move to storage/innobase

author: Sergei Golubchik <vuvova@gmail.com> 2015-05-04 19:17:21 +0200
committer: Sergei Golubchik <vuvova@gmail.com> 2015-05-04 19:17:21 +0200
commit: 6d06fbbd1dc25b3c12568f9038060dfdb69f9683 (patch)
tree: 21e27f3fddc89f9dda6b337091464ba10c490123 /storage
parent: 1645930d0bd02f79df3ebff412b90acdc15bd9a0 (diff)
download: mariadb-git-6d06fbbd1dc25b3c12568f9038060dfdb69f9683.tar.gz
360 files changed, 317343 insertions, 0 deletions
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt
new file mode 100644
index 00000000000..eeb53f96c9f
--- /dev/null
+++ b/storage/innobase/CMakeLists.txt
@@ -0,0 +1,433 @@
+# Copyright (c) 2006, 2011, Oracle and/or its affiliates. All rights reserved.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+
+# This is the CMakeLists for InnoDB
+
+INCLUDE(CheckFunctionExists)
+INCLUDE(CheckCSourceCompiles)
+INCLUDE(CheckCSourceRuns)
+
+# OS tests
+IF(UNIX)
+  IF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    CHECK_INCLUDE_FILES (libaio.h HAVE_LIBAIO_H)
+    CHECK_LIBRARY_EXISTS(aio io_queue_init "" HAVE_LIBAIO)
+    ADD_DEFINITIONS("-DUNIV_LINUX -D_GNU_SOURCE=1")
+    IF(HAVE_LIBAIO_H AND HAVE_LIBAIO)
+      ADD_DEFINITIONS(-DLINUX_NATIVE_AIO=1)
+      LINK_LIBRARIES(aio)
+    ENDIF()
+  ELSEIF(CMAKE_SYSTEM_NAME MATCHES "HP*")
+    ADD_DEFINITIONS("-DUNIV_HPUX")
+  ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "AIX")
+    ADD_DEFINITIONS("-DUNIV_AIX")
+  ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
+    ADD_DEFINITIONS("-DUNIV_SOLARIS")
+  ENDIF()
+ENDIF()
+
+IF(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+# After: WL#5825 Using C++ Standard Library with MySQL code
+#       we no longer use -fno-exceptions
+#	SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
+ENDIF()
+
+# Enable InnoDB's UNIV_DEBUG and UNIV_SYNC_DEBUG in debug builds
+SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DUNIV_DEBUG -DUNIV_SYNC_DEBUG")
+
+# Add -Wconversion if compiling with GCC
+## As of Mar 15 2011 this flag causes 3573+ warnings. If you are reading this
+## please fix them and enable the following code:
+#IF(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+#SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wconversion")
+#ENDIF()
+
+CHECK_FUNCTION_EXISTS(sched_getcpu  HAVE_SCHED_GETCPU)
+
+IF(NOT MSVC)
+# either define HAVE_IB_GCC_ATOMIC_BUILTINS or not
+IF(NOT CMAKE_CROSSCOMPILING)
+  CHECK_C_SOURCE_RUNS(
+  "
+  int main()
+  {
+    long	x;
+    long	y;
+    long	res;
+
+    x = 10;
+    y = 123;
+    res = __sync_bool_compare_and_swap(&x, x, y);
+    if (!res || x != y) {
+      return(1);
+    }
+
+    x = 10;
+    y = 123;
+    res = __sync_bool_compare_and_swap(&x, x + 1, y);
+    if (res || x != 10) {
+      return(1);
+    }
+    x = 10;
+    y = 123;
+    res = __sync_add_and_fetch(&x, y);
+    if (res != 123 + 10 || x != 123 + 10) {
+      return(1);
+    }
+    return(0);
+  }"
+  HAVE_IB_GCC_ATOMIC_BUILTINS
+  )
+  CHECK_C_SOURCE_RUNS(
+  "
+  int main()
+  {
+    long	res;
+    char	c;
+
+    c = 10;
+    res = __sync_lock_test_and_set(&c, 123);
+    if (res != 10 || c != 123) {
+      return(1);
+    }
+    return(0);
+  }"
+  HAVE_IB_GCC_ATOMIC_BUILTINS_BYTE
+  )
+  CHECK_C_SOURCE_RUNS(
+  "#include<stdint.h>
+  int main()
+  {
+    int64_t	x,y,res;
+
+    x = 10;
+    y = 123;
+    res = __sync_sub_and_fetch(&y, x);
+    if (res != y || y != 113) {
+      return(1);
+    }
+    res = __sync_add_and_fetch(&y, x);
+    if (res != y || y != 123) {
+      return(1);
+    }
+    return(0);
+  }"
+  HAVE_IB_GCC_ATOMIC_BUILTINS_64
+  )
+  CHECK_C_SOURCE_RUNS(
+  "#include<stdint.h>
+  int main()
+  {
+    __sync_synchronize();
+    return(0);
+  }"
+  HAVE_IB_GCC_SYNC_SYNCHRONISE
+  )
+  CHECK_C_SOURCE_RUNS(
+  "#include<stdint.h>
+  int main()
+  {
+    __atomic_thread_fence(__ATOMIC_ACQUIRE);
+    __atomic_thread_fence(__ATOMIC_RELEASE);
+    return(0);
+  }"
+  HAVE_IB_GCC_ATOMIC_THREAD_FENCE
+  )
+ENDIF()
+
+IF(HAVE_IB_GCC_ATOMIC_BUILTINS)
+ ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_BUILTINS=1)
+ENDIF()
+
+IF(HAVE_IB_GCC_ATOMIC_BUILTINS_BYTE)
+ ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_BUILTINS_BYTE=1)
+ENDIF()
+
+IF(HAVE_IB_GCC_ATOMIC_BUILTINS_64)
+ ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_BUILTINS_64=1)
+ENDIF()
+
+IF(HAVE_IB_GCC_SYNC_SYNCHRONISE)
+ ADD_DEFINITIONS(-DHAVE_IB_GCC_SYNC_SYNCHRONISE=1)
+ENDIF()
+
+IF(HAVE_IB_GCC_ATOMIC_THREAD_FENCE)
+ ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_THREAD_FENCE=1)
+ENDIF()
+
+ # either define HAVE_IB_ATOMIC_PTHREAD_T_GCC or not
+IF(NOT CMAKE_CROSSCOMPILING)
+  CHECK_C_SOURCE_RUNS(
+  "
+  #include <pthread.h>
+  #include <string.h>
+
+  int main() {
+    pthread_t       x1;
+    pthread_t       x2;
+    pthread_t       x3;
+
+    memset(&x1, 0x0, sizeof(x1));
+    memset(&x2, 0x0, sizeof(x2));
+    memset(&x3, 0x0, sizeof(x3));
+
+    __sync_bool_compare_and_swap(&x1, x2, x3);
+
+    return(0);
+  }"
+  HAVE_IB_ATOMIC_PTHREAD_T_GCC)
+ENDIF()
+IF(HAVE_IB_ATOMIC_PTHREAD_T_GCC)
+  ADD_DEFINITIONS(-DHAVE_IB_ATOMIC_PTHREAD_T_GCC=1)
+ENDIF()
+
+ENDIF(NOT MSVC)
+
+CHECK_FUNCTION_EXISTS(asprintf  HAVE_ASPRINTF)
+CHECK_FUNCTION_EXISTS(vasprintf  HAVE_VASPRINTF)
+
+# Solaris atomics
+IF(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
+  CHECK_FUNCTION_EXISTS(atomic_cas_ulong  HAVE_ATOMIC_CAS_ULONG)
+  CHECK_FUNCTION_EXISTS(atomic_cas_32 HAVE_ATOMIC_CAS_32)
+  CHECK_FUNCTION_EXISTS(atomic_cas_64 HAVE_ATOMIC_CAS_64)
+  CHECK_FUNCTION_EXISTS(atomic_add_long_nv HAVE_ATOMIC_ADD_LONG_NV)
+  CHECK_FUNCTION_EXISTS(atomic_swap_uchar HAVE_ATOMIC_SWAP_UCHAR)
+  IF(HAVE_ATOMIC_CAS_ULONG AND
+     HAVE_ATOMIC_CAS_32 AND
+     HAVE_ATOMIC_CAS_64 AND
+     HAVE_ATOMIC_ADD_LONG_NV AND
+     HAVE_ATOMIC_SWAP_UCHAR)
+    SET(HAVE_IB_SOLARIS_ATOMICS 1)
+  ENDIF()
+
+  IF(HAVE_IB_SOLARIS_ATOMICS)
+    ADD_DEFINITIONS(-DHAVE_IB_SOLARIS_ATOMICS=1)
+  ENDIF()
+
+  IF(NOT CMAKE_CROSSCOMPILING)
+  # either define HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS or not
+  CHECK_C_SOURCE_COMPILES(
+  "   #include <pthread.h>
+      #include <string.h>
+
+      int main(int argc, char** argv) {
+        pthread_t       x1;
+        pthread_t       x2;
+        pthread_t       x3;
+
+        memset(&x1, 0x0, sizeof(x1));
+        memset(&x2, 0x0, sizeof(x2));
+        memset(&x3, 0x0, sizeof(x3));
+
+        if (sizeof(pthread_t) == 4) {
+
+          atomic_cas_32(&x1, x2, x3);
+
+        } else if (sizeof(pthread_t) == 8) {
+
+          atomic_cas_64(&x1, x2, x3);
+
+        } else {
+
+          return(1);
+        }
+
+      return(0);
+    }
+  " HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS)
+  CHECK_C_SOURCE_COMPILES(
+  "#include <mbarrier.h>
+  int main() {
+    __machine_r_barrier();
+    __machine_w_barrier();
+    return(0);
+  }"
+  HAVE_IB_MACHINE_BARRIER_SOLARIS)
+  ENDIF()
+  IF(HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS)
+    ADD_DEFINITIONS(-DHAVE_IB_ATOMIC_PTHREAD_T_SOLARIS=1)
+  ENDIF()
+  IF(HAVE_IB_MACHINE_BARRIER_SOLARIS)
+    ADD_DEFINITIONS(-DHAVE_IB_MACHINE_BARRIER_SOLARIS=1)
+  ENDIF()
+ENDIF()
+
+
+IF(UNIX)
+# this is needed to know which one of atomic_cas_32() or atomic_cas_64()
+# to use in the source
+SET(CMAKE_EXTRA_INCLUDE_FILES pthread.h)
+CHECK_TYPE_SIZE(pthread_t SIZEOF_PTHREAD_T)
+SET(CMAKE_EXTRA_INCLUDE_FILES)
+ENDIF()
+
+IF(SIZEOF_PTHREAD_T)
+  ADD_DEFINITIONS(-DSIZEOF_PTHREAD_T=${SIZEOF_PTHREAD_T})
+ENDIF()
+
+IF(MSVC)
+  ADD_DEFINITIONS(-DHAVE_WINDOWS_ATOMICS)
+  ADD_DEFINITIONS(-DHAVE_WINDOWS_MM_FENCE)
+ENDIF()
+
+
+# Include directories under innobase
+INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/innobase/include
+		    ${CMAKE_SOURCE_DIR}/storage/innobase/handler)
+
+# Sun Studio bug with -xO2
+IF(CMAKE_CXX_COMPILER_ID MATCHES "SunPro"
+	AND CMAKE_CXX_FLAGS_RELEASE MATCHES "O2"
+	AND NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+	# Sun Studio 12 crashes with -xO2 flag, but not with higher optimization
+	# -xO3
+	SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_SOURCE_DIR}/rem/rem0rec.cc
+    PROPERTIES COMPILE_FLAGS -xO3)
+ENDIF()
+
+# Removing compiler optimizations for innodb/mem/* files on 64-bit Windows
+# due to 64-bit compiler error, See MySQL Bug #19424, #36366, #34297
+IF (MSVC AND CMAKE_SIZEOF_VOID_P EQUAL 8)
+	SET_SOURCE_FILES_PROPERTIES(mem/mem0mem.cc mem/mem0pool.cc
+				    PROPERTIES COMPILE_FLAGS -Od)
+ENDIF()
+
+SET(INNOBASE_SOURCES
+	api/api0api.cc
+	api/api0misc.cc
+	btr/btr0btr.cc
+	btr/btr0cur.cc
+	btr/btr0pcur.cc
+	btr/btr0sea.cc
+	buf/buf0buddy.cc
+	buf/buf0buf.cc
+	buf/buf0dblwr.cc
+	buf/buf0checksum.cc
+	buf/buf0dump.cc
+	buf/buf0flu.cc
+	buf/buf0lru.cc
+	buf/buf0rea.cc
+	data/data0data.cc
+	data/data0type.cc
+	dict/dict0boot.cc
+	dict/dict0crea.cc
+	dict/dict0dict.cc
+	dict/dict0load.cc
+	dict/dict0mem.cc
+	dict/dict0stats.cc
+	dict/dict0stats_bg.cc
+	dyn/dyn0dyn.cc
+	eval/eval0eval.cc
+	eval/eval0proc.cc
+	fil/fil0fil.cc
+	fsp/fsp0fsp.cc
+	fut/fut0fut.cc
+	fut/fut0lst.cc
+	ha/ha0ha.cc
+	ha/ha0storage.cc
+	ha/hash0hash.cc
+	fts/fts0fts.cc
+	fts/fts0ast.cc
+	fts/fts0blex.cc
+	fts/fts0config.cc
+	fts/fts0opt.cc
+	fts/fts0pars.cc
+	fts/fts0que.cc
+	fts/fts0sql.cc
+	fts/fts0tlex.cc
+	handler/ha_innodb.cc
+	handler/handler0alter.cc
+	handler/i_s.cc
+	ibuf/ibuf0ibuf.cc
+	lock/lock0iter.cc
+	lock/lock0lock.cc
+	lock/lock0wait.cc
+	log/log0log.cc
+	log/log0recv.cc
+	mach/mach0data.cc
+	mem/mem0mem.cc
+	mem/mem0pool.cc
+	mtr/mtr0log.cc
+	mtr/mtr0mtr.cc
+	os/os0file.cc
+	os/os0proc.cc
+	os/os0sync.cc
+	os/os0thread.cc
+	page/page0cur.cc
+	page/page0page.cc
+	page/page0zip.cc
+	pars/lexyy.cc
+	pars/pars0grm.cc
+	pars/pars0opt.cc
+	pars/pars0pars.cc
+	pars/pars0sym.cc
+	que/que0que.cc
+	read/read0read.cc
+	rem/rem0cmp.cc
+	rem/rem0rec.cc
+	row/row0ext.cc
+	row/row0ftsort.cc
+	row/row0import.cc
+	row/row0ins.cc
+	row/row0merge.cc
+	row/row0mysql.cc
+	row/row0log.cc
+	row/row0purge.cc
+	row/row0row.cc
+	row/row0sel.cc
+	row/row0uins.cc
+	row/row0umod.cc
+	row/row0undo.cc
+	row/row0upd.cc
+	row/row0quiesce.cc
+	row/row0vers.cc
+	srv/srv0conc.cc
+	srv/srv0mon.cc
+	srv/srv0srv.cc
+	srv/srv0start.cc
+	sync/sync0arr.cc
+	sync/sync0rw.cc
+	sync/sync0sync.cc
+	trx/trx0i_s.cc
+	trx/trx0purge.cc
+	trx/trx0rec.cc
+	trx/trx0roll.cc
+	trx/trx0rseg.cc
+	trx/trx0sys.cc
+	trx/trx0trx.cc
+	trx/trx0undo.cc
+	usr/usr0sess.cc
+	ut/ut0bh.cc
+	ut/ut0byte.cc
+	ut/ut0crc32.cc
+	ut/ut0dbg.cc
+	ut/ut0list.cc
+	ut/ut0mem.cc
+	ut/ut0rbt.cc
+	ut/ut0rnd.cc
+	ut/ut0ut.cc
+	ut/ut0vec.cc
+	ut/ut0wqueue.cc)
+
+IF(WITH_INNODB)
+  # Legacy option
+  SET(WITH_INNOBASE_STORAGE_ENGINE TRUE)
+ENDIF()
+
+MYSQL_ADD_PLUGIN(innobase ${INNOBASE_SOURCES} STORAGE_ENGINE
+  DEFAULT
+  MODULE_OUTPUT_NAME ha_innodb
+  LINK_LIBRARIES ${ZLIB_LIBRARY})
diff --git a/storage/innobase/COPYING.Google b/storage/innobase/COPYING.Google
new file mode 100644
index 00000000000..5ade2b0e381
--- /dev/null
+++ b/storage/innobase/COPYING.Google
@@ -0,0 +1,30 @@
+Portions of this software contain modifications contributed by Google, Inc.
+These contributions are used with the following license:
+
+Copyright (c) 2008, Google Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+      * Redistributions of source code must retain the above copyright
+        notice, this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above
+        copyright notice, this list of conditions and the following
+        disclaimer in the documentation and/or other materials
+        provided with the distribution.
+      * Neither the name of the Google Inc. nor the names of its
+        contributors may be used to endorse or promote products
+        derived from this software without specific prior written
+        permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/storage/innobase/COPYING.Percona b/storage/innobase/COPYING.Percona
new file mode 100644
index 00000000000..8c786811719
--- /dev/null
+++ b/storage/innobase/COPYING.Percona
@@ -0,0 +1,30 @@
+Portions of this software contain modifications contributed by Percona, Inc.
+These contributions are used with the following license:
+
+Copyright (c) 2008, 2009, Percona Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+      * Redistributions of source code must retain the above copyright
+        notice, this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above
+        copyright notice, this list of conditions and the following
+        disclaimer in the documentation and/or other materials
+        provided with the distribution.
+      * Neither the name of the Percona Inc. nor the names of its
+        contributors may be used to endorse or promote products
+        derived from this software without specific prior written
+        permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/storage/innobase/Doxyfile b/storage/innobase/Doxyfile
new file mode 100644
index 00000000000..7cf5048fa52
--- /dev/null
+++ b/storage/innobase/Doxyfile
@@ -0,0 +1,1419 @@
+# Doxyfile 1.5.6
+
+# Usage: SVNVERSION=-r$(svnversion) doxygen
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+
+PROJECT_NAME           = "InnoDB Plugin"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER         = 1.0$(SVNVERSION)
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = dox
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Farsi, Finnish, French, German, Greek,
+# Hungarian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, Polish,
+# Portuguese, Romanian, Russian, Serbian, Slovak, Slovene, Spanish, Swedish,
+# and Ukrainian.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful is your file systems
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the DETAILS_AT_TOP tag is set to YES then Doxygen
+# will output the detailed description near the top, like JavaDoc.
+# If set to NO, the detailed description appears after the member
+# documentation.
+
+DETAILS_AT_TOP         = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen to replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = YES
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespace are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or define consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and defines in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES       = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.  This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = YES
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be abled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT                  = . include/univ.i
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx
+# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90
+
+FILE_PATTERNS          = *.c *.ic *.h
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix filesystem feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.  If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.  Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.  The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER
+# is applied to all files.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.  Otherwise they will link to the documentstion.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        =
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING     =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# This tag can be used to set the number of enum values (range [1..20])
+# that doxygen will group on one line in the generated HTML documentation.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to FRAME, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (for instance Mozilla 1.0+,
+# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are
+# probably better off using the HTML help feature. Other possible values
+# for this tag are: HIERARCHIES, which will generate the Groups, Directories,
+# and Class Hiererachy pages using a tree view instead of an ordered list;
+# ALL, which combines the behavior of FRAME and HIERARCHIES; and NONE, which
+# disables this behavior completely. For backwards compatibility with previous
+# releases of Doxygen, the values YES and NO are equivalent to FRAME and NONE
+# respectively.
+
+GENERATE_TREEVIEW      = NONE
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, a4wide, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4wide
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.  This is useful
+# if you want to understand what is going on.  On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = YES
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             = DOXYGEN UNIV_DEBUG UNIV_SYNC_DEBUG __attribute__()=
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition.
+
+EXPAND_AS_DEFINED      = UT_LIST_BASE_NODE_T UT_LIST_NODE_T
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all function-like macros that are alone
+# on a line, have an all uppercase name, and do not end with a semicolon. Such
+# function macros are typically used for boiler-plate code, and will confuse
+# the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#   TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#   TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = NO
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option is superseded by the HAVE_DOT option below. This is only a
+# fallback. It is recommended to install and use dot, since it yields more
+# powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = YES
+
+# By default doxygen will write a font called FreeSans.ttf to the output
+# directory and reference it in all dot files that doxygen generates. This
+# font does not include all possible unicode characters however, so when you need
+# these (or just want a differently looking font) you can specify the font name
+# using DOT_FONTNAME. You need need to make sure dot is able to find the font,
+# which can be done by putting it in a standard location or by setting the
+# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory
+# containing the font.
+
+DOT_FONTNAME           = FreeSans
+
+# By default doxygen will tell dot to use the output directory to look for the
+# FreeSans.ttf font (which doxygen will put there itself). If you specify a
+# different font using DOT_FONTNAME you can set the path where dot
+# can find it using this tag.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = NO
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are png, jpg, or gif
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT       = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 3
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is enabled by default, which results in a transparent
+# background. Warning: Depending on the platform used, enabling this option
+# may lead to badly anti-aliased labels on the edges of a graph (i.e. they
+# become hard to read).
+
+DOT_TRANSPARENT        = YES
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to the search engine
+#---------------------------------------------------------------------------
+
+# The SEARCHENGINE tag specifies whether or not a search engine should be
+# used. If set to NO the values of all tags below this one will be ignored.
+
+SEARCHENGINE           = NO
diff --git a/storage/innobase/api/api0api.cc b/storage/innobase/api/api0api.cc
new file mode 100644
index 00000000000..2f5999e9a3a
--- /dev/null
+++ b/storage/innobase/api/api0api.cc
@@ -0,0 +1,4061 @@
+/*****************************************************************************
+
+Copyright (c) 2008, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file api/api0api.cc
+InnoDB Native API
+
+2008-08-01 Created Sunny Bains
+3/20/2011 Jimmy Yang extracted from Embedded InnoDB
+*******************************************************/
+
+#include "univ.i"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#include "api0api.h"
+#include "api0misc.h"
+#include "srv0start.h"
+#include "dict0dict.h"
+#include "btr0pcur.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "row0vers.h"
+#include "trx0roll.h"
+#include "dict0crea.h"
+#include "row0merge.h"
+#include "pars0pars.h"
+#include "lock0types.h"
+#include "row0sel.h"
+#include "lock0lock.h"
+#include "rem0cmp.h"
+#include "ut0dbg.h"
+#include "dict0priv.h"
+#include "ut0ut.h"
+#include "ha_prototypes.h"
+#include "trx0roll.h"
+
+/** configure variable for binlog option with InnoDB APIs */
+my_bool ib_binlog_enabled = FALSE;
+
+/** configure variable for MDL option with InnoDB APIs */
+my_bool ib_mdl_enabled = FALSE;
+
+/** configure variable for disable rowlock with InnoDB APIs */
+my_bool ib_disable_row_lock = FALSE;
+
+/** configure variable for Transaction isolation levels */
+ulong ib_trx_level_setting = IB_TRX_READ_UNCOMMITTED;
+
+/** configure variable for background commit interval in seconds */
+ulong ib_bk_commit_interval = 0;
+
+/** InnoDB tuple types. */
+enum ib_tuple_type_t{
+	TPL_TYPE_ROW,			/*!< Data row tuple */
+	TPL_TYPE_KEY			/*!< Index key tuple */
+};
+
+/** Query types supported. */
+enum ib_qry_type_t{
+	QRY_NON,			/*!< None/Sentinel */
+	QRY_INS,			/*!< Insert operation */
+	QRY_UPD,			/*!< Update operation */
+	QRY_SEL				/*!< Select operation */
+};
+
+/** Query graph types. */
+struct ib_qry_grph_t {
+	que_fork_t*	ins;		/*!< Innobase SQL query graph used
+					in inserts */
+	que_fork_t*	upd;		/*!< Innobase SQL query graph used
+					in updates or deletes */
+	que_fork_t*	sel;		/*!< dummy query graph used in
+					selects */
+};
+
+/** Query node types. */
+struct ib_qry_node_t {
+	ins_node_t*	ins;		/*!< Innobase SQL insert node
+					used to perform inserts to the table */
+	upd_node_t*	upd;		/*!< Innobase SQL update node
+					used to perform updates and deletes */
+	sel_node_t*	sel;		/*!< Innobase SQL select node
+					used to perform selects on the table */
+};
+
+/** Query processing fields. */
+struct ib_qry_proc_t {
+
+	ib_qry_node_t	node;		/*!< Query node*/
+
+	ib_qry_grph_t	grph;		/*!< Query graph */
+};
+
+/** Cursor instance for traversing tables/indexes. This will eventually
+become row_prebuilt_t. */
+struct ib_cursor_t {
+	mem_heap_t*	heap;		/*!< Instance heap */
+
+	mem_heap_t*	query_heap;	/*!< Heap to use for query graphs */
+
+	ib_qry_proc_t	q_proc;		/*!< Query processing info */
+
+	ib_match_mode_t	match_mode;	/*!< ib_cursor_moveto match mode */
+
+	row_prebuilt_t*	prebuilt;	/*!< For reading rows */
+
+	bool		valid_trx;	/*!< Valid transaction attached */
+};
+
+/** InnoDB table columns used during table and index schema creation. */
+struct ib_col_t {
+	const char*	name;		/*!< Name of column */
+
+	ib_col_type_t	ib_col_type;	/*!< Main type of the column */
+
+	ulint		len;		/*!< Length of the column */
+
+	ib_col_attr_t	ib_col_attr;	/*!< Column attributes */
+
+};
+
+/** InnoDB index columns used during index and index schema creation. */
+struct ib_key_col_t {
+	const char*	name;		/*!< Name of column */
+
+	ulint		prefix_len;	/*!< Column index prefix len or 0 */
+};
+
+struct ib_table_def_t;
+
+/** InnoDB index schema used during index creation */
+struct ib_index_def_t {
+	mem_heap_t*	heap;		/*!< Heap used to build this and all
+					its columns in the list */
+
+	const char*	name;		/*!< Index name */
+
+	dict_table_t*	table;		/*!< Parent InnoDB table */
+
+	ib_table_def_t*	schema;		/*!< Parent table schema that owns
+					this instance */
+
+	ibool		clustered;	/*!< True if clustered index */
+
+	ibool		unique;		/*!< True if unique index */
+
+	ib_vector_t*	cols;		/*!< Vector of columns */
+
+	trx_t*		usr_trx;	/*!< User transacton covering the
+					DDL operations */
+};
+
+/** InnoDB table schema used during table creation */
+struct ib_table_def_t {
+	mem_heap_t*	heap;		/*!< Heap used to build this and all
+					its columns in the list */
+	const char*	name;		/*!< Table name */
+
+	ib_tbl_fmt_t	ib_tbl_fmt;	/*!< Row format */
+
+	ulint		page_size;	/*!< Page size */
+
+	ib_vector_t*	cols;		/*!< Vector of columns */
+
+	ib_vector_t*	indexes;	/*!< Vector of indexes */
+
+	dict_table_t*	table;		/* Table read from or NULL */
+};
+
+/** InnoDB tuple used for key operations. */
+struct ib_tuple_t {
+	mem_heap_t*		heap;	/*!< Heap used to build
+					this and for copying
+					the column values. */
+
+	ib_tuple_type_t		type;	/*!< Tuple discriminitor. */
+
+	const dict_index_t*	index;	/*!< Index for tuple can be either
+					secondary or cluster index. */
+
+	dtuple_t*		ptr;	/*!< The internal tuple
+					instance */
+};
+
+/** The following counter is used to convey information to InnoDB
+about server activity: in case of normal DML ops it is not
+sensible to call srv_active_wake_master_thread after each
+operation, we only do it every INNOBASE_WAKE_INTERVAL'th step. */
+
+#define INNOBASE_WAKE_INTERVAL	32
+
+/*****************************************************************//**
+Check whether the Innodb persistent cursor is positioned.
+@return	IB_TRUE if positioned */
+UNIV_INLINE
+ib_bool_t
+ib_btr_cursor_is_positioned(
+/*========================*/
+	btr_pcur_t*	pcur)		/*!< in: InnoDB persistent cursor */
+{
+	return(pcur->old_stored == BTR_PCUR_OLD_STORED
+	       && (pcur->pos_state == BTR_PCUR_IS_POSITIONED
+	           || pcur->pos_state == BTR_PCUR_WAS_POSITIONED));
+}
+
+
+/********************************************************************//**
+Open a table using the table id, if found then increment table ref count.
+@return	table instance if found */
+static
+dict_table_t*
+ib_open_table_by_id(
+/*================*/
+	ib_id_u64_t	tid,		/*!< in: table id to lookup */
+	ib_bool_t	locked)		/*!< in: TRUE if own dict mutex */
+{
+	dict_table_t*	table;
+	table_id_t	table_id;
+
+	table_id = tid;
+
+	if (!locked) {
+		dict_mutex_enter_for_mysql();
+	}
+
+	table = dict_table_open_on_id(table_id, FALSE, DICT_TABLE_OP_NORMAL);
+
+	if (table != NULL && table->ibd_file_missing) {
+		table = NULL;
+	}
+
+	if (!locked) {
+		dict_mutex_exit_for_mysql();
+	}
+
+	return(table);
+}
+
+/********************************************************************//**
+Open a table using the table name, if found then increment table ref count.
+@return	table instance if found */
+UNIV_INTERN
+void*
+ib_open_table_by_name(
+/*==================*/
+	const char*	name)		/*!< in: table name to lookup */
+{
+	dict_table_t*	table;
+
+	table = dict_table_open_on_name(name, FALSE, FALSE,
+					DICT_ERR_IGNORE_NONE);
+
+	if (table != NULL && table->ibd_file_missing) {
+		table = NULL;
+	}
+
+	return(table);
+}
+
+/********************************************************************//**
+Find table using table name.
+@return	table instance if found */
+static
+dict_table_t*
+ib_lookup_table_by_name(
+/*====================*/
+	const char*	name)		/*!< in: table name to lookup */
+{
+	dict_table_t*	table;
+
+	table = dict_table_get_low(name);
+
+	if (table != NULL && table->ibd_file_missing) {
+		table = NULL;
+	}
+
+	return(table);
+}
+
+/********************************************************************//**
+Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth
+time calls srv_active_wake_master_thread. This function should be used
+when a single database operation may introduce a small need for
+server utility activity, like checkpointing. */
+UNIV_INLINE
+void
+ib_wake_master_thread(void)
+/*=======================*/
+{
+        static ulint    ib_signal_counter = 0;
+
+        ++ib_signal_counter;
+
+        if ((ib_signal_counter % INNOBASE_WAKE_INTERVAL) == 0) {
+                srv_active_wake_master_thread();
+        }
+}
+
+/*********************************************************************//**
+Calculate the max row size of the columns in a cluster index.
+@return	max row length */
+UNIV_INLINE
+ulint
+ib_get_max_row_len(
+/*===============*/
+	dict_index_t*	cluster)		/*!< in: cluster index */
+{
+	ulint		i;
+	ulint		max_len = 0;
+	ulint		n_fields = cluster->n_fields;
+
+	/* Add the size of the ordering columns in the
+	clustered index. */
+	for (i = 0; i < n_fields; ++i) {
+		const dict_col_t*	col;
+
+		col = dict_index_get_nth_col(cluster, i);
+
+		/* Use the maximum output size of
+		mach_write_compressed(), although the encoded
+		length should always fit in 2 bytes. */
+		max_len += dict_col_get_max_size(col);
+	}
+
+	return(max_len);
+}
+
+/*****************************************************************//**
+Read the columns from a rec into a tuple. */
+static
+void
+ib_read_tuple(
+/*==========*/
+	const rec_t*	rec,		/*!< in: Record to read */
+	ib_bool_t	page_format,	/*!< in: IB_TRUE if compressed format */
+	ib_tuple_t*	tuple,		/*!< in: tuple to read into */
+	void**		rec_buf,        /*!< in/out: row buffer */
+        ulint*          len)            /*!< in/out: buffer len */
+{
+	ulint		i;
+	void*		ptr;
+	rec_t*		copy;
+	ulint		rec_meta_data;
+	ulint		n_index_fields;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets	= offsets_;
+	dtuple_t*	dtuple = tuple->ptr;
+	const dict_index_t* index = tuple->index;
+	ulint		offset_size;
+
+	rec_offs_init(offsets_);
+
+	offsets = rec_get_offsets(
+		rec, index, offsets, ULINT_UNDEFINED, &tuple->heap);
+
+	rec_meta_data = rec_get_info_bits(rec, page_format);
+	dtuple_set_info_bits(dtuple, rec_meta_data);
+
+	offset_size = rec_offs_size(offsets);
+
+	if (rec_buf && *rec_buf) {
+		if (*len < offset_size) {
+			free(*rec_buf);
+			*rec_buf = malloc(offset_size);
+			*len = offset_size;
+		}
+		ptr = *rec_buf;
+	}  else {
+		/* Make a copy of the rec. */
+		ptr = mem_heap_alloc(tuple->heap, offset_size);
+	}
+
+	copy = rec_copy(ptr, rec, offsets);
+
+	n_index_fields = ut_min(
+		rec_offs_n_fields(offsets), dtuple_get_n_fields(dtuple));
+
+	for (i = 0; i < n_index_fields; ++i) {
+		ulint		len;
+		const byte*	data;
+		dfield_t*	dfield;
+
+		if (tuple->type == TPL_TYPE_ROW) {
+			const dict_col_t*	col;
+			ulint			col_no;
+			const dict_field_t*	index_field;
+
+			index_field = dict_index_get_nth_field(index, i);
+			col = dict_field_get_col(index_field);
+			col_no = dict_col_get_no(col);
+
+			dfield = dtuple_get_nth_field(dtuple, col_no);
+		} else {
+			dfield = dtuple_get_nth_field(dtuple, i);
+		}
+
+		data = rec_get_nth_field(copy, offsets, i, &len);
+
+		/* Fetch and copy any externally stored column. */
+		if (rec_offs_nth_extern(offsets, i)) {
+
+			ulint	zip_size;
+
+			zip_size = dict_table_zip_size(index->table);
+
+			data = btr_rec_copy_externally_stored_field(
+				copy, offsets, zip_size, i, &len,
+				tuple->heap);
+
+			ut_a(len != UNIV_SQL_NULL);
+		}
+
+		dfield_set_data(dfield, data, len);
+	}
+}
+
+/*****************************************************************//**
+Create an InnoDB key tuple.
+@return	tuple instance created, or NULL */
+static
+ib_tpl_t
+ib_key_tuple_new_low(
+/*=================*/
+	const dict_index_t*	index,	/*!< in: index for which tuple
+					required */
+	ulint			n_cols,	/*!< in: no. of user defined cols */
+	mem_heap_t*		heap)	/*!< in: memory heap */
+{
+	ib_tuple_t*	tuple;
+	ulint		i;
+	ulint		n_cmp_cols;
+
+	tuple = static_cast<ib_tuple_t*>(
+			mem_heap_alloc(heap, sizeof(*tuple)));
+
+	if (tuple == NULL) {
+		mem_heap_free(heap);
+		return(NULL);
+	}
+
+	tuple->heap  = heap;
+	tuple->index = index;
+	tuple->type  = TPL_TYPE_KEY;
+
+	/* Is it a generated clustered index ? */
+	if (n_cols == 0) {
+		++n_cols;
+	}
+
+	tuple->ptr = dtuple_create(heap, n_cols);
+
+	/* Copy types and set to SQL_NULL. */
+	dict_index_copy_types(tuple->ptr, index, n_cols);
+
+	for (i = 0; i < n_cols; i++) {
+
+		dfield_t*	dfield;
+
+		dfield	= dtuple_get_nth_field(tuple->ptr, i);
+		dfield_set_null(dfield);
+	}
+
+	n_cmp_cols = dict_index_get_n_ordering_defined_by_user(index);
+
+	dtuple_set_n_fields_cmp(tuple->ptr, n_cmp_cols);
+
+	return((ib_tpl_t) tuple);
+}
+
+/*****************************************************************//**
+Create an InnoDB key tuple.
+@return	tuple instance created, or NULL */
+static
+ib_tpl_t
+ib_key_tuple_new(
+/*=============*/
+	const dict_index_t*	index,	/*!< in: index of tuple */
+	ulint			n_cols)	/*!< in: no. of user defined cols */
+{
+	mem_heap_t*	heap;
+
+	heap = mem_heap_create(64);
+
+	if (heap == NULL) {
+		return(NULL);
+	}
+
+	return(ib_key_tuple_new_low(index, n_cols, heap));
+}
+
+/*****************************************************************//**
+Create an InnoDB row tuple.
+@return	tuple instance, or NULL */
+static
+ib_tpl_t
+ib_row_tuple_new_low(
+/*=================*/
+	const dict_index_t*	index,	/*!< in: index of tuple */
+	ulint			n_cols,	/*!< in: no. of cols in tuple */
+	mem_heap_t*		heap)	/*!< in: memory heap */
+{
+	ib_tuple_t*	tuple;
+
+	tuple = static_cast<ib_tuple_t*>(mem_heap_alloc(heap, sizeof(*tuple)));
+
+	if (tuple == NULL) {
+		mem_heap_free(heap);
+		return(NULL);
+	}
+
+	tuple->heap  = heap;
+	tuple->index = index;
+	tuple->type  = TPL_TYPE_ROW;
+
+	tuple->ptr = dtuple_create(heap, n_cols);
+
+	/* Copy types and set to SQL_NULL. */
+	dict_table_copy_types(tuple->ptr, index->table);
+
+	return((ib_tpl_t) tuple);
+}
+
+/*****************************************************************//**
+Create an InnoDB row tuple.
+@return	tuple instance, or NULL */
+static
+ib_tpl_t
+ib_row_tuple_new(
+/*=============*/
+	const dict_index_t*	index,	/*!< in: index of tuple */
+	ulint			n_cols)	/*!< in: no. of cols in tuple */
+{
+	mem_heap_t*	heap;
+
+	heap = mem_heap_create(64);
+
+	if (heap == NULL) {
+		return(NULL);
+	}
+
+	return(ib_row_tuple_new_low(index, n_cols, heap));
+}
+
+/*****************************************************************//**
+Begin a transaction.
+@return	innobase txn handle */
+UNIV_INTERN
+ib_err_t
+ib_trx_start(
+/*=========*/
+	ib_trx_t	ib_trx,		/*!< in: transaction to restart */
+	ib_trx_level_t	ib_trx_level,	/*!< in: trx isolation level */
+	ib_bool_t	read_write,	/*!< in: true if read write
+					transaction */
+	ib_bool_t	auto_commit,	/*!< in: auto commit after each
+					single DML */
+	void*		thd)		/*!< in: THD */
+{
+	ib_err_t	err = DB_SUCCESS;
+	trx_t*		trx = (trx_t*) ib_trx;
+
+	ut_a(ib_trx_level <= IB_TRX_SERIALIZABLE);
+
+	trx->api_trx = true;
+	trx->api_auto_commit = auto_commit;
+	trx->read_write = read_write;
+
+	trx_start_if_not_started(trx);
+
+	trx->isolation_level = ib_trx_level;
+
+	/* FIXME: This is a place holder, we should add an arg that comes
+	from the client. */
+	trx->mysql_thd = static_cast<THD*>(thd);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Begin a transaction. This will allocate a new transaction handle.
+put the transaction in the active state.
+@return	innobase txn handle */
+UNIV_INTERN
+ib_trx_t
+ib_trx_begin(
+/*=========*/
+	ib_trx_level_t	ib_trx_level,	/*!< in: trx isolation level */
+	ib_bool_t	read_write,     /*!< in: true if read write
+					transaction */
+	ib_bool_t	auto_commit)	/*!< in: auto commit after each
+					single DML */
+{
+	trx_t*		trx;
+	ib_bool_t	started;
+
+	trx = trx_allocate_for_mysql();
+
+	started = ib_trx_start(static_cast<ib_trx_t>(trx), ib_trx_level,
+			       read_write, auto_commit, NULL);
+	ut_a(started);
+
+	return(static_cast<ib_trx_t>(trx));
+}
+
+/*****************************************************************//**
+Get the transaction's state.
+@return	transaction state */
+UNIV_INTERN
+ib_trx_state_t
+ib_trx_state(
+/*=========*/
+	ib_trx_t	ib_trx)		/*!< in: trx handle */
+{
+	trx_t*		trx = (trx_t*) ib_trx;
+
+	return((ib_trx_state_t) trx->state);
+}
+
+/*****************************************************************//**
+Get a trx start time.
+@return	trx start_time */
+UNIV_INTERN
+ib_u64_t
+ib_trx_get_start_time(
+/*==================*/
+	ib_trx_t	ib_trx)		/*!< in: transaction */
+{
+	trx_t*		trx = (trx_t*) ib_trx;
+	return(static_cast<ib_u64_t>(trx->start_time));
+}
+/*****************************************************************//**
+Release the resources of the transaction.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_trx_release(
+/*===========*/
+	ib_trx_t	ib_trx)		/*!< in: trx handle */
+{
+	trx_t*		trx = (trx_t*) ib_trx;
+
+	ut_ad(trx != NULL);
+	trx_free_for_mysql(trx);
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Commit a transaction. This function will also release the schema
+latches too.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_trx_commit(
+/*==========*/
+	ib_trx_t	ib_trx)		/*!< in: trx handle */
+{
+	ib_err_t	err = DB_SUCCESS;
+	trx_t*		trx = (trx_t*) ib_trx;
+
+	if (trx->state == TRX_STATE_NOT_STARTED) {
+		return(err);
+	}
+
+	trx_commit(trx);
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Rollback a transaction. This function will also release the schema
+latches too.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_trx_rollback(
+/*============*/
+	ib_trx_t	ib_trx)		/*!< in: trx handle */
+{
+	ib_err_t	err;
+	trx_t*		trx = (trx_t*) ib_trx;
+
+	err = static_cast<ib_err_t>(trx_rollback_for_mysql(trx));
+
+        /* It should always succeed */
+        ut_a(err == DB_SUCCESS);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Find an index definition from the index vector using index name.
+@return	index def. if found else NULL */
+UNIV_INLINE
+const ib_index_def_t*
+ib_table_find_index(
+/*================*/
+	ib_vector_t*	indexes,	/*!< in: vector of indexes */
+	const char*	name)		/*!< in: index name */
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(indexes); ++i) {
+		const ib_index_def_t*	index_def;
+
+		index_def = (ib_index_def_t*) ib_vector_get(indexes, i);
+
+		if (innobase_strcasecmp(name, index_def->name) == 0) {
+			return(index_def);
+		}
+	}
+
+	return(NULL);
+}
+
+/*****************************************************************//**
+Get the InnoDB internal precise type from the schema column definition.
+@return	precise type in api format */
+UNIV_INLINE
+ulint
+ib_col_get_prtype(
+/*==============*/
+	const ib_col_t*	ib_col)		/*!< in: column definition */
+{
+	ulint		prtype = 0;
+
+	if (ib_col->ib_col_attr & IB_COL_UNSIGNED) {
+		prtype |= DATA_UNSIGNED;
+
+		ut_a(ib_col->ib_col_type == IB_INT);
+	}
+
+	if (ib_col->ib_col_attr & IB_COL_NOT_NULL) {
+		prtype |= DATA_NOT_NULL;
+	}
+
+	return(prtype);
+}
+
+/*****************************************************************//**
+Get the InnoDB internal main type from the schema column definition.
+@return	column main type */
+UNIV_INLINE
+ulint
+ib_col_get_mtype(
+/*==============*/
+	const ib_col_t*	ib_col)		/*!< in: column definition */
+{
+	/* Note: The api0api.h types should map directly to
+	the internal numeric codes. */
+	return(ib_col->ib_col_type);
+}
+
+/*****************************************************************//**
+Find a column in the the column vector with the same name.
+@return	col. def. if found else NULL */
+UNIV_INLINE
+const ib_col_t*
+ib_table_find_col(
+/*==============*/
+	const ib_vector_t*	cols,	/*!< in: column list head */
+	const char*	name)		/*!< in: column name to find */
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(cols); ++i) {
+		const ib_col_t*	ib_col;
+
+		ib_col =  static_cast<const ib_col_t*>(
+			ib_vector_get((ib_vector_t*) cols, i));
+
+		if (innobase_strcasecmp(ib_col->name, name) == 0) {
+			return(ib_col);
+		}
+	}
+
+	return(NULL);
+}
+
+/*****************************************************************//**
+Find a column in the the column list with the same name.
+@return	col. def. if found else NULL */
+UNIV_INLINE
+const ib_key_col_t*
+ib_index_find_col(
+/*==============*/
+	ib_vector_t*	cols,		/*!< in: column list head */
+	const char*	name)		/*!< in: column name to find */
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(cols); ++i) {
+		const ib_key_col_t*	ib_col;
+
+		ib_col = static_cast<ib_key_col_t*>(ib_vector_get(cols, i));
+
+		if (innobase_strcasecmp(ib_col->name, name) == 0) {
+			return(ib_col);
+		}
+	}
+
+	return(NULL);
+}
+
+#ifdef __WIN__
+/*****************************************************************//**
+Convert a string to lower case. */
+static
+void
+ib_to_lower_case(
+/*=============*/
+	char*		ptr)		/*!< string to convert to lower case */
+{
+	while (*ptr) {
+		*ptr = tolower(*ptr);
+		++ptr;
+	}
+}
+#endif /* __WIN__ */
+
+/*****************************************************************//**
+Normalizes a table name string. A normalized name consists of the
+database name catenated to '/' and table name. An example:
+test/mytable. On Windows normalization puts both the database name and the
+table name always to lower case. This function can be called for system
+tables and they don't have a database component. For tables that don't have
+a database component, we don't normalize them to lower case on Windows.
+The assumption is that they are system tables that reside in the system
+table space. */
+static
+void
+ib_normalize_table_name(
+/*====================*/
+	char*		norm_name,	/*!< out: normalized name as a
+					null-terminated string */
+	const char*	name)		/*!< in: table name string */
+{
+	const char*	ptr = name;
+
+	/* Scan name from the end */
+
+	ptr += ut_strlen(name) - 1;
+
+	/* Find the start of the table name. */
+	while (ptr >= name && *ptr != '\\' && *ptr != '/' && ptr > name) {
+		--ptr;
+	}
+
+
+	/* For system tables there is no '/' or dbname. */
+	ut_a(ptr >= name);
+
+	if (ptr > name) {
+		const char*	db_name;
+		const char*	table_name;
+
+		table_name = ptr + 1;
+
+		--ptr;
+
+		while (ptr >= name && *ptr != '\\' && *ptr != '/') {
+			ptr--;
+		}
+
+		db_name = ptr + 1;
+
+		memcpy(norm_name, db_name,
+			ut_strlen(name) + 1 - (db_name - name));
+
+		norm_name[table_name - db_name - 1] = '/';
+#ifdef __WIN__
+		ib_to_lower_case(norm_name);
+#endif
+	} else {
+		ut_strcpy(norm_name, name);
+	}
+}
+
+/*****************************************************************//**
+Check whether the table name conforms to our requirements. Currently
+we only do a simple check for the presence of a '/'.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_table_name_check(
+/*================*/
+	const char*	name)		/*!< in: table name to check */
+{
+	const char*	slash = NULL;
+	ulint		len = ut_strlen(name);
+
+	if (len < 2
+	    || *name == '/'
+	    || name[len - 1] == '/'
+	    || (name[0] == '.' && name[1] == '/')
+	    || (name[0] == '.' && name[1] == '.' && name[2] == '/')) {
+
+		return(DB_DATA_MISMATCH);
+	}
+
+	for ( ; *name; ++name) {
+#ifdef __WIN__
+		/* Check for reserved characters in DOS filenames. */
+		switch (*name) {
+		case ':':
+		case '|':
+		case '"':
+		case '*':
+		case '<':
+		case '>':
+			return(DB_DATA_MISMATCH);
+		}
+#endif /* __WIN__ */
+		if (*name == '/') {
+			if (slash) {
+				return(DB_DATA_MISMATCH);
+			}
+			slash = name;
+		}
+	}
+
+	return(slash ? DB_SUCCESS : DB_DATA_MISMATCH);
+}
+
+
+
+/*****************************************************************//**
+Get an index definition that is tagged as a clustered index.
+@return	cluster index schema */
+UNIV_INLINE
+ib_index_def_t*
+ib_find_clustered_index(
+/*====================*/
+	ib_vector_t*	indexes)	/*!< in: index defs. to search */
+{
+	ulint		i;
+	ulint		n_indexes;
+
+	n_indexes = ib_vector_size(indexes);
+
+	for (i = 0; i < n_indexes; ++i) {
+		ib_index_def_t*	ib_index_def;
+
+		ib_index_def = static_cast<ib_index_def_t*>(
+			ib_vector_get(indexes, i));
+
+		if (ib_index_def->clustered) {
+			return(ib_index_def);
+		}
+	}
+
+	return(NULL);
+}
+
+/*****************************************************************//**
+Get a table id. The caller must have acquired the dictionary mutex.
+@return	DB_SUCCESS if found */
+static
+ib_err_t
+ib_table_get_id_low(
+/*================*/
+	const char*	table_name,	/*!< in: table to find */
+	ib_id_u64_t*	table_id)	/*!< out: table id if found */
+{
+	dict_table_t*	table;
+	ib_err_t	err = DB_TABLE_NOT_FOUND;
+
+	*table_id = 0;
+
+	table = ib_lookup_table_by_name(table_name);
+
+	if (table != NULL) {
+		*table_id = (table->id);
+
+		err = DB_SUCCESS;
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Create an internal cursor instance.
+@return	DB_SUCCESS or err code */
+static
+ib_err_t
+ib_create_cursor(
+/*=============*/
+	ib_crsr_t*	ib_crsr,	/*!< out: InnoDB cursor */
+	dict_table_t*	table,		/*!< in: table instance */
+	dict_index_t*	index,		/*!< in: index to use */
+	trx_t*		trx)		/*!< in: transaction */
+{
+	mem_heap_t*	heap;
+	ib_cursor_t*	cursor;
+	ib_err_t	err = DB_SUCCESS;
+
+	heap = mem_heap_create(sizeof(*cursor) * 2);
+
+	if (heap != NULL) {
+		row_prebuilt_t*	prebuilt;
+
+		cursor = static_cast<ib_cursor_t*>(
+			 mem_heap_zalloc(heap, sizeof(*cursor)));
+
+		cursor->heap = heap;
+
+		cursor->query_heap = mem_heap_create(64);
+
+		if (cursor->query_heap == NULL) {
+			mem_heap_free(heap);
+
+			return(DB_OUT_OF_MEMORY);
+		}
+
+		cursor->prebuilt = row_create_prebuilt(table, 0);
+
+		prebuilt = cursor->prebuilt;
+
+		prebuilt->trx = trx;
+
+		cursor->valid_trx = TRUE;
+
+		prebuilt->table = table;
+		prebuilt->select_lock_type = LOCK_NONE;
+		prebuilt->innodb_api = TRUE;
+
+		prebuilt->index = index;
+
+		ut_a(prebuilt->index != NULL);
+
+		if (prebuilt->trx != NULL) {
+			++prebuilt->trx->n_mysql_tables_in_use;
+
+			 prebuilt->index_usable =
+				row_merge_is_index_usable(
+					prebuilt->trx, prebuilt->index);
+
+			/* Assign a read view if the transaction does
+			not have it yet */
+
+			trx_assign_read_view(prebuilt->trx);
+		}
+
+		*ib_crsr = (ib_crsr_t) cursor;
+	} else {
+		err = DB_OUT_OF_MEMORY;
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Create an internal cursor instance, and set prebuilt->index to index
+with supplied index_id.
+@return	DB_SUCCESS or err code */
+static
+ib_err_t
+ib_create_cursor_with_index_id(
+/*===========================*/
+	ib_crsr_t*	ib_crsr,	/*!< out: InnoDB cursor */
+	dict_table_t*	table,		/*!< in: table instance */
+	ib_id_u64_t	index_id,	/*!< in: index id or 0 */
+	trx_t*		trx)		/*!< in: transaction */
+{
+	dict_index_t*	index;
+
+	if (index_id != 0) {
+		mutex_enter(&dict_sys->mutex);
+		index = dict_index_find_on_id_low(index_id);
+		mutex_exit(&dict_sys->mutex);
+	} else {
+		index = dict_table_get_first_index(table);
+	}
+
+	return(ib_create_cursor(ib_crsr, table, index, trx));
+}
+
+/*****************************************************************//**
+Open an InnoDB table and return a cursor handle to it.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_open_table_using_id(
+/*==========================*/
+	ib_id_u64_t	table_id,	/*!< in: table id of table to open */
+	ib_trx_t	ib_trx,		/*!< in: Current transaction handle
+					can be NULL */
+	ib_crsr_t*	ib_crsr)	/*!< out,own: InnoDB cursor */
+{
+	ib_err_t	err;
+	dict_table_t*	table;
+
+	if (ib_trx == NULL || !ib_schema_lock_is_exclusive(ib_trx)) {
+		table = ib_open_table_by_id(table_id, FALSE);
+	} else {
+		table = ib_open_table_by_id(table_id, TRUE);
+	}
+
+	if (table == NULL) {
+
+		return(DB_TABLE_NOT_FOUND);
+	}
+
+	err = ib_create_cursor_with_index_id(ib_crsr, table, 0,
+					     (trx_t*) ib_trx);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Open an InnoDB index and return a cursor handle to it.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_open_index_using_id(
+/*==========================*/
+	ib_id_u64_t	index_id,	/*!< in: index id of index to open */
+	ib_trx_t	ib_trx,		/*!< in: Current transaction handle
+					can be NULL */
+	ib_crsr_t*	ib_crsr)	/*!< out: InnoDB cursor */
+{
+	ib_err_t	err;
+	dict_table_t*	table;
+	ulint		table_id = (ulint)( index_id >> 32);
+
+	if (ib_trx == NULL || !ib_schema_lock_is_exclusive(ib_trx)) {
+		table = ib_open_table_by_id(table_id, FALSE);
+	} else {
+		table = ib_open_table_by_id(table_id, TRUE);
+	}
+
+	if (table == NULL) {
+
+		return(DB_TABLE_NOT_FOUND);
+	}
+
+	/* We only return the lower 32 bits of the dulint. */
+	err = ib_create_cursor_with_index_id(
+		ib_crsr, table, index_id, (trx_t*) ib_trx);
+
+	if (ib_crsr != NULL) {
+		const ib_cursor_t*	cursor;
+
+		cursor = *(ib_cursor_t**) ib_crsr;
+
+		if (cursor->prebuilt->index == NULL) {
+			ib_err_t	crsr_err;
+
+			crsr_err = ib_cursor_close(*ib_crsr);
+			ut_a(crsr_err == DB_SUCCESS);
+
+			*ib_crsr = NULL;
+		}
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Open an InnoDB secondary index cursor and return a cursor handle to it.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_open_index_using_name(
+/*============================*/
+	ib_crsr_t	ib_open_crsr,	/*!< in: open/active cursor */
+	const char*	index_name,	/*!< in: secondary index name */
+	ib_crsr_t*	ib_crsr,	/*!< out,own: InnoDB index cursor */
+	int*		idx_type,	/*!< out: index is cluster index */
+	ib_id_u64_t*	idx_id)		/*!< out: index id */
+{
+	dict_table_t*	table;
+	dict_index_t*	index;
+	index_id_t	index_id = 0;
+	ib_err_t	err = DB_TABLE_NOT_FOUND;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_open_crsr;
+
+	*idx_type = 0;
+	*idx_id = 0;
+	*ib_crsr = NULL;
+
+	/* We want to increment the ref count, so we do a redundant search. */
+	table = dict_table_open_on_id(cursor->prebuilt->table->id,
+				      FALSE, DICT_TABLE_OP_NORMAL);
+	ut_a(table != NULL);
+
+	/* The first index is always the cluster index. */
+	index = dict_table_get_first_index(table);
+
+	/* Traverse the user defined indexes. */
+	while (index != NULL) {
+		if (innobase_strcasecmp(index->name, index_name) == 0) {
+			index_id = index->id;
+			*idx_type = index->type;
+			*idx_id = index_id;
+			break;
+		}
+		index = UT_LIST_GET_NEXT(indexes, index);
+	}
+
+	if (!index_id) {
+		dict_table_close(table, FALSE, FALSE);
+		return(DB_ERROR);
+	}
+
+	if (index_id > 0) {
+		ut_ad(index->id == index_id);
+		err = ib_create_cursor(
+			ib_crsr, table, index, cursor->prebuilt->trx);
+	}
+
+	if (*ib_crsr != NULL) {
+		const ib_cursor_t*	cursor;
+
+		cursor = *(ib_cursor_t**) ib_crsr;
+
+		if (cursor->prebuilt->index == NULL) {
+			err = ib_cursor_close(*ib_crsr);
+			ut_a(err == DB_SUCCESS);
+			*ib_crsr = NULL;
+		}
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Open an InnoDB table and return a cursor handle to it.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_open_table(
+/*=================*/
+	const char*	name,		/*!< in: table name */
+	ib_trx_t	ib_trx,		/*!< in: Current transaction handle
+					can be NULL */
+	ib_crsr_t*	ib_crsr)	/*!< out,own: InnoDB cursor */
+{
+	ib_err_t	err;
+	dict_table_t*	table;
+	char*		normalized_name;
+
+	normalized_name = static_cast<char*>(mem_alloc(ut_strlen(name) + 1));
+	ib_normalize_table_name(normalized_name, name);
+
+	if (ib_trx != NULL) {
+	       if (!ib_schema_lock_is_exclusive(ib_trx)) {
+			table = (dict_table_t*)ib_open_table_by_name(
+				normalized_name);
+		} else {
+			/* NOTE: We do not acquire MySQL metadata lock */
+			table = ib_lookup_table_by_name(normalized_name);
+		}
+	} else {
+		table = (dict_table_t*)ib_open_table_by_name(normalized_name);
+	}
+
+	mem_free(normalized_name);
+	normalized_name = NULL;
+
+	/* It can happen that another thread has created the table but
+	not the cluster index or it's a broken table definition. Refuse to
+	open if that's the case. */
+	if (table != NULL && dict_table_get_first_index(table) == NULL) {
+		table = NULL;
+	}
+
+	if (table != NULL) {
+		err = ib_create_cursor_with_index_id(ib_crsr, table, 0,
+						     (trx_t*) ib_trx);
+	} else {
+		err = DB_TABLE_NOT_FOUND;
+	}
+
+	return(err);
+}
+
+/********************************************************************//**
+Free a context struct for a table handle. */
+static
+void
+ib_qry_proc_free(
+/*=============*/
+	ib_qry_proc_t*	q_proc)		/*!< in, own: qproc struct */
+{
+	que_graph_free_recursive(q_proc->grph.ins);
+	que_graph_free_recursive(q_proc->grph.upd);
+	que_graph_free_recursive(q_proc->grph.sel);
+
+	memset(q_proc, 0x0, sizeof(*q_proc));
+}
+
+/*****************************************************************//**
+set a cursor trx to NULL */
+UNIV_INTERN
+void
+ib_cursor_clear_trx(
+/*================*/
+	ib_crsr_t	ib_crsr)	/*!< in/out: InnoDB cursor */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+
+	cursor->prebuilt->trx = NULL;
+}
+
+/*****************************************************************//**
+Reset the cursor.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_reset(
+/*============*/
+	ib_crsr_t	ib_crsr)	/*!< in/out: InnoDB cursor */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+
+	if (cursor->valid_trx && prebuilt->trx != NULL
+	    && prebuilt->trx->n_mysql_tables_in_use > 0) {
+
+		--prebuilt->trx->n_mysql_tables_in_use;
+	}
+
+	/* The fields in this data structure are allocated from
+	the query heap and so need to be reset too. */
+	ib_qry_proc_free(&cursor->q_proc);
+
+	mem_heap_empty(cursor->query_heap);
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+update the cursor with new transactions and also reset the cursor
+@return	DB_SUCCESS or err code */
+ib_err_t
+ib_cursor_new_trx(
+/*==============*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor */
+	ib_trx_t	ib_trx)		/*!< in: transaction */
+{
+	ib_err_t        err = DB_SUCCESS;
+	ib_cursor_t*    cursor = (ib_cursor_t*) ib_crsr;
+	trx_t*          trx = (trx_t*) ib_trx;
+
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+
+	row_update_prebuilt_trx(prebuilt, trx);
+
+	cursor->valid_trx = TRUE;
+
+	trx_assign_read_view(prebuilt->trx);
+
+        ib_qry_proc_free(&cursor->q_proc);
+
+        mem_heap_empty(cursor->query_heap);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Commit the transaction in a cursor
+@return	DB_SUCCESS or err code */
+ib_err_t
+ib_cursor_commit_trx(
+/*=================*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor */
+	ib_trx_t	ib_trx)		/*!< in: transaction */
+{
+	ib_err_t        err = DB_SUCCESS;
+	ib_cursor_t*    cursor = (ib_cursor_t*) ib_crsr;
+#ifdef UNIV_DEBUG
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+
+	ut_ad(prebuilt->trx == (trx_t*) ib_trx);
+#endif /* UNIV_DEBUG */
+	ib_trx_commit(ib_trx);
+	cursor->valid_trx = FALSE;
+	return(err);
+}
+
+/*****************************************************************//**
+Close an InnoDB table and free the cursor.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_close(
+/*============*/
+	ib_crsr_t	ib_crsr)	/*!< in,own: InnoDB cursor */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt;
+	trx_t*		trx;
+
+	if (!cursor) {
+		return(DB_SUCCESS);
+	}
+
+	prebuilt = cursor->prebuilt;
+	trx = prebuilt->trx;
+
+	ib_qry_proc_free(&cursor->q_proc);
+
+	/* The transaction could have been detached from the cursor. */
+	if (cursor->valid_trx && trx != NULL
+	    && trx->n_mysql_tables_in_use > 0) {
+		--trx->n_mysql_tables_in_use;
+	}
+
+	row_prebuilt_free(prebuilt, FALSE);
+	cursor->prebuilt = NULL;
+
+	mem_heap_free(cursor->query_heap);
+	mem_heap_free(cursor->heap);
+	cursor = NULL;
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Close the table, decrement n_ref_count count.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_close_table(
+/*==================*/
+	ib_crsr_t	ib_crsr)	/*!< in,own: InnoDB cursor */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+
+	if (prebuilt && prebuilt->table) {
+		dict_table_close(prebuilt->table, FALSE, FALSE);
+	}
+
+	return(DB_SUCCESS);
+}
+/**********************************************************************//**
+Run the insert query and do error handling.
+@return	DB_SUCCESS or error code */
+UNIV_INLINE
+ib_err_t
+ib_insert_row_with_lock_retry(
+/*==========================*/
+	que_thr_t*	thr,		/*!< in: insert query graph */
+	ins_node_t*	node,		/*!< in: insert node for the query */
+	trx_savept_t*	savept)		/*!< in: savepoint to rollback to
+					in case of an error */
+{
+	trx_t*		trx;
+	ib_err_t	err;
+	ib_bool_t	lock_wait;
+
+	trx = thr_get_trx(thr);
+
+	do {
+		thr->run_node = node;
+		thr->prev_node = node;
+
+		row_ins_step(thr);
+
+		err = trx->error_state;
+
+		if (err != DB_SUCCESS) {
+			que_thr_stop_for_mysql(thr);
+
+			thr->lock_state = QUE_THR_LOCK_ROW;
+			lock_wait = static_cast<ib_bool_t>(
+				ib_handle_errors(&err, trx, thr, savept));
+			thr->lock_state = QUE_THR_LOCK_NOLOCK;
+		} else {
+			lock_wait = FALSE;
+		}
+	} while (lock_wait);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Write a row.
+@return	DB_SUCCESS or err code */
+static
+ib_err_t
+ib_execute_insert_query_graph(
+/*==========================*/
+	dict_table_t*	table,		/*!< in: table where to insert */
+	que_fork_t*	ins_graph,	/*!< in: query graph */
+	ins_node_t*	node)		/*!< in: insert node */
+{
+	trx_t*		trx;
+	que_thr_t*	thr;
+	trx_savept_t	savept;
+	ib_err_t	err = DB_SUCCESS;
+
+	trx = ins_graph->trx;
+
+	savept = trx_savept_take(trx);
+
+	thr = que_fork_get_first_thr(ins_graph);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+	err = ib_insert_row_with_lock_retry(thr, node, &savept);
+
+	if (err == DB_SUCCESS) {
+		que_thr_stop_for_mysql_no_error(thr, trx);
+
+		dict_table_n_rows_inc(table);
+
+		srv_stats.n_rows_inserted.inc();
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*****************************************************************//**
+Create an insert query graph node. */
+static
+void
+ib_insert_query_graph_create(
+/*==========================*/
+	ib_cursor_t*	cursor)		/*!< in: Cursor instance */
+{
+	ib_qry_proc_t*	q_proc = &cursor->q_proc;
+	ib_qry_node_t*	node = &q_proc->node;
+	trx_t*		trx = cursor->prebuilt->trx;
+
+	ut_a(trx->state != TRX_STATE_NOT_STARTED);
+
+	if (node->ins == NULL) {
+		dtuple_t*	row;
+		ib_qry_grph_t*	grph = &q_proc->grph;
+		mem_heap_t*	heap = cursor->query_heap;
+		dict_table_t*	table = cursor->prebuilt->table;
+
+		node->ins = ins_node_create(INS_DIRECT, table, heap);
+
+		node->ins->select = NULL;
+		node->ins->values_list = NULL;
+
+		row = dtuple_create(heap, dict_table_get_n_cols(table));
+		dict_table_copy_types(row, table);
+
+		ins_node_set_new_row(node->ins, row);
+
+		grph->ins = static_cast<que_fork_t*>(
+			que_node_get_parent(
+				pars_complete_graph_for_exec(node->ins, trx,
+							     heap)));
+
+		grph->ins->state = QUE_FORK_ACTIVE;
+	}
+}
+
+/*****************************************************************//**
+Insert a row to a table.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_insert_row(
+/*=================*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor instance */
+	const ib_tpl_t	ib_tpl)		/*!< in: tuple to insert */
+{
+	ib_ulint_t	i;
+	ib_qry_node_t*	node;
+	ib_qry_proc_t*	q_proc;
+	ulint		n_fields;
+	dtuple_t*	dst_dtuple;
+	ib_err_t	err = DB_SUCCESS;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	const ib_tuple_t* src_tuple = (const ib_tuple_t*) ib_tpl;
+
+	ib_insert_query_graph_create(cursor);
+
+	ut_ad(src_tuple->type == TPL_TYPE_ROW);
+
+	q_proc = &cursor->q_proc;
+	node = &q_proc->node;
+
+	node->ins->state = INS_NODE_ALLOC_ROW_ID;
+	dst_dtuple = node->ins->row;
+
+	n_fields = dtuple_get_n_fields(src_tuple->ptr);
+	ut_ad(n_fields == dtuple_get_n_fields(dst_dtuple));
+
+	/* Do a shallow copy of the data fields and check for NULL
+	constraints on columns. */
+	for (i = 0; i < n_fields; i++) {
+		ulint		mtype;
+		dfield_t*	src_field;
+		dfield_t*	dst_field;
+
+		src_field = dtuple_get_nth_field(src_tuple->ptr, i);
+
+		mtype = dtype_get_mtype(dfield_get_type(src_field));
+
+		/* Don't touch the system columns. */
+		if (mtype != DATA_SYS) {
+			ulint	prtype;
+
+			prtype = dtype_get_prtype(dfield_get_type(src_field));
+
+			if ((prtype & DATA_NOT_NULL)
+			    && dfield_is_null(src_field)) {
+
+				err = DB_DATA_MISMATCH;
+				break;
+			}
+
+			dst_field = dtuple_get_nth_field(dst_dtuple, i);
+			ut_ad(mtype
+			      == dtype_get_mtype(dfield_get_type(dst_field)));
+
+			/* Do a shallow copy. */
+			dfield_set_data(
+				dst_field, src_field->data, src_field->len);
+
+			if (dst_field->len != IB_SQL_NULL) {
+				UNIV_MEM_ASSERT_RW(dst_field->data,
+						   dst_field->len);
+			}
+		}
+	}
+
+	if (err == DB_SUCCESS) {
+		err = ib_execute_insert_query_graph(
+			src_tuple->index->table, q_proc->grph.ins, node->ins);
+	}
+
+	ib_wake_master_thread();
+
+	return(err);
+}
+
+/*********************************************************************//**
+Gets pointer to a prebuilt update vector used in updates.
+@return	update vector */
+UNIV_INLINE
+upd_t*
+ib_update_vector_create(
+/*====================*/
+	ib_cursor_t*	cursor)		/*!< in: current cursor */
+{
+	trx_t*		trx = cursor->prebuilt->trx;
+	mem_heap_t*	heap = cursor->query_heap;
+	dict_table_t*	table = cursor->prebuilt->table;
+	ib_qry_proc_t*	q_proc = &cursor->q_proc;
+	ib_qry_grph_t*	grph = &q_proc->grph;
+	ib_qry_node_t*	node = &q_proc->node;
+
+	ut_a(trx->state != TRX_STATE_NOT_STARTED);
+
+	if (node->upd == NULL) {
+		node->upd = static_cast<upd_node_t*>(
+			row_create_update_node_for_mysql(table, heap));
+	}
+
+	grph->upd = static_cast<que_fork_t*>(
+		que_node_get_parent(
+			pars_complete_graph_for_exec(node->upd, trx, heap)));
+
+	grph->upd->state = QUE_FORK_ACTIVE;
+
+	return(node->upd->update);
+}
+
+/**********************************************************************//**
+Note that a column has changed. */
+static
+void
+ib_update_col(
+/*==========*/
+
+	ib_cursor_t*	cursor,		/*!< in: current cursor */
+	upd_field_t*	upd_field,	/*!< in/out: update field */
+	ulint		col_no,		/*!< in: column number */
+	dfield_t*	dfield)		/*!< in: updated dfield */
+{
+	ulint		data_len;
+	dict_table_t*	table = cursor->prebuilt->table;
+	dict_index_t*	index = dict_table_get_first_index(table);
+
+	data_len = dfield_get_len(dfield);
+
+	if (data_len == UNIV_SQL_NULL) {
+		dfield_set_null(&upd_field->new_val);
+	} else {
+		dfield_copy_data(&upd_field->new_val, dfield);
+	}
+
+	upd_field->exp = NULL;
+
+	upd_field->orig_len = 0;
+
+	upd_field->field_no = dict_col_get_clust_pos(
+		&table->cols[col_no], index);
+}
+
+/**********************************************************************//**
+Checks which fields have changed in a row and stores the new data
+to an update vector.
+@return	DB_SUCCESS or err code */
+static
+ib_err_t
+ib_calc_diff(
+/*=========*/
+	ib_cursor_t*	cursor,		/*!< in: current cursor */
+	upd_t*		upd,		/*!< in/out: update vector */
+	const ib_tuple_t*old_tuple,	/*!< in: Old tuple in table */
+	const ib_tuple_t*new_tuple)	/*!< in: New tuple to update */
+{
+	ulint		i;
+	ulint		n_changed = 0;
+	ib_err_t	err = DB_SUCCESS;
+	ulint		n_fields = dtuple_get_n_fields(new_tuple->ptr);
+
+	ut_a(old_tuple->type == TPL_TYPE_ROW);
+	ut_a(new_tuple->type == TPL_TYPE_ROW);
+	ut_a(old_tuple->index->table == new_tuple->index->table);
+
+	for (i = 0; i < n_fields; ++i) {
+		ulint		mtype;
+		ulint		prtype;
+		upd_field_t*	upd_field;
+		dfield_t*	new_dfield;
+		dfield_t*	old_dfield;
+
+		new_dfield = dtuple_get_nth_field(new_tuple->ptr, i);
+		old_dfield = dtuple_get_nth_field(old_tuple->ptr, i);
+
+		mtype = dtype_get_mtype(dfield_get_type(old_dfield));
+		prtype = dtype_get_prtype(dfield_get_type(old_dfield));
+
+		/* Skip the system columns */
+		if (mtype == DATA_SYS) {
+			continue;
+
+		} else if ((prtype & DATA_NOT_NULL)
+			   && dfield_is_null(new_dfield)) {
+
+			err = DB_DATA_MISMATCH;
+			break;
+		}
+
+		if (dfield_get_len(new_dfield) != dfield_get_len(old_dfield)
+		    || (!dfield_is_null(old_dfield)
+		        && memcmp(dfield_get_data(new_dfield),
+			      dfield_get_data(old_dfield),
+			      dfield_get_len(old_dfield)) != 0)) {
+
+			upd_field = &upd->fields[n_changed];
+
+			ib_update_col(cursor, upd_field, i, new_dfield);
+
+			++n_changed;
+		}
+	}
+
+	if (err == DB_SUCCESS) {
+		upd->info_bits = 0;
+		upd->n_fields = n_changed;
+	}
+
+	return(err);
+}
+
+/**********************************************************************//**
+Run the update query and do error handling.
+@return	DB_SUCCESS or error code */
+UNIV_INLINE
+ib_err_t
+ib_update_row_with_lock_retry(
+/*==========================*/
+	que_thr_t*	thr,		/*!< in: Update query graph */
+	upd_node_t*	node,		/*!< in: Update node for the query */
+	trx_savept_t*	savept)		/*!< in: savepoint to rollback to
+					in case of an error */
+
+{
+	trx_t*		trx;
+	ib_err_t	err;
+	ib_bool_t	lock_wait;
+
+	trx = thr_get_trx(thr);
+
+	do {
+		thr->run_node = node;
+		thr->prev_node = node;
+
+		row_upd_step(thr);
+
+		err = trx->error_state;
+
+		if (err != DB_SUCCESS) {
+			que_thr_stop_for_mysql(thr);
+
+			if (err != DB_RECORD_NOT_FOUND) {
+				thr->lock_state = QUE_THR_LOCK_ROW;
+
+				lock_wait = static_cast<ib_bool_t>(
+					ib_handle_errors(&err, trx, thr, savept));
+
+				thr->lock_state = QUE_THR_LOCK_NOLOCK;
+			} else {
+				lock_wait = FALSE;
+			}
+		} else {
+			lock_wait = FALSE;
+		}
+	} while (lock_wait);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Does an update or delete of a row.
+@return	DB_SUCCESS or err code */
+UNIV_INLINE
+ib_err_t
+ib_execute_update_query_graph(
+/*==========================*/
+	ib_cursor_t*	cursor,		/*!< in: Cursor instance */
+	btr_pcur_t*	pcur)		/*!< in: Btree persistent cursor */
+{
+	ib_err_t	err;
+	que_thr_t*	thr;
+	upd_node_t*	node;
+	trx_savept_t	savept;
+	trx_t*		trx = cursor->prebuilt->trx;
+	dict_table_t*	table = cursor->prebuilt->table;
+	ib_qry_proc_t*	q_proc = &cursor->q_proc;
+
+	/* The transaction must be running. */
+	ut_a(trx->state != TRX_STATE_NOT_STARTED);
+
+	node = q_proc->node.upd;
+
+	ut_a(dict_index_is_clust(pcur->btr_cur.index));
+	btr_pcur_copy_stored_position(node->pcur, pcur);
+
+	ut_a(node->pcur->rel_pos == BTR_PCUR_ON);
+
+	savept = trx_savept_take(trx);
+
+	thr = que_fork_get_first_thr(q_proc->grph.upd);
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+	err = ib_update_row_with_lock_retry(thr, node, &savept);
+
+	if (err == DB_SUCCESS) {
+
+		que_thr_stop_for_mysql_no_error(thr, trx);
+
+		if (node->is_delete) {
+
+			dict_table_n_rows_dec(table);
+
+			srv_stats.n_rows_deleted.inc();
+		} else {
+			srv_stats.n_rows_updated.inc();
+		}
+
+	} else if (err == DB_RECORD_NOT_FOUND) {
+		trx->error_state = DB_SUCCESS;
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*****************************************************************//**
+Update a row in a table.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_update_row(
+/*=================*/
+	ib_crsr_t	ib_crsr,	/*!< in: InnoDB cursor instance */
+	const ib_tpl_t	ib_old_tpl,	/*!< in: Old tuple in table */
+	const ib_tpl_t	ib_new_tpl)	/*!< in: New tuple to update */
+{
+	upd_t*		upd;
+	ib_err_t	err;
+	btr_pcur_t*	pcur;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+	const ib_tuple_t*old_tuple = (const ib_tuple_t*) ib_old_tpl;
+	const ib_tuple_t*new_tuple = (const ib_tuple_t*) ib_new_tpl;
+
+	if (dict_index_is_clust(prebuilt->index)) {
+		pcur = &cursor->prebuilt->pcur;
+	} else if (prebuilt->need_to_access_clustered) {
+		pcur = &cursor->prebuilt->clust_pcur;
+	} else {
+		return(DB_ERROR);
+	}
+
+	ut_a(old_tuple->type == TPL_TYPE_ROW);
+	ut_a(new_tuple->type == TPL_TYPE_ROW);
+
+	upd = ib_update_vector_create(cursor);
+
+	err = ib_calc_diff(cursor, upd, old_tuple, new_tuple);
+
+	if (err == DB_SUCCESS) {
+		/* Note that this is not a delete. */
+		cursor->q_proc.node.upd->is_delete = FALSE;
+
+		err = ib_execute_update_query_graph(cursor, pcur);
+	}
+
+	ib_wake_master_thread();
+
+	return(err);
+}
+
+/**********************************************************************//**
+Build the update query graph to delete a row from an index.
+@return	DB_SUCCESS or err code */
+static
+ib_err_t
+ib_delete_row(
+/*==========*/
+	ib_cursor_t*	cursor,		/*!< in: current cursor */
+	btr_pcur_t*	pcur,		/*!< in: Btree persistent cursor */
+	const rec_t*	rec)		/*!< in: record to delete */
+{
+	ulint		i;
+	upd_t*		upd;
+	ib_err_t	err;
+	ib_tuple_t*	tuple;
+	ib_tpl_t	ib_tpl;
+	ulint		n_cols;
+	upd_field_t*	upd_field;
+	ib_bool_t	page_format;
+	dict_table_t*	table = cursor->prebuilt->table;
+	dict_index_t*	index = dict_table_get_first_index(table);
+
+	n_cols = dict_index_get_n_ordering_defined_by_user(index);
+	ib_tpl = ib_key_tuple_new(index, n_cols);
+
+	if (!ib_tpl) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	tuple = (ib_tuple_t*) ib_tpl;
+
+	upd = ib_update_vector_create(cursor);
+
+	page_format = static_cast<ib_bool_t>(
+		dict_table_is_comp(index->table));
+	ib_read_tuple(rec, page_format, tuple, NULL, NULL);
+
+	upd->n_fields = ib_tuple_get_n_cols(ib_tpl);
+
+	for (i = 0; i < upd->n_fields; ++i) {
+		dfield_t*	dfield;
+
+		upd_field = &upd->fields[i];
+		dfield = dtuple_get_nth_field(tuple->ptr, i);
+
+		dfield_copy_data(&upd_field->new_val, dfield);
+
+		upd_field->exp = NULL;
+
+		upd_field->orig_len = 0;
+
+		upd->info_bits = 0;
+
+		upd_field->field_no = dict_col_get_clust_pos(
+			&table->cols[i], index);
+	}
+
+	/* Note that this is a delete. */
+	cursor->q_proc.node.upd->is_delete = TRUE;
+
+	err = ib_execute_update_query_graph(cursor, pcur);
+
+	ib_tuple_delete(ib_tpl);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Delete a row in a table.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_delete_row(
+/*=================*/
+	ib_crsr_t	ib_crsr)	/*!< in: InnoDB cursor instance */
+{
+	ib_err_t	err;
+	btr_pcur_t*	pcur;
+	dict_index_t*	index;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+
+	index = dict_table_get_first_index(prebuilt->index->table);
+
+	/* Check whether this is a secondary index cursor */
+	if (index != prebuilt->index) {
+		if (prebuilt->need_to_access_clustered) {
+			pcur = &prebuilt->clust_pcur;
+		} else {
+			return(DB_ERROR);
+		}
+	} else {
+		pcur = &prebuilt->pcur;
+	}
+
+	if (ib_btr_cursor_is_positioned(pcur)) {
+		const rec_t*	rec;
+		ib_bool_t	page_format;
+		mtr_t		mtr;
+		rec_t*		copy = NULL;
+	        byte		ptr[UNIV_PAGE_SIZE_MAX];
+
+		page_format = static_cast<ib_bool_t>(
+			dict_table_is_comp(index->table));
+
+		mtr_start(&mtr);
+
+		if (btr_pcur_restore_position(
+			BTR_SEARCH_LEAF, pcur, &mtr)) {
+			mem_heap_t*	heap = NULL;
+			ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+			ulint*		offsets	= offsets_;
+
+			rec_offs_init(offsets_);
+
+			rec = btr_pcur_get_rec(pcur);
+
+			/* Since mtr will be commited, the rec
+			will not be protected. Make a copy of
+			the rec. */
+			offsets = rec_get_offsets(
+				rec, index, offsets, ULINT_UNDEFINED, &heap);
+			ut_ad(rec_offs_size(offsets) < UNIV_PAGE_SIZE_MAX);
+			copy = rec_copy(ptr, rec, offsets);
+		}
+
+		mtr_commit(&mtr);
+
+		if (copy && !rec_get_deleted_flag(copy, page_format)) {
+			err = ib_delete_row(cursor, pcur, copy);
+		} else {
+			err = DB_RECORD_NOT_FOUND;
+		}
+	} else {
+		err = DB_RECORD_NOT_FOUND;
+	}
+
+	ib_wake_master_thread();
+
+	return(err);
+}
+
+/*****************************************************************//**
+Read current row.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_read_row(
+/*===============*/
+	ib_crsr_t	ib_crsr,	/*!< in: InnoDB cursor instance */
+	ib_tpl_t	ib_tpl,		/*!< out: read cols into this tuple */
+	void**		row_buf,        /*!< in/out: row buffer */
+	ib_ulint_t*	row_len)        /*!< in/out: row buffer len */
+{
+	ib_err_t	err;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+
+	ut_a(cursor->prebuilt->trx->state != TRX_STATE_NOT_STARTED);
+
+	/* When searching with IB_EXACT_MATCH set, row_search_for_mysql()
+	will not position the persistent cursor but will copy the record
+	found into the row cache. It should be the only entry. */
+	if (!ib_cursor_is_positioned(ib_crsr) ) {
+		err = DB_RECORD_NOT_FOUND;
+	} else {
+		mtr_t		mtr;
+		btr_pcur_t*	pcur;
+		row_prebuilt_t*	prebuilt = cursor->prebuilt;
+
+		if (prebuilt->need_to_access_clustered
+		    && tuple->type == TPL_TYPE_ROW) {
+			pcur = &prebuilt->clust_pcur;
+		} else {
+			pcur = &prebuilt->pcur;
+		}
+
+		if (pcur == NULL) {
+			return(DB_ERROR);
+		}
+
+		mtr_start(&mtr);
+
+		if (btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, &mtr)) {
+			const rec_t*	rec;
+			ib_bool_t	page_format;
+
+			page_format = static_cast<ib_bool_t>(
+				dict_table_is_comp(tuple->index->table));
+			rec = btr_pcur_get_rec(pcur);
+
+			if (prebuilt->innodb_api_rec &&
+			    prebuilt->innodb_api_rec != rec) {
+				rec = prebuilt->innodb_api_rec;
+			}
+
+			if (!rec_get_deleted_flag(rec, page_format)) {
+				ib_read_tuple(rec, page_format, tuple,
+					      row_buf, (ulint*) row_len);
+				err = DB_SUCCESS;
+			} else{
+				err = DB_RECORD_NOT_FOUND;
+			}
+
+		} else {
+			err = DB_RECORD_NOT_FOUND;
+		}
+
+		mtr_commit(&mtr);
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Move cursor to the first record in the table.
+@return	DB_SUCCESS or err code */
+UNIV_INLINE
+ib_err_t
+ib_cursor_position(
+/*===============*/
+	ib_cursor_t*	cursor,		/*!< in: InnoDB cursor instance */
+	ib_srch_mode_t	mode)		/*!< in: Search mode */
+{
+	ib_err_t	err;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+	unsigned char*	buf;
+
+	buf = static_cast<unsigned char*>(mem_alloc(UNIV_PAGE_SIZE));
+
+	/* We want to position at one of the ends, row_search_for_mysql()
+	uses the search_tuple fields to work out what to do. */
+	dtuple_set_n_fields(prebuilt->search_tuple, 0);
+
+	err = static_cast<ib_err_t>(row_search_for_mysql(
+		buf, mode, prebuilt, 0, 0));
+
+	mem_free(buf);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Move cursor to the first record in the table.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_first(
+/*============*/
+	ib_crsr_t	ib_crsr)	/*!< in: InnoDB cursor instance */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+
+	return(ib_cursor_position(cursor, IB_CUR_G));
+}
+
+/*****************************************************************//**
+Move cursor to the last record in the table.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_last(
+/*===========*/
+	ib_crsr_t	ib_crsr)	/*!< in: InnoDB cursor instance */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+
+	return(ib_cursor_position(cursor, IB_CUR_L));
+}
+
+/*****************************************************************//**
+Move cursor to the next user record in the table.
+@return DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_next(
+/*===========*/
+        ib_crsr_t       ib_crsr)        /*!< in: InnoDB cursor instance */
+{
+        ib_err_t	err;
+        ib_cursor_t*    cursor = (ib_cursor_t*) ib_crsr;
+        row_prebuilt_t* prebuilt = cursor->prebuilt;
+	byte		buf[UNIV_PAGE_SIZE_MAX];
+
+        /* We want to move to the next record */
+        dtuple_set_n_fields(prebuilt->search_tuple, 0);
+
+        err = static_cast<ib_err_t>(row_search_for_mysql(
+		buf, PAGE_CUR_G, prebuilt, 0, ROW_SEL_NEXT));
+
+        return(err);
+}
+
+/*****************************************************************//**
+Search for key.
+@return	DB_SUCCESS or err code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_moveto(
+/*=============*/
+	ib_crsr_t	ib_crsr,	/*!< in: InnoDB cursor instance */
+	ib_tpl_t	ib_tpl,		/*!< in: Key to search for */
+	ib_srch_mode_t	ib_srch_mode)	/*!< in: search mode */
+{
+	ulint		i;
+	ulint		n_fields;
+	ib_err_t	err = DB_SUCCESS;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+	dtuple_t*	search_tuple = prebuilt->search_tuple;
+	unsigned char*	buf;
+
+	ut_a(tuple->type == TPL_TYPE_KEY);
+
+	n_fields = dict_index_get_n_ordering_defined_by_user(prebuilt->index);
+
+	dtuple_set_n_fields(search_tuple, n_fields);
+	dtuple_set_n_fields_cmp(search_tuple, n_fields);
+
+	/* Do a shallow copy */
+	for (i = 0; i < n_fields; ++i) {
+		dfield_copy(dtuple_get_nth_field(search_tuple, i),
+			    dtuple_get_nth_field(tuple->ptr, i));
+	}
+
+	ut_a(prebuilt->select_lock_type <= LOCK_NUM);
+
+	prebuilt->innodb_api_rec = NULL;
+
+	buf = static_cast<unsigned char*>(mem_alloc(UNIV_PAGE_SIZE));
+
+	err = static_cast<ib_err_t>(row_search_for_mysql(
+		buf, ib_srch_mode, prebuilt, cursor->match_mode, 0));
+
+	mem_free(buf);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Set the cursor search mode. */
+UNIV_INTERN
+void
+ib_cursor_set_match_mode(
+/*=====================*/
+	ib_crsr_t	ib_crsr,	/*!< in: Cursor instance */
+	ib_match_mode_t	match_mode)	/*!< in: ib_cursor_moveto match mode */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+
+	cursor->match_mode = match_mode;
+}
+
+/*****************************************************************//**
+Get the dfield instance for the column in the tuple.
+@return	dfield instance in tuple */
+UNIV_INLINE
+dfield_t*
+ib_col_get_dfield(
+/*==============*/
+	ib_tuple_t*	tuple,		/*!< in: tuple instance */
+	ulint		col_no)		/*!< in: col no. in tuple */
+{
+	dfield_t*	dfield;
+
+	dfield = dtuple_get_nth_field(tuple->ptr, col_no);
+
+	return(dfield);
+}
+
+/*****************************************************************//**
+Predicate to check whether a column type contains variable length data.
+@return	DB_SUCCESS or error code */
+UNIV_INLINE
+ib_err_t
+ib_col_is_capped(
+/*==============*/
+	const dtype_t*  dtype)		/*!< in: column type */
+{
+	return(static_cast<ib_err_t>(
+		(dtype_get_mtype(dtype) == DATA_VARCHAR
+		|| dtype_get_mtype(dtype) == DATA_CHAR
+		|| dtype_get_mtype(dtype) == DATA_MYSQL
+		|| dtype_get_mtype(dtype) == DATA_VARMYSQL
+		|| dtype_get_mtype(dtype) == DATA_FIXBINARY
+		|| dtype_get_mtype(dtype) == DATA_BINARY)
+	       && dtype_get_len(dtype) > 0));
+}
+
+/*****************************************************************//**
+Set a column of the tuple. Make a copy using the tuple's heap.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ib_err_t
+ib_col_set_value(
+/*=============*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	col_no,		/*!< in: column index in tuple */
+	const void*	src,		/*!< in: data value */
+	ib_ulint_t	len,		/*!< in: data value len */
+	ib_bool_t	need_cpy)	/*!< in: if need memcpy */
+{
+	const dtype_t*  dtype;
+	dfield_t*	dfield;
+	void*		dst = NULL;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+	ulint		col_len;
+
+	dfield = ib_col_get_dfield(tuple, col_no);
+
+	/* User wants to set the column to NULL. */
+	if (len == IB_SQL_NULL) {
+		dfield_set_null(dfield);
+		return(DB_SUCCESS);
+	}
+
+	dtype = dfield_get_type(dfield);
+	col_len = dtype_get_len(dtype);
+
+	/* Not allowed to update system columns. */
+	if (dtype_get_mtype(dtype) == DATA_SYS) {
+		return(DB_DATA_MISMATCH);
+	}
+
+	dst = dfield_get_data(dfield);
+
+	/* Since TEXT/CLOB also map to DATA_VARCHAR we need to make an
+	exception. Perhaps we need to set the precise type and check
+	for that. */
+	if (ib_col_is_capped(dtype)) {
+
+		len = ut_min(len, static_cast<ib_ulint_t>(col_len));
+
+		if (dst == NULL || len > dfield_get_len(dfield)) {
+			dst = mem_heap_alloc(tuple->heap, col_len);
+			ut_a(dst != NULL);
+		}
+	} else if (dst == NULL || len > dfield_get_len(dfield)) {
+		dst = mem_heap_alloc(tuple->heap, len);
+	}
+
+	if (dst == NULL) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	switch (dtype_get_mtype(dtype)) {
+	case DATA_INT: {
+
+		if (col_len == len) {
+			ibool		usign;
+
+			usign = dtype_get_prtype(dtype) & DATA_UNSIGNED;
+			mach_write_int_type(static_cast<byte*>(dst),
+					    static_cast<const byte*>(src),
+					    len, usign);
+
+		} else {
+			return(DB_DATA_MISMATCH);
+		}
+		break;
+	}
+
+	case DATA_FLOAT:
+		if (len == sizeof(float)) {
+			mach_float_write(static_cast<byte*>(dst), *(float*)src);
+		} else {
+			return(DB_DATA_MISMATCH);
+		}
+		break;
+
+	case DATA_DOUBLE:
+		if (len == sizeof(double)) {
+			mach_double_write(static_cast<byte*>(dst),
+					  *(double*)src);
+		} else {
+			return(DB_DATA_MISMATCH);
+		}
+		break;
+
+	case DATA_SYS:
+		ut_error;
+		break;
+
+	case DATA_CHAR: {
+		ulint	pad_char = ULINT_UNDEFINED;
+
+		pad_char = dtype_get_pad_char(
+			dtype_get_mtype(dtype),	dtype_get_prtype(dtype));
+
+		ut_a(pad_char != ULINT_UNDEFINED);
+
+		memset((byte*) dst + len,
+		       static_cast<int>(pad_char),
+			   static_cast<size_t>(col_len - len));
+
+		memcpy(dst, src, len);
+
+		len = static_cast<ib_ulint_t>(col_len);
+		break;
+	}
+	case DATA_BLOB:
+	case DATA_BINARY:
+	case DATA_DECIMAL:
+	case DATA_VARCHAR:
+	case DATA_FIXBINARY:
+		if (need_cpy) {
+			memcpy(dst, src, len);
+		} else {
+			dfield_set_data(dfield, src, len);
+			dst = dfield_get_data(dfield);
+		}
+		break;
+
+	case DATA_MYSQL:
+	case DATA_VARMYSQL: {
+		ulint		cset;
+		CHARSET_INFO*	cs;
+		int		error = 0;
+		ulint		true_len = len;
+
+		/* For multi byte character sets we need to
+		calculate the true length of the data. */
+		cset = dtype_get_charset_coll(
+			dtype_get_prtype(dtype));
+		cs = all_charsets[cset];
+		if (cs) {
+			uint pos = (uint)(col_len / cs->mbmaxlen);
+
+			if (len > 0 && cs->mbmaxlen > 1) {
+				true_len = (ulint)
+					cs->cset->well_formed_len(
+						cs,
+						(const char*)src,
+						(const char*)src + len,
+						pos,
+						&error);
+
+				if (true_len < len) {
+					len = static_cast<ib_ulint_t>(true_len);
+				}
+			}
+		}
+
+		/* All invalid bytes in data need be truncated.
+		If len == 0, means all bytes of the data is invalid.
+		In this case, the data will be truncated to empty.*/
+		memcpy(dst, src, len);
+
+		/* For DATA_MYSQL, need to pad the unused
+		space with spaces. */
+		if (dtype_get_mtype(dtype) == DATA_MYSQL) {
+			ulint		n_chars;
+
+			if (len < col_len) {
+				ulint	pad_len = col_len - len;
+
+				ut_a(cs != NULL);
+				ut_a(!(pad_len % cs->mbminlen));
+
+				cs->cset->fill(cs, (char*)dst + len,
+					       pad_len,
+					       0x20 /* space */);
+			}
+
+			/* Why we should do below? See function
+			row_mysql_store_col_in_innobase_format */
+
+			ut_a(!(dtype_get_len(dtype)
+				% dtype_get_mbmaxlen(dtype)));
+
+			n_chars = dtype_get_len(dtype)
+				/ dtype_get_mbmaxlen(dtype);
+
+			/* Strip space padding. */
+			while (col_len > n_chars
+				&& ((char*)dst)[col_len - 1] == 0x20) {
+				col_len--;
+			}
+
+			len = static_cast<ib_ulint_t>(col_len);
+		}
+		break;
+	}
+
+	default:
+		ut_error;
+	}
+
+	if (dst != dfield_get_data(dfield)) {
+		dfield_set_data(dfield, dst, len);
+	} else {
+		dfield_set_len(dfield, len);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Get the size of the data available in a column of the tuple.
+@return	bytes avail or IB_SQL_NULL */
+UNIV_INTERN
+ib_ulint_t
+ib_col_get_len(
+/*===========*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	i)		/*!< in: column index in tuple */
+{
+	const dfield_t*		dfield;
+	ulint			data_len;
+	ib_tuple_t*		tuple = (ib_tuple_t*) ib_tpl;
+
+	dfield = ib_col_get_dfield(tuple, i);
+
+	data_len = dfield_get_len(dfield);
+
+	return(static_cast<ib_ulint_t>(
+		data_len == UNIV_SQL_NULL ? IB_SQL_NULL : data_len));
+}
+
+/*****************************************************************//**
+Copy a column value from the tuple.
+@return	bytes copied or IB_SQL_NULL */
+UNIV_INLINE
+ib_ulint_t
+ib_col_copy_value_low(
+/*==================*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	i,		/*!< in: column index in tuple */
+	void*		dst,		/*!< out: copied data value */
+	ib_ulint_t	len)		/*!< in: max data value len to copy */
+{
+	const void*	data;
+	const dfield_t*	dfield;
+	ulint		data_len;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+
+	dfield = ib_col_get_dfield(tuple, i);
+
+	data = dfield_get_data(dfield);
+	data_len = dfield_get_len(dfield);
+
+	if (data_len != UNIV_SQL_NULL) {
+
+		const dtype_t*  dtype = dfield_get_type(dfield);
+
+		switch (dtype_get_mtype(dfield_get_type(dfield))) {
+		case DATA_INT: {
+			ibool		usign;
+			ullint		ret;
+
+			ut_a(data_len == len);
+
+			usign = dtype_get_prtype(dtype) & DATA_UNSIGNED;
+			ret = mach_read_int_type(static_cast<const byte*>(data),
+						 data_len, usign);
+
+			if (usign) {
+				if (len == 1) {
+					*(ib_i8_t*)dst = (ib_i8_t)ret;
+				} else if (len == 2) {
+					*(ib_i16_t*)dst = (ib_i16_t)ret;
+				} else if (len == 4) {
+					*(ib_i32_t*)dst = (ib_i32_t)ret;
+				} else {
+					*(ib_i64_t*)dst = (ib_i64_t)ret;
+				}
+			} else {
+				if (len == 1) {
+					*(ib_u8_t*)dst = (ib_i8_t)ret;
+				} else if (len == 2) {
+					*(ib_u16_t*)dst = (ib_i16_t)ret;
+				} else if (len == 4) {
+					*(ib_u32_t*)dst = (ib_i32_t)ret;
+				} else {
+					*(ib_u64_t*)dst = (ib_i64_t)ret;
+				}
+			}
+
+			break;
+		}
+		case DATA_FLOAT:
+			if (len == data_len) {
+				float	f;
+
+				ut_a(data_len == sizeof(f));
+				f = mach_float_read(static_cast<const byte*>(
+					data));
+				memcpy(dst, &f, sizeof(f));
+			} else {
+				data_len = 0;
+			}
+			break;
+		case DATA_DOUBLE:
+			if (len == data_len) {
+				double	d;
+
+				ut_a(data_len == sizeof(d));
+				d = mach_double_read(static_cast<const byte*>(
+					data));
+				memcpy(dst, &d, sizeof(d));
+			} else {
+				data_len = 0;
+			}
+			break;
+		default:
+			data_len = ut_min(data_len, len);
+			memcpy(dst, data, data_len);
+		}
+	} else {
+		data_len = IB_SQL_NULL;
+	}
+
+	return(static_cast<ib_ulint_t>(data_len));
+}
+
+/*****************************************************************//**
+Copy a column value from the tuple.
+@return	bytes copied or IB_SQL_NULL */
+UNIV_INTERN
+ib_ulint_t
+ib_col_copy_value(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	i,		/*!< in: column index in tuple */
+	void*		dst,		/*!< out: copied data value */
+	ib_ulint_t	len)		/*!< in: max data value len to copy */
+{
+	return(ib_col_copy_value_low(ib_tpl, i, dst, len));
+}
+
+/*****************************************************************//**
+Get the InnoDB column attribute from the internal column precise type.
+@return	precise type in api format */
+UNIV_INLINE
+ib_col_attr_t
+ib_col_get_attr(
+/*============*/
+	ulint		prtype)		/*!< in: column definition */
+{
+	ib_col_attr_t	attr = IB_COL_NONE;
+
+	if (prtype & DATA_UNSIGNED) {
+		attr = static_cast<ib_col_attr_t>(attr | IB_COL_UNSIGNED);
+	}
+
+	if (prtype & DATA_NOT_NULL) {
+		attr = static_cast<ib_col_attr_t>(attr | IB_COL_NOT_NULL);
+	}
+
+	return(attr);
+}
+
+/*****************************************************************//**
+Get a column name from the tuple.
+@return	name of the column */
+UNIV_INTERN
+const char*
+ib_col_get_name(
+/*============*/
+	ib_crsr_t       ib_crsr,        /*!< in: InnoDB cursor instance */
+	ib_ulint_t	i)		/*!< in: column index in tuple */
+{
+	const char*	name;
+	ib_cursor_t*    cursor = (ib_cursor_t*) ib_crsr;
+	dict_table_t*	table = cursor->prebuilt->table;
+	dict_col_t*     col = dict_table_get_nth_col(table, i);
+	ulint           col_no = dict_col_get_no(col);
+
+	name = dict_table_get_col_name(table, col_no);
+
+	return(name);
+}
+
+/*****************************************************************//**
+Get an index field name from the cursor.
+@return	name of the field */
+UNIV_INTERN
+const char*
+ib_get_idx_field_name(
+/*==================*/
+	ib_crsr_t       ib_crsr,        /*!< in: InnoDB cursor instance */
+	ib_ulint_t	i)		/*!< in: column index in tuple */
+{
+	ib_cursor_t*    cursor = (ib_cursor_t*) ib_crsr;
+	dict_index_t*	index = cursor->prebuilt->index;
+	dict_field_t* 	field;
+
+	if (index) {
+		field = dict_index_get_nth_field(cursor->prebuilt->index, i);
+
+		if (field) {
+			return(field->name);
+		}
+	}
+
+	return(NULL);
+}
+
+/*****************************************************************//**
+Get a column type, length and attributes from the tuple.
+@return	len of column data */
+UNIV_INLINE
+ib_ulint_t
+ib_col_get_meta_low(
+/*================*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	i,		/*!< in: column index in tuple */
+	ib_col_meta_t*	ib_col_meta)	/*!< out: column meta data */
+{
+	ib_u16_t	prtype;
+	const dfield_t*	dfield;
+	ulint		data_len;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+
+	dfield = ib_col_get_dfield(tuple, i);
+
+	data_len = dfield_get_len(dfield);
+
+	/* We assume 1-1 mapping between the ENUM and internal type codes. */
+	ib_col_meta->type = static_cast<ib_col_type_t>(
+		dtype_get_mtype(dfield_get_type(dfield)));
+
+	ib_col_meta->type_len = static_cast<ib_u32_t>(
+		dtype_get_len(dfield_get_type(dfield)));
+
+	prtype = (ib_u16_t) dtype_get_prtype(dfield_get_type(dfield));
+
+	ib_col_meta->attr = ib_col_get_attr(prtype);
+	ib_col_meta->client_type = prtype & DATA_MYSQL_TYPE_MASK;
+
+	return(static_cast<ib_ulint_t>(data_len));
+}
+
+/*************************************************************//**
+Read a signed int 8 bit column from an InnoDB tuple. */
+UNIV_INLINE
+ib_err_t
+ib_tuple_check_int(
+/*===============*/
+	ib_tpl_t		ib_tpl,	/*!< in: InnoDB tuple */
+	ib_ulint_t		i,	/*!< in: column number */
+	ib_bool_t		usign,	/*!< in: true if unsigned */
+	ulint			size)	/*!< in: size of integer */
+{
+	ib_col_meta_t		ib_col_meta;
+
+	ib_col_get_meta_low(ib_tpl, i, &ib_col_meta);
+
+	if (ib_col_meta.type != IB_INT) {
+		return(DB_DATA_MISMATCH);
+	} else if (ib_col_meta.type_len == IB_SQL_NULL) {
+		return(DB_UNDERFLOW);
+	} else if (ib_col_meta.type_len != size) {
+		return(DB_DATA_MISMATCH);
+	} else if ((ib_col_meta.attr & IB_COL_UNSIGNED) && !usign) {
+		return(DB_DATA_MISMATCH);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*************************************************************//**
+Read a signed int 8 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_i8(
+/*=============*/
+	ib_tpl_t		ib_tpl,	/*!< in: InnoDB tuple */
+	ib_ulint_t		i,	/*!< in: column number */
+	ib_i8_t*		ival)	/*!< out: integer value */
+{
+	ib_err_t		err;
+
+	err = ib_tuple_check_int(ib_tpl, i, IB_FALSE, sizeof(*ival));
+
+	if (err == DB_SUCCESS) {
+		ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival));
+	}
+
+	return(err);
+}
+
+/*************************************************************//**
+Read an unsigned int 8 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_u8(
+/*=============*/
+	ib_tpl_t		ib_tpl,	/*!< in: InnoDB tuple */
+	ib_ulint_t		i,	/*!< in: column number */
+	ib_u8_t*		ival)	/*!< out: integer value */
+{
+	ib_err_t		err;
+
+	err = ib_tuple_check_int(ib_tpl, i, IB_TRUE, sizeof(*ival));
+
+	if (err == DB_SUCCESS) {
+		ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival));
+	}
+
+	return(err);
+}
+
+/*************************************************************//**
+Read a signed int 16 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_i16(
+/*==============*/
+	ib_tpl_t		ib_tpl,	/*!< in: InnoDB tuple */
+	ib_ulint_t		i,	/*!< in: column number */
+	ib_i16_t*		ival)	/*!< out: integer value */
+{
+	ib_err_t		err;
+
+	err = ib_tuple_check_int(ib_tpl, i, FALSE, sizeof(*ival));
+
+	if (err == DB_SUCCESS) {
+		ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival));
+	}
+
+	return(err);
+}
+
+/*************************************************************//**
+Read an unsigned int 16 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_u16(
+/*==============*/
+	ib_tpl_t		ib_tpl,	/*!< in: InnoDB tuple */
+	ib_ulint_t		i,	/*!< in: column number */
+	ib_u16_t*		ival)	/*!< out: integer value */
+{
+	ib_err_t		err;
+
+	err = ib_tuple_check_int(ib_tpl, i, IB_TRUE, sizeof(*ival));
+
+	if (err == DB_SUCCESS) {
+		ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival));
+	}
+
+	return(err);
+}
+
+/*************************************************************//**
+Read a signed int 32 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_i32(
+/*==============*/
+	ib_tpl_t		ib_tpl,	/*!< in: InnoDB tuple */
+	ib_ulint_t		i,	/*!< in: column number */
+	ib_i32_t*		ival)	/*!< out: integer value */
+{
+	ib_err_t		err;
+
+	err = ib_tuple_check_int(ib_tpl, i, FALSE, sizeof(*ival));
+
+	if (err == DB_SUCCESS) {
+		ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival));
+	}
+
+	return(err);
+}
+
+/*************************************************************//**
+Read an unsigned int 32 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_u32(
+/*==============*/
+	ib_tpl_t		ib_tpl,	/*!< in: InnoDB tuple */
+	ib_ulint_t		i,	/*!< in: column number */
+	ib_u32_t*		ival)	/*!< out: integer value */
+{
+	ib_err_t		err;
+
+	err = ib_tuple_check_int(ib_tpl, i, IB_TRUE, sizeof(*ival));
+
+	if (err == DB_SUCCESS) {
+		ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival));
+	}
+
+	return(err);
+}
+
+/*************************************************************//**
+Read a signed int 64 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_i64(
+/*==============*/
+	ib_tpl_t		ib_tpl,	/*!< in: InnoDB tuple */
+	ib_ulint_t		i,	/*!< in: column number */
+	ib_i64_t*		ival)	/*!< out: integer value */
+{
+	ib_err_t		err;
+
+	err = ib_tuple_check_int(ib_tpl, i, FALSE, sizeof(*ival));
+
+	if (err == DB_SUCCESS) {
+		ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival));
+	}
+
+	return(err);
+}
+
+/*************************************************************//**
+Read an unsigned int 64 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_u64(
+/*==============*/
+	ib_tpl_t		ib_tpl,	/*!< in: InnoDB tuple */
+	ib_ulint_t		i,	/*!< in: column number */
+	ib_u64_t*		ival)	/*!< out: integer value */
+{
+	ib_err_t		err;
+
+	err = ib_tuple_check_int(ib_tpl, i, IB_TRUE, sizeof(*ival));
+
+	if (err == DB_SUCCESS) {
+		ib_col_copy_value_low(ib_tpl, i, ival, sizeof(*ival));
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Get a column value pointer from the tuple.
+@return	NULL or pointer to buffer */
+UNIV_INTERN
+const void*
+ib_col_get_value(
+/*=============*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	i)		/*!< in: column index in tuple */
+{
+	const void*	data;
+	const dfield_t*	dfield;
+	ulint		data_len;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+
+	dfield = ib_col_get_dfield(tuple, i);
+
+	data = dfield_get_data(dfield);
+	data_len = dfield_get_len(dfield);
+
+	return(data_len != UNIV_SQL_NULL ? data : NULL);
+}
+
+/*****************************************************************//**
+Get a column type, length and attributes from the tuple.
+@return	len of column data */
+UNIV_INTERN
+ib_ulint_t
+ib_col_get_meta(
+/*============*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	i,		/*!< in: column index in tuple */
+	ib_col_meta_t*	ib_col_meta)	/*!< out: column meta data */
+{
+	return(ib_col_get_meta_low(ib_tpl, i, ib_col_meta));
+}
+
+/*****************************************************************//**
+"Clear" or reset an InnoDB tuple. We free the heap and recreate the tuple.
+@return	new tuple, or NULL */
+UNIV_INTERN
+ib_tpl_t
+ib_tuple_clear(
+/*============*/
+	ib_tpl_t	ib_tpl)		/*!< in,own: tuple (will be freed) */
+{
+	const dict_index_t*	index;
+	ulint			n_cols;
+	ib_tuple_t*		tuple	= (ib_tuple_t*) ib_tpl;
+	ib_tuple_type_t		type	= tuple->type;
+	mem_heap_t*		heap	= tuple->heap;
+
+	index = tuple->index;
+	n_cols = dtuple_get_n_fields(tuple->ptr);
+
+	mem_heap_empty(heap);
+
+	if (type == TPL_TYPE_ROW) {
+		return(ib_row_tuple_new_low(index, n_cols, heap));
+	} else {
+		return(ib_key_tuple_new_low(index, n_cols, heap));
+	}
+}
+
+/*****************************************************************//**
+Create a new cluster key search tuple and copy the contents of  the
+secondary index key tuple columns that refer to the cluster index record
+to the cluster key. It does a deep copy of the column data.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ib_err_t
+ib_tuple_get_cluster_key(
+/*=====================*/
+	ib_crsr_t	ib_crsr,	/*!< in: secondary index cursor */
+	ib_tpl_t*	ib_dst_tpl,	/*!< out,own: destination tuple */
+	const ib_tpl_t	ib_src_tpl)	/*!< in: source tuple */
+{
+	ulint		i;
+	ulint		n_fields;
+	ib_err_t	err = DB_SUCCESS;
+	ib_tuple_t*	dst_tuple = NULL;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	ib_tuple_t*	src_tuple = (ib_tuple_t*) ib_src_tpl;
+	dict_index_t*	clust_index;
+
+	clust_index = dict_table_get_first_index(cursor->prebuilt->table);
+
+	/* We need to ensure that the src tuple belongs to the same table
+	as the open cursor and that it's not a tuple for a cluster index. */
+	if (src_tuple->type != TPL_TYPE_KEY) {
+		return(DB_ERROR);
+	} else if (src_tuple->index->table != cursor->prebuilt->table) {
+		return(DB_DATA_MISMATCH);
+	} else if (src_tuple->index == clust_index) {
+		return(DB_ERROR);
+	}
+
+	/* Create the cluster index key search tuple. */
+	*ib_dst_tpl = ib_clust_search_tuple_create(ib_crsr);
+
+	if (!*ib_dst_tpl) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	dst_tuple = (ib_tuple_t*) *ib_dst_tpl;
+	ut_a(dst_tuple->index == clust_index);
+
+	n_fields = dict_index_get_n_unique(dst_tuple->index);
+
+	/* Do a deep copy of the data fields. */
+	for (i = 0; i < n_fields; i++) {
+		ulint		pos;
+		dfield_t*	src_field;
+		dfield_t*	dst_field;
+
+		pos = dict_index_get_nth_field_pos(
+			src_tuple->index, dst_tuple->index, i);
+
+		ut_a(pos != ULINT_UNDEFINED);
+
+		src_field = dtuple_get_nth_field(src_tuple->ptr, pos);
+		dst_field = dtuple_get_nth_field(dst_tuple->ptr, i);
+
+		if (!dfield_is_null(src_field)) {
+			UNIV_MEM_ASSERT_RW(src_field->data, src_field->len);
+
+			dst_field->data = mem_heap_dup(
+				dst_tuple->heap,
+				src_field->data,
+				src_field->len);
+
+			dst_field->len = src_field->len;
+		} else {
+			dfield_set_null(dst_field);
+		}
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Copy the contents of  source tuple to destination tuple. The tuples
+must be of the same type and belong to the same table/index.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ib_err_t
+ib_tuple_copy(
+/*==========*/
+	ib_tpl_t	ib_dst_tpl,	/*!< in: destination tuple */
+	const ib_tpl_t	ib_src_tpl)	/*!< in: source tuple */
+{
+	ulint		i;
+	ulint		n_fields;
+	ib_err_t	err = DB_SUCCESS;
+	const ib_tuple_t*src_tuple = (const ib_tuple_t*) ib_src_tpl;
+	ib_tuple_t*	dst_tuple = (ib_tuple_t*) ib_dst_tpl;
+
+	/* Make sure src and dst are not the same. */
+	ut_a(src_tuple != dst_tuple);
+
+	/* Make sure they are the same type and refer to the same index. */
+	if (src_tuple->type != dst_tuple->type
+	   || src_tuple->index != dst_tuple->index) {
+
+		return(DB_DATA_MISMATCH);
+	}
+
+	n_fields = dtuple_get_n_fields(src_tuple->ptr);
+	ut_ad(n_fields == dtuple_get_n_fields(dst_tuple->ptr));
+
+	/* Do a deep copy of the data fields. */
+	for (i = 0; i < n_fields; ++i) {
+		dfield_t*	src_field;
+		dfield_t*	dst_field;
+
+		src_field = dtuple_get_nth_field(src_tuple->ptr, i);
+		dst_field = dtuple_get_nth_field(dst_tuple->ptr, i);
+
+		if (!dfield_is_null(src_field)) {
+			UNIV_MEM_ASSERT_RW(src_field->data, src_field->len);
+
+			dst_field->data = mem_heap_dup(
+				dst_tuple->heap,
+				src_field->data,
+				src_field->len);
+
+			dst_field->len = src_field->len;
+		} else {
+			dfield_set_null(dst_field);
+		}
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Create an InnoDB tuple used for index/table search.
+@return	own: Tuple for current index */
+UNIV_INTERN
+ib_tpl_t
+ib_sec_search_tuple_create(
+/*=======================*/
+	ib_crsr_t	ib_crsr)	/*!< in: Cursor instance */
+{
+	ulint		n_cols;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	dict_index_t*	index = cursor->prebuilt->index;
+
+	n_cols = dict_index_get_n_unique_in_tree(index);
+	return(ib_key_tuple_new(index, n_cols));
+}
+
+/*****************************************************************//**
+Create an InnoDB tuple used for index/table search.
+@return	own: Tuple for current index */
+UNIV_INTERN
+ib_tpl_t
+ib_sec_read_tuple_create(
+/*=====================*/
+	ib_crsr_t	ib_crsr)	/*!< in: Cursor instance */
+{
+	ulint		n_cols;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	dict_index_t*	index = cursor->prebuilt->index;
+
+	n_cols = dict_index_get_n_fields(index);
+	return(ib_row_tuple_new(index, n_cols));
+}
+
+/*****************************************************************//**
+Create an InnoDB tuple used for table key operations.
+@return	own: Tuple for current table */
+UNIV_INTERN
+ib_tpl_t
+ib_clust_search_tuple_create(
+/*=========================*/
+	ib_crsr_t	ib_crsr)	/*!< in: Cursor instance */
+{
+	ulint		n_cols;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	dict_index_t*	index;
+
+	index = dict_table_get_first_index(cursor->prebuilt->table);
+
+	n_cols = dict_index_get_n_ordering_defined_by_user(index);
+	return(ib_key_tuple_new(index, n_cols));
+}
+
+/*****************************************************************//**
+Create an InnoDB tuple for table row operations.
+@return	own: Tuple for current table */
+UNIV_INTERN
+ib_tpl_t
+ib_clust_read_tuple_create(
+/*=======================*/
+	ib_crsr_t	ib_crsr)	/*!< in: Cursor instance */
+{
+	ulint		n_cols;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	dict_index_t*	index;
+
+	index = dict_table_get_first_index(cursor->prebuilt->table);
+
+	n_cols = dict_table_get_n_cols(cursor->prebuilt->table);
+	return(ib_row_tuple_new(index, n_cols));
+}
+
+/*****************************************************************//**
+Return the number of user columns in the tuple definition.
+@return	number of user columns */
+UNIV_INTERN
+ib_ulint_t
+ib_tuple_get_n_user_cols(
+/*=====================*/
+	const ib_tpl_t	ib_tpl)		/*!< in: Tuple for current table */
+{
+	const ib_tuple_t*	tuple = (const ib_tuple_t*) ib_tpl;
+
+	if (tuple->type == TPL_TYPE_ROW) {
+		return(static_cast<ib_ulint_t>(
+			dict_table_get_n_user_cols(tuple->index->table)));
+	}
+
+	return(static_cast<ib_ulint_t>(
+		dict_index_get_n_ordering_defined_by_user(tuple->index)));
+}
+
+/*****************************************************************//**
+Return the number of columns in the tuple definition.
+@return	number of columns */
+UNIV_INTERN
+ib_ulint_t
+ib_tuple_get_n_cols(
+/*================*/
+	const ib_tpl_t	ib_tpl)		/*!< in: Tuple for table/index */
+{
+	const ib_tuple_t*	tuple = (const ib_tuple_t*) ib_tpl;
+
+	return(static_cast<ib_ulint_t>(dtuple_get_n_fields(tuple->ptr)));
+}
+
+/*****************************************************************//**
+Destroy an InnoDB tuple. */
+UNIV_INTERN
+void
+ib_tuple_delete(
+/*============*/
+	ib_tpl_t	ib_tpl)		/*!< in,own: Tuple instance to delete */
+{
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+
+	if (!ib_tpl) {
+		return;
+	}
+
+	mem_heap_free(tuple->heap);
+}
+
+/*****************************************************************//**
+Get a table id. This function will acquire the dictionary mutex.
+@return	DB_SUCCESS if found */
+UNIV_INTERN
+ib_err_t
+ib_table_get_id(
+/*============*/
+	const char*	table_name,	/*!< in: table to find */
+	ib_id_u64_t*	table_id)	/*!< out: table id if found */
+{
+	ib_err_t	err;
+
+	dict_mutex_enter_for_mysql();
+
+	err = ib_table_get_id_low(table_name, table_id);
+
+	dict_mutex_exit_for_mysql();
+
+	return(err);
+}
+
+/*****************************************************************//**
+Get an index id.
+@return	DB_SUCCESS if found */
+UNIV_INTERN
+ib_err_t
+ib_index_get_id(
+/*============*/
+	const char*	table_name,	/*!< in: find index for this table */
+	const char*	index_name,	/*!< in: index to find */
+	ib_id_u64_t*	index_id)	/*!< out: index id if found */
+{
+	dict_table_t*	table;
+	char*		normalized_name;
+	ib_err_t	err = DB_TABLE_NOT_FOUND;
+
+	*index_id = 0;
+
+	normalized_name = static_cast<char*>(
+		mem_alloc(ut_strlen(table_name) + 1));
+	ib_normalize_table_name(normalized_name, table_name);
+
+	table = ib_lookup_table_by_name(normalized_name);
+
+	mem_free(normalized_name);
+	normalized_name = NULL;
+
+	if (table != NULL) {
+		dict_index_t*	index;
+
+		index = dict_table_get_index_on_name(table, index_name);
+
+		if (index != NULL) {
+			/* We only support 32 bit table and index ids. Because
+			we need to pack the table id into the index id. */
+
+			*index_id = (table->id);
+			*index_id <<= 32;
+			*index_id |= (index->id);
+
+			err = DB_SUCCESS;
+		}
+	}
+
+	return(err);
+}
+
+#ifdef __WIN__
+#define SRV_PATH_SEPARATOR      '\\'
+#else
+#define SRV_PATH_SEPARATOR      '/'
+#endif
+
+
+/*****************************************************************//**
+Check if cursor is positioned.
+@return	IB_TRUE if positioned */
+UNIV_INTERN
+ib_bool_t
+ib_cursor_is_positioned(
+/*====================*/
+	const ib_crsr_t	ib_crsr)	/*!< in: InnoDB cursor instance */
+{
+	const ib_cursor_t*	cursor = (const ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*		prebuilt = cursor->prebuilt;
+
+	return(ib_btr_cursor_is_positioned(&prebuilt->pcur));
+}
+
+
+/*****************************************************************//**
+Checks if the data dictionary is latched in exclusive mode.
+@return	TRUE if exclusive latch */
+UNIV_INTERN
+ib_bool_t
+ib_schema_lock_is_exclusive(
+/*========================*/
+	const ib_trx_t	ib_trx)		/*!< in: transaction */
+{
+	const trx_t*	trx = (const trx_t*) ib_trx;
+
+	return(trx->dict_operation_lock_mode == RW_X_LATCH);
+}
+
+/*****************************************************************//**
+Checks if the data dictionary is latched in shared mode.
+@return	TRUE if shared latch */
+UNIV_INTERN
+ib_bool_t
+ib_schema_lock_is_shared(
+/*=====================*/
+	const ib_trx_t	ib_trx)		/*!< in: transaction */
+{
+	const trx_t*	trx = (const trx_t*) ib_trx;
+
+	return(trx->dict_operation_lock_mode == RW_S_LATCH);
+}
+
+/*****************************************************************//**
+Set the Lock an InnoDB cursor/table.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_lock(
+/*===========*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor */
+	ib_lck_mode_t	ib_lck_mode)	/*!< in: InnoDB lock mode */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+	trx_t*		trx = prebuilt->trx;
+	dict_table_t*	table = prebuilt->table;
+
+	return(ib_trx_lock_table_with_retry(
+		trx, table, (enum lock_mode) ib_lck_mode));
+}
+
+/*****************************************************************//**
+Set the Lock an InnoDB table using the table id.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ib_err_t
+ib_table_lock(
+/*==========*/
+	ib_trx_t	ib_trx,		/*!< in/out: transaction */
+	ib_id_u64_t	table_id,	/*!< in: table id */
+	ib_lck_mode_t	ib_lck_mode)	/*!< in: InnoDB lock mode */
+{
+	ib_err_t	err;
+	que_thr_t*	thr;
+	mem_heap_t*	heap;
+	dict_table_t*	table;
+	ib_qry_proc_t	q_proc;
+	trx_t*		trx = (trx_t*) ib_trx;
+
+	ut_a(trx->state != TRX_STATE_NOT_STARTED);
+
+	table = ib_open_table_by_id(table_id, FALSE);
+
+	if (table == NULL) {
+		return(DB_TABLE_NOT_FOUND);
+	}
+
+	ut_a(ib_lck_mode <= static_cast<ib_lck_mode_t>(LOCK_NUM));
+
+	heap = mem_heap_create(128);
+
+	q_proc.node.sel = sel_node_create(heap);
+
+	thr = pars_complete_graph_for_exec(q_proc.node.sel, trx, heap);
+
+	q_proc.grph.sel = static_cast<que_fork_t*>(que_node_get_parent(thr));
+	q_proc.grph.sel->state = QUE_FORK_ACTIVE;
+
+	trx->op_info = "setting table lock";
+
+	ut_a(ib_lck_mode == IB_LOCK_IS || ib_lck_mode == IB_LOCK_IX);
+	err = static_cast<ib_err_t>(
+		lock_table(0, table, (enum lock_mode) ib_lck_mode, thr));
+
+	trx->error_state = err;
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Unlock an InnoDB table.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_unlock(
+/*=============*/
+	ib_crsr_t	ib_crsr)	/*!< in/out: InnoDB cursor */
+{
+	ib_err_t	err = DB_SUCCESS;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+
+	if (prebuilt->trx->mysql_n_tables_locked > 0) {
+		--prebuilt->trx->mysql_n_tables_locked;
+	} else {
+		err = DB_ERROR;
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Set the Lock mode of the cursor.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_set_lock_mode(
+/*====================*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor */
+	ib_lck_mode_t	ib_lck_mode)	/*!< in: InnoDB lock mode */
+{
+	ib_err_t	err = DB_SUCCESS;
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+
+	ut_a(ib_lck_mode <= static_cast<ib_lck_mode_t>(LOCK_NUM));
+
+	if (ib_lck_mode == IB_LOCK_X) {
+		err = ib_cursor_lock(ib_crsr, IB_LOCK_IX);
+	} else if (ib_lck_mode == IB_LOCK_S) {
+		err = ib_cursor_lock(ib_crsr, IB_LOCK_IS);
+	}
+
+	if (err == DB_SUCCESS) {
+		prebuilt->select_lock_type = (enum lock_mode) ib_lck_mode;
+		ut_a(prebuilt->trx->state != TRX_STATE_NOT_STARTED);
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Set need to access clustered index record. */
+UNIV_INTERN
+void
+ib_cursor_set_cluster_access(
+/*=========================*/
+	ib_crsr_t	ib_crsr)	/*!< in/out: InnoDB cursor */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*	prebuilt = cursor->prebuilt;
+
+	prebuilt->need_to_access_clustered = TRUE;
+}
+
+/*************************************************************//**
+Convert and write an INT column value to an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INLINE
+ib_err_t
+ib_tuple_write_int(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	ulint		col_no,		/*!< in: column number */
+	const void*	value,		/*!< in: integer value */
+	ulint		value_len)	/*!< in: sizeof value type */
+{
+	const dfield_t*	dfield;
+	ulint		data_len;
+	ulint		type_len;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+
+	ut_a(col_no < ib_tuple_get_n_cols(ib_tpl));
+
+	dfield = ib_col_get_dfield(tuple, col_no);
+
+	data_len = dfield_get_len(dfield);
+	type_len = dtype_get_len(dfield_get_type(dfield));
+
+	if (dtype_get_mtype(dfield_get_type(dfield)) != DATA_INT
+	    || value_len != data_len) {
+
+		return(DB_DATA_MISMATCH);
+	}
+
+	return(ib_col_set_value(
+		ib_tpl, static_cast<ib_ulint_t>(col_no),
+		value, static_cast<ib_ulint_t>(type_len), true));
+}
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_i8(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_i8_t		val)		/*!< in: value to write */
+{
+	return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true));
+}
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_i16(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_i16_t	val)		/*!< in: value to write */
+{
+	return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true));
+}
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_i32(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_i32_t	val)		/*!< in: value to write */
+{
+	return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true));
+}
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_i64(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_i64_t	val)		/*!< in: value to write */
+{
+	return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true));
+}
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_u8(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_u8_t		val)		/*!< in: value to write */
+{
+	return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true));
+}
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_u16(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tupe to write to */
+	int		col_no,		/*!< in: column number */
+	ib_u16_t	val)		/*!< in: value to write */
+{
+	return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true));
+}
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_u32(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_u32_t	val)		/*!< in: value to write */
+{
+	return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true));
+}
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_u64(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_u64_t	val)		/*!< in: value to write */
+{
+	return(ib_col_set_value(ib_tpl, col_no, &val, sizeof(val), true));
+}
+
+/*****************************************************************//**
+Inform the cursor that it's the start of an SQL statement. */
+UNIV_INTERN
+void
+ib_cursor_stmt_begin(
+/*=================*/
+	ib_crsr_t	ib_crsr)	/*!< in: cursor */
+{
+	ib_cursor_t*	cursor = (ib_cursor_t*) ib_crsr;
+
+	cursor->prebuilt->sql_stat_start = TRUE;
+}
+
+/*****************************************************************//**
+Write a double value to a column.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_double(
+/*==================*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	double		val)		/*!< in: value to write */
+{
+	const dfield_t*	dfield;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+
+	dfield = ib_col_get_dfield(tuple, col_no);
+
+	if (dtype_get_mtype(dfield_get_type(dfield)) == DATA_DOUBLE) {
+		return(ib_col_set_value(ib_tpl, col_no,
+					&val, sizeof(val), true));
+	} else {
+		return(DB_DATA_MISMATCH);
+	}
+}
+
+/*************************************************************//**
+Read a double column value from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_double(
+/*=================*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	col_no,		/*!< in: column number */
+	double*		dval)		/*!< out: double value */
+{
+	ib_err_t	err;
+	const dfield_t*	dfield;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+
+	dfield = ib_col_get_dfield(tuple, col_no);
+
+	if (dtype_get_mtype(dfield_get_type(dfield)) == DATA_DOUBLE) {
+		ib_col_copy_value_low(ib_tpl, col_no, dval, sizeof(*dval));
+		err = DB_SUCCESS;
+	} else {
+		err = DB_DATA_MISMATCH;
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Write a float value to a column.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_write_float(
+/*=================*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	float		val)		/*!< in: value to write */
+{
+	const dfield_t*	dfield;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+
+	dfield = ib_col_get_dfield(tuple, col_no);
+
+	if (dtype_get_mtype(dfield_get_type(dfield)) == DATA_FLOAT) {
+		return(ib_col_set_value(ib_tpl, col_no,
+					&val, sizeof(val), true));
+	} else {
+		return(DB_DATA_MISMATCH);
+	}
+}
+
+/*************************************************************//**
+Read a float value from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ib_err_t
+ib_tuple_read_float(
+/*================*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	col_no,		/*!< in: column number */
+	float*		fval)		/*!< out: float value */
+{
+	ib_err_t	err;
+	const dfield_t*	dfield;
+	ib_tuple_t*	tuple = (ib_tuple_t*) ib_tpl;
+
+	dfield = ib_col_get_dfield(tuple, col_no);
+
+	if (dtype_get_mtype(dfield_get_type(dfield)) == DATA_FLOAT) {
+		ib_col_copy_value_low(ib_tpl, col_no, fval, sizeof(*fval));
+		err = DB_SUCCESS;
+	} else {
+		err = DB_DATA_MISMATCH;
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Truncate a table. The cursor handle will be closed and set to NULL
+on success.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ib_err_t
+ib_cursor_truncate(
+/*===============*/
+	ib_crsr_t*	ib_crsr,	/*!< in/out: cursor for table
+					to truncate */
+	ib_id_u64_t*	table_id)	/*!< out: new table id */
+{
+	ib_err_t        err;
+	ib_cursor_t*    cursor = *(ib_cursor_t**) ib_crsr;
+	row_prebuilt_t* prebuilt = cursor->prebuilt;
+
+	*table_id = 0;
+
+	err = ib_cursor_lock(*ib_crsr, IB_LOCK_X);
+
+	if (err == DB_SUCCESS) {
+		trx_t*          trx;
+		dict_table_t*   table = prebuilt->table;
+
+		/* We are going to free the cursor and the prebuilt. Store
+		the transaction handle locally. */
+		trx = prebuilt->trx;
+		err = ib_cursor_close(*ib_crsr);
+		ut_a(err == DB_SUCCESS);
+
+		*ib_crsr = NULL;
+
+		/* A temp go around for assertion in trx_start_for_ddl_low
+		we already start the trx */
+		if (trx->state == TRX_STATE_ACTIVE) {
+#ifdef UNIV_DEBUG
+			trx->start_file = 0;
+#endif /* UNIV_DEBUG */
+			trx->dict_operation = TRX_DICT_OP_TABLE;
+		}
+
+		/* This function currently commits the transaction
+		on success. */
+		err = static_cast<ib_err_t>(
+			row_truncate_table_for_mysql(table, trx));
+
+		if (err == DB_SUCCESS) {
+			*table_id = (table->id);
+		}
+	}
+
+        return(err);
+}
+
+/*****************************************************************//**
+Truncate a table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ib_err_t
+ib_table_truncate(
+/*==============*/
+	const char*	table_name,	/*!< in: table name */
+	ib_id_u64_t*	table_id)	/*!< out: new table id */
+{
+	ib_err_t        err;
+	dict_table_t*   table;
+	ib_err_t        trunc_err;
+	ib_trx_t        ib_trx = NULL;
+	ib_crsr_t       ib_crsr = NULL;
+	ib_ulint_t	memcached_sync = 0;
+
+	ib_trx = ib_trx_begin(IB_TRX_SERIALIZABLE, true, false);
+
+	dict_mutex_enter_for_mysql();
+
+	table = dict_table_open_on_name(table_name, TRUE, FALSE,
+					DICT_ERR_IGNORE_NONE);
+
+	if (table != NULL && dict_table_get_first_index(table)) {
+		err = ib_create_cursor_with_index_id(&ib_crsr, table, 0,
+						     (trx_t*) ib_trx);
+	} else {
+		err = DB_TABLE_NOT_FOUND;
+	}
+
+	/* Remember the memcached_sync_count and set it to 0, so the
+	truncate can be executed. */
+	if (table != NULL && err == DB_SUCCESS) {
+		memcached_sync = static_cast<ib_ulint_t>(
+			table->memcached_sync_count);
+		table->memcached_sync_count = 0;
+	}
+
+	dict_mutex_exit_for_mysql();
+
+	if (err == DB_SUCCESS) {
+		trunc_err = ib_cursor_truncate(&ib_crsr, table_id);
+		ut_a(err == DB_SUCCESS);
+	} else {
+		trunc_err = err;
+	}
+
+	if (ib_crsr != NULL) {
+		err = ib_cursor_close(ib_crsr);
+		ut_a(err == DB_SUCCESS);
+	}
+
+	if (trunc_err == DB_SUCCESS) {
+		ut_a(ib_trx_state(ib_trx) == static_cast<ib_trx_state_t>(
+			TRX_STATE_NOT_STARTED));
+
+		err = ib_trx_release(ib_trx);
+		ut_a(err == DB_SUCCESS);
+	} else {
+		err = ib_trx_rollback(ib_trx);
+		ut_a(err == DB_SUCCESS);
+	}
+
+	/* Set the memcached_sync_count back. */
+	if (table != NULL && memcached_sync != 0) {
+		dict_mutex_enter_for_mysql();
+
+		table->memcached_sync_count = memcached_sync;
+
+		dict_mutex_exit_for_mysql();
+	}
+
+        return(trunc_err);
+}
+
+/*****************************************************************//**
+Frees a possible InnoDB trx object associated with the current THD.
+@return 0 or error number */
+UNIV_INTERN
+ib_err_t
+ib_close_thd(
+/*=========*/
+	void*		thd)	/*!< in: handle to the MySQL thread of the user
+				whose resources should be free'd */
+{
+	innobase_close_thd(static_cast<THD*>(thd));
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Return isolation configuration set by "innodb_api_trx_level"
+@return trx isolation level*/
+UNIV_INTERN
+ib_trx_state_t
+ib_cfg_trx_level()
+/*==============*/
+{
+	return(static_cast<ib_trx_state_t>(ib_trx_level_setting));
+}
+
+/*****************************************************************//**
+Return configure value for background commit interval (in seconds)
+@return background commit interval (in seconds) */
+UNIV_INTERN
+ib_ulint_t
+ib_cfg_bk_commit_interval()
+/*=======================*/
+{
+	return(static_cast<ib_ulint_t>(ib_bk_commit_interval));
+}
+
+/*****************************************************************//**
+Get generic configure status
+@return configure status*/
+UNIV_INTERN
+int
+ib_cfg_get_cfg()
+/*============*/
+{
+	int	cfg_status;
+
+	cfg_status = (ib_binlog_enabled) ? IB_CFG_BINLOG_ENABLED : 0;
+
+	if (ib_mdl_enabled) {
+		cfg_status |= IB_CFG_MDL_ENABLED;
+	}
+
+	if (ib_disable_row_lock) {
+		cfg_status |= IB_CFG_DISABLE_ROWLOCK;
+	}
+
+	return(cfg_status);
+}
+
+/*****************************************************************//**
+Increase/decrease the memcached sync count of table to sync memcached
+DML with SQL DDLs.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ib_err_t
+ib_cursor_set_memcached_sync(
+/*=========================*/
+	ib_crsr_t	ib_crsr,	/*!< in: cursor */
+	ib_bool_t	flag)		/*!< in: true for increase */
+{
+	const ib_cursor_t*      cursor = (const ib_cursor_t*) ib_crsr;
+	row_prebuilt_t*         prebuilt = cursor->prebuilt;
+	dict_table_t*           table = prebuilt->table;
+	ib_err_t                err = DB_SUCCESS;
+
+	if (table != NULL) {
+                /* If memcached_sync_count is -1, means table is
+                doing DDL, we just return error. */
+                if (table->memcached_sync_count == DICT_TABLE_IN_DDL) {
+                        return(DB_ERROR);
+                }
+
+		if (flag) {
+#ifdef HAVE_ATOMIC_BUILTINS
+			os_atomic_increment_lint(&table->memcached_sync_count, 1);
+#else
+		        dict_mutex_enter_for_mysql();
+                        ++table->memcached_sync_count;
+                        dict_mutex_exit_for_mysql();
+#endif
+		} else {
+#ifdef HAVE_ATOMIC_BUILTINS
+			os_atomic_decrement_lint(&table->memcached_sync_count, 1);
+#else
+		        dict_mutex_enter_for_mysql();
+                        --table->memcached_sync_count;
+                        dict_mutex_exit_for_mysql();
+#endif
+		        ut_a(table->memcached_sync_count >= 0);
+		}
+	} else {
+		err = DB_TABLE_NOT_FOUND;
+	}
+
+	return(err);
+}
diff --git a/storage/innobase/api/api0misc.cc b/storage/innobase/api/api0misc.cc
new file mode 100644
index 00000000000..b2370105938
--- /dev/null
+++ b/storage/innobase/api/api0misc.cc
@@ -0,0 +1,206 @@
+/*****************************************************************************
+
+Copyright (c) 2008, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file api/api0misc.cc
+InnoDB Native API
+
+2008-08-01 Created by Sunny Bains
+3/20/2011 Jimmy Yang extracted from Embedded InnoDB
+*******************************************************/
+
+#include <errno.h>
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif /* HAVE_UNISTD_H */
+
+#include "api0misc.h"
+#include "trx0roll.h"
+#include "srv0srv.h"
+#include "dict0mem.h"
+#include "dict0dict.h"
+#include "pars0pars.h"
+#include "row0sel.h"
+#include "lock0lock.h"
+#include "ha_prototypes.h"
+#include <m_ctype.h>
+#include <mysys_err.h>
+#include <mysql/plugin.h>
+
+/*********************************************************************//**
+Sets a lock on a table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+ib_trx_lock_table_with_retry(
+/*=========================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	dict_table_t*	table,		/*!< in: table to lock */
+	enum lock_mode	mode)		/*!< in: LOCK_X or LOCK_S */
+{
+	que_thr_t*	thr;
+	dberr_t		err;
+	mem_heap_t*	heap;
+	sel_node_t*	node;
+
+	heap = mem_heap_create(512);
+
+	trx->op_info = "setting table lock";
+
+	node = sel_node_create(heap);
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+	thr->graph->state = QUE_FORK_ACTIVE;
+
+	/* We use the select query graph as the dummy graph needed
+	in the lock module call */
+
+	thr = que_fork_get_first_thr(static_cast<que_fork_t*>(
+		que_node_get_parent(thr)));
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = thr;
+	thr->prev_node = thr->common.parent;
+
+	err = lock_table(0, table, mode, thr);
+
+	trx->error_state = err;
+
+	if (UNIV_LIKELY(err == DB_SUCCESS)) {
+		que_thr_stop_for_mysql_no_error(thr, trx);
+	} else {
+		que_thr_stop_for_mysql(thr);
+
+		if (err != DB_QUE_THR_SUSPENDED) {
+			ibool	was_lock_wait;
+
+			was_lock_wait = ib_handle_errors(&err, trx, thr, NULL);
+
+			if (was_lock_wait) {
+				goto run_again;
+			}
+		} else {
+			que_thr_t*	run_thr;
+			que_node_t*	parent;
+
+			parent = que_node_get_parent(thr);
+			run_thr = que_fork_start_command(
+				static_cast<que_fork_t*>(parent));
+
+			ut_a(run_thr == thr);
+
+			/* There was a lock wait but the thread was not
+			in a ready to run or running state. */
+			trx->error_state = DB_LOCK_WAIT;
+
+			goto run_again;
+		}
+	}
+
+	que_graph_free(thr->graph);
+	trx->op_info = "";
+
+	return(err);
+}
+/****************************************************************//**
+Handles user errors and lock waits detected by the database engine.
+@return TRUE if it was a lock wait and we should continue running
+the query thread */
+UNIV_INTERN
+ibool
+ib_handle_errors(
+/*=============*/
+        dberr_t*	new_err,/*!< out: possible new error encountered in
+                                lock wait, or if no new error, the value
+                                of trx->error_state at the entry of this
+                                function */
+        trx_t*          trx,    /*!< in: transaction */
+        que_thr_t*      thr,    /*!< in: query thread */
+        trx_savept_t*   savept) /*!< in: savepoint or NULL */
+{
+        dberr_t		err;
+handle_new_error:
+        err = trx->error_state;
+
+        ut_a(err != DB_SUCCESS);
+
+        trx->error_state = DB_SUCCESS;
+
+        switch (err) {
+        case DB_LOCK_WAIT_TIMEOUT:
+		trx_rollback_for_mysql(trx);
+		break;
+                /* fall through */
+        case DB_DUPLICATE_KEY:
+        case DB_FOREIGN_DUPLICATE_KEY:
+        case DB_TOO_BIG_RECORD:
+        case DB_ROW_IS_REFERENCED:
+        case DB_NO_REFERENCED_ROW:
+        case DB_CANNOT_ADD_CONSTRAINT:
+        case DB_TOO_MANY_CONCURRENT_TRXS:
+        case DB_OUT_OF_FILE_SPACE:
+                if (savept) {
+                        /* Roll back the latest, possibly incomplete
+                        insertion or update */
+
+			trx_rollback_to_savepoint(trx, savept);
+                }
+                break;
+        case DB_LOCK_WAIT:
+		lock_wait_suspend_thread(thr);
+
+                if (trx->error_state != DB_SUCCESS) {
+                        que_thr_stop_for_mysql(thr);
+
+                        goto handle_new_error;
+                }
+
+                *new_err = err;
+
+                return(TRUE); /* Operation needs to be retried. */
+
+        case DB_DEADLOCK:
+        case DB_LOCK_TABLE_FULL:
+                /* Roll back the whole transaction; this resolution was added
+                to version 3.23.43 */
+
+                trx_rollback_for_mysql(trx);
+                break;
+
+        case DB_MUST_GET_MORE_FILE_SPACE:
+
+                exit(1);
+
+        case DB_CORRUPTION:
+	case DB_FOREIGN_EXCEED_MAX_CASCADE:
+                break;
+        default:
+                ut_error;
+        }
+
+        if (trx->error_state != DB_SUCCESS) {
+                *new_err = trx->error_state;
+        } else {
+                *new_err = err;
+        }
+
+        trx->error_state = DB_SUCCESS;
+
+        return(FALSE);
+}
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
new file mode 100644
index 00000000000..79b533481b7
--- /dev/null
+++ b/storage/innobase/btr/btr0btr.cc
@@ -0,0 +1,5105 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0btr.cc
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "btr0btr.h"
+
+#ifdef UNIV_NONINL
+#include "btr0btr.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "page0page.h"
+#include "page0zip.h"
+
+#ifndef UNIV_HOTBACKUP
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "ibuf0ibuf.h"
+#include "trx0trx.h"
+#include "srv0mon.h"
+
+/**************************************************************//**
+Checks if the page in the cursor can be merged with given page.
+If necessary, re-organize the merge_page.
+@return	TRUE if possible to merge. */
+UNIV_INTERN
+ibool
+btr_can_merge_with_page(
+/*====================*/
+	btr_cur_t*	cursor,		/*!< in: cursor on the page to merge */
+	ulint		page_no,	/*!< in: a sibling page */
+	buf_block_t**	merge_block,	/*!< out: the merge block */
+	mtr_t*		mtr);		/*!< in: mini-transaction */
+
+#endif /* UNIV_HOTBACKUP */
+
+/**************************************************************//**
+Report that an index page is corrupted. */
+UNIV_INTERN
+void
+btr_corruption_report(
+/*==================*/
+	const buf_block_t*	block,	/*!< in: corrupted block */
+	const dict_index_t*	index)	/*!< in: index tree */
+{
+	fprintf(stderr, "InnoDB: flag mismatch in space %u page %u"
+		" index %s of table %s\n",
+		(unsigned) buf_block_get_space(block),
+		(unsigned) buf_block_get_page_no(block),
+		index->name, index->table_name);
+	if (block->page.zip.data) {
+		buf_page_print(block->page.zip.data,
+			       buf_block_get_zip_size(block),
+			       BUF_PAGE_PRINT_NO_CRASH);
+	}
+	buf_page_print(buf_block_get_frame(block), 0, 0);
+}
+
+#ifndef UNIV_HOTBACKUP
+#ifdef UNIV_BLOB_DEBUG
+# include "srv0srv.h"
+# include "ut0rbt.h"
+
+/** TRUE when messages about index->blobs modification are enabled. */
+static ibool btr_blob_dbg_msg;
+
+/** Issue a message about an operation on index->blobs.
+@param op	operation
+@param b	the entry being subjected to the operation
+@param ctx	the context of the operation */
+#define btr_blob_dbg_msg_issue(op, b, ctx)			\
+	fprintf(stderr, op " %u:%u:%u->%u %s(%u,%u,%u)\n",	\
+		(b)->ref_page_no, (b)->ref_heap_no,		\
+		(b)->ref_field_no, (b)->blob_page_no, ctx,	\
+		(b)->owner, (b)->always_owner, (b)->del)
+
+/** Insert to index->blobs a reference to an off-page column.
+@param index	the index tree
+@param b	the reference
+@param ctx	context (for logging) */
+UNIV_INTERN
+void
+btr_blob_dbg_rbt_insert(
+/*====================*/
+	dict_index_t*		index,	/*!< in/out: index tree */
+	const btr_blob_dbg_t*	b,	/*!< in: the reference */
+	const char*		ctx)	/*!< in: context (for logging) */
+{
+	if (btr_blob_dbg_msg) {
+		btr_blob_dbg_msg_issue("insert", b, ctx);
+	}
+	mutex_enter(&index->blobs_mutex);
+	rbt_insert(index->blobs, b, b);
+	mutex_exit(&index->blobs_mutex);
+}
+
+/** Remove from index->blobs a reference to an off-page column.
+@param index	the index tree
+@param b	the reference
+@param ctx	context (for logging) */
+UNIV_INTERN
+void
+btr_blob_dbg_rbt_delete(
+/*====================*/
+	dict_index_t*		index,	/*!< in/out: index tree */
+	const btr_blob_dbg_t*	b,	/*!< in: the reference */
+	const char*		ctx)	/*!< in: context (for logging) */
+{
+	if (btr_blob_dbg_msg) {
+		btr_blob_dbg_msg_issue("delete", b, ctx);
+	}
+	mutex_enter(&index->blobs_mutex);
+	ut_a(rbt_delete(index->blobs, b));
+	mutex_exit(&index->blobs_mutex);
+}
+
+/**************************************************************//**
+Comparator for items (btr_blob_dbg_t) in index->blobs.
+The key in index->blobs is (ref_page_no, ref_heap_no, ref_field_no).
+@return negative, 0 or positive if *a<*b, *a=*b, *a>*b */
+static
+int
+btr_blob_dbg_cmp(
+/*=============*/
+	const void*	a,	/*!< in: first btr_blob_dbg_t to compare */
+	const void*	b)	/*!< in: second btr_blob_dbg_t to compare */
+{
+	const btr_blob_dbg_t*	aa = static_cast<const btr_blob_dbg_t*>(a);
+	const btr_blob_dbg_t*	bb = static_cast<const btr_blob_dbg_t*>(b);
+
+	ut_ad(aa != NULL);
+	ut_ad(bb != NULL);
+
+	if (aa->ref_page_no != bb->ref_page_no) {
+		return(aa->ref_page_no < bb->ref_page_no ? -1 : 1);
+	}
+	if (aa->ref_heap_no != bb->ref_heap_no) {
+		return(aa->ref_heap_no < bb->ref_heap_no ? -1 : 1);
+	}
+	if (aa->ref_field_no != bb->ref_field_no) {
+		return(aa->ref_field_no < bb->ref_field_no ? -1 : 1);
+	}
+	return(0);
+}
+
+/**************************************************************//**
+Add a reference to an off-page column to the index->blobs map. */
+UNIV_INTERN
+void
+btr_blob_dbg_add_blob(
+/*==================*/
+	const rec_t*	rec,		/*!< in: clustered index record */
+	ulint		field_no,	/*!< in: off-page column number */
+	ulint		page_no,	/*!< in: start page of the column */
+	dict_index_t*	index,		/*!< in/out: index tree */
+	const char*	ctx)		/*!< in: context (for logging) */
+{
+	btr_blob_dbg_t	b;
+	const page_t*	page	= page_align(rec);
+
+	ut_a(index->blobs);
+
+	b.blob_page_no = page_no;
+	b.ref_page_no = page_get_page_no(page);
+	b.ref_heap_no = page_rec_get_heap_no(rec);
+	b.ref_field_no = field_no;
+	ut_a(b.ref_field_no >= index->n_uniq);
+	b.always_owner = b.owner = TRUE;
+	b.del = FALSE;
+	ut_a(!rec_get_deleted_flag(rec, page_is_comp(page)));
+	btr_blob_dbg_rbt_insert(index, &b, ctx);
+}
+
+/**************************************************************//**
+Add to index->blobs any references to off-page columns from a record.
+@return number of references added */
+UNIV_INTERN
+ulint
+btr_blob_dbg_add_rec(
+/*=================*/
+	const rec_t*	rec,	/*!< in: record */
+	dict_index_t*	index,	/*!< in/out: index */
+	const ulint*	offsets,/*!< in: offsets */
+	const char*	ctx)	/*!< in: context (for logging) */
+{
+	ulint		count	= 0;
+	ulint		i;
+	btr_blob_dbg_t	b;
+	ibool		del;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (!rec_offs_any_extern(offsets)) {
+		return(0);
+	}
+
+	b.ref_page_no = page_get_page_no(page_align(rec));
+	b.ref_heap_no = page_rec_get_heap_no(rec);
+	del = (rec_get_deleted_flag(rec, rec_offs_comp(offsets)) != 0);
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
+			ulint		len;
+			const byte*	field_ref = rec_get_nth_field(
+				rec, offsets, i, &len);
+
+			ut_a(len != UNIV_SQL_NULL);
+			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+			field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+			if (!memcmp(field_ref, field_ref_zero,
+				    BTR_EXTERN_FIELD_REF_SIZE)) {
+				/* the column has not been stored yet */
+				continue;
+			}
+
+			b.ref_field_no = i;
+			b.blob_page_no = mach_read_from_4(
+				field_ref + BTR_EXTERN_PAGE_NO);
+			ut_a(b.ref_field_no >= index->n_uniq);
+			b.always_owner = b.owner
+				= !(field_ref[BTR_EXTERN_LEN]
+				    & BTR_EXTERN_OWNER_FLAG);
+			b.del = del;
+
+			btr_blob_dbg_rbt_insert(index, &b, ctx);
+			count++;
+		}
+	}
+
+	return(count);
+}
+
+/**************************************************************//**
+Display the references to off-page columns.
+This function is to be called from a debugger,
+for example when a breakpoint on ut_dbg_assertion_failed is hit. */
+UNIV_INTERN
+void
+btr_blob_dbg_print(
+/*===============*/
+	const dict_index_t*	index)	/*!< in: index tree */
+{
+	const ib_rbt_node_t*	node;
+
+	if (!index->blobs) {
+		return;
+	}
+
+	/* We intentionally do not acquire index->blobs_mutex here.
+	This function is to be called from a debugger, and the caller
+	should make sure that the index->blobs_mutex is held. */
+
+	for (node = rbt_first(index->blobs);
+	     node != NULL; node = rbt_next(index->blobs, node)) {
+		const btr_blob_dbg_t*	b
+			= rbt_value(btr_blob_dbg_t, node);
+		fprintf(stderr, "%u:%u:%u->%u%s%s%s\n",
+			b->ref_page_no, b->ref_heap_no, b->ref_field_no,
+			b->blob_page_no,
+			b->owner ? "" : "(disowned)",
+			b->always_owner ? "" : "(has disowned)",
+			b->del ? "(deleted)" : "");
+	}
+}
+
+/**************************************************************//**
+Remove from index->blobs any references to off-page columns from a record.
+@return number of references removed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_remove_rec(
+/*====================*/
+	const rec_t*	rec,	/*!< in: record */
+	dict_index_t*	index,	/*!< in/out: index */
+	const ulint*	offsets,/*!< in: offsets */
+	const char*	ctx)	/*!< in: context (for logging) */
+{
+	ulint		i;
+	ulint		count	= 0;
+	btr_blob_dbg_t	b;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (!rec_offs_any_extern(offsets)) {
+		return(0);
+	}
+
+	b.ref_page_no = page_get_page_no(page_align(rec));
+	b.ref_heap_no = page_rec_get_heap_no(rec);
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
+			ulint		len;
+			const byte*	field_ref = rec_get_nth_field(
+				rec, offsets, i, &len);
+
+			ut_a(len != UNIV_SQL_NULL);
+			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+			field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+			b.ref_field_no = i;
+			b.blob_page_no = mach_read_from_4(
+				field_ref + BTR_EXTERN_PAGE_NO);
+
+			switch (b.blob_page_no) {
+			case 0:
+				/* The column has not been stored yet.
+				The BLOB pointer must be all zero.
+				There cannot be a BLOB starting at
+				page 0, because page 0 is reserved for
+				the tablespace header. */
+				ut_a(!memcmp(field_ref, field_ref_zero,
+					     BTR_EXTERN_FIELD_REF_SIZE));
+				/* fall through */
+			case FIL_NULL:
+				/* the column has been freed already */
+				continue;
+			}
+
+			btr_blob_dbg_rbt_delete(index, &b, ctx);
+			count++;
+		}
+	}
+
+	return(count);
+}
+
+/**************************************************************//**
+Check that there are no references to off-page columns from or to
+the given page. Invoked when freeing or clearing a page.
+@return TRUE when no orphan references exist */
+UNIV_INTERN
+ibool
+btr_blob_dbg_is_empty(
+/*==================*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		page_no)	/*!< in: page number */
+{
+	const ib_rbt_node_t*	node;
+	ibool			success	= TRUE;
+
+	if (!index->blobs) {
+		return(success);
+	}
+
+	mutex_enter(&index->blobs_mutex);
+
+	for (node = rbt_first(index->blobs);
+	     node != NULL; node = rbt_next(index->blobs, node)) {
+		const btr_blob_dbg_t*	b
+			= rbt_value(btr_blob_dbg_t, node);
+
+		if (b->ref_page_no != page_no && b->blob_page_no != page_no) {
+			continue;
+		}
+
+		fprintf(stderr,
+			"InnoDB: orphan BLOB ref%s%s%s %u:%u:%u->%u\n",
+			b->owner ? "" : "(disowned)",
+			b->always_owner ? "" : "(has disowned)",
+			b->del ? "(deleted)" : "",
+			b->ref_page_no, b->ref_heap_no, b->ref_field_no,
+			b->blob_page_no);
+
+		if (b->blob_page_no != page_no || b->owner || !b->del) {
+			success = FALSE;
+		}
+	}
+
+	mutex_exit(&index->blobs_mutex);
+	return(success);
+}
+
+/**************************************************************//**
+Count and process all references to off-page columns on a page.
+@return number of references processed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_op(
+/*============*/
+	const page_t*		page,	/*!< in: B-tree leaf page */
+	const rec_t*		rec,	/*!< in: record to start from
+					(NULL to process the whole page) */
+	dict_index_t*		index,	/*!< in/out: index */
+	const char*		ctx,	/*!< in: context (for logging) */
+	const btr_blob_dbg_op_f	op)	/*!< in: operation on records */
+{
+	ulint		count	= 0;
+	mem_heap_t*	heap	= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets	= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX);
+	ut_a(!rec || page_align(rec) == page);
+
+	if (!index->blobs || !page_is_leaf(page)
+	    || !dict_index_is_clust(index)) {
+		return(0);
+	}
+
+	if (rec == NULL) {
+		rec = page_get_infimum_rec(page);
+	}
+
+	do {
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		count += op(rec, index, offsets, ctx);
+		rec = page_rec_get_next_const(rec);
+	} while (!page_rec_is_supremum(rec));
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+
+	return(count);
+}
+
+/**************************************************************//**
+Count and add to index->blobs any references to off-page columns
+from records on a page.
+@return number of references added */
+UNIV_INTERN
+ulint
+btr_blob_dbg_add(
+/*=============*/
+	const page_t*	page,	/*!< in: rewritten page */
+	dict_index_t*	index,	/*!< in/out: index */
+	const char*	ctx)	/*!< in: context (for logging) */
+{
+	btr_blob_dbg_assert_empty(index, page_get_page_no(page));
+
+	return(btr_blob_dbg_op(page, NULL, index, ctx, btr_blob_dbg_add_rec));
+}
+
+/**************************************************************//**
+Count and remove from index->blobs any references to off-page columns
+from records on a page.
+Used when reorganizing a page, before copying the records.
+@return number of references removed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_remove(
+/*================*/
+	const page_t*	page,	/*!< in: b-tree page */
+	dict_index_t*	index,	/*!< in/out: index */
+	const char*	ctx)	/*!< in: context (for logging) */
+{
+	ulint	count;
+
+	count = btr_blob_dbg_op(page, NULL, index, ctx,
+				btr_blob_dbg_remove_rec);
+
+	/* Check that no references exist. */
+	btr_blob_dbg_assert_empty(index, page_get_page_no(page));
+
+	return(count);
+}
+
+/**************************************************************//**
+Restore in index->blobs any references to off-page columns
+Used when page reorganize fails due to compressed page overflow. */
+UNIV_INTERN
+void
+btr_blob_dbg_restore(
+/*=================*/
+	const page_t*	npage,	/*!< in: page that failed to compress  */
+	const page_t*	page,	/*!< in: copy of original page */
+	dict_index_t*	index,	/*!< in/out: index */
+	const char*	ctx)	/*!< in: context (for logging) */
+{
+	ulint	removed;
+	ulint	added;
+
+	ut_a(page_get_page_no(npage) == page_get_page_no(page));
+	ut_a(page_get_space_id(npage) == page_get_space_id(page));
+
+	removed = btr_blob_dbg_remove(npage, index, ctx);
+	added = btr_blob_dbg_add(page, index, ctx);
+	ut_a(added == removed);
+}
+
+/**************************************************************//**
+Modify the 'deleted' flag of a record. */
+UNIV_INTERN
+void
+btr_blob_dbg_set_deleted_flag(
+/*==========================*/
+	const rec_t*		rec,	/*!< in: record */
+	dict_index_t*		index,	/*!< in/out: index */
+	const ulint*		offsets,/*!< in: rec_get_offs(rec, index) */
+	ibool			del)	/*!< in: TRUE=deleted, FALSE=exists */
+{
+	const ib_rbt_node_t*	node;
+	btr_blob_dbg_t		b;
+	btr_blob_dbg_t*		c;
+	ulint			i;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_a(dict_index_is_clust(index));
+	ut_a(del == !!del);/* must be FALSE==0 or TRUE==1 */
+
+	if (!rec_offs_any_extern(offsets) || !index->blobs) {
+
+		return;
+	}
+
+	b.ref_page_no = page_get_page_no(page_align(rec));
+	b.ref_heap_no = page_rec_get_heap_no(rec);
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
+			ulint		len;
+			const byte*	field_ref = rec_get_nth_field(
+				rec, offsets, i, &len);
+
+			ut_a(len != UNIV_SQL_NULL);
+			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+			field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+			b.ref_field_no = i;
+			b.blob_page_no = mach_read_from_4(
+				field_ref + BTR_EXTERN_PAGE_NO);
+
+			switch (b.blob_page_no) {
+			case 0:
+				ut_a(memcmp(field_ref, field_ref_zero,
+					    BTR_EXTERN_FIELD_REF_SIZE));
+				/* page number 0 is for the
+				page allocation bitmap */
+			case FIL_NULL:
+				/* the column has been freed already */
+				ut_error;
+			}
+
+			mutex_enter(&index->blobs_mutex);
+			node = rbt_lookup(index->blobs, &b);
+			ut_a(node);
+
+			c = rbt_value(btr_blob_dbg_t, node);
+			/* The flag should be modified. */
+			c->del = del;
+			if (btr_blob_dbg_msg) {
+				b = *c;
+				mutex_exit(&index->blobs_mutex);
+				btr_blob_dbg_msg_issue("del_mk", &b, "");
+			} else {
+				mutex_exit(&index->blobs_mutex);
+			}
+		}
+	}
+}
+
+/**************************************************************//**
+Change the ownership of an off-page column. */
+UNIV_INTERN
+void
+btr_blob_dbg_owner(
+/*===============*/
+	const rec_t*		rec,	/*!< in: record */
+	dict_index_t*		index,	/*!< in/out: index */
+	const ulint*		offsets,/*!< in: rec_get_offs(rec, index) */
+	ulint			i,	/*!< in: ith field in rec */
+	ibool			own)	/*!< in: TRUE=owned, FALSE=disowned */
+{
+	const ib_rbt_node_t*	node;
+	btr_blob_dbg_t		b;
+	const byte*		field_ref;
+	ulint			len;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_a(rec_offs_nth_extern(offsets, i));
+
+	field_ref = rec_get_nth_field(rec, offsets, i, &len);
+	ut_a(len != UNIV_SQL_NULL);
+	ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+	field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+	b.ref_page_no = page_get_page_no(page_align(rec));
+	b.ref_heap_no = page_rec_get_heap_no(rec);
+	b.ref_field_no = i;
+	b.owner = !(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG);
+	b.blob_page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
+
+	ut_a(b.owner == own);
+
+	mutex_enter(&index->blobs_mutex);
+	node = rbt_lookup(index->blobs, &b);
+	/* row_ins_clust_index_entry_by_modify() invokes
+	btr_cur_unmark_extern_fields() also for the newly inserted
+	references, which are all zero bytes until the columns are stored.
+	The node lookup must fail if and only if that is the case. */
+	ut_a(!memcmp(field_ref, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)
+	     == !node);
+
+	if (node) {
+		btr_blob_dbg_t*	c = rbt_value(btr_blob_dbg_t, node);
+		/* Some code sets ownership from TRUE to TRUE.
+		We do not allow changing ownership from FALSE to FALSE. */
+		ut_a(own || c->owner);
+
+		c->owner = own;
+		if (!own) {
+			c->always_owner = FALSE;
+		}
+	}
+
+	mutex_exit(&index->blobs_mutex);
+}
+#endif /* UNIV_BLOB_DEBUG */
+
+/*
+Latching strategy of the InnoDB B-tree
+--------------------------------------
+A tree latch protects all non-leaf nodes of the tree. Each node of a tree
+also has a latch of its own.
+
+A B-tree operation normally first acquires an S-latch on the tree. It
+searches down the tree and releases the tree latch when it has the
+leaf node latch. To save CPU time we do not acquire any latch on
+non-leaf nodes of the tree during a search, those pages are only bufferfixed.
+
+If an operation needs to restructure the tree, it acquires an X-latch on
+the tree before searching to a leaf node. If it needs, for example, to
+split a leaf,
+(1) InnoDB decides the split point in the leaf,
+(2) allocates a new page,
+(3) inserts the appropriate node pointer to the first non-leaf level,
+(4) releases the tree X-latch,
+(5) and then moves records from the leaf to the new allocated page.
+
+Node pointers
+-------------
+Leaf pages of a B-tree contain the index records stored in the
+tree. On levels n > 0 we store 'node pointers' to pages on level
+n - 1. For each page there is exactly one node pointer stored:
+thus the our tree is an ordinary B-tree, not a B-link tree.
+
+A node pointer contains a prefix P of an index record. The prefix
+is long enough so that it determines an index record uniquely.
+The file page number of the child page is added as the last
+field. To the child page we can store node pointers or index records
+which are >= P in the alphabetical order, but < P1 if there is
+a next node pointer on the level, and P1 is its prefix.
+
+If a node pointer with a prefix P points to a non-leaf child,
+then the leftmost record in the child must have the same
+prefix P. If it points to a leaf node, the child is not required
+to contain any record with a prefix equal to P. The leaf case
+is decided this way to allow arbitrary deletions in a leaf node
+without touching upper levels of the tree.
+
+We have predefined a special minimum record which we
+define as the smallest record in any alphabetical order.
+A minimum record is denoted by setting a bit in the record
+header. A minimum record acts as the prefix of a node pointer
+which points to a leftmost node on any level of the tree.
+
+File page allocation
+--------------------
+In the root node of a B-tree there are two file segment headers.
+The leaf pages of a tree are allocated from one file segment, to
+make them consecutive on disk if possible. From the other file segment
+we allocate pages for the non-leaf levels of the tree.
+*/
+
+#ifdef UNIV_BTR_DEBUG
+/**************************************************************//**
+Checks a file segment header within a B-tree root page.
+@return	TRUE if valid */
+static
+ibool
+btr_root_fseg_validate(
+/*===================*/
+	const fseg_header_t*	seg_header,	/*!< in: segment header */
+	ulint			space)		/*!< in: tablespace identifier */
+{
+	ulint	offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
+
+	ut_a(mach_read_from_4(seg_header + FSEG_HDR_SPACE) == space);
+	ut_a(offset >= FIL_PAGE_DATA);
+	ut_a(offset <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END);
+	return(TRUE);
+}
+#endif /* UNIV_BTR_DEBUG */
+
+/**************************************************************//**
+Gets the root node of a tree and x- or s-latches it.
+@return	root page, x- or s-latched */
+static
+buf_block_t*
+btr_root_block_get(
+/*===============*/
+	const dict_index_t*	index,	/*!< in: index tree */
+	ulint			mode,	/*!< in: either RW_S_LATCH
+					or RW_X_LATCH */
+	mtr_t*			mtr)	/*!< in: mtr */
+{
+	ulint		space;
+	ulint		zip_size;
+	ulint		root_page_no;
+	buf_block_t*	block;
+
+	space = dict_index_get_space(index);
+	zip_size = dict_table_zip_size(index->table);
+	root_page_no = dict_index_get_page(index);
+
+	block = btr_block_get(space, zip_size, root_page_no, mode, index, mtr);
+	btr_assert_not_corrupted(block, index);
+#ifdef UNIV_BTR_DEBUG
+	if (!dict_index_is_ibuf(index)) {
+		const page_t*	root = buf_block_get_frame(block);
+
+		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+					    + root, space));
+		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+					    + root, space));
+	}
+#endif /* UNIV_BTR_DEBUG */
+
+	return(block);
+}
+
+/**************************************************************//**
+Gets the root node of a tree and x-latches it.
+@return	root page, x-latched */
+UNIV_INTERN
+page_t*
+btr_root_get(
+/*=========*/
+	const dict_index_t*	index,	/*!< in: index tree */
+	mtr_t*			mtr)	/*!< in: mtr */
+{
+	return(buf_block_get_frame(btr_root_block_get(index, RW_X_LATCH,
+						      mtr)));
+}
+
+/**************************************************************//**
+Gets the height of the B-tree (the level of the root, when the leaf
+level is assumed to be 0). The caller must hold an S or X latch on
+the index.
+@return	tree height (level of the root) */
+UNIV_INTERN
+ulint
+btr_height_get(
+/*===========*/
+	dict_index_t*	index,	/*!< in: index tree */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint		height;
+	buf_block_t*	root_block;
+
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_S_LOCK)
+	      || mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_X_LOCK));
+
+        /* S latches the page */
+        root_block = btr_root_block_get(index, RW_S_LATCH, mtr);
+
+        height = btr_page_get_level(buf_block_get_frame(root_block), mtr);
+
+        /* Release the S latch on the root page. */
+        mtr_memo_release(mtr, root_block, MTR_MEMO_PAGE_S_FIX);
+#ifdef UNIV_SYNC_DEBUG
+        sync_thread_reset_level(&root_block->lock);
+#endif /* UNIV_SYNC_DEBUG */
+
+	return(height);
+}
+
+/**************************************************************//**
+Checks a file segment header within a B-tree root page and updates
+the segment header space id.
+@return	TRUE if valid */
+static
+bool
+btr_root_fseg_adjust_on_import(
+/*===========================*/
+	fseg_header_t*	seg_header,	/*!< in/out: segment header */
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page,
+					or NULL */
+	ulint		space,		/*!< in: tablespace identifier */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	ulint	offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
+
+	if (offset < FIL_PAGE_DATA
+	    || offset > UNIV_PAGE_SIZE - FIL_PAGE_DATA_END) {
+
+		return(FALSE);
+
+	} else if (page_zip) {
+		mach_write_to_4(seg_header + FSEG_HDR_SPACE, space);
+		page_zip_write_header(page_zip, seg_header + FSEG_HDR_SPACE,
+				      4, mtr);
+	} else {
+		mlog_write_ulint(seg_header + FSEG_HDR_SPACE,
+				 space, MLOG_4BYTES, mtr);
+	}
+
+	return(TRUE);
+}
+
+/**************************************************************//**
+Checks and adjusts the root node of a tree during IMPORT TABLESPACE.
+@return error code, or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+btr_root_adjust_on_import(
+/*======================*/
+	const dict_index_t*	index)	/*!< in: index tree */
+{
+	dberr_t		err;
+	mtr_t		mtr;
+	page_t*		page;
+	buf_block_t*	block;
+	page_zip_des_t*	page_zip;
+	dict_table_t*	table		= index->table;
+	ulint		space_id	= dict_index_get_space(index);
+	ulint		zip_size	= dict_table_zip_size(table);
+	ulint		root_page_no	= dict_index_get_page(index);
+
+	mtr_start(&mtr);
+
+	mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
+
+	DBUG_EXECUTE_IF("ib_import_trigger_corruption_3",
+			return(DB_CORRUPTION););
+
+	block = btr_block_get(
+		space_id, zip_size, root_page_no, RW_X_LATCH, index, &mtr);
+
+	page = buf_block_get_frame(block);
+	page_zip = buf_block_get_page_zip(block);
+
+	/* Check that this is a B-tree page and both the PREV and NEXT
+	pointers are FIL_NULL, because the root page does not have any
+	siblings. */
+	if (fil_page_get_type(page) != FIL_PAGE_INDEX
+	    || fil_page_get_prev(page) != FIL_NULL
+	    || fil_page_get_next(page) != FIL_NULL) {
+
+		err = DB_CORRUPTION;
+
+	} else if (dict_index_is_clust(index)) {
+		bool	page_is_compact_format;
+
+		page_is_compact_format = page_is_comp(page) > 0;
+
+		/* Check if the page format and table format agree. */
+		if (page_is_compact_format != dict_table_is_comp(table)) {
+			err = DB_CORRUPTION;
+		} else {
+
+			/* Check that the table flags and the tablespace
+			flags match. */
+			ulint	flags = fil_space_get_flags(table->space);
+
+			if (flags
+			    && flags != dict_tf_to_fsp_flags(table->flags)) {
+
+				err = DB_CORRUPTION;
+			} else {
+				err = DB_SUCCESS;
+			}
+		}
+	} else {
+		err = DB_SUCCESS;
+	}
+
+	/* Check and adjust the file segment headers, if all OK so far. */
+	if (err == DB_SUCCESS
+	    && (!btr_root_fseg_adjust_on_import(
+			FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+			+ page, page_zip, space_id, &mtr)
+		|| !btr_root_fseg_adjust_on_import(
+			FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+			+ page, page_zip, space_id, &mtr))) {
+
+		err = DB_CORRUPTION;
+	}
+
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/*************************************************************//**
+Gets pointer to the previous user record in the tree. It is assumed that
+the caller has appropriate latches on the page and its neighbor.
+@return	previous user record, NULL if there is none */
+UNIV_INTERN
+rec_t*
+btr_get_prev_user_rec(
+/*==================*/
+	rec_t*	rec,	/*!< in: record on leaf level */
+	mtr_t*	mtr)	/*!< in: mtr holding a latch on the page, and if
+			needed, also to the previous page */
+{
+	page_t*	page;
+	page_t*	prev_page;
+	ulint	prev_page_no;
+
+	if (!page_rec_is_infimum(rec)) {
+
+		rec_t*	prev_rec = page_rec_get_prev(rec);
+
+		if (!page_rec_is_infimum(prev_rec)) {
+
+			return(prev_rec);
+		}
+	}
+
+	page = page_align(rec);
+	prev_page_no = btr_page_get_prev(page, mtr);
+
+	if (prev_page_no != FIL_NULL) {
+
+		ulint		space;
+		ulint		zip_size;
+		buf_block_t*	prev_block;
+
+		space = page_get_space_id(page);
+		zip_size = fil_space_get_zip_size(space);
+
+		prev_block = buf_page_get_with_no_latch(space, zip_size,
+							prev_page_no, mtr);
+		prev_page = buf_block_get_frame(prev_block);
+		/* The caller must already have a latch to the brother */
+		ut_ad(mtr_memo_contains(mtr, prev_block,
+					MTR_MEMO_PAGE_S_FIX)
+		      || mtr_memo_contains(mtr, prev_block,
+					   MTR_MEMO_PAGE_X_FIX));
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(prev_page) == page_is_comp(page));
+		ut_a(btr_page_get_next(prev_page, mtr)
+		     == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+		return(page_rec_get_prev(page_get_supremum_rec(prev_page)));
+	}
+
+	return(NULL);
+}
+
+/*************************************************************//**
+Gets pointer to the next user record in the tree. It is assumed that the
+caller has appropriate latches on the page and its neighbor.
+@return	next user record, NULL if there is none */
+UNIV_INTERN
+rec_t*
+btr_get_next_user_rec(
+/*==================*/
+	rec_t*	rec,	/*!< in: record on leaf level */
+	mtr_t*	mtr)	/*!< in: mtr holding a latch on the page, and if
+			needed, also to the next page */
+{
+	page_t*	page;
+	page_t*	next_page;
+	ulint	next_page_no;
+
+	if (!page_rec_is_supremum(rec)) {
+
+		rec_t*	next_rec = page_rec_get_next(rec);
+
+		if (!page_rec_is_supremum(next_rec)) {
+
+			return(next_rec);
+		}
+	}
+
+	page = page_align(rec);
+	next_page_no = btr_page_get_next(page, mtr);
+
+	if (next_page_no != FIL_NULL) {
+		ulint		space;
+		ulint		zip_size;
+		buf_block_t*	next_block;
+
+		space = page_get_space_id(page);
+		zip_size = fil_space_get_zip_size(space);
+
+		next_block = buf_page_get_with_no_latch(space, zip_size,
+							next_page_no, mtr);
+		next_page = buf_block_get_frame(next_block);
+		/* The caller must already have a latch to the brother */
+		ut_ad(mtr_memo_contains(mtr, next_block, MTR_MEMO_PAGE_S_FIX)
+		      || mtr_memo_contains(mtr, next_block,
+					   MTR_MEMO_PAGE_X_FIX));
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(next_page) == page_is_comp(page));
+		ut_a(btr_page_get_prev(next_page, mtr)
+		     == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+		return(page_rec_get_next(page_get_infimum_rec(next_page)));
+	}
+
+	return(NULL);
+}
+
+/**************************************************************//**
+Creates a new index page (not the root, and also not
+used in page reorganization).  @see btr_page_empty(). */
+static
+void
+btr_page_create(
+/*============*/
+	buf_block_t*	block,	/*!< in/out: page to be created */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: the B-tree level of the page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*		page = buf_block_get_frame(block);
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block));
+
+	if (page_zip) {
+		page_create_zip(block, index, level, 0, mtr);
+	} else {
+		page_create(block, mtr, dict_table_is_comp(index->table));
+		/* Set the level of the new index page */
+		btr_page_set_level(page, NULL, level, mtr);
+	}
+
+	block->check_index_page_at_flush = TRUE;
+
+	btr_page_set_index_id(page, page_zip, index->id, mtr);
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an ibuf tree. Takes the page from
+the free list of the tree, which must contain pages!
+@return	new allocated block, x-latched */
+static
+buf_block_t*
+btr_page_alloc_for_ibuf(
+/*====================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	fil_addr_t	node_addr;
+	page_t*		root;
+	page_t*		new_page;
+	buf_block_t*	new_block;
+
+	root = btr_root_get(index, mtr);
+
+	node_addr = flst_get_first(root + PAGE_HEADER
+				   + PAGE_BTR_IBUF_FREE_LIST, mtr);
+	ut_a(node_addr.page != FIL_NULL);
+
+	new_block = buf_page_get(dict_index_get_space(index),
+				 dict_table_zip_size(index->table),
+				 node_addr.page, RW_X_LATCH, mtr);
+	new_page = buf_block_get_frame(new_block);
+	buf_block_dbg_add_level(new_block, SYNC_IBUF_TREE_NODE_NEW);
+
+	flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+		    new_page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
+		    mtr);
+	ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+			    mtr));
+
+	return(new_block);
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@retval NULL if no page could be allocated
+@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
+(init_mtr == mtr, or the page was not previously freed in mtr)
+@retval block (not allocated or initialized) otherwise */
+static __attribute__((nonnull, warn_unused_result))
+buf_block_t*
+btr_page_alloc_low(
+/*===============*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		hint_page_no,	/*!< in: hint of a good page */
+	byte		file_direction,	/*!< in: direction where a possible
+					page split is made */
+	ulint		level,		/*!< in: level where the page is placed
+					in the tree */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr)	/*!< in/out: mtr or another
+					mini-transaction in which the
+					page should be initialized.
+					If init_mtr!=mtr, but the page
+					is already X-latched in mtr, do
+					not initialize the page. */
+{
+	fseg_header_t*	seg_header;
+	page_t*		root;
+
+	root = btr_root_get(index, mtr);
+
+	if (level == 0) {
+		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+	} else {
+		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+	}
+
+	/* Parameter TRUE below states that the caller has made the
+	reservation for free extents, and thus we know that a page can
+	be allocated: */
+
+	return(fseg_alloc_free_page_general(
+		       seg_header, hint_page_no, file_direction,
+		       TRUE, mtr, init_mtr));
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@retval NULL if no page could be allocated
+@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
+(init_mtr == mtr, or the page was not previously freed in mtr)
+@retval block (not allocated or initialized) otherwise */
+UNIV_INTERN
+buf_block_t*
+btr_page_alloc(
+/*===========*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		hint_page_no,	/*!< in: hint of a good page */
+	byte		file_direction,	/*!< in: direction where a possible
+					page split is made */
+	ulint		level,		/*!< in: level where the page is placed
+					in the tree */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr)	/*!< in/out: mini-transaction
+					for x-latching and initializing
+					the page */
+{
+	buf_block_t*	new_block;
+
+	if (dict_index_is_ibuf(index)) {
+
+		return(btr_page_alloc_for_ibuf(index, mtr));
+	}
+
+	new_block = btr_page_alloc_low(
+		index, hint_page_no, file_direction, level, mtr, init_mtr);
+
+	if (new_block) {
+		buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
+	}
+
+	return(new_block);
+}
+
+/**************************************************************//**
+Gets the number of pages in a B-tree.
+@return	number of pages, or ULINT_UNDEFINED if the index is unavailable */
+UNIV_INTERN
+ulint
+btr_get_size(
+/*=========*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		flag,	/*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction where index
+				is s-latched */
+{
+	fseg_header_t*	seg_header;
+	page_t*		root;
+	ulint		n;
+	ulint		dummy;
+
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_S_LOCK));
+
+	if (index->page == FIL_NULL || dict_index_is_online_ddl(index)
+	    || *index->name == TEMP_INDEX_PREFIX) {
+		return(ULINT_UNDEFINED);
+	}
+
+	root = btr_root_get(index, mtr);
+
+	if (flag == BTR_N_LEAF_PAGES) {
+		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+
+		fseg_n_reserved_pages(seg_header, &n, mtr);
+
+	} else if (flag == BTR_TOTAL_SIZE) {
+		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+
+		n = fseg_n_reserved_pages(seg_header, &dummy, mtr);
+
+		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+
+		n += fseg_n_reserved_pages(seg_header, &dummy, mtr);
+	} else {
+		ut_error;
+	}
+
+	return(n);
+}
+
+/**************************************************************//**
+Frees a page used in an ibuf tree. Puts the page to the free list of the
+ibuf tree. */
+static
+void
+btr_page_free_for_ibuf(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*		root;
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	root = btr_root_get(index, mtr);
+
+	flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+		       buf_block_get_frame(block)
+		       + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
+
+	ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+			    mtr));
+}
+
+/**************************************************************//**
+Frees a file page used in an index tree. Can be used also to (BLOB)
+external storage pages, because the page level 0 can be given as an
+argument. */
+UNIV_INTERN
+void
+btr_page_free_low(
+/*==============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
+	ulint		level,	/*!< in: page level */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	fseg_header_t*	seg_header;
+	page_t*		root;
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	/* The page gets invalid for optimistic searches: increment the frame
+	modify clock */
+
+	buf_block_modify_clock_inc(block);
+	btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block));
+
+	if (dict_index_is_ibuf(index)) {
+
+		btr_page_free_for_ibuf(index, block, mtr);
+
+		return;
+	}
+
+	root = btr_root_get(index, mtr);
+
+	if (level == 0) {
+		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+	} else {
+		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+	}
+
+	fseg_free_page(seg_header,
+		       buf_block_get_space(block),
+		       buf_block_get_page_no(block), mtr);
+
+	/* The page was marked free in the allocation bitmap, but it
+	should remain buffer-fixed until mtr_commit(mtr) or until it
+	is explicitly freed from the mini-transaction. */
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	/* TODO: Discard any operations on the page from the redo log
+	and remove the block from the flush list and the buffer pool.
+	This would free up buffer pool earlier and reduce writes to
+	both the tablespace and the redo log. */
+}
+
+/**************************************************************//**
+Frees a file page used in an index tree. NOTE: cannot free field external
+storage pages because the page must contain info on its level. */
+UNIV_INTERN
+void
+btr_page_free(
+/*==========*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	const page_t*	page	= buf_block_get_frame(block);
+	ulint		level	= btr_page_get_level(page, mtr);
+
+	ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_INDEX);
+	btr_page_free_low(index, block, level, mtr);
+}
+
+/**************************************************************//**
+Sets the child node file address in a node pointer. */
+UNIV_INLINE
+void
+btr_node_ptr_set_child_page_no(
+/*===========================*/
+	rec_t*		rec,	/*!< in: node pointer record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
+				part will be updated, or NULL */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		page_no,/*!< in: child node address */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	byte*	field;
+	ulint	len;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(!page_is_leaf(page_align(rec)));
+	ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
+
+	/* The child address is in the last field */
+	field = rec_get_nth_field(rec, offsets,
+				  rec_offs_n_fields(offsets) - 1, &len);
+
+	ut_ad(len == REC_NODE_PTR_SIZE);
+
+	if (page_zip) {
+		page_zip_write_node_ptr(page_zip, rec,
+					rec_offs_data_size(offsets),
+					page_no, mtr);
+	} else {
+		mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr);
+	}
+}
+
+/************************************************************//**
+Returns the child page of a node pointer and x-latches it.
+@return	child page, x-latched */
+static
+buf_block_t*
+btr_node_ptr_get_child(
+/*===================*/
+	const rec_t*	node_ptr,/*!< in: node pointer */
+	dict_index_t*	index,	/*!< in: index */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint	page_no;
+	ulint	space;
+
+	ut_ad(rec_offs_validate(node_ptr, index, offsets));
+	space = page_get_space_id(page_align(node_ptr));
+	page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
+
+	return(btr_block_get(space, dict_table_zip_size(index->table),
+			     page_no, RW_X_LATCH, index, mtr));
+}
+
+/************************************************************//**
+Returns the upper level node pointer to a page. It is assumed that mtr holds
+an x-latch on the tree.
+@return	rec_get_offsets() of the node pointer record */
+static
+ulint*
+btr_page_get_father_node_ptr_func(
+/*==============================*/
+	ulint*		offsets,/*!< in: work area for the return value */
+	mem_heap_t*	heap,	/*!< in: memory heap to use */
+	btr_cur_t*	cursor,	/*!< in: cursor pointing to user record,
+				out: cursor on node pointer record,
+				its page x-latched */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dtuple_t*	tuple;
+	rec_t*		user_rec;
+	rec_t*		node_ptr;
+	ulint		level;
+	ulint		page_no;
+	dict_index_t*	index;
+
+	page_no = buf_block_get_page_no(btr_cur_get_block(cursor));
+	index = btr_cur_get_index(cursor);
+
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_X_LOCK));
+
+	ut_ad(dict_index_get_page(index) != page_no);
+
+	level = btr_page_get_level(btr_cur_get_page(cursor), mtr);
+
+	user_rec = btr_cur_get_rec(cursor);
+	ut_a(page_rec_is_user_rec(user_rec));
+	tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level);
+
+	btr_cur_search_to_nth_level(index, level + 1, tuple, PAGE_CUR_LE,
+				    BTR_CONT_MODIFY_TREE, cursor, 0,
+				    file, line, mtr);
+
+	node_ptr = btr_cur_get_rec(cursor);
+	ut_ad(!page_rec_is_comp(node_ptr)
+	      || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR);
+	offsets = rec_get_offsets(node_ptr, index, offsets,
+				  ULINT_UNDEFINED, &heap);
+
+	if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) {
+		rec_t*	print_rec;
+		fputs("InnoDB: Dump of the child page:\n", stderr);
+		buf_page_print(page_align(user_rec), 0,
+			       BUF_PAGE_PRINT_NO_CRASH);
+		fputs("InnoDB: Dump of the parent page:\n", stderr);
+		buf_page_print(page_align(node_ptr), 0,
+			       BUF_PAGE_PRINT_NO_CRASH);
+
+		fputs("InnoDB: Corruption of an index tree: table ", stderr);
+		ut_print_name(stderr, NULL, TRUE, index->table_name);
+		fputs(", index ", stderr);
+		ut_print_name(stderr, NULL, FALSE, index->name);
+		fprintf(stderr, ",\n"
+			"InnoDB: father ptr page no %lu, child page no %lu\n",
+			(ulong)
+			btr_node_ptr_get_child_page_no(node_ptr, offsets),
+			(ulong) page_no);
+		print_rec = page_rec_get_next(
+			page_get_infimum_rec(page_align(user_rec)));
+		offsets = rec_get_offsets(print_rec, index,
+					  offsets, ULINT_UNDEFINED, &heap);
+		page_rec_print(print_rec, offsets);
+		offsets = rec_get_offsets(node_ptr, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		page_rec_print(node_ptr, offsets);
+
+		fputs("InnoDB: You should dump + drop + reimport the table"
+		      " to fix the\n"
+		      "InnoDB: corruption. If the crash happens at "
+		      "the database startup, see\n"
+		      "InnoDB: " REFMAN "forcing-innodb-recovery.html about\n"
+		      "InnoDB: forcing recovery. "
+		      "Then dump + drop + reimport.\n", stderr);
+
+		ut_error;
+	}
+
+	return(offsets);
+}
+
+#define btr_page_get_father_node_ptr(of,heap,cur,mtr)			\
+	btr_page_get_father_node_ptr_func(of,heap,cur,__FILE__,__LINE__,mtr)
+
+/************************************************************//**
+Returns the upper level node pointer to a page. It is assumed that mtr holds
+an x-latch on the tree.
+@return	rec_get_offsets() of the node pointer record */
+static
+ulint*
+btr_page_get_father_block(
+/*======================*/
+	ulint*		offsets,/*!< in: work area for the return value */
+	mem_heap_t*	heap,	/*!< in: memory heap to use */
+	dict_index_t*	index,	/*!< in: b-tree index */
+	buf_block_t*	block,	/*!< in: child page in the index */
+	mtr_t*		mtr,	/*!< in: mtr */
+	btr_cur_t*	cursor)	/*!< out: cursor on node pointer record,
+				its page x-latched */
+{
+	rec_t*	rec
+		= page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
+								 block)));
+	btr_cur_position(index, rec, block, cursor);
+	return(btr_page_get_father_node_ptr(offsets, heap, cursor, mtr));
+}
+
+/************************************************************//**
+Seeks to the upper level node pointer to a page.
+It is assumed that mtr holds an x-latch on the tree. */
+static
+void
+btr_page_get_father(
+/*================*/
+	dict_index_t*	index,	/*!< in: b-tree index */
+	buf_block_t*	block,	/*!< in: child page in the index */
+	mtr_t*		mtr,	/*!< in: mtr */
+	btr_cur_t*	cursor)	/*!< out: cursor on node pointer record,
+				its page x-latched */
+{
+	mem_heap_t*	heap;
+	rec_t*		rec
+		= page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
+								 block)));
+	btr_cur_position(index, rec, block, cursor);
+
+	heap = mem_heap_create(100);
+	btr_page_get_father_node_ptr(NULL, heap, cursor, mtr);
+	mem_heap_free(heap);
+}
+
+/************************************************************//**
+Creates the root node for a new index tree.
+@return	page number of the created root, FIL_NULL if did not succeed */
+UNIV_INTERN
+ulint
+btr_create(
+/*=======*/
+	ulint		type,	/*!< in: type of the index */
+	ulint		space,	/*!< in: space where created */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	index_id_t	index_id,/*!< in: index id */
+	dict_index_t*	index,	/*!< in: index */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+{
+	ulint		page_no;
+	buf_block_t*	block;
+	buf_frame_t*	frame;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+
+	/* Create the two new segments (one, in the case of an ibuf tree) for
+	the index tree; the segment headers are put on the allocated root page
+	(for an ibuf tree, not in the root, but on a separate ibuf header
+	page) */
+
+	if (type & DICT_IBUF) {
+		/* Allocate first the ibuf header page */
+		buf_block_t*	ibuf_hdr_block = fseg_create(
+			space, 0,
+			IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr);
+
+		buf_block_dbg_add_level(
+			ibuf_hdr_block, SYNC_IBUF_TREE_NODE_NEW);
+
+		ut_ad(buf_block_get_page_no(ibuf_hdr_block)
+		      == IBUF_HEADER_PAGE_NO);
+		/* Allocate then the next page to the segment: it will be the
+		tree root page */
+
+		block = fseg_alloc_free_page(
+			buf_block_get_frame(ibuf_hdr_block)
+			+ IBUF_HEADER + IBUF_TREE_SEG_HEADER,
+			IBUF_TREE_ROOT_PAGE_NO,
+			FSP_UP, mtr);
+		ut_ad(buf_block_get_page_no(block) == IBUF_TREE_ROOT_PAGE_NO);
+	} else {
+#ifdef UNIV_BLOB_DEBUG
+		if ((type & DICT_CLUSTERED) && !index->blobs) {
+			mutex_create(PFS_NOT_INSTRUMENTED,
+				     &index->blobs_mutex, SYNC_ANY_LATCH);
+			index->blobs = rbt_create(sizeof(btr_blob_dbg_t),
+						  btr_blob_dbg_cmp);
+		}
+#endif /* UNIV_BLOB_DEBUG */
+		block = fseg_create(space, 0,
+				    PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr);
+	}
+
+	if (block == NULL) {
+
+		return(FIL_NULL);
+	}
+
+	page_no = buf_block_get_page_no(block);
+	frame = buf_block_get_frame(block);
+
+	if (type & DICT_IBUF) {
+		/* It is an insert buffer tree: initialize the free list */
+		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
+
+		ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
+
+		flst_init(frame + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
+	} else {
+		/* It is a non-ibuf tree: create a file segment for leaf
+		pages */
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
+
+		if (!fseg_create(space, page_no,
+				 PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr)) {
+			/* Not enough space for new segment, free root
+			segment before return. */
+			btr_free_root(space, zip_size, page_no, mtr);
+
+			return(FIL_NULL);
+		}
+
+		/* The fseg create acquires a second latch on the page,
+		therefore we must declare it: */
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
+	}
+
+	/* Create a new index page on the allocated segment page */
+	page_zip = buf_block_get_page_zip(block);
+
+	if (page_zip) {
+		page = page_create_zip(block, index, 0, 0, mtr);
+	} else {
+		page = page_create(block, mtr,
+				   dict_table_is_comp(index->table));
+		/* Set the level of the new index page */
+		btr_page_set_level(page, NULL, 0, mtr);
+	}
+
+	block->check_index_page_at_flush = TRUE;
+
+	/* Set the index id of the page */
+	btr_page_set_index_id(page, page_zip, index_id, mtr);
+
+	/* Set the next node and previous node fields */
+	btr_page_set_next(page, page_zip, FIL_NULL, mtr);
+	btr_page_set_prev(page, page_zip, FIL_NULL, mtr);
+
+	/* We reset the free bits for the page to allow creation of several
+	trees in the same mtr, otherwise the latch on a bitmap page would
+	prevent it because of the latching order */
+
+	if (!(type & DICT_CLUSTERED)) {
+		ibuf_reset_free_bits(block);
+	}
+
+	/* In the following assertion we test that two records of maximum
+	allowed size fit on the root page: this fact is needed to ensure
+	correctness of split algorithms */
+
+	ut_ad(page_get_max_insert_size(page, 2) > 2 * BTR_PAGE_MAX_REC_SIZE);
+
+	return(page_no);
+}
+
+/************************************************************//**
+Frees a B-tree except the root page, which MUST be freed after this
+by calling btr_free_root. */
+UNIV_INTERN
+void
+btr_free_but_not_root(
+/*==================*/
+	ulint	space,		/*!< in: space where created */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	root_page_no)	/*!< in: root page number */
+{
+	ibool	finished;
+	page_t*	root;
+	mtr_t	mtr;
+
+leaf_loop:
+	mtr_start(&mtr);
+
+	root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH,
+			    NULL, &mtr);
+#ifdef UNIV_BTR_DEBUG
+	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+				    + root, space));
+	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+				    + root, space));
+#endif /* UNIV_BTR_DEBUG */
+
+	/* NOTE: page hash indexes are dropped when a page is freed inside
+	fsp0fsp. */
+
+	finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF,
+				  &mtr);
+	mtr_commit(&mtr);
+
+	if (!finished) {
+
+		goto leaf_loop;
+	}
+top_loop:
+	mtr_start(&mtr);
+
+	root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH,
+			    NULL, &mtr);
+#ifdef UNIV_BTR_DEBUG
+	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+				    + root, space));
+#endif /* UNIV_BTR_DEBUG */
+
+	finished = fseg_free_step_not_header(
+		root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr);
+	mtr_commit(&mtr);
+
+	if (!finished) {
+
+		goto top_loop;
+	}
+}
+
+/************************************************************//**
+Frees the B-tree root page. Other tree MUST already have been freed. */
+UNIV_INTERN
+void
+btr_free_root(
+/*==========*/
+	ulint	space,		/*!< in: space where created */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	root_page_no,	/*!< in: root page number */
+	mtr_t*	mtr)		/*!< in/out: mini-transaction */
+{
+	buf_block_t*	block;
+	fseg_header_t*	header;
+
+	block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH,
+			      NULL, mtr);
+
+	btr_search_drop_page_hash_index(block);
+
+	header = buf_block_get_frame(block) + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+#ifdef UNIV_BTR_DEBUG
+	ut_a(btr_root_fseg_validate(header, space));
+#endif /* UNIV_BTR_DEBUG */
+
+	while (!fseg_free_step(header, mtr)) {
+		/* Free the entire segment in small steps. */
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
+UNIV_INTERN
+bool
+btr_page_reorganize_low(
+/*====================*/
+	bool		recovery,/*!< in: true if called in recovery:
+				locks should not be updated, i.e.,
+				there cannot exist locks on the
+				page, and a hash index should not be
+				dropped: it cannot exist */
+	ulint		z_level,/*!< in: compression level to be used
+				if dealing with compressed page */
+	page_cur_t*	cursor,	/*!< in/out: page cursor */
+	dict_index_t*	index,	/*!< in: the index tree of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	buf_block_t*	block		= page_cur_get_block(cursor);
+#ifndef UNIV_HOTBACKUP
+	buf_pool_t*	buf_pool	= buf_pool_from_bpage(&block->page);
+#endif /* !UNIV_HOTBACKUP */
+	page_t*		page		= buf_block_get_frame(block);
+	page_zip_des_t*	page_zip	= buf_block_get_page_zip(block);
+	buf_block_t*	temp_block;
+	page_t*		temp_page;
+	ulint		log_mode;
+	ulint		data_size1;
+	ulint		data_size2;
+	ulint		max_ins_size1;
+	ulint		max_ins_size2;
+	bool		success		= false;
+	ulint		pos;
+	bool		log_compressed;
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	btr_assert_not_corrupted(block, index);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+	data_size1 = page_get_data_size(page);
+	max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1);
+
+	/* Turn logging off */
+	log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+
+#ifndef UNIV_HOTBACKUP
+	temp_block = buf_block_alloc(buf_pool);
+#else /* !UNIV_HOTBACKUP */
+	ut_ad(block == back_block1);
+	temp_block = back_block2;
+#endif /* !UNIV_HOTBACKUP */
+	temp_page = temp_block->frame;
+
+	MONITOR_INC(MONITOR_INDEX_REORG_ATTEMPTS);
+
+	/* Copy the old page to temporary space */
+	buf_frame_copy(temp_page, page);
+
+#ifndef UNIV_HOTBACKUP
+	if (!recovery) {
+		btr_search_drop_page_hash_index(block);
+	}
+
+	block->check_index_page_at_flush = TRUE;
+#endif /* !UNIV_HOTBACKUP */
+	btr_blob_dbg_remove(page, index, "btr_page_reorganize");
+
+	/* Save the cursor position. */
+	pos = page_rec_get_n_recs_before(page_cur_get_rec(cursor));
+
+	/* Recreate the page: note that global data on page (possible
+	segment headers, next page-field, etc.) is preserved intact */
+
+	page_create(block, mtr, dict_table_is_comp(index->table));
+
+	/* Copy the records from the temporary space to the recreated page;
+	do not copy the lock bits yet */
+
+	page_copy_rec_list_end_no_locks(block, temp_block,
+					page_get_infimum_rec(temp_page),
+					index, mtr);
+
+	if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
+		/* Copy max trx id to recreated page */
+		trx_id_t	max_trx_id = page_get_max_trx_id(temp_page);
+		page_set_max_trx_id(block, NULL, max_trx_id, mtr);
+		/* In crash recovery, dict_index_is_sec_or_ibuf() always
+		holds, even for clustered indexes.  max_trx_id is
+		unused in clustered index pages. */
+		ut_ad(max_trx_id != 0 || recovery);
+	}
+
+	/* If innodb_log_compressed_pages is ON, page reorganize should log the
+	compressed page image.*/
+	log_compressed = page_zip && page_zip_log_pages;
+
+	if (log_compressed) {
+		mtr_set_log_mode(mtr, log_mode);
+	}
+
+	if (page_zip
+	    && !page_zip_compress(page_zip, page, index, z_level, mtr)) {
+
+		/* Restore the old page and exit. */
+		btr_blob_dbg_restore(page, temp_page, index,
+				     "btr_page_reorganize_compress_fail");
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+		/* Check that the bytes that we skip are identical. */
+		ut_a(!memcmp(page, temp_page, PAGE_HEADER));
+		ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page,
+			     PAGE_HEADER + PAGE_N_RECS + temp_page,
+			     PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS)));
+		ut_a(!memcmp(UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page,
+			     UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + temp_page,
+			     FIL_PAGE_DATA_END));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+		memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page,
+		       PAGE_N_RECS - PAGE_N_DIR_SLOTS);
+		memcpy(PAGE_DATA + page, PAGE_DATA + temp_page,
+		       UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END);
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+		ut_a(!memcmp(page, temp_page, UNIV_PAGE_SIZE));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+		goto func_exit;
+	}
+
+#ifndef UNIV_HOTBACKUP
+	if (!recovery) {
+		/* Update the record lock bitmaps */
+		lock_move_reorganize_page(block, temp_block);
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	data_size2 = page_get_data_size(page);
+	max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1);
+
+	if (data_size1 != data_size2 || max_ins_size1 != max_ins_size2) {
+		buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
+		buf_page_print(temp_page, 0, BUF_PAGE_PRINT_NO_CRASH);
+
+		fprintf(stderr,
+			"InnoDB: Error: page old data size %lu"
+			" new data size %lu\n"
+			"InnoDB: Error: page old max ins size %lu"
+			" new max ins size %lu\n"
+			"InnoDB: Submit a detailed bug report"
+			" to http://bugs.mysql.com\n",
+			(unsigned long) data_size1, (unsigned long) data_size2,
+			(unsigned long) max_ins_size1,
+			(unsigned long) max_ins_size2);
+		ut_ad(0);
+	} else {
+		success = true;
+	}
+
+	/* Restore the cursor position. */
+	if (pos > 0) {
+		cursor->rec = page_rec_get_nth(page, pos);
+	} else {
+		ut_ad(cursor->rec == page_get_infimum_rec(page));
+	}
+
+func_exit:
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+#ifndef UNIV_HOTBACKUP
+	buf_block_free(temp_block);
+#endif /* !UNIV_HOTBACKUP */
+
+	/* Restore logging mode */
+	mtr_set_log_mode(mtr, log_mode);
+
+#ifndef UNIV_HOTBACKUP
+	if (success) {
+		byte	type;
+		byte*	log_ptr;
+
+		/* Write the log record */
+		if (page_zip) {
+			ut_ad(page_is_comp(page));
+			type = MLOG_ZIP_PAGE_REORGANIZE;
+		} else if (page_is_comp(page)) {
+			type = MLOG_COMP_PAGE_REORGANIZE;
+		} else {
+			type = MLOG_PAGE_REORGANIZE;
+		}
+
+		log_ptr = log_compressed
+			? NULL
+			: mlog_open_and_write_index(
+				mtr, page, index, type,
+				page_zip ? 1 : 0);
+
+		/* For compressed pages write the compression level. */
+		if (log_ptr && page_zip) {
+			mach_write_to_1(log_ptr, z_level);
+			mlog_close(mtr, log_ptr + 1);
+		}
+
+		MONITOR_INC(MONITOR_INDEX_REORG_SUCCESSFUL);
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	return(success);
+}
+
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
+static __attribute__((nonnull))
+bool
+btr_page_reorganize_block(
+/*======================*/
+	bool		recovery,/*!< in: true if called in recovery:
+				locks should not be updated, i.e.,
+				there cannot exist locks on the
+				page, and a hash index should not be
+				dropped: it cannot exist */
+	ulint		z_level,/*!< in: compression level to be used
+				if dealing with compressed page */
+	buf_block_t*	block,	/*!< in/out: B-tree page */
+	dict_index_t*	index,	/*!< in: the index tree of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	page_cur_t	cur;
+	page_cur_set_before_first(block, &cur);
+
+	return(btr_page_reorganize_low(recovery, z_level, &cur, index, mtr));
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
+UNIV_INTERN
+bool
+btr_page_reorganize(
+/*================*/
+	page_cur_t*	cursor,	/*!< in/out: page cursor */
+	dict_index_t*	index,	/*!< in: the index tree of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	return(btr_page_reorganize_low(false, page_zip_level,
+				       cursor, index, mtr));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses a redo log record of reorganizing a page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_parse_page_reorganize(
+/*======================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	bool		compressed,/*!< in: true if compressed page */
+	buf_block_t*	block,	/*!< in: page to be reorganized, or NULL */
+	mtr_t*		mtr)	/*!< in: mtr or NULL */
+{
+	ulint	level;
+
+	ut_ad(ptr && end_ptr);
+
+	/* If dealing with a compressed page the record has the
+	compression level used during original compression written in
+	one byte. Otherwise record is empty. */
+	if (compressed) {
+		if (ptr == end_ptr) {
+			return(NULL);
+		}
+
+		level = mach_read_from_1(ptr);
+
+		ut_a(level <= 9);
+		++ptr;
+	} else {
+		level = page_zip_level;
+	}
+
+	if (block != NULL) {
+		btr_page_reorganize_block(true, level, block, index, mtr);
+	}
+
+	return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Empties an index page.  @see btr_page_create(). */
+static
+void
+btr_page_empty(
+/*===========*/
+	buf_block_t*	block,	/*!< in: page to be emptied */
+	page_zip_des_t*	page_zip,/*!< out: compressed page, or NULL */
+	dict_index_t*	index,	/*!< in: index of the page */
+	ulint		level,	/*!< in: the B-tree level of the page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*	page = buf_block_get_frame(block);
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(page_zip == buf_block_get_page_zip(block));
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	btr_search_drop_page_hash_index(block);
+	btr_blob_dbg_remove(page, index, "btr_page_empty");
+
+	/* Recreate the page: note that global data on page (possible
+	segment headers, next page-field, etc.) is preserved intact */
+
+	if (page_zip) {
+		page_create_zip(block, index, level, 0, mtr);
+	} else {
+		page_create(block, mtr, dict_table_is_comp(index->table));
+		btr_page_set_level(page, NULL, level, mtr);
+	}
+
+	block->check_index_page_at_flush = TRUE;
+}
+
+/*************************************************************//**
+Makes tree one level higher by splitting the root, and inserts
+the tuple. It is assumed that mtr contains an x-latch on the tree.
+NOTE that the operation of this function must always succeed,
+we cannot reverse it: therefore enough free disk space must be
+guaranteed to be available before this function is called.
+@return	inserted record */
+UNIV_INTERN
+rec_t*
+btr_root_raise_and_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert: must be
+				on the root page; when the function returns,
+				the cursor is positioned on the predecessor
+				of the inserted record */
+	ulint**		offsets,/*!< out: offsets on inserted record */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_index_t*	index;
+	page_t*		root;
+	page_t*		new_page;
+	ulint		new_page_no;
+	rec_t*		rec;
+	dtuple_t*	node_ptr;
+	ulint		level;
+	rec_t*		node_ptr_rec;
+	page_cur_t*	page_cursor;
+	page_zip_des_t*	root_page_zip;
+	page_zip_des_t*	new_page_zip;
+	buf_block_t*	root_block;
+	buf_block_t*	new_block;
+
+	root = btr_cur_get_page(cursor);
+	root_block = btr_cur_get_block(cursor);
+	root_page_zip = buf_block_get_page_zip(root_block);
+	ut_ad(!page_is_empty(root));
+	index = btr_cur_get_index(cursor);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!root_page_zip || page_zip_validate(root_page_zip, root, index));
+#endif /* UNIV_ZIP_DEBUG */
+#ifdef UNIV_BTR_DEBUG
+	if (!dict_index_is_ibuf(index)) {
+		ulint	space = dict_index_get_space(index);
+
+		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+					    + root, space));
+		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+					    + root, space));
+	}
+
+	ut_a(dict_index_get_page(index) == page_get_page_no(root));
+#endif /* UNIV_BTR_DEBUG */
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains(mtr, root_block, MTR_MEMO_PAGE_X_FIX));
+
+	/* Allocate a new page to the tree. Root splitting is done by first
+	moving the root records to the new page, emptying the root, putting
+	a node pointer to the new page, and then splitting the new page. */
+
+	level = btr_page_get_level(root, mtr);
+
+	new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr);
+	new_page = buf_block_get_frame(new_block);
+	new_page_zip = buf_block_get_page_zip(new_block);
+	ut_a(!new_page_zip == !root_page_zip);
+	ut_a(!new_page_zip
+	     || page_zip_get_size(new_page_zip)
+	     == page_zip_get_size(root_page_zip));
+
+	btr_page_create(new_block, new_page_zip, index, level, mtr);
+
+	/* Set the next node and previous node fields of new page */
+	btr_page_set_next(new_page, new_page_zip, FIL_NULL, mtr);
+	btr_page_set_prev(new_page, new_page_zip, FIL_NULL, mtr);
+
+	/* Copy the records from root to the new page one by one. */
+
+	if (0
+#ifdef UNIV_ZIP_COPY
+	    || new_page_zip
+#endif /* UNIV_ZIP_COPY */
+	    || !page_copy_rec_list_end(new_block, root_block,
+				       page_get_infimum_rec(root),
+				       index, mtr)) {
+		ut_a(new_page_zip);
+
+		/* Copy the page byte for byte. */
+		page_zip_copy_recs(new_page_zip, new_page,
+				   root_page_zip, root, index, mtr);
+
+		/* Update the lock table and possible hash index. */
+
+		lock_move_rec_list_end(new_block, root_block,
+				       page_get_infimum_rec(root));
+
+		btr_search_move_or_delete_hash_entries(new_block, root_block,
+						       index);
+	}
+
+	/* If this is a pessimistic insert which is actually done to
+	perform a pessimistic update then we have stored the lock
+	information of the record to be inserted on the infimum of the
+	root page: we cannot discard the lock structs on the root page */
+
+	lock_update_root_raise(new_block, root_block);
+
+	/* Create a memory heap where the node pointer is stored */
+	if (!*heap) {
+		*heap = mem_heap_create(1000);
+	}
+
+	rec = page_rec_get_next(page_get_infimum_rec(new_page));
+	new_page_no = buf_block_get_page_no(new_block);
+
+	/* Build the node pointer (= node key and page address) for the
+	child */
+
+	node_ptr = dict_index_build_node_ptr(
+		index, rec, new_page_no, *heap, level);
+	/* The node pointer must be marked as the predefined minimum record,
+	as there is no lower alphabetical limit to records in the leftmost
+	node of a level: */
+	dtuple_set_info_bits(node_ptr,
+			     dtuple_get_info_bits(node_ptr)
+			     | REC_INFO_MIN_REC_FLAG);
+
+	/* Rebuild the root page to get free space */
+	btr_page_empty(root_block, root_page_zip, index, level + 1, mtr);
+
+	/* Set the next node and previous node fields, although
+	they should already have been set.  The previous node field
+	must be FIL_NULL if root_page_zip != NULL, because the
+	REC_INFO_MIN_REC_FLAG (of the first user record) will be
+	set if and only if btr_page_get_prev() == FIL_NULL. */
+	btr_page_set_next(root, root_page_zip, FIL_NULL, mtr);
+	btr_page_set_prev(root, root_page_zip, FIL_NULL, mtr);
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	/* Insert node pointer to the root */
+
+	page_cur_set_before_first(root_block, page_cursor);
+
+	node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
+					     index, offsets, heap, 0, mtr);
+
+	/* The root page should only contain the node pointer
+	to new_page at this point.  Thus, the data should fit. */
+	ut_a(node_ptr_rec);
+
+	/* We play safe and reset the free bits for the new page */
+
+#if 0
+	fprintf(stderr, "Root raise new page no %lu\n", new_page_no);
+#endif
+
+	if (!dict_index_is_clust(index)) {
+		ibuf_reset_free_bits(new_block);
+	}
+
+	/* Reposition the cursor to the child node */
+	page_cur_search(new_block, index, tuple,
+			PAGE_CUR_LE, page_cursor);
+
+	/* Split the child and insert tuple */
+	return(btr_page_split_and_insert(flags, cursor, offsets, heap,
+					 tuple, n_ext, mtr));
+}
+
+/*************************************************************//**
+Decides if the page should be split at the convergence point of inserts
+converging to the left.
+@return	TRUE if split recommended */
+UNIV_INTERN
+ibool
+btr_page_get_split_rec_to_left(
+/*===========================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert */
+	rec_t**		split_rec) /*!< out: if split recommended,
+				the first record on upper half page,
+				or NULL if tuple to be inserted should
+				be first */
+{
+	page_t*	page;
+	rec_t*	insert_point;
+	rec_t*	infimum;
+
+	page = btr_cur_get_page(cursor);
+	insert_point = btr_cur_get_rec(cursor);
+
+	if (page_header_get_ptr(page, PAGE_LAST_INSERT)
+	    == page_rec_get_next(insert_point)) {
+
+		infimum = page_get_infimum_rec(page);
+
+		/* If the convergence is in the middle of a page, include also
+		the record immediately before the new insert to the upper
+		page. Otherwise, we could repeatedly move from page to page
+		lots of records smaller than the convergence point. */
+
+		if (infimum != insert_point
+		    && page_rec_get_next(infimum) != insert_point) {
+
+			*split_rec = insert_point;
+		} else {
+			*split_rec = page_rec_get_next(insert_point);
+		}
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*************************************************************//**
+Decides if the page should be split at the convergence point of inserts
+converging to the right.
+@return	TRUE if split recommended */
+UNIV_INTERN
+ibool
+btr_page_get_split_rec_to_right(
+/*============================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert */
+	rec_t**		split_rec) /*!< out: if split recommended,
+				the first record on upper half page,
+				or NULL if tuple to be inserted should
+				be first */
+{
+	page_t*	page;
+	rec_t*	insert_point;
+
+	page = btr_cur_get_page(cursor);
+	insert_point = btr_cur_get_rec(cursor);
+
+	/* We use eager heuristics: if the new insert would be right after
+	the previous insert on the same page, we assume that there is a
+	pattern of sequential inserts here. */
+
+	if (page_header_get_ptr(page, PAGE_LAST_INSERT) == insert_point) {
+
+		rec_t*	next_rec;
+
+		next_rec = page_rec_get_next(insert_point);
+
+		if (page_rec_is_supremum(next_rec)) {
+split_at_new:
+			/* Split at the new record to insert */
+			*split_rec = NULL;
+		} else {
+			rec_t*	next_next_rec = page_rec_get_next(next_rec);
+			if (page_rec_is_supremum(next_next_rec)) {
+
+				goto split_at_new;
+			}
+
+			/* If there are >= 2 user records up from the insert
+			point, split all but 1 off. We want to keep one because
+			then sequential inserts can use the adaptive hash
+			index, as they can do the necessary checks of the right
+			search position just by looking at the records on this
+			page. */
+
+			*split_rec = next_next_rec;
+		}
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*************************************************************//**
+Calculates a split record such that the tuple will certainly fit on
+its half-page when the split is performed. We assume in this function
+only that the cursor page has at least one user record.
+@return split record, or NULL if tuple will be the first record on
+the lower or upper half-page (determined by btr_page_tuple_smaller()) */
+static
+rec_t*
+btr_page_get_split_rec(
+/*===================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which insert should be made */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+{
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	ulint		insert_size;
+	ulint		free_space;
+	ulint		total_data;
+	ulint		total_n_recs;
+	ulint		total_space;
+	ulint		incl_data;
+	rec_t*		ins_rec;
+	rec_t*		rec;
+	rec_t*		next_rec;
+	ulint		n;
+	mem_heap_t*	heap;
+	ulint*		offsets;
+
+	page = btr_cur_get_page(cursor);
+
+	insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
+	free_space  = page_get_free_space_of_empty(page_is_comp(page));
+
+	page_zip = btr_cur_get_page_zip(cursor);
+	if (page_zip) {
+		/* Estimate the free space of an empty compressed page. */
+		ulint	free_space_zip = page_zip_empty_size(
+			cursor->index->n_fields,
+			page_zip_get_size(page_zip));
+
+		if (free_space > (ulint) free_space_zip) {
+			free_space = (ulint) free_space_zip;
+		}
+	}
+
+	/* free_space is now the free space of a created new page */
+
+	total_data   = page_get_data_size(page) + insert_size;
+	total_n_recs = page_get_n_recs(page) + 1;
+	ut_ad(total_n_recs >= 2);
+	total_space  = total_data + page_dir_calc_reserved_space(total_n_recs);
+
+	n = 0;
+	incl_data = 0;
+	ins_rec = btr_cur_get_rec(cursor);
+	rec = page_get_infimum_rec(page);
+
+	heap = NULL;
+	offsets = NULL;
+
+	/* We start to include records to the left half, and when the
+	space reserved by them exceeds half of total_space, then if
+	the included records fit on the left page, they will be put there
+	if something was left over also for the right page,
+	otherwise the last included record will be the first on the right
+	half page */
+
+	do {
+		/* Decide the next record to include */
+		if (rec == ins_rec) {
+			rec = NULL;	/* NULL denotes that tuple is
+					now included */
+		} else if (rec == NULL) {
+			rec = page_rec_get_next(ins_rec);
+		} else {
+			rec = page_rec_get_next(rec);
+		}
+
+		if (rec == NULL) {
+			/* Include tuple */
+			incl_data += insert_size;
+		} else {
+			offsets = rec_get_offsets(rec, cursor->index,
+						  offsets, ULINT_UNDEFINED,
+						  &heap);
+			incl_data += rec_offs_size(offsets);
+		}
+
+		n++;
+	} while (incl_data + page_dir_calc_reserved_space(n)
+		 < total_space / 2);
+
+	if (incl_data + page_dir_calc_reserved_space(n) <= free_space) {
+		/* The next record will be the first on
+		the right half page if it is not the
+		supremum record of page */
+
+		if (rec == ins_rec) {
+			rec = NULL;
+
+			goto func_exit;
+		} else if (rec == NULL) {
+			next_rec = page_rec_get_next(ins_rec);
+		} else {
+			next_rec = page_rec_get_next(rec);
+		}
+		ut_ad(next_rec);
+		if (!page_rec_is_supremum(next_rec)) {
+			rec = next_rec;
+		}
+	}
+
+func_exit:
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	return(rec);
+}
+
+/*************************************************************//**
+Returns TRUE if the insert fits on the appropriate half-page with the
+chosen split_rec.
+@return	true if fits */
+static __attribute__((nonnull(1,3,4,6), warn_unused_result))
+bool
+btr_page_insert_fits(
+/*=================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which insert
+				should be made */
+	const rec_t*	split_rec,/*!< in: suggestion for first record
+				on upper half-page, or NULL if
+				tuple to be inserted should be first */
+	ulint**		offsets,/*!< in: rec_get_offsets(
+				split_rec, cursor->index); out: garbage */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mem_heap_t**	heap)	/*!< in: temporary memory heap */
+{
+	page_t*		page;
+	ulint		insert_size;
+	ulint		free_space;
+	ulint		total_data;
+	ulint		total_n_recs;
+	const rec_t*	rec;
+	const rec_t*	end_rec;
+
+	page = btr_cur_get_page(cursor);
+
+	ut_ad(!split_rec
+	      || !page_is_comp(page) == !rec_offs_comp(*offsets));
+	ut_ad(!split_rec
+	      || rec_offs_validate(split_rec, cursor->index, *offsets));
+
+	insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
+	free_space  = page_get_free_space_of_empty(page_is_comp(page));
+
+	/* free_space is now the free space of a created new page */
+
+	total_data   = page_get_data_size(page) + insert_size;
+	total_n_recs = page_get_n_recs(page) + 1;
+
+	/* We determine which records (from rec to end_rec, not including
+	end_rec) will end up on the other half page from tuple when it is
+	inserted. */
+
+	if (split_rec == NULL) {
+		rec = page_rec_get_next(page_get_infimum_rec(page));
+		end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
+
+	} else if (cmp_dtuple_rec(tuple, split_rec, *offsets) >= 0) {
+
+		rec = page_rec_get_next(page_get_infimum_rec(page));
+		end_rec = split_rec;
+	} else {
+		rec = split_rec;
+		end_rec = page_get_supremum_rec(page);
+	}
+
+	if (total_data + page_dir_calc_reserved_space(total_n_recs)
+	    <= free_space) {
+
+		/* Ok, there will be enough available space on the
+		half page where the tuple is inserted */
+
+		return(true);
+	}
+
+	while (rec != end_rec) {
+		/* In this loop we calculate the amount of reserved
+		space after rec is removed from page. */
+
+		*offsets = rec_get_offsets(rec, cursor->index, *offsets,
+					   ULINT_UNDEFINED, heap);
+
+		total_data -= rec_offs_size(*offsets);
+		total_n_recs--;
+
+		if (total_data + page_dir_calc_reserved_space(total_n_recs)
+		    <= free_space) {
+
+			/* Ok, there will be enough available space on the
+			half page where the tuple is inserted */
+
+			return(true);
+		}
+
+		rec = page_rec_get_next_const(rec);
+	}
+
+	return(false);
+}
+
+/*******************************************************//**
+Inserts a data tuple to a tree on a non-leaf level. It is assumed
+that mtr holds an x-latch on the tree. */
+UNIV_INTERN
+void
+btr_insert_on_non_leaf_level_func(
+/*==============================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: level, must be > 0 */
+	dtuple_t*	tuple,	/*!< in: the record to be inserted */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	big_rec_t*	dummy_big_rec;
+	btr_cur_t	cursor;
+	dberr_t		err;
+	rec_t*		rec;
+	ulint*		offsets	= NULL;
+	mem_heap_t*	heap = NULL;
+
+	ut_ad(level > 0);
+
+	btr_cur_search_to_nth_level(index, level, tuple, PAGE_CUR_LE,
+				    BTR_CONT_MODIFY_TREE,
+				    &cursor, 0, file, line, mtr);
+
+	ut_ad(cursor.flag == BTR_CUR_BINARY);
+
+	err = btr_cur_optimistic_insert(
+		flags
+		| BTR_NO_LOCKING_FLAG
+		| BTR_KEEP_SYS_FLAG
+		| BTR_NO_UNDO_LOG_FLAG,
+		&cursor, &offsets, &heap,
+		tuple, &rec, &dummy_big_rec, 0, NULL, mtr);
+
+	if (err == DB_FAIL) {
+		err = btr_cur_pessimistic_insert(flags
+						 | BTR_NO_LOCKING_FLAG
+						 | BTR_KEEP_SYS_FLAG
+						 | BTR_NO_UNDO_LOG_FLAG,
+						 &cursor, &offsets, &heap,
+						 tuple, &rec,
+						 &dummy_big_rec, 0, NULL, mtr);
+		ut_a(err == DB_SUCCESS);
+	}
+	mem_heap_free(heap);
+}
+
+/**************************************************************//**
+Attaches the halves of an index page on the appropriate level in an
+index tree. */
+static __attribute__((nonnull))
+void
+btr_attach_half_pages(
+/*==================*/
+	ulint		flags,		/*!< in: undo logging and
+					locking flags */
+	dict_index_t*	index,		/*!< in: the index tree */
+	buf_block_t*	block,		/*!< in/out: page to be split */
+	const rec_t*	split_rec,	/*!< in: first record on upper
+					half page */
+	buf_block_t*	new_block,	/*!< in/out: the new half page */
+	ulint		direction,	/*!< in: FSP_UP or FSP_DOWN */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint		space;
+	ulint		zip_size;
+	ulint		prev_page_no;
+	ulint		next_page_no;
+	ulint		level;
+	page_t*		page		= buf_block_get_frame(block);
+	page_t*		lower_page;
+	page_t*		upper_page;
+	ulint		lower_page_no;
+	ulint		upper_page_no;
+	page_zip_des_t*	lower_page_zip;
+	page_zip_des_t*	upper_page_zip;
+	dtuple_t*	node_ptr_upper;
+	mem_heap_t*	heap;
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains(mtr, new_block, MTR_MEMO_PAGE_X_FIX));
+
+	/* Create a memory heap where the data tuple is stored */
+	heap = mem_heap_create(1024);
+
+	/* Based on split direction, decide upper and lower pages */
+	if (direction == FSP_DOWN) {
+
+		btr_cur_t	cursor;
+		ulint*		offsets;
+
+		lower_page = buf_block_get_frame(new_block);
+		lower_page_no = buf_block_get_page_no(new_block);
+		lower_page_zip = buf_block_get_page_zip(new_block);
+		upper_page = buf_block_get_frame(block);
+		upper_page_no = buf_block_get_page_no(block);
+		upper_page_zip = buf_block_get_page_zip(block);
+
+		/* Look up the index for the node pointer to page */
+		offsets = btr_page_get_father_block(NULL, heap, index,
+						    block, mtr, &cursor);
+
+		/* Replace the address of the old child node (= page) with the
+		address of the new lower half */
+
+		btr_node_ptr_set_child_page_no(
+			btr_cur_get_rec(&cursor),
+			btr_cur_get_page_zip(&cursor),
+			offsets, lower_page_no, mtr);
+		mem_heap_empty(heap);
+	} else {
+		lower_page = buf_block_get_frame(block);
+		lower_page_no = buf_block_get_page_no(block);
+		lower_page_zip = buf_block_get_page_zip(block);
+		upper_page = buf_block_get_frame(new_block);
+		upper_page_no = buf_block_get_page_no(new_block);
+		upper_page_zip = buf_block_get_page_zip(new_block);
+	}
+
+	/* Get the level of the split pages */
+	level = btr_page_get_level(buf_block_get_frame(block), mtr);
+	ut_ad(level
+	      == btr_page_get_level(buf_block_get_frame(new_block), mtr));
+
+	/* Build the node pointer (= node key and page address) for the upper
+	half */
+
+	node_ptr_upper = dict_index_build_node_ptr(index, split_rec,
+						   upper_page_no, heap, level);
+
+	/* Insert it next to the pointer to the lower half. Note that this
+	may generate recursion leading to a split on the higher level. */
+
+	btr_insert_on_non_leaf_level(flags, index, level + 1,
+				     node_ptr_upper, mtr);
+
+	/* Free the memory heap */
+	mem_heap_free(heap);
+
+	/* Get the previous and next pages of page */
+
+	prev_page_no = btr_page_get_prev(page, mtr);
+	next_page_no = btr_page_get_next(page, mtr);
+	space = buf_block_get_space(block);
+	zip_size = buf_block_get_zip_size(block);
+
+	/* Update page links of the level */
+
+	if (prev_page_no != FIL_NULL) {
+		buf_block_t*	prev_block = btr_block_get(
+			space, zip_size, prev_page_no, RW_X_LATCH, index, mtr);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(prev_block->frame) == page_is_comp(page));
+		ut_a(btr_page_get_next(prev_block->frame, mtr)
+		     == buf_block_get_page_no(block));
+#endif /* UNIV_BTR_DEBUG */
+
+		btr_page_set_next(buf_block_get_frame(prev_block),
+				  buf_block_get_page_zip(prev_block),
+				  lower_page_no, mtr);
+	}
+
+	if (next_page_no != FIL_NULL) {
+		buf_block_t*	next_block = btr_block_get(
+			space, zip_size, next_page_no, RW_X_LATCH, index, mtr);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(next_block->frame) == page_is_comp(page));
+		ut_a(btr_page_get_prev(next_block->frame, mtr)
+		     == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+		btr_page_set_prev(buf_block_get_frame(next_block),
+				  buf_block_get_page_zip(next_block),
+				  upper_page_no, mtr);
+	}
+
+	btr_page_set_prev(lower_page, lower_page_zip, prev_page_no, mtr);
+	btr_page_set_next(lower_page, lower_page_zip, upper_page_no, mtr);
+
+	btr_page_set_prev(upper_page, upper_page_zip, lower_page_no, mtr);
+	btr_page_set_next(upper_page, upper_page_zip, next_page_no, mtr);
+}
+
+/*************************************************************//**
+Determine if a tuple is smaller than any record on the page.
+@return TRUE if smaller */
+static __attribute__((nonnull, warn_unused_result))
+bool
+btr_page_tuple_smaller(
+/*===================*/
+	btr_cur_t*	cursor,	/*!< in: b-tree cursor */
+	const dtuple_t*	tuple,	/*!< in: tuple to consider */
+	ulint**		offsets,/*!< in/out: temporary storage */
+	ulint		n_uniq,	/*!< in: number of unique fields
+				in the index page records */
+	mem_heap_t**	heap)	/*!< in/out: heap for offsets */
+{
+	buf_block_t*	block;
+	const rec_t*	first_rec;
+	page_cur_t	pcur;
+
+	/* Read the first user record in the page. */
+	block = btr_cur_get_block(cursor);
+	page_cur_set_before_first(block, &pcur);
+	page_cur_move_to_next(&pcur);
+	first_rec = page_cur_get_rec(&pcur);
+
+	*offsets = rec_get_offsets(
+		first_rec, cursor->index, *offsets,
+		n_uniq, heap);
+
+	return(cmp_dtuple_rec(tuple, first_rec, *offsets) < 0);
+}
+
+/** Insert the tuple into the right sibling page, if the cursor is at the end
+of a page.
+@param[in]	flags	undo logging and locking flags
+@param[in,out]	cursor	cursor at which to insert; when the function succeeds,
+			the cursor is positioned before the insert point.
+@param[out]	offsets	offsets on inserted record
+@param[in,out]	heap	memory heap for allocating offsets
+@param[in]	tuple	tuple to insert
+@param[in]	n_ext	number of externally stored columns
+@param[in,out]	mtr	mini-transaction
+@return	inserted record (first record on the right sibling page);
+	the cursor will be positioned on the page infimum
+@retval	NULL if the operation was not performed */
+static
+rec_t*
+btr_insert_into_right_sibling(
+	ulint		flags,
+	btr_cur_t*	cursor,
+	ulint**		offsets,
+	mem_heap_t*	heap,
+	const dtuple_t*	tuple,
+	ulint		n_ext,
+	mtr_t*		mtr)
+{
+	buf_block_t*	block = btr_cur_get_block(cursor);
+	page_t*		page = buf_block_get_frame(block);
+	ulint		next_page_no = btr_page_get_next(page, mtr);
+
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(heap);
+
+	if (next_page_no == FIL_NULL || !page_rec_is_supremum(
+			page_rec_get_next(btr_cur_get_rec(cursor)))) {
+
+		return(NULL);
+	}
+
+	page_cur_t	next_page_cursor;
+	buf_block_t*	next_block;
+	page_t*		next_page;
+	btr_cur_t	next_father_cursor;
+	rec_t*		rec = NULL;
+	ulint		zip_size = buf_block_get_zip_size(block);
+	ulint		max_size;
+
+	next_block = btr_block_get(
+		buf_block_get_space(block), zip_size,
+		next_page_no, RW_X_LATCH, cursor->index, mtr);
+	next_page = buf_block_get_frame(next_block);
+
+	bool	is_leaf = page_is_leaf(next_page);
+
+	btr_page_get_father(
+		cursor->index, next_block, mtr, &next_father_cursor);
+
+	page_cur_search(
+		next_block, cursor->index, tuple, PAGE_CUR_LE,
+		&next_page_cursor);
+
+	max_size = page_get_max_insert_size_after_reorganize(next_page, 1);
+
+	/* Extends gap lock for the next page */
+	lock_update_split_left(next_block, block);
+
+	rec = page_cur_tuple_insert(
+		&next_page_cursor, tuple, cursor->index, offsets, &heap,
+		n_ext, mtr);
+
+	if (rec == NULL) {
+		if (zip_size && is_leaf
+		    && !dict_index_is_clust(cursor->index)) {
+			/* Reset the IBUF_BITMAP_FREE bits, because
+			page_cur_tuple_insert() will have attempted page
+			reorganize before failing. */
+			ibuf_reset_free_bits(next_block);
+		}
+		return(NULL);
+	}
+
+	ibool	compressed;
+	dberr_t	err;
+	ulint	level = btr_page_get_level(next_page, mtr);
+
+	/* adjust cursor position */
+	*btr_cur_get_page_cur(cursor) = next_page_cursor;
+
+	ut_ad(btr_cur_get_rec(cursor) == page_get_infimum_rec(next_page));
+	ut_ad(page_rec_get_next(page_get_infimum_rec(next_page)) == rec);
+
+	/* We have to change the parent node pointer */
+
+	compressed = btr_cur_pessimistic_delete(
+		&err, TRUE, &next_father_cursor,
+		BTR_CREATE_FLAG, RB_NONE, mtr);
+
+	ut_a(err == DB_SUCCESS);
+
+	if (!compressed) {
+		btr_cur_compress_if_useful(&next_father_cursor, FALSE, mtr);
+	}
+
+	dtuple_t*	node_ptr = dict_index_build_node_ptr(
+		cursor->index, rec, buf_block_get_page_no(next_block),
+		heap, level);
+
+	btr_insert_on_non_leaf_level(
+		flags, cursor->index, level + 1, node_ptr, mtr);
+
+	ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
+
+	if (is_leaf && !dict_index_is_clust(cursor->index)) {
+		/* Update the free bits of the B-tree page in the
+		insert buffer bitmap. */
+
+		if (zip_size) {
+			ibuf_update_free_bits_zip(next_block, mtr);
+		} else {
+			ibuf_update_free_bits_if_full(
+				next_block, max_size,
+				rec_offs_size(*offsets) + PAGE_DIR_SLOT_SIZE);
+		}
+	}
+
+	return(rec);
+}
+
+/*************************************************************//**
+Splits an index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+
+@return inserted record */
+UNIV_INTERN
+rec_t*
+btr_page_split_and_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert; when the
+				function returns, the cursor is positioned
+				on the predecessor of the inserted record */
+	ulint**		offsets,/*!< out: offsets on inserted record */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	ulint		page_no;
+	byte		direction;
+	ulint		hint_page_no;
+	buf_block_t*	new_block;
+	page_t*		new_page;
+	page_zip_des_t*	new_page_zip;
+	rec_t*		split_rec;
+	buf_block_t*	left_block;
+	buf_block_t*	right_block;
+	buf_block_t*	insert_block;
+	page_cur_t*	page_cursor;
+	rec_t*		first_rec;
+	byte*		buf = 0; /* remove warning */
+	rec_t*		move_limit;
+	ibool		insert_will_fit;
+	ibool		insert_left;
+	ulint		n_iterations = 0;
+	rec_t*		rec;
+	ulint		n_uniq;
+
+	if (!*heap) {
+		*heap = mem_heap_create(1024);
+	}
+	n_uniq = dict_index_get_n_unique_in_tree(cursor->index);
+func_start:
+	mem_heap_empty(*heap);
+	*offsets = NULL;
+
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
+				MTR_MEMO_X_LOCK));
+	ut_ad(!dict_index_is_online_ddl(cursor->index)
+	      || (flags & BTR_CREATE_FLAG)
+	      || dict_index_is_clust(cursor->index));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(cursor->index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	block = btr_cur_get_block(cursor);
+	page = buf_block_get_frame(block);
+	page_zip = buf_block_get_page_zip(block);
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!page_is_empty(page));
+
+	/* try to insert to the next page if possible before split */
+	rec = btr_insert_into_right_sibling(
+		flags, cursor, offsets, *heap, tuple, n_ext, mtr);
+
+	if (rec != NULL) {
+		return(rec);
+	}
+
+	page_no = buf_block_get_page_no(block);
+
+	/* 1. Decide the split record; split_rec == NULL means that the
+	tuple to be inserted should be the first record on the upper
+	half-page */
+	insert_left = FALSE;
+
+	if (n_iterations > 0) {
+		direction = FSP_UP;
+		hint_page_no = page_no + 1;
+		split_rec = btr_page_get_split_rec(cursor, tuple, n_ext);
+
+		if (split_rec == NULL) {
+			insert_left = btr_page_tuple_smaller(
+				cursor, tuple, offsets, n_uniq, heap);
+		}
+	} else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) {
+		direction = FSP_UP;
+		hint_page_no = page_no + 1;
+
+	} else if (btr_page_get_split_rec_to_left(cursor, &split_rec)) {
+		direction = FSP_DOWN;
+		hint_page_no = page_no - 1;
+		ut_ad(split_rec);
+	} else {
+		direction = FSP_UP;
+		hint_page_no = page_no + 1;
+
+		/* If there is only one record in the index page, we
+		can't split the node in the middle by default. We need
+		to determine whether the new record will be inserted
+		to the left or right. */
+
+		if (page_get_n_recs(page) > 1) {
+			split_rec = page_get_middle_rec(page);
+		} else if (btr_page_tuple_smaller(cursor, tuple,
+						  offsets, n_uniq, heap)) {
+			split_rec = page_rec_get_next(
+				page_get_infimum_rec(page));
+		} else {
+			split_rec = NULL;
+		}
+	}
+
+	/* 2. Allocate a new page to the index */
+	new_block = btr_page_alloc(cursor->index, hint_page_no, direction,
+				   btr_page_get_level(page, mtr), mtr, mtr);
+	new_page = buf_block_get_frame(new_block);
+	new_page_zip = buf_block_get_page_zip(new_block);
+	btr_page_create(new_block, new_page_zip, cursor->index,
+			btr_page_get_level(page, mtr), mtr);
+
+	/* 3. Calculate the first record on the upper half-page, and the
+	first record (move_limit) on original page which ends up on the
+	upper half */
+
+	if (split_rec) {
+		first_rec = move_limit = split_rec;
+
+		*offsets = rec_get_offsets(split_rec, cursor->index, *offsets,
+					   n_uniq, heap);
+
+		insert_left = cmp_dtuple_rec(tuple, split_rec, *offsets) < 0;
+
+		if (!insert_left && new_page_zip && n_iterations > 0) {
+			/* If a compressed page has already been split,
+			avoid further splits by inserting the record
+			to an empty page. */
+			split_rec = NULL;
+			goto insert_empty;
+		}
+	} else if (insert_left) {
+		ut_a(n_iterations > 0);
+		first_rec = page_rec_get_next(page_get_infimum_rec(page));
+		move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
+	} else {
+insert_empty:
+		ut_ad(!split_rec);
+		ut_ad(!insert_left);
+		buf = (byte*) mem_alloc(rec_get_converted_size(cursor->index,
+							       tuple, n_ext));
+
+		first_rec = rec_convert_dtuple_to_rec(buf, cursor->index,
+						      tuple, n_ext);
+		move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
+	}
+
+	/* 4. Do first the modifications in the tree structure */
+
+	btr_attach_half_pages(flags, cursor->index, block,
+			      first_rec, new_block, direction, mtr);
+
+	/* If the split is made on the leaf level and the insert will fit
+	on the appropriate half-page, we may release the tree x-latch.
+	We can then move the records after releasing the tree latch,
+	thus reducing the tree latch contention. */
+
+	if (split_rec) {
+		insert_will_fit = !new_page_zip
+			&& btr_page_insert_fits(cursor, split_rec,
+						offsets, tuple, n_ext, heap);
+	} else {
+		if (!insert_left) {
+			mem_free(buf);
+			buf = NULL;
+		}
+
+		insert_will_fit = !new_page_zip
+			&& btr_page_insert_fits(cursor, NULL,
+						offsets, tuple, n_ext, heap);
+	}
+
+	if (insert_will_fit && page_is_leaf(page)
+	    && !dict_index_is_online_ddl(cursor->index)) {
+
+		mtr_memo_release(mtr, dict_index_get_lock(cursor->index),
+				 MTR_MEMO_X_LOCK);
+	}
+
+	/* 5. Move then the records to the new page */
+	if (direction == FSP_DOWN) {
+		/*		fputs("Split left\n", stderr); */
+
+		if (0
+#ifdef UNIV_ZIP_COPY
+		    || page_zip
+#endif /* UNIV_ZIP_COPY */
+		    || !page_move_rec_list_start(new_block, block, move_limit,
+						 cursor->index, mtr)) {
+			/* For some reason, compressing new_page failed,
+			even though it should contain fewer records than
+			the original page.  Copy the page byte for byte
+			and then delete the records from both pages
+			as appropriate.  Deleting will always succeed. */
+			ut_a(new_page_zip);
+
+			page_zip_copy_recs(new_page_zip, new_page,
+					   page_zip, page, cursor->index, mtr);
+			page_delete_rec_list_end(move_limit - page + new_page,
+						 new_block, cursor->index,
+						 ULINT_UNDEFINED,
+						 ULINT_UNDEFINED, mtr);
+
+			/* Update the lock table and possible hash index. */
+
+			lock_move_rec_list_start(
+				new_block, block, move_limit,
+				new_page + PAGE_NEW_INFIMUM);
+
+			btr_search_move_or_delete_hash_entries(
+				new_block, block, cursor->index);
+
+			/* Delete the records from the source page. */
+
+			page_delete_rec_list_start(move_limit, block,
+						   cursor->index, mtr);
+		}
+
+		left_block = new_block;
+		right_block = block;
+
+		lock_update_split_left(right_block, left_block);
+	} else {
+		/*		fputs("Split right\n", stderr); */
+
+		if (0
+#ifdef UNIV_ZIP_COPY
+		    || page_zip
+#endif /* UNIV_ZIP_COPY */
+		    || !page_move_rec_list_end(new_block, block, move_limit,
+					       cursor->index, mtr)) {
+			/* For some reason, compressing new_page failed,
+			even though it should contain fewer records than
+			the original page.  Copy the page byte for byte
+			and then delete the records from both pages
+			as appropriate.  Deleting will always succeed. */
+			ut_a(new_page_zip);
+
+			page_zip_copy_recs(new_page_zip, new_page,
+					   page_zip, page, cursor->index, mtr);
+			page_delete_rec_list_start(move_limit - page
+						   + new_page, new_block,
+						   cursor->index, mtr);
+
+			/* Update the lock table and possible hash index. */
+
+			lock_move_rec_list_end(new_block, block, move_limit);
+
+			btr_search_move_or_delete_hash_entries(
+				new_block, block, cursor->index);
+
+			/* Delete the records from the source page. */
+
+			page_delete_rec_list_end(move_limit, block,
+						 cursor->index,
+						 ULINT_UNDEFINED,
+						 ULINT_UNDEFINED, mtr);
+		}
+
+		left_block = block;
+		right_block = new_block;
+
+		lock_update_split_right(right_block, left_block);
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	if (page_zip) {
+		ut_a(page_zip_validate(page_zip, page, cursor->index));
+		ut_a(page_zip_validate(new_page_zip, new_page, cursor->index));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	/* At this point, split_rec, move_limit and first_rec may point
+	to garbage on the old page. */
+
+	/* 6. The split and the tree modification is now completed. Decide the
+	page where the tuple should be inserted */
+
+	if (insert_left) {
+		insert_block = left_block;
+	} else {
+		insert_block = right_block;
+	}
+
+	/* 7. Reposition the cursor for insert and try insertion */
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	page_cur_search(insert_block, cursor->index, tuple,
+			PAGE_CUR_LE, page_cursor);
+
+	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
+				    offsets, heap, n_ext, mtr);
+
+#ifdef UNIV_ZIP_DEBUG
+	{
+		page_t*		insert_page
+			= buf_block_get_frame(insert_block);
+
+		page_zip_des_t*	insert_page_zip
+			= buf_block_get_page_zip(insert_block);
+
+		ut_a(!insert_page_zip
+		     || page_zip_validate(insert_page_zip, insert_page,
+					  cursor->index));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (rec != NULL) {
+
+		goto func_exit;
+	}
+
+	/* 8. If insert did not fit, try page reorganization.
+	For compressed pages, page_cur_tuple_insert() will have
+	attempted this already. */
+
+	if (page_cur_get_page_zip(page_cursor)
+	    || !btr_page_reorganize(page_cursor, cursor->index, mtr)) {
+
+		goto insert_failed;
+	}
+
+	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
+				    offsets, heap, n_ext, mtr);
+
+	if (rec == NULL) {
+		/* The insert did not fit on the page: loop back to the
+		start of the function for a new split */
+insert_failed:
+		/* We play safe and reset the free bits */
+		if (!dict_index_is_clust(cursor->index)) {
+			ibuf_reset_free_bits(new_block);
+			ibuf_reset_free_bits(block);
+		}
+
+		/* fprintf(stderr, "Split second round %lu\n",
+		page_get_page_no(page)); */
+		n_iterations++;
+		ut_ad(n_iterations < 2
+		      || buf_block_get_page_zip(insert_block));
+		ut_ad(!insert_will_fit);
+
+		goto func_start;
+	}
+
+func_exit:
+	/* Insert fit on the page: update the free bits for the
+	left and right pages in the same mtr */
+
+	if (!dict_index_is_clust(cursor->index) && page_is_leaf(page)) {
+		ibuf_update_free_bits_for_two_pages_low(
+			buf_block_get_zip_size(left_block),
+			left_block, right_block, mtr);
+	}
+
+#if 0
+	fprintf(stderr, "Split and insert done %lu %lu\n",
+		buf_block_get_page_no(left_block),
+		buf_block_get_page_no(right_block));
+#endif
+	MONITOR_INC(MONITOR_INDEX_SPLIT);
+
+	ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index));
+	ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index));
+
+	ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
+	return(rec);
+}
+
+#ifdef UNIV_SYNC_DEBUG
+/*************************************************************//**
+Removes a page from the level list of pages.
+@param space	in: space where removed
+@param zip_size	in: compressed page size in bytes, or 0 for uncompressed
+@param page	in/out: page to remove
+@param index	in: index tree
+@param mtr	in/out: mini-transaction */
+# define btr_level_list_remove(space,zip_size,page,index,mtr)		\
+	btr_level_list_remove_func(space,zip_size,page,index,mtr)
+#else /* UNIV_SYNC_DEBUG */
+/*************************************************************//**
+Removes a page from the level list of pages.
+@param space	in: space where removed
+@param zip_size	in: compressed page size in bytes, or 0 for uncompressed
+@param page	in/out: page to remove
+@param index	in: index tree
+@param mtr	in/out: mini-transaction */
+# define btr_level_list_remove(space,zip_size,page,index,mtr)		\
+	btr_level_list_remove_func(space,zip_size,page,mtr)
+#endif /* UNIV_SYNC_DEBUG */
+
+/*************************************************************//**
+Removes a page from the level list of pages. */
+static __attribute__((nonnull))
+void
+btr_level_list_remove_func(
+/*=======================*/
+	ulint			space,	/*!< in: space where removed */
+	ulint			zip_size,/*!< in: compressed page size in bytes
+					or 0 for uncompressed pages */
+	page_t*			page,	/*!< in/out: page to remove */
+#ifdef UNIV_SYNC_DEBUG
+	const dict_index_t*	index,	/*!< in: index tree */
+#endif /* UNIV_SYNC_DEBUG */
+	mtr_t*			mtr)	/*!< in/out: mini-transaction */
+{
+	ulint	prev_page_no;
+	ulint	next_page_no;
+
+	ut_ad(page && mtr);
+	ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(space == page_get_space_id(page));
+	/* Get the previous and next page numbers of page */
+
+	prev_page_no = btr_page_get_prev(page, mtr);
+	next_page_no = btr_page_get_next(page, mtr);
+
+	/* Update page links of the level */
+
+	if (prev_page_no != FIL_NULL) {
+		buf_block_t*	prev_block
+			= btr_block_get(space, zip_size, prev_page_no,
+					RW_X_LATCH, index, mtr);
+		page_t*		prev_page
+			= buf_block_get_frame(prev_block);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(prev_page) == page_is_comp(page));
+		ut_a(btr_page_get_next(prev_page, mtr)
+		     == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+		btr_page_set_next(prev_page,
+				  buf_block_get_page_zip(prev_block),
+				  next_page_no, mtr);
+	}
+
+	if (next_page_no != FIL_NULL) {
+		buf_block_t*	next_block
+			= btr_block_get(space, zip_size, next_page_no,
+					RW_X_LATCH, index, mtr);
+		page_t*		next_page
+			= buf_block_get_frame(next_block);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(next_page) == page_is_comp(page));
+		ut_a(btr_page_get_prev(next_page, mtr)
+		     == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+		btr_page_set_prev(next_page,
+				  buf_block_get_page_zip(next_block),
+				  prev_page_no, mtr);
+	}
+}
+
+/****************************************************************//**
+Writes the redo log record for setting an index record as the predefined
+minimum record. */
+UNIV_INLINE
+void
+btr_set_min_rec_mark_log(
+/*=====================*/
+	rec_t*	rec,	/*!< in: record */
+	byte	type,	/*!< in: MLOG_COMP_REC_MIN_MARK or MLOG_REC_MIN_MARK */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	mlog_write_initial_log_record(rec, type, mtr);
+
+	/* Write rec offset as a 2-byte ulint */
+	mlog_catenate_ulint(mtr, page_offset(rec), MLOG_2BYTES);
+}
+#else /* !UNIV_HOTBACKUP */
+# define btr_set_min_rec_mark_log(rec,comp,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/****************************************************************//**
+Parses the redo log record for setting an index record as the predefined
+minimum record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_parse_set_min_rec_mark(
+/*=======================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	ulint	comp,	/*!< in: nonzero=compact page format */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr)	/*!< in: mtr or NULL */
+{
+	rec_t*	rec;
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	if (page) {
+		ut_a(!page_is_comp(page) == !comp);
+
+		rec = page + mach_read_from_2(ptr);
+
+		btr_set_min_rec_mark(rec, mtr);
+	}
+
+	return(ptr + 2);
+}
+
+/****************************************************************//**
+Sets a record as the predefined minimum record. */
+UNIV_INTERN
+void
+btr_set_min_rec_mark(
+/*=================*/
+	rec_t*	rec,	/*!< in: record */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	ulint	info_bits;
+
+	if (page_rec_is_comp(rec)) {
+		info_bits = rec_get_info_bits(rec, TRUE);
+
+		rec_set_info_bits_new(rec, info_bits | REC_INFO_MIN_REC_FLAG);
+
+		btr_set_min_rec_mark_log(rec, MLOG_COMP_REC_MIN_MARK, mtr);
+	} else {
+		info_bits = rec_get_info_bits(rec, FALSE);
+
+		rec_set_info_bits_old(rec, info_bits | REC_INFO_MIN_REC_FLAG);
+
+		btr_set_min_rec_mark_log(rec, MLOG_REC_MIN_MARK, mtr);
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Deletes on the upper level the node pointer to a page. */
+UNIV_INTERN
+void
+btr_node_ptr_delete(
+/*================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: page whose node pointer is deleted */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	btr_cur_t	cursor;
+	ibool		compressed;
+	dberr_t		err;
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+
+	/* Delete node pointer on father page */
+	btr_page_get_father(index, block, mtr, &cursor);
+
+	compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor,
+						BTR_CREATE_FLAG, RB_NONE, mtr);
+	ut_a(err == DB_SUCCESS);
+
+	if (!compressed) {
+		btr_cur_compress_if_useful(&cursor, FALSE, mtr);
+	}
+}
+
+/*************************************************************//**
+If page is the only on its level, this function moves its records to the
+father page, thus reducing the tree height.
+@return father block */
+static
+buf_block_t*
+btr_lift_page_up(
+/*=============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: page which is the only on its level;
+				must not be empty: use
+				btr_discard_only_page_on_level if the last
+				record from the page should be removed */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	buf_block_t*	father_block;
+	page_t*		father_page;
+	ulint		page_level;
+	page_zip_des_t*	father_page_zip;
+	page_t*		page		= buf_block_get_frame(block);
+	ulint		root_page_no;
+	buf_block_t*	blocks[BTR_MAX_LEVELS];
+	ulint		n_blocks;	/*!< last used index in blocks[] */
+	ulint		i;
+	bool		lift_father_up;
+	buf_block_t*	block_orig	= block;
+
+	ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
+	ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+
+	page_level = btr_page_get_level(page, mtr);
+	root_page_no = dict_index_get_page(index);
+
+	{
+		btr_cur_t	cursor;
+		ulint*		offsets	= NULL;
+		mem_heap_t*	heap	= mem_heap_create(
+			sizeof(*offsets)
+			* (REC_OFFS_HEADER_SIZE + 1 + 1 + index->n_fields));
+		buf_block_t*	b;
+
+		offsets = btr_page_get_father_block(offsets, heap, index,
+						    block, mtr, &cursor);
+		father_block = btr_cur_get_block(&cursor);
+		father_page_zip = buf_block_get_page_zip(father_block);
+		father_page = buf_block_get_frame(father_block);
+
+		n_blocks = 0;
+
+		/* Store all ancestor pages so we can reset their
+		levels later on.  We have to do all the searches on
+		the tree now because later on, after we've replaced
+		the first level, the tree is in an inconsistent state
+		and can not be searched. */
+		for (b = father_block;
+		     buf_block_get_page_no(b) != root_page_no; ) {
+			ut_a(n_blocks < BTR_MAX_LEVELS);
+
+			offsets = btr_page_get_father_block(offsets, heap,
+							    index, b,
+							    mtr, &cursor);
+
+			blocks[n_blocks++] = b = btr_cur_get_block(&cursor);
+		}
+
+		lift_father_up = (n_blocks && page_level == 0);
+		if (lift_father_up) {
+			/* The father page also should be the only on its level (not
+			root). We should lift up the father page at first.
+			Because the leaf page should be lifted up only for root page.
+			The freeing page is based on page_level (==0 or !=0)
+			to choose segment. If the page_level is changed ==0 from !=0,
+			later freeing of the page doesn't find the page allocation
+			to be freed.*/
+
+			block = father_block;
+			page = buf_block_get_frame(block);
+			page_level = btr_page_get_level(page, mtr);
+
+			ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
+			ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
+			ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+
+			father_block = blocks[0];
+			father_page_zip = buf_block_get_page_zip(father_block);
+			father_page = buf_block_get_frame(father_block);
+		}
+
+		mem_heap_free(heap);
+	}
+
+	btr_search_drop_page_hash_index(block);
+
+	/* Make the father empty */
+	btr_page_empty(father_block, father_page_zip, index, page_level, mtr);
+	page_level++;
+
+	/* Copy the records to the father page one by one. */
+	if (0
+#ifdef UNIV_ZIP_COPY
+	    || father_page_zip
+#endif /* UNIV_ZIP_COPY */
+	    || !page_copy_rec_list_end(father_block, block,
+				       page_get_infimum_rec(page),
+				       index, mtr)) {
+		const page_zip_des_t*	page_zip
+			= buf_block_get_page_zip(block);
+		ut_a(father_page_zip);
+		ut_a(page_zip);
+
+		/* Copy the page byte for byte. */
+		page_zip_copy_recs(father_page_zip, father_page,
+				   page_zip, page, index, mtr);
+
+		/* Update the lock table and possible hash index. */
+
+		lock_move_rec_list_end(father_block, block,
+				       page_get_infimum_rec(page));
+
+		btr_search_move_or_delete_hash_entries(father_block, block,
+						       index);
+	}
+
+	btr_blob_dbg_remove(page, index, "btr_lift_page_up");
+	lock_update_copy_and_discard(father_block, block);
+
+	/* Go upward to root page, decrementing levels by one. */
+	for (i = lift_father_up ? 1 : 0; i < n_blocks; i++, page_level++) {
+		page_t*		page	= buf_block_get_frame(blocks[i]);
+		page_zip_des_t*	page_zip= buf_block_get_page_zip(blocks[i]);
+
+		ut_ad(btr_page_get_level(page, mtr) == page_level + 1);
+
+		btr_page_set_level(page, page_zip, page_level, mtr);
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+	}
+
+	/* Free the file page */
+	btr_page_free(index, block, mtr);
+
+	/* We play it safe and reset the free bits for the father */
+	if (!dict_index_is_clust(index)) {
+		ibuf_reset_free_bits(father_block);
+	}
+	ut_ad(page_validate(father_page, index));
+	ut_ad(btr_check_node_ptr(index, father_block, mtr));
+
+	return(lift_father_up ? block_orig : father_block);
+}
+
+/*************************************************************//**
+Tries to merge the page first to the left immediate brother if such a
+brother exists, and the node pointers to the current page and to the brother
+reside on the same page. If the left brother does not satisfy these
+conditions, looks at the right brother. If the page is the only one on that
+level lifts the records of the page to the father page, thus reducing the
+tree height. It is assumed that mtr holds an x-latch on the tree and on the
+page. If cursor is on the leaf level, mtr must also hold x-latches to the
+brothers, if they exist.
+@return	TRUE on success */
+UNIV_INTERN
+ibool
+btr_compress(
+/*=========*/
+	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to merge
+				or lift; the page must not be empty:
+				when deleting records, use btr_discard_page()
+				if the page would become empty */
+	ibool		adjust,	/*!< in: TRUE if should adjust the
+				cursor position even if compression occurs */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	dict_index_t*	index;
+	ulint		space;
+	ulint		zip_size;
+	ulint		left_page_no;
+	ulint		right_page_no;
+	buf_block_t*	merge_block;
+	page_t*		merge_page = NULL;
+	page_zip_des_t*	merge_page_zip;
+	ibool		is_left;
+	buf_block_t*	block;
+	page_t*		page;
+	btr_cur_t	father_cursor;
+	mem_heap_t*	heap;
+	ulint*		offsets;
+	ulint		nth_rec = 0; /* remove bogus warning */
+	DBUG_ENTER("btr_compress");
+
+	block = btr_cur_get_block(cursor);
+	page = btr_cur_get_page(cursor);
+	index = btr_cur_get_index(cursor);
+
+	btr_assert_not_corrupted(block, index);
+
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	space = dict_index_get_space(index);
+	zip_size = dict_table_zip_size(index->table);
+
+	MONITOR_INC(MONITOR_INDEX_MERGE_ATTEMPTS);
+
+	left_page_no = btr_page_get_prev(page, mtr);
+	right_page_no = btr_page_get_next(page, mtr);
+
+#ifdef UNIV_DEBUG
+	if (!page_is_leaf(page) && left_page_no == FIL_NULL) {
+		ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+			page_rec_get_next(page_get_infimum_rec(page)),
+			page_is_comp(page)));
+	}
+#endif /* UNIV_DEBUG */
+
+	heap = mem_heap_create(100);
+	offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
+					    &father_cursor);
+
+	if (adjust) {
+		nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor));
+		ut_ad(nth_rec > 0);
+	}
+
+	if (left_page_no == FIL_NULL && right_page_no == FIL_NULL) {
+		/* The page is the only one on the level, lift the records
+		to the father */
+
+		merge_block = btr_lift_page_up(index, block, mtr);
+		goto func_exit;
+	}
+
+	/* Decide the page to which we try to merge and which will inherit
+	the locks */
+
+	is_left = btr_can_merge_with_page(cursor, left_page_no,
+					  &merge_block, mtr);
+
+	DBUG_EXECUTE_IF("ib_always_merge_right", is_left = FALSE;);
+
+	if(!is_left
+	   && !btr_can_merge_with_page(cursor, right_page_no, &merge_block,
+				       mtr)) {
+		goto err_exit;
+	}
+
+	merge_page = buf_block_get_frame(merge_block);
+
+#ifdef UNIV_BTR_DEBUG
+	if (is_left) {
+                ut_a(btr_page_get_next(merge_page, mtr)
+                     == buf_block_get_page_no(block));
+	} else {
+               ut_a(btr_page_get_prev(merge_page, mtr)
+                     == buf_block_get_page_no(block));
+	}
+#endif /* UNIV_BTR_DEBUG */
+
+	ut_ad(page_validate(merge_page, index));
+
+	merge_page_zip = buf_block_get_page_zip(merge_block);
+#ifdef UNIV_ZIP_DEBUG
+	if (merge_page_zip) {
+		const page_zip_des_t*	page_zip
+			= buf_block_get_page_zip(block);
+		ut_a(page_zip);
+		ut_a(page_zip_validate(merge_page_zip, merge_page, index));
+		ut_a(page_zip_validate(page_zip, page, index));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	/* Move records to the merge page */
+	if (is_left) {
+		rec_t*	orig_pred = page_copy_rec_list_start(
+			merge_block, block, page_get_supremum_rec(page),
+			index, mtr);
+
+		if (!orig_pred) {
+			goto err_exit;
+		}
+
+		btr_search_drop_page_hash_index(block);
+
+		/* Remove the page from the level list */
+		btr_level_list_remove(space, zip_size, page, index, mtr);
+
+		btr_node_ptr_delete(index, block, mtr);
+		lock_update_merge_left(merge_block, orig_pred, block);
+
+		if (adjust) {
+			nth_rec += page_rec_get_n_recs_before(orig_pred);
+		}
+	} else {
+		rec_t*		orig_succ;
+		ibool		compressed;
+		dberr_t		err;
+		btr_cur_t	cursor2;
+					/* father cursor pointing to node ptr
+					of the right sibling */
+#ifdef UNIV_BTR_DEBUG
+		byte		fil_page_prev[4];
+#endif /* UNIV_BTR_DEBUG */
+
+		btr_page_get_father(index, merge_block, mtr, &cursor2);
+
+		if (merge_page_zip && left_page_no == FIL_NULL) {
+
+			/* The function page_zip_compress(), which will be
+			invoked by page_copy_rec_list_end() below,
+			requires that FIL_PAGE_PREV be FIL_NULL.
+			Clear the field, but prepare to restore it. */
+#ifdef UNIV_BTR_DEBUG
+			memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4);
+#endif /* UNIV_BTR_DEBUG */
+#if FIL_NULL != 0xffffffff
+# error "FIL_NULL != 0xffffffff"
+#endif
+			memset(merge_page + FIL_PAGE_PREV, 0xff, 4);
+		}
+
+		orig_succ = page_copy_rec_list_end(merge_block, block,
+						   page_get_infimum_rec(page),
+						   cursor->index, mtr);
+
+		if (!orig_succ) {
+			ut_a(merge_page_zip);
+#ifdef UNIV_BTR_DEBUG
+			if (left_page_no == FIL_NULL) {
+				/* FIL_PAGE_PREV was restored from
+				merge_page_zip. */
+				ut_a(!memcmp(fil_page_prev,
+					     merge_page + FIL_PAGE_PREV, 4));
+			}
+#endif /* UNIV_BTR_DEBUG */
+			goto err_exit;
+		}
+
+		btr_search_drop_page_hash_index(block);
+
+#ifdef UNIV_BTR_DEBUG
+		if (merge_page_zip && left_page_no == FIL_NULL) {
+
+			/* Restore FIL_PAGE_PREV in order to avoid an assertion
+			failure in btr_level_list_remove(), which will set
+			the field again to FIL_NULL.  Even though this makes
+			merge_page and merge_page_zip inconsistent for a
+			split second, it is harmless, because the pages
+			are X-latched. */
+			memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4);
+		}
+#endif /* UNIV_BTR_DEBUG */
+
+		/* Remove the page from the level list */
+		btr_level_list_remove(space, zip_size, page, index, mtr);
+
+		/* Replace the address of the old child node (= page) with the
+		address of the merge page to the right */
+		btr_node_ptr_set_child_page_no(
+			btr_cur_get_rec(&father_cursor),
+			btr_cur_get_page_zip(&father_cursor),
+			offsets, right_page_no, mtr);
+
+		compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor2,
+							BTR_CREATE_FLAG,
+							RB_NONE, mtr);
+		ut_a(err == DB_SUCCESS);
+
+		if (!compressed) {
+			btr_cur_compress_if_useful(&cursor2, FALSE, mtr);
+		}
+
+		lock_update_merge_right(merge_block, orig_succ, block);
+	}
+
+	btr_blob_dbg_remove(page, index, "btr_compress");
+
+	if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) {
+		/* Update the free bits of the B-tree page in the
+		insert buffer bitmap.  This has to be done in a
+		separate mini-transaction that is committed before the
+		main mini-transaction.  We cannot update the insert
+		buffer bitmap in this mini-transaction, because
+		btr_compress() can be invoked recursively without
+		committing the mini-transaction in between.  Since
+		insert buffer bitmap pages have a lower rank than
+		B-tree pages, we must not access other pages in the
+		same mini-transaction after accessing an insert buffer
+		bitmap page. */
+
+		/* The free bits in the insert buffer bitmap must
+		never exceed the free space on a page.  It is safe to
+		decrement or reset the bits in the bitmap in a
+		mini-transaction that is committed before the
+		mini-transaction that affects the free space. */
+
+		/* It is unsafe to increment the bits in a separately
+		committed mini-transaction, because in crash recovery,
+		the free bits could momentarily be set too high. */
+
+		if (zip_size) {
+			/* Because the free bits may be incremented
+			and we cannot update the insert buffer bitmap
+			in the same mini-transaction, the only safe
+			thing we can do here is the pessimistic
+			approach: reset the free bits. */
+			ibuf_reset_free_bits(merge_block);
+		} else {
+			/* On uncompressed pages, the free bits will
+			never increase here.  Thus, it is safe to
+			write the bits accurately in a separate
+			mini-transaction. */
+			ibuf_update_free_bits_if_full(merge_block,
+						      UNIV_PAGE_SIZE,
+						      ULINT_UNDEFINED);
+		}
+	}
+
+	ut_ad(page_validate(merge_page, index));
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page,
+						  index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	/* Free the file page */
+	btr_page_free(index, block, mtr);
+
+	ut_ad(btr_check_node_ptr(index, merge_block, mtr));
+func_exit:
+	mem_heap_free(heap);
+
+	if (adjust) {
+		ut_ad(nth_rec > 0);
+		btr_cur_position(
+			index,
+			page_rec_get_nth(merge_block->frame, nth_rec),
+			merge_block, cursor);
+	}
+
+	MONITOR_INC(MONITOR_INDEX_MERGE_SUCCESSFUL);
+
+	DBUG_RETURN(TRUE);
+
+err_exit:
+	/* We play it safe and reset the free bits. */
+	if (zip_size
+	    && merge_page
+	    && page_is_leaf(merge_page)
+	    && !dict_index_is_clust(index)) {
+		ibuf_reset_free_bits(merge_block);
+	}
+
+	mem_heap_free(heap);
+	DBUG_RETURN(FALSE);
+}
+
+/*************************************************************//**
+Discards a page that is the only page on its level.  This will empty
+the whole B-tree, leaving just an empty root page.  This function
+should never be reached, because btr_compress(), which is invoked in
+delete operations, calls btr_lift_page_up() to flatten the B-tree. */
+static
+void
+btr_discard_only_page_on_level(
+/*===========================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: page which is the only on its level */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint		page_level = 0;
+	trx_id_t	max_trx_id;
+
+	/* Save the PAGE_MAX_TRX_ID from the leaf page. */
+	max_trx_id = page_get_max_trx_id(buf_block_get_frame(block));
+
+	while (buf_block_get_page_no(block) != dict_index_get_page(index)) {
+		btr_cur_t	cursor;
+		buf_block_t*	father;
+		const page_t*	page	= buf_block_get_frame(block);
+
+		ut_a(page_get_n_recs(page) == 1);
+		ut_a(page_level == btr_page_get_level(page, mtr));
+		ut_a(btr_page_get_prev(page, mtr) == FIL_NULL);
+		ut_a(btr_page_get_next(page, mtr) == FIL_NULL);
+
+		ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+		btr_search_drop_page_hash_index(block);
+
+		btr_page_get_father(index, block, mtr, &cursor);
+		father = btr_cur_get_block(&cursor);
+
+		lock_update_discard(father, PAGE_HEAP_NO_SUPREMUM, block);
+
+		/* Free the file page */
+		btr_page_free(index, block, mtr);
+
+		block = father;
+		page_level++;
+	}
+
+	/* block is the root page, which must be empty, except
+	for the node pointer to the (now discarded) block(s). */
+
+#ifdef UNIV_BTR_DEBUG
+	if (!dict_index_is_ibuf(index)) {
+		const page_t*	root	= buf_block_get_frame(block);
+		const ulint	space	= dict_index_get_space(index);
+		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+					    + root, space));
+		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+					    + root, space));
+	}
+#endif /* UNIV_BTR_DEBUG */
+
+	btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr);
+	ut_ad(page_is_leaf(buf_block_get_frame(block)));
+
+	if (!dict_index_is_clust(index)) {
+		/* We play it safe and reset the free bits for the root */
+		ibuf_reset_free_bits(block);
+
+		ut_a(max_trx_id);
+		page_set_max_trx_id(block,
+				    buf_block_get_page_zip(block),
+				    max_trx_id, mtr);
+	}
+}
+
+/*************************************************************//**
+Discards a page from a B-tree. This is used to remove the last record from
+a B-tree page: the whole page must be removed at the same time. This cannot
+be used for the root page, which is allowed to be empty. */
+UNIV_INTERN
+void
+btr_discard_page(
+/*=============*/
+	btr_cur_t*	cursor,	/*!< in: cursor on the page to discard: not on
+				the root page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_index_t*	index;
+	ulint		space;
+	ulint		zip_size;
+	ulint		left_page_no;
+	ulint		right_page_no;
+	buf_block_t*	merge_block;
+	page_t*		merge_page;
+	buf_block_t*	block;
+	page_t*		page;
+	rec_t*		node_ptr;
+
+	block = btr_cur_get_block(cursor);
+	index = btr_cur_get_index(cursor);
+
+	ut_ad(dict_index_get_page(index) != buf_block_get_page_no(block));
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	space = dict_index_get_space(index);
+	zip_size = dict_table_zip_size(index->table);
+
+	MONITOR_INC(MONITOR_INDEX_DISCARD);
+
+	/* Decide the page which will inherit the locks */
+
+	left_page_no = btr_page_get_prev(buf_block_get_frame(block), mtr);
+	right_page_no = btr_page_get_next(buf_block_get_frame(block), mtr);
+
+	if (left_page_no != FIL_NULL) {
+		merge_block = btr_block_get(space, zip_size, left_page_no,
+					    RW_X_LATCH, index, mtr);
+		merge_page = buf_block_get_frame(merge_block);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(btr_page_get_next(merge_page, mtr)
+		     == buf_block_get_page_no(block));
+#endif /* UNIV_BTR_DEBUG */
+	} else if (right_page_no != FIL_NULL) {
+		merge_block = btr_block_get(space, zip_size, right_page_no,
+					    RW_X_LATCH, index, mtr);
+		merge_page = buf_block_get_frame(merge_block);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(btr_page_get_prev(merge_page, mtr)
+		     == buf_block_get_page_no(block));
+#endif /* UNIV_BTR_DEBUG */
+	} else {
+		btr_discard_only_page_on_level(index, block, mtr);
+
+		return;
+	}
+
+	page = buf_block_get_frame(block);
+	ut_a(page_is_comp(merge_page) == page_is_comp(page));
+	btr_search_drop_page_hash_index(block);
+
+	if (left_page_no == FIL_NULL && !page_is_leaf(page)) {
+
+		/* We have to mark the leftmost node pointer on the right
+		side page as the predefined minimum record */
+		node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page));
+
+		ut_ad(page_rec_is_user_rec(node_ptr));
+
+		/* This will make page_zip_validate() fail on merge_page
+		until btr_level_list_remove() completes.  This is harmless,
+		because everything will take place within a single
+		mini-transaction and because writing to the redo log
+		is an atomic operation (performed by mtr_commit()). */
+		btr_set_min_rec_mark(node_ptr, mtr);
+	}
+
+	btr_node_ptr_delete(index, block, mtr);
+
+	/* Remove the page from the level list */
+	btr_level_list_remove(space, zip_size, page, index, mtr);
+#ifdef UNIV_ZIP_DEBUG
+	{
+		page_zip_des_t*	merge_page_zip
+			= buf_block_get_page_zip(merge_block);
+		ut_a(!merge_page_zip
+		     || page_zip_validate(merge_page_zip, merge_page, index));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (left_page_no != FIL_NULL) {
+		lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM,
+				    block);
+	} else {
+		lock_update_discard(merge_block,
+				    lock_get_min_heap_no(merge_block),
+				    block);
+	}
+
+	btr_blob_dbg_remove(page, index, "btr_discard_page");
+
+	/* Free the file page */
+	btr_page_free(index, block, mtr);
+
+	ut_ad(btr_check_node_ptr(index, merge_block, mtr));
+}
+
+#ifdef UNIV_BTR_PRINT
+/*************************************************************//**
+Prints size info of a B-tree. */
+UNIV_INTERN
+void
+btr_print_size(
+/*===========*/
+	dict_index_t*	index)	/*!< in: index tree */
+{
+	page_t*		root;
+	fseg_header_t*	seg;
+	mtr_t		mtr;
+
+	if (dict_index_is_ibuf(index)) {
+		fputs("Sorry, cannot print info of an ibuf tree:"
+		      " use ibuf functions\n", stderr);
+
+		return;
+	}
+
+	mtr_start(&mtr);
+
+	root = btr_root_get(index, &mtr);
+
+	seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+
+	fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
+	fseg_print(seg, &mtr);
+
+	if (!dict_index_is_univ(index)) {
+
+		seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+
+		fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr);
+		fseg_print(seg, &mtr);
+	}
+
+	mtr_commit(&mtr);
+}
+
+/************************************************************//**
+Prints recursively index tree pages. */
+static
+void
+btr_print_recursive(
+/*================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: index page */
+	ulint		width,	/*!< in: print this many entries from start
+				and end */
+	mem_heap_t**	heap,	/*!< in/out: heap for rec_get_offsets() */
+	ulint**		offsets,/*!< in/out: buffer for rec_get_offsets() */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	const page_t*	page	= buf_block_get_frame(block);
+	page_cur_t	cursor;
+	ulint		n_recs;
+	ulint		i	= 0;
+	mtr_t		mtr2;
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	fprintf(stderr, "NODE ON LEVEL %lu page number %lu\n",
+		(ulong) btr_page_get_level(page, mtr),
+		(ulong) buf_block_get_page_no(block));
+
+	page_print(block, index, width, width);
+
+	n_recs = page_get_n_recs(page);
+
+	page_cur_set_before_first(block, &cursor);
+	page_cur_move_to_next(&cursor);
+
+	while (!page_cur_is_after_last(&cursor)) {
+
+		if (page_is_leaf(page)) {
+
+			/* If this is the leaf level, do nothing */
+
+		} else if ((i <= width) || (i >= n_recs - width)) {
+
+			const rec_t*	node_ptr;
+
+			mtr_start(&mtr2);
+
+			node_ptr = page_cur_get_rec(&cursor);
+
+			*offsets = rec_get_offsets(node_ptr, index, *offsets,
+						   ULINT_UNDEFINED, heap);
+			btr_print_recursive(index,
+					    btr_node_ptr_get_child(node_ptr,
+								   index,
+								   *offsets,
+								   &mtr2),
+					    width, heap, offsets, &mtr2);
+			mtr_commit(&mtr2);
+		}
+
+		page_cur_move_to_next(&cursor);
+		i++;
+	}
+}
+
+/**************************************************************//**
+Prints directories and other info of all nodes in the tree. */
+UNIV_INTERN
+void
+btr_print_index(
+/*============*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		width)	/*!< in: print this many entries from start
+				and end */
+{
+	mtr_t		mtr;
+	buf_block_t*	root;
+	mem_heap_t*	heap	= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets	= offsets_;
+	rec_offs_init(offsets_);
+
+	fputs("--------------------------\n"
+	      "INDEX TREE PRINT\n", stderr);
+
+	mtr_start(&mtr);
+
+	root = btr_root_block_get(index, RW_X_LATCH, &mtr);
+
+	btr_print_recursive(index, root, width, &heap, &offsets, &mtr);
+	if (heap) {
+		mem_heap_free(heap);
+	}
+
+	mtr_commit(&mtr);
+
+	btr_validate_index(index, 0);
+}
+#endif /* UNIV_BTR_PRINT */
+
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Checks that the node pointer to a page is appropriate.
+@return	TRUE */
+UNIV_INTERN
+ibool
+btr_check_node_ptr(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: index page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	tuple;
+	ulint*		offsets;
+	btr_cur_t	cursor;
+	page_t*		page = buf_block_get_frame(block);
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	if (dict_index_get_page(index) == buf_block_get_page_no(block)) {
+
+		return(TRUE);
+	}
+
+	heap = mem_heap_create(256);
+	offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
+					    &cursor);
+
+	if (page_is_leaf(page)) {
+
+		goto func_exit;
+	}
+
+	tuple = dict_index_build_node_ptr(
+		index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap,
+		btr_page_get_level(page, mtr));
+
+	ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), offsets));
+func_exit:
+	mem_heap_free(heap);
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/************************************************************//**
+Display identification information for a record. */
+static
+void
+btr_index_rec_validate_report(
+/*==========================*/
+	const page_t*		page,	/*!< in: index page */
+	const rec_t*		rec,	/*!< in: index record */
+	const dict_index_t*	index)	/*!< in: index */
+{
+	fputs("InnoDB: Record in ", stderr);
+	dict_index_name_print(stderr, NULL, index);
+	fprintf(stderr, ", page %lu, at offset %lu\n",
+		page_get_page_no(page), (ulint) page_offset(rec));
+}
+
+/************************************************************//**
+Checks the size and number of fields in a record based on the definition of
+the index.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+btr_index_rec_validate(
+/*===================*/
+	const rec_t*		rec,		/*!< in: index record */
+	const dict_index_t*	index,		/*!< in: index */
+	ibool			dump_on_error)	/*!< in: TRUE if the function
+						should print hex dump of record
+						and page on error */
+{
+	ulint		len;
+	ulint		n;
+	ulint		i;
+	const page_t*	page;
+	mem_heap_t*	heap	= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets	= offsets_;
+	rec_offs_init(offsets_);
+
+	page = page_align(rec);
+
+	if (dict_index_is_univ(index)) {
+		/* The insert buffer index tree can contain records from any
+		other index: we cannot check the number of fields or
+		their length */
+
+		return(TRUE);
+	}
+
+	if ((ibool)!!page_is_comp(page) != dict_table_is_comp(index->table)) {
+		btr_index_rec_validate_report(page, rec, index);
+		fprintf(stderr, "InnoDB: compact flag=%lu, should be %lu\n",
+			(ulong) !!page_is_comp(page),
+			(ulong) dict_table_is_comp(index->table));
+
+		return(FALSE);
+	}
+
+	n = dict_index_get_n_fields(index);
+
+	if (!page_is_comp(page) && rec_get_n_fields_old(rec) != n) {
+		btr_index_rec_validate_report(page, rec, index);
+		fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n",
+			(ulong) rec_get_n_fields_old(rec), (ulong) n);
+
+		if (dump_on_error) {
+			buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
+
+			fputs("InnoDB: corrupt record ", stderr);
+			rec_print_old(stderr, rec);
+			putc('\n', stderr);
+		}
+		return(FALSE);
+	}
+
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+	for (i = 0; i < n; i++) {
+		ulint	fixed_size = dict_col_get_fixed_size(
+			dict_index_get_nth_col(index, i), page_is_comp(page));
+
+		rec_get_nth_field_offs(offsets, i, &len);
+
+		/* Note that if fixed_size != 0, it equals the
+		length of a fixed-size column in the clustered index.
+		A prefix index of the column is of fixed, but different
+		length.  When fixed_size == 0, prefix_len is the maximum
+		length of the prefix index column. */
+
+		if ((dict_index_get_nth_field(index, i)->prefix_len == 0
+		     && len != UNIV_SQL_NULL && fixed_size
+		     && len != fixed_size)
+		    || (dict_index_get_nth_field(index, i)->prefix_len > 0
+			&& len != UNIV_SQL_NULL
+			&& len
+			> dict_index_get_nth_field(index, i)->prefix_len)) {
+
+			btr_index_rec_validate_report(page, rec, index);
+			fprintf(stderr,
+				"InnoDB: field %lu len is %lu,"
+				" should be %lu\n",
+				(ulong) i, (ulong) len, (ulong) fixed_size);
+
+			if (dump_on_error) {
+				buf_page_print(page, 0,
+					       BUF_PAGE_PRINT_NO_CRASH);
+
+				fputs("InnoDB: corrupt record ", stderr);
+				rec_print_new(stderr, rec, offsets);
+				putc('\n', stderr);
+			}
+			if (heap) {
+				mem_heap_free(heap);
+			}
+			return(FALSE);
+		}
+	}
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	return(TRUE);
+}
+
+/************************************************************//**
+Checks the size and number of fields in records based on the definition of
+the index.
+@return	TRUE if ok */
+static
+ibool
+btr_index_page_validate(
+/*====================*/
+	buf_block_t*	block,	/*!< in: index page */
+	dict_index_t*	index)	/*!< in: index */
+{
+	page_cur_t	cur;
+	ibool		ret	= TRUE;
+#ifndef DBUG_OFF
+	ulint		nth	= 1;
+#endif /* !DBUG_OFF */
+
+	page_cur_set_before_first(block, &cur);
+
+	/* Directory slot 0 should only contain the infimum record. */
+	DBUG_EXECUTE_IF("check_table_rec_next",
+			ut_a(page_rec_get_nth_const(
+				     page_cur_get_page(&cur), 0)
+			     == cur.rec);
+			ut_a(page_dir_slot_get_n_owned(
+				     page_dir_get_nth_slot(
+					     page_cur_get_page(&cur), 0))
+			     == 1););
+
+	page_cur_move_to_next(&cur);
+
+	for (;;) {
+		if (page_cur_is_after_last(&cur)) {
+
+			break;
+		}
+
+		if (!btr_index_rec_validate(cur.rec, index, TRUE)) {
+
+			return(FALSE);
+		}
+
+		/* Verify that page_rec_get_nth_const() is correctly
+		retrieving each record. */
+		DBUG_EXECUTE_IF("check_table_rec_next",
+				ut_a(cur.rec == page_rec_get_nth_const(
+					     page_cur_get_page(&cur),
+					     page_rec_get_n_recs_before(
+						     cur.rec)));
+				ut_a(nth++ == page_rec_get_n_recs_before(
+					     cur.rec)););
+
+		page_cur_move_to_next(&cur);
+	}
+
+	return(ret);
+}
+
+/************************************************************//**
+Report an error on one page of an index tree. */
+static
+void
+btr_validate_report1(
+/*=================*/
+	dict_index_t*		index,	/*!< in: index */
+	ulint			level,	/*!< in: B-tree level */
+	const buf_block_t*	block)	/*!< in: index page */
+{
+	fprintf(stderr, "InnoDB: Error in page %lu of ",
+		buf_block_get_page_no(block));
+	dict_index_name_print(stderr, NULL, index);
+	if (level) {
+		fprintf(stderr, ", index tree level %lu", level);
+	}
+	putc('\n', stderr);
+}
+
+/************************************************************//**
+Report an error on two pages of an index tree. */
+static
+void
+btr_validate_report2(
+/*=================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			level,	/*!< in: B-tree level */
+	const buf_block_t*	block1,	/*!< in: first index page */
+	const buf_block_t*	block2)	/*!< in: second index page */
+{
+	fprintf(stderr, "InnoDB: Error in pages %lu and %lu of ",
+		buf_block_get_page_no(block1),
+		buf_block_get_page_no(block2));
+	dict_index_name_print(stderr, NULL, index);
+	if (level) {
+		fprintf(stderr, ", index tree level %lu", level);
+	}
+	putc('\n', stderr);
+}
+
+/************************************************************//**
+Validates index tree level.
+@return	TRUE if ok */
+static
+bool
+btr_validate_level(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	const trx_t*	trx,	/*!< in: transaction or NULL */
+	ulint		level)	/*!< in: level number */
+{
+	ulint		space;
+	ulint		space_flags;
+	ulint		zip_size;
+	buf_block_t*	block;
+	page_t*		page;
+	buf_block_t*	right_block = 0; /* remove warning */
+	page_t*		right_page = 0; /* remove warning */
+	page_t*		father_page;
+	btr_cur_t	node_cur;
+	btr_cur_t	right_node_cur;
+	rec_t*		rec;
+	ulint		right_page_no;
+	ulint		left_page_no;
+	page_cur_t	cursor;
+	dtuple_t*	node_ptr_tuple;
+	bool		ret	= true;
+	mtr_t		mtr;
+	mem_heap_t*	heap	= mem_heap_create(256);
+	fseg_header_t*	seg;
+	ulint*		offsets	= NULL;
+	ulint*		offsets2= NULL;
+#ifdef UNIV_ZIP_DEBUG
+	page_zip_des_t*	page_zip;
+#endif /* UNIV_ZIP_DEBUG */
+
+	mtr_start(&mtr);
+
+	mtr_x_lock(dict_index_get_lock(index), &mtr);
+
+	block = btr_root_block_get(index, RW_X_LATCH, &mtr);
+	page = buf_block_get_frame(block);
+	seg = page + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+
+	space = dict_index_get_space(index);
+	zip_size = dict_table_zip_size(index->table);
+
+	fil_space_get_latch(space, &space_flags);
+
+	if (zip_size != dict_tf_get_zip_size(space_flags)) {
+
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Flags mismatch: table=%lu, tablespace=%lu",
+			(ulint) index->table->flags, (ulint) space_flags);
+
+		mtr_commit(&mtr);
+
+		return(false);
+	}
+
+	while (level != btr_page_get_level(page, &mtr)) {
+		const rec_t*	node_ptr;
+
+		if (fseg_page_is_free(seg,
+				      block->page.space, block->page.offset)) {
+
+			btr_validate_report1(index, level, block);
+
+			ib_logf(IB_LOG_LEVEL_WARN, "page is free");
+
+			ret = false;
+		}
+
+		ut_a(space == buf_block_get_space(block));
+		ut_a(space == page_get_space_id(page));
+#ifdef UNIV_ZIP_DEBUG
+		page_zip = buf_block_get_page_zip(block);
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+		ut_a(!page_is_leaf(page));
+
+		page_cur_set_before_first(block, &cursor);
+		page_cur_move_to_next(&cursor);
+
+		node_ptr = page_cur_get_rec(&cursor);
+		offsets = rec_get_offsets(node_ptr, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr);
+		page = buf_block_get_frame(block);
+	}
+
+	/* Now we are on the desired level. Loop through the pages on that
+	level. */
+
+	if (level == 0) {
+		/* Leaf pages are managed in their own file segment. */
+		seg -= PAGE_BTR_SEG_TOP - PAGE_BTR_SEG_LEAF;
+	}
+
+loop:
+	mem_heap_empty(heap);
+	offsets = offsets2 = NULL;
+	mtr_x_lock(dict_index_get_lock(index), &mtr);
+
+#ifdef UNIV_ZIP_DEBUG
+	page_zip = buf_block_get_page_zip(block);
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	ut_a(block->page.space == space);
+
+	if (fseg_page_is_free(seg, block->page.space, block->page.offset)) {
+
+		btr_validate_report1(index, level, block);
+
+		ib_logf(IB_LOG_LEVEL_WARN, "Page is marked as free");
+		ret = false;
+
+	} else if (btr_page_get_index_id(page) != index->id) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Page index id " IB_ID_FMT " != data dictionary "
+			"index id " IB_ID_FMT,
+			btr_page_get_index_id(page), index->id);
+
+		ret = false;
+
+	} else if (!page_validate(page, index)) {
+
+		btr_validate_report1(index, level, block);
+		ret = false;
+
+	} else if (level == 0 && !btr_index_page_validate(block, index)) {
+
+		/* We are on level 0. Check that the records have the right
+		number of fields, and field lengths are right. */
+
+		ret = false;
+	}
+
+	ut_a(btr_page_get_level(page, &mtr) == level);
+
+	right_page_no = btr_page_get_next(page, &mtr);
+	left_page_no = btr_page_get_prev(page, &mtr);
+
+	ut_a(!page_is_empty(page)
+	     || (level == 0
+		 && page_get_page_no(page) == dict_index_get_page(index)));
+
+	if (right_page_no != FIL_NULL) {
+		const rec_t*	right_rec;
+		right_block = btr_block_get(space, zip_size, right_page_no,
+					    RW_X_LATCH, index, &mtr);
+		right_page = buf_block_get_frame(right_block);
+		if (btr_page_get_prev(right_page, &mtr)
+		    != page_get_page_no(page)) {
+
+			btr_validate_report2(index, level, block, right_block);
+			fputs("InnoDB: broken FIL_PAGE_NEXT"
+			      " or FIL_PAGE_PREV links\n", stderr);
+			buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
+			buf_page_print(right_page, 0, BUF_PAGE_PRINT_NO_CRASH);
+
+			ret = false;
+		}
+
+		if (page_is_comp(right_page) != page_is_comp(page)) {
+			btr_validate_report2(index, level, block, right_block);
+			fputs("InnoDB: 'compact' flag mismatch\n", stderr);
+			buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
+			buf_page_print(right_page, 0, BUF_PAGE_PRINT_NO_CRASH);
+
+			ret = false;
+
+			goto node_ptr_fails;
+		}
+
+		rec = page_rec_get_prev(page_get_supremum_rec(page));
+		right_rec = page_rec_get_next(page_get_infimum_rec(
+						      right_page));
+		offsets = rec_get_offsets(rec, index,
+					  offsets, ULINT_UNDEFINED, &heap);
+		offsets2 = rec_get_offsets(right_rec, index,
+					   offsets2, ULINT_UNDEFINED, &heap);
+		if (cmp_rec_rec(rec, right_rec, offsets, offsets2,
+			        index) >= 0) {
+
+			btr_validate_report2(index, level, block, right_block);
+
+			fputs("InnoDB: records in wrong order"
+			      " on adjacent pages\n", stderr);
+
+			buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
+			buf_page_print(right_page, 0, BUF_PAGE_PRINT_NO_CRASH);
+
+			fputs("InnoDB: record ", stderr);
+			rec = page_rec_get_prev(page_get_supremum_rec(page));
+			rec_print(stderr, rec, index);
+			putc('\n', stderr);
+			fputs("InnoDB: record ", stderr);
+			rec = page_rec_get_next(
+				page_get_infimum_rec(right_page));
+			rec_print(stderr, rec, index);
+			putc('\n', stderr);
+
+			ret = false;
+		}
+	}
+
+	if (level > 0 && left_page_no == FIL_NULL) {
+		ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+			     page_rec_get_next(page_get_infimum_rec(page)),
+			     page_is_comp(page)));
+	}
+
+	if (buf_block_get_page_no(block) != dict_index_get_page(index)) {
+
+		/* Check father node pointers */
+
+		rec_t*	node_ptr;
+
+		offsets = btr_page_get_father_block(offsets, heap, index,
+						    block, &mtr, &node_cur);
+		father_page = btr_cur_get_page(&node_cur);
+		node_ptr = btr_cur_get_rec(&node_cur);
+
+		btr_cur_position(
+			index, page_rec_get_prev(page_get_supremum_rec(page)),
+			block, &node_cur);
+		offsets = btr_page_get_father_node_ptr(offsets, heap,
+						       &node_cur, &mtr);
+
+		if (node_ptr != btr_cur_get_rec(&node_cur)
+		    || btr_node_ptr_get_child_page_no(node_ptr, offsets)
+				     != buf_block_get_page_no(block)) {
+
+			btr_validate_report1(index, level, block);
+
+			fputs("InnoDB: node pointer to the page is wrong\n",
+			      stderr);
+
+			buf_page_print(father_page, 0, BUF_PAGE_PRINT_NO_CRASH);
+			buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
+
+			fputs("InnoDB: node ptr ", stderr);
+			rec_print(stderr, node_ptr, index);
+
+			rec = btr_cur_get_rec(&node_cur);
+			fprintf(stderr, "\n"
+				"InnoDB: node ptr child page n:o %lu\n",
+				(ulong) btr_node_ptr_get_child_page_no(
+					rec, offsets));
+
+			fputs("InnoDB: record on page ", stderr);
+			rec_print_new(stderr, rec, offsets);
+			putc('\n', stderr);
+			ret = false;
+
+			goto node_ptr_fails;
+		}
+
+		if (!page_is_leaf(page)) {
+			node_ptr_tuple = dict_index_build_node_ptr(
+				index,
+				page_rec_get_next(page_get_infimum_rec(page)),
+				0, heap, btr_page_get_level(page, &mtr));
+
+			if (cmp_dtuple_rec(node_ptr_tuple, node_ptr,
+					   offsets)) {
+				const rec_t* first_rec = page_rec_get_next(
+					page_get_infimum_rec(page));
+
+				btr_validate_report1(index, level, block);
+
+				buf_page_print(father_page, 0,
+					       BUF_PAGE_PRINT_NO_CRASH);
+				buf_page_print(page, 0,
+					       BUF_PAGE_PRINT_NO_CRASH);
+
+				fputs("InnoDB: Error: node ptrs differ"
+				      " on levels > 0\n"
+				      "InnoDB: node ptr ", stderr);
+				rec_print_new(stderr, node_ptr, offsets);
+				fputs("InnoDB: first rec ", stderr);
+				rec_print(stderr, first_rec, index);
+				putc('\n', stderr);
+				ret = false;
+
+				goto node_ptr_fails;
+			}
+		}
+
+		if (left_page_no == FIL_NULL) {
+			ut_a(node_ptr == page_rec_get_next(
+				     page_get_infimum_rec(father_page)));
+			ut_a(btr_page_get_prev(father_page, &mtr) == FIL_NULL);
+		}
+
+		if (right_page_no == FIL_NULL) {
+			ut_a(node_ptr == page_rec_get_prev(
+				     page_get_supremum_rec(father_page)));
+			ut_a(btr_page_get_next(father_page, &mtr) == FIL_NULL);
+		} else {
+			const rec_t*	right_node_ptr
+				= page_rec_get_next(node_ptr);
+
+			offsets = btr_page_get_father_block(
+				offsets, heap, index, right_block,
+				&mtr, &right_node_cur);
+			if (right_node_ptr
+			    != page_get_supremum_rec(father_page)) {
+
+				if (btr_cur_get_rec(&right_node_cur)
+				    != right_node_ptr) {
+					ret = false;
+					fputs("InnoDB: node pointer to"
+					      " the right page is wrong\n",
+					      stderr);
+
+					btr_validate_report1(index, level,
+							     block);
+
+					buf_page_print(
+						father_page, 0,
+						BUF_PAGE_PRINT_NO_CRASH);
+					buf_page_print(
+						page, 0,
+						BUF_PAGE_PRINT_NO_CRASH);
+					buf_page_print(
+						right_page, 0,
+						BUF_PAGE_PRINT_NO_CRASH);
+				}
+			} else {
+				page_t*	right_father_page
+					= btr_cur_get_page(&right_node_cur);
+
+				if (btr_cur_get_rec(&right_node_cur)
+				    != page_rec_get_next(
+					    page_get_infimum_rec(
+						    right_father_page))) {
+					ret = false;
+					fputs("InnoDB: node pointer 2 to"
+					      " the right page is wrong\n",
+					      stderr);
+
+					btr_validate_report1(index, level,
+							     block);
+
+					buf_page_print(
+						father_page, 0,
+						BUF_PAGE_PRINT_NO_CRASH);
+					buf_page_print(
+						right_father_page, 0,
+						BUF_PAGE_PRINT_NO_CRASH);
+					buf_page_print(
+						page, 0,
+						BUF_PAGE_PRINT_NO_CRASH);
+					buf_page_print(
+						right_page, 0,
+						BUF_PAGE_PRINT_NO_CRASH);
+				}
+
+				if (page_get_page_no(right_father_page)
+				    != btr_page_get_next(father_page, &mtr)) {
+
+					ret = false;
+					fputs("InnoDB: node pointer 3 to"
+					      " the right page is wrong\n",
+					      stderr);
+
+					btr_validate_report1(index, level,
+							     block);
+
+					buf_page_print(
+						father_page, 0,
+						BUF_PAGE_PRINT_NO_CRASH);
+					buf_page_print(
+						right_father_page, 0,
+						BUF_PAGE_PRINT_NO_CRASH);
+					buf_page_print(
+						page, 0,
+						BUF_PAGE_PRINT_NO_CRASH);
+					buf_page_print(
+						right_page, 0,
+						BUF_PAGE_PRINT_NO_CRASH);
+				}
+			}
+		}
+	}
+
+node_ptr_fails:
+	/* Commit the mini-transaction to release the latch on 'page'.
+	Re-acquire the latch on right_page, which will become 'page'
+	on the next loop.  The page has already been checked. */
+	mtr_commit(&mtr);
+
+	if (trx_is_interrupted(trx)) {
+		/* On interrupt, return the current status. */
+	} else if (right_page_no != FIL_NULL) {
+
+		mtr_start(&mtr);
+
+		block = btr_block_get(
+			space, zip_size, right_page_no,
+			RW_X_LATCH, index, &mtr);
+
+		page = buf_block_get_frame(block);
+
+		goto loop;
+	}
+
+	mem_heap_free(heap);
+
+	return(ret);
+}
+
+/**************************************************************//**
+Checks the consistency of an index tree.
+@return	TRUE if ok */
+UNIV_INTERN
+bool
+btr_validate_index(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index */
+	const trx_t*	trx)	/*!< in: transaction or NULL */
+{
+	/* Full Text index are implemented by auxiliary tables,
+	not the B-tree */
+	if (dict_index_is_online_ddl(index) || (index->type & DICT_FTS)) {
+		return(true);
+	}
+
+	mtr_t		mtr;
+
+	mtr_start(&mtr);
+
+	mtr_x_lock(dict_index_get_lock(index), &mtr);
+
+	bool	ok = true;
+	page_t*	root = btr_root_get(index, &mtr);
+	ulint	n = btr_page_get_level(root, &mtr);
+
+	for (ulint i = 0; i <= n; ++i) {
+
+		if (!btr_validate_level(index, trx, n - i)) {
+			ok = false;
+			break;
+		}
+	}
+
+	mtr_commit(&mtr);
+
+	return(ok);
+}
+
+/**************************************************************//**
+Checks if the page in the cursor can be merged with given page.
+If necessary, re-organize the merge_page.
+@return	TRUE if possible to merge. */
+UNIV_INTERN
+ibool
+btr_can_merge_with_page(
+/*====================*/
+	btr_cur_t*	cursor,		/*!< in: cursor on the page to merge */
+	ulint		page_no,	/*!< in: a sibling page */
+	buf_block_t**	merge_block,	/*!< out: the merge block */
+	mtr_t*		mtr)		/*!< in: mini-transaction */
+{
+	dict_index_t*	index;
+	page_t*		page;
+	ulint		space;
+	ulint		zip_size;
+	ulint		n_recs;
+	ulint		data_size;
+        ulint           max_ins_size_reorg;
+	ulint		max_ins_size;
+	buf_block_t*	mblock;
+	page_t*		mpage;
+	DBUG_ENTER("btr_can_merge_with_page");
+
+	if (page_no == FIL_NULL) {
+		goto error;
+	}
+
+	index = btr_cur_get_index(cursor);
+	page  = btr_cur_get_page(cursor);
+	space = dict_index_get_space(index);
+        zip_size = dict_table_zip_size(index->table);
+
+	mblock = btr_block_get(space, zip_size, page_no, RW_X_LATCH, index,
+			       mtr);
+	mpage = buf_block_get_frame(mblock);
+
+        n_recs = page_get_n_recs(page);
+        data_size = page_get_data_size(page);
+
+        max_ins_size_reorg = page_get_max_insert_size_after_reorganize(
+                mpage, n_recs);
+
+	if (data_size > max_ins_size_reorg) {
+		goto error;
+	}
+
+	/* If compression padding tells us that merging will result in
+	too packed up page i.e.: which is likely to cause compression
+	failure then don't merge the pages. */
+	if (zip_size && page_is_leaf(mpage)
+	    && (page_get_data_size(mpage) + data_size
+		>= dict_index_zip_pad_optimal_page_size(index))) {
+
+		goto error;
+	}
+
+
+	max_ins_size = page_get_max_insert_size(mpage, n_recs);
+
+	if (data_size > max_ins_size) {
+
+		/* We have to reorganize mpage */
+
+		if (!btr_page_reorganize_block(
+			    false, page_zip_level, mblock, index, mtr)) {
+
+			goto error;
+		}
+
+		max_ins_size = page_get_max_insert_size(mpage, n_recs);
+
+		ut_ad(page_validate(mpage, index));
+		ut_ad(max_ins_size == max_ins_size_reorg);
+
+		if (data_size > max_ins_size) {
+
+			/* Add fault tolerance, though this should
+			never happen */
+
+			goto error;
+		}
+	}
+
+	*merge_block = mblock;
+	DBUG_RETURN(TRUE);
+
+error:
+	*merge_block = NULL;
+	DBUG_RETURN(FALSE);
+}
+
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
new file mode 100644
index 00000000000..1611fb6394c
--- /dev/null
+++ b/storage/innobase/btr/btr0cur.cc
@@ -0,0 +1,5644 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2012, Facebook Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0cur.cc
+The index tree cursor
+
+All changes that row operations make to a B-tree or the records
+there must go through this module! Undo log records are written here
+of every modify or insert of a clustered index record.
+
+			NOTE!!!
+To make sure we do not run out of disk space during a pessimistic
+insert or update, we have to reserve 2 x the height of the index tree
+many pages in the tablespace before we start the operation, because
+if leaf splitting has been started, it is difficult to undo, except
+by crashing the database and doing a roll-forward.
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#include "btr0cur.h"
+
+#ifdef UNIV_NONINL
+#include "btr0cur.ic"
+#endif
+
+#include "row0upd.h"
+#ifndef UNIV_HOTBACKUP
+#include "mtr0log.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "rem0rec.h"
+#include "rem0cmp.h"
+#include "buf0lru.h"
+#include "btr0btr.h"
+#include "btr0sea.h"
+#include "row0log.h"
+#include "row0purge.h"
+#include "row0upd.h"
+#include "trx0rec.h"
+#include "trx0roll.h" /* trx_is_recv() */
+#include "que0que.h"
+#include "row0row.h"
+#include "srv0srv.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "zlib.h"
+
+/** Buffered B-tree operation types, introduced as part of delete buffering. */
+enum btr_op_t {
+	BTR_NO_OP = 0,			/*!< Not buffered */
+	BTR_INSERT_OP,			/*!< Insert, do not ignore UNIQUE */
+	BTR_INSERT_IGNORE_UNIQUE_OP,	/*!< Insert, ignoring UNIQUE */
+	BTR_DELETE_OP,			/*!< Purge a delete-marked record */
+	BTR_DELMARK_OP			/*!< Mark a record for deletion */
+};
+
+#ifdef UNIV_DEBUG
+/** If the following is set to TRUE, this module prints a lot of
+trace information of individual record operations */
+UNIV_INTERN ibool	btr_cur_print_record_ops = FALSE;
+#endif /* UNIV_DEBUG */
+
+/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
+UNIV_INTERN ulint	btr_cur_n_non_sea	= 0;
+/** Number of successful adaptive hash index lookups in
+btr_cur_search_to_nth_level(). */
+UNIV_INTERN ulint	btr_cur_n_sea		= 0;
+/** Old value of btr_cur_n_non_sea.  Copied by
+srv_refresh_innodb_monitor_stats().  Referenced by
+srv_printf_innodb_monitor(). */
+UNIV_INTERN ulint	btr_cur_n_non_sea_old	= 0;
+/** Old value of btr_cur_n_sea.  Copied by
+srv_refresh_innodb_monitor_stats().  Referenced by
+srv_printf_innodb_monitor(). */
+UNIV_INTERN ulint	btr_cur_n_sea_old	= 0;
+
+#ifdef UNIV_DEBUG
+/* Flag to limit optimistic insert records */
+UNIV_INTERN uint	btr_cur_limit_optimistic_insert_debug = 0;
+#endif /* UNIV_DEBUG */
+
+/** In the optimistic insert, if the insert does not fit, but this much space
+can be released by page reorganize, then it is reorganized */
+#define BTR_CUR_PAGE_REORGANIZE_LIMIT	(UNIV_PAGE_SIZE / 32)
+
+/** The structure of a BLOB part header */
+/* @{ */
+/*--------------------------------------*/
+#define BTR_BLOB_HDR_PART_LEN		0	/*!< BLOB part len on this
+						page */
+#define BTR_BLOB_HDR_NEXT_PAGE_NO	4	/*!< next BLOB part page no,
+						FIL_NULL if none */
+/*--------------------------------------*/
+#define BTR_BLOB_HDR_SIZE		8	/*!< Size of a BLOB
+						part header, in bytes */
+
+/** Estimated table level stats from sampled value.
+@param value		sampled stats
+@param index		index being sampled
+@param sample		number of sampled rows
+@param ext_size		external stored data size
+@param not_empty	table not empty
+@return estimated table wide stats from sampled value */
+#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty)\
+	(((value) * (ib_int64_t) index->stat_n_leaf_pages		\
+	  + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
+
+/* @} */
+#endif /* !UNIV_HOTBACKUP */
+
+/** A BLOB field reference full of zero, for use in assertions and tests.
+Initially, BLOB field references are set to zero, in
+dtuple_convert_big_rec(). */
+const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE] = {
+	0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0,
+};
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Marks all extern fields in a record as owned by the record. This function
+should be called if the delete mark of a record is removed: a not delete
+marked record always owns all its extern fields. */
+static
+void
+btr_cur_unmark_extern_fields(
+/*=========================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
+				part will be updated, or NULL */
+	rec_t*		rec,	/*!< in/out: record in a clustered index */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	mtr_t*		mtr);	/*!< in: mtr, or NULL if not logged */
+/*******************************************************************//**
+Adds path information to the cursor for the current page, for which
+the binary search has been performed. */
+static
+void
+btr_cur_add_path_info(
+/*==================*/
+	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
+	ulint		height,		/*!< in: height of the page in tree;
+					0 means leaf node */
+	ulint		root_height);	/*!< in: root node height in tree */
+/***********************************************************//**
+Frees the externally stored fields for a record, if the field is mentioned
+in the update vector. */
+static
+void
+btr_rec_free_updated_extern_fields(
+/*===============================*/
+	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
+				X-latched */
+	rec_t*		rec,	/*!< in: record */
+	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
+				part will be updated, or NULL */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update,	/*!< in: update vector */
+	enum trx_rb_ctx	rb_ctx,	/*!< in: rollback context */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
+				an X-latch to record page and to the tree */
+/***********************************************************//**
+Frees the externally stored fields for a record. */
+static
+void
+btr_rec_free_externally_stored_fields(
+/*==================================*/
+	dict_index_t*	index,	/*!< in: index of the data, the index
+				tree MUST be X-latched */
+	rec_t*		rec,	/*!< in: record */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
+				part will be updated, or NULL */
+	enum trx_rb_ctx	rb_ctx,	/*!< in: rollback context */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
+				an X-latch to record page and to the index
+				tree */
+#endif /* !UNIV_HOTBACKUP */
+
+/******************************************************//**
+The following function is used to set the deleted bit of a record. */
+UNIV_INLINE
+void
+btr_rec_set_deleted_flag(
+/*=====================*/
+	rec_t*		rec,	/*!< in/out: physical record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page (or NULL) */
+	ulint		flag)	/*!< in: nonzero if delete marked */
+{
+	if (page_rec_is_comp(rec)) {
+		rec_set_deleted_flag_new(rec, page_zip, flag);
+	} else {
+		ut_ad(!page_zip);
+		rec_set_deleted_flag_old(rec, flag);
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/*==================== B-TREE SEARCH =========================*/
+
+/********************************************************************//**
+Latches the leaf page or pages requested. */
+static
+void
+btr_cur_latch_leaves(
+/*=================*/
+	page_t*		page,		/*!< in: leaf page where the search
+					converged */
+	ulint		space,		/*!< in: space id */
+	ulint		zip_size,	/*!< in: compressed page size in bytes
+					or 0 for uncompressed pages */
+	ulint		page_no,	/*!< in: page number of the leaf */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_cur_t*	cursor,		/*!< in: cursor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint		mode;
+	ulint		left_page_no;
+	ulint		right_page_no;
+	buf_block_t*	get_block;
+
+	ut_ad(page && mtr);
+
+	switch (latch_mode) {
+	case BTR_SEARCH_LEAF:
+	case BTR_MODIFY_LEAF:
+		mode = latch_mode == BTR_SEARCH_LEAF ? RW_S_LATCH : RW_X_LATCH;
+		get_block = btr_block_get(
+			space, zip_size, page_no, mode, cursor->index, mtr);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
+#endif /* UNIV_BTR_DEBUG */
+		get_block->check_index_page_at_flush = TRUE;
+		return;
+	case BTR_MODIFY_TREE:
+		/* x-latch also brothers from left to right */
+		left_page_no = btr_page_get_prev(page, mtr);
+
+		if (left_page_no != FIL_NULL) {
+			get_block = btr_block_get(
+				space, zip_size, left_page_no,
+				RW_X_LATCH, cursor->index, mtr);
+#ifdef UNIV_BTR_DEBUG
+			ut_a(page_is_comp(get_block->frame)
+			     == page_is_comp(page));
+			ut_a(btr_page_get_next(get_block->frame, mtr)
+			     == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+			get_block->check_index_page_at_flush = TRUE;
+		}
+
+		get_block = btr_block_get(
+			space, zip_size, page_no,
+			RW_X_LATCH, cursor->index, mtr);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
+#endif /* UNIV_BTR_DEBUG */
+		get_block->check_index_page_at_flush = TRUE;
+
+		right_page_no = btr_page_get_next(page, mtr);
+
+		if (right_page_no != FIL_NULL) {
+			get_block = btr_block_get(
+				space, zip_size, right_page_no,
+				RW_X_LATCH, cursor->index, mtr);
+#ifdef UNIV_BTR_DEBUG
+			ut_a(page_is_comp(get_block->frame)
+			     == page_is_comp(page));
+			ut_a(btr_page_get_prev(get_block->frame, mtr)
+			     == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+			get_block->check_index_page_at_flush = TRUE;
+		}
+
+		return;
+
+	case BTR_SEARCH_PREV:
+	case BTR_MODIFY_PREV:
+		mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
+		/* latch also left brother */
+		left_page_no = btr_page_get_prev(page, mtr);
+
+		if (left_page_no != FIL_NULL) {
+			get_block = btr_block_get(
+				space, zip_size,
+				left_page_no, mode, cursor->index, mtr);
+			cursor->left_block = get_block;
+#ifdef UNIV_BTR_DEBUG
+			ut_a(page_is_comp(get_block->frame)
+			     == page_is_comp(page));
+			ut_a(btr_page_get_next(get_block->frame, mtr)
+			     == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+			get_block->check_index_page_at_flush = TRUE;
+		}
+
+		get_block = btr_block_get(
+			space, zip_size, page_no, mode, cursor->index, mtr);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
+#endif /* UNIV_BTR_DEBUG */
+		get_block->check_index_page_at_flush = TRUE;
+		return;
+	}
+
+	ut_error;
+}
+
+/********************************************************************//**
+Searches an index tree and positions a tree cursor on a given level.
+NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
+to node pointer page number fields on the upper levels of the tree!
+Note that if mode is PAGE_CUR_LE, which is used in inserts, then
+cursor->up_match and cursor->low_match both will have sensible values.
+If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
+
+If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
+search tuple should be performed in the B-tree. InnoDB does an insert
+immediately after the cursor. Thus, the cursor may end up on a user record,
+or on a page infimum record. */
+UNIV_INTERN
+void
+btr_cur_search_to_nth_level(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: the tree level of search */
+	const dtuple_t*	tuple,	/*!< in: data tuple; NOTE: n_fields_cmp in
+				tuple must be set so that it cannot get
+				compared to the node ptr page number field! */
+	ulint		mode,	/*!< in: PAGE_CUR_L, ...;
+				Inserts should always be made using
+				PAGE_CUR_LE to search the position! */
+	ulint		latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
+				at most one of BTR_INSERT, BTR_DELETE_MARK,
+				BTR_DELETE, or BTR_ESTIMATE;
+				cursor->left_block is used to store a pointer
+				to the left neighbor page, in the cases
+				BTR_SEARCH_PREV and BTR_MODIFY_PREV;
+				NOTE that if has_search_latch
+				is != 0, we maybe do not have a latch set
+				on the cursor page, we assume
+				the caller uses his search latch
+				to protect the record! */
+	btr_cur_t*	cursor, /*!< in/out: tree cursor; the cursor page is
+				s- or x-latched, but see also above! */
+	ulint		has_search_latch,/*!< in: info on the latch mode the
+				caller currently has on btr_search_latch:
+				RW_S_LATCH, or 0 */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*		page;
+	buf_block_t*	block;
+	ulint		space;
+	buf_block_t*	guess;
+	ulint		height;
+	ulint		page_no;
+	ulint		up_match;
+	ulint		up_bytes;
+	ulint		low_match;
+	ulint		low_bytes;
+	ulint		savepoint;
+	ulint		rw_latch;
+	ulint		page_mode;
+	ulint		buf_mode;
+	ulint		estimate;
+	ulint		zip_size;
+	page_cur_t*	page_cursor;
+	btr_op_t	btr_op;
+	ulint		root_height = 0; /* remove warning */
+
+#ifdef BTR_CUR_ADAPT
+	btr_search_t*	info;
+#endif
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+	/* Currently, PAGE_CUR_LE is the only search mode used for searches
+	ending to upper levels */
+
+	ut_ad(level == 0 || mode == PAGE_CUR_LE);
+	ut_ad(dict_index_check_search_tuple(index, tuple));
+	ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr));
+	ut_ad(dtuple_check_typed(tuple));
+	ut_ad(!(index->type & DICT_FTS));
+	ut_ad(index->page != FIL_NULL);
+
+	UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
+	UNIV_MEM_INVALID(&cursor->up_bytes, sizeof cursor->up_bytes);
+	UNIV_MEM_INVALID(&cursor->low_match, sizeof cursor->low_match);
+	UNIV_MEM_INVALID(&cursor->low_bytes, sizeof cursor->low_bytes);
+#ifdef UNIV_DEBUG
+	cursor->up_match = ULINT_UNDEFINED;
+	cursor->low_match = ULINT_UNDEFINED;
+#endif
+
+	ibool	s_latch_by_caller;
+
+	s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
+
+	ut_ad(!s_latch_by_caller
+	      || mtr_memo_contains(mtr, dict_index_get_lock(index),
+				   MTR_MEMO_S_LOCK));
+
+	/* These flags are mutually exclusive, they are lumped together
+	with the latch mode for historical reasons. It's possible for
+	none of the flags to be set. */
+	switch (UNIV_EXPECT(latch_mode
+			    & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK),
+			    0)) {
+	case 0:
+		btr_op = BTR_NO_OP;
+		break;
+	case BTR_INSERT:
+		btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
+			? BTR_INSERT_IGNORE_UNIQUE_OP
+			: BTR_INSERT_OP;
+		break;
+	case BTR_DELETE:
+		btr_op = BTR_DELETE_OP;
+		ut_a(cursor->purge_node);
+		break;
+	case BTR_DELETE_MARK:
+		btr_op = BTR_DELMARK_OP;
+		break;
+	default:
+		/* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
+		should be specified at a time */
+		ut_error;
+	}
+
+	/* Operations on the insert buffer tree cannot be buffered. */
+	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
+	/* Operations on the clustered index cannot be buffered. */
+	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
+
+	estimate = latch_mode & BTR_ESTIMATE;
+
+	/* Turn the flags unrelated to the latch mode off. */
+	latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+
+	ut_ad(!s_latch_by_caller
+	      || latch_mode == BTR_SEARCH_LEAF
+	      || latch_mode == BTR_MODIFY_LEAF);
+
+	cursor->flag = BTR_CUR_BINARY;
+	cursor->index = index;
+
+#ifndef BTR_CUR_ADAPT
+	guess = NULL;
+#else
+	info = btr_search_get_info(index);
+
+	guess = info->root_guess;
+
+#ifdef BTR_CUR_HASH_ADAPT
+
+# ifdef UNIV_SEARCH_PERF_STAT
+	info->n_searches++;
+# endif
+	if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_NOT_LOCKED
+	    && latch_mode <= BTR_MODIFY_LEAF
+	    && info->last_hash_succ
+	    && !estimate
+# ifdef PAGE_CUR_LE_OR_EXTENDS
+	    && mode != PAGE_CUR_LE_OR_EXTENDS
+# endif /* PAGE_CUR_LE_OR_EXTENDS */
+	    /* If !has_search_latch, we do a dirty read of
+	    btr_search_enabled below, and btr_search_guess_on_hash()
+	    will have to check it again. */
+	    && UNIV_LIKELY(btr_search_enabled)
+	    && btr_search_guess_on_hash(index, info, tuple, mode,
+					latch_mode, cursor,
+					has_search_latch, mtr)) {
+
+		/* Search using the hash index succeeded */
+
+		ut_ad(cursor->up_match != ULINT_UNDEFINED
+		      || mode != PAGE_CUR_GE);
+		ut_ad(cursor->up_match != ULINT_UNDEFINED
+		      || mode != PAGE_CUR_LE);
+		ut_ad(cursor->low_match != ULINT_UNDEFINED
+		      || mode != PAGE_CUR_LE);
+		btr_cur_n_sea++;
+
+		return;
+	}
+# endif /* BTR_CUR_HASH_ADAPT */
+#endif /* BTR_CUR_ADAPT */
+	btr_cur_n_non_sea++;
+
+	/* If the hash search did not succeed, do binary search down the
+	tree */
+
+	if (has_search_latch) {
+		/* Release possible search latch to obey latching order */
+		rw_lock_s_unlock(&btr_search_latch);
+	}
+
+	/* Store the position of the tree latch we push to mtr so that we
+	know how to release it when we have latched leaf node(s) */
+
+	savepoint = mtr_set_savepoint(mtr);
+
+	switch (latch_mode) {
+	case BTR_MODIFY_TREE:
+		mtr_x_lock(dict_index_get_lock(index), mtr);
+		break;
+	case BTR_CONT_MODIFY_TREE:
+		/* Do nothing */
+		ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+					MTR_MEMO_X_LOCK));
+		break;
+	default:
+		if (!s_latch_by_caller) {
+			mtr_s_lock(dict_index_get_lock(index), mtr);
+		}
+	}
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	space = dict_index_get_space(index);
+	page_no = dict_index_get_page(index);
+
+	up_match = 0;
+	up_bytes = 0;
+	low_match = 0;
+	low_bytes = 0;
+
+	height = ULINT_UNDEFINED;
+
+	/* We use these modified search modes on non-leaf levels of the
+	B-tree. These let us end up in the right B-tree leaf. In that leaf
+	we use the original search mode. */
+
+	switch (mode) {
+	case PAGE_CUR_GE:
+		page_mode = PAGE_CUR_L;
+		break;
+	case PAGE_CUR_G:
+		page_mode = PAGE_CUR_LE;
+		break;
+	default:
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+		      || mode == PAGE_CUR_LE_OR_EXTENDS);
+#else /* PAGE_CUR_LE_OR_EXTENDS */
+		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+		page_mode = mode;
+		break;
+	}
+
+	/* Loop and search until we arrive at the desired level */
+
+search_loop:
+	buf_mode = BUF_GET;
+	rw_latch = RW_NO_LATCH;
+
+	if (height != 0) {
+		/* We are about to fetch the root or a non-leaf page. */
+	} else if (latch_mode <= BTR_MODIFY_LEAF) {
+		rw_latch = latch_mode;
+
+		if (btr_op != BTR_NO_OP
+		    && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
+
+			/* Try to buffer the operation if the leaf
+			page is not in the buffer pool. */
+
+			buf_mode = btr_op == BTR_DELETE_OP
+				? BUF_GET_IF_IN_POOL_OR_WATCH
+				: BUF_GET_IF_IN_POOL;
+		}
+	}
+
+	zip_size = dict_table_zip_size(index->table);
+
+retry_page_get:
+	block = buf_page_get_gen(
+		space, zip_size, page_no, rw_latch, guess, buf_mode,
+		file, line, mtr);
+
+	if (block == NULL) {
+		/* This must be a search to perform an insert/delete
+		mark/ delete; try using the insert/delete buffer */
+
+		ut_ad(height == 0);
+		ut_ad(cursor->thr);
+
+		switch (btr_op) {
+		case BTR_INSERT_OP:
+		case BTR_INSERT_IGNORE_UNIQUE_OP:
+			ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
+
+			if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
+					space, zip_size, page_no,
+					cursor->thr)) {
+
+				cursor->flag = BTR_CUR_INSERT_TO_IBUF;
+
+				goto func_exit;
+			}
+			break;
+
+		case BTR_DELMARK_OP:
+			ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
+
+			if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
+					index, space, zip_size,
+					page_no, cursor->thr)) {
+
+				cursor->flag = BTR_CUR_DEL_MARK_IBUF;
+
+				goto func_exit;
+			}
+
+			break;
+
+		case BTR_DELETE_OP:
+			ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
+
+			if (!row_purge_poss_sec(cursor->purge_node,
+						index, tuple)) {
+
+				/* The record cannot be purged yet. */
+				cursor->flag = BTR_CUR_DELETE_REF;
+			} else if (ibuf_insert(IBUF_OP_DELETE, tuple,
+					       index, space, zip_size,
+					       page_no,
+					       cursor->thr)) {
+
+				/* The purge was buffered. */
+				cursor->flag = BTR_CUR_DELETE_IBUF;
+			} else {
+				/* The purge could not be buffered. */
+				buf_pool_watch_unset(space, page_no);
+				break;
+			}
+
+			buf_pool_watch_unset(space, page_no);
+			goto func_exit;
+
+		default:
+			ut_error;
+		}
+
+		/* Insert to the insert/delete buffer did not succeed, we
+		must read the page from disk. */
+
+		buf_mode = BUF_GET;
+
+		goto retry_page_get;
+	}
+
+	block->check_index_page_at_flush = TRUE;
+	page = buf_block_get_frame(block);
+
+	if (rw_latch != RW_NO_LATCH) {
+#ifdef UNIV_ZIP_DEBUG
+		const page_zip_des_t*	page_zip
+			= buf_block_get_page_zip(block);
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+		buf_block_dbg_add_level(
+			block, dict_index_is_ibuf(index)
+			? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
+	}
+
+	ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+	ut_ad(index->id == btr_page_get_index_id(page));
+
+	if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
+		/* We are in the root node */
+
+		height = btr_page_get_level(page, mtr);
+		root_height = height;
+		cursor->tree_height = root_height + 1;
+
+#ifdef BTR_CUR_ADAPT
+		if (block != guess) {
+			info->root_guess = block;
+		}
+#endif
+	}
+
+	if (height == 0) {
+		if (rw_latch == RW_NO_LATCH) {
+
+			btr_cur_latch_leaves(
+				page, space, zip_size, page_no, latch_mode,
+				cursor, mtr);
+		}
+
+		switch (latch_mode) {
+		case BTR_MODIFY_TREE:
+		case BTR_CONT_MODIFY_TREE:
+			break;
+		default:
+			if (!s_latch_by_caller) {
+				/* Release the tree s-latch */
+				mtr_release_s_latch_at_savepoint(
+					mtr, savepoint,
+					dict_index_get_lock(index));
+			}
+		}
+
+		page_mode = mode;
+	}
+
+	page_cur_search_with_match(
+		block, index, tuple, page_mode, &up_match, &up_bytes,
+		&low_match, &low_bytes, page_cursor);
+
+	if (estimate) {
+		btr_cur_add_path_info(cursor, height, root_height);
+	}
+
+	/* If this is the desired level, leave the loop */
+
+	ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor),
+					   mtr));
+
+	if (level != height) {
+
+		const rec_t*	node_ptr;
+		ut_ad(height > 0);
+
+		height--;
+		guess = NULL;
+
+		node_ptr = page_cur_get_rec(page_cursor);
+
+		offsets = rec_get_offsets(
+			node_ptr, index, offsets, ULINT_UNDEFINED, &heap);
+
+		/* Go to the child node */
+		page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
+
+		if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
+			/* We're doing a search on an ibuf tree and we're one
+			level above the leaf page. */
+
+			ut_ad(level == 0);
+
+			buf_mode = BUF_GET;
+			rw_latch = RW_NO_LATCH;
+			goto retry_page_get;
+		}
+
+		goto search_loop;
+	}
+
+	if (level != 0) {
+		/* x-latch the page */
+		buf_block_t*	child_block = btr_block_get(
+			space, zip_size, page_no, RW_X_LATCH, index, mtr);
+
+		page = buf_block_get_frame(child_block);
+		btr_assert_not_corrupted(child_block, index);
+	} else {
+		cursor->low_match = low_match;
+		cursor->low_bytes = low_bytes;
+		cursor->up_match = up_match;
+		cursor->up_bytes = up_bytes;
+
+#ifdef BTR_CUR_ADAPT
+		/* We do a dirty read of btr_search_enabled here.  We
+		will properly check btr_search_enabled again in
+		btr_search_build_page_hash_index() before building a
+		page hash index, while holding btr_search_latch. */
+		if (btr_search_enabled) {
+			btr_search_info_update(index, cursor);
+		}
+#endif
+		ut_ad(cursor->up_match != ULINT_UNDEFINED
+		      || mode != PAGE_CUR_GE);
+		ut_ad(cursor->up_match != ULINT_UNDEFINED
+		      || mode != PAGE_CUR_LE);
+		ut_ad(cursor->low_match != ULINT_UNDEFINED
+		      || mode != PAGE_CUR_LE);
+	}
+
+func_exit:
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	if (has_search_latch) {
+
+		rw_lock_s_lock(&btr_search_latch);
+	}
+}
+
+/*****************************************************************//**
+Opens a cursor at either end of an index. */
+UNIV_INTERN
+void
+btr_cur_open_at_index_side_func(
+/*============================*/
+	bool		from_left,	/*!< in: true if open to the low end,
+					false if to the high end */
+	dict_index_t*	index,		/*!< in: index */
+	ulint		latch_mode,	/*!< in: latch mode */
+	btr_cur_t*	cursor,		/*!< in/out: cursor */
+	ulint		level,		/*!< in: level to search for
+					(0=leaf). */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	page_cur_t*	page_cursor;
+	ulint		page_no;
+	ulint		space;
+	ulint		zip_size;
+	ulint		height;
+	ulint		root_height = 0; /* remove warning */
+	rec_t*		node_ptr;
+	ulint		estimate;
+	ulint		savepoint;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	estimate = latch_mode & BTR_ESTIMATE;
+	latch_mode &= ~BTR_ESTIMATE;
+
+	ut_ad(level != ULINT_UNDEFINED);
+
+	/* Store the position of the tree latch we push to mtr so that we
+	know how to release it when we have latched the leaf node */
+
+	savepoint = mtr_set_savepoint(mtr);
+
+	switch (latch_mode) {
+	case BTR_CONT_MODIFY_TREE:
+		break;
+	case BTR_MODIFY_TREE:
+		mtr_x_lock(dict_index_get_lock(index), mtr);
+		break;
+	case BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED:
+	case BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED:
+		ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+					MTR_MEMO_S_LOCK));
+		break;
+	default:
+		mtr_s_lock(dict_index_get_lock(index), mtr);
+	}
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+	cursor->index = index;
+
+	space = dict_index_get_space(index);
+	zip_size = dict_table_zip_size(index->table);
+	page_no = dict_index_get_page(index);
+
+	height = ULINT_UNDEFINED;
+
+	for (;;) {
+		buf_block_t*	block;
+		page_t*		page;
+		block = buf_page_get_gen(space, zip_size, page_no,
+					 RW_NO_LATCH, NULL, BUF_GET,
+					 file, line, mtr);
+		page = buf_block_get_frame(block);
+		ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+		ut_ad(index->id == btr_page_get_index_id(page));
+
+		block->check_index_page_at_flush = TRUE;
+
+		if (height == ULINT_UNDEFINED) {
+			/* We are in the root node */
+
+			height = btr_page_get_level(page, mtr);
+			root_height = height;
+			ut_a(height >= level);
+		} else {
+			/* TODO: flag the index corrupted if this fails */
+			ut_ad(height == btr_page_get_level(page, mtr));
+		}
+
+		if (height == level) {
+			btr_cur_latch_leaves(
+				page, space, zip_size, page_no,
+				latch_mode & ~BTR_ALREADY_S_LATCHED,
+				cursor, mtr);
+
+			if (height == 0) {
+				/* In versions <= 3.23.52 we had
+				forgotten to release the tree latch
+				here. If in an index scan we had to
+				scan far to find a record visible to
+				the current transaction, that could
+				starve others waiting for the tree
+				latch. */
+
+				switch (latch_mode) {
+				case BTR_MODIFY_TREE:
+				case BTR_CONT_MODIFY_TREE:
+				case BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED:
+				case BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED:
+					break;
+				default:
+					/* Release the tree s-latch */
+
+					mtr_release_s_latch_at_savepoint(
+						mtr, savepoint,
+						dict_index_get_lock(index));
+				}
+			}
+		}
+
+		if (from_left) {
+			page_cur_set_before_first(block, page_cursor);
+		} else {
+			page_cur_set_after_last(block, page_cursor);
+		}
+
+		if (height == level) {
+			if (estimate) {
+				btr_cur_add_path_info(cursor, height,
+						      root_height);
+			}
+
+			break;
+		}
+
+		ut_ad(height > 0);
+
+		if (from_left) {
+			page_cur_move_to_next(page_cursor);
+		} else {
+			page_cur_move_to_prev(page_cursor);
+		}
+
+		if (estimate) {
+			btr_cur_add_path_info(cursor, height, root_height);
+		}
+
+		height--;
+
+		node_ptr = page_cur_get_rec(page_cursor);
+		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		/* Go to the child node */
+		page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
+	}
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+}
+
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree. */
+UNIV_INTERN
+void
+btr_cur_open_at_rnd_pos_func(
+/*=========================*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_cur_t*	cursor,		/*!< in/out: B-tree cursor */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_cur_t*	page_cursor;
+	ulint		page_no;
+	ulint		space;
+	ulint		zip_size;
+	ulint		height;
+	rec_t*		node_ptr;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	switch (latch_mode) {
+	case BTR_MODIFY_TREE:
+		mtr_x_lock(dict_index_get_lock(index), mtr);
+		break;
+	default:
+		ut_ad(latch_mode != BTR_CONT_MODIFY_TREE);
+		mtr_s_lock(dict_index_get_lock(index), mtr);
+	}
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+	cursor->index = index;
+
+	space = dict_index_get_space(index);
+	zip_size = dict_table_zip_size(index->table);
+	page_no = dict_index_get_page(index);
+
+	height = ULINT_UNDEFINED;
+
+	for (;;) {
+		buf_block_t*	block;
+		page_t*		page;
+
+		block = buf_page_get_gen(space, zip_size, page_no,
+					 RW_NO_LATCH, NULL, BUF_GET,
+					 file, line, mtr);
+		page = buf_block_get_frame(block);
+		ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+		ut_ad(index->id == btr_page_get_index_id(page));
+
+		if (height == ULINT_UNDEFINED) {
+			/* We are in the root node */
+
+			height = btr_page_get_level(page, mtr);
+		}
+
+		if (height == 0) {
+			btr_cur_latch_leaves(page, space, zip_size, page_no,
+					     latch_mode, cursor, mtr);
+		}
+
+		page_cur_open_on_rnd_user_rec(block, page_cursor);
+
+		if (height == 0) {
+
+			break;
+		}
+
+		ut_ad(height > 0);
+
+		height--;
+
+		node_ptr = page_cur_get_rec(page_cursor);
+		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		/* Go to the child node */
+		page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/*==================== B-TREE INSERT =========================*/
+
+/*************************************************************//**
+Inserts a record if there is enough space, or if enough space can
+be freed by reorganizing. Differs from btr_cur_optimistic_insert because
+no heuristics is applied to whether it pays to use CPU time for
+reorganizing the page or not.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return	pointer to inserted record if succeed, else NULL */
+static __attribute__((nonnull, warn_unused_result))
+rec_t*
+btr_cur_insert_if_possible(
+/*=======================*/
+	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
+				cursor stays valid */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert; the size info need not
+				have been stored to tuple */
+	ulint**		offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	page_cur_t*	page_cursor;
+	rec_t*		rec;
+
+	ut_ad(dtuple_check_typed(tuple));
+
+	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+				MTR_MEMO_PAGE_X_FIX));
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	/* Now, try the insert */
+	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
+				    offsets, heap, n_ext, mtr);
+
+	/* If the record did not fit, reorganize.
+	For compressed pages, page_cur_tuple_insert()
+	attempted this already. */
+	if (!rec && !page_cur_get_page_zip(page_cursor)
+	    && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
+		rec = page_cur_tuple_insert(
+			page_cursor, tuple, cursor->index,
+			offsets, heap, n_ext, mtr);
+	}
+
+	ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
+	return(rec);
+}
+
+/*************************************************************//**
+For an insert, checks the locks and does the undo logging if desired.
+@return	DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
+UNIV_INLINE __attribute__((warn_unused_result, nonnull(2,3,5,6)))
+dberr_t
+btr_cur_ins_lock_and_undo(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if
+				not zero, the parameters index and thr
+				should be specified */
+	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert */
+	dtuple_t*	entry,	/*!< in/out: entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread or NULL */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	ibool*		inherit)/*!< out: TRUE if the inserted new record maybe
+				should inherit LOCK_GAP type locks from the
+				successor record */
+{
+	dict_index_t*	index;
+	dberr_t		err;
+	rec_t*		rec;
+	roll_ptr_t	roll_ptr;
+
+	/* Check if we have to wait for a lock: enqueue an explicit lock
+	request if yes */
+
+	rec = btr_cur_get_rec(cursor);
+	index = cursor->index;
+
+	ut_ad(!dict_index_is_online_ddl(index)
+	      || dict_index_is_clust(index)
+	      || (flags & BTR_CREATE_FLAG));
+
+	err = lock_rec_insert_check_and_lock(flags, rec,
+					     btr_cur_get_block(cursor),
+					     index, thr, mtr, inherit);
+
+	if (err != DB_SUCCESS
+	    || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) {
+
+		return(err);
+	}
+
+	err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
+					    thr, index, entry,
+					    NULL, 0, NULL, NULL,
+					    &roll_ptr);
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	/* Now we can fill in the roll ptr field in entry */
+
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
+
+		row_upd_index_entry_sys_field(entry, index,
+					      DATA_ROLL_PTR, roll_ptr);
+	}
+
+	return(DB_SUCCESS);
+}
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Report information about a transaction. */
+static
+void
+btr_cur_trx_report(
+/*===============*/
+	trx_id_t		trx_id,	/*!< in: transaction id */
+	const dict_index_t*	index,	/*!< in: index */
+	const char*		op)	/*!< in: operation */
+{
+	fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ", trx_id);
+	fputs(op, stderr);
+	dict_index_name_print(stderr, NULL, index);
+	putc('\n', stderr);
+}
+#endif /* UNIV_DEBUG */
+
+/*************************************************************//**
+Tries to perform an insert to a page in an index tree, next to cursor.
+It is assumed that mtr holds an x-latch on the page. The operation does
+not succeed if there is too little space on the page. If there is just
+one record on the page, the insert will always succeed; this is to
+prevent trying to split a page with just one record.
+@return	DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
+UNIV_INTERN
+dberr_t
+btr_cur_optimistic_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if not
+				zero, the parameters index and thr should be
+				specified */
+	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
+				cursor stays valid */
+	ulint**		offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	dtuple_t*	entry,	/*!< in/out: entry to insert */
+	rec_t**		rec,	/*!< out: pointer to inserted record if
+				succeed */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller, or
+				NULL */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr,	/*!< in: query thread or NULL */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction;
+				if this function returns DB_SUCCESS on
+				a leaf page of a secondary index in a
+				compressed tablespace, the caller must
+				mtr_commit(mtr) before latching
+				any further pages */
+{
+	big_rec_t*	big_rec_vec	= NULL;
+	dict_index_t*	index;
+	page_cur_t*	page_cursor;
+	buf_block_t*	block;
+	page_t*		page;
+	rec_t*		dummy;
+	ibool		leaf;
+	ibool		reorg;
+	ibool		inherit = TRUE;
+	ulint		zip_size;
+	ulint		rec_size;
+	dberr_t		err;
+
+	*big_rec = NULL;
+
+	block = btr_cur_get_block(cursor);
+	page = buf_block_get_frame(block);
+	index = cursor->index;
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!dict_index_is_online_ddl(index)
+	      || dict_index_is_clust(index)
+	      || (flags & BTR_CREATE_FLAG));
+	ut_ad(dtuple_check_typed(entry));
+
+	zip_size = buf_block_get_zip_size(block);
+#ifdef UNIV_DEBUG_VALGRIND
+	if (zip_size) {
+		UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
+		UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
+	}
+#endif /* UNIV_DEBUG_VALGRIND */
+
+#ifdef UNIV_DEBUG
+	if (btr_cur_print_record_ops && thr) {
+		btr_cur_trx_report(thr_get_trx(thr)->id, index, "insert ");
+		dtuple_print(stderr, entry);
+	}
+#endif /* UNIV_DEBUG */
+
+	leaf = page_is_leaf(page);
+
+	/* Calculate the record size when entry is converted to a record */
+	rec_size = rec_get_converted_size(index, entry, n_ext);
+
+	if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
+				   dtuple_get_n_fields(entry), zip_size)) {
+
+		/* The record is so big that we have to store some fields
+		externally on separate database pages */
+		big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
+
+		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
+
+			return(DB_TOO_BIG_RECORD);
+		}
+
+		rec_size = rec_get_converted_size(index, entry, n_ext);
+	}
+
+	if (zip_size) {
+		/* Estimate the free space of an empty compressed page.
+		Subtract one byte for the encoded heap_no in the
+		modification log. */
+		ulint	free_space_zip = page_zip_empty_size(
+			cursor->index->n_fields, zip_size);
+		ulint	n_uniq = dict_index_get_n_unique_in_tree(index);
+
+		ut_ad(dict_table_is_comp(index->table));
+
+		if (free_space_zip == 0) {
+too_big:
+			if (big_rec_vec) {
+				dtuple_convert_back_big_rec(
+					index, entry, big_rec_vec);
+			}
+
+			return(DB_TOO_BIG_RECORD);
+		}
+
+		/* Subtract one byte for the encoded heap_no in the
+		modification log. */
+		free_space_zip--;
+
+		/* There should be enough room for two node pointer
+		records on an empty non-leaf page.  This prevents
+		infinite page splits. */
+
+		if (entry->n_fields >= n_uniq
+		    && (REC_NODE_PTR_SIZE
+			+ rec_get_converted_size_comp_prefix(
+				index, entry->fields, n_uniq, NULL)
+			/* On a compressed page, there is
+			a two-byte entry in the dense
+			page directory for every record.
+			But there is no record header. */
+			- (REC_N_NEW_EXTRA_BYTES - 2)
+			> free_space_zip / 2)) {
+			goto too_big;
+		}
+	}
+
+	LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
+				      goto fail);
+
+	if (leaf && zip_size
+	    && (page_get_data_size(page) + rec_size
+		>= dict_index_zip_pad_optimal_page_size(index))) {
+		/* If compression padding tells us that insertion will
+		result in too packed up page i.e.: which is likely to
+		cause compression failure then don't do an optimistic
+		insertion. */
+fail:
+		err = DB_FAIL;
+fail_err:
+
+		if (big_rec_vec) {
+			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+		}
+
+		return(err);
+	}
+
+	ulint	max_size = page_get_max_insert_size_after_reorganize(page, 1);
+
+	if (page_has_garbage(page)) {
+		if ((max_size < rec_size
+		     || max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT)
+		    && page_get_n_recs(page) > 1
+		    && page_get_max_insert_size(page, 1) < rec_size) {
+
+			goto fail;
+		}
+	} else if (max_size < rec_size) {
+		goto fail;
+	}
+
+	/* If there have been many consecutive inserts to the
+	clustered index leaf page of an uncompressed table, check if
+	we have to split the page to reserve enough free space for
+	future updates of records. */
+
+	if (leaf && !zip_size && dict_index_is_clust(index)
+	    && page_get_n_recs(page) >= 2
+	    && dict_index_get_space_reserve() + rec_size > max_size
+	    && (btr_page_get_split_rec_to_right(cursor, &dummy)
+		|| btr_page_get_split_rec_to_left(cursor, &dummy))) {
+		goto fail;
+	}
+
+	/* Check locks and write to the undo log, if specified */
+	err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
+					thr, mtr, &inherit);
+
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+
+		goto fail_err;
+	}
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	/* Now, try the insert */
+
+	{
+		const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor);
+		*rec = page_cur_tuple_insert(page_cursor, entry, index,
+					     offsets, heap, n_ext, mtr);
+		reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
+	}
+
+	if (*rec) {
+	} else if (zip_size) {
+		/* Reset the IBUF_BITMAP_FREE bits, because
+		page_cur_tuple_insert() will have attempted page
+		reorganize before failing. */
+		if (leaf && !dict_index_is_clust(index)) {
+			ibuf_reset_free_bits(block);
+		}
+
+		goto fail;
+	} else {
+		ut_ad(!reorg);
+
+		/* If the record did not fit, reorganize */
+		if (!btr_page_reorganize(page_cursor, index, mtr)) {
+			ut_ad(0);
+			goto fail;
+		}
+
+		ut_ad(page_get_max_insert_size(page, 1) == max_size);
+
+		reorg = TRUE;
+
+		*rec = page_cur_tuple_insert(page_cursor, entry, index,
+					     offsets, heap, n_ext, mtr);
+
+		if (UNIV_UNLIKELY(!*rec)) {
+			fputs("InnoDB: Error: cannot insert tuple ", stderr);
+			dtuple_print(stderr, entry);
+			fputs(" into ", stderr);
+			dict_index_name_print(stderr, thr_get_trx(thr), index);
+			fprintf(stderr, "\nInnoDB: max insert size %lu\n",
+				(ulong) max_size);
+			ut_error;
+		}
+	}
+
+#ifdef BTR_CUR_HASH_ADAPT
+	if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) {
+		btr_search_update_hash_node_on_insert(cursor);
+	} else {
+		btr_search_update_hash_on_insert(cursor);
+	}
+#endif
+
+	if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
+
+		lock_update_insert(block, *rec);
+	}
+
+	if (leaf && !dict_index_is_clust(index)) {
+		/* Update the free bits of the B-tree page in the
+		insert buffer bitmap. */
+
+		/* The free bits in the insert buffer bitmap must
+		never exceed the free space on a page.  It is safe to
+		decrement or reset the bits in the bitmap in a
+		mini-transaction that is committed before the
+		mini-transaction that affects the free space. */
+
+		/* It is unsafe to increment the bits in a separately
+		committed mini-transaction, because in crash recovery,
+		the free bits could momentarily be set too high. */
+
+		if (zip_size) {
+			/* Update the bits in the same mini-transaction. */
+			ibuf_update_free_bits_zip(block, mtr);
+		} else {
+			/* Decrement the bits in a separate
+			mini-transaction. */
+			ibuf_update_free_bits_if_full(
+				block, max_size,
+				rec_size + PAGE_DIR_SLOT_SIZE);
+		}
+	}
+
+	*big_rec = big_rec_vec;
+
+	return(DB_SUCCESS);
+}
+
+/*************************************************************//**
+Performs an insert on a page of an index tree. It is assumed that mtr
+holds an x-latch on the tree and on the cursor page. If the insert is
+made on the leaf level, to avoid deadlocks, mtr must also own x-latches
+to brothers of page, if those brothers exist.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+dberr_t
+btr_cur_pessimistic_insert(
+/*=======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if not
+				zero, the parameter thr should be
+				specified; if no undo logging is specified,
+				then the caller must have reserved enough
+				free extents in the file space so that the
+				insertion will certainly succeed */
+	btr_cur_t*	cursor,	/*!< in: cursor after which to insert;
+				cursor stays valid */
+	ulint**		offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
+				that can be emptied, or NULL */
+	dtuple_t*	entry,	/*!< in/out: entry to insert */
+	rec_t**		rec,	/*!< out: pointer to inserted record if
+				succeed */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller, or
+				NULL */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr,	/*!< in: query thread or NULL */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	dict_index_t*	index		= cursor->index;
+	ulint		zip_size	= dict_table_zip_size(index->table);
+	big_rec_t*	big_rec_vec	= NULL;
+	dberr_t		err;
+	ibool		inherit = FALSE;
+	ibool		success;
+	ulint		n_reserved	= 0;
+
+	ut_ad(dtuple_check_typed(entry));
+
+	*big_rec = NULL;
+
+	ut_ad(mtr_memo_contains(mtr,
+				dict_index_get_lock(btr_cur_get_index(cursor)),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+				MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!dict_index_is_online_ddl(index)
+	      || dict_index_is_clust(index)
+	      || (flags & BTR_CREATE_FLAG));
+
+	cursor->flag = BTR_CUR_BINARY;
+
+	/* Check locks and write to undo log, if specified */
+
+	err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
+					thr, mtr, &inherit);
+
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
+		/* First reserve enough free space for the file segments
+		of the index tree, so that the insert will not fail because
+		of lack of space */
+
+		ulint	n_extents = cursor->tree_height / 16 + 3;
+
+		success = fsp_reserve_free_extents(&n_reserved, index->space,
+						   n_extents, FSP_NORMAL, mtr);
+		if (!success) {
+			return(DB_OUT_OF_FILE_SPACE);
+		}
+	}
+
+	if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
+				   dict_table_is_comp(index->table),
+				   dtuple_get_n_fields(entry),
+				   zip_size)) {
+		/* The record is so big that we have to store some fields
+		externally on separate database pages */
+
+		if (UNIV_LIKELY_NULL(big_rec_vec)) {
+			/* This should never happen, but we handle
+			the situation in a robust manner. */
+			ut_ad(0);
+			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+		}
+
+		big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
+
+		if (big_rec_vec == NULL) {
+
+			if (n_reserved > 0) {
+				fil_space_release_free_extents(index->space,
+							       n_reserved);
+			}
+			return(DB_TOO_BIG_RECORD);
+		}
+	}
+
+	if (dict_index_get_page(index)
+	    == buf_block_get_page_no(btr_cur_get_block(cursor))) {
+
+		/* The page is the root page */
+		*rec = btr_root_raise_and_insert(
+			flags, cursor, offsets, heap, entry, n_ext, mtr);
+	} else {
+		*rec = btr_page_split_and_insert(
+			flags, cursor, offsets, heap, entry, n_ext, mtr);
+	}
+
+	ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec);
+
+	if (!(flags & BTR_NO_LOCKING_FLAG)) {
+		/* The cursor might be moved to the other page,
+		and the max trx id field should be updated after
+		the cursor was fixed. */
+		if (!dict_index_is_clust(index)) {
+			page_update_max_trx_id(
+				btr_cur_get_block(cursor),
+				btr_cur_get_page_zip(cursor),
+				thr_get_trx(thr)->id, mtr);
+		}
+		if (!page_rec_is_infimum(btr_cur_get_rec(cursor))
+		    || btr_page_get_prev(
+			buf_block_get_frame(
+				btr_cur_get_block(cursor)), mtr)
+		       == FIL_NULL) {
+			/* split and inserted need to call
+			lock_update_insert() always. */
+			inherit = TRUE;
+		}
+	}
+
+#ifdef BTR_CUR_ADAPT
+	btr_search_update_hash_on_insert(cursor);
+#endif
+	if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) {
+
+		lock_update_insert(btr_cur_get_block(cursor), *rec);
+	}
+
+	if (n_reserved > 0) {
+		fil_space_release_free_extents(index->space, n_reserved);
+	}
+
+	*big_rec = big_rec_vec;
+
+	return(DB_SUCCESS);
+}
+
+/*==================== B-TREE UPDATE =========================*/
+
+/*************************************************************//**
+For an update, checks the locks and does the undo logging.
+@return	DB_SUCCESS, DB_WAIT_LOCK, or error number */
+UNIV_INLINE __attribute__((warn_unused_result, nonnull(2,3,6,7)))
+dberr_t
+btr_cur_upd_lock_and_undo(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on record to update */
+	const ulint*	offsets,/*!< in: rec_get_offsets() on cursor */
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread
+				(can be NULL if BTR_NO_LOCKING_FLAG) */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	roll_ptr_t*	roll_ptr)/*!< out: roll pointer */
+{
+	dict_index_t*	index;
+	const rec_t*	rec;
+	dberr_t		err;
+
+	ut_ad(thr || (flags & BTR_NO_LOCKING_FLAG));
+
+	rec = btr_cur_get_rec(cursor);
+	index = cursor->index;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (!dict_index_is_clust(index)) {
+		ut_ad(dict_index_is_online_ddl(index)
+		      == !!(flags & BTR_CREATE_FLAG));
+
+		/* We do undo logging only when we update a clustered index
+		record */
+		return(lock_sec_rec_modify_check_and_lock(
+			       flags, btr_cur_get_block(cursor), rec,
+			       index, thr, mtr));
+	}
+
+	/* Check if we have to wait for a lock: enqueue an explicit lock
+	request if yes */
+
+	if (!(flags & BTR_NO_LOCKING_FLAG)) {
+		err = lock_clust_rec_modify_check_and_lock(
+			flags, btr_cur_get_block(cursor), rec, index,
+			offsets, thr);
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	/* Append the info about the update in the undo log */
+
+	return(trx_undo_report_row_operation(
+		       flags, TRX_UNDO_MODIFY_OP, thr,
+		       index, NULL, update,
+		       cmpl_info, rec, offsets, roll_ptr));
+}
+
+/***********************************************************//**
+Writes a redo log record of updating a record in-place. */
+UNIV_INTERN
+void
+btr_cur_update_in_place_log(
+/*========================*/
+	ulint		flags,		/*!< in: flags */
+	const rec_t*	rec,		/*!< in: record */
+	dict_index_t*	index,		/*!< in: index of the record */
+	const upd_t*	update,		/*!< in: update vector */
+	trx_id_t	trx_id,		/*!< in: transaction id */
+	roll_ptr_t	roll_ptr,	/*!< in: roll ptr */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	byte*		log_ptr;
+	const page_t*	page	= page_align(rec);
+	ut_ad(flags < 256);
+	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+
+	log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
+					    ? MLOG_COMP_REC_UPDATE_IN_PLACE
+					    : MLOG_REC_UPDATE_IN_PLACE,
+					    1 + DATA_ROLL_PTR_LEN + 14 + 2
+					    + MLOG_BUF_MARGIN);
+
+	if (!log_ptr) {
+		/* Logging in mtr is switched off during crash recovery */
+		return;
+	}
+
+	/* For secondary indexes, we could skip writing the dummy system fields
+	to the redo log but we have to change redo log parsing of
+	MLOG_REC_UPDATE_IN_PLACE/MLOG_COMP_REC_UPDATE_IN_PLACE or we have to add
+	new redo log record. For now, just write dummy sys fields to the redo
+	log if we are updating a secondary index record.
+	*/
+	mach_write_to_1(log_ptr, flags);
+	log_ptr++;
+
+	if (dict_index_is_clust(index)) {
+		log_ptr = row_upd_write_sys_vals_to_log(
+				index, trx_id, roll_ptr, log_ptr, mtr);
+	} else {
+		/* Dummy system fields for a secondary index */
+		/* TRX_ID Position */
+		log_ptr += mach_write_compressed(log_ptr, 0);
+		/* ROLL_PTR */
+		trx_write_roll_ptr(log_ptr, 0);
+		log_ptr += DATA_ROLL_PTR_LEN;
+		/* TRX_ID */
+		log_ptr += mach_ull_write_compressed(log_ptr, 0);
+	}
+
+	mach_write_to_2(log_ptr, page_offset(rec));
+	log_ptr += 2;
+
+	row_upd_index_write_log(update, log_ptr, mtr);
+}
+#endif /* UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses a redo log record of updating a record in-place.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_update_in_place(
+/*==========================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	page_t*		page,	/*!< in/out: page or NULL */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	dict_index_t*	index)	/*!< in: index corresponding to page */
+{
+	ulint		flags;
+	rec_t*		rec;
+	upd_t*		update;
+	ulint		pos;
+	trx_id_t	trx_id;
+	roll_ptr_t	roll_ptr;
+	ulint		rec_offset;
+	mem_heap_t*	heap;
+	ulint*		offsets;
+
+	if (end_ptr < ptr + 1) {
+
+		return(NULL);
+	}
+
+	flags = mach_read_from_1(ptr);
+	ptr++;
+
+	ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	rec_offset = mach_read_from_2(ptr);
+	ptr += 2;
+
+	ut_a(rec_offset <= UNIV_PAGE_SIZE);
+
+	heap = mem_heap_create(256);
+
+	ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
+
+	if (!ptr || !page) {
+
+		goto func_exit;
+	}
+
+	ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
+	rec = page + rec_offset;
+
+	/* We do not need to reserve btr_search_latch, as the page is only
+	being recovered, and there cannot be a hash index to it. */
+
+	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
+
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
+		row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
+						   pos, trx_id, roll_ptr);
+	}
+
+	row_upd_rec_in_place(rec, index, offsets, update, page_zip);
+
+func_exit:
+	mem_heap_free(heap);
+
+	return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+See if there is enough place in the page modification log to log
+an update-in-place.
+
+@retval false if out of space; IBUF_BITMAP_FREE will be reset
+outside mtr if the page was recompressed
+@retval	true if enough place;
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
+a secondary index leaf page. This has to be done either within the
+same mini-transaction, or by invoking ibuf_reset_free_bits() before
+mtr_commit(mtr). */
+UNIV_INTERN
+bool
+btr_cur_update_alloc_zip_func(
+/*==========================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	page_cur_t*	cursor,	/*!< in/out: B-tree page cursor */
+	dict_index_t*	index,	/*!< in: the index corresponding to cursor */
+#ifdef UNIV_DEBUG
+	ulint*		offsets,/*!< in/out: offsets of the cursor record */
+#endif /* UNIV_DEBUG */
+	ulint		length,	/*!< in: size needed */
+	bool		create,	/*!< in: true=delete-and-insert,
+				false=update-in-place */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	const page_t*	page = page_cur_get_page(cursor);
+
+	ut_ad(page_zip == page_cur_get_page_zip(cursor));
+	ut_ad(page_zip);
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
+
+	if (page_zip_available(page_zip, dict_index_is_clust(index),
+			       length, create)) {
+		return(true);
+	}
+
+	if (!page_zip->m_nonempty && !page_has_garbage(page)) {
+		/* The page has been freshly compressed, so
+		reorganizing it will not help. */
+		return(false);
+	}
+
+	if (create && page_is_leaf(page)
+	    && (length + page_get_data_size(page)
+		>= dict_index_zip_pad_optimal_page_size(index))) {
+		return(false);
+	}
+
+	if (!btr_page_reorganize(cursor, index, mtr)) {
+		goto out_of_space;
+	}
+
+	rec_offs_make_valid(page_cur_get_rec(cursor), index, offsets);
+
+	/* After recompressing a page, we must make sure that the free
+	bits in the insert buffer bitmap will not exceed the free
+	space on the page.  Because this function will not attempt
+	recompression unless page_zip_available() fails above, it is
+	safe to reset the free bits if page_zip_available() fails
+	again, below.  The free bits can safely be reset in a separate
+	mini-transaction.  If page_zip_available() succeeds below, we
+	can be sure that the btr_page_reorganize() above did not reduce
+	the free space available on the page. */
+
+	if (page_zip_available(page_zip, dict_index_is_clust(index),
+			       length, create)) {
+		return(true);
+	}
+
+out_of_space:
+	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
+
+	/* Out of space: reset the free bits. */
+	if (!dict_index_is_clust(index) && page_is_leaf(page)) {
+		ibuf_reset_free_bits(page_cur_get_block(cursor));
+	}
+
+	return(false);
+}
+
+/*************************************************************//**
+Updates a record when the update causes no size changes in its fields.
+We assume here that the ordering fields of the record do not change.
+@return locking or undo log related error code, or
+@retval DB_SUCCESS on success
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
+UNIV_INTERN
+dberr_t
+btr_cur_update_in_place(
+/*====================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
+				cursor stays valid and positioned on the
+				same record */
+	ulint*		offsets,/*!< in/out: offsets on cursor->page_cur.rec */
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
+				is a secondary index, the caller must
+				mtr_commit(mtr) before latching any
+				further pages */
+{
+	dict_index_t*	index;
+	buf_block_t*	block;
+	page_zip_des_t*	page_zip;
+	dberr_t		err;
+	rec_t*		rec;
+	roll_ptr_t	roll_ptr	= 0;
+	ulint		was_delete_marked;
+	ibool		is_hashed;
+
+	rec = btr_cur_get_rec(cursor);
+	index = cursor->index;
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+	/* The insert buffer tree should never be updated in place. */
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
+	      || dict_index_is_clust(index));
+	ut_ad(thr_get_trx(thr)->id == trx_id
+	      || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
+	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
+	ut_ad(fil_page_get_type(btr_cur_get_page(cursor)) == FIL_PAGE_INDEX);
+	ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
+
+#ifdef UNIV_DEBUG
+	if (btr_cur_print_record_ops) {
+		btr_cur_trx_report(trx_id, index, "update ");
+		rec_print_new(stderr, rec, offsets);
+	}
+#endif /* UNIV_DEBUG */
+
+	block = btr_cur_get_block(cursor);
+	page_zip = buf_block_get_page_zip(block);
+
+	/* Check that enough space is available on the compressed page. */
+	if (page_zip) {
+		if (!btr_cur_update_alloc_zip(
+			    page_zip, btr_cur_get_page_cur(cursor),
+			    index, offsets, rec_offs_size(offsets),
+			    false, mtr)) {
+			return(DB_ZIP_OVERFLOW);
+		}
+
+		rec = btr_cur_get_rec(cursor);
+	}
+
+	/* Do lock checking and undo logging */
+	err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
+					update, cmpl_info,
+					thr, mtr, &roll_ptr);
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
+		goto func_exit;
+	}
+
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
+		row_upd_rec_sys_fields(rec, NULL, index, offsets,
+				       thr_get_trx(thr), roll_ptr);
+	}
+
+	was_delete_marked = rec_get_deleted_flag(
+		rec, page_is_comp(buf_block_get_frame(block)));
+
+	is_hashed = (block->index != NULL);
+
+	if (is_hashed) {
+		/* TO DO: Can we skip this if none of the fields
+		index->search_info->curr_n_fields
+		are being updated? */
+
+		/* The function row_upd_changes_ord_field_binary works only
+		if the update vector was built for a clustered index, we must
+		NOT call it if index is secondary */
+
+		if (!dict_index_is_clust(index)
+		    || row_upd_changes_ord_field_binary(index, update, thr,
+							NULL, NULL)) {
+
+			/* Remove possible hash index pointer to this record */
+			btr_search_update_hash_on_delete(cursor);
+		}
+
+		rw_lock_x_lock(&btr_search_latch);
+	}
+
+	row_upd_rec_in_place(rec, index, offsets, update, page_zip);
+
+	if (is_hashed) {
+		rw_lock_x_unlock(&btr_search_latch);
+	}
+
+	btr_cur_update_in_place_log(flags, rec, index, update,
+				    trx_id, roll_ptr, mtr);
+
+	if (was_delete_marked
+	    && !rec_get_deleted_flag(
+		    rec, page_is_comp(buf_block_get_frame(block)))) {
+		/* The new updated record owns its possible externally
+		stored fields */
+
+		btr_cur_unmark_extern_fields(page_zip,
+					     rec, index, offsets, mtr);
+	}
+
+	ut_ad(err == DB_SUCCESS);
+
+func_exit:
+	if (page_zip
+	    && !(flags & BTR_KEEP_IBUF_BITMAP)
+	    && !dict_index_is_clust(index)
+	    && page_is_leaf(buf_block_get_frame(block))) {
+		/* Update the free bits in the insert buffer. */
+		ibuf_update_free_bits_zip(block, mtr);
+	}
+
+	return(err);
+}
+
+/*************************************************************//**
+Tries to update a record on a page in an index tree. It is assumed that mtr
+holds an x-latch on the page. The operation does not succeed if there is too
+little space on the page or if the update would result in too empty a page,
+so that tree compression is recommended. We assume here that the ordering
+fields of the record do not change.
+@return error code, including
+@retval DB_SUCCESS on success
+@retval DB_OVERFLOW if the updated record does not fit
+@retval DB_UNDERFLOW if the page would become too empty
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
+UNIV_INTERN
+dberr_t
+btr_cur_optimistic_update(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
+				cursor stays valid and positioned on the
+				same record */
+	ulint**		offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to NULL or memory heap */
+	const upd_t*	update,	/*!< in: update vector; this must also
+				contain trx id and roll ptr fields */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
+				is a secondary index, the caller must
+				mtr_commit(mtr) before latching any
+				further pages */
+{
+	dict_index_t*	index;
+	page_cur_t*	page_cursor;
+	dberr_t		err;
+	buf_block_t*	block;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	rec_t*		rec;
+	ulint		max_size;
+	ulint		new_rec_size;
+	ulint		old_rec_size;
+	dtuple_t*	new_entry;
+	roll_ptr_t	roll_ptr;
+	ulint		i;
+	ulint		n_ext;
+
+	block = btr_cur_get_block(cursor);
+	page = buf_block_get_frame(block);
+	rec = btr_cur_get_rec(cursor);
+	index = cursor->index;
+	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	/* The insert buffer tree should never be updated in place. */
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
+	      || dict_index_is_clust(index));
+	ut_ad(thr_get_trx(thr)->id == trx_id
+	      || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
+	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
+	ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+	ut_ad(btr_page_get_index_id(page) == index->id);
+
+	*offsets = rec_get_offsets(rec, index, *offsets,
+				   ULINT_UNDEFINED, heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	ut_a(!rec_offs_any_null_extern(rec, *offsets)
+	     || trx_is_recv(thr_get_trx(thr)));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+#ifdef UNIV_DEBUG
+	if (btr_cur_print_record_ops) {
+		btr_cur_trx_report(trx_id, index, "update ");
+		rec_print_new(stderr, rec, *offsets);
+	}
+#endif /* UNIV_DEBUG */
+
+	if (!row_upd_changes_field_size_or_external(index, *offsets, update)) {
+
+		/* The simplest and the most common case: the update does not
+		change the size of any field and none of the updated fields is
+		externally stored in rec or update, and there is enough space
+		on the compressed page to log the update. */
+
+		return(btr_cur_update_in_place(
+			       flags, cursor, *offsets, update,
+			       cmpl_info, thr, trx_id, mtr));
+	}
+
+	if (rec_offs_any_extern(*offsets)) {
+any_extern:
+		/* Externally stored fields are treated in pessimistic
+		update */
+
+		return(DB_OVERFLOW);
+	}
+
+	for (i = 0; i < upd_get_n_fields(update); i++) {
+		if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
+
+			goto any_extern;
+		}
+	}
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	if (!*heap) {
+		*heap = mem_heap_create(
+			rec_offs_size(*offsets)
+			+ DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
+	}
+
+	new_entry = row_rec_to_index_entry(rec, index, *offsets,
+					   &n_ext, *heap);
+	/* We checked above that there are no externally stored fields. */
+	ut_a(!n_ext);
+
+	/* The page containing the clustered index record
+	corresponding to new_entry is latched in mtr.
+	Thus the following call is safe. */
+	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
+						     FALSE, *heap);
+	old_rec_size = rec_offs_size(*offsets);
+	new_rec_size = rec_get_converted_size(index, new_entry, 0);
+
+	page_zip = buf_block_get_page_zip(block);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (page_zip) {
+		if (!btr_cur_update_alloc_zip(
+			    page_zip, page_cursor, index, *offsets,
+			    new_rec_size, true, mtr)) {
+			return(DB_ZIP_OVERFLOW);
+		}
+
+		rec = page_cur_get_rec(page_cursor);
+	}
+
+	if (UNIV_UNLIKELY(new_rec_size
+			  >= (page_get_free_space_of_empty(page_is_comp(page))
+			      / 2))) {
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
+		err = DB_OVERFLOW;
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(page_get_data_size(page)
+			  - old_rec_size + new_rec_size
+			  < BTR_CUR_PAGE_COMPRESS_LIMIT)) {
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
+
+		/* The page would become too empty */
+		err = DB_UNDERFLOW;
+		goto func_exit;
+	}
+
+	/* We do not attempt to reorganize if the page is compressed.
+	This is because the page may fail to compress after reorganization. */
+	max_size = page_zip
+		? page_get_max_insert_size(page, 1)
+		: (old_rec_size
+		   + page_get_max_insert_size_after_reorganize(page, 1));
+
+	if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
+	       && (max_size >= new_rec_size))
+	      || (page_get_n_recs(page) <= 1))) {
+
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
+
+		/* There was not enough space, or it did not pay to
+		reorganize: for simplicity, we decide what to do assuming a
+		reorganization is needed, though it might not be necessary */
+
+		err = DB_OVERFLOW;
+		goto func_exit;
+	}
+
+	/* Do lock checking and undo logging */
+	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
+					update, cmpl_info,
+					thr, mtr, &roll_ptr);
+	if (err != DB_SUCCESS) {
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
+		goto func_exit;
+	}
+
+	/* Ok, we may do the replacement. Store on the page infimum the
+	explicit locks on rec, before deleting rec (see the comment in
+	btr_cur_pessimistic_update). */
+
+	lock_rec_store_on_page_infimum(block, rec);
+
+	btr_search_update_hash_on_delete(cursor);
+
+	page_cur_delete_rec(page_cursor, index, *offsets, mtr);
+
+	page_cur_move_to_prev(page_cursor);
+
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
+		row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
+					      roll_ptr);
+		row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
+					      trx_id);
+	}
+
+	/* There are no externally stored columns in new_entry */
+	rec = btr_cur_insert_if_possible(
+		cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr);
+	ut_a(rec); /* <- We calculated above the insert would fit */
+
+	/* Restore the old explicit lock state on the record */
+
+	lock_rec_restore_from_page_infimum(block, rec, block);
+
+	page_cur_move_to_next(page_cursor);
+	ut_ad(err == DB_SUCCESS);
+
+func_exit:
+	if (page_zip
+	    && !(flags & BTR_KEEP_IBUF_BITMAP)
+	    && !dict_index_is_clust(index)
+	    && page_is_leaf(page)) {
+		/* Update the free bits in the insert buffer. */
+		ibuf_update_free_bits_zip(block, mtr);
+	}
+
+	return(err);
+}
+
+/*************************************************************//**
+If, in a split, a new supremum record was created as the predecessor of the
+updated record, the supremum record must inherit exactly the locks on the
+updated record. In the split it may have inherited locks from the successor
+of the updated record, which is not correct. This function restores the
+right locks for the new supremum. */
+static
+void
+btr_cur_pess_upd_restore_supremum(
+/*==============================*/
+	buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*	rec,	/*!< in: updated record */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*		page;
+	buf_block_t*	prev_block;
+	ulint		space;
+	ulint		zip_size;
+	ulint		prev_page_no;
+
+	page = buf_block_get_frame(block);
+
+	if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
+		/* Updated record is not the first user record on its page */
+
+		return;
+	}
+
+	space = buf_block_get_space(block);
+	zip_size = buf_block_get_zip_size(block);
+	prev_page_no = btr_page_get_prev(page, mtr);
+
+	ut_ad(prev_page_no != FIL_NULL);
+	prev_block = buf_page_get_with_no_latch(space, zip_size,
+						prev_page_no, mtr);
+#ifdef UNIV_BTR_DEBUG
+	ut_a(btr_page_get_next(prev_block->frame, mtr)
+	     == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+	/* We must already have an x-latch on prev_block! */
+	ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
+
+	lock_rec_reset_and_inherit_gap_locks(prev_block, block,
+					     PAGE_HEAP_NO_SUPREMUM,
+					     page_rec_get_heap_no(rec));
+}
+
+/*************************************************************//**
+Check if the total length of the modified blob for the row is within 10%
+of the total redo log size.  This constraint on the blob length is to
+avoid overwriting the redo logs beyond the last checkpoint lsn.
+@return	DB_SUCCESS or DB_TOO_BIG_FOR_REDO. */
+static
+dberr_t
+btr_check_blob_limit(const big_rec_t*	big_rec_vec)
+{
+	const	ib_uint64_t redo_size = srv_n_log_files * srv_log_file_size
+		* UNIV_PAGE_SIZE;
+	const	ib_uint64_t redo_10p = redo_size / 10;
+	ib_uint64_t	total_blob_len = 0;
+	dberr_t	err = DB_SUCCESS;
+
+	/* Calculate the total number of bytes for blob data */
+	for (ulint i = 0; i < big_rec_vec->n_fields; i++) {
+		total_blob_len += big_rec_vec->fields[i].len;
+	}
+
+	if (total_blob_len > redo_10p) {
+		ib_logf(IB_LOG_LEVEL_ERROR, "The total blob data"
+			" length (" UINT64PF ") is greater than"
+			" 10%% of the total redo log size (" UINT64PF
+			"). Please increase total redo log size.",
+			total_blob_len, redo_size);
+		err = DB_TOO_BIG_FOR_REDO;
+	}
+
+	return(err);
+}
+
+/*************************************************************//**
+Performs an update of a record on a page of a tree. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. If the
+update is made on the leaf level, to avoid deadlocks, mtr must also
+own x-latches to brothers of page, if those brothers exist. We assume
+here that the ordering fields of the record do not change.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+btr_cur_pessimistic_update(
+/*=======================*/
+	ulint		flags,	/*!< in: undo logging, locking, and rollback
+				flags */
+	btr_cur_t*	cursor,	/*!< in/out: cursor on the record to update;
+				cursor may become invalid if *big_rec == NULL
+				|| !(flags & BTR_KEEP_POS_FLAG) */
+	ulint**		offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: pointer to memory heap
+				that can be emptied, or NULL */
+	mem_heap_t*	entry_heap,
+				/*!< in/out: memory heap for allocating
+				big_rec and the index tuple */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller, or NULL */
+	const upd_t*	update,	/*!< in: update vector; this is allowed also
+				contain trx id and roll ptr fields, but
+				the values in update vector have no effect */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; must be
+				committed before latching any further pages */
+{
+	big_rec_t*	big_rec_vec	= NULL;
+	big_rec_t*	dummy_big_rec;
+	dict_index_t*	index;
+	buf_block_t*	block;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	rec_t*		rec;
+	page_cur_t*	page_cursor;
+	dberr_t		err;
+	dberr_t		optim_err;
+	roll_ptr_t	roll_ptr;
+	ibool		was_first;
+	ulint		n_reserved	= 0;
+	ulint		n_ext;
+
+	*offsets = NULL;
+	*big_rec = NULL;
+
+	block = btr_cur_get_block(cursor);
+	page = buf_block_get_frame(block);
+	page_zip = buf_block_get_page_zip(block);
+	index = cursor->index;
+
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+	/* The insert buffer tree should never be updated in place. */
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
+	      || dict_index_is_clust(index));
+	ut_ad(thr_get_trx(thr)->id == trx_id
+	      || (flags & ~BTR_KEEP_POS_FLAG)
+	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
+
+	err = optim_err = btr_cur_optimistic_update(
+		flags | BTR_KEEP_IBUF_BITMAP,
+		cursor, offsets, offsets_heap, update,
+		cmpl_info, thr, trx_id, mtr);
+
+	switch (err) {
+	case DB_ZIP_OVERFLOW:
+	case DB_UNDERFLOW:
+	case DB_OVERFLOW:
+		break;
+	default:
+	err_exit:
+		/* We suppressed this with BTR_KEEP_IBUF_BITMAP.
+		For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
+		already reset by btr_cur_update_alloc_zip() if the
+		page was recompressed. */
+		if (page_zip
+		    && optim_err != DB_ZIP_OVERFLOW
+		    && !dict_index_is_clust(index)
+		    && page_is_leaf(page)) {
+			ibuf_update_free_bits_zip(block, mtr);
+		}
+
+		return(err);
+	}
+
+	/* Do lock checking and undo logging */
+	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
+					update, cmpl_info,
+					thr, mtr, &roll_ptr);
+	if (err != DB_SUCCESS) {
+		goto err_exit;
+	}
+
+	if (optim_err == DB_OVERFLOW) {
+		ulint	reserve_flag;
+
+		/* First reserve enough free space for the file segments
+		of the index tree, so that the update will not fail because
+		of lack of space */
+
+		ulint	n_extents = cursor->tree_height / 16 + 3;
+
+		if (flags & BTR_NO_UNDO_LOG_FLAG) {
+			reserve_flag = FSP_CLEANING;
+		} else {
+			reserve_flag = FSP_NORMAL;
+		}
+
+		if (!fsp_reserve_free_extents(&n_reserved, index->space,
+					      n_extents, reserve_flag, mtr)) {
+			err = DB_OUT_OF_FILE_SPACE;
+			goto err_exit;
+		}
+	}
+
+	rec = btr_cur_get_rec(cursor);
+
+	*offsets = rec_get_offsets(
+		rec, index, *offsets, ULINT_UNDEFINED, offsets_heap);
+
+	dtuple_t*	new_entry = row_rec_to_index_entry(
+		rec, index, *offsets, &n_ext, entry_heap);
+
+	/* The page containing the clustered index record
+	corresponding to new_entry is latched in mtr.  If the
+	clustered index record is delete-marked, then its externally
+	stored fields cannot have been purged yet, because then the
+	purge would also have removed the clustered index record
+	itself.  Thus the following call is safe. */
+	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
+						     FALSE, entry_heap);
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
+		row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
+					      roll_ptr);
+		row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
+					      trx_id);
+	}
+
+	if ((flags & BTR_NO_UNDO_LOG_FLAG) && rec_offs_any_extern(*offsets)) {
+		/* We are in a transaction rollback undoing a row
+		update: we must free possible externally stored fields
+		which got new values in the update, if they are not
+		inherited values. They can be inherited if we have
+		updated the primary key to another value, and then
+		update it back again. */
+
+		ut_ad(big_rec_vec == NULL);
+
+		btr_rec_free_updated_extern_fields(
+			index, rec, page_zip, *offsets, update,
+			trx_is_recv(thr_get_trx(thr))
+			? RB_RECOVERY : RB_NORMAL, mtr);
+	}
+
+	/* We have to set appropriate extern storage bits in the new
+	record to be inserted: we have to remember which fields were such */
+
+	ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
+	ut_ad(rec_offs_validate(rec, index, *offsets));
+	n_ext += btr_push_update_extern_fields(new_entry, update, entry_heap);
+
+	if (page_zip) {
+		ut_ad(page_is_comp(page));
+		if (page_zip_rec_needs_ext(
+			    rec_get_converted_size(index, new_entry, n_ext),
+			    TRUE,
+			    dict_index_get_n_fields(index),
+			    page_zip_get_size(page_zip))) {
+
+			goto make_external;
+		}
+	} else if (page_zip_rec_needs_ext(
+			   rec_get_converted_size(index, new_entry, n_ext),
+			   page_is_comp(page), 0, 0)) {
+make_external:
+		big_rec_vec = dtuple_convert_big_rec(index, new_entry, &n_ext);
+		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
+
+			/* We cannot goto return_after_reservations,
+			because we may need to update the
+			IBUF_BITMAP_FREE bits, which was suppressed by
+			BTR_KEEP_IBUF_BITMAP. */
+#ifdef UNIV_ZIP_DEBUG
+			ut_a(!page_zip
+			     || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+			if (n_reserved > 0) {
+				fil_space_release_free_extents(
+					index->space, n_reserved);
+			}
+
+			err = DB_TOO_BIG_RECORD;
+			goto err_exit;
+		}
+
+		ut_ad(page_is_leaf(page));
+		ut_ad(dict_index_is_clust(index));
+		ut_ad(flags & BTR_KEEP_POS_FLAG);
+	}
+
+	if (big_rec_vec) {
+
+		err = btr_check_blob_limit(big_rec_vec);
+
+		if (err != DB_SUCCESS) {
+			if (n_reserved > 0) {
+				fil_space_release_free_extents(
+					index->space, n_reserved);
+			}
+			goto err_exit;
+		}
+	}
+
+	/* Store state of explicit locks on rec on the page infimum record,
+	before deleting rec. The page infimum acts as a dummy carrier of the
+	locks, taking care also of lock releases, before we can move the locks
+	back on the actual record. There is a special case: if we are
+	inserting on the root page and the insert causes a call of
+	btr_root_raise_and_insert. Therefore we cannot in the lock system
+	delete the lock structs set on the root page even if the root
+	page carries just node pointers. */
+
+	lock_rec_store_on_page_infimum(block, rec);
+
+	btr_search_update_hash_on_delete(cursor);
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	page_cur_delete_rec(page_cursor, index, *offsets, mtr);
+
+	page_cur_move_to_prev(page_cursor);
+
+	rec = btr_cur_insert_if_possible(cursor, new_entry,
+					 offsets, offsets_heap, n_ext, mtr);
+
+	if (rec) {
+		page_cursor->rec = rec;
+
+		lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
+						   rec, block);
+
+		if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
+			/* The new inserted record owns its possible externally
+			stored fields */
+			btr_cur_unmark_extern_fields(
+				page_zip, rec, index, *offsets, mtr);
+		}
+
+		bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
+
+		if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
+			if (adjust) {
+				rec_offs_make_valid(
+					page_cursor->rec, index, *offsets);
+			}
+		} else if (page_zip &&
+			   !dict_index_is_clust(index)
+			   && page_is_leaf(page)) {
+			/* Update the free bits in the insert buffer.
+			This is the same block which was skipped by
+			BTR_KEEP_IBUF_BITMAP. */
+			ibuf_update_free_bits_zip(block, mtr);
+		}
+
+		err = DB_SUCCESS;
+		goto return_after_reservations;
+	} else {
+		/* If the page is compressed and it initially
+		compresses very well, and there is a subsequent insert
+		of a badly-compressing record, it is possible for
+		btr_cur_optimistic_update() to return DB_UNDERFLOW and
+		btr_cur_insert_if_possible() to return FALSE. */
+		ut_a(page_zip || optim_err != DB_UNDERFLOW);
+
+		/* Out of space: reset the free bits.
+		This is the same block which was skipped by
+		BTR_KEEP_IBUF_BITMAP. */
+		if (!dict_index_is_clust(index) && page_is_leaf(page)) {
+			ibuf_reset_free_bits(block);
+		}
+	}
+
+	if (big_rec_vec) {
+		ut_ad(page_is_leaf(page));
+		ut_ad(dict_index_is_clust(index));
+		ut_ad(flags & BTR_KEEP_POS_FLAG);
+
+		/* btr_page_split_and_insert() in
+		btr_cur_pessimistic_insert() invokes
+		mtr_memo_release(mtr, index->lock, MTR_MEMO_X_LOCK).
+		We must keep the index->lock when we created a
+		big_rec, so that row_upd_clust_rec() can store the
+		big_rec in the same mini-transaction. */
+
+		mtr_x_lock(dict_index_get_lock(index), mtr);
+	}
+
+	/* Was the record to be updated positioned as the first user
+	record on its page? */
+	was_first = page_cur_is_before_first(page_cursor);
+
+	/* Lock checks and undo logging were already performed by
+	btr_cur_upd_lock_and_undo(). We do not try
+	btr_cur_optimistic_insert() because
+	btr_cur_insert_if_possible() already failed above. */
+
+	err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
+					 | BTR_NO_LOCKING_FLAG
+					 | BTR_KEEP_SYS_FLAG,
+					 cursor, offsets, offsets_heap,
+					 new_entry, &rec,
+					 &dummy_big_rec, n_ext, NULL, mtr);
+	ut_a(rec);
+	ut_a(err == DB_SUCCESS);
+	ut_a(dummy_big_rec == NULL);
+	ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
+	page_cursor->rec = rec;
+
+	if (dict_index_is_sec_or_ibuf(index)) {
+		/* Update PAGE_MAX_TRX_ID in the index page header.
+		It was not updated by btr_cur_pessimistic_insert()
+		because of BTR_NO_LOCKING_FLAG. */
+		buf_block_t*	rec_block;
+
+		rec_block = btr_cur_get_block(cursor);
+
+		page_update_max_trx_id(rec_block,
+				       buf_block_get_page_zip(rec_block),
+				       trx_id, mtr);
+	}
+
+	if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
+		/* The new inserted record owns its possible externally
+		stored fields */
+		buf_block_t*	rec_block = btr_cur_get_block(cursor);
+
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+		page = buf_block_get_frame(rec_block);
+#endif /* UNIV_ZIP_DEBUG */
+		page_zip = buf_block_get_page_zip(rec_block);
+
+		btr_cur_unmark_extern_fields(page_zip,
+					     rec, index, *offsets, mtr);
+	}
+
+	lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
+					   rec, block);
+
+	/* If necessary, restore also the correct lock state for a new,
+	preceding supremum record created in a page split. While the old
+	record was nonexistent, the supremum might have inherited its locks
+	from a wrong record. */
+
+	if (!was_first) {
+		btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
+						  rec, mtr);
+	}
+
+return_after_reservations:
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (n_reserved > 0) {
+		fil_space_release_free_extents(index->space, n_reserved);
+	}
+
+	*big_rec = big_rec_vec;
+
+	return(err);
+}
+
+/*==================== B-TREE DELETE MARK AND UNMARK ===============*/
+
+/****************************************************************//**
+Writes the redo log record for delete marking or unmarking of an index
+record. */
+UNIV_INLINE
+void
+btr_cur_del_mark_set_clust_rec_log(
+/*===============================*/
+	rec_t*		rec,	/*!< in: record */
+	dict_index_t*	index,	/*!< in: index of the record */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	roll_ptr_t	roll_ptr,/*!< in: roll ptr to the undo log record */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	byte*	log_ptr;
+
+	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+
+	log_ptr = mlog_open_and_write_index(mtr, rec, index,
+					    page_rec_is_comp(rec)
+					    ? MLOG_COMP_REC_CLUST_DELETE_MARK
+					    : MLOG_REC_CLUST_DELETE_MARK,
+					    1 + 1 + DATA_ROLL_PTR_LEN
+					    + 14 + 2);
+
+	if (!log_ptr) {
+		/* Logging in mtr is switched off during crash recovery */
+		return;
+	}
+
+	*log_ptr++ = 0;
+	*log_ptr++ = 1;
+
+	log_ptr = row_upd_write_sys_vals_to_log(
+		index, trx_id, roll_ptr, log_ptr, mtr);
+	mach_write_to_2(log_ptr, page_offset(rec));
+	log_ptr += 2;
+
+	mlog_close(mtr, log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/****************************************************************//**
+Parses the redo log record for delete marking or unmarking of a clustered
+index record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_del_mark_set_clust_rec(
+/*=================================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	page_t*		page,	/*!< in/out: page or NULL */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	dict_index_t*	index)	/*!< in: index corresponding to page */
+{
+	ulint		flags;
+	ulint		val;
+	ulint		pos;
+	trx_id_t	trx_id;
+	roll_ptr_t	roll_ptr;
+	ulint		offset;
+	rec_t*		rec;
+
+	ut_ad(!page
+	      || !!page_is_comp(page) == dict_table_is_comp(index->table));
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	flags = mach_read_from_1(ptr);
+	ptr++;
+	val = mach_read_from_1(ptr);
+	ptr++;
+
+	ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	offset = mach_read_from_2(ptr);
+	ptr += 2;
+
+	ut_a(offset <= UNIV_PAGE_SIZE);
+
+	if (page) {
+		rec = page + offset;
+
+		/* We do not need to reserve btr_search_latch, as the page
+		is only being recovered, and there cannot be a hash index to
+		it. Besides, these fields are being updated in place
+		and the adaptive hash index does not depend on them. */
+
+		btr_rec_set_deleted_flag(rec, page_zip, val);
+
+		if (!(flags & BTR_KEEP_SYS_FLAG)) {
+			mem_heap_t*	heap		= NULL;
+			ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+			rec_offs_init(offsets_);
+
+			row_upd_rec_sys_fields_in_recovery(
+				rec, page_zip,
+				rec_get_offsets(rec, index, offsets_,
+						ULINT_UNDEFINED, &heap),
+				pos, trx_id, roll_ptr);
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+		}
+	}
+
+	return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************//**
+Marks a clustered index record deleted. Writes an undo log record to
+undo log on this delete marking. Writes in the trx id field the id
+of the deleting transaction, and in the roll ptr field pointer to the
+undo log record created.
+@return	DB_SUCCESS, DB_LOCK_WAIT, or error number */
+UNIV_INTERN
+dberr_t
+btr_cur_del_mark_set_clust_rec(
+/*===========================*/
+	buf_block_t*	block,	/*!< in/out: buffer block of the record */
+	rec_t*		rec,	/*!< in/out: record */
+	dict_index_t*	index,	/*!< in: clustered index of the record */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec) */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	roll_ptr_t	roll_ptr;
+	dberr_t		err;
+	page_zip_des_t*	page_zip;
+	trx_t*		trx;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+	ut_ad(buf_block_get_frame(block) == page_align(rec));
+	ut_ad(page_is_leaf(page_align(rec)));
+
+#ifdef UNIV_DEBUG
+	if (btr_cur_print_record_ops && thr) {
+		btr_cur_trx_report(thr_get_trx(thr)->id, index, "del mark ");
+		rec_print_new(stderr, rec, offsets);
+	}
+#endif /* UNIV_DEBUG */
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+
+	err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block,
+						   rec, index, offsets, thr);
+
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	err = trx_undo_report_row_operation(0, TRX_UNDO_MODIFY_OP, thr,
+					    index, NULL, NULL, 0, rec, offsets,
+					    &roll_ptr);
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	/* The btr_search_latch is not needed here, because
+	the adaptive hash index does not depend on the delete-mark
+	and the delete-mark is being updated in place. */
+
+	page_zip = buf_block_get_page_zip(block);
+
+	btr_blob_dbg_set_deleted_flag(rec, index, offsets, TRUE);
+	btr_rec_set_deleted_flag(rec, page_zip, TRUE);
+
+	trx = thr_get_trx(thr);
+
+	if (dict_index_is_online_ddl(index)) {
+		row_log_table_delete(rec, index, offsets, NULL);
+	}
+
+	row_upd_rec_sys_fields(rec, page_zip, index, offsets, trx, roll_ptr);
+
+	btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id,
+					   roll_ptr, mtr);
+
+	return(err);
+}
+
+/****************************************************************//**
+Writes the redo log record for a delete mark setting of a secondary
+index record. */
+UNIV_INLINE
+void
+btr_cur_del_mark_set_sec_rec_log(
+/*=============================*/
+	rec_t*		rec,	/*!< in: record */
+	ibool		val,	/*!< in: value to set */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	byte*	log_ptr;
+	ut_ad(val <= 1);
+
+	log_ptr = mlog_open(mtr, 11 + 1 + 2);
+
+	if (!log_ptr) {
+		/* Logging in mtr is switched off during crash recovery:
+		in that case mlog_open returns NULL */
+		return;
+	}
+
+	log_ptr = mlog_write_initial_log_record_fast(
+		rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
+	mach_write_to_1(log_ptr, val);
+	log_ptr++;
+
+	mach_write_to_2(log_ptr, page_offset(rec));
+	log_ptr += 2;
+
+	mlog_close(mtr, log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/****************************************************************//**
+Parses the redo log record for delete marking or unmarking of a secondary
+index record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_del_mark_set_sec_rec(
+/*===============================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	page_t*		page,	/*!< in/out: page or NULL */
+	page_zip_des_t*	page_zip)/*!< in/out: compressed page, or NULL */
+{
+	ulint	val;
+	ulint	offset;
+	rec_t*	rec;
+
+	if (end_ptr < ptr + 3) {
+
+		return(NULL);
+	}
+
+	val = mach_read_from_1(ptr);
+	ptr++;
+
+	offset = mach_read_from_2(ptr);
+	ptr += 2;
+
+	ut_a(offset <= UNIV_PAGE_SIZE);
+
+	if (page) {
+		rec = page + offset;
+
+		/* We do not need to reserve btr_search_latch, as the page
+		is only being recovered, and there cannot be a hash index to
+		it. Besides, the delete-mark flag is being updated in place
+		and the adaptive hash index does not depend on it. */
+
+		btr_rec_set_deleted_flag(rec, page_zip, val);
+	}
+
+	return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************//**
+Sets a secondary index record delete mark to TRUE or FALSE.
+@return	DB_SUCCESS, DB_LOCK_WAIT, or error number */
+UNIV_INTERN
+dberr_t
+btr_cur_del_mark_set_sec_rec(
+/*=========================*/
+	ulint		flags,	/*!< in: locking flag */
+	btr_cur_t*	cursor,	/*!< in: cursor */
+	ibool		val,	/*!< in: value to set */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	buf_block_t*	block;
+	rec_t*		rec;
+	dberr_t		err;
+
+	block = btr_cur_get_block(cursor);
+	rec = btr_cur_get_rec(cursor);
+
+#ifdef UNIV_DEBUG
+	if (btr_cur_print_record_ops && thr) {
+		btr_cur_trx_report(thr_get_trx(thr)->id, cursor->index,
+				   "del mark ");
+		rec_print(stderr, rec, cursor->index);
+	}
+#endif /* UNIV_DEBUG */
+
+	err = lock_sec_rec_modify_check_and_lock(flags,
+						 btr_cur_get_block(cursor),
+						 rec, cursor->index, thr, mtr);
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	ut_ad(!!page_rec_is_comp(rec)
+	      == dict_table_is_comp(cursor->index->table));
+
+	/* We do not need to reserve btr_search_latch, as the
+	delete-mark flag is being updated in place and the adaptive
+	hash index does not depend on it. */
+	btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val);
+
+	btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
+
+	return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Sets a secondary index record's delete mark to the given value. This
+function is only used by the insert buffer merge mechanism. */
+UNIV_INTERN
+void
+btr_cur_set_deleted_flag_for_ibuf(
+/*==============================*/
+	rec_t*		rec,		/*!< in/out: record */
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page
+					corresponding to rec, or NULL
+					when the tablespace is
+					uncompressed */
+	ibool		val,		/*!< in: value to set */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	/* We do not need to reserve btr_search_latch, as the page
+	has just been read to the buffer pool and there cannot be
+	a hash index to it.  Besides, the delete-mark flag is being
+	updated in place and the adaptive hash index does not depend
+	on it. */
+
+	btr_rec_set_deleted_flag(rec, page_zip, val);
+
+	btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
+}
+
+/*==================== B-TREE RECORD REMOVE =========================*/
+
+/*************************************************************//**
+Tries to compress a page of the tree if it seems useful. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. To avoid
+deadlocks, mtr must also own x-latches to brothers of page, if those
+brothers exist. NOTE: it is assumed that the caller has reserved enough
+free extents so that the compression will always succeed if done!
+@return	TRUE if compression occurred */
+UNIV_INTERN
+ibool
+btr_cur_compress_if_useful(
+/*=======================*/
+	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to compress;
+				cursor does not stay valid if !adjust and
+				compression occurs */
+	ibool		adjust,	/*!< in: TRUE if should adjust the
+				cursor position even if compression occurs */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(mtr_memo_contains(mtr,
+				dict_index_get_lock(btr_cur_get_index(cursor)),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+				MTR_MEMO_PAGE_X_FIX));
+
+	return(btr_cur_compress_recommendation(cursor, mtr)
+	       && btr_compress(cursor, adjust, mtr));
+}
+
+/*******************************************************//**
+Removes the record on which the tree cursor is positioned on a leaf page.
+It is assumed that the mtr has an x-latch on the page where the cursor is
+positioned, but no latch on the whole tree.
+@return	TRUE if success, i.e., the page did not become too empty */
+UNIV_INTERN
+ibool
+btr_cur_optimistic_delete_func(
+/*===========================*/
+	btr_cur_t*	cursor,	/*!< in: cursor on leaf page, on the record to
+				delete; cursor stays valid: if deletion
+				succeeds, on function exit it points to the
+				successor of the deleted record */
+#ifdef UNIV_DEBUG
+	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
+#endif /* UNIV_DEBUG */
+	mtr_t*		mtr)	/*!< in: mtr; if this function returns
+				TRUE on a leaf page of a secondary
+				index, the mtr must be committed
+				before latching any further pages */
+{
+	buf_block_t*	block;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	ibool		no_compress_needed;
+	rec_offs_init(offsets_);
+
+	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
+	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+				MTR_MEMO_PAGE_X_FIX));
+	/* This is intended only for leaf page deletions */
+
+	block = btr_cur_get_block(cursor);
+
+	ut_ad(page_is_leaf(buf_block_get_frame(block)));
+	ut_ad(!dict_index_is_online_ddl(cursor->index)
+	      || dict_index_is_clust(cursor->index)
+	      || (flags & BTR_CREATE_FLAG));
+
+	rec = btr_cur_get_rec(cursor);
+	offsets = rec_get_offsets(rec, cursor->index, offsets,
+				  ULINT_UNDEFINED, &heap);
+
+	no_compress_needed = !rec_offs_any_extern(offsets)
+		&& btr_cur_can_delete_without_compress(
+			cursor, rec_offs_size(offsets), mtr);
+
+	if (no_compress_needed) {
+
+		page_t*		page	= buf_block_get_frame(block);
+		page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
+
+		lock_update_delete(block, rec);
+
+		btr_search_update_hash_on_delete(cursor);
+
+		if (page_zip) {
+#ifdef UNIV_ZIP_DEBUG
+			ut_a(page_zip_validate(page_zip, page, cursor->index));
+#endif /* UNIV_ZIP_DEBUG */
+			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+					    cursor->index, offsets, mtr);
+#ifdef UNIV_ZIP_DEBUG
+			ut_a(page_zip_validate(page_zip, page, cursor->index));
+#endif /* UNIV_ZIP_DEBUG */
+
+			/* On compressed pages, the IBUF_BITMAP_FREE
+			space is not affected by deleting (purging)
+			records, because it is defined as the minimum
+			of space available *without* reorganize, and
+			space available in the modification log. */
+		} else {
+			const ulint	max_ins
+				= page_get_max_insert_size_after_reorganize(
+					page, 1);
+
+			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+					    cursor->index, offsets, mtr);
+
+			/* The change buffer does not handle inserts
+			into non-leaf pages, into clustered indexes,
+			or into the change buffer. */
+			if (page_is_leaf(page)
+			    && !dict_index_is_clust(cursor->index)
+			    && !dict_index_is_ibuf(cursor->index)) {
+				ibuf_update_free_bits_low(block, max_ins, mtr);
+			}
+		}
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return(no_compress_needed);
+}
+
+/*************************************************************//**
+Removes the record on which the tree cursor is positioned. Tries
+to compress the page if its fillfactor drops below a threshold
+or if it is the only page on the level. It is assumed that mtr holds
+an x-latch on the tree and on the cursor page. To avoid deadlocks,
+mtr must also own x-latches to brothers of page, if those brothers
+exist.
+@return	TRUE if compression occurred */
+UNIV_INTERN
+ibool
+btr_cur_pessimistic_delete(
+/*=======================*/
+	dberr_t*	err,	/*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
+				the latter may occur because we may have
+				to update node pointers on upper levels,
+				and in the case of variable length keys
+				these may actually grow in size */
+	ibool		has_reserved_extents, /*!< in: TRUE if the
+				caller has already reserved enough free
+				extents so that he knows that the operation
+				will succeed */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
+				if compression does not occur, the cursor
+				stays valid: it points to successor of
+				deleted record on function exit */
+	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
+	enum trx_rb_ctx	rb_ctx,	/*!< in: rollback context */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	dict_index_t*	index;
+	rec_t*		rec;
+	ulint		n_reserved	= 0;
+	ibool		success;
+	ibool		ret		= FALSE;
+	ulint		level;
+	mem_heap_t*	heap;
+	ulint*		offsets;
+
+	block = btr_cur_get_block(cursor);
+	page = buf_block_get_frame(block);
+	index = btr_cur_get_index(cursor);
+
+	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
+	ut_ad(!dict_index_is_online_ddl(index)
+	      || dict_index_is_clust(index)
+	      || (flags & BTR_CREATE_FLAG));
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	if (!has_reserved_extents) {
+		/* First reserve enough free space for the file segments
+		of the index tree, so that the node pointer updates will
+		not fail because of lack of space */
+
+		ulint	n_extents = cursor->tree_height / 32 + 1;
+
+		success = fsp_reserve_free_extents(&n_reserved,
+						   index->space,
+						   n_extents,
+						   FSP_CLEANING, mtr);
+		if (!success) {
+			*err = DB_OUT_OF_FILE_SPACE;
+
+			return(FALSE);
+		}
+	}
+
+	heap = mem_heap_create(1024);
+	rec = btr_cur_get_rec(cursor);
+	page_zip = buf_block_get_page_zip(block);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
+
+	if (rec_offs_any_extern(offsets)) {
+		btr_rec_free_externally_stored_fields(index,
+						      rec, offsets, page_zip,
+						      rb_ctx, mtr);
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+	}
+
+	if (UNIV_UNLIKELY(page_get_n_recs(page) < 2)
+	    && UNIV_UNLIKELY(dict_index_get_page(index)
+			     != buf_block_get_page_no(block))) {
+
+		/* If there is only one record, drop the whole page in
+		btr_discard_page, if this is not the root page */
+
+		btr_discard_page(cursor, mtr);
+
+		ret = TRUE;
+
+		goto return_after_reservations;
+	}
+
+	if (flags == 0) {
+		lock_update_delete(block, rec);
+	}
+
+	level = btr_page_get_level(page, mtr);
+
+	if (level > 0
+	    && UNIV_UNLIKELY(rec == page_rec_get_next(
+				     page_get_infimum_rec(page)))) {
+
+		rec_t*	next_rec = page_rec_get_next(rec);
+
+		if (btr_page_get_prev(page, mtr) == FIL_NULL) {
+
+			/* If we delete the leftmost node pointer on a
+			non-leaf level, we must mark the new leftmost node
+			pointer as the predefined minimum record */
+
+			/* This will make page_zip_validate() fail until
+			page_cur_delete_rec() completes.  This is harmless,
+			because everything will take place within a single
+			mini-transaction and because writing to the redo log
+			is an atomic operation (performed by mtr_commit()). */
+			btr_set_min_rec_mark(next_rec, mtr);
+		} else {
+			/* Otherwise, if we delete the leftmost node pointer
+			on a page, we have to change the father node pointer
+			so that it is equal to the new leftmost node pointer
+			on the page */
+
+			btr_node_ptr_delete(index, block, mtr);
+
+			dtuple_t*	node_ptr = dict_index_build_node_ptr(
+				index, next_rec, buf_block_get_page_no(block),
+				heap, level);
+
+			btr_insert_on_non_leaf_level(
+				flags, index, level + 1, node_ptr, mtr);
+		}
+	}
+
+	btr_search_update_hash_on_delete(cursor);
+
+	page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	ut_ad(btr_check_node_ptr(index, block, mtr));
+
+return_after_reservations:
+	*err = DB_SUCCESS;
+
+	mem_heap_free(heap);
+
+	if (ret == FALSE) {
+		ret = btr_cur_compress_if_useful(cursor, FALSE, mtr);
+	}
+
+	if (n_reserved > 0) {
+		fil_space_release_free_extents(index->space, n_reserved);
+	}
+
+	return(ret);
+}
+
+/*******************************************************************//**
+Adds path information to the cursor for the current page, for which
+the binary search has been performed. */
+static
+void
+btr_cur_add_path_info(
+/*==================*/
+	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
+	ulint		height,		/*!< in: height of the page in tree;
+					0 means leaf node */
+	ulint		root_height)	/*!< in: root node height in tree */
+{
+	btr_path_t*	slot;
+	const rec_t*	rec;
+	const page_t*	page;
+
+	ut_a(cursor->path_arr);
+
+	if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
+		/* Do nothing; return empty path */
+
+		slot = cursor->path_arr;
+		slot->nth_rec = ULINT_UNDEFINED;
+
+		return;
+	}
+
+	if (height == 0) {
+		/* Mark end of slots for path */
+		slot = cursor->path_arr + root_height + 1;
+		slot->nth_rec = ULINT_UNDEFINED;
+	}
+
+	rec = btr_cur_get_rec(cursor);
+
+	slot = cursor->path_arr + (root_height - height);
+
+	page = page_align(rec);
+
+	slot->nth_rec = page_rec_get_n_recs_before(rec);
+	slot->n_recs = page_get_n_recs(page);
+	slot->page_no = page_get_page_no(page);
+	slot->page_level = btr_page_get_level_low(page);
+}
+
+/*******************************************************************//**
+Estimate the number of rows between slot1 and slot2 for any level on a
+B-tree. This function starts from slot1->page and reads a few pages to
+the right, counting their records. If we reach slot2->page quickly then
+we know exactly how many records there are between slot1 and slot2 and
+we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly
+then we calculate the average number of records in the pages scanned
+so far and assume that all pages that we did not scan up to slot2->page
+contain the same number of records, then we multiply that average to
+the number of pages between slot1->page and slot2->page (which is
+n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE.
+@return	number of rows (exact or estimated) */
+static
+ib_int64_t
+btr_estimate_n_rows_in_range_on_level(
+/*==================================*/
+	dict_index_t*	index,			/*!< in: index */
+	btr_path_t*	slot1,			/*!< in: left border */
+	btr_path_t*	slot2,			/*!< in: right border */
+	ib_int64_t	n_rows_on_prev_level,	/*!< in: number of rows
+						on the previous level for the
+						same descend paths; used to
+						determine the numbe of pages
+						on this level */
+	ibool*		is_n_rows_exact)	/*!< out: TRUE if the returned
+						value is exact i.e. not an
+						estimation */
+{
+	ulint		space;
+	ib_int64_t	n_rows;
+	ulint		n_pages_read;
+	ulint		page_no;
+	ulint		zip_size;
+	ulint		level;
+
+	space = dict_index_get_space(index);
+
+	n_rows = 0;
+	n_pages_read = 0;
+
+	/* Assume by default that we will scan all pages between
+	slot1->page_no and slot2->page_no */
+	*is_n_rows_exact = TRUE;
+
+	/* add records from slot1->page_no which are to the right of
+	the record which serves as a left border of the range, if any */
+	if (slot1->nth_rec < slot1->n_recs) {
+		n_rows += slot1->n_recs - slot1->nth_rec;
+	}
+
+	/* add records from slot2->page_no which are to the left of
+	the record which servers as a right border of the range, if any */
+	if (slot2->nth_rec > 1) {
+		n_rows += slot2->nth_rec - 1;
+	}
+
+	/* count the records in the pages between slot1->page_no and
+	slot2->page_no (non inclusive), if any */
+
+	zip_size = fil_space_get_zip_size(space);
+
+	/* Do not read more than this number of pages in order not to hurt
+	performance with this code which is just an estimation. If we read
+	this many pages before reaching slot2->page_no then we estimate the
+	average from the pages scanned so far */
+#	define N_PAGES_READ_LIMIT	10
+
+	page_no = slot1->page_no;
+	level = slot1->page_level;
+
+	do {
+		mtr_t		mtr;
+		page_t*		page;
+		buf_block_t*	block;
+
+		mtr_start(&mtr);
+
+		/* Fetch the page. Because we are not holding the
+		index->lock, the tree may have changed and we may be
+		attempting to read a page that is no longer part of
+		the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to
+		silence a debug assertion about this. */
+		block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH,
+					 NULL, BUF_GET_POSSIBLY_FREED,
+					 __FILE__, __LINE__, &mtr);
+
+		page = buf_block_get_frame(block);
+
+		/* It is possible that the tree has been reorganized in the
+		meantime and this is a different page. If this happens the
+		calculated estimate will be bogus, which is not fatal as
+		this is only an estimate. We are sure that a page with
+		page_no exists because InnoDB never frees pages, only
+		reuses them. */
+		if (fil_page_get_type(page) != FIL_PAGE_INDEX
+		    || btr_page_get_index_id(page) != index->id
+		    || btr_page_get_level_low(page) != level) {
+
+			/* The page got reused for something else */
+			mtr_commit(&mtr);
+			goto inexact;
+		}
+
+		/* It is possible but highly unlikely that the page was
+		originally written by an old version of InnoDB that did
+		not initialize FIL_PAGE_TYPE on other than B-tree pages.
+		For example, this could be an almost-empty BLOB page
+		that happens to contain the magic values in the fields
+		that we checked above. */
+
+		n_pages_read++;
+
+		if (page_no != slot1->page_no) {
+			/* Do not count the records on slot1->page_no,
+			we already counted them before this loop. */
+			n_rows += page_get_n_recs(page);
+		}
+
+		page_no = btr_page_get_next(page, &mtr);
+
+		mtr_commit(&mtr);
+
+		if (n_pages_read == N_PAGES_READ_LIMIT
+		    || page_no == FIL_NULL) {
+			/* Either we read too many pages or
+			we reached the end of the level without passing
+			through slot2->page_no, the tree must have changed
+			in the meantime */
+			goto inexact;
+		}
+
+	} while (page_no != slot2->page_no);
+
+	return(n_rows);
+
+inexact:
+
+	*is_n_rows_exact = FALSE;
+
+	/* We did interrupt before reaching slot2->page */
+
+	if (n_pages_read > 0) {
+		/* The number of pages on this level is
+		n_rows_on_prev_level, multiply it by the
+		average number of recs per page so far */
+		n_rows = n_rows_on_prev_level
+			* n_rows / n_pages_read;
+	} else {
+		/* The tree changed before we could even
+		start with slot1->page_no */
+		n_rows = 10;
+	}
+
+	return(n_rows);
+}
+
+/*******************************************************************//**
+Estimates the number of rows in a given index range.
+@return	estimated number of rows */
+UNIV_INTERN
+ib_int64_t
+btr_estimate_n_rows_in_range(
+/*=========================*/
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	tuple1,	/*!< in: range start, may also be empty tuple */
+	ulint		mode1,	/*!< in: search mode for range start */
+	const dtuple_t*	tuple2,	/*!< in: range end, may also be empty tuple */
+	ulint		mode2)	/*!< in: search mode for range end */
+{
+	btr_path_t	path1[BTR_PATH_ARRAY_N_SLOTS];
+	btr_path_t	path2[BTR_PATH_ARRAY_N_SLOTS];
+	btr_cur_t	cursor;
+	btr_path_t*	slot1;
+	btr_path_t*	slot2;
+	ibool		diverged;
+	ibool		diverged_lot;
+	ulint		divergence_level;
+	ib_int64_t	n_rows;
+	ibool		is_n_rows_exact;
+	ulint		i;
+	mtr_t		mtr;
+	ib_int64_t	table_n_rows;
+
+	table_n_rows = dict_table_get_n_rows(index->table);
+
+	mtr_start(&mtr);
+
+	cursor.path_arr = path1;
+
+	if (dtuple_get_n_fields(tuple1) > 0) {
+
+		btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
+					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
+					    &cursor, 0,
+					    __FILE__, __LINE__, &mtr);
+	} else {
+		btr_cur_open_at_index_side(true, index,
+					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
+					   &cursor, 0, &mtr);
+	}
+
+	mtr_commit(&mtr);
+
+	mtr_start(&mtr);
+
+	cursor.path_arr = path2;
+
+	if (dtuple_get_n_fields(tuple2) > 0) {
+
+		btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
+					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
+					    &cursor, 0,
+					    __FILE__, __LINE__, &mtr);
+	} else {
+		btr_cur_open_at_index_side(false, index,
+					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
+					   &cursor, 0, &mtr);
+	}
+
+	mtr_commit(&mtr);
+
+	/* We have the path information for the range in path1 and path2 */
+
+	n_rows = 1;
+	is_n_rows_exact = TRUE;
+	diverged = FALSE;	    /* This becomes true when the path is not
+				    the same any more */
+	diverged_lot = FALSE;	    /* This becomes true when the paths are
+				    not the same or adjacent any more */
+	divergence_level = 1000000; /* This is the level where paths diverged
+				    a lot */
+	for (i = 0; ; i++) {
+		ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
+
+		slot1 = path1 + i;
+		slot2 = path2 + i;
+
+		if (slot1->nth_rec == ULINT_UNDEFINED
+		    || slot2->nth_rec == ULINT_UNDEFINED) {
+
+			if (i > divergence_level + 1 && !is_n_rows_exact) {
+				/* In trees whose height is > 1 our algorithm
+				tends to underestimate: multiply the estimate
+				by 2: */
+
+				n_rows = n_rows * 2;
+			}
+
+			DBUG_EXECUTE_IF("bug14007649", return(n_rows););
+
+			/* Do not estimate the number of rows in the range
+			to over 1 / 2 of the estimated rows in the whole
+			table */
+
+			if (n_rows > table_n_rows / 2 && !is_n_rows_exact) {
+
+				n_rows = table_n_rows / 2;
+
+				/* If there are just 0 or 1 rows in the table,
+				then we estimate all rows are in the range */
+
+				if (n_rows == 0) {
+					n_rows = table_n_rows;
+				}
+			}
+
+			return(n_rows);
+		}
+
+		if (!diverged && slot1->nth_rec != slot2->nth_rec) {
+
+			diverged = TRUE;
+
+			if (slot1->nth_rec < slot2->nth_rec) {
+				n_rows = slot2->nth_rec - slot1->nth_rec;
+
+				if (n_rows > 1) {
+					diverged_lot = TRUE;
+					divergence_level = i;
+				}
+			} else {
+				/* It is possible that
+				slot1->nth_rec >= slot2->nth_rec
+				if, for example, we have a single page
+				tree which contains (inf, 5, 6, supr)
+				and we select where x > 20 and x < 30;
+				in this case slot1->nth_rec will point
+				to the supr record and slot2->nth_rec
+				will point to 6 */
+				n_rows = 0;
+			}
+
+		} else if (diverged && !diverged_lot) {
+
+			if (slot1->nth_rec < slot1->n_recs
+			    || slot2->nth_rec > 1) {
+
+				diverged_lot = TRUE;
+				divergence_level = i;
+
+				n_rows = 0;
+
+				if (slot1->nth_rec < slot1->n_recs) {
+					n_rows += slot1->n_recs
+						- slot1->nth_rec;
+				}
+
+				if (slot2->nth_rec > 1) {
+					n_rows += slot2->nth_rec - 1;
+				}
+			}
+		} else if (diverged_lot) {
+
+			n_rows = btr_estimate_n_rows_in_range_on_level(
+				index, slot1, slot2, n_rows,
+				&is_n_rows_exact);
+		}
+	}
+}
+
+/*******************************************************************//**
+Record the number of non_null key values in a given index for
+each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
+The estimates are eventually stored in the array:
+index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
+static
+void
+btr_record_not_null_field_in_rec(
+/*=============================*/
+	ulint		n_unique,	/*!< in: dict_index_get_n_unique(index),
+					number of columns uniquely determine
+					an index entry */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index),
+					its size could be for all fields or
+					that of "n_unique" */
+	ib_uint64_t*	n_not_null)	/*!< in/out: array to record number of
+					not null rows for n-column prefix */
+{
+	ulint	i;
+
+	ut_ad(rec_offs_n_fields(offsets) >= n_unique);
+
+	if (n_not_null == NULL) {
+		return;
+	}
+
+	for (i = 0; i < n_unique; i++) {
+		if (rec_offs_nth_sql_null(offsets, i)) {
+			break;
+		}
+
+		n_not_null[i]++;
+	}
+}
+
+/*******************************************************************//**
+Estimates the number of different key values in a given index, for
+each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
+The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
+0..n_uniq-1) and the number of pages that were sampled is saved in
+index->stat_n_sample_sizes[].
+If innodb_stats_method is nulls_ignored, we also record the number of
+non-null values for each prefix and stored the estimates in
+array index->stat_n_non_null_key_vals. */
+UNIV_INTERN
+void
+btr_estimate_number_of_different_key_vals(
+/*======================================*/
+	dict_index_t*	index)	/*!< in: index */
+{
+	btr_cur_t	cursor;
+	page_t*		page;
+	rec_t*		rec;
+	ulint		n_cols;
+	ulint		matched_fields;
+	ulint		matched_bytes;
+	ib_uint64_t*	n_diff;
+	ib_uint64_t*	n_not_null;
+	ibool		stats_null_not_equal;
+	ullint		n_sample_pages; /* number of pages to sample */
+	ulint		not_empty_flag	= 0;
+	ulint		total_external_size = 0;
+	ulint		i;
+	ulint		j;
+	ullint		add_on;
+	mtr_t		mtr;
+	mem_heap_t*	heap		= NULL;
+	ulint*		offsets_rec	= NULL;
+	ulint*		offsets_next_rec = NULL;
+
+	n_cols = dict_index_get_n_unique(index);
+
+	heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
+			       * n_cols
+			       + dict_index_get_n_fields(index)
+			       * (sizeof *offsets_rec
+				  + sizeof *offsets_next_rec));
+
+	n_diff = (ib_uint64_t*) mem_heap_zalloc(
+		heap, n_cols * sizeof(ib_int64_t));
+
+	n_not_null = NULL;
+
+	/* Check srv_innodb_stats_method setting, and decide whether we
+	need to record non-null value and also decide if NULL is
+	considered equal (by setting stats_null_not_equal value) */
+	switch (srv_innodb_stats_method) {
+	case SRV_STATS_NULLS_IGNORED:
+		n_not_null = (ib_uint64_t*) mem_heap_zalloc(
+			heap, n_cols * sizeof *n_not_null);
+		/* fall through */
+
+	case SRV_STATS_NULLS_UNEQUAL:
+		/* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
+		case, we will treat NULLs as unequal value */
+		stats_null_not_equal = TRUE;
+		break;
+
+	case SRV_STATS_NULLS_EQUAL:
+		stats_null_not_equal = FALSE;
+		break;
+
+	default:
+		ut_error;
+        }
+
+	/* It makes no sense to test more pages than are contained
+	in the index, thus we lower the number if it is too high */
+	if (srv_stats_transient_sample_pages > index->stat_index_size) {
+		if (index->stat_index_size > 0) {
+			n_sample_pages = index->stat_index_size;
+		} else {
+			n_sample_pages = 1;
+		}
+	} else {
+		n_sample_pages = srv_stats_transient_sample_pages;
+	}
+
+	/* We sample some pages in the index to get an estimate */
+
+	for (i = 0; i < n_sample_pages; i++) {
+		mtr_start(&mtr);
+
+		btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr);
+
+		/* Count the number of different key values for each prefix of
+		the key on this index page. If the prefix does not determine
+		the index record uniquely in the B-tree, then we subtract one
+		because otherwise our algorithm would give a wrong estimate
+		for an index where there is just one key value. */
+
+		page = btr_cur_get_page(&cursor);
+
+		rec = page_rec_get_next(page_get_infimum_rec(page));
+
+		if (!page_rec_is_supremum(rec)) {
+			not_empty_flag = 1;
+			offsets_rec = rec_get_offsets(rec, index, offsets_rec,
+						      ULINT_UNDEFINED, &heap);
+
+			if (n_not_null != NULL) {
+				btr_record_not_null_field_in_rec(
+					n_cols, offsets_rec, n_not_null);
+			}
+		}
+
+		while (!page_rec_is_supremum(rec)) {
+			rec_t*	next_rec = page_rec_get_next(rec);
+			if (page_rec_is_supremum(next_rec)) {
+				total_external_size +=
+					btr_rec_get_externally_stored_len(
+						rec, offsets_rec);
+				break;
+			}
+
+			matched_fields = 0;
+			matched_bytes = 0;
+			offsets_next_rec = rec_get_offsets(next_rec, index,
+							   offsets_next_rec,
+							   ULINT_UNDEFINED,
+							   &heap);
+
+			cmp_rec_rec_with_match(rec, next_rec,
+					       offsets_rec, offsets_next_rec,
+					       index, stats_null_not_equal,
+					       &matched_fields,
+					       &matched_bytes);
+
+			for (j = matched_fields; j < n_cols; j++) {
+				/* We add one if this index record has
+				a different prefix from the previous */
+
+				n_diff[j]++;
+			}
+
+			if (n_not_null != NULL) {
+				btr_record_not_null_field_in_rec(
+					n_cols, offsets_next_rec, n_not_null);
+			}
+
+			total_external_size
+				+= btr_rec_get_externally_stored_len(
+					rec, offsets_rec);
+
+			rec = next_rec;
+			/* Initialize offsets_rec for the next round
+			and assign the old offsets_rec buffer to
+			offsets_next_rec. */
+			{
+				ulint*	offsets_tmp = offsets_rec;
+				offsets_rec = offsets_next_rec;
+				offsets_next_rec = offsets_tmp;
+			}
+		}
+
+
+		if (n_cols == dict_index_get_n_unique_in_tree(index)) {
+
+			/* If there is more than one leaf page in the tree,
+			we add one because we know that the first record
+			on the page certainly had a different prefix than the
+			last record on the previous index page in the
+			alphabetical order. Before this fix, if there was
+			just one big record on each clustered index page, the
+			algorithm grossly underestimated the number of rows
+			in the table. */
+
+			if (btr_page_get_prev(page, &mtr) != FIL_NULL
+			    || btr_page_get_next(page, &mtr) != FIL_NULL) {
+
+				n_diff[n_cols - 1]++;
+			}
+		}
+
+		mtr_commit(&mtr);
+	}
+
+	/* If we saw k borders between different key values on
+	n_sample_pages leaf pages, we can estimate how many
+	there will be in index->stat_n_leaf_pages */
+
+	/* We must take into account that our sample actually represents
+	also the pages used for external storage of fields (those pages are
+	included in index->stat_n_leaf_pages) */
+
+	for (j = 0; j < n_cols; j++) {
+		index->stat_n_diff_key_vals[j]
+			= BTR_TABLE_STATS_FROM_SAMPLE(
+				n_diff[j], index, n_sample_pages,
+				total_external_size, not_empty_flag);
+
+		/* If the tree is small, smaller than
+		10 * n_sample_pages + total_external_size, then
+		the above estimate is ok. For bigger trees it is common that we
+		do not see any borders between key values in the few pages
+		we pick. But still there may be n_sample_pages
+		different key values, or even more. Let us try to approximate
+		that: */
+
+		add_on = index->stat_n_leaf_pages
+			/ (10 * (n_sample_pages
+				 + total_external_size));
+
+		if (add_on > n_sample_pages) {
+			add_on = n_sample_pages;
+		}
+
+		index->stat_n_diff_key_vals[j] += add_on;
+
+		index->stat_n_sample_sizes[j] = n_sample_pages;
+
+		/* Update the stat_n_non_null_key_vals[] with our
+		sampled result. stat_n_non_null_key_vals[] is created
+		and initialized to zero in dict_index_add_to_cache(),
+		along with stat_n_diff_key_vals[] array */
+		if (n_not_null != NULL) {
+			index->stat_n_non_null_key_vals[j] =
+				 BTR_TABLE_STATS_FROM_SAMPLE(
+					n_not_null[j], index, n_sample_pages,
+					total_external_size, not_empty_flag);
+		}
+	}
+
+	mem_heap_free(heap);
+}
+
+/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
+
+/***********************************************************//**
+Gets the offset of the pointer to the externally stored part of a field.
+@return	offset of the pointer to the externally stored part */
+static
+ulint
+btr_rec_get_field_ref_offs(
+/*=======================*/
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n)	/*!< in: index of the external field */
+{
+	ulint	field_ref_offs;
+	ulint	local_len;
+
+	ut_a(rec_offs_nth_extern(offsets, n));
+	field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
+	ut_a(local_len != UNIV_SQL_NULL);
+	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
+}
+
+/** Gets a pointer to the externally stored part of a field.
+@param rec	record
+@param offsets	rec_get_offsets(rec)
+@param n	index of the externally stored field
+@return pointer to the externally stored part */
+#define btr_rec_get_field_ref(rec, offsets, n)			\
+	((rec) + btr_rec_get_field_ref_offs(offsets, n))
+
+/** Gets the externally stored size of a record, in units of a database page.
+@param[in]	rec	record
+@param[in]	offsets	array returned by rec_get_offsets()
+@return	externally stored part, in units of a database page */
+
+ulint
+btr_rec_get_externally_stored_len(
+	const rec_t*	rec,
+	const ulint*	offsets)
+{
+	ulint	n_fields;
+	ulint	total_extern_len = 0;
+	ulint	i;
+
+	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+
+	if (!rec_offs_any_extern(offsets)) {
+		return(0);
+	}
+
+	n_fields = rec_offs_n_fields(offsets);
+
+	for (i = 0; i < n_fields; i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
+
+			ulint	extern_len = mach_read_from_4(
+				btr_rec_get_field_ref(rec, offsets, i)
+				+ BTR_EXTERN_LEN + 4);
+
+			total_extern_len += ut_calc_align(extern_len,
+							  UNIV_PAGE_SIZE);
+		}
+	}
+
+	return(total_extern_len / UNIV_PAGE_SIZE);
+}
+
+/*******************************************************************//**
+Sets the ownership bit of an externally stored field in a record. */
+static
+void
+btr_cur_set_ownership_of_extern_field(
+/*==================================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
+				part will be updated, or NULL */
+	rec_t*		rec,	/*!< in/out: clustered index record */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		i,	/*!< in: field number */
+	ibool		val,	/*!< in: value to set */
+	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
+{
+	byte*	data;
+	ulint	local_len;
+	ulint	byte_val;
+
+	data = rec_get_nth_field(rec, offsets, i, &local_len);
+	ut_ad(rec_offs_nth_extern(offsets, i));
+	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+	byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
+
+	if (val) {
+		byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
+	} else {
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+		ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+		byte_val = byte_val | BTR_EXTERN_OWNER_FLAG;
+	}
+
+	if (page_zip) {
+		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
+		page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
+	} else if (mtr != NULL) {
+
+		mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
+				 MLOG_1BYTE, mtr);
+	} else {
+		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
+	}
+
+	btr_blob_dbg_owner(rec, index, offsets, i, val);
+}
+
+/*******************************************************************//**
+Marks non-updated off-page fields as disowned by this record. The ownership
+must be transferred to the updated record which is inserted elsewhere in the
+index tree. In purge only the owner of externally stored field is allowed
+to free the field. */
+UNIV_INTERN
+void
+btr_cur_disown_inherited_fields(
+/*============================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
+				part will be updated, or NULL */
+	rec_t*		rec,	/*!< in/out: record in a clustered index */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	const upd_t*	update,	/*!< in: update vector */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint	i;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+	ut_ad(rec_offs_any_extern(offsets));
+	ut_ad(mtr);
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		if (rec_offs_nth_extern(offsets, i)
+		    && !upd_get_field_by_field_no(update, i)) {
+			btr_cur_set_ownership_of_extern_field(
+				page_zip, rec, index, offsets, i, FALSE, mtr);
+		}
+	}
+}
+
+/*******************************************************************//**
+Marks all extern fields in a record as owned by the record. This function
+should be called if the delete mark of a record is removed: a not delete
+marked record always owns all its extern fields. */
+static
+void
+btr_cur_unmark_extern_fields(
+/*=========================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
+				part will be updated, or NULL */
+	rec_t*		rec,	/*!< in/out: record in a clustered index */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
+{
+	ulint	n;
+	ulint	i;
+
+	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+	n = rec_offs_n_fields(offsets);
+
+	if (!rec_offs_any_extern(offsets)) {
+
+		return;
+	}
+
+	for (i = 0; i < n; i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
+
+			btr_cur_set_ownership_of_extern_field(
+				page_zip, rec, index, offsets, i, TRUE, mtr);
+		}
+	}
+}
+
+/*******************************************************************//**
+Flags the data tuple fields that are marked as extern storage in the
+update vector.  We use this function to remember which fields we must
+mark as extern storage in a record inserted for an update.
+@return	number of flagged external columns */
+UNIV_INTERN
+ulint
+btr_push_update_extern_fields(
+/*==========================*/
+	dtuple_t*	tuple,	/*!< in/out: data tuple */
+	const upd_t*	update,	/*!< in: update vector */
+	mem_heap_t*	heap)	/*!< in: memory heap */
+{
+	ulint			n_pushed	= 0;
+	ulint			n;
+	const upd_field_t*	uf;
+
+	ut_ad(tuple);
+	ut_ad(update);
+
+	uf = update->fields;
+	n = upd_get_n_fields(update);
+
+	for (; n--; uf++) {
+		if (dfield_is_ext(&uf->new_val)) {
+			dfield_t*	field
+				= dtuple_get_nth_field(tuple, uf->field_no);
+
+			if (!dfield_is_ext(field)) {
+				dfield_set_ext(field);
+				n_pushed++;
+			}
+
+			switch (uf->orig_len) {
+				byte*	data;
+				ulint	len;
+				byte*	buf;
+			case 0:
+				break;
+			case BTR_EXTERN_FIELD_REF_SIZE:
+				/* Restore the original locally stored
+				part of the column.  In the undo log,
+				InnoDB writes a longer prefix of externally
+				stored columns, so that column prefixes
+				in secondary indexes can be reconstructed. */
+				dfield_set_data(field, (byte*) dfield_get_data(field)
+						+ dfield_get_len(field)
+						- BTR_EXTERN_FIELD_REF_SIZE,
+						BTR_EXTERN_FIELD_REF_SIZE);
+				dfield_set_ext(field);
+				break;
+			default:
+				/* Reconstruct the original locally
+				stored part of the column.  The data
+				will have to be copied. */
+				ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
+
+				data = (byte*) dfield_get_data(field);
+				len = dfield_get_len(field);
+
+				buf = (byte*) mem_heap_alloc(heap,
+							     uf->orig_len);
+				/* Copy the locally stored prefix. */
+				memcpy(buf, data,
+				       uf->orig_len
+				       - BTR_EXTERN_FIELD_REF_SIZE);
+				/* Copy the BLOB pointer. */
+				memcpy(buf + uf->orig_len
+				       - BTR_EXTERN_FIELD_REF_SIZE,
+				       data + len - BTR_EXTERN_FIELD_REF_SIZE,
+				       BTR_EXTERN_FIELD_REF_SIZE);
+
+				dfield_set_data(field, buf, uf->orig_len);
+				dfield_set_ext(field);
+			}
+		}
+	}
+
+	return(n_pushed);
+}
+
+/*******************************************************************//**
+Returns the length of a BLOB part stored on the header page.
+@return	part length */
+static
+ulint
+btr_blob_get_part_len(
+/*==================*/
+	const byte*	blob_header)	/*!< in: blob header */
+{
+	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
+}
+
+/*******************************************************************//**
+Returns the page number where the next BLOB part is stored.
+@return	page number or FIL_NULL if no more pages */
+static
+ulint
+btr_blob_get_next_page_no(
+/*======================*/
+	const byte*	blob_header)	/*!< in: blob header */
+{
+	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
+}
+
+/*******************************************************************//**
+Deallocate a buffer block that was reserved for a BLOB part. */
+static
+void
+btr_blob_free(
+/*==========*/
+	buf_block_t*	block,	/*!< in: buffer block */
+	ibool		all,	/*!< in: TRUE=remove also the compressed page
+				if there is one */
+	mtr_t*		mtr)	/*!< in: mini-transaction to commit */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_block(block);
+	ulint		space	= buf_block_get_space(block);
+	ulint		page_no	= buf_block_get_page_no(block);
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+
+	mtr_commit(mtr);
+
+	buf_pool_mutex_enter(buf_pool);
+
+	/* Only free the block if it is still allocated to
+	the same file page. */
+
+	if (buf_block_get_state(block)
+	    == BUF_BLOCK_FILE_PAGE
+	    && buf_block_get_space(block) == space
+	    && buf_block_get_page_no(block) == page_no) {
+
+		if (!buf_LRU_free_page(&block->page, all)
+		    && all && block->page.zip.data) {
+			/* Attempt to deallocate the uncompressed page
+			if the whole block cannot be deallocted. */
+
+			buf_LRU_free_page(&block->page, false);
+		}
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+}
+
+/*******************************************************************//**
+Stores the fields in big_rec_vec to the tablespace and puts pointers to
+them in rec.  The extern flags in rec will have to be set beforehand.
+The fields are stored on pages allocated from leaf node
+file segment of the index tree.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE or DB_TOO_BIG_FOR_REDO */
+UNIV_INTERN
+dberr_t
+btr_store_big_rec_extern_fields(
+/*============================*/
+	dict_index_t*	index,		/*!< in: index of rec; the index tree
+					MUST be X-latched */
+	buf_block_t*	rec_block,	/*!< in/out: block containing rec */
+	rec_t*		rec,		/*!< in/out: record */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index);
+					the "external storage" flags in offsets
+					will not correspond to rec when
+					this function returns */
+	const big_rec_t*big_rec_vec,	/*!< in: vector containing fields
+					to be stored externally */
+	mtr_t*		btr_mtr,	/*!< in: mtr containing the
+					latches to the clustered index */
+	enum blob_op	op)		/*! in: operation code */
+{
+	ulint		rec_page_no;
+	byte*		field_ref;
+	ulint		extern_len;
+	ulint		store_len;
+	ulint		page_no;
+	ulint		space_id;
+	ulint		zip_size;
+	ulint		prev_page_no;
+	ulint		hint_page_no;
+	ulint		i;
+	mtr_t		mtr;
+	mtr_t*		alloc_mtr;
+	mem_heap_t*	heap = NULL;
+	page_zip_des_t*	page_zip;
+	z_stream	c_stream;
+	buf_block_t**	freed_pages	= NULL;
+	ulint		n_freed_pages	= 0;
+	dberr_t		error		= DB_SUCCESS;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(rec_offs_any_extern(offsets));
+	ut_ad(btr_mtr);
+	ut_ad(mtr_memo_contains(btr_mtr, dict_index_get_lock(index),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains(btr_mtr, rec_block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
+	ut_a(dict_index_is_clust(index));
+
+	page_zip = buf_block_get_page_zip(rec_block);
+	ut_a(dict_table_zip_size(index->table)
+	     == buf_block_get_zip_size(rec_block));
+
+	space_id = buf_block_get_space(rec_block);
+	zip_size = buf_block_get_zip_size(rec_block);
+	rec_page_no = buf_block_get_page_no(rec_block);
+	ut_a(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX);
+
+	error = btr_check_blob_limit(big_rec_vec);
+
+	if (error != DB_SUCCESS) {
+		ut_ad(op == BTR_STORE_INSERT);
+		return(error);
+	}
+
+	if (page_zip) {
+		int	err;
+
+		/* Zlib deflate needs 128 kilobytes for the default
+		window size, plus 512 << memLevel, plus a few
+		kilobytes for small objects.  We use reduced memLevel
+		to limit the memory consumption, and preallocate the
+		heap, hoping to avoid memory fragmentation. */
+		heap = mem_heap_create(250000);
+		page_zip_set_alloc(&c_stream, heap);
+
+		err = deflateInit2(&c_stream, page_zip_level,
+				   Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
+		ut_a(err == Z_OK);
+	}
+
+	if (btr_blob_op_is_update(op)) {
+		/* Avoid reusing pages that have been previously freed
+		in btr_mtr. */
+		if (btr_mtr->n_freed_pages) {
+			if (heap == NULL) {
+				heap = mem_heap_create(
+					btr_mtr->n_freed_pages
+					* sizeof *freed_pages);
+			}
+
+			freed_pages = static_cast<buf_block_t**>(
+				mem_heap_alloc(
+					heap,
+					btr_mtr->n_freed_pages
+					* sizeof *freed_pages));
+			n_freed_pages = 0;
+		}
+
+		/* Because btr_mtr will be committed after mtr, it is
+		possible that the tablespace has been extended when
+		the B-tree record was updated or inserted, or it will
+		be extended while allocating pages for big_rec.
+
+		TODO: In mtr (not btr_mtr), write a redo log record
+		about extending the tablespace to its current size,
+		and remember the current size. Whenever the tablespace
+		grows as pages are allocated, write further redo log
+		records to mtr. (Currently tablespace extension is not
+		covered by the redo log. If it were, the record would
+		only be written to btr_mtr, which is committed after
+		mtr.) */
+		alloc_mtr = btr_mtr;
+	} else {
+		/* Use the local mtr for allocations. */
+		alloc_mtr = &mtr;
+	}
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	/* All pointers to externally stored columns in the record
+	must either be zero or they must be pointers to inherited
+	columns, owned by this record or an earlier record version. */
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		if (!rec_offs_nth_extern(offsets, i)) {
+			continue;
+		}
+		field_ref = btr_rec_get_field_ref(rec, offsets, i);
+
+		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
+		/* Either this must be an update in place,
+		or the BLOB must be inherited, or the BLOB pointer
+		must be zero (will be written in this function). */
+		ut_a(op == BTR_STORE_UPDATE
+		     || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
+		     || !memcmp(field_ref, field_ref_zero,
+				BTR_EXTERN_FIELD_REF_SIZE));
+	}
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+	/* We have to create a file segment to the tablespace
+	for each field and put the pointer to the field in rec */
+
+	for (i = 0; i < big_rec_vec->n_fields; i++) {
+		field_ref = btr_rec_get_field_ref(
+			rec, offsets, big_rec_vec->fields[i].field_no);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+		/* A zero BLOB pointer should have been initially inserted. */
+		ut_a(!memcmp(field_ref, field_ref_zero,
+			     BTR_EXTERN_FIELD_REF_SIZE));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+		extern_len = big_rec_vec->fields[i].len;
+		UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data,
+				   extern_len);
+
+		ut_a(extern_len > 0);
+
+		prev_page_no = FIL_NULL;
+
+		if (page_zip) {
+			int	err = deflateReset(&c_stream);
+			ut_a(err == Z_OK);
+
+			c_stream.next_in = (Bytef*)
+				big_rec_vec->fields[i].data;
+			c_stream.avail_in = static_cast<uInt>(extern_len);
+		}
+
+		for (;;) {
+			buf_block_t*	block;
+			page_t*		page;
+
+			mtr_start(&mtr);
+
+			if (prev_page_no == FIL_NULL) {
+				hint_page_no = 1 + rec_page_no;
+			} else {
+				hint_page_no = prev_page_no + 1;
+			}
+
+alloc_another:
+			block = btr_page_alloc(index, hint_page_no,
+					       FSP_NO_DIR, 0, alloc_mtr, &mtr);
+			if (UNIV_UNLIKELY(block == NULL)) {
+				mtr_commit(&mtr);
+				error = DB_OUT_OF_FILE_SPACE;
+				goto func_exit;
+			}
+
+			if (rw_lock_get_x_lock_count(&block->lock) > 1) {
+				/* This page must have been freed in
+				btr_mtr previously. Put it aside, and
+				allocate another page for the BLOB data. */
+				ut_ad(alloc_mtr == btr_mtr);
+				ut_ad(btr_blob_op_is_update(op));
+				ut_ad(n_freed_pages < btr_mtr->n_freed_pages);
+				freed_pages[n_freed_pages++] = block;
+				goto alloc_another;
+			}
+
+			page_no = buf_block_get_page_no(block);
+			page = buf_block_get_frame(block);
+
+			if (prev_page_no != FIL_NULL) {
+				buf_block_t*	prev_block;
+				page_t*		prev_page;
+
+				prev_block = buf_page_get(space_id, zip_size,
+							  prev_page_no,
+							  RW_X_LATCH, &mtr);
+				buf_block_dbg_add_level(prev_block,
+							SYNC_EXTERN_STORAGE);
+				prev_page = buf_block_get_frame(prev_block);
+
+				if (page_zip) {
+					mlog_write_ulint(
+						prev_page + FIL_PAGE_NEXT,
+						page_no, MLOG_4BYTES, &mtr);
+					memcpy(buf_block_get_page_zip(
+						       prev_block)
+					       ->data + FIL_PAGE_NEXT,
+					       prev_page + FIL_PAGE_NEXT, 4);
+				} else {
+					mlog_write_ulint(
+						prev_page + FIL_PAGE_DATA
+						+ BTR_BLOB_HDR_NEXT_PAGE_NO,
+						page_no, MLOG_4BYTES, &mtr);
+				}
+
+			} else if (dict_index_is_online_ddl(index)) {
+				row_log_table_blob_alloc(index, page_no);
+			}
+
+			if (page_zip) {
+				int		err;
+				page_zip_des_t*	blob_page_zip;
+
+				/* Write FIL_PAGE_TYPE to the redo log
+				separately, before logging any other
+				changes to the page, so that the debug
+				assertions in
+				recv_parse_or_apply_log_rec_body() can
+				be made simpler.  Before InnoDB Plugin
+				1.0.4, the initialization of
+				FIL_PAGE_TYPE was logged as part of
+				the mlog_log_string() below. */
+
+				mlog_write_ulint(page + FIL_PAGE_TYPE,
+						 prev_page_no == FIL_NULL
+						 ? FIL_PAGE_TYPE_ZBLOB
+						 : FIL_PAGE_TYPE_ZBLOB2,
+						 MLOG_2BYTES, &mtr);
+
+				c_stream.next_out = page
+					+ FIL_PAGE_DATA;
+				c_stream.avail_out
+					= static_cast<uInt>(page_zip_get_size(page_zip))
+					- FIL_PAGE_DATA;
+
+				err = deflate(&c_stream, Z_FINISH);
+				ut_a(err == Z_OK || err == Z_STREAM_END);
+				ut_a(err == Z_STREAM_END
+				     || c_stream.avail_out == 0);
+
+				/* Write the "next BLOB page" pointer */
+				mlog_write_ulint(page + FIL_PAGE_NEXT,
+						 FIL_NULL, MLOG_4BYTES, &mtr);
+				/* Initialize the unused "prev page" pointer */
+				mlog_write_ulint(page + FIL_PAGE_PREV,
+						 FIL_NULL, MLOG_4BYTES, &mtr);
+				/* Write a back pointer to the record
+				into the otherwise unused area.  This
+				information could be useful in
+				debugging.  Later, we might want to
+				implement the possibility to relocate
+				BLOB pages.  Then, we would need to be
+				able to adjust the BLOB pointer in the
+				record.  We do not store the heap
+				number of the record, because it can
+				change in page_zip_reorganize() or
+				btr_page_reorganize().  However, also
+				the page number of the record may
+				change when B-tree nodes are split or
+				merged. */
+				mlog_write_ulint(page
+						 + FIL_PAGE_FILE_FLUSH_LSN,
+						 space_id,
+						 MLOG_4BYTES, &mtr);
+				mlog_write_ulint(page
+						 + FIL_PAGE_FILE_FLUSH_LSN + 4,
+						 rec_page_no,
+						 MLOG_4BYTES, &mtr);
+
+				/* Zero out the unused part of the page. */
+				memset(page + page_zip_get_size(page_zip)
+				       - c_stream.avail_out,
+				       0, c_stream.avail_out);
+				mlog_log_string(page + FIL_PAGE_FILE_FLUSH_LSN,
+						page_zip_get_size(page_zip)
+						- FIL_PAGE_FILE_FLUSH_LSN,
+						&mtr);
+				/* Copy the page to compressed storage,
+				because it will be flushed to disk
+				from there. */
+				blob_page_zip = buf_block_get_page_zip(block);
+				ut_ad(blob_page_zip);
+				ut_ad(page_zip_get_size(blob_page_zip)
+				      == page_zip_get_size(page_zip));
+				memcpy(blob_page_zip->data, page,
+				       page_zip_get_size(page_zip));
+
+				if (err == Z_OK && prev_page_no != FIL_NULL) {
+
+					goto next_zip_page;
+				}
+
+				if (alloc_mtr == &mtr) {
+					rec_block = buf_page_get(
+						space_id, zip_size,
+						rec_page_no,
+						RW_X_LATCH, &mtr);
+					buf_block_dbg_add_level(
+						rec_block,
+						SYNC_NO_ORDER_CHECK);
+				}
+
+				if (err == Z_STREAM_END) {
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_LEN, 0);
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_LEN + 4,
+							c_stream.total_in);
+				} else {
+					memset(field_ref + BTR_EXTERN_LEN,
+					       0, 8);
+				}
+
+				if (prev_page_no == FIL_NULL) {
+					btr_blob_dbg_add_blob(
+						rec, big_rec_vec->fields[i]
+						.field_no, page_no, index,
+						"store");
+
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_SPACE_ID,
+							space_id);
+
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_PAGE_NO,
+							page_no);
+
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_OFFSET,
+							FIL_PAGE_NEXT);
+				}
+
+				page_zip_write_blob_ptr(
+					page_zip, rec, index, offsets,
+					big_rec_vec->fields[i].field_no,
+					alloc_mtr);
+
+next_zip_page:
+				prev_page_no = page_no;
+
+				/* Commit mtr and release the
+				uncompressed page frame to save memory. */
+				btr_blob_free(block, FALSE, &mtr);
+
+				if (err == Z_STREAM_END) {
+					break;
+				}
+			} else {
+				mlog_write_ulint(page + FIL_PAGE_TYPE,
+						 FIL_PAGE_TYPE_BLOB,
+						 MLOG_2BYTES, &mtr);
+
+				if (extern_len > (UNIV_PAGE_SIZE
+						  - FIL_PAGE_DATA
+						  - BTR_BLOB_HDR_SIZE
+						  - FIL_PAGE_DATA_END)) {
+					store_len = UNIV_PAGE_SIZE
+						- FIL_PAGE_DATA
+						- BTR_BLOB_HDR_SIZE
+						- FIL_PAGE_DATA_END;
+				} else {
+					store_len = extern_len;
+				}
+
+				mlog_write_string(page + FIL_PAGE_DATA
+						  + BTR_BLOB_HDR_SIZE,
+						  (const byte*)
+						  big_rec_vec->fields[i].data
+						  + big_rec_vec->fields[i].len
+						  - extern_len,
+						  store_len, &mtr);
+				mlog_write_ulint(page + FIL_PAGE_DATA
+						 + BTR_BLOB_HDR_PART_LEN,
+						 store_len, MLOG_4BYTES, &mtr);
+				mlog_write_ulint(page + FIL_PAGE_DATA
+						 + BTR_BLOB_HDR_NEXT_PAGE_NO,
+						 FIL_NULL, MLOG_4BYTES, &mtr);
+
+				extern_len -= store_len;
+
+				if (alloc_mtr == &mtr) {
+					rec_block = buf_page_get(
+						space_id, zip_size,
+						rec_page_no,
+						RW_X_LATCH, &mtr);
+					buf_block_dbg_add_level(
+						rec_block,
+						SYNC_NO_ORDER_CHECK);
+				}
+
+				mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
+						 MLOG_4BYTES, alloc_mtr);
+				mlog_write_ulint(field_ref
+						 + BTR_EXTERN_LEN + 4,
+						 big_rec_vec->fields[i].len
+						 - extern_len,
+						 MLOG_4BYTES, alloc_mtr);
+
+				if (prev_page_no == FIL_NULL) {
+					btr_blob_dbg_add_blob(
+						rec, big_rec_vec->fields[i]
+						.field_no, page_no, index,
+						"store");
+
+					mlog_write_ulint(field_ref
+							 + BTR_EXTERN_SPACE_ID,
+							 space_id, MLOG_4BYTES,
+							 alloc_mtr);
+
+					mlog_write_ulint(field_ref
+							 + BTR_EXTERN_PAGE_NO,
+							 page_no, MLOG_4BYTES,
+							 alloc_mtr);
+
+					mlog_write_ulint(field_ref
+							 + BTR_EXTERN_OFFSET,
+							 FIL_PAGE_DATA,
+							 MLOG_4BYTES,
+							 alloc_mtr);
+				}
+
+				prev_page_no = page_no;
+
+				mtr_commit(&mtr);
+
+				if (extern_len == 0) {
+					break;
+				}
+			}
+		}
+
+		DBUG_EXECUTE_IF("btr_store_big_rec_extern",
+				error = DB_OUT_OF_FILE_SPACE;
+				goto func_exit;);
+	}
+
+func_exit:
+	if (page_zip) {
+		deflateEnd(&c_stream);
+	}
+
+	if (n_freed_pages) {
+		ulint	i;
+
+		ut_ad(alloc_mtr == btr_mtr);
+		ut_ad(btr_blob_op_is_update(op));
+
+		for (i = 0; i < n_freed_pages; i++) {
+			btr_page_free_low(index, freed_pages[i], 0, alloc_mtr);
+		}
+	}
+
+	if (heap != NULL) {
+		mem_heap_free(heap);
+	}
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	/* All pointers to externally stored columns in the record
+	must be valid. */
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		if (!rec_offs_nth_extern(offsets, i)) {
+			continue;
+		}
+
+		field_ref = btr_rec_get_field_ref(rec, offsets, i);
+
+		/* The pointer must not be zero if the operation
+		succeeded. */
+		ut_a(0 != memcmp(field_ref, field_ref_zero,
+				 BTR_EXTERN_FIELD_REF_SIZE)
+		     || error != DB_SUCCESS);
+		/* The column must not be disowned by this record. */
+		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
+	}
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+	return(error);
+}
+
+/*******************************************************************//**
+Check the FIL_PAGE_TYPE on an uncompressed BLOB page. */
+static
+void
+btr_check_blob_fil_page_type(
+/*=========================*/
+	ulint		space_id,	/*!< in: space id */
+	ulint		page_no,	/*!< in: page number */
+	const page_t*	page,		/*!< in: page */
+	ibool		read)		/*!< in: TRUE=read, FALSE=purge */
+{
+	ulint	type = fil_page_get_type(page);
+
+	ut_a(space_id == page_get_space_id(page));
+	ut_a(page_no == page_get_page_no(page));
+
+	if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) {
+		ulint	flags = fil_space_get_flags(space_id);
+
+#ifndef UNIV_DEBUG /* Improve debug test coverage */
+		if (dict_tf_get_format(flags) == UNIV_FORMAT_A) {
+			/* Old versions of InnoDB did not initialize
+			FIL_PAGE_TYPE on BLOB pages.  Do not print
+			anything about the type mismatch when reading
+			a BLOB page that is in Antelope format.*/
+			return;
+		}
+#endif /* !UNIV_DEBUG */
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: FIL_PAGE_TYPE=%lu"
+			" on BLOB %s space %lu page %lu flags %lx\n",
+			(ulong) type, read ? "read" : "purge",
+			(ulong) space_id, (ulong) page_no, (ulong) flags);
+		ut_error;
+	}
+}
+
+/*******************************************************************//**
+Frees the space in an externally stored field to the file space
+management if the field in data is owned by the externally stored field,
+in a rollback we may have the additional condition that the field must
+not be inherited. */
+UNIV_INTERN
+void
+btr_free_externally_stored_field(
+/*=============================*/
+	dict_index_t*	index,		/*!< in: index of the data, the index
+					tree MUST be X-latched; if the tree
+					height is 1, then also the root page
+					must be X-latched! (this is relevant
+					in the case this function is called
+					from purge where 'data' is located on
+					an undo log page, not an index
+					page) */
+	byte*		field_ref,	/*!< in/out: field reference */
+	const rec_t*	rec,		/*!< in: record containing field_ref, for
+					page_zip_write_blob_ptr(), or NULL */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index),
+					or NULL */
+	page_zip_des_t*	page_zip,	/*!< in: compressed page corresponding
+					to rec, or NULL if rec == NULL */
+	ulint		i,		/*!< in: field number of field_ref;
+					ignored if rec == NULL */
+	enum trx_rb_ctx	rb_ctx,		/*!< in: rollback context */
+	mtr_t*		local_mtr __attribute__((unused))) /*!< in: mtr
+					containing the latch to data an an
+					X-latch to the index tree */
+{
+	page_t*		page;
+	const ulint	space_id	= mach_read_from_4(
+		field_ref + BTR_EXTERN_SPACE_ID);
+	const ulint	start_page	= mach_read_from_4(
+		field_ref + BTR_EXTERN_PAGE_NO);
+	ulint		rec_zip_size = dict_table_zip_size(index->table);
+	ulint		ext_zip_size;
+	ulint		page_no;
+	ulint		next_page_no;
+	mtr_t		mtr;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains_page(local_mtr, field_ref,
+				     MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
+	ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
+
+	if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
+				  BTR_EXTERN_FIELD_REF_SIZE))) {
+		/* In the rollback, we may encounter a clustered index
+		record with some unwritten off-page columns. There is
+		nothing to free then. */
+		ut_a(rb_ctx != RB_NONE);
+		return;
+	}
+
+	ut_ad(space_id == index->space);
+
+	if (UNIV_UNLIKELY(space_id != dict_index_get_space(index))) {
+		ext_zip_size = fil_space_get_zip_size(space_id);
+		/* This must be an undo log record in the system tablespace,
+		that is, in row_purge_upd_exist_or_extern().
+		Currently, externally stored records are stored in the
+		same tablespace as the referring records. */
+		ut_ad(!page_get_space_id(page_align(field_ref)));
+		ut_ad(!rec);
+		ut_ad(!page_zip);
+	} else {
+		ext_zip_size = rec_zip_size;
+	}
+
+	if (!rec) {
+		/* This is a call from row_purge_upd_exist_or_extern(). */
+		ut_ad(!page_zip);
+		rec_zip_size = 0;
+	}
+
+#ifdef UNIV_BLOB_DEBUG
+	if (!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG)
+	    && !((field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
+		 && (rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY))) {
+		/* This off-page column will be freed.
+		Check that no references remain. */
+
+		btr_blob_dbg_t	b;
+
+		b.blob_page_no = start_page;
+
+		if (rec) {
+			/* Remove the reference from the record to the
+			BLOB. If the BLOB were not freed, the
+			reference would be removed when the record is
+			removed. Freeing the BLOB will overwrite the
+			BTR_EXTERN_PAGE_NO in the field_ref of the
+			record with FIL_NULL, which would make the
+			btr_blob_dbg information inconsistent with the
+			record. */
+			b.ref_page_no = page_get_page_no(page_align(rec));
+			b.ref_heap_no = page_rec_get_heap_no(rec);
+			b.ref_field_no = i;
+			btr_blob_dbg_rbt_delete(index, &b, "free");
+		}
+
+		btr_blob_dbg_assert_empty(index, b.blob_page_no);
+	}
+#endif /* UNIV_BLOB_DEBUG */
+
+	for (;;) {
+#ifdef UNIV_SYNC_DEBUG
+		buf_block_t*	rec_block;
+#endif /* UNIV_SYNC_DEBUG */
+		buf_block_t*	ext_block;
+
+		mtr_start(&mtr);
+
+#ifdef UNIV_SYNC_DEBUG
+		rec_block =
+#endif /* UNIV_SYNC_DEBUG */
+		buf_page_get(page_get_space_id(page_align(field_ref)),
+			     rec_zip_size,
+			     page_get_page_no(page_align(field_ref)),
+			     RW_X_LATCH, &mtr);
+		buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
+		page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
+
+		if (/* There is no external storage data */
+		    page_no == FIL_NULL
+		    /* This field does not own the externally stored field */
+		    || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
+			& BTR_EXTERN_OWNER_FLAG)
+		    /* Rollback and inherited field */
+		    || ((rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY)
+			&& (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
+			    & BTR_EXTERN_INHERITED_FLAG))) {
+
+			/* Do not free */
+			mtr_commit(&mtr);
+
+			return;
+		}
+
+		if (page_no == start_page && dict_index_is_online_ddl(index)) {
+			row_log_table_blob_free(index, start_page);
+		}
+
+		ext_block = buf_page_get(space_id, ext_zip_size, page_no,
+					 RW_X_LATCH, &mtr);
+		buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
+		page = buf_block_get_frame(ext_block);
+
+		if (ext_zip_size) {
+			/* Note that page_zip will be NULL
+			in row_purge_upd_exist_or_extern(). */
+			switch (fil_page_get_type(page)) {
+			case FIL_PAGE_TYPE_ZBLOB:
+			case FIL_PAGE_TYPE_ZBLOB2:
+				break;
+			default:
+				ut_error;
+			}
+			next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
+
+			btr_page_free_low(index, ext_block, 0, &mtr);
+
+			if (page_zip != NULL) {
+				mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
+						next_page_no);
+				mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
+						0);
+				page_zip_write_blob_ptr(page_zip, rec, index,
+							offsets, i, &mtr);
+			} else {
+				mlog_write_ulint(field_ref
+						 + BTR_EXTERN_PAGE_NO,
+						 next_page_no,
+						 MLOG_4BYTES, &mtr);
+				mlog_write_ulint(field_ref
+						 + BTR_EXTERN_LEN + 4, 0,
+						 MLOG_4BYTES, &mtr);
+			}
+		} else {
+			ut_a(!page_zip);
+			btr_check_blob_fil_page_type(space_id, page_no, page,
+						     FALSE);
+
+			next_page_no = mach_read_from_4(
+				page + FIL_PAGE_DATA
+				+ BTR_BLOB_HDR_NEXT_PAGE_NO);
+
+			/* We must supply the page level (= 0) as an argument
+			because we did not store it on the page (we save the
+			space overhead from an index page header. */
+
+			btr_page_free_low(index, ext_block, 0, &mtr);
+
+			mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO,
+					 next_page_no,
+					 MLOG_4BYTES, &mtr);
+			/* Zero out the BLOB length.  If the server
+			crashes during the execution of this function,
+			trx_rollback_or_clean_all_recovered() could
+			dereference the half-deleted BLOB, fetching a
+			wrong prefix for the BLOB. */
+			mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
+					 0,
+					 MLOG_4BYTES, &mtr);
+		}
+
+		/* Commit mtr and release the BLOB block to save memory. */
+		btr_blob_free(ext_block, TRUE, &mtr);
+	}
+}
+
+/***********************************************************//**
+Frees the externally stored fields for a record. */
+static
+void
+btr_rec_free_externally_stored_fields(
+/*==================================*/
+	dict_index_t*	index,	/*!< in: index of the data, the index
+				tree MUST be X-latched */
+	rec_t*		rec,	/*!< in/out: record */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
+				part will be updated, or NULL */
+	enum trx_rb_ctx	rb_ctx,	/*!< in: rollback context */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
+				an X-latch to record page and to the index
+				tree */
+{
+	ulint	n_fields;
+	ulint	i;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
+	/* Free possible externally stored fields in the record */
+
+	ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
+	n_fields = rec_offs_n_fields(offsets);
+
+	for (i = 0; i < n_fields; i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
+			btr_free_externally_stored_field(
+				index, btr_rec_get_field_ref(rec, offsets, i),
+				rec, offsets, page_zip, i, rb_ctx, mtr);
+		}
+	}
+}
+
+/***********************************************************//**
+Frees the externally stored fields for a record, if the field is mentioned
+in the update vector. */
+static
+void
+btr_rec_free_updated_extern_fields(
+/*===============================*/
+	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
+				X-latched */
+	rec_t*		rec,	/*!< in/out: record */
+	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
+				part will be updated, or NULL */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update,	/*!< in: update vector */
+	enum trx_rb_ctx	rb_ctx,	/*!< in: rollback context */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
+				an X-latch to record page and to the tree */
+{
+	ulint	n_fields;
+	ulint	i;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
+
+	/* Free possible externally stored fields in the record */
+
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		const upd_field_t* ufield = upd_get_nth_field(update, i);
+
+		if (rec_offs_nth_extern(offsets, ufield->field_no)) {
+			ulint	len;
+			byte*	data = rec_get_nth_field(
+				rec, offsets, ufield->field_no, &len);
+			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+			btr_free_externally_stored_field(
+				index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
+				rec, offsets, page_zip,
+				ufield->field_no, rb_ctx, mtr);
+		}
+	}
+}
+
+/*******************************************************************//**
+Copies the prefix of an uncompressed BLOB.  The clustered index record
+that points to this BLOB must be protected by a lock or a page latch.
+@return	number of bytes written to buf */
+static
+ulint
+btr_copy_blob_prefix(
+/*=================*/
+	byte*		buf,	/*!< out: the externally stored part of
+				the field, or a prefix of it */
+	ulint		len,	/*!< in: length of buf, in bytes */
+	ulint		space_id,/*!< in: space id of the BLOB pages */
+	ulint		page_no,/*!< in: page number of the first BLOB page */
+	ulint		offset)	/*!< in: offset on the first BLOB page */
+{
+	ulint	copied_len	= 0;
+
+	for (;;) {
+		mtr_t		mtr;
+		buf_block_t*	block;
+		const page_t*	page;
+		const byte*	blob_header;
+		ulint		part_len;
+		ulint		copy_len;
+
+		mtr_start(&mtr);
+
+		block = buf_page_get(space_id, 0, page_no, RW_S_LATCH, &mtr);
+		buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
+		page = buf_block_get_frame(block);
+
+		btr_check_blob_fil_page_type(space_id, page_no, page, TRUE);
+
+		blob_header = page + offset;
+		part_len = btr_blob_get_part_len(blob_header);
+		copy_len = ut_min(part_len, len - copied_len);
+
+		memcpy(buf + copied_len,
+		       blob_header + BTR_BLOB_HDR_SIZE, copy_len);
+		copied_len += copy_len;
+
+		page_no = btr_blob_get_next_page_no(blob_header);
+
+		mtr_commit(&mtr);
+
+		if (page_no == FIL_NULL || copy_len != part_len) {
+			UNIV_MEM_ASSERT_RW(buf, copied_len);
+			return(copied_len);
+		}
+
+		/* On other BLOB pages except the first the BLOB header
+		always is at the page data start: */
+
+		offset = FIL_PAGE_DATA;
+
+		ut_ad(copied_len <= len);
+	}
+}
+
+/*******************************************************************//**
+Copies the prefix of a compressed BLOB.  The clustered index record
+that points to this BLOB must be protected by a lock or a page latch.
+@return	number of bytes written to buf */
+static
+ulint
+btr_copy_zblob_prefix(
+/*==================*/
+	byte*		buf,	/*!< out: the externally stored part of
+				the field, or a prefix of it */
+	ulint		len,	/*!< in: length of buf, in bytes */
+	ulint		zip_size,/*!< in: compressed BLOB page size */
+	ulint		space_id,/*!< in: space id of the BLOB pages */
+	ulint		page_no,/*!< in: page number of the first BLOB page */
+	ulint		offset)	/*!< in: offset on the first BLOB page */
+{
+	ulint		page_type = FIL_PAGE_TYPE_ZBLOB;
+	mem_heap_t*	heap;
+	int		err;
+	z_stream	d_stream;
+
+	d_stream.next_out = buf;
+	d_stream.avail_out = static_cast<uInt>(len);
+	d_stream.next_in = Z_NULL;
+	d_stream.avail_in = 0;
+
+	/* Zlib inflate needs 32 kilobytes for the default
+	window size, plus a few kilobytes for small objects. */
+	heap = mem_heap_create(40000);
+	page_zip_set_alloc(&d_stream, heap);
+
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(zip_size >= UNIV_ZIP_SIZE_MIN);
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
+	ut_ad(space_id);
+
+	err = inflateInit(&d_stream);
+	ut_a(err == Z_OK);
+
+	for (;;) {
+		buf_page_t*	bpage;
+		ulint		next_page_no;
+
+		/* There is no latch on bpage directly.  Instead,
+		bpage is protected by the B-tree page latch that
+		is being held on the clustered index record, or,
+		in row_merge_copy_blobs(), by an exclusive table lock. */
+		bpage = buf_page_get_zip(space_id, zip_size, page_no);
+
+		if (UNIV_UNLIKELY(!bpage)) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Cannot load"
+				" compressed BLOB"
+				" page %lu space %lu\n",
+				(ulong) page_no, (ulong) space_id);
+			goto func_exit;
+		}
+
+		if (UNIV_UNLIKELY
+		    (fil_page_get_type(bpage->zip.data) != page_type)) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Unexpected type %lu of"
+				" compressed BLOB"
+				" page %lu space %lu\n",
+				(ulong) fil_page_get_type(bpage->zip.data),
+				(ulong) page_no, (ulong) space_id);
+			ut_ad(0);
+			goto end_of_blob;
+		}
+
+		next_page_no = mach_read_from_4(bpage->zip.data + offset);
+
+		if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
+			/* When the BLOB begins at page header,
+			the compressed data payload does not
+			immediately follow the next page pointer. */
+			offset = FIL_PAGE_DATA;
+		} else {
+			offset += 4;
+		}
+
+		d_stream.next_in = bpage->zip.data + offset;
+		d_stream.avail_in = static_cast<uInt>(zip_size - offset);
+
+		err = inflate(&d_stream, Z_NO_FLUSH);
+		switch (err) {
+		case Z_OK:
+			if (!d_stream.avail_out) {
+				goto end_of_blob;
+			}
+			break;
+		case Z_STREAM_END:
+			if (next_page_no == FIL_NULL) {
+				goto end_of_blob;
+			}
+			/* fall through */
+		default:
+inflate_error:
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: inflate() of"
+				" compressed BLOB"
+				" page %lu space %lu returned %d (%s)\n",
+				(ulong) page_no, (ulong) space_id,
+				err, d_stream.msg);
+		case Z_BUF_ERROR:
+			goto end_of_blob;
+		}
+
+		if (next_page_no == FIL_NULL) {
+			if (!d_stream.avail_in) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"  InnoDB: unexpected end of"
+					" compressed BLOB"
+					" page %lu space %lu\n",
+					(ulong) page_no,
+					(ulong) space_id);
+			} else {
+				err = inflate(&d_stream, Z_FINISH);
+				switch (err) {
+				case Z_STREAM_END:
+				case Z_BUF_ERROR:
+					break;
+				default:
+					goto inflate_error;
+				}
+			}
+
+end_of_blob:
+			buf_page_release_zip(bpage);
+			goto func_exit;
+		}
+
+		buf_page_release_zip(bpage);
+
+		/* On other BLOB pages except the first
+		the BLOB header always is at the page header: */
+
+		page_no = next_page_no;
+		offset = FIL_PAGE_NEXT;
+		page_type = FIL_PAGE_TYPE_ZBLOB2;
+	}
+
+func_exit:
+	inflateEnd(&d_stream);
+	mem_heap_free(heap);
+	UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
+	return(d_stream.total_out);
+}
+
+/*******************************************************************//**
+Copies the prefix of an externally stored field of a record.  The
+clustered index record that points to this BLOB must be protected by a
+lock or a page latch.
+@return	number of bytes written to buf */
+static
+ulint
+btr_copy_externally_stored_field_prefix_low(
+/*========================================*/
+	byte*		buf,	/*!< out: the externally stored part of
+				the field, or a prefix of it */
+	ulint		len,	/*!< in: length of buf, in bytes */
+	ulint		zip_size,/*!< in: nonzero=compressed BLOB page size,
+				zero for uncompressed BLOBs */
+	ulint		space_id,/*!< in: space id of the first BLOB page */
+	ulint		page_no,/*!< in: page number of the first BLOB page */
+	ulint		offset)	/*!< in: offset on the first BLOB page */
+{
+	if (UNIV_UNLIKELY(len == 0)) {
+		return(0);
+	}
+
+	if (zip_size) {
+		return(btr_copy_zblob_prefix(buf, len, zip_size,
+					     space_id, page_no, offset));
+	} else {
+		return(btr_copy_blob_prefix(buf, len, space_id,
+					    page_no, offset));
+	}
+}
+
+/*******************************************************************//**
+Copies the prefix of an externally stored field of a record.  The
+clustered index record must be protected by a lock or a page latch.
+@return the length of the copied field, or 0 if the column was being
+or has been deleted */
+UNIV_INTERN
+ulint
+btr_copy_externally_stored_field_prefix(
+/*====================================*/
+	byte*		buf,	/*!< out: the field, or a prefix of it */
+	ulint		len,	/*!< in: length of buf, in bytes */
+	ulint		zip_size,/*!< in: nonzero=compressed BLOB page size,
+				zero for uncompressed BLOBs */
+	const byte*	data,	/*!< in: 'internally' stored part of the
+				field containing also the reference to
+				the external part; must be protected by
+				a lock or a page latch */
+	ulint		local_len)/*!< in: length of data, in bytes */
+{
+	ulint	space_id;
+	ulint	page_no;
+	ulint	offset;
+
+	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+	if (UNIV_UNLIKELY(local_len >= len)) {
+		memcpy(buf, data, len);
+		return(len);
+	}
+
+	memcpy(buf, data, local_len);
+	data += local_len;
+
+	ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
+
+	if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
+		/* The externally stored part of the column has been
+		(partially) deleted.  Signal the half-deleted BLOB
+		to the caller. */
+
+		return(0);
+	}
+
+	space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
+
+	page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
+
+	offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
+
+	return(local_len
+	       + btr_copy_externally_stored_field_prefix_low(buf + local_len,
+							     len - local_len,
+							     zip_size,
+							     space_id, page_no,
+							     offset));
+}
+
+/*******************************************************************//**
+Copies an externally stored field of a record to mem heap.  The
+clustered index record must be protected by a lock or a page latch.
+@return	the whole field copied to heap */
+UNIV_INTERN
+byte*
+btr_copy_externally_stored_field(
+/*=============================*/
+	ulint*		len,	/*!< out: length of the whole field */
+	const byte*	data,	/*!< in: 'internally' stored part of the
+				field containing also the reference to
+				the external part; must be protected by
+				a lock or a page latch */
+	ulint		zip_size,/*!< in: nonzero=compressed BLOB page size,
+				zero for uncompressed BLOBs */
+	ulint		local_len,/*!< in: length of data */
+	mem_heap_t*	heap)	/*!< in: mem heap */
+{
+	ulint	space_id;
+	ulint	page_no;
+	ulint	offset;
+	ulint	extern_len;
+	byte*	buf;
+
+	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+	space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
+
+	page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
+
+	offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
+
+	/* Currently a BLOB cannot be bigger than 4 GB; we
+	leave the 4 upper bytes in the length field unused */
+
+	extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
+
+	buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
+
+	memcpy(buf, data, local_len);
+	*len = local_len
+		+ btr_copy_externally_stored_field_prefix_low(buf + local_len,
+							      extern_len,
+							      zip_size,
+							      space_id,
+							      page_no, offset);
+
+	return(buf);
+}
+
+/*******************************************************************//**
+Copies an externally stored field of a record to mem heap.
+@return	the field copied to heap, or NULL if the field is incomplete */
+UNIV_INTERN
+byte*
+btr_rec_copy_externally_stored_field(
+/*=================================*/
+	const rec_t*	rec,	/*!< in: record in a clustered index;
+				must be protected by a lock or a page latch */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		zip_size,/*!< in: nonzero=compressed BLOB page size,
+				zero for uncompressed BLOBs */
+	ulint		no,	/*!< in: field number */
+	ulint*		len,	/*!< out: length of the field */
+	mem_heap_t*	heap)	/*!< in: mem heap */
+{
+	ulint		local_len;
+	const byte*	data;
+
+	ut_a(rec_offs_nth_extern(offsets, no));
+
+	/* An externally stored field can contain some initial
+	data from the field, and in the last 20 bytes it has the
+	space id, page number, and offset where the rest of the
+	field data is stored, and the data length in addition to
+	the data stored locally. We may need to store some data
+	locally to get the local record length above the 128 byte
+	limit so that field offsets are stored in two bytes, and
+	the extern bit is available in those two bytes. */
+
+	data = rec_get_nth_field(rec, offsets, no, &local_len);
+
+	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	if (UNIV_UNLIKELY
+	    (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
+		     field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
+		/* The externally stored field was not written yet.
+		This record should only be seen by
+		recv_recovery_rollback_active() or any
+		TRX_ISO_READ_UNCOMMITTED transactions. */
+		return(NULL);
+	}
+
+	return(btr_copy_externally_stored_field(len, data,
+						zip_size, local_len, heap));
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc
new file mode 100644
index 00000000000..82a2b6dbf6b
--- /dev/null
+++ b/storage/innobase/btr/btr0pcur.cc
@@ -0,0 +1,595 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0pcur.cc
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+#include "btr0pcur.h"
+
+#ifdef UNIV_NONINL
+#include "btr0pcur.ic"
+#endif
+
+#include "ut0byte.h"
+#include "rem0cmp.h"
+#include "trx0trx.h"
+
+/**************************************************************//**
+Allocates memory for a persistent cursor object and initializes the cursor.
+@return	own: persistent cursor */
+UNIV_INTERN
+btr_pcur_t*
+btr_pcur_create_for_mysql(void)
+/*============================*/
+{
+	btr_pcur_t*	pcur;
+
+	pcur = (btr_pcur_t*) mem_alloc(sizeof(btr_pcur_t));
+
+	pcur->btr_cur.index = NULL;
+	btr_pcur_init(pcur);
+
+	return(pcur);
+}
+
+/**************************************************************//**
+Resets a persistent cursor object, freeing ::old_rec_buf if it is
+allocated and resetting the other members to their initial values. */
+UNIV_INTERN
+void
+btr_pcur_reset(
+/*===========*/
+	btr_pcur_t*	cursor)	/*!< in, out: persistent cursor */
+{
+	if (cursor->old_rec_buf != NULL) {
+
+		mem_free(cursor->old_rec_buf);
+
+		cursor->old_rec_buf = NULL;
+	}
+
+	cursor->btr_cur.index = NULL;
+	cursor->btr_cur.page_cur.rec = NULL;
+	cursor->old_rec = NULL;
+	cursor->old_n_fields = 0;
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+	cursor->latch_mode = BTR_NO_LATCHES;
+	cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
+}
+
+/**************************************************************//**
+Frees the memory for a persistent cursor object. */
+UNIV_INTERN
+void
+btr_pcur_free_for_mysql(
+/*====================*/
+	btr_pcur_t*	cursor)	/*!< in, own: persistent cursor */
+{
+	btr_pcur_reset(cursor);
+	mem_free(cursor);
+}
+
+/**************************************************************//**
+The position of the cursor is stored by taking an initial segment of the
+record the cursor is positioned on, before, or after, and copying it to the
+cursor data structure, or just setting a flag if the cursor id before the
+first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the
+page where the cursor is positioned must not be empty if the index tree is
+not totally empty! */
+UNIV_INTERN
+void
+btr_pcur_store_position(
+/*====================*/
+	btr_pcur_t*	cursor, /*!< in: persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_cur_t*	page_cursor;
+	buf_block_t*	block;
+	rec_t*		rec;
+	dict_index_t*	index;
+	page_t*		page;
+	ulint		offs;
+
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	block = btr_pcur_get_block(cursor);
+	index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor));
+
+	page_cursor = btr_pcur_get_page_cur(cursor);
+
+	rec = page_cur_get_rec(page_cursor);
+	page = page_align(rec);
+	offs = page_offset(rec);
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_S_FIX)
+	      || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+
+	if (page_is_empty(page)) {
+		/* It must be an empty index tree; NOTE that in this case
+		we do not store the modify_clock, but always do a search
+		if we restore the cursor position */
+
+		ut_a(btr_page_get_next(page, mtr) == FIL_NULL);
+		ut_a(btr_page_get_prev(page, mtr) == FIL_NULL);
+		ut_ad(page_is_leaf(page));
+		ut_ad(page_get_page_no(page) == index->page);
+
+		cursor->old_stored = BTR_PCUR_OLD_STORED;
+
+		if (page_rec_is_supremum_low(offs)) {
+
+			cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE;
+		} else {
+			cursor->rel_pos = BTR_PCUR_BEFORE_FIRST_IN_TREE;
+		}
+
+		return;
+	}
+
+	if (page_rec_is_supremum_low(offs)) {
+
+		rec = page_rec_get_prev(rec);
+
+		cursor->rel_pos = BTR_PCUR_AFTER;
+
+	} else if (page_rec_is_infimum_low(offs)) {
+
+		rec = page_rec_get_next(rec);
+
+		cursor->rel_pos = BTR_PCUR_BEFORE;
+	} else {
+		cursor->rel_pos = BTR_PCUR_ON;
+	}
+
+	cursor->old_stored = BTR_PCUR_OLD_STORED;
+	cursor->old_rec = dict_index_copy_rec_order_prefix(
+		index, rec, &cursor->old_n_fields,
+		&cursor->old_rec_buf, &cursor->buf_size);
+
+	cursor->block_when_stored = block;
+	cursor->modify_clock = buf_block_get_modify_clock(block);
+}
+
+/**************************************************************//**
+Copies the stored position of a pcur to another pcur. */
+UNIV_INTERN
+void
+btr_pcur_copy_stored_position(
+/*==========================*/
+	btr_pcur_t*	pcur_receive,	/*!< in: pcur which will receive the
+					position info */
+	btr_pcur_t*	pcur_donate)	/*!< in: pcur from which the info is
+					copied */
+{
+	if (pcur_receive->old_rec_buf) {
+		mem_free(pcur_receive->old_rec_buf);
+	}
+
+	ut_memcpy(pcur_receive, pcur_donate, sizeof(btr_pcur_t));
+
+	if (pcur_donate->old_rec_buf) {
+
+		pcur_receive->old_rec_buf = (byte*)
+			mem_alloc(pcur_donate->buf_size);
+
+		ut_memcpy(pcur_receive->old_rec_buf, pcur_donate->old_rec_buf,
+			  pcur_donate->buf_size);
+		pcur_receive->old_rec = pcur_receive->old_rec_buf
+			+ (pcur_donate->old_rec - pcur_donate->old_rec_buf);
+	}
+
+	pcur_receive->old_n_fields = pcur_donate->old_n_fields;
+}
+
+/**************************************************************//**
+Restores the stored position of a persistent cursor bufferfixing the page and
+obtaining the specified latches. If the cursor position was saved when the
+(1) cursor was positioned on a user record: this function restores the position
+to the last record LESS OR EQUAL to the stored record;
+(2) cursor was positioned on a page infimum record: restores the position to
+the last record LESS than the user record which was the successor of the page
+infimum;
+(3) cursor was positioned on the page supremum: restores to the first record
+GREATER than the user record which was the predecessor of the supremum.
+(4) cursor was positioned before the first or after the last in an empty tree:
+restores to before first or after the last in the tree.
+@return TRUE if the cursor position was stored when it was on a user
+record and it can be restored on a user record whose ordering fields
+are identical to the ones of the original user record */
+UNIV_INTERN
+ibool
+btr_pcur_restore_position_func(
+/*===========================*/
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_pcur_t*	cursor,		/*!< in: detached persistent cursor */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	dict_index_t*	index;
+	dtuple_t*	tuple;
+	ulint		mode;
+	ulint		old_mode;
+	mem_heap_t*	heap;
+
+	ut_ad(mtr);
+	ut_ad(mtr->state == MTR_ACTIVE);
+	ut_ad(cursor->old_stored == BTR_PCUR_OLD_STORED);
+	ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED
+	      || cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+	index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor));
+
+	if (UNIV_UNLIKELY
+	    (cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE
+	     || cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE)) {
+
+		/* In these cases we do not try an optimistic restoration,
+		but always do a search */
+
+		btr_cur_open_at_index_side(
+			cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE,
+			index, latch_mode,
+			btr_pcur_get_btr_cur(cursor), 0, mtr);
+
+		cursor->latch_mode = latch_mode;
+		cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+		cursor->block_when_stored = btr_pcur_get_block(cursor);
+
+		return(FALSE);
+	}
+
+	ut_a(cursor->old_rec);
+	ut_a(cursor->old_n_fields);
+
+	if (UNIV_LIKELY(latch_mode == BTR_SEARCH_LEAF)
+	    || UNIV_LIKELY(latch_mode == BTR_MODIFY_LEAF)) {
+		/* Try optimistic restoration. */
+
+		if (buf_page_optimistic_get(latch_mode,
+					    cursor->block_when_stored,
+					    cursor->modify_clock,
+					    file, line, mtr)) {
+			cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+			cursor->latch_mode = latch_mode;
+
+			buf_block_dbg_add_level(
+				btr_pcur_get_block(cursor),
+				dict_index_is_ibuf(index)
+				? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
+
+			if (cursor->rel_pos == BTR_PCUR_ON) {
+#ifdef UNIV_DEBUG
+				const rec_t*	rec;
+				const ulint*	offsets1;
+				const ulint*	offsets2;
+				rec = btr_pcur_get_rec(cursor);
+
+				heap = mem_heap_create(256);
+				offsets1 = rec_get_offsets(
+					cursor->old_rec, index, NULL,
+					cursor->old_n_fields, &heap);
+				offsets2 = rec_get_offsets(
+					rec, index, NULL,
+					cursor->old_n_fields, &heap);
+
+				ut_ad(!cmp_rec_rec(cursor->old_rec,
+						   rec, offsets1, offsets2,
+						   index));
+				mem_heap_free(heap);
+#endif /* UNIV_DEBUG */
+				return(TRUE);
+			}
+			/* This is the same record as stored,
+			may need to be adjusted for BTR_PCUR_BEFORE/AFTER,
+			depending on search mode and direction. */
+			if (btr_pcur_is_on_user_rec(cursor)) {
+				cursor->pos_state
+					= BTR_PCUR_IS_POSITIONED_OPTIMISTIC;
+			}
+			return(FALSE);
+		}
+	}
+
+	/* If optimistic restoration did not succeed, open the cursor anew */
+
+	heap = mem_heap_create(256);
+
+	tuple = dict_index_build_data_tuple(index, cursor->old_rec,
+					    cursor->old_n_fields, heap);
+
+	/* Save the old search mode of the cursor */
+	old_mode = cursor->search_mode;
+
+	switch (cursor->rel_pos) {
+	case BTR_PCUR_ON:
+		mode = PAGE_CUR_LE;
+		break;
+	case BTR_PCUR_AFTER:
+		mode = PAGE_CUR_G;
+		break;
+	case BTR_PCUR_BEFORE:
+		mode = PAGE_CUR_L;
+		break;
+	default:
+		ut_error;
+		mode = 0;
+	}
+
+	btr_pcur_open_with_no_init_func(index, tuple, mode, latch_mode,
+					cursor, 0, file, line, mtr);
+
+	/* Restore the old search mode */
+	cursor->search_mode = old_mode;
+
+	switch (cursor->rel_pos) {
+	case BTR_PCUR_ON:
+		if (btr_pcur_is_on_user_rec(cursor)
+		    && !cmp_dtuple_rec(
+			    tuple, btr_pcur_get_rec(cursor),
+			    rec_get_offsets(btr_pcur_get_rec(cursor),
+					    index, NULL,
+					    ULINT_UNDEFINED, &heap))) {
+
+			/* We have to store the NEW value for
+			the modify clock, since the cursor can
+			now be on a different page! But we can
+			retain the value of old_rec */
+
+			cursor->block_when_stored =
+				btr_pcur_get_block(cursor);
+			cursor->modify_clock =
+				buf_block_get_modify_clock(
+					cursor->block_when_stored);
+			cursor->old_stored = BTR_PCUR_OLD_STORED;
+
+			mem_heap_free(heap);
+
+			return(TRUE);
+		}
+#ifdef UNIV_DEBUG
+		/* fall through */
+	case BTR_PCUR_BEFORE:
+	case BTR_PCUR_AFTER:
+		break;
+	default:
+		ut_error;
+#endif /* UNIV_DEBUG */
+	}
+
+	mem_heap_free(heap);
+
+	/* We have to store new position information, modify_clock etc.,
+	to the cursor because it can now be on a different page, the record
+	under it may have been removed, etc. */
+
+	btr_pcur_store_position(cursor, mtr);
+
+	return(FALSE);
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the first record on the next page. Releases the
+latch on the current page, and bufferunfixes it. Note that there must not be
+modifications on the current page, as then the x-latch can be released only in
+mtr_commit. */
+UNIV_INTERN
+void
+btr_pcur_move_to_next_page(
+/*=======================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; must be on the
+				last record of the current page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint		next_page_no;
+	ulint		space;
+	ulint		zip_size;
+	page_t*		page;
+	buf_block_t*	next_block;
+	page_t*		next_page;
+
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+	ut_ad(btr_pcur_is_after_last_on_page(cursor));
+
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+	page = btr_pcur_get_page(cursor);
+	next_page_no = btr_page_get_next(page, mtr);
+	space = buf_block_get_space(btr_pcur_get_block(cursor));
+	zip_size = buf_block_get_zip_size(btr_pcur_get_block(cursor));
+
+	ut_ad(next_page_no != FIL_NULL);
+
+	next_block = btr_block_get(space, zip_size, next_page_no,
+				   cursor->latch_mode,
+				   btr_pcur_get_btr_cur(cursor)->index, mtr);
+	next_page = buf_block_get_frame(next_block);
+#ifdef UNIV_BTR_DEBUG
+	ut_a(page_is_comp(next_page) == page_is_comp(page));
+	ut_a(btr_page_get_prev(next_page, mtr)
+	     == buf_block_get_page_no(btr_pcur_get_block(cursor)));
+#endif /* UNIV_BTR_DEBUG */
+	next_block->check_index_page_at_flush = TRUE;
+
+	btr_leaf_page_release(btr_pcur_get_block(cursor),
+			      cursor->latch_mode, mtr);
+
+	page_cur_set_before_first(next_block, btr_pcur_get_page_cur(cursor));
+
+	page_check_dir(next_page);
+}
+
+/*********************************************************//**
+Moves the persistent cursor backward if it is on the first record of the page.
+Commits mtr. Note that to prevent a possible deadlock, the operation
+first stores the position of the cursor, commits mtr, acquires the necessary
+latches and restores the cursor position again before returning. The
+alphabetical position of the cursor is guaranteed to be sensible on
+return, but it may happen that the cursor is not positioned on the last
+record of any page, because the structure of the tree may have changed
+during the time when the cursor had no latches. */
+UNIV_INTERN
+void
+btr_pcur_move_backward_from_page(
+/*=============================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor, must be on the first
+				record of the current page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint		prev_page_no;
+	page_t*		page;
+	buf_block_t*	prev_block;
+	ulint		latch_mode;
+	ulint		latch_mode2;
+
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+	ut_ad(btr_pcur_is_before_first_on_page(cursor));
+	ut_ad(!btr_pcur_is_before_first_in_tree(cursor, mtr));
+
+	latch_mode = cursor->latch_mode;
+
+	if (latch_mode == BTR_SEARCH_LEAF) {
+
+		latch_mode2 = BTR_SEARCH_PREV;
+
+	} else if (latch_mode == BTR_MODIFY_LEAF) {
+
+		latch_mode2 = BTR_MODIFY_PREV;
+	} else {
+		latch_mode2 = 0; /* To eliminate compiler warning */
+		ut_error;
+	}
+
+	btr_pcur_store_position(cursor, mtr);
+
+	mtr_commit(mtr);
+
+	mtr_start(mtr);
+
+	btr_pcur_restore_position(latch_mode2, cursor, mtr);
+
+	page = btr_pcur_get_page(cursor);
+
+	prev_page_no = btr_page_get_prev(page, mtr);
+
+	if (prev_page_no == FIL_NULL) {
+	} else if (btr_pcur_is_before_first_on_page(cursor)) {
+
+		prev_block = btr_pcur_get_btr_cur(cursor)->left_block;
+
+		btr_leaf_page_release(btr_pcur_get_block(cursor),
+				      latch_mode, mtr);
+
+		page_cur_set_after_last(prev_block,
+					btr_pcur_get_page_cur(cursor));
+	} else {
+
+		/* The repositioned cursor did not end on an infimum record on
+		a page. Cursor repositioning acquired a latch also on the
+		previous page, but we do not need the latch: release it. */
+
+		prev_block = btr_pcur_get_btr_cur(cursor)->left_block;
+
+		btr_leaf_page_release(prev_block, latch_mode, mtr);
+	}
+
+	cursor->latch_mode = latch_mode;
+
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the previous record in the tree. If no records
+are left, the cursor stays 'before first in tree'.
+@return	TRUE if the cursor was not before first in tree */
+UNIV_INTERN
+ibool
+btr_pcur_move_to_prev(
+/*==================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+	if (btr_pcur_is_before_first_on_page(cursor)) {
+
+		if (btr_pcur_is_before_first_in_tree(cursor, mtr)) {
+
+			return(FALSE);
+		}
+
+		btr_pcur_move_backward_from_page(cursor, mtr);
+
+		return(TRUE);
+	}
+
+	btr_pcur_move_to_prev_on_page(cursor);
+
+	return(TRUE);
+}
+
+/**************************************************************//**
+If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first
+user record satisfying the search condition, in the case PAGE_CUR_L or
+PAGE_CUR_LE, on the last user record. If no such user record exists, then
+in the first case sets the cursor after last in tree, and in the latter case
+before first in tree. The latching mode must be BTR_SEARCH_LEAF or
+BTR_MODIFY_LEAF. */
+UNIV_INTERN
+void
+btr_pcur_open_on_user_rec_func(
+/*===========================*/
+	dict_index_t*	index,		/*!< in: index */
+	const dtuple_t*	tuple,		/*!< in: tuple on which search done */
+	ulint		mode,		/*!< in: PAGE_CUR_L, ... */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF or
+					BTR_MODIFY_LEAF */
+	btr_pcur_t*	cursor,		/*!< in: memory buffer for persistent
+					cursor */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	btr_pcur_open_low(index, 0, tuple, mode, latch_mode, cursor,
+			  file, line, mtr);
+
+	if ((mode == PAGE_CUR_GE) || (mode == PAGE_CUR_G)) {
+
+		if (btr_pcur_is_after_last_on_page(cursor)) {
+
+			btr_pcur_move_to_next_user_rec(cursor, mtr);
+		}
+	} else {
+		ut_ad((mode == PAGE_CUR_LE) || (mode == PAGE_CUR_L));
+
+		/* Not implemented yet */
+
+		ut_error;
+	}
+}
diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc
new file mode 100644
index 00000000000..df70f8a1130
--- /dev/null
+++ b/storage/innobase/btr/btr0sea.cc
@@ -0,0 +1,2020 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file btr/btr0sea.cc
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "btr0sea.h"
+#ifdef UNIV_NONINL
+#include "btr0sea.ic"
+#endif
+
+#include "buf0buf.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "btr0cur.h"
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "ha0ha.h"
+
+/** Flag: has the search system been enabled?
+Protected by btr_search_latch. */
+UNIV_INTERN char		btr_search_enabled	= TRUE;
+
+/** A dummy variable to fool the compiler */
+UNIV_INTERN ulint		btr_search_this_is_zero = 0;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+/** Number of successful adaptive hash index lookups */
+UNIV_INTERN ulint		btr_search_n_succ	= 0;
+/** Number of failed adaptive hash index lookups */
+UNIV_INTERN ulint		btr_search_n_hash_fail	= 0;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+/** padding to prevent other memory update
+hotspots from residing on the same memory
+cache line as btr_search_latch */
+UNIV_INTERN byte		btr_sea_pad1[64];
+
+/** The latch protecting the adaptive search system: this latch protects the
+(1) positions of records on those pages where a hash index has been built.
+NOTE: It does not protect values of non-ordering fields within a record from
+being updated in-place! We can use fact (1) to perform unique searches to
+indexes. */
+
+/* We will allocate the latch from dynamic memory to get it to the
+same DRAM page as other hotspot semaphores */
+UNIV_INTERN rw_lock_t*		btr_search_latch_temp;
+
+/** padding to prevent other memory update hotspots from residing on
+the same memory cache line */
+UNIV_INTERN byte		btr_sea_pad2[64];
+
+/** The adaptive hash index */
+UNIV_INTERN btr_search_sys_t*	btr_search_sys;
+
+#ifdef UNIV_PFS_RWLOCK
+/* Key to register btr_search_sys with performance schema */
+UNIV_INTERN mysql_pfs_key_t	btr_search_latch_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+/** If the number of records on the page divided by this parameter
+would have been successfully accessed using a hash index, the index
+is then built on the page, assuming the global limit has been reached */
+#define BTR_SEARCH_PAGE_BUILD_LIMIT	16
+
+/** The global limit for consecutive potentially successful hash searches,
+before hash index building is started */
+#define BTR_SEARCH_BUILD_LIMIT		100
+
+/********************************************************************//**
+Builds a hash index on a page with the given parameters. If the page already
+has a hash index with different parameters, the old hash index is removed.
+If index is non-NULL, this function checks if n_fields and n_bytes are
+sensible values, and does not build a hash index if not. */
+static
+void
+btr_search_build_page_hash_index(
+/*=============================*/
+	dict_index_t*	index,	/*!< in: index for which to build, or NULL if
+				not known */
+	buf_block_t*	block,	/*!< in: index page, s- or x-latched */
+	ulint		n_fields,/*!< in: hash this many full fields */
+	ulint		n_bytes,/*!< in: hash this many bytes from the next
+				field */
+	ibool		left_side);/*!< in: hash for searches from left side? */
+
+/*****************************************************************//**
+This function should be called before reserving any btr search mutex, if
+the intended operation might add nodes to the search system hash table.
+Because of the latching order, once we have reserved the btr search system
+latch, we cannot allocate a free frame from the buffer pool. Checks that
+there is a free buffer frame allocated for hash table heap in the btr search
+system. If not, allocates a free frames for the heap. This check makes it
+probable that, when have reserved the btr search system latch and we need to
+allocate a new node to the hash table, it will succeed. However, the check
+will not guarantee success. */
+static
+void
+btr_search_check_free_space_in_heap(void)
+/*=====================================*/
+{
+	hash_table_t*	table;
+	mem_heap_t*	heap;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	table = btr_search_sys->hash_index;
+
+	heap = table->heap;
+
+	/* Note that we peek the value of heap->free_block without reserving
+	the latch: this is ok, because we will not guarantee that there will
+	be enough free space in the hash table. */
+
+	if (heap->free_block == NULL) {
+		buf_block_t*	block = buf_block_alloc(NULL);
+
+		rw_lock_x_lock(&btr_search_latch);
+
+		if (heap->free_block == NULL) {
+			heap->free_block = block;
+		} else {
+			buf_block_free(block);
+		}
+
+		rw_lock_x_unlock(&btr_search_latch);
+	}
+}
+
+/*****************************************************************//**
+Creates and initializes the adaptive search system at a database start. */
+UNIV_INTERN
+void
+btr_search_sys_create(
+/*==================*/
+	ulint	hash_size)	/*!< in: hash index hash table size */
+{
+	/* We allocate the search latch from dynamic memory:
+	see above at the global variable definition */
+
+	btr_search_latch_temp = (rw_lock_t*) mem_alloc(sizeof(rw_lock_t));
+
+	rw_lock_create(btr_search_latch_key, &btr_search_latch,
+		       SYNC_SEARCH_SYS);
+
+	btr_search_sys = (btr_search_sys_t*)
+		mem_alloc(sizeof(btr_search_sys_t));
+
+	btr_search_sys->hash_index = ha_create(hash_size, 0,
+					MEM_HEAP_FOR_BTR_SEARCH, 0);
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	btr_search_sys->hash_index->adaptive = TRUE;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+}
+
+/*****************************************************************//**
+Frees the adaptive search system at a database shutdown. */
+UNIV_INTERN
+void
+btr_search_sys_free(void)
+/*=====================*/
+{
+	rw_lock_free(&btr_search_latch);
+	mem_free(btr_search_latch_temp);
+	btr_search_latch_temp = NULL;
+	mem_heap_free(btr_search_sys->hash_index->heap);
+	hash_table_free(btr_search_sys->hash_index);
+	mem_free(btr_search_sys);
+	btr_search_sys = NULL;
+}
+
+/********************************************************************//**
+Set index->ref_count = 0 on all indexes of a table. */
+static
+void
+btr_search_disable_ref_count(
+/*=========================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	dict_index_t*	index;
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	for (index = dict_table_get_first_index(table); index;
+	     index = dict_table_get_next_index(index)) {
+
+		index->search_info->ref_count = 0;
+	}
+}
+
+/********************************************************************//**
+Disable the adaptive hash search system and empty the index. */
+UNIV_INTERN
+void
+btr_search_disable(void)
+/*====================*/
+{
+	dict_table_t*	table;
+
+	mutex_enter(&dict_sys->mutex);
+	rw_lock_x_lock(&btr_search_latch);
+
+	btr_search_enabled = FALSE;
+
+	/* Clear the index->search_info->ref_count of every index in
+	the data dictionary cache. */
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU); table;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+		btr_search_disable_ref_count(table);
+	}
+
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU); table;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+		btr_search_disable_ref_count(table);
+	}
+
+	mutex_exit(&dict_sys->mutex);
+
+	/* Set all block->index = NULL. */
+	buf_pool_clear_hash_index();
+
+	/* Clear the adaptive hash index. */
+	hash_table_clear(btr_search_sys->hash_index);
+	mem_heap_empty(btr_search_sys->hash_index->heap);
+
+	rw_lock_x_unlock(&btr_search_latch);
+}
+
+/********************************************************************//**
+Enable the adaptive hash search system. */
+UNIV_INTERN
+void
+btr_search_enable(void)
+/*====================*/
+{
+	rw_lock_x_lock(&btr_search_latch);
+
+	btr_search_enabled = TRUE;
+
+	rw_lock_x_unlock(&btr_search_latch);
+}
+
+/*****************************************************************//**
+Creates and initializes a search info struct.
+@return	own: search info struct */
+UNIV_INTERN
+btr_search_t*
+btr_search_info_create(
+/*===================*/
+	mem_heap_t*	heap)	/*!< in: heap where created */
+{
+	btr_search_t*	info;
+
+	info = (btr_search_t*) mem_heap_alloc(heap, sizeof(btr_search_t));
+
+#ifdef UNIV_DEBUG
+	info->magic_n = BTR_SEARCH_MAGIC_N;
+#endif /* UNIV_DEBUG */
+
+	info->ref_count = 0;
+	info->root_guess = NULL;
+
+	info->hash_analysis = 0;
+	info->n_hash_potential = 0;
+
+	info->last_hash_succ = FALSE;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+	info->n_hash_succ = 0;
+	info->n_hash_fail = 0;
+	info->n_patt_succ = 0;
+	info->n_searches = 0;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+	/* Set some sensible values */
+	info->n_fields = 1;
+	info->n_bytes = 0;
+
+	info->left_side = TRUE;
+
+	return(info);
+}
+
+/*****************************************************************//**
+Returns the value of ref_count. The value is protected by
+btr_search_latch.
+@return	ref_count value. */
+UNIV_INTERN
+ulint
+btr_search_info_get_ref_count(
+/*==========================*/
+	btr_search_t*   info)	/*!< in: search info. */
+{
+	ulint ret;
+
+	ut_ad(info);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	rw_lock_s_lock(&btr_search_latch);
+	ret = info->ref_count;
+	rw_lock_s_unlock(&btr_search_latch);
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Updates the search info of an index about hash successes. NOTE that info
+is NOT protected by any semaphore, to save CPU time! Do not assume its fields
+are consistent. */
+static
+void
+btr_search_info_update_hash(
+/*========================*/
+	btr_search_t*	info,	/*!< in/out: search info */
+	btr_cur_t*	cursor)	/*!< in: cursor which was just positioned */
+{
+	dict_index_t*	index;
+	ulint		n_unique;
+	int		cmp;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	index = cursor->index;
+
+	if (dict_index_is_ibuf(index)) {
+		/* So many deletes are performed on an insert buffer tree
+		that we do not consider a hash index useful on it: */
+
+		return;
+	}
+
+	n_unique = dict_index_get_n_unique_in_tree(index);
+
+	if (info->n_hash_potential == 0) {
+
+		goto set_new_recomm;
+	}
+
+	/* Test if the search would have succeeded using the recommended
+	hash prefix */
+
+	if (info->n_fields >= n_unique && cursor->up_match >= n_unique) {
+increment_potential:
+		info->n_hash_potential++;
+
+		return;
+	}
+
+	cmp = ut_pair_cmp(info->n_fields, info->n_bytes,
+			  cursor->low_match, cursor->low_bytes);
+
+	if (info->left_side ? cmp <= 0 : cmp > 0) {
+
+		goto set_new_recomm;
+	}
+
+	cmp = ut_pair_cmp(info->n_fields, info->n_bytes,
+			  cursor->up_match, cursor->up_bytes);
+
+	if (info->left_side ? cmp <= 0 : cmp > 0) {
+
+		goto increment_potential;
+	}
+
+set_new_recomm:
+	/* We have to set a new recommendation; skip the hash analysis
+	for a while to avoid unnecessary CPU time usage when there is no
+	chance for success */
+
+	info->hash_analysis = 0;
+
+	cmp = ut_pair_cmp(cursor->up_match, cursor->up_bytes,
+			  cursor->low_match, cursor->low_bytes);
+	if (cmp == 0) {
+		info->n_hash_potential = 0;
+
+		/* For extra safety, we set some sensible values here */
+
+		info->n_fields = 1;
+		info->n_bytes = 0;
+
+		info->left_side = TRUE;
+
+	} else if (cmp > 0) {
+		info->n_hash_potential = 1;
+
+		if (cursor->up_match >= n_unique) {
+
+			info->n_fields = n_unique;
+			info->n_bytes = 0;
+
+		} else if (cursor->low_match < cursor->up_match) {
+
+			info->n_fields = cursor->low_match + 1;
+			info->n_bytes = 0;
+		} else {
+			info->n_fields = cursor->low_match;
+			info->n_bytes = cursor->low_bytes + 1;
+		}
+
+		info->left_side = TRUE;
+	} else {
+		info->n_hash_potential = 1;
+
+		if (cursor->low_match >= n_unique) {
+
+			info->n_fields = n_unique;
+			info->n_bytes = 0;
+
+		} else if (cursor->low_match > cursor->up_match) {
+
+			info->n_fields = cursor->up_match + 1;
+			info->n_bytes = 0;
+		} else {
+			info->n_fields = cursor->up_match;
+			info->n_bytes = cursor->up_bytes + 1;
+		}
+
+		info->left_side = FALSE;
+	}
+}
+
+/*********************************************************************//**
+Updates the block search info on hash successes. NOTE that info and
+block->n_hash_helps, n_fields, n_bytes, side are NOT protected by any
+semaphore, to save CPU time! Do not assume the fields are consistent.
+@return	TRUE if building a (new) hash index on the block is recommended */
+static
+ibool
+btr_search_update_block_hash_info(
+/*==============================*/
+	btr_search_t*	info,	/*!< in: search info */
+	buf_block_t*	block,	/*!< in: buffer block */
+	btr_cur_t*	cursor __attribute__((unused)))
+				/*!< in: cursor */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+	ut_ad(rw_lock_own(&block->lock, RW_LOCK_SHARED)
+	      || rw_lock_own(&block->lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(cursor);
+
+	info->last_hash_succ = FALSE;
+
+	ut_a(buf_block_state_valid(block));
+	ut_ad(info->magic_n == BTR_SEARCH_MAGIC_N);
+
+	if ((block->n_hash_helps > 0)
+	    && (info->n_hash_potential > 0)
+	    && (block->n_fields == info->n_fields)
+	    && (block->n_bytes == info->n_bytes)
+	    && (block->left_side == info->left_side)) {
+
+		if ((block->index)
+		    && (block->curr_n_fields == info->n_fields)
+		    && (block->curr_n_bytes == info->n_bytes)
+		    && (block->curr_left_side == info->left_side)) {
+
+			/* The search would presumably have succeeded using
+			the hash index */
+
+			info->last_hash_succ = TRUE;
+		}
+
+		block->n_hash_helps++;
+	} else {
+		block->n_hash_helps = 1;
+		block->n_fields = info->n_fields;
+		block->n_bytes = info->n_bytes;
+		block->left_side = info->left_side;
+	}
+
+#ifdef UNIV_DEBUG
+	if (cursor->index->table->does_not_fit_in_memory) {
+		block->n_hash_helps = 0;
+	}
+#endif /* UNIV_DEBUG */
+
+	if ((block->n_hash_helps > page_get_n_recs(block->frame)
+	     / BTR_SEARCH_PAGE_BUILD_LIMIT)
+	    && (info->n_hash_potential >= BTR_SEARCH_BUILD_LIMIT)) {
+
+		if ((!block->index)
+		    || (block->n_hash_helps
+			> 2 * page_get_n_recs(block->frame))
+		    || (block->n_fields != block->curr_n_fields)
+		    || (block->n_bytes != block->curr_n_bytes)
+		    || (block->left_side != block->curr_left_side)) {
+
+			/* Build a new hash index on the page */
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Updates a hash node reference when it has been unsuccessfully used in a
+search which could have succeeded with the used hash parameters. This can
+happen because when building a hash index for a page, we do not check
+what happens at page boundaries, and therefore there can be misleading
+hash nodes. Also, collisions in the fold value can lead to misleading
+references. This function lazily fixes these imperfections in the hash
+index. */
+static
+void
+btr_search_update_hash_ref(
+/*=======================*/
+	btr_search_t*	info,	/*!< in: search info */
+	buf_block_t*	block,	/*!< in: buffer block where cursor positioned */
+	btr_cur_t*	cursor)	/*!< in: cursor */
+{
+	dict_index_t*	index;
+	ulint		fold;
+	const rec_t*	rec;
+
+	ut_ad(cursor->flag == BTR_CUR_HASH_FAIL);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED)
+	      || rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(page_align(btr_cur_get_rec(cursor))
+	      == buf_block_get_frame(block));
+
+	index = block->index;
+
+	if (!index) {
+
+		return;
+	}
+
+	ut_a(index == cursor->index);
+	ut_a(!dict_index_is_ibuf(index));
+
+	if ((info->n_hash_potential > 0)
+	    && (block->curr_n_fields == info->n_fields)
+	    && (block->curr_n_bytes == info->n_bytes)
+	    && (block->curr_left_side == info->left_side)) {
+		mem_heap_t*	heap		= NULL;
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		rec_offs_init(offsets_);
+
+		rec = btr_cur_get_rec(cursor);
+
+		if (!page_rec_is_user_rec(rec)) {
+
+			return;
+		}
+
+		fold = rec_fold(rec,
+				rec_get_offsets(rec, index, offsets_,
+						ULINT_UNDEFINED, &heap),
+				block->curr_n_fields,
+				block->curr_n_bytes, index->id);
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+		ha_insert_for_fold(btr_search_sys->hash_index, fold,
+				   block, rec);
+
+		MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+	}
+}
+
+/*********************************************************************//**
+Updates the search info. */
+UNIV_INTERN
+void
+btr_search_info_update_slow(
+/*========================*/
+	btr_search_t*	info,	/*!< in/out: search info */
+	btr_cur_t*	cursor)	/*!< in: cursor which was just positioned */
+{
+	buf_block_t*	block;
+	ibool		build_index;
+	ulint*		params;
+	ulint*		params2;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	block = btr_cur_get_block(cursor);
+
+	/* NOTE that the following two function calls do NOT protect
+	info or block->n_fields etc. with any semaphore, to save CPU time!
+	We cannot assume the fields are consistent when we return from
+	those functions! */
+
+	btr_search_info_update_hash(info, cursor);
+
+	build_index = btr_search_update_block_hash_info(info, block, cursor);
+
+	if (build_index || (cursor->flag == BTR_CUR_HASH_FAIL)) {
+
+		btr_search_check_free_space_in_heap();
+	}
+
+	if (cursor->flag == BTR_CUR_HASH_FAIL) {
+		/* Update the hash node reference, if appropriate */
+
+#ifdef UNIV_SEARCH_PERF_STAT
+		btr_search_n_hash_fail++;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+		rw_lock_x_lock(&btr_search_latch);
+
+		btr_search_update_hash_ref(info, block, cursor);
+
+		rw_lock_x_unlock(&btr_search_latch);
+	}
+
+	if (build_index) {
+		/* Note that since we did not protect block->n_fields etc.
+		with any semaphore, the values can be inconsistent. We have
+		to check inside the function call that they make sense. We
+		also malloc an array and store the values there to make sure
+		the compiler does not let the function call parameters change
+		inside the called function. It might be that the compiler
+		would optimize the call just to pass pointers to block. */
+
+		params = (ulint*) mem_alloc(3 * sizeof(ulint));
+		params[0] = block->n_fields;
+		params[1] = block->n_bytes;
+		params[2] = block->left_side;
+
+		/* Make sure the compiler cannot deduce the values and do
+		optimizations */
+
+		params2 = params + btr_search_this_is_zero;
+
+		btr_search_build_page_hash_index(cursor->index,
+						 block,
+						 params2[0],
+						 params2[1],
+						 params2[2]);
+		mem_free(params);
+	}
+}
+
+/******************************************************************//**
+Checks if a guessed position for a tree cursor is right. Note that if
+mode is PAGE_CUR_LE, which is used in inserts, and the function returns
+TRUE, then cursor->up_match and cursor->low_match both have sensible values.
+@return	TRUE if success */
+static
+ibool
+btr_search_check_guess(
+/*===================*/
+	btr_cur_t*	cursor,	/*!< in: guessed cursor position */
+	ibool		can_only_compare_to_cursor_rec,
+				/*!< in: if we do not have a latch on the page
+				of cursor, but only a latch on
+				btr_search_latch, then ONLY the columns
+				of the record UNDER the cursor are
+				protected, not the next or previous record
+				in the chain: we cannot look at the next or
+				previous record to check our guess! */
+	const dtuple_t*	tuple,	/*!< in: data tuple */
+	ulint		mode,	/*!< in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G,
+				or PAGE_CUR_GE */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	rec_t*		rec;
+	ulint		n_unique;
+	ulint		match;
+	ulint		bytes;
+	int		cmp;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	ibool		success		= FALSE;
+	rec_offs_init(offsets_);
+
+	n_unique = dict_index_get_n_unique_in_tree(cursor->index);
+
+	rec = btr_cur_get_rec(cursor);
+
+	ut_ad(page_rec_is_user_rec(rec));
+
+	match = 0;
+	bytes = 0;
+
+	offsets = rec_get_offsets(rec, cursor->index, offsets,
+				  n_unique, &heap);
+	cmp = page_cmp_dtuple_rec_with_match(tuple, rec,
+					     offsets, &match, &bytes);
+
+	if (mode == PAGE_CUR_GE) {
+		if (cmp == 1) {
+			goto exit_func;
+		}
+
+		cursor->up_match = match;
+
+		if (match >= n_unique) {
+			success = TRUE;
+			goto exit_func;
+		}
+	} else if (mode == PAGE_CUR_LE) {
+		if (cmp == -1) {
+			goto exit_func;
+		}
+
+		cursor->low_match = match;
+
+	} else if (mode == PAGE_CUR_G) {
+		if (cmp != -1) {
+			goto exit_func;
+		}
+	} else if (mode == PAGE_CUR_L) {
+		if (cmp != 1) {
+			goto exit_func;
+		}
+	}
+
+	if (can_only_compare_to_cursor_rec) {
+		/* Since we could not determine if our guess is right just by
+		looking at the record under the cursor, return FALSE */
+		goto exit_func;
+	}
+
+	match = 0;
+	bytes = 0;
+
+	if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE)) {
+		rec_t*	prev_rec;
+
+		ut_ad(!page_rec_is_infimum(rec));
+
+		prev_rec = page_rec_get_prev(rec);
+
+		if (page_rec_is_infimum(prev_rec)) {
+			success = btr_page_get_prev(page_align(prev_rec), mtr)
+				== FIL_NULL;
+
+			goto exit_func;
+		}
+
+		offsets = rec_get_offsets(prev_rec, cursor->index, offsets,
+					  n_unique, &heap);
+		cmp = page_cmp_dtuple_rec_with_match(tuple, prev_rec,
+						     offsets, &match, &bytes);
+		if (mode == PAGE_CUR_GE) {
+			success = cmp == 1;
+		} else {
+			success = cmp != -1;
+		}
+
+		goto exit_func;
+	} else {
+		rec_t*	next_rec;
+
+		ut_ad(!page_rec_is_supremum(rec));
+
+		next_rec = page_rec_get_next(rec);
+
+		if (page_rec_is_supremum(next_rec)) {
+			if (btr_page_get_next(page_align(next_rec), mtr)
+			    == FIL_NULL) {
+
+				cursor->up_match = 0;
+				success = TRUE;
+			}
+
+			goto exit_func;
+		}
+
+		offsets = rec_get_offsets(next_rec, cursor->index, offsets,
+					  n_unique, &heap);
+		cmp = page_cmp_dtuple_rec_with_match(tuple, next_rec,
+						     offsets, &match, &bytes);
+		if (mode == PAGE_CUR_LE) {
+			success = cmp == -1;
+			cursor->up_match = match;
+		} else {
+			success = cmp != 1;
+		}
+	}
+exit_func:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(success);
+}
+
+/******************************************************************//**
+Tries to guess the right search position based on the hash search info
+of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts,
+and the function returns TRUE, then cursor->up_match and cursor->low_match
+both have sensible values.
+@return	TRUE if succeeded */
+UNIV_INTERN
+ibool
+btr_search_guess_on_hash(
+/*=====================*/
+	dict_index_t*	index,		/*!< in: index */
+	btr_search_t*	info,		/*!< in: index search info */
+	const dtuple_t*	tuple,		/*!< in: logical record */
+	ulint		mode,		/*!< in: PAGE_CUR_L, ... */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ...;
+					NOTE that only if has_search_latch
+					is 0, we will have a latch set on
+					the cursor page, otherwise we assume
+					the caller uses his search latch
+					to protect the record! */
+	btr_cur_t*	cursor,		/*!< out: tree cursor */
+	ulint		has_search_latch,/*!< in: latch mode the caller
+					currently has on btr_search_latch:
+					RW_S_LATCH, RW_X_LATCH, or 0 */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	buf_pool_t*	buf_pool;
+	buf_block_t*	block;
+	const rec_t*	rec;
+	ulint		fold;
+	index_id_t	index_id;
+#ifdef notdefined
+	btr_cur_t	cursor2;
+	btr_pcur_t	pcur;
+#endif
+	ut_ad(index && info && tuple && cursor && mtr);
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad((latch_mode == BTR_SEARCH_LEAF)
+	      || (latch_mode == BTR_MODIFY_LEAF));
+
+	/* Note that, for efficiency, the struct info may not be protected by
+	any latch here! */
+
+	if (UNIV_UNLIKELY(info->n_hash_potential == 0)) {
+
+		return(FALSE);
+	}
+
+	cursor->n_fields = info->n_fields;
+	cursor->n_bytes = info->n_bytes;
+
+	if (UNIV_UNLIKELY(dtuple_get_n_fields(tuple)
+			  < cursor->n_fields + (cursor->n_bytes > 0))) {
+
+		return(FALSE);
+	}
+
+	index_id = index->id;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+	info->n_hash_succ++;
+#endif
+	fold = dtuple_fold(tuple, cursor->n_fields, cursor->n_bytes, index_id);
+
+	cursor->fold = fold;
+	cursor->flag = BTR_CUR_HASH;
+
+	if (UNIV_LIKELY(!has_search_latch)) {
+		rw_lock_s_lock(&btr_search_latch);
+
+		if (UNIV_UNLIKELY(!btr_search_enabled)) {
+			goto failure_unlock;
+		}
+	}
+
+	ut_ad(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_EX);
+	ut_ad(rw_lock_get_reader_count(&btr_search_latch) > 0);
+
+	rec = (rec_t*) ha_search_and_get_data(btr_search_sys->hash_index, fold);
+
+	if (UNIV_UNLIKELY(!rec)) {
+		goto failure_unlock;
+	}
+
+	block = buf_block_align(rec);
+
+	if (UNIV_LIKELY(!has_search_latch)) {
+
+		if (UNIV_UNLIKELY(
+			    !buf_page_get_known_nowait(latch_mode, block,
+						       BUF_MAKE_YOUNG,
+						       __FILE__, __LINE__,
+						       mtr))) {
+			goto failure_unlock;
+		}
+
+		rw_lock_s_unlock(&btr_search_latch);
+
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH);
+	}
+
+	if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) {
+		ut_ad(buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH);
+
+		if (UNIV_LIKELY(!has_search_latch)) {
+
+			btr_leaf_page_release(block, latch_mode, mtr);
+		}
+
+		goto failure;
+	}
+
+	ut_ad(page_rec_is_user_rec(rec));
+
+	btr_cur_position(index, (rec_t*) rec, block, cursor);
+
+	/* Check the validity of the guess within the page */
+
+	/* If we only have the latch on btr_search_latch, not on the
+	page, it only protects the columns of the record the cursor
+	is positioned on. We cannot look at the next of the previous
+	record to determine if our guess for the cursor position is
+	right. */
+	if (UNIV_UNLIKELY(index_id != btr_page_get_index_id(block->frame))
+	    || !btr_search_check_guess(cursor,
+				       has_search_latch,
+				       tuple, mode, mtr)) {
+		if (UNIV_LIKELY(!has_search_latch)) {
+			btr_leaf_page_release(block, latch_mode, mtr);
+		}
+
+		goto failure;
+	}
+
+	if (UNIV_LIKELY(info->n_hash_potential < BTR_SEARCH_BUILD_LIMIT + 5)) {
+
+		info->n_hash_potential++;
+	}
+
+#ifdef notdefined
+	/* These lines of code can be used in a debug version to check
+	the correctness of the searched cursor position: */
+
+	info->last_hash_succ = FALSE;
+
+	/* Currently, does not work if the following fails: */
+	ut_ad(!has_search_latch);
+
+	btr_leaf_page_release(block, latch_mode, mtr);
+
+	btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
+				    &cursor2, 0, mtr);
+	if (mode == PAGE_CUR_GE
+	    && page_rec_is_supremum(btr_cur_get_rec(&cursor2))) {
+
+		/* If mode is PAGE_CUR_GE, then the binary search
+		in the index tree may actually take us to the supremum
+		of the previous page */
+
+		info->last_hash_succ = FALSE;
+
+		btr_pcur_open_on_user_rec(index, tuple, mode, latch_mode,
+					  &pcur, mtr);
+		ut_ad(btr_pcur_get_rec(&pcur) == btr_cur_get_rec(cursor));
+	} else {
+		ut_ad(btr_cur_get_rec(&cursor2) == btr_cur_get_rec(cursor));
+	}
+
+	/* NOTE that it is theoretically possible that the above assertions
+	fail if the page of the cursor gets removed from the buffer pool
+	meanwhile! Thus it might not be a bug. */
+#endif
+	info->last_hash_succ = TRUE;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+	btr_search_n_succ++;
+#endif
+	if (UNIV_LIKELY(!has_search_latch)
+	    && buf_page_peek_if_too_old(&block->page)) {
+
+		buf_page_make_young(&block->page);
+	}
+
+	/* Increment the page get statistics though we did not really
+	fix the page: for user info only */
+	buf_pool = buf_pool_from_bpage(&block->page);
+	buf_pool->stat.n_page_gets++;
+
+	return(TRUE);
+
+	/*-------------------------------------------*/
+failure_unlock:
+	if (UNIV_LIKELY(!has_search_latch)) {
+		rw_lock_s_unlock(&btr_search_latch);
+	}
+failure:
+	cursor->flag = BTR_CUR_HASH_FAIL;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+	info->n_hash_fail++;
+
+	if (info->n_hash_succ > 0) {
+		info->n_hash_succ--;
+	}
+#endif
+	info->last_hash_succ = FALSE;
+
+	return(FALSE);
+}
+
+/********************************************************************//**
+Drops a page hash index. */
+UNIV_INTERN
+void
+btr_search_drop_page_hash_index(
+/*============================*/
+	buf_block_t*	block)	/*!< in: block containing index page,
+				s- or x-latched, or an index page
+				for which we know that
+				block->buf_fix_count == 0 or it is an
+				index page which has already been
+				removed from the buf_pool->page_hash
+				i.e.: it is in state
+				BUF_BLOCK_REMOVE_HASH */
+{
+	hash_table_t*		table;
+	ulint			n_fields;
+	ulint			n_bytes;
+	const page_t*		page;
+	const rec_t*		rec;
+	ulint			fold;
+	ulint			prev_fold;
+	index_id_t		index_id;
+	ulint			n_cached;
+	ulint			n_recs;
+	ulint*			folds;
+	ulint			i;
+	mem_heap_t*		heap;
+	const dict_index_t*	index;
+	ulint*			offsets;
+	btr_search_t*		info;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* Do a dirty check on block->index, return if the block is
+	not in the adaptive hash index. This is to avoid acquiring
+	shared btr_search_latch for performance consideration. */
+	if (!block->index) {
+		return;
+	}
+
+retry:
+	rw_lock_s_lock(&btr_search_latch);
+	index = block->index;
+
+	if (UNIV_LIKELY(!index)) {
+
+		rw_lock_s_unlock(&btr_search_latch);
+
+		return;
+	}
+
+	ut_a(!dict_index_is_ibuf(index));
+#ifdef UNIV_DEBUG
+	switch (dict_index_get_online_status(index)) {
+	case ONLINE_INDEX_CREATION:
+		/* The index is being created (bulk loaded). */
+	case ONLINE_INDEX_COMPLETE:
+		/* The index has been published. */
+	case ONLINE_INDEX_ABORTED:
+		/* Either the index creation was aborted due to an
+		error observed by InnoDB (in which case there should
+		not be any adaptive hash index entries), or it was
+		completed and then flagged aborted in
+		rollback_inplace_alter_table(). */
+		break;
+	case ONLINE_INDEX_ABORTED_DROPPED:
+		/* The index should have been dropped from the tablespace
+		already, and the adaptive hash index entries should have
+		been dropped as well. */
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+
+	table = btr_search_sys->hash_index;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED)
+	      || rw_lock_own(&(block->lock), RW_LOCK_EX)
+	      || block->page.buf_fix_count == 0
+	      || buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH);
+#endif /* UNIV_SYNC_DEBUG */
+
+	n_fields = block->curr_n_fields;
+	n_bytes = block->curr_n_bytes;
+
+	/* NOTE: The fields of block must not be accessed after
+	releasing btr_search_latch, as the index page might only
+	be s-latched! */
+
+	rw_lock_s_unlock(&btr_search_latch);
+
+	ut_a(n_fields + n_bytes > 0);
+
+	page = block->frame;
+	n_recs = page_get_n_recs(page);
+
+	/* Calculate and cache fold values into an array for fast deletion
+	from the hash index */
+
+	folds = (ulint*) mem_alloc(n_recs * sizeof(ulint));
+
+	n_cached = 0;
+
+	rec = page_get_infimum_rec(page);
+	rec = page_rec_get_next_low(rec, page_is_comp(page));
+
+	index_id = btr_page_get_index_id(page);
+
+	ut_a(index_id == index->id);
+
+	prev_fold = 0;
+
+	heap = NULL;
+	offsets = NULL;
+
+	while (!page_rec_is_supremum(rec)) {
+		offsets = rec_get_offsets(rec, index, offsets,
+					  n_fields + (n_bytes > 0), &heap);
+		ut_a(rec_offs_n_fields(offsets) == n_fields + (n_bytes > 0));
+		fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id);
+
+		if (fold == prev_fold && prev_fold != 0) {
+
+			goto next_rec;
+		}
+
+		/* Remove all hash nodes pointing to this page from the
+		hash chain */
+
+		folds[n_cached] = fold;
+		n_cached++;
+next_rec:
+		rec = page_rec_get_next_low(rec, page_rec_is_comp(rec));
+		prev_fold = fold;
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	rw_lock_x_lock(&btr_search_latch);
+
+	if (UNIV_UNLIKELY(!block->index)) {
+		/* Someone else has meanwhile dropped the hash index */
+
+		goto cleanup;
+	}
+
+	ut_a(block->index == index);
+
+	if (UNIV_UNLIKELY(block->curr_n_fields != n_fields)
+	    || UNIV_UNLIKELY(block->curr_n_bytes != n_bytes)) {
+
+		/* Someone else has meanwhile built a new hash index on the
+		page, with different parameters */
+
+		rw_lock_x_unlock(&btr_search_latch);
+
+		mem_free(folds);
+		goto retry;
+	}
+
+	for (i = 0; i < n_cached; i++) {
+
+		ha_remove_all_nodes_to_page(table, folds[i], page);
+	}
+
+	info = btr_search_get_info(block->index);
+	ut_a(info->ref_count > 0);
+	info->ref_count--;
+
+	block->index = NULL;
+
+	MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_REMOVED);
+	MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_REMOVED, n_cached);
+
+cleanup:
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	if (UNIV_UNLIKELY(block->n_pointers)) {
+		/* Corruption */
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Corruption of adaptive hash index."
+			" After dropping\n"
+			"InnoDB: the hash index to a page of %s,"
+			" still %lu hash nodes remain.\n",
+			index->name, (ulong) block->n_pointers);
+		rw_lock_x_unlock(&btr_search_latch);
+
+		ut_ad(btr_search_validate());
+	} else {
+		rw_lock_x_unlock(&btr_search_latch);
+	}
+#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	rw_lock_x_unlock(&btr_search_latch);
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+	mem_free(folds);
+}
+
+/********************************************************************//**
+Drops a possible page hash index when a page is evicted from the buffer pool
+or freed in a file segment. */
+UNIV_INTERN
+void
+btr_search_drop_page_hash_when_freed(
+/*=================================*/
+	ulint	space,		/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no)	/*!< in: page number */
+{
+	buf_block_t*	block;
+	mtr_t		mtr;
+
+	mtr_start(&mtr);
+
+	/* If the caller has a latch on the page, then the caller must
+	have a x-latch on the page and it must have already dropped
+	the hash index for the page. Because of the x-latch that we
+	are possibly holding, we cannot s-latch the page, but must
+	(recursively) x-latch it, even though we are only reading. */
+
+	block = buf_page_get_gen(space, zip_size, page_no, RW_X_LATCH, NULL,
+				 BUF_PEEK_IF_IN_POOL, __FILE__, __LINE__,
+				 &mtr);
+
+	if (block && block->index) {
+
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH);
+
+		btr_search_drop_page_hash_index(block);
+	}
+
+	mtr_commit(&mtr);
+}
+
+/********************************************************************//**
+Builds a hash index on a page with the given parameters. If the page already
+has a hash index with different parameters, the old hash index is removed.
+If index is non-NULL, this function checks if n_fields and n_bytes are
+sensible values, and does not build a hash index if not. */
+static
+void
+btr_search_build_page_hash_index(
+/*=============================*/
+	dict_index_t*	index,	/*!< in: index for which to build */
+	buf_block_t*	block,	/*!< in: index page, s- or x-latched */
+	ulint		n_fields,/*!< in: hash this many full fields */
+	ulint		n_bytes,/*!< in: hash this many bytes from the next
+				field */
+	ibool		left_side)/*!< in: hash for searches from left side? */
+{
+	hash_table_t*	table;
+	page_t*		page;
+	rec_t*		rec;
+	rec_t*		next_rec;
+	ulint		fold;
+	ulint		next_fold;
+	ulint		n_cached;
+	ulint		n_recs;
+	ulint*		folds;
+	rec_t**		recs;
+	ulint		i;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(index);
+	ut_a(!dict_index_is_ibuf(index));
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED)
+	      || rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	rw_lock_s_lock(&btr_search_latch);
+
+	if (!btr_search_enabled) {
+		rw_lock_s_unlock(&btr_search_latch);
+		return;
+	}
+
+	table = btr_search_sys->hash_index;
+	page = buf_block_get_frame(block);
+
+	if (block->index && ((block->curr_n_fields != n_fields)
+			     || (block->curr_n_bytes != n_bytes)
+			     || (block->curr_left_side != left_side))) {
+
+		rw_lock_s_unlock(&btr_search_latch);
+
+		btr_search_drop_page_hash_index(block);
+	} else {
+		rw_lock_s_unlock(&btr_search_latch);
+	}
+
+	n_recs = page_get_n_recs(page);
+
+	if (n_recs == 0) {
+
+		return;
+	}
+
+	/* Check that the values for hash index build are sensible */
+
+	if (n_fields + n_bytes == 0) {
+
+		return;
+	}
+
+	if (dict_index_get_n_unique_in_tree(index) < n_fields
+	    || (dict_index_get_n_unique_in_tree(index) == n_fields
+		&& n_bytes > 0)) {
+		return;
+	}
+
+	/* Calculate and cache fold values and corresponding records into
+	an array for fast insertion to the hash index */
+
+	folds = (ulint*) mem_alloc(n_recs * sizeof(ulint));
+	recs = (rec_t**) mem_alloc(n_recs * sizeof(rec_t*));
+
+	n_cached = 0;
+
+	ut_a(index->id == btr_page_get_index_id(page));
+
+	rec = page_rec_get_next(page_get_infimum_rec(page));
+
+	offsets = rec_get_offsets(rec, index, offsets,
+				  n_fields + (n_bytes > 0), &heap);
+
+	if (!page_rec_is_supremum(rec)) {
+		ut_a(n_fields <= rec_offs_n_fields(offsets));
+
+		if (n_bytes > 0) {
+			ut_a(n_fields < rec_offs_n_fields(offsets));
+		}
+	}
+
+	fold = rec_fold(rec, offsets, n_fields, n_bytes, index->id);
+
+	if (left_side) {
+
+		folds[n_cached] = fold;
+		recs[n_cached] = rec;
+		n_cached++;
+	}
+
+	for (;;) {
+		next_rec = page_rec_get_next(rec);
+
+		if (page_rec_is_supremum(next_rec)) {
+
+			if (!left_side) {
+
+				folds[n_cached] = fold;
+				recs[n_cached] = rec;
+				n_cached++;
+			}
+
+			break;
+		}
+
+		offsets = rec_get_offsets(next_rec, index, offsets,
+					  n_fields + (n_bytes > 0), &heap);
+		next_fold = rec_fold(next_rec, offsets, n_fields,
+				     n_bytes, index->id);
+
+		if (fold != next_fold) {
+			/* Insert an entry into the hash index */
+
+			if (left_side) {
+
+				folds[n_cached] = next_fold;
+				recs[n_cached] = next_rec;
+				n_cached++;
+			} else {
+				folds[n_cached] = fold;
+				recs[n_cached] = rec;
+				n_cached++;
+			}
+		}
+
+		rec = next_rec;
+		fold = next_fold;
+	}
+
+	btr_search_check_free_space_in_heap();
+
+	rw_lock_x_lock(&btr_search_latch);
+
+	if (UNIV_UNLIKELY(!btr_search_enabled)) {
+		goto exit_func;
+	}
+
+	if (block->index && ((block->curr_n_fields != n_fields)
+			     || (block->curr_n_bytes != n_bytes)
+			     || (block->curr_left_side != left_side))) {
+		goto exit_func;
+	}
+
+	/* This counter is decremented every time we drop page
+	hash index entries and is incremented here. Since we can
+	rebuild hash index for a page that is already hashed, we
+	have to take care not to increment the counter in that
+	case. */
+	if (!block->index) {
+		index->search_info->ref_count++;
+	}
+
+	block->n_hash_helps = 0;
+
+	block->curr_n_fields = n_fields;
+	block->curr_n_bytes = n_bytes;
+	block->curr_left_side = left_side;
+	block->index = index;
+
+	for (i = 0; i < n_cached; i++) {
+
+		ha_insert_for_fold(table, folds[i], block, recs[i]);
+	}
+
+	MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_ADDED);
+	MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_ADDED, n_cached);
+exit_func:
+	rw_lock_x_unlock(&btr_search_latch);
+
+	mem_free(folds);
+	mem_free(recs);
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/********************************************************************//**
+Moves or deletes hash entries for moved records. If new_page is already hashed,
+then the hash index for page, if any, is dropped. If new_page is not hashed,
+and page is hashed, then a new hash index is built to new_page with the same
+parameters as page (this often happens when a page is split). */
+UNIV_INTERN
+void
+btr_search_move_or_delete_hash_entries(
+/*===================================*/
+	buf_block_t*	new_block,	/*!< in: records are copied
+					to this page */
+	buf_block_t*	block,		/*!< in: index page from which
+					records were copied, and the
+					copied records will be deleted
+					from this page */
+	dict_index_t*	index)		/*!< in: record descriptor */
+{
+	ulint	n_fields;
+	ulint	n_bytes;
+	ibool	left_side;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+	ut_ad(rw_lock_own(&(new_block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	rw_lock_s_lock(&btr_search_latch);
+
+	ut_a(!new_block->index || new_block->index == index);
+	ut_a(!block->index || block->index == index);
+	ut_a(!(new_block->index || block->index)
+	     || !dict_index_is_ibuf(index));
+
+	if (new_block->index) {
+
+		rw_lock_s_unlock(&btr_search_latch);
+
+		btr_search_drop_page_hash_index(block);
+
+		return;
+	}
+
+	if (block->index) {
+
+		n_fields = block->curr_n_fields;
+		n_bytes = block->curr_n_bytes;
+		left_side = block->curr_left_side;
+
+		new_block->n_fields = block->curr_n_fields;
+		new_block->n_bytes = block->curr_n_bytes;
+		new_block->left_side = left_side;
+
+		rw_lock_s_unlock(&btr_search_latch);
+
+		ut_a(n_fields + n_bytes > 0);
+
+		btr_search_build_page_hash_index(index, new_block, n_fields,
+						 n_bytes, left_side);
+		ut_ad(n_fields == block->curr_n_fields);
+		ut_ad(n_bytes == block->curr_n_bytes);
+		ut_ad(left_side == block->curr_left_side);
+		return;
+	}
+
+	rw_lock_s_unlock(&btr_search_latch);
+}
+
+/********************************************************************//**
+Updates the page hash index when a single record is deleted from a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_on_delete(
+/*=============================*/
+	btr_cur_t*	cursor)	/*!< in: cursor which was positioned on the
+				record to delete using btr_cur_search_...,
+				the record is not yet deleted */
+{
+	hash_table_t*	table;
+	buf_block_t*	block;
+	const rec_t*	rec;
+	ulint		fold;
+	dict_index_t*	index;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	mem_heap_t*	heap		= NULL;
+	rec_offs_init(offsets_);
+
+	block = btr_cur_get_block(cursor);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	index = block->index;
+
+	if (!index) {
+
+		return;
+	}
+
+	ut_a(index == cursor->index);
+	ut_a(block->curr_n_fields + block->curr_n_bytes > 0);
+	ut_a(!dict_index_is_ibuf(index));
+
+	table = btr_search_sys->hash_index;
+
+	rec = btr_cur_get_rec(cursor);
+
+	fold = rec_fold(rec, rec_get_offsets(rec, index, offsets_,
+					     ULINT_UNDEFINED, &heap),
+			block->curr_n_fields, block->curr_n_bytes, index->id);
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	rw_lock_x_lock(&btr_search_latch);
+
+	if (block->index) {
+		ut_a(block->index == index);
+
+		if (ha_search_and_delete_if_found(table, fold, rec)) {
+			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVED);
+		} else {
+			MONITOR_INC(
+				MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND);
+		}
+	}
+
+	rw_lock_x_unlock(&btr_search_latch);
+}
+
+/********************************************************************//**
+Updates the page hash index when a single record is inserted on a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_node_on_insert(
+/*==================================*/
+	btr_cur_t*	cursor)	/*!< in: cursor which was positioned to the
+				place to insert using btr_cur_search_...,
+				and the new record has been inserted next
+				to the cursor */
+{
+	hash_table_t*	table;
+	buf_block_t*	block;
+	dict_index_t*	index;
+	rec_t*		rec;
+
+	rec = btr_cur_get_rec(cursor);
+
+	block = btr_cur_get_block(cursor);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	index = block->index;
+
+	if (!index) {
+
+		return;
+	}
+
+	ut_a(cursor->index == index);
+	ut_a(!dict_index_is_ibuf(index));
+
+	rw_lock_x_lock(&btr_search_latch);
+
+	if (!block->index) {
+
+		goto func_exit;
+	}
+
+	ut_a(block->index == index);
+
+	if ((cursor->flag == BTR_CUR_HASH)
+	    && (cursor->n_fields == block->curr_n_fields)
+	    && (cursor->n_bytes == block->curr_n_bytes)
+	    && !block->curr_left_side) {
+
+		table = btr_search_sys->hash_index;
+
+		if (ha_search_and_update_if_found(
+			table, cursor->fold, rec, block,
+			page_rec_get_next(rec))) {
+			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_UPDATED);
+		}
+
+func_exit:
+		rw_lock_x_unlock(&btr_search_latch);
+	} else {
+		rw_lock_x_unlock(&btr_search_latch);
+
+		btr_search_update_hash_on_insert(cursor);
+	}
+}
+
+/********************************************************************//**
+Updates the page hash index when a single record is inserted on a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_on_insert(
+/*=============================*/
+	btr_cur_t*	cursor)	/*!< in: cursor which was positioned to the
+				place to insert using btr_cur_search_...,
+				and the new record has been inserted next
+				to the cursor */
+{
+	hash_table_t*	table;
+	buf_block_t*	block;
+	dict_index_t*	index;
+	const rec_t*	rec;
+	const rec_t*	ins_rec;
+	const rec_t*	next_rec;
+	ulint		fold;
+	ulint		ins_fold;
+	ulint		next_fold = 0; /* remove warning (??? bug ???) */
+	ulint		n_fields;
+	ulint		n_bytes;
+	ibool		left_side;
+	ibool		locked		= FALSE;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	block = btr_cur_get_block(cursor);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	index = block->index;
+
+	if (!index) {
+
+		return;
+	}
+
+	btr_search_check_free_space_in_heap();
+
+	table = btr_search_sys->hash_index;
+
+	rec = btr_cur_get_rec(cursor);
+
+	ut_a(index == cursor->index);
+	ut_a(!dict_index_is_ibuf(index));
+
+	n_fields = block->curr_n_fields;
+	n_bytes = block->curr_n_bytes;
+	left_side = block->curr_left_side;
+
+	ins_rec = page_rec_get_next_const(rec);
+	next_rec = page_rec_get_next_const(ins_rec);
+
+	offsets = rec_get_offsets(ins_rec, index, offsets,
+				  ULINT_UNDEFINED, &heap);
+	ins_fold = rec_fold(ins_rec, offsets, n_fields, n_bytes, index->id);
+
+	if (!page_rec_is_supremum(next_rec)) {
+		offsets = rec_get_offsets(next_rec, index, offsets,
+					  n_fields + (n_bytes > 0), &heap);
+		next_fold = rec_fold(next_rec, offsets, n_fields,
+				     n_bytes, index->id);
+	}
+
+	if (!page_rec_is_infimum(rec)) {
+		offsets = rec_get_offsets(rec, index, offsets,
+					  n_fields + (n_bytes > 0), &heap);
+		fold = rec_fold(rec, offsets, n_fields, n_bytes, index->id);
+	} else {
+		if (left_side) {
+
+			rw_lock_x_lock(&btr_search_latch);
+
+			locked = TRUE;
+
+			if (!btr_search_enabled) {
+				goto function_exit;
+			}
+
+			ha_insert_for_fold(table, ins_fold, block, ins_rec);
+		}
+
+		goto check_next_rec;
+	}
+
+	if (fold != ins_fold) {
+
+		if (!locked) {
+
+			rw_lock_x_lock(&btr_search_latch);
+
+			locked = TRUE;
+
+			if (!btr_search_enabled) {
+				goto function_exit;
+			}
+		}
+
+		if (!left_side) {
+			ha_insert_for_fold(table, fold, block, rec);
+		} else {
+			ha_insert_for_fold(table, ins_fold, block, ins_rec);
+		}
+	}
+
+check_next_rec:
+	if (page_rec_is_supremum(next_rec)) {
+
+		if (!left_side) {
+
+			if (!locked) {
+				rw_lock_x_lock(&btr_search_latch);
+
+				locked = TRUE;
+
+				if (!btr_search_enabled) {
+					goto function_exit;
+				}
+			}
+
+			ha_insert_for_fold(table, ins_fold, block, ins_rec);
+		}
+
+		goto function_exit;
+	}
+
+	if (ins_fold != next_fold) {
+
+		if (!locked) {
+
+			rw_lock_x_lock(&btr_search_latch);
+
+			locked = TRUE;
+
+			if (!btr_search_enabled) {
+				goto function_exit;
+			}
+		}
+
+		if (!left_side) {
+
+			ha_insert_for_fold(table, ins_fold, block, ins_rec);
+			/*
+			fputs("Hash insert for ", stderr);
+			dict_index_name_print(stderr, index);
+			fprintf(stderr, " fold %lu\n", ins_fold);
+			*/
+		} else {
+			ha_insert_for_fold(table, next_fold, block, next_rec);
+		}
+	}
+
+function_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	if (locked) {
+		rw_lock_x_unlock(&btr_search_latch);
+	}
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/********************************************************************//**
+Validates the search system.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+btr_search_validate(void)
+/*=====================*/
+{
+	ha_node_t*	node;
+	ulint		n_page_dumps	= 0;
+	ibool		ok		= TRUE;
+	ulint		i;
+	ulint		cell_count;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+
+	/* How many cells to check before temporarily releasing
+	btr_search_latch. */
+	ulint		chunk_size = 10000;
+
+	rec_offs_init(offsets_);
+
+	rw_lock_x_lock(&btr_search_latch);
+	buf_pool_mutex_enter_all();
+
+	cell_count = hash_get_n_cells(btr_search_sys->hash_index);
+
+	for (i = 0; i < cell_count; i++) {
+		/* We release btr_search_latch every once in a while to
+		give other queries a chance to run. */
+		if ((i != 0) && ((i % chunk_size) == 0)) {
+			buf_pool_mutex_exit_all();
+			rw_lock_x_unlock(&btr_search_latch);
+			os_thread_yield();
+			rw_lock_x_lock(&btr_search_latch);
+			buf_pool_mutex_enter_all();
+		}
+
+		node = (ha_node_t*)
+			hash_get_nth_cell(btr_search_sys->hash_index, i)->node;
+
+		for (; node != NULL; node = node->next) {
+			const buf_block_t*	block
+				= buf_block_align((byte*) node->data);
+			const buf_block_t*	hash_block;
+			buf_pool_t*		buf_pool;
+			index_id_t		page_index_id;
+
+			buf_pool = buf_pool_from_bpage((buf_page_t*) block);
+
+			if (UNIV_LIKELY(buf_block_get_state(block)
+					== BUF_BLOCK_FILE_PAGE)) {
+
+				/* The space and offset are only valid
+				for file blocks.  It is possible that
+				the block is being freed
+				(BUF_BLOCK_REMOVE_HASH, see the
+				assertion and the comment below) */
+				hash_block = buf_block_hash_get(
+					buf_pool,
+					buf_block_get_space(block),
+					buf_block_get_page_no(block));
+			} else {
+				hash_block = NULL;
+			}
+
+			if (hash_block) {
+				ut_a(hash_block == block);
+			} else {
+				/* When a block is being freed,
+				buf_LRU_search_and_free_block() first
+				removes the block from
+				buf_pool->page_hash by calling
+				buf_LRU_block_remove_hashed_page().
+				After that, it invokes
+				btr_search_drop_page_hash_index() to
+				remove the block from
+				btr_search_sys->hash_index. */
+
+				ut_a(buf_block_get_state(block)
+				     == BUF_BLOCK_REMOVE_HASH);
+			}
+
+			ut_a(!dict_index_is_ibuf(block->index));
+
+			page_index_id = btr_page_get_index_id(block->frame);
+
+			offsets = rec_get_offsets(node->data,
+						  block->index, offsets,
+						  block->curr_n_fields
+						  + (block->curr_n_bytes > 0),
+						  &heap);
+
+			if (!block->index || node->fold
+			    != rec_fold(node->data,
+					offsets,
+					block->curr_n_fields,
+					block->curr_n_bytes,
+					page_index_id)) {
+				const page_t*	page = block->frame;
+
+				ok = FALSE;
+				ut_print_timestamp(stderr);
+
+				fprintf(stderr,
+					"  InnoDB: Error in an adaptive hash"
+					" index pointer to page %lu\n"
+					"InnoDB: ptr mem address %p"
+					" index id %llu,"
+					" node fold %lu, rec fold %lu\n",
+					(ulong) page_get_page_no(page),
+					node->data,
+					(ullint) page_index_id,
+					(ulong) node->fold,
+					(ulong) rec_fold(node->data,
+							 offsets,
+							 block->curr_n_fields,
+							 block->curr_n_bytes,
+							 page_index_id));
+
+				fputs("InnoDB: Record ", stderr);
+				rec_print_new(stderr, node->data, offsets);
+				fprintf(stderr, "\nInnoDB: on that page."
+					" Page mem address %p, is hashed %p,"
+					" n fields %lu, n bytes %lu\n"
+					"InnoDB: side %lu\n",
+					(void*) page, (void*) block->index,
+					(ulong) block->curr_n_fields,
+					(ulong) block->curr_n_bytes,
+					(ulong) block->curr_left_side);
+
+				if (n_page_dumps < 20) {
+					buf_page_print(
+						page, 0,
+						BUF_PAGE_PRINT_NO_CRASH);
+					n_page_dumps++;
+				}
+			}
+		}
+	}
+
+	for (i = 0; i < cell_count; i += chunk_size) {
+		ulint end_index = ut_min(i + chunk_size - 1, cell_count - 1);
+
+		/* We release btr_search_latch every once in a while to
+		give other queries a chance to run. */
+		if (i != 0) {
+			buf_pool_mutex_exit_all();
+			rw_lock_x_unlock(&btr_search_latch);
+			os_thread_yield();
+			rw_lock_x_lock(&btr_search_latch);
+			buf_pool_mutex_enter_all();
+		}
+
+		if (!ha_validate(btr_search_sys->hash_index, i, end_index)) {
+			ok = FALSE;
+		}
+	}
+
+	buf_pool_mutex_exit_all();
+	rw_lock_x_unlock(&btr_search_latch);
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return(ok);
+}
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
diff --git a/storage/innobase/buf/buf0buddy.cc b/storage/innobase/buf/buf0buddy.cc
new file mode 100644
index 00000000000..958b3b5cfad
--- /dev/null
+++ b/storage/innobase/buf/buf0buddy.cc
@@ -0,0 +1,721 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0buddy.cc
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#define THIS_MODULE
+#include "buf0buddy.h"
+#ifdef UNIV_NONINL
+# include "buf0buddy.ic"
+#endif
+#undef THIS_MODULE
+#include "buf0buf.h"
+#include "buf0lru.h"
+#include "buf0flu.h"
+#include "page0zip.h"
+#include "srv0start.h"
+
+/** When freeing a buf we attempt to coalesce by looking at its buddy
+and deciding whether it is free or not. To ascertain if the buddy is
+free we look for BUF_BUDDY_STAMP_FREE at BUF_BUDDY_STAMP_OFFSET
+within the buddy. The question is how we can be sure that it is
+safe to look at BUF_BUDDY_STAMP_OFFSET.
+The answer lies in following invariants:
+* All blocks allocated by buddy allocator are used for compressed
+page frame.
+* A compressed table always have space_id < SRV_LOG_SPACE_FIRST_ID
+* BUF_BUDDY_STAMP_OFFSET always points to the space_id field in
+a frame.
+  -- The above is true because we look at these fields when the
+     corresponding buddy block is free which implies that:
+     * The block we are looking at must have an address aligned at
+       the same size that its free buddy has. For example, if we have
+       a free block of 8K then its buddy's address must be aligned at
+       8K as well.
+     * It is possible that the block we are looking at may have been
+       further divided into smaller sized blocks but its starting
+       address must still remain the start of a page frame i.e.: it
+       cannot be middle of a block. For example, if we have a free
+       block of size 8K then its buddy may be divided into blocks
+       of, say, 1K, 1K, 2K, 4K but the buddy's address will still be
+       the starting address of first 1K compressed page.
+     * What is important to note is that for any given block, the
+       buddy's address cannot be in the middle of a larger block i.e.:
+       in above example, our 8K block cannot have a buddy whose address
+       is aligned on 8K but it is part of a larger 16K block.
+*/
+
+/** Offset within buf_buddy_free_t where free or non_free stamps
+are written.*/
+#define BUF_BUDDY_STAMP_OFFSET	FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID
+
+/** Value that we stamp on all buffers that are currently on the zip_free
+list. This value is stamped at BUF_BUDDY_STAMP_OFFSET offset */
+#define BUF_BUDDY_STAMP_FREE	(SRV_LOG_SPACE_FIRST_ID)
+
+/** Stamp value for non-free buffers. Will be overwritten by a non-zero
+value by the consumer of the block */
+#define BUF_BUDDY_STAMP_NONFREE	(0XFFFFFFFF)
+
+#if BUF_BUDDY_STAMP_FREE >= BUF_BUDDY_STAMP_NONFREE
+# error "BUF_BUDDY_STAMP_FREE >= BUF_BUDDY_STAMP_NONFREE"
+#endif
+
+/** Return type of buf_buddy_is_free() */
+enum buf_buddy_state_t {
+	BUF_BUDDY_STATE_FREE,	/*!< If the buddy to completely free */
+	BUF_BUDDY_STATE_USED,	/*!< Buddy currently in used */
+	BUF_BUDDY_STATE_PARTIALLY_USED/*!< Some sub-blocks in the buddy
+				are in use */
+};
+
+#ifdef UNIV_DEBUG_VALGRIND
+/**********************************************************************//**
+Invalidate memory area that we won't access while page is free */
+UNIV_INLINE
+void
+buf_buddy_mem_invalid(
+/*==================*/
+	buf_buddy_free_t*	buf,	/*!< in: block to check */
+	ulint			i)	/*!< in: index of zip_free[] */
+{
+	const size_t	size	= BUF_BUDDY_LOW << i;
+	ut_ad(i <= BUF_BUDDY_SIZES);
+
+	UNIV_MEM_ASSERT_W(buf, size);
+	UNIV_MEM_INVALID(buf, size);
+}
+#else /* UNIV_DEBUG_VALGRIND */
+# define buf_buddy_mem_invalid(buf, i) ut_ad((i) <= BUF_BUDDY_SIZES)
+#endif /* UNIV_DEBUG_VALGRIND */
+
+/**********************************************************************//**
+Check if a buddy is stamped free.
+@return	whether the buddy is free */
+UNIV_INLINE __attribute__((warn_unused_result))
+bool
+buf_buddy_stamp_is_free(
+/*====================*/
+	const buf_buddy_free_t*	buf)	/*!< in: block to check */
+{
+	return(mach_read_from_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET)
+	       == BUF_BUDDY_STAMP_FREE);
+}
+
+/**********************************************************************//**
+Stamps a buddy free. */
+UNIV_INLINE
+void
+buf_buddy_stamp_free(
+/*=================*/
+	buf_buddy_free_t*	buf,	/*!< in/out: block to stamp */
+	ulint			i)	/*!< in: block size */
+{
+	ut_d(memset(buf, static_cast<int>(i), BUF_BUDDY_LOW << i));
+	buf_buddy_mem_invalid(buf, i);
+	mach_write_to_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET,
+			BUF_BUDDY_STAMP_FREE);
+	buf->stamp.size = i;
+}
+
+/**********************************************************************//**
+Stamps a buddy nonfree.
+@param[in/out]	buf	block to stamp
+@param[in]	i	block size */
+#define buf_buddy_stamp_nonfree(buf, i) do {				\
+	buf_buddy_mem_invalid(buf, i);					\
+	memset(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET, 0xff, 4);	\
+} while (0)
+#if BUF_BUDDY_STAMP_NONFREE != 0xffffffff
+# error "BUF_BUDDY_STAMP_NONFREE != 0xffffffff"
+#endif
+
+/**********************************************************************//**
+Get the offset of the buddy of a compressed page frame.
+@return	the buddy relative of page */
+UNIV_INLINE
+void*
+buf_buddy_get(
+/*==========*/
+	byte*	page,	/*!< in: compressed page */
+	ulint	size)	/*!< in: page size in bytes */
+{
+	ut_ad(ut_is_2pow(size));
+	ut_ad(size >= BUF_BUDDY_LOW);
+	ut_ad(BUF_BUDDY_LOW <= UNIV_ZIP_SIZE_MIN);
+	ut_ad(size < BUF_BUDDY_HIGH);
+	ut_ad(BUF_BUDDY_HIGH == UNIV_PAGE_SIZE);
+	ut_ad(!ut_align_offset(page, size));
+
+	if (((ulint) page) & size) {
+		return(page - size);
+	} else {
+		return(page + size);
+	}
+}
+
+/** Validate a given zip_free list. */
+struct	CheckZipFree {
+	ulint	i;
+	CheckZipFree(ulint i) : i (i) {}
+
+	void	operator()(const buf_buddy_free_t* elem) const
+	{
+		ut_a(buf_buddy_stamp_is_free(elem));
+		ut_a(elem->stamp.size <= i);
+	}
+};
+
+#define BUF_BUDDY_LIST_VALIDATE(bp, i)				\
+	UT_LIST_VALIDATE(list, buf_buddy_free_t,		\
+			 bp->zip_free[i], CheckZipFree(i))
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Debug function to validate that a buffer is indeed free i.e.: in the
+zip_free[].
+@return true if free */
+UNIV_INLINE
+bool
+buf_buddy_check_free(
+/*=================*/
+	buf_pool_t*		buf_pool,/*!< in: buffer pool instance */
+	const buf_buddy_free_t*	buf,	/*!< in: block to check */
+	ulint			i)	/*!< in: index of buf_pool->zip_free[] */
+{
+	const ulint	size	= BUF_BUDDY_LOW << i;
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(!ut_align_offset(buf, size));
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+	buf_buddy_free_t* itr;
+
+	for (itr = UT_LIST_GET_FIRST(buf_pool->zip_free[i]);
+	     itr && itr != buf;
+	     itr = UT_LIST_GET_NEXT(list, itr)) {
+	}
+
+	return(itr == buf);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Checks if a buf is free i.e.: in the zip_free[].
+@retval BUF_BUDDY_STATE_FREE if fully free
+@retval BUF_BUDDY_STATE_USED if currently in use
+@retval BUF_BUDDY_STATE_PARTIALLY_USED if partially in use. */
+static  __attribute__((warn_unused_result))
+buf_buddy_state_t
+buf_buddy_is_free(
+/*==============*/
+	buf_buddy_free_t*	buf,	/*!< in: block to check */
+	ulint			i)	/*!< in: index of
+					buf_pool->zip_free[] */
+{
+#ifdef UNIV_DEBUG
+	const ulint	size	= BUF_BUDDY_LOW << i;
+	ut_ad(!ut_align_offset(buf, size));
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+#endif /* UNIV_DEBUG */
+
+	/* We assume that all memory from buf_buddy_alloc()
+	is used for compressed page frames. */
+
+	/* We look inside the allocated objects returned by
+	buf_buddy_alloc() and assume that each block is a compressed
+	page that contains one of the following in space_id.
+	* BUF_BUDDY_STAMP_FREE if the block is in a zip_free list or
+	* BUF_BUDDY_STAMP_NONFREE if the block has been allocated but
+	not initialized yet or
+	* A valid space_id of a compressed tablespace
+
+	The call below attempts to read from free memory.  The memory
+	is "owned" by the buddy allocator (and it has been allocated
+	from the buffer pool), so there is nothing wrong about this. */
+	if (!buf_buddy_stamp_is_free(buf)) {
+		return(BUF_BUDDY_STATE_USED);
+	}
+
+	/* A block may be free but a fragment of it may still be in use.
+	To guard against that we write the free block size in terms of
+	zip_free index at start of stamped block. Note that we can
+	safely rely on this value only if the buf is free. */
+	ut_ad(buf->stamp.size <= i);
+	return(buf->stamp.size == i
+	       ? BUF_BUDDY_STATE_FREE
+	       : BUF_BUDDY_STATE_PARTIALLY_USED);
+}
+
+/**********************************************************************//**
+Add a block to the head of the appropriate buddy free list. */
+UNIV_INLINE
+void
+buf_buddy_add_to_free(
+/*==================*/
+	buf_pool_t*		buf_pool,	/*!< in: buffer pool instance */
+	buf_buddy_free_t*	buf,		/*!< in,own: block to be freed */
+	ulint			i)		/*!< in: index of
+						buf_pool->zip_free[] */
+{
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(buf_pool->zip_free[i].start != buf);
+
+	buf_buddy_stamp_free(buf, i);
+	UT_LIST_ADD_FIRST(list, buf_pool->zip_free[i], buf);
+	ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i));
+}
+
+/**********************************************************************//**
+Remove a block from the appropriate buddy free list. */
+UNIV_INLINE
+void
+buf_buddy_remove_from_free(
+/*=======================*/
+	buf_pool_t*		buf_pool,	/*!< in: buffer pool instance */
+	buf_buddy_free_t*	buf,		/*!< in,own: block to be freed */
+	ulint			i)		/*!< in: index of
+						buf_pool->zip_free[] */
+{
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(buf_buddy_check_free(buf_pool, buf, i));
+
+	UT_LIST_REMOVE(list, buf_pool->zip_free[i], buf);
+	buf_buddy_stamp_nonfree(buf, i);
+}
+
+/**********************************************************************//**
+Try to allocate a block from buf_pool->zip_free[].
+@return	allocated block, or NULL if buf_pool->zip_free[] was empty */
+static
+buf_buddy_free_t*
+buf_buddy_alloc_zip(
+/*================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ulint		i)		/*!< in: index of buf_pool->zip_free[] */
+{
+	buf_buddy_free_t*	buf;
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_a(i < BUF_BUDDY_SIZES);
+	ut_a(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+	ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i));
+
+	buf = UT_LIST_GET_FIRST(buf_pool->zip_free[i]);
+
+	if (buf) {
+		buf_buddy_remove_from_free(buf_pool, buf, i);
+	} else if (i + 1 < BUF_BUDDY_SIZES) {
+		/* Attempt to split. */
+		buf = buf_buddy_alloc_zip(buf_pool, i + 1);
+
+		if (buf) {
+			buf_buddy_free_t* buddy =
+				reinterpret_cast<buf_buddy_free_t*>(
+					buf->stamp.bytes
+					+ (BUF_BUDDY_LOW << i));
+
+			ut_ad(!buf_pool_contains_zip(buf_pool, buddy));
+			buf_buddy_add_to_free(buf_pool, buddy, i);
+		}
+	}
+
+	if (buf) {
+		/* Trash the page other than the BUF_BUDDY_STAMP_NONFREE. */
+		UNIV_MEM_TRASH(buf, ~i, BUF_BUDDY_STAMP_OFFSET);
+		UNIV_MEM_TRASH(BUF_BUDDY_STAMP_OFFSET + 4
+			       + buf->stamp.bytes, ~i,
+			       (BUF_BUDDY_LOW << i)
+			       - (BUF_BUDDY_STAMP_OFFSET + 4));
+		ut_ad(mach_read_from_4(buf->stamp.bytes
+				       + BUF_BUDDY_STAMP_OFFSET)
+		      == BUF_BUDDY_STAMP_NONFREE);
+	}
+
+	return(buf);
+}
+
+/**********************************************************************//**
+Deallocate a buffer frame of UNIV_PAGE_SIZE. */
+static
+void
+buf_buddy_block_free(
+/*=================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	void*		buf)		/*!< in: buffer frame to deallocate */
+{
+	const ulint	fold	= BUF_POOL_ZIP_FOLD_PTR(buf);
+	buf_page_t*	bpage;
+	buf_block_t*	block;
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(!mutex_own(&buf_pool->zip_mutex));
+	ut_a(!ut_align_offset(buf, UNIV_PAGE_SIZE));
+
+	HASH_SEARCH(hash, buf_pool->zip_hash, fold, buf_page_t*, bpage,
+		    ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY
+			  && bpage->in_zip_hash && !bpage->in_page_hash),
+		    ((buf_block_t*) bpage)->frame == buf);
+	ut_a(bpage);
+	ut_a(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY);
+	ut_ad(!bpage->in_page_hash);
+	ut_ad(bpage->in_zip_hash);
+	ut_d(bpage->in_zip_hash = FALSE);
+	HASH_DELETE(buf_page_t, hash, buf_pool->zip_hash, fold, bpage);
+
+	ut_d(memset(buf, 0, UNIV_PAGE_SIZE));
+	UNIV_MEM_INVALID(buf, UNIV_PAGE_SIZE);
+
+	block = (buf_block_t*) bpage;
+	mutex_enter(&block->mutex);
+	buf_LRU_block_free_non_file_page(block);
+	mutex_exit(&block->mutex);
+
+	ut_ad(buf_pool->buddy_n_frames > 0);
+	ut_d(buf_pool->buddy_n_frames--);
+}
+
+/**********************************************************************//**
+Allocate a buffer block to the buddy allocator. */
+static
+void
+buf_buddy_block_register(
+/*=====================*/
+	buf_block_t*	block)	/*!< in: buffer frame to allocate */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_block(block);
+	const ulint	fold = BUF_POOL_ZIP_FOLD(block);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(!mutex_own(&buf_pool->zip_mutex));
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE);
+
+	buf_block_set_state(block, BUF_BLOCK_MEMORY);
+
+	ut_a(block->frame);
+	ut_a(!ut_align_offset(block->frame, UNIV_PAGE_SIZE));
+
+	ut_ad(!block->page.in_page_hash);
+	ut_ad(!block->page.in_zip_hash);
+	ut_d(block->page.in_zip_hash = TRUE);
+	HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page);
+
+	ut_d(buf_pool->buddy_n_frames++);
+}
+
+/**********************************************************************//**
+Allocate a block from a bigger object.
+@return	allocated block */
+static
+void*
+buf_buddy_alloc_from(
+/*=================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	void*		buf,		/*!< in: a block that is free to use */
+	ulint		i,		/*!< in: index of
+					buf_pool->zip_free[] */
+	ulint		j)		/*!< in: size of buf as an index
+					of buf_pool->zip_free[] */
+{
+	ulint	offs	= BUF_BUDDY_LOW << j;
+	ut_ad(j <= BUF_BUDDY_SIZES);
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+	ut_ad(j >= i);
+	ut_ad(!ut_align_offset(buf, offs));
+
+	/* Add the unused parts of the block to the free lists. */
+	while (j > i) {
+		buf_buddy_free_t*	zip_buf;
+
+		offs >>= 1;
+		j--;
+
+		zip_buf = reinterpret_cast<buf_buddy_free_t*>(
+			reinterpret_cast<byte*>(buf) + offs);
+		buf_buddy_add_to_free(buf_pool, zip_buf, j);
+	}
+
+	buf_buddy_stamp_nonfree(reinterpret_cast<buf_buddy_free_t*>(buf), i);
+	return(buf);
+}
+
+/**********************************************************************//**
+Allocate a block.  The thread calling this function must hold
+buf_pool->mutex and must not hold buf_pool->zip_mutex or any block->mutex.
+The buf_pool_mutex may be released and reacquired.
+@return	allocated block, never NULL */
+UNIV_INTERN
+void*
+buf_buddy_alloc_low(
+/*================*/
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
+	ulint		i,		/*!< in: index of buf_pool->zip_free[],
+					or BUF_BUDDY_SIZES */
+	ibool*		lru)		/*!< in: pointer to a variable that
+					will be assigned TRUE if storage was
+					allocated from the LRU list and
+					buf_pool->mutex was temporarily
+					released */
+{
+	buf_block_t*	block;
+
+	ut_ad(lru);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(!mutex_own(&buf_pool->zip_mutex));
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+	if (i < BUF_BUDDY_SIZES) {
+		/* Try to allocate from the buddy system. */
+		block = (buf_block_t*) buf_buddy_alloc_zip(buf_pool, i);
+
+		if (block) {
+			goto func_exit;
+		}
+	}
+
+	/* Try allocating from the buf_pool->free list. */
+	block = buf_LRU_get_free_only(buf_pool);
+
+	if (block) {
+
+		goto alloc_big;
+	}
+
+	/* Try replacing an uncompressed page in the buffer pool. */
+	buf_pool_mutex_exit(buf_pool);
+	block = buf_LRU_get_free_block(buf_pool);
+	*lru = TRUE;
+	buf_pool_mutex_enter(buf_pool);
+
+alloc_big:
+	buf_buddy_block_register(block);
+
+	block = (buf_block_t*) buf_buddy_alloc_from(
+		buf_pool, block->frame, i, BUF_BUDDY_SIZES);
+
+func_exit:
+	buf_pool->buddy_stat[i].used++;
+	return(block);
+}
+
+/**********************************************************************//**
+Try to relocate a block.
+@return	true if relocated */
+static
+bool
+buf_buddy_relocate(
+/*===============*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	void*		src,		/*!< in: block to relocate */
+	void*		dst,		/*!< in: free block to relocate to */
+	ulint		i)		/*!< in: index of
+					buf_pool->zip_free[] */
+{
+	buf_page_t*	bpage;
+	const ulint	size	= BUF_BUDDY_LOW << i;
+	ulint		space;
+	ulint		offset;
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(!mutex_own(&buf_pool->zip_mutex));
+	ut_ad(!ut_align_offset(src, size));
+	ut_ad(!ut_align_offset(dst, size));
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+	UNIV_MEM_ASSERT_W(dst, size);
+
+	space	= mach_read_from_4((const byte*) src
+				   + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	offset	= mach_read_from_4((const byte*) src
+				   + FIL_PAGE_OFFSET);
+
+	/* Suppress Valgrind warnings about conditional jump
+	on uninitialized value. */
+	UNIV_MEM_VALID(&space, sizeof space);
+	UNIV_MEM_VALID(&offset, sizeof offset);
+
+	ut_ad(space != BUF_BUDDY_STAMP_FREE);
+
+	ulint		fold = buf_page_address_fold(space, offset);
+	rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+
+	rw_lock_x_lock(hash_lock);
+
+	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
+
+	if (!bpage || bpage->zip.data != src) {
+		/* The block has probably been freshly
+		allocated by buf_LRU_get_free_block() but not
+		added to buf_pool->page_hash yet.  Obviously,
+		it cannot be relocated. */
+
+		rw_lock_x_unlock(hash_lock);
+
+		return(false);
+	}
+
+	if (page_zip_get_size(&bpage->zip) != size) {
+		/* The block is of different size.  We would
+		have to relocate all blocks covered by src.
+		For the sake of simplicity, give up. */
+		ut_ad(page_zip_get_size(&bpage->zip) < size);
+
+		rw_lock_x_unlock(hash_lock);
+
+		return(false);
+	}
+
+	/* The block must have been allocated, but it may
+	contain uninitialized data. */
+	UNIV_MEM_ASSERT_W(src, size);
+
+	ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
+
+	mutex_enter(block_mutex);
+
+	if (buf_page_can_relocate(bpage)) {
+		/* Relocate the compressed page. */
+		ullint	usec = ut_time_us(NULL);
+
+		ut_a(bpage->zip.data == src);
+
+		/* Note: This is potentially expensive, we need a better
+		solution here. We go with correctness for now. */
+		::memcpy(dst, src, size);
+
+		bpage->zip.data = reinterpret_cast<page_zip_t*>(dst);
+
+		rw_lock_x_unlock(hash_lock);
+
+		mutex_exit(block_mutex);
+
+		buf_buddy_mem_invalid(
+			reinterpret_cast<buf_buddy_free_t*>(src), i);
+
+		buf_buddy_stat_t*	buddy_stat = &buf_pool->buddy_stat[i];
+
+		++buddy_stat->relocated;
+
+		buddy_stat->relocated_usec += ut_time_us(NULL) - usec;
+
+		return(true);
+	}
+
+	rw_lock_x_unlock(hash_lock);
+
+	mutex_exit(block_mutex);
+
+	return(false);
+}
+
+/**********************************************************************//**
+Deallocate a block. */
+UNIV_INTERN
+void
+buf_buddy_free_low(
+/*===============*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	void*		buf,		/*!< in: block to be freed, must not be
+					pointed to by the buffer pool */
+	ulint		i)		/*!< in: index of buf_pool->zip_free[],
+					or BUF_BUDDY_SIZES */
+{
+	buf_buddy_free_t*	buddy;
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(!mutex_own(&buf_pool->zip_mutex));
+	ut_ad(i <= BUF_BUDDY_SIZES);
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+	ut_ad(buf_pool->buddy_stat[i].used > 0);
+
+	buf_pool->buddy_stat[i].used--;
+recombine:
+	UNIV_MEM_ASSERT_AND_ALLOC(buf, BUF_BUDDY_LOW << i);
+
+	if (i == BUF_BUDDY_SIZES) {
+		buf_buddy_block_free(buf_pool, buf);
+		return;
+	}
+
+	ut_ad(i < BUF_BUDDY_SIZES);
+	ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i));
+	ut_ad(!buf_pool_contains_zip(buf_pool, buf));
+
+	/* Do not recombine blocks if there are few free blocks.
+	We may waste up to 15360*max_len bytes to free blocks
+	(1024 + 2048 + 4096 + 8192 = 15360) */
+	if (UT_LIST_GET_LEN(buf_pool->zip_free[i]) < 16) {
+		goto func_exit;
+	}
+
+	/* Try to combine adjacent blocks. */
+	buddy = reinterpret_cast<buf_buddy_free_t*>(
+		buf_buddy_get(reinterpret_cast<byte*>(buf),
+			      BUF_BUDDY_LOW << i));
+
+	switch (buf_buddy_is_free(buddy, i)) {
+	case BUF_BUDDY_STATE_FREE:
+		/* The buddy is free: recombine */
+		buf_buddy_remove_from_free(buf_pool, buddy, i);
+buddy_is_free:
+		ut_ad(!buf_pool_contains_zip(buf_pool, buddy));
+		i++;
+		buf = ut_align_down(buf, BUF_BUDDY_LOW << i);
+
+		goto recombine;
+
+	case BUF_BUDDY_STATE_USED:
+		ut_d(BUF_BUDDY_LIST_VALIDATE(buf_pool, i));
+
+		/* The buddy is not free. Is there a free block of
+		this size? */
+		if (buf_buddy_free_t* zip_buf =
+			UT_LIST_GET_FIRST(buf_pool->zip_free[i])) {
+
+			/* Remove the block from the free list, because
+			a successful buf_buddy_relocate() will overwrite
+			zip_free->list. */
+			buf_buddy_remove_from_free(buf_pool, zip_buf, i);
+
+			/* Try to relocate the buddy of buf to the free
+			block. */
+			if (buf_buddy_relocate(buf_pool, buddy, zip_buf, i)) {
+
+				goto buddy_is_free;
+			}
+
+			buf_buddy_add_to_free(buf_pool, zip_buf, i);
+		}
+
+		break;
+	case BUF_BUDDY_STATE_PARTIALLY_USED:
+		/* Some sub-blocks in the buddy are still in use.
+		Relocation will fail. No need to try. */
+		break;
+	}
+
+func_exit:
+	/* Free the block to the buddy list. */
+	buf_buddy_add_to_free(buf_pool,
+			      reinterpret_cast<buf_buddy_free_t*>(buf),
+			      i);
+}
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
new file mode 100644
index 00000000000..a676d70a992
--- /dev/null
+++ b/storage/innobase/buf/buf0buf.cc
@@ -0,0 +1,5431 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0buf.cc
+The database buffer buf_pool
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0buf.h"
+
+#ifdef UNIV_NONINL
+#include "buf0buf.ic"
+#endif
+
+#include "mem0mem.h"
+#include "btr0btr.h"
+#include "fil0fil.h"
+#ifndef UNIV_HOTBACKUP
+#include "buf0buddy.h"
+#include "lock0lock.h"
+#include "btr0sea.h"
+#include "ibuf0ibuf.h"
+#include "trx0undo.h"
+#include "log0log.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "srv0srv.h"
+#include "dict0dict.h"
+#include "log0recv.h"
+#include "page0zip.h"
+#include "srv0mon.h"
+#include "buf0checksum.h"
+
+/*
+		IMPLEMENTATION OF THE BUFFER POOL
+		=================================
+
+Performance improvement:
+------------------------
+Thread scheduling in NT may be so slow that the OS wait mechanism should
+not be used even in waiting for disk reads to complete.
+Rather, we should put waiting query threads to the queue of
+waiting jobs, and let the OS thread do something useful while the i/o
+is processed. In this way we could remove most OS thread switches in
+an i/o-intensive benchmark like TPC-C.
+
+A possibility is to put a user space thread library between the database
+and NT. User space thread libraries might be very fast.
+
+SQL Server 7.0 can be configured to use 'fibers' which are lightweight
+threads in NT. These should be studied.
+
+		Buffer frames and blocks
+		------------------------
+Following the terminology of Gray and Reuter, we call the memory
+blocks where file pages are loaded buffer frames. For each buffer
+frame there is a control block, or shortly, a block, in the buffer
+control array. The control info which does not need to be stored
+in the file along with the file page, resides in the control block.
+
+		Buffer pool struct
+		------------------
+The buffer buf_pool contains a single mutex which protects all the
+control data structures of the buf_pool. The content of a buffer frame is
+protected by a separate read-write lock in its control block, though.
+These locks can be locked and unlocked without owning the buf_pool->mutex.
+The OS events in the buf_pool struct can be waited for without owning the
+buf_pool->mutex.
+
+The buf_pool->mutex is a hot-spot in main memory, causing a lot of
+memory bus traffic on multiprocessor systems when processors
+alternately access the mutex. On our Pentium, the mutex is accessed
+maybe every 10 microseconds. We gave up the solution to have mutexes
+for each control block, for instance, because it seemed to be
+complicated.
+
+A solution to reduce mutex contention of the buf_pool->mutex is to
+create a separate mutex for the page hash table. On Pentium,
+accessing the hash table takes 2 microseconds, about half
+of the total buf_pool->mutex hold time.
+
+		Control blocks
+		--------------
+
+The control block contains, for instance, the bufferfix count
+which is incremented when a thread wants a file page to be fixed
+in a buffer frame. The bufferfix operation does not lock the
+contents of the frame, however. For this purpose, the control
+block contains a read-write lock.
+
+The buffer frames have to be aligned so that the start memory
+address of a frame is divisible by the universal page size, which
+is a power of two.
+
+We intend to make the buffer buf_pool size on-line reconfigurable,
+that is, the buf_pool size can be changed without closing the database.
+Then the database administarator may adjust it to be bigger
+at night, for example. The control block array must
+contain enough control blocks for the maximum buffer buf_pool size
+which is used in the particular database.
+If the buf_pool size is cut, we exploit the virtual memory mechanism of
+the OS, and just refrain from using frames at high addresses. Then the OS
+can swap them to disk.
+
+The control blocks containing file pages are put to a hash table
+according to the file address of the page.
+We could speed up the access to an individual page by using
+"pointer swizzling": we could replace the page references on
+non-leaf index pages by direct pointers to the page, if it exists
+in the buf_pool. We could make a separate hash table where we could
+chain all the page references in non-leaf pages residing in the buf_pool,
+using the page reference as the hash key,
+and at the time of reading of a page update the pointers accordingly.
+Drawbacks of this solution are added complexity and,
+possibly, extra space required on non-leaf pages for memory pointers.
+A simpler solution is just to speed up the hash table mechanism
+in the database, using tables whose size is a power of 2.
+
+		Lists of blocks
+		---------------
+
+There are several lists of control blocks.
+
+The free list (buf_pool->free) contains blocks which are currently not
+used.
+
+The common LRU list contains all the blocks holding a file page
+except those for which the bufferfix count is non-zero.
+The pages are in the LRU list roughly in the order of the last
+access to the page, so that the oldest pages are at the end of the
+list. We also keep a pointer to near the end of the LRU list,
+which we can use when we want to artificially age a page in the
+buf_pool. This is used if we know that some page is not needed
+again for some time: we insert the block right after the pointer,
+causing it to be replaced sooner than would normally be the case.
+Currently this aging mechanism is used for read-ahead mechanism
+of pages, and it can also be used when there is a scan of a full
+table which cannot fit in the memory. Putting the pages near the
+end of the LRU list, we make sure that most of the buf_pool stays
+in the main memory, undisturbed.
+
+The unzip_LRU list contains a subset of the common LRU list.  The
+blocks on the unzip_LRU list hold a compressed file page and the
+corresponding uncompressed page frame.  A block is in unzip_LRU if and
+only if the predicate buf_page_belongs_to_unzip_LRU(&block->page)
+holds.  The blocks in unzip_LRU will be in same order as they are in
+the common LRU list.  That is, each manipulation of the common LRU
+list will result in the same manipulation of the unzip_LRU list.
+
+The chain of modified blocks (buf_pool->flush_list) contains the blocks
+holding file pages that have been modified in the memory
+but not written to disk yet. The block with the oldest modification
+which has not yet been written to disk is at the end of the chain.
+The access to this list is protected by buf_pool->flush_list_mutex.
+
+The chain of unmodified compressed blocks (buf_pool->zip_clean)
+contains the control blocks (buf_page_t) of those compressed pages
+that are not in buf_pool->flush_list and for which no uncompressed
+page has been allocated in the buffer pool.  The control blocks for
+uncompressed pages are accessible via buf_block_t objects that are
+reachable via buf_pool->chunks[].
+
+The chains of free memory blocks (buf_pool->zip_free[]) are used by
+the buddy allocator (buf0buddy.cc) to keep track of currently unused
+memory blocks of size sizeof(buf_page_t)..UNIV_PAGE_SIZE / 2.  These
+blocks are inside the UNIV_PAGE_SIZE-sized memory blocks of type
+BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
+pool.  The buddy allocator is solely used for allocating control
+blocks for compressed pages (buf_page_t) and compressed page frames.
+
+		Loading a file page
+		-------------------
+
+First, a victim block for replacement has to be found in the
+buf_pool. It is taken from the free list or searched for from the
+end of the LRU-list. An exclusive lock is reserved for the frame,
+the io_fix field is set in the block fixing the block in buf_pool,
+and the io-operation for loading the page is queued. The io-handler thread
+releases the X-lock on the frame and resets the io_fix field
+when the io operation completes.
+
+A thread may request the above operation using the function
+buf_page_get(). It may then continue to request a lock on the frame.
+The lock is granted when the io-handler releases the x-lock.
+
+		Read-ahead
+		----------
+
+The read-ahead mechanism is intended to be intelligent and
+isolated from the semantically higher levels of the database
+index management. From the higher level we only need the
+information if a file page has a natural successor or
+predecessor page. On the leaf level of a B-tree index,
+these are the next and previous pages in the natural
+order of the pages.
+
+Let us first explain the read-ahead mechanism when the leafs
+of a B-tree are scanned in an ascending or descending order.
+When a read page is the first time referenced in the buf_pool,
+the buffer manager checks if it is at the border of a so-called
+linear read-ahead area. The tablespace is divided into these
+areas of size 64 blocks, for example. So if the page is at the
+border of such an area, the read-ahead mechanism checks if
+all the other blocks in the area have been accessed in an
+ascending or descending order. If this is the case, the system
+looks at the natural successor or predecessor of the page,
+checks if that is at the border of another area, and in this case
+issues read-requests for all the pages in that area. Maybe
+we could relax the condition that all the pages in the area
+have to be accessed: if data is deleted from a table, there may
+appear holes of unused pages in the area.
+
+A different read-ahead mechanism is used when there appears
+to be a random access pattern to a file.
+If a new page is referenced in the buf_pool, and several pages
+of its random access area (for instance, 32 consecutive pages
+in a tablespace) have recently been referenced, we may predict
+that the whole area may be needed in the near future, and issue
+the read requests for the whole area.
+*/
+
+#ifndef UNIV_HOTBACKUP
+/** Value in microseconds */
+static const int WAIT_FOR_READ	= 100;
+/** Number of attemtps made to read in a page in the buffer pool */
+static const ulint BUF_PAGE_READ_MAX_RETRIES = 100;
+
+/** The buffer pools of the database */
+UNIV_INTERN buf_pool_t*	buf_pool_ptr;
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+static ulint	buf_dbg_counter	= 0; /*!< This is used to insert validation
+					operations in execution in the
+					debug version */
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#ifdef UNIV_DEBUG
+/** If this is set TRUE, the program prints info whenever
+read-ahead or flush occurs */
+UNIV_INTERN ibool		buf_debug_prints = FALSE;
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_PFS_RWLOCK
+/* Keys to register buffer block related rwlocks and mutexes with
+performance schema */
+UNIV_INTERN mysql_pfs_key_t	buf_block_lock_key;
+# ifdef UNIV_SYNC_DEBUG
+UNIV_INTERN mysql_pfs_key_t	buf_block_debug_latch_key;
+# endif /* UNIV_SYNC_DEBUG */
+#endif /* UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	buffer_block_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	buf_pool_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	buf_pool_zip_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	flush_list_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+#if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK
+# ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK
+
+/* Buffer block mutexes and rwlocks can be registered
+in one group rather than individually. If PFS_GROUP_BUFFER_SYNC
+is defined, register buffer block mutex and rwlock
+in one group after their initialization. */
+#  define PFS_GROUP_BUFFER_SYNC
+
+/* This define caps the number of mutexes/rwlocks can
+be registered with performance schema. Developers can
+modify this define if necessary. Please note, this would
+be effective only if PFS_GROUP_BUFFER_SYNC is defined. */
+#  define PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER	ULINT_MAX
+
+# endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
+#endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */
+
+/** Macro to determine whether the read of write counter is used depending
+on the io_type */
+#define MONITOR_RW_COUNTER(io_type, counter)		\
+	((io_type == BUF_IO_READ)			\
+	 ? (counter##_READ)				\
+	 : (counter##_WRITTEN))
+
+/********************************************************************//**
+Gets the smallest oldest_modification lsn for any page in the pool. Returns
+zero if all modified pages have been flushed to disk.
+@return oldest modification in pool, zero if none */
+UNIV_INTERN
+lsn_t
+buf_pool_get_oldest_modification(void)
+/*==================================*/
+{
+	ulint		i;
+	buf_page_t*	bpage;
+	lsn_t		lsn = 0;
+	lsn_t		oldest_lsn = 0;
+
+	/* When we traverse all the flush lists we don't want another
+	thread to add a dirty page to any flush list. */
+	log_flush_order_mutex_enter();
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		buf_flush_list_mutex_enter(buf_pool);
+
+		bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
+
+		if (bpage != NULL) {
+			ut_ad(bpage->in_flush_list);
+			lsn = bpage->oldest_modification;
+		}
+
+		buf_flush_list_mutex_exit(buf_pool);
+
+		if (!oldest_lsn || oldest_lsn > lsn) {
+			oldest_lsn = lsn;
+		}
+	}
+
+	log_flush_order_mutex_exit();
+
+	/* The returned answer may be out of date: the flush_list can
+	change after the mutex has been released. */
+
+	return(oldest_lsn);
+}
+
+/********************************************************************//**
+Get total buffer pool statistics. */
+UNIV_INTERN
+void
+buf_get_total_list_len(
+/*===================*/
+	ulint*		LRU_len,	/*!< out: length of all LRU lists */
+	ulint*		free_len,	/*!< out: length of all free lists */
+	ulint*		flush_list_len)	/*!< out: length of all flush lists */
+{
+	ulint		i;
+
+	*LRU_len = 0;
+	*free_len = 0;
+	*flush_list_len = 0;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		*LRU_len += UT_LIST_GET_LEN(buf_pool->LRU);
+		*free_len += UT_LIST_GET_LEN(buf_pool->free);
+		*flush_list_len += UT_LIST_GET_LEN(buf_pool->flush_list);
+	}
+}
+
+/********************************************************************//**
+Get total list size in bytes from all buffer pools. */
+UNIV_INTERN
+void
+buf_get_total_list_size_in_bytes(
+/*=============================*/
+	buf_pools_list_size_t*	buf_pools_list_size)	/*!< out: list sizes
+							in all buffer pools */
+{
+	ut_ad(buf_pools_list_size);
+	memset(buf_pools_list_size, 0, sizeof(*buf_pools_list_size));
+
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+		/* We don't need mutex protection since this is
+		for statistics purpose */
+		buf_pools_list_size->LRU_bytes += buf_pool->stat.LRU_bytes;
+		buf_pools_list_size->unzip_LRU_bytes +=
+			UT_LIST_GET_LEN(buf_pool->unzip_LRU) * UNIV_PAGE_SIZE;
+		buf_pools_list_size->flush_list_bytes +=
+			buf_pool->stat.flush_list_bytes;
+	}
+}
+
+/********************************************************************//**
+Get total buffer pool statistics. */
+UNIV_INTERN
+void
+buf_get_total_stat(
+/*===============*/
+	buf_pool_stat_t*	tot_stat)	/*!< out: buffer pool stats */
+{
+	ulint			i;
+
+	memset(tot_stat, 0, sizeof(*tot_stat));
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_stat_t*buf_stat;
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		buf_stat = &buf_pool->stat;
+		tot_stat->n_page_gets += buf_stat->n_page_gets;
+		tot_stat->n_pages_read += buf_stat->n_pages_read;
+		tot_stat->n_pages_written += buf_stat->n_pages_written;
+		tot_stat->n_pages_created += buf_stat->n_pages_created;
+		tot_stat->n_ra_pages_read_rnd += buf_stat->n_ra_pages_read_rnd;
+		tot_stat->n_ra_pages_read += buf_stat->n_ra_pages_read;
+		tot_stat->n_ra_pages_evicted += buf_stat->n_ra_pages_evicted;
+		tot_stat->n_pages_made_young += buf_stat->n_pages_made_young;
+
+		tot_stat->n_pages_not_made_young +=
+			buf_stat->n_pages_not_made_young;
+	}
+}
+
+/********************************************************************//**
+Allocates a buffer block.
+@return own: the allocated block, in state BUF_BLOCK_MEMORY */
+UNIV_INTERN
+buf_block_t*
+buf_block_alloc(
+/*============*/
+	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance,
+					or NULL for round-robin selection
+					of the buffer pool */
+{
+	buf_block_t*	block;
+	ulint		index;
+	static ulint	buf_pool_index;
+
+	if (buf_pool == NULL) {
+		/* We are allocating memory from any buffer pool, ensure
+		we spread the grace on all buffer pool instances. */
+		index = buf_pool_index++ % srv_buf_pool_instances;
+		buf_pool = buf_pool_from_array(index);
+	}
+
+	block = buf_LRU_get_free_block(buf_pool);
+
+	buf_block_set_state(block, BUF_BLOCK_MEMORY);
+
+	return(block);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Checks if a page is all zeroes.
+@return	TRUE if the page is all zeroes */
+bool
+buf_page_is_zeroes(
+/*===============*/
+	const byte*	read_buf,	/*!< in: a database page */
+	const ulint	zip_size)	/*!< in: size of compressed page;
+					0 for uncompressed pages */
+{
+	const ulint page_size = zip_size ? zip_size : UNIV_PAGE_SIZE;
+
+	for (ulint i = 0; i < page_size; i++) {
+		if (read_buf[i] != 0) {
+			return(false);
+		}
+	}
+	return(true);
+}
+
+/********************************************************************//**
+Checks if a page is corrupt.
+@return	TRUE if corrupted */
+UNIV_INTERN
+ibool
+buf_page_is_corrupted(
+/*==================*/
+	bool		check_lsn,	/*!< in: true if we need to check
+					and complain about the LSN */
+	const byte*	read_buf,	/*!< in: a database page */
+	ulint		zip_size)	/*!< in: size of compressed page;
+					0 for uncompressed pages */
+{
+	ulint		checksum_field1;
+	ulint		checksum_field2;
+	ibool		crc32_inited = FALSE;
+	ib_uint32_t	crc32 = ULINT32_UNDEFINED;
+
+	if (!zip_size
+	    && memcmp(read_buf + FIL_PAGE_LSN + 4,
+		      read_buf + UNIV_PAGE_SIZE
+		      - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
+
+		/* Stored log sequence numbers at the start and the end
+		of page do not match */
+
+		return(TRUE);
+	}
+
+#ifndef UNIV_HOTBACKUP
+	if (check_lsn && recv_lsn_checks_on) {
+		lsn_t	current_lsn;
+
+		/* Since we are going to reset the page LSN during the import
+		phase it makes no sense to spam the log with error messages. */
+
+		if (log_peek_lsn(&current_lsn)
+		    && current_lsn
+		    < mach_read_from_8(read_buf + FIL_PAGE_LSN)) {
+			ut_print_timestamp(stderr);
+
+			fprintf(stderr,
+				" InnoDB: Error: page %lu log sequence number"
+				" " LSN_PF "\n"
+				"InnoDB: is in the future! Current system "
+				"log sequence number " LSN_PF ".\n"
+				"InnoDB: Your database may be corrupt or "
+				"you may have copied the InnoDB\n"
+				"InnoDB: tablespace but not the InnoDB "
+				"log files. See\n"
+				"InnoDB: " REFMAN
+				"forcing-innodb-recovery.html\n"
+				"InnoDB: for more information.\n",
+				(ulong) mach_read_from_4(
+					read_buf + FIL_PAGE_OFFSET),
+				(lsn_t) mach_read_from_8(
+					read_buf + FIL_PAGE_LSN),
+				current_lsn);
+		}
+	}
+#endif
+
+	/* Check whether the checksum fields have correct values */
+
+	if (srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_NONE) {
+		return(FALSE);
+	}
+
+	if (zip_size) {
+		return(!page_zip_verify_checksum(read_buf, zip_size));
+	}
+
+	checksum_field1 = mach_read_from_4(
+		read_buf + FIL_PAGE_SPACE_OR_CHKSUM);
+
+	checksum_field2 = mach_read_from_4(
+		read_buf + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM);
+
+#if FIL_PAGE_LSN % 8
+#error "FIL_PAGE_LSN must be 64 bit aligned"
+#endif
+
+	/* declare empty pages non-corrupted */
+	if (checksum_field1 == 0 && checksum_field2 == 0
+	    && *reinterpret_cast<const ib_uint64_t*>(read_buf +
+						     FIL_PAGE_LSN) == 0) {
+		/* make sure that the page is really empty */
+		for (ulint i = 0; i < UNIV_PAGE_SIZE; i++) {
+			if (read_buf[i] != 0) {
+				return(TRUE);
+			}
+		}
+
+		return(FALSE);
+	}
+
+	switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+
+		crc32 = buf_calc_page_crc32(read_buf);
+
+		return(checksum_field1 != crc32 || checksum_field2 != crc32);
+
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+
+		return(checksum_field1
+		       != buf_calc_page_new_checksum(read_buf)
+		       || checksum_field2
+		       != buf_calc_page_old_checksum(read_buf));
+
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+
+		return(checksum_field1 != BUF_NO_CHECKSUM_MAGIC
+		       || checksum_field2 != BUF_NO_CHECKSUM_MAGIC);
+
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+		/* There are 3 valid formulas for
+		checksum_field2 (old checksum field):
+
+		1. Very old versions of InnoDB only stored 8 byte lsn to the
+		start and the end of the page.
+
+		2. InnoDB versions before MySQL 5.6.3 store the old formula
+		checksum (buf_calc_page_old_checksum()).
+
+		3. InnoDB versions 5.6.3 and newer with
+		innodb_checksum_algorithm=strict_crc32|crc32 store CRC32. */
+
+		/* since innodb_checksum_algorithm is not strict_* allow
+		any of the algos to match for the old field */
+
+		if (checksum_field2
+		    != mach_read_from_4(read_buf + FIL_PAGE_LSN)
+		    && checksum_field2 != BUF_NO_CHECKSUM_MAGIC) {
+
+			/* The checksum does not match any of the
+			fast to check. First check the selected algorithm
+			for writing checksums because we assume that the
+			chance of it matching is higher. */
+
+			if (srv_checksum_algorithm
+			    == SRV_CHECKSUM_ALGORITHM_CRC32) {
+
+				crc32 = buf_calc_page_crc32(read_buf);
+				crc32_inited = TRUE;
+
+				if (checksum_field2 != crc32
+				    && checksum_field2
+				    != buf_calc_page_old_checksum(read_buf)) {
+
+					return(TRUE);
+				}
+			} else {
+				ut_ad(srv_checksum_algorithm
+				     == SRV_CHECKSUM_ALGORITHM_INNODB);
+
+				if (checksum_field2
+				    != buf_calc_page_old_checksum(read_buf)) {
+
+					crc32 = buf_calc_page_crc32(read_buf);
+					crc32_inited = TRUE;
+
+					if (checksum_field2 != crc32) {
+						return(TRUE);
+					}
+				}
+			}
+		}
+
+		/* old field is fine, check the new field */
+
+		/* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
+		(always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */
+
+		if (checksum_field1 != 0
+		    && checksum_field1 != BUF_NO_CHECKSUM_MAGIC) {
+
+			/* The checksum does not match any of the
+			fast to check. First check the selected algorithm
+			for writing checksums because we assume that the
+			chance of it matching is higher. */
+
+			if (srv_checksum_algorithm
+			    == SRV_CHECKSUM_ALGORITHM_CRC32) {
+
+				if (!crc32_inited) {
+					crc32 = buf_calc_page_crc32(read_buf);
+					crc32_inited = TRUE;
+				}
+
+				if (checksum_field1 != crc32
+				    && checksum_field1
+				    != buf_calc_page_new_checksum(read_buf)) {
+
+					return(TRUE);
+				}
+			} else {
+				ut_ad(srv_checksum_algorithm
+				     == SRV_CHECKSUM_ALGORITHM_INNODB);
+
+				if (checksum_field1
+				    != buf_calc_page_new_checksum(read_buf)) {
+
+					if (!crc32_inited) {
+						crc32 = buf_calc_page_crc32(
+							read_buf);
+						crc32_inited = TRUE;
+					}
+
+					if (checksum_field1 != crc32) {
+						return(TRUE);
+					}
+				}
+			}
+		}
+
+		/* If CRC32 is stored in at least one of the fields, then the
+		other field must also be CRC32 */
+		if (crc32_inited
+		    && ((checksum_field1 == crc32
+			 && checksum_field2 != crc32)
+			|| (checksum_field1 != crc32
+			    && checksum_field2 == crc32))) {
+
+			return(TRUE);
+		}
+
+		break;
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+		/* should have returned FALSE earlier */
+		ut_error;
+	/* no default so the compiler will emit a warning if new enum
+	is added and not handled here */
+	}
+
+	DBUG_EXECUTE_IF("buf_page_is_corrupt_failure", return(TRUE); );
+
+	return(FALSE);
+}
+
+/********************************************************************//**
+Prints a page to stderr. */
+UNIV_INTERN
+void
+buf_page_print(
+/*===========*/
+	const byte*	read_buf,	/*!< in: a database page */
+	ulint		zip_size,	/*!< in: compressed page size, or
+					0 for uncompressed pages */
+	ulint		flags)		/*!< in: 0 or
+					BUF_PAGE_PRINT_NO_CRASH or
+					BUF_PAGE_PRINT_NO_FULL */
+
+{
+#ifndef UNIV_HOTBACKUP
+	dict_index_t*	index;
+#endif /* !UNIV_HOTBACKUP */
+	ulint		size = zip_size;
+
+	if (!size) {
+		size = UNIV_PAGE_SIZE;
+	}
+
+	if (!(flags & BUF_PAGE_PRINT_NO_FULL)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Page dump in ascii and hex (%lu bytes):\n",
+			(ulong) size);
+		ut_print_buf(stderr, read_buf, size);
+		fputs("\nInnoDB: End of page dump\n", stderr);
+	}
+
+	if (zip_size) {
+		/* Print compressed page. */
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Compressed page type (" ULINTPF "); "
+			"stored checksum in field1 " ULINTPF "; "
+			"calculated checksums for field1: "
+			"%s " ULINTPF ", "
+			"%s " ULINTPF ", "
+			"%s " ULINTPF "; "
+			"page LSN " LSN_PF "; "
+			"page number (if stored to page already) " ULINTPF "; "
+			"space id (if stored to page already) " ULINTPF "\n",
+			fil_page_get_type(read_buf),
+			mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
+			buf_checksum_algorithm_name(
+				SRV_CHECKSUM_ALGORITHM_CRC32),
+			page_zip_calc_checksum(read_buf, zip_size,
+				SRV_CHECKSUM_ALGORITHM_CRC32),
+			buf_checksum_algorithm_name(
+				SRV_CHECKSUM_ALGORITHM_INNODB),
+			page_zip_calc_checksum(read_buf, zip_size,
+				SRV_CHECKSUM_ALGORITHM_INNODB),
+			buf_checksum_algorithm_name(
+				SRV_CHECKSUM_ALGORITHM_NONE),
+			page_zip_calc_checksum(read_buf, zip_size,
+				SRV_CHECKSUM_ALGORITHM_NONE),
+			mach_read_from_8(read_buf + FIL_PAGE_LSN),
+			mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
+			mach_read_from_4(read_buf
+					 + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+	} else {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: uncompressed page, "
+			"stored checksum in field1 " ULINTPF ", "
+			"calculated checksums for field1: "
+			"%s " UINT32PF ", "
+			"%s " ULINTPF ", "
+			"%s " ULINTPF ", "
+
+			"stored checksum in field2 " ULINTPF ", "
+			"calculated checksums for field2: "
+			"%s " UINT32PF ", "
+			"%s " ULINTPF ", "
+			"%s " ULINTPF ", "
+
+			"page LSN " ULINTPF " " ULINTPF ", "
+			"low 4 bytes of LSN at page end " ULINTPF ", "
+			"page number (if stored to page already) " ULINTPF ", "
+			"space id (if created with >= MySQL-4.1.1 "
+			"and stored already) %lu\n",
+			mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
+			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_CRC32),
+			buf_calc_page_crc32(read_buf),
+			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_INNODB),
+			buf_calc_page_new_checksum(read_buf),
+			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_NONE),
+			BUF_NO_CHECKSUM_MAGIC,
+
+			mach_read_from_4(read_buf + UNIV_PAGE_SIZE
+					 - FIL_PAGE_END_LSN_OLD_CHKSUM),
+			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_CRC32),
+			buf_calc_page_crc32(read_buf),
+			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_INNODB),
+			buf_calc_page_old_checksum(read_buf),
+			buf_checksum_algorithm_name(SRV_CHECKSUM_ALGORITHM_NONE),
+			BUF_NO_CHECKSUM_MAGIC,
+
+			mach_read_from_4(read_buf + FIL_PAGE_LSN),
+			mach_read_from_4(read_buf + FIL_PAGE_LSN + 4),
+			mach_read_from_4(read_buf + UNIV_PAGE_SIZE
+					 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
+			mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
+			mach_read_from_4(read_buf
+					 + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+	}
+
+#ifndef UNIV_HOTBACKUP
+	if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE)
+	    == TRX_UNDO_INSERT) {
+		fprintf(stderr,
+			"InnoDB: Page may be an insert undo log page\n");
+	} else if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR
+				    + TRX_UNDO_PAGE_TYPE)
+		   == TRX_UNDO_UPDATE) {
+		fprintf(stderr,
+			"InnoDB: Page may be an update undo log page\n");
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	switch (fil_page_get_type(read_buf)) {
+		index_id_t	index_id;
+	case FIL_PAGE_INDEX:
+		index_id = btr_page_get_index_id(read_buf);
+		fprintf(stderr,
+			"InnoDB: Page may be an index page where"
+			" index id is %llu\n",
+			(ullint) index_id);
+#ifndef UNIV_HOTBACKUP
+		index = dict_index_find_on_id_low(index_id);
+		if (index) {
+			fputs("InnoDB: (", stderr);
+			dict_index_name_print(stderr, NULL, index);
+			fputs(")\n", stderr);
+		}
+#endif /* !UNIV_HOTBACKUP */
+		break;
+	case FIL_PAGE_INODE:
+		fputs("InnoDB: Page may be an 'inode' page\n", stderr);
+		break;
+	case FIL_PAGE_IBUF_FREE_LIST:
+		fputs("InnoDB: Page may be an insert buffer free list page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_ALLOCATED:
+		fputs("InnoDB: Page may be a freshly allocated page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_IBUF_BITMAP:
+		fputs("InnoDB: Page may be an insert buffer bitmap page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_SYS:
+		fputs("InnoDB: Page may be a system page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_TRX_SYS:
+		fputs("InnoDB: Page may be a transaction system page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_FSP_HDR:
+		fputs("InnoDB: Page may be a file space header page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_XDES:
+		fputs("InnoDB: Page may be an extent descriptor page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_BLOB:
+		fputs("InnoDB: Page may be a BLOB page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_ZBLOB:
+	case FIL_PAGE_TYPE_ZBLOB2:
+		fputs("InnoDB: Page may be a compressed BLOB page\n",
+		      stderr);
+		break;
+	}
+
+	ut_ad(flags & BUF_PAGE_PRINT_NO_CRASH);
+}
+
+#ifndef UNIV_HOTBACKUP
+
+# ifdef PFS_GROUP_BUFFER_SYNC
+/********************************************************************//**
+This function registers mutexes and rwlocks in buffer blocks with
+performance schema. If PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER is
+defined to be a value less than chunk->size, then only mutexes
+and rwlocks in the first PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER
+blocks are registered. */
+static
+void
+pfs_register_buffer_block(
+/*======================*/
+	buf_chunk_t*	chunk)		/*!< in/out: chunk of buffers */
+{
+	ulint		i;
+	ulint		num_to_register;
+	buf_block_t*    block;
+
+	block = chunk->blocks;
+
+	num_to_register = ut_min(chunk->size,
+				 PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER);
+
+	for (i = 0; i < num_to_register; i++) {
+		ib_mutex_t*	mutex;
+		rw_lock_t*	rwlock;
+
+#  ifdef UNIV_PFS_MUTEX
+		mutex = &block->mutex;
+		ut_a(!mutex->pfs_psi);
+		mutex->pfs_psi = (PSI_server)
+			? PSI_server->init_mutex(buffer_block_mutex_key, mutex)
+			: NULL;
+#  endif /* UNIV_PFS_MUTEX */
+
+#  ifdef UNIV_PFS_RWLOCK
+		rwlock = &block->lock;
+		ut_a(!rwlock->pfs_psi);
+		rwlock->pfs_psi = (PSI_server)
+			? PSI_server->init_rwlock(buf_block_lock_key, rwlock)
+			: NULL;
+
+#   ifdef UNIV_SYNC_DEBUG
+		rwlock = &block->debug_latch;
+		ut_a(!rwlock->pfs_psi);
+		rwlock->pfs_psi = (PSI_server)
+			? PSI_server->init_rwlock(buf_block_debug_latch_key,
+						  rwlock)
+			: NULL;
+#   endif /* UNIV_SYNC_DEBUG */
+
+#  endif /* UNIV_PFS_RWLOCK */
+		block++;
+	}
+}
+# endif /* PFS_GROUP_BUFFER_SYNC */
+
+/********************************************************************//**
+Initializes a buffer control block when the buf_pool is created. */
+static
+void
+buf_block_init(
+/*===========*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	buf_block_t*	block,		/*!< in: pointer to control block */
+	byte*		frame)		/*!< in: pointer to buffer frame */
+{
+	UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE);
+
+	block->frame = frame;
+
+	block->page.buf_pool_index = buf_pool_index(buf_pool);
+	block->page.state = BUF_BLOCK_NOT_USED;
+	block->page.buf_fix_count = 0;
+	block->page.io_fix = BUF_IO_NONE;
+
+	block->modify_clock = 0;
+
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+	block->page.file_page_was_freed = FALSE;
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
+
+	block->check_index_page_at_flush = FALSE;
+	block->index = NULL;
+
+#ifdef UNIV_DEBUG
+	block->page.in_page_hash = FALSE;
+	block->page.in_zip_hash = FALSE;
+	block->page.in_flush_list = FALSE;
+	block->page.in_free_list = FALSE;
+	block->page.in_LRU_list = FALSE;
+	block->in_unzip_LRU_list = FALSE;
+#endif /* UNIV_DEBUG */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	block->n_pointers = 0;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	page_zip_des_init(&block->page.zip);
+
+#if defined PFS_SKIP_BUFFER_MUTEX_RWLOCK || defined PFS_GROUP_BUFFER_SYNC
+	/* If PFS_SKIP_BUFFER_MUTEX_RWLOCK is defined, skip registration
+	of buffer block mutex/rwlock with performance schema. If
+	PFS_GROUP_BUFFER_SYNC is defined, skip the registration
+	since buffer block mutex/rwlock will be registered later in
+	pfs_register_buffer_block() */
+
+	mutex_create(PFS_NOT_INSTRUMENTED, &block->mutex, SYNC_BUF_BLOCK);
+	rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, SYNC_LEVEL_VARYING);
+
+# ifdef UNIV_SYNC_DEBUG
+	rw_lock_create(PFS_NOT_INSTRUMENTED,
+		       &block->debug_latch, SYNC_NO_ORDER_CHECK);
+# endif /* UNIV_SYNC_DEBUG */
+
+#else /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
+	mutex_create(buffer_block_mutex_key, &block->mutex, SYNC_BUF_BLOCK);
+	rw_lock_create(buf_block_lock_key, &block->lock, SYNC_LEVEL_VARYING);
+
+# ifdef UNIV_SYNC_DEBUG
+	rw_lock_create(buf_block_debug_latch_key,
+		       &block->debug_latch, SYNC_NO_ORDER_CHECK);
+# endif /* UNIV_SYNC_DEBUG */
+#endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
+
+	ut_ad(rw_lock_validate(&(block->lock)));
+}
+
+/********************************************************************//**
+Allocates a chunk of buffer frames.
+@return	chunk, or NULL on failure */
+static
+buf_chunk_t*
+buf_chunk_init(
+/*===========*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	buf_chunk_t*	chunk,		/*!< out: chunk of buffers */
+	ulint		mem_size)	/*!< in: requested size in bytes */
+{
+	buf_block_t*	block;
+	byte*		frame;
+	ulint		i;
+
+	/* Round down to a multiple of page size,
+	although it already should be. */
+	mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE);
+	/* Reserve space for the block descriptors. */
+	mem_size += ut_2pow_round((mem_size / UNIV_PAGE_SIZE) * (sizeof *block)
+				  + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
+
+	chunk->mem_size = mem_size;
+	chunk->mem = os_mem_alloc_large(&chunk->mem_size);
+
+	if (UNIV_UNLIKELY(chunk->mem == NULL)) {
+
+		return(NULL);
+	}
+
+	/* Allocate the block descriptors from
+	the start of the memory block. */
+	chunk->blocks = (buf_block_t*) chunk->mem;
+
+	/* Align a pointer to the first frame.  Note that when
+	os_large_page_size is smaller than UNIV_PAGE_SIZE,
+	we may allocate one fewer block than requested.  When
+	it is bigger, we may allocate more blocks than requested. */
+
+	frame = (byte*) ut_align(chunk->mem, UNIV_PAGE_SIZE);
+	chunk->size = chunk->mem_size / UNIV_PAGE_SIZE
+		- (frame != chunk->mem);
+
+	/* Subtract the space needed for block descriptors. */
+	{
+		ulint	size = chunk->size;
+
+		while (frame < (byte*) (chunk->blocks + size)) {
+			frame += UNIV_PAGE_SIZE;
+			size--;
+		}
+
+		chunk->size = size;
+	}
+
+	/* Init block structs and assign frames for them. Then we
+	assign the frames to the first blocks (we already mapped the
+	memory above). */
+
+	block = chunk->blocks;
+
+	for (i = chunk->size; i--; ) {
+
+		buf_block_init(buf_pool, block, frame);
+		UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE);
+
+		/* Add the block to the free list */
+		UT_LIST_ADD_LAST(list, buf_pool->free, (&block->page));
+
+		ut_d(block->page.in_free_list = TRUE);
+		ut_ad(buf_pool_from_block(block) == buf_pool);
+
+		block++;
+		frame += UNIV_PAGE_SIZE;
+	}
+
+#ifdef PFS_GROUP_BUFFER_SYNC
+	pfs_register_buffer_block(chunk);
+#endif
+	return(chunk);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Finds a block in the given buffer chunk that points to a
+given compressed page.
+@return	buffer block pointing to the compressed page, or NULL */
+static
+buf_block_t*
+buf_chunk_contains_zip(
+/*===================*/
+	buf_chunk_t*	chunk,	/*!< in: chunk being checked */
+	const void*	data)	/*!< in: pointer to compressed page */
+{
+	buf_block_t*	block;
+	ulint		i;
+
+	block = chunk->blocks;
+
+	for (i = chunk->size; i--; block++) {
+		if (block->page.zip.data == data) {
+
+			return(block);
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Finds a block in the buffer pool that points to a
+given compressed page.
+@return	buffer block pointing to the compressed page, or NULL */
+UNIV_INTERN
+buf_block_t*
+buf_pool_contains_zip(
+/*==================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	const void*	data)		/*!< in: pointer to compressed page */
+{
+	ulint		n;
+	buf_chunk_t*	chunk = buf_pool->chunks;
+
+	ut_ad(buf_pool);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	for (n = buf_pool->n_chunks; n--; chunk++) {
+
+		buf_block_t* block = buf_chunk_contains_zip(chunk, data);
+
+		if (block) {
+			return(block);
+		}
+	}
+
+	return(NULL);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Checks that all file pages in the buffer chunk are in a replaceable state.
+@return	address of a non-free block, or NULL if all freed */
+static
+const buf_block_t*
+buf_chunk_not_freed(
+/*================*/
+	buf_chunk_t*	chunk)	/*!< in: chunk being checked */
+{
+	buf_block_t*	block;
+	ulint		i;
+
+	block = chunk->blocks;
+
+	for (i = chunk->size; i--; block++) {
+		ibool	ready;
+
+		switch (buf_block_get_state(block)) {
+		case BUF_BLOCK_POOL_WATCH:
+		case BUF_BLOCK_ZIP_PAGE:
+		case BUF_BLOCK_ZIP_DIRTY:
+			/* The uncompressed buffer pool should never
+			contain compressed block descriptors. */
+			ut_error;
+			break;
+		case BUF_BLOCK_NOT_USED:
+		case BUF_BLOCK_READY_FOR_USE:
+		case BUF_BLOCK_MEMORY:
+		case BUF_BLOCK_REMOVE_HASH:
+			/* Skip blocks that are not being used for
+			file pages. */
+			break;
+		case BUF_BLOCK_FILE_PAGE:
+			mutex_enter(&block->mutex);
+			ready = buf_flush_ready_for_replace(&block->page);
+			mutex_exit(&block->mutex);
+
+			if (!ready) {
+
+				return(block);
+			}
+
+			break;
+		}
+	}
+
+	return(NULL);
+}
+
+/********************************************************************//**
+Set buffer pool size variables after resizing it */
+static
+void
+buf_pool_set_sizes(void)
+/*====================*/
+{
+	ulint	i;
+	ulint	curr_size = 0;
+
+	buf_pool_mutex_enter_all();
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+		curr_size += buf_pool->curr_pool_size;
+	}
+
+	srv_buf_pool_curr_size = curr_size;
+	srv_buf_pool_old_size = srv_buf_pool_size;
+
+	buf_pool_mutex_exit_all();
+}
+
+/********************************************************************//**
+Initialize a buffer pool instance.
+@return DB_SUCCESS if all goes well. */
+UNIV_INTERN
+ulint
+buf_pool_init_instance(
+/*===================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ulint		buf_pool_size,	/*!< in: size in bytes */
+	ulint		instance_no)	/*!< in: id of the instance */
+{
+	ulint		i;
+	buf_chunk_t*	chunk;
+
+	/* 1. Initialize general fields
+	------------------------------- */
+	mutex_create(buf_pool_mutex_key,
+		     &buf_pool->mutex, SYNC_BUF_POOL);
+	mutex_create(buf_pool_zip_mutex_key,
+		     &buf_pool->zip_mutex, SYNC_BUF_BLOCK);
+
+	buf_pool_mutex_enter(buf_pool);
+
+	if (buf_pool_size > 0) {
+		buf_pool->n_chunks = 1;
+
+		buf_pool->chunks = chunk =
+			(buf_chunk_t*) mem_zalloc(sizeof *chunk);
+
+		UT_LIST_INIT(buf_pool->free);
+
+		if (!buf_chunk_init(buf_pool, chunk, buf_pool_size)) {
+			mem_free(chunk);
+			mem_free(buf_pool);
+
+			buf_pool_mutex_exit(buf_pool);
+
+			return(DB_ERROR);
+		}
+
+		buf_pool->instance_no = instance_no;
+		buf_pool->old_pool_size = buf_pool_size;
+		buf_pool->curr_size = chunk->size;
+		buf_pool->curr_pool_size = buf_pool->curr_size * UNIV_PAGE_SIZE;
+
+		/* Number of locks protecting page_hash must be a
+		power of two */
+		srv_n_page_hash_locks = static_cast<ulong>(
+				 ut_2_power_up(srv_n_page_hash_locks));
+		ut_a(srv_n_page_hash_locks != 0);
+		ut_a(srv_n_page_hash_locks <= MAX_PAGE_HASH_LOCKS);
+
+		buf_pool->page_hash = ha_create(2 * buf_pool->curr_size,
+						srv_n_page_hash_locks,
+						MEM_HEAP_FOR_PAGE_HASH,
+						SYNC_BUF_PAGE_HASH);
+
+		buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size);
+
+		buf_pool->last_printout_time = ut_time();
+	}
+	/* 2. Initialize flushing fields
+	-------------------------------- */
+
+	mutex_create(flush_list_mutex_key, &buf_pool->flush_list_mutex,
+		     SYNC_BUF_FLUSH_LIST);
+
+	for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
+		buf_pool->no_flush[i] = os_event_create();
+	}
+
+	buf_pool->watch = (buf_page_t*) mem_zalloc(
+		sizeof(*buf_pool->watch) * BUF_POOL_WATCH_SIZE);
+
+	/* All fields are initialized by mem_zalloc(). */
+
+	buf_pool->try_LRU_scan = TRUE;
+
+	buf_pool_mutex_exit(buf_pool);
+
+	return(DB_SUCCESS);
+}
+
+/********************************************************************//**
+free one buffer pool instance */
+static
+void
+buf_pool_free_instance(
+/*===================*/
+	buf_pool_t*	buf_pool)	/* in,own: buffer pool instance
+					to free */
+{
+	buf_chunk_t*	chunk;
+	buf_chunk_t*	chunks;
+	buf_page_t*	bpage;
+
+	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+	while (bpage != NULL) {
+		buf_page_t*	prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
+		enum buf_page_state	state = buf_page_get_state(bpage);
+
+		ut_ad(buf_page_in_file(bpage));
+		ut_ad(bpage->in_LRU_list);
+
+		if (state != BUF_BLOCK_FILE_PAGE) {
+			/* We must not have any dirty block except
+			when doing a fast shutdown. */
+			ut_ad(state == BUF_BLOCK_ZIP_PAGE
+			      || srv_fast_shutdown == 2);
+			buf_page_free_descriptor(bpage);
+		}
+
+		bpage = prev_bpage;
+	}
+
+	mem_free(buf_pool->watch);
+	buf_pool->watch = NULL;
+
+	chunks = buf_pool->chunks;
+	chunk = chunks + buf_pool->n_chunks;
+
+	while (--chunk >= chunks) {
+		os_mem_free_large(chunk->mem, chunk->mem_size);
+	}
+
+	mem_free(buf_pool->chunks);
+	ha_clear(buf_pool->page_hash);
+	hash_table_free(buf_pool->page_hash);
+	hash_table_free(buf_pool->zip_hash);
+}
+
+/********************************************************************//**
+Creates the buffer pool.
+@return	DB_SUCCESS if success, DB_ERROR if not enough memory or error */
+UNIV_INTERN
+dberr_t
+buf_pool_init(
+/*==========*/
+	ulint	total_size,	/*!< in: size of the total pool in bytes */
+	ulint	n_instances)	/*!< in: number of instances */
+{
+	ulint		i;
+	const ulint	size	= total_size / n_instances;
+
+	ut_ad(n_instances > 0);
+	ut_ad(n_instances <= MAX_BUFFER_POOLS);
+	ut_ad(n_instances == srv_buf_pool_instances);
+
+	buf_pool_ptr = (buf_pool_t*) mem_zalloc(
+		n_instances * sizeof *buf_pool_ptr);
+
+	for (i = 0; i < n_instances; i++) {
+		buf_pool_t*	ptr	= &buf_pool_ptr[i];
+
+		if (buf_pool_init_instance(ptr, size, i) != DB_SUCCESS) {
+
+			/* Free all the instances created so far. */
+			buf_pool_free(i);
+
+			return(DB_ERROR);
+		}
+	}
+
+	buf_pool_set_sizes();
+	buf_LRU_old_ratio_update(100 * 3/ 8, FALSE);
+
+	btr_search_sys_create(buf_pool_get_curr_size() / sizeof(void*) / 64);
+
+	return(DB_SUCCESS);
+}
+
+/********************************************************************//**
+Frees the buffer pool at shutdown.  This must not be invoked before
+freeing all mutexes. */
+UNIV_INTERN
+void
+buf_pool_free(
+/*==========*/
+	ulint	n_instances)	/*!< in: numbere of instances to free */
+{
+	ulint	i;
+
+	for (i = 0; i < n_instances; i++) {
+		buf_pool_free_instance(buf_pool_from_array(i));
+	}
+
+	mem_free(buf_pool_ptr);
+	buf_pool_ptr = NULL;
+}
+
+/********************************************************************//**
+Clears the adaptive hash index on all pages in the buffer pool. */
+UNIV_INTERN
+void
+buf_pool_clear_hash_index(void)
+/*===========================*/
+{
+	ulint	p;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(!btr_search_enabled);
+
+	for (p = 0; p < srv_buf_pool_instances; p++) {
+		buf_pool_t*	buf_pool = buf_pool_from_array(p);
+		buf_chunk_t*	chunks	= buf_pool->chunks;
+		buf_chunk_t*	chunk	= chunks + buf_pool->n_chunks;
+
+		while (--chunk >= chunks) {
+			buf_block_t*	block	= chunk->blocks;
+			ulint		i	= chunk->size;
+
+			for (; i--; block++) {
+				dict_index_t*	index	= block->index;
+
+				/* We can set block->index = NULL
+				when we have an x-latch on btr_search_latch;
+				see the comment in buf0buf.h */
+
+				if (!index) {
+					/* Not hashed */
+					continue;
+				}
+
+				block->index = NULL;
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+				block->n_pointers = 0;
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+			}
+		}
+	}
+}
+
+/********************************************************************//**
+Relocate a buffer control block.  Relocates the block on the LRU list
+and in buf_pool->page_hash.  Does not relocate bpage->list.
+The caller must take care of relocating bpage->list. */
+UNIV_INTERN
+void
+buf_relocate(
+/*=========*/
+	buf_page_t*	bpage,	/*!< in/out: control block being relocated;
+				buf_page_get_state(bpage) must be
+				BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */
+	buf_page_t*	dpage)	/*!< in/out: destination control block */
+{
+	buf_page_t*	b;
+	ulint		fold;
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+	fold = buf_page_address_fold(bpage->space, bpage->offset);
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(buf_page_hash_lock_held_x(buf_pool, bpage));
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+	ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
+	ut_a(bpage->buf_fix_count == 0);
+	ut_ad(bpage->in_LRU_list);
+	ut_ad(!bpage->in_zip_hash);
+	ut_ad(bpage->in_page_hash);
+	ut_ad(bpage == buf_page_hash_get_low(buf_pool,
+					     bpage->space,
+					     bpage->offset,
+					     fold));
+
+	ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
+#ifdef UNIV_DEBUG
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_POOL_WATCH:
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_FILE_PAGE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		ut_error;
+	case BUF_BLOCK_ZIP_DIRTY:
+	case BUF_BLOCK_ZIP_PAGE:
+		break;
+	}
+#endif /* UNIV_DEBUG */
+
+	memcpy(dpage, bpage, sizeof *dpage);
+
+	ut_d(bpage->in_LRU_list = FALSE);
+	ut_d(bpage->in_page_hash = FALSE);
+
+	/* relocate buf_pool->LRU */
+	b = UT_LIST_GET_PREV(LRU, bpage);
+	UT_LIST_REMOVE(LRU, buf_pool->LRU, bpage);
+
+	if (b) {
+		UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, b, dpage);
+	} else {
+		UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, dpage);
+	}
+
+	if (UNIV_UNLIKELY(buf_pool->LRU_old == bpage)) {
+		buf_pool->LRU_old = dpage;
+#ifdef UNIV_LRU_DEBUG
+		/* buf_pool->LRU_old must be the first item in the LRU list
+		whose "old" flag is set. */
+		ut_a(buf_pool->LRU_old->old);
+		ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
+		     || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
+		ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
+		     || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
+	} else {
+		/* Check that the "old" flag is consistent in
+		the block and its neighbours. */
+		buf_page_set_old(dpage, buf_page_is_old(dpage));
+#endif /* UNIV_LRU_DEBUG */
+	}
+
+        ut_d(UT_LIST_VALIDATE(
+		LRU, buf_page_t, buf_pool->LRU, CheckInLRUList()));
+
+	/* relocate buf_pool->page_hash */
+	HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
+	HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage);
+}
+
+/********************************************************************//**
+Determine if a block is a sentinel for a buffer pool watch.
+@return	TRUE if a sentinel for a buffer pool watch, FALSE if not */
+UNIV_INTERN
+ibool
+buf_pool_watch_is_sentinel(
+/*=======================*/
+	buf_pool_t*		buf_pool,	/*!< buffer pool instance */
+	const buf_page_t*	bpage)		/*!< in: block */
+{
+	/* We must also own the appropriate hash lock. */
+	ut_ad(buf_page_hash_lock_held_s_or_x(buf_pool, bpage));
+	ut_ad(buf_page_in_file(bpage));
+
+	if (bpage < &buf_pool->watch[0]
+	    || bpage >= &buf_pool->watch[BUF_POOL_WATCH_SIZE]) {
+
+		ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_PAGE
+		      || bpage->zip.data != NULL);
+
+		return(FALSE);
+	}
+
+	ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE);
+	ut_ad(!bpage->in_zip_hash);
+	ut_ad(bpage->in_page_hash);
+	ut_ad(bpage->zip.data == NULL);
+	ut_ad(bpage->buf_fix_count > 0);
+	return(TRUE);
+}
+
+/****************************************************************//**
+Add watch for the given page to be read in. Caller must have
+appropriate hash_lock for the bpage. This function may release the
+hash_lock and reacquire it.
+@return NULL if watch set, block if the page is in the buffer pool */
+UNIV_INTERN
+buf_page_t*
+buf_pool_watch_set(
+/*===============*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset,	/*!< in: page number */
+	ulint	fold)	/*!< in: buf_page_address_fold(space, offset) */
+{
+	buf_page_t*	bpage;
+	ulint		i;
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+	rw_lock_t*	hash_lock;
+
+	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
+
+	if (bpage != NULL) {
+page_found:
+		if (!buf_pool_watch_is_sentinel(buf_pool, bpage)) {
+			/* The page was loaded meanwhile. */
+			return(bpage);
+		}
+
+		/* Add to an existing watch. */
+#ifdef PAGE_ATOMIC_REF_COUNT
+		os_atomic_increment_uint32(&bpage->buf_fix_count, 1);
+#else
+		++bpage->buf_fix_count;
+#endif /* PAGE_ATOMIC_REF_COUNT */
+		return(NULL);
+	}
+
+	/* From this point this function becomes fairly heavy in terms
+	of latching. We acquire the buf_pool mutex as well as all the
+	hash_locks. buf_pool mutex is needed because any changes to
+	the page_hash must be covered by it and hash_locks are needed
+	because we don't want to read any stale information in
+	buf_pool->watch[]. However, it is not in the critical code path
+	as this function will be called only by the purge thread. */
+
+
+	/* To obey latching order first release the hash_lock. */
+	rw_lock_x_unlock(hash_lock);
+
+	buf_pool_mutex_enter(buf_pool);
+	hash_lock_x_all(buf_pool->page_hash);
+
+	/* We have to recheck that the page
+	was not loaded or a watch set by some other
+	purge thread. This is because of the small
+	time window between when we release the
+	hash_lock to acquire buf_pool mutex above. */
+
+	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
+	if (UNIV_LIKELY_NULL(bpage)) {
+		buf_pool_mutex_exit(buf_pool);
+		hash_unlock_x_all_but(buf_pool->page_hash, hash_lock);
+		goto page_found;
+	}
+
+	for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) {
+		bpage = &buf_pool->watch[i];
+
+		ut_ad(bpage->access_time == 0);
+		ut_ad(bpage->newest_modification == 0);
+		ut_ad(bpage->oldest_modification == 0);
+		ut_ad(bpage->zip.data == NULL);
+		ut_ad(!bpage->in_zip_hash);
+
+		switch (bpage->state) {
+		case BUF_BLOCK_POOL_WATCH:
+			ut_ad(!bpage->in_page_hash);
+			ut_ad(bpage->buf_fix_count == 0);
+
+			/* bpage is pointing to buf_pool->watch[],
+			which is protected by buf_pool->mutex.
+			Normally, buf_page_t objects are protected by
+			buf_block_t::mutex or buf_pool->zip_mutex or both. */
+
+			bpage->state = BUF_BLOCK_ZIP_PAGE;
+			bpage->space = static_cast<ib_uint32_t>(space);
+			bpage->offset = static_cast<ib_uint32_t>(offset);
+			bpage->buf_fix_count = 1;
+
+			ut_d(bpage->in_page_hash = TRUE);
+			HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
+				    fold, bpage);
+
+			buf_pool_mutex_exit(buf_pool);
+			/* Once the sentinel is in the page_hash we can
+			safely release all locks except just the
+			relevant hash_lock */
+			hash_unlock_x_all_but(buf_pool->page_hash,
+						hash_lock);
+
+			return(NULL);
+		case BUF_BLOCK_ZIP_PAGE:
+			ut_ad(bpage->in_page_hash);
+			ut_ad(bpage->buf_fix_count > 0);
+			break;
+		default:
+			ut_error;
+		}
+	}
+
+	/* Allocation failed.  Either the maximum number of purge
+	threads should never exceed BUF_POOL_WATCH_SIZE, or this code
+	should be modified to return a special non-NULL value and the
+	caller should purge the record directly. */
+	ut_error;
+
+	/* Fix compiler warning */
+	return(NULL);
+}
+
+/****************************************************************//**
+Remove the sentinel block for the watch before replacing it with a real block.
+buf_page_watch_clear() or buf_page_watch_occurred() will notice that
+the block has been replaced with the real block.
+@return reference count, to be added to the replacement block */
+static
+void
+buf_pool_watch_remove(
+/*==================*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	ulint		fold,		/*!< in: buf_page_address_fold(
+					space, offset) */
+	buf_page_t*	watch)		/*!< in/out: sentinel for watch */
+{
+#ifdef UNIV_SYNC_DEBUG
+	/* We must also own the appropriate hash_bucket mutex. */
+	rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+	ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, watch);
+	ut_d(watch->in_page_hash = FALSE);
+	watch->buf_fix_count = 0;
+	watch->state = BUF_BLOCK_POOL_WATCH;
+}
+
+/****************************************************************//**
+Stop watching if the page has been read in.
+buf_pool_watch_set(space,offset) must have returned NULL before. */
+UNIV_INTERN
+void
+buf_pool_watch_unset(
+/*=================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: page number */
+{
+	buf_page_t*	bpage;
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+	ulint		fold = buf_page_address_fold(space, offset);
+	rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+
+	/* We only need to have buf_pool mutex in case where we end
+	up calling buf_pool_watch_remove but to obey latching order
+	we acquire it here before acquiring hash_lock. This should
+	not cause too much grief as this function is only ever
+	called from the purge thread. */
+	buf_pool_mutex_enter(buf_pool);
+
+	rw_lock_x_lock(hash_lock);
+
+	/* The page must exist because buf_pool_watch_set() increments
+	buf_fix_count. */
+
+	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
+
+	if (!buf_pool_watch_is_sentinel(buf_pool, bpage)) {
+		buf_block_unfix(reinterpret_cast<buf_block_t*>(bpage));
+	} else {
+
+		ut_ad(bpage->buf_fix_count > 0);
+
+#ifdef PAGE_ATOMIC_REF_COUNT
+		os_atomic_decrement_uint32(&bpage->buf_fix_count, 1);
+#else
+		--bpage->buf_fix_count;
+#endif /* PAGE_ATOMIC_REF_COUNT */
+
+		if (bpage->buf_fix_count == 0) {
+			buf_pool_watch_remove(buf_pool, fold, bpage);
+		}
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+	rw_lock_x_unlock(hash_lock);
+}
+
+/****************************************************************//**
+Check if the page has been read in.
+This may only be called after buf_pool_watch_set(space,offset)
+has returned NULL and before invoking buf_pool_watch_unset(space,offset).
+@return	FALSE if the given page was not read in, TRUE if it was */
+UNIV_INTERN
+ibool
+buf_pool_watch_occurred(
+/*====================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: page number */
+{
+	ibool		ret;
+	buf_page_t*	bpage;
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+	ulint		fold	= buf_page_address_fold(space, offset);
+	rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool,
+							     fold);
+
+	rw_lock_s_lock(hash_lock);
+
+	/* The page must exist because buf_pool_watch_set()
+	increments buf_fix_count. */
+	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
+
+	ret = !buf_pool_watch_is_sentinel(buf_pool, bpage);
+	rw_lock_s_unlock(hash_lock);
+
+	return(ret);
+}
+
+/********************************************************************//**
+Moves a page to the start of the buffer pool LRU list. This high-level
+function can be used to prevent an important page from slipping out of
+the buffer pool. */
+UNIV_INTERN
+void
+buf_page_make_young(
+/*================*/
+	buf_page_t*	bpage)	/*!< in: buffer block of a file page */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+	buf_pool_mutex_enter(buf_pool);
+
+	ut_a(buf_page_in_file(bpage));
+
+	buf_LRU_make_block_young(bpage);
+
+	buf_pool_mutex_exit(buf_pool);
+}
+
+/********************************************************************//**
+Moves a page to the start of the buffer pool LRU list if it is too old.
+This high-level function can be used to prevent an important page from
+slipping out of the buffer pool. */
+static
+void
+buf_page_make_young_if_needed(
+/*==========================*/
+	buf_page_t*	bpage)		/*!< in/out: buffer block of a
+					file page */
+{
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(!buf_pool_mutex_own(buf_pool));
+#endif /* UNIV_DEBUG */
+	ut_a(buf_page_in_file(bpage));
+
+	if (buf_page_peek_if_too_old(bpage)) {
+		buf_page_make_young(bpage);
+	}
+}
+
+/********************************************************************//**
+Resets the check_index_page_at_flush field of a page if found in the buffer
+pool. */
+UNIV_INTERN
+void
+buf_reset_check_index_page_at_flush(
+/*================================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: page number */
+{
+	buf_block_t*	block;
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+
+	buf_pool_mutex_enter(buf_pool);
+
+	block = (buf_block_t*) buf_page_hash_get(buf_pool, space, offset);
+
+	if (block && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE) {
+		ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page));
+		block->check_index_page_at_flush = FALSE;
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+}
+
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+/********************************************************************//**
+Sets file_page_was_freed TRUE if the page is found in the buffer pool.
+This function should be called when we free a file page and want the
+debug version to check that it is not accessed any more unless
+reallocated.
+@return	control block if found in page hash table, otherwise NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_set_file_page_was_freed(
+/*=============================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: page number */
+{
+	buf_page_t*	bpage;
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+	rw_lock_t*	hash_lock;
+
+	bpage = buf_page_hash_get_s_locked(buf_pool, space, offset,
+					   &hash_lock);
+
+	if (bpage) {
+		ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
+		ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
+		mutex_enter(block_mutex);
+		rw_lock_s_unlock(hash_lock);
+		/* bpage->file_page_was_freed can already hold
+		when this code is invoked from dict_drop_index_tree() */
+		bpage->file_page_was_freed = TRUE;
+		mutex_exit(block_mutex);
+	}
+
+	return(bpage);
+}
+
+/********************************************************************//**
+Sets file_page_was_freed FALSE if the page is found in the buffer pool.
+This function should be called when we free a file page and want the
+debug version to check that it is not accessed any more unless
+reallocated.
+@return	control block if found in page hash table, otherwise NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_reset_file_page_was_freed(
+/*===============================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: page number */
+{
+	buf_page_t*	bpage;
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+	rw_lock_t*	hash_lock;
+
+	bpage = buf_page_hash_get_s_locked(buf_pool, space, offset,
+					   &hash_lock);
+	if (bpage) {
+		ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
+		ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
+		mutex_enter(block_mutex);
+		rw_lock_s_unlock(hash_lock);
+		bpage->file_page_was_freed = FALSE;
+		mutex_exit(block_mutex);
+	}
+
+	return(bpage);
+}
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
+
+/********************************************************************//**
+Attempts to discard the uncompressed frame of a compressed page. The
+caller should not be holding any mutexes when this function is called.
+@return	TRUE if successful, FALSE otherwise. */
+static
+void
+buf_block_try_discard_uncompressed(
+/*===============================*/
+	ulint		space,	/*!< in: space id */
+	ulint		offset)	/*!< in: page number */
+{
+	buf_page_t*	bpage;
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+
+	/* Since we need to acquire buf_pool mutex to discard
+	the uncompressed frame and because page_hash mutex resides
+	below buf_pool mutex in sync ordering therefore we must
+	first release the page_hash mutex. This means that the
+	block in question can move out of page_hash. Therefore
+	we need to check again if the block is still in page_hash. */
+	buf_pool_mutex_enter(buf_pool);
+
+	bpage = buf_page_hash_get(buf_pool, space, offset);
+
+	if (bpage) {
+		buf_LRU_free_page(bpage, false);
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+}
+
+/********************************************************************//**
+Get read access to a compressed page (usually of type
+FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
+The page must be released with buf_page_release_zip().
+NOTE: the page is not protected by any latch.  Mutual exclusion has to
+be implemented at a higher level.  In other words, all possible
+accesses to a given page through this function must be protected by
+the same set of mutexes or latches.
+@return	pointer to the block */
+UNIV_INTERN
+buf_page_t*
+buf_page_get_zip(
+/*=============*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size */
+	ulint		offset)	/*!< in: page number */
+{
+	buf_page_t*	bpage;
+	ib_mutex_t*	block_mutex;
+	rw_lock_t*	hash_lock;
+	ibool		discard_attempted = FALSE;
+	ibool		must_read;
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+
+	buf_pool->stat.n_page_gets++;
+
+	for (;;) {
+lookup:
+
+		/* The following call will also grab the page_hash
+		mutex if the page is found. */
+		bpage = buf_page_hash_get_s_locked(buf_pool, space,
+						offset, &hash_lock);
+		if (bpage) {
+			ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
+			break;
+		}
+
+		/* Page not in buf_pool: needs to be read from file */
+
+		ut_ad(!hash_lock);
+		buf_read_page(space, zip_size, offset);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+		ut_a(++buf_dbg_counter % 5771 || buf_validate());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+	}
+
+	ut_ad(buf_page_hash_lock_held_s(buf_pool, bpage));
+
+	if (!bpage->zip.data) {
+		/* There is no compressed page. */
+err_exit:
+		rw_lock_s_unlock(hash_lock);
+		return(NULL);
+	}
+
+	ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
+
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_POOL_WATCH:
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		ut_error;
+
+	case BUF_BLOCK_ZIP_PAGE:
+	case BUF_BLOCK_ZIP_DIRTY:
+		block_mutex = &buf_pool->zip_mutex;
+		mutex_enter(block_mutex);
+#ifdef PAGE_ATOMIC_REF_COUNT
+		os_atomic_increment_uint32(&bpage->buf_fix_count, 1);
+#else
+		++bpage->buf_fix_count;
+#endif /* PAGE_ATOMIC_REF_COUNT */
+		goto got_block;
+	case BUF_BLOCK_FILE_PAGE:
+		/* Discard the uncompressed page frame if possible. */
+		if (!discard_attempted) {
+			rw_lock_s_unlock(hash_lock);
+			buf_block_try_discard_uncompressed(space, offset);
+			discard_attempted = TRUE;
+			goto lookup;
+		}
+
+		block_mutex = &((buf_block_t*) bpage)->mutex;
+
+		mutex_enter(block_mutex);
+
+		buf_block_buf_fix_inc((buf_block_t*) bpage, __FILE__, __LINE__);
+		goto got_block;
+	}
+
+	ut_error;
+	goto err_exit;
+
+got_block:
+	must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ;
+
+	rw_lock_s_unlock(hash_lock);
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+	ut_a(!bpage->file_page_was_freed);
+#endif /* defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG */
+
+	buf_page_set_accessed(bpage);
+
+	mutex_exit(block_mutex);
+
+	buf_page_make_young_if_needed(bpage);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(++buf_dbg_counter % 5771 || buf_validate());
+	ut_a(bpage->buf_fix_count > 0);
+	ut_a(buf_page_in_file(bpage));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+	if (must_read) {
+		/* Let us wait until the read operation
+		completes */
+
+		for (;;) {
+			enum buf_io_fix	io_fix;
+
+			mutex_enter(block_mutex);
+			io_fix = buf_page_get_io_fix(bpage);
+			mutex_exit(block_mutex);
+
+			if (io_fix == BUF_IO_READ) {
+
+				os_thread_sleep(WAIT_FOR_READ);
+			} else {
+				break;
+			}
+		}
+	}
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a(ibuf_count_get(buf_page_get_space(bpage),
+			    buf_page_get_page_no(bpage)) == 0);
+#endif
+	return(bpage);
+}
+
+/********************************************************************//**
+Initialize some fields of a control block. */
+UNIV_INLINE
+void
+buf_block_init_low(
+/*===============*/
+	buf_block_t*	block)	/*!< in: block to init */
+{
+	block->check_index_page_at_flush = FALSE;
+	block->index		= NULL;
+
+	block->n_hash_helps	= 0;
+	block->n_fields		= 1;
+	block->n_bytes		= 0;
+	block->left_side	= TRUE;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Decompress a block.
+@return	TRUE if successful */
+UNIV_INTERN
+ibool
+buf_zip_decompress(
+/*===============*/
+	buf_block_t*	block,	/*!< in/out: block */
+	ibool		check)	/*!< in: TRUE=verify the page checksum */
+{
+	const byte*	frame = block->page.zip.data;
+	ulint		size = page_zip_get_size(&block->page.zip);
+
+	ut_ad(buf_block_get_zip_size(block));
+	ut_a(buf_block_get_space(block) != 0);
+
+	if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: compressed page checksum mismatch"
+			" (space %u page %u): stored: %lu, crc32: %lu "
+			"innodb: %lu, none: %lu\n",
+			block->page.space, block->page.offset,
+			mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM),
+			page_zip_calc_checksum(frame, size,
+					       SRV_CHECKSUM_ALGORITHM_CRC32),
+			page_zip_calc_checksum(frame, size,
+					       SRV_CHECKSUM_ALGORITHM_INNODB),
+			page_zip_calc_checksum(frame, size,
+					       SRV_CHECKSUM_ALGORITHM_NONE));
+		return(FALSE);
+	}
+
+	switch (fil_page_get_type(frame)) {
+	case FIL_PAGE_INDEX:
+		if (page_zip_decompress(&block->page.zip,
+					block->frame, TRUE)) {
+			return(TRUE);
+		}
+
+		fprintf(stderr,
+			"InnoDB: unable to decompress space %lu page %lu\n",
+			(ulong) block->page.space,
+			(ulong) block->page.offset);
+		return(FALSE);
+
+	case FIL_PAGE_TYPE_ALLOCATED:
+	case FIL_PAGE_INODE:
+	case FIL_PAGE_IBUF_BITMAP:
+	case FIL_PAGE_TYPE_FSP_HDR:
+	case FIL_PAGE_TYPE_XDES:
+	case FIL_PAGE_TYPE_ZBLOB:
+	case FIL_PAGE_TYPE_ZBLOB2:
+		/* Copy to uncompressed storage. */
+		memcpy(block->frame, frame,
+		       buf_block_get_zip_size(block));
+		return(TRUE);
+	}
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		"  InnoDB: unknown compressed page"
+		" type %lu\n",
+		fil_page_get_type(frame));
+	return(FALSE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Gets the block to whose frame the pointer is pointing to if found
+in this buffer pool instance.
+@return	pointer to block */
+UNIV_INTERN
+buf_block_t*
+buf_block_align_instance(
+/*=====================*/
+ 	buf_pool_t*	buf_pool,	/*!< in: buffer in which the block
+					resides */
+	const byte*	ptr)		/*!< in: pointer to a frame */
+{
+	buf_chunk_t*	chunk;
+	ulint		i;
+
+	/* TODO: protect buf_pool->chunks with a mutex (it will
+	currently remain constant after buf_pool_init()) */
+	for (chunk = buf_pool->chunks, i = buf_pool->n_chunks; i--; chunk++) {
+		ulint	offs;
+
+		if (UNIV_UNLIKELY(ptr < chunk->blocks->frame)) {
+
+			continue;
+		}
+		/* else */
+
+		offs = ptr - chunk->blocks->frame;
+
+		offs >>= UNIV_PAGE_SIZE_SHIFT;
+
+		if (UNIV_LIKELY(offs < chunk->size)) {
+			buf_block_t*	block = &chunk->blocks[offs];
+
+			/* The function buf_chunk_init() invokes
+			buf_block_init() so that block[n].frame ==
+			block->frame + n * UNIV_PAGE_SIZE.  Check it. */
+			ut_ad(block->frame == page_align(ptr));
+#ifdef UNIV_DEBUG
+			/* A thread that updates these fields must
+			hold buf_pool->mutex and block->mutex.  Acquire
+			only the latter. */
+			mutex_enter(&block->mutex);
+
+			switch (buf_block_get_state(block)) {
+			case BUF_BLOCK_POOL_WATCH:
+			case BUF_BLOCK_ZIP_PAGE:
+			case BUF_BLOCK_ZIP_DIRTY:
+				/* These types should only be used in
+				the compressed buffer pool, whose
+				memory is allocated from
+				buf_pool->chunks, in UNIV_PAGE_SIZE
+				blocks flagged as BUF_BLOCK_MEMORY. */
+				ut_error;
+				break;
+			case BUF_BLOCK_NOT_USED:
+			case BUF_BLOCK_READY_FOR_USE:
+			case BUF_BLOCK_MEMORY:
+				/* Some data structures contain
+				"guess" pointers to file pages.  The
+				file pages may have been freed and
+				reused.  Do not complain. */
+				break;
+			case BUF_BLOCK_REMOVE_HASH:
+				/* buf_LRU_block_remove_hashed_page()
+				will overwrite the FIL_PAGE_OFFSET and
+				FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID with
+				0xff and set the state to
+				BUF_BLOCK_REMOVE_HASH. */
+				ut_ad(page_get_space_id(page_align(ptr))
+				      == 0xffffffff);
+				ut_ad(page_get_page_no(page_align(ptr))
+				      == 0xffffffff);
+				break;
+			case BUF_BLOCK_FILE_PAGE:
+				ut_ad(block->page.space
+				      == page_get_space_id(page_align(ptr)));
+				ut_ad(block->page.offset
+				      == page_get_page_no(page_align(ptr)));
+				break;
+			}
+
+			mutex_exit(&block->mutex);
+#endif /* UNIV_DEBUG */
+
+			return(block);
+		}
+	}
+
+	return(NULL);
+}
+
+/*******************************************************************//**
+Gets the block to whose frame the pointer is pointing to.
+@return	pointer to block, never NULL */
+UNIV_INTERN
+buf_block_t*
+buf_block_align(
+/*============*/
+	const byte*	ptr)	/*!< in: pointer to a frame */
+{
+	ulint		i;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_block_t*	block;
+
+		block = buf_block_align_instance(
+			buf_pool_from_array(i), ptr);
+		if (block) {
+			return(block);
+		}
+	}
+
+	/* The block should always be found. */
+	ut_error;
+	return(NULL);
+}
+
+/********************************************************************//**
+Find out if a pointer belongs to a buf_block_t. It can be a pointer to
+the buf_block_t itself or a member of it. This functions checks one of
+the buffer pool instances.
+@return	TRUE if ptr belongs to a buf_block_t struct */
+static
+ibool
+buf_pointer_is_block_field_instance(
+/*================================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	const void*	ptr)		/*!< in: pointer not dereferenced */
+{
+	const buf_chunk_t*		chunk	= buf_pool->chunks;
+	const buf_chunk_t* const	echunk	= chunk + buf_pool->n_chunks;
+
+	/* TODO: protect buf_pool->chunks with a mutex (it will
+	currently remain constant after buf_pool_init()) */
+	while (chunk < echunk) {
+		if (ptr >= (void*) chunk->blocks
+		    && ptr < (void*) (chunk->blocks + chunk->size)) {
+
+			return(TRUE);
+		}
+
+		chunk++;
+	}
+
+	return(FALSE);
+}
+
+/********************************************************************//**
+Find out if a pointer belongs to a buf_block_t. It can be a pointer to
+the buf_block_t itself or a member of it
+@return	TRUE if ptr belongs to a buf_block_t struct */
+UNIV_INTERN
+ibool
+buf_pointer_is_block_field(
+/*=======================*/
+	const void*	ptr)	/*!< in: pointer not dereferenced */
+{
+	ulint	i;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		ibool	found;
+
+		found = buf_pointer_is_block_field_instance(
+			buf_pool_from_array(i), ptr);
+		if (found) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/********************************************************************//**
+Find out if a buffer block was created by buf_chunk_init().
+@return	TRUE if "block" has been added to buf_pool->free by buf_chunk_init() */
+static
+ibool
+buf_block_is_uncompressed(
+/*======================*/
+	buf_pool_t*		buf_pool,	/*!< in: buffer pool instance */
+	const buf_block_t*	block)		/*!< in: pointer to block,
+						not dereferenced */
+{
+	if ((((ulint) block) % sizeof *block) != 0) {
+		/* The pointer should be aligned. */
+		return(FALSE);
+	}
+
+	return(buf_pointer_is_block_field_instance(buf_pool, (void*) block));
+}
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+/********************************************************************//**
+Return true if probe is enabled.
+@return true if probe enabled. */
+static
+bool
+buf_debug_execute_is_force_flush()
+/*==============================*/
+{
+	DBUG_EXECUTE_IF("ib_buf_force_flush", return(true); );
+
+	/* This is used during queisce testing, we want to ensure maximum
+	buffering by the change buffer. */
+
+	if (srv_ibuf_disable_background_merge) {
+		return(true);
+	}
+
+	return(false);
+}
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+/**
+Wait for the block to be read in.
+@param block	The block to check */
+static
+void
+buf_wait_for_read(buf_block_t* block)
+{
+	/* Note: For the PAGE_ATOMIC_REF_COUNT case:
+
+	We are using the block->lock to check for IO state (and a dirty read).
+	We set the IO_READ state under the protection of the hash_lock
+	(and block->mutex). This is safe because another thread can only
+	access the block (and check for IO state) after the block has been
+	added to the page hashtable. */
+
+	if (buf_block_get_io_fix(block) == BUF_IO_READ) {
+
+		/* Wait until the read operation completes */
+
+		ib_mutex_t*	mutex = buf_page_get_mutex(&block->page);
+
+		for (;;) {
+			buf_io_fix	io_fix;
+
+			mutex_enter(mutex);
+
+			io_fix = buf_block_get_io_fix(block);
+
+			mutex_exit(mutex);
+
+			if (io_fix == BUF_IO_READ) {
+				/* Wait by temporaly s-latch */
+				rw_lock_s_lock(&block->lock);
+				rw_lock_s_unlock(&block->lock);
+			} else {
+				break;
+			}
+		}
+	}
+}
+
+/********************************************************************//**
+This is the general function used to get access to a database page.
+@return	pointer to the block or NULL */
+UNIV_INTERN
+buf_block_t*
+buf_page_get_gen(
+/*=============*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint		offset,	/*!< in: page number */
+	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
+	buf_block_t*	guess,	/*!< in: guessed block or NULL */
+	ulint		mode,	/*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
+				BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or
+				BUF_GET_IF_IN_POOL_OR_WATCH */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mini-transaction */
+{
+	buf_block_t*	block;
+	ulint		fold;
+	unsigned	access_time;
+	ulint		fix_type;
+	rw_lock_t*	hash_lock;
+	ulint		retries = 0;
+	buf_block_t*	fix_block;
+	ib_mutex_t*	fix_mutex = NULL;
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+
+	ut_ad(mtr);
+	ut_ad(mtr->state == MTR_ACTIVE);
+	ut_ad((rw_latch == RW_S_LATCH)
+	      || (rw_latch == RW_X_LATCH)
+	      || (rw_latch == RW_NO_LATCH));
+#ifdef UNIV_DEBUG
+	switch (mode) {
+	case BUF_GET_NO_LATCH:
+		ut_ad(rw_latch == RW_NO_LATCH);
+		break;
+	case BUF_GET:
+	case BUF_GET_IF_IN_POOL:
+	case BUF_PEEK_IF_IN_POOL:
+	case BUF_GET_IF_IN_POOL_OR_WATCH:
+	case BUF_GET_POSSIBLY_FREED:
+		break;
+	default:
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+	ut_ad(zip_size == fil_space_get_zip_size(space));
+	ut_ad(ut_is_2pow(zip_size));
+#ifndef UNIV_LOG_DEBUG
+	ut_ad(!ibuf_inside(mtr)
+	      || ibuf_page_low(space, zip_size, offset,
+			       FALSE, file, line, NULL));
+#endif
+	buf_pool->stat.n_page_gets++;
+	fold = buf_page_address_fold(space, offset);
+	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+loop:
+	block = guess;
+
+	rw_lock_s_lock(hash_lock);
+
+	if (block != NULL) {
+
+		/* If the guess is a compressed page descriptor that
+		has been allocated by buf_page_alloc_descriptor(),
+		it may have been freed by buf_relocate(). */
+
+		if (!buf_block_is_uncompressed(buf_pool, block)
+		    || offset != block->page.offset
+		    || space != block->page.space
+		    || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
+
+			/* Our guess was bogus or things have changed
+			since. */
+			block = guess = NULL;
+		} else {
+			ut_ad(!block->page.in_zip_hash);
+		}
+	}
+
+	if (block == NULL) {
+		block = (buf_block_t*) buf_page_hash_get_low(
+			buf_pool, space, offset, fold);
+	}
+
+	if (!block || buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
+		rw_lock_s_unlock(hash_lock);
+		block = NULL;
+	}
+
+	if (block == NULL) {
+		/* Page not in buf_pool: needs to be read from file */
+
+		if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
+			rw_lock_x_lock(hash_lock);
+			block = (buf_block_t*) buf_pool_watch_set(
+				space, offset, fold);
+
+			if (UNIV_LIKELY_NULL(block)) {
+				/* We can release hash_lock after we
+				increment the fix count to make
+				sure that no state change takes place. */
+				fix_block = block;
+				buf_block_fix(fix_block);
+
+				/* Now safe to release page_hash mutex */
+				rw_lock_x_unlock(hash_lock);
+				goto got_block;
+			}
+
+			rw_lock_x_unlock(hash_lock);
+		}
+
+		if (mode == BUF_GET_IF_IN_POOL
+		    || mode == BUF_PEEK_IF_IN_POOL
+		    || mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
+#ifdef UNIV_SYNC_DEBUG
+			ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+			ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+			return(NULL);
+		}
+
+		if (buf_read_page(space, zip_size, offset)) {
+			buf_read_ahead_random(space, zip_size, offset,
+					      ibuf_inside(mtr));
+
+			retries = 0;
+		} else if (retries < BUF_PAGE_READ_MAX_RETRIES) {
+			++retries;
+			DBUG_EXECUTE_IF(
+				"innodb_page_corruption_retries",
+				retries = BUF_PAGE_READ_MAX_RETRIES;
+			);
+		} else {
+			fprintf(stderr, "InnoDB: Error: Unable"
+				" to read tablespace %lu page no"
+				" %lu into the buffer pool after"
+				" %lu attempts\n"
+				"InnoDB: The most probable cause"
+				" of this error may be that the"
+				" table has been corrupted.\n"
+				"InnoDB: You can try to fix this"
+				" problem by using"
+				" innodb_force_recovery.\n"
+				"InnoDB: Please see reference manual"
+				" for more details.\n"
+				"InnoDB: Aborting...\n",
+				space, offset,
+				BUF_PAGE_READ_MAX_RETRIES);
+
+			ut_error;
+		}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+		ut_a(++buf_dbg_counter % 5771 || buf_validate());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+		goto loop;
+	} else {
+		fix_block = block;
+	}
+
+	buf_block_fix(fix_block);
+
+	/* Now safe to release page_hash mutex */
+	rw_lock_s_unlock(hash_lock);
+
+got_block:
+
+	fix_mutex = buf_page_get_mutex(&fix_block->page);
+
+	ut_ad(page_zip_get_size(&block->page.zip) == zip_size);
+
+	if (mode == BUF_GET_IF_IN_POOL || mode == BUF_PEEK_IF_IN_POOL) {
+
+		bool	must_read;
+
+		{
+			buf_page_t*	fix_page = &fix_block->page;
+
+			mutex_enter(fix_mutex);
+
+			buf_io_fix	io_fix = buf_page_get_io_fix(fix_page);
+
+			must_read = (io_fix == BUF_IO_READ);
+
+			mutex_exit(fix_mutex);
+		}
+
+		if (must_read) {
+			/* The page is being read to buffer pool,
+			but we cannot wait around for the read to
+			complete. */
+			buf_block_unfix(fix_block);
+
+			return(NULL);
+		}
+	}
+
+	switch(buf_block_get_state(fix_block)) {
+		buf_page_t*	bpage;
+
+	case BUF_BLOCK_FILE_PAGE:
+		break;
+
+	case BUF_BLOCK_ZIP_PAGE:
+	case BUF_BLOCK_ZIP_DIRTY:
+		if (mode == BUF_PEEK_IF_IN_POOL) {
+			/* This mode is only used for dropping an
+			adaptive hash index.  There cannot be an
+			adaptive hash index for a compressed-only
+			page, so do not bother decompressing the page. */
+			buf_block_unfix(fix_block);
+
+			return(NULL);
+		}
+
+		bpage = &block->page;
+
+		/* Note: We have already buffer fixed this block. */
+		if (bpage->buf_fix_count > 1
+		    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+
+			/* This condition often occurs when the buffer
+			is not buffer-fixed, but I/O-fixed by
+			buf_page_init_for_read(). */
+			buf_block_unfix(fix_block);
+
+			/* The block is buffer-fixed or I/O-fixed.
+			Try again later. */
+			os_thread_sleep(WAIT_FOR_READ);
+
+			goto loop;
+		}
+
+		/* Buffer-fix the block so that it cannot be evicted
+		or relocated while we are attempting to allocate an
+		uncompressed page. */
+
+		block = buf_LRU_get_free_block(buf_pool);
+
+		buf_pool_mutex_enter(buf_pool);
+
+		rw_lock_x_lock(hash_lock);
+
+		/* Buffer-fixing prevents the page_hash from changing. */
+		ut_ad(bpage == buf_page_hash_get_low(
+			      buf_pool, space, offset, fold));
+
+		buf_block_mutex_enter(block);
+
+		mutex_enter(&buf_pool->zip_mutex);
+
+		ut_ad(fix_block->page.buf_fix_count > 0);
+
+#ifdef PAGE_ATOMIC_REF_COUNT
+		os_atomic_decrement_uint32(&fix_block->page.buf_fix_count, 1);
+#else
+		--fix_block->page.buf_fix_count;
+#endif /* PAGE_ATOMIC_REF_COUNT */
+
+		fix_block = block;
+
+		if (bpage->buf_fix_count > 0
+		    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+
+			mutex_exit(&buf_pool->zip_mutex);
+			/* The block was buffer-fixed or I/O-fixed while
+			buf_pool->mutex was not held by this thread.
+			Free the block that was allocated and retry.
+			This should be extremely unlikely, for example,
+			if buf_page_get_zip() was invoked. */
+
+			buf_LRU_block_free_non_file_page(block);
+			buf_pool_mutex_exit(buf_pool);
+			rw_lock_x_unlock(hash_lock);
+			buf_block_mutex_exit(block);
+
+			/* Try again */
+			goto loop;
+		}
+
+		/* Move the compressed page from bpage to block,
+		and uncompress it. */
+
+		/* Note: this is the uncompressed block and it is not
+		accessible by other threads yet because it is not in
+		any list or hash table */
+		buf_relocate(bpage, &block->page);
+
+		buf_block_init_low(block);
+
+		/* Set after relocate(). */
+		block->page.buf_fix_count = 1;
+
+		block->lock_hash_val = lock_rec_hash(space, offset);
+
+		UNIV_MEM_DESC(&block->page.zip.data,
+			page_zip_get_size(&block->page.zip));
+
+		if (buf_page_get_state(&block->page) == BUF_BLOCK_ZIP_PAGE) {
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+			UT_LIST_REMOVE(list, buf_pool->zip_clean,
+				       &block->page);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+			ut_ad(!block->page.in_flush_list);
+		} else {
+			/* Relocate buf_pool->flush_list. */
+			buf_flush_relocate_on_flush_list(bpage, &block->page);
+		}
+
+		/* Buffer-fix, I/O-fix, and X-latch the block
+		for the duration of the decompression.
+		Also add the block to the unzip_LRU list. */
+		block->page.state = BUF_BLOCK_FILE_PAGE;
+
+		/* Insert at the front of unzip_LRU list */
+		buf_unzip_LRU_add_block(block, FALSE);
+
+		buf_block_set_io_fix(block, BUF_IO_READ);
+		rw_lock_x_lock_inline(&block->lock, 0, file, line);
+
+		UNIV_MEM_INVALID(bpage, sizeof *bpage);
+
+		rw_lock_x_unlock(hash_lock);
+
+		++buf_pool->n_pend_unzip;
+
+		mutex_exit(&buf_pool->zip_mutex);
+		buf_pool_mutex_exit(buf_pool);
+
+		access_time = buf_page_is_accessed(&block->page);
+
+		buf_block_mutex_exit(block);
+
+		buf_page_free_descriptor(bpage);
+
+		/* Decompress the page while not holding
+		buf_pool->mutex or block->mutex. */
+
+		/* Page checksum verification is already done when
+		the page is read from disk. Hence page checksum
+		verification is not necessary when decompressing the page. */
+		{
+			bool	success = buf_zip_decompress(block, FALSE);
+			ut_a(success);
+		}
+
+		if (!recv_no_ibuf_operations) {
+			if (access_time) {
+#ifdef UNIV_IBUF_COUNT_DEBUG
+				ut_a(ibuf_count_get(space, offset) == 0);
+#endif /* UNIV_IBUF_COUNT_DEBUG */
+			} else {
+				ibuf_merge_or_delete_for_page(
+					block, space, offset, zip_size, TRUE);
+			}
+		}
+
+		buf_pool_mutex_enter(buf_pool);
+
+		/* Unfix and unlatch the block. */
+		buf_block_mutex_enter(fix_block);
+
+		buf_block_set_io_fix(fix_block, BUF_IO_NONE);
+
+		buf_block_mutex_exit(fix_block);
+
+		--buf_pool->n_pend_unzip;
+
+		buf_pool_mutex_exit(buf_pool);
+
+		rw_lock_x_unlock(&block->lock);
+
+		break;
+
+	case BUF_BLOCK_POOL_WATCH:
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		ut_error;
+		break;
+	}
+
+	ut_ad(block == fix_block);
+	ut_ad(fix_block->page.buf_fix_count > 0);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_ad(buf_block_get_state(fix_block) == BUF_BLOCK_FILE_PAGE);
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+
+	if ((mode == BUF_GET_IF_IN_POOL || mode == BUF_GET_IF_IN_POOL_OR_WATCH)
+	    && (ibuf_debug || buf_debug_execute_is_force_flush())) {
+
+		/* Try to evict the block from the buffer pool, to use the
+		insert buffer (change buffer) as much as possible. */
+
+		buf_pool_mutex_enter(buf_pool);
+
+		buf_block_unfix(fix_block);
+
+		/* Now we are only holding the buf_pool->mutex,
+		not block->mutex or hash_lock. Blocks cannot be
+		relocated or enter or exit the buf_pool while we
+		are holding the buf_pool->mutex. */
+
+		if (buf_LRU_free_page(&fix_block->page, true)) {
+			buf_pool_mutex_exit(buf_pool);
+			rw_lock_x_lock(hash_lock);
+
+			if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
+				/* Set the watch, as it would have
+				been set if the page were not in the
+				buffer pool in the first place. */
+				block = (buf_block_t*) buf_pool_watch_set(
+					space, offset, fold);
+			} else {
+				block = (buf_block_t*) buf_page_hash_get_low(
+					buf_pool, space, offset, fold);
+			}
+
+			rw_lock_x_unlock(hash_lock);
+
+			if (block != NULL) {
+				/* Either the page has been read in or
+				a watch was set on that in the window
+				where we released the buf_pool::mutex
+				and before we acquire the hash_lock
+				above. Try again. */
+				guess = block;
+				goto loop;
+			}
+
+			fprintf(stderr,
+				"innodb_change_buffering_debug evict %u %u\n",
+				(unsigned) space, (unsigned) offset);
+			return(NULL);
+		}
+
+		mutex_enter(&fix_block->mutex);
+
+		if (buf_flush_page_try(buf_pool, fix_block)) {
+			fprintf(stderr,
+				"innodb_change_buffering_debug flush %u %u\n",
+				(unsigned) space, (unsigned) offset);
+			guess = fix_block;
+			goto loop;
+		}
+
+		buf_block_mutex_exit(fix_block);
+
+		buf_block_fix(fix_block);
+
+		/* Failed to evict the page; change it directly */
+
+		buf_pool_mutex_exit(buf_pool);
+	}
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+	ut_ad(fix_block->page.buf_fix_count > 0);
+
+#ifdef UNIV_SYNC_DEBUG
+	/* We have already buffer fixed the page, and we are committed to
+	returning this page to the caller. Register for debugging. */
+	{
+		ibool	ret;
+		ret = rw_lock_s_lock_nowait(&fix_block->debug_latch, file, line);
+		ut_a(ret);
+	}
+#endif /* UNIV_SYNC_DEBUG */
+
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+	ut_a(mode == BUF_GET_POSSIBLY_FREED
+	     || !fix_block->page.file_page_was_freed);
+#endif
+	/* Check if this is the first access to the page */
+	access_time = buf_page_is_accessed(&fix_block->page);
+
+	/* This is a heuristic and we don't care about ordering issues. */
+	if (access_time == 0) {
+		buf_block_mutex_enter(fix_block);
+
+		buf_page_set_accessed(&fix_block->page);
+
+		buf_block_mutex_exit(fix_block);
+	}
+
+	if (mode != BUF_PEEK_IF_IN_POOL) {
+		buf_page_make_young_if_needed(&fix_block->page);
+	}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(++buf_dbg_counter % 5771 || buf_validate());
+	ut_a(fix_block->page.buf_fix_count > 0);
+	ut_a(buf_block_get_state(fix_block) == BUF_BLOCK_FILE_PAGE);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+#ifdef PAGE_ATOMIC_REF_COUNT
+	/* We have to wait here because the IO_READ state was set
+	under the protection of the hash_lock and the block->mutex
+	but not the block->lock. */
+	buf_wait_for_read(fix_block);
+#endif /* PAGE_ATOMIC_REF_COUNT */
+
+	switch (rw_latch) {
+	case RW_NO_LATCH:
+
+#ifndef PAGE_ATOMIC_REF_COUNT
+		buf_wait_for_read(fix_block);
+#endif /* !PAGE_ATOMIC_REF_COUNT */
+
+		fix_type = MTR_MEMO_BUF_FIX;
+		break;
+
+	case RW_S_LATCH:
+		rw_lock_s_lock_inline(&fix_block->lock, 0, file, line);
+
+		fix_type = MTR_MEMO_PAGE_S_FIX;
+		break;
+
+	default:
+		ut_ad(rw_latch == RW_X_LATCH);
+		rw_lock_x_lock_inline(&fix_block->lock, 0, file, line);
+
+		fix_type = MTR_MEMO_PAGE_X_FIX;
+		break;
+	}
+
+	mtr_memo_push(mtr, fix_block, fix_type);
+
+	if (mode != BUF_PEEK_IF_IN_POOL && !access_time) {
+		/* In the case of a first access, try to apply linear
+		read-ahead */
+
+		buf_read_ahead_linear(
+			space, zip_size, offset, ibuf_inside(mtr));
+	}
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a(ibuf_count_get(buf_block_get_space(fix_block),
+			    buf_block_get_page_no(fix_block)) == 0);
+#endif
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+	return(fix_block);
+}
+
+/********************************************************************//**
+This is the general function used to get optimistic access to a database
+page.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+buf_page_optimistic_get(
+/*====================*/
+	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+	buf_block_t*	block,	/*!< in: guessed buffer block */
+	ib_uint64_t	modify_clock,/*!< in: modify clock value */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mini-transaction */
+{
+	buf_pool_t*	buf_pool;
+	unsigned	access_time;
+	ibool		success;
+	ulint		fix_type;
+
+	ut_ad(block);
+	ut_ad(mtr);
+	ut_ad(mtr->state == MTR_ACTIVE);
+	ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
+
+	mutex_enter(&block->mutex);
+
+	if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) {
+
+		mutex_exit(&block->mutex);
+
+		return(FALSE);
+	}
+
+	buf_block_buf_fix_inc(block, file, line);
+
+	access_time = buf_page_is_accessed(&block->page);
+
+	buf_page_set_accessed(&block->page);
+
+	mutex_exit(&block->mutex);
+
+	buf_page_make_young_if_needed(&block->page);
+
+	ut_ad(!ibuf_inside(mtr)
+	      || ibuf_page(buf_block_get_space(block),
+			   buf_block_get_zip_size(block),
+			   buf_block_get_page_no(block), NULL));
+
+	if (rw_latch == RW_S_LATCH) {
+		success = rw_lock_s_lock_nowait(&(block->lock),
+						file, line);
+		fix_type = MTR_MEMO_PAGE_S_FIX;
+	} else {
+		success = rw_lock_x_lock_func_nowait_inline(&(block->lock),
+							    file, line);
+		fix_type = MTR_MEMO_PAGE_X_FIX;
+	}
+
+	if (UNIV_UNLIKELY(!success)) {
+		buf_block_buf_fix_dec(block);
+
+		return(FALSE);
+	}
+
+	if (UNIV_UNLIKELY(modify_clock != block->modify_clock)) {
+		buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+		if (rw_latch == RW_S_LATCH) {
+			rw_lock_s_unlock(&(block->lock));
+		} else {
+			rw_lock_x_unlock(&(block->lock));
+		}
+
+		buf_block_buf_fix_dec(block);
+
+		return(FALSE);
+	}
+
+	mtr_memo_push(mtr, block, fix_type);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(++buf_dbg_counter % 5771 || buf_validate());
+	ut_a(block->page.buf_fix_count > 0);
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+	mutex_enter(&block->mutex);
+	ut_a(!block->page.file_page_was_freed);
+	mutex_exit(&block->mutex);
+#endif
+
+	if (!access_time) {
+		/* In the case of a first access, try to apply linear
+		read-ahead */
+
+		buf_read_ahead_linear(buf_block_get_space(block),
+				      buf_block_get_zip_size(block),
+				      buf_block_get_page_no(block),
+				      ibuf_inside(mtr));
+	}
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a(ibuf_count_get(buf_block_get_space(block),
+			    buf_block_get_page_no(block)) == 0);
+#endif
+	buf_pool = buf_pool_from_block(block);
+	buf_pool->stat.n_page_gets++;
+
+	return(TRUE);
+}
+
+/********************************************************************//**
+This is used to get access to a known database page, when no waiting can be
+done. For example, if a search in an adaptive hash index leads us to this
+frame.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+buf_page_get_known_nowait(
+/*======================*/
+	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+	buf_block_t*	block,	/*!< in: the known page */
+	ulint		mode,	/*!< in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mini-transaction */
+{
+	buf_pool_t*	buf_pool;
+	ibool		success;
+	ulint		fix_type;
+
+	ut_ad(mtr);
+	ut_ad(mtr->state == MTR_ACTIVE);
+	ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
+
+	mutex_enter(&block->mutex);
+
+	if (buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH) {
+		/* Another thread is just freeing the block from the LRU list
+		of the buffer pool: do not try to access this page; this
+		attempt to access the page can only come through the hash
+		index because when the buffer block state is ..._REMOVE_HASH,
+		we have already removed it from the page address hash table
+		of the buffer pool. */
+
+		mutex_exit(&block->mutex);
+
+		return(FALSE);
+	}
+
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+	buf_block_buf_fix_inc(block, file, line);
+
+	buf_page_set_accessed(&block->page);
+
+	mutex_exit(&block->mutex);
+
+	buf_pool = buf_pool_from_block(block);
+
+	if (mode == BUF_MAKE_YOUNG) {
+		buf_page_make_young_if_needed(&block->page);
+	}
+
+	ut_ad(!ibuf_inside(mtr) || mode == BUF_KEEP_OLD);
+
+	if (rw_latch == RW_S_LATCH) {
+		success = rw_lock_s_lock_nowait(&(block->lock),
+						file, line);
+		fix_type = MTR_MEMO_PAGE_S_FIX;
+	} else {
+		success = rw_lock_x_lock_func_nowait_inline(&(block->lock),
+							    file, line);
+		fix_type = MTR_MEMO_PAGE_X_FIX;
+	}
+
+	if (!success) {
+		buf_block_buf_fix_dec(block);
+
+		return(FALSE);
+	}
+
+	mtr_memo_push(mtr, block, fix_type);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(++buf_dbg_counter % 5771 || buf_validate());
+	ut_a(block->page.buf_fix_count > 0);
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+	if (mode != BUF_KEEP_OLD) {
+		/* If mode == BUF_KEEP_OLD, we are executing an I/O
+		completion routine.  Avoid a bogus assertion failure
+		when ibuf_merge_or_delete_for_page() is processing a
+		page that was just freed due to DROP INDEX, or
+		deleting a record from SYS_INDEXES. This check will be
+		skipped in recv_recover_page() as well. */
+
+		mutex_enter(&block->mutex);
+		ut_a(!block->page.file_page_was_freed);
+		mutex_exit(&block->mutex);
+	}
+#endif
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a((mode == BUF_KEEP_OLD)
+	     || (ibuf_count_get(buf_block_get_space(block),
+				buf_block_get_page_no(block)) == 0));
+#endif
+	buf_pool->stat.n_page_gets++;
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Given a tablespace id and page number tries to get that page. If the
+page is not in the buffer pool it is not loaded and NULL is returned.
+Suitable for using when holding the lock_sys_t::mutex.
+@return	pointer to a page or NULL */
+UNIV_INTERN
+const buf_block_t*
+buf_page_try_get_func(
+/*==================*/
+	ulint		space_id,/*!< in: tablespace id */
+	ulint		page_no,/*!< in: page number */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mini-transaction */
+{
+	buf_block_t*	block;
+	ibool		success;
+	ulint		fix_type;
+	buf_pool_t*	buf_pool = buf_pool_get(space_id, page_no);
+	rw_lock_t*	hash_lock;
+
+	ut_ad(mtr);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
+	block = buf_block_hash_get_s_locked(buf_pool, space_id,
+					    page_no, &hash_lock);
+
+	if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
+		if (block) {
+			rw_lock_s_unlock(hash_lock);
+		}
+		return(NULL);
+	}
+
+	ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page));
+
+	mutex_enter(&block->mutex);
+	rw_lock_s_unlock(hash_lock);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_a(buf_block_get_space(block) == space_id);
+	ut_a(buf_block_get_page_no(block) == page_no);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+	buf_block_buf_fix_inc(block, file, line);
+	mutex_exit(&block->mutex);
+
+	fix_type = MTR_MEMO_PAGE_S_FIX;
+	success = rw_lock_s_lock_nowait(&block->lock, file, line);
+
+	if (!success) {
+		/* Let us try to get an X-latch. If the current thread
+		is holding an X-latch on the page, we cannot get an
+		S-latch. */
+
+		fix_type = MTR_MEMO_PAGE_X_FIX;
+		success = rw_lock_x_lock_func_nowait_inline(&block->lock,
+							    file, line);
+	}
+
+	if (!success) {
+		buf_block_buf_fix_dec(block);
+
+		return(NULL);
+	}
+
+	mtr_memo_push(mtr, block, fix_type);
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(++buf_dbg_counter % 5771 || buf_validate());
+	ut_a(block->page.buf_fix_count > 0);
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+	mutex_enter(&block->mutex);
+	ut_a(!block->page.file_page_was_freed);
+	mutex_exit(&block->mutex);
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
+	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+	buf_pool->stat.n_page_gets++;
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a(ibuf_count_get(buf_block_get_space(block),
+			    buf_block_get_page_no(block)) == 0);
+#endif
+
+	return(block);
+}
+
+/********************************************************************//**
+Initialize some fields of a control block. */
+UNIV_INLINE
+void
+buf_page_init_low(
+/*==============*/
+	buf_page_t*	bpage)	/*!< in: block to init */
+{
+	bpage->flush_type = BUF_FLUSH_LRU;
+	bpage->io_fix = BUF_IO_NONE;
+	bpage->buf_fix_count = 0;
+	bpage->freed_page_clock = 0;
+	bpage->access_time = 0;
+	bpage->newest_modification = 0;
+	bpage->oldest_modification = 0;
+	HASH_INVALIDATE(bpage, hash);
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+	bpage->file_page_was_freed = FALSE;
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
+}
+
+/********************************************************************//**
+Inits a page to the buffer buf_pool. */
+static __attribute__((nonnull))
+void
+buf_page_init(
+/*==========*/
+	buf_pool_t*	buf_pool,/*!< in/out: buffer pool */
+	ulint		space,	/*!< in: space id */
+	ulint		offset,	/*!< in: offset of the page within space
+				in units of a page */
+	ulint		fold,	/*!< in: buf_page_address_fold(space,offset) */
+	ulint		zip_size,/*!< in: compressed page size, or 0 */
+	buf_block_t*	block)	/*!< in/out: block to init */
+{
+	buf_page_t*	hash_page;
+
+	ut_ad(buf_pool == buf_pool_get(space, offset));
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	ut_ad(mutex_own(&(block->mutex)));
+	ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(buf_page_hash_lock_get(buf_pool, fold),
+			  RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* Set the state of the block */
+	buf_block_set_file_page(block, space, offset);
+
+#ifdef UNIV_DEBUG_VALGRIND
+	if (!space) {
+		/* Silence valid Valgrind warnings about uninitialized
+		data being written to data files.  There are some unused
+		bytes on some pages that InnoDB does not initialize. */
+		UNIV_MEM_VALID(block->frame, UNIV_PAGE_SIZE);
+	}
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	buf_block_init_low(block);
+
+	block->lock_hash_val = lock_rec_hash(space, offset);
+
+	buf_page_init_low(&block->page);
+
+	/* Insert into the hash table of file pages */
+
+	hash_page = buf_page_hash_get_low(buf_pool, space, offset, fold);
+
+	if (hash_page == NULL) {
+		/* Block not found in the hash table */
+	} else if (buf_pool_watch_is_sentinel(buf_pool, hash_page)) {
+		ib_uint32_t	buf_fix_count = hash_page->buf_fix_count;
+
+	ut_a(buf_fix_count > 0);
+
+#ifdef PAGE_ATOMIC_REF_COUNT
+		os_atomic_increment_uint32(
+			&block->page.buf_fix_count, buf_fix_count);
+#else
+		block->page.buf_fix_count += ulint(buf_fix_count);
+#endif /* PAGE_ATOMIC_REF_COUNT */
+
+		buf_pool_watch_remove(buf_pool, fold, hash_page);
+	} else {
+		fprintf(stderr,
+			"InnoDB: Error: page %lu %lu already found"
+			" in the hash table: %p, %p\n",
+			(ulong) space,
+			(ulong) offset,
+			(const void*) hash_page, (const void*) block);
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+		mutex_exit(&block->mutex);
+		buf_pool_mutex_exit(buf_pool);
+		buf_print();
+		buf_LRU_print();
+		buf_validate();
+		buf_LRU_validate();
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+		ut_error;
+	}
+
+	ut_ad(!block->page.in_zip_hash);
+	ut_ad(!block->page.in_page_hash);
+	ut_d(block->page.in_page_hash = TRUE);
+
+	HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, &block->page);
+
+	if (zip_size) {
+		page_zip_set_size(&block->page.zip, zip_size);
+	}
+}
+
+/********************************************************************//**
+Function which inits a page for read to the buffer buf_pool. If the page is
+(1) already in buf_pool, or
+(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
+(3) if the space is deleted or being deleted,
+then this function does nothing.
+Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
+on the buffer frame. The io-handler must take care that the flag is cleared
+and the lock released later.
+@return	pointer to the block or NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_init_for_read(
+/*===================*/
+	dberr_t*	err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */
+	ulint		mode,	/*!< in: BUF_READ_IBUF_PAGES_ONLY, ... */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size, or 0 */
+	ibool		unzip,	/*!< in: TRUE=request uncompressed page */
+	ib_int64_t	tablespace_version,
+				/*!< in: prevents reading from a wrong
+				version of the tablespace in case we have done
+				DISCARD + IMPORT */
+	ulint		offset)	/*!< in: page number */
+{
+	buf_block_t*	block;
+	buf_page_t*	bpage	= NULL;
+	buf_page_t*	watch_page;
+	rw_lock_t*	hash_lock;
+	mtr_t		mtr;
+	ulint		fold;
+	ibool		lru	= FALSE;
+	void*		data;
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+
+	ut_ad(buf_pool);
+
+	*err = DB_SUCCESS;
+
+	if (mode == BUF_READ_IBUF_PAGES_ONLY) {
+		/* It is a read-ahead within an ibuf routine */
+
+		ut_ad(!ibuf_bitmap_page(zip_size, offset));
+
+		ibuf_mtr_start(&mtr);
+
+		if (!recv_no_ibuf_operations
+		    && !ibuf_page(space, zip_size, offset, &mtr)) {
+
+			ibuf_mtr_commit(&mtr);
+
+			return(NULL);
+		}
+	} else {
+		ut_ad(mode == BUF_READ_ANY_PAGE);
+	}
+
+	if (zip_size && !unzip && !recv_recovery_is_on()) {
+		block = NULL;
+	} else {
+		block = buf_LRU_get_free_block(buf_pool);
+		ut_ad(block);
+		ut_ad(buf_pool_from_block(block) == buf_pool);
+	}
+
+	fold = buf_page_address_fold(space, offset);
+	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+
+	buf_pool_mutex_enter(buf_pool);
+	rw_lock_x_lock(hash_lock);
+
+	watch_page = buf_page_hash_get_low(buf_pool, space, offset, fold);
+	if (watch_page && !buf_pool_watch_is_sentinel(buf_pool, watch_page)) {
+		/* The page is already in the buffer pool. */
+		watch_page = NULL;
+err_exit:
+		rw_lock_x_unlock(hash_lock);
+		if (block) {
+			mutex_enter(&block->mutex);
+			buf_LRU_block_free_non_file_page(block);
+			mutex_exit(&block->mutex);
+		}
+
+		bpage = NULL;
+		goto func_exit;
+	}
+
+	if (fil_tablespace_deleted_or_being_deleted_in_mem(
+		    space, tablespace_version)) {
+		/* The page belongs to a space which has been
+		deleted or is being deleted. */
+		*err = DB_TABLESPACE_DELETED;
+
+		goto err_exit;
+	}
+
+	if (block) {
+		bpage = &block->page;
+
+		mutex_enter(&block->mutex);
+
+		ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
+
+		buf_page_init(buf_pool, space, offset, fold, zip_size, block);
+
+#ifdef PAGE_ATOMIC_REF_COUNT
+		/* Note: We set the io state without the protection of
+		the block->lock. This is because other threads cannot
+		access this block unless it is in the hash table. */
+
+		buf_page_set_io_fix(bpage, BUF_IO_READ);
+#endif /* PAGE_ATOMIC_REF_COUNT */
+
+		rw_lock_x_unlock(hash_lock);
+
+		/* The block must be put to the LRU list, to the old blocks */
+		buf_LRU_add_block(bpage, TRUE/* to old blocks */);
+
+		/* We set a pass-type x-lock on the frame because then
+		the same thread which called for the read operation
+		(and is running now at this point of code) can wait
+		for the read to complete by waiting for the x-lock on
+		the frame; if the x-lock were recursive, the same
+		thread would illegally get the x-lock before the page
+		read is completed.  The x-lock is cleared by the
+		io-handler thread. */
+
+		rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
+
+#ifndef PAGE_ATOMIC_REF_COUNT
+		buf_page_set_io_fix(bpage, BUF_IO_READ);
+#endif /* !PAGE_ATOMIC_REF_COUNT */
+
+		if (zip_size) {
+			/* buf_pool->mutex may be released and
+			reacquired by buf_buddy_alloc().  Thus, we
+			must release block->mutex in order not to
+			break the latching order in the reacquisition
+			of buf_pool->mutex.  We also must defer this
+			operation until after the block descriptor has
+			been added to buf_pool->LRU and
+			buf_pool->page_hash. */
+			mutex_exit(&block->mutex);
+			data = buf_buddy_alloc(buf_pool, zip_size, &lru);
+			mutex_enter(&block->mutex);
+			block->page.zip.data = (page_zip_t*) data;
+
+			/* To maintain the invariant
+			block->in_unzip_LRU_list
+			== buf_page_belongs_to_unzip_LRU(&block->page)
+			we have to add this block to unzip_LRU
+			after block->page.zip.data is set. */
+			ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
+			buf_unzip_LRU_add_block(block, TRUE);
+		}
+
+		mutex_exit(&block->mutex);
+	} else {
+		rw_lock_x_unlock(hash_lock);
+
+		/* The compressed page must be allocated before the
+		control block (bpage), in order to avoid the
+		invocation of buf_buddy_relocate_block() on
+		uninitialized data. */
+		data = buf_buddy_alloc(buf_pool, zip_size, &lru);
+
+		rw_lock_x_lock(hash_lock);
+
+		/* If buf_buddy_alloc() allocated storage from the LRU list,
+		it released and reacquired buf_pool->mutex.  Thus, we must
+		check the page_hash again, as it may have been modified. */
+		if (UNIV_UNLIKELY(lru)) {
+
+			watch_page = buf_page_hash_get_low(
+				buf_pool, space, offset, fold);
+
+			if (UNIV_UNLIKELY(watch_page
+			    && !buf_pool_watch_is_sentinel(buf_pool,
+							   watch_page))) {
+
+				/* The block was added by some other thread. */
+				rw_lock_x_unlock(hash_lock);
+				watch_page = NULL;
+				buf_buddy_free(buf_pool, data, zip_size);
+
+				bpage = NULL;
+				goto func_exit;
+			}
+		}
+
+		bpage = buf_page_alloc_descriptor();
+
+		/* Initialize the buf_pool pointer. */
+		bpage->buf_pool_index = buf_pool_index(buf_pool);
+
+		page_zip_des_init(&bpage->zip);
+		page_zip_set_size(&bpage->zip, zip_size);
+		bpage->zip.data = (page_zip_t*) data;
+
+		mutex_enter(&buf_pool->zip_mutex);
+		UNIV_MEM_DESC(bpage->zip.data,
+			      page_zip_get_size(&bpage->zip));
+
+		buf_page_init_low(bpage);
+
+		bpage->state	= BUF_BLOCK_ZIP_PAGE;
+		bpage->space	= static_cast<ib_uint32_t>(space);
+		bpage->offset	= static_cast<ib_uint32_t>(offset);
+
+#ifdef UNIV_DEBUG
+		bpage->in_page_hash = FALSE;
+		bpage->in_zip_hash = FALSE;
+		bpage->in_flush_list = FALSE;
+		bpage->in_free_list = FALSE;
+		bpage->in_LRU_list = FALSE;
+#endif /* UNIV_DEBUG */
+
+		ut_d(bpage->in_page_hash = TRUE);
+
+		if (watch_page != NULL) {
+
+			/* Preserve the reference count. */
+			ib_uint32_t	buf_fix_count;
+
+			buf_fix_count = watch_page->buf_fix_count;
+
+			ut_a(buf_fix_count > 0);
+
+#ifdef PAGE_ATOMIC_REF_COUNT
+			os_atomic_increment_uint32(
+				&bpage->buf_fix_count, buf_fix_count);
+#else
+			bpage->buf_fix_count += buf_fix_count;
+#endif /* PAGE_ATOMIC_REF_COUNT */
+
+			ut_ad(buf_pool_watch_is_sentinel(buf_pool, watch_page));
+			buf_pool_watch_remove(buf_pool, fold, watch_page);
+		}
+
+		HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold,
+			    bpage);
+
+		rw_lock_x_unlock(hash_lock);
+
+		/* The block must be put to the LRU list, to the old blocks.
+		The zip_size is already set into the page zip */
+		buf_LRU_add_block(bpage, TRUE/* to old blocks */);
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+		buf_LRU_insert_zip_clean(bpage);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+		buf_page_set_io_fix(bpage, BUF_IO_READ);
+
+		mutex_exit(&buf_pool->zip_mutex);
+	}
+
+	buf_pool->n_pend_reads++;
+func_exit:
+	buf_pool_mutex_exit(buf_pool);
+
+	if (mode == BUF_READ_IBUF_PAGES_ONLY) {
+
+		ibuf_mtr_commit(&mtr);
+	}
+
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_ad(!bpage || buf_page_in_file(bpage));
+	return(bpage);
+}
+
+/********************************************************************//**
+Initializes a page to the buffer buf_pool. The page is usually not read
+from a file even if it cannot be found in the buffer buf_pool. This is one
+of the functions which perform to a block a state transition NOT_USED =>
+FILE_PAGE (the other is buf_page_get_gen).
+@return	pointer to the block, page bufferfixed */
+UNIV_INTERN
+buf_block_t*
+buf_page_create(
+/*============*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset,	/*!< in: offset of the page within space in units of
+			a page */
+	ulint	zip_size,/*!< in: compressed page size, or 0 */
+	mtr_t*	mtr)	/*!< in: mini-transaction handle */
+{
+	buf_frame_t*	frame;
+	buf_block_t*	block;
+	ulint		fold;
+	buf_block_t*	free_block	= NULL;
+	buf_pool_t*	buf_pool	= buf_pool_get(space, offset);
+	rw_lock_t*	hash_lock;
+
+	ut_ad(mtr);
+	ut_ad(mtr->state == MTR_ACTIVE);
+	ut_ad(space || !zip_size);
+
+	free_block = buf_LRU_get_free_block(buf_pool);
+
+	fold = buf_page_address_fold(space, offset);
+	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+
+	buf_pool_mutex_enter(buf_pool);
+	rw_lock_x_lock(hash_lock);
+
+	block = (buf_block_t*) buf_page_hash_get_low(
+		buf_pool, space, offset, fold);
+
+	if (block
+	    && buf_page_in_file(&block->page)
+	    && !buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
+#ifdef UNIV_IBUF_COUNT_DEBUG
+		ut_a(ibuf_count_get(space, offset) == 0);
+#endif
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+		block->page.file_page_was_freed = FALSE;
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
+
+		/* Page can be found in buf_pool */
+		buf_pool_mutex_exit(buf_pool);
+		rw_lock_x_unlock(hash_lock);
+
+		buf_block_free(free_block);
+
+		return(buf_page_get_with_no_latch(space, zip_size, offset, mtr));
+	}
+
+	/* If we get here, the page was not in buf_pool: init it there */
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints) {
+		fprintf(stderr, "Creating space %lu page %lu to buffer\n",
+			(ulong) space, (ulong) offset);
+	}
+#endif /* UNIV_DEBUG */
+
+	block = free_block;
+
+	mutex_enter(&block->mutex);
+
+	buf_page_init(buf_pool, space, offset, fold, zip_size, block);
+
+	rw_lock_x_unlock(hash_lock);
+
+	/* The block must be put to the LRU list */
+	buf_LRU_add_block(&block->page, FALSE);
+
+	buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+	buf_pool->stat.n_pages_created++;
+
+	if (zip_size) {
+		void*	data;
+		ibool	lru;
+
+		/* Prevent race conditions during buf_buddy_alloc(),
+		which may release and reacquire buf_pool->mutex,
+		by IO-fixing and X-latching the block. */
+
+		buf_page_set_io_fix(&block->page, BUF_IO_READ);
+		rw_lock_x_lock(&block->lock);
+
+		mutex_exit(&block->mutex);
+		/* buf_pool->mutex may be released and reacquired by
+		buf_buddy_alloc().  Thus, we must release block->mutex
+		in order not to break the latching order in
+		the reacquisition of buf_pool->mutex.  We also must
+		defer this operation until after the block descriptor
+		has been added to buf_pool->LRU and buf_pool->page_hash. */
+		data = buf_buddy_alloc(buf_pool, zip_size, &lru);
+		mutex_enter(&block->mutex);
+		block->page.zip.data = (page_zip_t*) data;
+
+		/* To maintain the invariant
+		block->in_unzip_LRU_list
+		== buf_page_belongs_to_unzip_LRU(&block->page)
+		we have to add this block to unzip_LRU after
+		block->page.zip.data is set. */
+		ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
+		buf_unzip_LRU_add_block(block, FALSE);
+
+		buf_page_set_io_fix(&block->page, BUF_IO_NONE);
+		rw_lock_x_unlock(&block->lock);
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+
+	mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX);
+
+	buf_page_set_accessed(&block->page);
+
+	mutex_exit(&block->mutex);
+
+	/* Delete possible entries for the page from the insert buffer:
+	such can exist if the page belonged to an index which was dropped */
+
+	ibuf_merge_or_delete_for_page(NULL, space, offset, zip_size, TRUE);
+
+	frame = block->frame;
+
+	memset(frame + FIL_PAGE_PREV, 0xff, 4);
+	memset(frame + FIL_PAGE_NEXT, 0xff, 4);
+	mach_write_to_2(frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
+
+	/* Reset to zero the file flush lsn field in the page; if the first
+	page of an ibdata file is 'created' in this function into the buffer
+	pool then we lose the original contents of the file flush lsn stamp.
+	Then InnoDB could in a crash recovery print a big, false, corruption
+	warning if the stamp contains an lsn bigger than the ib_logfile lsn. */
+
+	memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(++buf_dbg_counter % 5771 || buf_validate());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a(ibuf_count_get(buf_block_get_space(block),
+			    buf_block_get_page_no(block)) == 0);
+#endif
+	return(block);
+}
+
+/********************************************************************//**
+Monitor the buffer page read/write activity, and increment corresponding
+counter value if MONITOR_MODULE_BUF_PAGE (module_buf_page) module is
+enabled. */
+static
+void
+buf_page_monitor(
+/*=============*/
+	const buf_page_t*	bpage,	/*!< in: pointer to the block */
+	enum buf_io_fix		io_type)/*!< in: io_fix types */
+{
+	const byte*	frame;
+	monitor_id_t	counter;
+
+	/* If the counter module is not turned on, just return */
+	if (!MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)) {
+		return;
+	}
+
+	ut_a(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
+
+	frame = bpage->zip.data
+		? bpage->zip.data
+		: ((buf_block_t*) bpage)->frame;
+
+	switch (fil_page_get_type(frame)) {
+		ulint	level;
+
+	case FIL_PAGE_INDEX:
+		level = btr_page_get_level_low(frame);
+
+		/* Check if it is an index page for insert buffer */
+		if (btr_page_get_index_id(frame)
+		    == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) {
+			if (level == 0) {
+				counter = MONITOR_RW_COUNTER(
+					io_type, MONITOR_INDEX_IBUF_LEAF_PAGE);
+			} else {
+				counter = MONITOR_RW_COUNTER(
+					io_type,
+					MONITOR_INDEX_IBUF_NON_LEAF_PAGE);
+			}
+		} else {
+			if (level == 0) {
+				counter = MONITOR_RW_COUNTER(
+					io_type, MONITOR_INDEX_LEAF_PAGE);
+			} else {
+				counter = MONITOR_RW_COUNTER(
+					io_type, MONITOR_INDEX_NON_LEAF_PAGE);
+			}
+		}
+		break;
+
+        case FIL_PAGE_UNDO_LOG:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_UNDO_LOG_PAGE);
+		break;
+
+        case FIL_PAGE_INODE:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_INODE_PAGE);
+		break;
+
+        case FIL_PAGE_IBUF_FREE_LIST:
+		counter = MONITOR_RW_COUNTER(io_type,
+					     MONITOR_IBUF_FREELIST_PAGE);
+		break;
+
+        case FIL_PAGE_IBUF_BITMAP:
+		counter = MONITOR_RW_COUNTER(io_type,
+					     MONITOR_IBUF_BITMAP_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_SYS:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_SYSTEM_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_TRX_SYS:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_TRX_SYSTEM_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_FSP_HDR:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_FSP_HDR_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_XDES:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_XDES_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_BLOB:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_BLOB_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_ZBLOB:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB_PAGE);
+		break;
+
+        case FIL_PAGE_TYPE_ZBLOB2:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB2_PAGE);
+		break;
+
+	default:
+		counter = MONITOR_RW_COUNTER(io_type, MONITOR_OTHER_PAGE);
+	}
+
+	MONITOR_INC_NOCHECK(counter);
+}
+
+/********************************************************************//**
+Mark a table with the specified space pointed by bpage->space corrupted.
+Also remove the bpage from LRU list.
+@return TRUE if successful */
+static
+ibool
+buf_mark_space_corrupt(
+/*===================*/
+	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	const ibool	uncompressed = (buf_page_get_state(bpage)
+					== BUF_BLOCK_FILE_PAGE);
+	ulint		space = bpage->space;
+	ibool		ret = TRUE;
+
+	/* First unfix and release lock on the bpage */
+	buf_pool_mutex_enter(buf_pool);
+	mutex_enter(buf_page_get_mutex(bpage));
+	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
+	ut_ad(bpage->buf_fix_count == 0);
+
+	/* Set BUF_IO_NONE before we remove the block from LRU list */
+	buf_page_set_io_fix(bpage, BUF_IO_NONE);
+
+	if (uncompressed) {
+		rw_lock_x_unlock_gen(
+			&((buf_block_t*) bpage)->lock,
+			BUF_IO_READ);
+	}
+
+	mutex_exit(buf_page_get_mutex(bpage));
+
+	/* Find the table with specified space id, and mark it corrupted */
+	if (dict_set_corrupted_by_space(space)) {
+		buf_LRU_free_one_page(bpage);
+	} else {
+		ret = FALSE;
+	}
+
+	ut_ad(buf_pool->n_pend_reads > 0);
+	buf_pool->n_pend_reads--;
+
+	buf_pool_mutex_exit(buf_pool);
+
+	return(ret);
+}
+
+/********************************************************************//**
+Completes an asynchronous read or write request of a file page to or from
+the buffer pool.
+@return true if successful */
+UNIV_INTERN
+bool
+buf_page_io_complete(
+/*=================*/
+	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
+{
+	enum buf_io_fix	io_type;
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	const ibool	uncompressed = (buf_page_get_state(bpage)
+					== BUF_BLOCK_FILE_PAGE);
+
+	ut_a(buf_page_in_file(bpage));
+
+	/* We do not need protect io_fix here by mutex to read
+	it because this is the only function where we can change the value
+	from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code
+	ensures that this is the only thread that handles the i/o for this
+	block. */
+
+	io_type = buf_page_get_io_fix(bpage);
+	ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
+
+	if (io_type == BUF_IO_READ) {
+		ulint	read_page_no;
+		ulint	read_space_id;
+		byte*	frame;
+
+		if (buf_page_get_zip_size(bpage)) {
+			frame = bpage->zip.data;
+			buf_pool->n_pend_unzip++;
+			if (uncompressed
+			    && !buf_zip_decompress((buf_block_t*) bpage,
+						   FALSE)) {
+
+				buf_pool->n_pend_unzip--;
+				goto corrupt;
+			}
+			buf_pool->n_pend_unzip--;
+		} else {
+			ut_a(uncompressed);
+			frame = ((buf_block_t*) bpage)->frame;
+		}
+
+		/* If this page is not uninitialized and not in the
+		doublewrite buffer, then the page number and space id
+		should be the same as in block. */
+		read_page_no = mach_read_from_4(frame + FIL_PAGE_OFFSET);
+		read_space_id = mach_read_from_4(
+			frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+		if (bpage->space == TRX_SYS_SPACE
+		    && buf_dblwr_page_inside(bpage->offset)) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Error: reading page %lu\n"
+				"InnoDB: which is in the"
+				" doublewrite buffer!\n",
+				(ulong) bpage->offset);
+		} else if (!read_space_id && !read_page_no) {
+			/* This is likely an uninitialized page. */
+		} else if ((bpage->space
+			    && bpage->space != read_space_id)
+			   || bpage->offset != read_page_no) {
+			/* We did not compare space_id to read_space_id
+			if bpage->space == 0, because the field on the
+			page may contain garbage in MySQL < 4.1.1,
+			which only supported bpage->space == 0. */
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Error: space id and page n:o"
+				" stored in the page\n"
+				"InnoDB: read in are %lu:%lu,"
+				" should be %lu:%lu!\n",
+				(ulong) read_space_id, (ulong) read_page_no,
+				(ulong) bpage->space,
+				(ulong) bpage->offset);
+		}
+
+		/* From version 3.23.38 up we store the page checksum
+		to the 4 first bytes of the page end lsn field */
+
+		if (buf_page_is_corrupted(true, frame,
+					  buf_page_get_zip_size(bpage))) {
+
+			/* Not a real corruption if it was triggered by
+			error injection */
+			DBUG_EXECUTE_IF("buf_page_is_corrupt_failure",
+				if (bpage->space > TRX_SYS_SPACE
+				    && buf_mark_space_corrupt(bpage)) {
+					ib_logf(IB_LOG_LEVEL_INFO,
+						"Simulated page corruption");
+					return(true);
+				}
+				goto page_not_corrupt;
+				;);
+corrupt:
+			fprintf(stderr,
+				"InnoDB: Database page corruption on disk"
+				" or a failed\n"
+				"InnoDB: file read of page %lu.\n"
+				"InnoDB: You may have to recover"
+				" from a backup.\n",
+				(ulong) bpage->offset);
+			buf_page_print(frame, buf_page_get_zip_size(bpage),
+				       BUF_PAGE_PRINT_NO_CRASH);
+			fprintf(stderr,
+				"InnoDB: Database page corruption on disk"
+				" or a failed\n"
+				"InnoDB: file read of page %lu.\n"
+				"InnoDB: You may have to recover"
+				" from a backup.\n",
+				(ulong) bpage->offset);
+			fputs("InnoDB: It is also possible that"
+			      " your operating\n"
+			      "InnoDB: system has corrupted its"
+			      " own file cache\n"
+			      "InnoDB: and rebooting your computer"
+			      " removes the\n"
+			      "InnoDB: error.\n"
+			      "InnoDB: If the corrupt page is an index page\n"
+			      "InnoDB: you can also try to"
+			      " fix the corruption\n"
+			      "InnoDB: by dumping, dropping,"
+			      " and reimporting\n"
+			      "InnoDB: the corrupt table."
+			      " You can use CHECK\n"
+			      "InnoDB: TABLE to scan your"
+			      " table for corruption.\n"
+			      "InnoDB: See also "
+			      REFMAN "forcing-innodb-recovery.html\n"
+			      "InnoDB: about forcing recovery.\n", stderr);
+
+			if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) {
+				/* If page space id is larger than TRX_SYS_SPACE
+				(0), we will attempt to mark the corresponding
+				table as corrupted instead of crashing server */
+				if (bpage->space > TRX_SYS_SPACE
+				    && buf_mark_space_corrupt(bpage)) {
+					return(false);
+				} else {
+					fputs("InnoDB: Ending processing"
+					      " because of"
+					      " a corrupt database page.\n",
+					      stderr);
+
+					ut_error;
+				}
+			}
+		}
+
+		DBUG_EXECUTE_IF("buf_page_is_corrupt_failure",
+				page_not_corrupt:  bpage = bpage; );
+
+		if (recv_recovery_is_on()) {
+			/* Pages must be uncompressed for crash recovery. */
+			ut_a(uncompressed);
+			recv_recover_page(TRUE, (buf_block_t*) bpage);
+		}
+
+		if (uncompressed && !recv_no_ibuf_operations) {
+			ibuf_merge_or_delete_for_page(
+				(buf_block_t*) bpage, bpage->space,
+				bpage->offset, buf_page_get_zip_size(bpage),
+				TRUE);
+		}
+	}
+
+	buf_pool_mutex_enter(buf_pool);
+	mutex_enter(buf_page_get_mutex(bpage));
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	if (io_type == BUF_IO_WRITE || uncompressed) {
+		/* For BUF_IO_READ of compressed-only blocks, the
+		buffered operations will be merged by buf_page_get_gen()
+		after the block has been uncompressed. */
+		ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
+	}
+#endif
+	/* Because this thread which does the unlocking is not the same that
+	did the locking, we use a pass value != 0 in unlock, which simply
+	removes the newest lock debug record, without checking the thread
+	id. */
+
+	buf_page_set_io_fix(bpage, BUF_IO_NONE);
+
+	switch (io_type) {
+	case BUF_IO_READ:
+		/* NOTE that the call to ibuf may have moved the ownership of
+		the x-latch to this OS thread: do not let this confuse you in
+		debugging! */
+
+		ut_ad(buf_pool->n_pend_reads > 0);
+		buf_pool->n_pend_reads--;
+		buf_pool->stat.n_pages_read++;
+
+		if (uncompressed) {
+			rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock,
+					     BUF_IO_READ);
+		}
+
+		break;
+
+	case BUF_IO_WRITE:
+		/* Write means a flush operation: call the completion
+		routine in the flush system */
+
+		buf_flush_write_complete(bpage);
+
+		if (uncompressed) {
+			rw_lock_s_unlock_gen(&((buf_block_t*) bpage)->lock,
+					     BUF_IO_WRITE);
+		}
+
+		buf_pool->stat.n_pages_written++;
+
+		break;
+
+	default:
+		ut_error;
+	}
+
+	buf_page_monitor(bpage, io_type);
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints) {
+		fprintf(stderr, "Has %s page space %lu page no %lu\n",
+			io_type == BUF_IO_READ ? "read" : "written",
+			(ulong) buf_page_get_space(bpage),
+			(ulong) buf_page_get_page_no(bpage));
+	}
+#endif /* UNIV_DEBUG */
+
+	mutex_exit(buf_page_get_mutex(bpage));
+	buf_pool_mutex_exit(buf_pool);
+
+	return(true);
+}
+
+/*********************************************************************//**
+Asserts that all file pages in the buffer are in a replaceable state.
+@return	TRUE */
+static
+ibool
+buf_all_freed_instance(
+/*===================*/
+	buf_pool_t*	buf_pool)	/*!< in: buffer pool instancce */
+{
+	ulint		i;
+	buf_chunk_t*	chunk;
+
+	ut_ad(buf_pool);
+
+	buf_pool_mutex_enter(buf_pool);
+
+	chunk = buf_pool->chunks;
+
+	for (i = buf_pool->n_chunks; i--; chunk++) {
+
+		const buf_block_t* block = buf_chunk_not_freed(chunk);
+
+		if (UNIV_LIKELY_NULL(block)) {
+			fprintf(stderr,
+				"Page %lu %lu still fixed or dirty\n",
+				(ulong) block->page.space,
+				(ulong) block->page.offset);
+			ut_error;
+		}
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Invalidates file pages in one buffer pool instance */
+static
+void
+buf_pool_invalidate_instance(
+/*=========================*/
+	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
+{
+	ulint		i;
+
+	buf_pool_mutex_enter(buf_pool);
+
+	for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
+
+		/* As this function is called during startup and
+		during redo application phase during recovery, InnoDB
+		is single threaded (apart from IO helper threads) at
+		this stage. No new write batch can be in intialization
+		stage at this point. */
+		ut_ad(buf_pool->init_flush[i] == FALSE);
+
+		/* However, it is possible that a write batch that has
+		been posted earlier is still not complete. For buffer
+		pool invalidation to proceed we must ensure there is NO
+		write activity happening. */
+		if (buf_pool->n_flush[i] > 0) {
+			buf_flush_t	type = static_cast<buf_flush_t>(i);
+
+			buf_pool_mutex_exit(buf_pool);
+			buf_flush_wait_batch_end(buf_pool, type);
+			buf_pool_mutex_enter(buf_pool);
+		}
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+
+	ut_ad(buf_all_freed_instance(buf_pool));
+
+	buf_pool_mutex_enter(buf_pool);
+
+	while (buf_LRU_scan_and_free_block(buf_pool, TRUE)) {
+	}
+
+	ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0);
+	ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0);
+
+	buf_pool->freed_page_clock = 0;
+	buf_pool->LRU_old = NULL;
+	buf_pool->LRU_old_len = 0;
+
+	memset(&buf_pool->stat, 0x00, sizeof(buf_pool->stat));
+	buf_refresh_io_stats(buf_pool);
+
+	buf_pool_mutex_exit(buf_pool);
+}
+
+/*********************************************************************//**
+Invalidates the file pages in the buffer pool when an archive recovery is
+completed. All the file pages buffered must be in a replaceable state when
+this function is called: not latched and not modified. */
+UNIV_INTERN
+void
+buf_pool_invalidate(void)
+/*=====================*/
+{
+	ulint   i;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_invalidate_instance(buf_pool_from_array(i));
+	}
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/*********************************************************************//**
+Validates data in one buffer pool instance
+@return	TRUE */
+static
+ibool
+buf_pool_validate_instance(
+/*=======================*/
+	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
+{
+	buf_page_t*	b;
+	buf_chunk_t*	chunk;
+	ulint		i;
+	ulint		n_lru_flush	= 0;
+	ulint		n_page_flush	= 0;
+	ulint		n_list_flush	= 0;
+	ulint		n_lru		= 0;
+	ulint		n_flush		= 0;
+	ulint		n_free		= 0;
+	ulint		n_zip		= 0;
+	ulint		fold		= 0;
+	ulint		space		= 0;
+	ulint		offset		= 0;
+
+	ut_ad(buf_pool);
+
+	buf_pool_mutex_enter(buf_pool);
+	hash_lock_x_all(buf_pool->page_hash);
+
+	chunk = buf_pool->chunks;
+
+	/* Check the uncompressed blocks. */
+
+	for (i = buf_pool->n_chunks; i--; chunk++) {
+
+		ulint		j;
+		buf_block_t*	block = chunk->blocks;
+
+		for (j = chunk->size; j--; block++) {
+
+			mutex_enter(&block->mutex);
+
+			switch (buf_block_get_state(block)) {
+			case BUF_BLOCK_POOL_WATCH:
+			case BUF_BLOCK_ZIP_PAGE:
+			case BUF_BLOCK_ZIP_DIRTY:
+				/* These should only occur on
+				zip_clean, zip_free[], or flush_list. */
+				ut_error;
+				break;
+
+			case BUF_BLOCK_FILE_PAGE:
+				space = buf_block_get_space(block);
+				offset = buf_block_get_page_no(block);
+				fold = buf_page_address_fold(space, offset);
+				ut_a(buf_page_hash_get_low(buf_pool,
+							   space,
+							   offset,
+							   fold)
+				     == &block->page);
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+				ut_a(buf_page_get_io_fix(&block->page)
+				     == BUF_IO_READ
+				     || !ibuf_count_get(buf_block_get_space(
+								block),
+							buf_block_get_page_no(
+								block)));
+#endif
+				switch (buf_page_get_io_fix(&block->page)) {
+				case BUF_IO_NONE:
+					break;
+
+				case BUF_IO_WRITE:
+					switch (buf_page_get_flush_type(
+							&block->page)) {
+					case BUF_FLUSH_LRU:
+						n_lru_flush++;
+						goto assert_s_latched;
+					case BUF_FLUSH_SINGLE_PAGE:
+						n_page_flush++;
+assert_s_latched:
+						ut_a(rw_lock_is_locked(
+							     &block->lock,
+								     RW_LOCK_SHARED));
+						break;
+					case BUF_FLUSH_LIST:
+						n_list_flush++;
+						break;
+					default:
+						ut_error;
+					}
+
+					break;
+
+				case BUF_IO_READ:
+
+					ut_a(rw_lock_is_locked(&block->lock,
+							       RW_LOCK_EX));
+					break;
+
+				case BUF_IO_PIN:
+					break;
+				}
+
+				n_lru++;
+				break;
+
+			case BUF_BLOCK_NOT_USED:
+				n_free++;
+				break;
+
+			case BUF_BLOCK_READY_FOR_USE:
+			case BUF_BLOCK_MEMORY:
+			case BUF_BLOCK_REMOVE_HASH:
+				/* do nothing */
+				break;
+			}
+
+			mutex_exit(&block->mutex);
+		}
+	}
+
+	mutex_enter(&buf_pool->zip_mutex);
+
+	/* Check clean compressed-only blocks. */
+
+	for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
+	     b = UT_LIST_GET_NEXT(list, b)) {
+		ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
+		switch (buf_page_get_io_fix(b)) {
+		case BUF_IO_NONE:
+		case BUF_IO_PIN:
+			/* All clean blocks should be I/O-unfixed. */
+			break;
+		case BUF_IO_READ:
+			/* In buf_LRU_free_page(), we temporarily set
+			b->io_fix = BUF_IO_READ for a newly allocated
+			control block in order to prevent
+			buf_page_get_gen() from decompressing the block. */
+			break;
+		default:
+			ut_error;
+			break;
+		}
+
+		/* It is OK to read oldest_modification here because
+		we have acquired buf_pool->zip_mutex above which acts
+		as the 'block->mutex' for these bpages. */
+		ut_a(!b->oldest_modification);
+		fold = buf_page_address_fold(b->space, b->offset);
+		ut_a(buf_page_hash_get_low(buf_pool, b->space, b->offset,
+					   fold) == b);
+		n_lru++;
+		n_zip++;
+	}
+
+	/* Check dirty blocks. */
+
+	buf_flush_list_mutex_enter(buf_pool);
+	for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
+	     b = UT_LIST_GET_NEXT(list, b)) {
+		ut_ad(b->in_flush_list);
+		ut_a(b->oldest_modification);
+		n_flush++;
+
+		switch (buf_page_get_state(b)) {
+		case BUF_BLOCK_ZIP_DIRTY:
+			n_lru++;
+			n_zip++;
+			switch (buf_page_get_io_fix(b)) {
+			case BUF_IO_NONE:
+			case BUF_IO_READ:
+			case BUF_IO_PIN:
+				break;
+			case BUF_IO_WRITE:
+				switch (buf_page_get_flush_type(b)) {
+				case BUF_FLUSH_LRU:
+					n_lru_flush++;
+					break;
+				case BUF_FLUSH_SINGLE_PAGE:
+					n_page_flush++;
+					break;
+				case BUF_FLUSH_LIST:
+					n_list_flush++;
+					break;
+				default:
+					ut_error;
+				}
+				break;
+			}
+			break;
+		case BUF_BLOCK_FILE_PAGE:
+			/* uncompressed page */
+			break;
+		case BUF_BLOCK_POOL_WATCH:
+		case BUF_BLOCK_ZIP_PAGE:
+		case BUF_BLOCK_NOT_USED:
+		case BUF_BLOCK_READY_FOR_USE:
+		case BUF_BLOCK_MEMORY:
+		case BUF_BLOCK_REMOVE_HASH:
+			ut_error;
+			break;
+		}
+		fold = buf_page_address_fold(b->space, b->offset);
+		ut_a(buf_page_hash_get_low(buf_pool, b->space, b->offset,
+					   fold) == b);
+	}
+
+	ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
+
+	hash_unlock_x_all(buf_pool->page_hash);
+	buf_flush_list_mutex_exit(buf_pool);
+
+	mutex_exit(&buf_pool->zip_mutex);
+
+	if (n_lru + n_free > buf_pool->curr_size + n_zip) {
+		fprintf(stderr, "n LRU %lu, n free %lu, pool %lu zip %lu\n",
+			(ulong) n_lru, (ulong) n_free,
+			(ulong) buf_pool->curr_size, (ulong) n_zip);
+		ut_error;
+	}
+
+	ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru);
+	if (UT_LIST_GET_LEN(buf_pool->free) != n_free) {
+		fprintf(stderr, "Free list len %lu, free blocks %lu\n",
+			(ulong) UT_LIST_GET_LEN(buf_pool->free),
+			(ulong) n_free);
+		ut_error;
+	}
+
+	ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
+	ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush);
+	ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_page_flush);
+
+	buf_pool_mutex_exit(buf_pool);
+
+	ut_a(buf_LRU_validate());
+	ut_a(buf_flush_validate(buf_pool));
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Validates the buffer buf_pool data structure.
+@return	TRUE */
+UNIV_INTERN
+ibool
+buf_validate(void)
+/*==============*/
+{
+	ulint	i;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		buf_pool_validate_instance(buf_pool);
+	}
+	return(TRUE);
+}
+
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/*********************************************************************//**
+Prints info of the buffer buf_pool data structure for one instance. */
+static
+void
+buf_print_instance(
+/*===============*/
+	buf_pool_t*	buf_pool)
+{
+	index_id_t*	index_ids;
+	ulint*		counts;
+	ulint		size;
+	ulint		i;
+	ulint		j;
+	index_id_t	id;
+	ulint		n_found;
+	buf_chunk_t*	chunk;
+	dict_index_t*	index;
+
+	ut_ad(buf_pool);
+
+	size = buf_pool->curr_size;
+
+	index_ids = static_cast<index_id_t*>(
+		mem_alloc(size * sizeof *index_ids));
+
+	counts = static_cast<ulint*>(mem_alloc(sizeof(ulint) * size));
+
+	buf_pool_mutex_enter(buf_pool);
+	buf_flush_list_mutex_enter(buf_pool);
+
+	fprintf(stderr,
+		"buf_pool size %lu\n"
+		"database pages %lu\n"
+		"free pages %lu\n"
+		"modified database pages %lu\n"
+		"n pending decompressions %lu\n"
+		"n pending reads %lu\n"
+		"n pending flush LRU %lu list %lu single page %lu\n"
+		"pages made young %lu, not young %lu\n"
+		"pages read %lu, created %lu, written %lu\n",
+		(ulong) size,
+		(ulong) UT_LIST_GET_LEN(buf_pool->LRU),
+		(ulong) UT_LIST_GET_LEN(buf_pool->free),
+		(ulong) UT_LIST_GET_LEN(buf_pool->flush_list),
+		(ulong) buf_pool->n_pend_unzip,
+		(ulong) buf_pool->n_pend_reads,
+		(ulong) buf_pool->n_flush[BUF_FLUSH_LRU],
+		(ulong) buf_pool->n_flush[BUF_FLUSH_LIST],
+		(ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE],
+		(ulong) buf_pool->stat.n_pages_made_young,
+		(ulong) buf_pool->stat.n_pages_not_made_young,
+		(ulong) buf_pool->stat.n_pages_read,
+		(ulong) buf_pool->stat.n_pages_created,
+		(ulong) buf_pool->stat.n_pages_written);
+
+	buf_flush_list_mutex_exit(buf_pool);
+
+	/* Count the number of blocks belonging to each index in the buffer */
+
+	n_found = 0;
+
+	chunk = buf_pool->chunks;
+
+	for (i = buf_pool->n_chunks; i--; chunk++) {
+		buf_block_t*	block		= chunk->blocks;
+		ulint		n_blocks	= chunk->size;
+
+		for (; n_blocks--; block++) {
+			const buf_frame_t* frame = block->frame;
+
+			if (fil_page_get_type(frame) == FIL_PAGE_INDEX) {
+
+				id = btr_page_get_index_id(frame);
+
+				/* Look for the id in the index_ids array */
+				j = 0;
+
+				while (j < n_found) {
+
+					if (index_ids[j] == id) {
+						counts[j]++;
+
+						break;
+					}
+					j++;
+				}
+
+				if (j == n_found) {
+					n_found++;
+					index_ids[j] = id;
+					counts[j] = 1;
+				}
+			}
+		}
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+
+	for (i = 0; i < n_found; i++) {
+		index = dict_index_get_if_in_cache(index_ids[i]);
+
+		fprintf(stderr,
+			"Block count for index %llu in buffer is about %lu",
+			(ullint) index_ids[i],
+			(ulong) counts[i]);
+
+		if (index) {
+			putc(' ', stderr);
+			dict_index_name_print(stderr, NULL, index);
+		}
+
+		putc('\n', stderr);
+	}
+
+	mem_free(index_ids);
+	mem_free(counts);
+
+	ut_a(buf_pool_validate_instance(buf_pool));
+}
+
+/*********************************************************************//**
+Prints info of the buffer buf_pool data structure. */
+UNIV_INTERN
+void
+buf_print(void)
+/*===========*/
+{
+	ulint   i;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+		buf_print_instance(buf_pool);
+	}
+}
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the number of latched pages in the buffer pool.
+@return	number of latched pages */
+UNIV_INTERN
+ulint
+buf_get_latched_pages_number_instance(
+/*==================================*/
+	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
+{
+	buf_page_t*	b;
+	ulint		i;
+	buf_chunk_t*	chunk;
+	ulint		fixed_pages_number = 0;
+
+	buf_pool_mutex_enter(buf_pool);
+
+	chunk = buf_pool->chunks;
+
+	for (i = buf_pool->n_chunks; i--; chunk++) {
+		buf_block_t*	block;
+		ulint		j;
+
+		block = chunk->blocks;
+
+		for (j = chunk->size; j--; block++) {
+			if (buf_block_get_state(block)
+			    != BUF_BLOCK_FILE_PAGE) {
+
+				continue;
+			}
+
+			mutex_enter(&block->mutex);
+
+			if (block->page.buf_fix_count != 0
+			    || buf_page_get_io_fix(&block->page)
+			    != BUF_IO_NONE) {
+				fixed_pages_number++;
+			}
+
+			mutex_exit(&block->mutex);
+		}
+	}
+
+	mutex_enter(&buf_pool->zip_mutex);
+
+	/* Traverse the lists of clean and dirty compressed-only blocks. */
+
+	for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
+	     b = UT_LIST_GET_NEXT(list, b)) {
+		ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
+		ut_a(buf_page_get_io_fix(b) != BUF_IO_WRITE);
+
+		if (b->buf_fix_count != 0
+		    || buf_page_get_io_fix(b) != BUF_IO_NONE) {
+			fixed_pages_number++;
+		}
+	}
+
+	buf_flush_list_mutex_enter(buf_pool);
+	for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
+	     b = UT_LIST_GET_NEXT(list, b)) {
+		ut_ad(b->in_flush_list);
+
+		switch (buf_page_get_state(b)) {
+		case BUF_BLOCK_ZIP_DIRTY:
+			if (b->buf_fix_count != 0
+			    || buf_page_get_io_fix(b) != BUF_IO_NONE) {
+				fixed_pages_number++;
+			}
+			break;
+		case BUF_BLOCK_FILE_PAGE:
+			/* uncompressed page */
+			break;
+		case BUF_BLOCK_POOL_WATCH:
+		case BUF_BLOCK_ZIP_PAGE:
+		case BUF_BLOCK_NOT_USED:
+		case BUF_BLOCK_READY_FOR_USE:
+		case BUF_BLOCK_MEMORY:
+		case BUF_BLOCK_REMOVE_HASH:
+			ut_error;
+			break;
+		}
+	}
+
+	buf_flush_list_mutex_exit(buf_pool);
+	mutex_exit(&buf_pool->zip_mutex);
+	buf_pool_mutex_exit(buf_pool);
+
+	return(fixed_pages_number);
+}
+
+/*********************************************************************//**
+Returns the number of latched pages in all the buffer pools.
+@return	number of latched pages */
+UNIV_INTERN
+ulint
+buf_get_latched_pages_number(void)
+/*==============================*/
+{
+	ulint	i;
+	ulint	total_latched_pages = 0;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		total_latched_pages += buf_get_latched_pages_number_instance(
+			buf_pool);
+	}
+
+	return(total_latched_pages);
+}
+
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Returns the number of pending buf pool read ios.
+@return	number of pending read I/O operations */
+UNIV_INTERN
+ulint
+buf_get_n_pending_read_ios(void)
+/*============================*/
+{
+	ulint	i;
+	ulint	pend_ios = 0;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		pend_ios += buf_pool_from_array(i)->n_pend_reads;
+	}
+
+	return(pend_ios);
+}
+
+/*********************************************************************//**
+Returns the ratio in percents of modified pages in the buffer pool /
+database pages in the buffer pool.
+@return	modified page percentage ratio */
+UNIV_INTERN
+ulint
+buf_get_modified_ratio_pct(void)
+/*============================*/
+{
+	ulint		ratio;
+	ulint		lru_len = 0;
+	ulint		free_len = 0;
+	ulint		flush_list_len = 0;
+
+	buf_get_total_list_len(&lru_len, &free_len, &flush_list_len);
+
+	ratio = (100 * flush_list_len) / (1 + lru_len + free_len);
+
+	/* 1 + is there to avoid division by zero */
+
+	return(ratio);
+}
+
+/*******************************************************************//**
+Aggregates a pool stats information with the total buffer pool stats  */
+static
+void
+buf_stats_aggregate_pool_info(
+/*==========================*/
+	buf_pool_info_t*	total_info,	/*!< in/out: the buffer pool
+						info to store aggregated
+						result */
+	const buf_pool_info_t*	pool_info)	/*!< in: individual buffer pool
+						stats info */
+{
+	ut_a(total_info && pool_info);
+
+	/* Nothing to copy if total_info is the same as pool_info */
+	if (total_info == pool_info) {
+		return;
+	}
+
+	total_info->pool_size += pool_info->pool_size;
+	total_info->lru_len += pool_info->lru_len;
+	total_info->old_lru_len += pool_info->old_lru_len;
+	total_info->free_list_len += pool_info->free_list_len;
+	total_info->flush_list_len += pool_info->flush_list_len;
+	total_info->n_pend_unzip += pool_info->n_pend_unzip;
+	total_info->n_pend_reads += pool_info->n_pend_reads;
+	total_info->n_pending_flush_lru += pool_info->n_pending_flush_lru;
+	total_info->n_pending_flush_list += pool_info->n_pending_flush_list;
+	total_info->n_pages_made_young += pool_info->n_pages_made_young;
+	total_info->n_pages_not_made_young += pool_info->n_pages_not_made_young;
+	total_info->n_pages_read += pool_info->n_pages_read;
+	total_info->n_pages_created += pool_info->n_pages_created;
+	total_info->n_pages_written += pool_info->n_pages_written;
+	total_info->n_page_gets += pool_info->n_page_gets;
+	total_info->n_ra_pages_read_rnd += pool_info->n_ra_pages_read_rnd;
+	total_info->n_ra_pages_read += pool_info->n_ra_pages_read;
+	total_info->n_ra_pages_evicted += pool_info->n_ra_pages_evicted;
+	total_info->page_made_young_rate += pool_info->page_made_young_rate;
+	total_info->page_not_made_young_rate +=
+		pool_info->page_not_made_young_rate;
+	total_info->pages_read_rate += pool_info->pages_read_rate;
+	total_info->pages_created_rate += pool_info->pages_created_rate;
+	total_info->pages_written_rate += pool_info->pages_written_rate;
+	total_info->n_page_get_delta += pool_info->n_page_get_delta;
+	total_info->page_read_delta += pool_info->page_read_delta;
+	total_info->young_making_delta += pool_info->young_making_delta;
+	total_info->not_young_making_delta += pool_info->not_young_making_delta;
+	total_info->pages_readahead_rnd_rate += pool_info->pages_readahead_rnd_rate;
+	total_info->pages_readahead_rate += pool_info->pages_readahead_rate;
+	total_info->pages_evicted_rate += pool_info->pages_evicted_rate;
+	total_info->unzip_lru_len += pool_info->unzip_lru_len;
+	total_info->io_sum += pool_info->io_sum;
+	total_info->io_cur += pool_info->io_cur;
+	total_info->unzip_sum += pool_info->unzip_sum;
+	total_info->unzip_cur += pool_info->unzip_cur;
+}
+/*******************************************************************//**
+Collect buffer pool stats information for a buffer pool. Also
+record aggregated stats if there are more than one buffer pool
+in the server */
+UNIV_INTERN
+void
+buf_stats_get_pool_info(
+/*====================*/
+	buf_pool_t*		buf_pool,	/*!< in: buffer pool */
+	ulint			pool_id,	/*!< in: buffer pool ID */
+	buf_pool_info_t*	all_pool_info)	/*!< in/out: buffer pool info
+						to fill */
+{
+	buf_pool_info_t*        pool_info;
+	time_t			current_time;
+	double			time_elapsed;
+
+	/* Find appropriate pool_info to store stats for this buffer pool */
+	pool_info = &all_pool_info[pool_id];
+
+	buf_pool_mutex_enter(buf_pool);
+	buf_flush_list_mutex_enter(buf_pool);
+
+	pool_info->pool_unique_id = pool_id;
+
+	pool_info->pool_size = buf_pool->curr_size;
+
+	pool_info->lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+
+	pool_info->old_lru_len = buf_pool->LRU_old_len;
+
+	pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool->free);
+
+	pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool->flush_list);
+
+	pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
+
+	pool_info->n_pend_reads = buf_pool->n_pend_reads;
+
+	pool_info->n_pending_flush_lru =
+		 (buf_pool->n_flush[BUF_FLUSH_LRU]
+		  + buf_pool->init_flush[BUF_FLUSH_LRU]);
+
+	pool_info->n_pending_flush_list =
+		 (buf_pool->n_flush[BUF_FLUSH_LIST]
+		  + buf_pool->init_flush[BUF_FLUSH_LIST]);
+
+	pool_info->n_pending_flush_single_page =
+		 (buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
+		  + buf_pool->init_flush[BUF_FLUSH_SINGLE_PAGE]);
+
+	buf_flush_list_mutex_exit(buf_pool);
+
+	current_time = time(NULL);
+	time_elapsed = 0.001 + difftime(current_time,
+					buf_pool->last_printout_time);
+
+	pool_info->n_pages_made_young = buf_pool->stat.n_pages_made_young;
+
+	pool_info->n_pages_not_made_young =
+		buf_pool->stat.n_pages_not_made_young;
+
+	pool_info->n_pages_read = buf_pool->stat.n_pages_read;
+
+	pool_info->n_pages_created = buf_pool->stat.n_pages_created;
+
+	pool_info->n_pages_written = buf_pool->stat.n_pages_written;
+
+	pool_info->n_page_gets = buf_pool->stat.n_page_gets;
+
+	pool_info->n_ra_pages_read_rnd = buf_pool->stat.n_ra_pages_read_rnd;
+	pool_info->n_ra_pages_read = buf_pool->stat.n_ra_pages_read;
+
+	pool_info->n_ra_pages_evicted = buf_pool->stat.n_ra_pages_evicted;
+
+	pool_info->page_made_young_rate =
+		 (buf_pool->stat.n_pages_made_young
+		  - buf_pool->old_stat.n_pages_made_young) / time_elapsed;
+
+	pool_info->page_not_made_young_rate =
+		 (buf_pool->stat.n_pages_not_made_young
+		  - buf_pool->old_stat.n_pages_not_made_young) / time_elapsed;
+
+	pool_info->pages_read_rate =
+		(buf_pool->stat.n_pages_read
+		  - buf_pool->old_stat.n_pages_read) / time_elapsed;
+
+	pool_info->pages_created_rate =
+		(buf_pool->stat.n_pages_created
+		 - buf_pool->old_stat.n_pages_created) / time_elapsed;
+
+	pool_info->pages_written_rate =
+		(buf_pool->stat.n_pages_written
+		 - buf_pool->old_stat.n_pages_written) / time_elapsed;
+
+	pool_info->n_page_get_delta = buf_pool->stat.n_page_gets
+				      - buf_pool->old_stat.n_page_gets;
+
+	if (pool_info->n_page_get_delta) {
+		pool_info->page_read_delta = buf_pool->stat.n_pages_read
+					     - buf_pool->old_stat.n_pages_read;
+
+		pool_info->young_making_delta =
+			buf_pool->stat.n_pages_made_young
+			- buf_pool->old_stat.n_pages_made_young;
+
+		pool_info->not_young_making_delta =
+			buf_pool->stat.n_pages_not_made_young
+			- buf_pool->old_stat.n_pages_not_made_young;
+	}
+	pool_info->pages_readahead_rnd_rate =
+		 (buf_pool->stat.n_ra_pages_read_rnd
+		  - buf_pool->old_stat.n_ra_pages_read_rnd) / time_elapsed;
+
+
+	pool_info->pages_readahead_rate =
+		 (buf_pool->stat.n_ra_pages_read
+		  - buf_pool->old_stat.n_ra_pages_read) / time_elapsed;
+
+	pool_info->pages_evicted_rate =
+		(buf_pool->stat.n_ra_pages_evicted
+		 - buf_pool->old_stat.n_ra_pages_evicted) / time_elapsed;
+
+	pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
+
+	pool_info->io_sum = buf_LRU_stat_sum.io;
+
+	pool_info->io_cur = buf_LRU_stat_cur.io;
+
+	pool_info->unzip_sum = buf_LRU_stat_sum.unzip;
+
+	pool_info->unzip_cur = buf_LRU_stat_cur.unzip;
+
+	buf_refresh_io_stats(buf_pool);
+	buf_pool_mutex_exit(buf_pool);
+}
+
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+UNIV_INTERN
+void
+buf_print_io_instance(
+/*==================*/
+	buf_pool_info_t*pool_info,	/*!< in: buffer pool info */
+	FILE*		file)		/*!< in/out: buffer where to print */
+{
+	ut_ad(pool_info);
+
+	fprintf(file,
+		"Buffer pool size   %lu\n"
+		"Free buffers       %lu\n"
+		"Database pages     %lu\n"
+		"Old database pages %lu\n"
+		"Modified db pages  %lu\n"
+		"Pending reads %lu\n"
+		"Pending writes: LRU %lu, flush list %lu, single page %lu\n",
+		pool_info->pool_size,
+		pool_info->free_list_len,
+		pool_info->lru_len,
+		pool_info->old_lru_len,
+		pool_info->flush_list_len,
+		pool_info->n_pend_reads,
+		pool_info->n_pending_flush_lru,
+		pool_info->n_pending_flush_list,
+		pool_info->n_pending_flush_single_page);
+
+	fprintf(file,
+		"Pages made young %lu, not young %lu\n"
+		"%.2f youngs/s, %.2f non-youngs/s\n"
+		"Pages read %lu, created %lu, written %lu\n"
+		"%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
+		pool_info->n_pages_made_young,
+		pool_info->n_pages_not_made_young,
+		pool_info->page_made_young_rate,
+		pool_info->page_not_made_young_rate,
+		pool_info->n_pages_read,
+		pool_info->n_pages_created,
+		pool_info->n_pages_written,
+		pool_info->pages_read_rate,
+		pool_info->pages_created_rate,
+		pool_info->pages_written_rate);
+
+	if (pool_info->n_page_get_delta) {
+		fprintf(file,
+			"Buffer pool hit rate %lu / 1000,"
+			" young-making rate %lu / 1000 not %lu / 1000\n",
+			(ulong) (1000 - (1000 * pool_info->page_read_delta
+					 / pool_info->n_page_get_delta)),
+			(ulong) (1000 * pool_info->young_making_delta
+				 / pool_info->n_page_get_delta),
+			(ulong) (1000 * pool_info->not_young_making_delta
+				 / pool_info->n_page_get_delta));
+	} else {
+		fputs("No buffer pool page gets since the last printout\n",
+		      file);
+	}
+
+	/* Statistics about read ahead algorithm */
+	fprintf(file, "Pages read ahead %.2f/s,"
+		" evicted without access %.2f/s,"
+		" Random read ahead %.2f/s\n",
+
+		pool_info->pages_readahead_rate,
+		pool_info->pages_evicted_rate,
+		pool_info->pages_readahead_rnd_rate);
+
+	/* Print some values to help us with visualizing what is
+	happening with LRU eviction. */
+	fprintf(file,
+		"LRU len: %lu, unzip_LRU len: %lu\n"
+		"I/O sum[%lu]:cur[%lu], unzip sum[%lu]:cur[%lu]\n",
+		pool_info->lru_len, pool_info->unzip_lru_len,
+		pool_info->io_sum, pool_info->io_cur,
+		pool_info->unzip_sum, pool_info->unzip_cur);
+}
+
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+UNIV_INTERN
+void
+buf_print_io(
+/*=========*/
+	FILE*	file)	/*!< in/out: buffer where to print */
+{
+	ulint			i;
+	buf_pool_info_t*	pool_info;
+	buf_pool_info_t*	pool_info_total;
+
+	/* If srv_buf_pool_instances is greater than 1, allocate
+	one extra buf_pool_info_t, the last one stores
+	aggregated/total values from all pools */
+	if (srv_buf_pool_instances > 1) {
+		pool_info = (buf_pool_info_t*) mem_zalloc((
+			srv_buf_pool_instances + 1) * sizeof *pool_info);
+
+		pool_info_total = &pool_info[srv_buf_pool_instances];
+	} else {
+		ut_a(srv_buf_pool_instances == 1);
+
+		pool_info_total = pool_info =
+			static_cast<buf_pool_info_t*>(
+				mem_zalloc(sizeof *pool_info));
+	}
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		/* Fetch individual buffer pool info and calculate
+		aggregated stats along the way */
+		buf_stats_get_pool_info(buf_pool, i, pool_info);
+
+		/* If we have more than one buffer pool, store
+		the aggregated stats  */
+		if (srv_buf_pool_instances > 1) {
+			buf_stats_aggregate_pool_info(pool_info_total,
+						      &pool_info[i]);
+		}
+	}
+
+	/* Print the aggreate buffer pool info */
+	buf_print_io_instance(pool_info_total, file);
+
+	/* If there are more than one buffer pool, print each individual pool
+	info */
+	if (srv_buf_pool_instances > 1) {
+		fputs("----------------------\n"
+		"INDIVIDUAL BUFFER POOL INFO\n"
+		"----------------------\n", file);
+
+		for (i = 0; i < srv_buf_pool_instances; i++) {
+			fprintf(file, "---BUFFER POOL %lu\n", i);
+			buf_print_io_instance(&pool_info[i], file);
+		}
+	}
+
+	mem_free(pool_info);
+}
+
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+buf_refresh_io_stats(
+/*=================*/
+	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
+{
+	buf_pool->last_printout_time = ut_time();
+	buf_pool->old_stat = buf_pool->stat;
+}
+
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+buf_refresh_io_stats_all(void)
+/*==========================*/
+{
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		buf_refresh_io_stats(buf_pool);
+	}
+}
+
+/**********************************************************************//**
+Check if all pages in all buffer pools are in a replacable state.
+@return FALSE if not */
+UNIV_INTERN
+ibool
+buf_all_freed(void)
+/*===============*/
+{
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		if (!buf_all_freed_instance(buf_pool)) {
+			return(FALSE);
+		}
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Checks that there currently are no pending i/o-operations for the buffer
+pool.
+@return	number of pending i/o */
+UNIV_INTERN
+ulint
+buf_pool_check_no_pending_io(void)
+/*==============================*/
+{
+	ulint		i;
+	ulint		pending_io = 0;
+
+	buf_pool_mutex_enter_all();
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		const buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		pending_io += buf_pool->n_pend_reads
+			      + buf_pool->n_flush[BUF_FLUSH_LRU]
+			      + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
+			      + buf_pool->n_flush[BUF_FLUSH_LIST];
+
+	}
+
+	buf_pool_mutex_exit_all();
+
+	return(pending_io);
+}
+
+#if 0
+Code currently not used
+/*********************************************************************//**
+Gets the current length of the free list of buffer blocks.
+@return	length of the free list */
+UNIV_INTERN
+ulint
+buf_get_free_list_len(void)
+/*=======================*/
+{
+	ulint	len;
+
+	buf_pool_mutex_enter(buf_pool);
+
+	len = UT_LIST_GET_LEN(buf_pool->free);
+
+	buf_pool_mutex_exit(buf_pool);
+
+	return(len);
+}
+#endif
+
+#else /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Inits a page to the buffer buf_pool, for use in mysqlbackup --restore. */
+UNIV_INTERN
+void
+buf_page_init_for_backup_restore(
+/*=============================*/
+	ulint		space,	/*!< in: space id */
+	ulint		offset,	/*!< in: offset of the page within space
+				in units of a page */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	buf_block_t*	block)	/*!< in: block to init */
+{
+	block->page.state	= BUF_BLOCK_FILE_PAGE;
+	block->page.space	= space;
+	block->page.offset	= offset;
+
+	page_zip_des_init(&block->page.zip);
+
+	/* We assume that block->page.data has been allocated
+	with zip_size == UNIV_PAGE_SIZE. */
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
+	ut_ad(ut_is_2pow(zip_size));
+	page_zip_set_size(&block->page.zip, zip_size);
+	if (zip_size) {
+		block->page.zip.data = block->frame + UNIV_PAGE_SIZE;
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/buf/buf0checksum.cc b/storage/innobase/buf/buf0checksum.cc
new file mode 100644
index 00000000000..ec79bbe6be9
--- /dev/null
+++ b/storage/innobase/buf/buf0checksum.cc
@@ -0,0 +1,155 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0checksum.cc
+Buffer pool checksum functions, also linked from /extra/innochecksum.cc
+
+Created Aug 11, 2011 Vasil Dimov
+*******************************************************/
+
+#include "univ.i"
+#include "fil0fil.h" /* FIL_* */
+#include "ut0crc32.h" /* ut_crc32() */
+#include "ut0rnd.h" /* ut_fold_binary() */
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "srv0srv.h" /* SRV_CHECKSUM_* */
+#include "buf0types.h"
+
+/** the macro MYSQL_SYSVAR_ENUM() requires "long unsigned int" and if we
+use srv_checksum_algorithm_t here then we get a compiler error:
+ha_innodb.cc:12251: error: cannot convert 'srv_checksum_algorithm_t*' to
+  'long unsigned int*' in initialization */
+UNIV_INTERN ulong	srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_INNODB;
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/********************************************************************//**
+Calculates a page CRC32 which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@return	checksum */
+UNIV_INTERN
+ib_uint32_t
+buf_calc_page_crc32(
+/*================*/
+	const byte*	page)	/*!< in: buffer page */
+{
+	ib_uint32_t	checksum;
+
+	/* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
+	FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, are written outside the buffer pool
+	to the first pages of data files, we have to skip them in the page
+	checksum calculation.
+	We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
+	checksum is stored, and also the last 8 bytes of page because
+	there we store the old formula checksum. */
+
+	checksum = ut_crc32(page + FIL_PAGE_OFFSET,
+			    FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
+		^ ut_crc32(page + FIL_PAGE_DATA,
+			   UNIV_PAGE_SIZE - FIL_PAGE_DATA
+			   - FIL_PAGE_END_LSN_OLD_CHKSUM);
+
+	return(checksum);
+}
+
+/********************************************************************//**
+Calculates a page checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@return	checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_new_checksum(
+/*=======================*/
+	const byte*	page)	/*!< in: buffer page */
+{
+	ulint checksum;
+
+	/* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
+	FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, are written outside the buffer pool
+	to the first pages of data files, we have to skip them in the page
+	checksum calculation.
+	We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
+	checksum is stored, and also the last 8 bytes of page because
+	there we store the old formula checksum. */
+
+	checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
+				  FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
+		+ ut_fold_binary(page + FIL_PAGE_DATA,
+				 UNIV_PAGE_SIZE - FIL_PAGE_DATA
+				 - FIL_PAGE_END_LSN_OLD_CHKSUM);
+	checksum = checksum & 0xFFFFFFFFUL;
+
+	return(checksum);
+}
+
+/********************************************************************//**
+In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
+looked at the first few bytes of the page. This calculates that old
+checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@return	checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_old_checksum(
+/*=======================*/
+	const byte*	page)	/*!< in: buffer page */
+{
+	ulint checksum;
+
+	checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
+
+	checksum = checksum & 0xFFFFFFFFUL;
+
+	return(checksum);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/********************************************************************//**
+Return a printable string describing the checksum algorithm.
+@return	algorithm name */
+UNIV_INTERN
+const char*
+buf_checksum_algorithm_name(
+/*========================*/
+	srv_checksum_algorithm_t	algo)	/*!< in: algorithm */
+{
+	switch (algo) {
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+		return("crc32");
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+		return("innodb");
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+		return("none");
+	}
+
+	ut_error;
+	return(NULL);
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc
new file mode 100644
index 00000000000..62222993622
--- /dev/null
+++ b/storage/innobase/buf/buf0dblwr.cc
@@ -0,0 +1,1171 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dblwr.cc
+Doublwrite buffer module
+
+Created 2011/12/19
+*******************************************************/
+
+#include "buf0dblwr.h"
+
+#ifdef UNIV_NONINL
+#include "buf0buf.ic"
+#endif
+
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "page0zip.h"
+#include "trx0sys.h"
+
+#ifndef UNIV_HOTBACKUP
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	buf_dblwr_mutex_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+/** The doublewrite buffer */
+UNIV_INTERN buf_dblwr_t*	buf_dblwr = NULL;
+
+/** Set to TRUE when the doublewrite buffer is being created */
+UNIV_INTERN ibool	buf_dblwr_being_created = FALSE;
+
+/****************************************************************//**
+Determines if a page number is located inside the doublewrite buffer.
+@return TRUE if the location is inside the two blocks of the
+doublewrite buffer */
+UNIV_INTERN
+ibool
+buf_dblwr_page_inside(
+/*==================*/
+	ulint	page_no)	/*!< in: page number */
+{
+	if (buf_dblwr == NULL) {
+
+		return(FALSE);
+	}
+
+	if (page_no >= buf_dblwr->block1
+	    && page_no < buf_dblwr->block1
+	    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+		return(TRUE);
+	}
+
+	if (page_no >= buf_dblwr->block2
+	    && page_no < buf_dblwr->block2
+	    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/****************************************************************//**
+Calls buf_page_get() on the TRX_SYS_PAGE and returns a pointer to the
+doublewrite buffer within it.
+@return	pointer to the doublewrite buffer within the filespace header
+page. */
+UNIV_INLINE
+byte*
+buf_dblwr_get(
+/*==========*/
+	mtr_t*	mtr)	/*!< in/out: MTR to hold the page latch */
+{
+	buf_block_t*	block;
+
+	block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
+			     RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+	return(buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE);
+}
+
+/********************************************************************//**
+Flush a batch of writes to the datafiles that have already been
+written to the dblwr buffer on disk. */
+UNIV_INLINE
+void
+buf_dblwr_sync_datafiles()
+/*======================*/
+{
+	/* Wake possible simulated aio thread to actually post the
+	writes to the operating system */
+	os_aio_simulated_wake_handler_threads();
+
+	/* Wait that all async writes to tablespaces have been posted to
+	the OS */
+	os_aio_wait_until_no_pending_writes();
+
+	/* Now we flush the data to disk (for example, with fsync) */
+	fil_flush_file_spaces(FIL_TABLESPACE);
+}
+
+/****************************************************************//**
+Creates or initialializes the doublewrite buffer at a database start. */
+static
+void
+buf_dblwr_init(
+/*===========*/
+	byte*	doublewrite)	/*!< in: pointer to the doublewrite buf
+				header on trx sys page */
+{
+	ulint	buf_size;
+
+	buf_dblwr = static_cast<buf_dblwr_t*>(
+		mem_zalloc(sizeof(buf_dblwr_t)));
+
+	/* There are two blocks of same size in the doublewrite
+	buffer. */
+	buf_size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+
+	/* There must be atleast one buffer for single page writes
+	and one buffer for batch writes. */
+	ut_a(srv_doublewrite_batch_size > 0
+	     && srv_doublewrite_batch_size < buf_size);
+
+	mutex_create(buf_dblwr_mutex_key,
+		     &buf_dblwr->mutex, SYNC_DOUBLEWRITE);
+
+	buf_dblwr->b_event = os_event_create();
+	buf_dblwr->s_event = os_event_create();
+	buf_dblwr->first_free = 0;
+	buf_dblwr->s_reserved = 0;
+	buf_dblwr->b_reserved = 0;
+
+	buf_dblwr->block1 = mach_read_from_4(
+		doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1);
+	buf_dblwr->block2 = mach_read_from_4(
+		doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2);
+
+	buf_dblwr->in_use = static_cast<bool*>(
+		mem_zalloc(buf_size * sizeof(bool)));
+
+	buf_dblwr->write_buf_unaligned = static_cast<byte*>(
+		ut_malloc((1 + buf_size) * UNIV_PAGE_SIZE));
+
+	buf_dblwr->write_buf = static_cast<byte*>(
+		ut_align(buf_dblwr->write_buf_unaligned,
+			 UNIV_PAGE_SIZE));
+
+	buf_dblwr->buf_block_arr = static_cast<buf_page_t**>(
+		mem_zalloc(buf_size * sizeof(void*)));
+}
+
+/****************************************************************//**
+Creates the doublewrite buffer to a new InnoDB installation. The header of the
+doublewrite buffer is placed on the trx system header page. */
+UNIV_INTERN
+void
+buf_dblwr_create(void)
+/*==================*/
+{
+	buf_block_t*	block2;
+	buf_block_t*	new_block;
+	byte*	doublewrite;
+	byte*	fseg_header;
+	ulint	page_no;
+	ulint	prev_page_no;
+	ulint	i;
+	mtr_t	mtr;
+
+	if (buf_dblwr) {
+		/* Already inited */
+
+		return;
+	}
+
+start_again:
+	mtr_start(&mtr);
+	buf_dblwr_being_created = TRUE;
+
+	doublewrite = buf_dblwr_get(&mtr);
+
+	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+	    == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+		/* The doublewrite buffer has already been created:
+		just read in some numbers */
+
+		buf_dblwr_init(doublewrite);
+
+		mtr_commit(&mtr);
+		buf_dblwr_being_created = FALSE;
+		return;
+	}
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Doublewrite buffer not found: creating new");
+
+	if (buf_pool_get_curr_size()
+	    < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+		+ FSP_EXTENT_SIZE / 2 + 100)
+	       * UNIV_PAGE_SIZE)) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Cannot create doublewrite buffer: you must "
+			"increase your buffer pool size. Cannot continue "
+			"operation.");
+
+		exit(EXIT_FAILURE);
+	}
+
+	block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO,
+			     TRX_SYS_DOUBLEWRITE
+			     + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
+
+	/* fseg_create acquires a second latch on the page,
+	therefore we must declare it: */
+
+	buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
+
+	if (block2 == NULL) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Cannot create doublewrite buffer: you must "
+			"increase your tablespace size. "
+			"Cannot continue operation.");
+
+		/* We exit without committing the mtr to prevent
+		its modifications to the database getting to disk */
+
+		exit(EXIT_FAILURE);
+	}
+
+	fseg_header = doublewrite + TRX_SYS_DOUBLEWRITE_FSEG;
+	prev_page_no = 0;
+
+	for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+		     + FSP_EXTENT_SIZE / 2; i++) {
+		new_block = fseg_alloc_free_page(
+			fseg_header, prev_page_no + 1, FSP_UP, &mtr);
+		if (new_block == NULL) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Cannot create doublewrite buffer: you must "
+				"increase your tablespace size. "
+				"Cannot continue operation.");
+
+			exit(EXIT_FAILURE);
+		}
+
+		/* We read the allocated pages to the buffer pool;
+		when they are written to disk in a flush, the space
+		id and page number fields are also written to the
+		pages. When we at database startup read pages
+		from the doublewrite buffer, we know that if the
+		space id and page number in them are the same as
+		the page position in the tablespace, then the page
+		has not been written to in doublewrite. */
+
+		ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
+		page_no = buf_block_get_page_no(new_block);
+
+		if (i == FSP_EXTENT_SIZE / 2) {
+			ut_a(page_no == FSP_EXTENT_SIZE);
+			mlog_write_ulint(doublewrite
+					 + TRX_SYS_DOUBLEWRITE_BLOCK1,
+					 page_no, MLOG_4BYTES, &mtr);
+			mlog_write_ulint(doublewrite
+					 + TRX_SYS_DOUBLEWRITE_REPEAT
+					 + TRX_SYS_DOUBLEWRITE_BLOCK1,
+					 page_no, MLOG_4BYTES, &mtr);
+
+		} else if (i == FSP_EXTENT_SIZE / 2
+			   + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+			ut_a(page_no == 2 * FSP_EXTENT_SIZE);
+			mlog_write_ulint(doublewrite
+					 + TRX_SYS_DOUBLEWRITE_BLOCK2,
+					 page_no, MLOG_4BYTES, &mtr);
+			mlog_write_ulint(doublewrite
+					 + TRX_SYS_DOUBLEWRITE_REPEAT
+					 + TRX_SYS_DOUBLEWRITE_BLOCK2,
+					 page_no, MLOG_4BYTES, &mtr);
+
+		} else if (i > FSP_EXTENT_SIZE / 2) {
+			ut_a(page_no == prev_page_no + 1);
+		}
+
+		if (((i + 1) & 15) == 0) {
+			/* rw_locks can only be recursively x-locked
+			2048 times. (on 32 bit platforms,
+			(lint) 0 - (X_LOCK_DECR * 2049)
+			is no longer a negative number, and thus
+			lock_word becomes like a shared lock).
+			For 4k page size this loop will
+			lock the fseg header too many times. Since
+			this code is not done while any other threads
+			are active, restart the MTR occasionally. */
+			mtr_commit(&mtr);
+			mtr_start(&mtr);
+			doublewrite = buf_dblwr_get(&mtr);
+			fseg_header = doublewrite
+				      + TRX_SYS_DOUBLEWRITE_FSEG;
+		}
+
+		prev_page_no = page_no;
+	}
+
+	mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
+			 TRX_SYS_DOUBLEWRITE_MAGIC_N,
+			 MLOG_4BYTES, &mtr);
+	mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
+			 + TRX_SYS_DOUBLEWRITE_REPEAT,
+			 TRX_SYS_DOUBLEWRITE_MAGIC_N,
+			 MLOG_4BYTES, &mtr);
+
+	mlog_write_ulint(doublewrite
+			 + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
+			 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+			 MLOG_4BYTES, &mtr);
+	mtr_commit(&mtr);
+
+	/* Flush the modified pages to disk and make a checkpoint */
+	log_make_checkpoint_at(LSN_MAX, TRUE);
+
+	/* Remove doublewrite pages from LRU */
+	buf_pool_invalidate();
+
+	ib_logf(IB_LOG_LEVEL_INFO, "Doublewrite buffer created");
+
+	goto start_again;
+}
+
+/****************************************************************//**
+At a database startup initializes the doublewrite buffer memory structure if
+we already have a doublewrite buffer created in the data files. If we are
+upgrading to an InnoDB version which supports multiple tablespaces, then this
+function performs the necessary update operations. If we are in a crash
+recovery, this function loads the pages from double write buffer into memory. */
+void
+buf_dblwr_init_or_load_pages(
+/*=========================*/
+	os_file_t	file,
+	char*		path,
+	bool		load_corrupt_pages)
+{
+	byte*	buf;
+	byte*	read_buf;
+	byte*	unaligned_read_buf;
+	ulint	block1;
+	ulint	block2;
+	byte*	page;
+	ibool	reset_space_ids = FALSE;
+	byte*	doublewrite;
+	ulint	space_id;
+	ulint	i;
+        ulint	block_bytes = 0;
+	recv_dblwr_t& recv_dblwr = recv_sys->dblwr;
+
+	/* We do the file i/o past the buffer pool */
+
+	unaligned_read_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
+
+	read_buf = static_cast<byte*>(
+		ut_align(unaligned_read_buf, UNIV_PAGE_SIZE));
+
+	/* Read the trx sys header to check if we are using the doublewrite
+	buffer */
+	off_t  trx_sys_page = TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE;
+	os_file_read(file, read_buf, trx_sys_page, UNIV_PAGE_SIZE);
+
+	doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
+
+	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+	    == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+		/* The doublewrite buffer has been created */
+
+		buf_dblwr_init(doublewrite);
+
+		block1 = buf_dblwr->block1;
+		block2 = buf_dblwr->block2;
+
+		buf = buf_dblwr->write_buf;
+	} else {
+		goto leave_func;
+	}
+
+	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
+	    != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
+
+		/* We are upgrading from a version < 4.1.x to a version where
+		multiple tablespaces are supported. We must reset the space id
+		field in the pages in the doublewrite buffer because starting
+		from this version the space id is stored to
+		FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
+
+		reset_space_ids = TRUE;
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Resetting space id's in the doublewrite buffer");
+	}
+
+	/* Read the pages from the doublewrite buffer to memory */
+
+        block_bytes = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
+
+	os_file_read(file, buf, block1 * UNIV_PAGE_SIZE, block_bytes);
+	os_file_read(file, buf + block_bytes, block2 * UNIV_PAGE_SIZE,
+		     block_bytes);
+
+	/* Check if any of these pages is half-written in data files, in the
+	intended position */
+
+	page = buf;
+
+	for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
+
+		ulint source_page_no;
+
+		if (reset_space_ids) {
+
+			space_id = 0;
+			mach_write_to_4(page
+					+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id);
+			/* We do not need to calculate new checksums for the
+			pages because the field .._SPACE_ID does not affect
+			them. Write the page back to where we read it from. */
+
+			if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+				source_page_no = block1 + i;
+			} else {
+				source_page_no = block2
+					+ i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+			}
+
+			os_file_write(path, file, page,
+				      source_page_no * UNIV_PAGE_SIZE,
+				      UNIV_PAGE_SIZE);
+
+		} else if (load_corrupt_pages) {
+
+			recv_dblwr.add(page);
+		}
+
+		page += UNIV_PAGE_SIZE;
+	}
+
+	if (reset_space_ids) {
+		os_file_flush(file);
+	}
+
+leave_func:
+	ut_free(unaligned_read_buf);
+}
+
+/****************************************************************//**
+Process the double write buffer pages. */
+void
+buf_dblwr_process()
+/*===============*/
+{
+	ulint	space_id;
+	ulint	page_no;
+	ulint	page_no_dblwr = 0;
+	byte*	page;
+	byte*	read_buf;
+	byte*	unaligned_read_buf;
+	recv_dblwr_t& recv_dblwr = recv_sys->dblwr;
+
+	unaligned_read_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
+
+	read_buf = static_cast<byte*>(
+		ut_align(unaligned_read_buf, UNIV_PAGE_SIZE));
+
+	for (std::list<byte*>::iterator i = recv_dblwr.pages.begin();
+	     i != recv_dblwr.pages.end(); ++i, ++page_no_dblwr ) {
+
+		page = *i;
+		page_no  = mach_read_from_4(page + FIL_PAGE_OFFSET);
+		space_id = mach_read_from_4(page + FIL_PAGE_SPACE_ID);
+
+		if (!fil_tablespace_exists_in_mem(space_id)) {
+			/* Maybe we have dropped the single-table tablespace
+			and this page once belonged to it: do nothing */
+
+		} else if (!fil_check_adress_in_tablespace(space_id,
+							   page_no)) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"A page in the doublewrite buffer is not "
+				"within space bounds; space id %lu "
+				"page number %lu, page %lu in "
+				"doublewrite buf.",
+				(ulong) space_id, (ulong) page_no,
+				page_no_dblwr);
+		} else {
+			ulint	zip_size = fil_space_get_zip_size(space_id);
+
+			/* Read in the actual page from the file */
+			fil_io(OS_FILE_READ, true, space_id, zip_size,
+			       page_no, 0,
+			       zip_size ? zip_size : UNIV_PAGE_SIZE,
+			       read_buf, NULL);
+
+			/* Check if the page is corrupt */
+
+			if (buf_page_is_corrupted(true, read_buf, zip_size)) {
+
+				fprintf(stderr,
+					"InnoDB: Warning: database page"
+					" corruption or a failed\n"
+					"InnoDB: file read of"
+					" space %lu page %lu.\n"
+					"InnoDB: Trying to recover it from"
+					" the doublewrite buffer.\n",
+					(ulong) space_id, (ulong) page_no);
+
+				if (buf_page_is_corrupted(true,
+							  page, zip_size)) {
+					fprintf(stderr,
+						"InnoDB: Dump of the page:\n");
+					buf_page_print(
+						read_buf, zip_size,
+						BUF_PAGE_PRINT_NO_CRASH);
+					fprintf(stderr,
+						"InnoDB: Dump of"
+						" corresponding page"
+						" in doublewrite buffer:\n");
+					buf_page_print(
+						page, zip_size,
+						BUF_PAGE_PRINT_NO_CRASH);
+
+					fprintf(stderr,
+						"InnoDB: Also the page in the"
+						" doublewrite buffer"
+						" is corrupt.\n"
+						"InnoDB: Cannot continue"
+						" operation.\n"
+						"InnoDB: You can try to"
+						" recover the database"
+						" with the my.cnf\n"
+						"InnoDB: option:\n"
+						"InnoDB:"
+						" innodb_force_recovery=6\n");
+					ut_error;
+				}
+
+				/* Write the good page from the
+				doublewrite buffer to the intended
+				position */
+
+				fil_io(OS_FILE_WRITE, true, space_id,
+				       zip_size, page_no, 0,
+				       zip_size ? zip_size : UNIV_PAGE_SIZE,
+				       page, NULL);
+
+				ib_logf(IB_LOG_LEVEL_INFO,
+					"Recovered the page from"
+					" the doublewrite buffer.");
+
+			} else if (buf_page_is_zeroes(read_buf, zip_size)) {
+
+				if (!buf_page_is_zeroes(page, zip_size)
+				    && !buf_page_is_corrupted(true, page,
+							      zip_size)) {
+
+					/* Database page contained only
+					zeroes, while a valid copy is
+					available in dblwr buffer. */
+
+					fil_io(OS_FILE_WRITE, true, space_id,
+					       zip_size, page_no, 0,
+					       zip_size ? zip_size
+							: UNIV_PAGE_SIZE,
+					       page, NULL);
+				}
+			}
+		}
+	}
+
+	fil_flush_file_spaces(FIL_TABLESPACE);
+	ut_free(unaligned_read_buf);
+}
+
+/****************************************************************//**
+Frees doublewrite buffer. */
+UNIV_INTERN
+void
+buf_dblwr_free(void)
+/*================*/
+{
+	/* Free the double write data structures. */
+	ut_a(buf_dblwr != NULL);
+	ut_ad(buf_dblwr->s_reserved == 0);
+	ut_ad(buf_dblwr->b_reserved == 0);
+
+	os_event_free(buf_dblwr->b_event);
+	os_event_free(buf_dblwr->s_event);
+	ut_free(buf_dblwr->write_buf_unaligned);
+	buf_dblwr->write_buf_unaligned = NULL;
+
+	mem_free(buf_dblwr->buf_block_arr);
+	buf_dblwr->buf_block_arr = NULL;
+
+	mem_free(buf_dblwr->in_use);
+	buf_dblwr->in_use = NULL;
+
+	mutex_free(&buf_dblwr->mutex);
+	mem_free(buf_dblwr);
+	buf_dblwr = NULL;
+}
+
+/********************************************************************//**
+Updates the doublewrite buffer when an IO request is completed. */
+UNIV_INTERN
+void
+buf_dblwr_update(
+/*=============*/
+	const buf_page_t*	bpage,	/*!< in: buffer block descriptor */
+	buf_flush_t		flush_type)/*!< in: flush type */
+{
+	if (!srv_use_doublewrite_buf || buf_dblwr == NULL) {
+		return;
+	}
+
+	switch (flush_type) {
+	case BUF_FLUSH_LIST:
+	case BUF_FLUSH_LRU:
+		mutex_enter(&buf_dblwr->mutex);
+
+		ut_ad(buf_dblwr->batch_running);
+		ut_ad(buf_dblwr->b_reserved > 0);
+		ut_ad(buf_dblwr->b_reserved <= buf_dblwr->first_free);
+
+		buf_dblwr->b_reserved--;
+
+		if (buf_dblwr->b_reserved == 0) {
+			mutex_exit(&buf_dblwr->mutex);
+			/* This will finish the batch. Sync data files
+			to the disk. */
+			fil_flush_file_spaces(FIL_TABLESPACE);
+			mutex_enter(&buf_dblwr->mutex);
+
+			/* We can now reuse the doublewrite memory buffer: */
+			buf_dblwr->first_free = 0;
+			buf_dblwr->batch_running = false;
+			os_event_set(buf_dblwr->b_event);
+		}
+
+		mutex_exit(&buf_dblwr->mutex);
+		break;
+	case BUF_FLUSH_SINGLE_PAGE:
+		{
+			const ulint size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+			ulint i;
+			mutex_enter(&buf_dblwr->mutex);
+			for (i = srv_doublewrite_batch_size; i < size; ++i) {
+				if (buf_dblwr->buf_block_arr[i] == bpage) {
+					buf_dblwr->s_reserved--;
+					buf_dblwr->buf_block_arr[i] = NULL;
+					buf_dblwr->in_use[i] = false;
+					break;
+				}
+			}
+
+			/* The block we are looking for must exist as a
+			reserved block. */
+			ut_a(i < size);
+		}
+		os_event_set(buf_dblwr->s_event);
+		mutex_exit(&buf_dblwr->mutex);
+		break;
+	case BUF_FLUSH_N_TYPES:
+		ut_error;
+	}
+}
+
+/********************************************************************//**
+Check the LSN values on the page. */
+static
+void
+buf_dblwr_check_page_lsn(
+/*=====================*/
+	const page_t*	page)		/*!< in: page to check */
+{
+	if (memcmp(page + (FIL_PAGE_LSN + 4),
+		   page + (UNIV_PAGE_SIZE
+			   - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
+		   4)) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: ERROR: The page to be written"
+			" seems corrupt!\n"
+			"InnoDB: The low 4 bytes of LSN fields do not match "
+			"(" ULINTPF " != " ULINTPF ")!"
+			" Noticed in the buffer pool.\n",
+			mach_read_from_4(
+				page + FIL_PAGE_LSN + 4),
+			mach_read_from_4(
+				page + UNIV_PAGE_SIZE
+				- FIL_PAGE_END_LSN_OLD_CHKSUM + 4));
+	}
+}
+
+/********************************************************************//**
+Asserts when a corrupt block is find during writing out data to the
+disk. */
+static
+void
+buf_dblwr_assert_on_corrupt_block(
+/*==============================*/
+	const buf_block_t*	block)	/*!< in: block to check */
+{
+	buf_page_print(block->frame, 0, BUF_PAGE_PRINT_NO_CRASH);
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		"  InnoDB: Apparent corruption of an"
+		" index page n:o %lu in space %lu\n"
+		"InnoDB: to be written to data file."
+		" We intentionally crash server\n"
+		"InnoDB: to prevent corrupt data"
+		" from ending up in data\n"
+		"InnoDB: files.\n",
+		(ulong) buf_block_get_page_no(block),
+		(ulong) buf_block_get_space(block));
+
+	ut_error;
+}
+
+/********************************************************************//**
+Check the LSN values on the page with which this block is associated.
+Also validate the page if the option is set. */
+static
+void
+buf_dblwr_check_block(
+/*==================*/
+	const buf_block_t*	block)	/*!< in: block to check */
+{
+	if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
+	    || block->page.zip.data) {
+		/* No simple validate for compressed pages exists. */
+		return;
+	}
+
+	buf_dblwr_check_page_lsn(block->frame);
+
+	if (!block->check_index_page_at_flush) {
+		return;
+	}
+
+	if (page_is_comp(block->frame)) {
+		if (!page_simple_validate_new(block->frame)) {
+			buf_dblwr_assert_on_corrupt_block(block);
+		}
+	} else if (!page_simple_validate_old(block->frame)) {
+
+		buf_dblwr_assert_on_corrupt_block(block);
+	}
+}
+
+/********************************************************************//**
+Writes a page that has already been written to the doublewrite buffer
+to the datafile. It is the job of the caller to sync the datafile. */
+static
+void
+buf_dblwr_write_block_to_datafile(
+/*==============================*/
+	const buf_page_t*	bpage,	/*!< in: page to write */
+	bool			sync)	/*!< in: true if sync IO
+					is requested */
+{
+	ut_a(bpage);
+	ut_a(buf_page_in_file(bpage));
+
+	const ulint flags = sync
+		? OS_FILE_WRITE
+		: OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER;
+
+	if (bpage->zip.data) {
+		fil_io(flags, sync, buf_page_get_space(bpage),
+		       buf_page_get_zip_size(bpage),
+		       buf_page_get_page_no(bpage), 0,
+		       buf_page_get_zip_size(bpage),
+		       (void*) bpage->zip.data,
+		       (void*) bpage);
+
+		return;
+	}
+
+
+	const buf_block_t* block = (buf_block_t*) bpage;
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	buf_dblwr_check_page_lsn(block->frame);
+
+	fil_io(flags, sync, buf_block_get_space(block), 0,
+	       buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
+	       (void*) block->frame, (void*) block);
+
+}
+
+/********************************************************************//**
+Flushes possible buffered writes from the doublewrite memory buffer to disk,
+and also wakes up the aio thread if simulated aio is used. It is very
+important to call this function after a batch of writes has been posted,
+and also when we may have to wait for a page latch! Otherwise a deadlock
+of threads can occur. */
+UNIV_INTERN
+void
+buf_dblwr_flush_buffered_writes(void)
+/*=================================*/
+{
+	byte*		write_buf;
+	ulint		first_free;
+	ulint		len;
+
+	if (!srv_use_doublewrite_buf || buf_dblwr == NULL) {
+		/* Sync the writes to the disk. */
+		buf_dblwr_sync_datafiles();
+		return;
+	}
+
+try_again:
+	mutex_enter(&buf_dblwr->mutex);
+
+	/* Write first to doublewrite buffer blocks. We use synchronous
+	aio and thus know that file write has been completed when the
+	control returns. */
+
+	if (buf_dblwr->first_free == 0) {
+
+		mutex_exit(&buf_dblwr->mutex);
+
+		return;
+	}
+
+	if (buf_dblwr->batch_running) {
+		/* Another thread is running the batch right now. Wait
+		for it to finish. */
+		ib_int64_t	sig_count = os_event_reset(buf_dblwr->b_event);
+		mutex_exit(&buf_dblwr->mutex);
+
+		os_event_wait_low(buf_dblwr->b_event, sig_count);
+		goto try_again;
+	}
+
+	ut_a(!buf_dblwr->batch_running);
+	ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved);
+
+	/* Disallow anyone else to post to doublewrite buffer or to
+	start another batch of flushing. */
+	buf_dblwr->batch_running = true;
+	first_free = buf_dblwr->first_free;
+
+	/* Now safe to release the mutex. Note that though no other
+	thread is allowed to post to the doublewrite batch flushing
+	but any threads working on single page flushes are allowed
+	to proceed. */
+	mutex_exit(&buf_dblwr->mutex);
+
+	write_buf = buf_dblwr->write_buf;
+
+	for (ulint len2 = 0, i = 0;
+	     i < buf_dblwr->first_free;
+	     len2 += UNIV_PAGE_SIZE, i++) {
+
+		const buf_block_t*	block;
+
+		block = (buf_block_t*) buf_dblwr->buf_block_arr[i];
+
+		if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
+		    || block->page.zip.data) {
+			/* No simple validate for compressed
+			pages exists. */
+			continue;
+		}
+
+		/* Check that the actual page in the buffer pool is
+		not corrupt and the LSN values are sane. */
+		buf_dblwr_check_block(block);
+
+		/* Check that the page as written to the doublewrite
+		buffer has sane LSN values. */
+		buf_dblwr_check_page_lsn(write_buf + len2);
+	}
+
+	/* Write out the first block of the doublewrite buffer */
+	len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
+		     buf_dblwr->first_free) * UNIV_PAGE_SIZE;
+
+	fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
+	       buf_dblwr->block1, 0, len,
+	       (void*) write_buf, NULL);
+
+	if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+		/* No unwritten pages in the second block. */
+		goto flush;
+	}
+
+	/* Write out the second block of the doublewrite buffer. */
+	len = (buf_dblwr->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+	       * UNIV_PAGE_SIZE;
+
+	write_buf = buf_dblwr->write_buf
+		    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
+
+	fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
+	       buf_dblwr->block2, 0, len,
+	       (void*) write_buf, NULL);
+
+flush:
+	/* increment the doublewrite flushed pages counter */
+	srv_stats.dblwr_pages_written.add(buf_dblwr->first_free);
+	srv_stats.dblwr_writes.inc();
+
+	/* Now flush the doublewrite buffer data to disk */
+	fil_flush(TRX_SYS_SPACE);
+
+	/* We know that the writes have been flushed to disk now
+	and in recovery we will find them in the doublewrite buffer
+	blocks. Next do the writes to the intended positions. */
+
+	/* Up to this point first_free and buf_dblwr->first_free are
+	same because we have set the buf_dblwr->batch_running flag
+	disallowing any other thread to post any request but we
+	can't safely access buf_dblwr->first_free in the loop below.
+	This is so because it is possible that after we are done with
+	the last iteration and before we terminate the loop, the batch
+	gets finished in the IO helper thread and another thread posts
+	a new batch setting buf_dblwr->first_free to a higher value.
+	If this happens and we are using buf_dblwr->first_free in the
+	loop termination condition then we'll end up dispatching
+	the same block twice from two different threads. */
+	ut_ad(first_free == buf_dblwr->first_free);
+	for (ulint i = 0; i < first_free; i++) {
+		buf_dblwr_write_block_to_datafile(
+			buf_dblwr->buf_block_arr[i], false);
+	}
+
+	/* Wake possible simulated aio thread to actually post the
+	writes to the operating system. We don't flush the files
+	at this point. We leave it to the IO helper thread to flush
+	datafiles when the whole batch has been processed. */
+	os_aio_simulated_wake_handler_threads();
+}
+
+/********************************************************************//**
+Posts a buffer page for writing. If the doublewrite memory buffer is
+full, calls buf_dblwr_flush_buffered_writes and waits for for free
+space to appear. */
+UNIV_INTERN
+void
+buf_dblwr_add_to_batch(
+/*====================*/
+	buf_page_t*	bpage)	/*!< in: buffer block to write */
+{
+	ulint	zip_size;
+
+	ut_a(buf_page_in_file(bpage));
+
+try_again:
+	mutex_enter(&buf_dblwr->mutex);
+
+	ut_a(buf_dblwr->first_free <= srv_doublewrite_batch_size);
+
+	if (buf_dblwr->batch_running) {
+
+		/* This not nearly as bad as it looks. There is only
+		page_cleaner thread which does background flushing
+		in batches therefore it is unlikely to be a contention
+		point. The only exception is when a user thread is
+		forced to do a flush batch because of a sync
+		checkpoint. */
+		ib_int64_t	sig_count = os_event_reset(buf_dblwr->b_event);
+		mutex_exit(&buf_dblwr->mutex);
+
+		os_event_wait_low(buf_dblwr->b_event, sig_count);
+		goto try_again;
+	}
+
+	if (buf_dblwr->first_free == srv_doublewrite_batch_size) {
+		mutex_exit(&(buf_dblwr->mutex));
+
+		buf_dblwr_flush_buffered_writes();
+
+		goto try_again;
+	}
+
+	zip_size = buf_page_get_zip_size(bpage);
+
+	if (zip_size) {
+		UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size);
+		/* Copy the compressed page and clear the rest. */
+		memcpy(buf_dblwr->write_buf
+		       + UNIV_PAGE_SIZE * buf_dblwr->first_free,
+		       bpage->zip.data, zip_size);
+		memset(buf_dblwr->write_buf
+		       + UNIV_PAGE_SIZE * buf_dblwr->first_free
+		       + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
+	} else {
+		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+		UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame,
+				   UNIV_PAGE_SIZE);
+
+		memcpy(buf_dblwr->write_buf
+		       + UNIV_PAGE_SIZE * buf_dblwr->first_free,
+		       ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
+	}
+
+	buf_dblwr->buf_block_arr[buf_dblwr->first_free] = bpage;
+
+	buf_dblwr->first_free++;
+	buf_dblwr->b_reserved++;
+
+	ut_ad(!buf_dblwr->batch_running);
+	ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved);
+	ut_ad(buf_dblwr->b_reserved <= srv_doublewrite_batch_size);
+
+	if (buf_dblwr->first_free == srv_doublewrite_batch_size) {
+		mutex_exit(&(buf_dblwr->mutex));
+
+		buf_dblwr_flush_buffered_writes();
+
+		return;
+	}
+
+	mutex_exit(&(buf_dblwr->mutex));
+}
+
+/********************************************************************//**
+Writes a page to the doublewrite buffer on disk, sync it, then write
+the page to the datafile and sync the datafile. This function is used
+for single page flushes. If all the buffers allocated for single page
+flushes in the doublewrite buffer are in use we wait here for one to
+become free. We are guaranteed that a slot will become free because any
+thread that is using a slot must also release the slot before leaving
+this function. */
+UNIV_INTERN
+void
+buf_dblwr_write_single_page(
+/*========================*/
+	buf_page_t*	bpage,	/*!< in: buffer block to write */
+	bool		sync)	/*!< in: true if sync IO requested */
+{
+	ulint		n_slots;
+	ulint		size;
+	ulint		zip_size;
+	ulint		offset;
+	ulint		i;
+
+	ut_a(buf_page_in_file(bpage));
+	ut_a(srv_use_doublewrite_buf);
+	ut_a(buf_dblwr != NULL);
+
+	/* total number of slots available for single page flushes
+	starts from srv_doublewrite_batch_size to the end of the
+	buffer. */
+	size = 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+	ut_a(size > srv_doublewrite_batch_size);
+	n_slots = size - srv_doublewrite_batch_size;
+
+	if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
+
+		/* Check that the actual page in the buffer pool is
+		not corrupt and the LSN values are sane. */
+		buf_dblwr_check_block((buf_block_t*) bpage);
+
+		/* Check that the page as written to the doublewrite
+		buffer has sane LSN values. */
+		if (!bpage->zip.data) {
+			buf_dblwr_check_page_lsn(
+				((buf_block_t*) bpage)->frame);
+		}
+	}
+
+retry:
+	mutex_enter(&buf_dblwr->mutex);
+	if (buf_dblwr->s_reserved == n_slots) {
+
+		/* All slots are reserved. */
+		ib_int64_t	sig_count =
+			os_event_reset(buf_dblwr->s_event);
+		mutex_exit(&buf_dblwr->mutex);
+		os_event_wait_low(buf_dblwr->s_event, sig_count);
+
+		goto retry;
+	}
+
+	for (i = srv_doublewrite_batch_size; i < size; ++i) {
+
+		if (!buf_dblwr->in_use[i]) {
+			break;
+		}
+	}
+
+	/* We are guaranteed to find a slot. */
+	ut_a(i < size);
+	buf_dblwr->in_use[i] = true;
+	buf_dblwr->s_reserved++;
+	buf_dblwr->buf_block_arr[i] = bpage;
+
+	/* increment the doublewrite flushed pages counter */
+	srv_stats.dblwr_pages_written.inc();
+	srv_stats.dblwr_writes.inc();
+
+	mutex_exit(&buf_dblwr->mutex);
+
+	/* Lets see if we are going to write in the first or second
+	block of the doublewrite buffer. */
+	if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+		offset = buf_dblwr->block1 + i;
+	} else {
+		offset = buf_dblwr->block2 + i
+			 - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+	}
+
+	/* We deal with compressed and uncompressed pages a little
+	differently here. In case of uncompressed pages we can
+	directly write the block to the allocated slot in the
+	doublewrite buffer in the system tablespace and then after
+	syncing the system table space we can proceed to write the page
+	in the datafile.
+	In case of compressed page we first do a memcpy of the block
+	to the in-memory buffer of doublewrite before proceeding to
+	write it. This is so because we want to pad the remaining
+	bytes in the doublewrite page with zeros. */
+
+	zip_size = buf_page_get_zip_size(bpage);
+	if (zip_size) {
+		memcpy(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i,
+		       bpage->zip.data, zip_size);
+		memset(buf_dblwr->write_buf + UNIV_PAGE_SIZE * i
+		       + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
+
+		fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
+		       offset, 0, UNIV_PAGE_SIZE,
+		       (void*) (buf_dblwr->write_buf
+				+ UNIV_PAGE_SIZE * i), NULL);
+	} else {
+		/* It is a regular page. Write it directly to the
+		doublewrite buffer */
+		fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
+		       offset, 0, UNIV_PAGE_SIZE,
+		       (void*) ((buf_block_t*) bpage)->frame,
+		       NULL);
+	}
+
+	/* Now flush the doublewrite buffer data to disk */
+	fil_flush(TRX_SYS_SPACE);
+
+	/* We know that the write has been flushed to disk now
+	and during recovery we will find it in the doublewrite buffer
+	blocks. Next do the write to the intended position. */
+	buf_dblwr_write_block_to_datafile(bpage, sync);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc
new file mode 100644
index 00000000000..467f817a2d1
--- /dev/null
+++ b/storage/innobase/buf/buf0dump.cc
@@ -0,0 +1,621 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dump.cc
+Implements a buffer pool dump/load.
+
+Created April 08, 2011 Vasil Dimov
+*******************************************************/
+
+#include "univ.i"
+
+#include <stdarg.h> /* va_* */
+#include <string.h> /* strerror() */
+
+#include "buf0buf.h" /* buf_pool_mutex_enter(), srv_buf_pool_instances */
+#include "buf0dump.h"
+#include "db0err.h"
+#include "dict0dict.h" /* dict_operation_lock */
+#include "os0file.h" /* OS_FILE_MAX_PATH */
+#include "os0sync.h" /* os_event* */
+#include "os0thread.h" /* os_thread_* */
+#include "srv0srv.h" /* srv_fast_shutdown, srv_buf_dump* */
+#include "srv0start.h" /* srv_shutdown_state */
+#include "sync0rw.h" /* rw_lock_s_lock() */
+#include "ut0byte.h" /* ut_ull_create() */
+#include "ut0sort.h" /* UT_SORT_FUNCTION_BODY */
+
+enum status_severity {
+	STATUS_INFO,
+	STATUS_NOTICE,
+	STATUS_ERR
+};
+
+#define SHUTTING_DOWN()	(UNIV_UNLIKELY(srv_shutdown_state \
+				       != SRV_SHUTDOWN_NONE))
+
+/* Flags that tell the buffer pool dump/load thread which action should it
+take after being waked up. */
+static ibool	buf_dump_should_start = FALSE;
+static ibool	buf_load_should_start = FALSE;
+
+static ibool	buf_load_abort_flag = FALSE;
+
+/* Used to temporary store dump info in order to avoid IO while holding
+buffer pool mutex during dump and also to sort the contents of the dump
+before reading the pages from disk during load.
+We store the space id in the high 32 bits and page no in low 32 bits. */
+typedef ib_uint64_t	buf_dump_t;
+
+/* Aux macros to create buf_dump_t and to extract space and page from it */
+#define BUF_DUMP_CREATE(space, page)	ut_ull_create(space, page)
+#define BUF_DUMP_SPACE(a)		((ulint) ((a) >> 32))
+#define BUF_DUMP_PAGE(a)		((ulint) ((a) & 0xFFFFFFFFUL))
+
+/*****************************************************************//**
+Wakes up the buffer pool dump/load thread and instructs it to start
+a dump. This function is called by MySQL code via buffer_pool_dump_now()
+and it should return immediately because the whole MySQL is frozen during
+its execution. */
+UNIV_INTERN
+void
+buf_dump_start()
+/*============*/
+{
+	buf_dump_should_start = TRUE;
+	os_event_set(srv_buf_dump_event);
+}
+
+/*****************************************************************//**
+Wakes up the buffer pool dump/load thread and instructs it to start
+a load. This function is called by MySQL code via buffer_pool_load_now()
+and it should return immediately because the whole MySQL is frozen during
+its execution. */
+UNIV_INTERN
+void
+buf_load_start()
+/*============*/
+{
+	buf_load_should_start = TRUE;
+	os_event_set(srv_buf_dump_event);
+}
+
+/*****************************************************************//**
+Sets the global variable that feeds MySQL's innodb_buffer_pool_dump_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3). The value of this variable can be
+retrieved by:
+SELECT variable_value FROM information_schema.global_status WHERE
+variable_name = 'INNODB_BUFFER_POOL_DUMP_STATUS';
+or by:
+SHOW STATUS LIKE 'innodb_buffer_pool_dump_status'; */
+static __attribute__((nonnull, format(printf, 2, 3)))
+void
+buf_dump_status(
+/*============*/
+	enum status_severity	severity,/*!< in: status severity */
+	const char*		fmt,	/*!< in: format */
+	...)				/*!< in: extra parameters according
+					to fmt */
+{
+	va_list	ap;
+
+	va_start(ap, fmt);
+
+	ut_vsnprintf(
+		export_vars.innodb_buffer_pool_dump_status,
+		sizeof(export_vars.innodb_buffer_pool_dump_status),
+		fmt, ap);
+
+	if (severity == STATUS_NOTICE || severity == STATUS_ERR) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: %s\n",
+			export_vars.innodb_buffer_pool_dump_status);
+	}
+
+	va_end(ap);
+}
+
+/*****************************************************************//**
+Sets the global variable that feeds MySQL's innodb_buffer_pool_load_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3). The value of this variable can be
+retrieved by:
+SELECT variable_value FROM information_schema.global_status WHERE
+variable_name = 'INNODB_BUFFER_POOL_LOAD_STATUS';
+or by:
+SHOW STATUS LIKE 'innodb_buffer_pool_load_status'; */
+static __attribute__((nonnull, format(printf, 2, 3)))
+void
+buf_load_status(
+/*============*/
+	enum status_severity	severity,/*!< in: status severity */
+	const char*	fmt,	/*!< in: format */
+	...)			/*!< in: extra parameters according to fmt */
+{
+	va_list	ap;
+
+	va_start(ap, fmt);
+
+	ut_vsnprintf(
+		export_vars.innodb_buffer_pool_load_status,
+		sizeof(export_vars.innodb_buffer_pool_load_status),
+		fmt, ap);
+
+	if (severity == STATUS_NOTICE || severity == STATUS_ERR) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: %s\n",
+			export_vars.innodb_buffer_pool_load_status);
+	}
+
+	va_end(ap);
+}
+
+/*****************************************************************//**
+Perform a buffer pool dump into the file specified by
+innodb_buffer_pool_filename. If any errors occur then the value of
+innodb_buffer_pool_dump_status will be set accordingly, see buf_dump_status().
+The dump filename can be specified by (relative to srv_data_home):
+SET GLOBAL innodb_buffer_pool_filename='filename'; */
+static
+void
+buf_dump(
+/*=====*/
+	ibool	obey_shutdown)	/*!< in: quit if we are in a shutting down
+				state */
+{
+#define SHOULD_QUIT()	(SHUTTING_DOWN() && obey_shutdown)
+
+	char	full_filename[OS_FILE_MAX_PATH];
+	char	tmp_filename[OS_FILE_MAX_PATH];
+	char	now[32];
+	FILE*	f;
+	ulint	i;
+	int	ret;
+
+	ut_snprintf(full_filename, sizeof(full_filename),
+		    "%s%c%s", srv_data_home, SRV_PATH_SEPARATOR,
+		    srv_buf_dump_filename);
+
+	ut_snprintf(tmp_filename, sizeof(tmp_filename),
+		    "%s.incomplete", full_filename);
+
+	buf_dump_status(STATUS_NOTICE, "Dumping buffer pool(s) to %s",
+			full_filename);
+
+	f = fopen(tmp_filename, "w");
+	if (f == NULL) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot open '%s' for writing: %s",
+				tmp_filename, strerror(errno));
+		return;
+	}
+	/* else */
+
+	/* walk through each buffer pool */
+	for (i = 0; i < srv_buf_pool_instances && !SHOULD_QUIT(); i++) {
+		buf_pool_t*		buf_pool;
+		const buf_page_t*	bpage;
+		buf_dump_t*		dump;
+		ulint			n_pages;
+		ulint			j;
+
+		buf_pool = buf_pool_from_array(i);
+
+		/* obtain buf_pool mutex before allocate, since
+		UT_LIST_GET_LEN(buf_pool->LRU) could change */
+		buf_pool_mutex_enter(buf_pool);
+
+		n_pages = UT_LIST_GET_LEN(buf_pool->LRU);
+
+		/* skip empty buffer pools */
+		if (n_pages == 0) {
+			buf_pool_mutex_exit(buf_pool);
+			continue;
+		}
+
+		dump = static_cast<buf_dump_t*>(
+			ut_malloc(n_pages * sizeof(*dump))) ;
+
+		if (dump == NULL) {
+			buf_pool_mutex_exit(buf_pool);
+			fclose(f);
+			buf_dump_status(STATUS_ERR,
+					"Cannot allocate " ULINTPF " bytes: %s",
+					(ulint) (n_pages * sizeof(*dump)),
+					strerror(errno));
+			/* leave tmp_filename to exist */
+			return;
+		}
+
+		for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), j = 0;
+		     bpage != NULL;
+		     bpage = UT_LIST_GET_PREV(LRU, bpage), j++) {
+
+			ut_a(buf_page_in_file(bpage));
+
+			dump[j] = BUF_DUMP_CREATE(buf_page_get_space(bpage),
+						  buf_page_get_page_no(bpage));
+		}
+
+		ut_a(j == n_pages);
+
+		buf_pool_mutex_exit(buf_pool);
+
+		for (j = 0; j < n_pages && !SHOULD_QUIT(); j++) {
+			ret = fprintf(f, ULINTPF "," ULINTPF "\n",
+				      BUF_DUMP_SPACE(dump[j]),
+				      BUF_DUMP_PAGE(dump[j]));
+			if (ret < 0) {
+				ut_free(dump);
+				fclose(f);
+				buf_dump_status(STATUS_ERR,
+						"Cannot write to '%s': %s",
+						tmp_filename, strerror(errno));
+				/* leave tmp_filename to exist */
+				return;
+			}
+
+			if (j % 128 == 0) {
+				buf_dump_status(
+					STATUS_INFO,
+					"Dumping buffer pool "
+					ULINTPF "/" ULINTPF ", "
+					"page " ULINTPF "/" ULINTPF,
+					i + 1, srv_buf_pool_instances,
+					j + 1, n_pages);
+			}
+		}
+
+		ut_free(dump);
+	}
+
+	ret = fclose(f);
+	if (ret != 0) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot close '%s': %s",
+				tmp_filename, strerror(errno));
+		return;
+	}
+	/* else */
+
+	ret = unlink(full_filename);
+	if (ret != 0 && errno != ENOENT) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot delete '%s': %s",
+				full_filename, strerror(errno));
+		/* leave tmp_filename to exist */
+		return;
+	}
+	/* else */
+
+	ret = rename(tmp_filename, full_filename);
+	if (ret != 0) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot rename '%s' to '%s': %s",
+				tmp_filename, full_filename,
+				strerror(errno));
+		/* leave tmp_filename to exist */
+		return;
+	}
+	/* else */
+
+	/* success */
+
+	ut_sprintf_timestamp(now);
+
+	buf_dump_status(STATUS_NOTICE,
+			"Buffer pool(s) dump completed at %s", now);
+}
+
+/*****************************************************************//**
+Compare two buffer pool dump entries, used to sort the dump on
+space_no,page_no before loading in order to increase the chance for
+sequential IO.
+@return -1/0/1 if entry 1 is smaller/equal/bigger than entry 2 */
+static
+lint
+buf_dump_cmp(
+/*=========*/
+	const buf_dump_t	d1,	/*!< in: buffer pool dump entry 1 */
+	const buf_dump_t	d2)	/*!< in: buffer pool dump entry 2 */
+{
+	if (d1 < d2) {
+		return(-1);
+	} else if (d1 == d2) {
+		return(0);
+	} else {
+		return(1);
+	}
+}
+
+/*****************************************************************//**
+Sort a buffer pool dump on space_no, page_no. */
+static
+void
+buf_dump_sort(
+/*==========*/
+	buf_dump_t*	dump,	/*!< in/out: buffer pool dump to sort */
+	buf_dump_t*	tmp,	/*!< in/out: temp storage */
+	ulint		low,	/*!< in: lowest index (inclusive) */
+	ulint		high)	/*!< in: highest index (non-inclusive) */
+{
+	UT_SORT_FUNCTION_BODY(buf_dump_sort, dump, tmp, low, high,
+			      buf_dump_cmp);
+}
+
+/*****************************************************************//**
+Perform a buffer pool load from the file specified by
+innodb_buffer_pool_filename. If any errors occur then the value of
+innodb_buffer_pool_load_status will be set accordingly, see buf_load_status().
+The dump filename can be specified by (relative to srv_data_home):
+SET GLOBAL innodb_buffer_pool_filename='filename'; */
+static
+void
+buf_load()
+/*======*/
+{
+	char		full_filename[OS_FILE_MAX_PATH];
+	char		now[32];
+	FILE*		f;
+	buf_dump_t*	dump;
+	buf_dump_t*	dump_tmp;
+	ulint		dump_n;
+	ulint		total_buffer_pools_pages;
+	ulint		i;
+	ulint		space_id;
+	ulint		page_no;
+	int		fscanf_ret;
+
+	/* Ignore any leftovers from before */
+	buf_load_abort_flag = FALSE;
+
+	ut_snprintf(full_filename, sizeof(full_filename),
+		    "%s%c%s", srv_data_home, SRV_PATH_SEPARATOR,
+		    srv_buf_dump_filename);
+
+	buf_load_status(STATUS_NOTICE,
+			"Loading buffer pool(s) from %s", full_filename);
+
+	f = fopen(full_filename, "r");
+	if (f == NULL) {
+		buf_load_status(STATUS_ERR,
+				"Cannot open '%s' for reading: %s",
+				full_filename, strerror(errno));
+		return;
+	}
+	/* else */
+
+	/* First scan the file to estimate how many entries are in it.
+	This file is tiny (approx 500KB per 1GB buffer pool), reading it
+	two times is fine. */
+	dump_n = 0;
+	while (fscanf(f, ULINTPF "," ULINTPF, &space_id, &page_no) == 2
+	       && !SHUTTING_DOWN()) {
+		dump_n++;
+	}
+
+	if (!SHUTTING_DOWN() && !feof(f)) {
+		/* fscanf() returned != 2 */
+		const char*	what;
+		if (ferror(f)) {
+			what = "reading";
+		} else {
+			what = "parsing";
+		}
+		fclose(f);
+		buf_load_status(STATUS_ERR, "Error %s '%s', "
+				"unable to load buffer pool (stage 1)",
+				what, full_filename);
+		return;
+	}
+
+	/* If dump is larger than the buffer pool(s), then we ignore the
+	extra trailing. This could happen if a dump is made, then buffer
+	pool is shrunk and then load it attempted. */
+	total_buffer_pools_pages = buf_pool_get_n_pages()
+		* srv_buf_pool_instances;
+	if (dump_n > total_buffer_pools_pages) {
+		dump_n = total_buffer_pools_pages;
+	}
+
+	dump = static_cast<buf_dump_t*>(ut_malloc(dump_n * sizeof(*dump)));
+
+	if (dump == NULL) {
+		fclose(f);
+		buf_load_status(STATUS_ERR,
+				"Cannot allocate " ULINTPF " bytes: %s",
+				(ulint) (dump_n * sizeof(*dump)),
+				strerror(errno));
+		return;
+	}
+
+	dump_tmp = static_cast<buf_dump_t*>(
+		ut_malloc(dump_n * sizeof(*dump_tmp)));
+
+	if (dump_tmp == NULL) {
+		ut_free(dump);
+		fclose(f);
+		buf_load_status(STATUS_ERR,
+				"Cannot allocate " ULINTPF " bytes: %s",
+				(ulint) (dump_n * sizeof(*dump_tmp)),
+				strerror(errno));
+		return;
+	}
+
+	rewind(f);
+
+	for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
+		fscanf_ret = fscanf(f, ULINTPF "," ULINTPF,
+				    &space_id, &page_no);
+
+		if (fscanf_ret != 2) {
+			if (feof(f)) {
+				break;
+			}
+			/* else */
+
+			ut_free(dump);
+			ut_free(dump_tmp);
+			fclose(f);
+			buf_load_status(STATUS_ERR,
+					"Error parsing '%s', unable "
+					"to load buffer pool (stage 2)",
+					full_filename);
+			return;
+		}
+
+		if (space_id > ULINT32_MASK || page_no > ULINT32_MASK) {
+			ut_free(dump);
+			ut_free(dump_tmp);
+			fclose(f);
+			buf_load_status(STATUS_ERR,
+					"Error parsing '%s': bogus "
+					"space,page " ULINTPF "," ULINTPF
+					" at line " ULINTPF ", "
+					"unable to load buffer pool",
+					full_filename,
+					space_id, page_no,
+					i);
+			return;
+		}
+
+		dump[i] = BUF_DUMP_CREATE(space_id, page_no);
+	}
+
+	/* Set dump_n to the actual number of initialized elements,
+	i could be smaller than dump_n here if the file got truncated after
+	we read it the first time. */
+	dump_n = i;
+
+	fclose(f);
+
+	if (dump_n == 0) {
+		ut_free(dump);
+		ut_sprintf_timestamp(now);
+		buf_load_status(STATUS_NOTICE,
+				"Buffer pool(s) load completed at %s "
+				"(%s was empty)", now, full_filename);
+		return;
+	}
+
+	if (!SHUTTING_DOWN()) {
+		buf_dump_sort(dump, dump_tmp, 0, dump_n);
+	}
+
+	ut_free(dump_tmp);
+
+	for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
+
+		buf_read_page_async(BUF_DUMP_SPACE(dump[i]),
+				    BUF_DUMP_PAGE(dump[i]));
+
+		if (i % 64 == 63) {
+			os_aio_simulated_wake_handler_threads();
+		}
+
+		if (i % 128 == 0) {
+			buf_load_status(STATUS_INFO,
+					"Loaded " ULINTPF "/" ULINTPF " pages",
+					i + 1, dump_n);
+		}
+
+		if (buf_load_abort_flag) {
+			buf_load_abort_flag = FALSE;
+			ut_free(dump);
+			buf_load_status(
+				STATUS_NOTICE,
+				"Buffer pool(s) load aborted on request");
+			return;
+		}
+	}
+
+	ut_free(dump);
+
+	ut_sprintf_timestamp(now);
+
+	buf_load_status(STATUS_NOTICE,
+			"Buffer pool(s) load completed at %s", now);
+}
+
+/*****************************************************************//**
+Aborts a currently running buffer pool load. This function is called by
+MySQL code via buffer_pool_load_abort() and it should return immediately
+because the whole MySQL is frozen during its execution. */
+UNIV_INTERN
+void
+buf_load_abort()
+/*============*/
+{
+	buf_load_abort_flag = TRUE;
+}
+
+/*****************************************************************//**
+This is the main thread for buffer pool dump/load. It waits for an
+event and when waked up either performs a dump or load and sleeps
+again.
+@return this function does not return, it calls os_thread_exit() */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(buf_dump_thread)(
+/*============================*/
+	void*	arg __attribute__((unused)))	/*!< in: a dummy parameter
+						required by os_thread_create */
+{
+	ut_ad(!srv_read_only_mode);
+
+	srv_buf_dump_thread_active = TRUE;
+
+	buf_dump_status(STATUS_INFO, "not started");
+	buf_load_status(STATUS_INFO, "not started");
+
+	if (srv_buffer_pool_load_at_startup) {
+		buf_load();
+	}
+
+	while (!SHUTTING_DOWN()) {
+
+		os_event_wait(srv_buf_dump_event);
+
+		if (buf_dump_should_start) {
+			buf_dump_should_start = FALSE;
+			buf_dump(TRUE /* quit on shutdown */);
+		}
+
+		if (buf_load_should_start) {
+			buf_load_should_start = FALSE;
+			buf_load();
+		}
+
+		os_event_reset(srv_buf_dump_event);
+	}
+
+	if (srv_buffer_pool_dump_at_shutdown && srv_fast_shutdown != 2) {
+		buf_dump(FALSE /* ignore shutdown down flag,
+		keep going even if we are in a shutdown state */);
+	}
+
+	srv_buf_dump_thread_active = FALSE;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
new file mode 100644
index 00000000000..f3437c202f6
--- /dev/null
+++ b/storage/innobase/buf/buf0flu.cc
@@ -0,0 +1,2674 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0flu.cc
+The database buffer buf_pool flush algorithm
+
+Created 11/11/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0flu.h"
+
+#ifdef UNIV_NONINL
+#include "buf0flu.ic"
+#endif
+
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "page0zip.h"
+#ifndef UNIV_HOTBACKUP
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "page0page.h"
+#include "fil0fil.h"
+#include "buf0lru.h"
+#include "buf0rea.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+#include "os0file.h"
+#include "trx0sys.h"
+#include "srv0mon.h"
+#include "mysql/plugin.h"
+#include "mysql/service_thd_wait.h"
+
+/** Number of pages flushed through non flush_list flushes. */
+static ulint buf_lru_flush_page_count = 0;
+
+/** Flag indicating if the page_cleaner is in active state. This flag
+is set to TRUE by the page_cleaner thread when it is spawned and is set
+back to FALSE at shutdown by the page_cleaner as well. Therefore no
+need to protect it by a mutex. It is only ever read by the thread
+doing the shutdown */
+UNIV_INTERN ibool buf_page_cleaner_is_active = FALSE;
+
+/** LRU flush batch is further divided into this chunk size to
+reduce the wait time for the threads waiting for a clean block */
+#define PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE	100
+
+#ifdef UNIV_PFS_THREAD
+UNIV_INTERN mysql_pfs_key_t buf_page_cleaner_thread_key;
+#endif /* UNIV_PFS_THREAD */
+
+/** If LRU list of a buf_pool is less than this size then LRU eviction
+should not happen. This is because when we do LRU flushing we also put
+the blocks on free list. If LRU list is very small then we can end up
+in thrashing. */
+#define BUF_LRU_MIN_LEN		256
+
+/* @} */
+
+/******************************************************************//**
+Increases flush_list size in bytes with zip_size for compressed page,
+UNIV_PAGE_SIZE for uncompressed page in inline function */
+static inline
+void
+incr_flush_list_size_in_bytes(
+/*==========================*/
+	buf_block_t*	block,		/*!< in: control block */
+	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
+{
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+	ulint zip_size = page_zip_get_size(&block->page.zip);
+	buf_pool->stat.flush_list_bytes += zip_size ? zip_size : UNIV_PAGE_SIZE;
+	ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size);
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/******************************************************************//**
+Validates the flush list.
+@return	TRUE if ok */
+static
+ibool
+buf_flush_validate_low(
+/*===================*/
+	buf_pool_t*	buf_pool);	/*!< in: Buffer pool instance */
+
+/******************************************************************//**
+Validates the flush list some of the time.
+@return	TRUE if ok or the check was skipped */
+static
+ibool
+buf_flush_validate_skip(
+/*====================*/
+	buf_pool_t*	buf_pool)	/*!< in: Buffer pool instance */
+{
+/** Try buf_flush_validate_low() every this many times */
+# define BUF_FLUSH_VALIDATE_SKIP	23
+
+	/** The buf_flush_validate_low() call skip counter.
+	Use a signed type because of the race condition below. */
+	static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
+
+	/* There is a race condition below, but it does not matter,
+	because this call is only for heuristic purposes. We want to
+	reduce the call frequency of the costly buf_flush_validate_low()
+	check in debug builds. */
+	if (--buf_flush_validate_count > 0) {
+		return(TRUE);
+	}
+
+	buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
+	return(buf_flush_validate_low(buf_pool));
+}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+/*******************************************************************//**
+Sets hazard pointer during flush_list iteration. */
+UNIV_INLINE
+void
+buf_flush_set_hp(
+/*=============*/
+	buf_pool_t*		buf_pool,/*!< in/out: buffer pool instance */
+	const buf_page_t*	bpage)	/*!< in: buffer control block */
+{
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+	ut_ad(buf_pool->flush_list_hp == NULL || bpage == NULL);
+	ut_ad(!bpage || buf_page_in_file(bpage));
+	ut_ad(!bpage || bpage->in_flush_list);
+	ut_ad(!bpage || buf_pool_from_bpage(bpage) == buf_pool);
+
+	buf_pool->flush_list_hp = bpage;
+}
+
+/*******************************************************************//**
+Checks if the given block is a hazard pointer
+@return true if bpage is hazard pointer */
+UNIV_INLINE
+bool
+buf_flush_is_hp(
+/*============*/
+	buf_pool_t*		buf_pool,/*!< in: buffer pool instance */
+	const buf_page_t*	bpage)	/*!< in: buffer control block */
+{
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+	return(buf_pool->flush_list_hp == bpage);
+}
+
+/*******************************************************************//**
+Whenever we move a block in flush_list (either to remove it or to
+relocate it) we check the hazard pointer set by some other thread
+doing the flush list scan. If the hazard pointer is the same as the
+one we are about going to move then we set it to NULL to force a rescan
+in the thread doing the batch. */
+UNIV_INLINE
+void
+buf_flush_update_hp(
+/*================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	buf_page_t*	bpage)		/*!< in: buffer control block */
+{
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+	if (buf_flush_is_hp(buf_pool, bpage)) {
+		buf_flush_set_hp(buf_pool, NULL);
+		MONITOR_INC(MONITOR_FLUSH_HP_RESCAN);
+	}
+}
+
+/******************************************************************//**
+Insert a block in the flush_rbt and returns a pointer to its
+predecessor or NULL if no predecessor. The ordering is maintained
+on the basis of the <oldest_modification, space, offset> key.
+@return	pointer to the predecessor or NULL if no predecessor. */
+static
+buf_page_t*
+buf_flush_insert_in_flush_rbt(
+/*==========================*/
+	buf_page_t*	bpage)	/*!< in: bpage to be inserted. */
+{
+	const ib_rbt_node_t*	c_node;
+	const ib_rbt_node_t*	p_node;
+	buf_page_t*		prev = NULL;
+	buf_pool_t*		buf_pool = buf_pool_from_bpage(bpage);
+
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+	/* Insert this buffer into the rbt. */
+	c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
+	ut_a(c_node != NULL);
+
+	/* Get the predecessor. */
+	p_node = rbt_prev(buf_pool->flush_rbt, c_node);
+
+	if (p_node != NULL) {
+		buf_page_t**	value;
+		value = rbt_value(buf_page_t*, p_node);
+		prev = *value;
+		ut_a(prev != NULL);
+	}
+
+	return(prev);
+}
+
+/*********************************************************//**
+Delete a bpage from the flush_rbt. */
+static
+void
+buf_flush_delete_from_flush_rbt(
+/*============================*/
+	buf_page_t*	bpage)	/*!< in: bpage to be removed. */
+{
+#ifdef UNIV_DEBUG
+	ibool		ret = FALSE;
+#endif /* UNIV_DEBUG */
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+#ifdef UNIV_DEBUG
+	ret =
+#endif /* UNIV_DEBUG */
+	rbt_delete(buf_pool->flush_rbt, &bpage);
+
+	ut_ad(ret);
+}
+
+/*****************************************************************//**
+Compare two modified blocks in the buffer pool. The key for comparison
+is:
+key = <oldest_modification, space, offset>
+This comparison is used to maintian ordering of blocks in the
+buf_pool->flush_rbt.
+Note that for the purpose of flush_rbt, we only need to order blocks
+on the oldest_modification. The other two fields are used to uniquely
+identify the blocks.
+@return	 < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
+static
+int
+buf_flush_block_cmp(
+/*================*/
+	const void*	p1,		/*!< in: block1 */
+	const void*	p2)		/*!< in: block2 */
+{
+	int			ret;
+	const buf_page_t*	b1 = *(const buf_page_t**) p1;
+	const buf_page_t*	b2 = *(const buf_page_t**) p2;
+#ifdef UNIV_DEBUG
+	buf_pool_t*		buf_pool = buf_pool_from_bpage(b1);
+#endif /* UNIV_DEBUG */
+
+	ut_ad(b1 != NULL);
+	ut_ad(b2 != NULL);
+
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+	ut_ad(b1->in_flush_list);
+	ut_ad(b2->in_flush_list);
+
+	if (b2->oldest_modification > b1->oldest_modification) {
+		return(1);
+	} else if (b2->oldest_modification < b1->oldest_modification) {
+		return(-1);
+	}
+
+	/* If oldest_modification is same then decide on the space. */
+	ret = (int)(b2->space - b1->space);
+
+	/* Or else decide ordering on the offset field. */
+	return(ret ? ret : (int)(b2->offset - b1->offset));
+}
+
+/********************************************************************//**
+Initialize the red-black tree to speed up insertions into the flush_list
+during recovery process. Should be called at the start of recovery
+process before any page has been read/written. */
+UNIV_INTERN
+void
+buf_flush_init_flush_rbt(void)
+/*==========================*/
+{
+	ulint	i;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		buf_flush_list_mutex_enter(buf_pool);
+
+		/* Create red black tree for speedy insertions in flush list. */
+		buf_pool->flush_rbt = rbt_create(
+			sizeof(buf_page_t*), buf_flush_block_cmp);
+
+		buf_flush_list_mutex_exit(buf_pool);
+	}
+}
+
+/********************************************************************//**
+Frees up the red-black tree. */
+UNIV_INTERN
+void
+buf_flush_free_flush_rbt(void)
+/*==========================*/
+{
+	ulint	i;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		buf_flush_list_mutex_enter(buf_pool);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+		ut_a(buf_flush_validate_low(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+		rbt_free(buf_pool->flush_rbt);
+		buf_pool->flush_rbt = NULL;
+
+		buf_flush_list_mutex_exit(buf_pool);
+	}
+}
+
+/********************************************************************//**
+Inserts a modified block into the flush list. */
+UNIV_INTERN
+void
+buf_flush_insert_into_flush_list(
+/*=============================*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	buf_block_t*	block,		/*!< in/out: block which is modified */
+	lsn_t		lsn)		/*!< in: oldest modification */
+{
+	ut_ad(!buf_pool_mutex_own(buf_pool));
+	ut_ad(log_flush_order_mutex_own());
+	ut_ad(mutex_own(&block->mutex));
+
+	buf_flush_list_mutex_enter(buf_pool);
+
+	ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
+	      || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
+		  <= lsn));
+
+	/* If we are in the recovery then we need to update the flush
+	red-black tree as well. */
+	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+		buf_flush_list_mutex_exit(buf_pool);
+		buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
+		return;
+	}
+
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(!block->page.in_flush_list);
+
+	ut_d(block->page.in_flush_list = TRUE);
+	block->page.oldest_modification = lsn;
+	UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
+	incr_flush_list_size_in_bytes(block, buf_pool);
+
+#ifdef UNIV_DEBUG_VALGRIND
+	{
+		ulint	zip_size = buf_block_get_zip_size(block);
+
+		if (zip_size) {
+			UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
+		} else {
+			UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
+		}
+	}
+#endif /* UNIV_DEBUG_VALGRIND */
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(buf_flush_validate_skip(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+	buf_flush_list_mutex_exit(buf_pool);
+}
+
+/********************************************************************//**
+Inserts a modified block into the flush list in the right sorted position.
+This function is used by recovery, because there the modifications do not
+necessarily come in the order of lsn's. */
+UNIV_INTERN
+void
+buf_flush_insert_sorted_into_flush_list(
+/*====================================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	buf_block_t*	block,		/*!< in/out: block which is modified */
+	lsn_t		lsn)		/*!< in: oldest modification */
+{
+	buf_page_t*	prev_b;
+	buf_page_t*	b;
+
+	ut_ad(!buf_pool_mutex_own(buf_pool));
+	ut_ad(log_flush_order_mutex_own());
+	ut_ad(mutex_own(&block->mutex));
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+	buf_flush_list_mutex_enter(buf_pool);
+
+	/* The field in_LRU_list is protected by buf_pool->mutex, which
+	we are not holding.  However, while a block is in the flush
+	list, it is dirty and cannot be discarded, not from the
+	page_hash or from the LRU list.  At most, the uncompressed
+	page frame of a compressed block may be discarded or created
+	(copying the block->page to or from a buf_page_t that is
+	dynamically allocated from buf_buddy_alloc()).  Because those
+	transitions hold block->mutex and the flush list mutex (via
+	buf_flush_relocate_on_flush_list()), there is no possibility
+	of a race condition in the assertions below. */
+	ut_ad(block->page.in_LRU_list);
+	ut_ad(block->page.in_page_hash);
+	/* buf_buddy_block_register() will take a block in the
+	BUF_BLOCK_MEMORY state, not a file page. */
+	ut_ad(!block->page.in_zip_hash);
+
+	ut_ad(!block->page.in_flush_list);
+	ut_d(block->page.in_flush_list = TRUE);
+	block->page.oldest_modification = lsn;
+
+#ifdef UNIV_DEBUG_VALGRIND
+	{
+		ulint	zip_size = buf_block_get_zip_size(block);
+
+		if (zip_size) {
+			UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
+		} else {
+			UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
+		}
+	}
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	prev_b = NULL;
+
+	/* For the most part when this function is called the flush_rbt
+	should not be NULL. In a very rare boundary case it is possible
+	that the flush_rbt has already been freed by the recovery thread
+	before the last page was hooked up in the flush_list by the
+	io-handler thread. In that case we'll  just do a simple
+	linear search in the else block. */
+	if (buf_pool->flush_rbt) {
+
+		prev_b = buf_flush_insert_in_flush_rbt(&block->page);
+
+	} else {
+
+		b = UT_LIST_GET_FIRST(buf_pool->flush_list);
+
+		while (b && b->oldest_modification
+		       > block->page.oldest_modification) {
+			ut_ad(b->in_flush_list);
+			prev_b = b;
+			b = UT_LIST_GET_NEXT(list, b);
+		}
+	}
+
+	if (prev_b == NULL) {
+		UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
+	} else {
+		UT_LIST_INSERT_AFTER(list, buf_pool->flush_list,
+				     prev_b, &block->page);
+	}
+
+	incr_flush_list_size_in_bytes(block, buf_pool);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(buf_flush_validate_low(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+	buf_flush_list_mutex_exit(buf_pool);
+}
+
+/********************************************************************//**
+Returns TRUE if the file page block is immediately suitable for replacement,
+i.e., the transition FILE_PAGE => NOT_USED allowed.
+@return	TRUE if can replace immediately */
+UNIV_INTERN
+ibool
+buf_flush_ready_for_replace(
+/*========================*/
+	buf_page_t*	bpage)	/*!< in: buffer control block, must be
+				buf_page_in_file(bpage) and in the LRU list */
+{
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+#endif /* UNIV_DEBUG */
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(bpage->in_LRU_list);
+
+	if (buf_page_in_file(bpage)) {
+
+		return(bpage->oldest_modification == 0
+		       && bpage->buf_fix_count == 0
+		       && buf_page_get_io_fix(bpage) == BUF_IO_NONE);
+	}
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		"  InnoDB: Error: buffer block state %lu"
+		" in the LRU list!\n",
+		(ulong) buf_page_get_state(bpage));
+	ut_print_buf(stderr, bpage, sizeof(buf_page_t));
+	putc('\n', stderr);
+
+	return(FALSE);
+}
+
+/********************************************************************//**
+Returns true if the block is modified and ready for flushing.
+@return	true if can flush immediately */
+UNIV_INTERN
+bool
+buf_flush_ready_for_flush(
+/*======================*/
+	buf_page_t*	bpage,	/*!< in: buffer control block, must be
+				buf_page_in_file(bpage) */
+	buf_flush_t	flush_type)/*!< in: type of flush */
+{
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+#endif /* UNIV_DEBUG */
+
+	ut_a(buf_page_in_file(bpage));
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
+
+	if (bpage->oldest_modification == 0
+	    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+		return(false);
+	}
+
+	ut_ad(bpage->in_flush_list);
+
+	switch (flush_type) {
+	case BUF_FLUSH_LIST:
+	case BUF_FLUSH_LRU:
+	case BUF_FLUSH_SINGLE_PAGE:
+		return(true);
+
+	case BUF_FLUSH_N_TYPES:
+		break;
+	}
+
+	ut_error;
+	return(false);
+}
+
+/********************************************************************//**
+Remove a block from the flush list of modified blocks. */
+UNIV_INTERN
+void
+buf_flush_remove(
+/*=============*/
+	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ulint		zip_size;
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(bpage->in_flush_list);
+
+	buf_flush_list_mutex_enter(buf_pool);
+
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_POOL_WATCH:
+	case BUF_BLOCK_ZIP_PAGE:
+		/* Clean compressed pages should not be on the flush list */
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		ut_error;
+		return;
+	case BUF_BLOCK_ZIP_DIRTY:
+		buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
+		UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+		buf_LRU_insert_zip_clean(bpage);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+		break;
+	case BUF_BLOCK_FILE_PAGE:
+		UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
+		break;
+	}
+
+	/* If the flush_rbt is active then delete from there as well. */
+	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+		buf_flush_delete_from_flush_rbt(bpage);
+	}
+
+	/* Must be done after we have removed it from the flush_rbt
+	because we assert on in_flush_list in comparison function. */
+	ut_d(bpage->in_flush_list = FALSE);
+
+	zip_size = page_zip_get_size(&bpage->zip);
+	buf_pool->stat.flush_list_bytes -= zip_size ? zip_size : UNIV_PAGE_SIZE;
+
+	bpage->oldest_modification = 0;
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(buf_flush_validate_skip(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+	buf_flush_update_hp(buf_pool, bpage);
+	buf_flush_list_mutex_exit(buf_pool);
+}
+
+/*******************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage have already been
+copied to dpage.
+IMPORTANT: When this function is called bpage and dpage are not
+exact copies of each other. For example, they both will have different
+::state. Also the ::list pointers in dpage may be stale. We need to
+use the current list node (bpage) to do the list manipulation because
+the list pointers could have changed between the time that we copied
+the contents of bpage to the dpage and the flush list manipulation
+below. */
+UNIV_INTERN
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+	buf_page_t*	bpage,	/*!< in/out: control block being moved */
+	buf_page_t*	dpage)	/*!< in/out: destination block */
+{
+	buf_page_t*	prev;
+	buf_page_t*	prev_b = NULL;
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	/* Must reside in the same buffer pool. */
+	ut_ad(buf_pool == buf_pool_from_bpage(dpage));
+
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+	buf_flush_list_mutex_enter(buf_pool);
+
+	/* FIXME: At this point we have both buf_pool and flush_list
+	mutexes. Theoretically removal of a block from flush list is
+	only covered by flush_list mutex but currently we do
+	have buf_pool mutex in buf_flush_remove() therefore this block
+	is guaranteed to be in the flush list. We need to check if
+	this will work without the assumption of block removing code
+	having the buf_pool mutex. */
+	ut_ad(bpage->in_flush_list);
+	ut_ad(dpage->in_flush_list);
+
+	/* If recovery is active we must swap the control blocks in
+	the flush_rbt as well. */
+	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+		buf_flush_delete_from_flush_rbt(bpage);
+		prev_b = buf_flush_insert_in_flush_rbt(dpage);
+	}
+
+	/* Must be done after we have removed it from the flush_rbt
+	because we assert on in_flush_list in comparison function. */
+	ut_d(bpage->in_flush_list = FALSE);
+
+	prev = UT_LIST_GET_PREV(list, bpage);
+	UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
+
+	if (prev) {
+		ut_ad(prev->in_flush_list);
+		UT_LIST_INSERT_AFTER(
+			list,
+			buf_pool->flush_list,
+			prev, dpage);
+	} else {
+		UT_LIST_ADD_FIRST(
+			list,
+			buf_pool->flush_list,
+			dpage);
+	}
+
+	/* Just an extra check. Previous in flush_list
+	should be the same control block as in flush_rbt. */
+	ut_a(!buf_pool->flush_rbt || prev_b == prev);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(buf_flush_validate_low(buf_pool));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+	buf_flush_update_hp(buf_pool, bpage);
+	buf_flush_list_mutex_exit(buf_pool);
+}
+
+/********************************************************************//**
+Updates the flush system data structures when a write is completed. */
+UNIV_INTERN
+void
+buf_flush_write_complete(
+/*=====================*/
+	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
+{
+	buf_flush_t	flush_type;
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+	ut_ad(bpage);
+
+	buf_flush_remove(bpage);
+
+	flush_type = buf_page_get_flush_type(bpage);
+	buf_pool->n_flush[flush_type]--;
+
+	/* fprintf(stderr, "n pending flush %lu\n",
+	buf_pool->n_flush[flush_type]); */
+
+	if (buf_pool->n_flush[flush_type] == 0
+	    && buf_pool->init_flush[flush_type] == FALSE) {
+
+		/* The running flush batch has ended */
+
+		os_event_set(buf_pool->no_flush[flush_type]);
+	}
+
+	buf_dblwr_update(bpage, flush_type);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Calculate the checksum of a page from compressed table and update the page. */
+UNIV_INTERN
+void
+buf_flush_update_zip_checksum(
+/*==========================*/
+	buf_frame_t*	page,		/*!< in/out: Page to update */
+	ulint		zip_size,	/*!< in: Compressed page size */
+	lsn_t		lsn)		/*!< in: Lsn to stamp on the page */
+{
+	ut_a(zip_size > 0);
+
+	ib_uint32_t	checksum = static_cast<ib_uint32_t>(
+		page_zip_calc_checksum(
+			page, zip_size,
+			static_cast<srv_checksum_algorithm_t>(
+				srv_checksum_algorithm)));
+
+	mach_write_to_8(page + FIL_PAGE_LSN, lsn);
+	memset(page + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
+	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
+}
+
+/********************************************************************//**
+Initializes a page for writing to the tablespace. */
+UNIV_INTERN
+void
+buf_flush_init_for_writing(
+/*=======================*/
+	byte*	page,		/*!< in/out: page */
+	void*	page_zip_,	/*!< in/out: compressed page, or NULL */
+	lsn_t	newest_lsn)	/*!< in: newest modification lsn
+				to the page */
+{
+	ib_uint32_t	checksum = 0 /* silence bogus gcc warning */;
+
+	ut_ad(page);
+
+	if (page_zip_) {
+		page_zip_des_t*	page_zip;
+		ulint		zip_size;
+
+		page_zip = static_cast<page_zip_des_t*>(page_zip_);
+		zip_size = page_zip_get_size(page_zip);
+
+		ut_ad(zip_size);
+		ut_ad(ut_is_2pow(zip_size));
+		ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
+
+		switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
+		case FIL_PAGE_TYPE_ALLOCATED:
+		case FIL_PAGE_INODE:
+		case FIL_PAGE_IBUF_BITMAP:
+		case FIL_PAGE_TYPE_FSP_HDR:
+		case FIL_PAGE_TYPE_XDES:
+			/* These are essentially uncompressed pages. */
+			memcpy(page_zip->data, page, zip_size);
+			/* fall through */
+		case FIL_PAGE_TYPE_ZBLOB:
+		case FIL_PAGE_TYPE_ZBLOB2:
+		case FIL_PAGE_INDEX:
+
+			buf_flush_update_zip_checksum(
+				page_zip->data, zip_size, newest_lsn);
+
+			return;
+		}
+
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: ERROR: The compressed page to be written"
+		      " seems corrupt:", stderr);
+		ut_print_buf(stderr, page, zip_size);
+		fputs("\nInnoDB: Possibly older version of the page:", stderr);
+		ut_print_buf(stderr, page_zip->data, zip_size);
+		putc('\n', stderr);
+		ut_error;
+	}
+
+	/* Write the newest modification lsn to the page header and trailer */
+	mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
+
+	mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+			newest_lsn);
+
+	/* Store the new formula checksum */
+
+	switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+		checksum = buf_calc_page_crc32(page);
+		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
+		break;
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+		checksum = (ib_uint32_t) buf_calc_page_new_checksum(page);
+		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
+		checksum = (ib_uint32_t) buf_calc_page_old_checksum(page);
+		break;
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+		checksum = BUF_NO_CHECKSUM_MAGIC;
+		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
+		break;
+	/* no default so the compiler will emit a warning if new enum
+	is added and not handled here */
+	}
+
+	/* With the InnoDB checksum, we overwrite the first 4 bytes of
+	the end lsn field to store the old formula checksum. Since it
+	depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to
+	be calculated after storing the new formula checksum.
+
+	In other cases we write the same value to both fields.
+	If CRC32 is used then it is faster to use that checksum
+	(calculated above) instead of calculating another one.
+	We can afford to store something other than
+	buf_calc_page_old_checksum() or BUF_NO_CHECKSUM_MAGIC in
+	this field because the file will not be readable by old
+	versions of MySQL/InnoDB anyway (older than MySQL 5.6.3) */
+
+	mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+			checksum);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Does an asynchronous write of a buffer page. NOTE: in simulated aio and
+also when the doublewrite buffer is used, we must call
+buf_dblwr_flush_buffered_writes after we have posted a batch of
+writes! */
+static
+void
+buf_flush_write_block_low(
+/*======================*/
+	buf_page_t*	bpage,		/*!< in: buffer block to write */
+	buf_flush_t	flush_type,	/*!< in: type of flush */
+	bool		sync)		/*!< in: true if sync IO request */
+{
+	ulint	zip_size	= buf_page_get_zip_size(bpage);
+	page_t*	frame		= NULL;
+
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(!buf_pool_mutex_own(buf_pool));
+#endif
+
+#ifdef UNIV_LOG_DEBUG
+	static ibool	univ_log_debug_warned;
+#endif /* UNIV_LOG_DEBUG */
+
+	ut_ad(buf_page_in_file(bpage));
+
+	/* We are not holding buf_pool->mutex or block_mutex here.
+	Nevertheless, it is safe to access bpage, because it is
+	io_fixed and oldest_modification != 0.  Thus, it cannot be
+	relocated in the buffer pool or removed from flush_list or
+	LRU_list. */
+	ut_ad(!buf_pool_mutex_own(buf_pool));
+	ut_ad(!buf_flush_list_mutex_own(buf_pool));
+	ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
+	ut_ad(bpage->oldest_modification != 0);
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
+#endif
+	ut_ad(bpage->newest_modification != 0);
+
+#ifdef UNIV_LOG_DEBUG
+	if (!univ_log_debug_warned) {
+		univ_log_debug_warned = TRUE;
+		fputs("Warning: cannot force log to disk if"
+		      " UNIV_LOG_DEBUG is defined!\n"
+		      "Crash recovery will not work!\n",
+		      stderr);
+	}
+#else
+	/* Force the log to the disk before writing the modified block */
+	log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
+#endif
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_POOL_WATCH:
+	case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		ut_error;
+		break;
+	case BUF_BLOCK_ZIP_DIRTY:
+		frame = bpage->zip.data;
+
+		ut_a(page_zip_verify_checksum(frame, zip_size));
+
+		mach_write_to_8(frame + FIL_PAGE_LSN,
+				bpage->newest_modification);
+		memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
+		break;
+	case BUF_BLOCK_FILE_PAGE:
+		frame = bpage->zip.data;
+		if (!frame) {
+			frame = ((buf_block_t*) bpage)->frame;
+		}
+
+		buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
+					   bpage->zip.data
+					   ? &bpage->zip : NULL,
+					   bpage->newest_modification);
+		break;
+	}
+
+	if (!srv_use_doublewrite_buf || !buf_dblwr) {
+		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
+		       sync, buf_page_get_space(bpage), zip_size,
+		       buf_page_get_page_no(bpage), 0,
+		       zip_size ? zip_size : UNIV_PAGE_SIZE,
+		       frame, bpage);
+	} else if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
+		buf_dblwr_write_single_page(bpage, sync);
+	} else {
+		ut_ad(!sync);
+		buf_dblwr_add_to_batch(bpage);
+	}
+
+	/* When doing single page flushing the IO is done synchronously
+	and we flush the changes to disk only for the tablespace we
+	are working on. */
+	if (sync) {
+		ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE);
+		fil_flush(buf_page_get_space(bpage));
+		buf_page_io_complete(bpage);
+	}
+
+	/* Increment the counter of I/O operations used
+	for selecting LRU policy. */
+	buf_LRU_stat_inc_io();
+}
+
+/********************************************************************//**
+Writes a flushable page asynchronously from the buffer pool to a file.
+NOTE: in simulated aio we must call
+os_aio_simulated_wake_handler_threads after we have posted a batch of
+writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be
+held upon entering this function, and they will be released by this
+function if it returns true.
+@return TRUE if the page was flushed */
+UNIV_INTERN
+bool
+buf_flush_page(
+/*===========*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	buf_page_t*	bpage,		/*!< in: buffer control block */
+	buf_flush_t	flush_type,	/*!< in: type of flush */
+	bool		sync)		/*!< in: true if sync IO request */
+{
+	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(buf_page_in_file(bpage));
+	ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE);
+
+	ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
+
+	ut_ad(mutex_own(block_mutex));
+
+	ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
+
+        bool            is_uncompressed;
+
+        is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+        ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
+
+        ibool           flush;
+        rw_lock_t*	rw_lock;
+        bool            no_fix_count = bpage->buf_fix_count == 0;
+
+        if (!is_uncompressed) {
+                flush = TRUE;
+		rw_lock = NULL;
+
+	} else if (!(no_fix_count || flush_type == BUF_FLUSH_LIST)) {
+		/* This is a heuristic, to avoid expensive S attempts. */
+		flush = FALSE;
+	} else {
+
+		rw_lock = &reinterpret_cast<buf_block_t*>(bpage)->lock;
+
+		if (flush_type != BUF_FLUSH_LIST) {
+			flush = rw_lock_s_lock_gen_nowait(
+				rw_lock, BUF_IO_WRITE);
+		} else {
+			/* Will S lock later */
+			flush = TRUE;
+		}
+	}
+
+        if (flush) {
+
+		/* We are committed to flushing by the time we get here */
+
+		buf_page_set_io_fix(bpage, BUF_IO_WRITE);
+
+		buf_page_set_flush_type(bpage, flush_type);
+
+		if (buf_pool->n_flush[flush_type] == 0) {
+
+			os_event_reset(buf_pool->no_flush[flush_type]);
+		}
+
+		++buf_pool->n_flush[flush_type];
+
+		mutex_exit(block_mutex);
+		buf_pool_mutex_exit(buf_pool);
+
+		if (flush_type == BUF_FLUSH_LIST
+		    && is_uncompressed
+		    && !rw_lock_s_lock_gen_nowait(rw_lock, BUF_IO_WRITE)) {
+			/* avoiding deadlock possibility involves doublewrite
+			buffer, should flush it, because it might hold the
+			another block->lock. */
+			buf_dblwr_flush_buffered_writes();
+
+			rw_lock_s_lock_gen(rw_lock, BUF_IO_WRITE);
+                }
+
+                /* Even though bpage is not protected by any mutex at this
+                point, it is safe to access bpage, because it is io_fixed and
+                oldest_modification != 0.  Thus, it cannot be relocated in the
+                buffer pool or removed from flush_list or LRU_list. */
+
+                buf_flush_write_block_low(bpage, flush_type, sync);
+        }
+
+	return(flush);
+}
+
+# if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+/********************************************************************//**
+Writes a flushable page asynchronously from the buffer pool to a file.
+NOTE: buf_pool->mutex and block->mutex must be held upon entering this
+function, and they will be released by this function after flushing.
+This is loosely based on buf_flush_batch() and buf_flush_page().
+@return TRUE if the page was flushed and the mutexes released */
+UNIV_INTERN
+ibool
+buf_flush_page_try(
+/*===============*/
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
+	buf_block_t*	block)		/*!< in/out: buffer control block */
+{
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(mutex_own(&block->mutex));
+
+	if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) {
+		return(FALSE);
+	}
+
+	/* The following call will release the buffer pool and
+	block mutex. */
+	return(buf_flush_page(
+			buf_pool, &block->page, BUF_FLUSH_SINGLE_PAGE, true));
+}
+# endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+/***********************************************************//**
+Check the page is in buffer pool and can be flushed.
+@return	true if the page can be flushed. */
+static
+bool
+buf_flush_check_neighbor(
+/*=====================*/
+	ulint		space,		/*!< in: space id */
+	ulint		offset,		/*!< in: page offset */
+	buf_flush_t	flush_type)	/*!< in: BUF_FLUSH_LRU or
+					BUF_FLUSH_LIST */
+{
+	buf_page_t*	bpage;
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+	bool		ret;
+
+	ut_ad(flush_type == BUF_FLUSH_LRU
+	      || flush_type == BUF_FLUSH_LIST);
+
+	buf_pool_mutex_enter(buf_pool);
+
+	/* We only want to flush pages from this buffer pool. */
+	bpage = buf_page_hash_get(buf_pool, space, offset);
+
+	if (!bpage) {
+
+		buf_pool_mutex_exit(buf_pool);
+		return(false);
+	}
+
+	ut_a(buf_page_in_file(bpage));
+
+	/* We avoid flushing 'non-old' blocks in an LRU flush,
+	because the flushed blocks are soon freed */
+
+	ret = false;
+	if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) {
+		ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
+
+		mutex_enter(block_mutex);
+		if (buf_flush_ready_for_flush(bpage, flush_type)) {
+			ret = true;
+		}
+		mutex_exit(block_mutex);
+	}
+	buf_pool_mutex_exit(buf_pool);
+
+	return(ret);
+}
+
+/***********************************************************//**
+Flushes to disk all flushable pages within the flush area.
+@return	number of pages flushed */
+static
+ulint
+buf_flush_try_neighbors(
+/*====================*/
+	ulint		space,		/*!< in: space id */
+	ulint		offset,		/*!< in: page offset */
+	buf_flush_t	flush_type,	/*!< in: BUF_FLUSH_LRU or
+					BUF_FLUSH_LIST */
+	ulint		n_flushed,	/*!< in: number of pages
+					flushed so far in this batch */
+	ulint		n_to_flush)	/*!< in: maximum number of pages
+					we are allowed to flush */
+{
+	ulint		i;
+	ulint		low;
+	ulint		high;
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+
+	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
+
+	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
+	    || srv_flush_neighbors == 0) {
+		/* If there is little space or neighbor flushing is
+		not enabled then just flush the victim. */
+		low = offset;
+		high = offset + 1;
+	} else {
+		/* When flushed, dirty blocks are searched in
+		neighborhoods of this size, and flushed along with the
+		original page. */
+
+		ulint	buf_flush_area;
+
+		buf_flush_area	= ut_min(
+			BUF_READ_AHEAD_AREA(buf_pool),
+			buf_pool->curr_size / 16);
+
+		low = (offset / buf_flush_area) * buf_flush_area;
+		high = (offset / buf_flush_area + 1) * buf_flush_area;
+
+		if (srv_flush_neighbors == 1) {
+			/* adjust 'low' and 'high' to limit
+			   for contiguous dirty area */
+			if (offset > low) {
+				for (i = offset - 1;
+				     i >= low
+				     && buf_flush_check_neighbor(
+						space, i, flush_type);
+				     i--) {
+					/* do nothing */
+				}
+				low = i + 1;
+			}
+
+			for (i = offset + 1;
+			     i < high
+			     && buf_flush_check_neighbor(
+						space, i, flush_type);
+			     i++) {
+				/* do nothing */
+			}
+			high = i;
+		}
+	}
+
+	/* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
+
+	if (high > fil_space_get_size(space)) {
+		high = fil_space_get_size(space);
+	}
+
+	ulint	count = 0;
+
+	for (i = low; i < high; i++) {
+
+		if ((count + n_flushed) >= n_to_flush) {
+
+			/* We have already flushed enough pages and
+			should call it a day. There is, however, one
+			exception. If the page whose neighbors we
+			are flushing has not been flushed yet then
+			we'll try to flush the victim that we
+			selected originally. */
+			if (i <= offset) {
+				i = offset;
+			} else {
+				break;
+			}
+		}
+
+		buf_pool = buf_pool_get(space, i);
+
+		buf_pool_mutex_enter(buf_pool);
+
+		/* We only want to flush pages from this buffer pool. */
+		buf_page_t*	bpage = buf_page_hash_get(buf_pool, space, i);
+
+		if (bpage == NULL) {
+
+			buf_pool_mutex_exit(buf_pool);
+			continue;
+		}
+
+		ut_a(buf_page_in_file(bpage));
+
+		/* We avoid flushing 'non-old' blocks in an LRU flush,
+		because the flushed blocks are soon freed */
+
+		if (flush_type != BUF_FLUSH_LRU
+		    || i == offset
+		    || buf_page_is_old(bpage)) {
+
+			ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
+
+			mutex_enter(block_mutex);
+
+			if (buf_flush_ready_for_flush(bpage, flush_type)
+			    && (i == offset || bpage->buf_fix_count == 0)
+			    && buf_flush_page(
+					buf_pool, bpage, flush_type, false)) {
+
+				++count;
+
+				continue;
+			}
+
+			mutex_exit(block_mutex);
+		}
+
+		buf_pool_mutex_exit(buf_pool);
+	}
+
+	if (count > 0) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+					MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+					MONITOR_FLUSH_NEIGHBOR_COUNT,
+					MONITOR_FLUSH_NEIGHBOR_PAGES,
+					(count - 1));
+	}
+
+	return(count);
+}
+
+/********************************************************************//**
+Check if the block is modified and ready for flushing. If the the block
+is ready to flush then flush the page and try o flush its neighbors.
+
+@return	TRUE if buf_pool mutex was released during this function.
+This does not guarantee that some pages were written as well.
+Number of pages written are incremented to the count. */
+static
+ibool
+buf_flush_page_and_try_neighbors(
+/*=============================*/
+	buf_page_t*	bpage,		/*!< in: buffer control block,
+					must be
+					buf_page_in_file(bpage) */
+	buf_flush_t	flush_type,	/*!< in: BUF_FLUSH_LRU
+					or BUF_FLUSH_LIST */
+	ulint		n_to_flush,	/*!< in: number of pages to
+					flush */
+	ulint*		count)		/*!< in/out: number of pages
+					flushed */
+{
+	ibool		flushed;
+	ib_mutex_t*	block_mutex;
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+#endif /* UNIV_DEBUG */
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	block_mutex = buf_page_get_mutex(bpage);
+	mutex_enter(block_mutex);
+
+	ut_a(buf_page_in_file(bpage));
+
+	if (buf_flush_ready_for_flush(bpage, flush_type)) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_bpage(bpage);
+
+		buf_pool_mutex_exit(buf_pool);
+
+		/* These fields are protected by both the
+		buffer pool mutex and block mutex. */
+		ulint	space = buf_page_get_space(bpage);
+		ulint	offset = buf_page_get_page_no(bpage);
+
+		mutex_exit(block_mutex);
+
+		/* Try to flush also all the neighbors */
+		*count += buf_flush_try_neighbors(
+			space, offset, flush_type, *count, n_to_flush);
+
+		buf_pool_mutex_enter(buf_pool);
+
+		flushed = TRUE;
+
+	} else {
+		mutex_exit(block_mutex);
+		flushed = FALSE;
+	}
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	return(flushed);
+}
+
+/*******************************************************************//**
+This utility moves the uncompressed frames of pages to the free list.
+Note that this function does not actually flush any data to disk. It
+just detaches the uncompressed frames from the compressed pages at the
+tail of the unzip_LRU and puts those freed frames in the free list.
+Note that it is a best effort attempt and it is not guaranteed that
+after a call to this function there will be 'max' blocks in the free
+list.
+@return number of blocks moved to the free list. */
+static
+ulint
+buf_free_from_unzip_LRU_list_batch(
+/*===============================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ulint		max)		/*!< in: desired number of
+					blocks in the free_list */
+{
+	buf_block_t*	block;
+	ulint		scanned = 0;
+	ulint		count = 0;
+	ulint		free_len = UT_LIST_GET_LEN(buf_pool->free);
+	ulint		lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
+	while (block != NULL && count < max
+	       && free_len < srv_LRU_scan_depth
+	       && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
+
+		++scanned;
+		if (buf_LRU_free_page(&block->page, false)) {
+			/* Block was freed. buf_pool->mutex potentially
+			released and reacquired */
+			++count;
+			block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
+
+		} else {
+
+			block = UT_LIST_GET_PREV(unzip_LRU, block);
+		}
+
+		free_len = UT_LIST_GET_LEN(buf_pool->free);
+		lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
+	}
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	if (scanned) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_BATCH_SCANNED,
+			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+			scanned);
+	}
+
+	return(count);
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list.
+The calling thread is not allowed to own any latches on pages!
+It attempts to make 'max' blocks available in the free list. Note that
+it is a best effort attempt and it is not guaranteed that after a call
+to this function there will be 'max' blocks in the free list.
+@return number of blocks for which the write request was queued. */
+static
+ulint
+buf_flush_LRU_list_batch(
+/*=====================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ulint		max)		/*!< in: desired number of
+					blocks in the free_list */
+{
+	buf_page_t*	bpage;
+	ulint		count = 0;
+	ulint		scanned = 0;
+	ulint		free_len = UT_LIST_GET_LEN(buf_pool->free);
+	ulint		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+	while (bpage != NULL && count < max
+	       && free_len < srv_LRU_scan_depth
+	       && lru_len > BUF_LRU_MIN_LEN) {
+
+		ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
+		ibool	 evict;
+
+		mutex_enter(block_mutex);
+		evict = buf_flush_ready_for_replace(bpage);
+		mutex_exit(block_mutex);
+
+		++scanned;
+
+		/* If the block is ready to be replaced we try to
+		free it i.e.: put it on the free list.
+		Otherwise we try to flush the block and its
+		neighbors. In this case we'll put it on the
+		free list in the next pass. We do this extra work
+		of putting blocks to the free list instead of
+		just flushing them because after every flush
+		we have to restart the scan from the tail of
+		the LRU list and if we don't clear the tail
+		of the flushed pages then the scan becomes
+		O(n*n). */
+		if (evict) {
+			if (buf_LRU_free_page(bpage, true)) {
+				/* buf_pool->mutex was potentially
+				released and reacquired. */
+				bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+			} else {
+				bpage = UT_LIST_GET_PREV(LRU, bpage);
+			}
+		} else {
+			ulint		space;
+			ulint		offset;
+			buf_page_t*	prev_bpage;
+
+			prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
+
+			/* Save the previous bpage */
+
+			if (prev_bpage != NULL) {
+				space = prev_bpage->space;
+				offset = prev_bpage->offset;
+			} else {
+				space = ULINT_UNDEFINED;
+				offset = ULINT_UNDEFINED;
+			}
+
+			if (!buf_flush_page_and_try_neighbors(
+				bpage, BUF_FLUSH_LRU, max, &count)) {
+
+				bpage = prev_bpage;
+			} else {
+				/* buf_pool->mutex was released.
+				reposition the iterator. Note: the
+				prev block could have been repositioned
+				too but that should be rare. */
+
+				if (prev_bpage != NULL) {
+
+					ut_ad(space != ULINT_UNDEFINED);
+					ut_ad(offset != ULINT_UNDEFINED);
+
+					prev_bpage = buf_page_hash_get(
+						buf_pool, space, offset);
+				}
+
+				bpage = prev_bpage;
+			}
+		}
+
+		free_len = UT_LIST_GET_LEN(buf_pool->free);
+		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+	}
+
+	/* We keep track of all flushes happening as part of LRU
+	flush. When estimating the desired rate at which flush_list
+	should be flushed, we factor in this value. */
+	buf_lru_flush_page_count += count;
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	if (scanned) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_BATCH_SCANNED,
+			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+			scanned);
+	}
+
+	return(count);
+}
+
+/*******************************************************************//**
+Flush and move pages from LRU or unzip_LRU list to the free list.
+Whether LRU or unzip_LRU is used depends on the state of the system.
+@return number of blocks for which either the write request was queued
+or in case of unzip_LRU the number of blocks actually moved to the
+free list */
+static
+ulint
+buf_do_LRU_batch(
+/*=============*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ulint		max)		/*!< in: desired number of
+					blocks in the free_list */
+{
+	ulint	count = 0;
+
+	if (buf_LRU_evict_from_unzip_LRU(buf_pool)) {
+		count += buf_free_from_unzip_LRU_list_batch(buf_pool, max);
+	}
+
+	if (max > count) {
+		count += buf_flush_LRU_list_batch(buf_pool, max - count);
+	}
+
+	return(count);
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the flush_list.
+the calling thread is not allowed to own any latches on pages!
+@return number of blocks for which the write request was queued;
+ULINT_UNDEFINED if there was a flush of the same type already
+running */
+static
+ulint
+buf_do_flush_list_batch(
+/*====================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ulint		min_n,		/*!< in: wished minimum mumber
+					of blocks flushed (it is not
+					guaranteed that the actual
+					number is that big, though) */
+	lsn_t		lsn_limit)	/*!< all blocks whose
+					oldest_modification is smaller
+					than this should be flushed (if
+					their number does not exceed
+					min_n) */
+{
+	ulint		count = 0;
+	ulint		scanned = 0;
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	/* Start from the end of the list looking for a suitable
+	block to be flushed. */
+	buf_flush_list_mutex_enter(buf_pool);
+	ulint len = UT_LIST_GET_LEN(buf_pool->flush_list);
+
+	/* In order not to degenerate this scan to O(n*n) we attempt
+	to preserve pointer of previous block in the flush list. To do
+	so we declare it a hazard pointer. Any thread working on the
+	flush list must check the hazard pointer and if it is removing
+	the same block then it must reset it. */
+	for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
+	     count < min_n && bpage != NULL && len > 0
+	     && bpage->oldest_modification < lsn_limit;
+	     ++scanned) {
+
+		buf_page_t*	prev;
+
+		ut_a(bpage->oldest_modification > 0);
+		ut_ad(bpage->in_flush_list);
+
+		prev = UT_LIST_GET_PREV(list, bpage);
+		buf_flush_set_hp(buf_pool, prev);
+
+		buf_flush_list_mutex_exit(buf_pool);
+
+#ifdef UNIV_DEBUG
+		bool flushed =
+#endif /* UNIV_DEBUG */
+		buf_flush_page_and_try_neighbors(
+			bpage, BUF_FLUSH_LIST, min_n, &count);
+
+		buf_flush_list_mutex_enter(buf_pool);
+
+		ut_ad(flushed || buf_flush_is_hp(buf_pool, prev));
+
+		if (!buf_flush_is_hp(buf_pool, prev)) {
+			/* The hazard pointer was reset by some other
+			thread. Restart the scan. */
+			ut_ad(buf_flush_is_hp(buf_pool, NULL));
+			bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
+			len = UT_LIST_GET_LEN(buf_pool->flush_list);
+		} else {
+			bpage = prev;
+			--len;
+			buf_flush_set_hp(buf_pool, NULL);
+		}
+
+		ut_ad(!bpage || bpage->in_flush_list);
+	}
+
+	buf_flush_list_mutex_exit(buf_pool);
+
+	MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED,
+				     MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+				     MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
+				     scanned);
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	return(count);
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list or flush_list.
+NOTE 1: in the case of an LRU flush the calling thread may own latches to
+pages: to avoid deadlocks, this function must be written so that it cannot
+end up waiting for these latches! NOTE 2: in the case of a flush list flush,
+the calling thread is not allowed to own any latches on pages!
+@return number of blocks for which the write request was queued */
+static
+ulint
+buf_flush_batch(
+/*============*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	buf_flush_t	flush_type,	/*!< in: BUF_FLUSH_LRU or
+					BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
+					then the caller must not own any
+					latches on pages */
+	ulint		min_n,		/*!< in: wished minimum mumber of blocks
+					flushed (it is not guaranteed that the
+					actual number is that big, though) */
+	lsn_t		lsn_limit)	/*!< in: in the case of BUF_FLUSH_LIST
+					all blocks whose oldest_modification is
+					smaller than this should be flushed
+					(if their number does not exceed
+					min_n), otherwise ignored */
+{
+	ulint		count	= 0;
+
+	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad((flush_type != BUF_FLUSH_LIST)
+	      || sync_thread_levels_empty_except_dict());
+#endif /* UNIV_SYNC_DEBUG */
+
+	buf_pool_mutex_enter(buf_pool);
+
+	/* Note: The buffer pool mutex is released and reacquired within
+	the flush functions. */
+	switch (flush_type) {
+	case BUF_FLUSH_LRU:
+		count = buf_do_LRU_batch(buf_pool, min_n);
+		break;
+	case BUF_FLUSH_LIST:
+		count = buf_do_flush_list_batch(buf_pool, min_n, lsn_limit);
+		break;
+	default:
+		ut_error;
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints && count > 0) {
+		fprintf(stderr, flush_type == BUF_FLUSH_LRU
+			? "Flushed %lu pages in LRU flush\n"
+			: "Flushed %lu pages in flush list flush\n",
+			(ulong) count);
+	}
+#endif /* UNIV_DEBUG */
+
+	return(count);
+}
+
+/******************************************************************//**
+Gather the aggregated stats for both flush list and LRU list flushing */
+static
+void
+buf_flush_common(
+/*=============*/
+	buf_flush_t	flush_type,	/*!< in: type of flush */
+	ulint		page_count)	/*!< in: number of pages flushed */
+{
+	buf_dblwr_flush_buffered_writes();
+
+	ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints && page_count > 0) {
+		fprintf(stderr, flush_type == BUF_FLUSH_LRU
+			? "Flushed %lu pages in LRU flush\n"
+			: "Flushed %lu pages in flush list flush\n",
+			(ulong) page_count);
+	}
+#endif /* UNIV_DEBUG */
+
+	srv_stats.buf_pool_flushed.add(page_count);
+}
+
+/******************************************************************//**
+Start a buffer flush batch for LRU or flush list */
+static
+ibool
+buf_flush_start(
+/*============*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	buf_flush_t	flush_type)	/*!< in: BUF_FLUSH_LRU
+					or BUF_FLUSH_LIST */
+{
+	buf_pool_mutex_enter(buf_pool);
+
+	if (buf_pool->n_flush[flush_type] > 0
+	   || buf_pool->init_flush[flush_type] == TRUE) {
+
+		/* There is already a flush batch of the same type running */
+
+		buf_pool_mutex_exit(buf_pool);
+
+		return(FALSE);
+	}
+
+	buf_pool->init_flush[flush_type] = TRUE;
+
+	buf_pool_mutex_exit(buf_pool);
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+End a buffer flush batch for LRU or flush list */
+static
+void
+buf_flush_end(
+/*==========*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	buf_flush_t	flush_type)	/*!< in: BUF_FLUSH_LRU
+					or BUF_FLUSH_LIST */
+{
+	buf_pool_mutex_enter(buf_pool);
+
+	buf_pool->init_flush[flush_type] = FALSE;
+
+	buf_pool->try_LRU_scan = TRUE;
+
+	if (buf_pool->n_flush[flush_type] == 0) {
+
+		/* The running flush batch has ended */
+
+		os_event_set(buf_pool->no_flush[flush_type]);
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+}
+
+/******************************************************************//**
+Waits until a flush batch of the given type ends */
+UNIV_INTERN
+void
+buf_flush_wait_batch_end(
+/*=====================*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	buf_flush_t	type)		/*!< in: BUF_FLUSH_LRU
+					or BUF_FLUSH_LIST */
+{
+	ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
+
+	if (buf_pool == NULL) {
+		ulint	i;
+
+		for (i = 0; i < srv_buf_pool_instances; ++i) {
+			buf_pool_t*	buf_pool;
+
+			buf_pool = buf_pool_from_array(i);
+
+			thd_wait_begin(NULL, THD_WAIT_DISKIO);
+			os_event_wait(buf_pool->no_flush[type]);
+			thd_wait_end(NULL);
+		}
+	} else {
+		thd_wait_begin(NULL, THD_WAIT_DISKIO);
+		os_event_wait(buf_pool->no_flush[type]);
+		thd_wait_end(NULL);
+	}
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list and also
+puts replaceable clean pages from the end of the LRU list to the free
+list.
+NOTE: The calling thread is not allowed to own any latches on pages!
+@return true if a batch was queued successfully. false if another batch
+of same type was already running. */
+static
+bool
+buf_flush_LRU(
+/*==========*/
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
+	ulint		min_n,		/*!< in: wished minimum mumber of blocks
+					flushed (it is not guaranteed that the
+					actual number is that big, though) */
+	ulint*		n_processed)	/*!< out: the number of pages
+					which were processed is passed
+					back to caller. Ignored if NULL */
+{
+	ulint		page_count;
+
+	if (n_processed) {
+		*n_processed = 0;
+	}
+
+	if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) {
+		return(false);
+	}
+
+	page_count = buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0);
+
+	buf_flush_end(buf_pool, BUF_FLUSH_LRU);
+
+	buf_flush_common(BUF_FLUSH_LRU, page_count);
+
+	if (n_processed) {
+		*n_processed = page_count;
+	}
+
+	return(true);
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the flush list of
+all buffer pool instances.
+NOTE: The calling thread is not allowed to own any latches on pages!
+@return true if a batch was queued successfully for each buffer pool
+instance. false if another batch of same type was already running in
+at least one of the buffer pool instance */
+UNIV_INTERN
+bool
+buf_flush_list(
+/*===========*/
+	ulint		min_n,		/*!< in: wished minimum mumber of blocks
+					flushed (it is not guaranteed that the
+					actual number is that big, though) */
+	lsn_t		lsn_limit,	/*!< in the case BUF_FLUSH_LIST all
+					blocks whose oldest_modification is
+					smaller than this should be flushed
+					(if their number does not exceed
+					min_n), otherwise ignored */
+	ulint*		n_processed)	/*!< out: the number of pages
+					which were processed is passed
+					back to caller. Ignored if NULL */
+
+{
+	ulint		i;
+	bool		success = true;
+
+	if (n_processed) {
+		*n_processed = 0;
+	}
+
+	if (min_n != ULINT_MAX) {
+		/* Ensure that flushing is spread evenly amongst the
+		buffer pool instances. When min_n is ULINT_MAX
+		we need to flush everything up to the lsn limit
+		so no limit here. */
+		min_n = (min_n + srv_buf_pool_instances - 1)
+			 / srv_buf_pool_instances;
+	}
+
+	/* Flush to lsn_limit in all buffer pool instances */
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+		ulint		page_count = 0;
+
+		buf_pool = buf_pool_from_array(i);
+
+		if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) {
+			/* We have two choices here. If lsn_limit was
+			specified then skipping an instance of buffer
+			pool means we cannot guarantee that all pages
+			up to lsn_limit has been flushed. We can
+			return right now with failure or we can try
+			to flush remaining buffer pools up to the
+			lsn_limit. We attempt to flush other buffer
+			pools based on the assumption that it will
+			help in the retry which will follow the
+			failure. */
+			success = false;
+
+			continue;
+		}
+
+		page_count = buf_flush_batch(
+			buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit);
+
+		buf_flush_end(buf_pool, BUF_FLUSH_LIST);
+
+		buf_flush_common(BUF_FLUSH_LIST, page_count);
+
+		if (n_processed) {
+			*n_processed += page_count;
+		}
+
+		if (page_count) {
+			MONITOR_INC_VALUE_CUMULATIVE(
+				MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+				MONITOR_FLUSH_BATCH_COUNT,
+				MONITOR_FLUSH_BATCH_PAGES,
+				page_count);
+		}
+	}
+
+	return(success);
+}
+
+/******************************************************************//**
+This function picks up a single dirty page from the tail of the LRU
+list, flushes it, removes it from page_hash and LRU list and puts
+it on the free list. It is called from user threads when they are
+unable to find a replaceable page at the tail of the LRU list i.e.:
+when the background LRU flushing in the page_cleaner thread is not
+fast enough to keep pace with the workload.
+@return TRUE if success. */
+UNIV_INTERN
+ibool
+buf_flush_single_page_from_LRU(
+/*===========================*/
+	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance */
+{
+	ulint		scanned;
+	buf_page_t*	bpage;
+
+	buf_pool_mutex_enter(buf_pool);
+
+	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), scanned = 1;
+	     bpage != NULL;
+	     bpage = UT_LIST_GET_PREV(LRU, bpage), ++scanned) {
+
+		ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
+
+		mutex_enter(block_mutex);
+
+		if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_SINGLE_PAGE)) {
+
+			/* The following call will release the buffer pool
+			and block mutex. */
+
+			ibool	flushed = buf_flush_page(
+				buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, true);
+
+			if (flushed) {
+				/* buf_flush_page() will release the
+				block mutex */
+				break;
+			}
+		}
+
+		mutex_exit(block_mutex);
+	}
+
+	MONITOR_INC_VALUE_CUMULATIVE(
+		MONITOR_LRU_SINGLE_FLUSH_SCANNED,
+		MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
+		MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
+		scanned);
+
+	if (bpage == NULL) {
+		/* Can't find a single flushable page. */
+		buf_pool_mutex_exit(buf_pool);
+		return(FALSE);
+	}
+
+
+	ibool	freed = FALSE;
+
+	/* At this point the page has been written to the disk.
+	As we are not holding buffer pool or block mutex therefore
+	we cannot use the bpage safely. It may have been plucked out
+	of the LRU list by some other thread or it may even have
+	relocated in case of a compressed page. We need to start
+	the scan of LRU list again to remove the block from the LRU
+	list and put it on the free list. */
+	buf_pool_mutex_enter(buf_pool);
+
+	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+	     bpage != NULL;
+	     bpage = UT_LIST_GET_PREV(LRU, bpage)) {
+
+		ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
+
+		mutex_enter(block_mutex);
+
+		ibool	ready = buf_flush_ready_for_replace(bpage);
+
+		mutex_exit(block_mutex);
+
+		if (ready) {
+			bool	evict_zip;
+
+			evict_zip = !buf_LRU_evict_from_unzip_LRU(buf_pool);;
+
+			freed = buf_LRU_free_page(bpage, evict_zip);
+
+			break;
+		}
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+
+	return(freed);
+}
+
+/*********************************************************************//**
+Clears up tail of the LRU lists:
+* Put replaceable pages at the tail of LRU to the free list
+* Flush dirty pages at the tail of LRU to the disk
+The depth to which we scan each buffer pool is controlled by dynamic
+config parameter innodb_LRU_scan_depth.
+@return total pages flushed */
+UNIV_INTERN
+ulint
+buf_flush_LRU_tail(void)
+/*====================*/
+{
+	ulint	total_flushed = 0;
+
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+
+		buf_pool_t*	buf_pool = buf_pool_from_array(i);
+		ulint		scan_depth;
+
+		/* srv_LRU_scan_depth can be arbitrarily large value.
+		We cap it with current LRU size. */
+		buf_pool_mutex_enter(buf_pool);
+		scan_depth = UT_LIST_GET_LEN(buf_pool->LRU);
+		buf_pool_mutex_exit(buf_pool);
+
+		scan_depth = ut_min(srv_LRU_scan_depth, scan_depth);
+
+		/* We divide LRU flush into smaller chunks because
+		there may be user threads waiting for the flush to
+		end in buf_LRU_get_free_block(). */
+		for (ulint j = 0;
+		     j < scan_depth;
+		     j += PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE) {
+
+			ulint	n_flushed = 0;
+
+			/* Currently page_cleaner is the only thread
+			that can trigger an LRU flush. It is possible
+			that a batch triggered during last iteration is
+			still running, */
+			if (buf_flush_LRU(buf_pool,
+					  PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE,
+					  &n_flushed)) {
+
+				/* Allowed only one batch per
+				buffer pool instance. */
+				buf_flush_wait_batch_end(
+					buf_pool, BUF_FLUSH_LRU);
+			}
+
+			if (n_flushed) {
+				total_flushed += n_flushed;
+			} else {
+				/* Nothing to flush */
+				break;
+			}
+		}
+	}
+
+	if (total_flushed) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_BATCH_TOTAL_PAGE,
+			MONITOR_LRU_BATCH_COUNT,
+			MONITOR_LRU_BATCH_PAGES,
+			total_flushed);
+	}
+
+	return(total_flushed);
+}
+
+/*********************************************************************//**
+Wait for any possible LRU flushes that are in progress to end. */
+UNIV_INTERN
+void
+buf_flush_wait_LRU_batch_end(void)
+/*==============================*/
+{
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		buf_pool_mutex_enter(buf_pool);
+
+		if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
+		   || buf_pool->init_flush[BUF_FLUSH_LRU]) {
+
+			buf_pool_mutex_exit(buf_pool);
+			buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
+		} else {
+			buf_pool_mutex_exit(buf_pool);
+		}
+	}
+}
+
+/*********************************************************************//**
+Flush a batch of dirty pages from the flush list
+@return number of pages flushed, 0 if no page is flushed or if another
+flush_list type batch is running */
+static
+ulint
+page_cleaner_do_flush_batch(
+/*========================*/
+	ulint		n_to_flush,	/*!< in: number of pages that
+					we should attempt to flush. */
+	lsn_t		lsn_limit)	/*!< in: LSN up to which flushing
+					must happen */
+{
+	ulint n_flushed;
+
+	buf_flush_list(n_to_flush, lsn_limit, &n_flushed);
+
+	return(n_flushed);
+}
+
+/*********************************************************************//**
+Calculates if flushing is required based on number of dirty pages in
+the buffer pool.
+@return percent of io_capacity to flush to manage dirty page ratio */
+static
+ulint
+af_get_pct_for_dirty()
+/*==================*/
+{
+	ulint dirty_pct = buf_get_modified_ratio_pct();
+
+	if (dirty_pct > 0 && srv_max_buf_pool_modified_pct == 0) {
+		return(100);
+	}
+
+	ut_a(srv_max_dirty_pages_pct_lwm
+	     <= srv_max_buf_pool_modified_pct);
+
+	if (srv_max_dirty_pages_pct_lwm == 0) {
+		/* The user has not set the option to preflush dirty
+		pages as we approach the high water mark. */
+		if (dirty_pct > srv_max_buf_pool_modified_pct) {
+			/* We have crossed the high water mark of dirty
+			pages In this case we start flushing at 100% of
+			innodb_io_capacity. */
+			return(100);
+		}
+	} else if (dirty_pct > srv_max_dirty_pages_pct_lwm) {
+		/* We should start flushing pages gradually. */
+		return((dirty_pct * 100)
+		       / (srv_max_buf_pool_modified_pct + 1));
+	}
+
+	return(0);
+}
+
+/*********************************************************************//**
+Calculates if flushing is required based on redo generation rate.
+@return percent of io_capacity to flush to manage redo space */
+static
+ulint
+af_get_pct_for_lsn(
+/*===============*/
+	lsn_t	age)	/*!< in: current age of LSN. */
+{
+	lsn_t	max_async_age;
+	lsn_t	lsn_age_factor;
+	lsn_t	af_lwm = (srv_adaptive_flushing_lwm
+			  * log_get_capacity()) / 100;
+
+	if (age < af_lwm) {
+		/* No adaptive flushing. */
+		return(0);
+	}
+
+	max_async_age = log_get_max_modified_age_async();
+
+	if (age < max_async_age && !srv_adaptive_flushing) {
+		/* We have still not reached the max_async point and
+		the user has disabled adaptive flushing. */
+		return(0);
+	}
+
+	/* If we are here then we know that either:
+	1) User has enabled adaptive flushing
+	2) User may have disabled adaptive flushing but we have reached
+	max_async_age. */
+	lsn_age_factor = (age * 100) / max_async_age;
+
+	ut_ad(srv_max_io_capacity >= srv_io_capacity);
+	return(static_cast<ulint>(
+		((srv_max_io_capacity / srv_io_capacity)
+		* (lsn_age_factor * sqrt((double)lsn_age_factor)))
+		/ 7.5));
+}
+
+/*********************************************************************//**
+This function is called approximately once every second by the
+page_cleaner thread. Based on various factors it decides if there is a
+need to do flushing. If flushing is needed it is performed and the
+number of pages flushed is returned.
+@return number of pages flushed */
+static
+ulint
+page_cleaner_flush_pages_if_needed(void)
+/*====================================*/
+{
+	static	lsn_t		lsn_avg_rate = 0;
+	static	lsn_t		prev_lsn = 0;
+	static	lsn_t		last_lsn = 0;
+	static	ulint		sum_pages = 0;
+	static	ulint		last_pages = 0;
+	static	ulint		prev_pages = 0;
+	static	ulint		avg_page_rate = 0;
+	static	ulint		n_iterations = 0;
+	lsn_t			oldest_lsn;
+	lsn_t			cur_lsn;
+	lsn_t			age;
+	lsn_t			lsn_rate;
+	ulint			n_pages = 0;
+	ulint			pct_for_dirty = 0;
+	ulint			pct_for_lsn = 0;
+	ulint			pct_total = 0;
+	int			age_factor = 0;
+
+	cur_lsn = log_get_lsn();
+
+	if (prev_lsn == 0) {
+		/* First time around. */
+		prev_lsn = cur_lsn;
+		return(0);
+	}
+
+	if (prev_lsn == cur_lsn) {
+		return(0);
+	}
+
+	/* We update our variables every srv_flushing_avg_loops
+	iterations to smooth out transition in workload. */
+	if (++n_iterations >= srv_flushing_avg_loops) {
+
+		avg_page_rate = ((sum_pages / srv_flushing_avg_loops)
+				 + avg_page_rate) / 2;
+
+		/* How much LSN we have generated since last call. */
+		lsn_rate = (cur_lsn - prev_lsn) / srv_flushing_avg_loops;
+
+		lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
+
+		prev_lsn = cur_lsn;
+
+		n_iterations = 0;
+
+		sum_pages = 0;
+	}
+
+	oldest_lsn = buf_pool_get_oldest_modification();
+
+	ut_ad(oldest_lsn <= log_get_lsn());
+
+	age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0;
+
+	pct_for_dirty = af_get_pct_for_dirty();
+	pct_for_lsn = af_get_pct_for_lsn(age);
+
+	pct_total = ut_max(pct_for_dirty, pct_for_lsn);
+
+	/* Cap the maximum IO capacity that we are going to use by
+	max_io_capacity. */
+	n_pages = (PCT_IO(pct_total) + avg_page_rate) / 2;
+
+	if (n_pages > srv_max_io_capacity) {
+		n_pages = srv_max_io_capacity;
+	}
+
+	if (last_pages && cur_lsn - last_lsn > lsn_avg_rate / 2) {
+		age_factor = static_cast<int>(prev_pages / last_pages);
+	}
+
+	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
+
+	prev_pages = n_pages;
+	n_pages = page_cleaner_do_flush_batch(
+		n_pages, oldest_lsn + lsn_avg_rate * (age_factor + 1));
+
+	last_lsn= cur_lsn;
+	last_pages= n_pages + 1;
+
+	MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
+	MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
+	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
+	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
+
+	if (n_pages) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+			MONITOR_FLUSH_ADAPTIVE_COUNT,
+			MONITOR_FLUSH_ADAPTIVE_PAGES,
+			n_pages);
+
+		sum_pages += n_pages;
+	}
+
+	return(n_pages);
+}
+
+/*********************************************************************//**
+Puts the page_cleaner thread to sleep if it has finished work in less
+than a second */
+static
+void
+page_cleaner_sleep_if_needed(
+/*=========================*/
+	ulint	next_loop_time)	/*!< in: time when next loop iteration
+				should start */
+{
+	ulint	cur_time = ut_time_ms();
+
+	if (next_loop_time > cur_time) {
+		/* Get sleep interval in micro seconds. We use
+		ut_min() to avoid long sleep in case of
+		wrap around. */
+		os_thread_sleep(ut_min(1000000,
+				(next_loop_time - cur_time)
+				 * 1000));
+	}
+}
+
+/******************************************************************//**
+page_cleaner thread tasked with flushing dirty pages from the buffer
+pools. As of now we'll have only one instance of this thread.
+@return a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(buf_flush_page_cleaner_thread)(
+/*==========================================*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	ulint	next_loop_time = ut_time_ms() + 1000;
+	ulint	n_flushed = 0;
+	ulint	last_activity = srv_get_activity_count();
+
+	ut_ad(!srv_read_only_mode);
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(buf_page_cleaner_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "InnoDB: page_cleaner thread running, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+	buf_page_cleaner_is_active = TRUE;
+
+	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+
+		/* The page_cleaner skips sleep if the server is
+		idle and there are no pending IOs in the buffer pool
+		and there is work to do. */
+		if (srv_check_activity(last_activity)
+		    || buf_get_n_pending_read_ios()
+		    || n_flushed == 0) {
+			page_cleaner_sleep_if_needed(next_loop_time);
+		}
+
+		next_loop_time = ut_time_ms() + 1000;
+
+		if (srv_check_activity(last_activity)) {
+			last_activity = srv_get_activity_count();
+
+			/* Flush pages from end of LRU if required */
+			n_flushed = buf_flush_LRU_tail();
+
+			/* Flush pages from flush_list if required */
+			n_flushed += page_cleaner_flush_pages_if_needed();
+		} else {
+			n_flushed = page_cleaner_do_flush_batch(
+							PCT_IO(100),
+							LSN_MAX);
+
+			if (n_flushed) {
+				MONITOR_INC_VALUE_CUMULATIVE(
+					MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+					MONITOR_FLUSH_BACKGROUND_COUNT,
+					MONITOR_FLUSH_BACKGROUND_PAGES,
+					n_flushed);
+			}
+		}
+	}
+
+	ut_ad(srv_shutdown_state > 0);
+	if (srv_fast_shutdown == 2) {
+		/* In very fast shutdown we simulate a crash of
+		buffer pool. We are not required to do any flushing */
+		goto thread_exit;
+	}
+
+	/* In case of normal and slow shutdown the page_cleaner thread
+	must wait for all other activity in the server to die down.
+	Note that we can start flushing the buffer pool as soon as the
+	server enters shutdown phase but we must stay alive long enough
+	to ensure that any work done by the master or purge threads is
+	also flushed.
+	During shutdown we pass through two stages. In the first stage,
+	when SRV_SHUTDOWN_CLEANUP is set other threads like the master
+	and the purge threads may be working as well. We start flushing
+	the buffer pool but can't be sure that no new pages are being
+	dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. */
+
+	do {
+		n_flushed = page_cleaner_do_flush_batch(PCT_IO(100), LSN_MAX);
+
+		/* We sleep only if there are no pages to flush */
+		if (n_flushed == 0) {
+			os_thread_sleep(100000);
+		}
+	} while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP);
+
+	/* At this point all threads including the master and the purge
+	thread must have been suspended. */
+	ut_a(srv_get_active_thread_type() == SRV_NONE);
+	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
+
+	/* We can now make a final sweep on flushing the buffer pool
+	and exit after we have cleaned the whole buffer pool.
+	It is important that we wait for any running batch that has
+	been triggered by us to finish. Otherwise we can end up
+	considering end of that batch as a finish of our final
+	sweep and we'll come out of the loop leaving behind dirty pages
+	in the flush_list */
+	buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+	buf_flush_wait_LRU_batch_end();
+
+	bool	success;
+
+	do {
+
+		success = buf_flush_list(PCT_IO(100), LSN_MAX, &n_flushed);
+		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+
+	} while (!success || n_flushed > 0);
+
+	/* Some sanity checks */
+	ut_a(srv_get_active_thread_type() == SRV_NONE);
+	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t* buf_pool = buf_pool_from_array(i);
+		ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0);
+	}
+
+	/* We have lived our life. Time to die. */
+
+thread_exit:
+	buf_page_cleaner_is_active = FALSE;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+
+/** Functor to validate the flush list. */
+struct	Check {
+	void	operator()(const buf_page_t* elem)
+	{
+		ut_a(elem->in_flush_list);
+	}
+};
+
+/******************************************************************//**
+Validates the flush list.
+@return	TRUE if ok */
+static
+ibool
+buf_flush_validate_low(
+/*===================*/
+	buf_pool_t*	buf_pool)		/*!< in: Buffer pool instance */
+{
+	buf_page_t*		bpage;
+	const ib_rbt_node_t*	rnode = NULL;
+
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+	UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list, Check());
+
+	bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
+
+	/* If we are in recovery mode i.e.: flush_rbt != NULL
+	then each block in the flush_list must also be present
+	in the flush_rbt. */
+	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+		rnode = rbt_first(buf_pool->flush_rbt);
+	}
+
+	while (bpage != NULL) {
+		const lsn_t	om = bpage->oldest_modification;
+
+		ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
+
+		ut_ad(bpage->in_flush_list);
+
+		/* A page in buf_pool->flush_list can be in
+		BUF_BLOCK_REMOVE_HASH state. This happens when a page
+		is in the middle of being relocated. In that case the
+		original descriptor can have this state and still be
+		in the flush list waiting to acquire the
+		buf_pool->flush_list_mutex to complete the relocation. */
+		ut_a(buf_page_in_file(bpage)
+		     || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
+		ut_a(om > 0);
+
+		if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+			buf_page_t** prpage;
+
+			ut_a(rnode);
+			prpage = rbt_value(buf_page_t*, rnode);
+
+			ut_a(*prpage);
+			ut_a(*prpage == bpage);
+			rnode = rbt_next(buf_pool->flush_rbt, rnode);
+		}
+
+		bpage = UT_LIST_GET_NEXT(list, bpage);
+
+		ut_a(!bpage || om >= bpage->oldest_modification);
+	}
+
+	/* By this time we must have exhausted the traversal of
+	flush_rbt (if active) as well. */
+	ut_a(rnode == NULL);
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+Validates the flush list.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+buf_flush_validate(
+/*===============*/
+	buf_pool_t*	buf_pool)	/*!< buffer pool instance */
+{
+	ibool	ret;
+
+	buf_flush_list_mutex_enter(buf_pool);
+
+	ret = buf_flush_validate_low(buf_pool);
+
+	buf_flush_list_mutex_exit(buf_pool);
+
+	return(ret);
+}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Check if there are any dirty pages that belong to a space id in the flush
+list in a particular buffer pool.
+@return	number of dirty pages present in a single buffer pool */
+UNIV_INTERN
+ulint
+buf_pool_get_dirty_pages_count(
+/*===========================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool */
+	ulint		id)		/*!< in: space id to check */
+
+{
+	ulint		count = 0;
+
+	buf_pool_mutex_enter(buf_pool);
+	buf_flush_list_mutex_enter(buf_pool);
+
+	buf_page_t*	bpage;
+
+	for (bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
+	     bpage != 0;
+	     bpage = UT_LIST_GET_NEXT(list, bpage)) {
+
+		ut_ad(buf_page_in_file(bpage));
+		ut_ad(bpage->in_flush_list);
+		ut_ad(bpage->oldest_modification > 0);
+
+		if (buf_page_get_space(bpage) == id) {
+			++count;
+		}
+	}
+
+	buf_flush_list_mutex_exit(buf_pool);
+	buf_pool_mutex_exit(buf_pool);
+
+	return(count);
+}
+
+/******************************************************************//**
+Check if there are any dirty pages that belong to a space id in the flush list.
+@return	number of dirty pages present in all the buffer pools */
+UNIV_INTERN
+ulint
+buf_flush_get_dirty_pages_count(
+/*============================*/
+	ulint		id)		/*!< in: space id to check */
+
+{
+	ulint		count = 0;
+
+	for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		count += buf_pool_get_dirty_pages_count(buf_pool, id);
+	}
+
+	return(count);
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc
new file mode 100644
index 00000000000..a1618020bca
--- /dev/null
+++ b/storage/innobase/buf/buf0lru.cc
@@ -0,0 +1,2745 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0lru.cc
+The database buffer replacement algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0lru.h"
+
+#ifndef UNIV_HOTBACKUP
+#ifdef UNIV_NONINL
+#include "buf0lru.ic"
+#endif
+
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "ut0rnd.h"
+#include "sync0sync.h"
+#include "sync0rw.h"
+#include "hash0hash.h"
+#include "os0sync.h"
+#include "fil0fil.h"
+#include "btr0btr.h"
+#include "buf0buddy.h"
+#include "buf0buf.h"
+#include "buf0dblwr.h"
+#include "buf0flu.h"
+#include "buf0rea.h"
+#include "btr0sea.h"
+#include "ibuf0ibuf.h"
+#include "os0file.h"
+#include "page0zip.h"
+#include "log0recv.h"
+#include "srv0srv.h"
+#include "srv0mon.h"
+#include "lock0lock.h"
+
+#include "ha_prototypes.h"
+
+/** The number of blocks from the LRU_old pointer onward, including
+the block pointed to, must be buf_pool->LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
+of the whole LRU list length, except that the tolerance defined below
+is allowed. Note that the tolerance must be small enough such that for
+even the BUF_LRU_OLD_MIN_LEN long LRU list, the LRU_old pointer is not
+allowed to point to either end of the LRU list. */
+
+#define BUF_LRU_OLD_TOLERANCE	20
+
+/** The minimum amount of non-old blocks when the LRU_old list exists
+(that is, when there are more than BUF_LRU_OLD_MIN_LEN blocks).
+@see buf_LRU_old_adjust_len */
+#define BUF_LRU_NON_OLD_MIN_LEN	5
+#if BUF_LRU_NON_OLD_MIN_LEN >= BUF_LRU_OLD_MIN_LEN
+# error "BUF_LRU_NON_OLD_MIN_LEN >= BUF_LRU_OLD_MIN_LEN"
+#endif
+
+/** When dropping the search hash index entries before deleting an ibd
+file, we build a local array of pages belonging to that tablespace
+in the buffer pool. Following is the size of that array.
+We also release buf_pool->mutex after scanning this many pages of the
+flush_list when dropping a table. This is to ensure that other threads
+are not blocked for extended period of time when using very large
+buffer pools. */
+#define BUF_LRU_DROP_SEARCH_SIZE	1024
+
+/** If we switch on the InnoDB monitor because there are too few available
+frames in the buffer pool, we set this to TRUE */
+static ibool	buf_lru_switched_on_innodb_mon	= FALSE;
+
+/******************************************************************//**
+These statistics are not 'of' LRU but 'for' LRU.  We keep count of I/O
+and page_zip_decompress() operations.  Based on the statistics,
+buf_LRU_evict_from_unzip_LRU() decides if we want to evict from
+unzip_LRU or the regular LRU.  From unzip_LRU, we will only evict the
+uncompressed frame (meaning we can evict dirty blocks as well).  From
+the regular LRU, we will evict the entire block (i.e.: both the
+uncompressed and compressed data), which must be clean. */
+
+/* @{ */
+
+/** Number of intervals for which we keep the history of these stats.
+Each interval is 1 second, defined by the rate at which
+srv_error_monitor_thread() calls buf_LRU_stat_update(). */
+#define BUF_LRU_STAT_N_INTERVAL 50
+
+/** Co-efficient with which we multiply I/O operations to equate them
+with page_zip_decompress() operations. */
+#define BUF_LRU_IO_TO_UNZIP_FACTOR 50
+
+/** Sampled values buf_LRU_stat_cur.
+Not protected by any mutex.  Updated by buf_LRU_stat_update(). */
+static buf_LRU_stat_t		buf_LRU_stat_arr[BUF_LRU_STAT_N_INTERVAL];
+
+/** Cursor to buf_LRU_stat_arr[] that is updated in a round-robin fashion. */
+static ulint			buf_LRU_stat_arr_ind;
+
+/** Current operation counters.  Not protected by any mutex.  Cleared
+by buf_LRU_stat_update(). */
+UNIV_INTERN buf_LRU_stat_t	buf_LRU_stat_cur;
+
+/** Running sum of past values of buf_LRU_stat_cur.
+Updated by buf_LRU_stat_update().  Not Protected by any mutex. */
+UNIV_INTERN buf_LRU_stat_t	buf_LRU_stat_sum;
+
+/* @} */
+
+/** @name Heuristics for detecting index scan @{ */
+/** Move blocks to "new" LRU list only if the first access was at
+least this many milliseconds ago.  Not protected by any mutex or latch. */
+UNIV_INTERN uint	buf_LRU_old_threshold_ms;
+/* @} */
+
+/******************************************************************//**
+Takes a block out of the LRU list and page hash table.
+If the block is compressed-only (BUF_BLOCK_ZIP_PAGE),
+the object will be freed.
+
+The caller must hold buf_pool->mutex, the buf_page_get_mutex() mutex
+and the appropriate hash_lock. This function will release the
+buf_page_get_mutex() and the hash_lock.
+
+If a compressed page is freed other compressed pages may be relocated.
+@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The
+caller needs to free the page to the free list
+@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
+this case the block is already returned to the buddy allocator. */
+static __attribute__((nonnull, warn_unused_result))
+bool
+buf_LRU_block_remove_hashed(
+/*========================*/
+	buf_page_t*	bpage,	/*!< in: block, must contain a file page and
+				be in a state where it can be freed; there
+				may or may not be a hash index to the page */
+	bool		zip);	/*!< in: true if should remove also the
+				compressed page of an uncompressed page */
+/******************************************************************//**
+Puts a file page whose has no hash index to the free list. */
+static
+void
+buf_LRU_block_free_hashed_page(
+/*===========================*/
+	buf_block_t*	block);	/*!< in: block, must contain a file page and
+				be in a state where it can be freed */
+
+/******************************************************************//**
+Increases LRU size in bytes with zip_size for compressed page,
+UNIV_PAGE_SIZE for uncompressed page in inline function */
+static inline
+void
+incr_LRU_size_in_bytes(
+/*===================*/
+	buf_page_t*	bpage,		/*!< in: control block */
+	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
+{
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ulint zip_size = page_zip_get_size(&bpage->zip);
+	buf_pool->stat.LRU_bytes += zip_size ? zip_size : UNIV_PAGE_SIZE;
+	ut_ad(buf_pool->stat.LRU_bytes <= buf_pool->curr_pool_size);
+}
+
+/******************************************************************//**
+Determines if the unzip_LRU list should be used for evicting a victim
+instead of the general LRU list.
+@return	TRUE if should use unzip_LRU */
+UNIV_INTERN
+ibool
+buf_LRU_evict_from_unzip_LRU(
+/*=========================*/
+	buf_pool_t*	buf_pool)
+{
+	ulint	io_avg;
+	ulint	unzip_avg;
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	/* If the unzip_LRU list is empty, we can only use the LRU. */
+	if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0) {
+		return(FALSE);
+	}
+
+	/* If unzip_LRU is at most 10% of the size of the LRU list,
+	then use the LRU.  This slack allows us to keep hot
+	decompressed pages in the buffer pool. */
+	if (UT_LIST_GET_LEN(buf_pool->unzip_LRU)
+	    <= UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
+		return(FALSE);
+	}
+
+	/* If eviction hasn't started yet, we assume by default
+	that a workload is disk bound. */
+	if (buf_pool->freed_page_clock == 0) {
+		return(TRUE);
+	}
+
+	/* Calculate the average over past intervals, and add the values
+	of the current interval. */
+	io_avg = buf_LRU_stat_sum.io / BUF_LRU_STAT_N_INTERVAL
+		+ buf_LRU_stat_cur.io;
+	unzip_avg = buf_LRU_stat_sum.unzip / BUF_LRU_STAT_N_INTERVAL
+		+ buf_LRU_stat_cur.unzip;
+
+	/* Decide based on our formula.  If the load is I/O bound
+	(unzip_avg is smaller than the weighted io_avg), evict an
+	uncompressed frame from unzip_LRU.  Otherwise we assume that
+	the load is CPU bound and evict from the regular LRU. */
+	return(unzip_avg <= io_avg * BUF_LRU_IO_TO_UNZIP_FACTOR);
+}
+
+/******************************************************************//**
+Attempts to drop page hash index on a batch of pages belonging to a
+particular space id. */
+static
+void
+buf_LRU_drop_page_hash_batch(
+/*=========================*/
+	ulint		space_id,	/*!< in: space id */
+	ulint		zip_size,	/*!< in: compressed page size in bytes
+					or 0 for uncompressed pages */
+	const ulint*	arr,		/*!< in: array of page_no */
+	ulint		count)		/*!< in: number of entries in array */
+{
+	ulint	i;
+
+	ut_ad(arr != NULL);
+	ut_ad(count <= BUF_LRU_DROP_SEARCH_SIZE);
+
+	for (i = 0; i < count; ++i) {
+		btr_search_drop_page_hash_when_freed(space_id, zip_size,
+						     arr[i]);
+	}
+}
+
+/******************************************************************//**
+When doing a DROP TABLE/DISCARD TABLESPACE we have to drop all page
+hash index entries belonging to that table. This function tries to
+do that in batch. Note that this is a 'best effort' attempt and does
+not guarantee that ALL hash entries will be removed. */
+static
+void
+buf_LRU_drop_page_hash_for_tablespace(
+/*==================================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ulint		id)		/*!< in: space id */
+{
+	buf_page_t*	bpage;
+	ulint*		page_arr;
+	ulint		num_entries;
+	ulint		zip_size;
+
+	zip_size = fil_space_get_zip_size(id);
+
+	if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+		/* Somehow, the tablespace does not exist.  Nothing to drop. */
+		ut_ad(0);
+		return;
+	}
+
+	page_arr = static_cast<ulint*>(ut_malloc(
+		sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE));
+
+	buf_pool_mutex_enter(buf_pool);
+	num_entries = 0;
+
+scan_again:
+	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+
+	while (bpage != NULL) {
+		buf_page_t*	prev_bpage;
+		ibool		is_fixed;
+
+		prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
+
+		ut_a(buf_page_in_file(bpage));
+
+		if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE
+		    || bpage->space != id
+		    || bpage->io_fix != BUF_IO_NONE) {
+			/* Compressed pages are never hashed.
+			Skip blocks of other tablespaces.
+			Skip I/O-fixed blocks (to be dealt with later). */
+next_page:
+			bpage = prev_bpage;
+			continue;
+		}
+
+		mutex_enter(&((buf_block_t*) bpage)->mutex);
+		is_fixed = bpage->buf_fix_count > 0
+			|| !((buf_block_t*) bpage)->index;
+		mutex_exit(&((buf_block_t*) bpage)->mutex);
+
+		if (is_fixed) {
+			goto next_page;
+		}
+
+		/* Store the page number so that we can drop the hash
+		index in a batch later. */
+		page_arr[num_entries] = bpage->offset;
+		ut_a(num_entries < BUF_LRU_DROP_SEARCH_SIZE);
+		++num_entries;
+
+		if (num_entries < BUF_LRU_DROP_SEARCH_SIZE) {
+			goto next_page;
+		}
+
+		/* Array full. We release the buf_pool->mutex to obey
+		the latching order. */
+		buf_pool_mutex_exit(buf_pool);
+
+		buf_LRU_drop_page_hash_batch(
+			id, zip_size, page_arr, num_entries);
+
+		num_entries = 0;
+
+		buf_pool_mutex_enter(buf_pool);
+
+		/* Note that we released the buf_pool mutex above
+		after reading the prev_bpage during processing of a
+		page_hash_batch (i.e.: when the array was full).
+		Because prev_bpage could belong to a compressed-only
+		block, it may have been relocated, and thus the
+		pointer cannot be trusted. Because bpage is of type
+		buf_block_t, it is safe to dereference.
+
+		bpage can change in the LRU list. This is OK because
+		this function is a 'best effort' to drop as many
+		search hash entries as possible and it does not
+		guarantee that ALL such entries will be dropped. */
+
+		/* If, however, bpage has been removed from LRU list
+		to the free list then we should restart the scan.
+		bpage->state is protected by buf_pool mutex. */
+		if (bpage
+		    && buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
+			goto scan_again;
+		}
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+
+	/* Drop any remaining batch of search hashed pages. */
+	buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, num_entries);
+	ut_free(page_arr);
+}
+
+/******************************************************************//**
+While flushing (or removing dirty) pages from a tablespace we don't
+want to hog the CPU and resources. Release the buffer pool and block
+mutex and try to force a context switch. Then reacquire the same mutexes.
+The current page is "fixed" before the release of the mutexes and then
+"unfixed" again once we have reacquired the mutexes. */
+static	__attribute__((nonnull))
+void
+buf_flush_yield(
+/*============*/
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
+	buf_page_t*	bpage)		/*!< in/out: current page */
+{
+	ib_mutex_t*	block_mutex;
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(buf_page_in_file(bpage));
+
+	block_mutex = buf_page_get_mutex(bpage);
+
+	mutex_enter(block_mutex);
+	/* "Fix" the block so that the position cannot be
+	changed after we release the buffer pool and
+	block mutexes. */
+	buf_page_set_sticky(bpage);
+
+	/* Now it is safe to release the buf_pool->mutex. */
+	buf_pool_mutex_exit(buf_pool);
+
+	mutex_exit(block_mutex);
+	/* Try and force a context switch. */
+	os_thread_yield();
+
+	buf_pool_mutex_enter(buf_pool);
+
+	mutex_enter(block_mutex);
+	/* "Unfix" the block now that we have both the
+	buffer pool and block mutex again. */
+	buf_page_unset_sticky(bpage);
+	mutex_exit(block_mutex);
+}
+
+/******************************************************************//**
+If we have hogged the resources for too long then release the buffer
+pool and flush list mutex and do a thread yield. Set the current page
+to "sticky" so that it is not relocated during the yield.
+@return true if yielded */
+static	__attribute__((nonnull(1), warn_unused_result))
+bool
+buf_flush_try_yield(
+/*================*/
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
+	buf_page_t*	bpage,		/*!< in/out: bpage to remove */
+	ulint		processed)	/*!< in: number of pages processed */
+{
+	/* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the
+	loop we release buf_pool->mutex to let other threads
+	do their job but only if the block is not IO fixed. This
+	ensures that the block stays in its position in the
+	flush_list. */
+
+	if (bpage != NULL
+	    && processed >= BUF_LRU_DROP_SEARCH_SIZE
+	    && buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
+
+		buf_flush_list_mutex_exit(buf_pool);
+
+		/* Release the buffer pool and block mutex
+		to give the other threads a go. */
+
+		buf_flush_yield(buf_pool, bpage);
+
+		buf_flush_list_mutex_enter(buf_pool);
+
+		/* Should not have been removed from the flush
+		list during the yield. However, this check is
+		not sufficient to catch a remove -> add. */
+
+		ut_ad(bpage->in_flush_list);
+
+		return(true);
+	}
+
+	return(false);
+}
+
+/******************************************************************//**
+Removes a single page from a given tablespace inside a specific
+buffer pool instance.
+@return true if page was removed. */
+static	__attribute__((nonnull, warn_unused_result))
+bool
+buf_flush_or_remove_page(
+/*=====================*/
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
+	buf_page_t*	bpage,		/*!< in/out: bpage to remove */
+	bool		flush)		/*!< in: flush to disk if true but
+					don't remove else remove without
+					flushing to disk */
+{
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(buf_flush_list_mutex_own(buf_pool));
+
+	/* bpage->space and bpage->io_fix are protected by
+	buf_pool->mutex and block_mutex. It is safe to check
+	them while holding buf_pool->mutex only. */
+
+	if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+
+		/* We cannot remove this page during this scan
+		yet; maybe the system is currently reading it
+		in, or flushing the modifications to the file */
+		return(false);
+	}
+
+	/* We have to release the flush_list_mutex to obey the
+	latching order. We are however guaranteed that the page
+	will stay in the flush_list and won't be relocated because
+	buf_flush_remove() and buf_flush_relocate_on_flush_list()
+	need buf_pool->mutex as well. */
+
+	buf_flush_list_mutex_exit(buf_pool);
+
+	bool		processed;
+	ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
+
+	mutex_enter(block_mutex);
+
+	ut_ad(bpage->oldest_modification != 0);
+
+	if (!flush) {
+
+		buf_flush_remove(bpage);
+		processed = true;
+
+	} else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_SINGLE_PAGE)
+		   && buf_flush_page(
+			   buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, false)) {
+
+		/* Wake possible simulated aio thread to actually
+		post the writes to the operating system */
+		os_aio_simulated_wake_handler_threads();
+
+		buf_pool_mutex_enter(buf_pool);
+
+		buf_flush_list_mutex_enter(buf_pool);
+
+		return(true);
+
+	} else {
+		processed = false;
+	}
+
+	mutex_exit(block_mutex);
+
+	buf_flush_list_mutex_enter(buf_pool);
+
+	ut_ad(!mutex_own(block_mutex));
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	return(processed);
+}
+
+/******************************************************************//**
+Remove all dirty pages belonging to a given tablespace inside a specific
+buffer pool instance when we are deleting the data file(s) of that
+tablespace. The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU.
+@retval DB_SUCCESS if all freed
+@retval DB_FAIL if not all freed
+@retval DB_INTERRUPTED if the transaction was interrupted */
+static	__attribute__((nonnull(1), warn_unused_result))
+dberr_t
+buf_flush_or_remove_pages(
+/*======================*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	ulint		id,		/*!< in: target space id for which
+					to remove or flush pages */
+	bool		flush,		/*!< in: flush to disk if true but
+					don't remove else remove without
+					flushing to disk */
+	const trx_t*	trx)		/*!< to check if the operation must
+					be interrupted, can be 0 */
+{
+	buf_page_t*	prev;
+	buf_page_t*	bpage;
+	ulint		processed = 0;
+
+	buf_flush_list_mutex_enter(buf_pool);
+
+rescan:
+	bool	all_freed = true;
+
+	for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
+	     bpage != NULL;
+	     bpage = prev) {
+
+		ut_a(buf_page_in_file(bpage));
+
+		/* Save the previous link because once we free the
+		page we can't rely on the links. */
+
+		prev = UT_LIST_GET_PREV(list, bpage);
+
+		if (buf_page_get_space(bpage) != id) {
+
+			/* Skip this block, as it does not belong to
+			the target space. */
+
+		} else if (!buf_flush_or_remove_page(buf_pool, bpage, flush)) {
+
+			/* Remove was unsuccessful, we have to try again
+			by scanning the entire list from the end.
+			This also means that we never released the
+			buf_pool mutex. Therefore we can trust the prev
+			pointer.
+			buf_flush_or_remove_page() released the
+			flush list mutex but not the buf_pool mutex.
+			Therefore it is possible that a new page was
+			added to the flush list. For example, in case
+			where we are at the head of the flush list and
+			prev == NULL. That is OK because we have the
+			tablespace quiesced and no new pages for this
+			space-id should enter flush_list. This is
+			because the only callers of this function are
+			DROP TABLE and FLUSH TABLE FOR EXPORT.
+			We know that we'll have to do at least one more
+			scan but we don't break out of loop here and
+			try to do as much work as we can in this
+			iteration. */
+
+			all_freed = false;
+		} else if (flush) {
+
+			/* The processing was successful. And during the
+			processing we have released the buf_pool mutex
+			when calling buf_page_flush(). We cannot trust
+			prev pointer. */
+			goto rescan;
+		}
+
+		++processed;
+
+		/* Yield if we have hogged the CPU and mutexes for too long. */
+		if (buf_flush_try_yield(buf_pool, prev, processed)) {
+
+			/* Reset the batch size counter if we had to yield. */
+
+			processed = 0;
+		}
+
+#ifdef DBUG_OFF
+		if (flush) {
+			DBUG_EXECUTE_IF("ib_export_flush_crash",
+					static ulint	n_pages;
+					if (++n_pages == 4) {DBUG_SUICIDE();});
+		}
+#endif /* DBUG_OFF */
+
+		/* The check for trx is interrupted is expensive, we want
+		to check every N iterations. */
+		if (!processed && trx && trx_is_interrupted(trx)) {
+			buf_flush_list_mutex_exit(buf_pool);
+			return(DB_INTERRUPTED);
+		}
+	}
+
+	buf_flush_list_mutex_exit(buf_pool);
+
+	return(all_freed ? DB_SUCCESS : DB_FAIL);
+}
+
+/******************************************************************//**
+Remove or flush all the dirty pages that belong to a given tablespace
+inside a specific buffer pool instance. The pages will remain in the LRU
+list and will be evicted from the LRU list as they age and move towards
+the tail of the LRU list. */
+static __attribute__((nonnull(1)))
+void
+buf_flush_dirty_pages(
+/*==================*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	ulint		id,		/*!< in: space id */
+	bool		flush,		/*!< in: flush to disk if true otherwise
+					remove the pages without flushing */
+	const trx_t*	trx)		/*!< to check if the operation must
+					be interrupted */
+{
+	dberr_t		err;
+
+	do {
+		buf_pool_mutex_enter(buf_pool);
+
+		err = buf_flush_or_remove_pages(buf_pool, id, flush, trx);
+
+		buf_pool_mutex_exit(buf_pool);
+
+		ut_ad(buf_flush_validate(buf_pool));
+
+		if (err == DB_FAIL) {
+			os_thread_sleep(2000);
+		}
+
+		/* DB_FAIL is a soft error, it means that the task wasn't
+		completed, needs to be retried. */
+
+		ut_ad(buf_flush_validate(buf_pool));
+
+	} while (err == DB_FAIL);
+
+	ut_ad(err == DB_INTERRUPTED
+	      || buf_pool_get_dirty_pages_count(buf_pool, id) == 0);
+}
+
+/******************************************************************//**
+Remove all pages that belong to a given tablespace inside a specific
+buffer pool instance when we are DISCARDing the tablespace. */
+static __attribute__((nonnull))
+void
+buf_LRU_remove_all_pages(
+/*=====================*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	ulint		id)		/*!< in: space id */
+{
+	buf_page_t*	bpage;
+	ibool		all_freed;
+
+scan_again:
+	buf_pool_mutex_enter(buf_pool);
+
+	all_freed = TRUE;
+
+	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+	     bpage != NULL;
+	     /* No op */) {
+
+		rw_lock_t*	hash_lock;
+		buf_page_t*	prev_bpage;
+		ib_mutex_t*	block_mutex = NULL;
+
+		ut_a(buf_page_in_file(bpage));
+		ut_ad(bpage->in_LRU_list);
+
+		prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
+
+		/* bpage->space and bpage->io_fix are protected by
+		buf_pool->mutex and the block_mutex. It is safe to check
+		them while holding buf_pool->mutex only. */
+
+		if (buf_page_get_space(bpage) != id) {
+			/* Skip this block, as it does not belong to
+			the space that is being invalidated. */
+			goto next_page;
+		} else if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+			/* We cannot remove this page during this scan
+			yet; maybe the system is currently reading it
+			in, or flushing the modifications to the file */
+
+			all_freed = FALSE;
+			goto next_page;
+		} else {
+			ulint	fold = buf_page_address_fold(
+				bpage->space, bpage->offset);
+
+			hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+
+			rw_lock_x_lock(hash_lock);
+
+			block_mutex = buf_page_get_mutex(bpage);
+			mutex_enter(block_mutex);
+
+			if (bpage->buf_fix_count > 0) {
+
+				mutex_exit(block_mutex);
+
+				rw_lock_x_unlock(hash_lock);
+
+				/* We cannot remove this page during
+				this scan yet; maybe the system is
+				currently reading it in, or flushing
+				the modifications to the file */
+
+				all_freed = FALSE;
+
+				goto next_page;
+			}
+		}
+
+		ut_ad(mutex_own(block_mutex));
+
+#ifdef UNIV_DEBUG
+		if (buf_debug_prints) {
+			fprintf(stderr,
+				"Dropping space %lu page %lu\n",
+				(ulong) buf_page_get_space(bpage),
+				(ulong) buf_page_get_page_no(bpage));
+		}
+#endif
+		if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
+			/* Do nothing, because the adaptive hash index
+			covers uncompressed pages only. */
+		} else if (((buf_block_t*) bpage)->index) {
+			ulint	page_no;
+			ulint	zip_size;
+
+			buf_pool_mutex_exit(buf_pool);
+
+			zip_size = buf_page_get_zip_size(bpage);
+			page_no = buf_page_get_page_no(bpage);
+
+			rw_lock_x_unlock(hash_lock);
+
+			mutex_exit(block_mutex);
+
+			/* Note that the following call will acquire
+			and release block->lock X-latch. */
+
+			btr_search_drop_page_hash_when_freed(
+				id, zip_size, page_no);
+
+			goto scan_again;
+		}
+
+		if (bpage->oldest_modification != 0) {
+
+			buf_flush_remove(bpage);
+		}
+
+		ut_ad(!bpage->in_flush_list);
+
+		/* Remove from the LRU list. */
+
+		if (buf_LRU_block_remove_hashed(bpage, true)) {
+			buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
+		} else {
+			ut_ad(block_mutex == &buf_pool->zip_mutex);
+		}
+
+		ut_ad(!mutex_own(block_mutex));
+
+#ifdef UNIV_SYNC_DEBUG
+		/* buf_LRU_block_remove_hashed() releases the hash_lock */
+		ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX));
+		ut_ad(!rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+next_page:
+		bpage = prev_bpage;
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+
+	if (!all_freed) {
+		os_thread_sleep(20000);
+
+		goto scan_again;
+	}
+}
+
+/******************************************************************//**
+Remove pages belonging to a given tablespace inside a specific
+buffer pool instance when we are deleting the data file(s) of that
+tablespace. The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU only if buf_remove
+is BUF_REMOVE_FLUSH_NO_WRITE. */
+static	__attribute__((nonnull(1)))
+void
+buf_LRU_remove_pages(
+/*=================*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	ulint		id,		/*!< in: space id */
+	buf_remove_t	buf_remove,	/*!< in: remove or flush strategy */
+	const trx_t*	trx)		/*!< to check if the operation must
+					be interrupted */
+{
+	switch (buf_remove) {
+	case BUF_REMOVE_ALL_NO_WRITE:
+		buf_LRU_remove_all_pages(buf_pool, id);
+		break;
+
+	case BUF_REMOVE_FLUSH_NO_WRITE:
+		ut_a(trx == 0);
+		buf_flush_dirty_pages(buf_pool, id, false, NULL);
+		break;
+
+	case BUF_REMOVE_FLUSH_WRITE:
+		ut_a(trx != 0);
+		buf_flush_dirty_pages(buf_pool, id, true, trx);
+		/* Ensure that all asynchronous IO is completed. */
+		os_aio_wait_until_no_pending_writes();
+		fil_flush(id);
+		break;
+	}
+}
+
+/******************************************************************//**
+Flushes all dirty pages or removes all pages belonging
+to a given tablespace. A PROBLEM: if readahead is being started, what
+guarantees that it will not try to read in pages after this operation
+has completed? */
+UNIV_INTERN
+void
+buf_LRU_flush_or_remove_pages(
+/*==========================*/
+	ulint		id,		/*!< in: space id */
+	buf_remove_t	buf_remove,	/*!< in: remove or flush strategy */
+	const trx_t*	trx)		/*!< to check if the operation must
+					be interrupted */
+{
+	ulint		i;
+
+	/* Before we attempt to drop pages one by one we first
+	attempt to drop page hash index entries in batches to make
+	it more efficient. The batching attempt is a best effort
+	attempt and does not guarantee that all pages hash entries
+	will be dropped. We get rid of remaining page hash entries
+	one by one below. */
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		switch (buf_remove) {
+		case BUF_REMOVE_ALL_NO_WRITE:
+			buf_LRU_drop_page_hash_for_tablespace(buf_pool, id);
+			break;
+
+		case BUF_REMOVE_FLUSH_NO_WRITE:
+			/* It is a DROP TABLE for a single table
+			tablespace. No AHI entries exist because
+			we already dealt with them when freeing up
+			extents. */
+		case BUF_REMOVE_FLUSH_WRITE:
+			/* We allow read-only queries against the
+			table, there is no need to drop the AHI entries. */
+			break;
+		}
+
+		buf_LRU_remove_pages(buf_pool, id, buf_remove, trx);
+	}
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/********************************************************************//**
+Insert a compressed block into buf_pool->zip_clean in the LRU order. */
+UNIV_INTERN
+void
+buf_LRU_insert_zip_clean(
+/*=====================*/
+	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
+{
+	buf_page_t*	b;
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE);
+
+	/* Find the first successor of bpage in the LRU list
+	that is in the zip_clean list. */
+	b = bpage;
+	do {
+		b = UT_LIST_GET_NEXT(LRU, b);
+	} while (b && buf_page_get_state(b) != BUF_BLOCK_ZIP_PAGE);
+
+	/* Insert bpage before b, i.e., after the predecessor of b. */
+	if (b) {
+		b = UT_LIST_GET_PREV(list, b);
+	}
+
+	if (b) {
+		UT_LIST_INSERT_AFTER(list, buf_pool->zip_clean, b, bpage);
+	} else {
+		UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, bpage);
+	}
+}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+/******************************************************************//**
+Try to free an uncompressed page of a compressed block from the unzip
+LRU list.  The compressed page is preserved, and it need not be clean.
+@return	TRUE if freed */
+UNIV_INLINE
+ibool
+buf_LRU_free_from_unzip_LRU_list(
+/*=============================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ibool		scan_all)	/*!< in: scan whole LRU list
+					if TRUE, otherwise scan only
+					srv_LRU_scan_depth / 2 blocks. */
+{
+	buf_block_t*	block;
+	ibool 		freed;
+	ulint		scanned;
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	if (!buf_LRU_evict_from_unzip_LRU(buf_pool)) {
+		return(FALSE);
+	}
+
+	for (block = UT_LIST_GET_LAST(buf_pool->unzip_LRU),
+	     scanned = 1, freed = FALSE;
+	     block != NULL && !freed
+	     && (scan_all || scanned < srv_LRU_scan_depth);
+	     ++scanned) {
+
+		buf_block_t*	prev_block = UT_LIST_GET_PREV(unzip_LRU,
+						block);
+
+		ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+		ut_ad(block->in_unzip_LRU_list);
+		ut_ad(block->page.in_LRU_list);
+
+		freed = buf_LRU_free_page(&block->page, false);
+
+		block = prev_block;
+	}
+
+	MONITOR_INC_VALUE_CUMULATIVE(
+		MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+		MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+		MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
+		scanned);
+	return(freed);
+}
+
+/******************************************************************//**
+Try to free a clean page from the common LRU list.
+@return	TRUE if freed */
+UNIV_INLINE
+ibool
+buf_LRU_free_from_common_LRU_list(
+/*==============================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ibool		scan_all)	/*!< in: scan whole LRU list
+					if TRUE, otherwise scan only
+					srv_LRU_scan_depth / 2 blocks. */
+{
+	buf_page_t*	bpage;
+	ibool		freed;
+	ulint		scanned;
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU),
+	     scanned = 1, freed = FALSE;
+	     bpage != NULL && !freed
+	     && (scan_all || scanned < srv_LRU_scan_depth);
+	     ++scanned) {
+
+		unsigned	accessed;
+		buf_page_t*	prev_bpage = UT_LIST_GET_PREV(LRU,
+						bpage);
+
+		ut_ad(buf_page_in_file(bpage));
+		ut_ad(bpage->in_LRU_list);
+
+		accessed = buf_page_is_accessed(bpage);
+		freed = buf_LRU_free_page(bpage, true);
+		if (freed && !accessed) {
+			/* Keep track of pages that are evicted without
+			ever being accessed. This gives us a measure of
+			the effectiveness of readahead */
+			++buf_pool->stat.n_ra_pages_evicted;
+		}
+
+		bpage = prev_bpage;
+	}
+
+	MONITOR_INC_VALUE_CUMULATIVE(
+		MONITOR_LRU_SEARCH_SCANNED,
+		MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+		MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
+		scanned);
+
+	return(freed);
+}
+
+/******************************************************************//**
+Try to free a replaceable block.
+@return	TRUE if found and freed */
+UNIV_INTERN
+ibool
+buf_LRU_scan_and_free_block(
+/*========================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ibool		scan_all)	/*!< in: scan whole LRU list
+					if TRUE, otherwise scan only
+					'old' blocks. */
+{
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	return(buf_LRU_free_from_unzip_LRU_list(buf_pool, scan_all)
+	       || buf_LRU_free_from_common_LRU_list(
+			buf_pool, scan_all));
+}
+
+/******************************************************************//**
+Returns TRUE if less than 25 % of the buffer pool in any instance is
+available. This can be used in heuristics to prevent huge transactions
+eating up the whole buffer pool for their locks.
+@return	TRUE if less than 25 % of buffer pool left */
+UNIV_INTERN
+ibool
+buf_LRU_buf_pool_running_out(void)
+/*==============================*/
+{
+	ulint	i;
+	ibool	ret = FALSE;
+
+	for (i = 0; i < srv_buf_pool_instances && !ret; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		buf_pool_mutex_enter(buf_pool);
+
+		if (!recv_recovery_on
+		    && UT_LIST_GET_LEN(buf_pool->free)
+		       + UT_LIST_GET_LEN(buf_pool->LRU)
+		       < buf_pool->curr_size / 4) {
+
+			ret = TRUE;
+		}
+
+		buf_pool_mutex_exit(buf_pool);
+	}
+
+	return(ret);
+}
+
+/******************************************************************//**
+Returns a free block from the buf_pool.  The block is taken off the
+free list.  If it is empty, returns NULL.
+@return	a free control block, or NULL if the buf_block->free list is empty */
+UNIV_INTERN
+buf_block_t*
+buf_LRU_get_free_only(
+/*==================*/
+	buf_pool_t*	buf_pool)
+{
+	buf_block_t*	block;
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	block = (buf_block_t*) UT_LIST_GET_FIRST(buf_pool->free);
+
+	if (block) {
+
+		ut_ad(block->page.in_free_list);
+		ut_d(block->page.in_free_list = FALSE);
+		ut_ad(!block->page.in_flush_list);
+		ut_ad(!block->page.in_LRU_list);
+		ut_a(!buf_page_in_file(&block->page));
+		UT_LIST_REMOVE(list, buf_pool->free, (&block->page));
+
+		mutex_enter(&block->mutex);
+
+		buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE);
+		UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE);
+
+		ut_ad(buf_pool_from_block(block) == buf_pool);
+
+		mutex_exit(&block->mutex);
+	}
+
+	return(block);
+}
+
+/******************************************************************//**
+Checks how much of buf_pool is occupied by non-data objects like
+AHI, lock heaps etc. Depending on the size of non-data objects this
+function will either assert or issue a warning and switch on the
+status monitor. */
+static
+void
+buf_LRU_check_size_of_non_data_objects(
+/*===================================*/
+	const buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
+{
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free)
+	    + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 20) {
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr,
+			"  InnoDB: ERROR: over 95 percent of the buffer pool"
+			" is occupied by\n"
+			"InnoDB: lock heaps or the adaptive hash index!"
+			" Check that your\n"
+			"InnoDB: transactions do not set too many row locks.\n"
+			"InnoDB: Your buffer pool size is %lu MB."
+			" Maybe you should make\n"
+			"InnoDB: the buffer pool bigger?\n"
+			"InnoDB: We intentionally generate a seg fault"
+			" to print a stack trace\n"
+			"InnoDB: on Linux!\n",
+			(ulong) (buf_pool->curr_size
+				 / (1024 * 1024 / UNIV_PAGE_SIZE)));
+
+		ut_error;
+
+	} else if (!recv_recovery_on
+		   && (UT_LIST_GET_LEN(buf_pool->free)
+		       + UT_LIST_GET_LEN(buf_pool->LRU))
+		   < buf_pool->curr_size / 3) {
+
+		if (!buf_lru_switched_on_innodb_mon) {
+
+			/* Over 67 % of the buffer pool is occupied by lock
+			heaps or the adaptive hash index. This may be a memory
+			leak! */
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: WARNING: over 67 percent of"
+				" the buffer pool is occupied by\n"
+				"InnoDB: lock heaps or the adaptive"
+				" hash index! Check that your\n"
+				"InnoDB: transactions do not set too many"
+				" row locks.\n"
+				"InnoDB: Your buffer pool size is %lu MB."
+				" Maybe you should make\n"
+				"InnoDB: the buffer pool bigger?\n"
+				"InnoDB: Starting the InnoDB Monitor to print"
+				" diagnostics, including\n"
+				"InnoDB: lock heap and hash index sizes.\n",
+				(ulong) (buf_pool->curr_size
+					 / (1024 * 1024 / UNIV_PAGE_SIZE)));
+
+			buf_lru_switched_on_innodb_mon = TRUE;
+			srv_print_innodb_monitor = TRUE;
+			os_event_set(lock_sys->timeout_event);
+		}
+	} else if (buf_lru_switched_on_innodb_mon) {
+
+		/* Switch off the InnoDB Monitor; this is a simple way
+		to stop the monitor if the situation becomes less urgent,
+		but may also surprise users if the user also switched on the
+		monitor! */
+
+		buf_lru_switched_on_innodb_mon = FALSE;
+		srv_print_innodb_monitor = FALSE;
+	}
+}
+
+/******************************************************************//**
+Returns a free block from the buf_pool. The block is taken off the
+free list. If free list is empty, blocks are moved from the end of the
+LRU list to the free list.
+This function is called from a user thread when it needs a clean
+block to read in a page. Note that we only ever get a block from
+the free list. Even when we flush a page or find a page in LRU scan
+we put it to free list to be used.
+* iteration 0:
+  * get a block from free list, success:done
+  * if there is an LRU flush batch in progress:
+    * wait for batch to end: retry free list
+  * if buf_pool->try_LRU_scan is set
+    * scan LRU up to srv_LRU_scan_depth to find a clean block
+    * the above will put the block on free list
+    * success:retry the free list
+  * flush one dirty page from tail of LRU to disk
+    * the above will put the block on free list
+    * success: retry the free list
+* iteration 1:
+  * same as iteration 0 except:
+    * scan whole LRU list
+    * scan LRU list even if buf_pool->try_LRU_scan is not set
+* iteration > 1:
+  * same as iteration 1 but sleep 100ms
+@return	the free control block, in state BUF_BLOCK_READY_FOR_USE */
+UNIV_INTERN
+buf_block_t*
+buf_LRU_get_free_block(
+/*===================*/
+	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance */
+{
+	buf_block_t*	block		= NULL;
+	ibool		freed		= FALSE;
+	ulint		n_iterations	= 0;
+	ulint		flush_failures	= 0;
+	ibool		mon_value_was	= FALSE;
+	ibool		started_monitor	= FALSE;
+
+	MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH);
+loop:
+	buf_pool_mutex_enter(buf_pool);
+
+	buf_LRU_check_size_of_non_data_objects(buf_pool);
+
+	/* If there is a block in the free list, take it */
+	block = buf_LRU_get_free_only(buf_pool);
+
+	if (block) {
+
+		buf_pool_mutex_exit(buf_pool);
+		ut_ad(buf_pool_from_block(block) == buf_pool);
+		memset(&block->page.zip, 0, sizeof block->page.zip);
+
+		if (started_monitor) {
+			srv_print_innodb_monitor =
+				static_cast<my_bool>(mon_value_was);
+		}
+
+		return(block);
+	}
+
+	if (buf_pool->init_flush[BUF_FLUSH_LRU]
+	    && srv_use_doublewrite_buf
+	    && buf_dblwr != NULL) {
+
+		/* If there is an LRU flush happening in the background
+		then we wait for it to end instead of trying a single
+		page flush. If, however, we are not using doublewrite
+		buffer then it is better to do our own single page
+		flush instead of waiting for LRU flush to end. */
+		buf_pool_mutex_exit(buf_pool);
+		buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
+		goto loop;
+	}
+
+	freed = FALSE;
+	if (buf_pool->try_LRU_scan || n_iterations > 0) {
+		/* If no block was in the free list, search from the
+		end of the LRU list and try to free a block there.
+		If we are doing for the first time we'll scan only
+		tail of the LRU list otherwise we scan the whole LRU
+		list. */
+		freed = buf_LRU_scan_and_free_block(buf_pool,
+						    n_iterations > 0);
+
+		if (!freed && n_iterations == 0) {
+			/* Tell other threads that there is no point
+			in scanning the LRU list. This flag is set to
+			TRUE again when we flush a batch from this
+			buffer pool. */
+			buf_pool->try_LRU_scan = FALSE;
+		}
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+
+	if (freed) {
+		goto loop;
+
+	}
+
+	if (n_iterations > 20) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Warning: difficult to find free blocks in\n"
+			"InnoDB: the buffer pool (%lu search iterations)!\n"
+			"InnoDB: %lu failed attempts to flush a page!"
+			" Consider\n"
+			"InnoDB: increasing the buffer pool size.\n"
+			"InnoDB: It is also possible that"
+			" in your Unix version\n"
+			"InnoDB: fsync is very slow, or"
+			" completely frozen inside\n"
+			"InnoDB: the OS kernel. Then upgrading to"
+			" a newer version\n"
+			"InnoDB: of your operating system may help."
+			" Look at the\n"
+			"InnoDB: number of fsyncs in diagnostic info below.\n"
+			"InnoDB: Pending flushes (fsync) log: %lu;"
+			" buffer pool: %lu\n"
+			"InnoDB: %lu OS file reads, %lu OS file writes,"
+			" %lu OS fsyncs\n"
+			"InnoDB: Starting InnoDB Monitor to print further\n"
+			"InnoDB: diagnostics to the standard output.\n",
+			(ulong) n_iterations,
+			(ulong)	flush_failures,
+			(ulong) fil_n_pending_log_flushes,
+			(ulong) fil_n_pending_tablespace_flushes,
+			(ulong) os_n_file_reads, (ulong) os_n_file_writes,
+			(ulong) os_n_fsyncs);
+
+		mon_value_was = srv_print_innodb_monitor;
+		started_monitor = TRUE;
+		srv_print_innodb_monitor = TRUE;
+		os_event_set(lock_sys->timeout_event);
+	}
+
+	/* If we have scanned the whole LRU and still are unable to
+	find a free block then we should sleep here to let the
+	page_cleaner do an LRU batch for us.
+	TODO: It'd be better if we can signal the page_cleaner. Perhaps
+	we should use timed wait for page_cleaner. */
+	if (n_iterations > 1) {
+
+		os_thread_sleep(100000);
+	}
+
+	/* No free block was found: try to flush the LRU list.
+	This call will flush one page from the LRU and put it on the
+	free list. That means that the free block is up for grabs for
+	all user threads.
+	TODO: A more elegant way would have been to return the freed
+	up block to the caller here but the code that deals with
+	removing the block from page_hash and LRU_list is fairly
+	involved (particularly in case of compressed pages). We
+	can do that in a separate patch sometime in future. */
+	if (!buf_flush_single_page_from_LRU(buf_pool)) {
+		MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT);
+		++flush_failures;
+	}
+
+	srv_stats.buf_pool_wait_free.add(n_iterations, 1);
+
+	n_iterations++;
+
+	goto loop;
+}
+
+/*******************************************************************//**
+Moves the LRU_old pointer so that the length of the old blocks list
+is inside the allowed limits. */
+UNIV_INLINE
+void
+buf_LRU_old_adjust_len(
+/*===================*/
+	buf_pool_t*	buf_pool)	/*!< in: buffer pool instance */
+{
+	ulint	old_len;
+	ulint	new_len;
+
+	ut_a(buf_pool->LRU_old);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(buf_pool->LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN);
+	ut_ad(buf_pool->LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX);
+#if BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN <= BUF_LRU_OLD_RATIO_DIV * (BUF_LRU_OLD_TOLERANCE + 5)
+# error "BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN <= BUF_LRU_OLD_RATIO_DIV * (BUF_LRU_OLD_TOLERANCE + 5)"
+#endif
+#ifdef UNIV_LRU_DEBUG
+	/* buf_pool->LRU_old must be the first item in the LRU list
+	whose "old" flag is set. */
+	ut_a(buf_pool->LRU_old->old);
+	ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
+	     || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
+	ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
+	     || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
+#endif /* UNIV_LRU_DEBUG */
+
+	old_len = buf_pool->LRU_old_len;
+	new_len = ut_min(UT_LIST_GET_LEN(buf_pool->LRU)
+			 * buf_pool->LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV,
+			 UT_LIST_GET_LEN(buf_pool->LRU)
+			 - (BUF_LRU_OLD_TOLERANCE
+			    + BUF_LRU_NON_OLD_MIN_LEN));
+
+	for (;;) {
+		buf_page_t*	LRU_old = buf_pool->LRU_old;
+
+		ut_a(LRU_old);
+		ut_ad(LRU_old->in_LRU_list);
+#ifdef UNIV_LRU_DEBUG
+		ut_a(LRU_old->old);
+#endif /* UNIV_LRU_DEBUG */
+
+		/* Update the LRU_old pointer if necessary */
+
+		if (old_len + BUF_LRU_OLD_TOLERANCE < new_len) {
+
+			buf_pool->LRU_old = LRU_old = UT_LIST_GET_PREV(
+				LRU, LRU_old);
+#ifdef UNIV_LRU_DEBUG
+			ut_a(!LRU_old->old);
+#endif /* UNIV_LRU_DEBUG */
+			old_len = ++buf_pool->LRU_old_len;
+			buf_page_set_old(LRU_old, TRUE);
+
+		} else if (old_len > new_len + BUF_LRU_OLD_TOLERANCE) {
+
+			buf_pool->LRU_old = UT_LIST_GET_NEXT(LRU, LRU_old);
+			old_len = --buf_pool->LRU_old_len;
+			buf_page_set_old(LRU_old, FALSE);
+		} else {
+			return;
+		}
+	}
+}
+
+/*******************************************************************//**
+Initializes the old blocks pointer in the LRU list. This function should be
+called when the LRU list grows to BUF_LRU_OLD_MIN_LEN length. */
+static
+void
+buf_LRU_old_init(
+/*=============*/
+	buf_pool_t*	buf_pool)
+{
+	buf_page_t*	bpage;
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN);
+
+	/* We first initialize all blocks in the LRU list as old and then use
+	the adjust function to move the LRU_old pointer to the right
+	position */
+
+	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU); bpage != NULL;
+	     bpage = UT_LIST_GET_PREV(LRU, bpage)) {
+		ut_ad(bpage->in_LRU_list);
+		ut_ad(buf_page_in_file(bpage));
+		/* This loop temporarily violates the
+		assertions of buf_page_set_old(). */
+		bpage->old = TRUE;
+	}
+
+	buf_pool->LRU_old = UT_LIST_GET_FIRST(buf_pool->LRU);
+	buf_pool->LRU_old_len = UT_LIST_GET_LEN(buf_pool->LRU);
+
+	buf_LRU_old_adjust_len(buf_pool);
+}
+
+/******************************************************************//**
+Remove a block from the unzip_LRU list if it belonged to the list. */
+static
+void
+buf_unzip_LRU_remove_block_if_needed(
+/*=================================*/
+	buf_page_t*	bpage)	/*!< in/out: control block */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+	ut_ad(buf_pool);
+	ut_ad(bpage);
+	ut_ad(buf_page_in_file(bpage));
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	if (buf_page_belongs_to_unzip_LRU(bpage)) {
+		buf_block_t*	block = (buf_block_t*) bpage;
+
+		ut_ad(block->in_unzip_LRU_list);
+		ut_d(block->in_unzip_LRU_list = FALSE);
+
+		UT_LIST_REMOVE(unzip_LRU, buf_pool->unzip_LRU, block);
+	}
+}
+
+/******************************************************************//**
+Removes a block from the LRU list. */
+UNIV_INLINE
+void
+buf_LRU_remove_block(
+/*=================*/
+	buf_page_t*	bpage)	/*!< in: control block */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ulint		zip_size;
+
+	ut_ad(buf_pool);
+	ut_ad(bpage);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	ut_a(buf_page_in_file(bpage));
+
+	ut_ad(bpage->in_LRU_list);
+
+	/* If the LRU_old pointer is defined and points to just this block,
+	move it backward one step */
+
+	if (UNIV_UNLIKELY(bpage == buf_pool->LRU_old)) {
+
+		/* Below: the previous block is guaranteed to exist,
+		because the LRU_old pointer is only allowed to differ
+		by BUF_LRU_OLD_TOLERANCE from strict
+		buf_pool->LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the LRU
+		list length. */
+		buf_page_t*	prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
+
+		ut_a(prev_bpage);
+#ifdef UNIV_LRU_DEBUG
+		ut_a(!prev_bpage->old);
+#endif /* UNIV_LRU_DEBUG */
+		buf_pool->LRU_old = prev_bpage;
+		buf_page_set_old(prev_bpage, TRUE);
+
+		buf_pool->LRU_old_len++;
+	}
+
+	/* Remove the block from the LRU list */
+	UT_LIST_REMOVE(LRU, buf_pool->LRU, bpage);
+	ut_d(bpage->in_LRU_list = FALSE);
+
+	zip_size = page_zip_get_size(&bpage->zip);
+	buf_pool->stat.LRU_bytes -= zip_size ? zip_size : UNIV_PAGE_SIZE;
+
+	buf_unzip_LRU_remove_block_if_needed(bpage);
+
+	/* If the LRU list is so short that LRU_old is not defined,
+	clear the "old" flags and return */
+	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
+
+		for (bpage = UT_LIST_GET_FIRST(buf_pool->LRU); bpage != NULL;
+		     bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+			/* This loop temporarily violates the
+			assertions of buf_page_set_old(). */
+			bpage->old = FALSE;
+		}
+
+		buf_pool->LRU_old = NULL;
+		buf_pool->LRU_old_len = 0;
+
+		return;
+	}
+
+	ut_ad(buf_pool->LRU_old);
+
+	/* Update the LRU_old_len field if necessary */
+	if (buf_page_is_old(bpage)) {
+
+		buf_pool->LRU_old_len--;
+	}
+
+	/* Adjust the length of the old block list if necessary */
+	buf_LRU_old_adjust_len(buf_pool);
+}
+
+/******************************************************************//**
+Adds a block to the LRU list of decompressed zip pages. */
+UNIV_INTERN
+void
+buf_unzip_LRU_add_block(
+/*====================*/
+	buf_block_t*	block,	/*!< in: control block */
+	ibool		old)	/*!< in: TRUE if should be put to the end
+				of the list, else put to the start */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_block(block);
+
+	ut_ad(buf_pool);
+	ut_ad(block);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	ut_a(buf_page_belongs_to_unzip_LRU(&block->page));
+
+	ut_ad(!block->in_unzip_LRU_list);
+	ut_d(block->in_unzip_LRU_list = TRUE);
+
+	if (old) {
+		UT_LIST_ADD_LAST(unzip_LRU, buf_pool->unzip_LRU, block);
+	} else {
+		UT_LIST_ADD_FIRST(unzip_LRU, buf_pool->unzip_LRU, block);
+	}
+}
+
+/******************************************************************//**
+Adds a block to the LRU list end. Please make sure that the zip_size is
+already set into the page zip when invoking the function, so that we
+can get correct zip_size from the buffer page when adding a block
+into LRU */
+UNIV_INLINE
+void
+buf_LRU_add_block_to_end_low(
+/*=========================*/
+	buf_page_t*	bpage)	/*!< in: control block */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+	ut_ad(buf_pool);
+	ut_ad(bpage);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	ut_a(buf_page_in_file(bpage));
+
+	ut_ad(!bpage->in_LRU_list);
+	UT_LIST_ADD_LAST(LRU, buf_pool->LRU, bpage);
+	ut_d(bpage->in_LRU_list = TRUE);
+
+	incr_LRU_size_in_bytes(bpage, buf_pool);
+
+	if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) {
+
+		ut_ad(buf_pool->LRU_old);
+
+		/* Adjust the length of the old block list if necessary */
+
+		buf_page_set_old(bpage, TRUE);
+		buf_pool->LRU_old_len++;
+		buf_LRU_old_adjust_len(buf_pool);
+
+	} else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) {
+
+		/* The LRU list is now long enough for LRU_old to become
+		defined: init it */
+
+		buf_LRU_old_init(buf_pool);
+	} else {
+		buf_page_set_old(bpage, buf_pool->LRU_old != NULL);
+	}
+
+	/* If this is a zipped block with decompressed frame as well
+	then put it on the unzip_LRU list */
+	if (buf_page_belongs_to_unzip_LRU(bpage)) {
+		buf_unzip_LRU_add_block((buf_block_t*) bpage, TRUE);
+	}
+}
+
+/******************************************************************//**
+Adds a block to the LRU list. Please make sure that the zip_size is
+already set into the page zip when invoking the function, so that we
+can get correct zip_size from the buffer page when adding a block
+into LRU */
+UNIV_INLINE
+void
+buf_LRU_add_block_low(
+/*==================*/
+	buf_page_t*	bpage,	/*!< in: control block */
+	ibool		old)	/*!< in: TRUE if should be put to the old blocks
+				in the LRU list, else put to the start; if the
+				LRU list is very short, the block is added to
+				the start, regardless of this parameter */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	ut_a(buf_page_in_file(bpage));
+	ut_ad(!bpage->in_LRU_list);
+
+	if (!old || (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN)) {
+
+		UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, bpage);
+
+		bpage->freed_page_clock = buf_pool->freed_page_clock;
+	} else {
+#ifdef UNIV_LRU_DEBUG
+		/* buf_pool->LRU_old must be the first item in the LRU list
+		whose "old" flag is set. */
+		ut_a(buf_pool->LRU_old->old);
+		ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
+		     || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
+		ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
+		     || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
+#endif /* UNIV_LRU_DEBUG */
+		UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, buf_pool->LRU_old,
+				     bpage);
+		buf_pool->LRU_old_len++;
+	}
+
+	ut_d(bpage->in_LRU_list = TRUE);
+
+	incr_LRU_size_in_bytes(bpage, buf_pool);
+
+	if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) {
+
+		ut_ad(buf_pool->LRU_old);
+
+		/* Adjust the length of the old block list if necessary */
+
+		buf_page_set_old(bpage, old);
+		buf_LRU_old_adjust_len(buf_pool);
+
+	} else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) {
+
+		/* The LRU list is now long enough for LRU_old to become
+		defined: init it */
+
+		buf_LRU_old_init(buf_pool);
+	} else {
+		buf_page_set_old(bpage, buf_pool->LRU_old != NULL);
+	}
+
+	/* If this is a zipped block with decompressed frame as well
+	then put it on the unzip_LRU list */
+	if (buf_page_belongs_to_unzip_LRU(bpage)) {
+		buf_unzip_LRU_add_block((buf_block_t*) bpage, old);
+	}
+}
+
+/******************************************************************//**
+Adds a block to the LRU list. Please make sure that the zip_size is
+already set into the page zip when invoking the function, so that we
+can get correct zip_size from the buffer page when adding a block
+into LRU */
+UNIV_INTERN
+void
+buf_LRU_add_block(
+/*==============*/
+	buf_page_t*	bpage,	/*!< in: control block */
+	ibool		old)	/*!< in: TRUE if should be put to the old
+				blocks in the LRU list, else put to the start;
+				if the LRU list is very short, the block is
+				added to the start, regardless of this
+				parameter */
+{
+	buf_LRU_add_block_low(bpage, old);
+}
+
+/******************************************************************//**
+Moves a block to the start of the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_make_block_young(
+/*=====================*/
+	buf_page_t*	bpage)	/*!< in: control block */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	if (bpage->old) {
+		buf_pool->stat.n_pages_made_young++;
+	}
+
+	buf_LRU_remove_block(bpage);
+	buf_LRU_add_block_low(bpage, FALSE);
+}
+
+/******************************************************************//**
+Moves a block to the end of the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_make_block_old(
+/*===================*/
+	buf_page_t*	bpage)	/*!< in: control block */
+{
+	buf_LRU_remove_block(bpage);
+	buf_LRU_add_block_to_end_low(bpage);
+}
+
+/******************************************************************//**
+Try to free a block.  If bpage is a descriptor of a compressed-only
+page, the descriptor object will be freed as well.
+
+NOTE: If this function returns true, it will temporarily
+release buf_pool->mutex.  Furthermore, the page frame will no longer be
+accessible via bpage.
+
+The caller must hold buf_pool->mutex and must not hold any
+buf_page_get_mutex() when calling this function.
+@return true if freed, false otherwise. */
+UNIV_INTERN
+bool
+buf_LRU_free_page(
+/*===============*/
+	buf_page_t*	bpage,	/*!< in: block to be freed */
+	bool		zip)	/*!< in: true if should remove also the
+				compressed page of an uncompressed page */
+{
+	buf_page_t*	b = NULL;
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	const ulint	fold = buf_page_address_fold(bpage->space,
+						     bpage->offset);
+	rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+
+	ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(buf_page_in_file(bpage));
+	ut_ad(bpage->in_LRU_list);
+
+	rw_lock_x_lock(hash_lock);
+	mutex_enter(block_mutex);
+
+	if (!buf_page_can_relocate(bpage)) {
+
+		/* Do not free buffer fixed or I/O-fixed blocks. */
+		goto func_exit;
+	}
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
+#endif /* UNIV_IBUF_COUNT_DEBUG */
+
+	if (zip || !bpage->zip.data) {
+		/* This would completely free the block. */
+		/* Do not completely free dirty blocks. */
+
+		if (bpage->oldest_modification) {
+			goto func_exit;
+		}
+	} else if (bpage->oldest_modification > 0
+		   && buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
+
+		ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY);
+
+func_exit:
+		rw_lock_x_unlock(hash_lock);
+		mutex_exit(block_mutex);
+		return(false);
+
+	} else if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
+		b = buf_page_alloc_descriptor();
+		ut_a(b);
+		memcpy(b, bpage, sizeof *b);
+	}
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(buf_page_in_file(bpage));
+	ut_ad(bpage->in_LRU_list);
+	ut_ad(!bpage->in_flush_list == !bpage->oldest_modification);
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints) {
+		fprintf(stderr, "Putting space %lu page %lu to free list\n",
+			(ulong) buf_page_get_space(bpage),
+			(ulong) buf_page_get_page_no(bpage));
+	}
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_SYNC_DEBUG
+        ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(buf_page_can_relocate(bpage));
+
+	if (!buf_LRU_block_remove_hashed(bpage, zip)) {
+		return(true);
+	}
+
+#ifdef UNIV_SYNC_DEBUG
+	/* buf_LRU_block_remove_hashed() releases the hash_lock */
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)
+	      && !rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* We have just freed a BUF_BLOCK_FILE_PAGE. If b != NULL
+	then it was a compressed page with an uncompressed frame and
+	we are interested in freeing only the uncompressed frame.
+	Therefore we have to reinsert the compressed page descriptor
+	into the LRU and page_hash (and possibly flush_list).
+	if b == NULL then it was a regular page that has been freed */
+
+	if (b) {
+		buf_page_t*	prev_b	= UT_LIST_GET_PREV(LRU, b);
+
+		rw_lock_x_lock(hash_lock);
+		mutex_enter(block_mutex);
+
+		ut_a(!buf_page_hash_get_low(
+				buf_pool, b->space, b->offset, fold));
+
+		b->state = b->oldest_modification
+			? BUF_BLOCK_ZIP_DIRTY
+			: BUF_BLOCK_ZIP_PAGE;
+		UNIV_MEM_DESC(b->zip.data,
+			      page_zip_get_size(&b->zip));
+
+		/* The fields in_page_hash and in_LRU_list of
+		the to-be-freed block descriptor should have
+		been cleared in
+		buf_LRU_block_remove_hashed(), which
+		invokes buf_LRU_remove_block(). */
+		ut_ad(!bpage->in_page_hash);
+		ut_ad(!bpage->in_LRU_list);
+		/* bpage->state was BUF_BLOCK_FILE_PAGE because
+		b != NULL. The type cast below is thus valid. */
+		ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list);
+
+		/* The fields of bpage were copied to b before
+		buf_LRU_block_remove_hashed() was invoked. */
+		ut_ad(!b->in_zip_hash);
+		ut_ad(b->in_page_hash);
+		ut_ad(b->in_LRU_list);
+
+		HASH_INSERT(buf_page_t, hash,
+			    buf_pool->page_hash, fold, b);
+
+		/* Insert b where bpage was in the LRU list. */
+		if (UNIV_LIKELY(prev_b != NULL)) {
+			ulint	lru_len;
+
+			ut_ad(prev_b->in_LRU_list);
+			ut_ad(buf_page_in_file(prev_b));
+			UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU,
+					     prev_b, b);
+
+			incr_LRU_size_in_bytes(b, buf_pool);
+
+			if (buf_page_is_old(b)) {
+				buf_pool->LRU_old_len++;
+				if (UNIV_UNLIKELY
+				    (buf_pool->LRU_old
+				     == UT_LIST_GET_NEXT(LRU, b))) {
+
+					buf_pool->LRU_old = b;
+				}
+			}
+
+			lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+
+			if (lru_len > BUF_LRU_OLD_MIN_LEN) {
+				ut_ad(buf_pool->LRU_old);
+				/* Adjust the length of the
+				old block list if necessary */
+				buf_LRU_old_adjust_len(buf_pool);
+			} else if (lru_len == BUF_LRU_OLD_MIN_LEN) {
+				/* The LRU list is now long
+				enough for LRU_old to become
+				defined: init it */
+				buf_LRU_old_init(buf_pool);
+			}
+#ifdef UNIV_LRU_DEBUG
+			/* Check that the "old" flag is consistent
+			in the block and its neighbours. */
+			buf_page_set_old(b, buf_page_is_old(b));
+#endif /* UNIV_LRU_DEBUG */
+		} else {
+			ut_d(b->in_LRU_list = FALSE);
+			buf_LRU_add_block_low(b, buf_page_is_old(b));
+		}
+
+		if (b->state == BUF_BLOCK_ZIP_PAGE) {
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+			buf_LRU_insert_zip_clean(b);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+		} else {
+			/* Relocate on buf_pool->flush_list. */
+			buf_flush_relocate_on_flush_list(bpage, b);
+		}
+
+		bpage->zip.data = NULL;
+		page_zip_set_size(&bpage->zip, 0);
+		mutex_exit(block_mutex);
+
+		/* Prevent buf_page_get_gen() from
+		decompressing the block while we release
+		buf_pool->mutex and block_mutex. */
+		block_mutex = buf_page_get_mutex(b);
+		mutex_enter(block_mutex);
+		buf_page_set_sticky(b);
+		mutex_exit(block_mutex);
+
+		rw_lock_x_unlock(hash_lock);
+
+	} else {
+
+		/* There can be multiple threads doing an LRU scan to
+		free a block. The page_cleaner thread can be doing an
+		LRU batch whereas user threads can potentially be doing
+		multiple single page flushes. As we release
+		buf_pool->mutex below we need to make sure that no one
+		else considers this block as a victim for page
+		replacement. This block is already out of page_hash
+		and we are about to remove it from the LRU list and put
+		it on the free list. */
+		mutex_enter(block_mutex);
+		buf_page_set_sticky(bpage);
+		mutex_exit(block_mutex);
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+
+	/* Remove possible adaptive hash index on the page.
+	The page was declared uninitialized by
+	buf_LRU_block_remove_hashed().  We need to flag
+	the contents of the page valid (which it still is) in
+	order to avoid bogus Valgrind warnings.*/
+
+	UNIV_MEM_VALID(((buf_block_t*) bpage)->frame,
+		       UNIV_PAGE_SIZE);
+	btr_search_drop_page_hash_index((buf_block_t*) bpage);
+	UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame,
+			 UNIV_PAGE_SIZE);
+
+	if (b) {
+		ib_uint32_t	checksum;
+		/* Compute and stamp the compressed page
+		checksum while not holding any mutex.  The
+		block is already half-freed
+		(BUF_BLOCK_REMOVE_HASH) and removed from
+		buf_pool->page_hash, thus inaccessible by any
+		other thread. */
+
+		checksum = static_cast<ib_uint32_t>(
+			page_zip_calc_checksum(
+				b->zip.data,
+				page_zip_get_size(&b->zip),
+				static_cast<srv_checksum_algorithm_t>(
+					srv_checksum_algorithm)));
+
+		mach_write_to_4(b->zip.data + FIL_PAGE_SPACE_OR_CHKSUM,
+				checksum);
+	}
+
+	buf_pool_mutex_enter(buf_pool);
+
+	mutex_enter(block_mutex);
+	buf_page_unset_sticky(b != NULL ? b : bpage);
+	mutex_exit(block_mutex);
+
+	buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
+	return(true);
+}
+
+/******************************************************************//**
+Puts a block back to the free list. */
+UNIV_INTERN
+void
+buf_LRU_block_free_non_file_page(
+/*=============================*/
+	buf_block_t*	block)	/*!< in: block, must not contain a file page */
+{
+	void*		data;
+	buf_pool_t*	buf_pool = buf_pool_from_block(block);
+
+	ut_ad(block);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(mutex_own(&block->mutex));
+
+	switch (buf_block_get_state(block)) {
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_READY_FOR_USE:
+		break;
+	default:
+		ut_error;
+	}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	ut_a(block->n_pointers == 0);
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	ut_ad(!block->page.in_free_list);
+	ut_ad(!block->page.in_flush_list);
+	ut_ad(!block->page.in_LRU_list);
+
+	buf_block_set_state(block, BUF_BLOCK_NOT_USED);
+
+	UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE);
+#ifdef UNIV_DEBUG
+	/* Wipe contents of page to reveal possible stale pointers to it */
+	memset(block->frame, '\0', UNIV_PAGE_SIZE);
+#else
+	/* Wipe page_no and space_id */
+	memset(block->frame + FIL_PAGE_OFFSET, 0xfe, 4);
+	memset(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xfe, 4);
+#endif
+	data = block->page.zip.data;
+
+	if (data) {
+		block->page.zip.data = NULL;
+		mutex_exit(&block->mutex);
+		buf_pool_mutex_exit_forbid(buf_pool);
+
+		buf_buddy_free(
+			buf_pool, data, page_zip_get_size(&block->page.zip));
+
+		buf_pool_mutex_exit_allow(buf_pool);
+		mutex_enter(&block->mutex);
+		page_zip_set_size(&block->page.zip, 0);
+	}
+
+	UT_LIST_ADD_FIRST(list, buf_pool->free, (&block->page));
+	ut_d(block->page.in_free_list = TRUE);
+
+	UNIV_MEM_ASSERT_AND_FREE(block->frame, UNIV_PAGE_SIZE);
+}
+
+/******************************************************************//**
+Takes a block out of the LRU list and page hash table.
+If the block is compressed-only (BUF_BLOCK_ZIP_PAGE),
+the object will be freed.
+
+The caller must hold buf_pool->mutex, the buf_page_get_mutex() mutex
+and the appropriate hash_lock. This function will release the
+buf_page_get_mutex() and the hash_lock.
+
+If a compressed page is freed other compressed pages may be relocated.
+@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The
+caller needs to free the page to the free list
+@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
+this case the block is already returned to the buddy allocator. */
+static
+bool
+buf_LRU_block_remove_hashed(
+/*========================*/
+	buf_page_t*	bpage,	/*!< in: block, must contain a file page and
+				be in a state where it can be freed; there
+				may or may not be a hash index to the page */
+	bool		zip)	/*!< in: true if should remove also the
+				compressed page of an uncompressed page */
+{
+	ulint			fold;
+	const buf_page_t*	hashed_bpage;
+	buf_pool_t*		buf_pool = buf_pool_from_bpage(bpage);
+	rw_lock_t*		hash_lock;
+
+	ut_ad(bpage);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+	fold = buf_page_address_fold(bpage->space, bpage->offset);
+	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+#ifdef UNIV_SYNC_DEBUG
+        ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
+	ut_a(bpage->buf_fix_count == 0);
+
+	buf_LRU_remove_block(bpage);
+
+	buf_pool->freed_page_clock += 1;
+
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_FILE_PAGE:
+		UNIV_MEM_ASSERT_W(bpage, sizeof(buf_block_t));
+		UNIV_MEM_ASSERT_W(((buf_block_t*) bpage)->frame,
+				  UNIV_PAGE_SIZE);
+		buf_block_modify_clock_inc((buf_block_t*) bpage);
+		if (bpage->zip.data) {
+			const page_t*	page = ((buf_block_t*) bpage)->frame;
+			const ulint	zip_size
+				= page_zip_get_size(&bpage->zip);
+
+			ut_a(!zip || bpage->oldest_modification == 0);
+
+			switch (UNIV_EXPECT(fil_page_get_type(page),
+					    FIL_PAGE_INDEX)) {
+			case FIL_PAGE_TYPE_ALLOCATED:
+			case FIL_PAGE_INODE:
+			case FIL_PAGE_IBUF_BITMAP:
+			case FIL_PAGE_TYPE_FSP_HDR:
+			case FIL_PAGE_TYPE_XDES:
+				/* These are essentially uncompressed pages. */
+				if (!zip) {
+					/* InnoDB writes the data to the
+					uncompressed page frame.  Copy it
+					to the compressed page, which will
+					be preserved. */
+					memcpy(bpage->zip.data, page,
+					       zip_size);
+				}
+				break;
+			case FIL_PAGE_TYPE_ZBLOB:
+			case FIL_PAGE_TYPE_ZBLOB2:
+				break;
+			case FIL_PAGE_INDEX:
+#ifdef UNIV_ZIP_DEBUG
+				ut_a(page_zip_validate(
+					     &bpage->zip, page,
+					     ((buf_block_t*) bpage)->index));
+#endif /* UNIV_ZIP_DEBUG */
+				break;
+			default:
+				ut_print_timestamp(stderr);
+				fputs("  InnoDB: ERROR: The compressed page"
+				      " to be evicted seems corrupt:", stderr);
+				ut_print_buf(stderr, page, zip_size);
+				fputs("\nInnoDB: Possibly older version"
+				      " of the page:", stderr);
+				ut_print_buf(stderr, bpage->zip.data,
+					     zip_size);
+				putc('\n', stderr);
+				ut_error;
+			}
+
+			break;
+		}
+		/* fall through */
+	case BUF_BLOCK_ZIP_PAGE:
+		ut_a(bpage->oldest_modification == 0);
+		UNIV_MEM_ASSERT_W(bpage->zip.data,
+				  page_zip_get_size(&bpage->zip));
+		break;
+	case BUF_BLOCK_POOL_WATCH:
+	case BUF_BLOCK_ZIP_DIRTY:
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		ut_error;
+		break;
+	}
+
+	hashed_bpage = buf_page_hash_get_low(buf_pool, bpage->space,
+					     bpage->offset, fold);
+
+	if (UNIV_UNLIKELY(bpage != hashed_bpage)) {
+		fprintf(stderr,
+			"InnoDB: Error: page %lu %lu not found"
+			" in the hash table\n",
+			(ulong) bpage->space,
+			(ulong) bpage->offset);
+		if (hashed_bpage) {
+			fprintf(stderr,
+				"InnoDB: In hash table we find block"
+				" %p of %lu %lu which is not %p\n",
+				(const void*) hashed_bpage,
+				(ulong) hashed_bpage->space,
+				(ulong) hashed_bpage->offset,
+				(const void*) bpage);
+		}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+		mutex_exit(buf_page_get_mutex(bpage));
+		rw_lock_x_unlock(hash_lock);
+		buf_pool_mutex_exit(buf_pool);
+		buf_print();
+		buf_LRU_print();
+		buf_validate();
+		buf_LRU_validate();
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+		ut_error;
+	}
+
+	ut_ad(!bpage->in_zip_hash);
+	ut_ad(bpage->in_page_hash);
+	ut_d(bpage->in_page_hash = FALSE);
+	HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_ZIP_PAGE:
+		ut_ad(!bpage->in_free_list);
+		ut_ad(!bpage->in_flush_list);
+		ut_ad(!bpage->in_LRU_list);
+		ut_a(bpage->zip.data);
+		ut_a(buf_page_get_zip_size(bpage));
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+		UT_LIST_REMOVE(list, buf_pool->zip_clean, bpage);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+		mutex_exit(&buf_pool->zip_mutex);
+		rw_lock_x_unlock(hash_lock);
+		buf_pool_mutex_exit_forbid(buf_pool);
+
+		buf_buddy_free(
+			buf_pool, bpage->zip.data,
+			page_zip_get_size(&bpage->zip));
+
+		buf_pool_mutex_exit_allow(buf_pool);
+		buf_page_free_descriptor(bpage);
+		return(false);
+
+	case BUF_BLOCK_FILE_PAGE:
+		memset(((buf_block_t*) bpage)->frame
+		       + FIL_PAGE_OFFSET, 0xff, 4);
+		memset(((buf_block_t*) bpage)->frame
+		       + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
+		UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame,
+				 UNIV_PAGE_SIZE);
+		buf_page_set_state(bpage, BUF_BLOCK_REMOVE_HASH);
+
+		if (buf_pool->flush_rbt == NULL) {
+			bpage->space = ULINT32_UNDEFINED;
+			bpage->offset = ULINT32_UNDEFINED;
+		}
+
+		/* Question: If we release bpage and hash mutex here
+		then what protects us against:
+		1) Some other thread buffer fixing this page
+		2) Some other thread trying to read this page and
+		not finding it in buffer pool attempting to read it
+		from the disk.
+		Answer:
+		1) Cannot happen because the page is no longer in the
+		page_hash. Only possibility is when while invalidating
+		a tablespace we buffer fix the prev_page in LRU to
+		avoid relocation during the scan. But that is not
+		possible because we are holding buf_pool mutex.
+
+		2) Not possible because in buf_page_init_for_read()
+		we do a look up of page_hash while holding buf_pool
+		mutex and since we are holding buf_pool mutex here
+		and by the time we'll release it in the caller we'd
+		have inserted the compressed only descriptor in the
+		page_hash. */
+		rw_lock_x_unlock(hash_lock);
+		mutex_exit(&((buf_block_t*) bpage)->mutex);
+
+		if (zip && bpage->zip.data) {
+			/* Free the compressed page. */
+			void*	data = bpage->zip.data;
+			bpage->zip.data = NULL;
+
+			ut_ad(!bpage->in_free_list);
+			ut_ad(!bpage->in_flush_list);
+			ut_ad(!bpage->in_LRU_list);
+			buf_pool_mutex_exit_forbid(buf_pool);
+
+			buf_buddy_free(
+				buf_pool, data,
+				page_zip_get_size(&bpage->zip));
+
+			buf_pool_mutex_exit_allow(buf_pool);
+			page_zip_set_size(&bpage->zip, 0);
+		}
+
+		return(true);
+
+	case BUF_BLOCK_POOL_WATCH:
+	case BUF_BLOCK_ZIP_DIRTY:
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		break;
+	}
+
+	ut_error;
+	return(false);
+}
+
+/******************************************************************//**
+Puts a file page whose has no hash index to the free list. */
+static
+void
+buf_LRU_block_free_hashed_page(
+/*===========================*/
+	buf_block_t*	block)	/*!< in: block, must contain a file page and
+				be in a state where it can be freed */
+{
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_block(block);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+#endif
+
+	mutex_enter(&block->mutex);
+	buf_block_set_state(block, BUF_BLOCK_MEMORY);
+
+	buf_LRU_block_free_non_file_page(block);
+	mutex_exit(&block->mutex);
+}
+
+/******************************************************************//**
+Remove one page from LRU list and put it to free list */
+UNIV_INTERN
+void
+buf_LRU_free_one_page(
+/*==================*/
+	buf_page_t*	bpage)	/*!< in/out: block, must contain a file page and
+				be in a state where it can be freed; there
+				may or may not be a hash index to the page */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	const ulint	fold = buf_page_address_fold(bpage->space,
+						     bpage->offset);
+	rw_lock_t*	hash_lock = buf_page_hash_lock_get(buf_pool, fold);
+	ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
+
+	ut_ad(buf_pool_mutex_own(buf_pool));
+
+	rw_lock_x_lock(hash_lock);
+	mutex_enter(block_mutex);
+
+	if (buf_LRU_block_remove_hashed(bpage, true)) {
+		buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
+	}
+
+	/* buf_LRU_block_remove_hashed() releases hash_lock and block_mutex */
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)
+	      && !rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(!mutex_own(block_mutex));
+}
+
+/**********************************************************************//**
+Updates buf_pool->LRU_old_ratio for one buffer pool instance.
+@return	updated old_pct */
+static
+uint
+buf_LRU_old_ratio_update_instance(
+/*==============================*/
+	buf_pool_t*	buf_pool,/*!< in: buffer pool instance */
+	uint		old_pct,/*!< in: Reserve this percentage of
+				the buffer pool for "old" blocks. */
+	ibool		adjust)	/*!< in: TRUE=adjust the LRU list;
+				FALSE=just assign buf_pool->LRU_old_ratio
+				during the initialization of InnoDB */
+{
+	uint	ratio;
+
+	ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100;
+	if (ratio < BUF_LRU_OLD_RATIO_MIN) {
+		ratio = BUF_LRU_OLD_RATIO_MIN;
+	} else if (ratio > BUF_LRU_OLD_RATIO_MAX) {
+		ratio = BUF_LRU_OLD_RATIO_MAX;
+	}
+
+	if (adjust) {
+		buf_pool_mutex_enter(buf_pool);
+
+		if (ratio != buf_pool->LRU_old_ratio) {
+			buf_pool->LRU_old_ratio = ratio;
+
+			if (UT_LIST_GET_LEN(buf_pool->LRU)
+			   >= BUF_LRU_OLD_MIN_LEN) {
+
+				buf_LRU_old_adjust_len(buf_pool);
+			}
+		}
+
+		buf_pool_mutex_exit(buf_pool);
+	} else {
+		buf_pool->LRU_old_ratio = ratio;
+	}
+	/* the reverse of
+	ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100 */
+	return((uint) (ratio * 100 / (double) BUF_LRU_OLD_RATIO_DIV + 0.5));
+}
+
+/**********************************************************************//**
+Updates buf_pool->LRU_old_ratio.
+@return	updated old_pct */
+UNIV_INTERN
+ulint
+buf_LRU_old_ratio_update(
+/*=====================*/
+	uint	old_pct,/*!< in: Reserve this percentage of
+			the buffer pool for "old" blocks. */
+	ibool	adjust)	/*!< in: TRUE=adjust the LRU list;
+			FALSE=just assign buf_pool->LRU_old_ratio
+			during the initialization of InnoDB */
+{
+	ulint	i;
+	ulint	new_ratio = 0;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		new_ratio = buf_LRU_old_ratio_update_instance(
+			buf_pool, old_pct, adjust);
+	}
+
+	return(new_ratio);
+}
+
+/********************************************************************//**
+Update the historical stats that we are collecting for LRU eviction
+policy at the end of each interval. */
+UNIV_INTERN
+void
+buf_LRU_stat_update(void)
+/*=====================*/
+{
+	ulint		i;
+	buf_LRU_stat_t*	item;
+	buf_pool_t*	buf_pool;
+	ibool		evict_started = FALSE;
+	buf_LRU_stat_t	cur_stat;
+
+	/* If we haven't started eviction yet then don't update stats. */
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+
+		buf_pool = buf_pool_from_array(i);
+
+		if (buf_pool->freed_page_clock != 0) {
+			evict_started = TRUE;
+			break;
+		}
+	}
+
+	if (!evict_started) {
+		goto func_exit;
+	}
+
+	/* Update the index. */
+	item = &buf_LRU_stat_arr[buf_LRU_stat_arr_ind];
+	buf_LRU_stat_arr_ind++;
+	buf_LRU_stat_arr_ind %= BUF_LRU_STAT_N_INTERVAL;
+
+	/* Add the current value and subtract the obsolete entry.
+	Since buf_LRU_stat_cur is not protected by any mutex,
+	it can be changing between adding to buf_LRU_stat_sum
+	and copying to item. Assign it to local variables to make
+	sure the same value assign to the buf_LRU_stat_sum
+	and item */
+	cur_stat = buf_LRU_stat_cur;
+
+	buf_LRU_stat_sum.io += cur_stat.io - item->io;
+	buf_LRU_stat_sum.unzip += cur_stat.unzip - item->unzip;
+
+	/* Put current entry in the array. */
+	memcpy(item, &cur_stat, sizeof *item);
+
+func_exit:
+	/* Clear the current entry. */
+	memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur);
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Validates the LRU list for one buffer pool instance. */
+static
+void
+buf_LRU_validate_instance(
+/*======================*/
+	buf_pool_t*	buf_pool)
+{
+	buf_page_t*	bpage;
+	buf_block_t*	block;
+	ulint		old_len;
+	ulint		new_len;
+
+	ut_ad(buf_pool);
+	buf_pool_mutex_enter(buf_pool);
+
+	if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) {
+
+		ut_a(buf_pool->LRU_old);
+		old_len = buf_pool->LRU_old_len;
+		new_len = ut_min(UT_LIST_GET_LEN(buf_pool->LRU)
+				 * buf_pool->LRU_old_ratio
+				 / BUF_LRU_OLD_RATIO_DIV,
+				 UT_LIST_GET_LEN(buf_pool->LRU)
+				 - (BUF_LRU_OLD_TOLERANCE
+				    + BUF_LRU_NON_OLD_MIN_LEN));
+		ut_a(old_len >= new_len - BUF_LRU_OLD_TOLERANCE);
+		ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE);
+	}
+
+	UT_LIST_VALIDATE(LRU, buf_page_t, buf_pool->LRU, CheckInLRUList());
+
+	old_len = 0;
+
+	for (bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
+	     bpage != NULL;
+             bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+
+		switch (buf_page_get_state(bpage)) {
+		case BUF_BLOCK_POOL_WATCH:
+		case BUF_BLOCK_NOT_USED:
+		case BUF_BLOCK_READY_FOR_USE:
+		case BUF_BLOCK_MEMORY:
+		case BUF_BLOCK_REMOVE_HASH:
+			ut_error;
+			break;
+		case BUF_BLOCK_FILE_PAGE:
+			ut_ad(((buf_block_t*) bpage)->in_unzip_LRU_list
+			      == buf_page_belongs_to_unzip_LRU(bpage));
+		case BUF_BLOCK_ZIP_PAGE:
+		case BUF_BLOCK_ZIP_DIRTY:
+			break;
+		}
+
+		if (buf_page_is_old(bpage)) {
+			const buf_page_t*	prev
+				= UT_LIST_GET_PREV(LRU, bpage);
+			const buf_page_t*	next
+				= UT_LIST_GET_NEXT(LRU, bpage);
+
+			if (!old_len++) {
+				ut_a(buf_pool->LRU_old == bpage);
+			} else {
+				ut_a(!prev || buf_page_is_old(prev));
+			}
+
+			ut_a(!next || buf_page_is_old(next));
+		}
+	}
+
+	ut_a(buf_pool->LRU_old_len == old_len);
+
+	UT_LIST_VALIDATE(list, buf_page_t, buf_pool->free, CheckInFreeList());
+
+	for (bpage = UT_LIST_GET_FIRST(buf_pool->free);
+	     bpage != NULL;
+	     bpage = UT_LIST_GET_NEXT(list, bpage)) {
+
+		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_NOT_USED);
+	}
+
+	UT_LIST_VALIDATE(
+                unzip_LRU, buf_block_t, buf_pool->unzip_LRU,
+                CheckUnzipLRUAndLRUList());
+
+	for (block = UT_LIST_GET_FIRST(buf_pool->unzip_LRU);
+	     block;
+	     block = UT_LIST_GET_NEXT(unzip_LRU, block)) {
+
+		ut_ad(block->in_unzip_LRU_list);
+		ut_ad(block->page.in_LRU_list);
+		ut_a(buf_page_belongs_to_unzip_LRU(&block->page));
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+}
+
+/**********************************************************************//**
+Validates the LRU list.
+@return	TRUE */
+UNIV_INTERN
+ibool
+buf_LRU_validate(void)
+/*==================*/
+{
+	ulint	i;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+		buf_LRU_validate_instance(buf_pool);
+	}
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Prints the LRU list for one buffer pool instance. */
+UNIV_INTERN
+void
+buf_LRU_print_instance(
+/*===================*/
+	buf_pool_t*	buf_pool)
+{
+	const buf_page_t*	bpage;
+
+	ut_ad(buf_pool);
+	buf_pool_mutex_enter(buf_pool);
+
+	bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
+
+	while (bpage != NULL) {
+
+		mutex_enter(buf_page_get_mutex(bpage));
+		fprintf(stderr, "BLOCK space %lu page %lu ",
+			(ulong) buf_page_get_space(bpage),
+			(ulong) buf_page_get_page_no(bpage));
+
+		if (buf_page_is_old(bpage)) {
+			fputs("old ", stderr);
+		}
+
+		if (bpage->buf_fix_count) {
+			fprintf(stderr, "buffix count %lu ",
+				(ulong) bpage->buf_fix_count);
+		}
+
+		if (buf_page_get_io_fix(bpage)) {
+			fprintf(stderr, "io_fix %lu ",
+				(ulong) buf_page_get_io_fix(bpage));
+		}
+
+		if (bpage->oldest_modification) {
+			fputs("modif. ", stderr);
+		}
+
+		switch (buf_page_get_state(bpage)) {
+			const byte*	frame;
+		case BUF_BLOCK_FILE_PAGE:
+			frame = buf_block_get_frame((buf_block_t*) bpage);
+			fprintf(stderr, "\ntype %lu"
+				" index id %llu\n",
+				(ulong) fil_page_get_type(frame),
+				(ullint) btr_page_get_index_id(frame));
+			break;
+		case BUF_BLOCK_ZIP_PAGE:
+			frame = bpage->zip.data;
+			fprintf(stderr, "\ntype %lu size %lu"
+				" index id %llu\n",
+				(ulong) fil_page_get_type(frame),
+				(ulong) buf_page_get_zip_size(bpage),
+				(ullint) btr_page_get_index_id(frame));
+			break;
+
+		default:
+			fprintf(stderr, "\n!state %lu!\n",
+				(ulong) buf_page_get_state(bpage));
+			break;
+		}
+
+		mutex_exit(buf_page_get_mutex(bpage));
+		bpage = UT_LIST_GET_NEXT(LRU, bpage);
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+}
+
+/**********************************************************************//**
+Prints the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_print(void)
+/*===============*/
+{
+	ulint		i;
+	buf_pool_t*	buf_pool;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool = buf_pool_from_array(i);
+		buf_LRU_print_instance(buf_pool);
+	}
+}
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc
new file mode 100644
index 00000000000..7c8369c0c09
--- /dev/null
+++ b/storage/innobase/buf/buf0rea.cc
@@ -0,0 +1,921 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0rea.cc
+The database buffer read
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0rea.h"
+
+#include "fil0fil.h"
+#include "mtr0mtr.h"
+
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "buf0dblwr.h"
+#include "ibuf0ibuf.h"
+#include "log0recv.h"
+#include "trx0sys.h"
+#include "os0file.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "mysql/plugin.h"
+#include "mysql/service_thd_wait.h"
+
+/** There must be at least this many pages in buf_pool in the area to start
+a random read-ahead */
+#define BUF_READ_AHEAD_RANDOM_THRESHOLD(b)	\
+				(5 + BUF_READ_AHEAD_AREA(b) / 8)
+
+/** If there are buf_pool->curr_size per the number below pending reads, then
+read-ahead is not done: this is to prevent flooding the buffer pool with
+i/o-fixed buffer blocks */
+#define BUF_READ_AHEAD_PEND_LIMIT	2
+
+/********************************************************************//**
+Unfixes the pages, unlatches the page,
+removes it from page_hash and removes it from LRU. */
+static
+void
+buf_read_page_handle_error(
+/*=======================*/
+	buf_page_t*	bpage)	/*!< in: pointer to the block */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	const bool	uncompressed = (buf_page_get_state(bpage)
+					== BUF_BLOCK_FILE_PAGE);
+
+	/* First unfix and release lock on the bpage */
+	buf_pool_mutex_enter(buf_pool);
+	mutex_enter(buf_page_get_mutex(bpage));
+	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
+	ut_ad(bpage->buf_fix_count == 0);
+
+	/* Set BUF_IO_NONE before we remove the block from LRU list */
+	buf_page_set_io_fix(bpage, BUF_IO_NONE);
+
+	if (uncompressed) {
+		rw_lock_x_unlock_gen(
+			&((buf_block_t*) bpage)->lock,
+			BUF_IO_READ);
+	}
+
+	mutex_exit(buf_page_get_mutex(bpage));
+
+	/* remove the block from LRU list */
+	buf_LRU_free_one_page(bpage);
+
+	ut_ad(buf_pool->n_pend_reads > 0);
+	buf_pool->n_pend_reads--;
+
+	buf_pool_mutex_exit(buf_pool);
+}
+
+/********************************************************************//**
+Low-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there, in which case does nothing.
+Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
+flag is cleared and the x-lock released by an i/o-handler thread.
+@return 1 if a read request was queued, 0 if the page already resided
+in buf_pool, or if the page is in the doublewrite buffer blocks in
+which case it is never read into the pool, or if the tablespace does
+not exist or is being dropped
+@return 1 if read request is issued. 0 if it is not */
+static
+ulint
+buf_read_page_low(
+/*==============*/
+	dberr_t*	err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
+			trying to read from a non-existent tablespace, or a
+			tablespace which is just now being dropped */
+	bool	sync,	/*!< in: true if synchronous aio is desired */
+	ulint	mode,	/*!< in: BUF_READ_IBUF_PAGES_ONLY, ...,
+			ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
+			at read-ahead functions) */
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size, or 0 */
+	ibool	unzip,	/*!< in: TRUE=request uncompressed page */
+	ib_int64_t tablespace_version, /*!< in: if the space memory object has
+			this timestamp different from what we are giving here,
+			treat the tablespace as dropped; this is a timestamp we
+			use to stop dangling page reads from a tablespace
+			which we have DISCARDed + IMPORTed back */
+	ulint	offset)	/*!< in: page number */
+{
+	buf_page_t*	bpage;
+	ulint		wake_later;
+	ibool		ignore_nonexistent_pages;
+
+	*err = DB_SUCCESS;
+
+	wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
+	mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
+
+	ignore_nonexistent_pages = mode & BUF_READ_IGNORE_NONEXISTENT_PAGES;
+	mode &= ~BUF_READ_IGNORE_NONEXISTENT_PAGES;
+
+	if (space == TRX_SYS_SPACE && buf_dblwr_page_inside(offset)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Warning: trying to read"
+			" doublewrite buffer page %lu\n",
+			(ulong) offset);
+
+		return(0);
+	}
+
+	if (ibuf_bitmap_page(zip_size, offset)
+	    || trx_sys_hdr_page(space, offset)) {
+
+		/* Trx sys header is so low in the latching order that we play
+		safe and do not leave the i/o-completion to an asynchronous
+		i/o-thread. Ibuf bitmap pages must always be read with
+		syncronous i/o, to make sure they do not get involved in
+		thread deadlocks. */
+
+		sync = true;
+	}
+
+	/* The following call will also check if the tablespace does not exist
+	or is being dropped; if we succeed in initing the page in the buffer
+	pool for read, then DISCARD cannot proceed until the read has
+	completed */
+	bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip,
+				       tablespace_version, offset);
+	if (bpage == NULL) {
+
+		return(0);
+	}
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints) {
+		fprintf(stderr,
+			"Posting read request for page %lu, sync %s\n",
+			(ulong) offset, sync ? "true" : "false");
+	}
+#endif
+
+	ut_ad(buf_page_in_file(bpage));
+
+	if (sync) {
+		thd_wait_begin(NULL, THD_WAIT_DISKIO);
+	}
+
+	if (zip_size) {
+		*err = fil_io(OS_FILE_READ | wake_later
+			      | ignore_nonexistent_pages,
+			      sync, space, zip_size, offset, 0, zip_size,
+			      bpage->zip.data, bpage);
+	} else {
+		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+
+		*err = fil_io(OS_FILE_READ | wake_later
+			      | ignore_nonexistent_pages,
+			      sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
+			      ((buf_block_t*) bpage)->frame, bpage);
+	}
+
+	if (sync) {
+		thd_wait_end(NULL);
+	}
+
+	if (*err != DB_SUCCESS) {
+		if (ignore_nonexistent_pages || *err == DB_TABLESPACE_DELETED) {
+			buf_read_page_handle_error(bpage);
+			return(0);
+		}
+		/* else */
+		ut_error;
+	}
+
+	if (sync) {
+		/* The i/o is already completed when we arrive from
+		fil_read */
+		if (!buf_page_io_complete(bpage)) {
+			return(0);
+		}
+	}
+
+	return(1);
+}
+
+/********************************************************************//**
+Applies a random read-ahead in buf_pool if there are at least a threshold
+value of accessed pages from the random read-ahead area. Does not read any
+page, not even the one at the position (space, offset), if the read-ahead
+mechanism is not activated. NOTE 1: the calling thread may own latches on
+pages: to avoid deadlocks this function must be written such that it cannot
+end up waiting for these latches! NOTE 2: the calling thread must want
+access to the page given: this rule is set to prevent unintended read-aheads
+performed by ibuf routines, a situation which could result in a deadlock if
+the OS does not support asynchronous i/o.
+@return number of page read requests issued; NOTE that if we read ibuf
+pages, it may happen that the page at the given page number does not
+get read even if we return a positive value!
+@return	number of page read requests issued */
+UNIV_INTERN
+ulint
+buf_read_ahead_random(
+/*==================*/
+	ulint	space,		/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes,
+				or 0 */
+	ulint	offset,		/*!< in: page number of a page which
+				the current thread wants to access */
+	ibool	inside_ibuf)	/*!< in: TRUE if we are inside ibuf
+				routine */
+{
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+	ib_int64_t	tablespace_version;
+	ulint		recent_blocks	= 0;
+	ulint		ibuf_mode;
+	ulint		count;
+	ulint		low, high;
+	dberr_t		err;
+	ulint		i;
+	const ulint	buf_read_ahead_random_area
+				= BUF_READ_AHEAD_AREA(buf_pool);
+
+	if (!srv_random_read_ahead) {
+		/* Disabled by user */
+		return(0);
+	}
+
+	if (srv_startup_is_before_trx_rollback_phase) {
+		/* No read-ahead to avoid thread deadlocks */
+		return(0);
+	}
+
+	if (ibuf_bitmap_page(zip_size, offset)
+	    || trx_sys_hdr_page(space, offset)) {
+
+		/* If it is an ibuf bitmap page or trx sys hdr, we do
+		no read-ahead, as that could break the ibuf page access
+		order */
+
+		return(0);
+	}
+
+	/* Remember the tablespace version before we ask te tablespace size
+	below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
+	do not try to read outside the bounds of the tablespace! */
+
+	tablespace_version = fil_space_get_version(space);
+
+	low  = (offset / buf_read_ahead_random_area)
+		* buf_read_ahead_random_area;
+	high = (offset / buf_read_ahead_random_area + 1)
+		* buf_read_ahead_random_area;
+	if (high > fil_space_get_size(space)) {
+
+		high = fil_space_get_size(space);
+	}
+
+	buf_pool_mutex_enter(buf_pool);
+
+	if (buf_pool->n_pend_reads
+	    > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
+		buf_pool_mutex_exit(buf_pool);
+
+		return(0);
+	}
+
+	/* Count how many blocks in the area have been recently accessed,
+	that is, reside near the start of the LRU list. */
+
+	for (i = low; i < high; i++) {
+		const buf_page_t* bpage =
+			buf_page_hash_get(buf_pool, space, i);
+
+		if (bpage
+		    && buf_page_is_accessed(bpage)
+		    && buf_page_peek_if_young(bpage)) {
+
+			recent_blocks++;
+
+			if (recent_blocks
+			    >= BUF_READ_AHEAD_RANDOM_THRESHOLD(buf_pool)) {
+
+				buf_pool_mutex_exit(buf_pool);
+				goto read_ahead;
+			}
+		}
+	}
+
+	buf_pool_mutex_exit(buf_pool);
+	/* Do nothing */
+	return(0);
+
+read_ahead:
+	/* Read all the suitable blocks within the area */
+
+	if (inside_ibuf) {
+		ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
+	} else {
+		ibuf_mode = BUF_READ_ANY_PAGE;
+	}
+
+	count = 0;
+
+	for (i = low; i < high; i++) {
+		/* It is only sensible to do read-ahead in the non-sync aio
+		mode: hence FALSE as the first parameter */
+
+		if (!ibuf_bitmap_page(zip_size, i)) {
+			count += buf_read_page_low(
+				&err, false,
+				ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
+				space, zip_size, FALSE,
+				tablespace_version, i);
+			if (err == DB_TABLESPACE_DELETED) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"  InnoDB: Warning: in random"
+					" readahead trying to access\n"
+					"InnoDB: tablespace %lu page %lu,\n"
+					"InnoDB: but the tablespace does not"
+					" exist or is just being dropped.\n",
+					(ulong) space, (ulong) i);
+			}
+		}
+	}
+
+	/* In simulated aio we wake the aio handler threads only after
+	queuing all aio requests, in native aio the following call does
+	nothing: */
+
+	os_aio_simulated_wake_handler_threads();
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints && (count > 0)) {
+		fprintf(stderr,
+			"Random read-ahead space %lu offset %lu pages %lu\n",
+			(ulong) space, (ulong) offset,
+			(ulong) count);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* Read ahead is considered one I/O operation for the purpose of
+	LRU policy decision. */
+	buf_LRU_stat_inc_io();
+
+	buf_pool->stat.n_ra_pages_read_rnd += count;
+	srv_stats.buf_pool_reads.add(count);
+	return(count);
+}
+
+/********************************************************************//**
+High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@return TRUE if page has been read in, FALSE in case of failure */
+UNIV_INTERN
+ibool
+buf_read_page(
+/*==========*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes, or 0 */
+	ulint	offset)	/*!< in: page number */
+{
+	ib_int64_t	tablespace_version;
+	ulint		count;
+	dberr_t		err;
+
+	tablespace_version = fil_space_get_version(space);
+
+	/* We do the i/o in the synchronous aio mode to save thread
+	switches: hence TRUE */
+
+	count = buf_read_page_low(&err, true, BUF_READ_ANY_PAGE, space,
+				  zip_size, FALSE,
+				  tablespace_version, offset);
+	srv_stats.buf_pool_reads.add(count);
+	if (err == DB_TABLESPACE_DELETED) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: trying to access"
+			" tablespace %lu page no. %lu,\n"
+			"InnoDB: but the tablespace does not exist"
+			" or is just being dropped.\n",
+			(ulong) space, (ulong) offset);
+	}
+
+	/* Increment number of I/O operations used for LRU policy. */
+	buf_LRU_stat_inc_io();
+
+	return(count > 0);
+}
+
+/********************************************************************//**
+High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@return TRUE if page has been read in, FALSE in case of failure */
+UNIV_INTERN
+ibool
+buf_read_page_async(
+/*================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: page number */
+{
+	ulint		zip_size;
+	ib_int64_t	tablespace_version;
+	ulint		count;
+	dberr_t		err;
+
+	zip_size = fil_space_get_zip_size(space);
+
+	if (zip_size == ULINT_UNDEFINED) {
+		return(FALSE);
+	}
+
+	tablespace_version = fil_space_get_version(space);
+
+	count = buf_read_page_low(&err, true, BUF_READ_ANY_PAGE
+				  | OS_AIO_SIMULATED_WAKE_LATER
+				  | BUF_READ_IGNORE_NONEXISTENT_PAGES,
+				  space, zip_size, FALSE,
+				  tablespace_version, offset);
+	srv_stats.buf_pool_reads.add(count);
+
+	/* We do not increment number of I/O operations used for LRU policy
+	here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
+	about evicting uncompressed version of compressed pages from the
+	buffer pool. Since this function is called from buffer pool load
+	these IOs are deliberate and are not part of normal workload we can
+	ignore these in our heuristics. */
+
+	return(count > 0);
+}
+
+/********************************************************************//**
+Applies linear read-ahead if in the buf_pool the page is a border page of
+a linear read-ahead area and all the pages in the area have been accessed.
+Does not read any page if the read-ahead mechanism is not activated. Note
+that the algorithm looks at the 'natural' adjacent successor and
+predecessor of the page, which on the leaf level of a B-tree are the next
+and previous page in the chain of leaves. To know these, the page specified
+in (space, offset) must already be present in the buf_pool. Thus, the
+natural way to use this function is to call it when a page in the buf_pool
+is accessed the first time, calling this function just after it has been
+bufferfixed.
+NOTE 1: as this function looks at the natural predecessor and successor
+fields on the page, what happens, if these are not initialized to any
+sensible value? No problem, before applying read-ahead we check that the
+area to read is within the span of the space, if not, read-ahead is not
+applied. An uninitialized value may result in a useless read operation, but
+only very improbably.
+NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
+function must be written such that it cannot end up waiting for these
+latches!
+NOTE 3: the calling thread must want access to the page given: this rule is
+set to prevent unintended read-aheads performed by ibuf routines, a situation
+which could result in a deadlock if the OS does not support asynchronous io.
+@return	number of page read requests issued */
+UNIV_INTERN
+ulint
+buf_read_ahead_linear(
+/*==================*/
+	ulint	space,		/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes, or 0 */
+	ulint	offset,		/*!< in: page number; see NOTE 3 above */
+	ibool	inside_ibuf)	/*!< in: TRUE if we are inside ibuf routine */
+{
+	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
+	ib_int64_t	tablespace_version;
+	buf_page_t*	bpage;
+	buf_frame_t*	frame;
+	buf_page_t*	pred_bpage	= NULL;
+	ulint		pred_offset;
+	ulint		succ_offset;
+	ulint		count;
+	int		asc_or_desc;
+	ulint		new_offset;
+	ulint		fail_count;
+	ulint		ibuf_mode;
+	ulint		low, high;
+	dberr_t		err;
+	ulint		i;
+	const ulint	buf_read_ahead_linear_area
+		= BUF_READ_AHEAD_AREA(buf_pool);
+	ulint		threshold;
+
+	/* check if readahead is disabled */
+	if (!srv_read_ahead_threshold) {
+		return(0);
+	}
+
+	if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) {
+		/* No read-ahead to avoid thread deadlocks */
+		return(0);
+	}
+
+	low  = (offset / buf_read_ahead_linear_area)
+		* buf_read_ahead_linear_area;
+	high = (offset / buf_read_ahead_linear_area + 1)
+		* buf_read_ahead_linear_area;
+
+	if ((offset != low) && (offset != high - 1)) {
+		/* This is not a border page of the area: return */
+
+		return(0);
+	}
+
+	if (ibuf_bitmap_page(zip_size, offset)
+	    || trx_sys_hdr_page(space, offset)) {
+
+		/* If it is an ibuf bitmap page or trx sys hdr, we do
+		no read-ahead, as that could break the ibuf page access
+		order */
+
+		return(0);
+	}
+
+	/* Remember the tablespace version before we ask te tablespace size
+	below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
+	do not try to read outside the bounds of the tablespace! */
+
+	tablespace_version = fil_space_get_version(space);
+
+	buf_pool_mutex_enter(buf_pool);
+
+	if (high > fil_space_get_size(space)) {
+		buf_pool_mutex_exit(buf_pool);
+		/* The area is not whole, return */
+
+		return(0);
+	}
+
+	if (buf_pool->n_pend_reads
+	    > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
+		buf_pool_mutex_exit(buf_pool);
+
+		return(0);
+	}
+
+	/* Check that almost all pages in the area have been accessed; if
+	offset == low, the accesses must be in a descending order, otherwise,
+	in an ascending order. */
+
+	asc_or_desc = 1;
+
+	if (offset == low) {
+		asc_or_desc = -1;
+	}
+
+	/* How many out of order accessed pages can we ignore
+	when working out the access pattern for linear readahead */
+	threshold = ut_min((64 - srv_read_ahead_threshold),
+			   BUF_READ_AHEAD_AREA(buf_pool));
+
+	fail_count = 0;
+
+	for (i = low; i < high; i++) {
+		bpage = buf_page_hash_get(buf_pool, space, i);
+
+		if (bpage == NULL || !buf_page_is_accessed(bpage)) {
+			/* Not accessed */
+			fail_count++;
+
+		} else if (pred_bpage) {
+			/* Note that buf_page_is_accessed() returns
+			the time of the first access.  If some blocks
+			of the extent existed in the buffer pool at
+			the time of a linear access pattern, the first
+			access times may be nonmonotonic, even though
+			the latest access times were linear.  The
+			threshold (srv_read_ahead_factor) should help
+			a little against this. */
+			int res = ut_ulint_cmp(
+				buf_page_is_accessed(bpage),
+				buf_page_is_accessed(pred_bpage));
+			/* Accesses not in the right order */
+			if (res != 0 && res != asc_or_desc) {
+				fail_count++;
+			}
+		}
+
+		if (fail_count > threshold) {
+			/* Too many failures: return */
+			buf_pool_mutex_exit(buf_pool);
+			return(0);
+		}
+
+		if (bpage && buf_page_is_accessed(bpage)) {
+			pred_bpage = bpage;
+		}
+	}
+
+	/* If we got this far, we know that enough pages in the area have
+	been accessed in the right order: linear read-ahead can be sensible */
+
+	bpage = buf_page_hash_get(buf_pool, space, offset);
+
+	if (bpage == NULL) {
+		buf_pool_mutex_exit(buf_pool);
+
+		return(0);
+	}
+
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_ZIP_PAGE:
+		frame = bpage->zip.data;
+		break;
+	case BUF_BLOCK_FILE_PAGE:
+		frame = ((buf_block_t*) bpage)->frame;
+		break;
+	default:
+		ut_error;
+		break;
+	}
+
+	/* Read the natural predecessor and successor page addresses from
+	the page; NOTE that because the calling thread may have an x-latch
+	on the page, we do not acquire an s-latch on the page, this is to
+	prevent deadlocks. Even if we read values which are nonsense, the
+	algorithm will work. */
+
+	pred_offset = fil_page_get_prev(frame);
+	succ_offset = fil_page_get_next(frame);
+
+	buf_pool_mutex_exit(buf_pool);
+
+	if ((offset == low) && (succ_offset == offset + 1)) {
+
+		/* This is ok, we can continue */
+		new_offset = pred_offset;
+
+	} else if ((offset == high - 1) && (pred_offset == offset - 1)) {
+
+		/* This is ok, we can continue */
+		new_offset = succ_offset;
+	} else {
+		/* Successor or predecessor not in the right order */
+
+		return(0);
+	}
+
+	low  = (new_offset / buf_read_ahead_linear_area)
+		* buf_read_ahead_linear_area;
+	high = (new_offset / buf_read_ahead_linear_area + 1)
+		* buf_read_ahead_linear_area;
+
+	if ((new_offset != low) && (new_offset != high - 1)) {
+		/* This is not a border page of the area: return */
+
+		return(0);
+	}
+
+	if (high > fil_space_get_size(space)) {
+		/* The area is not whole, return */
+
+		return(0);
+	}
+
+	/* If we got this far, read-ahead can be sensible: do it */
+
+	ibuf_mode = inside_ibuf
+		? BUF_READ_IBUF_PAGES_ONLY | OS_AIO_SIMULATED_WAKE_LATER
+		: BUF_READ_ANY_PAGE | OS_AIO_SIMULATED_WAKE_LATER;
+
+	count = 0;
+
+	/* Since Windows XP seems to schedule the i/o handler thread
+	very eagerly, and consequently it does not wait for the
+	full read batch to be posted, we use special heuristics here */
+
+	os_aio_simulated_put_read_threads_to_sleep();
+
+	for (i = low; i < high; i++) {
+		/* It is only sensible to do read-ahead in the non-sync
+		aio mode: hence FALSE as the first parameter */
+
+		if (!ibuf_bitmap_page(zip_size, i)) {
+			count += buf_read_page_low(
+				&err, false,
+				ibuf_mode,
+				space, zip_size, FALSE, tablespace_version, i);
+			if (err == DB_TABLESPACE_DELETED) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"  InnoDB: Warning: in"
+					" linear readahead trying to access\n"
+					"InnoDB: tablespace %lu page %lu,\n"
+					"InnoDB: but the tablespace does not"
+					" exist or is just being dropped.\n",
+					(ulong) space, (ulong) i);
+			}
+		}
+	}
+
+	/* In simulated aio we wake the aio handler threads only after
+	queuing all aio requests, in native aio the following call does
+	nothing: */
+
+	os_aio_simulated_wake_handler_threads();
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints && (count > 0)) {
+		fprintf(stderr,
+			"LINEAR read-ahead space %lu offset %lu pages %lu\n",
+			(ulong) space, (ulong) offset, (ulong) count);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* Read ahead is considered one I/O operation for the purpose of
+	LRU policy decision. */
+	buf_LRU_stat_inc_io();
+
+	buf_pool->stat.n_ra_pages_read += count;
+	return(count);
+}
+
+/********************************************************************//**
+Issues read requests for pages which the ibuf module wants to read in, in
+order to contract the insert buffer tree. Technically, this function is like
+a read-ahead function. */
+UNIV_INTERN
+void
+buf_read_ibuf_merge_pages(
+/*======================*/
+	bool		sync,		/*!< in: true if the caller
+					wants this function to wait
+					for the highest address page
+					to get read in, before this
+					function returns */
+	const ulint*	space_ids,	/*!< in: array of space ids */
+	const ib_int64_t* space_versions,/*!< in: the spaces must have
+					this version number
+					(timestamp), otherwise we
+					discard the read; we use this
+					to cancel reads if DISCARD +
+					IMPORT may have changed the
+					tablespace size */
+	const ulint*	page_nos,	/*!< in: array of page numbers
+					to read, with the highest page
+					number the last in the
+					array */
+	ulint		n_stored)	/*!< in: number of elements
+					in the arrays */
+{
+	ulint	i;
+
+#ifdef UNIV_IBUF_DEBUG
+	ut_a(n_stored < UNIV_PAGE_SIZE);
+#endif
+
+	for (i = 0; i < n_stored; i++) {
+		dberr_t		err;
+		buf_pool_t*	buf_pool;
+		ulint		zip_size = fil_space_get_zip_size(space_ids[i]);
+
+		buf_pool = buf_pool_get(space_ids[i], page_nos[i]);
+
+		while (buf_pool->n_pend_reads
+		       > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
+			os_thread_sleep(500000);
+		}
+
+		if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+
+			goto tablespace_deleted;
+		}
+
+		buf_read_page_low(&err, sync && (i + 1 == n_stored),
+				  BUF_READ_ANY_PAGE, space_ids[i],
+				  zip_size, TRUE, space_versions[i],
+				  page_nos[i]);
+
+		if (UNIV_UNLIKELY(err == DB_TABLESPACE_DELETED)) {
+tablespace_deleted:
+			/* We have deleted or are deleting the single-table
+			tablespace: remove the entries for that page */
+
+			ibuf_merge_or_delete_for_page(NULL, space_ids[i],
+						      page_nos[i],
+						      zip_size, FALSE);
+		}
+	}
+
+	os_aio_simulated_wake_handler_threads();
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints) {
+		fprintf(stderr,
+			"Ibuf merge read-ahead space %lu pages %lu\n",
+			(ulong) space_ids[0], (ulong) n_stored);
+	}
+#endif /* UNIV_DEBUG */
+}
+
+/********************************************************************//**
+Issues read requests for pages which recovery wants to read in. */
+UNIV_INTERN
+void
+buf_read_recv_pages(
+/*================*/
+	ibool		sync,		/*!< in: TRUE if the caller
+					wants this function to wait
+					for the highest address page
+					to get read in, before this
+					function returns */
+	ulint		space,		/*!< in: space id */
+	ulint		zip_size,	/*!< in: compressed page size in
+					bytes, or 0 */
+	const ulint*	page_nos,	/*!< in: array of page numbers
+					to read, with the highest page
+					number the last in the
+					array */
+	ulint		n_stored)	/*!< in: number of page numbers
+					in the array */
+{
+	ib_int64_t	tablespace_version;
+	ulint		count;
+	dberr_t		err;
+	ulint		i;
+
+	zip_size = fil_space_get_zip_size(space);
+
+	if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+		/* It is a single table tablespace and the .ibd file is
+		missing: do nothing */
+
+		return;
+	}
+
+	tablespace_version = fil_space_get_version(space);
+
+	for (i = 0; i < n_stored; i++) {
+		buf_pool_t*	buf_pool;
+
+		count = 0;
+
+		os_aio_print_debug = FALSE;
+		buf_pool = buf_pool_get(space, page_nos[i]);
+		while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
+
+			os_aio_simulated_wake_handler_threads();
+			os_thread_sleep(10000);
+
+			count++;
+
+			if (count > 1000) {
+				fprintf(stderr,
+					"InnoDB: Error: InnoDB has waited for"
+					" 10 seconds for pending\n"
+					"InnoDB: reads to the buffer pool to"
+					" be finished.\n"
+					"InnoDB: Number of pending reads %lu,"
+					" pending pread calls %lu\n",
+					(ulong) buf_pool->n_pend_reads,
+					(ulong) os_file_n_pending_preads);
+
+				os_aio_print_debug = TRUE;
+			}
+		}
+
+		os_aio_print_debug = FALSE;
+
+		if ((i + 1 == n_stored) && sync) {
+			buf_read_page_low(&err, true, BUF_READ_ANY_PAGE, space,
+					  zip_size, TRUE, tablespace_version,
+					  page_nos[i]);
+		} else {
+			buf_read_page_low(&err, false, BUF_READ_ANY_PAGE
+					  | OS_AIO_SIMULATED_WAKE_LATER,
+					  space, zip_size, TRUE,
+					  tablespace_version, page_nos[i]);
+		}
+	}
+
+	os_aio_simulated_wake_handler_threads();
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints) {
+		fprintf(stderr,
+			"Recovery applies read-ahead pages %lu\n",
+			(ulong) n_stored);
+	}
+#endif /* UNIV_DEBUG */
+}
diff --git a/storage/innobase/compile-innodb b/storage/innobase/compile-innodb
new file mode 100755
index 00000000000..fa791282b28
--- /dev/null
+++ b/storage/innobase/compile-innodb
@@ -0,0 +1,25 @@
+#!/bin/sh
+#
+# Copyright (c) 2006, 2013, Oracle and/or its affiliates. All rights reserved.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St,
+# Fifth Floor, Boston, MA 02110-1301 USA
+#
+
+# we assume this script is in storage/innobase/
+
+MYSQL_ROOT="$(dirname ${0})/../.."
+
+cd ${MYSQL_ROOT}
+
+cmake -DWITH_INNOBASE_STORAGE_ENGINE:BOOL=ON
+make -j$(nproc)
diff --git a/storage/innobase/data/data0data.cc b/storage/innobase/data/data0data.cc
new file mode 100644
index 00000000000..179de79b69f
--- /dev/null
+++ b/storage/innobase/data/data0data.cc
@@ -0,0 +1,750 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file data/data0data.cc
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "data0data.h"
+
+#ifdef UNIV_NONINL
+#include "data0data.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+#include "rem0rec.h"
+#include "rem0cmp.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "dict0dict.h"
+#include "btr0cur.h"
+
+#include <ctype.h>
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/** Dummy variable to catch access to uninitialized fields.  In the
+debug version, dtuple_create() will make all fields of dtuple_t point
+to data_error. */
+UNIV_INTERN byte	data_error;
+
+# ifndef UNIV_DEBUG_VALGRIND
+/** this is used to fool the compiler in dtuple_validate */
+UNIV_INTERN ulint	data_dummy;
+# endif /* !UNIV_DEBUG_VALGRIND */
+#endif /* UNIV_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Compare two data tuples, respecting the collation of character fields.
+@return 1, 0 , -1 if tuple1 is greater, equal, less, respectively,
+than tuple2 */
+UNIV_INTERN
+int
+dtuple_coll_cmp(
+/*============*/
+	const dtuple_t*	tuple1,	/*!< in: tuple 1 */
+	const dtuple_t*	tuple2)	/*!< in: tuple 2 */
+{
+	ulint	n_fields;
+	ulint	i;
+
+	ut_ad(tuple1 && tuple2);
+	ut_ad(tuple1->magic_n == DATA_TUPLE_MAGIC_N);
+	ut_ad(tuple2->magic_n == DATA_TUPLE_MAGIC_N);
+	ut_ad(dtuple_check_typed(tuple1));
+	ut_ad(dtuple_check_typed(tuple2));
+
+	n_fields = dtuple_get_n_fields(tuple1);
+
+	if (n_fields != dtuple_get_n_fields(tuple2)) {
+
+		return(n_fields < dtuple_get_n_fields(tuple2) ? -1 : 1);
+	}
+
+	for (i = 0; i < n_fields; i++) {
+		int		cmp;
+		const dfield_t*	field1	= dtuple_get_nth_field(tuple1, i);
+		const dfield_t*	field2	= dtuple_get_nth_field(tuple2, i);
+
+		cmp = cmp_dfield_dfield(field1, field2);
+
+		if (cmp) {
+			return(cmp);
+		}
+	}
+
+	return(0);
+}
+
+/*********************************************************************//**
+Sets number of fields used in a tuple. Normally this is set in
+dtuple_create, but if you want later to set it smaller, you can use this. */
+UNIV_INTERN
+void
+dtuple_set_n_fields(
+/*================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		n_fields)	/*!< in: number of fields */
+{
+	ut_ad(tuple);
+
+	tuple->n_fields = n_fields;
+	tuple->n_fields_cmp = n_fields;
+}
+
+/**********************************************************//**
+Checks that a data field is typed.
+@return	TRUE if ok */
+static
+ibool
+dfield_check_typed_no_assert(
+/*=========================*/
+	const dfield_t*	field)	/*!< in: data field */
+{
+	if (dfield_get_type(field)->mtype > DATA_MYSQL
+	    || dfield_get_type(field)->mtype < DATA_VARCHAR) {
+
+		fprintf(stderr,
+			"InnoDB: Error: data field type %lu, len %lu\n",
+			(ulong) dfield_get_type(field)->mtype,
+			(ulong) dfield_get_len(field));
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************//**
+Checks that a data tuple is typed.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_check_typed_no_assert(
+/*=========================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	const dfield_t*	field;
+	ulint		i;
+
+	if (dtuple_get_n_fields(tuple) > REC_MAX_N_FIELDS) {
+		fprintf(stderr,
+			"InnoDB: Error: index entry has %lu fields\n",
+			(ulong) dtuple_get_n_fields(tuple));
+dump:
+		fputs("InnoDB: Tuple contents: ", stderr);
+		dtuple_print(stderr, tuple);
+		putc('\n', stderr);
+
+		return(FALSE);
+	}
+
+	for (i = 0; i < dtuple_get_n_fields(tuple); i++) {
+
+		field = dtuple_get_nth_field(tuple, i);
+
+		if (!dfield_check_typed_no_assert(field)) {
+			goto dump;
+		}
+	}
+
+	return(TRUE);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Checks that a data field is typed. Asserts an error if not.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dfield_check_typed(
+/*===============*/
+	const dfield_t*	field)	/*!< in: data field */
+{
+	if (dfield_get_type(field)->mtype > DATA_MYSQL
+	    || dfield_get_type(field)->mtype < DATA_VARCHAR) {
+
+		fprintf(stderr,
+			"InnoDB: Error: data field type %lu, len %lu\n",
+			(ulong) dfield_get_type(field)->mtype,
+			(ulong) dfield_get_len(field));
+
+		ut_error;
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************//**
+Checks that a data tuple is typed. Asserts an error if not.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_check_typed(
+/*===============*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	const dfield_t*	field;
+	ulint		i;
+
+	for (i = 0; i < dtuple_get_n_fields(tuple); i++) {
+
+		field = dtuple_get_nth_field(tuple, i);
+
+		ut_a(dfield_check_typed(field));
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************//**
+Validates the consistency of a tuple which must be complete, i.e,
+all fields must have been set.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_validate(
+/*============*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	const dfield_t*	field;
+	ulint		n_fields;
+	ulint		len;
+	ulint		i;
+
+	ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+
+	n_fields = dtuple_get_n_fields(tuple);
+
+	/* We dereference all the data of each field to test
+	for memory traps */
+
+	for (i = 0; i < n_fields; i++) {
+
+		field = dtuple_get_nth_field(tuple, i);
+		len = dfield_get_len(field);
+
+		if (!dfield_is_null(field)) {
+
+			const byte*	data;
+
+			data = static_cast<const byte*>(dfield_get_data(field));
+#ifndef UNIV_DEBUG_VALGRIND
+			ulint		j;
+
+			for (j = 0; j < len; j++) {
+
+				data_dummy  += *data; /* fool the compiler not
+						      to optimize out this
+						      code */
+				data++;
+			}
+#endif /* !UNIV_DEBUG_VALGRIND */
+
+			UNIV_MEM_ASSERT_RW(data, len);
+		}
+	}
+
+	ut_a(dtuple_check_typed(tuple));
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. */
+UNIV_INTERN
+void
+dfield_print(
+/*=========*/
+	const dfield_t*	dfield)	/*!< in: dfield */
+{
+	const byte*	data;
+	ulint		len;
+	ulint		i;
+
+	len = dfield_get_len(dfield);
+	data = static_cast<const byte*>(dfield_get_data(dfield));
+
+	if (dfield_is_null(dfield)) {
+		fputs("NULL", stderr);
+
+		return;
+	}
+
+	switch (dtype_get_mtype(dfield_get_type(dfield))) {
+	case DATA_CHAR:
+	case DATA_VARCHAR:
+		for (i = 0; i < len; i++) {
+			int	c = *data++;
+			putc(isprint(c) ? c : ' ', stderr);
+		}
+
+		if (dfield_is_ext(dfield)) {
+			fputs("(external)", stderr);
+		}
+		break;
+	case DATA_INT:
+		ut_a(len == 4); /* only works for 32-bit integers */
+		fprintf(stderr, "%d", (int) mach_read_from_4(data));
+		break;
+	default:
+		ut_error;
+	}
+}
+
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. Also the hex string
+is printed if a string contains non-printable characters. */
+UNIV_INTERN
+void
+dfield_print_also_hex(
+/*==================*/
+	const dfield_t*	dfield)	/*!< in: dfield */
+{
+	const byte*	data;
+	ulint		len;
+	ulint		prtype;
+	ulint		i;
+	ibool		print_also_hex;
+
+	len = dfield_get_len(dfield);
+	data = static_cast<const byte*>(dfield_get_data(dfield));
+
+	if (dfield_is_null(dfield)) {
+		fputs("NULL", stderr);
+
+		return;
+	}
+
+	prtype = dtype_get_prtype(dfield_get_type(dfield));
+
+	switch (dtype_get_mtype(dfield_get_type(dfield))) {
+		ib_id_t	id;
+	case DATA_INT:
+		switch (len) {
+			ulint	val;
+		case 1:
+			val = mach_read_from_1(data);
+
+			if (!(prtype & DATA_UNSIGNED)) {
+				val &= ~0x80;
+				fprintf(stderr, "%ld", (long) val);
+			} else {
+				fprintf(stderr, "%lu", (ulong) val);
+			}
+			break;
+
+		case 2:
+			val = mach_read_from_2(data);
+
+			if (!(prtype & DATA_UNSIGNED)) {
+				val &= ~0x8000;
+				fprintf(stderr, "%ld", (long) val);
+			} else {
+				fprintf(stderr, "%lu", (ulong) val);
+			}
+			break;
+
+		case 3:
+			val = mach_read_from_3(data);
+
+			if (!(prtype & DATA_UNSIGNED)) {
+				val &= ~0x800000;
+				fprintf(stderr, "%ld", (long) val);
+			} else {
+				fprintf(stderr, "%lu", (ulong) val);
+			}
+			break;
+
+		case 4:
+			val = mach_read_from_4(data);
+
+			if (!(prtype & DATA_UNSIGNED)) {
+				val &= ~0x80000000;
+				fprintf(stderr, "%ld", (long) val);
+			} else {
+				fprintf(stderr, "%lu", (ulong) val);
+			}
+			break;
+
+		case 6:
+			id = mach_read_from_6(data);
+			fprintf(stderr, "%llu", (ullint) id);
+			break;
+
+		case 7:
+			id = mach_read_from_7(data);
+			fprintf(stderr, "%llu", (ullint) id);
+			break;
+		case 8:
+			id = mach_read_from_8(data);
+			fprintf(stderr, "%llu", (ullint) id);
+			break;
+		default:
+			goto print_hex;
+		}
+		break;
+
+	case DATA_SYS:
+		switch (prtype & DATA_SYS_PRTYPE_MASK) {
+		case DATA_TRX_ID:
+			id = mach_read_from_6(data);
+
+			fprintf(stderr, "trx_id " TRX_ID_FMT, id);
+			break;
+
+		case DATA_ROLL_PTR:
+			id = mach_read_from_7(data);
+
+			fprintf(stderr, "roll_ptr " TRX_ID_FMT, id);
+			break;
+
+		case DATA_ROW_ID:
+			id = mach_read_from_6(data);
+
+			fprintf(stderr, "row_id " TRX_ID_FMT, id);
+			break;
+
+		default:
+			id = mach_ull_read_compressed(data);
+
+			fprintf(stderr, "mix_id " TRX_ID_FMT, id);
+		}
+		break;
+
+	case DATA_CHAR:
+	case DATA_VARCHAR:
+		print_also_hex = FALSE;
+
+		for (i = 0; i < len; i++) {
+			int c = *data++;
+
+			if (!isprint(c)) {
+				print_also_hex = TRUE;
+
+				fprintf(stderr, "\\x%02x", (unsigned char) c);
+			} else {
+				putc(c, stderr);
+			}
+		}
+
+		if (dfield_is_ext(dfield)) {
+			fputs("(external)", stderr);
+		}
+
+		if (!print_also_hex) {
+			break;
+		}
+
+		data = static_cast<byte*>(dfield_get_data(dfield));
+		/* fall through */
+
+	case DATA_BINARY:
+	default:
+print_hex:
+		fputs(" Hex: ",stderr);
+
+		for (i = 0; i < len; i++) {
+			fprintf(stderr, "%02lx", (ulint) *data++);
+		}
+
+		if (dfield_is_ext(dfield)) {
+			fputs("(external)", stderr);
+		}
+	}
+}
+
+/*************************************************************//**
+Print a dfield value using ut_print_buf. */
+static
+void
+dfield_print_raw(
+/*=============*/
+	FILE*		f,		/*!< in: output stream */
+	const dfield_t*	dfield)		/*!< in: dfield */
+{
+	ulint	len	= dfield_get_len(dfield);
+	if (!dfield_is_null(dfield)) {
+		ulint	print_len = ut_min(len, 1000);
+		ut_print_buf(f, dfield_get_data(dfield), print_len);
+		if (len != print_len) {
+			fprintf(f, "(total %lu bytes%s)",
+				(ulong) len,
+				dfield_is_ext(dfield) ? ", external" : "");
+		}
+	} else {
+		fputs(" SQL NULL", f);
+	}
+}
+
+/**********************************************************//**
+The following function prints the contents of a tuple. */
+UNIV_INTERN
+void
+dtuple_print(
+/*=========*/
+	FILE*		f,	/*!< in: output stream */
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	ulint		n_fields;
+	ulint		i;
+
+	n_fields = dtuple_get_n_fields(tuple);
+
+	fprintf(f, "DATA TUPLE: %lu fields;\n", (ulong) n_fields);
+
+	for (i = 0; i < n_fields; i++) {
+		fprintf(f, " %lu:", (ulong) i);
+
+		dfield_print_raw(f, dtuple_get_nth_field(tuple, i));
+
+		putc(';', f);
+		putc('\n', f);
+	}
+
+	ut_ad(dtuple_validate(tuple));
+}
+
+/**************************************************************//**
+Moves parts of long fields in entry to the big record vector so that
+the size of tuple drops below the maximum record size allowed in the
+database. Moves data only from those fields which are not necessary
+to determine uniquely the insertion place of the tuple in the index.
+@return own: created big record vector, NULL if we are not able to
+shorten the entry enough, i.e., if there are too many fixed-length or
+short fields in entry or the index is clustered */
+UNIV_INTERN
+big_rec_t*
+dtuple_convert_big_rec(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in/out: index entry */
+	ulint*		n_ext)	/*!< in/out: number of
+				externally stored columns */
+{
+	mem_heap_t*	heap;
+	big_rec_t*	vector;
+	dfield_t*	dfield;
+	dict_field_t*	ifield;
+	ulint		size;
+	ulint		n_fields;
+	ulint		local_len;
+	ulint		local_prefix_len;
+
+	if (!dict_index_is_clust(index)) {
+		return(NULL);
+	}
+
+	if (dict_table_get_format(index->table) < UNIV_FORMAT_B) {
+		/* up to MySQL 5.1: store a 768-byte prefix locally */
+		local_len = BTR_EXTERN_FIELD_REF_SIZE
+			+ DICT_ANTELOPE_MAX_INDEX_COL_LEN;
+	} else {
+		/* new-format table: do not store any BLOB prefix locally */
+		local_len = BTR_EXTERN_FIELD_REF_SIZE;
+	}
+
+	ut_a(dtuple_check_typed_no_assert(entry));
+
+	size = rec_get_converted_size(index, entry, *n_ext);
+
+	if (UNIV_UNLIKELY(size > 1000000000)) {
+		fprintf(stderr,
+			"InnoDB: Warning: tuple size very big: %lu\n",
+			(ulong) size);
+		fputs("InnoDB: Tuple contents: ", stderr);
+		dtuple_print(stderr, entry);
+		putc('\n', stderr);
+	}
+
+	heap = mem_heap_create(size + dtuple_get_n_fields(entry)
+			       * sizeof(big_rec_field_t) + 1000);
+
+	vector = static_cast<big_rec_t*>(
+		mem_heap_alloc(heap, sizeof(big_rec_t)));
+
+	vector->heap = heap;
+
+	vector->fields = static_cast<big_rec_field_t*>(
+		mem_heap_alloc(
+			heap,
+			dtuple_get_n_fields(entry) * sizeof(big_rec_field_t)));
+
+	/* Decide which fields to shorten: the algorithm is to look for
+	a variable-length field that yields the biggest savings when
+	stored externally */
+
+	n_fields = 0;
+
+	while (page_zip_rec_needs_ext(rec_get_converted_size(index, entry,
+							     *n_ext),
+				      dict_table_is_comp(index->table),
+				      dict_index_get_n_fields(index),
+				      dict_table_zip_size(index->table))) {
+		ulint			i;
+		ulint			longest		= 0;
+		ulint			longest_i	= ULINT_MAX;
+		byte*			data;
+		big_rec_field_t*	b;
+
+		for (i = dict_index_get_n_unique_in_tree(index);
+		     i < dtuple_get_n_fields(entry); i++) {
+			ulint	savings;
+
+			dfield = dtuple_get_nth_field(entry, i);
+			ifield = dict_index_get_nth_field(index, i);
+
+			/* Skip fixed-length, NULL, externally stored,
+			or short columns */
+
+			if (ifield->fixed_len
+			    || dfield_is_null(dfield)
+			    || dfield_is_ext(dfield)
+			    || dfield_get_len(dfield) <= local_len
+			    || dfield_get_len(dfield)
+			    <= BTR_EXTERN_FIELD_REF_SIZE * 2) {
+				goto skip_field;
+			}
+
+			savings = dfield_get_len(dfield) - local_len;
+
+			/* Check that there would be savings */
+			if (longest >= savings) {
+				goto skip_field;
+			}
+
+			/* In DYNAMIC and COMPRESSED format, store
+			locally any non-BLOB columns whose maximum
+			length does not exceed 256 bytes.  This is
+			because there is no room for the "external
+			storage" flag when the maximum length is 255
+			bytes or less. This restriction trivially
+			holds in REDUNDANT and COMPACT format, because
+			there we always store locally columns whose
+			length is up to local_len == 788 bytes.
+			@see rec_init_offsets_comp_ordinary */
+			if (ifield->col->mtype != DATA_BLOB
+			    && ifield->col->len < 256) {
+				goto skip_field;
+			}
+
+			longest_i = i;
+			longest = savings;
+
+skip_field:
+			continue;
+		}
+
+		if (!longest) {
+			/* Cannot shorten more */
+
+			mem_heap_free(heap);
+
+			return(NULL);
+		}
+
+		/* Move data from field longest_i to big rec vector.
+
+		We store the first bytes locally to the record. Then
+		we can calculate all ordering fields in all indexes
+		from locally stored data. */
+
+		dfield = dtuple_get_nth_field(entry, longest_i);
+		ifield = dict_index_get_nth_field(index, longest_i);
+		local_prefix_len = local_len - BTR_EXTERN_FIELD_REF_SIZE;
+
+		b = &vector->fields[n_fields];
+		b->field_no = longest_i;
+		b->len = dfield_get_len(dfield) - local_prefix_len;
+		b->data = (char*) dfield_get_data(dfield) + local_prefix_len;
+
+		/* Allocate the locally stored part of the column. */
+		data = static_cast<byte*>(mem_heap_alloc(heap, local_len));
+
+		/* Copy the local prefix. */
+		memcpy(data, dfield_get_data(dfield), local_prefix_len);
+		/* Clear the extern field reference (BLOB pointer). */
+		memset(data + local_prefix_len, 0, BTR_EXTERN_FIELD_REF_SIZE);
+#if 0
+		/* The following would fail the Valgrind checks in
+		page_cur_insert_rec_low() and page_cur_insert_rec_zip().
+		The BLOB pointers in the record will be initialized after
+		the record and the BLOBs have been written. */
+		UNIV_MEM_ALLOC(data + local_prefix_len,
+			       BTR_EXTERN_FIELD_REF_SIZE);
+#endif
+
+		dfield_set_data(dfield, data, local_len);
+		dfield_set_ext(dfield);
+
+		n_fields++;
+		(*n_ext)++;
+		ut_ad(n_fields < dtuple_get_n_fields(entry));
+	}
+
+	vector->n_fields = n_fields;
+	return(vector);
+}
+
+/**************************************************************//**
+Puts back to entry the data stored in vector. Note that to ensure the
+fields in entry can accommodate the data, vector must have been created
+from entry with dtuple_convert_big_rec. */
+UNIV_INTERN
+void
+dtuple_convert_back_big_rec(
+/*========================*/
+	dict_index_t*	index __attribute__((unused)),	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: entry whose data was put to vector */
+	big_rec_t*	vector)	/*!< in, own: big rec vector; it is
+				freed in this function */
+{
+	big_rec_field_t*		b	= vector->fields;
+	const big_rec_field_t* const	end	= b + vector->n_fields;
+
+	for (; b < end; b++) {
+		dfield_t*	dfield;
+		ulint		local_len;
+
+		dfield = dtuple_get_nth_field(entry, b->field_no);
+		local_len = dfield_get_len(dfield);
+
+		ut_ad(dfield_is_ext(dfield));
+		ut_ad(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+		local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+		/* Only in REDUNDANT and COMPACT format, we store
+		up to DICT_ANTELOPE_MAX_INDEX_COL_LEN (768) bytes
+		locally */
+		ut_ad(local_len <= DICT_ANTELOPE_MAX_INDEX_COL_LEN);
+
+		dfield_set_data(dfield,
+				(char*) b->data - local_len,
+				b->len + local_len);
+	}
+
+	mem_heap_free(vector->heap);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/data/data0type.cc b/storage/innobase/data/data0type.cc
new file mode 100644
index 00000000000..0b9e08544a5
--- /dev/null
+++ b/storage/innobase/data/data0type.cc
@@ -0,0 +1,298 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file data/data0type.cc
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#include "data0type.h"
+
+#ifdef UNIV_NONINL
+#include "data0type.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+# include "ha_prototypes.h"
+
+/* At the database startup we store the default-charset collation number of
+this MySQL installation to this global variable. If we have < 4.1.2 format
+column definitions, or records in the insert buffer, we use this
+charset-collation code for them. */
+
+UNIV_INTERN ulint	data_mysql_default_charset_coll;
+
+/*********************************************************************//**
+Determine how many bytes the first n characters of the given string occupy.
+If the string is shorter than n characters, returns the number of bytes
+the characters in the string occupy.
+@return	length of the prefix, in bytes */
+UNIV_INTERN
+ulint
+dtype_get_at_most_n_mbchars(
+/*========================*/
+	ulint		prtype,		/*!< in: precise type */
+	ulint		mbminmaxlen,	/*!< in: minimum and maximum length of
+					a multi-byte character */
+	ulint		prefix_len,	/*!< in: length of the requested
+					prefix, in characters, multiplied by
+					dtype_get_mbmaxlen(dtype) */
+	ulint		data_len,	/*!< in: length of str (in bytes) */
+	const char*	str)		/*!< in: the string whose prefix
+					length is being determined */
+{
+	ulint	mbminlen = DATA_MBMINLEN(mbminmaxlen);
+	ulint	mbmaxlen = DATA_MBMAXLEN(mbminmaxlen);
+
+	ut_a(data_len != UNIV_SQL_NULL);
+	ut_ad(!mbmaxlen || !(prefix_len % mbmaxlen));
+
+	if (mbminlen != mbmaxlen) {
+		ut_a(!(prefix_len % mbmaxlen));
+		return(innobase_get_at_most_n_mbchars(
+			dtype_get_charset_coll(prtype),
+			prefix_len, data_len, str));
+	}
+
+	if (prefix_len < data_len) {
+
+		return(prefix_len);
+
+	}
+
+	return(data_len);
+}
+#endif /* UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Checks if a data main type is a string type. Also a BLOB is considered a
+string type.
+@return	TRUE if string type */
+UNIV_INTERN
+ibool
+dtype_is_string_type(
+/*=================*/
+	ulint	mtype)	/*!< in: InnoDB main data type code: DATA_CHAR, ... */
+{
+	if (mtype <= DATA_BLOB
+	    || mtype == DATA_MYSQL
+	    || mtype == DATA_VARMYSQL) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Checks if a type is a binary string type. Note that for tables created with
+< 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. For
+those DATA_BLOB columns this function currently returns FALSE.
+@return	TRUE if binary string type */
+UNIV_INTERN
+ibool
+dtype_is_binary_string_type(
+/*========================*/
+	ulint	mtype,	/*!< in: main data type */
+	ulint	prtype)	/*!< in: precise type */
+{
+	if ((mtype == DATA_FIXBINARY)
+	    || (mtype == DATA_BINARY)
+	    || (mtype == DATA_BLOB && (prtype & DATA_BINARY_TYPE))) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Checks if a type is a non-binary string type. That is, dtype_is_string_type is
+TRUE and dtype_is_binary_string_type is FALSE. Note that for tables created
+with < 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column.
+For those DATA_BLOB columns this function currently returns TRUE.
+@return	TRUE if non-binary string type */
+UNIV_INTERN
+ibool
+dtype_is_non_binary_string_type(
+/*============================*/
+	ulint	mtype,	/*!< in: main data type */
+	ulint	prtype)	/*!< in: precise type */
+{
+	if (dtype_is_string_type(mtype) == TRUE
+	    && dtype_is_binary_string_type(mtype, prtype) == FALSE) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Forms a precise type from the < 4.1.2 format precise type plus the
+charset-collation code.
+@return precise type, including the charset-collation code */
+UNIV_INTERN
+ulint
+dtype_form_prtype(
+/*==============*/
+	ulint	old_prtype,	/*!< in: the MySQL type code and the flags
+				DATA_BINARY_TYPE etc. */
+	ulint	charset_coll)	/*!< in: MySQL charset-collation code */
+{
+	ut_a(old_prtype < 256 * 256);
+	ut_a(charset_coll <= MAX_CHAR_COLL_NUM);
+
+	return(old_prtype + (charset_coll << 16));
+}
+
+/*********************************************************************//**
+Validates a data type structure.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dtype_validate(
+/*===========*/
+	const dtype_t*	type)	/*!< in: type struct to validate */
+{
+	ut_a(type);
+	ut_a(type->mtype >= DATA_VARCHAR);
+	ut_a(type->mtype <= DATA_MYSQL);
+
+	if (type->mtype == DATA_SYS) {
+		ut_a((type->prtype & DATA_MYSQL_TYPE_MASK) < DATA_N_SYS_COLS);
+	}
+
+#ifndef UNIV_HOTBACKUP
+	ut_a(dtype_get_mbminlen(type) <= dtype_get_mbmaxlen(type));
+#endif /* !UNIV_HOTBACKUP */
+
+	return(TRUE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Prints a data type structure. */
+UNIV_INTERN
+void
+dtype_print(
+/*========*/
+	const dtype_t*	type)	/*!< in: type */
+{
+	ulint	mtype;
+	ulint	prtype;
+	ulint	len;
+
+	ut_a(type);
+
+	mtype = type->mtype;
+	prtype = type->prtype;
+
+	switch (mtype) {
+	case DATA_VARCHAR:
+		fputs("DATA_VARCHAR", stderr);
+		break;
+
+	case DATA_CHAR:
+		fputs("DATA_CHAR", stderr);
+		break;
+
+	case DATA_BINARY:
+		fputs("DATA_BINARY", stderr);
+		break;
+
+	case DATA_FIXBINARY:
+		fputs("DATA_FIXBINARY", stderr);
+		break;
+
+	case DATA_BLOB:
+		fputs("DATA_BLOB", stderr);
+		break;
+
+	case DATA_INT:
+		fputs("DATA_INT", stderr);
+		break;
+
+	case DATA_MYSQL:
+		fputs("DATA_MYSQL", stderr);
+		break;
+
+	case DATA_SYS:
+		fputs("DATA_SYS", stderr);
+		break;
+
+	case DATA_FLOAT:
+		fputs("DATA_FLOAT", stderr);
+		break;
+
+	case DATA_DOUBLE:
+		fputs("DATA_DOUBLE", stderr);
+		break;
+
+	case DATA_DECIMAL:
+		fputs("DATA_DECIMAL", stderr);
+		break;
+
+	case DATA_VARMYSQL:
+		fputs("DATA_VARMYSQL", stderr);
+		break;
+
+	default:
+		fprintf(stderr, "type %lu", (ulong) mtype);
+		break;
+	}
+
+	len = type->len;
+
+	if ((type->mtype == DATA_SYS)
+	    || (type->mtype == DATA_VARCHAR)
+	    || (type->mtype == DATA_CHAR)) {
+		putc(' ', stderr);
+		if (prtype == DATA_ROW_ID) {
+			fputs("DATA_ROW_ID", stderr);
+			len = DATA_ROW_ID_LEN;
+		} else if (prtype == DATA_ROLL_PTR) {
+			fputs("DATA_ROLL_PTR", stderr);
+			len = DATA_ROLL_PTR_LEN;
+		} else if (prtype == DATA_TRX_ID) {
+			fputs("DATA_TRX_ID", stderr);
+			len = DATA_TRX_ID_LEN;
+		} else if (prtype == DATA_ENGLISH) {
+			fputs("DATA_ENGLISH", stderr);
+		} else {
+			fprintf(stderr, "prtype %lu", (ulong) prtype);
+		}
+	} else {
+		if (prtype & DATA_UNSIGNED) {
+			fputs(" DATA_UNSIGNED", stderr);
+		}
+
+		if (prtype & DATA_BINARY_TYPE) {
+			fputs(" DATA_BINARY_TYPE", stderr);
+		}
+
+		if (prtype & DATA_NOT_NULL) {
+			fputs(" DATA_NOT_NULL", stderr);
+		}
+	}
+
+	fprintf(stderr, " len %lu", (ulong) len);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/dict/dict0boot.cc b/storage/innobase/dict/dict0boot.cc
new file mode 100644
index 00000000000..1a1dd29a202
--- /dev/null
+++ b/storage/innobase/dict/dict0boot.cc
@@ -0,0 +1,512 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0boot.cc
+Data dictionary creation and booting
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0boot.h"
+
+#ifdef UNIV_NONINL
+#include "dict0boot.ic"
+#endif
+
+#include "dict0crea.h"
+#include "btr0btr.h"
+#include "dict0load.h"
+#include "trx0trx.h"
+#include "srv0srv.h"
+#include "ibuf0ibuf.h"
+#include "buf0flu.h"
+#include "log0recv.h"
+#include "os0file.h"
+
+/**********************************************************************//**
+Gets a pointer to the dictionary header and x-latches its page.
+@return	pointer to the dictionary header, page x-latched */
+UNIV_INTERN
+dict_hdr_t*
+dict_hdr_get(
+/*=========*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+	dict_hdr_t*	header;
+
+	block = buf_page_get(DICT_HDR_SPACE, 0, DICT_HDR_PAGE_NO,
+			     RW_X_LATCH, mtr);
+	header = DICT_HDR + buf_block_get_frame(block);
+
+	buf_block_dbg_add_level(block, SYNC_DICT_HEADER);
+
+	return(header);
+}
+
+/**********************************************************************//**
+Returns a new table, index, or space id. */
+UNIV_INTERN
+void
+dict_hdr_get_new_id(
+/*================*/
+	table_id_t*	table_id,	/*!< out: table id
+					(not assigned if NULL) */
+	index_id_t*	index_id,	/*!< out: index id
+					(not assigned if NULL) */
+	ulint*		space_id)	/*!< out: space id
+					(not assigned if NULL) */
+{
+	dict_hdr_t*	dict_hdr;
+	ib_id_t		id;
+	mtr_t		mtr;
+
+	mtr_start(&mtr);
+
+	dict_hdr = dict_hdr_get(&mtr);
+
+	if (table_id) {
+		id = mach_read_from_8(dict_hdr + DICT_HDR_TABLE_ID);
+		id++;
+		mlog_write_ull(dict_hdr + DICT_HDR_TABLE_ID, id, &mtr);
+		*table_id = id;
+	}
+
+	if (index_id) {
+		id = mach_read_from_8(dict_hdr + DICT_HDR_INDEX_ID);
+		id++;
+		mlog_write_ull(dict_hdr + DICT_HDR_INDEX_ID, id, &mtr);
+		*index_id = id;
+	}
+
+	if (space_id) {
+		*space_id = mtr_read_ulint(dict_hdr + DICT_HDR_MAX_SPACE_ID,
+					   MLOG_4BYTES, &mtr);
+		if (fil_assign_new_space_id(space_id)) {
+			mlog_write_ulint(dict_hdr + DICT_HDR_MAX_SPACE_ID,
+					 *space_id, MLOG_4BYTES, &mtr);
+		}
+	}
+
+	mtr_commit(&mtr);
+}
+
+/**********************************************************************//**
+Writes the current value of the row id counter to the dictionary header file
+page. */
+UNIV_INTERN
+void
+dict_hdr_flush_row_id(void)
+/*=======================*/
+{
+	dict_hdr_t*	dict_hdr;
+	row_id_t	id;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	id = dict_sys->row_id;
+
+	mtr_start(&mtr);
+
+	dict_hdr = dict_hdr_get(&mtr);
+
+	mlog_write_ull(dict_hdr + DICT_HDR_ROW_ID, id, &mtr);
+
+	mtr_commit(&mtr);
+}
+
+/*****************************************************************//**
+Creates the file page for the dictionary header. This function is
+called only at the database creation.
+@return	TRUE if succeed */
+static
+ibool
+dict_hdr_create(
+/*============*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+	dict_hdr_t*	dict_header;
+	ulint		root_page_no;
+
+	ut_ad(mtr);
+
+	/* Create the dictionary header file block in a new, allocated file
+	segment in the system tablespace */
+	block = fseg_create(DICT_HDR_SPACE, 0,
+			    DICT_HDR + DICT_HDR_FSEG_HEADER, mtr);
+
+	ut_a(DICT_HDR_PAGE_NO == buf_block_get_page_no(block));
+
+	dict_header = dict_hdr_get(mtr);
+
+	/* Start counting row, table, index, and tree ids from
+	DICT_HDR_FIRST_ID */
+	mlog_write_ull(dict_header + DICT_HDR_ROW_ID,
+		       DICT_HDR_FIRST_ID, mtr);
+
+	mlog_write_ull(dict_header + DICT_HDR_TABLE_ID,
+		       DICT_HDR_FIRST_ID, mtr);
+
+	mlog_write_ull(dict_header + DICT_HDR_INDEX_ID,
+		       DICT_HDR_FIRST_ID, mtr);
+
+	mlog_write_ulint(dict_header + DICT_HDR_MAX_SPACE_ID,
+			 0, MLOG_4BYTES, mtr);
+
+	/* Obsolete, but we must initialize it anyway. */
+	mlog_write_ulint(dict_header + DICT_HDR_MIX_ID_LOW,
+			 DICT_HDR_FIRST_ID, MLOG_4BYTES, mtr);
+
+	/* Create the B-tree roots for the clustered indexes of the basic
+	system tables */
+
+	/*--------------------------*/
+	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+				  DICT_HDR_SPACE, 0, DICT_TABLES_ID,
+				  dict_ind_redundant, mtr);
+	if (root_page_no == FIL_NULL) {
+
+		return(FALSE);
+	}
+
+	mlog_write_ulint(dict_header + DICT_HDR_TABLES, root_page_no,
+			 MLOG_4BYTES, mtr);
+	/*--------------------------*/
+	root_page_no = btr_create(DICT_UNIQUE, DICT_HDR_SPACE, 0,
+				  DICT_TABLE_IDS_ID,
+				  dict_ind_redundant, mtr);
+	if (root_page_no == FIL_NULL) {
+
+		return(FALSE);
+	}
+
+	mlog_write_ulint(dict_header + DICT_HDR_TABLE_IDS, root_page_no,
+			 MLOG_4BYTES, mtr);
+	/*--------------------------*/
+	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+				  DICT_HDR_SPACE, 0, DICT_COLUMNS_ID,
+				  dict_ind_redundant, mtr);
+	if (root_page_no == FIL_NULL) {
+
+		return(FALSE);
+	}
+
+	mlog_write_ulint(dict_header + DICT_HDR_COLUMNS, root_page_no,
+			 MLOG_4BYTES, mtr);
+	/*--------------------------*/
+	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+				  DICT_HDR_SPACE, 0, DICT_INDEXES_ID,
+				  dict_ind_redundant, mtr);
+	if (root_page_no == FIL_NULL) {
+
+		return(FALSE);
+	}
+
+	mlog_write_ulint(dict_header + DICT_HDR_INDEXES, root_page_no,
+			 MLOG_4BYTES, mtr);
+	/*--------------------------*/
+	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+				  DICT_HDR_SPACE, 0, DICT_FIELDS_ID,
+				  dict_ind_redundant, mtr);
+	if (root_page_no == FIL_NULL) {
+
+		return(FALSE);
+	}
+
+	mlog_write_ulint(dict_header + DICT_HDR_FIELDS, root_page_no,
+			 MLOG_4BYTES, mtr);
+	/*--------------------------*/
+
+	return(TRUE);
+}
+
+/*****************************************************************//**
+Initializes the data dictionary memory structures when the database is
+started. This function is also called when the data dictionary is created.
+@return DB_SUCCESS or error code. */
+UNIV_INTERN
+dberr_t
+dict_boot(void)
+/*===========*/
+{
+	dict_table_t*	table;
+	dict_index_t*	index;
+	dict_hdr_t*	dict_hdr;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+	dberr_t		error;
+
+	/* Be sure these constants do not ever change.  To avoid bloat,
+	only check the *NUM_FIELDS* in each table */
+
+	ut_ad(DICT_NUM_COLS__SYS_TABLES == 8);
+	ut_ad(DICT_NUM_FIELDS__SYS_TABLES == 10);
+	ut_ad(DICT_NUM_FIELDS__SYS_TABLE_IDS == 2);
+	ut_ad(DICT_NUM_COLS__SYS_COLUMNS == 7);
+	ut_ad(DICT_NUM_FIELDS__SYS_COLUMNS == 9);
+	ut_ad(DICT_NUM_COLS__SYS_INDEXES == 7);
+	ut_ad(DICT_NUM_FIELDS__SYS_INDEXES == 9);
+	ut_ad(DICT_NUM_COLS__SYS_FIELDS == 3);
+	ut_ad(DICT_NUM_FIELDS__SYS_FIELDS == 5);
+	ut_ad(DICT_NUM_COLS__SYS_FOREIGN == 4);
+	ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN == 6);
+	ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME == 2);
+	ut_ad(DICT_NUM_COLS__SYS_FOREIGN_COLS == 4);
+	ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN_COLS == 6);
+
+	mtr_start(&mtr);
+
+	/* Create the hash tables etc. */
+	dict_init();
+
+	heap = mem_heap_create(450);
+
+	mutex_enter(&(dict_sys->mutex));
+
+	/* Get the dictionary header */
+	dict_hdr = dict_hdr_get(&mtr);
+
+	/* Because we only write new row ids to disk-based data structure
+	(dictionary header) when it is divisible by
+	DICT_HDR_ROW_ID_WRITE_MARGIN, in recovery we will not recover
+	the latest value of the row id counter. Therefore we advance
+	the counter at the database startup to avoid overlapping values.
+	Note that when a user after database startup first time asks for
+	a new row id, then because the counter is now divisible by
+	..._MARGIN, it will immediately be updated to the disk-based
+	header. */
+
+	dict_sys->row_id = DICT_HDR_ROW_ID_WRITE_MARGIN
+		+ ut_uint64_align_up(mach_read_from_8(dict_hdr + DICT_HDR_ROW_ID),
+				     DICT_HDR_ROW_ID_WRITE_MARGIN);
+
+	/* Insert into the dictionary cache the descriptions of the basic
+	system tables */
+	/*-------------------------*/
+	table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE, 8, 0, 0);
+
+	dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0);
+	/* ROW_FORMAT = (N_COLS >> 31) ? COMPACT : REDUNDANT */
+	dict_mem_table_add_col(table, heap, "N_COLS", DATA_INT, 0, 4);
+	/* The low order bit of TYPE is always set to 1.  If the format
+	is UNIV_FORMAT_B or higher, this field matches table->flags. */
+	dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "MIX_ID", DATA_BINARY, 0, 0);
+	/* MIX_LEN may contain additional table flags when
+	ROW_FORMAT!=REDUNDANT.  Currently, these flags include
+	DICT_TF2_TEMPORARY. */
+	dict_mem_table_add_col(table, heap, "MIX_LEN", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "CLUSTER_NAME", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4);
+
+	table->id = DICT_TABLES_ID;
+
+	dict_table_add_to_cache(table, FALSE, heap);
+	dict_sys->sys_tables = table;
+	mem_heap_empty(heap);
+
+	index = dict_mem_index_create("SYS_TABLES", "CLUST_IND",
+				      DICT_HDR_SPACE,
+				      DICT_UNIQUE | DICT_CLUSTERED, 1);
+
+	dict_mem_index_add_field(index, "NAME", 0);
+
+	index->id = DICT_TABLES_ID;
+
+	error = dict_index_add_to_cache(table, index,
+					mtr_read_ulint(dict_hdr
+						       + DICT_HDR_TABLES,
+						       MLOG_4BYTES, &mtr),
+					FALSE);
+	ut_a(error == DB_SUCCESS);
+
+	/*-------------------------*/
+	index = dict_mem_index_create("SYS_TABLES", "ID_IND",
+				      DICT_HDR_SPACE, DICT_UNIQUE, 1);
+	dict_mem_index_add_field(index, "ID", 0);
+
+	index->id = DICT_TABLE_IDS_ID;
+	error = dict_index_add_to_cache(table, index,
+					mtr_read_ulint(dict_hdr
+						       + DICT_HDR_TABLE_IDS,
+						       MLOG_4BYTES, &mtr),
+					FALSE);
+	ut_a(error == DB_SUCCESS);
+
+	/*-------------------------*/
+	table = dict_mem_table_create("SYS_COLUMNS", DICT_HDR_SPACE, 7, 0, 0);
+
+	dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "MTYPE", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "PRTYPE", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "LEN", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "PREC", DATA_INT, 0, 4);
+
+	table->id = DICT_COLUMNS_ID;
+
+	dict_table_add_to_cache(table, FALSE, heap);
+	dict_sys->sys_columns = table;
+	mem_heap_empty(heap);
+
+	index = dict_mem_index_create("SYS_COLUMNS", "CLUST_IND",
+				      DICT_HDR_SPACE,
+				      DICT_UNIQUE | DICT_CLUSTERED, 2);
+
+	dict_mem_index_add_field(index, "TABLE_ID", 0);
+	dict_mem_index_add_field(index, "POS", 0);
+
+	index->id = DICT_COLUMNS_ID;
+	error = dict_index_add_to_cache(table, index,
+					mtr_read_ulint(dict_hdr
+						       + DICT_HDR_COLUMNS,
+						       MLOG_4BYTES, &mtr),
+					FALSE);
+	ut_a(error == DB_SUCCESS);
+
+	/*-------------------------*/
+	table = dict_mem_table_create("SYS_INDEXES", DICT_HDR_SPACE, 7, 0, 0);
+
+	dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "N_FIELDS", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "PAGE_NO", DATA_INT, 0, 4);
+
+	table->id = DICT_INDEXES_ID;
+
+	dict_table_add_to_cache(table, FALSE, heap);
+	dict_sys->sys_indexes = table;
+	mem_heap_empty(heap);
+
+	index = dict_mem_index_create("SYS_INDEXES", "CLUST_IND",
+				      DICT_HDR_SPACE,
+				      DICT_UNIQUE | DICT_CLUSTERED, 2);
+
+	dict_mem_index_add_field(index, "TABLE_ID", 0);
+	dict_mem_index_add_field(index, "ID", 0);
+
+	index->id = DICT_INDEXES_ID;
+	error = dict_index_add_to_cache(table, index,
+					mtr_read_ulint(dict_hdr
+						       + DICT_HDR_INDEXES,
+						       MLOG_4BYTES, &mtr),
+					FALSE);
+	ut_a(error == DB_SUCCESS);
+
+	/*-------------------------*/
+	table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE, 3, 0, 0);
+
+	dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "COL_NAME", DATA_BINARY, 0, 0);
+
+	table->id = DICT_FIELDS_ID;
+
+	dict_table_add_to_cache(table, FALSE, heap);
+	dict_sys->sys_fields = table;
+	mem_heap_free(heap);
+
+	index = dict_mem_index_create("SYS_FIELDS", "CLUST_IND",
+				      DICT_HDR_SPACE,
+				      DICT_UNIQUE | DICT_CLUSTERED, 2);
+
+	dict_mem_index_add_field(index, "INDEX_ID", 0);
+	dict_mem_index_add_field(index, "POS", 0);
+
+	index->id = DICT_FIELDS_ID;
+	error = dict_index_add_to_cache(table, index,
+					mtr_read_ulint(dict_hdr
+						       + DICT_HDR_FIELDS,
+						       MLOG_4BYTES, &mtr),
+					FALSE);
+	ut_a(error == DB_SUCCESS);
+
+	mtr_commit(&mtr);
+
+	/*-------------------------*/
+
+	/* Initialize the insert buffer table and index for each tablespace */
+
+	ibuf_init_at_db_start();
+
+	dberr_t	err = DB_SUCCESS;
+
+	if (srv_read_only_mode && !ibuf_is_empty()) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Change buffer must be empty when --innodb-read-only "
+			"is set!");
+
+		err = DB_ERROR;
+	} else {
+		/* Load definitions of other indexes on system tables */
+
+		dict_load_sys_table(dict_sys->sys_tables);
+		dict_load_sys_table(dict_sys->sys_columns);
+		dict_load_sys_table(dict_sys->sys_indexes);
+		dict_load_sys_table(dict_sys->sys_fields);
+	}
+
+	mutex_exit(&(dict_sys->mutex));
+
+	return(err);
+}
+
+/*****************************************************************//**
+Inserts the basic system table data into themselves in the database
+creation. */
+static
+void
+dict_insert_initial_data(void)
+/*==========================*/
+{
+	/* Does nothing yet */
+}
+
+/*****************************************************************//**
+Creates and initializes the data dictionary at the server bootstrap.
+@return DB_SUCCESS or error code. */
+UNIV_INTERN
+dberr_t
+dict_create(void)
+/*=============*/
+{
+	mtr_t	mtr;
+
+	mtr_start(&mtr);
+
+	dict_hdr_create(&mtr);
+
+	mtr_commit(&mtr);
+
+	dberr_t	err = dict_boot();
+
+	if (err == DB_SUCCESS) {
+		dict_insert_initial_data();
+	}
+
+	return(err);
+}
diff --git a/storage/innobase/dict/dict0crea.cc b/storage/innobase/dict/dict0crea.cc
new file mode 100644
index 00000000000..30523ff2af4
--- /dev/null
+++ b/storage/innobase/dict/dict0crea.cc
@@ -0,0 +1,1845 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0crea.cc
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0crea.h"
+
+#ifdef UNIV_NONINL
+#include "dict0crea.ic"
+#endif
+
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "page0page.h"
+#include "mach0data.h"
+#include "dict0boot.h"
+#include "dict0dict.h"
+#include "que0que.h"
+#include "row0ins.h"
+#include "row0mysql.h"
+#include "pars0pars.h"
+#include "trx0roll.h"
+#include "usr0sess.h"
+#include "ut0vec.h"
+#include "dict0priv.h"
+#include "fts0priv.h"
+#include "ha_prototypes.h"
+
+/*****************************************************************//**
+Based on a table object, this function builds the entry to be inserted
+in the SYS_TABLES system table.
+@return	the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_tables_tuple(
+/*=========================*/
+	const dict_table_t*	table,	/*!< in: table */
+	mem_heap_t*		heap)	/*!< in: memory heap from
+					which the memory for the built
+					tuple is allocated */
+{
+	dict_table_t*	sys_tables;
+	dtuple_t*	entry;
+	dfield_t*	dfield;
+	byte*		ptr;
+	ulint		type;
+
+	ut_ad(table);
+	ut_ad(heap);
+
+	sys_tables = dict_sys->sys_tables;
+
+	entry = dtuple_create(heap, 8 + DATA_N_SYS_COLS);
+
+	dict_table_copy_types(entry, sys_tables);
+
+	/* 0: NAME -----------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__NAME);
+
+	dfield_set_data(dfield, table->name, ut_strlen(table->name));
+
+	/* 1: DB_TRX_ID added later */
+	/* 2: DB_ROLL_PTR added later */
+	/* 3: ID -------------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__ID);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(ptr, table->id);
+
+	dfield_set_data(dfield, ptr, 8);
+
+	/* 4: N_COLS ---------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__N_COLS);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, table->n_def
+			| ((table->flags & DICT_TF_COMPACT) << 31));
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 5: TYPE (table flags) -----------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__TYPE);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+	/* Validate the table flags and convert them to what is saved in
+	SYS_TABLES.TYPE.  Table flag values 0 and 1 are both written to
+	SYS_TABLES.TYPE as 1. */
+	type = dict_tf_to_sys_tables_type(table->flags);
+	mach_write_to_4(ptr, type);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 6: MIX_ID (obsolete) ---------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__MIX_ID);
+
+	ptr = static_cast<byte*>(mem_heap_zalloc(heap, 8));
+
+	dfield_set_data(dfield, ptr, 8);
+
+	/* 7: MIX_LEN (additional flags) --------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__MIX_LEN);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	/* Be sure all non-used bits are zero. */
+	ut_a(!(table->flags2 & ~DICT_TF2_BIT_MASK));
+	mach_write_to_4(ptr, table->flags2);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 8: CLUSTER_NAME ---------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__CLUSTER_ID);
+	dfield_set_null(dfield); /* not supported */
+
+	/* 9: SPACE ----------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__SPACE);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, table->space);
+
+	dfield_set_data(dfield, ptr, 4);
+	/*----------------------------------*/
+
+	return(entry);
+}
+
+/*****************************************************************//**
+Based on a table object, this function builds the entry to be inserted
+in the SYS_COLUMNS system table.
+@return	the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_columns_tuple(
+/*==========================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			i,	/*!< in: column number */
+	mem_heap_t*		heap)	/*!< in: memory heap from
+					which the memory for the built
+					tuple is allocated */
+{
+	dict_table_t*		sys_columns;
+	dtuple_t*		entry;
+	const dict_col_t*	column;
+	dfield_t*		dfield;
+	byte*			ptr;
+	const char*		col_name;
+
+	ut_ad(table);
+	ut_ad(heap);
+
+	column = dict_table_get_nth_col(table, i);
+
+	sys_columns = dict_sys->sys_columns;
+
+	entry = dtuple_create(heap, 7 + DATA_N_SYS_COLS);
+
+	dict_table_copy_types(entry, sys_columns);
+
+	/* 0: TABLE_ID -----------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__TABLE_ID);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(ptr, table->id);
+
+	dfield_set_data(dfield, ptr, 8);
+
+	/* 1: POS ----------------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__POS);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, i);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 2: DB_TRX_ID added later */
+	/* 3: DB_ROLL_PTR added later */
+	/* 4: NAME ---------------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__NAME);
+
+	col_name = dict_table_get_col_name(table, i);
+	dfield_set_data(dfield, col_name, ut_strlen(col_name));
+
+	/* 5: MTYPE --------------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__MTYPE);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, column->mtype);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 6: PRTYPE -------------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__PRTYPE);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, column->prtype);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 7: LEN ----------------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__LEN);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, column->len);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 8: PREC ---------------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__PREC);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, 0/* unused */);
+
+	dfield_set_data(dfield, ptr, 4);
+	/*---------------------------------*/
+
+	return(entry);
+}
+
+/***************************************************************//**
+Builds a table definition to insert.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+dict_build_table_def_step(
+/*======================*/
+	que_thr_t*	thr,	/*!< in: query thread */
+	tab_node_t*	node)	/*!< in: table create node */
+{
+	dict_table_t*	table;
+	dtuple_t*	row;
+	dberr_t		error;
+	const char*	path;
+	mtr_t		mtr;
+	ulint		space = 0;
+	bool		use_tablespace;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	table = node->table;
+	use_tablespace = DICT_TF2_FLAG_IS_SET(table, DICT_TF2_USE_TABLESPACE);
+
+	dict_hdr_get_new_id(&table->id, NULL, NULL);
+
+	thr_get_trx(thr)->table_id = table->id;
+
+        /* Always set this bit for all new created tables */
+	DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_AUX_HEX_NAME);
+	DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+			DICT_TF2_FLAG_UNSET(table,
+					    DICT_TF2_FTS_AUX_HEX_NAME););
+
+	if (use_tablespace) {
+		/* This table will not use the system tablespace.
+		Get a new space id. */
+		dict_hdr_get_new_id(NULL, NULL, &space);
+
+		DBUG_EXECUTE_IF(
+			"ib_create_table_fail_out_of_space_ids",
+			space = ULINT_UNDEFINED;
+		);
+
+		if (UNIV_UNLIKELY(space == ULINT_UNDEFINED)) {
+			return(DB_ERROR);
+		}
+
+		/* We create a new single-table tablespace for the table.
+		We initially let it be 4 pages:
+		- page 0 is the fsp header and an extent descriptor page,
+		- page 1 is an ibuf bitmap page,
+		- page 2 is the first inode page,
+		- page 3 will contain the root of the clustered index of the
+		table we create here. */
+
+		path = table->data_dir_path ? table->data_dir_path
+					    : table->dir_path_of_temp_table;
+
+		ut_ad(dict_table_get_format(table) <= UNIV_FORMAT_MAX);
+		ut_ad(!dict_table_zip_size(table)
+		      || dict_table_get_format(table) >= UNIV_FORMAT_B);
+
+		error = fil_create_new_single_table_tablespace(
+			space, table->name, path,
+			dict_tf_to_fsp_flags(table->flags),
+			table->flags2,
+			FIL_IBD_FILE_INITIAL_SIZE);
+
+		table->space = (unsigned int) space;
+
+		if (error != DB_SUCCESS) {
+
+			return(error);
+		}
+
+		mtr_start(&mtr);
+
+		fsp_header_init(table->space, FIL_IBD_FILE_INITIAL_SIZE, &mtr);
+
+		mtr_commit(&mtr);
+	} else {
+		/* Create in the system tablespace: disallow Barracuda
+		features by keeping only the first bit which says whether
+		the row format is redundant or compact */
+		table->flags &= DICT_TF_COMPACT;
+	}
+
+	row = dict_create_sys_tables_tuple(table, node->heap);
+
+	ins_node_set_new_row(node->tab_def, row);
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************//**
+Builds a column definition to insert. */
+static
+void
+dict_build_col_def_step(
+/*====================*/
+	tab_node_t*	node)	/*!< in: table create node */
+{
+	dtuple_t*	row;
+
+	row = dict_create_sys_columns_tuple(node->table, node->col_no,
+					    node->heap);
+	ins_node_set_new_row(node->col_def, row);
+}
+
+/*****************************************************************//**
+Based on an index object, this function builds the entry to be inserted
+in the SYS_INDEXES system table.
+@return	the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_indexes_tuple(
+/*==========================*/
+	const dict_index_t*	index,	/*!< in: index */
+	mem_heap_t*		heap)	/*!< in: memory heap from
+					which the memory for the built
+					tuple is allocated */
+{
+	dict_table_t*	sys_indexes;
+	dict_table_t*	table;
+	dtuple_t*	entry;
+	dfield_t*	dfield;
+	byte*		ptr;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(index);
+	ut_ad(heap);
+
+	sys_indexes = dict_sys->sys_indexes;
+
+	table = dict_table_get_low(index->table_name);
+
+	entry = dtuple_create(heap, 7 + DATA_N_SYS_COLS);
+
+	dict_table_copy_types(entry, sys_indexes);
+
+	/* 0: TABLE_ID -----------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__TABLE_ID);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(ptr, table->id);
+
+	dfield_set_data(dfield, ptr, 8);
+
+	/* 1: ID ----------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__ID);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(ptr, index->id);
+
+	dfield_set_data(dfield, ptr, 8);
+
+	/* 2: DB_TRX_ID added later */
+	/* 3: DB_ROLL_PTR added later */
+	/* 4: NAME --------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__NAME);
+
+	dfield_set_data(dfield, index->name, ut_strlen(index->name));
+
+	/* 5: N_FIELDS ----------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__N_FIELDS);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, index->n_fields);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 6: TYPE --------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__TYPE);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, index->type);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 7: SPACE --------------------------*/
+
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__SPACE);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, index->space);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 8: PAGE_NO --------------------------*/
+
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__PAGE_NO);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, FIL_NULL);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/*--------------------------------*/
+
+	return(entry);
+}
+
+/*****************************************************************//**
+Based on an index object, this function builds the entry to be inserted
+in the SYS_FIELDS system table.
+@return	the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_fields_tuple(
+/*=========================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			fld_no,	/*!< in: field number */
+	mem_heap_t*		heap)	/*!< in: memory heap from
+					which the memory for the built
+					tuple is allocated */
+{
+	dict_table_t*	sys_fields;
+	dtuple_t*	entry;
+	dict_field_t*	field;
+	dfield_t*	dfield;
+	byte*		ptr;
+	ibool		index_contains_column_prefix_field	= FALSE;
+	ulint		j;
+
+	ut_ad(index);
+	ut_ad(heap);
+
+	for (j = 0; j < index->n_fields; j++) {
+		if (dict_index_get_nth_field(index, j)->prefix_len > 0) {
+			index_contains_column_prefix_field = TRUE;
+			break;
+		}
+	}
+
+	field = dict_index_get_nth_field(index, fld_no);
+
+	sys_fields = dict_sys->sys_fields;
+
+	entry = dtuple_create(heap, 3 + DATA_N_SYS_COLS);
+
+	dict_table_copy_types(entry, sys_fields);
+
+	/* 0: INDEX_ID -----------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__INDEX_ID);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(ptr, index->id);
+
+	dfield_set_data(dfield, ptr, 8);
+
+	/* 1: POS; FIELD NUMBER & PREFIX LENGTH -----------------------*/
+
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__POS);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+	if (index_contains_column_prefix_field) {
+		/* If there are column prefix fields in the index, then
+		we store the number of the field to the 2 HIGH bytes
+		and the prefix length to the 2 low bytes, */
+
+		mach_write_to_4(ptr, (fld_no << 16) + field->prefix_len);
+	} else {
+		/* Else we store the number of the field to the 2 LOW bytes.
+		This is to keep the storage format compatible with
+		InnoDB versions < 4.0.14. */
+
+		mach_write_to_4(ptr, fld_no);
+	}
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 2: DB_TRX_ID added later */
+	/* 3: DB_ROLL_PTR added later */
+	/* 4: COL_NAME -------------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__COL_NAME);
+
+	dfield_set_data(dfield, field->name,
+			ut_strlen(field->name));
+	/*---------------------------------*/
+
+	return(entry);
+}
+
+/*****************************************************************//**
+Creates the tuple with which the index entry is searched for writing the index
+tree root page number, if such a tree is created.
+@return	the tuple for search */
+static
+dtuple_t*
+dict_create_search_tuple(
+/*=====================*/
+	const dtuple_t*	tuple,	/*!< in: the tuple inserted in the SYS_INDEXES
+				table */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory for
+				the built tuple is allocated */
+{
+	dtuple_t*	search_tuple;
+	const dfield_t*	field1;
+	dfield_t*	field2;
+
+	ut_ad(tuple && heap);
+
+	search_tuple = dtuple_create(heap, 2);
+
+	field1 = dtuple_get_nth_field(tuple, 0);
+	field2 = dtuple_get_nth_field(search_tuple, 0);
+
+	dfield_copy(field2, field1);
+
+	field1 = dtuple_get_nth_field(tuple, 1);
+	field2 = dtuple_get_nth_field(search_tuple, 1);
+
+	dfield_copy(field2, field1);
+
+	ut_ad(dtuple_validate(search_tuple));
+
+	return(search_tuple);
+}
+
+/***************************************************************//**
+Builds an index definition row to insert.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+dict_build_index_def_step(
+/*======================*/
+	que_thr_t*	thr,	/*!< in: query thread */
+	ind_node_t*	node)	/*!< in: index create node */
+{
+	dict_table_t*	table;
+	dict_index_t*	index;
+	dtuple_t*	row;
+	trx_t*		trx;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	trx = thr_get_trx(thr);
+
+	index = node->index;
+
+	table = dict_table_get_low(index->table_name);
+
+	if (table == NULL) {
+		return(DB_TABLE_NOT_FOUND);
+	}
+
+	if (!trx->table_id) {
+		/* Record only the first table id. */
+		trx->table_id = table->id;
+	}
+
+	node->table = table;
+
+	ut_ad((UT_LIST_GET_LEN(table->indexes) > 0)
+	      || dict_index_is_clust(index));
+
+	dict_hdr_get_new_id(NULL, &index->id, NULL);
+
+	/* Inherit the space id from the table; we store all indexes of a
+	table in the same tablespace */
+
+	index->space = table->space;
+	node->page_no = FIL_NULL;
+	row = dict_create_sys_indexes_tuple(index, node->heap);
+	node->ind_row = row;
+
+	ins_node_set_new_row(node->ind_def, row);
+
+	/* Note that the index was created by this transaction. */
+	index->trx_id = trx->id;
+	ut_ad(table->def_trx_id <= trx->id);
+	table->def_trx_id = trx->id;
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************//**
+Builds a field definition row to insert. */
+static
+void
+dict_build_field_def_step(
+/*======================*/
+	ind_node_t*	node)	/*!< in: index create node */
+{
+	dict_index_t*	index;
+	dtuple_t*	row;
+
+	index = node->index;
+
+	row = dict_create_sys_fields_tuple(index, node->field_no, node->heap);
+
+	ins_node_set_new_row(node->field_def, row);
+}
+
+/***************************************************************//**
+Creates an index tree for the index if it is not a member of a cluster.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+dict_create_index_tree_step(
+/*========================*/
+	ind_node_t*	node)	/*!< in: index create node */
+{
+	dict_index_t*	index;
+	dict_table_t*	sys_indexes;
+	dtuple_t*	search_tuple;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	index = node->index;
+
+	sys_indexes = dict_sys->sys_indexes;
+
+	if (index->type == DICT_FTS) {
+		/* FTS index does not need an index tree */
+		return(DB_SUCCESS);
+	}
+
+	/* Run a mini-transaction in which the index tree is allocated for
+	the index and its root address is written to the index entry in
+	sys_indexes */
+
+	mtr_start(&mtr);
+
+	search_tuple = dict_create_search_tuple(node->ind_row, node->heap);
+
+	btr_pcur_open(UT_LIST_GET_FIRST(sys_indexes->indexes),
+		      search_tuple, PAGE_CUR_L, BTR_MODIFY_LEAF,
+		      &pcur, &mtr);
+
+	btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+
+	dberr_t		err = DB_SUCCESS;
+	ulint		zip_size = dict_table_zip_size(index->table);
+
+	if (node->index->table->ibd_file_missing
+	    || dict_table_is_discarded(node->index->table)) {
+
+		node->page_no = FIL_NULL;
+	} else {
+		node->page_no = btr_create(
+			index->type, index->space, zip_size,
+			index->id, index, &mtr);
+
+		if (node->page_no == FIL_NULL) {
+			err = DB_OUT_OF_FILE_SPACE;
+		}
+
+		DBUG_EXECUTE_IF("ib_import_create_index_failure_1",
+				node->page_no = FIL_NULL;
+				err = DB_OUT_OF_FILE_SPACE; );
+	}
+
+	page_rec_write_field(
+		btr_pcur_get_rec(&pcur), DICT_FLD__SYS_INDEXES__PAGE_NO,
+		node->page_no, &mtr);
+
+	btr_pcur_close(&pcur);
+
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/*******************************************************************//**
+Drops the index tree associated with a row in SYS_INDEXES table. */
+UNIV_INTERN
+void
+dict_drop_index_tree(
+/*=================*/
+	rec_t*	rec,	/*!< in/out: record in the clustered index
+			of SYS_INDEXES table */
+	mtr_t*	mtr)	/*!< in: mtr having the latch on the record page */
+{
+	ulint		root_page_no;
+	ulint		space;
+	ulint		zip_size;
+	const byte*	ptr;
+	ulint		len;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_a(!dict_table_is_comp(dict_sys->sys_indexes));
+	ptr = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len);
+
+	ut_ad(len == 4);
+
+	root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);
+
+	if (root_page_no == FIL_NULL) {
+		/* The tree has already been freed */
+
+		return;
+	}
+
+	ptr = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__SPACE, &len);
+
+	ut_ad(len == 4);
+
+	space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);
+	zip_size = fil_space_get_zip_size(space);
+
+	if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+		/* It is a single table tablespace and the .ibd file is
+		missing: do nothing */
+
+		return;
+	}
+
+	/* We free all the pages but the root page first; this operation
+	may span several mini-transactions */
+
+	btr_free_but_not_root(space, zip_size, root_page_no);
+
+	/* Then we free the root page in the same mini-transaction where
+	we write FIL_NULL to the appropriate field in the SYS_INDEXES
+	record: this mini-transaction marks the B-tree totally freed */
+
+	/* printf("Dropping index tree in space %lu root page %lu\n", space,
+	root_page_no); */
+	btr_free_root(space, zip_size, root_page_no, mtr);
+
+	page_rec_write_field(rec, DICT_FLD__SYS_INDEXES__PAGE_NO,
+			     FIL_NULL, mtr);
+}
+
+/*******************************************************************//**
+Truncates the index tree associated with a row in SYS_INDEXES table.
+@return	new root page number, or FIL_NULL on failure */
+UNIV_INTERN
+ulint
+dict_truncate_index_tree(
+/*=====================*/
+	dict_table_t*	table,	/*!< in: the table the index belongs to */
+	ulint		space,	/*!< in: 0=truncate,
+				nonzero=create the index tree in the
+				given tablespace */
+	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor pointing to
+				record in the clustered index of
+				SYS_INDEXES table. The cursor may be
+				repositioned in this call. */
+	mtr_t*		mtr)	/*!< in: mtr having the latch
+				on the record page. The mtr may be
+				committed and restarted in this call. */
+{
+	ulint		root_page_no;
+	ibool		drop = !space;
+	ulint		zip_size;
+	ulint		type;
+	index_id_t	index_id;
+	rec_t*		rec;
+	const byte*	ptr;
+	ulint		len;
+	dict_index_t*	index;
+	bool		has_been_dropped = false;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_a(!dict_table_is_comp(dict_sys->sys_indexes));
+	rec = btr_pcur_get_rec(pcur);
+	ptr = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len);
+
+	ut_ad(len == 4);
+
+	root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);
+
+	if (drop && root_page_no == FIL_NULL) {
+		has_been_dropped = true;
+		drop = FALSE;
+	}
+
+	ptr = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__SPACE, &len);
+
+	ut_ad(len == 4);
+
+	if (drop) {
+		space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);
+	}
+
+	zip_size = fil_space_get_zip_size(space);
+
+	if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+		/* It is a single table tablespace and the .ibd file is
+		missing: do nothing */
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Trying to TRUNCATE"
+			" a missing .ibd file of table %s!\n", table->name);
+		return(FIL_NULL);
+	}
+
+	ptr = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__TYPE, &len);
+	ut_ad(len == 4);
+	type = mach_read_from_4(ptr);
+
+	ptr = rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__ID, &len);
+	ut_ad(len == 8);
+	index_id = mach_read_from_8(ptr);
+
+	if (!drop) {
+
+		goto create;
+	}
+
+	/* We free all the pages but the root page first; this operation
+	may span several mini-transactions */
+
+	btr_free_but_not_root(space, zip_size, root_page_no);
+
+	/* Then we free the root page in the same mini-transaction where
+	we create the b-tree and write its new root page number to the
+	appropriate field in the SYS_INDEXES record: this mini-transaction
+	marks the B-tree totally truncated */
+
+	btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, NULL, mtr);
+
+	btr_free_root(space, zip_size, root_page_no, mtr);
+create:
+	/* We will temporarily write FIL_NULL to the PAGE_NO field
+	in SYS_INDEXES, so that the database will not get into an
+	inconsistent state in case it crashes between the mtr_commit()
+	below and the following mtr_commit() call. */
+	page_rec_write_field(rec, DICT_FLD__SYS_INDEXES__PAGE_NO,
+			     FIL_NULL, mtr);
+
+	/* We will need to commit the mini-transaction in order to avoid
+	deadlocks in the btr_create() call, because otherwise we would
+	be freeing and allocating pages in the same mini-transaction. */
+	btr_pcur_store_position(pcur, mtr);
+	mtr_commit(mtr);
+
+	mtr_start(mtr);
+	btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr);
+
+	/* Find the index corresponding to this SYS_INDEXES record. */
+	for (index = UT_LIST_GET_FIRST(table->indexes);
+	     index;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+		if (index->id == index_id) {
+			if (index->type & DICT_FTS) {
+				return(FIL_NULL);
+			} else {
+				if (has_been_dropped) {
+					fprintf(stderr,	"  InnoDB: Trying to"
+						" TRUNCATE a missing index of"
+						" table %s!\n",
+						index->table->name);
+				}
+
+				root_page_no = btr_create(type, space, zip_size,
+							  index_id, index, mtr);
+				index->page = (unsigned int) root_page_no;
+				return(root_page_no);
+			}
+		}
+	}
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		"  InnoDB: Index %llu of table %s is missing\n"
+		"InnoDB: from the data dictionary during TRUNCATE!\n",
+		(ullint) index_id,
+		table->name);
+
+	return(FIL_NULL);
+}
+
+/*********************************************************************//**
+Creates a table create graph.
+@return	own: table create node */
+UNIV_INTERN
+tab_node_t*
+tab_create_graph_create(
+/*====================*/
+	dict_table_t*	table,	/*!< in: table to create, built as a memory data
+				structure */
+	mem_heap_t*	heap,	/*!< in: heap where created */
+	bool		commit)	/*!< in: true if the commit node should be
+				added to the query graph */
+{
+	tab_node_t*	node;
+
+	node = static_cast<tab_node_t*>(
+		mem_heap_alloc(heap, sizeof(tab_node_t)));
+
+	node->common.type = QUE_NODE_CREATE_TABLE;
+
+	node->table = table;
+
+	node->state = TABLE_BUILD_TABLE_DEF;
+	node->heap = mem_heap_create(256);
+
+	node->tab_def = ins_node_create(INS_DIRECT, dict_sys->sys_tables,
+					heap);
+	node->tab_def->common.parent = node;
+
+	node->col_def = ins_node_create(INS_DIRECT, dict_sys->sys_columns,
+					heap);
+	node->col_def->common.parent = node;
+
+	if (commit) {
+		node->commit_node = trx_commit_node_create(heap);
+		node->commit_node->common.parent = node;
+	} else {
+		node->commit_node = 0;
+	}
+
+	return(node);
+}
+
+/*********************************************************************//**
+Creates an index create graph.
+@return	own: index create node */
+UNIV_INTERN
+ind_node_t*
+ind_create_graph_create(
+/*====================*/
+	dict_index_t*	index,	/*!< in: index to create, built as a memory data
+				structure */
+	mem_heap_t*	heap,	/*!< in: heap where created */
+	bool		commit)	/*!< in: true if the commit node should be
+				added to the query graph */
+{
+	ind_node_t*	node;
+
+	node = static_cast<ind_node_t*>(
+		mem_heap_alloc(heap, sizeof(ind_node_t)));
+
+	node->common.type = QUE_NODE_CREATE_INDEX;
+
+	node->index = index;
+
+	node->state = INDEX_BUILD_INDEX_DEF;
+	node->page_no = FIL_NULL;
+	node->heap = mem_heap_create(256);
+
+	node->ind_def = ins_node_create(INS_DIRECT,
+					dict_sys->sys_indexes, heap);
+	node->ind_def->common.parent = node;
+
+	node->field_def = ins_node_create(INS_DIRECT,
+					  dict_sys->sys_fields, heap);
+	node->field_def->common.parent = node;
+
+	if (commit) {
+		node->commit_node = trx_commit_node_create(heap);
+		node->commit_node->common.parent = node;
+	} else {
+		node->commit_node = 0;
+	}
+
+	return(node);
+}
+
+/***********************************************************//**
+Creates a table. This is a high-level function used in SQL execution graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+dict_create_table_step(
+/*===================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	tab_node_t*	node;
+	dberr_t		err	= DB_ERROR;
+	trx_t*		trx;
+
+	ut_ad(thr);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	trx = thr_get_trx(thr);
+
+	node = static_cast<tab_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_TABLE);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		node->state = TABLE_BUILD_TABLE_DEF;
+	}
+
+	if (node->state == TABLE_BUILD_TABLE_DEF) {
+
+		/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+		err = dict_build_table_def_step(thr, node);
+
+		if (err != DB_SUCCESS) {
+
+			goto function_exit;
+		}
+
+		node->state = TABLE_BUILD_COL_DEF;
+		node->col_no = 0;
+
+		thr->run_node = node->tab_def;
+
+		return(thr);
+	}
+
+	if (node->state == TABLE_BUILD_COL_DEF) {
+
+		if (node->col_no < (node->table)->n_def) {
+
+			dict_build_col_def_step(node);
+
+			node->col_no++;
+
+			thr->run_node = node->col_def;
+
+			return(thr);
+		} else {
+			node->state = TABLE_COMMIT_WORK;
+		}
+	}
+
+	if (node->state == TABLE_COMMIT_WORK) {
+
+		/* Table was correctly defined: do NOT commit the transaction
+		(CREATE TABLE does NOT do an implicit commit of the current
+		transaction) */
+
+		node->state = TABLE_ADD_TO_CACHE;
+
+		/* thr->run_node = node->commit_node;
+
+		return(thr); */
+	}
+
+	if (node->state == TABLE_ADD_TO_CACHE) {
+
+		dict_table_add_to_cache(node->table, TRUE, node->heap);
+
+		err = DB_SUCCESS;
+	}
+
+function_exit:
+	trx->error_state = err;
+
+	if (err == DB_SUCCESS) {
+		/* Ok: do nothing */
+
+	} else if (err == DB_LOCK_WAIT) {
+
+		return(NULL);
+	} else {
+		/* SQL error detected */
+
+		return(NULL);
+	}
+
+	thr->run_node = que_node_get_parent(node);
+
+	return(thr);
+}
+
+/***********************************************************//**
+Creates an index. This is a high-level function used in SQL execution
+graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+dict_create_index_step(
+/*===================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ind_node_t*	node;
+	dberr_t		err	= DB_ERROR;
+	trx_t*		trx;
+
+	ut_ad(thr);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	trx = thr_get_trx(thr);
+
+	node = static_cast<ind_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_INDEX);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		node->state = INDEX_BUILD_INDEX_DEF;
+	}
+
+	if (node->state == INDEX_BUILD_INDEX_DEF) {
+		/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+		err = dict_build_index_def_step(thr, node);
+
+		if (err != DB_SUCCESS) {
+
+			goto function_exit;
+		}
+
+		node->state = INDEX_BUILD_FIELD_DEF;
+		node->field_no = 0;
+
+		thr->run_node = node->ind_def;
+
+		return(thr);
+	}
+
+	if (node->state == INDEX_BUILD_FIELD_DEF) {
+
+		if (node->field_no < (node->index)->n_fields) {
+
+			dict_build_field_def_step(node);
+
+			node->field_no++;
+
+			thr->run_node = node->field_def;
+
+			return(thr);
+		} else {
+			node->state = INDEX_ADD_TO_CACHE;
+		}
+	}
+
+	if (node->state == INDEX_ADD_TO_CACHE) {
+
+		index_id_t	index_id = node->index->id;
+
+		err = dict_index_add_to_cache(
+			node->table, node->index, FIL_NULL,
+			trx_is_strict(trx)
+			|| dict_table_get_format(node->table)
+			>= UNIV_FORMAT_B);
+
+		node->index = dict_index_get_if_in_cache_low(index_id);
+		ut_a(!node->index == (err != DB_SUCCESS));
+
+		if (err != DB_SUCCESS) {
+
+			goto function_exit;
+		}
+
+		node->state = INDEX_CREATE_INDEX_TREE;
+	}
+
+	if (node->state == INDEX_CREATE_INDEX_TREE) {
+
+		err = dict_create_index_tree_step(node);
+
+		DBUG_EXECUTE_IF("ib_dict_create_index_tree_fail",
+				err = DB_OUT_OF_MEMORY;);
+
+		if (err != DB_SUCCESS) {
+			/* If this is a FTS index, we will need to remove
+			it from fts->cache->indexes list as well */
+			if ((node->index->type & DICT_FTS)
+			    && node->table->fts) {
+				fts_index_cache_t*	index_cache;
+
+				rw_lock_x_lock(
+					&node->table->fts->cache->init_lock);
+
+				index_cache = (fts_index_cache_t*)
+					 fts_find_index_cache(
+						node->table->fts->cache,
+						node->index);
+
+				if (index_cache->words) {
+					rbt_free(index_cache->words);
+					index_cache->words = 0;
+				}
+
+				ib_vector_remove(
+					node->table->fts->cache->indexes,
+					*reinterpret_cast<void**>(index_cache));
+
+				rw_lock_x_unlock(
+					&node->table->fts->cache->init_lock);
+			}
+
+			dict_index_remove_from_cache(node->table, node->index);
+			node->index = NULL;
+
+			goto function_exit;
+		}
+
+		node->index->page = node->page_no;
+		/* These should have been set in
+		dict_build_index_def_step() and
+		dict_index_add_to_cache(). */
+		ut_ad(node->index->trx_id == trx->id);
+		ut_ad(node->index->table->def_trx_id == trx->id);
+		node->state = INDEX_COMMIT_WORK;
+	}
+
+	if (node->state == INDEX_COMMIT_WORK) {
+
+		/* Index was correctly defined: do NOT commit the transaction
+		(CREATE INDEX does NOT currently do an implicit commit of
+		the current transaction) */
+
+		node->state = INDEX_CREATE_INDEX_TREE;
+
+		/* thr->run_node = node->commit_node;
+
+		return(thr); */
+	}
+
+function_exit:
+	trx->error_state = err;
+
+	if (err == DB_SUCCESS) {
+		/* Ok: do nothing */
+
+	} else if (err == DB_LOCK_WAIT) {
+
+		return(NULL);
+	} else {
+		/* SQL error detected */
+
+		return(NULL);
+	}
+
+	thr->run_node = que_node_get_parent(node);
+
+	return(thr);
+}
+
+/****************************************************************//**
+Check whether a system table exists.  Additionally, if it exists,
+move it to the non-LRU end of the table LRU list.  This is oly used
+for system tables that can be upgraded or added to an older database,
+which include SYS_FOREIGN, SYS_FOREIGN_COLS, SYS_TABLESPACES and
+SYS_DATAFILES.
+@return DB_SUCCESS if the sys table exists, DB_CORRUPTION if it exists
+but is not current, DB_TABLE_NOT_FOUND if it does not exist*/
+static
+dberr_t
+dict_check_if_system_table_exists(
+/*==============================*/
+	const char*	tablename,	/*!< in: name of table */
+	ulint		num_fields,	/*!< in: number of fields */
+	ulint		num_indexes)	/*!< in: number of indexes */
+{
+	dict_table_t*	sys_table;
+	dberr_t		error = DB_SUCCESS;
+
+	ut_a(srv_get_active_thread_type() == SRV_NONE);
+
+	mutex_enter(&dict_sys->mutex);
+
+	sys_table = dict_table_get_low(tablename);
+
+	if (sys_table == NULL) {
+		error = DB_TABLE_NOT_FOUND;
+
+	} else if (UT_LIST_GET_LEN(sys_table->indexes) != num_indexes
+		   || sys_table->n_cols != num_fields) {
+		error = DB_CORRUPTION;
+
+	} else {
+		/* This table has already been created, and it is OK.
+		Ensure that it can't be evicted from the table LRU cache. */
+
+		dict_table_move_from_lru_to_non_lru(sys_table);
+	}
+
+	mutex_exit(&dict_sys->mutex);
+
+	return(error);
+}
+
+/****************************************************************//**
+Creates the foreign key constraints system tables inside InnoDB
+at server bootstrap or server start if they are not found or are
+not of the right form.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_create_or_check_foreign_constraint_tables(void)
+/*================================================*/
+{
+	trx_t*		trx;
+	my_bool		srv_file_per_table_backup;
+	dberr_t		err;
+	dberr_t		sys_foreign_err;
+	dberr_t		sys_foreign_cols_err;
+
+	ut_a(srv_get_active_thread_type() == SRV_NONE);
+
+	/* Note: The master thread has not been started at this point. */
+
+
+	sys_foreign_err = dict_check_if_system_table_exists(
+		"SYS_FOREIGN", DICT_NUM_FIELDS__SYS_FOREIGN + 1, 3);
+	sys_foreign_cols_err = dict_check_if_system_table_exists(
+		"SYS_FOREIGN_COLS", DICT_NUM_FIELDS__SYS_FOREIGN_COLS + 1, 1);
+
+	if (sys_foreign_err == DB_SUCCESS
+	    && sys_foreign_cols_err == DB_SUCCESS) {
+		return(DB_SUCCESS);
+	}
+
+	trx = trx_allocate_for_mysql();
+
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+	trx->op_info = "creating foreign key sys tables";
+
+	row_mysql_lock_data_dictionary(trx);
+
+	/* Check which incomplete table definition to drop. */
+
+	if (sys_foreign_err == DB_CORRUPTION) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Dropping incompletely created "
+			"SYS_FOREIGN table.");
+		row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE);
+	}
+
+	if (sys_foreign_cols_err == DB_CORRUPTION) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Dropping incompletely created "
+			"SYS_FOREIGN_COLS table.");
+
+		row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE);
+	}
+
+	ib_logf(IB_LOG_LEVEL_WARN,
+		"Creating foreign key constraint system tables.");
+
+	/* NOTE: in dict_load_foreigns we use the fact that
+	there are 2 secondary indexes on SYS_FOREIGN, and they
+	are defined just like below */
+
+	/* NOTE: when designing InnoDB's foreign key support in 2001, we made
+	an error and made the table names and the foreign key id of type
+	'CHAR' (internally, really a VARCHAR). We should have made the type
+	VARBINARY, like in other InnoDB system tables, to get a clean
+	design. */
+
+	srv_file_per_table_backup = srv_file_per_table;
+
+	/* We always want SYSTEM tables to be created inside the system
+	tablespace. */
+
+	srv_file_per_table = 0;
+
+	err = que_eval_sql(
+		NULL,
+		"PROCEDURE CREATE_FOREIGN_SYS_TABLES_PROC () IS\n"
+		"BEGIN\n"
+		"CREATE TABLE\n"
+		"SYS_FOREIGN(ID CHAR, FOR_NAME CHAR,"
+		" REF_NAME CHAR, N_COLS INT);\n"
+		"CREATE UNIQUE CLUSTERED INDEX ID_IND"
+		" ON SYS_FOREIGN (ID);\n"
+		"CREATE INDEX FOR_IND"
+		" ON SYS_FOREIGN (FOR_NAME);\n"
+		"CREATE INDEX REF_IND"
+		" ON SYS_FOREIGN (REF_NAME);\n"
+		"CREATE TABLE\n"
+		"SYS_FOREIGN_COLS(ID CHAR, POS INT,"
+		" FOR_COL_NAME CHAR, REF_COL_NAME CHAR);\n"
+		"CREATE UNIQUE CLUSTERED INDEX ID_IND"
+		" ON SYS_FOREIGN_COLS (ID, POS);\n"
+		"END;\n",
+		FALSE, trx);
+
+	if (err != DB_SUCCESS) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Creation of SYS_FOREIGN and SYS_FOREIGN_COLS "
+			"has failed with error %lu.  Tablespace is full. "
+			"Dropping incompletely created tables.",
+			(ulong) err);
+
+		ut_ad(err == DB_OUT_OF_FILE_SPACE
+		      || err == DB_TOO_MANY_CONCURRENT_TRXS);
+
+		row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE);
+		row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE);
+
+		if (err == DB_OUT_OF_FILE_SPACE) {
+			err = DB_MUST_GET_MORE_FILE_SPACE;
+		}
+	}
+
+	trx_commit_for_mysql(trx);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx_free_for_mysql(trx);
+
+	srv_file_per_table = srv_file_per_table_backup;
+
+	if (err == DB_SUCCESS) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Foreign key constraint system tables created");
+	}
+
+	/* Note: The master thread has not been started at this point. */
+	/* Confirm and move to the non-LRU part of the table LRU list. */
+	sys_foreign_err = dict_check_if_system_table_exists(
+		"SYS_FOREIGN", DICT_NUM_FIELDS__SYS_FOREIGN + 1, 3);
+	ut_a(sys_foreign_err == DB_SUCCESS);
+
+	sys_foreign_cols_err = dict_check_if_system_table_exists(
+		"SYS_FOREIGN_COLS", DICT_NUM_FIELDS__SYS_FOREIGN_COLS + 1, 1);
+	ut_a(sys_foreign_cols_err == DB_SUCCESS);
+
+	return(err);
+}
+
+/****************************************************************//**
+Evaluate the given foreign key SQL statement.
+@return	error code or DB_SUCCESS */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+dict_foreign_eval_sql(
+/*==================*/
+	pars_info_t*	info,	/*!< in: info struct */
+	const char*	sql,	/*!< in: SQL string to evaluate */
+	const char*	name,	/*!< in: table name (for diagnostics) */
+	const char*	id,	/*!< in: foreign key id */
+	trx_t*		trx)	/*!< in/out: transaction */
+{
+	dberr_t	error;
+	FILE*	ef	= dict_foreign_err_file;
+
+	error = que_eval_sql(info, sql, FALSE, trx);
+
+	if (error == DB_DUPLICATE_KEY) {
+		mutex_enter(&dict_foreign_err_mutex);
+		rewind(ef);
+		ut_print_timestamp(ef);
+		fputs(" Error in foreign key constraint creation for table ",
+		      ef);
+		ut_print_name(ef, trx, TRUE, name);
+		fputs(".\nA foreign key constraint of name ", ef);
+		ut_print_name(ef, trx, TRUE, id);
+		fputs("\nalready exists."
+		      " (Note that internally InnoDB adds 'databasename'\n"
+		      "in front of the user-defined constraint name.)\n"
+		      "Note that InnoDB's FOREIGN KEY system tables store\n"
+		      "constraint names as case-insensitive, with the\n"
+		      "MySQL standard latin1_swedish_ci collation. If you\n"
+		      "create tables or databases whose names differ only in\n"
+		      "the character case, then collisions in constraint\n"
+		      "names can occur. Workaround: name your constraints\n"
+		      "explicitly with unique names.\n",
+		      ef);
+
+		mutex_exit(&dict_foreign_err_mutex);
+
+		return(error);
+	}
+
+	if (error != DB_SUCCESS) {
+		fprintf(stderr,
+			"InnoDB: Foreign key constraint creation failed:\n"
+			"InnoDB: internal error number %lu\n", (ulong) error);
+
+		mutex_enter(&dict_foreign_err_mutex);
+		ut_print_timestamp(ef);
+		fputs(" Internal error in foreign key constraint creation"
+		      " for table ", ef);
+		ut_print_name(ef, trx, TRUE, name);
+		fputs(".\n"
+		      "See the MySQL .err log in the datadir"
+		      " for more information.\n", ef);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		return(error);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/********************************************************************//**
+Add a single foreign key field definition to the data dictionary tables in
+the database.
+@return	error code or DB_SUCCESS */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+dict_create_add_foreign_field_to_dictionary(
+/*========================================*/
+	ulint			field_nr,	/*!< in: field number */
+	const char*		table_name,	/*!< in: table name */
+	const dict_foreign_t*	foreign,	/*!< in: foreign */
+	trx_t*			trx)		/*!< in/out: transaction */
+{
+	pars_info_t*	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "id", foreign->id);
+
+	pars_info_add_int4_literal(info, "pos", field_nr);
+
+	pars_info_add_str_literal(info, "for_col_name",
+				  foreign->foreign_col_names[field_nr]);
+
+	pars_info_add_str_literal(info, "ref_col_name",
+				  foreign->referenced_col_names[field_nr]);
+
+	return(dict_foreign_eval_sql(
+		       info,
+		       "PROCEDURE P () IS\n"
+		       "BEGIN\n"
+		       "INSERT INTO SYS_FOREIGN_COLS VALUES"
+		       "(:id, :pos, :for_col_name, :ref_col_name);\n"
+		       "END;\n",
+		       table_name, foreign->id, trx));
+}
+
+/********************************************************************//**
+Add a foreign key definition to the data dictionary tables.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_add_foreign_to_dictionary(
+/*==================================*/
+	const char*		name,	/*!< in: table name */
+	const dict_foreign_t*	foreign,/*!< in: foreign key */
+	trx_t*			trx)	/*!< in/out: dictionary transaction */
+{
+	dberr_t		error;
+	pars_info_t*	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "id", foreign->id);
+
+	pars_info_add_str_literal(info, "for_name", name);
+
+	pars_info_add_str_literal(info, "ref_name",
+				  foreign->referenced_table_name);
+
+	pars_info_add_int4_literal(info, "n_cols",
+				   foreign->n_fields + (foreign->type << 24));
+
+	error = dict_foreign_eval_sql(info,
+				      "PROCEDURE P () IS\n"
+				      "BEGIN\n"
+				      "INSERT INTO SYS_FOREIGN VALUES"
+				      "(:id, :for_name, :ref_name, :n_cols);\n"
+				      "END;\n"
+				      , name, foreign->id, trx);
+
+	if (error != DB_SUCCESS) {
+
+		return(error);
+	}
+
+	for (ulint i = 0; i < foreign->n_fields; i++) {
+		error = dict_create_add_foreign_field_to_dictionary(
+			i, name, foreign, trx);
+
+		if (error != DB_SUCCESS) {
+
+			return(error);
+		}
+	}
+
+	return(error);
+}
+
+/** Adds the given set of foreign key objects to the dictionary tables
+in the database. This function does not modify the dictionary cache. The
+caller must ensure that all foreign key objects contain a valid constraint
+name in foreign->id.
+@param[in]	local_fk_set	set of foreign key objects, to be added to
+the dictionary tables
+@param[in]	table		table to which the foreign key objects in
+local_fk_set belong to
+@param[in,out]	trx		transaction
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_add_foreigns_to_dictionary(
+/*===================================*/
+	const dict_foreign_set&	local_fk_set,
+	const dict_table_t*	table,
+	trx_t*			trx)
+{
+	dict_foreign_t*	foreign;
+	dberr_t		error;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	if (NULL == dict_table_get_low("SYS_FOREIGN")) {
+		fprintf(stderr,
+			"InnoDB: table SYS_FOREIGN not found"
+			" in internal data dictionary\n");
+
+		return(DB_ERROR);
+	}
+
+	for (dict_foreign_set::const_iterator it = local_fk_set.begin();
+	     it != local_fk_set.end();
+	     ++it) {
+
+		foreign = *it;
+		ut_ad(foreign->id != NULL);
+
+		error = dict_create_add_foreign_to_dictionary(table->name,
+							      foreign, trx);
+
+		if (error != DB_SUCCESS) {
+
+			return(error);
+		}
+	}
+
+	trx->op_info = "committing foreign key definitions";
+
+	trx_commit(trx);
+
+	trx->op_info = "";
+
+	return(DB_SUCCESS);
+}
+
+/****************************************************************//**
+Creates the tablespaces and datafiles system tables inside InnoDB
+at server bootstrap or server start if they are not found or are
+not of the right form.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_create_or_check_sys_tablespace(void)
+/*=====================================*/
+{
+	trx_t*		trx;
+	my_bool		srv_file_per_table_backup;
+	dberr_t		err;
+	dberr_t		sys_tablespaces_err;
+	dberr_t		sys_datafiles_err;
+
+	ut_a(srv_get_active_thread_type() == SRV_NONE);
+
+	/* Note: The master thread has not been started at this point. */
+
+	sys_tablespaces_err = dict_check_if_system_table_exists(
+		"SYS_TABLESPACES", DICT_NUM_FIELDS__SYS_TABLESPACES + 1, 1);
+	sys_datafiles_err = dict_check_if_system_table_exists(
+		"SYS_DATAFILES", DICT_NUM_FIELDS__SYS_DATAFILES + 1, 1);
+
+	if (sys_tablespaces_err == DB_SUCCESS
+	    && sys_datafiles_err == DB_SUCCESS) {
+		return(DB_SUCCESS);
+	}
+
+	trx = trx_allocate_for_mysql();
+
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+	trx->op_info = "creating tablepace and datafile sys tables";
+
+	row_mysql_lock_data_dictionary(trx);
+
+	/* Check which incomplete table definition to drop. */
+
+	if (sys_tablespaces_err == DB_CORRUPTION) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Dropping incompletely created "
+			"SYS_TABLESPACES table.");
+		row_drop_table_for_mysql("SYS_TABLESPACES", trx, TRUE);
+	}
+
+	if (sys_datafiles_err == DB_CORRUPTION) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Dropping incompletely created "
+			"SYS_DATAFILES table.");
+
+		row_drop_table_for_mysql("SYS_DATAFILES", trx, TRUE);
+	}
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Creating tablespace and datafile system tables.");
+
+	/* We always want SYSTEM tables to be created inside the system
+	tablespace. */
+	srv_file_per_table_backup = srv_file_per_table;
+	srv_file_per_table = 0;
+
+	err = que_eval_sql(
+		NULL,
+		"PROCEDURE CREATE_SYS_TABLESPACE_PROC () IS\n"
+		"BEGIN\n"
+		"CREATE TABLE SYS_TABLESPACES(\n"
+		" SPACE INT, NAME CHAR, FLAGS INT);\n"
+		"CREATE UNIQUE CLUSTERED INDEX SYS_TABLESPACES_SPACE"
+		" ON SYS_TABLESPACES (SPACE);\n"
+		"CREATE TABLE SYS_DATAFILES(\n"
+		" SPACE INT, PATH CHAR);\n"
+		"CREATE UNIQUE CLUSTERED INDEX SYS_DATAFILES_SPACE"
+		" ON SYS_DATAFILES (SPACE);\n"
+		"END;\n",
+		FALSE, trx);
+
+	if (err != DB_SUCCESS) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Creation of SYS_TABLESPACES and SYS_DATAFILES "
+			"has failed with error %lu.  Tablespace is full. "
+			"Dropping incompletely created tables.",
+			(ulong) err);
+
+		ut_a(err == DB_OUT_OF_FILE_SPACE
+		     || err == DB_TOO_MANY_CONCURRENT_TRXS);
+
+		row_drop_table_for_mysql("SYS_TABLESPACES", trx, TRUE);
+		row_drop_table_for_mysql("SYS_DATAFILES", trx, TRUE);
+
+		if (err == DB_OUT_OF_FILE_SPACE) {
+			err = DB_MUST_GET_MORE_FILE_SPACE;
+		}
+	}
+
+	trx_commit_for_mysql(trx);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx_free_for_mysql(trx);
+
+	srv_file_per_table = srv_file_per_table_backup;
+
+	if (err == DB_SUCCESS) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Tablespace and datafile system tables created.");
+	}
+
+	/* Note: The master thread has not been started at this point. */
+	/* Confirm and move to the non-LRU part of the table LRU list. */
+
+	sys_tablespaces_err = dict_check_if_system_table_exists(
+		"SYS_TABLESPACES", DICT_NUM_FIELDS__SYS_TABLESPACES + 1, 1);
+	ut_a(sys_tablespaces_err == DB_SUCCESS);
+
+	sys_datafiles_err = dict_check_if_system_table_exists(
+		"SYS_DATAFILES", DICT_NUM_FIELDS__SYS_DATAFILES + 1, 1);
+	ut_a(sys_datafiles_err == DB_SUCCESS);
+
+	return(err);
+}
+
+/********************************************************************//**
+Add a single tablespace definition to the data dictionary tables in the
+database.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_add_tablespace_to_dictionary(
+/*=====================================*/
+	ulint		space,		/*!< in: tablespace id */
+	const char*	name,		/*!< in: tablespace name */
+	ulint		flags,		/*!< in: tablespace flags */
+	const char*	path,		/*!< in: tablespace path */
+	trx_t*		trx,		/*!< in/out: transaction */
+	bool		commit)		/*!< in: if true then commit the
+					transaction */
+{
+	dberr_t		error;
+
+	pars_info_t*	info = pars_info_create();
+
+	ut_a(space > TRX_SYS_SPACE);
+
+	pars_info_add_int4_literal(info, "space", space);
+
+	pars_info_add_str_literal(info, "name", name);
+
+	pars_info_add_int4_literal(info, "flags", flags);
+
+	pars_info_add_str_literal(info, "path", path);
+
+	error = que_eval_sql(info,
+			     "PROCEDURE P () IS\n"
+			     "BEGIN\n"
+			     "INSERT INTO SYS_TABLESPACES VALUES"
+			     "(:space, :name, :flags);\n"
+			     "INSERT INTO SYS_DATAFILES VALUES"
+			     "(:space, :path);\n"
+			     "END;\n",
+			     FALSE, trx);
+
+	if (error != DB_SUCCESS) {
+		return(error);
+	}
+
+	if (commit) {
+		trx->op_info = "committing tablespace and datafile definition";
+		trx_commit(trx);
+	}
+
+	trx->op_info = "";
+
+	return(error);
+}
diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc
new file mode 100644
index 00000000000..80453898a23
--- /dev/null
+++ b/storage/innobase/dict/dict0dict.cc
@@ -0,0 +1,6707 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file dict/dict0dict.cc
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "dict0dict.h"
+#include "fts0fts.h"
+#include "fil0fil.h"
+#include <algorithm>
+
+#ifdef UNIV_NONINL
+#include "dict0dict.ic"
+#include "dict0priv.ic"
+#endif
+
+/** dummy index for ROW_FORMAT=REDUNDANT supremum and infimum records */
+UNIV_INTERN dict_index_t*	dict_ind_redundant;
+/** dummy index for ROW_FORMAT=COMPACT supremum and infimum records */
+UNIV_INTERN dict_index_t*	dict_ind_compact;
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+/** Flag to control insert buffer debugging. */
+extern UNIV_INTERN uint	ibuf_debug;
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+/**********************************************************************
+Issue a warning that the row is too big. */
+void
+ib_warn_row_too_big(const dict_table_t*	table);
+
+#ifndef UNIV_HOTBACKUP
+#include "buf0buf.h"
+#include "data0type.h"
+#include "mach0data.h"
+#include "dict0boot.h"
+#include "dict0mem.h"
+#include "dict0crea.h"
+#include "dict0stats.h"
+#include "trx0undo.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "os0once.h"
+#include "page0zip.h"
+#include "page0page.h"
+#include "pars0pars.h"
+#include "pars0sym.h"
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include "m_ctype.h" /* my_isspace() */
+#include "ha_prototypes.h" /* innobase_strcasecmp(), innobase_casedn_str() */
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "lock0lock.h"
+#include "dict0priv.h"
+#include "row0upd.h"
+#include "row0mysql.h"
+#include "row0merge.h"
+#include "row0log.h"
+#include "ut0ut.h" /* ut_format_name() */
+#include "m_string.h"
+#include "my_sys.h"
+#include "mysqld.h" /* system_charset_info */
+#include "strfunc.h" /* strconvert() */
+
+#include <ctype.h>
+
+/** the dictionary system */
+UNIV_INTERN dict_sys_t*	dict_sys	= NULL;
+
+/** @brief the data dictionary rw-latch protecting dict_sys
+
+table create, drop, etc. reserve this in X-mode; implicit or
+backround operations purge, rollback, foreign key checks reserve this
+in S-mode; we cannot trust that MySQL protects implicit or background
+operations a table drop since MySQL does not know of them; therefore
+we need this; NOTE: a transaction which reserves this must keep book
+on the mode in trx_t::dict_operation_lock_mode */
+UNIV_INTERN rw_lock_t	dict_operation_lock;
+
+/** Percentage of compression failures that are allowed in a single
+round */
+UNIV_INTERN ulong	zip_failure_threshold_pct = 5;
+
+/** Maximum percentage of a page that can be allowed as a pad to avoid
+compression failures */
+UNIV_INTERN ulong	zip_pad_max = 50;
+
+/* Keys to register rwlocks and mutexes with performance schema */
+#ifdef UNIV_PFS_RWLOCK
+UNIV_INTERN mysql_pfs_key_t	dict_operation_lock_key;
+UNIV_INTERN mysql_pfs_key_t	index_tree_rw_lock_key;
+UNIV_INTERN mysql_pfs_key_t	index_online_log_key;
+UNIV_INTERN mysql_pfs_key_t	dict_table_stats_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	zip_pad_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	dict_sys_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	dict_foreign_err_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+#define	DICT_HEAP_SIZE		100	/*!< initial memory heap size when
+					creating a table or index object */
+#define DICT_POOL_PER_TABLE_HASH 512	/*!< buffer pool max size per table
+					hash table fixed size in bytes */
+#define DICT_POOL_PER_VARYING	4	/*!< buffer pool max size per data
+					dictionary varying size in bytes */
+
+/** Identifies generated InnoDB foreign key names */
+static char	dict_ibfk[] = "_ibfk_";
+
+/*******************************************************************//**
+Tries to find column names for the index and sets the col field of the
+index.
+@return TRUE if the column names were found */
+static
+ibool
+dict_index_find_cols(
+/*=================*/
+	dict_table_t*	table,	/*!< in: table */
+	dict_index_t*	index);	/*!< in: index */
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a clustered
+index, containing also system fields not defined by the user.
+@return	own: the internal representation of the clustered index */
+static
+dict_index_t*
+dict_index_build_internal_clust(
+/*============================*/
+	const dict_table_t*	table,	/*!< in: table */
+	dict_index_t*		index);	/*!< in: user representation of
+					a clustered index */
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a non-clustered
+index, containing also system fields not defined by the user.
+@return	own: the internal representation of the non-clustered index */
+static
+dict_index_t*
+dict_index_build_internal_non_clust(
+/*================================*/
+	const dict_table_t*	table,	/*!< in: table */
+	dict_index_t*		index);	/*!< in: user representation of
+					a non-clustered index */
+/**********************************************************************//**
+Builds the internal dictionary cache representation for an FTS index.
+@return	own: the internal representation of the FTS index */
+static
+dict_index_t*
+dict_index_build_internal_fts(
+/*==========================*/
+	dict_table_t*	table,	/*!< in: table */
+	dict_index_t*	index);	/*!< in: user representation of an FTS index */
+/**********************************************************************//**
+Prints a column data. */
+static
+void
+dict_col_print_low(
+/*===============*/
+	const dict_table_t*	table,	/*!< in: table */
+	const dict_col_t*	col);	/*!< in: column */
+/**********************************************************************//**
+Prints an index data. */
+static
+void
+dict_index_print_low(
+/*=================*/
+	dict_index_t*	index);	/*!< in: index */
+/**********************************************************************//**
+Prints a field data. */
+static
+void
+dict_field_print_low(
+/*=================*/
+	const dict_field_t*	field);	/*!< in: field */
+
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+static
+void
+dict_index_remove_from_cache_low(
+/*=============================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	dict_index_t*	index,		/*!< in, own: index */
+	ibool		lru_evict);	/*!< in: TRUE if page being evicted
+					to make room in the table LRU list */
+/**********************************************************************//**
+Removes a table object from the dictionary cache. */
+static
+void
+dict_table_remove_from_cache_low(
+/*=============================*/
+	dict_table_t*	table,		/*!< in, own: table */
+	ibool		lru_evict);	/*!< in: TRUE if evicting from LRU */
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate the dictionary table LRU list.
+@return TRUE if validate OK */
+static
+ibool
+dict_lru_validate(void);
+/*===================*/
+/**********************************************************************//**
+Check if table is in the dictionary table LRU list.
+@return TRUE if table found */
+static
+ibool
+dict_lru_find_table(
+/*================*/
+	const dict_table_t*	find_table);	/*!< in: table to find */
+/**********************************************************************//**
+Check if a table exists in the dict table non-LRU list.
+@return TRUE if table found */
+static
+ibool
+dict_non_lru_find_table(
+/*====================*/
+	const dict_table_t*	find_table);	/*!< in: table to find */
+#endif /* UNIV_DEBUG */
+
+/* Stream for storing detailed information about the latest foreign key
+and unique key errors. Only created if !srv_read_only_mode */
+UNIV_INTERN FILE*	dict_foreign_err_file		= NULL;
+/* mutex protecting the foreign and unique error buffers */
+UNIV_INTERN ib_mutex_t	dict_foreign_err_mutex;
+
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+UNIV_INTERN
+void
+dict_casedn_str(
+/*============*/
+	char*	a)	/*!< in/out: string to put in lower case */
+{
+	innobase_casedn_str(a);
+}
+
+/********************************************************************//**
+Checks if the database name in two table names is the same.
+@return	TRUE if same db name */
+UNIV_INTERN
+ibool
+dict_tables_have_same_db(
+/*=====================*/
+	const char*	name1,	/*!< in: table name in the form
+				dbname '/' tablename */
+	const char*	name2)	/*!< in: table name in the form
+				dbname '/' tablename */
+{
+	for (; *name1 == *name2; name1++, name2++) {
+		if (*name1 == '/') {
+			return(TRUE);
+		}
+		ut_a(*name1); /* the names must contain '/' */
+	}
+	return(FALSE);
+}
+
+/********************************************************************//**
+Return the end of table name where we have removed dbname and '/'.
+@return	table name */
+UNIV_INTERN
+const char*
+dict_remove_db_name(
+/*================*/
+	const char*	name)	/*!< in: table name in the form
+				dbname '/' tablename */
+{
+	const char*	s = strchr(name, '/');
+	ut_a(s);
+
+	return(s + 1);
+}
+
+/********************************************************************//**
+Get the database name length in a table name.
+@return	database name length */
+UNIV_INTERN
+ulint
+dict_get_db_name_len(
+/*=================*/
+	const char*	name)	/*!< in: table name in the form
+				dbname '/' tablename */
+{
+	const char*	s;
+	s = strchr(name, '/');
+	ut_a(s);
+	return(s - name);
+}
+
+/********************************************************************//**
+Reserves the dictionary system mutex for MySQL. */
+UNIV_INTERN
+void
+dict_mutex_enter_for_mysql(void)
+/*============================*/
+{
+	mutex_enter(&(dict_sys->mutex));
+}
+
+/********************************************************************//**
+Releases the dictionary system mutex for MySQL. */
+UNIV_INTERN
+void
+dict_mutex_exit_for_mysql(void)
+/*===========================*/
+{
+	mutex_exit(&(dict_sys->mutex));
+}
+
+/** Allocate and init a dict_table_t's stats latch.
+This function must not be called concurrently on the same table object.
+@param[in,out]	table_void	table whose stats latch to create */
+static
+void
+dict_table_stats_latch_alloc(
+	void*	table_void)
+{
+	dict_table_t*	table = static_cast<dict_table_t*>(table_void);
+
+	table->stats_latch = new(std::nothrow) rw_lock_t;
+
+	ut_a(table->stats_latch != NULL);
+
+	rw_lock_create(dict_table_stats_key, table->stats_latch,
+		       SYNC_INDEX_TREE);
+}
+
+/** Deinit and free a dict_table_t's stats latch.
+This function must not be called concurrently on the same table object.
+@param[in,out]	table	table whose stats latch to free */
+static
+void
+dict_table_stats_latch_free(
+	dict_table_t*	table)
+{
+	rw_lock_free(table->stats_latch);
+	delete table->stats_latch;
+}
+
+/** Create a dict_table_t's stats latch or delay for lazy creation.
+This function is only called from either single threaded environment
+or from a thread that has not shared the table object with other threads.
+@param[in,out]	table	table whose stats latch to create
+@param[in]	enabled	if false then the latch is disabled
+and dict_table_stats_lock()/unlock() become noop on this table. */
+
+void
+dict_table_stats_latch_create(
+	dict_table_t*	table,
+	bool		enabled)
+{
+	if (!enabled) {
+		table->stats_latch = NULL;
+		table->stats_latch_created = os_once::DONE;
+		return;
+	}
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	/* We create this lazily the first time it is used. */
+	table->stats_latch = NULL;
+	table->stats_latch_created = os_once::NEVER_DONE;
+#else /* HAVE_ATOMIC_BUILTINS */
+
+	dict_table_stats_latch_alloc(table);
+
+	table->stats_latch_created = os_once::DONE;
+#endif /* HAVE_ATOMIC_BUILTINS */
+}
+
+/** Destroy a dict_table_t's stats latch.
+This function is only called from either single threaded environment
+or from a thread that has not shared the table object with other threads.
+@param[in,out]	table	table whose stats latch to destroy */
+
+void
+dict_table_stats_latch_destroy(
+	dict_table_t*	table)
+{
+	if (table->stats_latch_created == os_once::DONE
+	    && table->stats_latch != NULL) {
+
+		dict_table_stats_latch_free(table);
+	}
+}
+
+/**********************************************************************//**
+Lock the appropriate latch to protect a given table's statistics. */
+UNIV_INTERN
+void
+dict_table_stats_lock(
+/*==================*/
+	dict_table_t*	table,		/*!< in: table */
+	ulint		latch_mode)	/*!< in: RW_S_LATCH or RW_X_LATCH */
+{
+	ut_ad(table != NULL);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	os_once::do_or_wait_for_done(
+		&table->stats_latch_created,
+		dict_table_stats_latch_alloc, table);
+#else /* HAVE_ATOMIC_BUILTINS */
+	ut_ad(table->stats_latch_created == os_once::DONE);
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+	if (table->stats_latch == NULL) {
+		/* This is a dummy table object that is private in the current
+		thread and is not shared between multiple threads, thus we
+		skip any locking. */
+		return;
+	}
+
+	switch (latch_mode) {
+	case RW_S_LATCH:
+		rw_lock_s_lock(table->stats_latch);
+		break;
+	case RW_X_LATCH:
+		rw_lock_x_lock(table->stats_latch);
+		break;
+	case RW_NO_LATCH:
+		/* fall through */
+	default:
+		ut_error;
+	}
+}
+
+/**********************************************************************//**
+Unlock the latch that has been locked by dict_table_stats_lock() */
+UNIV_INTERN
+void
+dict_table_stats_unlock(
+/*====================*/
+	dict_table_t*	table,		/*!< in: table */
+	ulint		latch_mode)	/*!< in: RW_S_LATCH or
+						RW_X_LATCH */
+{
+	ut_ad(table != NULL);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	if (table->stats_latch == NULL) {
+		/* This is a dummy table object that is private in the current
+		thread and is not shared between multiple threads, thus we
+		skip any locking. */
+		return;
+	}
+
+	switch (latch_mode) {
+	case RW_S_LATCH:
+		rw_lock_s_unlock(table->stats_latch);
+		break;
+	case RW_X_LATCH:
+		rw_lock_x_unlock(table->stats_latch);
+		break;
+	case RW_NO_LATCH:
+		/* fall through */
+	default:
+		ut_error;
+	}
+}
+
+/**********************************************************************//**
+Try to drop any indexes after an aborted index creation.
+This can also be after a server kill during DROP INDEX. */
+static
+void
+dict_table_try_drop_aborted(
+/*========================*/
+	dict_table_t*	table,		/*!< in: table, or NULL if it
+					needs to be looked up again */
+	table_id_t	table_id,	/*!< in: table identifier */
+	ulint		ref_count)	/*!< in: expected table->n_ref_count */
+{
+	trx_t*		trx;
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "try to drop any indexes after an aborted index creation";
+	row_mysql_lock_data_dictionary(trx);
+	trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+
+	if (table == NULL) {
+		table = dict_table_open_on_id_low(
+			table_id, DICT_ERR_IGNORE_NONE);
+	} else {
+		ut_ad(table->id == table_id);
+	}
+
+	if (table && table->n_ref_count == ref_count && table->drop_aborted) {
+		/* Silence a debug assertion in row_merge_drop_indexes(). */
+		ut_d(table->n_ref_count++);
+		row_merge_drop_indexes(trx, table, TRUE);
+		ut_d(table->n_ref_count--);
+		ut_ad(table->n_ref_count == ref_count);
+		trx_commit_for_mysql(trx);
+	}
+
+	row_mysql_unlock_data_dictionary(trx);
+	trx_free_for_background(trx);
+}
+
+/**********************************************************************//**
+When opening a table,
+try to drop any indexes after an aborted index creation.
+Release the dict_sys->mutex. */
+static
+void
+dict_table_try_drop_aborted_and_mutex_exit(
+/*=======================================*/
+	dict_table_t*	table,		/*!< in: table (may be NULL) */
+	ibool		try_drop)	/*!< in: FALSE if should try to
+					drop indexes whose online creation
+					was aborted */
+{
+	if (try_drop
+	    && table != NULL
+	    && table->drop_aborted
+	    && table->n_ref_count == 1
+	    && dict_table_get_first_index(table)) {
+
+		/* Attempt to drop the indexes whose online creation
+		was aborted. */
+		table_id_t	table_id = table->id;
+
+		mutex_exit(&dict_sys->mutex);
+
+		dict_table_try_drop_aborted(table, table_id, 1);
+	} else {
+		mutex_exit(&dict_sys->mutex);
+	}
+}
+
+/********************************************************************//**
+Decrements the count of open handles to a table. */
+UNIV_INTERN
+void
+dict_table_close(
+/*=============*/
+	dict_table_t*	table,		/*!< in/out: table */
+	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
+	ibool		try_drop)	/*!< in: TRUE=try to drop any orphan
+					indexes after an aborted online
+					index creation */
+{
+	if (!dict_locked) {
+		mutex_enter(&dict_sys->mutex);
+	}
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_a(table->n_ref_count > 0);
+
+	--table->n_ref_count;
+
+	/* Force persistent stats re-read upon next open of the table
+	so that FLUSH TABLE can be used to forcibly fetch stats from disk
+	if they have been manually modified. We reset table->stat_initialized
+	only if table reference count is 0 because we do not want too frequent
+	stats re-reads (e.g. in other cases than FLUSH TABLE). */
+	if (strchr(table->name, '/') != NULL
+	    && table->n_ref_count == 0
+	    && dict_stats_is_persistent_enabled(table)) {
+
+		dict_stats_deinit(table);
+	}
+
+	MONITOR_DEC(MONITOR_TABLE_REFERENCE);
+
+	ut_ad(dict_lru_validate());
+
+#ifdef UNIV_DEBUG
+	if (table->can_be_evicted) {
+		ut_ad(dict_lru_find_table(table));
+	} else {
+		ut_ad(dict_non_lru_find_table(table));
+	}
+#endif /* UNIV_DEBUG */
+
+	if (!dict_locked) {
+		table_id_t	table_id	= table->id;
+		ibool		drop_aborted;
+
+		drop_aborted = try_drop
+			&& table->drop_aborted
+			&& table->n_ref_count == 1
+			&& dict_table_get_first_index(table);
+
+		mutex_exit(&dict_sys->mutex);
+
+		if (drop_aborted) {
+			dict_table_try_drop_aborted(NULL, table_id, 0);
+		}
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Returns a column's name.
+@return column name. NOTE: not guaranteed to stay valid if table is
+modified in any way (columns added, etc.). */
+UNIV_INTERN
+const char*
+dict_table_get_col_name(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			col_nr)	/*!< in: column number */
+{
+	ulint		i;
+	const char*	s;
+
+	ut_ad(table);
+	ut_ad(col_nr < table->n_def);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	s = table->col_names;
+	if (s) {
+		for (i = 0; i < col_nr; i++) {
+			s += strlen(s) + 1;
+		}
+	}
+
+	return(s);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Acquire the autoinc lock. */
+UNIV_INTERN
+void
+dict_table_autoinc_lock(
+/*====================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	mutex_enter(&table->autoinc_mutex);
+}
+
+/********************************************************************//**
+Unconditionally set the autoinc counter. */
+UNIV_INTERN
+void
+dict_table_autoinc_initialize(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	ib_uint64_t	value)	/*!< in: next value to assign to a row */
+{
+	ut_ad(mutex_own(&table->autoinc_mutex));
+
+	table->autoinc = value;
+}
+
+/************************************************************************
+Get all the FTS indexes on a table.
+@return	number of FTS indexes */
+UNIV_INTERN
+ulint
+dict_table_get_all_fts_indexes(
+/*===========================*/
+	dict_table_t*   table,          /*!< in: table */
+	ib_vector_t*    indexes)        /*!< out: all FTS indexes on this
+					table */
+{
+	dict_index_t* index;
+
+	ut_a(ib_vector_size(indexes) == 0);
+
+	for (index = dict_table_get_first_index(table);
+	     index;
+	     index = dict_table_get_next_index(index)) {
+
+		if (index->type == DICT_FTS) {
+			ib_vector_push(indexes, &index);
+		}
+	}
+
+	return(ib_vector_size(indexes));
+}
+
+/********************************************************************//**
+Reads the next autoinc value (== autoinc counter value), 0 if not yet
+initialized.
+@return	value for a new row, or 0 */
+UNIV_INTERN
+ib_uint64_t
+dict_table_autoinc_read(
+/*====================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(mutex_own(&table->autoinc_mutex));
+
+	return(table->autoinc);
+}
+
+/********************************************************************//**
+Updates the autoinc counter if the value supplied is greater than the
+current value. */
+UNIV_INTERN
+void
+dict_table_autoinc_update_if_greater(
+/*=================================*/
+
+	dict_table_t*	table,	/*!< in/out: table */
+	ib_uint64_t	value)	/*!< in: value which was assigned to a row */
+{
+	ut_ad(mutex_own(&table->autoinc_mutex));
+
+	if (value > table->autoinc) {
+
+		table->autoinc = value;
+	}
+}
+
+/********************************************************************//**
+Release the autoinc lock. */
+UNIV_INTERN
+void
+dict_table_autoinc_unlock(
+/*======================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	mutex_exit(&table->autoinc_mutex);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INTERN
+ulint
+dict_index_get_nth_col_or_prefix_pos(
+/*=================================*/
+	const dict_index_t*	index,		/*!< in: index */
+	ulint			n,		/*!< in: column number */
+	ibool			inc_prefix)	/*!< in: TRUE=consider
+						column prefixes too */
+{
+	const dict_field_t*	field;
+	const dict_col_t*	col;
+	ulint			pos;
+	ulint			n_fields;
+
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	col = dict_table_get_nth_col(index->table, n);
+
+	if (dict_index_is_clust(index)) {
+
+		return(dict_col_get_clust_pos(col, index));
+	}
+
+	n_fields = dict_index_get_n_fields(index);
+
+	for (pos = 0; pos < n_fields; pos++) {
+		field = dict_index_get_nth_field(index, pos);
+
+		if (col == field->col
+		    && (inc_prefix || field->prefix_len == 0)) {
+
+			return(pos);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Returns TRUE if the index contains a column or a prefix of that column.
+@return	TRUE if contains the column or its prefix */
+UNIV_INTERN
+ibool
+dict_index_contains_col_or_prefix(
+/*==============================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			n)	/*!< in: column number */
+{
+	const dict_field_t*	field;
+	const dict_col_t*	col;
+	ulint			pos;
+	ulint			n_fields;
+
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	if (dict_index_is_clust(index)) {
+
+		return(TRUE);
+	}
+
+	col = dict_table_get_nth_col(index->table, n);
+
+	n_fields = dict_index_get_n_fields(index);
+
+	for (pos = 0; pos < n_fields; pos++) {
+		field = dict_index_get_nth_field(index, pos);
+
+		if (col == field->col) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/********************************************************************//**
+Looks for a matching field in an index. The column has to be the same. The
+column in index must be complete, or must contain a prefix longer than the
+column in index2. That is, we must be able to construct the prefix in index2
+from the prefix in index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INTERN
+ulint
+dict_index_get_nth_field_pos(
+/*=========================*/
+	const dict_index_t*	index,	/*!< in: index from which to search */
+	const dict_index_t*	index2,	/*!< in: index */
+	ulint			n)	/*!< in: field number in index2 */
+{
+	const dict_field_t*	field;
+	const dict_field_t*	field2;
+	ulint			n_fields;
+	ulint			pos;
+
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	field2 = dict_index_get_nth_field(index2, n);
+
+	n_fields = dict_index_get_n_fields(index);
+
+	for (pos = 0; pos < n_fields; pos++) {
+		field = dict_index_get_nth_field(index, pos);
+
+		if (field->col == field2->col
+		    && (field->prefix_len == 0
+			|| (field->prefix_len >= field2->prefix_len
+			    && field2->prefix_len != 0))) {
+
+			return(pos);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Returns a table object based on table id.
+@return	table, NULL if does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_table_open_on_id(
+/*==================*/
+	table_id_t	table_id,	/*!< in: table id */
+	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
+	dict_table_op_t	table_op)	/*!< in: operation to perform */
+{
+	dict_table_t*	table;
+
+	if (!dict_locked) {
+		mutex_enter(&dict_sys->mutex);
+	}
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	table = dict_table_open_on_id_low(
+		table_id,
+		table_op == DICT_TABLE_OP_LOAD_TABLESPACE
+		? DICT_ERR_IGNORE_RECOVER_LOCK
+		: DICT_ERR_IGNORE_NONE);
+
+	if (table != NULL) {
+
+		if (table->can_be_evicted) {
+			dict_move_to_mru(table);
+		}
+
+		++table->n_ref_count;
+
+		MONITOR_INC(MONITOR_TABLE_REFERENCE);
+	}
+
+	if (!dict_locked) {
+		dict_table_try_drop_aborted_and_mutex_exit(
+			table, table_op == DICT_TABLE_OP_DROP_ORPHAN);
+	}
+
+	return(table);
+}
+
+/********************************************************************//**
+Looks for column n position in the clustered index.
+@return	position in internal representation of the clustered index */
+UNIV_INTERN
+ulint
+dict_table_get_nth_col_pos(
+/*=======================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			n)	/*!< in: column number */
+{
+	return(dict_index_get_nth_col_pos(dict_table_get_first_index(table),
+					  n));
+}
+
+/********************************************************************//**
+Checks if a column is in the ordering columns of the clustered index of a
+table. Column prefixes are treated like whole columns.
+@return	TRUE if the column, or its prefix, is in the clustered key */
+UNIV_INTERN
+ibool
+dict_table_col_in_clustered_key(
+/*============================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			n)	/*!< in: column number */
+{
+	const dict_index_t*	index;
+	const dict_field_t*	field;
+	const dict_col_t*	col;
+	ulint			pos;
+	ulint			n_fields;
+
+	ut_ad(table);
+
+	col = dict_table_get_nth_col(table, n);
+
+	index = dict_table_get_first_index(table);
+
+	n_fields = dict_index_get_n_unique(index);
+
+	for (pos = 0; pos < n_fields; pos++) {
+		field = dict_index_get_nth_field(index, pos);
+
+		if (col == field->col) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Inits the data dictionary module. */
+UNIV_INTERN
+void
+dict_init(void)
+/*===========*/
+{
+	dict_sys = static_cast<dict_sys_t*>(mem_zalloc(sizeof(*dict_sys)));
+
+	mutex_create(dict_sys_mutex_key, &dict_sys->mutex, SYNC_DICT);
+
+	dict_sys->table_hash = hash_create(buf_pool_get_curr_size()
+					   / (DICT_POOL_PER_TABLE_HASH
+					      * UNIV_WORD_SIZE));
+	dict_sys->table_id_hash = hash_create(buf_pool_get_curr_size()
+					      / (DICT_POOL_PER_TABLE_HASH
+						 * UNIV_WORD_SIZE));
+	rw_lock_create(dict_operation_lock_key,
+		       &dict_operation_lock, SYNC_DICT_OPERATION);
+
+	if (!srv_read_only_mode) {
+		dict_foreign_err_file = os_file_create_tmpfile();
+		ut_a(dict_foreign_err_file);
+
+		mutex_create(dict_foreign_err_mutex_key,
+			     &dict_foreign_err_mutex, SYNC_NO_ORDER_CHECK);
+	}
+}
+
+/**********************************************************************//**
+Move to the most recently used segment of the LRU list. */
+UNIV_INTERN
+void
+dict_move_to_mru(
+/*=============*/
+	dict_table_t*	table)		/*!< in: table to move to MRU */
+{
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(dict_lru_validate());
+	ut_ad(dict_lru_find_table(table));
+
+	ut_a(table->can_be_evicted);
+
+	UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
+
+	UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table);
+
+	ut_ad(dict_lru_validate());
+}
+
+/**********************************************************************//**
+Returns a table object and increment its open handle count.
+NOTE! This is a high-level function to be used mainly from outside the
+'dict' module. Inside this directory dict_table_get_low
+is usually the appropriate function.
+@return	table, NULL if does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_table_open_on_name(
+/*====================*/
+	const char*	table_name,	/*!< in: table name */
+	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
+	ibool		try_drop,	/*!< in: TRUE=try to drop any orphan
+					indexes after an aborted online
+					index creation */
+	dict_err_ignore_t
+			ignore_err)	/*!< in: error to be ignored when
+					loading a table definition */
+{
+	dict_table_t*	table;
+
+	if (!dict_locked) {
+		mutex_enter(&(dict_sys->mutex));
+	}
+
+	ut_ad(table_name);
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	table = dict_table_check_if_in_cache_low(table_name);
+
+	if (table == NULL) {
+		table = dict_load_table(table_name, TRUE, ignore_err);
+	}
+
+	ut_ad(!table || table->cached);
+
+	if (table != NULL) {
+
+		/* If table is corrupted, return NULL */
+		if (ignore_err == DICT_ERR_IGNORE_NONE
+		    && table->corrupted) {
+
+			/* Make life easy for drop table. */
+			if (table->can_be_evicted) {
+				dict_table_move_from_lru_to_non_lru(table);
+			}
+
+			if (!dict_locked) {
+				mutex_exit(&dict_sys->mutex);
+			}
+
+			ut_print_timestamp(stderr);
+
+			fprintf(stderr, "  InnoDB: table ");
+			ut_print_name(stderr, NULL, TRUE, table->name);
+			fprintf(stderr, "is corrupted. Please drop the table "
+				"and recreate\n");
+
+			return(NULL);
+		}
+
+		if (table->can_be_evicted) {
+			dict_move_to_mru(table);
+		}
+
+		++table->n_ref_count;
+
+		MONITOR_INC(MONITOR_TABLE_REFERENCE);
+	}
+
+	ut_ad(dict_lru_validate());
+
+	if (!dict_locked) {
+		dict_table_try_drop_aborted_and_mutex_exit(table, try_drop);
+	}
+
+	return(table);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Adds system columns to a table object. */
+UNIV_INTERN
+void
+dict_table_add_system_columns(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	mem_heap_t*	heap)	/*!< in: temporary heap */
+{
+	ut_ad(table);
+	ut_ad(table->n_def == table->n_cols - DATA_N_SYS_COLS);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(!table->cached);
+
+	/* NOTE: the system columns MUST be added in the following order
+	(so that they can be indexed by the numerical value of DATA_ROW_ID,
+	etc.) and as the last columns of the table memory object.
+	The clustered index will not always physically contain all
+	system columns. */
+
+	dict_mem_table_add_col(table, heap, "DB_ROW_ID", DATA_SYS,
+			       DATA_ROW_ID | DATA_NOT_NULL,
+			       DATA_ROW_ID_LEN);
+#if DATA_ROW_ID != 0
+#error "DATA_ROW_ID != 0"
+#endif
+	dict_mem_table_add_col(table, heap, "DB_TRX_ID", DATA_SYS,
+			       DATA_TRX_ID | DATA_NOT_NULL,
+			       DATA_TRX_ID_LEN);
+#if DATA_TRX_ID != 1
+#error "DATA_TRX_ID != 1"
+#endif
+	dict_mem_table_add_col(table, heap, "DB_ROLL_PTR", DATA_SYS,
+			       DATA_ROLL_PTR | DATA_NOT_NULL,
+			       DATA_ROLL_PTR_LEN);
+#if DATA_ROLL_PTR != 2
+#error "DATA_ROLL_PTR != 2"
+#endif
+
+	/* This check reminds that if a new system column is added to
+	the program, it should be dealt with here */
+#if DATA_N_SYS_COLS != 3
+#error "DATA_N_SYS_COLS != 3"
+#endif
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Adds a table object to the dictionary cache. */
+UNIV_INTERN
+void
+dict_table_add_to_cache(
+/*====================*/
+	dict_table_t*	table,		/*!< in: table */
+	ibool		can_be_evicted,	/*!< in: TRUE if can be evicted */
+	mem_heap_t*	heap)		/*!< in: temporary heap */
+{
+	ulint	fold;
+	ulint	id_fold;
+	ulint	i;
+	ulint	row_len;
+
+	ut_ad(dict_lru_validate());
+
+	/* The lower limit for what we consider a "big" row */
+#define BIG_ROW_SIZE 1024
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	dict_table_add_system_columns(table, heap);
+
+	table->cached = TRUE;
+
+	fold = ut_fold_string(table->name);
+	id_fold = ut_fold_ull(table->id);
+
+	row_len = 0;
+	for (i = 0; i < table->n_def; i++) {
+		ulint	col_len = dict_col_get_max_size(
+			dict_table_get_nth_col(table, i));
+
+		row_len += col_len;
+
+		/* If we have a single unbounded field, or several gigantic
+		fields, mark the maximum row size as BIG_ROW_SIZE. */
+		if (row_len >= BIG_ROW_SIZE || col_len >= BIG_ROW_SIZE) {
+			row_len = BIG_ROW_SIZE;
+
+			break;
+		}
+	}
+
+	table->big_rows = row_len >= BIG_ROW_SIZE;
+
+	/* Look for a table with the same name: error if such exists */
+	{
+		dict_table_t*	table2;
+		HASH_SEARCH(name_hash, dict_sys->table_hash, fold,
+			    dict_table_t*, table2, ut_ad(table2->cached),
+			    ut_strcmp(table2->name, table->name) == 0);
+		ut_a(table2 == NULL);
+
+#ifdef UNIV_DEBUG
+		/* Look for the same table pointer with a different name */
+		HASH_SEARCH_ALL(name_hash, dict_sys->table_hash,
+				dict_table_t*, table2, ut_ad(table2->cached),
+				table2 == table);
+		ut_ad(table2 == NULL);
+#endif /* UNIV_DEBUG */
+	}
+
+	/* Look for a table with the same id: error if such exists */
+	{
+		dict_table_t*	table2;
+		HASH_SEARCH(id_hash, dict_sys->table_id_hash, id_fold,
+			    dict_table_t*, table2, ut_ad(table2->cached),
+			    table2->id == table->id);
+		ut_a(table2 == NULL);
+
+#ifdef UNIV_DEBUG
+		/* Look for the same table pointer with a different id */
+		HASH_SEARCH_ALL(id_hash, dict_sys->table_id_hash,
+				dict_table_t*, table2, ut_ad(table2->cached),
+				table2 == table);
+		ut_ad(table2 == NULL);
+#endif /* UNIV_DEBUG */
+	}
+
+	/* Add table to hash table of tables */
+	HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold,
+		    table);
+
+	/* Add table to hash table of tables based on table id */
+	HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash, id_fold,
+		    table);
+
+	table->can_be_evicted = can_be_evicted;
+
+	if (table->can_be_evicted) {
+		UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table);
+	} else {
+		UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_non_LRU, table);
+	}
+
+	ut_ad(dict_lru_validate());
+
+	dict_sys->size += mem_heap_get_size(table->heap)
+		+ strlen(table->name) + 1;
+}
+
+/**********************************************************************//**
+Test whether a table can be evicted from the LRU cache.
+@return TRUE if table can be evicted. */
+static
+ibool
+dict_table_can_be_evicted(
+/*======================*/
+	const dict_table_t*	table)		/*!< in: table to test */
+{
+	ut_ad(mutex_own(&dict_sys->mutex));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_a(table->can_be_evicted);
+	ut_a(table->foreign_set.empty());
+	ut_a(table->referenced_set.empty());
+
+	if (table->n_ref_count == 0) {
+		dict_index_t*	index;
+
+		/* The transaction commit and rollback are called from
+		outside the handler interface. This means that there is
+		a window where the table->n_ref_count can be zero but
+		the table instance is in "use". */
+
+		if (lock_table_has_locks(table)) {
+			return(FALSE);
+		}
+
+		for (index = dict_table_get_first_index(table);
+		     index != NULL;
+		     index = dict_table_get_next_index(index)) {
+
+			btr_search_t*	info = btr_search_get_info(index);
+
+			/* We are not allowed to free the in-memory index
+			struct dict_index_t until all entries in the adaptive
+			hash index that point to any of the page belonging to
+			his b-tree index are dropped. This is so because
+			dropping of these entries require access to
+			dict_index_t struct. To avoid such scenario we keep
+			a count of number of such pages in the search_info and
+			only free the dict_index_t struct when this count
+			drops to zero.
+
+			See also: dict_index_remove_from_cache_low() */
+
+			if (btr_search_info_get_ref_count(info) > 0) {
+				return(FALSE);
+			}
+		}
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Make room in the table cache by evicting an unused table. The unused table
+should not be part of FK relationship and currently not used in any user
+transaction. There is no guarantee that it will remove a table.
+@return number of tables evicted. If the number of tables in the dict_LRU
+is less than max_tables it will not do anything. */
+UNIV_INTERN
+ulint
+dict_make_room_in_cache(
+/*====================*/
+	ulint		max_tables,	/*!< in: max tables allowed in cache */
+	ulint		pct_check)	/*!< in: max percent to check */
+{
+	ulint		i;
+	ulint		len;
+	dict_table_t*	table;
+	ulint		check_up_to;
+	ulint		n_evicted = 0;
+
+	ut_a(pct_check > 0);
+	ut_a(pct_check <= 100);
+	ut_ad(mutex_own(&dict_sys->mutex));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(dict_lru_validate());
+
+	i = len = UT_LIST_GET_LEN(dict_sys->table_LRU);
+
+	if (len < max_tables) {
+		return(0);
+	}
+
+	check_up_to = len - ((len * pct_check) / 100);
+
+	/* Check for overflow */
+	ut_a(i == 0 || check_up_to <= i);
+
+	/* Find a suitable candidate to evict from the cache. Don't scan the
+	entire LRU list. Only scan pct_check list entries. */
+
+	for (table = UT_LIST_GET_LAST(dict_sys->table_LRU);
+	     table != NULL
+	     && i > check_up_to
+	     && (len - n_evicted) > max_tables;
+	     --i) {
+
+		dict_table_t*	prev_table;
+
+	        prev_table = UT_LIST_GET_PREV(table_LRU, table);
+
+		if (dict_table_can_be_evicted(table)) {
+
+			dict_table_remove_from_cache_low(table, TRUE);
+
+			++n_evicted;
+		}
+
+		table = prev_table;
+	}
+
+	return(n_evicted);
+}
+
+/**********************************************************************//**
+Move a table to the non-LRU list from the LRU list. */
+UNIV_INTERN
+void
+dict_table_move_from_lru_to_non_lru(
+/*================================*/
+	dict_table_t*	table)	/*!< in: table to move from LRU to non-LRU */
+{
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(dict_lru_find_table(table));
+
+	ut_a(table->can_be_evicted);
+
+	UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
+
+	UT_LIST_ADD_LAST(table_LRU, dict_sys->table_non_LRU, table);
+
+	table->can_be_evicted = FALSE;
+}
+
+/**********************************************************************//**
+Move a table to the LRU list from the non-LRU list. */
+UNIV_INTERN
+void
+dict_table_move_from_non_lru_to_lru(
+/*================================*/
+	dict_table_t*	table)	/*!< in: table to move from non-LRU to LRU */
+{
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(dict_non_lru_find_table(table));
+
+	ut_a(!table->can_be_evicted);
+
+	UT_LIST_REMOVE(table_LRU, dict_sys->table_non_LRU, table);
+
+	UT_LIST_ADD_LAST(table_LRU, dict_sys->table_LRU, table);
+
+	table->can_be_evicted = TRUE;
+}
+
+/**********************************************************************//**
+Looks for an index with the given id given a table instance.
+@return	index or NULL */
+static
+dict_index_t*
+dict_table_find_index_on_id(
+/*========================*/
+	const dict_table_t*	table,	/*!< in: table instance */
+	index_id_t		id)	/*!< in: index id */
+{
+	dict_index_t*	index;
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (id == index->id) {
+			/* Found */
+
+			return(index);
+		}
+	}
+
+	return(NULL);
+}
+
+/**********************************************************************//**
+Looks for an index with the given id. NOTE that we do not reserve
+the dictionary mutex: this function is for emergency purposes like
+printing info of a corrupt database page!
+@return	index or NULL if not found in cache */
+UNIV_INTERN
+dict_index_t*
+dict_index_find_on_id_low(
+/*======================*/
+	index_id_t	id)	/*!< in: index id */
+{
+	dict_table_t*	table;
+
+	/* This can happen if the system tablespace is the wrong page size */
+	if (dict_sys == NULL) {
+		return(NULL);
+	}
+
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+		dict_index_t*	index = dict_table_find_index_on_id(table, id);
+
+		if (index != NULL) {
+			return(index);
+		}
+	}
+
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+		dict_index_t*	index = dict_table_find_index_on_id(table, id);
+
+		if (index != NULL) {
+			return(index);
+		}
+	}
+
+	return(NULL);
+}
+
+/** Function object to remove a foreign key constraint from the
+referenced_set of the referenced table.  The foreign key object is
+also removed from the dictionary cache.  The foreign key constraint
+is not removed from the foreign_set of the table containing the
+constraint. */
+struct dict_foreign_remove_partial
+{
+	void operator()(dict_foreign_t* foreign) {
+		dict_table_t*	table = foreign->referenced_table;
+		if (table != NULL) {
+			table->referenced_set.erase(foreign);
+		}
+		dict_foreign_free(foreign);
+	}
+};
+
+/**********************************************************************//**
+Renames a table object.
+@return	TRUE if success */
+UNIV_INTERN
+dberr_t
+dict_table_rename_in_cache(
+/*=======================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	const char*	new_name,	/*!< in: new name */
+	ibool		rename_also_foreigns)/*!< in: in ALTER TABLE we want
+					to preserve the original table name
+					in constraints which reference it */
+{
+	dict_foreign_t*	foreign;
+	dict_index_t*	index;
+	ulint		fold;
+	char		old_name[MAX_FULL_NAME_LEN + 1];
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	/* store the old/current name to an automatic variable */
+	if (strlen(table->name) + 1 <= sizeof(old_name)) {
+		memcpy(old_name, table->name, strlen(table->name) + 1);
+	} else {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "InnoDB: too long table name: '%s', "
+			"max length is %d\n", table->name,
+			MAX_FULL_NAME_LEN);
+		ut_error;
+	}
+
+	fold = ut_fold_string(new_name);
+
+	/* Look for a table with the same name: error if such exists */
+	dict_table_t*	table2;
+	HASH_SEARCH(name_hash, dict_sys->table_hash, fold,
+			dict_table_t*, table2, ut_ad(table2->cached),
+			(ut_strcmp(table2->name, new_name) == 0));
+	DBUG_EXECUTE_IF("dict_table_rename_in_cache_failure",
+		if (table2 == NULL) {
+			table2 = (dict_table_t*) -1;
+		} );
+	if (table2) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Cannot rename table '%s' to '%s' since the "
+			"dictionary cache already contains '%s'.",
+			old_name, new_name, new_name);
+		return(DB_ERROR);
+	}
+
+	/* If the table is stored in a single-table tablespace, rename the
+	.ibd file and rebuild the .isl file if needed. */
+
+	if (dict_table_is_discarded(table)) {
+		os_file_type_t	type;
+		ibool		exists;
+		char*		filepath;
+
+		ut_ad(table->space != TRX_SYS_SPACE);
+
+		if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+
+			dict_get_and_save_data_dir_path(table, true);
+			ut_a(table->data_dir_path);
+
+			filepath = os_file_make_remote_pathname(
+				table->data_dir_path, table->name, "ibd");
+		} else {
+			filepath = fil_make_ibd_name(table->name, false);
+		}
+
+		fil_delete_tablespace(table->space, BUF_REMOVE_ALL_NO_WRITE);
+
+		/* Delete any temp file hanging around. */
+		if (os_file_status(filepath, &exists, &type)
+		    && exists
+		    && !os_file_delete_if_exists(innodb_file_temp_key,
+						 filepath)) {
+
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Delete of %s failed.", filepath);
+		}
+
+		mem_free(filepath);
+
+	} else if (table->space != TRX_SYS_SPACE) {
+		char*	new_path = NULL;
+
+		if (table->dir_path_of_temp_table != NULL) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Error: trying to rename a"
+			      " TEMPORARY TABLE ", stderr);
+			ut_print_name(stderr, NULL, TRUE, old_name);
+			fputs(" (", stderr);
+			ut_print_filename(stderr,
+					  table->dir_path_of_temp_table);
+			fputs(" )\n", stderr);
+			return(DB_ERROR);
+
+		} else if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+			char*		old_path;
+
+			old_path = fil_space_get_first_path(table->space);
+
+			new_path = os_file_make_new_pathname(
+				old_path, new_name);
+
+			mem_free(old_path);
+
+			dberr_t	err = fil_create_link_file(
+				new_name, new_path);
+
+			if (err != DB_SUCCESS) {
+				mem_free(new_path);
+				return(DB_TABLESPACE_EXISTS);
+			}
+		}
+
+		ibool	success = fil_rename_tablespace(
+			old_name, table->space, new_name, new_path);
+
+		/* If the tablespace is remote, a new .isl file was created
+		If success, delete the old one. If not, delete the new one.  */
+		if (new_path) {
+
+			mem_free(new_path);
+			fil_delete_link_file(success ? old_name : new_name);
+		}
+
+		if (!success) {
+			return(DB_ERROR);
+		}
+	}
+
+	/* Remove table from the hash tables of tables */
+	HASH_DELETE(dict_table_t, name_hash, dict_sys->table_hash,
+		    ut_fold_string(old_name), table);
+
+	if (strlen(new_name) > strlen(table->name)) {
+		/* We allocate MAX_FULL_NAME_LEN + 1 bytes here to avoid
+		memory fragmentation, we assume a repeated calls of
+		ut_realloc() with the same size do not cause fragmentation */
+		ut_a(strlen(new_name) <= MAX_FULL_NAME_LEN);
+
+		table->name = static_cast<char*>(
+			ut_realloc(table->name, MAX_FULL_NAME_LEN + 1));
+	}
+	memcpy(table->name, new_name, strlen(new_name) + 1);
+
+	/* Add table to hash table of tables */
+	HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold,
+		    table);
+
+	dict_sys->size += strlen(new_name) - strlen(old_name);
+	ut_a(dict_sys->size > 0);
+
+	/* Update the table_name field in indexes */
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		index->table_name = table->name;
+	}
+
+	if (!rename_also_foreigns) {
+		/* In ALTER TABLE we think of the rename table operation
+		in the direction table -> temporary table (#sql...)
+		as dropping the table with the old name and creating
+		a new with the new name. Thus we kind of drop the
+		constraints from the dictionary cache here. The foreign key
+		constraints will be inherited to the new table from the
+		system tables through a call of dict_load_foreigns. */
+
+		/* Remove the foreign constraints from the cache */
+		std::for_each(table->foreign_set.begin(),
+			      table->foreign_set.end(),
+			      dict_foreign_remove_partial());
+		table->foreign_set.clear();
+
+		/* Reset table field in referencing constraints */
+		for (dict_foreign_set::iterator it
+			= table->referenced_set.begin();
+		     it != table->referenced_set.end();
+		     ++it) {
+
+			foreign = *it;
+			foreign->referenced_table = NULL;
+			foreign->referenced_index = NULL;
+
+		}
+
+		/* Make the set of referencing constraints empty */
+		table->referenced_set.clear();
+
+		return(DB_SUCCESS);
+	}
+
+	/* Update the table name fields in foreign constraints, and update also
+	the constraint id of new format >= 4.0.18 constraints. Note that at
+	this point we have already changed table->name to the new name. */
+
+	dict_foreign_set	fk_set;
+
+	for (;;) {
+
+		dict_foreign_set::iterator	it
+			= table->foreign_set.begin();
+
+		if (it == table->foreign_set.end()) {
+			break;
+		}
+
+		foreign = *it;
+
+		if (foreign->referenced_table) {
+			foreign->referenced_table->referenced_set.erase(foreign);
+		}
+
+		if (ut_strlen(foreign->foreign_table_name)
+		    < ut_strlen(table->name)) {
+			/* Allocate a longer name buffer;
+			TODO: store buf len to save memory */
+
+			foreign->foreign_table_name = mem_heap_strdup(
+				foreign->heap, table->name);
+			dict_mem_foreign_table_name_lookup_set(foreign, TRUE);
+		} else {
+			strcpy(foreign->foreign_table_name, table->name);
+			dict_mem_foreign_table_name_lookup_set(foreign, FALSE);
+		}
+		if (strchr(foreign->id, '/')) {
+			/* This is a >= 4.0.18 format id */
+
+			ulint	db_len;
+			char*	old_id;
+			char    old_name_cs_filename[MAX_TABLE_NAME_LEN+20];
+			uint    errors = 0;
+
+			/* All table names are internally stored in charset
+			my_charset_filename (except the temp tables and the
+			partition identifier suffix in partition tables). The
+			foreign key constraint names are internally stored
+			in UTF-8 charset.  The variable fkid here is used
+			to store foreign key constraint name in charset
+			my_charset_filename for comparison further below. */
+			char    fkid[MAX_TABLE_NAME_LEN+20];
+			ibool	on_tmp = FALSE;
+
+			/* The old table name in my_charset_filename is stored
+			in old_name_cs_filename */
+
+			strncpy(old_name_cs_filename, old_name,
+				MAX_TABLE_NAME_LEN);
+			if (strstr(old_name, TEMP_TABLE_PATH_PREFIX) == NULL) {
+
+				innobase_convert_to_system_charset(
+					strchr(old_name_cs_filename, '/') + 1,
+					strchr(old_name, '/') + 1,
+					MAX_TABLE_NAME_LEN, &errors);
+
+				if (errors) {
+					/* There has been an error to convert
+					old table into UTF-8.  This probably
+					means that the old table name is
+					actually in UTF-8. */
+					innobase_convert_to_filename_charset(
+						strchr(old_name_cs_filename,
+						       '/') + 1,
+						strchr(old_name, '/') + 1,
+						MAX_TABLE_NAME_LEN);
+				} else {
+					/* Old name already in
+					my_charset_filename */
+					strncpy(old_name_cs_filename, old_name,
+						MAX_TABLE_NAME_LEN);
+				}
+			}
+
+			strncpy(fkid, foreign->id, MAX_TABLE_NAME_LEN);
+
+			if (strstr(fkid, TEMP_TABLE_PATH_PREFIX) == NULL) {
+				innobase_convert_to_filename_charset(
+					strchr(fkid, '/') + 1,
+					strchr(foreign->id, '/') + 1,
+					MAX_TABLE_NAME_LEN+20);
+			} else {
+				on_tmp = TRUE;
+			}
+
+			old_id = mem_strdup(foreign->id);
+
+			if (ut_strlen(fkid) > ut_strlen(old_name_cs_filename)
+			    + ((sizeof dict_ibfk) - 1)
+			    && !memcmp(fkid, old_name_cs_filename,
+				       ut_strlen(old_name_cs_filename))
+			    && !memcmp(fkid + ut_strlen(old_name_cs_filename),
+				       dict_ibfk, (sizeof dict_ibfk) - 1)) {
+
+				/* This is a generated >= 4.0.18 format id */
+
+				char	table_name[MAX_TABLE_NAME_LEN] = "";
+				uint	errors = 0;
+
+				if (strlen(table->name) > strlen(old_name)) {
+					foreign->id = static_cast<char*>(
+						mem_heap_alloc(
+						foreign->heap,
+						strlen(table->name)
+						+ strlen(old_id) + 1));
+				}
+
+				/* Convert the table name to UTF-8 */
+				strncpy(table_name, table->name,
+					MAX_TABLE_NAME_LEN);
+				innobase_convert_to_system_charset(
+					strchr(table_name, '/') + 1,
+					strchr(table->name, '/') + 1,
+					MAX_TABLE_NAME_LEN, &errors);
+
+				if (errors) {
+					/* Table name could not be converted
+					from charset my_charset_filename to
+					UTF-8. This means that the table name
+					is already in UTF-8 (#mysql#50). */
+					strncpy(table_name, table->name,
+						MAX_TABLE_NAME_LEN);
+				}
+
+				/* Replace the prefix 'databasename/tablename'
+				with the new names */
+				strcpy(foreign->id, table_name);
+				if (on_tmp) {
+					strcat(foreign->id,
+					       old_id + ut_strlen(old_name));
+				} else {
+					sprintf(strchr(foreign->id, '/') + 1,
+						"%s%s",
+						strchr(table_name, '/') +1,
+						strstr(old_id, "_ibfk_") );
+				}
+
+			} else {
+				/* This is a >= 4.0.18 format id where the user
+				gave the id name */
+				db_len = dict_get_db_name_len(table->name) + 1;
+
+				if (dict_get_db_name_len(table->name)
+				    > dict_get_db_name_len(foreign->id)) {
+
+					foreign->id = static_cast<char*>(
+						mem_heap_alloc(
+						foreign->heap,
+						db_len + strlen(old_id) + 1));
+				}
+
+				/* Replace the database prefix in id with the
+				one from table->name */
+
+				ut_memcpy(foreign->id, table->name, db_len);
+
+				strcpy(foreign->id + db_len,
+				       dict_remove_db_name(old_id));
+			}
+
+			mem_free(old_id);
+		}
+
+		table->foreign_set.erase(it);
+		fk_set.insert(foreign);
+
+		if (foreign->referenced_table) {
+			foreign->referenced_table->referenced_set.insert(foreign);
+		}
+	}
+
+	ut_a(table->foreign_set.empty());
+	table->foreign_set.swap(fk_set);
+
+	for (dict_foreign_set::iterator it = table->referenced_set.begin();
+	     it != table->referenced_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		if (ut_strlen(foreign->referenced_table_name)
+		    < ut_strlen(table->name)) {
+			/* Allocate a longer name buffer;
+			TODO: store buf len to save memory */
+
+			foreign->referenced_table_name = mem_heap_strdup(
+				foreign->heap, table->name);
+
+			dict_mem_referenced_table_name_lookup_set(
+				foreign, TRUE);
+		} else {
+			/* Use the same buffer */
+			strcpy(foreign->referenced_table_name, table->name);
+
+			dict_mem_referenced_table_name_lookup_set(
+				foreign, FALSE);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/**********************************************************************//**
+Change the id of a table object in the dictionary cache. This is used in
+DISCARD TABLESPACE. */
+UNIV_INTERN
+void
+dict_table_change_id_in_cache(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table object already in cache */
+	table_id_t	new_id)	/*!< in: new id to set */
+{
+	ut_ad(table);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	/* Remove the table from the hash table of id's */
+
+	HASH_DELETE(dict_table_t, id_hash, dict_sys->table_id_hash,
+		    ut_fold_ull(table->id), table);
+	table->id = new_id;
+
+	/* Add the table back to the hash table */
+	HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash,
+		    ut_fold_ull(table->id), table);
+}
+
+/**********************************************************************//**
+Removes a table object from the dictionary cache. */
+static
+void
+dict_table_remove_from_cache_low(
+/*=============================*/
+	dict_table_t*	table,		/*!< in, own: table */
+	ibool		lru_evict)	/*!< in: TRUE if table being evicted
+					to make room in the table LRU list */
+{
+	dict_foreign_t*	foreign;
+	dict_index_t*	index;
+	ulint		size;
+
+	ut_ad(table);
+	ut_ad(dict_lru_validate());
+	ut_a(table->n_ref_count == 0);
+	ut_a(table->n_rec_locks == 0);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	/* Remove the foreign constraints from the cache */
+	std::for_each(table->foreign_set.begin(), table->foreign_set.end(),
+		      dict_foreign_remove_partial());
+	table->foreign_set.clear();
+
+	/* Reset table field in referencing constraints */
+	for (dict_foreign_set::iterator it = table->referenced_set.begin();
+	     it != table->referenced_set.end();
+	     ++it) {
+
+		foreign = *it;
+		foreign->referenced_table = NULL;
+		foreign->referenced_index = NULL;
+	}
+
+	/* Remove the indexes from the cache */
+
+	for (index = UT_LIST_GET_LAST(table->indexes);
+	     index != NULL;
+	     index = UT_LIST_GET_LAST(table->indexes)) {
+
+		dict_index_remove_from_cache_low(table, index, lru_evict);
+	}
+
+	/* Remove table from the hash tables of tables */
+
+	HASH_DELETE(dict_table_t, name_hash, dict_sys->table_hash,
+		    ut_fold_string(table->name), table);
+
+	HASH_DELETE(dict_table_t, id_hash, dict_sys->table_id_hash,
+		    ut_fold_ull(table->id), table);
+
+	/* Remove table from LRU or non-LRU list. */
+	if (table->can_be_evicted) {
+		ut_ad(dict_lru_find_table(table));
+		UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
+	} else {
+		ut_ad(dict_non_lru_find_table(table));
+		UT_LIST_REMOVE(table_LRU, dict_sys->table_non_LRU, table);
+	}
+
+	ut_ad(dict_lru_validate());
+
+	if (lru_evict && table->drop_aborted) {
+		/* Do as dict_table_try_drop_aborted() does. */
+
+		trx_t* trx = trx_allocate_for_background();
+
+		ut_ad(mutex_own(&dict_sys->mutex));
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+		/* Mimic row_mysql_lock_data_dictionary(). */
+		trx->dict_operation_lock_mode = RW_X_LATCH;
+
+		trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+
+		/* Silence a debug assertion in row_merge_drop_indexes(). */
+		ut_d(table->n_ref_count++);
+		row_merge_drop_indexes(trx, table, TRUE);
+		ut_d(table->n_ref_count--);
+		ut_ad(table->n_ref_count == 0);
+		trx_commit_for_mysql(trx);
+		trx->dict_operation_lock_mode = 0;
+		trx_free_for_background(trx);
+	}
+
+	size = mem_heap_get_size(table->heap) + strlen(table->name) + 1;
+
+	ut_ad(dict_sys->size >= size);
+
+	dict_sys->size -= size;
+
+	dict_mem_table_free(table);
+}
+
+/**********************************************************************//**
+Removes a table object from the dictionary cache. */
+UNIV_INTERN
+void
+dict_table_remove_from_cache(
+/*=========================*/
+	dict_table_t*	table)	/*!< in, own: table */
+{
+	dict_table_remove_from_cache_low(table, FALSE);
+}
+
+/****************************************************************//**
+If the given column name is reserved for InnoDB system columns, return
+TRUE.
+@return	TRUE if name is reserved */
+UNIV_INTERN
+ibool
+dict_col_name_is_reserved(
+/*======================*/
+	const char*	name)	/*!< in: column name */
+{
+	/* This check reminds that if a new system column is added to
+	the program, it should be dealt with here. */
+#if DATA_N_SYS_COLS != 3
+#error "DATA_N_SYS_COLS != 3"
+#endif
+
+	static const char*	reserved_names[] = {
+		"DB_ROW_ID", "DB_TRX_ID", "DB_ROLL_PTR"
+	};
+
+	ulint			i;
+
+	for (i = 0; i < UT_ARR_SIZE(reserved_names); i++) {
+		if (innobase_strcasecmp(name, reserved_names[i]) == 0) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+#if 1	/* This function is not very accurate at determining
+	whether an UNDO record will be too big. See innodb_4k.test,
+	Bug 13336585, for a testcase that shows an index that can
+	be created but cannot be updated. */
+
+/****************************************************************//**
+If an undo log record for this table might not fit on a single page,
+return TRUE.
+@return	TRUE if the undo log record could become too big */
+static
+ibool
+dict_index_too_big_for_undo(
+/*========================*/
+	const dict_table_t*	table,		/*!< in: table */
+	const dict_index_t*	new_index)	/*!< in: index */
+{
+	/* Make sure that all column prefixes will fit in the undo log record
+	in trx_undo_page_report_modify() right after trx_undo_page_init(). */
+
+	ulint			i;
+	const dict_index_t*	clust_index
+		= dict_table_get_first_index(table);
+	ulint			undo_page_len
+		= TRX_UNDO_PAGE_HDR - TRX_UNDO_PAGE_HDR_SIZE
+		+ 2 /* next record pointer */
+		+ 1 /* type_cmpl */
+		+ 11 /* trx->undo_no */ + 11 /* table->id */
+		+ 1 /* rec_get_info_bits() */
+		+ 11 /* DB_TRX_ID */
+		+ 11 /* DB_ROLL_PTR */
+		+ 10 + FIL_PAGE_DATA_END /* trx_undo_left() */
+		+ 2/* pointer to previous undo log record */;
+
+	/* FTS index consists of auxiliary tables, they shall be excluded from
+	index row size check */
+	if (new_index->type & DICT_FTS) {
+		return(false);
+	}
+
+	if (!clust_index) {
+		ut_a(dict_index_is_clust(new_index));
+		clust_index = new_index;
+	}
+
+	/* Add the size of the ordering columns in the
+	clustered index. */
+	for (i = 0; i < clust_index->n_uniq; i++) {
+		const dict_col_t*	col
+			= dict_index_get_nth_col(clust_index, i);
+
+		/* Use the maximum output size of
+		mach_write_compressed(), although the encoded
+		length should always fit in 2 bytes. */
+		undo_page_len += 5 + dict_col_get_max_size(col);
+	}
+
+	/* Add the old values of the columns to be updated.
+	First, the amount and the numbers of the columns.
+	These are written by mach_write_compressed() whose
+	maximum output length is 5 bytes.  However, given that
+	the quantities are below REC_MAX_N_FIELDS (10 bits),
+	the maximum length is 2 bytes per item. */
+	undo_page_len += 2 * (dict_table_get_n_cols(table) + 1);
+
+	for (i = 0; i < clust_index->n_def; i++) {
+		const dict_col_t*	col
+			= dict_index_get_nth_col(clust_index, i);
+		ulint			max_size
+			= dict_col_get_max_size(col);
+		ulint			fixed_size
+			= dict_col_get_fixed_size(col,
+						  dict_table_is_comp(table));
+		ulint			max_prefix
+			= col->max_prefix;
+
+		if (fixed_size) {
+			/* Fixed-size columns are stored locally. */
+			max_size = fixed_size;
+		} else if (max_size <= BTR_EXTERN_FIELD_REF_SIZE * 2) {
+			/* Short columns are stored locally. */
+		} else if (!col->ord_part
+			   || (col->max_prefix
+			       < (ulint) DICT_MAX_FIELD_LEN_BY_FORMAT(table))) {
+			/* See if col->ord_part would be set
+			because of new_index. Also check if the new
+			index could have longer prefix on columns
+			that already had ord_part set  */
+			ulint	j;
+
+			for (j = 0; j < new_index->n_uniq; j++) {
+				if (dict_index_get_nth_col(
+					    new_index, j) == col) {
+					const dict_field_t*     field
+						= dict_index_get_nth_field(
+							new_index, j);
+
+					if (field->prefix_len
+					    > col->max_prefix) {
+						max_prefix =
+							 field->prefix_len;
+					}
+
+					goto is_ord_part;
+				}
+			}
+
+			if (col->ord_part) {
+				goto is_ord_part;
+			}
+
+			/* This is not an ordering column in any index.
+			Thus, it can be stored completely externally. */
+			max_size = BTR_EXTERN_FIELD_REF_SIZE;
+		} else {
+			ulint	max_field_len;
+is_ord_part:
+			max_field_len = DICT_MAX_FIELD_LEN_BY_FORMAT(table);
+
+			/* This is an ordering column in some index.
+			A long enough prefix must be written to the
+			undo log.  See trx_undo_page_fetch_ext(). */
+			max_size = ut_min(max_size, max_field_len);
+
+			/* We only store the needed prefix length in undo log */
+			if (max_prefix) {
+			     ut_ad(dict_table_get_format(table)
+				   >= UNIV_FORMAT_B);
+
+				max_size = ut_min(max_prefix, max_size);
+			}
+
+			max_size += BTR_EXTERN_FIELD_REF_SIZE;
+		}
+
+		undo_page_len += 5 + max_size;
+	}
+
+	return(undo_page_len >= UNIV_PAGE_SIZE);
+}
+#endif
+
+/****************************************************************//**
+If a record of this index might not fit on a single B-tree page,
+return TRUE.
+@return	TRUE if the index record could become too big */
+static
+ibool
+dict_index_too_big_for_tree(
+/*========================*/
+	const dict_table_t*	table,		/*!< in: table */
+	const dict_index_t*	new_index)	/*!< in: index */
+{
+	ulint	zip_size;
+	ulint	comp;
+	ulint	i;
+	/* maximum possible storage size of a record */
+	ulint	rec_max_size;
+	/* maximum allowed size of a record on a leaf page */
+	ulint	page_rec_max;
+	/* maximum allowed size of a node pointer record */
+	ulint	page_ptr_max;
+
+	/* FTS index consists of auxiliary tables, they shall be excluded from
+	index row size check */
+	if (new_index->type & DICT_FTS) {
+		return(false);
+	}
+
+	DBUG_EXECUTE_IF(
+		"ib_force_create_table",
+		return(FALSE););
+
+	comp = dict_table_is_comp(table);
+	zip_size = dict_table_zip_size(table);
+
+	if (zip_size && zip_size < UNIV_PAGE_SIZE) {
+		/* On a compressed page, two records must fit in the
+		uncompressed page modification log.  On compressed
+		pages with zip_size == UNIV_PAGE_SIZE, this limit will
+		never be reached. */
+		ut_ad(comp);
+		/* The maximum allowed record size is the size of
+		an empty page, minus a byte for recoding the heap
+		number in the page modification log.  The maximum
+		allowed node pointer size is half that. */
+		page_rec_max = page_zip_empty_size(new_index->n_fields,
+						   zip_size);
+		if (page_rec_max) {
+			page_rec_max--;
+		}
+		page_ptr_max = page_rec_max / 2;
+		/* On a compressed page, there is a two-byte entry in
+		the dense page directory for every record.  But there
+		is no record header. */
+		rec_max_size = 2;
+	} else {
+		/* The maximum allowed record size is half a B-tree
+		page.  No additional sparse page directory entry will
+		be generated for the first few user records. */
+		page_rec_max = page_get_free_space_of_empty(comp) / 2;
+		page_ptr_max = page_rec_max;
+		/* Each record has a header. */
+		rec_max_size = comp
+			? REC_N_NEW_EXTRA_BYTES
+			: REC_N_OLD_EXTRA_BYTES;
+	}
+
+	if (comp) {
+		/* Include the "null" flags in the
+		maximum possible record size. */
+		rec_max_size += UT_BITS_IN_BYTES(new_index->n_nullable);
+	} else {
+		/* For each column, include a 2-byte offset and a
+		"null" flag.  The 1-byte format is only used in short
+		records that do not contain externally stored columns.
+		Such records could never exceed the page limit, even
+		when using the 2-byte format. */
+		rec_max_size += 2 * new_index->n_fields;
+	}
+
+	/* Compute the maximum possible record size. */
+	for (i = 0; i < new_index->n_fields; i++) {
+		const dict_field_t*	field
+			= dict_index_get_nth_field(new_index, i);
+		const dict_col_t*	col
+			= dict_field_get_col(field);
+		ulint			field_max_size;
+		ulint			field_ext_max_size;
+
+		/* In dtuple_convert_big_rec(), variable-length columns
+		that are longer than BTR_EXTERN_FIELD_REF_SIZE * 2
+		may be chosen for external storage.
+
+		Fixed-length columns, and all columns of secondary
+		index records are always stored inline. */
+
+		/* Determine the maximum length of the index field.
+		The field_ext_max_size should be computed as the worst
+		case in rec_get_converted_size_comp() for
+		REC_STATUS_ORDINARY records. */
+
+		field_max_size = dict_col_get_fixed_size(col, comp);
+		if (field_max_size) {
+			/* dict_index_add_col() should guarantee this */
+			ut_ad(!field->prefix_len
+			      || field->fixed_len == field->prefix_len);
+			/* Fixed lengths are not encoded
+			in ROW_FORMAT=COMPACT. */
+			field_ext_max_size = 0;
+			goto add_field_size;
+		}
+
+		field_max_size = dict_col_get_max_size(col);
+		field_ext_max_size = field_max_size < 256 ? 1 : 2;
+
+		if (field->prefix_len) {
+			if (field->prefix_len < field_max_size) {
+				field_max_size = field->prefix_len;
+			}
+		} else if (field_max_size > BTR_EXTERN_FIELD_REF_SIZE * 2
+			   && dict_index_is_clust(new_index)) {
+
+			/* In the worst case, we have a locally stored
+			column of BTR_EXTERN_FIELD_REF_SIZE * 2 bytes.
+			The length can be stored in one byte.  If the
+			column were stored externally, the lengths in
+			the clustered index page would be
+			BTR_EXTERN_FIELD_REF_SIZE and 2. */
+			field_max_size = BTR_EXTERN_FIELD_REF_SIZE * 2;
+			field_ext_max_size = 1;
+		}
+
+		if (comp) {
+			/* Add the extra size for ROW_FORMAT=COMPACT.
+			For ROW_FORMAT=REDUNDANT, these bytes were
+			added to rec_max_size before this loop. */
+			rec_max_size += field_ext_max_size;
+		}
+add_field_size:
+		rec_max_size += field_max_size;
+
+		/* Check the size limit on leaf pages. */
+		if (UNIV_UNLIKELY(rec_max_size >= page_rec_max)) {
+
+			return(TRUE);
+		}
+
+		/* Check the size limit on non-leaf pages.  Records
+		stored in non-leaf B-tree pages consist of the unique
+		columns of the record (the key columns of the B-tree)
+		and a node pointer field.  When we have processed the
+		unique columns, rec_max_size equals the size of the
+		node pointer record minus the node pointer column. */
+		if (i + 1 == dict_index_get_n_unique_in_tree(new_index)
+		    && rec_max_size + REC_NODE_PTR_SIZE >= page_ptr_max) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Adds an index to the dictionary cache.
+@return	DB_SUCCESS, DB_TOO_BIG_RECORD, or DB_CORRUPTION */
+UNIV_INTERN
+dberr_t
+dict_index_add_to_cache(
+/*====================*/
+	dict_table_t*	table,	/*!< in: table on which the index is */
+	dict_index_t*	index,	/*!< in, own: index; NOTE! The index memory
+				object is freed in this function! */
+	ulint		page_no,/*!< in: root page number of the index */
+	ibool		strict)	/*!< in: TRUE=refuse to create the index
+				if records could be too big to fit in
+				an B-tree page */
+{
+	dict_index_t*	new_index;
+	ulint		n_ord;
+	ulint		i;
+
+	ut_ad(index);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(index->n_def == index->n_fields);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(!dict_index_is_online_ddl(index));
+
+	ut_ad(mem_heap_validate(index->heap));
+	ut_a(!dict_index_is_clust(index)
+	     || UT_LIST_GET_LEN(table->indexes) == 0);
+
+	if (!dict_index_find_cols(table, index)) {
+
+		dict_mem_index_free(index);
+		return(DB_CORRUPTION);
+	}
+
+	/* Build the cache internal representation of the index,
+	containing also the added system fields */
+
+	if (index->type == DICT_FTS) {
+		new_index = dict_index_build_internal_fts(table, index);
+	} else if (dict_index_is_clust(index)) {
+		new_index = dict_index_build_internal_clust(table, index);
+	} else {
+		new_index = dict_index_build_internal_non_clust(table, index);
+	}
+
+	/* Set the n_fields value in new_index to the actual defined
+	number of fields in the cache internal representation */
+
+	new_index->n_fields = new_index->n_def;
+	new_index->trx_id = index->trx_id;
+
+	if (dict_index_too_big_for_tree(table, new_index)) {
+
+		if (strict) {
+too_big:
+			dict_mem_index_free(new_index);
+			dict_mem_index_free(index);
+			return(DB_TOO_BIG_RECORD);
+		} else if (current_thd != NULL) {
+			/* Avoid the warning to be printed
+			during recovery. */
+			ib_warn_row_too_big(table);
+		}
+	}
+
+	if (dict_index_is_univ(index)) {
+		n_ord = new_index->n_fields;
+	} else {
+		n_ord = new_index->n_uniq;
+	}
+
+#if 1	/* The following code predetermines whether to call
+	dict_index_too_big_for_undo().  This function is not
+	accurate. See innodb_4k.test, Bug 13336585, for a
+	testcase that shows an index that can be created but
+	cannot be updated. */
+
+	switch (dict_table_get_format(table)) {
+	case UNIV_FORMAT_A:
+		/* ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT store
+		prefixes of externally stored columns locally within
+		the record.  There are no special considerations for
+		the undo log record size. */
+		goto undo_size_ok;
+
+	case UNIV_FORMAT_B:
+		/* In ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED,
+		column prefix indexes require that prefixes of
+		externally stored columns are written to the undo log.
+		This may make the undo log record bigger than the
+		record on the B-tree page.  The maximum size of an
+		undo log record is the page size.  That must be
+		checked for below. */
+		break;
+
+#if UNIV_FORMAT_B != UNIV_FORMAT_MAX
+# error "UNIV_FORMAT_B != UNIV_FORMAT_MAX"
+#endif
+	}
+
+	for (i = 0; i < n_ord; i++) {
+		const dict_field_t*	field
+			= dict_index_get_nth_field(new_index, i);
+		const dict_col_t*	col
+			= dict_field_get_col(field);
+
+		/* In dtuple_convert_big_rec(), variable-length columns
+		that are longer than BTR_EXTERN_FIELD_REF_SIZE * 2
+		may be chosen for external storage.  If the column appears
+		in an ordering column of an index, a longer prefix determined
+		by dict_max_field_len_store_undo() will be copied to the undo
+		log by trx_undo_page_report_modify() and
+		trx_undo_page_fetch_ext().  It suffices to check the
+		capacity of the undo log whenever new_index includes
+		a column prefix on a column that may be stored externally. */
+
+		if (field->prefix_len /* prefix index */
+		    && (!col->ord_part /* not yet ordering column */
+			|| field->prefix_len > col->max_prefix)
+		    && !dict_col_get_fixed_size(col, TRUE) /* variable-length */
+		    && dict_col_get_max_size(col)
+		    > BTR_EXTERN_FIELD_REF_SIZE * 2 /* long enough */) {
+
+			if (dict_index_too_big_for_undo(table, new_index)) {
+				/* An undo log record might not fit in
+				a single page.  Refuse to create this index. */
+
+				goto too_big;
+			}
+
+			break;
+		}
+	}
+
+undo_size_ok:
+#endif
+	/* Flag the ordering columns and also set column max_prefix */
+
+	for (i = 0; i < n_ord; i++) {
+		const dict_field_t*	field
+			= dict_index_get_nth_field(new_index, i);
+
+		field->col->ord_part = 1;
+
+		if (field->prefix_len > field->col->max_prefix) {
+			field->col->max_prefix = field->prefix_len;
+		}
+	}
+
+	if (!dict_index_is_univ(new_index)) {
+
+		new_index->stat_n_diff_key_vals =
+			static_cast<ib_uint64_t*>(mem_heap_zalloc(
+			new_index->heap,
+			dict_index_get_n_unique(new_index)
+			* sizeof(*new_index->stat_n_diff_key_vals)));
+
+		new_index->stat_n_sample_sizes =
+			static_cast<ib_uint64_t*>(mem_heap_zalloc(
+			new_index->heap,
+			dict_index_get_n_unique(new_index)
+			* sizeof(*new_index->stat_n_sample_sizes)));
+
+		new_index->stat_n_non_null_key_vals =
+			static_cast<ib_uint64_t*>(mem_heap_zalloc(
+			new_index->heap,
+			dict_index_get_n_unique(new_index)
+			* sizeof(*new_index->stat_n_non_null_key_vals)));
+	}
+
+	new_index->stat_index_size = 1;
+	new_index->stat_n_leaf_pages = 1;
+
+	/* Add the new index as the last index for the table */
+
+	UT_LIST_ADD_LAST(indexes, table->indexes, new_index);
+	new_index->table = table;
+	new_index->table_name = table->name;
+	new_index->search_info = btr_search_info_create(new_index->heap);
+
+	new_index->page = page_no;
+	rw_lock_create(index_tree_rw_lock_key, &new_index->lock,
+		       dict_index_is_ibuf(index)
+		       ? SYNC_IBUF_INDEX_TREE : SYNC_INDEX_TREE);
+
+	dict_sys->size += mem_heap_get_size(new_index->heap);
+
+	dict_mem_index_free(index);
+
+	return(DB_SUCCESS);
+}
+
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+static
+void
+dict_index_remove_from_cache_low(
+/*=============================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	dict_index_t*	index,		/*!< in, own: index */
+	ibool		lru_evict)	/*!< in: TRUE if index being evicted
+					to make room in the table LRU list */
+{
+	ulint		size;
+	ulint		retries = 0;
+	btr_search_t*	info;
+
+	ut_ad(table && index);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	/* No need to acquire the dict_index_t::lock here because
+	there can't be any active operations on this index (or table). */
+
+	if (index->online_log) {
+		ut_ad(index->online_status == ONLINE_INDEX_CREATION);
+		row_log_free(index->online_log);
+	}
+
+	/* We always create search info whether or not adaptive
+	hash index is enabled or not. */
+	info = btr_search_get_info(index);
+	ut_ad(info);
+
+	/* We are not allowed to free the in-memory index struct
+	dict_index_t until all entries in the adaptive hash index
+	that point to any of the page belonging to his b-tree index
+	are dropped. This is so because dropping of these entries
+	require access to dict_index_t struct. To avoid such scenario
+	We keep a count of number of such pages in the search_info and
+	only free the dict_index_t struct when this count drops to
+	zero. See also: dict_table_can_be_evicted() */
+
+	do {
+		ulint ref_count = btr_search_info_get_ref_count(info);
+
+		if (ref_count == 0) {
+			break;
+		}
+
+		/* Sleep for 10ms before trying again. */
+		os_thread_sleep(10000);
+		++retries;
+
+		if (retries % 500 == 0) {
+			/* No luck after 5 seconds of wait. */
+			fprintf(stderr, "InnoDB: Error: Waited for"
+				" %lu secs for hash index"
+				" ref_count (%lu) to drop"
+				" to 0.\n"
+				"index: \"%s\""
+				" table: \"%s\"\n",
+				retries/100,
+				ref_count,
+				index->name,
+				table->name);
+		}
+
+		/* To avoid a hang here we commit suicide if the
+		ref_count doesn't drop to zero in 600 seconds. */
+		if (retries >= 60000) {
+			ut_error;
+		}
+	} while (srv_shutdown_state == SRV_SHUTDOWN_NONE || !lru_evict);
+
+	rw_lock_free(&index->lock);
+
+	/* Remove the index from the list of indexes of the table */
+	UT_LIST_REMOVE(indexes, table->indexes, index);
+
+	size = mem_heap_get_size(index->heap);
+
+	ut_ad(dict_sys->size >= size);
+
+	dict_sys->size -= size;
+
+	dict_mem_index_free(index);
+}
+
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+UNIV_INTERN
+void
+dict_index_remove_from_cache(
+/*=========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	dict_index_t*	index)	/*!< in, own: index */
+{
+	dict_index_remove_from_cache_low(table, index, FALSE);
+}
+
+/*******************************************************************//**
+Tries to find column names for the index and sets the col field of the
+index.
+@return TRUE if the column names were found */
+static
+ibool
+dict_index_find_cols(
+/*=================*/
+	dict_table_t*	table,	/*!< in: table */
+	dict_index_t*	index)	/*!< in: index */
+{
+	ulint		i;
+
+	ut_ad(table && index);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	for (i = 0; i < index->n_fields; i++) {
+		ulint		j;
+		dict_field_t*	field = dict_index_get_nth_field(index, i);
+
+		for (j = 0; j < table->n_cols; j++) {
+			if (!strcmp(dict_table_get_col_name(table, j),
+				    field->name)) {
+				field->col = dict_table_get_nth_col(table, j);
+
+				goto found;
+			}
+		}
+
+#ifdef UNIV_DEBUG
+		/* It is an error not to find a matching column. */
+		fputs("InnoDB: Error: no matching column for ", stderr);
+		ut_print_name(stderr, NULL, FALSE, field->name);
+		fputs(" in ", stderr);
+		dict_index_name_print(stderr, NULL, index);
+		fputs("!\n", stderr);
+#endif /* UNIV_DEBUG */
+		return(FALSE);
+
+found:
+		;
+	}
+
+	return(TRUE);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Adds a column to index. */
+UNIV_INTERN
+void
+dict_index_add_col(
+/*===============*/
+	dict_index_t*		index,		/*!< in/out: index */
+	const dict_table_t*	table,		/*!< in: table */
+	dict_col_t*		col,		/*!< in: column */
+	ulint			prefix_len)	/*!< in: column prefix length */
+{
+	dict_field_t*	field;
+	const char*	col_name;
+
+	col_name = dict_table_get_col_name(table, dict_col_get_no(col));
+
+	dict_mem_index_add_field(index, col_name, prefix_len);
+
+	field = dict_index_get_nth_field(index, index->n_def - 1);
+
+	field->col = col;
+	field->fixed_len = (unsigned int) dict_col_get_fixed_size(
+		col, dict_table_is_comp(table));
+
+	if (prefix_len && field->fixed_len > prefix_len) {
+		field->fixed_len = (unsigned int) prefix_len;
+	}
+
+	/* Long fixed-length fields that need external storage are treated as
+	variable-length fields, so that the extern flag can be embedded in
+	the length word. */
+
+	if (field->fixed_len > DICT_MAX_FIXED_COL_LEN) {
+		field->fixed_len = 0;
+	}
+#if DICT_MAX_FIXED_COL_LEN != 768
+	/* The comparison limit above must be constant.  If it were
+	changed, the disk format of some fixed-length columns would
+	change, which would be a disaster. */
+# error "DICT_MAX_FIXED_COL_LEN != 768"
+#endif
+
+	if (!(col->prtype & DATA_NOT_NULL)) {
+		index->n_nullable++;
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Copies fields contained in index2 to index1. */
+static
+void
+dict_index_copy(
+/*============*/
+	dict_index_t*		index1,	/*!< in: index to copy to */
+	dict_index_t*		index2,	/*!< in: index to copy from */
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			start,	/*!< in: first position to copy */
+	ulint			end)	/*!< in: last position to copy */
+{
+	dict_field_t*	field;
+	ulint		i;
+
+	/* Copy fields contained in index2 */
+
+	for (i = start; i < end; i++) {
+
+		field = dict_index_get_nth_field(index2, i);
+		dict_index_add_col(index1, table, field->col,
+				   field->prefix_len);
+	}
+}
+
+/*******************************************************************//**
+Copies types of fields contained in index to tuple. */
+UNIV_INTERN
+void
+dict_index_copy_types(
+/*==================*/
+	dtuple_t*		tuple,		/*!< in/out: data tuple */
+	const dict_index_t*	index,		/*!< in: index */
+	ulint			n_fields)	/*!< in: number of
+						field types to copy */
+{
+	ulint		i;
+
+	if (dict_index_is_univ(index)) {
+		dtuple_set_types_binary(tuple, n_fields);
+
+		return;
+	}
+
+	for (i = 0; i < n_fields; i++) {
+		const dict_field_t*	ifield;
+		dtype_t*		dfield_type;
+
+		ifield = dict_index_get_nth_field(index, i);
+		dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i));
+		dict_col_copy_type(dict_field_get_col(ifield), dfield_type);
+	}
+}
+
+/*******************************************************************//**
+Copies types of columns contained in table to tuple and sets all
+fields of the tuple to the SQL NULL value.  This function should
+be called right after dtuple_create(). */
+UNIV_INTERN
+void
+dict_table_copy_types(
+/*==================*/
+	dtuple_t*		tuple,	/*!< in/out: data tuple */
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ulint		i;
+
+	for (i = 0; i < dtuple_get_n_fields(tuple); i++) {
+
+		dfield_t*	dfield	= dtuple_get_nth_field(tuple, i);
+		dtype_t*	dtype	= dfield_get_type(dfield);
+
+		dfield_set_null(dfield);
+		dict_col_copy_type(dict_table_get_nth_col(table, i), dtype);
+	}
+}
+
+/********************************************************************
+Wait until all the background threads of the given table have exited, i.e.,
+bg_threads == 0. Note: bg_threads_mutex must be reserved when
+calling this. */
+UNIV_INTERN
+void
+dict_table_wait_for_bg_threads_to_exit(
+/*===================================*/
+	dict_table_t*	table,	/*< in: table */
+	ulint		delay)	/*< in: time in microseconds to wait between
+				checks of bg_threads. */
+{
+	fts_t*		fts = table->fts;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(mutex_own(&fts->bg_threads_mutex));
+#endif /* UNIV_SYNC_DEBUG */
+
+	while (fts->bg_threads > 0) {
+		mutex_exit(&fts->bg_threads_mutex);
+
+		os_thread_sleep(delay);
+
+		mutex_enter(&fts->bg_threads_mutex);
+	}
+}
+
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a clustered
+index, containing also system fields not defined by the user.
+@return	own: the internal representation of the clustered index */
+static
+dict_index_t*
+dict_index_build_internal_clust(
+/*============================*/
+	const dict_table_t*	table,	/*!< in: table */
+	dict_index_t*		index)	/*!< in: user representation of
+					a clustered index */
+{
+	dict_index_t*	new_index;
+	dict_field_t*	field;
+	ulint		trx_id_pos;
+	ulint		i;
+	ibool*		indexed;
+
+	ut_ad(table && index);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	/* Create a new index object with certainly enough fields */
+	new_index = dict_mem_index_create(table->name,
+					  index->name, table->space,
+					  index->type,
+					  index->n_fields + table->n_cols);
+
+	/* Copy other relevant data from the old index struct to the new
+	struct: it inherits the values */
+
+	new_index->n_user_defined_cols = index->n_fields;
+
+	new_index->id = index->id;
+
+	/* Copy the fields of index */
+	dict_index_copy(new_index, index, table, 0, index->n_fields);
+
+	if (dict_index_is_univ(index)) {
+		/* No fixed number of fields determines an entry uniquely */
+
+		new_index->n_uniq = REC_MAX_N_FIELDS;
+
+	} else if (dict_index_is_unique(index)) {
+		/* Only the fields defined so far are needed to identify
+		the index entry uniquely */
+
+		new_index->n_uniq = new_index->n_def;
+	} else {
+		/* Also the row id is needed to identify the entry */
+		new_index->n_uniq = 1 + new_index->n_def;
+	}
+
+	new_index->trx_id_offset = 0;
+
+	if (!dict_index_is_ibuf(index)) {
+		/* Add system columns, trx id first */
+
+		trx_id_pos = new_index->n_def;
+
+#if DATA_ROW_ID != 0
+# error "DATA_ROW_ID != 0"
+#endif
+#if DATA_TRX_ID != 1
+# error "DATA_TRX_ID != 1"
+#endif
+#if DATA_ROLL_PTR != 2
+# error "DATA_ROLL_PTR != 2"
+#endif
+
+		if (!dict_index_is_unique(index)) {
+			dict_index_add_col(new_index, table,
+					   dict_table_get_sys_col(
+						   table, DATA_ROW_ID),
+					   0);
+			trx_id_pos++;
+		}
+
+		dict_index_add_col(new_index, table,
+				   dict_table_get_sys_col(table, DATA_TRX_ID),
+				   0);
+
+		dict_index_add_col(new_index, table,
+				   dict_table_get_sys_col(table,
+							  DATA_ROLL_PTR),
+				   0);
+
+		for (i = 0; i < trx_id_pos; i++) {
+
+			ulint	fixed_size = dict_col_get_fixed_size(
+				dict_index_get_nth_col(new_index, i),
+				dict_table_is_comp(table));
+
+			if (fixed_size == 0) {
+				new_index->trx_id_offset = 0;
+
+				break;
+			}
+
+			if (dict_index_get_nth_field(new_index, i)->prefix_len
+			    > 0) {
+				new_index->trx_id_offset = 0;
+
+				break;
+			}
+
+			/* Add fixed_size to new_index->trx_id_offset.
+			Because the latter is a bit-field, an overflow
+			can theoretically occur. Check for it. */
+			fixed_size += new_index->trx_id_offset;
+
+			new_index->trx_id_offset = fixed_size;
+
+			if (new_index->trx_id_offset != fixed_size) {
+				/* Overflow. Pretend that this is a
+				variable-length PRIMARY KEY. */
+				ut_ad(0);
+				new_index->trx_id_offset = 0;
+				break;
+			}
+		}
+
+	}
+
+	/* Remember the table columns already contained in new_index */
+	indexed = static_cast<ibool*>(
+		mem_zalloc(table->n_cols * sizeof *indexed));
+
+	/* Mark the table columns already contained in new_index */
+	for (i = 0; i < new_index->n_def; i++) {
+
+		field = dict_index_get_nth_field(new_index, i);
+
+		/* If there is only a prefix of the column in the index
+		field, do not mark the column as contained in the index */
+
+		if (field->prefix_len == 0) {
+
+			indexed[field->col->ind] = TRUE;
+		}
+	}
+
+	/* Add to new_index non-system columns of table not yet included
+	there */
+	for (i = 0; i + DATA_N_SYS_COLS < (ulint) table->n_cols; i++) {
+
+		dict_col_t*	col = dict_table_get_nth_col(table, i);
+		ut_ad(col->mtype != DATA_SYS);
+
+		if (!indexed[col->ind]) {
+			dict_index_add_col(new_index, table, col, 0);
+		}
+	}
+
+	mem_free(indexed);
+
+	ut_ad(dict_index_is_ibuf(index)
+	      || (UT_LIST_GET_LEN(table->indexes) == 0));
+
+	new_index->cached = TRUE;
+
+	return(new_index);
+}
+
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a non-clustered
+index, containing also system fields not defined by the user.
+@return	own: the internal representation of the non-clustered index */
+static
+dict_index_t*
+dict_index_build_internal_non_clust(
+/*================================*/
+	const dict_table_t*	table,	/*!< in: table */
+	dict_index_t*		index)	/*!< in: user representation of
+					a non-clustered index */
+{
+	dict_field_t*	field;
+	dict_index_t*	new_index;
+	dict_index_t*	clust_index;
+	ulint		i;
+	ibool*		indexed;
+
+	ut_ad(table && index);
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	/* The clustered index should be the first in the list of indexes */
+	clust_index = UT_LIST_GET_FIRST(table->indexes);
+
+	ut_ad(clust_index);
+	ut_ad(dict_index_is_clust(clust_index));
+	ut_ad(!dict_index_is_univ(clust_index));
+
+	/* Create a new index */
+	new_index = dict_mem_index_create(
+		table->name, index->name, index->space, index->type,
+		index->n_fields + 1 + clust_index->n_uniq);
+
+	/* Copy other relevant data from the old index
+	struct to the new struct: it inherits the values */
+
+	new_index->n_user_defined_cols = index->n_fields;
+
+	new_index->id = index->id;
+
+	/* Copy fields from index to new_index */
+	dict_index_copy(new_index, index, table, 0, index->n_fields);
+
+	/* Remember the table columns already contained in new_index */
+	indexed = static_cast<ibool*>(
+		mem_zalloc(table->n_cols * sizeof *indexed));
+
+	/* Mark the table columns already contained in new_index */
+	for (i = 0; i < new_index->n_def; i++) {
+
+		field = dict_index_get_nth_field(new_index, i);
+
+		/* If there is only a prefix of the column in the index
+		field, do not mark the column as contained in the index */
+
+		if (field->prefix_len == 0) {
+
+			indexed[field->col->ind] = TRUE;
+		}
+	}
+
+	/* Add to new_index the columns necessary to determine the clustered
+	index entry uniquely */
+
+	for (i = 0; i < clust_index->n_uniq; i++) {
+
+		field = dict_index_get_nth_field(clust_index, i);
+
+		if (!indexed[field->col->ind]) {
+			dict_index_add_col(new_index, table, field->col,
+					   field->prefix_len);
+		}
+	}
+
+	mem_free(indexed);
+
+	if (dict_index_is_unique(index)) {
+		new_index->n_uniq = index->n_fields;
+	} else {
+		new_index->n_uniq = new_index->n_def;
+	}
+
+	/* Set the n_fields value in new_index to the actual defined
+	number of fields */
+
+	new_index->n_fields = new_index->n_def;
+
+	new_index->cached = TRUE;
+
+	return(new_index);
+}
+
+/***********************************************************************
+Builds the internal dictionary cache representation for an FTS index.
+@return	own: the internal representation of the FTS index */
+static
+dict_index_t*
+dict_index_build_internal_fts(
+/*==========================*/
+	dict_table_t*	table,	/*!< in: table */
+	dict_index_t*	index)	/*!< in: user representation of an FTS index */
+{
+	dict_index_t*	new_index;
+
+	ut_ad(table && index);
+	ut_ad(index->type == DICT_FTS);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	/* Create a new index */
+	new_index = dict_mem_index_create(
+		table->name, index->name, index->space, index->type,
+		index->n_fields);
+
+	/* Copy other relevant data from the old index struct to the new
+	struct: it inherits the values */
+
+	new_index->n_user_defined_cols = index->n_fields;
+
+	new_index->id = index->id;
+
+	/* Copy fields from index to new_index */
+	dict_index_copy(new_index, index, table, 0, index->n_fields);
+
+	new_index->n_uniq = 0;
+	new_index->cached = TRUE;
+
+	if (table->fts->cache == NULL) {
+		table->fts->cache = fts_cache_create(table);
+	}
+
+	rw_lock_x_lock(&table->fts->cache->init_lock);
+	/* Notify the FTS cache about this index. */
+	fts_cache_index_cache_create(table, new_index);
+	rw_lock_x_unlock(&table->fts->cache->init_lock);
+
+	return(new_index);
+}
+/*====================== FOREIGN KEY PROCESSING ========================*/
+
+/*********************************************************************//**
+Checks if a table is referenced by foreign keys.
+@return	TRUE if table is referenced by a foreign key */
+UNIV_INTERN
+ibool
+dict_table_is_referenced_by_foreign_key(
+/*====================================*/
+	const dict_table_t*	table)	/*!< in: InnoDB table */
+{
+	return(!table->referenced_set.empty());
+}
+
+/*********************************************************************//**
+Check if the index is referenced by a foreign key, if TRUE return foreign
+else return NULL
+@return pointer to foreign key struct if index is defined for foreign
+key, otherwise NULL */
+UNIV_INTERN
+dict_foreign_t*
+dict_table_get_referenced_constraint(
+/*=================================*/
+	dict_table_t*	table,	/*!< in: InnoDB table */
+	dict_index_t*	index)	/*!< in: InnoDB index */
+{
+	dict_foreign_t*	foreign;
+
+	ut_ad(index != NULL);
+	ut_ad(table != NULL);
+
+	for (dict_foreign_set::iterator it = table->referenced_set.begin();
+	     it != table->referenced_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		if (foreign->referenced_index == index) {
+
+			return(foreign);
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Checks if a index is defined for a foreign key constraint. Index is a part
+of a foreign key constraint if the index is referenced by foreign key
+or index is a foreign key index.
+@return pointer to foreign key struct if index is defined for foreign
+key, otherwise NULL */
+UNIV_INTERN
+dict_foreign_t*
+dict_table_get_foreign_constraint(
+/*==============================*/
+	dict_table_t*	table,	/*!< in: InnoDB table */
+	dict_index_t*	index)	/*!< in: InnoDB index */
+{
+	dict_foreign_t*	foreign;
+
+	ut_ad(index != NULL);
+	ut_ad(table != NULL);
+
+	for (dict_foreign_set::iterator it = table->foreign_set.begin();
+	     it != table->foreign_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		if (foreign->foreign_index == index) {
+
+			return(foreign);
+		}
+	}
+
+	return(NULL);
+}
+
+/**********************************************************************//**
+Removes a foreign constraint struct from the dictionary cache. */
+UNIV_INTERN
+void
+dict_foreign_remove_from_cache(
+/*===========================*/
+	dict_foreign_t*	foreign)	/*!< in, own: foreign constraint */
+{
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_a(foreign);
+
+	if (foreign->referenced_table != NULL) {
+		foreign->referenced_table->referenced_set.erase(foreign);
+	}
+
+	if (foreign->foreign_table != NULL) {
+		foreign->foreign_table->foreign_set.erase(foreign);
+	}
+
+	dict_foreign_free(foreign);
+}
+
+/**********************************************************************//**
+Looks for the foreign constraint from the foreign and referenced lists
+of a table.
+@return	foreign constraint */
+static
+dict_foreign_t*
+dict_foreign_find(
+/*==============*/
+	dict_table_t*	table,		/*!< in: table object */
+	dict_foreign_t*	foreign)	/*!< in: foreign constraint */
+{
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	ut_ad(dict_foreign_set_validate(table->foreign_set));
+	ut_ad(dict_foreign_set_validate(table->referenced_set));
+
+	dict_foreign_set::iterator it = table->foreign_set.find(foreign);
+
+	if (it != table->foreign_set.end()) {
+		return(*it);
+	}
+
+	it = table->referenced_set.find(foreign);
+
+	if (it != table->referenced_set.end()) {
+		return(*it);
+	}
+
+	return(NULL);
+}
+
+
+/*********************************************************************//**
+Tries to find an index whose first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return	matching index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_foreign_find_index(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const char**		columns,/*!< in: array of column names */
+	ulint			n_cols,	/*!< in: number of columns */
+	const dict_index_t*	types_idx,
+					/*!< in: NULL or an index
+					whose types the column types
+					must match */
+	bool			check_charsets,
+					/*!< in: whether to check
+					charsets.  only has an effect
+					if types_idx != NULL */
+	ulint			check_null)
+					/*!< in: nonzero if none of
+					the columns must be declared
+					NOT NULL */
+{
+	dict_index_t*	index;
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	index = dict_table_get_first_index(table);
+
+	while (index != NULL) {
+		if (types_idx != index
+		    && !(index->type & DICT_FTS)
+		    && !index->to_be_dropped
+		    && dict_foreign_qualify_index(
+			    table, col_names, columns, n_cols,
+			    index, types_idx,
+			    check_charsets, check_null)) {
+			return(index);
+		}
+
+		index = dict_table_get_next_index(index);
+	}
+
+	return(NULL);
+}
+
+/**********************************************************************//**
+Report an error in a foreign key definition. */
+static
+void
+dict_foreign_error_report_low(
+/*==========================*/
+	FILE*		file,	/*!< in: output stream */
+	const char*	name)	/*!< in: table name */
+{
+	rewind(file);
+	ut_print_timestamp(file);
+	fprintf(file, " Error in foreign key constraint of table %s:\n",
+		name);
+}
+
+/**********************************************************************//**
+Report an error in a foreign key definition. */
+static
+void
+dict_foreign_error_report(
+/*======================*/
+	FILE*		file,	/*!< in: output stream */
+	dict_foreign_t*	fk,	/*!< in: foreign key constraint */
+	const char*	msg)	/*!< in: the error message */
+{
+	mutex_enter(&dict_foreign_err_mutex);
+	dict_foreign_error_report_low(file, fk->foreign_table_name);
+	fputs(msg, file);
+	fputs(" Constraint:\n", file);
+	dict_print_info_on_foreign_key_in_create_format(file, NULL, fk, TRUE);
+	putc('\n', file);
+	if (fk->foreign_index) {
+		fputs("The index in the foreign key in table is ", file);
+		ut_print_name(file, NULL, FALSE, fk->foreign_index->name);
+		fputs("\n"
+		      "See " REFMAN "innodb-foreign-key-constraints.html\n"
+		      "for correct foreign key definition.\n",
+		      file);
+	}
+	mutex_exit(&dict_foreign_err_mutex);
+}
+
+/**********************************************************************//**
+Adds a foreign key constraint object to the dictionary cache. May free
+the object if there already is an object with the same identifier in.
+At least one of the foreign table and the referenced table must already
+be in the dictionary cache!
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_foreign_add_to_cache(
+/*======================*/
+	dict_foreign_t*		foreign,
+				/*!< in, own: foreign key constraint */
+	const char**		col_names,
+				/*!< in: column names, or NULL to use
+				foreign->foreign_table->col_names */
+	bool			check_charsets,
+				/*!< in: whether to check charset
+				compatibility */
+	dict_err_ignore_t	ignore_err)
+				/*!< in: error to be ignored */
+{
+	dict_table_t*	for_table;
+	dict_table_t*	ref_table;
+	dict_foreign_t*	for_in_cache		= NULL;
+	dict_index_t*	index;
+	ibool		added_to_referenced_list= FALSE;
+	FILE*		ef			= dict_foreign_err_file;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	for_table = dict_table_check_if_in_cache_low(
+		foreign->foreign_table_name_lookup);
+
+	ref_table = dict_table_check_if_in_cache_low(
+		foreign->referenced_table_name_lookup);
+	ut_a(for_table || ref_table);
+
+	if (for_table) {
+		for_in_cache = dict_foreign_find(for_table, foreign);
+	}
+
+	if (!for_in_cache && ref_table) {
+		for_in_cache = dict_foreign_find(ref_table, foreign);
+	}
+
+	if (for_in_cache) {
+		/* Free the foreign object */
+		mem_heap_free(foreign->heap);
+	} else {
+		for_in_cache = foreign;
+	}
+
+	if (ref_table && !for_in_cache->referenced_table) {
+		index = dict_foreign_find_index(
+			ref_table, NULL,
+			for_in_cache->referenced_col_names,
+			for_in_cache->n_fields, for_in_cache->foreign_index,
+			check_charsets, false);
+
+		if (index == NULL
+		    && !(ignore_err & DICT_ERR_IGNORE_FK_NOKEY)) {
+			dict_foreign_error_report(
+				ef, for_in_cache,
+				"there is no index in referenced table"
+				" which would contain\n"
+				"the columns as the first columns,"
+				" or the data types in the\n"
+				"referenced table do not match"
+				" the ones in table.");
+
+			if (for_in_cache == foreign) {
+				mem_heap_free(foreign->heap);
+			}
+
+			return(DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		for_in_cache->referenced_table = ref_table;
+		for_in_cache->referenced_index = index;
+
+		std::pair<dict_foreign_set::iterator, bool>	ret
+			= ref_table->referenced_set.insert(for_in_cache);
+
+		ut_a(ret.second);	/* second is true if the insertion
+					took place */
+		added_to_referenced_list = TRUE;
+	}
+
+	if (for_table && !for_in_cache->foreign_table) {
+		index = dict_foreign_find_index(
+			for_table, col_names,
+			for_in_cache->foreign_col_names,
+			for_in_cache->n_fields,
+			for_in_cache->referenced_index, check_charsets,
+			for_in_cache->type
+			& (DICT_FOREIGN_ON_DELETE_SET_NULL
+			   | DICT_FOREIGN_ON_UPDATE_SET_NULL));
+
+		if (index == NULL
+		    && !(ignore_err & DICT_ERR_IGNORE_FK_NOKEY)) {
+			dict_foreign_error_report(
+				ef, for_in_cache,
+				"there is no index in the table"
+				" which would contain\n"
+				"the columns as the first columns,"
+				" or the data types in the\n"
+				"table do not match"
+				" the ones in the referenced table\n"
+				"or one of the ON ... SET NULL columns"
+				" is declared NOT NULL.");
+
+			if (for_in_cache == foreign) {
+				if (added_to_referenced_list) {
+					const dict_foreign_set::size_type	n
+						= ref_table->referenced_set
+						  .erase(for_in_cache);
+
+					ut_a(n == 1);	/* the number of
+							elements removed must
+							be one */
+				}
+
+				mem_heap_free(foreign->heap);
+			}
+
+			return(DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		for_in_cache->foreign_table = for_table;
+		for_in_cache->foreign_index = index;
+		std::pair<dict_foreign_set::iterator, bool>	ret
+			= for_table->foreign_set.insert(for_in_cache);
+
+		ut_a(ret.second);	/* second is true if the insertion
+					took place */
+	}
+
+	/* We need to move the table to the non-LRU end of the table LRU
+	list. Otherwise it will be evicted from the cache. */
+
+	if (ref_table != NULL && ref_table->can_be_evicted) {
+		dict_table_move_from_lru_to_non_lru(ref_table);
+	}
+
+	if (for_table != NULL && for_table->can_be_evicted) {
+		dict_table_move_from_lru_to_non_lru(for_table);
+	}
+
+	ut_ad(dict_lru_validate());
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Scans from pointer onwards. Stops if is at the start of a copy of
+'string' where characters are compared without case sensitivity, and
+only outside `` or "" quotes. Stops also at NUL.
+@return	scanned up to this */
+static
+const char*
+dict_scan_to(
+/*=========*/
+	const char*	ptr,	/*!< in: scan from */
+	const char*	string)	/*!< in: look for this */
+{
+	char	quote	= '\0';
+	bool	escape	= false;
+
+	for (; *ptr; ptr++) {
+		if (*ptr == quote) {
+			/* Closing quote character: do not look for
+			starting quote or the keyword. */
+
+			/* If the quote character is escaped by a
+			backslash, ignore it. */
+			if (escape) {
+				escape = false;
+			} else {
+				quote = '\0';
+			}
+		} else if (quote) {
+			/* Within quotes: do nothing. */
+			if (escape) {
+				escape = false;
+			} else if (*ptr == '\\') {
+				escape = true;
+			}
+		} else if (*ptr == '`' || *ptr == '"' || *ptr == '\'') {
+			/* Starting quote: remember the quote character. */
+			quote = *ptr;
+		} else {
+			/* Outside quotes: look for the keyword. */
+			ulint	i;
+			for (i = 0; string[i]; i++) {
+				if (toupper((int)(unsigned char)(ptr[i]))
+				    != toupper((int)(unsigned char)
+					       (string[i]))) {
+					goto nomatch;
+				}
+			}
+			break;
+nomatch:
+			;
+		}
+	}
+
+	return(ptr);
+}
+
+/*********************************************************************//**
+Accepts a specified string. Comparisons are case-insensitive.
+@return if string was accepted, the pointer is moved after that, else
+ptr is returned */
+static
+const char*
+dict_accept(
+/*========*/
+	struct charset_info_st*	cs,/*!< in: the character set of ptr */
+	const char*	ptr,	/*!< in: scan from this */
+	const char*	string,	/*!< in: accept only this string as the next
+				non-whitespace string */
+	ibool*		success)/*!< out: TRUE if accepted */
+{
+	const char*	old_ptr = ptr;
+	const char*	old_ptr2;
+
+	*success = FALSE;
+
+	while (my_isspace(cs, *ptr)) {
+		ptr++;
+	}
+
+	old_ptr2 = ptr;
+
+	ptr = dict_scan_to(ptr, string);
+
+	if (*ptr == '\0' || old_ptr2 != ptr) {
+		return(old_ptr);
+	}
+
+	*success = TRUE;
+
+	return(ptr + ut_strlen(string));
+}
+
+/*********************************************************************//**
+Scans an id. For the lexical definition of an 'id', see the code below.
+Strips backquotes or double quotes from around the id.
+@return	scanned to */
+static
+const char*
+dict_scan_id(
+/*=========*/
+	struct charset_info_st*	cs,/*!< in: the character set of ptr */
+	const char*	ptr,	/*!< in: scanned to */
+	mem_heap_t*	heap,	/*!< in: heap where to allocate the id
+				(NULL=id will not be allocated, but it
+				will point to string near ptr) */
+	const char**	id,	/*!< out,own: the id; NULL if no id was
+				scannable */
+	ibool		table_id,/*!< in: TRUE=convert the allocated id
+				as a table name; FALSE=convert to UTF-8 */
+	ibool		accept_also_dot)
+				/*!< in: TRUE if also a dot can appear in a
+				non-quoted id; in a quoted id it can appear
+				always */
+{
+	char		quote	= '\0';
+	ulint		len	= 0;
+	const char*	s;
+	char*		str;
+	char*		dst;
+
+	*id = NULL;
+
+	while (my_isspace(cs, *ptr)) {
+		ptr++;
+	}
+
+	if (*ptr == '\0') {
+
+		return(ptr);
+	}
+
+	if (*ptr == '`' || *ptr == '"') {
+		quote = *ptr++;
+	}
+
+	s = ptr;
+
+	if (quote) {
+		for (;;) {
+			if (!*ptr) {
+				/* Syntax error */
+				return(ptr);
+			}
+			if (*ptr == quote) {
+				ptr++;
+				if (*ptr != quote) {
+					break;
+				}
+			}
+			ptr++;
+			len++;
+		}
+	} else {
+		while (!my_isspace(cs, *ptr) && *ptr != '(' && *ptr != ')'
+		       && (accept_also_dot || *ptr != '.')
+		       && *ptr != ',' && *ptr != '\0') {
+
+			ptr++;
+		}
+
+		len = ptr - s;
+	}
+
+	if (UNIV_UNLIKELY(!heap)) {
+		/* no heap given: id will point to source string */
+		*id = s;
+		return(ptr);
+	}
+
+	if (quote) {
+		char*	d;
+
+		str = d = static_cast<char*>(
+			mem_heap_alloc(heap, len + 1));
+
+		while (len--) {
+			if ((*d++ = *s++) == quote) {
+				s++;
+			}
+		}
+		*d++ = 0;
+		len = d - str;
+		ut_ad(*s == quote);
+		ut_ad(s + 1 == ptr);
+	} else {
+		str = mem_heap_strdupl(heap, s, len);
+	}
+
+	if (!table_id) {
+convert_id:
+		/* Convert the identifier from connection character set
+		to UTF-8. */
+		len = 3 * len + 1;
+		*id = dst = static_cast<char*>(mem_heap_alloc(heap, len));
+
+		innobase_convert_from_id(cs, dst, str, len);
+	} else if (!strncmp(str, srv_mysql50_table_name_prefix,
+			    sizeof(srv_mysql50_table_name_prefix) - 1)) {
+		/* This is a pre-5.1 table name
+		containing chars other than [A-Za-z0-9].
+		Discard the prefix and use raw UTF-8 encoding. */
+		str += sizeof(srv_mysql50_table_name_prefix) - 1;
+		len -= sizeof(srv_mysql50_table_name_prefix) - 1;
+		goto convert_id;
+	} else {
+		/* Encode using filename-safe characters. */
+		len = 5 * len + 1;
+		*id = dst = static_cast<char*>(mem_heap_alloc(heap, len));
+
+		innobase_convert_from_table_id(cs, dst, str, len);
+	}
+
+	return(ptr);
+}
+
+/*********************************************************************//**
+Tries to scan a column name.
+@return	scanned to */
+static
+const char*
+dict_scan_col(
+/*==========*/
+	struct charset_info_st*	cs,	/*!< in: the character set of ptr */
+	const char*		ptr,	/*!< in: scanned to */
+	ibool*			success,/*!< out: TRUE if success */
+	dict_table_t*		table,	/*!< in: table in which the column is */
+	const dict_col_t**	column,	/*!< out: pointer to column if success */
+	mem_heap_t*		heap,	/*!< in: heap where to allocate */
+	const char**		name)	/*!< out,own: the column name;
+					NULL if no name was scannable */
+{
+	ulint		i;
+
+	*success = FALSE;
+
+	ptr = dict_scan_id(cs, ptr, heap, name, FALSE, TRUE);
+
+	if (*name == NULL) {
+
+		return(ptr);	/* Syntax error */
+	}
+
+	if (table == NULL) {
+		*success = TRUE;
+		*column = NULL;
+	} else {
+		for (i = 0; i < dict_table_get_n_cols(table); i++) {
+
+			const char*	col_name = dict_table_get_col_name(
+				table, i);
+
+			if (0 == innobase_strcasecmp(col_name, *name)) {
+				/* Found */
+
+				*success = TRUE;
+				*column = dict_table_get_nth_col(table, i);
+				strcpy((char*) *name, col_name);
+
+				break;
+			}
+		}
+	}
+
+	return(ptr);
+}
+
+
+/*********************************************************************//**
+Open a table from its database and table name, this is currently used by
+foreign constraint parser to get the referenced table.
+@return complete table name with database and table name, allocated from
+heap memory passed in */
+UNIV_INTERN
+char*
+dict_get_referenced_table(
+/*======================*/
+	const char*	name,		/*!< in: foreign key table name */
+	const char*	database_name,	/*!< in: table db name */
+	ulint		database_name_len, /*!< in: db name length */
+	const char*	table_name,	/*!< in: table name */
+	ulint		table_name_len, /*!< in: table name length */
+	dict_table_t**	table,		/*!< out: table object or NULL */
+	mem_heap_t*	heap)		/*!< in/out: heap memory */
+{
+	char*		ref;
+	const char*	db_name;
+
+	if (!database_name) {
+		/* Use the database name of the foreign key table */
+
+		db_name = name;
+		database_name_len = dict_get_db_name_len(name);
+	} else {
+		db_name = database_name;
+	}
+
+	/* Copy database_name, '/', table_name, '\0' */
+	ref = static_cast<char*>(
+		mem_heap_alloc(heap, database_name_len + table_name_len + 2));
+
+	memcpy(ref, db_name, database_name_len);
+	ref[database_name_len] = '/';
+	memcpy(ref + database_name_len + 1, table_name, table_name_len + 1);
+
+	/* Values;  0 = Store and compare as given; case sensitive
+	            1 = Store and compare in lower; case insensitive
+	            2 = Store as given, compare in lower; case semi-sensitive */
+	if (innobase_get_lower_case_table_names() == 2) {
+		innobase_casedn_str(ref);
+		*table = dict_table_get_low(ref);
+		memcpy(ref, db_name, database_name_len);
+		ref[database_name_len] = '/';
+		memcpy(ref + database_name_len + 1, table_name, table_name_len + 1);
+
+	} else {
+#ifndef __WIN__
+		if (innobase_get_lower_case_table_names() == 1) {
+			innobase_casedn_str(ref);
+		}
+#else
+		innobase_casedn_str(ref);
+#endif /* !__WIN__ */
+		*table = dict_table_get_low(ref);
+	}
+
+	return(ref);
+}
+/*********************************************************************//**
+Scans a table name from an SQL string.
+@return	scanned to */
+static
+const char*
+dict_scan_table_name(
+/*=================*/
+	struct charset_info_st*	cs,/*!< in: the character set of ptr */
+	const char*	ptr,	/*!< in: scanned to */
+	dict_table_t**	table,	/*!< out: table object or NULL */
+	const char*	name,	/*!< in: foreign key table name */
+	ibool*		success,/*!< out: TRUE if ok name found */
+	mem_heap_t*	heap,	/*!< in: heap where to allocate the id */
+	const char**	ref_name)/*!< out,own: the table name;
+				NULL if no name was scannable */
+{
+	const char*	database_name	= NULL;
+	ulint		database_name_len = 0;
+	const char*	table_name	= NULL;
+	const char*	scan_name;
+
+	*success = FALSE;
+	*table = NULL;
+
+	ptr = dict_scan_id(cs, ptr, heap, &scan_name, TRUE, FALSE);
+
+	if (scan_name == NULL) {
+
+		return(ptr);	/* Syntax error */
+	}
+
+	if (*ptr == '.') {
+		/* We scanned the database name; scan also the table name */
+
+		ptr++;
+
+		database_name = scan_name;
+		database_name_len = strlen(database_name);
+
+		ptr = dict_scan_id(cs, ptr, heap, &table_name, TRUE, FALSE);
+
+		if (table_name == NULL) {
+
+			return(ptr);	/* Syntax error */
+		}
+	} else {
+		/* To be able to read table dumps made with InnoDB-4.0.17 or
+		earlier, we must allow the dot separator between the database
+		name and the table name also to appear within a quoted
+		identifier! InnoDB used to print a constraint as:
+		... REFERENCES `databasename.tablename` ...
+		starting from 4.0.18 it is
+		... REFERENCES `databasename`.`tablename` ... */
+		const char* s;
+
+		for (s = scan_name; *s; s++) {
+			if (*s == '.') {
+				database_name = scan_name;
+				database_name_len = s - scan_name;
+				scan_name = ++s;
+				break;/* to do: multiple dots? */
+			}
+		}
+
+		table_name = scan_name;
+	}
+
+	*ref_name = dict_get_referenced_table(
+		name, database_name, database_name_len,
+		table_name, strlen(table_name), table, heap);
+
+	*success = TRUE;
+	return(ptr);
+}
+
+/*********************************************************************//**
+Skips one id. The id is allowed to contain also '.'.
+@return	scanned to */
+static
+const char*
+dict_skip_word(
+/*===========*/
+	struct charset_info_st*	cs,/*!< in: the character set of ptr */
+	const char*	ptr,	/*!< in: scanned to */
+	ibool*		success)/*!< out: TRUE if success, FALSE if just spaces
+				left in string or a syntax error */
+{
+	const char*	start;
+
+	*success = FALSE;
+
+	ptr = dict_scan_id(cs, ptr, NULL, &start, FALSE, TRUE);
+
+	if (start) {
+		*success = TRUE;
+	}
+
+	return(ptr);
+}
+
+/*********************************************************************//**
+Removes MySQL comments from an SQL string. A comment is either
+(a) '#' to the end of the line,
+(b) '--[space]' to the end of the line, or
+(c) '[slash][asterisk]' till the next '[asterisk][slash]' (like the familiar
+C comment syntax).
+@return own: SQL string stripped from comments; the caller must free
+this with mem_free()! */
+static
+char*
+dict_strip_comments(
+/*================*/
+	const char*	sql_string,	/*!< in: SQL string */
+	size_t		sql_length)	/*!< in: length of sql_string */
+{
+	char*		str;
+	const char*	sptr;
+	const char*	eptr	= sql_string + sql_length;
+	char*		ptr;
+	/* unclosed quote character (0 if none) */
+	char		quote	= 0;
+	bool		escape = false;
+
+	DBUG_ENTER("dict_strip_comments");
+
+	DBUG_PRINT("dict_strip_comments", ("%s", sql_string));
+
+	str = static_cast<char*>(mem_alloc(sql_length + 1));
+
+	sptr = sql_string;
+	ptr = str;
+
+	for (;;) {
+scan_more:
+		if (sptr >= eptr || *sptr == '\0') {
+end_of_string:
+			*ptr = '\0';
+
+			ut_a(ptr <= str + sql_length);
+
+			DBUG_PRINT("dict_strip_comments", ("%s", str));
+			DBUG_RETURN(str);
+		}
+
+		if (*sptr == quote) {
+			/* Closing quote character: do not look for
+			starting quote or comments. */
+
+			/* If the quote character is escaped by a
+			backslash, ignore it. */
+			if (escape) {
+				escape = false;
+			} else {
+				quote = 0;
+			}
+		} else if (quote) {
+			/* Within quotes: do not look for
+			starting quotes or comments. */
+			if (escape) {
+				escape = false;
+			} else if (*sptr == '\\') {
+				escape = true;
+			}
+		} else if (*sptr == '"' || *sptr == '`' || *sptr == '\'') {
+			/* Starting quote: remember the quote character. */
+			quote = *sptr;
+		} else if (*sptr == '#'
+			   || (sptr[0] == '-' && sptr[1] == '-'
+			       && sptr[2] == ' ')) {
+			for (;;) {
+				if (++sptr >= eptr) {
+					goto end_of_string;
+				}
+
+				/* In Unix a newline is 0x0A while in Windows
+				it is 0x0D followed by 0x0A */
+
+				switch (*sptr) {
+				case (char) 0X0A:
+				case (char) 0x0D:
+				case '\0':
+					goto scan_more;
+				}
+			}
+		} else if (!quote && *sptr == '/' && *(sptr + 1) == '*') {
+			sptr += 2;
+			for (;;) {
+				if (sptr >= eptr) {
+					goto end_of_string;
+				}
+
+				switch (*sptr) {
+				case '\0':
+					goto scan_more;
+				case '*':
+					if (sptr[1] == '/') {
+						sptr += 2;
+						goto scan_more;
+					}
+				}
+
+				sptr++;
+			}
+		}
+
+		*ptr = *sptr;
+
+		ptr++;
+		sptr++;
+	}
+}
+
+/*********************************************************************//**
+Finds the highest [number] for foreign key constraints of the table. Looks
+only at the >= 4.0.18-format id's, which are of the form
+databasename/tablename_ibfk_[number].
+@return	highest number, 0 if table has no new format foreign key constraints */
+UNIV_INTERN
+ulint
+dict_table_get_highest_foreign_id(
+/*==============================*/
+	dict_table_t*	table)	/*!< in: table in the dictionary memory cache */
+{
+	dict_foreign_t*	foreign;
+	char*		endp;
+	ulint		biggest_id	= 0;
+	ulint		id;
+	ulint		len;
+
+	ut_a(table);
+
+	len = ut_strlen(table->name);
+
+	for (dict_foreign_set::iterator it = table->foreign_set.begin();
+	     it != table->foreign_set.end();
+	     ++it) {
+		foreign = *it;
+
+		if (ut_strlen(foreign->id) > ((sizeof dict_ibfk) - 1) + len
+		    && 0 == ut_memcmp(foreign->id, table->name, len)
+		    && 0 == ut_memcmp(foreign->id + len,
+				      dict_ibfk, (sizeof dict_ibfk) - 1)
+		    && foreign->id[len + ((sizeof dict_ibfk) - 1)] != '0') {
+			/* It is of the >= 4.0.18 format */
+
+			id = strtoul(foreign->id + len
+				     + ((sizeof dict_ibfk) - 1),
+				     &endp, 10);
+			if (*endp == '\0') {
+				ut_a(id != biggest_id);
+
+				if (id > biggest_id) {
+					biggest_id = id;
+				}
+			}
+		}
+	}
+
+	return(biggest_id);
+}
+
+/*********************************************************************//**
+Reports a simple foreign key create clause syntax error. */
+static
+void
+dict_foreign_report_syntax_err(
+/*===========================*/
+	const char*	name,		/*!< in: table name */
+	const char*	start_of_latest_foreign,
+					/*!< in: start of the foreign key clause
+					in the SQL string */
+	const char*	ptr)		/*!< in: place of the syntax error */
+{
+	ut_ad(!srv_read_only_mode);
+
+	FILE*	ef = dict_foreign_err_file;
+
+	mutex_enter(&dict_foreign_err_mutex);
+	dict_foreign_error_report_low(ef, name);
+	fprintf(ef, "%s:\nSyntax error close to:\n%s\n",
+		start_of_latest_foreign, ptr);
+	mutex_exit(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Scans a table create SQL string and adds to the data dictionary the foreign
+key constraints declared in the string. This function should be called after
+the indexes for a table have been created. Each foreign key constraint must
+be accompanied with indexes in both participating tables. The indexes are
+allowed to contain more fields than mentioned in the constraint.
+@return	error code or DB_SUCCESS */
+static
+dberr_t
+dict_create_foreign_constraints_low(
+/*================================*/
+	trx_t*		trx,	/*!< in: transaction */
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	struct charset_info_st*	cs,/*!< in: the character set of sql_string */
+	const char*	sql_string,
+				/*!< in: CREATE TABLE or ALTER TABLE statement
+				where foreign keys are declared like:
+				FOREIGN KEY (a, b) REFERENCES table2(c, d),
+				table2 can be written also with the database
+				name before it: test.table2; the default
+				database is the database of parameter name */
+	const char*	name,	/*!< in: table full name in the normalized form
+				database_name/table_name */
+	ibool		reject_fks)
+				/*!< in: if TRUE, fail with error code
+				DB_CANNOT_ADD_CONSTRAINT if any foreign
+				keys are found. */
+{
+	dict_table_t*	table;
+	dict_table_t*	referenced_table;
+	dict_table_t*	table_to_alter;
+	ulint		highest_id_so_far	= 0;
+	ulint		number			= 1;
+	dict_index_t*	index;
+	dict_foreign_t*	foreign;
+	const char*	ptr			= sql_string;
+	const char*	start_of_latest_foreign	= sql_string;
+	FILE*		ef			= dict_foreign_err_file;
+	const char*	constraint_name;
+	ibool		success;
+	dberr_t		error;
+	const char*	ptr1;
+	const char*	ptr2;
+	ulint		i;
+	ulint		j;
+	ibool		is_on_delete;
+	ulint		n_on_deletes;
+	ulint		n_on_updates;
+	const dict_col_t*columns[500];
+	const char*	column_names[500];
+	const char*	referenced_table_name;
+	dict_foreign_set	local_fk_set;
+	dict_foreign_set_free	local_fk_set_free(local_fk_set);
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	table = dict_table_get_low(name);
+
+	if (table == NULL) {
+		mutex_enter(&dict_foreign_err_mutex);
+		dict_foreign_error_report_low(ef, name);
+		fprintf(ef,
+			"Cannot find the table in the internal"
+			" data dictionary of InnoDB.\n"
+			"Create table statement:\n%s\n", sql_string);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		return(DB_ERROR);
+	}
+
+	/* First check if we are actually doing an ALTER TABLE, and in that
+	case look for the table being altered */
+
+	ptr = dict_accept(cs, ptr, "ALTER", &success);
+
+	if (!success) {
+
+		goto loop;
+	}
+
+	ptr = dict_accept(cs, ptr, "TABLE", &success);
+
+	if (!success) {
+
+		goto loop;
+	}
+
+	/* We are doing an ALTER TABLE: scan the table name we are altering */
+
+	ptr = dict_scan_table_name(cs, ptr, &table_to_alter, name,
+				   &success, heap, &referenced_table_name);
+	if (!success) {
+		fprintf(stderr,
+			"InnoDB: Error: could not find"
+			" the table being ALTERED in:\n%s\n",
+			sql_string);
+
+		return(DB_ERROR);
+	}
+
+	/* Starting from 4.0.18 and 4.1.2, we generate foreign key id's in the
+	format databasename/tablename_ibfk_[number], where [number] is local
+	to the table; look for the highest [number] for table_to_alter, so
+	that we can assign to new constraints higher numbers. */
+
+	/* If we are altering a temporary table, the table name after ALTER
+	TABLE does not correspond to the internal table name, and
+	table_to_alter is NULL. TODO: should we fix this somehow? */
+
+	if (table_to_alter == NULL) {
+		highest_id_so_far = 0;
+	} else {
+		highest_id_so_far = dict_table_get_highest_foreign_id(
+			table_to_alter);
+	}
+
+	number = highest_id_so_far + 1;
+	/* Scan for foreign key declarations in a loop */
+loop:
+	/* Scan either to "CONSTRAINT" or "FOREIGN", whichever is closer */
+
+	ptr1 = dict_scan_to(ptr, "CONSTRAINT");
+	ptr2 = dict_scan_to(ptr, "FOREIGN");
+
+	constraint_name = NULL;
+
+	if (ptr1 < ptr2) {
+		/* The user may have specified a constraint name. Pick it so
+		that we can store 'databasename/constraintname' as the id of
+		of the constraint to system tables. */
+		ptr = ptr1;
+
+		ptr = dict_accept(cs, ptr, "CONSTRAINT", &success);
+
+		ut_a(success);
+
+		if (!my_isspace(cs, *ptr) && *ptr != '"' && *ptr != '`') {
+			goto loop;
+		}
+
+		while (my_isspace(cs, *ptr)) {
+			ptr++;
+		}
+
+		/* read constraint name unless got "CONSTRAINT FOREIGN" */
+		if (ptr != ptr2) {
+			ptr = dict_scan_id(cs, ptr, heap,
+					   &constraint_name, FALSE, FALSE);
+		}
+	} else {
+		ptr = ptr2;
+	}
+
+	if (*ptr == '\0') {
+		/* The proper way to reject foreign keys for temporary
+		tables would be to split the lexing and syntactical
+		analysis of foreign key clauses from the actual adding
+		of them, so that ha_innodb.cc could first parse the SQL
+		command, determine if there are any foreign keys, and
+		if so, immediately reject the command if the table is a
+		temporary one. For now, this kludge will work. */
+		if (reject_fks && !local_fk_set.empty()) {
+
+			return(DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		/**********************************************************/
+		/* The following call adds the foreign key constraints
+		to the data dictionary system tables on disk */
+
+		error = dict_create_add_foreigns_to_dictionary(
+			local_fk_set, table, trx);
+
+		if (error == DB_SUCCESS) {
+
+			table->foreign_set.insert(local_fk_set.begin(),
+						  local_fk_set.end());
+			std::for_each(local_fk_set.begin(),
+				      local_fk_set.end(),
+				      dict_foreign_add_to_referenced_table());
+			local_fk_set.clear();
+		}
+		return(error);
+	}
+
+	start_of_latest_foreign = ptr;
+
+	ptr = dict_accept(cs, ptr, "FOREIGN", &success);
+
+	if (!success) {
+		goto loop;
+	}
+
+	if (!my_isspace(cs, *ptr)) {
+		goto loop;
+	}
+
+	ptr = dict_accept(cs, ptr, "KEY", &success);
+
+	if (!success) {
+		goto loop;
+	}
+
+	ptr = dict_accept(cs, ptr, "(", &success);
+
+	if (!success) {
+		/* MySQL allows also an index id before the '('; we
+		skip it */
+		ptr = dict_skip_word(cs, ptr, &success);
+
+		if (!success) {
+			dict_foreign_report_syntax_err(
+				name, start_of_latest_foreign, ptr);
+
+			return(DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		ptr = dict_accept(cs, ptr, "(", &success);
+
+		if (!success) {
+			/* We do not flag a syntax error here because in an
+			ALTER TABLE we may also have DROP FOREIGN KEY abc */
+
+			goto loop;
+		}
+	}
+
+	i = 0;
+
+	/* Scan the columns in the first list */
+col_loop1:
+	ut_a(i < (sizeof column_names) / sizeof *column_names);
+	ptr = dict_scan_col(cs, ptr, &success, table, columns + i,
+			    heap, column_names + i);
+	if (!success) {
+		mutex_enter(&dict_foreign_err_mutex);
+		dict_foreign_error_report_low(ef, name);
+		fprintf(ef, "%s:\nCannot resolve column name close to:\n%s\n",
+			start_of_latest_foreign, ptr);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	i++;
+
+	ptr = dict_accept(cs, ptr, ",", &success);
+
+	if (success) {
+		goto col_loop1;
+	}
+
+	ptr = dict_accept(cs, ptr, ")", &success);
+
+	if (!success) {
+		dict_foreign_report_syntax_err(
+			name, start_of_latest_foreign, ptr);
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	/* Try to find an index which contains the columns
+	as the first fields and in the right order. There is
+	no need to check column type match (on types_idx), since
+	the referenced table can be NULL if foreign_key_checks is
+	set to 0 */
+
+	index = dict_foreign_find_index(
+		table, NULL, column_names, i, NULL, TRUE, FALSE);
+
+	if (!index) {
+		mutex_enter(&dict_foreign_err_mutex);
+		dict_foreign_error_report_low(ef, name);
+		fputs("There is no index in table ", ef);
+		ut_print_name(ef, NULL, TRUE, name);
+		fprintf(ef, " where the columns appear\n"
+			"as the first columns. Constraint:\n%s\n"
+			"See " REFMAN "innodb-foreign-key-constraints.html\n"
+			"for correct foreign key definition.\n",
+			start_of_latest_foreign);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		return(DB_CHILD_NO_INDEX);
+	}
+	ptr = dict_accept(cs, ptr, "REFERENCES", &success);
+
+	if (!success || !my_isspace(cs, *ptr)) {
+		dict_foreign_report_syntax_err(
+			name, start_of_latest_foreign, ptr);
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	/* Let us create a constraint struct */
+
+	foreign = dict_mem_foreign_create();
+
+	if (constraint_name) {
+		ulint	db_len;
+
+		/* Catenate 'databasename/' to the constraint name specified
+		by the user: we conceive the constraint as belonging to the
+		same MySQL 'database' as the table itself. We store the name
+		to foreign->id. */
+
+		db_len = dict_get_db_name_len(table->name);
+
+		foreign->id = static_cast<char*>(mem_heap_alloc(
+			foreign->heap, db_len + strlen(constraint_name) + 2));
+
+		ut_memcpy(foreign->id, table->name, db_len);
+		foreign->id[db_len] = '/';
+		strcpy(foreign->id + db_len + 1, constraint_name);
+	}
+
+	if (foreign->id == NULL) {
+		error = dict_create_add_foreign_id(&number,
+						   table->name, foreign);
+		if (error != DB_SUCCESS) {
+			dict_foreign_free(foreign);
+			return(error);
+		}
+	}
+
+	std::pair<dict_foreign_set::iterator, bool>	ret
+		= local_fk_set.insert(foreign);
+
+	if (!ret.second) {
+		/* A duplicate foreign key name has been found */
+		dict_foreign_free(foreign);
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	foreign->foreign_table = table;
+	foreign->foreign_table_name = mem_heap_strdup(
+		foreign->heap, table->name);
+	dict_mem_foreign_table_name_lookup_set(foreign, TRUE);
+
+	foreign->foreign_index = index;
+	foreign->n_fields = (unsigned int) i;
+
+	foreign->foreign_col_names = static_cast<const char**>(
+		mem_heap_alloc(foreign->heap, i * sizeof(void*)));
+
+	for (i = 0; i < foreign->n_fields; i++) {
+		foreign->foreign_col_names[i] = mem_heap_strdup(
+			foreign->heap,
+			dict_table_get_col_name(table,
+						dict_col_get_no(columns[i])));
+	}
+
+	ptr = dict_scan_table_name(cs, ptr, &referenced_table, name,
+				   &success, heap, &referenced_table_name);
+
+	/* Note that referenced_table can be NULL if the user has suppressed
+	checking of foreign key constraints! */
+
+	if (!success || (!referenced_table && trx->check_foreigns)) {
+		mutex_enter(&dict_foreign_err_mutex);
+		dict_foreign_error_report_low(ef, name);
+		fprintf(ef, "%s:\nCannot resolve table name close to:\n"
+			"%s\n",
+			start_of_latest_foreign, ptr);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	ptr = dict_accept(cs, ptr, "(", &success);
+
+	if (!success) {
+		dict_foreign_report_syntax_err(name, start_of_latest_foreign,
+					       ptr);
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	/* Scan the columns in the second list */
+	i = 0;
+
+col_loop2:
+	ptr = dict_scan_col(cs, ptr, &success, referenced_table, columns + i,
+			    heap, column_names + i);
+	i++;
+
+	if (!success) {
+
+		mutex_enter(&dict_foreign_err_mutex);
+		dict_foreign_error_report_low(ef, name);
+		fprintf(ef, "%s:\nCannot resolve column name close to:\n"
+			"%s\n",
+			start_of_latest_foreign, ptr);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	ptr = dict_accept(cs, ptr, ",", &success);
+
+	if (success) {
+		goto col_loop2;
+	}
+
+	ptr = dict_accept(cs, ptr, ")", &success);
+
+	if (!success || foreign->n_fields != i) {
+
+		dict_foreign_report_syntax_err(name, start_of_latest_foreign,
+					       ptr);
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	n_on_deletes = 0;
+	n_on_updates = 0;
+
+scan_on_conditions:
+	/* Loop here as long as we can find ON ... conditions */
+
+	ptr = dict_accept(cs, ptr, "ON", &success);
+
+	if (!success) {
+
+		goto try_find_index;
+	}
+
+	ptr = dict_accept(cs, ptr, "DELETE", &success);
+
+	if (!success) {
+		ptr = dict_accept(cs, ptr, "UPDATE", &success);
+
+		if (!success) {
+
+			dict_foreign_report_syntax_err(
+				name, start_of_latest_foreign, ptr);
+			return(DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		is_on_delete = FALSE;
+		n_on_updates++;
+	} else {
+		is_on_delete = TRUE;
+		n_on_deletes++;
+	}
+
+	ptr = dict_accept(cs, ptr, "RESTRICT", &success);
+
+	if (success) {
+		goto scan_on_conditions;
+	}
+
+	ptr = dict_accept(cs, ptr, "CASCADE", &success);
+
+	if (success) {
+		if (is_on_delete) {
+			foreign->type |= DICT_FOREIGN_ON_DELETE_CASCADE;
+		} else {
+			foreign->type |= DICT_FOREIGN_ON_UPDATE_CASCADE;
+		}
+
+		goto scan_on_conditions;
+	}
+
+	ptr = dict_accept(cs, ptr, "NO", &success);
+
+	if (success) {
+		ptr = dict_accept(cs, ptr, "ACTION", &success);
+
+		if (!success) {
+			dict_foreign_report_syntax_err(
+				name, start_of_latest_foreign, ptr);
+
+			return(DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		if (is_on_delete) {
+			foreign->type |= DICT_FOREIGN_ON_DELETE_NO_ACTION;
+		} else {
+			foreign->type |= DICT_FOREIGN_ON_UPDATE_NO_ACTION;
+		}
+
+		goto scan_on_conditions;
+	}
+
+	ptr = dict_accept(cs, ptr, "SET", &success);
+
+	if (!success) {
+		dict_foreign_report_syntax_err(name, start_of_latest_foreign,
+					       ptr);
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	ptr = dict_accept(cs, ptr, "NULL", &success);
+
+	if (!success) {
+		dict_foreign_report_syntax_err(name, start_of_latest_foreign,
+					       ptr);
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	for (j = 0; j < foreign->n_fields; j++) {
+		if ((dict_index_get_nth_col(foreign->foreign_index, j)->prtype)
+		    & DATA_NOT_NULL) {
+
+			/* It is not sensible to define SET NULL
+			if the column is not allowed to be NULL! */
+
+			mutex_enter(&dict_foreign_err_mutex);
+			dict_foreign_error_report_low(ef, name);
+			fprintf(ef, "%s:\n"
+				"You have defined a SET NULL condition"
+				" though some of the\n"
+				"columns are defined as NOT NULL.\n",
+				start_of_latest_foreign);
+			mutex_exit(&dict_foreign_err_mutex);
+
+			return(DB_CANNOT_ADD_CONSTRAINT);
+		}
+	}
+
+	if (is_on_delete) {
+		foreign->type |= DICT_FOREIGN_ON_DELETE_SET_NULL;
+	} else {
+		foreign->type |= DICT_FOREIGN_ON_UPDATE_SET_NULL;
+	}
+
+	goto scan_on_conditions;
+
+try_find_index:
+	if (n_on_deletes > 1 || n_on_updates > 1) {
+		/* It is an error to define more than 1 action */
+
+		mutex_enter(&dict_foreign_err_mutex);
+		dict_foreign_error_report_low(ef, name);
+		fprintf(ef, "%s:\n"
+			"You have twice an ON DELETE clause"
+			" or twice an ON UPDATE clause.\n",
+			start_of_latest_foreign);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	/* Try to find an index which contains the columns as the first fields
+	and in the right order, and the types are the same as in
+	foreign->foreign_index */
+
+	if (referenced_table) {
+		index = dict_foreign_find_index(referenced_table, NULL,
+						column_names, i,
+						foreign->foreign_index,
+						TRUE, FALSE);
+		if (!index) {
+			mutex_enter(&dict_foreign_err_mutex);
+			dict_foreign_error_report_low(ef, name);
+			fprintf(ef, "%s:\n"
+				"Cannot find an index in the"
+				" referenced table where the\n"
+				"referenced columns appear as the"
+				" first columns, or column types\n"
+				"in the table and the referenced table"
+				" do not match for constraint.\n"
+				"Note that the internal storage type of"
+				" ENUM and SET changed in\n"
+				"tables created with >= InnoDB-4.1.12,"
+				" and such columns in old tables\n"
+				"cannot be referenced by such columns"
+				" in new tables.\n"
+				"See " REFMAN
+				"innodb-foreign-key-constraints.html\n"
+				"for correct foreign key definition.\n",
+				start_of_latest_foreign);
+			mutex_exit(&dict_foreign_err_mutex);
+
+			return(DB_PARENT_NO_INDEX);
+		}
+	} else {
+		ut_a(trx->check_foreigns == FALSE);
+		index = NULL;
+	}
+
+	foreign->referenced_index = index;
+	foreign->referenced_table = referenced_table;
+
+	foreign->referenced_table_name = mem_heap_strdup(
+		foreign->heap, referenced_table_name);
+	dict_mem_referenced_table_name_lookup_set(foreign, TRUE);
+
+	foreign->referenced_col_names = static_cast<const char**>(
+		mem_heap_alloc(foreign->heap, i * sizeof(void*)));
+
+	for (i = 0; i < foreign->n_fields; i++) {
+		foreign->referenced_col_names[i]
+			= mem_heap_strdup(foreign->heap, column_names[i]);
+	}
+
+	goto loop;
+}
+/**************************************************************************
+Determines whether a string starts with the specified keyword.
+@return	TRUE if str starts with keyword */
+UNIV_INTERN
+ibool
+dict_str_starts_with_keyword(
+/*=========================*/
+	THD*		thd,		/*!< in: MySQL thread handle */
+	const char*	str,		/*!< in: string to scan for keyword */
+	const char*	keyword)	/*!< in: keyword to look for */
+{
+	struct charset_info_st*	cs = innobase_get_charset(thd);
+	ibool			success;
+
+	dict_accept(cs, str, keyword, &success);
+	return(success);
+}
+
+/*********************************************************************//**
+Scans a table create SQL string and adds to the data dictionary the foreign
+key constraints declared in the string. This function should be called after
+the indexes for a table have been created. Each foreign key constraint must
+be accompanied with indexes in both participating tables. The indexes are
+allowed to contain more fields than mentioned in the constraint.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_foreign_constraints(
+/*============================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const char*	sql_string,	/*!< in: table create statement where
+					foreign keys are declared like:
+					FOREIGN KEY (a, b) REFERENCES
+					table2(c, d), table2 can be written
+					also with the database
+					name before it: test.table2; the
+					default database id the database of
+					parameter name */
+	size_t		sql_length,	/*!< in: length of sql_string */
+	const char*	name,		/*!< in: table full name in the
+					normalized form
+					database_name/table_name */
+	ibool		reject_fks)	/*!< in: if TRUE, fail with error
+					code DB_CANNOT_ADD_CONSTRAINT if
+					any foreign keys are found. */
+{
+	char*		str;
+	dberr_t		err;
+	mem_heap_t*	heap;
+
+	ut_a(trx);
+	ut_a(trx->mysql_thd);
+
+	str = dict_strip_comments(sql_string, sql_length);
+	heap = mem_heap_create(10000);
+
+	err = dict_create_foreign_constraints_low(
+		trx, heap, innobase_get_charset(trx->mysql_thd), str, name,
+		reject_fks);
+
+	mem_heap_free(heap);
+	mem_free(str);
+
+	return(err);
+}
+
+/**********************************************************************//**
+Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement.
+@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the
+constraint id does not match */
+UNIV_INTERN
+dberr_t
+dict_foreign_parse_drop_constraints(
+/*================================*/
+	mem_heap_t*	heap,			/*!< in: heap from which we can
+						allocate memory */
+	trx_t*		trx,			/*!< in: transaction */
+	dict_table_t*	table,			/*!< in: table */
+	ulint*		n,			/*!< out: number of constraints
+						to drop */
+	const char***	constraints_to_drop)	/*!< out: id's of the
+						constraints to drop */
+{
+	ibool			success;
+	char*			str;
+	size_t			len;
+	const char*		ptr;
+	const char*		id;
+	struct charset_info_st*	cs;
+
+	ut_a(trx);
+	ut_a(trx->mysql_thd);
+
+	cs = innobase_get_charset(trx->mysql_thd);
+
+	*n = 0;
+
+	*constraints_to_drop = static_cast<const char**>(
+		mem_heap_alloc(heap, 1000 * sizeof(char*)));
+
+	ptr = innobase_get_stmt(trx->mysql_thd, &len);
+
+	str = dict_strip_comments(ptr, len);
+
+	ptr = str;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+loop:
+	ptr = dict_scan_to(ptr, "DROP");
+
+	if (*ptr == '\0') {
+		mem_free(str);
+
+		return(DB_SUCCESS);
+	}
+
+	ptr = dict_accept(cs, ptr, "DROP", &success);
+
+	if (!my_isspace(cs, *ptr)) {
+
+		goto loop;
+	}
+
+	ptr = dict_accept(cs, ptr, "FOREIGN", &success);
+
+	if (!success || !my_isspace(cs, *ptr)) {
+
+		goto loop;
+	}
+
+	ptr = dict_accept(cs, ptr, "KEY", &success);
+
+	if (!success) {
+
+		goto syntax_error;
+	}
+
+	ptr = dict_scan_id(cs, ptr, heap, &id, FALSE, TRUE);
+
+	if (id == NULL) {
+
+		goto syntax_error;
+	}
+
+	ut_a(*n < 1000);
+	(*constraints_to_drop)[*n] = id;
+	(*n)++;
+
+	if (std::find_if(table->foreign_set.begin(),
+			 table->foreign_set.end(),
+			 dict_foreign_matches_id(id))
+	    == table->foreign_set.end()) {
+
+		if (!srv_read_only_mode) {
+			FILE*	ef = dict_foreign_err_file;
+
+			mutex_enter(&dict_foreign_err_mutex);
+			rewind(ef);
+			ut_print_timestamp(ef);
+			fputs(" Error in dropping of a foreign key "
+			      "constraint of table ", ef);
+			ut_print_name(ef, NULL, TRUE, table->name);
+			fputs(",\nin SQL command\n", ef);
+			fputs(str, ef);
+			fputs("\nCannot find a constraint with the "
+			      "given id ", ef);
+			ut_print_name(ef, NULL, FALSE, id);
+			fputs(".\n", ef);
+			mutex_exit(&dict_foreign_err_mutex);
+		}
+
+		mem_free(str);
+
+		return(DB_CANNOT_DROP_CONSTRAINT);
+	}
+
+	goto loop;
+
+syntax_error:
+	if (!srv_read_only_mode) {
+		FILE*	ef = dict_foreign_err_file;
+
+		mutex_enter(&dict_foreign_err_mutex);
+		rewind(ef);
+		ut_print_timestamp(ef);
+		fputs(" Syntax error in dropping of a"
+		      " foreign key constraint of table ", ef);
+		ut_print_name(ef, NULL, TRUE, table->name);
+		fprintf(ef, ",\n"
+			"close to:\n%s\n in SQL command\n%s\n", ptr, str);
+		mutex_exit(&dict_foreign_err_mutex);
+	}
+
+	mem_free(str);
+
+	return(DB_CANNOT_DROP_CONSTRAINT);
+}
+
+/*==================== END OF FOREIGN KEY PROCESSING ====================*/
+
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+Assumes that dict_sys->mutex is already being held.
+@return	index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_index_get_if_in_cache_low(
+/*===========================*/
+	index_id_t	index_id)	/*!< in: index id */
+{
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	return(dict_index_find_on_id_low(index_id));
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+@return	index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_index_get_if_in_cache(
+/*=======================*/
+	index_id_t	index_id)	/*!< in: index id */
+{
+	dict_index_t*	index;
+
+	if (dict_sys == NULL) {
+		return(NULL);
+	}
+
+	mutex_enter(&(dict_sys->mutex));
+
+	index = dict_index_get_if_in_cache_low(index_id);
+
+	mutex_exit(&(dict_sys->mutex));
+
+	return(index);
+}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Checks that a tuple has n_fields_cmp value in a sensible range, so that
+no comparison can occur with the page number field in a node pointer.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dict_index_check_search_tuple(
+/*==========================*/
+	const dict_index_t*	index,	/*!< in: index tree */
+	const dtuple_t*		tuple)	/*!< in: tuple used in a search */
+{
+	ut_a(index);
+	ut_a(dtuple_get_n_fields_cmp(tuple)
+	     <= dict_index_get_n_unique_in_tree(index));
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Builds a node pointer out of a physical record and a page number.
+@return	own: node pointer */
+UNIV_INTERN
+dtuple_t*
+dict_index_build_node_ptr(
+/*======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_t*		rec,	/*!< in: record for which to build node
+					pointer */
+	ulint			page_no,/*!< in: page number to put in node
+					pointer */
+	mem_heap_t*		heap,	/*!< in: memory heap where pointer
+					created */
+	ulint			level)	/*!< in: level of rec in tree:
+					0 means leaf level */
+{
+	dtuple_t*	tuple;
+	dfield_t*	field;
+	byte*		buf;
+	ulint		n_unique;
+
+	if (dict_index_is_univ(index)) {
+		/* In a universal index tree, we take the whole record as
+		the node pointer if the record is on the leaf level,
+		on non-leaf levels we remove the last field, which
+		contains the page number of the child page */
+
+		ut_a(!dict_table_is_comp(index->table));
+		n_unique = rec_get_n_fields_old(rec);
+
+		if (level > 0) {
+			ut_a(n_unique > 1);
+			n_unique--;
+		}
+	} else {
+		n_unique = dict_index_get_n_unique_in_tree(index);
+	}
+
+	tuple = dtuple_create(heap, n_unique + 1);
+
+	/* When searching in the tree for the node pointer, we must not do
+	comparison on the last field, the page number field, as on upper
+	levels in the tree there may be identical node pointers with a
+	different page number; therefore, we set the n_fields_cmp to one
+	less: */
+
+	dtuple_set_n_fields_cmp(tuple, n_unique);
+
+	dict_index_copy_types(tuple, index, n_unique);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+	mach_write_to_4(buf, page_no);
+
+	field = dtuple_get_nth_field(tuple, n_unique);
+	dfield_set_data(field, buf, 4);
+
+	dtype_set(dfield_get_type(field), DATA_SYS_CHILD, DATA_NOT_NULL, 4);
+
+	rec_copy_prefix_to_dtuple(tuple, rec, index, n_unique, heap);
+	dtuple_set_info_bits(tuple, dtuple_get_info_bits(tuple)
+			     | REC_STATUS_NODE_PTR);
+
+	ut_ad(dtuple_check_typed(tuple));
+
+	return(tuple);
+}
+
+/**********************************************************************//**
+Copies an initial segment of a physical record, long enough to specify an
+index entry uniquely.
+@return	pointer to the prefix record */
+UNIV_INTERN
+rec_t*
+dict_index_copy_rec_order_prefix(
+/*=============================*/
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_t*		rec,	/*!< in: record for which to
+					copy prefix */
+	ulint*			n_fields,/*!< out: number of fields copied */
+	byte**			buf,	/*!< in/out: memory buffer for the
+					copied prefix, or NULL */
+	ulint*			buf_size)/*!< in/out: buffer size */
+{
+	ulint		n;
+
+	UNIV_PREFETCH_R(rec);
+
+	if (dict_index_is_univ(index)) {
+		ut_a(!dict_table_is_comp(index->table));
+		n = rec_get_n_fields_old(rec);
+	} else {
+		n = dict_index_get_n_unique_in_tree(index);
+	}
+
+	*n_fields = n;
+	return(rec_copy_prefix_to_buf(rec, index, n, buf, buf_size));
+}
+
+/**********************************************************************//**
+Builds a typed data tuple out of a physical record.
+@return	own: data tuple */
+UNIV_INTERN
+dtuple_t*
+dict_index_build_data_tuple(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	rec_t*		rec,	/*!< in: record for which to build data tuple */
+	ulint		n_fields,/*!< in: number of data fields */
+	mem_heap_t*	heap)	/*!< in: memory heap where tuple created */
+{
+	dtuple_t*	tuple;
+
+	ut_ad(dict_table_is_comp(index->table)
+	      || n_fields <= rec_get_n_fields_old(rec));
+
+	tuple = dtuple_create(heap, n_fields);
+
+	dict_index_copy_types(tuple, index, n_fields);
+
+	rec_copy_prefix_to_dtuple(tuple, rec, index, n_fields, heap);
+
+	ut_ad(dtuple_check_typed(tuple));
+
+	return(tuple);
+}
+
+/*********************************************************************//**
+Calculates the minimum record length in an index. */
+UNIV_INTERN
+ulint
+dict_index_calc_min_rec_len(
+/*========================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ulint	sum	= 0;
+	ulint	i;
+	ulint	comp	= dict_table_is_comp(index->table);
+
+	if (comp) {
+		ulint nullable = 0;
+		sum = REC_N_NEW_EXTRA_BYTES;
+		for (i = 0; i < dict_index_get_n_fields(index); i++) {
+			const dict_col_t*	col
+				= dict_index_get_nth_col(index, i);
+			ulint	size = dict_col_get_fixed_size(col, comp);
+			sum += size;
+			if (!size) {
+				size = col->len;
+				sum += size < 128 ? 1 : 2;
+			}
+			if (!(col->prtype & DATA_NOT_NULL)) {
+				nullable++;
+			}
+		}
+
+		/* round the NULL flags up to full bytes */
+		sum += UT_BITS_IN_BYTES(nullable);
+
+		return(sum);
+	}
+
+	for (i = 0; i < dict_index_get_n_fields(index); i++) {
+		sum += dict_col_get_fixed_size(
+			dict_index_get_nth_col(index, i), comp);
+	}
+
+	if (sum > 127) {
+		sum += 2 * dict_index_get_n_fields(index);
+	} else {
+		sum += dict_index_get_n_fields(index);
+	}
+
+	sum += REC_N_OLD_EXTRA_BYTES;
+
+	return(sum);
+}
+
+/**********************************************************************//**
+Prints info of a foreign key constraint. */
+static
+void
+dict_foreign_print_low(
+/*===================*/
+	dict_foreign_t*	foreign)	/*!< in: foreign key constraint */
+{
+	ulint	i;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	fprintf(stderr, "  FOREIGN KEY CONSTRAINT %s: %s (",
+		foreign->id, foreign->foreign_table_name);
+
+	for (i = 0; i < foreign->n_fields; i++) {
+		fprintf(stderr, " %s", foreign->foreign_col_names[i]);
+	}
+
+	fprintf(stderr, " )\n"
+		"             REFERENCES %s (",
+		foreign->referenced_table_name);
+
+	for (i = 0; i < foreign->n_fields; i++) {
+		fprintf(stderr, " %s", foreign->referenced_col_names[i]);
+	}
+
+	fputs(" )\n", stderr);
+}
+
+/**********************************************************************//**
+Prints a table data. */
+UNIV_INTERN
+void
+dict_table_print(
+/*=============*/
+	dict_table_t*	table)	/*!< in: table */
+{
+	dict_index_t*	index;
+	ulint		i;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	dict_table_stats_lock(table, RW_X_LATCH);
+
+	if (!table->stat_initialized) {
+		dict_stats_update_transient(table);
+	}
+
+	fprintf(stderr,
+		"--------------------------------------\n"
+		"TABLE: name %s, id %llu, flags %lx, columns %lu,"
+		" indexes %lu, appr.rows " UINT64PF "\n"
+		"  COLUMNS: ",
+		table->name,
+		(ullint) table->id,
+		(ulong) table->flags,
+		(ulong) table->n_cols,
+		(ulong) UT_LIST_GET_LEN(table->indexes),
+		table->stat_n_rows);
+
+	for (i = 0; i < (ulint) table->n_cols; i++) {
+		dict_col_print_low(table, dict_table_get_nth_col(table, i));
+		fputs("; ", stderr);
+	}
+
+	putc('\n', stderr);
+
+	index = UT_LIST_GET_FIRST(table->indexes);
+
+	while (index != NULL) {
+		dict_index_print_low(index);
+		index = UT_LIST_GET_NEXT(indexes, index);
+	}
+
+	dict_table_stats_unlock(table, RW_X_LATCH);
+
+	std::for_each(table->foreign_set.begin(),
+		      table->foreign_set.end(),
+		      dict_foreign_print_low);
+
+	std::for_each(table->referenced_set.begin(),
+		      table->referenced_set.end(),
+		      dict_foreign_print_low);
+}
+
+/**********************************************************************//**
+Prints a column data. */
+static
+void
+dict_col_print_low(
+/*===============*/
+	const dict_table_t*	table,	/*!< in: table */
+	const dict_col_t*	col)	/*!< in: column */
+{
+	dtype_t	type;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	dict_col_copy_type(col, &type);
+	fprintf(stderr, "%s: ", dict_table_get_col_name(table,
+							dict_col_get_no(col)));
+
+	dtype_print(&type);
+}
+
+/**********************************************************************//**
+Prints an index data. */
+static
+void
+dict_index_print_low(
+/*=================*/
+	dict_index_t*	index)	/*!< in: index */
+{
+	ib_int64_t	n_vals;
+	ulint		i;
+
+	ut_a(index->table->stat_initialized);
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	if (index->n_user_defined_cols > 0) {
+		n_vals = index->stat_n_diff_key_vals[
+			index->n_user_defined_cols - 1];
+	} else {
+		n_vals = index->stat_n_diff_key_vals[0];
+	}
+
+	fprintf(stderr,
+		"  INDEX: name %s, id %llu, fields %lu/%lu,"
+		" uniq %lu, type %lu\n"
+		"   root page %lu, appr.key vals %lu,"
+		" leaf pages %lu, size pages %lu\n"
+		"   FIELDS: ",
+		index->name,
+		(ullint) index->id,
+		(ulong) index->n_user_defined_cols,
+		(ulong) index->n_fields,
+		(ulong) index->n_uniq,
+		(ulong) index->type,
+		(ulong) index->page,
+		(ulong) n_vals,
+		(ulong) index->stat_n_leaf_pages,
+		(ulong) index->stat_index_size);
+
+	for (i = 0; i < index->n_fields; i++) {
+		dict_field_print_low(dict_index_get_nth_field(index, i));
+	}
+
+	putc('\n', stderr);
+
+#ifdef UNIV_BTR_PRINT
+	btr_print_size(index);
+
+	btr_print_index(index, 7);
+#endif /* UNIV_BTR_PRINT */
+}
+
+/**********************************************************************//**
+Prints a field data. */
+static
+void
+dict_field_print_low(
+/*=================*/
+	const dict_field_t*	field)	/*!< in: field */
+{
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	fprintf(stderr, " %s", field->name);
+
+	if (field->prefix_len != 0) {
+		fprintf(stderr, "(%lu)", (ulong) field->prefix_len);
+	}
+}
+
+/**********************************************************************//**
+Outputs info on a foreign key of a table in a format suitable for
+CREATE TABLE. */
+UNIV_INTERN
+void
+dict_print_info_on_foreign_key_in_create_format(
+/*============================================*/
+	FILE*		file,		/*!< in: file where to print */
+	trx_t*		trx,		/*!< in: transaction */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
+	ibool		add_newline)	/*!< in: whether to add a newline */
+{
+	const char*	stripped_id;
+	ulint	i;
+
+	if (strchr(foreign->id, '/')) {
+		/* Strip the preceding database name from the constraint id */
+		stripped_id = foreign->id + 1
+			+ dict_get_db_name_len(foreign->id);
+	} else {
+		stripped_id = foreign->id;
+	}
+
+	putc(',', file);
+
+	if (add_newline) {
+		/* SHOW CREATE TABLE wants constraints each printed nicely
+		on its own line, while error messages want no newlines
+		inserted. */
+		fputs("\n ", file);
+	}
+
+	fputs(" CONSTRAINT ", file);
+	ut_print_name(file, trx, FALSE, stripped_id);
+	fputs(" FOREIGN KEY (", file);
+
+	for (i = 0;;) {
+		ut_print_name(file, trx, FALSE, foreign->foreign_col_names[i]);
+		if (++i < foreign->n_fields) {
+			fputs(", ", file);
+		} else {
+			break;
+		}
+	}
+
+	fputs(") REFERENCES ", file);
+
+	if (dict_tables_have_same_db(foreign->foreign_table_name_lookup,
+				     foreign->referenced_table_name_lookup)) {
+		/* Do not print the database name of the referenced table */
+		ut_print_name(file, trx, TRUE,
+			      dict_remove_db_name(
+				      foreign->referenced_table_name));
+	} else {
+		ut_print_name(file, trx, TRUE,
+			      foreign->referenced_table_name);
+	}
+
+	putc(' ', file);
+	putc('(', file);
+
+	for (i = 0;;) {
+		ut_print_name(file, trx, FALSE,
+			      foreign->referenced_col_names[i]);
+		if (++i < foreign->n_fields) {
+			fputs(", ", file);
+		} else {
+			break;
+		}
+	}
+
+	putc(')', file);
+
+	if (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE) {
+		fputs(" ON DELETE CASCADE", file);
+	}
+
+	if (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL) {
+		fputs(" ON DELETE SET NULL", file);
+	}
+
+	if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) {
+		fputs(" ON DELETE NO ACTION", file);
+	}
+
+	if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) {
+		fputs(" ON UPDATE CASCADE", file);
+	}
+
+	if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) {
+		fputs(" ON UPDATE SET NULL", file);
+	}
+
+	if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) {
+		fputs(" ON UPDATE NO ACTION", file);
+	}
+}
+
+/**********************************************************************//**
+Outputs info on foreign keys of a table. */
+UNIV_INTERN
+void
+dict_print_info_on_foreign_keys(
+/*============================*/
+	ibool		create_table_format, /*!< in: if TRUE then print in
+				a format suitable to be inserted into
+				a CREATE TABLE, otherwise in the format
+				of SHOW TABLE STATUS */
+	FILE*		file,	/*!< in: file where to print */
+	trx_t*		trx,	/*!< in: transaction */
+	dict_table_t*	table)	/*!< in: table */
+{
+	dict_foreign_t*	foreign;
+
+	mutex_enter(&(dict_sys->mutex));
+
+	for (dict_foreign_set::iterator it = table->foreign_set.begin();
+	     it != table->foreign_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		if (create_table_format) {
+			dict_print_info_on_foreign_key_in_create_format(
+				file, trx, foreign, TRUE);
+		} else {
+			ulint	i;
+			fputs("; (", file);
+
+			for (i = 0; i < foreign->n_fields; i++) {
+				if (i) {
+					putc(' ', file);
+				}
+
+				ut_print_name(file, trx, FALSE,
+					      foreign->foreign_col_names[i]);
+			}
+
+			fputs(") REFER ", file);
+			ut_print_name(file, trx, TRUE,
+				      foreign->referenced_table_name);
+			putc('(', file);
+
+			for (i = 0; i < foreign->n_fields; i++) {
+				if (i) {
+					putc(' ', file);
+				}
+				ut_print_name(
+					file, trx, FALSE,
+					foreign->referenced_col_names[i]);
+			}
+
+			putc(')', file);
+
+			if (foreign->type == DICT_FOREIGN_ON_DELETE_CASCADE) {
+				fputs(" ON DELETE CASCADE", file);
+			}
+
+			if (foreign->type == DICT_FOREIGN_ON_DELETE_SET_NULL) {
+				fputs(" ON DELETE SET NULL", file);
+			}
+
+			if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) {
+				fputs(" ON DELETE NO ACTION", file);
+			}
+
+			if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) {
+				fputs(" ON UPDATE CASCADE", file);
+			}
+
+			if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) {
+				fputs(" ON UPDATE SET NULL", file);
+			}
+
+			if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) {
+				fputs(" ON UPDATE NO ACTION", file);
+			}
+		}
+	}
+
+	mutex_exit(&(dict_sys->mutex));
+}
+
+/********************************************************************//**
+Displays the names of the index and the table. */
+UNIV_INTERN
+void
+dict_index_name_print(
+/*==================*/
+	FILE*			file,	/*!< in: output stream */
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_index_t*	index)	/*!< in: index to print */
+{
+	fputs("index ", file);
+	ut_print_name(file, trx, FALSE, index->name);
+	fputs(" of table ", file);
+	ut_print_name(file, trx, TRUE, index->table_name);
+}
+
+/**********************************************************************//**
+Find a table in dict_sys->table_LRU list with specified space id
+@return table if found, NULL if not */
+static
+dict_table_t*
+dict_find_table_by_space(
+/*=====================*/
+	ulint	space_id)		/*!< in: space ID */
+{
+	dict_table_t*   table;
+	ulint		num_item;
+	ulint		count = 0;
+
+	ut_ad(space_id > 0);
+
+	if (dict_sys == NULL) {
+		/* This could happen when it's in redo processing. */
+		return(NULL);
+	}
+
+	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	num_item =  UT_LIST_GET_LEN(dict_sys->table_LRU);
+
+	/* This function intentionally does not acquire mutex as it is used
+	by error handling code in deep call stack as last means to avoid
+	killing the server, so it worth to risk some consequencies for
+	the action. */
+	while (table && count < num_item) {
+		if (table->space == space_id) {
+			return(table);
+		}
+
+		table = UT_LIST_GET_NEXT(table_LRU, table);
+		count++;
+	}
+
+	return(NULL);
+}
+
+/**********************************************************************//**
+Flags a table with specified space_id corrupted in the data dictionary
+cache
+@return TRUE if successful */
+UNIV_INTERN
+ibool
+dict_set_corrupted_by_space(
+/*========================*/
+	ulint	space_id)		/*!< in: space ID */
+{
+	dict_table_t*   table;
+
+	table = dict_find_table_by_space(space_id);
+
+	if (!table) {
+		return(FALSE);
+	}
+
+	/* mark the table->corrupted bit only, since the caller
+	could be too deep in the stack for SYS_INDEXES update */
+	table->corrupted = TRUE;
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Flags an index corrupted both in the data dictionary cache
+and in the SYS_INDEXES */
+UNIV_INTERN
+void
+dict_set_corrupted(
+/*===============*/
+	dict_index_t*	index,	/*!< in/out: index */
+	trx_t*		trx,	/*!< in/out: transaction */
+	const char*	ctx)	/*!< in: context */
+{
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+	dict_index_t*	sys_index;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	byte*		buf;
+	char*		table_name;
+	const char*	status;
+	btr_cur_t	cursor;
+	bool		locked	= RW_X_LATCH == trx->dict_operation_lock_mode;
+
+	if (!locked) {
+		row_mysql_lock_data_dictionary(trx);
+	}
+
+	ut_ad(index);
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(!dict_table_is_comp(dict_sys->sys_tables));
+	ut_ad(!dict_table_is_comp(dict_sys->sys_indexes));
+
+#ifdef UNIV_SYNC_DEBUG
+        ut_ad(sync_thread_levels_empty_except_dict());
+#endif
+
+	/* Mark the table as corrupted only if the clustered index
+	is corrupted */
+	if (dict_index_is_clust(index)) {
+		index->table->corrupted = TRUE;
+	}
+
+	if (index->type & DICT_CORRUPT) {
+		/* The index was already flagged corrupted. */
+		ut_ad(!dict_index_is_clust(index) || index->table->corrupted);
+		goto func_exit;
+	}
+
+	heap = mem_heap_create(sizeof(dtuple_t) + 2 * (sizeof(dfield_t)
+			       + sizeof(que_fork_t) + sizeof(upd_node_t)
+			       + sizeof(upd_t) + 12));
+	mtr_start(&mtr);
+	index->type |= DICT_CORRUPT;
+
+	sys_index = UT_LIST_GET_FIRST(dict_sys->sys_indexes->indexes);
+
+	/* Find the index row in SYS_INDEXES */
+	tuple = dtuple_create(heap, 2);
+
+	dfield = dtuple_get_nth_field(tuple, 0);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(buf, index->table->id);
+	dfield_set_data(dfield, buf, 8);
+
+	dfield = dtuple_get_nth_field(tuple, 1);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(buf, index->id);
+	dfield_set_data(dfield, buf, 8);
+
+	dict_index_copy_types(tuple, sys_index, 2);
+
+	btr_cur_search_to_nth_level(sys_index, 0, tuple, PAGE_CUR_LE,
+				    BTR_MODIFY_LEAF,
+				    &cursor, 0, __FILE__, __LINE__, &mtr);
+
+	if (cursor.low_match == dtuple_get_n_fields(tuple)) {
+		/* UPDATE SYS_INDEXES SET TYPE=index->type
+		WHERE TABLE_ID=index->table->id AND INDEX_ID=index->id */
+		ulint	len;
+		byte*	field	= rec_get_nth_field_old(
+			btr_cur_get_rec(&cursor),
+			DICT_FLD__SYS_INDEXES__TYPE, &len);
+		if (len != 4) {
+			goto fail;
+		}
+		mlog_write_ulint(field, index->type, MLOG_4BYTES, &mtr);
+		status = "Flagged";
+	} else {
+fail:
+		status = "Unable to flag";
+	}
+
+	mtr_commit(&mtr);
+	mem_heap_empty(heap);
+	table_name = static_cast<char*>(mem_heap_alloc(heap, FN_REFLEN + 1));
+	*innobase_convert_name(
+		table_name, FN_REFLEN,
+		index->table_name, strlen(index->table_name),
+		NULL, TRUE) = 0;
+
+	ib_logf(IB_LOG_LEVEL_ERROR, "%s corruption of %s in table %s in %s",
+		status, index->name, table_name, ctx);
+
+	mem_heap_free(heap);
+
+func_exit:
+	if (!locked) {
+		row_mysql_unlock_data_dictionary(trx);
+	}
+}
+
+/**********************************************************************//**
+Flags an index corrupted in the data dictionary cache only. This
+is used mostly to mark a corrupted index when index's own dictionary
+is corrupted, and we force to load such index for repair purpose */
+UNIV_INTERN
+void
+dict_set_corrupted_index_cache_only(
+/*================================*/
+	dict_index_t*	index,		/*!< in/out: index */
+	dict_table_t*	table)		/*!< in/out: table */
+{
+	ut_ad(index);
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(!dict_table_is_comp(dict_sys->sys_tables));
+	ut_ad(!dict_table_is_comp(dict_sys->sys_indexes));
+
+	/* Mark the table as corrupted only if the clustered index
+	is corrupted */
+	if (dict_index_is_clust(index)) {
+		dict_table_t*	corrupt_table;
+
+		corrupt_table = table ? table : index->table;
+		ut_ad(!index->table || !table || index->table  == table);
+
+		if (corrupt_table) {
+			corrupt_table->corrupted = TRUE;
+		}
+	}
+
+	index->type |= DICT_CORRUPT;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Inits dict_ind_redundant and dict_ind_compact. */
+UNIV_INTERN
+void
+dict_ind_init(void)
+/*===============*/
+{
+	dict_table_t*		table;
+
+	/* create dummy table and index for REDUNDANT infimum and supremum */
+	table = dict_mem_table_create("SYS_DUMMY1", DICT_HDR_SPACE, 1, 0, 0);
+	dict_mem_table_add_col(table, NULL, NULL, DATA_CHAR,
+			       DATA_ENGLISH | DATA_NOT_NULL, 8);
+
+	dict_ind_redundant = dict_mem_index_create("SYS_DUMMY1", "SYS_DUMMY1",
+						   DICT_HDR_SPACE, 0, 1);
+	dict_index_add_col(dict_ind_redundant, table,
+			   dict_table_get_nth_col(table, 0), 0);
+	dict_ind_redundant->table = table;
+
+	/* create dummy table and index for COMPACT infimum and supremum */
+	table = dict_mem_table_create("SYS_DUMMY2",
+				      DICT_HDR_SPACE, 1,
+				      DICT_TF_COMPACT, 0);
+	dict_mem_table_add_col(table, NULL, NULL, DATA_CHAR,
+			       DATA_ENGLISH | DATA_NOT_NULL, 8);
+	dict_ind_compact = dict_mem_index_create("SYS_DUMMY2", "SYS_DUMMY2",
+						 DICT_HDR_SPACE, 0, 1);
+	dict_index_add_col(dict_ind_compact, table,
+			   dict_table_get_nth_col(table, 0), 0);
+	dict_ind_compact->table = table;
+
+	/* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+	dict_ind_redundant->cached = dict_ind_compact->cached = TRUE;
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Frees dict_ind_redundant and dict_ind_compact. */
+static
+void
+dict_ind_free(void)
+/*===============*/
+{
+	dict_table_t*	table;
+
+	table = dict_ind_compact->table;
+	dict_mem_index_free(dict_ind_compact);
+	dict_ind_compact = NULL;
+	dict_mem_table_free(table);
+
+	table = dict_ind_redundant->table;
+	dict_mem_index_free(dict_ind_redundant);
+	dict_ind_redundant = NULL;
+	dict_mem_table_free(table);
+}
+
+/**********************************************************************//**
+Get index by name
+@return	index, NULL if does not exist */
+UNIV_INTERN
+dict_index_t*
+dict_table_get_index_on_name(
+/*=========================*/
+	dict_table_t*	table,	/*!< in: table */
+	const char*	name)	/*!< in: name of the index to find */
+{
+	dict_index_t*	index;
+
+	/* If name is NULL, just return */
+	if (!name) {
+		return(NULL);
+	}
+
+	index = dict_table_get_first_index(table);
+
+	while (index != NULL) {
+		if (innobase_strcasecmp(index->name, name) == 0) {
+
+			return(index);
+		}
+
+		index = dict_table_get_next_index(index);
+	}
+
+	return(NULL);
+}
+
+/**********************************************************************//**
+Replace the index passed in with another equivalent index in the
+foreign key lists of the table.
+@return whether all replacements were found */
+UNIV_INTERN
+bool
+dict_foreign_replace_index(
+/*=======================*/
+	dict_table_t*		table,  /*!< in/out: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const dict_index_t*	index)	/*!< in: index to be replaced */
+{
+	bool		found	= true;
+	dict_foreign_t*	foreign;
+
+	ut_ad(index->to_be_dropped);
+	ut_ad(index->table == table);
+
+	for (dict_foreign_set::iterator it = table->foreign_set.begin();
+	     it != table->foreign_set.end();
+	     ++it) {
+
+		foreign = *it;
+		if (foreign->foreign_index == index) {
+			ut_ad(foreign->foreign_table == index->table);
+
+			dict_index_t* new_index = dict_foreign_find_index(
+				foreign->foreign_table, col_names,
+				foreign->foreign_col_names,
+				foreign->n_fields, index,
+				/*check_charsets=*/TRUE, /*check_null=*/FALSE);
+			if (new_index) {
+				ut_ad(new_index->table == index->table);
+				ut_ad(!new_index->to_be_dropped);
+			} else {
+				found = false;
+			}
+
+			foreign->foreign_index = new_index;
+		}
+	}
+
+	for (dict_foreign_set::iterator it = table->referenced_set.begin();
+	     it != table->referenced_set.end();
+	     ++it) {
+
+		foreign = *it;
+		if (foreign->referenced_index == index) {
+			ut_ad(foreign->referenced_table == index->table);
+
+			dict_index_t* new_index = dict_foreign_find_index(
+				foreign->referenced_table, NULL,
+				foreign->referenced_col_names,
+				foreign->n_fields, index,
+				/*check_charsets=*/TRUE, /*check_null=*/FALSE);
+			/* There must exist an alternative index,
+			since this must have been checked earlier. */
+			if (new_index) {
+				ut_ad(new_index->table == index->table);
+				ut_ad(!new_index->to_be_dropped);
+			} else {
+				found = false;
+			}
+
+			foreign->referenced_index = new_index;
+		}
+	}
+
+	return(found);
+}
+
+/**********************************************************************//**
+In case there is more than one index with the same name return the index
+with the min(id).
+@return	index, NULL if does not exist */
+UNIV_INTERN
+dict_index_t*
+dict_table_get_index_on_name_and_min_id(
+/*=====================================*/
+	dict_table_t*	table,	/*!< in: table */
+	const char*	name)	/*!< in: name of the index to find */
+{
+	dict_index_t*	index;
+	dict_index_t*	min_index; /* Index with matching name and min(id) */
+
+	min_index = NULL;
+	index = dict_table_get_first_index(table);
+
+	while (index != NULL) {
+		if (ut_strcmp(index->name, name) == 0) {
+			if (!min_index || index->id < min_index->id) {
+
+				min_index = index;
+			}
+		}
+
+		index = dict_table_get_next_index(index);
+	}
+
+	return(min_index);
+
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Check for duplicate index entries in a table [using the index name] */
+UNIV_INTERN
+void
+dict_table_check_for_dup_indexes(
+/*=============================*/
+	const dict_table_t*	table,	/*!< in: Check for dup indexes
+					in this table */
+	enum check_name		check)	/*!< in: whether and when to allow
+					temporary index names */
+{
+	/* Check for duplicates, ignoring indexes that are marked
+	as to be dropped */
+
+	const dict_index_t*	index1;
+	const dict_index_t*	index2;
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	/* The primary index _must_ exist */
+	ut_a(UT_LIST_GET_LEN(table->indexes) > 0);
+
+	index1 = UT_LIST_GET_FIRST(table->indexes);
+
+	do {
+		if (*index1->name == TEMP_INDEX_PREFIX) {
+			ut_a(!dict_index_is_clust(index1));
+
+			switch (check) {
+			case CHECK_ALL_COMPLETE:
+				ut_error;
+			case CHECK_ABORTED_OK:
+				switch (dict_index_get_online_status(index1)) {
+				case ONLINE_INDEX_COMPLETE:
+				case ONLINE_INDEX_CREATION:
+					ut_error;
+					break;
+				case ONLINE_INDEX_ABORTED:
+				case ONLINE_INDEX_ABORTED_DROPPED:
+					break;
+				}
+				/* fall through */
+			case CHECK_PARTIAL_OK:
+				break;
+			}
+		}
+
+		for (index2 = UT_LIST_GET_NEXT(indexes, index1);
+		     index2 != NULL;
+		     index2 = UT_LIST_GET_NEXT(indexes, index2)) {
+			ut_ad(ut_strcmp(index1->name, index2->name));
+		}
+
+		index1 = UT_LIST_GET_NEXT(indexes, index1);
+	} while (index1);
+}
+#endif /* UNIV_DEBUG */
+
+/** Auxiliary macro used inside dict_table_schema_check(). */
+#define CREATE_TYPES_NAMES() \
+	dtype_sql_name((unsigned) req_schema->columns[i].mtype, \
+		       (unsigned) req_schema->columns[i].prtype_mask, \
+		       (unsigned) req_schema->columns[i].len, \
+		       req_type, sizeof(req_type)); \
+	dtype_sql_name(table->cols[j].mtype, \
+		       table->cols[j].prtype, \
+		       table->cols[j].len, \
+		       actual_type, sizeof(actual_type))
+
+/*********************************************************************//**
+Checks whether a table exists and whether it has the given structure.
+The table must have the same number of columns with the same names and
+types. The order of the columns does not matter.
+The caller must own the dictionary mutex.
+dict_table_schema_check() @{
+@return DB_SUCCESS if the table exists and contains the necessary columns */
+UNIV_INTERN
+dberr_t
+dict_table_schema_check(
+/*====================*/
+	dict_table_schema_t*	req_schema,	/*!< in/out: required table
+						schema */
+	char*			errstr,		/*!< out: human readable error
+						message if != DB_SUCCESS is
+						returned */
+	size_t			errstr_sz)	/*!< in: errstr size */
+{
+	char		buf[MAX_FULL_NAME_LEN];
+	char		req_type[64];
+	char		actual_type[64];
+	dict_table_t*	table;
+	ulint		i;
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	table = dict_table_get_low(req_schema->table_name);
+
+	if (table == NULL) {
+		/* no such table */
+
+		ut_snprintf(errstr, errstr_sz,
+			    "Table %s not found.",
+			    ut_format_name(req_schema->table_name,
+					   TRUE, buf, sizeof(buf)));
+
+		return(DB_TABLE_NOT_FOUND);
+	}
+
+	if (table->ibd_file_missing) {
+		/* missing tablespace */
+
+		ut_snprintf(errstr, errstr_sz,
+			    "Tablespace for table %s is missing.",
+			    ut_format_name(req_schema->table_name,
+					   TRUE, buf, sizeof(buf)));
+
+		return(DB_TABLE_NOT_FOUND);
+	}
+
+	if ((ulint) table->n_def - DATA_N_SYS_COLS != req_schema->n_cols) {
+		/* the table has a different number of columns than
+		required */
+
+		ut_snprintf(errstr, errstr_sz,
+			    "%s has %d columns but should have %lu.",
+			    ut_format_name(req_schema->table_name,
+					   TRUE, buf, sizeof(buf)),
+			    table->n_def - DATA_N_SYS_COLS,
+			    req_schema->n_cols);
+
+		return(DB_ERROR);
+	}
+
+	/* For each column from req_schema->columns[] search
+	whether it is present in table->cols[].
+	The following algorithm is O(n_cols^2), but is optimized to
+	be O(n_cols) if the columns are in the same order in both arrays. */
+
+	for (i = 0; i < req_schema->n_cols; i++) {
+		ulint	j;
+
+		/* check if i'th column is the same in both arrays */
+		if (innobase_strcasecmp(req_schema->columns[i].name,
+			       dict_table_get_col_name(table, i)) == 0) {
+
+			/* we found the column in table->cols[] quickly */
+			j = i;
+		} else {
+
+			/* columns in both arrays are not in the same order,
+			do a full scan of the second array */
+			for (j = 0; j < table->n_def; j++) {
+				const char*	name;
+
+				name = dict_table_get_col_name(table, j);
+
+				if (innobase_strcasecmp(name,
+					req_schema->columns[i].name) == 0) {
+
+					/* found the column on j'th
+					position */
+					break;
+				}
+			}
+
+			if (j == table->n_def) {
+
+				ut_snprintf(errstr, errstr_sz,
+					    "required column %s "
+					    "not found in table %s.",
+					    req_schema->columns[i].name,
+					    ut_format_name(
+						    req_schema->table_name,
+						    TRUE, buf, sizeof(buf)));
+
+				return(DB_ERROR);
+			}
+		}
+
+		/* we found a column with the same name on j'th position,
+		compare column types and flags */
+
+		/* check length for exact match */
+		if (req_schema->columns[i].len != table->cols[j].len) {
+
+			CREATE_TYPES_NAMES();
+
+			ut_snprintf(errstr, errstr_sz,
+				    "Column %s in table %s is %s "
+				    "but should be %s (length mismatch).",
+				    req_schema->columns[i].name,
+				    ut_format_name(req_schema->table_name,
+						   TRUE, buf, sizeof(buf)),
+				    actual_type, req_type);
+
+			return(DB_ERROR);
+		}
+
+		/* check mtype for exact match */
+		if (req_schema->columns[i].mtype != table->cols[j].mtype) {
+
+			CREATE_TYPES_NAMES();
+
+			ut_snprintf(errstr, errstr_sz,
+				    "Column %s in table %s is %s "
+				    "but should be %s (type mismatch).",
+				    req_schema->columns[i].name,
+				    ut_format_name(req_schema->table_name,
+						   TRUE, buf, sizeof(buf)),
+				    actual_type, req_type);
+
+			return(DB_ERROR);
+		}
+
+		/* check whether required prtype mask is set */
+		if (req_schema->columns[i].prtype_mask != 0
+		    && (table->cols[j].prtype
+			& req_schema->columns[i].prtype_mask)
+		       != req_schema->columns[i].prtype_mask) {
+
+			CREATE_TYPES_NAMES();
+
+			ut_snprintf(errstr, errstr_sz,
+				    "Column %s in table %s is %s "
+				    "but should be %s (flags mismatch).",
+				    req_schema->columns[i].name,
+				    ut_format_name(req_schema->table_name,
+						   TRUE, buf, sizeof(buf)),
+				    actual_type, req_type);
+
+			return(DB_ERROR);
+		}
+	}
+
+	if (req_schema->n_foreign != table->foreign_set.size()) {
+		ut_snprintf(
+			errstr, errstr_sz,
+			"Table %s has " ULINTPF " foreign key(s) pointing"
+			" to other tables, but it must have %lu.",
+			ut_format_name(req_schema->table_name,
+				       TRUE, buf, sizeof(buf)),
+			static_cast<ulint>(table->foreign_set.size()),
+			req_schema->n_foreign);
+		return(DB_ERROR);
+	}
+
+	if (req_schema->n_referenced != table->referenced_set.size()) {
+		ut_snprintf(
+			errstr, errstr_sz,
+			"There are " ULINTPF " foreign key(s) pointing to %s, "
+			"but there must be %lu.",
+			static_cast<ulint>(table->referenced_set.size()),
+			ut_format_name(req_schema->table_name,
+				       TRUE, buf, sizeof(buf)),
+			req_schema->n_referenced);
+		return(DB_ERROR);
+	}
+
+	return(DB_SUCCESS);
+}
+/* @} */
+
+/*********************************************************************//**
+Converts a database and table name from filesystem encoding
+(e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two
+strings in UTF8 encoding (e.g. dцb and aюbØc). The output buffers must be
+at least MAX_DB_UTF8_LEN and MAX_TABLE_UTF8_LEN bytes. */
+UNIV_INTERN
+void
+dict_fs2utf8(
+/*=========*/
+	const char*	db_and_table,	/*!< in: database and table names,
+					e.g. d@i1b/a@q1b@1Kc */
+	char*		db_utf8,	/*!< out: database name, e.g. dцb */
+	size_t		db_utf8_size,	/*!< in: dbname_utf8 size */
+	char*		table_utf8,	/*!< out: table name, e.g. aюbØc */
+	size_t		table_utf8_size)/*!< in: table_utf8 size */
+{
+	char	db[MAX_DATABASE_NAME_LEN + 1];
+	ulint	db_len;
+	uint	errors;
+
+	db_len = dict_get_db_name_len(db_and_table);
+
+	ut_a(db_len <= sizeof(db));
+
+	memcpy(db, db_and_table, db_len);
+	db[db_len] = '\0';
+
+	strconvert(
+		&my_charset_filename, db, system_charset_info,
+		db_utf8, static_cast<uint>(db_utf8_size), &errors);
+
+	/* convert each # to @0023 in table name and store the result in buf */
+	const char*	table = dict_remove_db_name(db_and_table);
+	const char*	table_p;
+	char		buf[MAX_TABLE_NAME_LEN * 5 + 1];
+	char*		buf_p;
+	for (table_p = table, buf_p = buf; table_p[0] != '\0'; table_p++) {
+		if (table_p[0] != '#') {
+			buf_p[0] = table_p[0];
+			buf_p++;
+		} else {
+			buf_p[0] = '@';
+			buf_p[1] = '0';
+			buf_p[2] = '0';
+			buf_p[3] = '2';
+			buf_p[4] = '3';
+			buf_p += 5;
+		}
+		ut_a((size_t) (buf_p - buf) < sizeof(buf));
+	}
+	buf_p[0] = '\0';
+
+	errors = 0;
+	strconvert(
+		&my_charset_filename, buf, system_charset_info,
+		table_utf8, static_cast<uint>(table_utf8_size),
+		&errors);
+
+	if (errors != 0) {
+		ut_snprintf(table_utf8, table_utf8_size, "%s%s",
+			    srv_mysql50_table_name_prefix, table);
+	}
+}
+
+/**********************************************************************//**
+Closes the data dictionary module. */
+UNIV_INTERN
+void
+dict_close(void)
+/*============*/
+{
+	ulint	i;
+
+	/* Free the hash elements. We don't remove them from the table
+	because we are going to destroy the table anyway. */
+	for (i = 0; i < hash_get_n_cells(dict_sys->table_hash); i++) {
+		dict_table_t*	table;
+
+		table = static_cast<dict_table_t*>(
+			HASH_GET_FIRST(dict_sys->table_hash, i));
+
+		while (table) {
+			dict_table_t*	prev_table = table;
+
+			table = static_cast<dict_table_t*>(
+				HASH_GET_NEXT(name_hash, prev_table));
+#ifdef UNIV_DEBUG
+			ut_a(prev_table->magic_n == DICT_TABLE_MAGIC_N);
+#endif
+			/* Acquire only because it's a pre-condition. */
+			mutex_enter(&dict_sys->mutex);
+
+			dict_table_remove_from_cache(prev_table);
+
+			mutex_exit(&dict_sys->mutex);
+		}
+	}
+
+	hash_table_free(dict_sys->table_hash);
+
+	/* The elements are the same instance as in dict_sys->table_hash,
+	therefore we don't delete the individual elements. */
+	hash_table_free(dict_sys->table_id_hash);
+
+	dict_ind_free();
+
+	mutex_free(&dict_sys->mutex);
+
+	rw_lock_free(&dict_operation_lock);
+	memset(&dict_operation_lock, 0x0, sizeof(dict_operation_lock));
+
+	if (!srv_read_only_mode) {
+		mutex_free(&dict_foreign_err_mutex);
+	}
+
+	mem_free(dict_sys);
+	dict_sys = NULL;
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate the dictionary table LRU list.
+@return TRUE if valid  */
+static
+ibool
+dict_lru_validate(void)
+/*===================*/
+{
+	dict_table_t*	table;
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+		ut_a(table->can_be_evicted);
+	}
+
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+		ut_a(!table->can_be_evicted);
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Check if a table exists in the dict table LRU list.
+@return TRUE if table found in LRU list */
+static
+ibool
+dict_lru_find_table(
+/*================*/
+	const dict_table_t*	find_table)	/*!< in: table to find */
+{
+	dict_table_t*		table;
+
+	ut_ad(find_table != NULL);
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+		ut_a(table->can_be_evicted);
+
+		if (table == find_table) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Check if a table exists in the dict table non-LRU list.
+@return TRUE if table found in non-LRU list */
+static
+ibool
+dict_non_lru_find_table(
+/*====================*/
+	const dict_table_t*	find_table)	/*!< in: table to find */
+{
+	dict_table_t*		table;
+
+	ut_ad(find_table != NULL);
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+		ut_a(!table->can_be_evicted);
+
+		if (table == find_table) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Check an index to see whether its first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return true if the index qualifies, otherwise false */
+UNIV_INTERN
+bool
+dict_foreign_qualify_index(
+/*=======================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const char**		columns,/*!< in: array of column names */
+	ulint			n_cols,	/*!< in: number of columns */
+	const dict_index_t*	index,	/*!< in: index to check */
+	const dict_index_t*	types_idx,
+					/*!< in: NULL or an index
+					whose types the column types
+					must match */
+	bool			check_charsets,
+					/*!< in: whether to check
+					charsets.  only has an effect
+					if types_idx != NULL */
+	ulint			check_null)
+					/*!< in: nonzero if none of
+					the columns must be declared
+					NOT NULL */
+{
+	if (dict_index_get_n_fields(index) < n_cols) {
+		return(false);
+	}
+
+	for (ulint i = 0; i < n_cols; i++) {
+		dict_field_t*	field;
+		const char*	col_name;
+		ulint		col_no;
+
+		field = dict_index_get_nth_field(index, i);
+		col_no = dict_col_get_no(field->col);
+
+		if (field->prefix_len != 0) {
+			/* We do not accept column prefix
+			indexes here */
+			return(false);
+		}
+
+		if (check_null
+		    && (field->col->prtype & DATA_NOT_NULL)) {
+			return(false);
+		}
+
+		col_name = col_names
+			? col_names[col_no]
+			: dict_table_get_col_name(table, col_no);
+
+		if (0 != innobase_strcasecmp(columns[i], col_name)) {
+			return(false);
+		}
+
+		if (types_idx && !cmp_cols_are_equal(
+			    dict_index_get_nth_col(index, i),
+			    dict_index_get_nth_col(types_idx, i),
+			    check_charsets)) {
+			return(false);
+		}
+	}
+
+	return(true);
+}
+
+/*********************************************************************//**
+Update the state of compression failure padding heuristics. This is
+called whenever a compression operation succeeds or fails.
+The caller must be holding info->mutex */
+static
+void
+dict_index_zip_pad_update(
+/*======================*/
+	zip_pad_info_t*	info,	/*<! in/out: info to be updated */
+	ulint	zip_threshold)	/*<! in: zip threshold value */
+{
+	ulint	total;
+	ulint	fail_pct;
+
+	ut_ad(info);
+
+	total = info->success + info->failure;
+
+	ut_ad(total > 0);
+
+	if(zip_threshold == 0) {
+		/* User has just disabled the padding. */
+		return;
+	}
+
+	if (total < ZIP_PAD_ROUND_LEN) {
+		/* We are in middle of a round. Do nothing. */
+		return;
+	}
+
+	/* We are at a 'round' boundary. Reset the values but first
+	calculate fail rate for our heuristic. */
+	fail_pct = (info->failure * 100) / total;
+	info->failure = 0;
+	info->success = 0;
+
+	if (fail_pct > zip_threshold) {
+		/* Compression failures are more then user defined
+		threshold. Increase the pad size to reduce chances of
+		compression failures. */
+		ut_ad(info->pad % ZIP_PAD_INCR == 0);
+
+		/* Only do increment if it won't increase padding
+		beyond max pad size. */
+		if (info->pad + ZIP_PAD_INCR
+		    < (UNIV_PAGE_SIZE * zip_pad_max) / 100) {
+#ifdef HAVE_ATOMIC_BUILTINS
+			/* Use atomics even though we have the mutex.
+			This is to ensure that we are able to read
+			info->pad atomically where atomics are
+			supported. */
+			os_atomic_increment_ulint(&info->pad, ZIP_PAD_INCR);
+#else /* HAVE_ATOMIC_BUILTINS */
+			info->pad += ZIP_PAD_INCR;
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+			MONITOR_INC(MONITOR_PAD_INCREMENTS);
+		}
+
+		info->n_rounds = 0;
+
+	} else {
+		/* Failure rate was OK. Another successful round
+		completed. */
+		++info->n_rounds;
+
+		/* If enough successful rounds are completed with
+		compression failure rate in control, decrease the
+		padding. */
+		if (info->n_rounds >= ZIP_PAD_SUCCESSFUL_ROUND_LIMIT
+		    && info->pad > 0) {
+
+			ut_ad(info->pad % ZIP_PAD_INCR == 0);
+#ifdef HAVE_ATOMIC_BUILTINS
+			/* Use atomics even though we have the mutex.
+			This is to ensure that we are able to read
+			info->pad atomically where atomics are
+			supported. */
+			os_atomic_decrement_ulint(&info->pad, ZIP_PAD_INCR);
+#else /* HAVE_ATOMIC_BUILTINS */
+			info->pad -= ZIP_PAD_INCR;
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+			info->n_rounds = 0;
+
+			MONITOR_INC(MONITOR_PAD_DECREMENTS);
+		}
+	}
+}
+
+/*********************************************************************//**
+This function should be called whenever a page is successfully
+compressed. Updates the compression padding information. */
+UNIV_INTERN
+void
+dict_index_zip_success(
+/*===================*/
+	dict_index_t*	index)	/*!< in/out: index to be updated. */
+{
+	ut_ad(index);
+
+	ulint zip_threshold = zip_failure_threshold_pct;
+	if (!zip_threshold) {
+		/* Disabled by user. */
+		return;
+	}
+
+	os_fast_mutex_lock(&index->zip_pad.mutex);
+	++index->zip_pad.success;
+	dict_index_zip_pad_update(&index->zip_pad, zip_threshold);
+	os_fast_mutex_unlock(&index->zip_pad.mutex);
+}
+
+/*********************************************************************//**
+This function should be called whenever a page compression attempt
+fails. Updates the compression padding information. */
+UNIV_INTERN
+void
+dict_index_zip_failure(
+/*===================*/
+	dict_index_t*	index)	/*!< in/out: index to be updated. */
+{
+	ut_ad(index);
+
+	ulint zip_threshold = zip_failure_threshold_pct;
+	if (!zip_threshold) {
+		/* Disabled by user. */
+		return;
+	}
+
+	os_fast_mutex_lock(&index->zip_pad.mutex);
+	++index->zip_pad.failure;
+	dict_index_zip_pad_update(&index->zip_pad, zip_threshold);
+	os_fast_mutex_unlock(&index->zip_pad.mutex);
+}
+
+
+/*********************************************************************//**
+Return the optimal page size, for which page will likely compress.
+@return page size beyond which page might not compress */
+UNIV_INTERN
+ulint
+dict_index_zip_pad_optimal_page_size(
+/*=================================*/
+	dict_index_t*	index)	/*!< in: index for which page size
+				is requested */
+{
+	ulint	pad;
+	ulint	min_sz;
+	ulint	sz;
+
+	ut_ad(index);
+
+	if (!zip_failure_threshold_pct) {
+		/* Disabled by user. */
+		return(UNIV_PAGE_SIZE);
+	}
+
+	/* We use atomics to read index->zip_pad.pad. Here we use zero
+	as increment as are not changing the value of the 'pad'. On
+	platforms where atomics are not available we grab the mutex. */
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	pad = os_atomic_increment_ulint(&index->zip_pad.pad, 0);
+#else /* HAVE_ATOMIC_BUILTINS */
+	os_fast_mutex_lock(&index->zip_pad.mutex);
+	pad = index->zip_pad.pad;
+	os_fast_mutex_unlock(&index->zip_pad.mutex);
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+	ut_ad(pad < UNIV_PAGE_SIZE);
+	sz = UNIV_PAGE_SIZE - pad;
+
+	/* Min size allowed by user. */
+	ut_ad(zip_pad_max < 100);
+	min_sz = (UNIV_PAGE_SIZE * (100 - zip_pad_max)) / 100;
+
+	return(ut_max(sz, min_sz));
+}
+
+/*************************************************************//**
+Convert table flag to row format string.
+@return row format name. */
+UNIV_INTERN
+const char*
+dict_tf_to_row_format_string(
+/*=========================*/
+	ulint	table_flag)		/*!< in: row format setting */
+{
+	switch (dict_tf_get_rec_format(table_flag)) {
+	case REC_FORMAT_REDUNDANT:
+		return("ROW_TYPE_REDUNDANT");
+	case REC_FORMAT_COMPACT:
+		return("ROW_TYPE_COMPACT");
+	case REC_FORMAT_COMPRESSED:
+		return("ROW_TYPE_COMPRESSED");
+	case REC_FORMAT_DYNAMIC:
+		return("ROW_TYPE_DYNAMIC");
+	}
+
+	ut_error;
+	return(0);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/dict/dict0load.cc b/storage/innobase/dict/dict0load.cc
new file mode 100644
index 00000000000..69211990bfa
--- /dev/null
+++ b/storage/innobase/dict/dict0load.cc
@@ -0,0 +1,3147 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0load.cc
+Loads to the memory cache database object definitions
+from dictionary tables
+
+Created 4/24/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0load.h"
+#include "mysql_version.h"
+
+#ifdef UNIV_NONINL
+#include "dict0load.ic"
+#endif
+
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "page0page.h"
+#include "mach0data.h"
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "dict0stats.h"
+#include "rem0cmp.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "dict0crea.h"
+#include "dict0priv.h"
+#include "ha_prototypes.h" /* innobase_casedn_str() */
+#include "fts0priv.h"
+
+/** Following are the InnoDB system tables. The positions in
+this array are referenced by enum dict_system_table_id. */
+static const char* SYSTEM_TABLE_NAME[] = {
+	"SYS_TABLES",
+	"SYS_INDEXES",
+	"SYS_COLUMNS",
+	"SYS_FIELDS",
+	"SYS_FOREIGN",
+	"SYS_FOREIGN_COLS",
+	"SYS_TABLESPACES",
+	"SYS_DATAFILES"
+};
+
+/* If this flag is TRUE, then we will load the cluster index's (and tables')
+metadata even if it is marked as "corrupted". */
+UNIV_INTERN my_bool     srv_load_corrupted = FALSE;
+
+#ifdef UNIV_DEBUG
+/****************************************************************//**
+Compare the name of an index column.
+@return	TRUE if the i'th column of index is 'name'. */
+static
+ibool
+name_of_col_is(
+/*===========*/
+	const dict_table_t*	table,	/*!< in: table */
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			i,	/*!< in: index field offset */
+	const char*		name)	/*!< in: name to compare to */
+{
+	ulint	tmp = dict_col_get_no(dict_field_get_col(
+					      dict_index_get_nth_field(
+						      index, i)));
+
+	return(strcmp(name, dict_table_get_col_name(table, tmp)) == 0);
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Finds the first table name in the given database.
+@return own: table name, NULL if does not exist; the caller must free
+the memory in the string! */
+UNIV_INTERN
+char*
+dict_get_first_table_name_in_db(
+/*============================*/
+	const char*	name)	/*!< in: database name which ends in '/' */
+{
+	dict_table_t*	sys_tables;
+	btr_pcur_t	pcur;
+	dict_index_t*	sys_index;
+	dtuple_t*	tuple;
+	mem_heap_t*	heap;
+	dfield_t*	dfield;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	heap = mem_heap_create(1000);
+
+	mtr_start(&mtr);
+
+	sys_tables = dict_table_get_low("SYS_TABLES");
+	sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
+	ut_ad(!dict_table_is_comp(sys_tables));
+
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	dfield_set_data(dfield, name, ut_strlen(name));
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+loop:
+	rec = btr_pcur_get_rec(&pcur);
+
+	if (!btr_pcur_is_on_user_rec(&pcur)) {
+		/* Not found */
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+		mem_heap_free(heap);
+
+		return(NULL);
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__NAME, &len);
+
+	if (len < strlen(name)
+	    || ut_memcmp(name, field, strlen(name)) != 0) {
+		/* Not found */
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+		mem_heap_free(heap);
+
+		return(NULL);
+	}
+
+	if (!rec_get_deleted_flag(rec, 0)) {
+
+		/* We found one */
+
+		char*	table_name = mem_strdupl((char*) field, len);
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+		mem_heap_free(heap);
+
+		return(table_name);
+	}
+
+	btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+	goto loop;
+}
+
+/********************************************************************//**
+Prints to the standard output information on all tables found in the data
+dictionary system table. */
+UNIV_INTERN
+void
+dict_print(void)
+/*============*/
+{
+	dict_table_t*	table;
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+
+	/* Enlarge the fatal semaphore wait timeout during the InnoDB table
+	monitor printout */
+
+	os_increment_counter_by_amount(
+		server_mutex,
+		srv_fatal_semaphore_wait_threshold,
+		SRV_SEMAPHORE_WAIT_EXTENSION);
+
+	heap = mem_heap_create(1000);
+	mutex_enter(&(dict_sys->mutex));
+	mtr_start(&mtr);
+
+	rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES);
+
+	while (rec) {
+		const char* err_msg;
+
+		err_msg = static_cast<const char*>(
+			dict_process_sys_tables_rec_and_mtr_commit(
+				heap, rec, &table, DICT_TABLE_LOAD_FROM_CACHE,
+				&mtr));
+
+		if (!err_msg) {
+			dict_table_print(table);
+		} else {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: %s\n", err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		mtr_start(&mtr);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+	mutex_exit(&(dict_sys->mutex));
+	mem_heap_free(heap);
+
+	/* Restore the fatal semaphore wait timeout */
+	os_decrement_counter_by_amount(
+		server_mutex,
+		srv_fatal_semaphore_wait_threshold,
+		SRV_SEMAPHORE_WAIT_EXTENSION);
+}
+
+/********************************************************************//**
+This function gets the next system table record as it scans the table.
+@return	the next record if found, NULL if end of scan */
+static
+const rec_t*
+dict_getnext_system_low(
+/*====================*/
+	btr_pcur_t*	pcur,		/*!< in/out: persistent cursor to the
+					record*/
+	mtr_t*		mtr)		/*!< in: the mini-transaction */
+{
+	rec_t*	rec = NULL;
+
+	while (!rec || rec_get_deleted_flag(rec, 0)) {
+		btr_pcur_move_to_next_user_rec(pcur, mtr);
+
+		rec = btr_pcur_get_rec(pcur);
+
+		if (!btr_pcur_is_on_user_rec(pcur)) {
+			/* end of index */
+			btr_pcur_close(pcur);
+
+			return(NULL);
+		}
+	}
+
+	/* Get a record, let's save the position */
+	btr_pcur_store_position(pcur, mtr);
+
+	return(rec);
+}
+
+/********************************************************************//**
+This function opens a system table, and returns the first record.
+@return	first record of the system table */
+UNIV_INTERN
+const rec_t*
+dict_startscan_system(
+/*==================*/
+	btr_pcur_t*	pcur,		/*!< out: persistent cursor to
+					the record */
+	mtr_t*		mtr,		/*!< in: the mini-transaction */
+	dict_system_id_t system_id)	/*!< in: which system table to open */
+{
+	dict_table_t*	system_table;
+	dict_index_t*	clust_index;
+	const rec_t*	rec;
+
+	ut_a(system_id < SYS_NUM_SYSTEM_TABLES);
+
+	system_table = dict_table_get_low(SYSTEM_TABLE_NAME[system_id]);
+
+	clust_index = UT_LIST_GET_FIRST(system_table->indexes);
+
+	btr_pcur_open_at_index_side(true, clust_index, BTR_SEARCH_LEAF, pcur,
+				    true, 0, mtr);
+
+	rec = dict_getnext_system_low(pcur, mtr);
+
+	return(rec);
+}
+
+/********************************************************************//**
+This function gets the next system table record as it scans the table.
+@return	the next record if found, NULL if end of scan */
+UNIV_INTERN
+const rec_t*
+dict_getnext_system(
+/*================*/
+	btr_pcur_t*	pcur,		/*!< in/out: persistent cursor
+					to the record */
+	mtr_t*		mtr)		/*!< in: the mini-transaction */
+{
+	const rec_t*	rec;
+
+	/* Restore the position */
+	btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+
+	/* Get the next record */
+	rec = dict_getnext_system_low(pcur, mtr);
+
+	return(rec);
+}
+
+/********************************************************************//**
+This function processes one SYS_TABLES record and populate the dict_table_t
+struct for the table. Extracted out of dict_print() to be used by
+both monitor table output and information schema innodb_sys_tables output.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_tables_rec_and_mtr_commit(
+/*=======================================*/
+	mem_heap_t*	heap,		/*!< in/out: temporary memory heap */
+	const rec_t*	rec,		/*!< in: SYS_TABLES record */
+	dict_table_t**	table,		/*!< out: dict_table_t to fill */
+	dict_table_info_t status,	/*!< in: status bit controls
+					options such as whether we shall
+					look for dict_table_t from cache
+					first */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction,
+					will be committed */
+{
+	ulint		len;
+	const char*	field;
+	const char*	err_msg = NULL;
+	char*		table_name;
+
+	field = (const char*) rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__NAME, &len);
+
+	ut_a(!rec_get_deleted_flag(rec, 0));
+
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+
+	/* Get the table name */
+	table_name = mem_heap_strdupl(heap, field, len);
+
+	/* If DICT_TABLE_LOAD_FROM_CACHE is set, first check
+	whether there is cached dict_table_t struct */
+	if (status & DICT_TABLE_LOAD_FROM_CACHE) {
+
+		/* Commit before load the table again */
+		mtr_commit(mtr);
+
+		*table = dict_table_get_low(table_name);
+
+		if (!(*table)) {
+			err_msg = "Table not found in cache";
+		}
+	} else {
+		err_msg = dict_load_table_low(table_name, rec, table);
+		mtr_commit(mtr);
+	}
+
+	if (err_msg) {
+		return(err_msg);
+	}
+
+	return(NULL);
+}
+
+/********************************************************************//**
+This function parses a SYS_INDEXES record and populate a dict_index_t
+structure with the information from the record. For detail information
+about SYS_INDEXES fields, please refer to dict_boot() function.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_indexes_rec(
+/*=========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_INDEXES rec */
+	dict_index_t*	index,		/*!< out: index to be filled */
+	table_id_t*	table_id)	/*!< out: index table id */
+{
+	const char*	err_msg;
+	byte*		buf;
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+
+	/* Parse the record, and get "dict_index_t" struct filled */
+	err_msg = dict_load_index_low(buf, NULL,
+				      heap, rec, FALSE, &index);
+
+	*table_id = mach_read_from_8(buf);
+
+	return(err_msg);
+}
+
+/********************************************************************//**
+This function parses a SYS_COLUMNS record and populate a dict_column_t
+structure with the information from the record.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_columns_rec(
+/*=========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_COLUMNS rec */
+	dict_col_t*	column,		/*!< out: dict_col_t to be filled */
+	table_id_t*	table_id,	/*!< out: table id */
+	const char**	col_name)	/*!< out: column name */
+{
+	const char*	err_msg;
+
+	/* Parse the record, and get "dict_col_t" struct filled */
+	err_msg = dict_load_column_low(NULL, heap, column,
+				       table_id, col_name, rec);
+
+	return(err_msg);
+}
+
+/********************************************************************//**
+This function parses a SYS_FIELDS record and populates a dict_field_t
+structure with the information from the record.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_fields_rec(
+/*========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_FIELDS rec */
+	dict_field_t*	sys_field,	/*!< out: dict_field_t to be
+					filled */
+	ulint*		pos,		/*!< out: Field position */
+	index_id_t*	index_id,	/*!< out: current index id */
+	index_id_t	last_id)	/*!< in: previous index id */
+{
+	byte*		buf;
+	byte*		last_index_id;
+	const char*	err_msg;
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+
+	last_index_id = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(last_index_id, last_id);
+
+	err_msg = dict_load_field_low(buf, NULL, sys_field,
+				      pos, last_index_id, heap, rec);
+
+	*index_id = mach_read_from_8(buf);
+
+	return(err_msg);
+
+}
+
+/********************************************************************//**
+This function parses a SYS_FOREIGN record and populate a dict_foreign_t
+structure with the information from the record. For detail information
+about SYS_FOREIGN fields, please refer to dict_load_foreign() function.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_foreign_rec(
+/*=========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_FOREIGN rec */
+	dict_foreign_t*	foreign)	/*!< out: dict_foreign_t struct
+					to be filled */
+{
+	ulint		len;
+	const byte*	field;
+	ulint		n_fields_and_type;
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		return("delete-marked record in SYS_FOREIGN");
+	}
+
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FOREIGN) {
+		return("wrong number of columns in SYS_FOREIGN record");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__ID, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+err_len:
+		return("incorrect column length in SYS_FOREIGN");
+	}
+
+	/* This recieves a dict_foreign_t* that points to a stack variable.
+	So mem_heap_free(foreign->heap) is not used as elsewhere.
+	Since the heap used here is freed elsewhere, foreign->heap
+	is not assigned. */
+	foreign->id = mem_heap_strdupl(heap, (const char*) field, len);
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FOREIGN__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FOREIGN__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	/* The _lookup versions of the referenced and foreign table names
+	 are not assigned since they are not used in this dict_foreign_t */
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__FOR_NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	foreign->foreign_table_name = mem_heap_strdupl(
+		heap, (const char*) field, len);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__REF_NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	foreign->referenced_table_name = mem_heap_strdupl(
+		heap, (const char*) field, len);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__N_COLS, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+	n_fields_and_type = mach_read_from_4(field);
+
+	foreign->type = (unsigned int) (n_fields_and_type >> 24);
+	foreign->n_fields = (unsigned int) (n_fields_and_type & 0x3FFUL);
+
+	return(NULL);
+}
+
+/********************************************************************//**
+This function parses a SYS_FOREIGN_COLS record and extract necessary
+information from the record and return to caller.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_foreign_col_rec(
+/*=============================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_FOREIGN_COLS rec */
+	const char**	name,		/*!< out: foreign key constraint name */
+	const char**	for_col_name,	/*!< out: referencing column name */
+	const char**	ref_col_name,	/*!< out: referenced column name
+					in referenced table */
+	ulint*		pos)		/*!< out: column position */
+{
+	ulint		len;
+	const byte*	field;
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		return("delete-marked record in SYS_FOREIGN_COLS");
+	}
+
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FOREIGN_COLS) {
+		return("wrong number of columns in SYS_FOREIGN_COLS record");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+err_len:
+		return("incorrect column length in SYS_FOREIGN_COLS");
+	}
+	*name = mem_heap_strdupl(heap, (char*) field, len);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__POS, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+	*pos = mach_read_from_4(field);
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	*for_col_name = mem_heap_strdupl(heap, (char*) field, len);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	*ref_col_name = mem_heap_strdupl(heap, (char*) field, len);
+
+	return(NULL);
+}
+
+/********************************************************************//**
+This function parses a SYS_TABLESPACES record, extracts necessary
+information from the record and returns to caller.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_tablespaces(
+/*=========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_TABLESPACES rec */
+	ulint*		space,		/*!< out: space id */
+	const char**	name,		/*!< out: tablespace name */
+	ulint*		flags)		/*!< out: tablespace flags */
+{
+	ulint		len;
+	const byte*	field;
+
+	/* Initialize the output values */
+	*space = ULINT_UNDEFINED;
+	*name = NULL;
+	*flags = ULINT_UNDEFINED;
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		return("delete-marked record in SYS_TABLESPACES");
+	}
+
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_TABLESPACES) {
+		return("wrong number of columns in SYS_TABLESPACES record");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLESPACES__SPACE, &len);
+	if (len != DICT_FLD_LEN_SPACE) {
+err_len:
+		return("incorrect column length in SYS_TABLESPACES");
+	}
+	*space = mach_read_from_4(field);
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLESPACES__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLESPACES__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLESPACES__NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	*name = mem_heap_strdupl(heap, (char*) field, len);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLESPACES__FLAGS, &len);
+	if (len != DICT_FLD_LEN_FLAGS) {
+		goto err_len;
+	}
+	*flags = mach_read_from_4(field);
+
+	return(NULL);
+}
+
+/********************************************************************//**
+This function parses a SYS_DATAFILES record, extracts necessary
+information from the record and returns it to the caller.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_datafiles(
+/*=======================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_DATAFILES rec */
+	ulint*		space,		/*!< out: space id */
+	const char**	path)		/*!< out: datafile paths */
+{
+	ulint		len;
+	const byte*	field;
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		return("delete-marked record in SYS_DATAFILES");
+	}
+
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_DATAFILES) {
+		return("wrong number of columns in SYS_DATAFILES record");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_DATAFILES__SPACE, &len);
+	if (len != DICT_FLD_LEN_SPACE) {
+err_len:
+		return("incorrect column length in SYS_DATAFILES");
+	}
+	*space = mach_read_from_4(field);
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_DATAFILES__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_DATAFILES__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_DATAFILES__PATH, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	*path = mem_heap_strdupl(heap, (char*) field, len);
+
+	return(NULL);
+}
+
+/********************************************************************//**
+Determine the flags of a table as stored in SYS_TABLES.TYPE and N_COLS.
+@return  ULINT_UNDEFINED if error, else a valid dict_table_t::flags. */
+static
+ulint
+dict_sys_tables_get_flags(
+/*======================*/
+	const rec_t*	rec)	/*!< in: a record of SYS_TABLES */
+{
+	const byte*	field;
+	ulint		len;
+	ulint		type;
+	ulint		n_cols;
+
+	/* read the 4 byte flags from the TYPE field */
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__TYPE, &len);
+	ut_a(len == 4);
+	type = mach_read_from_4(field);
+
+	/* The low order bit of SYS_TABLES.TYPE is always set to 1. But in
+	dict_table_t::flags the low order bit is used to determine if the
+	row format is Redundant or Compact when the format is Antelope.
+	Read the 4 byte N_COLS field and look at the high order bit.  It
+	should be set for COMPACT and later.  It should not be set for
+	REDUNDANT. */
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__N_COLS, &len);
+	ut_a(len == 4);
+	n_cols = mach_read_from_4(field);
+
+	/* This validation function also combines the DICT_N_COLS_COMPACT
+	flag in n_cols into the type field to effectively make it a
+	dict_table_t::flags. */
+
+	if (ULINT_UNDEFINED == dict_sys_tables_type_validate(type, n_cols)) {
+		return(ULINT_UNDEFINED);
+	}
+
+	return(dict_sys_tables_type_to_tf(type, n_cols));
+}
+
+/********************************************************************//**
+Gets the filepath for a spaceid from SYS_DATAFILES and checks it against
+the contents of a link file. This function is called when there is no
+fil_node_t entry for this space ID so both durable locations on  disk
+must be checked and compared.
+We use a temporary heap here for the table lookup, but not for the path
+returned which the caller must free.
+This function can return NULL if the space ID is not found in SYS_DATAFILES,
+then the caller will assume that the ibd file is in the normal datadir.
+@return	own: A copy of the first datafile found in SYS_DATAFILES.PATH for
+the given space ID. NULL if space ID is zero or not found. */
+UNIV_INTERN
+char*
+dict_get_first_path(
+/*================*/
+	ulint		space,	/*!< in: space id */
+	const char*	name)	/*!< in: tablespace name */
+{
+	mtr_t		mtr;
+	dict_table_t*	sys_datafiles;
+	dict_index_t*	sys_index;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	byte*		buf;
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	char*		dict_filepath = NULL;
+	mem_heap_t*	heap = mem_heap_create(1024);
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	mtr_start(&mtr);
+
+	sys_datafiles = dict_table_get_low("SYS_DATAFILES");
+	sys_index = UT_LIST_GET_FIRST(sys_datafiles->indexes);
+	ut_ad(!dict_table_is_comp(sys_datafiles));
+	ut_ad(name_of_col_is(sys_datafiles, sys_index,
+			     DICT_FLD__SYS_DATAFILES__SPACE, "SPACE"));
+	ut_ad(name_of_col_is(sys_datafiles, sys_index,
+			     DICT_FLD__SYS_DATAFILES__PATH, "PATH"));
+
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, DICT_FLD__SYS_DATAFILES__SPACE);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(buf, space);
+
+	dfield_set_data(dfield, buf, 4);
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+
+	rec = btr_pcur_get_rec(&pcur);
+
+	/* If the file-per-table tablespace was created with
+	an earlier version of InnoDB, then this record is not
+	in SYS_DATAFILES.  But a link file still might exist. */
+
+	if (btr_pcur_is_on_user_rec(&pcur)) {
+		/* A record for this space ID was found. */
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_DATAFILES__PATH, &len);
+		ut_a(len > 0 || len == UNIV_SQL_NULL);
+		ut_a(len < OS_FILE_MAX_PATH);
+		dict_filepath = mem_strdupl((char*) field, len);
+		ut_a(dict_filepath);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+	mem_heap_free(heap);
+
+	return(dict_filepath);
+}
+
+/********************************************************************//**
+Update the record for space_id in SYS_TABLESPACES to this filepath.
+@return	DB_SUCCESS if OK, dberr_t if the insert failed */
+UNIV_INTERN
+dberr_t
+dict_update_filepath(
+/*=================*/
+	ulint		space_id,	/*!< in: space id */
+	const char*	filepath)	/*!< in: filepath */
+{
+	dberr_t		err = DB_SUCCESS;
+	trx_t*		trx;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "update filepath";
+	trx->dict_operation_lock_mode = RW_X_LATCH;
+	trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+
+	pars_info_t*	info = pars_info_create();
+
+	pars_info_add_int4_literal(info, "space", space_id);
+	pars_info_add_str_literal(info, "path", filepath);
+
+	err = que_eval_sql(info,
+			   "PROCEDURE UPDATE_FILEPATH () IS\n"
+			   "BEGIN\n"
+			   "UPDATE SYS_DATAFILES"
+			   " SET PATH = :path\n"
+			   " WHERE SPACE = :space;\n"
+			   "END;\n", FALSE, trx);
+
+	trx_commit_for_mysql(trx);
+	trx->dict_operation_lock_mode = 0;
+	trx_free_for_background(trx);
+
+	if (err == DB_SUCCESS) {
+		/* We just updated SYS_DATAFILES due to the contents in
+		a link file.  Make a note that we did this. */
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"The InnoDB data dictionary table SYS_DATAFILES "
+			"for tablespace ID %lu was updated to use file %s.",
+			(ulong) space_id, filepath);
+	} else {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Problem updating InnoDB data dictionary table "
+			"SYS_DATAFILES for tablespace ID %lu to file %s.",
+			(ulong) space_id, filepath);
+	}
+
+	return(err);
+}
+
+/********************************************************************//**
+Insert records into SYS_TABLESPACES and SYS_DATAFILES.
+@return	DB_SUCCESS if OK, dberr_t if the insert failed */
+UNIV_INTERN
+dberr_t
+dict_insert_tablespace_and_filepath(
+/*================================*/
+	ulint		space,		/*!< in: space id */
+	const char*	name,		/*!< in: talespace name */
+	const char*	filepath,	/*!< in: filepath */
+	ulint		fsp_flags)	/*!< in: tablespace flags */
+{
+	dberr_t		err = DB_SUCCESS;
+	trx_t*		trx;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(filepath);
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "insert tablespace and filepath";
+	trx->dict_operation_lock_mode = RW_X_LATCH;
+	trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+
+	/* A record for this space ID was not found in
+	SYS_DATAFILES. Assume the record is also missing in
+	SYS_TABLESPACES.  Insert records onto them both. */
+	err = dict_create_add_tablespace_to_dictionary(
+		space, name, fsp_flags, filepath, trx, false);
+
+	trx_commit_for_mysql(trx);
+	trx->dict_operation_lock_mode = 0;
+	trx_free_for_background(trx);
+
+	return(err);
+}
+
+/********************************************************************//**
+This function looks at each table defined in SYS_TABLES.  It checks the
+tablespace for any table with a space_id > 0.  It looks up the tablespace
+in SYS_DATAFILES to ensure the correct path.
+
+In a crash recovery we already have all the tablespace objects created.
+This function compares the space id information in the InnoDB data dictionary
+to what we already read with fil_load_single_table_tablespaces().
+
+In a normal startup, we create the tablespace objects for every table in
+InnoDB's data dictionary, if the corresponding .ibd file exists.
+We also scan the biggest space id, and store it to fil_system. */
+UNIV_INTERN
+void
+dict_check_tablespaces_and_store_max_id(
+/*====================================*/
+	dict_check_t	dict_check)	/*!< in: how to check */
+{
+	dict_table_t*	sys_tables;
+	dict_index_t*	sys_index;
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	ulint		max_space_id;
+	mtr_t		mtr;
+
+	rw_lock_x_lock(&dict_operation_lock);
+	mutex_enter(&(dict_sys->mutex));
+
+	mtr_start(&mtr);
+
+	sys_tables = dict_table_get_low("SYS_TABLES");
+	sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
+	ut_ad(!dict_table_is_comp(sys_tables));
+
+	max_space_id = mtr_read_ulint(dict_hdr_get(&mtr)
+				      + DICT_HDR_MAX_SPACE_ID,
+				      MLOG_4BYTES, &mtr);
+	fil_set_max_space_id_if_bigger(max_space_id);
+
+	btr_pcur_open_at_index_side(true, sys_index, BTR_SEARCH_LEAF, &pcur,
+				    true, 0, &mtr);
+loop:
+	btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+	rec = btr_pcur_get_rec(&pcur);
+
+	if (!btr_pcur_is_on_user_rec(&pcur)) {
+		/* end of index */
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+
+		/* We must make the tablespace cache aware of the biggest
+		known space id */
+
+		/* printf("Biggest space id in data dictionary %lu\n",
+		max_space_id); */
+		fil_set_max_space_id_if_bigger(max_space_id);
+
+		mutex_exit(&(dict_sys->mutex));
+		rw_lock_x_unlock(&dict_operation_lock);
+
+		return;
+	}
+
+	if (!rec_get_deleted_flag(rec, 0)) {
+
+		/* We found one */
+		const byte*	field;
+		ulint		len;
+		ulint		space_id;
+		ulint		flags;
+		char*		name;
+
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__NAME, &len);
+
+		name = mem_strdupl((char*) field, len);
+
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name), name, FALSE);
+
+		flags = dict_sys_tables_get_flags(rec);
+		if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) {
+			/* Read again the 4 bytes from rec. */
+			field = rec_get_nth_field_old(
+				rec, DICT_FLD__SYS_TABLES__TYPE, &len);
+			ut_ad(len == 4); /* this was checked earlier */
+			flags = mach_read_from_4(field);
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Table '%s' in InnoDB data dictionary"
+				" has unknown type %lx", table_name, flags);
+			mem_free(name);
+			goto loop;
+		}
+
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__SPACE, &len);
+		ut_a(len == 4);
+
+		space_id = mach_read_from_4(field);
+
+		btr_pcur_store_position(&pcur, &mtr);
+
+		mtr_commit(&mtr);
+
+		/* For tables created with old versions of InnoDB,
+		SYS_TABLES.MIX_LEN may contain garbage.  Such tables
+		would always be in ROW_FORMAT=REDUNDANT. Pretend that
+		all such tables are non-temporary. That is, do not
+		suppress error printouts about temporary or discarded
+		tablespaces not being found. */
+
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len);
+
+		bool		is_temp = false;
+		bool		discarded = false;
+		ib_uint32_t	flags2 = static_cast<ib_uint32_t>(
+			mach_read_from_4(field));
+
+		/* Check that the tablespace (the .ibd file) really
+		exists; print a warning to the .err log if not.
+		Do not print warnings for temporary tables or for
+		tablespaces that have been discarded. */
+
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__N_COLS, &len);
+
+		/* MIX_LEN valid only for ROW_FORMAT > REDUNDANT. */
+		if (mach_read_from_4(field) & DICT_N_COLS_COMPACT) {
+
+			is_temp = !!(flags2 & DICT_TF2_TEMPORARY);
+			discarded = !!(flags2 & DICT_TF2_DISCARDED);
+		}
+
+		if (space_id == 0) {
+			/* The system tablespace always exists. */
+			ut_ad(!discarded);
+			goto next_tablespace;
+		}
+
+		switch (dict_check) {
+		case DICT_CHECK_ALL_LOADED:
+			/* All tablespaces should have been found in
+			fil_load_single_table_tablespaces(). */
+			if (fil_space_for_table_exists_in_mem(
+				space_id, name, TRUE, !(is_temp || discarded),
+				false, NULL, 0)
+			    && !(is_temp || discarded)) {
+				/* If user changes the path of .ibd files in
+				   *.isl files before doing crash recovery ,
+				   then this leads to inconsistency in
+				   SYS_DATAFILES system table because the
+				   tables are loaded from the updated path
+				   but the SYS_DATAFILES still points to the
+				   old path.Therefore after crash recovery
+				   update SYS_DATAFILES with the updated path.*/
+				ut_ad(space_id);
+				ut_ad(recv_needed_recovery);
+				char *dict_path = dict_get_first_path(space_id,
+								      name);
+				char *remote_path = fil_read_link_file(name);
+				if(dict_path && remote_path) {
+					if(strcmp(dict_path,remote_path)) {
+						dict_update_filepath(space_id,
+								     remote_path);
+						}
+				}
+				if(dict_path)
+					mem_free(dict_path);
+				if(remote_path)
+					mem_free(remote_path);
+			}
+			break;
+
+		case DICT_CHECK_SOME_LOADED:
+			/* Some tablespaces may have been opened in
+			trx_resurrect_table_locks(). */
+			if (fil_space_for_table_exists_in_mem(
+				    space_id, name, FALSE, FALSE,
+				    false, NULL, 0)) {
+				break;
+			}
+			/* fall through */
+		case DICT_CHECK_NONE_LOADED:
+			if (discarded) {
+				ib_logf(IB_LOG_LEVEL_INFO,
+					"DISCARD flag set for table '%s',"
+					" ignored.",
+					table_name);
+				break;
+			}
+
+			/* It is a normal database startup: create the
+			space object and check that the .ibd file exists.
+			If the table uses a remote tablespace, look for the
+			space_id in SYS_DATAFILES to find the filepath */
+
+			/* Use the remote filepath if known. */
+			char*	filepath = NULL;
+			if (DICT_TF_HAS_DATA_DIR(flags)) {
+				filepath = dict_get_first_path(
+					space_id, name);
+			}
+
+			/* We set the 2nd param (fix_dict = true)
+			here because we already have an x-lock on
+			dict_operation_lock and dict_sys->mutex. Besides,
+			this is at startup and we are now single threaded.
+			If the filepath is not known, it will need to
+			be discovered. */
+			dberr_t	err = fil_open_single_table_tablespace(
+				false, srv_read_only_mode ? false : true,
+				space_id, dict_tf_to_fsp_flags(flags),
+				name, filepath);
+
+			if (err != DB_SUCCESS) {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Tablespace open failed for '%s', "
+					"ignored.", table_name);
+			}
+
+			if (filepath) {
+				mem_free(filepath);
+			}
+
+			break;
+		}
+
+		if (space_id > max_space_id) {
+			max_space_id = space_id;
+		}
+
+next_tablespace:
+		mem_free(name);
+		mtr_start(&mtr);
+
+		btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr);
+	}
+
+	goto loop;
+}
+
+/********************************************************************//**
+Loads a table column definition from a SYS_COLUMNS record to
+dict_table_t.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_load_column_low(
+/*=================*/
+	dict_table_t*	table,		/*!< in/out: table, could be NULL
+					if we just populate a dict_column_t
+					struct with information from
+					a SYS_COLUMNS record */
+	mem_heap_t*	heap,		/*!< in/out: memory heap
+					for temporary storage */
+	dict_col_t*	column,		/*!< out: dict_column_t to fill,
+					or NULL if table != NULL */
+	table_id_t*	table_id,	/*!< out: table id */
+	const char**	col_name,	/*!< out: column name */
+	const rec_t*	rec)		/*!< in: SYS_COLUMNS record */
+{
+	char*		name;
+	const byte*	field;
+	ulint		len;
+	ulint		mtype;
+	ulint		prtype;
+	ulint		col_len;
+	ulint		pos;
+
+	ut_ad(table || column);
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		return("delete-marked record in SYS_COLUMNS");
+	}
+
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_COLUMNS) {
+		return("wrong number of columns in SYS_COLUMNS record");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__TABLE_ID, &len);
+	if (len != 8) {
+err_len:
+		return("incorrect column length in SYS_COLUMNS");
+	}
+
+	if (table_id) {
+		*table_id = mach_read_from_8(field);
+	} else if (table->id != mach_read_from_8(field)) {
+		return("SYS_COLUMNS.TABLE_ID mismatch");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__POS, &len);
+	if (len != 4) {
+
+		goto err_len;
+	}
+
+	pos = mach_read_from_4(field);
+
+	if (table && table->n_def != pos) {
+		return("SYS_COLUMNS.POS mismatch");
+	}
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_COLUMNS__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	name = mem_heap_strdupl(heap, (const char*) field, len);
+
+	if (col_name) {
+		*col_name = name;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__MTYPE, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+
+	mtype = mach_read_from_4(field);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__PRTYPE, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+	prtype = mach_read_from_4(field);
+
+	if (dtype_get_charset_coll(prtype) == 0
+	    && dtype_is_string_type(mtype)) {
+		/* The table was created with < 4.1.2. */
+
+		if (dtype_is_binary_string_type(mtype, prtype)) {
+			/* Use the binary collation for
+			string columns of binary type. */
+
+			prtype = dtype_form_prtype(
+				prtype,
+				DATA_MYSQL_BINARY_CHARSET_COLL);
+		} else {
+			/* Use the default charset for
+			other than binary columns. */
+
+			prtype = dtype_form_prtype(
+				prtype,
+				data_mysql_default_charset_coll);
+		}
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__LEN, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+	col_len = mach_read_from_4(field);
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__PREC, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+
+	if (!column) {
+		dict_mem_table_add_col(table, heap, name, mtype,
+				       prtype, col_len);
+	} else {
+		dict_mem_fill_column_struct(column, pos, mtype,
+					    prtype, col_len);
+	}
+
+	return(NULL);
+}
+
+/********************************************************************//**
+Loads definitions for table columns. */
+static
+void
+dict_load_columns(
+/*==============*/
+	dict_table_t*	table,	/*!< in/out: table */
+	mem_heap_t*	heap)	/*!< in/out: memory heap
+				for temporary storage */
+{
+	dict_table_t*	sys_columns;
+	dict_index_t*	sys_index;
+	btr_pcur_t	pcur;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	const rec_t*	rec;
+	byte*		buf;
+	ulint		i;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	mtr_start(&mtr);
+
+	sys_columns = dict_table_get_low("SYS_COLUMNS");
+	sys_index = UT_LIST_GET_FIRST(sys_columns->indexes);
+	ut_ad(!dict_table_is_comp(sys_columns));
+
+	ut_ad(name_of_col_is(sys_columns, sys_index,
+			     DICT_FLD__SYS_COLUMNS__NAME, "NAME"));
+	ut_ad(name_of_col_is(sys_columns, sys_index,
+			     DICT_FLD__SYS_COLUMNS__PREC, "PREC"));
+
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(buf, table->id);
+
+	dfield_set_data(dfield, buf, 8);
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+	for (i = 0; i + DATA_N_SYS_COLS < (ulint) table->n_cols; i++) {
+		const char*	err_msg;
+		const char*	name = NULL;
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		ut_a(btr_pcur_is_on_user_rec(&pcur));
+
+		err_msg = dict_load_column_low(table, heap, NULL, NULL,
+					       &name, rec);
+
+		if (err_msg) {
+			fprintf(stderr, "InnoDB: %s\n", err_msg);
+			ut_error;
+		}
+
+		/* Note: Currently we have one DOC_ID column that is
+		shared by all FTS indexes on a table. */
+		if (innobase_strcasecmp(name,
+					FTS_DOC_ID_COL_NAME) == 0) {
+			dict_col_t*	col;
+			/* As part of normal loading of tables the
+			table->flag is not set for tables with FTS
+			till after the FTS indexes are loaded. So we
+			create the fts_t instance here if there isn't
+			one already created.
+
+			This case does not arise for table create as
+			the flag is set before the table is created. */
+			if (table->fts == NULL) {
+				table->fts = fts_create(table);
+				fts_optimize_add_table(table);
+			}
+
+			ut_a(table->fts->doc_col == ULINT_UNDEFINED);
+
+			col = dict_table_get_nth_col(table, i);
+
+			ut_ad(col->len == sizeof(doc_id_t));
+
+			if (col->prtype & DATA_FTS_DOC_ID) {
+				DICT_TF2_FLAG_SET(
+					table, DICT_TF2_FTS_HAS_DOC_ID);
+				DICT_TF2_FLAG_UNSET(
+					table, DICT_TF2_FTS_ADD_DOC_ID);
+			}
+
+			table->fts->doc_col = i;
+		}
+
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+}
+
+/** Error message for a delete-marked record in dict_load_field_low() */
+static const char* dict_load_field_del = "delete-marked record in SYS_FIELDS";
+
+/********************************************************************//**
+Loads an index field definition from a SYS_FIELDS record to
+dict_index_t.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_load_field_low(
+/*================*/
+	byte*		index_id,	/*!< in/out: index id (8 bytes)
+					an "in" value if index != NULL
+					and "out" if index == NULL */
+	dict_index_t*	index,		/*!< in/out: index, could be NULL
+					if we just populate a dict_field_t
+					struct with information from
+					a SYS_FIELDS record */
+	dict_field_t*	sys_field,	/*!< out: dict_field_t to be
+					filled */
+	ulint*		pos,		/*!< out: Field position */
+	byte*		last_index_id,	/*!< in: last index id */
+	mem_heap_t*	heap,		/*!< in/out: memory heap
+					for temporary storage */
+	const rec_t*	rec)		/*!< in: SYS_FIELDS record */
+{
+	const byte*	field;
+	ulint		len;
+	ulint		pos_and_prefix_len;
+	ulint		prefix_len;
+	ibool		first_field;
+	ulint		position;
+
+	/* Either index or sys_field is supplied, not both */
+	ut_a((!index) || (!sys_field));
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		return(dict_load_field_del);
+	}
+
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FIELDS) {
+		return("wrong number of columns in SYS_FIELDS record");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FIELDS__INDEX_ID, &len);
+	if (len != 8) {
+err_len:
+		return("incorrect column length in SYS_FIELDS");
+	}
+
+	if (!index) {
+		ut_a(last_index_id);
+		memcpy(index_id, (const char*) field, 8);
+		first_field = memcmp(index_id, last_index_id, 8);
+	} else {
+		first_field = (index->n_def == 0);
+		if (memcmp(field, index_id, 8)) {
+			return("SYS_FIELDS.INDEX_ID mismatch");
+		}
+	}
+
+	/* The next field stores the field position in the index and a
+	possible column prefix length if the index field does not
+	contain the whole column. The storage format is like this: if
+	there is at least one prefix field in the index, then the HIGH
+	2 bytes contain the field number (index->n_def) and the low 2
+	bytes the prefix length for the field. Otherwise the field
+	number (index->n_def) is contained in the 2 LOW bytes. */
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FIELDS__POS, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+
+	pos_and_prefix_len = mach_read_from_4(field);
+
+	if (index && UNIV_UNLIKELY
+	    ((pos_and_prefix_len & 0xFFFFUL) != index->n_def
+	     && (pos_and_prefix_len >> 16 & 0xFFFF) != index->n_def)) {
+		return("SYS_FIELDS.POS mismatch");
+	}
+
+	if (first_field || pos_and_prefix_len > 0xFFFFUL) {
+		prefix_len = pos_and_prefix_len & 0xFFFFUL;
+		position = (pos_and_prefix_len & 0xFFFF0000UL)  >> 16;
+	} else {
+		prefix_len = 0;
+		position = pos_and_prefix_len & 0xFFFFUL;
+	}
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FIELDS__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FIELDS__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FIELDS__COL_NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	if (index) {
+		dict_mem_index_add_field(
+			index, mem_heap_strdupl(heap, (const char*) field, len),
+			prefix_len);
+	} else {
+		ut_a(sys_field);
+		ut_a(pos);
+
+		sys_field->name = mem_heap_strdupl(
+			heap, (const char*) field, len);
+		sys_field->prefix_len = prefix_len;
+		*pos = position;
+	}
+
+	return(NULL);
+}
+
+/********************************************************************//**
+Loads definitions for index fields.
+@return DB_SUCCESS if ok, DB_CORRUPTION if corruption */
+static
+ulint
+dict_load_fields(
+/*=============*/
+	dict_index_t*	index,	/*!< in/out: index whose fields to load */
+	mem_heap_t*	heap)	/*!< in: memory heap for temporary storage */
+{
+	dict_table_t*	sys_fields;
+	dict_index_t*	sys_index;
+	btr_pcur_t	pcur;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	const rec_t*	rec;
+	byte*		buf;
+	ulint		i;
+	mtr_t		mtr;
+	dberr_t		error;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	mtr_start(&mtr);
+
+	sys_fields = dict_table_get_low("SYS_FIELDS");
+	sys_index = UT_LIST_GET_FIRST(sys_fields->indexes);
+	ut_ad(!dict_table_is_comp(sys_fields));
+	ut_ad(name_of_col_is(sys_fields, sys_index,
+			     DICT_FLD__SYS_FIELDS__COL_NAME, "COL_NAME"));
+
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(buf, index->id);
+
+	dfield_set_data(dfield, buf, 8);
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+	for (i = 0; i < index->n_fields; i++) {
+		const char* err_msg;
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		ut_a(btr_pcur_is_on_user_rec(&pcur));
+
+		err_msg = dict_load_field_low(buf, index, NULL, NULL, NULL,
+					      heap, rec);
+
+		if (err_msg == dict_load_field_del) {
+			/* There could be delete marked records in
+			SYS_FIELDS because SYS_FIELDS.INDEX_ID can be
+			updated by ALTER TABLE ADD INDEX. */
+
+			goto next_rec;
+		} else if (err_msg) {
+			fprintf(stderr, "InnoDB: %s\n", err_msg);
+			error = DB_CORRUPTION;
+			goto func_exit;
+		}
+next_rec:
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+
+	error = DB_SUCCESS;
+func_exit:
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+	return(error);
+}
+
+/** Error message for a delete-marked record in dict_load_index_low() */
+static const char* dict_load_index_del = "delete-marked record in SYS_INDEXES";
+/** Error message for table->id mismatch in dict_load_index_low() */
+static const char* dict_load_index_id_err = "SYS_INDEXES.TABLE_ID mismatch";
+
+/********************************************************************//**
+Loads an index definition from a SYS_INDEXES record to dict_index_t.
+If allocate=TRUE, we will create a dict_index_t structure and fill it
+accordingly. If allocated=FALSE, the dict_index_t will be supplied by
+the caller and filled with information read from the record.  @return
+error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_load_index_low(
+/*================*/
+	byte*		table_id,	/*!< in/out: table id (8 bytes),
+					an "in" value if allocate=TRUE
+					and "out" when allocate=FALSE */
+	const char*	table_name,	/*!< in: table name */
+	mem_heap_t*	heap,		/*!< in/out: temporary memory heap */
+	const rec_t*	rec,		/*!< in: SYS_INDEXES record */
+	ibool		allocate,	/*!< in: TRUE=allocate *index,
+					FALSE=fill in a pre-allocated
+					*index */
+	dict_index_t**	index)		/*!< out,own: index, or NULL */
+{
+	const byte*	field;
+	ulint		len;
+	ulint		name_len;
+	char*		name_buf;
+	index_id_t	id;
+	ulint		n_fields;
+	ulint		type;
+	ulint		space;
+
+	if (allocate) {
+		/* If allocate=TRUE, no dict_index_t will
+		be supplied. Initialize "*index" to NULL */
+		*index = NULL;
+	}
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		return(dict_load_index_del);
+	}
+
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_INDEXES) {
+		return("wrong number of columns in SYS_INDEXES record");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__TABLE_ID, &len);
+	if (len != 8) {
+err_len:
+		return("incorrect column length in SYS_INDEXES");
+	}
+
+	if (!allocate) {
+		/* We are reading a SYS_INDEXES record. Copy the table_id */
+		memcpy(table_id, (const char*) field, 8);
+	} else if (memcmp(field, table_id, 8)) {
+		/* Caller supplied table_id, verify it is the same
+		id as on the index record */
+		return(dict_load_index_id_err);
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__ID, &len);
+	if (len != 8) {
+		goto err_len;
+	}
+
+	id = mach_read_from_8(field);
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_INDEXES__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_INDEXES__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__NAME, &name_len);
+	if (name_len == UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	name_buf = mem_heap_strdupl(heap, (const char*) field,
+				    name_len);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__N_FIELDS, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+	n_fields = mach_read_from_4(field);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__TYPE, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+	type = mach_read_from_4(field);
+	if (type & (~0 << DICT_IT_BITS)) {
+		return("unknown SYS_INDEXES.TYPE bits");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__SPACE, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+	space = mach_read_from_4(field);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+
+	if (allocate) {
+		*index = dict_mem_index_create(table_name, name_buf,
+					       space, type, n_fields);
+	} else {
+		ut_a(*index);
+
+		dict_mem_fill_index_struct(*index, NULL, NULL, name_buf,
+					   space, type, n_fields);
+	}
+
+	(*index)->id = id;
+	(*index)->page = mach_read_from_4(field);
+	ut_ad((*index)->page);
+
+	return(NULL);
+}
+
+/********************************************************************//**
+Loads definitions for table indexes. Adds them to the data dictionary
+cache.
+@return DB_SUCCESS if ok, DB_CORRUPTION if corruption of dictionary
+table or DB_UNSUPPORTED if table has unknown index type */
+static __attribute__((nonnull))
+dberr_t
+dict_load_indexes(
+/*==============*/
+	dict_table_t*	table,	/*!< in/out: table */
+	mem_heap_t*	heap,	/*!< in: memory heap for temporary storage */
+	dict_err_ignore_t ignore_err)
+				/*!< in: error to be ignored when
+				loading the index definition */
+{
+	dict_table_t*	sys_indexes;
+	dict_index_t*	sys_index;
+	btr_pcur_t	pcur;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	const rec_t*	rec;
+	byte*		buf;
+	mtr_t		mtr;
+	dberr_t		error = DB_SUCCESS;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	mtr_start(&mtr);
+
+	sys_indexes = dict_table_get_low("SYS_INDEXES");
+	sys_index = UT_LIST_GET_FIRST(sys_indexes->indexes);
+	ut_ad(!dict_table_is_comp(sys_indexes));
+	ut_ad(name_of_col_is(sys_indexes, sys_index,
+			     DICT_FLD__SYS_INDEXES__NAME, "NAME"));
+	ut_ad(name_of_col_is(sys_indexes, sys_index,
+			     DICT_FLD__SYS_INDEXES__PAGE_NO, "PAGE_NO"));
+
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(buf, table->id);
+
+	dfield_set_data(dfield, buf, 8);
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+	for (;;) {
+		dict_index_t*	index = NULL;
+		const char*	err_msg;
+
+		if (!btr_pcur_is_on_user_rec(&pcur)) {
+
+			/* We should allow the table to open even
+			without index when DICT_ERR_IGNORE_CORRUPT is set.
+			DICT_ERR_IGNORE_CORRUPT is currently only set
+			for drop table */
+			if (dict_table_get_first_index(table) == NULL
+			    && !(ignore_err & DICT_ERR_IGNORE_CORRUPT)) {
+				ib_logf(IB_LOG_LEVEL_WARN,
+					"Cannot load table %s "
+					"because it has no indexes in "
+					"InnoDB internal data dictionary.",
+					table->name);
+				error = DB_CORRUPTION;
+				goto func_exit;
+			}
+
+			break;
+		}
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		if ((ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)
+		    && rec_get_n_fields_old(rec)
+		    == DICT_NUM_FIELDS__SYS_INDEXES) {
+			const byte*	field;
+			ulint		len;
+			field = rec_get_nth_field_old(
+				rec, DICT_FLD__SYS_INDEXES__NAME, &len);
+
+			if (len != UNIV_SQL_NULL
+			    && char(*field) == char(TEMP_INDEX_PREFIX)) {
+				/* Skip indexes whose name starts with
+				TEMP_INDEX_PREFIX, because they will
+				be dropped during crash recovery. */
+				goto next_rec;
+			}
+		}
+
+		err_msg = dict_load_index_low(buf, table->name, heap, rec,
+					      TRUE, &index);
+		ut_ad((index == NULL && err_msg != NULL)
+		      || (index != NULL && err_msg == NULL));
+
+		if (err_msg == dict_load_index_id_err) {
+			/* TABLE_ID mismatch means that we have
+			run out of index definitions for the table. */
+
+			if (dict_table_get_first_index(table) == NULL
+			    && !(ignore_err & DICT_ERR_IGNORE_CORRUPT)) {
+				ib_logf(IB_LOG_LEVEL_WARN,
+					"Failed to load the "
+					"clustered index for table %s "
+					"because of the following error: %s. "
+					"Refusing to load the rest of the "
+					"indexes (if any) and the whole table "
+					"altogether.", table->name, err_msg);
+				error = DB_CORRUPTION;
+				goto func_exit;
+			}
+
+			break;
+		} else if (err_msg == dict_load_index_del) {
+			/* Skip delete-marked records. */
+			goto next_rec;
+		} else if (err_msg) {
+			fprintf(stderr, "InnoDB: %s\n", err_msg);
+			if (ignore_err & DICT_ERR_IGNORE_CORRUPT) {
+				goto next_rec;
+			}
+			error = DB_CORRUPTION;
+			goto func_exit;
+		}
+
+		ut_ad(index);
+
+		/* Check whether the index is corrupted */
+		if (dict_index_is_corrupted(index)) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: ", stderr);
+			dict_index_name_print(stderr, NULL, index);
+			fputs(" is corrupted\n", stderr);
+
+			if (!srv_load_corrupted
+			    && !(ignore_err & DICT_ERR_IGNORE_CORRUPT)
+			    && dict_index_is_clust(index)) {
+				dict_mem_index_free(index);
+
+				error = DB_INDEX_CORRUPT;
+				goto func_exit;
+			} else {
+				/* We will load the index if
+				1) srv_load_corrupted is TRUE
+				2) ignore_err is set with
+				DICT_ERR_IGNORE_CORRUPT
+				3) if the index corrupted is a secondary
+				index */
+				ut_print_timestamp(stderr);
+				fputs("  InnoDB: load corrupted index ", stderr);
+				dict_index_name_print(stderr, NULL, index);
+				putc('\n', stderr);
+			}
+		}
+
+		if (index->type & DICT_FTS
+		    && !DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS)) {
+			/* This should have been created by now. */
+			ut_a(table->fts != NULL);
+			DICT_TF2_FLAG_SET(table, DICT_TF2_FTS);
+		}
+
+		/* We check for unsupported types first, so that the
+		subsequent checks are relevant for the supported types. */
+		if (index->type & ~(DICT_CLUSTERED | DICT_UNIQUE
+				    | DICT_CORRUPT | DICT_FTS)) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Unknown type %lu of index %s of table %s",
+				(ulong) index->type, index->name, table->name);
+
+			error = DB_UNSUPPORTED;
+			dict_mem_index_free(index);
+			goto func_exit;
+		} else if (index->page == FIL_NULL
+			   && !table->ibd_file_missing
+			   && (!(index->type & DICT_FTS))) {
+
+			fprintf(stderr,
+				"InnoDB: Error: trying to load index %s"
+				" for table %s\n"
+				"InnoDB: but the index tree has been freed!\n",
+				index->name, table->name);
+
+			if (ignore_err & DICT_ERR_IGNORE_INDEX_ROOT) {
+				/* If caller can tolerate this error,
+				we will continue to load the index and
+				let caller deal with this error. However
+				mark the index and table corrupted. We
+				only need to mark such in the index
+				dictionary cache for such metadata corruption,
+				since we would always be able to set it
+				when loading the dictionary cache */
+				dict_set_corrupted_index_cache_only(
+					index, table);
+
+				fprintf(stderr,
+					"InnoDB: Index is corrupt but forcing"
+					" load into data dictionary\n");
+			} else {
+corrupted:
+				dict_mem_index_free(index);
+				error = DB_CORRUPTION;
+				goto func_exit;
+			}
+		} else if (!dict_index_is_clust(index)
+			   && NULL == dict_table_get_first_index(table)) {
+
+			fputs("InnoDB: Error: trying to load index ",
+			      stderr);
+			ut_print_name(stderr, NULL, FALSE, index->name);
+			fputs(" for table ", stderr);
+			ut_print_name(stderr, NULL, TRUE, table->name);
+			fputs("\nInnoDB: but the first index"
+			      " is not clustered!\n", stderr);
+
+			goto corrupted;
+		} else if (dict_is_sys_table(table->id)
+			   && (dict_index_is_clust(index)
+			       || ((table == dict_sys->sys_tables)
+				   && !strcmp("ID_IND", index->name)))) {
+
+			/* The index was created in memory already at booting
+			of the database server */
+			dict_mem_index_free(index);
+		} else {
+			dict_load_fields(index, heap);
+
+			error = dict_index_add_to_cache(
+				table, index, index->page, FALSE);
+
+			/* The data dictionary tables should never contain
+			invalid index definitions.  If we ignored this error
+			and simply did not load this index definition, the
+			.frm file would disagree with the index definitions
+			inside InnoDB. */
+			if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+
+				goto func_exit;
+			}
+		}
+next_rec:
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+
+	/* If the table contains FTS indexes, populate table->fts->indexes */
+	if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS)) {
+		/* table->fts->indexes should have been created. */
+		ut_a(table->fts->indexes != NULL);
+		dict_table_get_all_fts_indexes(table, table->fts->indexes);
+	}
+
+func_exit:
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(error);
+}
+
+/********************************************************************//**
+Loads a table definition from a SYS_TABLES record to dict_table_t.
+Does not load any columns or indexes.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_load_table_low(
+/*================*/
+	const char*	name,		/*!< in: table name */
+	const rec_t*	rec,		/*!< in: SYS_TABLES record */
+	dict_table_t**	table)		/*!< out,own: table, or NULL */
+{
+	const byte*	field;
+	ulint		len;
+	ulint		space;
+	ulint		n_cols;
+	ulint		flags = 0;
+	ulint		flags2;
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		return("delete-marked record in SYS_TABLES");
+	}
+
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_TABLES) {
+		return("wrong number of columns in SYS_TABLES record");
+	}
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+err_len:
+		return("incorrect column length in SYS_TABLES");
+	}
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_TABLES__ID, &len);
+	if (len != 8) {
+		goto err_len;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__N_COLS, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+
+	n_cols = mach_read_from_4(field);
+
+	rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_TABLES__TYPE, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__MIX_ID, &len);
+	if (len != 8) {
+		goto err_len;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+
+	/* MIX_LEN may hold additional flags in post-antelope file formats. */
+	flags2 = mach_read_from_4(field);
+
+	/* DICT_TF2_FTS will be set when indexes is being loaded */
+	flags2 &= ~DICT_TF2_FTS;
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__CLUSTER_ID, &len);
+	if (len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__SPACE, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+
+	space = mach_read_from_4(field);
+
+	/* Check if the tablespace exists and has the right name */
+	flags = dict_sys_tables_get_flags(rec);
+
+	if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) {
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__TYPE, &len);
+		ut_ad(len == 4); /* this was checked earlier */
+		flags = mach_read_from_4(field);
+
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_filename(stderr, name);
+		fprintf(stderr, "\n"
+			"InnoDB: in InnoDB data dictionary"
+			" has unknown type %lx.\n",
+			(ulong) flags);
+		return("incorrect flags in SYS_TABLES");
+	}
+
+	/* The high-order bit of N_COLS is the "compact format" flag.
+	For tables in that format, MIX_LEN may hold additional flags. */
+	if (n_cols & DICT_N_COLS_COMPACT) {
+		ut_ad(flags & DICT_TF_COMPACT);
+
+		if (flags2 & ~DICT_TF2_BIT_MASK) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Warning: table ", stderr);
+			ut_print_filename(stderr, name);
+			fprintf(stderr, "\n"
+				"InnoDB: in InnoDB data dictionary"
+				" has unknown flags %lx.\n",
+				(ulong) flags2);
+
+			/* Clean it up and keep going */
+			flags2 &= DICT_TF2_BIT_MASK;
+		}
+	} else {
+		/* Do not trust the MIX_LEN field when the
+		row format is Redundant. */
+		flags2 = 0;
+	}
+
+	/* See if the tablespace is available. */
+	*table = dict_mem_table_create(
+		name, space, n_cols & ~DICT_N_COLS_COMPACT, flags, flags2);
+
+	field = rec_get_nth_field_old(rec, DICT_FLD__SYS_TABLES__ID, &len);
+	ut_ad(len == 8); /* this was checked earlier */
+
+	(*table)->id = mach_read_from_8(field);
+
+	(*table)->ibd_file_missing = FALSE;
+
+	return(NULL);
+}
+
+/********************************************************************//**
+Using the table->heap, copy the null-terminated filepath into
+table->data_dir_path and replace the 'databasename/tablename.ibd'
+portion with 'tablename'.
+This allows SHOW CREATE TABLE to return the correct DATA DIRECTORY path.
+Make this data directory path only if it has not yet been saved. */
+UNIV_INTERN
+void
+dict_save_data_dir_path(
+/*====================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	char*		filepath)	/*!< in: filepath of tablespace */
+{
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_a(DICT_TF_HAS_DATA_DIR(table->flags));
+
+	ut_a(!table->data_dir_path);
+	ut_a(filepath);
+
+	/* Be sure this filepath is not the default filepath. */
+	char*	default_filepath = fil_make_ibd_name(table->name, false);
+	if (strcmp(filepath, default_filepath)) {
+		ulint pathlen = strlen(filepath);
+		ut_a(pathlen < OS_FILE_MAX_PATH);
+		ut_a(0 == strcmp(filepath + pathlen - 4, ".ibd"));
+
+		table->data_dir_path = mem_heap_strdup(table->heap, filepath);
+		os_file_make_data_dir_path(table->data_dir_path);
+	} else {
+		/* This does not change SYS_DATAFILES or SYS_TABLES
+		or FSP_FLAGS on the header page of the tablespace,
+		but it makes dict_table_t consistent */
+		table->flags &= ~DICT_TF_MASK_DATA_DIR;
+	}
+	mem_free(default_filepath);
+}
+
+/*****************************************************************//**
+Make sure the data_file_name is saved in dict_table_t if needed. Try to
+read it from the file dictionary first, then from SYS_DATAFILES. */
+UNIV_INTERN
+void
+dict_get_and_save_data_dir_path(
+/*============================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	bool		dict_mutex_own)	/*!< in: true if dict_sys->mutex
+					is owned already */
+{
+	if (DICT_TF_HAS_DATA_DIR(table->flags)
+	    && (!table->data_dir_path)) {
+		char*	path = fil_space_get_first_path(table->space);
+
+		if (!dict_mutex_own) {
+			dict_mutex_enter_for_mysql();
+		}
+		if (!path) {
+			path = dict_get_first_path(
+				table->space, table->name);
+		}
+
+		if (path) {
+			dict_save_data_dir_path(table, path);
+			mem_free(path);
+		}
+
+		if (!dict_mutex_own) {
+			dict_mutex_exit_for_mysql();
+		}
+	}
+}
+
+/********************************************************************//**
+Loads a table definition and also all its index definitions, and also
+the cluster definition if the table is a member in a cluster. Also loads
+all foreign key constraints where the foreign key is in the table or where
+a foreign key references columns in this table. Adds all these to the data
+dictionary cache.
+@return table, NULL if does not exist; if the table is stored in an
+.ibd file, but the file does not exist, then we set the
+ibd_file_missing flag TRUE in the table object we return */
+UNIV_INTERN
+dict_table_t*
+dict_load_table(
+/*============*/
+	const char*	name,	/*!< in: table name in the
+				databasename/tablename format */
+	ibool		cached,	/*!< in: TRUE=add to cache, FALSE=do not */
+	dict_err_ignore_t ignore_err)
+				/*!< in: error to be ignored when loading
+				table and its indexes' definition */
+{
+	dberr_t		err;
+	dict_table_t*	table;
+	dict_table_t*	sys_tables;
+	btr_pcur_t	pcur;
+	dict_index_t*	sys_index;
+	dtuple_t*	tuple;
+	mem_heap_t*	heap;
+	dfield_t*	dfield;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	char*		filepath = NULL;
+	const char*	err_msg;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	heap = mem_heap_create(32000);
+
+	mtr_start(&mtr);
+
+	sys_tables = dict_table_get_low("SYS_TABLES");
+	sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
+	ut_ad(!dict_table_is_comp(sys_tables));
+	ut_ad(name_of_col_is(sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__ID, "ID"));
+	ut_ad(name_of_col_is(sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__N_COLS, "N_COLS"));
+	ut_ad(name_of_col_is(sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__TYPE, "TYPE"));
+	ut_ad(name_of_col_is(sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__MIX_LEN, "MIX_LEN"));
+	ut_ad(name_of_col_is(sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__SPACE, "SPACE"));
+
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	dfield_set_data(dfield, name, ut_strlen(name));
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+	rec = btr_pcur_get_rec(&pcur);
+
+	if (!btr_pcur_is_on_user_rec(&pcur)
+	    || rec_get_deleted_flag(rec, 0)) {
+		/* Not found */
+err_exit:
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+		mem_heap_free(heap);
+
+		return(NULL);
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__NAME, &len);
+
+	/* Check if the table name in record is the searched one */
+	if (len != ut_strlen(name) || ut_memcmp(name, field, len) != 0) {
+
+		goto err_exit;
+	}
+
+	err_msg = dict_load_table_low(name, rec, &table);
+
+	if (err_msg) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: %s\n", err_msg);
+		goto err_exit;
+	}
+
+	char	table_name[MAX_FULL_NAME_LEN + 1];
+
+	innobase_format_name(table_name, sizeof(table_name), name, FALSE);
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	if (table->space == 0) {
+		/* The system tablespace is always available. */
+	} else if (table->flags2 & DICT_TF2_DISCARDED) {
+
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Table '%s' tablespace is set as discarded.",
+			table_name);
+
+		table->ibd_file_missing = TRUE;
+
+	} else if (!fil_space_for_table_exists_in_mem(
+			table->space, name, FALSE, FALSE, true, heap,
+			table->id)) {
+
+		if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY)) {
+			/* Do not bother to retry opening temporary tables. */
+			table->ibd_file_missing = TRUE;
+
+		} else {
+			if (!(ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)) {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Failed to find tablespace for "
+					"table '%s' in the cache. "
+					"Attempting to load the tablespace "
+					"with space id %lu.",
+					table_name, (ulong) table->space);
+			}
+
+			/* Use the remote filepath if needed. */
+			if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+				/* This needs to be added to the table
+				from SYS_DATAFILES */
+				dict_get_and_save_data_dir_path(table, true);
+
+				if (table->data_dir_path) {
+					filepath = os_file_make_remote_pathname(
+						table->data_dir_path,
+						table->name, "ibd");
+				}
+			}
+
+			/* Try to open the tablespace.  We set the
+			2nd param (fix_dict = false) here because we
+			do not have an x-lock on dict_operation_lock */
+			err = fil_open_single_table_tablespace(
+				true, false, table->space,
+				dict_tf_to_fsp_flags(table->flags),
+				name, filepath);
+
+			if (err != DB_SUCCESS) {
+				/* We failed to find a sensible
+				tablespace file */
+
+				table->ibd_file_missing = TRUE;
+			}
+			if (filepath) {
+				mem_free(filepath);
+			}
+		}
+	}
+
+	dict_load_columns(table, heap);
+
+	if (cached) {
+		dict_table_add_to_cache(table, TRUE, heap);
+	} else {
+		dict_table_add_system_columns(table, heap);
+	}
+
+	mem_heap_empty(heap);
+
+	/* If there is no tablespace for the table then we only need to
+	load the index definitions. So that we can IMPORT the tablespace
+	later. When recovering table locks for resurrected incomplete
+	transactions, the tablespace should exist, because DDL operations
+	were not allowed while the table is being locked by a transaction. */
+	dict_err_ignore_t index_load_err =
+		!(ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)
+		&& table->ibd_file_missing
+		? DICT_ERR_IGNORE_ALL
+		: ignore_err;
+	err = dict_load_indexes(table, heap, index_load_err);
+
+	if (err == DB_INDEX_CORRUPT) {
+		/* Refuse to load the table if the table has a corrupted
+		cluster index */
+		if (!srv_load_corrupted) {
+			fprintf(stderr, "InnoDB: Error: Load table ");
+			ut_print_name(stderr, NULL, TRUE, table->name);
+			fprintf(stderr, " failed, the table has corrupted"
+					" clustered indexes. Turn on"
+					" 'innodb_force_load_corrupted'"
+					" to drop it\n");
+
+			dict_table_remove_from_cache(table);
+			table = NULL;
+			goto func_exit;
+		} else {
+			dict_index_t*	clust_index;
+			clust_index = dict_table_get_first_index(table);
+
+			if (dict_index_is_corrupted(clust_index)) {
+				table->corrupted = TRUE;
+			}
+		}
+	}
+
+	/* Initialize table foreign_child value. Its value could be
+	changed when dict_load_foreigns() is called below */
+	table->fk_max_recusive_level = 0;
+
+	/* If the force recovery flag is set, we open the table irrespective
+	of the error condition, since the user may want to dump data from the
+	clustered index. However we load the foreign key information only if
+	all indexes were loaded. */
+	if (!cached || table->ibd_file_missing) {
+		/* Don't attempt to load the indexes from disk. */
+	} else if (err == DB_SUCCESS) {
+		err = dict_load_foreigns(table->name, NULL, true, true,
+					 ignore_err);
+
+		if (err != DB_SUCCESS) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Load table '%s' failed, the table has missing "
+				"foreign key indexes. Turn off "
+				"'foreign_key_checks' and try again.",
+				table->name);
+
+			dict_table_remove_from_cache(table);
+			table = NULL;
+		} else {
+			table->fk_max_recusive_level = 0;
+		}
+	} else {
+		dict_index_t*   index;
+
+		/* Make sure that at least the clustered index was loaded.
+		Otherwise refuse to load the table */
+		index = dict_table_get_first_index(table);
+
+		if (!srv_force_recovery
+		    || !index
+		    || !dict_index_is_clust(index)) {
+
+			dict_table_remove_from_cache(table);
+			table = NULL;
+
+		} else if (dict_index_is_corrupted(index)
+			   && !table->ibd_file_missing) {
+
+			/* It is possible we force to load a corrupted
+			clustered index if srv_load_corrupted is set.
+			Mark the table as corrupted in this case */
+			table->corrupted = TRUE;
+		}
+	}
+
+func_exit:
+	mem_heap_free(heap);
+
+	ut_ad(!table
+	      || ignore_err != DICT_ERR_IGNORE_NONE
+	      || table->ibd_file_missing
+	      || !table->corrupted);
+
+	if (table && table->fts) {
+		if (!(dict_table_has_fts_index(table)
+		      || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
+		      || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID))) {
+			/* the table->fts could be created in dict_load_column
+			when a user defined FTS_DOC_ID is present, but no
+			FTS */
+			fts_free(table);
+		} else {
+			fts_optimize_add_table(table);
+		}
+	}
+
+	ut_ad(err != DB_SUCCESS || dict_foreign_set_validate(*table));
+
+	return(table);
+}
+
+/***********************************************************************//**
+Loads a table object based on the table id.
+@return	table; NULL if table does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_load_table_on_id(
+/*==================*/
+	table_id_t		table_id,	/*!< in: table id */
+	dict_err_ignore_t	ignore_err)	/*!< in: errors to ignore
+						when loading the table */
+{
+	byte		id_buf[8];
+	btr_pcur_t	pcur;
+	mem_heap_t*	heap;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	dict_index_t*	sys_table_ids;
+	dict_table_t*	sys_tables;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	dict_table_t*	table;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	table = NULL;
+
+	/* NOTE that the operation of this function is protected by
+	the dictionary mutex, and therefore no deadlocks can occur
+	with other dictionary operations. */
+
+	mtr_start(&mtr);
+	/*---------------------------------------------------*/
+	/* Get the secondary index based on ID for table SYS_TABLES */
+	sys_tables = dict_sys->sys_tables;
+	sys_table_ids = dict_table_get_next_index(
+		dict_table_get_first_index(sys_tables));
+	ut_ad(!dict_table_is_comp(sys_tables));
+	ut_ad(!dict_index_is_clust(sys_table_ids));
+	heap = mem_heap_create(256);
+
+	tuple  = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	/* Write the table id in byte format to id_buf */
+	mach_write_to_8(id_buf, table_id);
+
+	dfield_set_data(dfield, id_buf, 8);
+	dict_index_copy_types(tuple, sys_table_ids, 1);
+
+	btr_pcur_open_on_user_rec(sys_table_ids, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+
+check_rec:
+	rec = btr_pcur_get_rec(&pcur);
+
+	if (page_rec_is_user_rec(rec)) {
+		/*---------------------------------------------------*/
+		/* Now we have the record in the secondary index
+		containing the table ID and NAME */
+
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLE_IDS__ID, &len);
+		ut_ad(len == 8);
+
+		/* Check if the table id in record is the one searched for */
+		if (table_id == mach_read_from_8(field)) {
+			if (rec_get_deleted_flag(rec, 0)) {
+				/* Until purge has completed, there
+				may be delete-marked duplicate records
+				for the same SYS_TABLES.ID.
+				Due to Bug #60049, some delete-marked
+				records may survive the purge forever. */
+				if (btr_pcur_move_to_next(&pcur, &mtr)) {
+
+					goto check_rec;
+				}
+			} else {
+				/* Now we get the table name from the record */
+				field = rec_get_nth_field_old(rec,
+					DICT_FLD__SYS_TABLE_IDS__NAME, &len);
+				/* Load the table definition to memory */
+				table = dict_load_table(
+					mem_heap_strdupl(
+						heap, (char*) field, len),
+					TRUE, ignore_err);
+			}
+		}
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+	mem_heap_free(heap);
+
+	return(table);
+}
+
+/********************************************************************//**
+This function is called when the database is booted. Loads system table
+index definitions except for the clustered index which is added to the
+dictionary cache at booting before calling this function. */
+UNIV_INTERN
+void
+dict_load_sys_table(
+/*================*/
+	dict_table_t*	table)	/*!< in: system table */
+{
+	mem_heap_t*	heap;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	heap = mem_heap_create(1000);
+
+	dict_load_indexes(table, heap, DICT_ERR_IGNORE_NONE);
+
+	mem_heap_free(heap);
+}
+
+/********************************************************************//**
+Loads foreign key constraint col names (also for the referenced table).
+Members that must be set (and valid) in foreign:
+foreign->heap
+foreign->n_fields
+foreign->id ('\0'-terminated)
+Members that will be created and set by this function:
+foreign->foreign_col_names[i]
+foreign->referenced_col_names[i]
+(for i=0..foreign->n_fields-1) */
+static
+void
+dict_load_foreign_cols(
+/*===================*/
+	dict_foreign_t*	foreign)/*!< in/out: foreign constraint object */
+{
+	dict_table_t*	sys_foreign_cols;
+	dict_index_t*	sys_index;
+	btr_pcur_t	pcur;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	ulint		i;
+	mtr_t		mtr;
+	size_t		id_len;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	id_len = strlen(foreign->id);
+
+	foreign->foreign_col_names = static_cast<const char**>(
+		mem_heap_alloc(foreign->heap,
+			       foreign->n_fields * sizeof(void*)));
+
+	foreign->referenced_col_names = static_cast<const char**>(
+		mem_heap_alloc(foreign->heap,
+			       foreign->n_fields * sizeof(void*)));
+
+	mtr_start(&mtr);
+
+	sys_foreign_cols = dict_table_get_low("SYS_FOREIGN_COLS");
+
+	sys_index = UT_LIST_GET_FIRST(sys_foreign_cols->indexes);
+	ut_ad(!dict_table_is_comp(sys_foreign_cols));
+
+	tuple = dtuple_create(foreign->heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	dfield_set_data(dfield, foreign->id, id_len);
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+	for (i = 0; i < foreign->n_fields; i++) {
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		ut_a(btr_pcur_is_on_user_rec(&pcur));
+		ut_a(!rec_get_deleted_flag(rec, 0));
+
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len);
+
+		if (len != id_len || ut_memcmp(foreign->id, field, len) != 0) {
+			const rec_t*	pos;
+			ulint		pos_len;
+			const rec_t*	for_col_name;
+			ulint		for_col_name_len;
+			const rec_t*	ref_col_name;
+			ulint		ref_col_name_len;
+
+			pos = rec_get_nth_field_old(
+				rec, DICT_FLD__SYS_FOREIGN_COLS__POS,
+				&pos_len);
+
+			for_col_name = rec_get_nth_field_old(
+				rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME,
+				&for_col_name_len);
+
+			ref_col_name = rec_get_nth_field_old(
+				rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME,
+				&ref_col_name_len);
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Unable to load columns names for foreign "
+				"key '%s' because it was not found in "
+				"InnoDB internal table SYS_FOREIGN_COLS. The "
+				"closest entry we found is: "
+				"(ID='%.*s', POS=%lu, FOR_COL_NAME='%.*s', "
+				"REF_COL_NAME='%.*s')",
+				foreign->id,
+				(int) len, field,
+				mach_read_from_4(pos),
+				(int) for_col_name_len, for_col_name,
+				(int) ref_col_name_len, ref_col_name);
+
+			ut_error;
+		}
+
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN_COLS__POS, &len);
+		ut_a(len == 4);
+		ut_a(i == mach_read_from_4(field));
+
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, &len);
+		foreign->foreign_col_names[i] = mem_heap_strdupl(
+			foreign->heap, (char*) field, len);
+
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, &len);
+		foreign->referenced_col_names[i] = mem_heap_strdupl(
+			foreign->heap, (char*) field, len);
+
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+}
+
+/***********************************************************************//**
+Loads a foreign key constraint to the dictionary cache.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull(1), warn_unused_result))
+dberr_t
+dict_load_foreign(
+/*==============*/
+	const char*		id,
+				/*!< in: foreign constraint id, must be
+				'\0'-terminated */
+	const char**		col_names,
+				/*!< in: column names, or NULL
+				to use foreign->foreign_table->col_names */
+	bool			check_recursive,
+				/*!< in: whether to record the foreign table
+				parent count to avoid unlimited recursive
+				load of chained foreign tables */
+	bool			check_charsets,
+				/*!< in: whether to check charset
+				compatibility */
+	dict_err_ignore_t	ignore_err)
+				/*!< in: error to be ignored */
+{
+	dict_foreign_t*	foreign;
+	dict_table_t*	sys_foreign;
+	btr_pcur_t	pcur;
+	dict_index_t*	sys_index;
+	dtuple_t*	tuple;
+	mem_heap_t*	heap2;
+	dfield_t*	dfield;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	ulint		n_fields_and_type;
+	mtr_t		mtr;
+	dict_table_t*	for_table;
+	dict_table_t*	ref_table;
+	size_t		id_len;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	id_len = strlen(id);
+
+	heap2 = mem_heap_create(1000);
+
+	mtr_start(&mtr);
+
+	sys_foreign = dict_table_get_low("SYS_FOREIGN");
+
+	sys_index = UT_LIST_GET_FIRST(sys_foreign->indexes);
+	ut_ad(!dict_table_is_comp(sys_foreign));
+
+	tuple = dtuple_create(heap2, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	dfield_set_data(dfield, id, id_len);
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+	rec = btr_pcur_get_rec(&pcur);
+
+	if (!btr_pcur_is_on_user_rec(&pcur)
+	    || rec_get_deleted_flag(rec, 0)) {
+		/* Not found */
+
+		fprintf(stderr,
+			"InnoDB: Error: cannot load foreign constraint "
+			"%s: could not find the relevant record in "
+			"SYS_FOREIGN\n", id);
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+		mem_heap_free(heap2);
+
+		return(DB_ERROR);
+	}
+
+	field = rec_get_nth_field_old(rec, DICT_FLD__SYS_FOREIGN__ID, &len);
+
+	/* Check if the id in record is the searched one */
+	if (len != id_len || ut_memcmp(id, field, len) != 0) {
+
+		fprintf(stderr,
+			"InnoDB: Error: cannot load foreign constraint "
+			"%s: found %.*s instead in SYS_FOREIGN\n",
+			id, (int) len, field);
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+		mem_heap_free(heap2);
+
+		return(DB_ERROR);
+	}
+
+	/* Read the table names and the number of columns associated
+	with the constraint */
+
+	mem_heap_free(heap2);
+
+	foreign = dict_mem_foreign_create();
+
+	n_fields_and_type = mach_read_from_4(
+		rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN__N_COLS, &len));
+
+	ut_a(len == 4);
+
+	/* We store the type in the bits 24..29 of n_fields_and_type. */
+
+	foreign->type = (unsigned int) (n_fields_and_type >> 24);
+	foreign->n_fields = (unsigned int) (n_fields_and_type & 0x3FFUL);
+
+	foreign->id = mem_heap_strdupl(foreign->heap, id, id_len);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__FOR_NAME, &len);
+
+	foreign->foreign_table_name = mem_heap_strdupl(
+		foreign->heap, (char*) field, len);
+	dict_mem_foreign_table_name_lookup_set(foreign, TRUE);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__REF_NAME, &len);
+	foreign->referenced_table_name = mem_heap_strdupl(
+		foreign->heap, (char*) field, len);
+	dict_mem_referenced_table_name_lookup_set(foreign, TRUE);
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	dict_load_foreign_cols(foreign);
+
+	ref_table = dict_table_check_if_in_cache_low(
+			foreign->referenced_table_name_lookup);
+
+	/* We could possibly wind up in a deep recursive calls if
+	we call dict_table_get_low() again here if there
+	is a chain of tables concatenated together with
+	foreign constraints. In such case, each table is
+	both a parent and child of the other tables, and
+	act as a "link" in such table chains.
+	To avoid such scenario, we would need to check the
+	number of ancesters the current table has. If that
+	exceeds DICT_FK_MAX_CHAIN_LEN, we will stop loading
+	the child table.
+	Foreign constraints are loaded in a Breath First fashion,
+	that is, the index on FOR_NAME is scanned first, and then
+	index on REF_NAME. So foreign constrains in which
+	current table is a child (foreign table) are loaded first,
+	and then those constraints where current table is a
+	parent (referenced) table.
+	Thus we could check the parent (ref_table) table's
+	reference count (fk_max_recusive_level) to know how deep the
+	recursive call is. If the parent table (ref_table) is already
+	loaded, and its fk_max_recusive_level is larger than
+	DICT_FK_MAX_CHAIN_LEN, we will stop the recursive loading
+	by skipping loading the child table. It will not affect foreign
+	constraint check for DMLs since child table will be loaded
+	at that time for the constraint check. */
+	if (!ref_table
+	    || ref_table->fk_max_recusive_level < DICT_FK_MAX_RECURSIVE_LOAD) {
+
+		/* If the foreign table is not yet in the dictionary cache, we
+		have to load it so that we are able to make type comparisons
+		in the next function call. */
+
+		for_table = dict_table_get_low(foreign->foreign_table_name_lookup);
+
+		if (for_table && ref_table && check_recursive) {
+			/* This is to record the longest chain of ancesters
+			this table has, if the parent has more ancesters
+			than this table has, record it after add 1 (for this
+			parent */
+			if (ref_table->fk_max_recusive_level
+			    >= for_table->fk_max_recusive_level) {
+				for_table->fk_max_recusive_level =
+					 ref_table->fk_max_recusive_level + 1;
+			}
+		}
+	}
+
+	/* Note that there may already be a foreign constraint object in
+	the dictionary cache for this constraint: then the following
+	call only sets the pointers in it to point to the appropriate table
+	and index objects and frees the newly created object foreign.
+	Adding to the cache should always succeed since we are not creating
+	a new foreign key constraint but loading one from the data
+	dictionary. */
+
+	return(dict_foreign_add_to_cache(foreign, col_names, check_charsets,
+					 ignore_err));
+}
+
+/***********************************************************************//**
+Loads foreign key constraints where the table is either the foreign key
+holder or where the table is referenced by a foreign key. Adds these
+constraints to the data dictionary. Note that we know that the dictionary
+cache already contains all constraints where the other relevant table is
+already in the dictionary cache.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_load_foreigns(
+/*===============*/
+	const char*		table_name,	/*!< in: table name */
+	const char**		col_names,	/*!< in: column names, or NULL
+						to use table->col_names */
+	bool			check_recursive,/*!< in: Whether to check
+						recursive load of tables
+						chained by FK */
+	bool			check_charsets,	/*!< in: whether to check
+						charset compatibility */
+	dict_err_ignore_t	ignore_err)	/*!< in: error to be ignored */
+{
+	ulint		tuple_buf[(DTUPLE_EST_ALLOC(1) + sizeof(ulint) - 1)
+				/ sizeof(ulint)];
+	btr_pcur_t	pcur;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	dict_index_t*	sec_index;
+	dict_table_t*	sys_foreign;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	dberr_t		err;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	sys_foreign = dict_table_get_low("SYS_FOREIGN");
+
+	if (sys_foreign == NULL) {
+		/* No foreign keys defined yet in this database */
+
+		fprintf(stderr,
+			"InnoDB: Error: no foreign key system tables"
+			" in the database\n");
+
+		return(DB_ERROR);
+	}
+
+	ut_ad(!dict_table_is_comp(sys_foreign));
+	mtr_start(&mtr);
+
+	/* Get the secondary index based on FOR_NAME from table
+	SYS_FOREIGN */
+
+	sec_index = dict_table_get_next_index(
+		dict_table_get_first_index(sys_foreign));
+	ut_ad(!dict_index_is_clust(sec_index));
+start_load:
+
+	tuple = dtuple_create_from_mem(tuple_buf, sizeof(tuple_buf), 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	dfield_set_data(dfield, table_name, ut_strlen(table_name));
+	dict_index_copy_types(tuple, sec_index, 1);
+
+	btr_pcur_open_on_user_rec(sec_index, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+loop:
+	rec = btr_pcur_get_rec(&pcur);
+
+	if (!btr_pcur_is_on_user_rec(&pcur)) {
+		/* End of index */
+
+		goto load_next_index;
+	}
+
+	/* Now we have the record in the secondary index containing a table
+	name and a foreign constraint ID */
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME, &len);
+
+	/* Check if the table name in the record is the one searched for; the
+	following call does the comparison in the latin1_swedish_ci
+	charset-collation, in a case-insensitive way. */
+
+	if (0 != cmp_data_data(dfield_get_type(dfield)->mtype,
+			       dfield_get_type(dfield)->prtype,
+			       static_cast<const byte*>(
+				       dfield_get_data(dfield)),
+			       dfield_get_len(dfield),
+			       field, len)) {
+
+		goto load_next_index;
+	}
+
+	/* Since table names in SYS_FOREIGN are stored in a case-insensitive
+	order, we have to check that the table name matches also in a binary
+	string comparison. On Unix, MySQL allows table names that only differ
+	in character case.  If lower_case_table_names=2 then what is stored
+	may not be the same case, but the previous comparison showed that they
+	match with no-case.  */
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		goto next_rec;
+	}
+
+	if ((innobase_get_lower_case_table_names() != 2)
+	    && (0 != ut_memcmp(field, table_name, len))) {
+		goto next_rec;
+	}
+
+	/* Now we get a foreign key constraint id */
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__ID, &len);
+
+	/* Copy the string because the page may be modified or evicted
+	after mtr_commit() below. */
+	char	fk_id[MAX_TABLE_NAME_LEN + 1];
+
+	ut_a(len <= MAX_TABLE_NAME_LEN);
+	memcpy(fk_id, field, len);
+	fk_id[len] = '\0';
+
+	btr_pcur_store_position(&pcur, &mtr);
+
+	mtr_commit(&mtr);
+
+	/* Load the foreign constraint definition to the dictionary cache */
+
+	err = dict_load_foreign(fk_id, col_names,
+				check_recursive, check_charsets, ignore_err);
+
+	if (err != DB_SUCCESS) {
+		btr_pcur_close(&pcur);
+
+		return(err);
+	}
+
+	mtr_start(&mtr);
+
+	btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr);
+next_rec:
+	btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+	goto loop;
+
+load_next_index:
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	sec_index = dict_table_get_next_index(sec_index);
+
+	if (sec_index != NULL) {
+
+		mtr_start(&mtr);
+
+		/* Switch to scan index on REF_NAME, fk_max_recusive_level
+		already been updated when scanning FOR_NAME index, no need to
+		update again */
+		check_recursive = FALSE;
+
+		goto start_load;
+	}
+
+	return(DB_SUCCESS);
+}
diff --git a/storage/innobase/dict/dict0mem.cc b/storage/innobase/dict/dict0mem.cc
new file mode 100644
index 00000000000..9ef878bd870
--- /dev/null
+++ b/storage/innobase/dict/dict0mem.cc
@@ -0,0 +1,783 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file dict/dict0mem.cc
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "dict0mem.h"
+
+#ifdef UNIV_NONINL
+#include "dict0mem.ic"
+#endif
+
+#include "rem0rec.h"
+#include "data0type.h"
+#include "mach0data.h"
+#include "dict0dict.h"
+#include "fts0priv.h"
+#include "ut0crc32.h"
+#ifndef UNIV_HOTBACKUP
+# include "ha_prototypes.h"	/* innobase_casedn_str(),
+				innobase_get_lower_case_table_names */
+# include "mysql_com.h"		/* NAME_LEN */
+# include "lock0lock.h"
+#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_BLOB_DEBUG
+# include "ut0rbt.h"
+#endif /* UNIV_BLOB_DEBUG */
+#include <iostream>
+
+#define	DICT_HEAP_SIZE		100	/*!< initial memory heap size when
+					creating a table or index object */
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register autoinc_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	autoinc_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/** An interger randomly initialized at startup used to make a temporary
+table name as unique as possible. */
+static ib_uint32_t	dict_temp_file_num;
+
+/**********************************************************************//**
+Creates a table memory object.
+@return	own: table object */
+UNIV_INTERN
+dict_table_t*
+dict_mem_table_create(
+/*==================*/
+	const char*	name,	/*!< in: table name */
+	ulint		space,	/*!< in: space where the clustered index of
+				the table is placed */
+	ulint		n_cols,	/*!< in: number of columns */
+	ulint		flags,	/*!< in: table flags */
+	ulint		flags2)	/*!< in: table flags2 */
+{
+	dict_table_t*	table;
+	mem_heap_t*	heap;
+
+	ut_ad(name);
+	ut_a(dict_tf_is_valid(flags));
+	ut_a(!(flags2 & ~DICT_TF2_BIT_MASK));
+
+	heap = mem_heap_create(DICT_HEAP_SIZE);
+
+	table = static_cast<dict_table_t*>(
+		mem_heap_zalloc(heap, sizeof(dict_table_t)));
+
+	table->heap = heap;
+
+	table->flags = (unsigned int) flags;
+	table->flags2 = (unsigned int) flags2;
+	table->name = static_cast<char*>(ut_malloc(strlen(name) + 1));
+	memcpy(table->name, name, strlen(name) + 1);
+	table->space = (unsigned int) space;
+	table->n_cols = (unsigned int) (n_cols + DATA_N_SYS_COLS);
+
+	table->cols = static_cast<dict_col_t*>(
+		mem_heap_alloc(heap,
+			       (n_cols + DATA_N_SYS_COLS)
+			       * sizeof(dict_col_t)));
+
+	ut_d(table->magic_n = DICT_TABLE_MAGIC_N);
+
+	/* true means that the stats latch will be enabled -
+	dict_table_stats_lock() will not be noop. */
+	dict_table_stats_latch_create(table, true);
+
+#ifndef UNIV_HOTBACKUP
+	table->autoinc_lock = static_cast<ib_lock_t*>(
+		mem_heap_alloc(heap, lock_get_size()));
+
+	mutex_create(autoinc_mutex_key,
+		     &table->autoinc_mutex, SYNC_DICT_AUTOINC_MUTEX);
+
+	table->autoinc = 0;
+
+	/* The number of transactions that are either waiting on the
+	AUTOINC lock or have been granted the lock. */
+	table->n_waiting_or_granted_auto_inc_locks = 0;
+
+	/* If the table has an FTS index or we are in the process
+	of building one, create the table->fts */
+	if (dict_table_has_fts_index(table)
+	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
+	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+		table->fts = fts_create(table);
+		table->fts->cache = fts_cache_create(table);
+	} else {
+		table->fts = NULL;
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	new(&table->foreign_set) dict_foreign_set();
+	new(&table->referenced_set) dict_foreign_set();
+
+	return(table);
+}
+
+/****************************************************************//**
+Free a table memory object. */
+UNIV_INTERN
+void
+dict_mem_table_free(
+/*================*/
+	dict_table_t*	table)		/*!< in: table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_d(table->cached = FALSE);
+
+        if (dict_table_has_fts_index(table)
+            || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
+            || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+		if (table->fts) {
+			if (table->cached) {
+				fts_optimize_remove_table(table);
+			}
+
+			fts_free(table);
+		}
+	}
+#ifndef UNIV_HOTBACKUP
+	mutex_free(&(table->autoinc_mutex));
+#endif /* UNIV_HOTBACKUP */
+
+	dict_table_stats_latch_destroy(table);
+
+	table->foreign_set.~dict_foreign_set();
+	table->referenced_set.~dict_foreign_set();
+
+	ut_free(table->name);
+	mem_heap_free(table->heap);
+}
+
+/****************************************************************//**
+Append 'name' to 'col_names'.  @see dict_table_t::col_names
+@return	new column names array */
+static
+const char*
+dict_add_col_name(
+/*==============*/
+	const char*	col_names,	/*!< in: existing column names, or
+					NULL */
+	ulint		cols,		/*!< in: number of existing columns */
+	const char*	name,		/*!< in: new column name */
+	mem_heap_t*	heap)		/*!< in: heap */
+{
+	ulint	old_len;
+	ulint	new_len;
+	ulint	total_len;
+	char*	res;
+
+	ut_ad(!cols == !col_names);
+
+	/* Find out length of existing array. */
+	if (col_names) {
+		const char*	s = col_names;
+		ulint		i;
+
+		for (i = 0; i < cols; i++) {
+			s += strlen(s) + 1;
+		}
+
+		old_len = s - col_names;
+	} else {
+		old_len = 0;
+	}
+
+	new_len = strlen(name) + 1;
+	total_len = old_len + new_len;
+
+	res = static_cast<char*>(mem_heap_alloc(heap, total_len));
+
+	if (old_len > 0) {
+		memcpy(res, col_names, old_len);
+	}
+
+	memcpy(res + old_len, name, new_len);
+
+	return(res);
+}
+
+/**********************************************************************//**
+Adds a column definition to a table. */
+UNIV_INTERN
+void
+dict_mem_table_add_col(
+/*===================*/
+	dict_table_t*	table,	/*!< in: table */
+	mem_heap_t*	heap,	/*!< in: temporary memory heap, or NULL */
+	const char*	name,	/*!< in: column name, or NULL */
+	ulint		mtype,	/*!< in: main datatype */
+	ulint		prtype,	/*!< in: precise type */
+	ulint		len)	/*!< in: precision */
+{
+	dict_col_t*	col;
+	ulint		i;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(!heap == !name);
+
+	i = table->n_def++;
+
+	if (name) {
+		if (UNIV_UNLIKELY(table->n_def == table->n_cols)) {
+			heap = table->heap;
+		}
+		if (UNIV_LIKELY(i) && UNIV_UNLIKELY(!table->col_names)) {
+			/* All preceding column names are empty. */
+			char* s = static_cast<char*>(
+				mem_heap_zalloc(heap, table->n_def));
+
+			table->col_names = s;
+		}
+
+		table->col_names = dict_add_col_name(table->col_names,
+						     i, name, heap);
+	}
+
+	col = dict_table_get_nth_col(table, i);
+
+	dict_mem_fill_column_struct(col, i, mtype, prtype, len);
+}
+
+/**********************************************************************//**
+Renames a column of a table in the data dictionary cache. */
+static __attribute__((nonnull))
+void
+dict_mem_table_col_rename_low(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	unsigned	i,	/*!< in: column offset corresponding to s */
+	const char*	to,	/*!< in: new column name */
+	const char*	s)	/*!< in: pointer to table->col_names */
+{
+	size_t from_len = strlen(s), to_len = strlen(to);
+
+	ut_ad(i < table->n_def);
+	ut_ad(from_len <= NAME_LEN);
+	ut_ad(to_len <= NAME_LEN);
+
+	char from[NAME_LEN];
+	strncpy(from, s, NAME_LEN);
+
+	if (from_len == to_len) {
+		/* The easy case: simply replace the column name in
+		table->col_names. */
+		strcpy(const_cast<char*>(s), to);
+	} else {
+		/* We need to adjust all affected index->field
+		pointers, as in dict_index_add_col(). First, copy
+		table->col_names. */
+		ulint	prefix_len	= s - table->col_names;
+
+		for (; i < table->n_def; i++) {
+			s += strlen(s) + 1;
+		}
+
+		ulint	full_len	= s - table->col_names;
+		char*	col_names;
+
+		if (to_len > from_len) {
+			col_names = static_cast<char*>(
+				mem_heap_alloc(
+					table->heap,
+					full_len + to_len - from_len));
+
+			memcpy(col_names, table->col_names, prefix_len);
+		} else {
+			col_names = const_cast<char*>(table->col_names);
+		}
+
+		memcpy(col_names + prefix_len, to, to_len);
+		memmove(col_names + prefix_len + to_len,
+			table->col_names + (prefix_len + from_len),
+			full_len - (prefix_len + from_len));
+
+		/* Replace the field names in every index. */
+		for (dict_index_t* index = dict_table_get_first_index(table);
+		     index != NULL;
+		     index = dict_table_get_next_index(index)) {
+			ulint	n_fields = dict_index_get_n_fields(index);
+
+			for (ulint i = 0; i < n_fields; i++) {
+				dict_field_t*	field
+					= dict_index_get_nth_field(
+						index, i);
+				ulint		name_ofs
+					= field->name - table->col_names;
+				if (name_ofs <= prefix_len) {
+					field->name = col_names + name_ofs;
+				} else {
+					ut_a(name_ofs < full_len);
+					field->name = col_names
+						+ name_ofs + to_len - from_len;
+				}
+			}
+		}
+
+		table->col_names = col_names;
+	}
+
+	dict_foreign_t*	foreign;
+
+	/* Replace the field names in every foreign key constraint. */
+	for (dict_foreign_set::iterator it = table->foreign_set.begin();
+	     it != table->foreign_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		if (foreign->foreign_index == NULL) {
+			/* We may go here when we set foreign_key_checks to 0,
+			and then try to rename a column and modify the
+			corresponding foreign key constraint. The index
+			would have been dropped, we have to find an equivalent
+			one */
+			for (unsigned f = 0; f < foreign->n_fields; f++) {
+				if (strcmp(foreign->foreign_col_names[f], from)
+				    == 0) {
+
+					char** rc = const_cast<char**>(
+						foreign->foreign_col_names
+						+ f);
+
+					if (to_len <= strlen(*rc)) {
+						memcpy(*rc, to, to_len + 1);
+					} else {
+						*rc = static_cast<char*>(
+							mem_heap_dup(
+								foreign->heap,
+								to,
+								to_len + 1));
+					}
+				}
+			}
+
+			dict_index_t* new_index = dict_foreign_find_index(
+				foreign->foreign_table, NULL,
+				foreign->foreign_col_names,
+				foreign->n_fields, NULL, true, false);
+			/* There must be an equivalent index in this case. */
+			ut_ad(new_index != NULL);
+
+			foreign->foreign_index = new_index;
+
+		} else {
+
+			for (unsigned f = 0; f < foreign->n_fields; f++) {
+				/* These can point straight to
+				table->col_names, because the foreign key
+				constraints will be freed at the same time
+				when the table object is freed. */
+				foreign->foreign_col_names[f]
+					= dict_index_get_nth_field(
+						foreign->foreign_index,
+						f)->name;
+			}
+		}
+	}
+
+	for (dict_foreign_set::iterator it = table->referenced_set.begin();
+	     it != table->referenced_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		ut_ad(foreign->referenced_index != NULL);
+
+		for (unsigned f = 0; f < foreign->n_fields; f++) {
+			/* foreign->referenced_col_names[] need to be
+			copies, because the constraint may become
+			orphan when foreign_key_checks=0 and the
+			parent table is dropped. */
+
+			const char* col_name = dict_index_get_nth_field(
+				foreign->referenced_index, f)->name;
+
+			if (strcmp(foreign->referenced_col_names[f],
+				   col_name)) {
+				char**	rc = const_cast<char**>(
+					foreign->referenced_col_names + f);
+				size_t	col_name_len_1 = strlen(col_name) + 1;
+
+				if (col_name_len_1 <= strlen(*rc) + 1) {
+					memcpy(*rc, col_name, col_name_len_1);
+				} else {
+					*rc = static_cast<char*>(
+						mem_heap_dup(
+							foreign->heap,
+							col_name,
+							col_name_len_1));
+				}
+			}
+		}
+	}
+}
+
+/**********************************************************************//**
+Renames a column of a table in the data dictionary cache. */
+UNIV_INTERN
+void
+dict_mem_table_col_rename(
+/*======================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	unsigned	nth_col,/*!< in: column index */
+	const char*	from,	/*!< in: old column name */
+	const char*	to)	/*!< in: new column name */
+{
+	const char*	s = table->col_names;
+
+	ut_ad(nth_col < table->n_def);
+
+	for (unsigned i = 0; i < nth_col; i++) {
+		size_t	len = strlen(s);
+		ut_ad(len > 0);
+		s += len + 1;
+	}
+
+	/* This could fail if the data dictionaries are out of sync.
+	Proceed with the renaming anyway. */
+	ut_ad(!strcmp(from, s));
+
+	dict_mem_table_col_rename_low(table, nth_col, to, s);
+}
+
+/**********************************************************************//**
+This function populates a dict_col_t memory structure with
+supplied information. */
+UNIV_INTERN
+void
+dict_mem_fill_column_struct(
+/*========================*/
+	dict_col_t*	column,		/*!< out: column struct to be
+					filled */
+	ulint		col_pos,	/*!< in: column position */
+	ulint		mtype,		/*!< in: main data type */
+	ulint		prtype,		/*!< in: precise type */
+	ulint		col_len)	/*!< in: column length */
+{
+#ifndef UNIV_HOTBACKUP
+	ulint	mbminlen;
+	ulint	mbmaxlen;
+#endif /* !UNIV_HOTBACKUP */
+
+	column->ind = (unsigned int) col_pos;
+	column->ord_part = 0;
+	column->max_prefix = 0;
+	column->mtype = (unsigned int) mtype;
+	column->prtype = (unsigned int) prtype;
+	column->len = (unsigned int) col_len;
+#ifndef UNIV_HOTBACKUP
+        dtype_get_mblen(mtype, prtype, &mbminlen, &mbmaxlen);
+	dict_col_set_mbminmaxlen(column, mbminlen, mbmaxlen);
+#endif /* !UNIV_HOTBACKUP */
+}
+
+/**********************************************************************//**
+Creates an index memory object.
+@return	own: index object */
+UNIV_INTERN
+dict_index_t*
+dict_mem_index_create(
+/*==================*/
+	const char*	table_name,	/*!< in: table name */
+	const char*	index_name,	/*!< in: index name */
+	ulint		space,		/*!< in: space where the index tree is
+					placed, ignored if the index is of
+					the clustered type */
+	ulint		type,		/*!< in: DICT_UNIQUE,
+					DICT_CLUSTERED, ... ORed */
+	ulint		n_fields)	/*!< in: number of fields */
+{
+	dict_index_t*	index;
+	mem_heap_t*	heap;
+
+	ut_ad(table_name && index_name);
+
+	heap = mem_heap_create(DICT_HEAP_SIZE);
+
+	index = static_cast<dict_index_t*>(
+		mem_heap_zalloc(heap, sizeof(*index)));
+
+	dict_mem_fill_index_struct(index, heap, table_name, index_name,
+				   space, type, n_fields);
+
+	os_fast_mutex_init(zip_pad_mutex_key, &index->zip_pad.mutex);
+
+	return(index);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Creates and initializes a foreign constraint memory object.
+@return	own: foreign constraint struct */
+UNIV_INTERN
+dict_foreign_t*
+dict_mem_foreign_create(void)
+/*=========================*/
+{
+	dict_foreign_t*	foreign;
+	mem_heap_t*	heap;
+
+	heap = mem_heap_create(100);
+
+	foreign = static_cast<dict_foreign_t*>(
+		mem_heap_zalloc(heap, sizeof(dict_foreign_t)));
+
+	foreign->heap = heap;
+
+	return(foreign);
+}
+
+/**********************************************************************//**
+Sets the foreign_table_name_lookup pointer based on the value of
+lower_case_table_names.  If that is 0 or 1, foreign_table_name_lookup
+will point to foreign_table_name.  If 2, then another string is
+allocated from foreign->heap and set to lower case. */
+UNIV_INTERN
+void
+dict_mem_foreign_table_name_lookup_set(
+/*===================================*/
+	dict_foreign_t*	foreign,	/*!< in/out: foreign struct */
+	ibool		do_alloc)	/*!< in: is an alloc needed */
+{
+	if (innobase_get_lower_case_table_names() == 2) {
+		if (do_alloc) {
+			ulint	len;
+
+			len = strlen(foreign->foreign_table_name) + 1;
+
+			foreign->foreign_table_name_lookup =
+				static_cast<char*>(
+					mem_heap_alloc(foreign->heap, len));
+		}
+		strcpy(foreign->foreign_table_name_lookup,
+		       foreign->foreign_table_name);
+		innobase_casedn_str(foreign->foreign_table_name_lookup);
+	} else {
+		foreign->foreign_table_name_lookup
+			= foreign->foreign_table_name;
+	}
+}
+
+/**********************************************************************//**
+Sets the referenced_table_name_lookup pointer based on the value of
+lower_case_table_names.  If that is 0 or 1, referenced_table_name_lookup
+will point to referenced_table_name.  If 2, then another string is
+allocated from foreign->heap and set to lower case. */
+UNIV_INTERN
+void
+dict_mem_referenced_table_name_lookup_set(
+/*======================================*/
+	dict_foreign_t*	foreign,	/*!< in/out: foreign struct */
+	ibool		do_alloc)	/*!< in: is an alloc needed */
+{
+	if (innobase_get_lower_case_table_names() == 2) {
+		if (do_alloc) {
+			ulint	len;
+
+			len = strlen(foreign->referenced_table_name) + 1;
+
+			foreign->referenced_table_name_lookup =
+				static_cast<char*>(
+					mem_heap_alloc(foreign->heap, len));
+		}
+		strcpy(foreign->referenced_table_name_lookup,
+		       foreign->referenced_table_name);
+		innobase_casedn_str(foreign->referenced_table_name_lookup);
+	} else {
+		foreign->referenced_table_name_lookup
+			= foreign->referenced_table_name;
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Adds a field definition to an index. NOTE: does not take a copy
+of the column name if the field is a column. The memory occupied
+by the column name may be released only after publishing the index. */
+UNIV_INTERN
+void
+dict_mem_index_add_field(
+/*=====================*/
+	dict_index_t*	index,		/*!< in: index */
+	const char*	name,		/*!< in: column name */
+	ulint		prefix_len)	/*!< in: 0 or the column prefix length
+					in a MySQL index like
+					INDEX (textcol(25)) */
+{
+	dict_field_t*	field;
+
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	index->n_def++;
+
+	field = dict_index_get_nth_field(index, index->n_def - 1);
+
+	field->name = name;
+	field->prefix_len = (unsigned int) prefix_len;
+}
+
+/**********************************************************************//**
+Frees an index memory object. */
+UNIV_INTERN
+void
+dict_mem_index_free(
+/*================*/
+	dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+#ifdef UNIV_BLOB_DEBUG
+	if (index->blobs) {
+		mutex_free(&index->blobs_mutex);
+		rbt_free(index->blobs);
+	}
+#endif /* UNIV_BLOB_DEBUG */
+
+	os_fast_mutex_free(&index->zip_pad.mutex);
+
+	mem_heap_free(index->heap);
+}
+
+/** Create a temporary tablename like "#sql-ibtid-inc where
+  tid = the Table ID
+  inc = a randomly initialized number that is incremented for each file
+The table ID is a 64 bit integer, can use up to 20 digits, and is
+initialized at bootstrap. The second number is 32 bits, can use up to 10
+digits, and is initialized at startup to a randomly distributed number.
+It is hoped that the combination of these two numbers will provide a
+reasonably unique temporary file name.
+@param[in]	heap	A memory heap
+@param[in]	dbtab	Table name in the form database/table name
+@param[in]	id	Table id
+@return A unique temporary tablename suitable for InnoDB use */
+UNIV_INTERN
+char*
+dict_mem_create_temporary_tablename(
+	mem_heap_t*	heap,
+	const char*	dbtab,
+	table_id_t	id)
+{
+	size_t		size;
+	char*		name;
+	const char*	dbend = strchr(dbtab, '/');
+	ut_ad(dbend);
+	size_t		dblen = dbend - dbtab + 1;
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	/* Increment a randomly initialized number for each temp file. */
+	os_atomic_increment_uint32(&dict_temp_file_num, 1);
+#else /* HAVE_ATOMIC_BUILTINS */
+	dict_temp_file_num++;
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+	size = tmp_file_prefix_length + 3 + 20 + 1 + 10 + dblen;
+	name = static_cast<char*>(mem_heap_alloc(heap, size));
+	memcpy(name, dbtab, dblen);
+	ut_snprintf(name + dblen, size - dblen,
+		    TEMP_FILE_PREFIX_INNODB UINT64PF "-" UINT32PF,
+		    id, dict_temp_file_num);
+
+	return(name);
+}
+
+/** Initialize dict memory variables */
+
+void
+dict_mem_init(void)
+{
+	/* Initialize a randomly distributed temporary file number */
+	ib_uint32_t now = static_cast<ib_uint32_t>(ut_time());
+
+	const byte* buf = reinterpret_cast<const byte*>(&now);
+	ut_ad(ut_crc32 != NULL);
+
+	dict_temp_file_num = ut_crc32(buf, sizeof(now));
+
+	DBUG_PRINT("dict_mem_init",
+		   ("Starting Temporary file number is " UINT32PF,
+		   dict_temp_file_num));
+}
+
+/** Validate the search order in the foreign key set.
+@param[in]	fk_set	the foreign key set to be validated
+@return true if search order is fine in the set, false otherwise. */
+bool
+dict_foreign_set_validate(
+	const dict_foreign_set&	fk_set)
+{
+	dict_foreign_not_exists	not_exists(fk_set);
+
+	dict_foreign_set::iterator it = std::find_if(
+		fk_set.begin(), fk_set.end(), not_exists);
+
+	if (it == fk_set.end()) {
+		return(true);
+	}
+
+	dict_foreign_t*	foreign = *it;
+	std::cerr << "Foreign key lookup failed: " << *foreign;
+	std::cerr << fk_set;
+	ut_ad(0);
+	return(false);
+}
+
+/** Validate the search order in the foreign key sets of the table
+(foreign_set and referenced_set).
+@param[in]	table	table whose foreign key sets are to be validated
+@return true if foreign key sets are fine, false otherwise. */
+bool
+dict_foreign_set_validate(
+	const dict_table_t&	table)
+{
+	return(dict_foreign_set_validate(table.foreign_set)
+	       && dict_foreign_set_validate(table.referenced_set));
+}
+
+std::ostream&
+operator<< (std::ostream& out, const dict_foreign_t& foreign)
+{
+	out << "[dict_foreign_t: id='" << foreign.id << "'";
+
+	if (foreign.foreign_table_name != NULL) {
+		out << ",for: '" << foreign.foreign_table_name << "'";
+	}
+
+	out << "]";
+	return(out);
+}
+
+std::ostream&
+operator<< (std::ostream& out, const dict_foreign_set& fk_set)
+{
+	out << "[dict_foreign_set:";
+	std::for_each(fk_set.begin(), fk_set.end(), dict_foreign_print(out));
+	out << "]" << std::endl;
+	return(out);
+}
+
diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc
new file mode 100644
index 00000000000..9cd909686ed
--- /dev/null
+++ b/storage/innobase/dict/dict0stats.cc
@@ -0,0 +1,4182 @@
+/*****************************************************************************
+
+Copyright (c) 2009, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0stats.cc
+Code used for calculating and manipulating table statistics.
+
+Created Jan 06, 2010 Vasil Dimov
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+
+#include "univ.i"
+
+#include "btr0btr.h" /* btr_get_size() */
+#include "btr0cur.h" /* btr_estimate_number_of_different_key_vals() */
+#include "dict0dict.h" /* dict_table_get_first_index(), dict_fs2utf8() */
+#include "dict0mem.h" /* DICT_TABLE_MAGIC_N */
+#include "dict0stats.h"
+#include "data0type.h" /* dtype_t */
+#include "db0err.h" /* dberr_t */
+#include "page0page.h" /* page_align() */
+#include "pars0pars.h" /* pars_info_create() */
+#include "pars0types.h" /* pars_info_t */
+#include "que0que.h" /* que_eval_sql() */
+#include "rem0cmp.h" /* REC_MAX_N_FIELDS,cmp_rec_rec_with_match() */
+#include "row0sel.h" /* sel_node_t */
+#include "row0types.h" /* sel_node_t */
+#include "trx0trx.h" /* trx_create() */
+#include "trx0roll.h" /* trx_rollback_to_savepoint() */
+#include "ut0rnd.h" /* ut_rnd_interval() */
+#include "ut0ut.h" /* ut_format_name(), ut_time() */
+
+#include <algorithm>
+#include <map>
+#include <vector>
+
+/* Sampling algorithm description @{
+
+The algorithm is controlled by one number - N_SAMPLE_PAGES(index),
+let it be A, which is the number of leaf pages to analyze for a given index
+for each n-prefix (if the index is on 3 columns, then 3*A leaf pages will be
+analyzed).
+
+Let the total number of leaf pages in the table be T.
+Level 0 - leaf pages, level H - root.
+
+Definition: N-prefix-boring record is a record on a non-leaf page that equals
+the next (to the right, cross page boundaries, skipping the supremum and
+infimum) record on the same level when looking at the fist n-prefix columns.
+The last (user) record on a level is not boring (it does not match the
+non-existent user record to the right). We call the records boring because all
+the records on the page below a boring record are equal to that boring record.
+
+We avoid diving below boring records when searching for a leaf page to
+estimate the number of distinct records because we know that such a leaf
+page will have number of distinct records == 1.
+
+For each n-prefix: start from the root level and full scan subsequent lower
+levels until a level that contains at least A*10 distinct records is found.
+Lets call this level LA.
+As an optimization the search is canceled if it has reached level 1 (never
+descend to the level 0 (leaf)) and also if the next level to be scanned
+would contain more than A pages. The latter is because the user has asked
+to analyze A leaf pages and it does not make sense to scan much more than
+A non-leaf pages with the sole purpose of finding a good sample of A leaf
+pages.
+
+After finding the appropriate level LA with >A*10 distinct records (or less in
+the exceptions described above), divide it into groups of equal records and
+pick A such groups. Then pick the last record from each group. For example,
+let the level be:
+
+index:  0,1,2,3,4,5,6,7,8,9,10
+record: 1,1,1,2,2,7,7,7,7,7,9
+
+There are 4 groups of distinct records and if A=2 random ones are selected,
+e.g. 1,1,1 and 7,7,7,7,7, then records with indexes 2 and 9 will be selected.
+
+After selecting A records as described above, dive below them to find A leaf
+pages and analyze them, finding the total number of distinct records. The
+dive to the leaf level is performed by selecting a non-boring record from
+each page and diving below it.
+
+This way, a total of A leaf pages are analyzed for the given n-prefix.
+
+Let the number of different key values found in each leaf page i be Pi (i=1..A).
+Let N_DIFF_AVG_LEAF be (P1 + P2 + ... + PA) / A.
+Let the number of different key values on level LA be N_DIFF_LA.
+Let the total number of records on level LA be TOTAL_LA.
+Let R be N_DIFF_LA / TOTAL_LA, we assume this ratio is the same on the
+leaf level.
+Let the number of leaf pages be N.
+Then the total number of different key values on the leaf level is:
+N * R * N_DIFF_AVG_LEAF.
+See REF01 for the implementation.
+
+The above describes how to calculate the cardinality of an index.
+This algorithm is executed for each n-prefix of a multi-column index
+where n=1..n_uniq.
+@} */
+
+/* names of the tables from the persistent statistics storage */
+#define TABLE_STATS_NAME	"mysql/innodb_table_stats"
+#define TABLE_STATS_NAME_PRINT	"mysql.innodb_table_stats"
+#define INDEX_STATS_NAME	"mysql/innodb_index_stats"
+#define INDEX_STATS_NAME_PRINT	"mysql.innodb_index_stats"
+
+#ifdef UNIV_STATS_DEBUG
+#define DEBUG_PRINTF(fmt, ...)	printf(fmt, ## __VA_ARGS__)
+#else /* UNIV_STATS_DEBUG */
+#define DEBUG_PRINTF(fmt, ...)	/* noop */
+#endif /* UNIV_STATS_DEBUG */
+
+/* Gets the number of leaf pages to sample in persistent stats estimation */
+#define N_SAMPLE_PAGES(index)					\
+	static_cast<ib_uint64_t>(				\
+		(index)->table->stats_sample_pages != 0		\
+		? (index)->table->stats_sample_pages		\
+		: srv_stats_persistent_sample_pages)
+
+/* number of distinct records on a given level that are required to stop
+descending to lower levels and fetch N_SAMPLE_PAGES(index) records
+from that level */
+#define N_DIFF_REQUIRED(index)	(N_SAMPLE_PAGES(index) * 10)
+
+/* A dynamic array where we store the boundaries of each distinct group
+of keys. For example if a btree level is:
+index: 0,1,2,3,4,5,6,7,8,9,10,11,12
+data:  b,b,b,b,b,b,g,g,j,j,j, x, y
+then we would store 5,7,10,11,12 in the array. */
+typedef std::vector<ib_uint64_t>	boundaries_t;
+
+/* This is used to arrange the index based on the index name.
+@return true if index_name1 is smaller than index_name2. */
+struct index_cmp
+{
+	bool operator()(const char* index_name1, const char* index_name2) const {
+		return(strcmp(index_name1, index_name2) < 0);
+	}
+};
+
+typedef std::map<const char*, dict_index_t*, index_cmp>	index_map_t;
+
+/*********************************************************************//**
+Checks whether an index should be ignored in stats manipulations:
+* stats fetch
+* stats recalc
+* stats save
+@return true if exists and all tables are ok */
+UNIV_INLINE
+bool
+dict_stats_should_ignore_index(
+/*===========================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	return((index->type & DICT_FTS)
+	       || dict_index_is_corrupted(index)
+	       || index->to_be_dropped
+	       || *index->name == TEMP_INDEX_PREFIX);
+}
+
+/*********************************************************************//**
+Checks whether the persistent statistics storage exists and that all
+tables have the proper structure.
+@return true if exists and all tables are ok */
+static
+bool
+dict_stats_persistent_storage_check(
+/*================================*/
+	bool	caller_has_dict_sys_mutex)	/*!< in: true if the caller
+						owns dict_sys->mutex */
+{
+	/* definition for the table TABLE_STATS_NAME */
+	dict_col_meta_t	table_stats_columns[] = {
+		{"database_name", DATA_VARMYSQL,
+			DATA_NOT_NULL, 192},
+
+		{"table_name", DATA_VARMYSQL,
+			DATA_NOT_NULL, 192},
+
+		{"last_update", DATA_FIXBINARY,
+			DATA_NOT_NULL, 4},
+
+		{"n_rows", DATA_INT,
+			DATA_NOT_NULL | DATA_UNSIGNED, 8},
+
+		{"clustered_index_size", DATA_INT,
+			DATA_NOT_NULL | DATA_UNSIGNED, 8},
+
+		{"sum_of_other_index_sizes", DATA_INT,
+			DATA_NOT_NULL | DATA_UNSIGNED, 8}
+	};
+	dict_table_schema_t	table_stats_schema = {
+		TABLE_STATS_NAME,
+		UT_ARR_SIZE(table_stats_columns),
+		table_stats_columns,
+		0 /* n_foreign */,
+		0 /* n_referenced */
+	};
+
+	/* definition for the table INDEX_STATS_NAME */
+	dict_col_meta_t	index_stats_columns[] = {
+		{"database_name", DATA_VARMYSQL,
+			DATA_NOT_NULL, 192},
+
+		{"table_name", DATA_VARMYSQL,
+			DATA_NOT_NULL, 192},
+
+		{"index_name", DATA_VARMYSQL,
+			DATA_NOT_NULL, 192},
+
+		{"last_update", DATA_FIXBINARY,
+			DATA_NOT_NULL, 4},
+
+		{"stat_name", DATA_VARMYSQL,
+			DATA_NOT_NULL, 64*3},
+
+		{"stat_value", DATA_INT,
+			DATA_NOT_NULL | DATA_UNSIGNED, 8},
+
+		{"sample_size", DATA_INT,
+			DATA_UNSIGNED, 8},
+
+		{"stat_description", DATA_VARMYSQL,
+			DATA_NOT_NULL, 1024*3}
+	};
+	dict_table_schema_t	index_stats_schema = {
+		INDEX_STATS_NAME,
+		UT_ARR_SIZE(index_stats_columns),
+		index_stats_columns,
+		0 /* n_foreign */,
+		0 /* n_referenced */
+	};
+
+	char		errstr[512];
+	dberr_t		ret;
+
+	if (!caller_has_dict_sys_mutex) {
+		mutex_enter(&(dict_sys->mutex));
+	}
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	/* first check table_stats */
+	ret = dict_table_schema_check(&table_stats_schema, errstr,
+				      sizeof(errstr));
+	if (ret == DB_SUCCESS) {
+		/* if it is ok, then check index_stats */
+		ret = dict_table_schema_check(&index_stats_schema, errstr,
+					      sizeof(errstr));
+	}
+
+	if (!caller_has_dict_sys_mutex) {
+		mutex_exit(&(dict_sys->mutex));
+	}
+
+	if (ret != DB_SUCCESS) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Error: %s\n", errstr);
+		return(false);
+	}
+	/* else */
+
+	return(true);
+}
+
+/** Executes a given SQL statement using the InnoDB internal SQL parser.
+This function will free the pinfo object.
+@param[in,out]	pinfo	pinfo to pass to que_eval_sql() must already
+have any literals bound to it
+@param[in]	sql	SQL string to execute
+@param[in,out]	trx	in case of NULL the function will allocate and
+free the trx object. If it is not NULL then it will be rolled back
+only in the case of error, but not freed.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+dict_stats_exec_sql(
+	pars_info_t*	pinfo,
+	const char*	sql,
+	trx_t*		trx)
+{
+	dberr_t	err;
+	bool	trx_started = false;
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	if (!dict_stats_persistent_storage_check(true)) {
+		pars_info_free(pinfo);
+		return(DB_STATS_DO_NOT_EXIST);
+	}
+
+	if (trx == NULL) {
+		trx = trx_allocate_for_background();
+		trx_start_if_not_started(trx);
+		trx_started = true;
+	}
+
+	err = que_eval_sql(pinfo, sql, FALSE, trx); /* pinfo is freed here */
+
+	DBUG_EXECUTE_IF("stats_index_error",
+		if (!trx_started) {
+			err = DB_STATS_DO_NOT_EXIST;
+			trx->error_state = DB_STATS_DO_NOT_EXIST;
+		});
+
+	if (!trx_started && err == DB_SUCCESS) {
+		return(DB_SUCCESS);
+	}
+
+	if (err == DB_SUCCESS) {
+		trx_commit_for_mysql(trx);
+	} else {
+		trx->op_info = "rollback of internal trx on stats tables";
+		trx->dict_operation_lock_mode = RW_X_LATCH;
+		trx_rollback_to_savepoint(trx, NULL);
+		trx->dict_operation_lock_mode = 0;
+		trx->op_info = "";
+		ut_a(trx->error_state == DB_SUCCESS);
+	}
+
+	if (trx_started) {
+		trx_free_for_background(trx);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Duplicate a table object and its indexes.
+This function creates a dummy dict_table_t object and initializes the
+following table and index members:
+dict_table_t::id (copied)
+dict_table_t::heap (newly created)
+dict_table_t::name (copied)
+dict_table_t::corrupted (copied)
+dict_table_t::indexes<> (newly created)
+dict_table_t::magic_n
+for each entry in dict_table_t::indexes, the following are initialized:
+(indexes that have DICT_FTS set in index->type are skipped)
+dict_index_t::id (copied)
+dict_index_t::name (copied)
+dict_index_t::table_name (points to the copied table name)
+dict_index_t::table (points to the above semi-initialized object)
+dict_index_t::type (copied)
+dict_index_t::to_be_dropped (copied)
+dict_index_t::online_status (copied)
+dict_index_t::n_uniq (copied)
+dict_index_t::fields[] (newly created, only first n_uniq, only fields[i].name)
+dict_index_t::indexes<> (newly created)
+dict_index_t::stat_n_diff_key_vals[] (only allocated, left uninitialized)
+dict_index_t::stat_n_sample_sizes[] (only allocated, left uninitialized)
+dict_index_t::stat_n_non_null_key_vals[] (only allocated, left uninitialized)
+dict_index_t::magic_n
+The returned object should be freed with dict_stats_table_clone_free()
+when no longer needed.
+@return incomplete table object */
+static
+dict_table_t*
+dict_stats_table_clone_create(
+/*==========================*/
+	const dict_table_t*	table)	/*!< in: table whose stats to copy */
+{
+	size_t		heap_size;
+	dict_index_t*	index;
+
+	/* Estimate the size needed for the table and all of its indexes */
+
+	heap_size = 0;
+	heap_size += sizeof(dict_table_t);
+	heap_size += strlen(table->name) + 1;
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (dict_stats_should_ignore_index(index)) {
+			continue;
+		}
+
+		ut_ad(!dict_index_is_univ(index));
+
+		ulint	n_uniq = dict_index_get_n_unique(index);
+
+		heap_size += sizeof(dict_index_t);
+		heap_size += strlen(index->name) + 1;
+		heap_size += n_uniq * sizeof(index->fields[0]);
+		for (ulint i = 0; i < n_uniq; i++) {
+			heap_size += strlen(index->fields[i].name) + 1;
+		}
+		heap_size += n_uniq * sizeof(index->stat_n_diff_key_vals[0]);
+		heap_size += n_uniq * sizeof(index->stat_n_sample_sizes[0]);
+		heap_size += n_uniq * sizeof(index->stat_n_non_null_key_vals[0]);
+	}
+
+	/* Allocate the memory and copy the members */
+
+	mem_heap_t*	heap;
+
+	heap = mem_heap_create(heap_size);
+
+	dict_table_t*	t;
+
+	t = (dict_table_t*) mem_heap_alloc(heap, sizeof(*t));
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->id, sizeof(table->id));
+	t->id = table->id;
+
+	t->heap = heap;
+
+	UNIV_MEM_ASSERT_RW_ABORT(table->name, strlen(table->name) + 1);
+	t->name = (char*) mem_heap_strdup(heap, table->name);
+
+	t->corrupted = table->corrupted;
+
+	/* This private object "t" is not shared with other threads, so
+	we do not need the stats_latch (thus we pass false below). The
+	dict_table_stats_lock()/unlock() routines will do nothing. */
+	dict_table_stats_latch_create(t, false);
+
+	UT_LIST_INIT(t->indexes);
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (dict_stats_should_ignore_index(index)) {
+			continue;
+		}
+
+		ut_ad(!dict_index_is_univ(index));
+
+		dict_index_t*	idx;
+
+		idx = (dict_index_t*) mem_heap_alloc(heap, sizeof(*idx));
+
+		UNIV_MEM_ASSERT_RW_ABORT(&index->id, sizeof(index->id));
+		idx->id = index->id;
+
+		UNIV_MEM_ASSERT_RW_ABORT(index->name, strlen(index->name) + 1);
+		idx->name = (char*) mem_heap_strdup(heap, index->name);
+
+		idx->table_name = t->name;
+
+		idx->table = t;
+
+		idx->type = index->type;
+
+		idx->to_be_dropped = 0;
+
+		idx->online_status = ONLINE_INDEX_COMPLETE;
+
+		idx->n_uniq = index->n_uniq;
+
+		idx->fields = (dict_field_t*) mem_heap_alloc(
+			heap, idx->n_uniq * sizeof(idx->fields[0]));
+
+		for (ulint i = 0; i < idx->n_uniq; i++) {
+			UNIV_MEM_ASSERT_RW_ABORT(index->fields[i].name, strlen(index->fields[i].name) + 1);
+			idx->fields[i].name = (char*) mem_heap_strdup(
+				heap, index->fields[i].name);
+		}
+
+		/* hook idx into t->indexes */
+		UT_LIST_ADD_LAST(indexes, t->indexes, idx);
+
+		idx->stat_n_diff_key_vals = (ib_uint64_t*) mem_heap_alloc(
+			heap,
+			idx->n_uniq * sizeof(idx->stat_n_diff_key_vals[0]));
+
+		idx->stat_n_sample_sizes = (ib_uint64_t*) mem_heap_alloc(
+			heap,
+			idx->n_uniq * sizeof(idx->stat_n_sample_sizes[0]));
+
+		idx->stat_n_non_null_key_vals = (ib_uint64_t*) mem_heap_alloc(
+			heap,
+			idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0]));
+		ut_d(idx->magic_n = DICT_INDEX_MAGIC_N);
+	}
+
+	ut_d(t->magic_n = DICT_TABLE_MAGIC_N);
+
+	return(t);
+}
+
+/*********************************************************************//**
+Free the resources occupied by an object returned by
+dict_stats_table_clone_create(). */
+static
+void
+dict_stats_table_clone_free(
+/*========================*/
+	dict_table_t*	t)	/*!< in: dummy table object to free */
+{
+	dict_table_stats_latch_destroy(t);
+	mem_heap_free(t->heap);
+}
+
+/*********************************************************************//**
+Write all zeros (or 1 where it makes sense) into an index
+statistics members. The resulting stats correspond to an empty index.
+The caller must own index's table stats latch in X mode
+(dict_table_stats_lock(table, RW_X_LATCH)) */
+static
+void
+dict_stats_empty_index(
+/*===================*/
+	dict_index_t*	index)	/*!< in/out: index */
+{
+	ut_ad(!(index->type & DICT_FTS));
+	ut_ad(!dict_index_is_univ(index));
+
+	ulint	n_uniq = index->n_uniq;
+
+	for (ulint i = 0; i < n_uniq; i++) {
+		index->stat_n_diff_key_vals[i] = 0;
+		index->stat_n_sample_sizes[i] = 1;
+		index->stat_n_non_null_key_vals[i] = 0;
+	}
+
+	index->stat_index_size = 1;
+	index->stat_n_leaf_pages = 1;
+}
+
+/*********************************************************************//**
+Write all zeros (or 1 where it makes sense) into a table and its indexes'
+statistics members. The resulting stats correspond to an empty table. */
+static
+void
+dict_stats_empty_table(
+/*===================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	/* Zero the stats members */
+
+	dict_table_stats_lock(table, RW_X_LATCH);
+
+	table->stat_n_rows = 0;
+	table->stat_clustered_index_size = 1;
+	/* 1 page for each index, not counting the clustered */
+	table->stat_sum_of_other_index_sizes
+		= UT_LIST_GET_LEN(table->indexes) - 1;
+	table->stat_modified_counter = 0;
+
+	dict_index_t*	index;
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (index->type & DICT_FTS) {
+			continue;
+		}
+
+		ut_ad(!dict_index_is_univ(index));
+
+		dict_stats_empty_index(index);
+	}
+
+	table->stat_initialized = TRUE;
+
+	dict_table_stats_unlock(table, RW_X_LATCH);
+}
+
+/*********************************************************************//**
+Check whether index's stats are initialized (assert if they are not). */
+static
+void
+dict_stats_assert_initialized_index(
+/*================================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	UNIV_MEM_ASSERT_RW_ABORT(
+		index->stat_n_diff_key_vals,
+		index->n_uniq * sizeof(index->stat_n_diff_key_vals[0]));
+
+	UNIV_MEM_ASSERT_RW_ABORT(
+		index->stat_n_sample_sizes,
+		index->n_uniq * sizeof(index->stat_n_sample_sizes[0]));
+
+	UNIV_MEM_ASSERT_RW_ABORT(
+		index->stat_n_non_null_key_vals,
+		index->n_uniq * sizeof(index->stat_n_non_null_key_vals[0]));
+
+	UNIV_MEM_ASSERT_RW_ABORT(
+		&index->stat_index_size,
+		sizeof(index->stat_index_size));
+
+	UNIV_MEM_ASSERT_RW_ABORT(
+		&index->stat_n_leaf_pages,
+		sizeof(index->stat_n_leaf_pages));
+}
+
+/*********************************************************************//**
+Check whether table's stats are initialized (assert if they are not). */
+static
+void
+dict_stats_assert_initialized(
+/*==========================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_a(table->stat_initialized);
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->stats_last_recalc,
+			   sizeof(table->stats_last_recalc));
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->stat_persistent,
+			   sizeof(table->stat_persistent));
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->stats_auto_recalc,
+			   sizeof(table->stats_auto_recalc));
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->stats_sample_pages,
+			   sizeof(table->stats_sample_pages));
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->stat_n_rows,
+			   sizeof(table->stat_n_rows));
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->stat_clustered_index_size,
+			   sizeof(table->stat_clustered_index_size));
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->stat_sum_of_other_index_sizes,
+			   sizeof(table->stat_sum_of_other_index_sizes));
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->stat_modified_counter,
+			   sizeof(table->stat_modified_counter));
+
+	UNIV_MEM_ASSERT_RW_ABORT(&table->stats_bg_flag,
+			   sizeof(table->stats_bg_flag));
+
+	for (dict_index_t* index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (!dict_stats_should_ignore_index(index)) {
+			dict_stats_assert_initialized_index(index);
+		}
+	}
+}
+
+#define INDEX_EQ(i1, i2) \
+	((i1) != NULL \
+	 && (i2) != NULL \
+	 && (i1)->id == (i2)->id \
+	 && strcmp((i1)->name, (i2)->name) == 0)
+
+/*********************************************************************//**
+Copy table and index statistics from one table to another, including index
+stats. Extra indexes in src are ignored and extra indexes in dst are
+initialized to correspond to an empty index. */
+static
+void
+dict_stats_copy(
+/*============*/
+	dict_table_t*		dst,	/*!< in/out: destination table */
+	const dict_table_t*	src)	/*!< in: source table */
+{
+	dst->stats_last_recalc = src->stats_last_recalc;
+	dst->stat_n_rows = src->stat_n_rows;
+	dst->stat_clustered_index_size = src->stat_clustered_index_size;
+	dst->stat_sum_of_other_index_sizes = src->stat_sum_of_other_index_sizes;
+	dst->stat_modified_counter = src->stat_modified_counter;
+
+	dict_index_t*	dst_idx;
+	dict_index_t*	src_idx;
+
+	for (dst_idx = dict_table_get_first_index(dst),
+	     src_idx = dict_table_get_first_index(src);
+	     dst_idx != NULL;
+	     dst_idx = dict_table_get_next_index(dst_idx),
+	     (src_idx != NULL
+	      && (src_idx = dict_table_get_next_index(src_idx)))) {
+
+		if (dict_stats_should_ignore_index(dst_idx)) {
+			continue;
+		}
+
+		ut_ad(!dict_index_is_univ(dst_idx));
+
+		if (!INDEX_EQ(src_idx, dst_idx)) {
+			for (src_idx = dict_table_get_first_index(src);
+			     src_idx != NULL;
+			     src_idx = dict_table_get_next_index(src_idx)) {
+
+				if (INDEX_EQ(src_idx, dst_idx)) {
+					break;
+				}
+			}
+		}
+
+		if (!INDEX_EQ(src_idx, dst_idx)) {
+			dict_stats_empty_index(dst_idx);
+			continue;
+		}
+
+		ulint	n_copy_el;
+
+		if (dst_idx->n_uniq > src_idx->n_uniq) {
+			n_copy_el = src_idx->n_uniq;
+			/* Since src is smaller some elements in dst
+			will remain untouched by the following memmove(),
+			thus we init all of them here. */
+			dict_stats_empty_index(dst_idx);
+		} else {
+			n_copy_el = dst_idx->n_uniq;
+		}
+
+		memmove(dst_idx->stat_n_diff_key_vals,
+			src_idx->stat_n_diff_key_vals,
+			n_copy_el * sizeof(dst_idx->stat_n_diff_key_vals[0]));
+
+		memmove(dst_idx->stat_n_sample_sizes,
+			src_idx->stat_n_sample_sizes,
+			n_copy_el * sizeof(dst_idx->stat_n_sample_sizes[0]));
+
+		memmove(dst_idx->stat_n_non_null_key_vals,
+			src_idx->stat_n_non_null_key_vals,
+			n_copy_el * sizeof(dst_idx->stat_n_non_null_key_vals[0]));
+
+		dst_idx->stat_index_size = src_idx->stat_index_size;
+
+		dst_idx->stat_n_leaf_pages = src_idx->stat_n_leaf_pages;
+	}
+
+	dst->stat_initialized = TRUE;
+}
+
+/*********************************************************************//**
+Duplicate the stats of a table and its indexes.
+This function creates a dummy dict_table_t object and copies the input
+table's stats into it. The returned table object is not in the dictionary
+cache and cannot be accessed by any other threads. In addition to the
+members copied in dict_stats_table_clone_create() this function initializes
+the following:
+dict_table_t::stat_initialized
+dict_table_t::stat_persistent
+dict_table_t::stat_n_rows
+dict_table_t::stat_clustered_index_size
+dict_table_t::stat_sum_of_other_index_sizes
+dict_table_t::stat_modified_counter
+dict_index_t::stat_n_diff_key_vals[]
+dict_index_t::stat_n_sample_sizes[]
+dict_index_t::stat_n_non_null_key_vals[]
+dict_index_t::stat_index_size
+dict_index_t::stat_n_leaf_pages
+The returned object should be freed with dict_stats_snapshot_free()
+when no longer needed.
+@return incomplete table object */
+static
+dict_table_t*
+dict_stats_snapshot_create(
+/*=======================*/
+	dict_table_t*	table)	/*!< in: table whose stats to copy */
+{
+	mutex_enter(&dict_sys->mutex);
+
+	dict_table_stats_lock(table, RW_S_LATCH);
+
+	dict_stats_assert_initialized(table);
+
+	dict_table_t*	t;
+
+	t = dict_stats_table_clone_create(table);
+
+	dict_stats_copy(t, table);
+
+	t->stat_persistent = table->stat_persistent;
+	t->stats_auto_recalc = table->stats_auto_recalc;
+	t->stats_sample_pages = table->stats_sample_pages;
+	t->stats_bg_flag = table->stats_bg_flag;
+
+	dict_table_stats_unlock(table, RW_S_LATCH);
+
+	mutex_exit(&dict_sys->mutex);
+
+	return(t);
+}
+
+/*********************************************************************//**
+Free the resources occupied by an object returned by
+dict_stats_snapshot_create(). */
+static
+void
+dict_stats_snapshot_free(
+/*=====================*/
+	dict_table_t*	t)	/*!< in: dummy table object to free */
+{
+	dict_stats_table_clone_free(t);
+}
+
+/*********************************************************************//**
+Calculates new estimates for index statistics. This function is
+relatively quick and is used to calculate transient statistics that
+are not saved on disk. This was the only way to calculate statistics
+before the Persistent Statistics feature was introduced. */
+static
+void
+dict_stats_update_transient_for_index(
+/*==================================*/
+	dict_index_t*	index)	/*!< in/out: index */
+{
+	if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
+	    && (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO
+		|| !dict_index_is_clust(index))) {
+		/* If we have set a high innodb_force_recovery
+		level, do not calculate statistics, as a badly
+		corrupted index can cause a crash in it.
+		Initialize some bogus index cardinality
+		statistics, so that the data can be queried in
+		various means, also via secondary indexes. */
+		dict_stats_empty_index(index);
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+	} else if (ibuf_debug && !dict_index_is_clust(index)) {
+		dict_stats_empty_index(index);
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+	} else {
+		mtr_t	mtr;
+		ulint	size;
+		mtr_start(&mtr);
+		mtr_s_lock(dict_index_get_lock(index), &mtr);
+
+		size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr);
+
+		if (size != ULINT_UNDEFINED) {
+			index->stat_index_size = size;
+
+			size = btr_get_size(
+				index, BTR_N_LEAF_PAGES, &mtr);
+		}
+
+		mtr_commit(&mtr);
+
+		switch (size) {
+		case ULINT_UNDEFINED:
+			dict_stats_empty_index(index);
+			return;
+		case 0:
+			/* The root node of the tree is a leaf */
+			size = 1;
+		}
+
+		index->stat_n_leaf_pages = size;
+
+		btr_estimate_number_of_different_key_vals(index);
+	}
+}
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. This function
+is relatively quick and is used to calculate transient statistics that
+are not saved on disk.
+This was the only way to calculate statistics before the
+Persistent Statistics feature was introduced. */
+UNIV_INTERN
+void
+dict_stats_update_transient(
+/*========================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	dict_index_t*	index;
+	ulint		sum_of_index_sizes	= 0;
+
+	/* Find out the sizes of the indexes and how many different values
+	for the key they approximately have */
+
+	index = dict_table_get_first_index(table);
+
+	if (dict_table_is_discarded(table)) {
+		/* Nothing to do. */
+		dict_stats_empty_table(table);
+		return;
+	} else if (index == NULL) {
+		/* Table definition is corrupt */
+
+		char	buf[MAX_FULL_NAME_LEN];
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: table %s has no indexes. "
+			"Cannot calculate statistics.\n",
+			ut_format_name(table->name, TRUE, buf, sizeof(buf)));
+		dict_stats_empty_table(table);
+		return;
+	}
+
+	for (; index != NULL; index = dict_table_get_next_index(index)) {
+
+		ut_ad(!dict_index_is_univ(index));
+
+		if (index->type & DICT_FTS) {
+			continue;
+		}
+
+		dict_stats_empty_index(index);
+
+		if (dict_stats_should_ignore_index(index)) {
+			continue;
+		}
+
+		dict_stats_update_transient_for_index(index);
+
+		sum_of_index_sizes += index->stat_index_size;
+	}
+
+	index = dict_table_get_first_index(table);
+
+	table->stat_n_rows = index->stat_n_diff_key_vals[
+		dict_index_get_n_unique(index) - 1];
+
+	table->stat_clustered_index_size = index->stat_index_size;
+
+	table->stat_sum_of_other_index_sizes = sum_of_index_sizes
+		- index->stat_index_size;
+
+	table->stats_last_recalc = ut_time();
+
+	table->stat_modified_counter = 0;
+
+	table->stat_initialized = TRUE;
+}
+
+/* @{ Pseudo code about the relation between the following functions
+
+let N = N_SAMPLE_PAGES(index)
+
+dict_stats_analyze_index()
+  for each n_prefix
+    search for good enough level:
+      dict_stats_analyze_index_level() // only called if level has <= N pages
+        // full scan of the level in one mtr
+        collect statistics about the given level
+      if we are not satisfied with the level, search next lower level
+    we have found a good enough level here
+    dict_stats_analyze_index_for_n_prefix(that level, stats collected above)
+      // full scan of the level in one mtr
+      dive below some records and analyze the leaf page there:
+      dict_stats_analyze_index_below_cur()
+@} */
+
+/*********************************************************************//**
+Find the total number and the number of distinct keys on a given level in
+an index. Each of the 1..n_uniq prefixes are looked up and the results are
+saved in the array n_diff[0] .. n_diff[n_uniq - 1]. The total number of
+records on the level is saved in total_recs.
+Also, the index of the last record in each group of equal records is saved
+in n_diff_boundaries[0..n_uniq - 1], records indexing starts from the leftmost
+record on the level and continues cross pages boundaries, counting from 0. */
+static
+void
+dict_stats_analyze_index_level(
+/*===========================*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		level,		/*!< in: level */
+	ib_uint64_t*	n_diff,		/*!< out: array for number of
+					distinct keys for all prefixes */
+	ib_uint64_t*	total_recs,	/*!< out: total number of records */
+	ib_uint64_t*	total_pages,	/*!< out: total number of pages */
+	boundaries_t*	n_diff_boundaries,/*!< out: boundaries of the groups
+					of distinct keys */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	ulint		n_uniq;
+	mem_heap_t*	heap;
+	btr_pcur_t	pcur;
+	const page_t*	page;
+	const rec_t*	rec;
+	const rec_t*	prev_rec;
+	bool		prev_rec_is_copied;
+	byte*		prev_rec_buf = NULL;
+	ulint		prev_rec_buf_size = 0;
+	ulint*		rec_offsets;
+	ulint*		prev_rec_offsets;
+	ulint		i;
+
+	DEBUG_PRINTF("    %s(table=%s, index=%s, level=%lu)\n", __func__,
+		     index->table->name, index->name, level);
+
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_S_LOCK));
+
+	n_uniq = dict_index_get_n_unique(index);
+
+	/* elements in the n_diff array are 0..n_uniq-1 (inclusive) */
+	memset(n_diff, 0x0, n_uniq * sizeof(n_diff[0]));
+
+	/* Allocate space for the offsets header (the allocation size at
+	offsets[0] and the REC_OFFS_HEADER_SIZE bytes), and n_fields + 1,
+	so that this will never be less than the size calculated in
+	rec_get_offsets_func(). */
+	i = (REC_OFFS_HEADER_SIZE + 1 + 1) + index->n_fields;
+
+	heap = mem_heap_create((2 * sizeof *rec_offsets) * i);
+	rec_offsets = static_cast<ulint*>(
+		mem_heap_alloc(heap, i * sizeof *rec_offsets));
+	prev_rec_offsets = static_cast<ulint*>(
+		mem_heap_alloc(heap, i * sizeof *prev_rec_offsets));
+	rec_offs_set_n_alloc(rec_offsets, i);
+	rec_offs_set_n_alloc(prev_rec_offsets, i);
+
+	/* reset the dynamic arrays n_diff_boundaries[0..n_uniq-1] */
+	if (n_diff_boundaries != NULL) {
+		for (i = 0; i < n_uniq; i++) {
+			n_diff_boundaries[i].erase(
+				n_diff_boundaries[i].begin(),
+				n_diff_boundaries[i].end());
+		}
+	}
+
+	/* Position pcur on the leftmost record on the leftmost page
+	on the desired level. */
+
+	btr_pcur_open_at_index_side(
+		true, index, BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED,
+		&pcur, true, level, mtr);
+	btr_pcur_move_to_next_on_page(&pcur);
+
+	page = btr_pcur_get_page(&pcur);
+
+	/* The page must not be empty, except when
+	it is the root page (and the whole index is empty). */
+	ut_ad(btr_pcur_is_on_user_rec(&pcur) || page_is_leaf(page));
+	ut_ad(btr_pcur_get_rec(&pcur)
+	      == page_rec_get_next_const(page_get_infimum_rec(page)));
+
+	/* check that we are indeed on the desired level */
+	ut_a(btr_page_get_level(page, mtr) == level);
+
+	/* there should not be any pages on the left */
+	ut_a(btr_page_get_prev(page, mtr) == FIL_NULL);
+
+	/* check whether the first record on the leftmost page is marked
+	as such, if we are on a non-leaf level */
+	ut_a((level == 0)
+	     == !(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+			  btr_pcur_get_rec(&pcur), page_is_comp(page))));
+
+	prev_rec = NULL;
+	prev_rec_is_copied = false;
+
+	/* no records by default */
+	*total_recs = 0;
+
+	*total_pages = 0;
+
+	/* iterate over all user records on this level
+	and compare each two adjacent ones, even the last on page
+	X and the fist on page X+1 */
+	for (;
+	     btr_pcur_is_on_user_rec(&pcur);
+	     btr_pcur_move_to_next_user_rec(&pcur, mtr)) {
+
+		ulint	matched_fields = 0;
+		ulint	matched_bytes = 0;
+		bool	rec_is_last_on_page;
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		/* If rec and prev_rec are on different pages, then prev_rec
+		must have been copied, because we hold latch only on the page
+		where rec resides. */
+		if (prev_rec != NULL
+		    && page_align(rec) != page_align(prev_rec)) {
+
+			ut_a(prev_rec_is_copied);
+		}
+
+		rec_is_last_on_page =
+			page_rec_is_supremum(page_rec_get_next_const(rec));
+
+		/* increment the pages counter at the end of each page */
+		if (rec_is_last_on_page) {
+
+			(*total_pages)++;
+		}
+
+		/* Skip delete-marked records on the leaf level. If we
+		do not skip them, then ANALYZE quickly after DELETE
+		could count them or not (purge may have already wiped
+		them away) which brings non-determinism. We skip only
+		leaf-level delete marks because delete marks on
+		non-leaf level do not make sense. */
+		if (level == 0 &&
+		    rec_get_deleted_flag(
+			    rec,
+			    page_is_comp(btr_pcur_get_page(&pcur)))) {
+
+			if (rec_is_last_on_page
+			    && !prev_rec_is_copied
+			    && prev_rec != NULL) {
+				/* copy prev_rec */
+
+				prev_rec_offsets = rec_get_offsets(
+					prev_rec, index, prev_rec_offsets,
+					n_uniq, &heap);
+
+				prev_rec = rec_copy_prefix_to_buf(
+					prev_rec, index,
+					rec_offs_n_fields(prev_rec_offsets),
+					&prev_rec_buf, &prev_rec_buf_size);
+
+				prev_rec_is_copied = true;
+			}
+
+			continue;
+		}
+
+		rec_offsets = rec_get_offsets(
+			rec, index, rec_offsets, n_uniq, &heap);
+
+		(*total_recs)++;
+
+		if (prev_rec != NULL) {
+			prev_rec_offsets = rec_get_offsets(
+				prev_rec, index, prev_rec_offsets,
+				n_uniq, &heap);
+
+			cmp_rec_rec_with_match(rec,
+					       prev_rec,
+					       rec_offsets,
+					       prev_rec_offsets,
+					       index,
+					       FALSE,
+					       &matched_fields,
+					       &matched_bytes);
+
+			for (i = matched_fields; i < n_uniq; i++) {
+
+				if (n_diff_boundaries != NULL) {
+					/* push the index of the previous
+					record, that is - the last one from
+					a group of equal keys */
+
+					ib_uint64_t	idx;
+
+					/* the index of the current record
+					is total_recs - 1, the index of the
+					previous record is total_recs - 2;
+					we know that idx is not going to
+					become negative here because if we
+					are in this branch then there is a
+					previous record and thus
+					total_recs >= 2 */
+					idx = *total_recs - 2;
+
+					n_diff_boundaries[i].push_back(idx);
+				}
+
+				/* increment the number of different keys
+				for n_prefix=i+1 (e.g. if i=0 then we increment
+				for n_prefix=1 which is stored in n_diff[0]) */
+				n_diff[i]++;
+			}
+		} else {
+			/* this is the first non-delete marked record */
+			for (i = 0; i < n_uniq; i++) {
+				n_diff[i] = 1;
+			}
+		}
+
+		if (rec_is_last_on_page) {
+			/* end of a page has been reached */
+
+			/* we need to copy the record instead of assigning
+			like prev_rec = rec; because when we traverse the
+			records on this level at some point we will jump from
+			one page to the next and then rec and prev_rec will
+			be on different pages and
+			btr_pcur_move_to_next_user_rec() will release the
+			latch on the page that prev_rec is on */
+			prev_rec = rec_copy_prefix_to_buf(
+				rec, index, rec_offs_n_fields(rec_offsets),
+				&prev_rec_buf, &prev_rec_buf_size);
+			prev_rec_is_copied = true;
+
+		} else {
+			/* still on the same page, the next call to
+			btr_pcur_move_to_next_user_rec() will not jump
+			on the next page, we can simply assign pointers
+			instead of copying the records like above */
+
+			prev_rec = rec;
+			prev_rec_is_copied = false;
+		}
+	}
+
+	/* if *total_pages is left untouched then the above loop was not
+	entered at all and there is one page in the whole tree which is
+	empty or the loop was entered but this is level 0, contains one page
+	and all records are delete-marked */
+	if (*total_pages == 0) {
+
+		ut_ad(level == 0);
+		ut_ad(*total_recs == 0);
+
+		*total_pages = 1;
+	}
+
+	/* if there are records on this level and boundaries
+	should be saved */
+	if (*total_recs > 0 && n_diff_boundaries != NULL) {
+
+		/* remember the index of the last record on the level as the
+		last one from the last group of equal keys; this holds for
+		all possible prefixes */
+		for (i = 0; i < n_uniq; i++) {
+			ib_uint64_t	idx;
+
+			idx = *total_recs - 1;
+
+			n_diff_boundaries[i].push_back(idx);
+		}
+	}
+
+	/* now in n_diff_boundaries[i] there are exactly n_diff[i] integers,
+	for i=0..n_uniq-1 */
+
+#ifdef UNIV_STATS_DEBUG
+	for (i = 0; i < n_uniq; i++) {
+
+		DEBUG_PRINTF("    %s(): total recs: " UINT64PF
+			     ", total pages: " UINT64PF
+			     ", n_diff[%lu]: " UINT64PF "\n",
+			     __func__, *total_recs,
+			     *total_pages,
+			     i, n_diff[i]);
+
+#if 0
+		if (n_diff_boundaries != NULL) {
+			ib_uint64_t	j;
+
+			DEBUG_PRINTF("    %s(): boundaries[%lu]: ",
+				     __func__, i);
+
+			for (j = 0; j < n_diff[i]; j++) {
+				ib_uint64_t	idx;
+
+				idx = n_diff_boundaries[i][j];
+
+				DEBUG_PRINTF(UINT64PF "=" UINT64PF ", ",
+					     j, idx);
+			}
+			DEBUG_PRINTF("\n");
+		}
+#endif
+	}
+#endif /* UNIV_STATS_DEBUG */
+
+	/* Release the latch on the last page, because that is not done by
+	btr_pcur_close(). This function works also for non-leaf pages. */
+	btr_leaf_page_release(btr_pcur_get_block(&pcur), BTR_SEARCH_LEAF, mtr);
+
+	btr_pcur_close(&pcur);
+
+	if (prev_rec_buf != NULL) {
+
+		mem_free(prev_rec_buf);
+	}
+
+	mem_heap_free(heap);
+}
+
+/* aux enum for controlling the behavior of dict_stats_scan_page() @{ */
+enum page_scan_method_t {
+	COUNT_ALL_NON_BORING_AND_SKIP_DEL_MARKED,/* scan all records on
+				the given page and count the number of
+				distinct ones, also ignore delete marked
+				records */
+	QUIT_ON_FIRST_NON_BORING/* quit when the first record that differs
+				from its right neighbor is found */
+};
+/* @} */
+
+/** Scan a page, reading records from left to right and counting the number
+of distinct records (looking only at the first n_prefix
+columns) and the number of external pages pointed by records from this page.
+If scan_method is QUIT_ON_FIRST_NON_BORING then the function
+will return as soon as it finds a record that does not match its neighbor
+to the right, which means that in the case of QUIT_ON_FIRST_NON_BORING the
+returned n_diff can either be 0 (empty page), 1 (the whole page has all keys
+equal) or 2 (the function found a non-boring record and returned).
+@param[out]	out_rec			record, or NULL
+@param[out]	offsets1		rec_get_offsets() working space (must
+be big enough)
+@param[out]	offsets2		rec_get_offsets() working space (must
+be big enough)
+@param[in]	index			index of the page
+@param[in]	page			the page to scan
+@param[in]	n_prefix		look at the first n_prefix columns
+@param[in]	scan_method		scan to the end of the page or not
+@param[out]	n_diff			number of distinct records encountered
+@param[out]	n_external_pages	if this is non-NULL then it will be set
+to the number of externally stored pages which were encountered
+@return offsets1 or offsets2 (the offsets of *out_rec),
+or NULL if the page is empty and does not contain user records. */
+UNIV_INLINE
+ulint*
+dict_stats_scan_page(
+	const rec_t**		out_rec,
+	ulint*			offsets1,
+	ulint*			offsets2,
+	dict_index_t*		index,
+	const page_t*		page,
+	ulint			n_prefix,
+	page_scan_method_t	scan_method,
+	ib_uint64_t*		n_diff,
+	ib_uint64_t*		n_external_pages)
+{
+	ulint*		offsets_rec		= offsets1;
+	ulint*		offsets_next_rec	= offsets2;
+	const rec_t*	rec;
+	const rec_t*	next_rec;
+	/* A dummy heap, to be passed to rec_get_offsets().
+	Because offsets1,offsets2 should be big enough,
+	this memory heap should never be used. */
+	mem_heap_t*	heap			= NULL;
+	const rec_t*	(*get_next)(const rec_t*);
+
+	if (scan_method == COUNT_ALL_NON_BORING_AND_SKIP_DEL_MARKED) {
+		get_next = page_rec_get_next_non_del_marked;
+	} else {
+		get_next = page_rec_get_next_const;
+	}
+
+	const bool	should_count_external_pages = n_external_pages != NULL;
+
+	if (should_count_external_pages) {
+		*n_external_pages = 0;
+	}
+
+	rec = get_next(page_get_infimum_rec(page));
+
+	if (page_rec_is_supremum(rec)) {
+		/* the page is empty or contains only delete-marked records */
+		*n_diff = 0;
+		*out_rec = NULL;
+		return(NULL);
+	}
+
+	offsets_rec = rec_get_offsets(rec, index, offsets_rec,
+				      ULINT_UNDEFINED, &heap);
+
+	if (should_count_external_pages) {
+		*n_external_pages += btr_rec_get_externally_stored_len(
+			rec, offsets_rec);
+	}
+
+	next_rec = get_next(rec);
+
+	*n_diff = 1;
+
+	while (!page_rec_is_supremum(next_rec)) {
+
+		ulint	matched_fields = 0;
+		ulint	matched_bytes = 0;
+
+		offsets_next_rec = rec_get_offsets(next_rec, index,
+						   offsets_next_rec,
+						   ULINT_UNDEFINED,
+						   &heap);
+
+		/* check whether rec != next_rec when looking at
+		the first n_prefix fields */
+		cmp_rec_rec_with_match(rec, next_rec,
+				       offsets_rec, offsets_next_rec,
+				       index, FALSE, &matched_fields,
+				       &matched_bytes);
+
+		if (matched_fields < n_prefix) {
+			/* rec != next_rec, => rec is non-boring */
+
+			(*n_diff)++;
+
+			if (scan_method == QUIT_ON_FIRST_NON_BORING) {
+				goto func_exit;
+			}
+		}
+
+		rec = next_rec;
+		{
+			/* Assign offsets_rec = offsets_next_rec
+			so that offsets_rec matches with rec which
+			was just assigned rec = next_rec above.
+			Also need to point offsets_next_rec to the
+			place where offsets_rec was pointing before
+			because we have just 2 placeholders where
+			data is actually stored:
+			offsets_onstack1 and offsets_onstack2 and we
+			are using them in circular fashion
+			(offsets[_next]_rec are just pointers to
+			those placeholders). */
+			ulint*	offsets_tmp;
+			offsets_tmp = offsets_rec;
+			offsets_rec = offsets_next_rec;
+			offsets_next_rec = offsets_tmp;
+		}
+
+		if (should_count_external_pages) {
+			*n_external_pages += btr_rec_get_externally_stored_len(
+				rec, offsets_rec);
+		}
+
+		next_rec = get_next(next_rec);
+	}
+
+func_exit:
+	/* offsets1,offsets2 should have been big enough */
+	ut_a(heap == NULL);
+	*out_rec = rec;
+	return(offsets_rec);
+}
+
+/** Dive below the current position of a cursor and calculate the number of
+distinct records on the leaf page, when looking at the fist n_prefix
+columns. Also calculate the number of external pages pointed by records
+on the leaf page.
+@param[in]	cur			cursor
+@param[in]	n_prefix		look at the first n_prefix columns
+when comparing records
+@param[out]	n_diff			number of distinct records
+@param[out]	n_external_pages	number of external pages
+@param[in,out]	mtr			mini-transaction
+@return number of distinct records on the leaf page */
+static
+void
+dict_stats_analyze_index_below_cur(
+	const btr_cur_t*	cur,
+	ulint			n_prefix,
+	ib_uint64_t*		n_diff,
+	ib_uint64_t*		n_external_pages,
+	mtr_t*			mtr)
+{
+	dict_index_t*	index;
+	ulint		space;
+	ulint		zip_size;
+	buf_block_t*	block;
+	ulint		page_no;
+	const page_t*	page;
+	mem_heap_t*	heap;
+	const rec_t*	rec;
+	ulint*		offsets1;
+	ulint*		offsets2;
+	ulint*		offsets_rec;
+	ulint		size;
+
+	index = btr_cur_get_index(cur);
+
+	/* Allocate offsets for the record and the node pointer, for
+	node pointer records. In a secondary index, the node pointer
+	record will consist of all index fields followed by a child
+	page number.
+	Allocate space for the offsets header (the allocation size at
+	offsets[0] and the REC_OFFS_HEADER_SIZE bytes), and n_fields + 1,
+	so that this will never be less than the size calculated in
+	rec_get_offsets_func(). */
+	size = (1 + REC_OFFS_HEADER_SIZE) + 1 + dict_index_get_n_fields(index);
+
+	heap = mem_heap_create(size * (sizeof *offsets1 + sizeof *offsets2));
+
+	offsets1 = static_cast<ulint*>(mem_heap_alloc(
+			heap, size * sizeof *offsets1));
+
+	offsets2 = static_cast<ulint*>(mem_heap_alloc(
+			heap, size * sizeof *offsets2));
+
+	rec_offs_set_n_alloc(offsets1, size);
+	rec_offs_set_n_alloc(offsets2, size);
+
+	space = dict_index_get_space(index);
+	zip_size = dict_table_zip_size(index->table);
+
+	rec = btr_cur_get_rec(cur);
+
+	offsets_rec = rec_get_offsets(rec, index, offsets1,
+				      ULINT_UNDEFINED, &heap);
+
+	page_no = btr_node_ptr_get_child_page_no(rec, offsets_rec);
+
+	/* assume no external pages by default - in case we quit from this
+	function without analyzing any leaf pages */
+	*n_external_pages = 0;
+
+	/* descend to the leaf level on the B-tree */
+	for (;;) {
+
+		block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH,
+					 NULL /* no guessed block */,
+					 BUF_GET, __FILE__, __LINE__, mtr);
+
+		page = buf_block_get_frame(block);
+
+		if (btr_page_get_level(page, mtr) == 0) {
+			/* leaf level */
+			break;
+		}
+		/* else */
+
+		/* search for the first non-boring record on the page */
+		offsets_rec = dict_stats_scan_page(
+			&rec, offsets1, offsets2, index, page, n_prefix,
+			QUIT_ON_FIRST_NON_BORING, n_diff, NULL);
+
+		/* pages on level > 0 are not allowed to be empty */
+		ut_a(offsets_rec != NULL);
+		/* if page is not empty (offsets_rec != NULL) then n_diff must
+		be > 0, otherwise there is a bug in dict_stats_scan_page() */
+		ut_a(*n_diff > 0);
+
+		if (*n_diff == 1) {
+			/* page has all keys equal and the end of the page
+			was reached by dict_stats_scan_page(), no need to
+			descend to the leaf level */
+			mem_heap_free(heap);
+			/* can't get an estimate for n_external_pages here
+			because we do not dive to the leaf level, assume no
+			external pages (*n_external_pages was assigned to 0
+			above). */
+			return;
+		}
+		/* else */
+
+		/* when we instruct dict_stats_scan_page() to quit on the
+		first non-boring record it finds, then the returned n_diff
+		can either be 0 (empty page), 1 (page has all keys equal) or
+		2 (non-boring record was found) */
+		ut_a(*n_diff == 2);
+
+		/* we have a non-boring record in rec, descend below it */
+
+		page_no = btr_node_ptr_get_child_page_no(rec, offsets_rec);
+	}
+
+	/* make sure we got a leaf page as a result from the above loop */
+	ut_ad(btr_page_get_level(page, mtr) == 0);
+
+	/* scan the leaf page and find the number of distinct keys,
+	when looking only at the first n_prefix columns; also estimate
+	the number of externally stored pages pointed by records on this
+	page */
+
+	offsets_rec = dict_stats_scan_page(
+		&rec, offsets1, offsets2, index, page, n_prefix,
+		COUNT_ALL_NON_BORING_AND_SKIP_DEL_MARKED, n_diff,
+		n_external_pages);
+
+#if 0
+	DEBUG_PRINTF("      %s(): n_diff below page_no=%lu: " UINT64PF "\n",
+		     __func__, page_no, n_diff);
+#endif
+
+	mem_heap_free(heap);
+}
+
+/** Input data that is used to calculate dict_index_t::stat_n_diff_key_vals[]
+for each n-columns prefix (n from 1 to n_uniq). */
+struct n_diff_data_t {
+	/** Index of the level on which the descent through the btree
+	stopped. level 0 is the leaf level. This is >= 1 because we
+	avoid scanning the leaf level because it may contain too many
+	pages and doing so is useless when combined with the random dives -
+	if we are to scan the leaf level, this means a full scan and we can
+	simply do that instead of fiddling with picking random records higher
+	in the tree and to dive below them. At the start of the analyzing
+	we may decide to do full scan of the leaf level, but then this
+	structure is not used in that code path. */
+	ulint		level;
+
+	/** Number of records on the level where the descend through the btree
+	stopped. When we scan the btree from the root, we stop at some mid
+	level, choose some records from it and dive below them towards a leaf
+	page to analyze. */
+	ib_uint64_t	n_recs_on_level;
+
+	/** Number of different key values that were found on the mid level. */
+	ib_uint64_t	n_diff_on_level;
+
+	/** Number of leaf pages that are analyzed. This is also the same as
+	the number of records that we pick from the mid level and dive below
+	them. */
+	ib_uint64_t	n_leaf_pages_to_analyze;
+
+	/** Cumulative sum of the number of different key values that were
+	found on all analyzed pages. */
+	ib_uint64_t	n_diff_all_analyzed_pages;
+
+	/** Cumulative sum of the number of external pages (stored outside of
+	the btree but in the same file segment). */
+	ib_uint64_t	n_external_pages_sum;
+};
+
+/** Estimate the number of different key values in an index when looking at
+the first n_prefix columns. For a given level in an index select
+n_diff_data->n_leaf_pages_to_analyze records from that level and dive below
+them to the corresponding leaf pages, then scan those leaf pages and save the
+sampling results in n_diff_data->n_diff_all_analyzed_pages.
+@param[in]	index			index
+@param[in]	n_prefix		look at first 'n_prefix' columns when
+comparing records
+@param[in]	boundaries		a vector that contains
+n_diff_data->n_diff_on_level integers each of which represents the index (on
+level 'level', counting from left/smallest to right/biggest from 0) of the
+last record from each group of distinct keys
+@param[in,out]	n_diff_data		n_diff_all_analyzed_pages and
+n_external_pages_sum in this structure will be set by this function. The
+members level, n_diff_on_level and n_leaf_pages_to_analyze must be set by the
+caller in advance - they are used by some calculations inside this function
+@param[in,out]	mtr			mini-transaction */
+static
+void
+dict_stats_analyze_index_for_n_prefix(
+	dict_index_t*		index,
+	ulint			n_prefix,
+	const boundaries_t*	boundaries,
+	n_diff_data_t*		n_diff_data,
+	mtr_t*			mtr)
+{
+	btr_pcur_t	pcur;
+	const page_t*	page;
+	ib_uint64_t	rec_idx;
+	ib_uint64_t	i;
+
+#if 0
+	DEBUG_PRINTF("    %s(table=%s, index=%s, level=%lu, n_prefix=%lu, "
+		     "n_diff_on_level=" UINT64PF ")\n",
+		     __func__, index->table->name, index->name, level,
+		     n_prefix, n_diff_data->n_diff_on_level);
+#endif
+
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_S_LOCK));
+
+	/* Position pcur on the leftmost record on the leftmost page
+	on the desired level. */
+
+	btr_pcur_open_at_index_side(
+		true, index, BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED,
+		&pcur, true, n_diff_data->level, mtr);
+	btr_pcur_move_to_next_on_page(&pcur);
+
+	page = btr_pcur_get_page(&pcur);
+
+	const rec_t*	first_rec = btr_pcur_get_rec(&pcur);
+
+	/* We shouldn't be scanning the leaf level. The caller of this function
+	should have stopped the descend on level 1 or higher. */
+	ut_ad(n_diff_data->level > 0);
+	ut_ad(!page_is_leaf(page));
+
+	/* The page must not be empty, except when
+	it is the root page (and the whole index is empty). */
+	ut_ad(btr_pcur_is_on_user_rec(&pcur));
+	ut_ad(first_rec == page_rec_get_next_const(page_get_infimum_rec(page)));
+
+	/* check that we are indeed on the desired level */
+	ut_a(btr_page_get_level(page, mtr) == n_diff_data->level);
+
+	/* there should not be any pages on the left */
+	ut_a(btr_page_get_prev(page, mtr) == FIL_NULL);
+
+	/* check whether the first record on the leftmost page is marked
+	as such; we are on a non-leaf level */
+	ut_a(rec_get_info_bits(first_rec, page_is_comp(page))
+	     & REC_INFO_MIN_REC_FLAG);
+
+	const ib_uint64_t	last_idx_on_level = boundaries->at(
+		static_cast<unsigned>(n_diff_data->n_diff_on_level - 1));
+
+	rec_idx = 0;
+
+	n_diff_data->n_diff_all_analyzed_pages = 0;
+	n_diff_data->n_external_pages_sum = 0;
+
+	for (i = 0; i < n_diff_data->n_leaf_pages_to_analyze; i++) {
+		/* there are n_diff_on_level elements
+		in 'boundaries' and we divide those elements
+		into n_leaf_pages_to_analyze segments, for example:
+
+		let n_diff_on_level=100, n_leaf_pages_to_analyze=4, then:
+		segment i=0:  [0, 24]
+		segment i=1: [25, 49]
+		segment i=2: [50, 74]
+		segment i=3: [75, 99] or
+
+		let n_diff_on_level=1, n_leaf_pages_to_analyze=1, then:
+		segment i=0: [0, 0] or
+
+		let n_diff_on_level=2, n_leaf_pages_to_analyze=2, then:
+		segment i=0: [0, 0]
+		segment i=1: [1, 1] or
+
+		let n_diff_on_level=13, n_leaf_pages_to_analyze=7, then:
+		segment i=0:  [0,  0]
+		segment i=1:  [1,  2]
+		segment i=2:  [3,  4]
+		segment i=3:  [5,  6]
+		segment i=4:  [7,  8]
+		segment i=5:  [9, 10]
+		segment i=6: [11, 12]
+
+		then we select a random record from each segment and dive
+		below it */
+		const ib_uint64_t	n_diff = n_diff_data->n_diff_on_level;
+		const ib_uint64_t	n_pick
+			= n_diff_data->n_leaf_pages_to_analyze;
+
+		const ib_uint64_t	left = n_diff * i / n_pick;
+		const ib_uint64_t	right = n_diff * (i + 1) / n_pick - 1;
+
+		ut_a(left <= right);
+		ut_a(right <= last_idx_on_level);
+
+		/* we do not pass (left, right) because we do not want to ask
+		ut_rnd_interval() to work with too big numbers since
+		ib_uint64_t could be bigger than ulint */
+		const ulint	rnd = ut_rnd_interval(
+			0, static_cast<ulint>(right - left));
+
+		const ib_uint64_t	dive_below_idx
+			= boundaries->at(static_cast<unsigned>(left + rnd));
+
+#if 0
+		DEBUG_PRINTF("    %s(): dive below record with index="
+			     UINT64PF "\n", __func__, dive_below_idx);
+#endif
+
+		/* seek to the record with index dive_below_idx */
+		while (rec_idx < dive_below_idx
+		       && btr_pcur_is_on_user_rec(&pcur)) {
+
+			btr_pcur_move_to_next_user_rec(&pcur, mtr);
+			rec_idx++;
+		}
+
+		/* if the level has finished before the record we are
+		searching for, this means that the B-tree has changed in
+		the meantime, quit our sampling and use whatever stats
+		we have collected so far */
+		if (rec_idx < dive_below_idx) {
+
+			ut_ad(!btr_pcur_is_on_user_rec(&pcur));
+			break;
+		}
+
+		/* it could be that the tree has changed in such a way that
+		the record under dive_below_idx is the supremum record, in
+		this case rec_idx == dive_below_idx and pcur is positioned
+		on the supremum, we do not want to dive below it */
+		if (!btr_pcur_is_on_user_rec(&pcur)) {
+			break;
+		}
+
+		ut_a(rec_idx == dive_below_idx);
+
+		ib_uint64_t	n_diff_on_leaf_page;
+		ib_uint64_t	n_external_pages;
+
+		dict_stats_analyze_index_below_cur(btr_pcur_get_btr_cur(&pcur),
+						   n_prefix,
+						   &n_diff_on_leaf_page,
+						   &n_external_pages,
+						   mtr);
+
+		/* We adjust n_diff_on_leaf_page here to avoid counting
+		one record twice - once as the last on some page and once
+		as the first on another page. Consider the following example:
+		Leaf level:
+		page: (2,2,2,2,3,3)
+		... many pages like (3,3,3,3,3,3) ...
+		page: (3,3,3,3,5,5)
+		... many pages like (5,5,5,5,5,5) ...
+		page: (5,5,5,5,8,8)
+		page: (8,8,8,8,9,9)
+		our algo would (correctly) get an estimate that there are
+		2 distinct records per page (average). Having 4 pages below
+		non-boring records, it would (wrongly) estimate the number
+		of distinct records to 8. */
+		if (n_diff_on_leaf_page > 0) {
+			n_diff_on_leaf_page--;
+		}
+
+		n_diff_data->n_diff_all_analyzed_pages += n_diff_on_leaf_page;
+
+		n_diff_data->n_external_pages_sum += n_external_pages;
+	}
+
+	btr_pcur_close(&pcur);
+}
+
+/** Set dict_index_t::stat_n_diff_key_vals[] and stat_n_sample_sizes[].
+@param[in]	n_diff_data	input data to use to derive the results
+@param[in,out]	index		index whose stat_n_diff_key_vals[] to set */
+UNIV_INLINE
+void
+dict_stats_index_set_n_diff(
+	const n_diff_data_t*	n_diff_data,
+	dict_index_t*		index)
+{
+	for (ulint n_prefix = dict_index_get_n_unique(index);
+	     n_prefix >= 1;
+	     n_prefix--) {
+		/* n_diff_all_analyzed_pages can be 0 here if
+		all the leaf pages sampled contained only
+		delete-marked records. In this case we should assign
+		0 to index->stat_n_diff_key_vals[n_prefix - 1], which
+		the formula below does. */
+
+		const n_diff_data_t*	data = &n_diff_data[n_prefix - 1];
+
+		ut_ad(data->n_leaf_pages_to_analyze > 0);
+		ut_ad(data->n_recs_on_level > 0);
+
+		ulint	n_ordinary_leaf_pages;
+
+		if (data->level == 1) {
+			/* If we know the number of records on level 1, then
+			this number is the same as the number of pages on
+			level 0 (leaf). */
+			n_ordinary_leaf_pages = data->n_recs_on_level;
+		} else {
+			/* If we analyzed D ordinary leaf pages and found E
+			external pages in total linked from those D ordinary
+			leaf pages, then this means that the ratio
+			ordinary/external is D/E. Then the ratio ordinary/total
+			is D / (D + E). Knowing that the total number of pages
+			is T (including ordinary and external) then we estimate
+			that the total number of ordinary leaf pages is
+			T * D / (D + E). */
+			n_ordinary_leaf_pages
+				= index->stat_n_leaf_pages
+				* data->n_leaf_pages_to_analyze
+				/ (data->n_leaf_pages_to_analyze
+				   + data->n_external_pages_sum);
+		}
+
+		/* See REF01 for an explanation of the algorithm */
+		index->stat_n_diff_key_vals[n_prefix - 1]
+			= n_ordinary_leaf_pages
+
+			* data->n_diff_on_level
+			/ data->n_recs_on_level
+
+			* data->n_diff_all_analyzed_pages
+			/ data->n_leaf_pages_to_analyze;
+
+		index->stat_n_sample_sizes[n_prefix - 1]
+			= data->n_leaf_pages_to_analyze;
+
+		DEBUG_PRINTF("    %s(): n_diff=" UINT64PF " for n_prefix=%lu"
+			     " (%lu"
+			     " * " UINT64PF " / " UINT64PF
+			     " * " UINT64PF " / " UINT64PF ")\n",
+			     __func__,
+			     index->stat_n_diff_key_vals[n_prefix - 1],
+			     n_prefix,
+			     index->stat_n_leaf_pages,
+			     data->n_diff_on_level,
+			     data->n_recs_on_level,
+			     data->n_diff_all_analyzed_pages,
+			     data->n_leaf_pages_to_analyze);
+	}
+}
+
+/*********************************************************************//**
+Calculates new statistics for a given index and saves them to the index
+members stat_n_diff_key_vals[], stat_n_sample_sizes[], stat_index_size and
+stat_n_leaf_pages. This function could be slow. */
+static
+void
+dict_stats_analyze_index(
+/*=====================*/
+	dict_index_t*	index)	/*!< in/out: index to analyze */
+{
+	ulint		root_level;
+	ulint		level;
+	bool		level_is_analyzed;
+	ulint		n_uniq;
+	ulint		n_prefix;
+	ib_uint64_t	total_recs;
+	ib_uint64_t	total_pages;
+	mtr_t		mtr;
+	ulint		size;
+	DBUG_ENTER("dict_stats_analyze_index");
+
+	DBUG_PRINT("info", ("index: %s, online status: %d", index->name,
+			    dict_index_get_online_status(index)));
+
+	DEBUG_PRINTF("  %s(index=%s)\n", __func__, index->name);
+
+	dict_stats_empty_index(index);
+
+	mtr_start(&mtr);
+
+	mtr_s_lock(dict_index_get_lock(index), &mtr);
+
+	size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr);
+
+	if (size != ULINT_UNDEFINED) {
+		index->stat_index_size = size;
+		size = btr_get_size(index, BTR_N_LEAF_PAGES, &mtr);
+	}
+
+	/* Release the X locks on the root page taken by btr_get_size() */
+	mtr_commit(&mtr);
+
+	switch (size) {
+	case ULINT_UNDEFINED:
+		dict_stats_assert_initialized_index(index);
+		DBUG_VOID_RETURN;
+	case 0:
+		/* The root node of the tree is a leaf */
+		size = 1;
+	}
+
+	index->stat_n_leaf_pages = size;
+
+	mtr_start(&mtr);
+
+	mtr_s_lock(dict_index_get_lock(index), &mtr);
+
+	root_level = btr_height_get(index, &mtr);
+
+	n_uniq = dict_index_get_n_unique(index);
+
+	/* If the tree has just one level (and one page) or if the user
+	has requested to sample too many pages then do full scan.
+
+	For each n-column prefix (for n=1..n_uniq) N_SAMPLE_PAGES(index)
+	will be sampled, so in total N_SAMPLE_PAGES(index) * n_uniq leaf
+	pages will be sampled. If that number is bigger than the total
+	number of leaf pages then do full scan of the leaf level instead
+	since it will be faster and will give better results. */
+
+	if (root_level == 0
+	    || N_SAMPLE_PAGES(index) * n_uniq > index->stat_n_leaf_pages) {
+
+		if (root_level == 0) {
+			DEBUG_PRINTF("  %s(): just one page, "
+				     "doing full scan\n", __func__);
+		} else {
+			DEBUG_PRINTF("  %s(): too many pages requested for "
+				     "sampling, doing full scan\n", __func__);
+		}
+
+		/* do full scan of level 0; save results directly
+		into the index */
+
+		dict_stats_analyze_index_level(index,
+					       0 /* leaf level */,
+					       index->stat_n_diff_key_vals,
+					       &total_recs,
+					       &total_pages,
+					       NULL /* boundaries not needed */,
+					       &mtr);
+
+		for (ulint i = 0; i < n_uniq; i++) {
+			index->stat_n_sample_sizes[i] = total_pages;
+		}
+
+		mtr_commit(&mtr);
+
+		dict_stats_assert_initialized_index(index);
+		DBUG_VOID_RETURN;
+	}
+
+	/* For each level that is being scanned in the btree, this contains the
+	number of different key values for all possible n-column prefixes. */
+	ib_uint64_t*		n_diff_on_level = new ib_uint64_t[n_uniq];
+
+	/* For each level that is being scanned in the btree, this contains the
+	index of the last record from each group of equal records (when
+	comparing only the first n columns, n=1..n_uniq). */
+	boundaries_t*		n_diff_boundaries = new boundaries_t[n_uniq];
+
+	/* For each n-column prefix this array contains the input data that is
+	used to calculate dict_index_t::stat_n_diff_key_vals[]. */
+	n_diff_data_t*		n_diff_data = new n_diff_data_t[n_uniq];
+
+	/* total_recs is also used to estimate the number of pages on one
+	level below, so at the start we have 1 page (the root) */
+	total_recs = 1;
+
+	/* Here we use the following optimization:
+	If we find that level L is the first one (searching from the
+	root) that contains at least D distinct keys when looking at
+	the first n_prefix columns, then:
+	if we look at the first n_prefix-1 columns then the first
+	level that contains D distinct keys will be either L or a
+	lower one.
+	So if we find that the first level containing D distinct
+	keys (on n_prefix columns) is L, we continue from L when
+	searching for D distinct keys on n_prefix-1 columns. */
+	level = root_level;
+	level_is_analyzed = false;
+
+	for (n_prefix = n_uniq; n_prefix >= 1; n_prefix--) {
+
+		DEBUG_PRINTF("  %s(): searching level with >=%llu "
+			     "distinct records, n_prefix=%lu\n",
+			     __func__, N_DIFF_REQUIRED(index), n_prefix);
+
+		/* Commit the mtr to release the tree S lock to allow
+		other threads to do some work too. */
+		mtr_commit(&mtr);
+		mtr_start(&mtr);
+		mtr_s_lock(dict_index_get_lock(index), &mtr);
+		if (root_level != btr_height_get(index, &mtr)) {
+			/* Just quit if the tree has changed beyond
+			recognition here. The old stats from previous
+			runs will remain in the values that we have
+			not calculated yet. Initially when the index
+			object is created the stats members are given
+			some sensible values so leaving them untouched
+			here even the first time will not cause us to
+			read uninitialized memory later. */
+			break;
+		}
+
+		/* check whether we should pick the current level;
+		we pick level 1 even if it does not have enough
+		distinct records because we do not want to scan the
+		leaf level because it may contain too many records */
+		if (level_is_analyzed
+		    && (n_diff_on_level[n_prefix - 1] >= N_DIFF_REQUIRED(index)
+			|| level == 1)) {
+
+			goto found_level;
+		}
+
+		/* search for a level that contains enough distinct records */
+
+		if (level_is_analyzed && level > 1) {
+
+			/* if this does not hold we should be on
+			"found_level" instead of here */
+			ut_ad(n_diff_on_level[n_prefix - 1]
+			      < N_DIFF_REQUIRED(index));
+
+			level--;
+			level_is_analyzed = false;
+		}
+
+		/* descend into the tree, searching for "good enough" level */
+		for (;;) {
+
+			/* make sure we do not scan the leaf level
+			accidentally, it may contain too many pages */
+			ut_ad(level > 0);
+
+			/* scanning the same level twice is an optimization
+			bug */
+			ut_ad(!level_is_analyzed);
+
+			/* Do not scan if this would read too many pages.
+			Here we use the following fact:
+			the number of pages on level L equals the number
+			of records on level L+1, thus we deduce that the
+			following call would scan total_recs pages, because
+			total_recs is left from the previous iteration when
+			we scanned one level upper or we have not scanned any
+			levels yet in which case total_recs is 1. */
+			if (total_recs > N_SAMPLE_PAGES(index)) {
+
+				/* if the above cond is true then we are
+				not at the root level since on the root
+				level total_recs == 1 (set before we
+				enter the n-prefix loop) and cannot
+				be > N_SAMPLE_PAGES(index) */
+				ut_a(level != root_level);
+
+				/* step one level back and be satisfied with
+				whatever it contains */
+				level++;
+				level_is_analyzed = true;
+
+				break;
+			}
+
+			dict_stats_analyze_index_level(index,
+						       level,
+						       n_diff_on_level,
+						       &total_recs,
+						       &total_pages,
+						       n_diff_boundaries,
+						       &mtr);
+
+			level_is_analyzed = true;
+
+			if (level == 1
+			    || n_diff_on_level[n_prefix - 1]
+			    >= N_DIFF_REQUIRED(index)) {
+				/* we have reached the last level we could scan
+				or we found a good level with many distinct
+				records */
+				break;
+			}
+
+			level--;
+			level_is_analyzed = false;
+		}
+found_level:
+
+		DEBUG_PRINTF("  %s(): found level %lu that has " UINT64PF
+			     " distinct records for n_prefix=%lu\n",
+			     __func__, level, n_diff_on_level[n_prefix - 1],
+			     n_prefix);
+		/* here we are either on level 1 or the level that we are on
+		contains >= N_DIFF_REQUIRED distinct keys or we did not scan
+		deeper levels because they would contain too many pages */
+
+		ut_ad(level > 0);
+
+		ut_ad(level_is_analyzed);
+
+		/* if any of these is 0 then there is exactly one page in the
+		B-tree and it is empty and we should have done full scan and
+		should not be here */
+		ut_ad(total_recs > 0);
+		ut_ad(n_diff_on_level[n_prefix - 1] > 0);
+
+		ut_ad(N_SAMPLE_PAGES(index) > 0);
+
+		n_diff_data_t*	data = &n_diff_data[n_prefix - 1];
+
+		data->level = level;
+
+		data->n_recs_on_level = total_recs;
+
+		data->n_diff_on_level = n_diff_on_level[n_prefix - 1];
+
+		data->n_leaf_pages_to_analyze = std::min(
+			N_SAMPLE_PAGES(index),
+			n_diff_on_level[n_prefix - 1]);
+
+		/* pick some records from this level and dive below them for
+		the given n_prefix */
+
+		dict_stats_analyze_index_for_n_prefix(
+			index, n_prefix, &n_diff_boundaries[n_prefix - 1],
+			data, &mtr);
+	}
+
+	mtr_commit(&mtr);
+
+	delete[] n_diff_boundaries;
+
+	delete[] n_diff_on_level;
+
+	/* n_prefix == 0 means that the above loop did not end up prematurely
+	due to tree being changed and so n_diff_data[] is set up. */
+	if (n_prefix == 0) {
+		dict_stats_index_set_n_diff(n_diff_data, index);
+	}
+
+	delete[] n_diff_data;
+
+	dict_stats_assert_initialized_index(index);
+	DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. This function
+is relatively slow and is used to calculate persistent statistics that
+will be saved on disk.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+dict_stats_update_persistent(
+/*=========================*/
+	dict_table_t*	table)		/*!< in/out: table */
+{
+	dict_index_t*	index;
+
+	DEBUG_PRINTF("%s(table=%s)\n", __func__, table->name);
+
+	dict_table_stats_lock(table, RW_X_LATCH);
+
+	/* analyze the clustered index first */
+
+	index = dict_table_get_first_index(table);
+
+	if (index == NULL
+	    || dict_index_is_corrupted(index)
+	    || (index->type | DICT_UNIQUE) != (DICT_CLUSTERED | DICT_UNIQUE)) {
+
+		/* Table definition is corrupt */
+		dict_table_stats_unlock(table, RW_X_LATCH);
+		dict_stats_empty_table(table);
+
+		return(DB_CORRUPTION);
+	}
+
+	ut_ad(!dict_index_is_univ(index));
+
+	dict_stats_analyze_index(index);
+
+	ulint	n_unique = dict_index_get_n_unique(index);
+
+	table->stat_n_rows = index->stat_n_diff_key_vals[n_unique - 1];
+
+	table->stat_clustered_index_size = index->stat_index_size;
+
+	/* analyze other indexes from the table, if any */
+
+	table->stat_sum_of_other_index_sizes = 0;
+
+	for (index = dict_table_get_next_index(index);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		ut_ad(!dict_index_is_univ(index));
+
+		if (index->type & DICT_FTS) {
+			continue;
+		}
+
+		dict_stats_empty_index(index);
+
+		if (dict_stats_should_ignore_index(index)) {
+			continue;
+		}
+
+		if (!(table->stats_bg_flag & BG_STAT_SHOULD_QUIT)) {
+			dict_stats_analyze_index(index);
+		}
+
+		table->stat_sum_of_other_index_sizes
+			+= index->stat_index_size;
+	}
+
+	table->stats_last_recalc = ut_time();
+
+	table->stat_modified_counter = 0;
+
+	table->stat_initialized = TRUE;
+
+	dict_stats_assert_initialized(table);
+
+	dict_table_stats_unlock(table, RW_X_LATCH);
+
+	return(DB_SUCCESS);
+}
+
+#include "mysql_com.h"
+/** Save an individual index's statistic into the persistent statistics
+storage.
+@param[in]	index			index to be updated
+@param[in]	last_update		timestamp of the stat
+@param[in]	stat_name		name of the stat
+@param[in]	stat_value		value of the stat
+@param[in]	sample_size		n pages sampled or NULL
+@param[in]	stat_description	description of the stat
+@param[in,out]	trx			in case of NULL the function will
+allocate and free the trx object. If it is not NULL then it will be
+rolled back only in the case of error, but not freed.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+dict_stats_save_index_stat(
+	dict_index_t*	index,
+	lint		last_update,
+	const char*	stat_name,
+	ib_uint64_t	stat_value,
+	ib_uint64_t*	sample_size,
+	const char*	stat_description,
+	trx_t*		trx)
+{
+	pars_info_t*	pinfo;
+	dberr_t		ret;
+	char		db_utf8[MAX_DB_UTF8_LEN];
+	char		table_utf8[MAX_TABLE_UTF8_LEN];
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	dict_fs2utf8(index->table->name, db_utf8, sizeof(db_utf8),
+		     table_utf8, sizeof(table_utf8));
+
+	pinfo = pars_info_create();
+	pars_info_add_str_literal(pinfo, "database_name", db_utf8);
+	pars_info_add_str_literal(pinfo, "table_name", table_utf8);
+	UNIV_MEM_ASSERT_RW_ABORT(index->name, strlen(index->name));
+	pars_info_add_str_literal(pinfo, "index_name", index->name);
+	UNIV_MEM_ASSERT_RW_ABORT(&last_update, 4);
+	pars_info_add_int4_literal(pinfo, "last_update", last_update);
+	UNIV_MEM_ASSERT_RW_ABORT(stat_name, strlen(stat_name));
+	pars_info_add_str_literal(pinfo, "stat_name", stat_name);
+	UNIV_MEM_ASSERT_RW_ABORT(&stat_value, 8);
+	pars_info_add_ull_literal(pinfo, "stat_value", stat_value);
+	if (sample_size != NULL) {
+		UNIV_MEM_ASSERT_RW_ABORT(sample_size, 8);
+		pars_info_add_ull_literal(pinfo, "sample_size", *sample_size);
+	} else {
+		pars_info_add_literal(pinfo, "sample_size", NULL,
+				      UNIV_SQL_NULL, DATA_FIXBINARY, 0);
+	}
+	UNIV_MEM_ASSERT_RW_ABORT(stat_description, strlen(stat_description));
+	pars_info_add_str_literal(pinfo, "stat_description",
+				  stat_description);
+
+	ret = dict_stats_exec_sql(
+		pinfo,
+		"PROCEDURE INDEX_STATS_SAVE () IS\n"
+		"BEGIN\n"
+
+		"DELETE FROM \"" INDEX_STATS_NAME "\"\n"
+		"WHERE\n"
+		"database_name = :database_name AND\n"
+		"table_name = :table_name AND\n"
+		"index_name = :index_name AND\n"
+		"stat_name = :stat_name;\n"
+
+		"INSERT INTO \"" INDEX_STATS_NAME "\"\n"
+		"VALUES\n"
+		"(\n"
+		":database_name,\n"
+		":table_name,\n"
+		":index_name,\n"
+		":last_update,\n"
+		":stat_name,\n"
+		":stat_value,\n"
+		":sample_size,\n"
+		":stat_description\n"
+		");\n"
+		"END;", trx);
+
+	if (ret != DB_SUCCESS) {
+		char	buf_table[MAX_FULL_NAME_LEN];
+		char	buf_index[MAX_FULL_NAME_LEN];
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Cannot save index statistics for table "
+			"%s, index %s, stat name \"%s\": %s\n",
+			ut_format_name(index->table->name, TRUE,
+				       buf_table, sizeof(buf_table)),
+			ut_format_name(index->name, FALSE,
+				       buf_index, sizeof(buf_index)),
+			stat_name, ut_strerr(ret));
+	}
+
+	return(ret);
+}
+
+/** Save the table's statistics into the persistent statistics storage.
+@param[in] table_orig	table whose stats to save
+@param[in] only_for_index if this is non-NULL, then stats for indexes
+that are not equal to it will not be saved, if NULL, then all
+indexes' stats are saved
+@return DB_SUCCESS or error code */
+static
+dberr_t
+dict_stats_save(
+/*============*/
+	dict_table_t*		table_orig,
+	const index_id_t*	only_for_index)
+{
+	pars_info_t*	pinfo;
+	lint		now;
+	dberr_t		ret;
+	dict_table_t*	table;
+	char		db_utf8[MAX_DB_UTF8_LEN];
+	char		table_utf8[MAX_TABLE_UTF8_LEN];
+
+	table = dict_stats_snapshot_create(table_orig);
+
+	dict_fs2utf8(table->name, db_utf8, sizeof(db_utf8),
+		     table_utf8, sizeof(table_utf8));
+
+	rw_lock_x_lock(&dict_operation_lock);
+	mutex_enter(&dict_sys->mutex);
+
+	/* MySQL's timestamp is 4 byte, so we use
+	pars_info_add_int4_literal() which takes a lint arg, so "now" is
+	lint */
+	now = (lint) ut_time();
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "database_name", db_utf8);
+	pars_info_add_str_literal(pinfo, "table_name", table_utf8);
+	pars_info_add_int4_literal(pinfo, "last_update", now);
+	pars_info_add_ull_literal(pinfo, "n_rows", table->stat_n_rows);
+	pars_info_add_ull_literal(pinfo, "clustered_index_size",
+		table->stat_clustered_index_size);
+	pars_info_add_ull_literal(pinfo, "sum_of_other_index_sizes",
+		table->stat_sum_of_other_index_sizes);
+
+	ret = dict_stats_exec_sql(
+		pinfo,
+		"PROCEDURE TABLE_STATS_SAVE () IS\n"
+		"BEGIN\n"
+
+		"DELETE FROM \"" TABLE_STATS_NAME "\"\n"
+		"WHERE\n"
+		"database_name = :database_name AND\n"
+		"table_name = :table_name;\n"
+
+		"INSERT INTO \"" TABLE_STATS_NAME "\"\n"
+		"VALUES\n"
+		"(\n"
+		":database_name,\n"
+		":table_name,\n"
+		":last_update,\n"
+		":n_rows,\n"
+		":clustered_index_size,\n"
+		":sum_of_other_index_sizes\n"
+		");\n"
+		"END;", NULL);
+
+	if (ret != DB_SUCCESS) {
+		char	buf[MAX_FULL_NAME_LEN];
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Cannot save table statistics for table "
+			"%s: %s\n",
+			ut_format_name(table->name, TRUE, buf, sizeof(buf)),
+			ut_strerr(ret));
+
+		mutex_exit(&dict_sys->mutex);
+		rw_lock_x_unlock(&dict_operation_lock);
+
+		dict_stats_snapshot_free(table);
+
+		return(ret);
+	}
+
+	trx_t*	trx = trx_allocate_for_background();
+	trx_start_if_not_started(trx);
+
+	dict_index_t*	index;
+	index_map_t	indexes;
+
+	/* Below we do all the modifications in innodb_index_stats in a single
+	transaction for performance reasons. Modifying more than one row in a
+	single transaction may deadlock with other transactions if they
+	lock the rows in different order. Other transaction could be for
+	example when we DROP a table and do
+	DELETE FROM innodb_index_stats WHERE database_name = '...'
+	AND table_name = '...'; which will affect more than one row. To
+	prevent deadlocks we always lock the rows in the same order - the
+	order of the PK, which is (database_name, table_name, index_name,
+	stat_name). This is why below we sort the indexes by name and then
+	for each index, do the mods ordered by stat_name. */
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		indexes[index->name] = index;
+	}
+
+	index_map_t::const_iterator	it;
+
+	for (it = indexes.begin(); it != indexes.end(); ++it) {
+
+		index = it->second;
+
+		if (only_for_index != NULL && index->id != *only_for_index) {
+			continue;
+		}
+
+		if (dict_stats_should_ignore_index(index)) {
+			continue;
+		}
+
+		ut_ad(!dict_index_is_univ(index));
+
+		for (ulint i = 0; i < index->n_uniq; i++) {
+
+			char	stat_name[16];
+			char	stat_description[1024];
+			ulint	j;
+
+			ut_snprintf(stat_name, sizeof(stat_name),
+				    "n_diff_pfx%02lu", i + 1);
+
+			/* craft a string that contains the columns names */
+			ut_snprintf(stat_description,
+				    sizeof(stat_description),
+				    "%s", index->fields[0].name);
+			for (j = 1; j <= i; j++) {
+				size_t	len;
+
+				len = strlen(stat_description);
+
+				ut_snprintf(stat_description + len,
+					    sizeof(stat_description) - len,
+					    ",%s", index->fields[j].name);
+			}
+
+			ret = dict_stats_save_index_stat(
+				index, now, stat_name,
+				index->stat_n_diff_key_vals[i],
+				&index->stat_n_sample_sizes[i],
+				stat_description, trx);
+
+			if (ret != DB_SUCCESS) {
+				goto end;
+			}
+		}
+
+		ret = dict_stats_save_index_stat(index, now, "n_leaf_pages",
+						 index->stat_n_leaf_pages,
+						 NULL,
+						 "Number of leaf pages "
+						 "in the index", trx);
+		if (ret != DB_SUCCESS) {
+			goto end;
+		}
+
+		ret = dict_stats_save_index_stat(index, now, "size",
+						 index->stat_index_size,
+						 NULL,
+						 "Number of pages "
+						 "in the index", trx);
+		if (ret != DB_SUCCESS) {
+			goto end;
+		}
+	}
+
+	trx_commit_for_mysql(trx);
+
+end:
+	trx_free_for_background(trx);
+
+	mutex_exit(&dict_sys->mutex);
+	rw_lock_x_unlock(&dict_operation_lock);
+
+	dict_stats_snapshot_free(table);
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Called for the row that is selected by
+SELECT ... FROM mysql.innodb_table_stats WHERE table='...'
+The second argument is a pointer to the table and the fetched stats are
+written to it.
+@return non-NULL dummy */
+static
+ibool
+dict_stats_fetch_table_stats_step(
+/*==============================*/
+	void*	node_void,	/*!< in: select node */
+	void*	table_void)	/*!< out: table */
+{
+	sel_node_t*	node = (sel_node_t*) node_void;
+	dict_table_t*	table = (dict_table_t*) table_void;
+	que_common_t*	cnode;
+	int		i;
+
+	/* this should loop exactly 3 times - for
+	n_rows,clustered_index_size,sum_of_other_index_sizes */
+	for (cnode = static_cast<que_common_t*>(node->select_list), i = 0;
+	     cnode != NULL;
+	     cnode = static_cast<que_common_t*>(que_node_get_next(cnode)),
+	     i++) {
+
+		const byte*	data;
+		dfield_t*	dfield = que_node_get_val(cnode);
+		dtype_t*	type = dfield_get_type(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		data = static_cast<const byte*>(dfield_get_data(dfield));
+
+		switch (i) {
+		case 0: /* mysql.innodb_table_stats.n_rows */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			table->stat_n_rows = mach_read_from_8(data);
+
+			break;
+
+		case 1: /* mysql.innodb_table_stats.clustered_index_size */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			table->stat_clustered_index_size
+				= (ulint) mach_read_from_8(data);
+
+			break;
+
+		case 2: /* mysql.innodb_table_stats.sum_of_other_index_sizes */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			table->stat_sum_of_other_index_sizes
+				= (ulint) mach_read_from_8(data);
+
+			break;
+
+		default:
+
+			/* someone changed SELECT
+			n_rows,clustered_index_size,sum_of_other_index_sizes
+			to select more columns from innodb_table_stats without
+			adjusting here */
+			ut_error;
+		}
+	}
+
+	/* if i < 3 this means someone changed the
+	SELECT n_rows,clustered_index_size,sum_of_other_index_sizes
+	to select less columns from innodb_table_stats without adjusting here;
+	if i > 3 we would have ut_error'ed earlier */
+	ut_a(i == 3 /*n_rows,clustered_index_size,sum_of_other_index_sizes*/);
+
+	/* XXX this is not used but returning non-NULL is necessary */
+	return(TRUE);
+}
+
+/** Aux struct used to pass a table and a boolean to
+dict_stats_fetch_index_stats_step(). */
+struct index_fetch_t {
+	dict_table_t*	table;	/*!< table whose indexes are to be modified */
+	bool		stats_were_modified; /*!< will be set to true if at
+				least one index stats were modified */
+};
+
+/*********************************************************************//**
+Called for the rows that are selected by
+SELECT ... FROM mysql.innodb_index_stats WHERE table='...'
+The second argument is a pointer to the table and the fetched stats are
+written to its indexes.
+Let a table has N indexes and each index has Ui unique columns for i=1..N,
+then mysql.innodb_index_stats will have SUM(Ui) i=1..N rows for that table.
+So this function will be called SUM(Ui) times where SUM(Ui) is of magnitude
+N*AVG(Ui). In each call it searches for the currently fetched index into
+table->indexes linearly, assuming this list is not sorted. Thus, overall,
+fetching all indexes' stats from mysql.innodb_index_stats is O(N^2) where N
+is the number of indexes.
+This can be improved if we sort table->indexes in a temporary area just once
+and then search in that sorted list. Then the complexity will be O(N*log(N)).
+We assume a table will not have more than 100 indexes, so we go with the
+simpler N^2 algorithm.
+@return non-NULL dummy */
+static
+ibool
+dict_stats_fetch_index_stats_step(
+/*==============================*/
+	void*	node_void,	/*!< in: select node */
+	void*	arg_void)	/*!< out: table + a flag that tells if we
+				modified anything */
+{
+	sel_node_t*	node = (sel_node_t*) node_void;
+	index_fetch_t*	arg = (index_fetch_t*) arg_void;
+	dict_table_t*	table = arg->table;
+	dict_index_t*	index = NULL;
+	que_common_t*	cnode;
+	const char*	stat_name = NULL;
+	ulint		stat_name_len = ULINT_UNDEFINED;
+	ib_uint64_t	stat_value = UINT64_UNDEFINED;
+	ib_uint64_t	sample_size = UINT64_UNDEFINED;
+	int		i;
+
+	/* this should loop exactly 4 times - for the columns that
+	were selected: index_name,stat_name,stat_value,sample_size */
+	for (cnode = static_cast<que_common_t*>(node->select_list), i = 0;
+	     cnode != NULL;
+	     cnode = static_cast<que_common_t*>(que_node_get_next(cnode)),
+	     i++) {
+
+		const byte*	data;
+		dfield_t*	dfield = que_node_get_val(cnode);
+		dtype_t*	type = dfield_get_type(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		data = static_cast<const byte*>(dfield_get_data(dfield));
+
+		switch (i) {
+		case 0: /* mysql.innodb_index_stats.index_name */
+
+			ut_a(dtype_get_mtype(type) == DATA_VARMYSQL);
+
+			/* search for index in table's indexes whose name
+			matches data; the fetched index name is in data,
+			has no terminating '\0' and has length len */
+			for (index = dict_table_get_first_index(table);
+			     index != NULL;
+			     index = dict_table_get_next_index(index)) {
+
+				if (strlen(index->name) == len
+				    && memcmp(index->name, data, len) == 0) {
+					/* the corresponding index was found */
+					break;
+				}
+			}
+
+			/* if index is NULL here this means that
+			mysql.innodb_index_stats contains more rows than the
+			number of indexes in the table; this is ok, we just
+			return ignoring those extra rows; in other words
+			dict_stats_fetch_index_stats_step() has been called
+			for a row from index_stats with unknown index_name
+			column */
+			if (index == NULL) {
+
+				return(TRUE);
+			}
+
+			break;
+
+		case 1: /* mysql.innodb_index_stats.stat_name */
+
+			ut_a(dtype_get_mtype(type) == DATA_VARMYSQL);
+
+			ut_a(index != NULL);
+
+			stat_name = (const char*) data;
+			stat_name_len = len;
+
+			break;
+
+		case 2: /* mysql.innodb_index_stats.stat_value */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			ut_a(index != NULL);
+			ut_a(stat_name != NULL);
+			ut_a(stat_name_len != ULINT_UNDEFINED);
+
+			stat_value = mach_read_from_8(data);
+
+			break;
+
+		case 3: /* mysql.innodb_index_stats.sample_size */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8 || len == UNIV_SQL_NULL);
+
+			ut_a(index != NULL);
+			ut_a(stat_name != NULL);
+			ut_a(stat_name_len != ULINT_UNDEFINED);
+			ut_a(stat_value != UINT64_UNDEFINED);
+
+			if (len == UNIV_SQL_NULL) {
+				break;
+			}
+			/* else */
+
+			sample_size = mach_read_from_8(data);
+
+			break;
+
+		default:
+
+			/* someone changed
+			SELECT index_name,stat_name,stat_value,sample_size
+			to select more columns from innodb_index_stats without
+			adjusting here */
+			ut_error;
+		}
+	}
+
+	/* if i < 4 this means someone changed the
+	SELECT index_name,stat_name,stat_value,sample_size
+	to select less columns from innodb_index_stats without adjusting here;
+	if i > 4 we would have ut_error'ed earlier */
+	ut_a(i == 4 /* index_name,stat_name,stat_value,sample_size */);
+
+	ut_a(index != NULL);
+	ut_a(stat_name != NULL);
+	ut_a(stat_name_len != ULINT_UNDEFINED);
+	ut_a(stat_value != UINT64_UNDEFINED);
+	/* sample_size could be UINT64_UNDEFINED here, if it is NULL */
+
+#define PFX	"n_diff_pfx"
+#define PFX_LEN	10
+
+	if (stat_name_len == 4 /* strlen("size") */
+	    && strncasecmp("size", stat_name, stat_name_len) == 0) {
+		index->stat_index_size = (ulint) stat_value;
+		arg->stats_were_modified = true;
+	} else if (stat_name_len == 12 /* strlen("n_leaf_pages") */
+		   && strncasecmp("n_leaf_pages", stat_name, stat_name_len)
+		   == 0) {
+		index->stat_n_leaf_pages = (ulint) stat_value;
+		arg->stats_were_modified = true;
+	} else if (stat_name_len > PFX_LEN /* e.g. stat_name=="n_diff_pfx01" */
+		   && strncasecmp(PFX, stat_name, PFX_LEN) == 0) {
+
+		const char*	num_ptr;
+		unsigned long	n_pfx;
+
+		/* point num_ptr into "1" from "n_diff_pfx12..." */
+		num_ptr = stat_name + PFX_LEN;
+
+		/* stat_name should have exactly 2 chars appended to PFX
+		and they should be digits */
+		if (stat_name_len != PFX_LEN + 2
+		    || num_ptr[0] < '0' || num_ptr[0] > '9'
+		    || num_ptr[1] < '0' || num_ptr[1] > '9') {
+
+			char	db_utf8[MAX_DB_UTF8_LEN];
+			char	table_utf8[MAX_TABLE_UTF8_LEN];
+
+			dict_fs2utf8(table->name, db_utf8, sizeof(db_utf8),
+				     table_utf8, sizeof(table_utf8));
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Ignoring strange row from "
+				"%s WHERE "
+				"database_name = '%s' AND "
+				"table_name = '%s' AND "
+				"index_name = '%s' AND "
+				"stat_name = '%.*s'; because stat_name "
+				"is malformed\n",
+				INDEX_STATS_NAME_PRINT,
+				db_utf8,
+				table_utf8,
+				index->name,
+				(int) stat_name_len,
+				stat_name);
+			return(TRUE);
+		}
+		/* else */
+
+		/* extract 12 from "n_diff_pfx12..." into n_pfx
+		note that stat_name does not have a terminating '\0' */
+		n_pfx = (num_ptr[0] - '0') * 10 + (num_ptr[1] - '0');
+
+		ulint	n_uniq = index->n_uniq;
+
+		if (n_pfx == 0 || n_pfx > n_uniq) {
+
+			char	db_utf8[MAX_DB_UTF8_LEN];
+			char	table_utf8[MAX_TABLE_UTF8_LEN];
+
+			dict_fs2utf8(table->name, db_utf8, sizeof(db_utf8),
+				     table_utf8, sizeof(table_utf8));
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Ignoring strange row from "
+				"%s WHERE "
+				"database_name = '%s' AND "
+				"table_name = '%s' AND "
+				"index_name = '%s' AND "
+				"stat_name = '%.*s'; because stat_name is "
+				"out of range, the index has %lu unique "
+				"columns\n",
+				INDEX_STATS_NAME_PRINT,
+				db_utf8,
+				table_utf8,
+				index->name,
+				(int) stat_name_len,
+				stat_name,
+				n_uniq);
+			return(TRUE);
+		}
+		/* else */
+
+		index->stat_n_diff_key_vals[n_pfx - 1] = stat_value;
+
+		if (sample_size != UINT64_UNDEFINED) {
+			index->stat_n_sample_sizes[n_pfx - 1] = sample_size;
+		} else {
+			/* hmm, strange... the user must have UPDATEd the
+			table manually and SET sample_size = NULL */
+			index->stat_n_sample_sizes[n_pfx - 1] = 0;
+		}
+
+		index->stat_n_non_null_key_vals[n_pfx - 1] = 0;
+
+		arg->stats_were_modified = true;
+	} else {
+		/* silently ignore rows with unknown stat_name, the
+		user may have developed her own stats */
+	}
+
+	/* XXX this is not used but returning non-NULL is necessary */
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Read table's statistics from the persistent statistics storage.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+dict_stats_fetch_from_ps(
+/*=====================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	index_fetch_t	index_fetch_arg;
+	trx_t*		trx;
+	pars_info_t*	pinfo;
+	dberr_t		ret;
+	char		db_utf8[MAX_DB_UTF8_LEN];
+	char		table_utf8[MAX_TABLE_UTF8_LEN];
+
+	ut_ad(!mutex_own(&dict_sys->mutex));
+
+	/* Initialize all stats to dummy values before fetching because if
+	the persistent storage contains incomplete stats (e.g. missing stats
+	for some index) then we would end up with (partially) uninitialized
+	stats. */
+	dict_stats_empty_table(table);
+
+	trx = trx_allocate_for_background();
+
+	/* Use 'read-uncommitted' so that the SELECTs we execute
+	do not get blocked in case some user has locked the rows we
+	are SELECTing */
+
+	trx->isolation_level = TRX_ISO_READ_UNCOMMITTED;
+
+	trx_start_if_not_started(trx);
+
+	dict_fs2utf8(table->name, db_utf8, sizeof(db_utf8),
+		     table_utf8, sizeof(table_utf8));
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "database_name", db_utf8);
+
+	pars_info_add_str_literal(pinfo, "table_name", table_utf8);
+
+	pars_info_bind_function(pinfo,
+			       "fetch_table_stats_step",
+			       dict_stats_fetch_table_stats_step,
+			       table);
+
+	index_fetch_arg.table = table;
+	index_fetch_arg.stats_were_modified = false;
+	pars_info_bind_function(pinfo,
+			        "fetch_index_stats_step",
+			        dict_stats_fetch_index_stats_step,
+			        &index_fetch_arg);
+
+	ret = que_eval_sql(pinfo,
+			   "PROCEDURE FETCH_STATS () IS\n"
+			   "found INT;\n"
+			   "DECLARE FUNCTION fetch_table_stats_step;\n"
+			   "DECLARE FUNCTION fetch_index_stats_step;\n"
+			   "DECLARE CURSOR table_stats_cur IS\n"
+			   "  SELECT\n"
+			   /* if you change the selected fields, be
+			   sure to adjust
+			   dict_stats_fetch_table_stats_step() */
+			   "  n_rows,\n"
+			   "  clustered_index_size,\n"
+			   "  sum_of_other_index_sizes\n"
+			   "  FROM \"" TABLE_STATS_NAME "\"\n"
+			   "  WHERE\n"
+			   "  database_name = :database_name AND\n"
+			   "  table_name = :table_name;\n"
+			   "DECLARE CURSOR index_stats_cur IS\n"
+			   "  SELECT\n"
+			   /* if you change the selected fields, be
+			   sure to adjust
+			   dict_stats_fetch_index_stats_step() */
+			   "  index_name,\n"
+			   "  stat_name,\n"
+			   "  stat_value,\n"
+			   "  sample_size\n"
+			   "  FROM \"" INDEX_STATS_NAME "\"\n"
+			   "  WHERE\n"
+			   "  database_name = :database_name AND\n"
+			   "  table_name = :table_name;\n"
+
+			   "BEGIN\n"
+
+			   "OPEN table_stats_cur;\n"
+			   "FETCH table_stats_cur INTO\n"
+			   "  fetch_table_stats_step();\n"
+			   "IF (SQL % NOTFOUND) THEN\n"
+			   "  CLOSE table_stats_cur;\n"
+			   "  RETURN;\n"
+			   "END IF;\n"
+			   "CLOSE table_stats_cur;\n"
+
+			   "OPEN index_stats_cur;\n"
+			   "found := 1;\n"
+			   "WHILE found = 1 LOOP\n"
+			   "  FETCH index_stats_cur INTO\n"
+			   "    fetch_index_stats_step();\n"
+			   "  IF (SQL % NOTFOUND) THEN\n"
+			   "    found := 0;\n"
+			   "  END IF;\n"
+			   "END LOOP;\n"
+			   "CLOSE index_stats_cur;\n"
+
+			   "END;",
+			   TRUE, trx);
+	/* pinfo is freed by que_eval_sql() */
+
+	trx_commit_for_mysql(trx);
+
+	trx_free_for_background(trx);
+
+	if (!index_fetch_arg.stats_were_modified) {
+		return(DB_STATS_DO_NOT_EXIST);
+	}
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Fetches or calculates new estimates for index statistics. */
+UNIV_INTERN
+void
+dict_stats_update_for_index(
+/*========================*/
+	dict_index_t*	index)	/*!< in/out: index */
+{
+	DBUG_ENTER("dict_stats_update_for_index");
+
+	ut_ad(!mutex_own(&dict_sys->mutex));
+
+	if (dict_stats_is_persistent_enabled(index->table)) {
+
+		if (dict_stats_persistent_storage_check(false)) {
+			dict_table_stats_lock(index->table, RW_X_LATCH);
+			dict_stats_analyze_index(index);
+			dict_table_stats_unlock(index->table, RW_X_LATCH);
+			dict_stats_save(index->table, &index->id);
+			DBUG_VOID_RETURN;
+		}
+		/* else */
+
+		/* Fall back to transient stats since the persistent
+		storage is not present or is corrupted */
+		char	buf_table[MAX_FULL_NAME_LEN];
+		char	buf_index[MAX_FULL_NAME_LEN];
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Recalculation of persistent statistics "
+			"requested for table %s index %s but the required "
+			"persistent statistics storage is not present or is "
+			"corrupted. Using transient stats instead.\n",
+			ut_format_name(index->table->name, TRUE,
+				       buf_table, sizeof(buf_table)),
+			ut_format_name(index->name, FALSE,
+				       buf_index, sizeof(buf_index)));
+	}
+
+	dict_table_stats_lock(index->table, RW_X_LATCH);
+	dict_stats_update_transient_for_index(index);
+	dict_table_stats_unlock(index->table, RW_X_LATCH);
+
+	DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_update(
+/*==============*/
+	dict_table_t*		table,	/*!< in/out: table */
+	dict_stats_upd_option_t	stats_upd_option)
+					/*!< in: whether to (re) calc
+					the stats or to fetch them from
+					the persistent statistics
+					storage */
+{
+	char			buf[MAX_FULL_NAME_LEN];
+
+	ut_ad(!mutex_own(&dict_sys->mutex));
+
+	if (table->ibd_file_missing) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: cannot calculate statistics for table %s "
+			"because the .ibd file is missing. For help, please "
+			"refer to " REFMAN "innodb-troubleshooting.html\n",
+			ut_format_name(table->name, TRUE, buf, sizeof(buf)));
+		dict_stats_empty_table(table);
+		return(DB_TABLESPACE_DELETED);
+	} else if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
+		/* If we have set a high innodb_force_recovery level, do
+		not calculate statistics, as a badly corrupted index can
+		cause a crash in it. */
+		dict_stats_empty_table(table);
+		return(DB_SUCCESS);
+	}
+
+	switch (stats_upd_option) {
+	case DICT_STATS_RECALC_PERSISTENT:
+
+		if (srv_read_only_mode) {
+			goto transient;
+		}
+
+		/* Persistent recalculation requested, called from
+		1) ANALYZE TABLE, or
+		2) the auto recalculation background thread, or
+		3) open table if stats do not exist on disk and auto recalc
+		   is enabled */
+
+		/* InnoDB internal tables (e.g. SYS_TABLES) cannot have
+		persistent stats enabled */
+		ut_a(strchr(table->name, '/') != NULL);
+
+		/* check if the persistent statistics storage exists
+		before calling the potentially slow function
+		dict_stats_update_persistent(); that is a
+		prerequisite for dict_stats_save() succeeding */
+		if (dict_stats_persistent_storage_check(false)) {
+
+			dberr_t	err;
+
+			err = dict_stats_update_persistent(table);
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+
+			err = dict_stats_save(table, NULL);
+
+			return(err);
+		}
+
+		/* Fall back to transient stats since the persistent
+		storage is not present or is corrupted */
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Recalculation of persistent statistics "
+			"requested for table %s but the required persistent "
+			"statistics storage is not present or is corrupted. "
+			"Using transient stats instead.\n",
+			ut_format_name(table->name, TRUE, buf, sizeof(buf)));
+
+		goto transient;
+
+	case DICT_STATS_RECALC_TRANSIENT:
+
+		goto transient;
+
+	case DICT_STATS_EMPTY_TABLE:
+
+		dict_stats_empty_table(table);
+
+		/* If table is using persistent stats,
+		then save the stats on disk */
+
+		if (dict_stats_is_persistent_enabled(table)) {
+
+			if (dict_stats_persistent_storage_check(false)) {
+
+				return(dict_stats_save(table, NULL));
+			}
+
+			return(DB_STATS_DO_NOT_EXIST);
+		}
+
+		return(DB_SUCCESS);
+
+	case DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY:
+
+		/* fetch requested, either fetch from persistent statistics
+		storage or use the old method */
+
+		if (table->stat_initialized) {
+			return(DB_SUCCESS);
+		}
+
+		/* InnoDB internal tables (e.g. SYS_TABLES) cannot have
+		persistent stats enabled */
+		ut_a(strchr(table->name, '/') != NULL);
+
+		if (!dict_stats_persistent_storage_check(false)) {
+			/* persistent statistics storage does not exist
+			or is corrupted, calculate the transient stats */
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Error: Fetch of persistent "
+				"statistics requested for table %s but the "
+				"required system tables %s and %s are not "
+				"present or have unexpected structure. "
+				"Using transient stats instead.\n",
+				ut_format_name(table->name, TRUE,
+					       buf, sizeof(buf)),
+				TABLE_STATS_NAME_PRINT,
+				INDEX_STATS_NAME_PRINT);
+
+			goto transient;
+		}
+
+		dict_table_t*	t;
+
+		/* Create a dummy table object with the same name and
+		indexes, suitable for fetching the stats into it. */
+		t = dict_stats_table_clone_create(table);
+
+		dberr_t	err = dict_stats_fetch_from_ps(t);
+
+		t->stats_last_recalc = table->stats_last_recalc;
+		t->stat_modified_counter = 0;
+
+		switch (err) {
+		case DB_SUCCESS:
+
+			dict_table_stats_lock(table, RW_X_LATCH);
+
+			/* Initialize all stats to dummy values before
+			copying because dict_stats_table_clone_create() does
+			skip corrupted indexes so our dummy object 't' may
+			have less indexes than the real object 'table'. */
+			dict_stats_empty_table(table);
+
+			dict_stats_copy(table, t);
+
+			dict_stats_assert_initialized(table);
+
+			dict_table_stats_unlock(table, RW_X_LATCH);
+
+			dict_stats_table_clone_free(t);
+
+			return(DB_SUCCESS);
+		case DB_STATS_DO_NOT_EXIST:
+
+			dict_stats_table_clone_free(t);
+
+			if (srv_read_only_mode) {
+				goto transient;
+			}
+
+			if (dict_stats_auto_recalc_is_enabled(table)) {
+				return(dict_stats_update(
+						table,
+						DICT_STATS_RECALC_PERSISTENT));
+			}
+
+			ut_format_name(table->name, TRUE, buf, sizeof(buf));
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Trying to use table %s which has "
+				"persistent statistics enabled, but auto "
+				"recalculation turned off and the statistics "
+				"do not exist in %s and %s. Please either run "
+				"\"ANALYZE TABLE %s;\" manually or enable the "
+				"auto recalculation with "
+				"\"ALTER TABLE %s STATS_AUTO_RECALC=1;\". "
+				"InnoDB will now use transient statistics for "
+				"%s.\n",
+				buf, TABLE_STATS_NAME, INDEX_STATS_NAME, buf,
+				buf, buf);
+
+			goto transient;
+		default:
+
+			dict_stats_table_clone_free(t);
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Error fetching persistent statistics "
+				"for table %s from %s and %s: %s. "
+				"Using transient stats method instead.\n",
+				ut_format_name(table->name, TRUE, buf,
+					       sizeof(buf)),
+				TABLE_STATS_NAME,
+				INDEX_STATS_NAME,
+				ut_strerr(err));
+
+			goto transient;
+		}
+	/* no "default:" in order to produce a compilation warning
+	about unhandled enumeration value */
+	}
+
+transient:
+
+	dict_table_stats_lock(table, RW_X_LATCH);
+
+	dict_stats_update_transient(table);
+
+	dict_table_stats_unlock(table, RW_X_LATCH);
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Removes the information for a particular index's stats from the persistent
+storage if it exists and if there is data stored for this index.
+This function creates its own trx and commits it.
+A note from Marko why we cannot edit user and sys_* tables in one trx:
+marko: The problem is that ibuf merges should be disabled while we are
+rolling back dict transactions.
+marko: If ibuf merges are not disabled, we need to scan the *.ibd files.
+But we shouldn't open *.ibd files before we have rolled back dict
+transactions and opened the SYS_* records for the *.ibd files.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_drop_index(
+/*==================*/
+	const char*	db_and_table,/*!< in: db and table, e.g. 'db/table' */
+	const char*	iname,	/*!< in: index name */
+	char*		errstr, /*!< out: error message if != DB_SUCCESS
+				is returned */
+	ulint		errstr_sz)/*!< in: size of the errstr buffer */
+{
+	char		db_utf8[MAX_DB_UTF8_LEN];
+	char		table_utf8[MAX_TABLE_UTF8_LEN];
+	pars_info_t*	pinfo;
+	dberr_t		ret;
+
+	ut_ad(!mutex_own(&dict_sys->mutex));
+
+	/* skip indexes whose table names do not contain a database name
+	e.g. if we are dropping an index from SYS_TABLES */
+	if (strchr(db_and_table, '/') == NULL) {
+
+		return(DB_SUCCESS);
+	}
+
+	dict_fs2utf8(db_and_table, db_utf8, sizeof(db_utf8),
+		     table_utf8, sizeof(table_utf8));
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "database_name", db_utf8);
+
+	pars_info_add_str_literal(pinfo, "table_name", table_utf8);
+
+	pars_info_add_str_literal(pinfo, "index_name", iname);
+
+	rw_lock_x_lock(&dict_operation_lock);
+	mutex_enter(&dict_sys->mutex);
+
+	ret = dict_stats_exec_sql(
+		pinfo,
+		"PROCEDURE DROP_INDEX_STATS () IS\n"
+		"BEGIN\n"
+		"DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n"
+		"database_name = :database_name AND\n"
+		"table_name = :table_name AND\n"
+		"index_name = :index_name;\n"
+		"END;\n", NULL);
+
+	mutex_exit(&dict_sys->mutex);
+	rw_lock_x_unlock(&dict_operation_lock);
+
+	if (ret == DB_STATS_DO_NOT_EXIST) {
+		ret = DB_SUCCESS;
+	}
+
+	if (ret != DB_SUCCESS) {
+		ut_snprintf(errstr, errstr_sz,
+			    "Unable to delete statistics for index %s "
+			    "from %s%s: %s. They can be deleted later using "
+			    "DELETE FROM %s WHERE "
+			    "database_name = '%s' AND "
+			    "table_name = '%s' AND "
+			    "index_name = '%s';",
+			    iname,
+			    INDEX_STATS_NAME_PRINT,
+			    (ret == DB_LOCK_WAIT_TIMEOUT
+			     ? " because the rows are locked"
+			     : ""),
+			    ut_strerr(ret),
+			    INDEX_STATS_NAME_PRINT,
+			    db_utf8,
+			    table_utf8,
+			    iname);
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: %s\n", errstr);
+	}
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Executes
+DELETE FROM mysql.innodb_table_stats
+WHERE database_name = '...' AND table_name = '...';
+Creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INLINE
+dberr_t
+dict_stats_delete_from_table_stats(
+/*===============================*/
+	const char*	database_name,	/*!< in: database name, e.g. 'db' */
+	const char*	table_name)	/*!< in: table name, e.g. 'table' */
+{
+	pars_info_t*	pinfo;
+	dberr_t		ret;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "database_name", database_name);
+	pars_info_add_str_literal(pinfo, "table_name", table_name);
+
+	ret = dict_stats_exec_sql(
+		pinfo,
+		"PROCEDURE DELETE_FROM_TABLE_STATS () IS\n"
+		"BEGIN\n"
+		"DELETE FROM \"" TABLE_STATS_NAME "\" WHERE\n"
+		"database_name = :database_name AND\n"
+		"table_name = :table_name;\n"
+		"END;\n", NULL);
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Executes
+DELETE FROM mysql.innodb_index_stats
+WHERE database_name = '...' AND table_name = '...';
+Creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INLINE
+dberr_t
+dict_stats_delete_from_index_stats(
+/*===============================*/
+	const char*	database_name,	/*!< in: database name, e.g. 'db' */
+	const char*	table_name)	/*!< in: table name, e.g. 'table' */
+{
+	pars_info_t*	pinfo;
+	dberr_t		ret;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "database_name", database_name);
+	pars_info_add_str_literal(pinfo, "table_name", table_name);
+
+	ret = dict_stats_exec_sql(
+		pinfo,
+		"PROCEDURE DELETE_FROM_INDEX_STATS () IS\n"
+		"BEGIN\n"
+		"DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n"
+		"database_name = :database_name AND\n"
+		"table_name = :table_name;\n"
+		"END;\n", NULL);
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Removes the statistics for a table and all of its indexes from the
+persistent statistics storage if it exists and if there is data stored for
+the table. This function creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_drop_table(
+/*==================*/
+	const char*	db_and_table,	/*!< in: db and table, e.g. 'db/table' */
+	char*		errstr,		/*!< out: error message
+					if != DB_SUCCESS is returned */
+	ulint		errstr_sz)	/*!< in: size of errstr buffer */
+{
+	char		db_utf8[MAX_DB_UTF8_LEN];
+	char		table_utf8[MAX_TABLE_UTF8_LEN];
+	dberr_t		ret;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	/* skip tables that do not contain a database name
+	e.g. if we are dropping SYS_TABLES */
+	if (strchr(db_and_table, '/') == NULL) {
+
+		return(DB_SUCCESS);
+	}
+
+	/* skip innodb_table_stats and innodb_index_stats themselves */
+	if (strcmp(db_and_table, TABLE_STATS_NAME) == 0
+	    || strcmp(db_and_table, INDEX_STATS_NAME) == 0) {
+
+		return(DB_SUCCESS);
+	}
+
+	dict_fs2utf8(db_and_table, db_utf8, sizeof(db_utf8),
+		     table_utf8, sizeof(table_utf8));
+
+	ret = dict_stats_delete_from_table_stats(db_utf8, table_utf8);
+
+	if (ret == DB_SUCCESS) {
+		ret = dict_stats_delete_from_index_stats(db_utf8, table_utf8);
+	}
+
+	if (ret == DB_STATS_DO_NOT_EXIST) {
+		ret = DB_SUCCESS;
+	}
+
+	if (ret != DB_SUCCESS) {
+
+		ut_snprintf(errstr, errstr_sz,
+			    "Unable to delete statistics for table %s.%s: %s. "
+			    "They can be deleted later using "
+
+			    "DELETE FROM %s WHERE "
+			    "database_name = '%s' AND "
+			    "table_name = '%s'; "
+
+			    "DELETE FROM %s WHERE "
+			    "database_name = '%s' AND "
+			    "table_name = '%s';",
+
+			    db_utf8, table_utf8,
+			    ut_strerr(ret),
+
+			    INDEX_STATS_NAME_PRINT,
+			    db_utf8, table_utf8,
+
+			    TABLE_STATS_NAME_PRINT,
+			    db_utf8, table_utf8);
+	}
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Executes
+UPDATE mysql.innodb_table_stats SET
+database_name = '...', table_name = '...'
+WHERE database_name = '...' AND table_name = '...';
+Creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INLINE
+dberr_t
+dict_stats_rename_in_table_stats(
+/*=============================*/
+	const char*	old_dbname_utf8,/*!< in: database name, e.g. 'olddb' */
+	const char*	old_tablename_utf8,/*!< in: table name, e.g. 'oldtable' */
+	const char*	new_dbname_utf8,/*!< in: database name, e.g. 'newdb' */
+	const char*	new_tablename_utf8)/*!< in: table name, e.g. 'newtable' */
+{
+	pars_info_t*	pinfo;
+	dberr_t		ret;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "old_dbname_utf8", old_dbname_utf8);
+	pars_info_add_str_literal(pinfo, "old_tablename_utf8", old_tablename_utf8);
+	pars_info_add_str_literal(pinfo, "new_dbname_utf8", new_dbname_utf8);
+	pars_info_add_str_literal(pinfo, "new_tablename_utf8", new_tablename_utf8);
+
+	ret = dict_stats_exec_sql(
+		pinfo,
+		"PROCEDURE RENAME_IN_TABLE_STATS () IS\n"
+		"BEGIN\n"
+		"UPDATE \"" TABLE_STATS_NAME "\" SET\n"
+		"database_name = :new_dbname_utf8,\n"
+		"table_name = :new_tablename_utf8\n"
+		"WHERE\n"
+		"database_name = :old_dbname_utf8 AND\n"
+		"table_name = :old_tablename_utf8;\n"
+		"END;\n", NULL);
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Executes
+UPDATE mysql.innodb_index_stats SET
+database_name = '...', table_name = '...'
+WHERE database_name = '...' AND table_name = '...';
+Creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INLINE
+dberr_t
+dict_stats_rename_in_index_stats(
+/*=============================*/
+	const char*	old_dbname_utf8,/*!< in: database name, e.g. 'olddb' */
+	const char*	old_tablename_utf8,/*!< in: table name, e.g. 'oldtable' */
+	const char*	new_dbname_utf8,/*!< in: database name, e.g. 'newdb' */
+	const char*	new_tablename_utf8)/*!< in: table name, e.g. 'newtable' */
+{
+	pars_info_t*	pinfo;
+	dberr_t		ret;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "old_dbname_utf8", old_dbname_utf8);
+	pars_info_add_str_literal(pinfo, "old_tablename_utf8", old_tablename_utf8);
+	pars_info_add_str_literal(pinfo, "new_dbname_utf8", new_dbname_utf8);
+	pars_info_add_str_literal(pinfo, "new_tablename_utf8", new_tablename_utf8);
+
+	ret = dict_stats_exec_sql(
+		pinfo,
+		"PROCEDURE RENAME_IN_INDEX_STATS () IS\n"
+		"BEGIN\n"
+		"UPDATE \"" INDEX_STATS_NAME "\" SET\n"
+		"database_name = :new_dbname_utf8,\n"
+		"table_name = :new_tablename_utf8\n"
+		"WHERE\n"
+		"database_name = :old_dbname_utf8 AND\n"
+		"table_name = :old_tablename_utf8;\n"
+		"END;\n", NULL);
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Renames a table in InnoDB persistent stats storage.
+This function creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_rename_table(
+/*====================*/
+	const char*	old_name,	/*!< in: old name, e.g. 'db/table' */
+	const char*	new_name,	/*!< in: new name, e.g. 'db/table' */
+	char*		errstr,		/*!< out: error string if != DB_SUCCESS
+					is returned */
+	size_t		errstr_sz)	/*!< in: errstr size */
+{
+	char		old_db_utf8[MAX_DB_UTF8_LEN];
+	char		new_db_utf8[MAX_DB_UTF8_LEN];
+	char		old_table_utf8[MAX_TABLE_UTF8_LEN];
+	char		new_table_utf8[MAX_TABLE_UTF8_LEN];
+	dberr_t		ret;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(!mutex_own(&dict_sys->mutex));
+
+	/* skip innodb_table_stats and innodb_index_stats themselves */
+	if (strcmp(old_name, TABLE_STATS_NAME) == 0
+	    || strcmp(old_name, INDEX_STATS_NAME) == 0
+	    || strcmp(new_name, TABLE_STATS_NAME) == 0
+	    || strcmp(new_name, INDEX_STATS_NAME) == 0) {
+
+		return(DB_SUCCESS);
+	}
+
+	dict_fs2utf8(old_name, old_db_utf8, sizeof(old_db_utf8),
+		     old_table_utf8, sizeof(old_table_utf8));
+
+	dict_fs2utf8(new_name, new_db_utf8, sizeof(new_db_utf8),
+		     new_table_utf8, sizeof(new_table_utf8));
+
+	rw_lock_x_lock(&dict_operation_lock);
+	mutex_enter(&dict_sys->mutex);
+
+	ulint	n_attempts = 0;
+	do {
+		n_attempts++;
+
+		ret = dict_stats_rename_in_table_stats(
+			old_db_utf8, old_table_utf8,
+			new_db_utf8, new_table_utf8);
+
+		if (ret == DB_DUPLICATE_KEY) {
+			dict_stats_delete_from_table_stats(
+				new_db_utf8, new_table_utf8);
+		}
+
+		if (ret == DB_STATS_DO_NOT_EXIST) {
+			ret = DB_SUCCESS;
+		}
+
+		if (ret != DB_SUCCESS) {
+			mutex_exit(&dict_sys->mutex);
+			rw_lock_x_unlock(&dict_operation_lock);
+			os_thread_sleep(200000 /* 0.2 sec */);
+			rw_lock_x_lock(&dict_operation_lock);
+			mutex_enter(&dict_sys->mutex);
+		}
+	} while ((ret == DB_DEADLOCK
+		  || ret == DB_DUPLICATE_KEY
+		  || ret == DB_LOCK_WAIT_TIMEOUT)
+		 && n_attempts < 5);
+
+	if (ret != DB_SUCCESS) {
+		ut_snprintf(errstr, errstr_sz,
+			    "Unable to rename statistics from "
+			    "%s.%s to %s.%s in %s: %s. "
+			    "They can be renamed later using "
+
+			    "UPDATE %s SET "
+			    "database_name = '%s', "
+			    "table_name = '%s' "
+			    "WHERE "
+			    "database_name = '%s' AND "
+			    "table_name = '%s';",
+
+			    old_db_utf8, old_table_utf8,
+			    new_db_utf8, new_table_utf8,
+			    TABLE_STATS_NAME_PRINT,
+			    ut_strerr(ret),
+
+			    TABLE_STATS_NAME_PRINT,
+			    new_db_utf8, new_table_utf8,
+			    old_db_utf8, old_table_utf8);
+		mutex_exit(&dict_sys->mutex);
+		rw_lock_x_unlock(&dict_operation_lock);
+		return(ret);
+	}
+	/* else */
+
+	n_attempts = 0;
+	do {
+		n_attempts++;
+
+		ret = dict_stats_rename_in_index_stats(
+			old_db_utf8, old_table_utf8,
+			new_db_utf8, new_table_utf8);
+
+		if (ret == DB_DUPLICATE_KEY) {
+			dict_stats_delete_from_index_stats(
+				new_db_utf8, new_table_utf8);
+		}
+
+		if (ret == DB_STATS_DO_NOT_EXIST) {
+			ret = DB_SUCCESS;
+		}
+
+		if (ret != DB_SUCCESS) {
+			mutex_exit(&dict_sys->mutex);
+			rw_lock_x_unlock(&dict_operation_lock);
+			os_thread_sleep(200000 /* 0.2 sec */);
+			rw_lock_x_lock(&dict_operation_lock);
+			mutex_enter(&dict_sys->mutex);
+		}
+	} while ((ret == DB_DEADLOCK
+		  || ret == DB_DUPLICATE_KEY
+		  || ret == DB_LOCK_WAIT_TIMEOUT)
+		 && n_attempts < 5);
+
+	mutex_exit(&dict_sys->mutex);
+	rw_lock_x_unlock(&dict_operation_lock);
+
+	if (ret != DB_SUCCESS) {
+		ut_snprintf(errstr, errstr_sz,
+			    "Unable to rename statistics from "
+			    "%s.%s to %s.%s in %s: %s. "
+			    "They can be renamed later using "
+
+			    "UPDATE %s SET "
+			    "database_name = '%s', "
+			    "table_name = '%s' "
+			    "WHERE "
+			    "database_name = '%s' AND "
+			    "table_name = '%s';",
+
+			    old_db_utf8, old_table_utf8,
+			    new_db_utf8, new_table_utf8,
+			    INDEX_STATS_NAME_PRINT,
+			    ut_strerr(ret),
+
+			    INDEX_STATS_NAME_PRINT,
+			    new_db_utf8, new_table_utf8,
+			    old_db_utf8, old_table_utf8);
+	}
+
+	return(ret);
+}
+
+/* tests @{ */
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+/* The following unit tests test some of the functions in this file
+individually, such testing cannot be performed by the mysql-test framework
+via SQL. */
+
+/* test_dict_table_schema_check() @{ */
+void
+test_dict_table_schema_check()
+{
+	/*
+	CREATE TABLE tcheck (
+		c01 VARCHAR(123),
+		c02 INT,
+		c03 INT NOT NULL,
+		c04 INT UNSIGNED,
+		c05 BIGINT,
+		c06 BIGINT UNSIGNED NOT NULL,
+		c07 TIMESTAMP
+	) ENGINE=INNODB;
+	*/
+	/* definition for the table 'test/tcheck' */
+	dict_col_meta_t	columns[] = {
+		{"c01", DATA_VARCHAR, 0, 123},
+		{"c02", DATA_INT, 0, 4},
+		{"c03", DATA_INT, DATA_NOT_NULL, 4},
+		{"c04", DATA_INT, DATA_UNSIGNED, 4},
+		{"c05", DATA_INT, 0, 8},
+		{"c06", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8},
+		{"c07", DATA_INT, 0, 4},
+		{"c_extra", DATA_INT, 0, 4}
+	};
+	dict_table_schema_t	schema = {
+		"test/tcheck",
+		0 /* will be set individually for each test below */,
+		columns
+	};
+	char	errstr[512];
+
+	ut_snprintf(errstr, sizeof(errstr), "Table not found");
+
+	/* prevent any data dictionary modifications while we are checking
+	the tables' structure */
+
+	mutex_enter(&(dict_sys->mutex));
+
+	/* check that a valid table is reported as valid */
+	schema.n_cols = 7;
+	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+	    == DB_SUCCESS) {
+		printf("OK: test.tcheck ok\n");
+	} else {
+		printf("ERROR: %s\n", errstr);
+		printf("ERROR: test.tcheck not present or corrupted\n");
+		goto test_dict_table_schema_check_end;
+	}
+
+	/* check columns with wrong length */
+	schema.columns[1].len = 8;
+	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+	    != DB_SUCCESS) {
+		printf("OK: test.tcheck.c02 has different length and is "
+		       "reported as corrupted\n");
+	} else {
+		printf("OK: test.tcheck.c02 has different length but is "
+		       "reported as ok\n");
+		goto test_dict_table_schema_check_end;
+	}
+	schema.columns[1].len = 4;
+
+	/* request that c02 is NOT NULL while actually it does not have
+	this flag set */
+	schema.columns[1].prtype_mask |= DATA_NOT_NULL;
+	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+	    != DB_SUCCESS) {
+		printf("OK: test.tcheck.c02 does not have NOT NULL while "
+		       "it should and is reported as corrupted\n");
+	} else {
+		printf("ERROR: test.tcheck.c02 does not have NOT NULL while "
+		       "it should and is not reported as corrupted\n");
+		goto test_dict_table_schema_check_end;
+	}
+	schema.columns[1].prtype_mask &= ~DATA_NOT_NULL;
+
+	/* check a table that contains some extra columns */
+	schema.n_cols = 6;
+	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+	    == DB_SUCCESS) {
+		printf("ERROR: test.tcheck has more columns but is not "
+		       "reported as corrupted\n");
+		goto test_dict_table_schema_check_end;
+	} else {
+		printf("OK: test.tcheck has more columns and is "
+		       "reported as corrupted\n");
+	}
+
+	/* check a table that has some columns missing */
+	schema.n_cols = 8;
+	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+	    != DB_SUCCESS) {
+		printf("OK: test.tcheck has missing columns and is "
+		       "reported as corrupted\n");
+	} else {
+		printf("ERROR: test.tcheck has missing columns but is "
+		       "reported as ok\n");
+		goto test_dict_table_schema_check_end;
+	}
+
+	/* check non-existent table */
+	schema.table_name = "test/tcheck_nonexistent";
+	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
+	    != DB_SUCCESS) {
+		printf("OK: test.tcheck_nonexistent is not present\n");
+	} else {
+		printf("ERROR: test.tcheck_nonexistent is present!?\n");
+		goto test_dict_table_schema_check_end;
+	}
+
+test_dict_table_schema_check_end:
+
+	mutex_exit(&(dict_sys->mutex));
+}
+/* @} */
+
+/* save/fetch aux macros @{ */
+#define TEST_DATABASE_NAME		"foobardb"
+#define TEST_TABLE_NAME			"test_dict_stats"
+
+#define TEST_N_ROWS			111
+#define TEST_CLUSTERED_INDEX_SIZE	222
+#define TEST_SUM_OF_OTHER_INDEX_SIZES	333
+
+#define TEST_IDX1_NAME			"tidx1"
+#define TEST_IDX1_COL1_NAME		"tidx1_col1"
+#define TEST_IDX1_INDEX_SIZE		123
+#define TEST_IDX1_N_LEAF_PAGES		234
+#define TEST_IDX1_N_DIFF1		50
+#define TEST_IDX1_N_DIFF1_SAMPLE_SIZE	500
+
+#define TEST_IDX2_NAME			"tidx2"
+#define TEST_IDX2_COL1_NAME		"tidx2_col1"
+#define TEST_IDX2_COL2_NAME		"tidx2_col2"
+#define TEST_IDX2_COL3_NAME		"tidx2_col3"
+#define TEST_IDX2_COL4_NAME		"tidx2_col4"
+#define TEST_IDX2_INDEX_SIZE		321
+#define TEST_IDX2_N_LEAF_PAGES		432
+#define TEST_IDX2_N_DIFF1		60
+#define TEST_IDX2_N_DIFF1_SAMPLE_SIZE	600
+#define TEST_IDX2_N_DIFF2		61
+#define TEST_IDX2_N_DIFF2_SAMPLE_SIZE	610
+#define TEST_IDX2_N_DIFF3		62
+#define TEST_IDX2_N_DIFF3_SAMPLE_SIZE	620
+#define TEST_IDX2_N_DIFF4		63
+#define TEST_IDX2_N_DIFF4_SAMPLE_SIZE	630
+/* @} */
+
+/* test_dict_stats_save() @{ */
+void
+test_dict_stats_save()
+{
+	dict_table_t	table;
+	dict_index_t	index1;
+	dict_field_t	index1_fields[1];
+	ib_uint64_t	index1_stat_n_diff_key_vals[1];
+	ib_uint64_t	index1_stat_n_sample_sizes[1];
+	dict_index_t	index2;
+	dict_field_t	index2_fields[4];
+	ib_uint64_t	index2_stat_n_diff_key_vals[4];
+	ib_uint64_t	index2_stat_n_sample_sizes[4];
+	dberr_t		ret;
+
+	/* craft a dummy dict_table_t */
+	table.name = (char*) (TEST_DATABASE_NAME "/" TEST_TABLE_NAME);
+	table.stat_n_rows = TEST_N_ROWS;
+	table.stat_clustered_index_size = TEST_CLUSTERED_INDEX_SIZE;
+	table.stat_sum_of_other_index_sizes = TEST_SUM_OF_OTHER_INDEX_SIZES;
+	UT_LIST_INIT(table.indexes);
+	UT_LIST_ADD_LAST(indexes, table.indexes, &index1);
+	UT_LIST_ADD_LAST(indexes, table.indexes, &index2);
+	ut_d(table.magic_n = DICT_TABLE_MAGIC_N);
+	ut_d(index1.magic_n = DICT_INDEX_MAGIC_N);
+
+	index1.name = TEST_IDX1_NAME;
+	index1.table = &table;
+	index1.cached = 1;
+	index1.n_uniq = 1;
+	index1.fields = index1_fields;
+	index1.stat_n_diff_key_vals = index1_stat_n_diff_key_vals;
+	index1.stat_n_sample_sizes = index1_stat_n_sample_sizes;
+	index1.stat_index_size = TEST_IDX1_INDEX_SIZE;
+	index1.stat_n_leaf_pages = TEST_IDX1_N_LEAF_PAGES;
+	index1_fields[0].name = TEST_IDX1_COL1_NAME;
+	index1_stat_n_diff_key_vals[0] = TEST_IDX1_N_DIFF1;
+	index1_stat_n_sample_sizes[0] = TEST_IDX1_N_DIFF1_SAMPLE_SIZE;
+
+	ut_d(index2.magic_n = DICT_INDEX_MAGIC_N);
+	index2.name = TEST_IDX2_NAME;
+	index2.table = &table;
+	index2.cached = 1;
+	index2.n_uniq = 4;
+	index2.fields = index2_fields;
+	index2.stat_n_diff_key_vals = index2_stat_n_diff_key_vals;
+	index2.stat_n_sample_sizes = index2_stat_n_sample_sizes;
+	index2.stat_index_size = TEST_IDX2_INDEX_SIZE;
+	index2.stat_n_leaf_pages = TEST_IDX2_N_LEAF_PAGES;
+	index2_fields[0].name = TEST_IDX2_COL1_NAME;
+	index2_fields[1].name = TEST_IDX2_COL2_NAME;
+	index2_fields[2].name = TEST_IDX2_COL3_NAME;
+	index2_fields[3].name = TEST_IDX2_COL4_NAME;
+	index2_stat_n_diff_key_vals[0] = TEST_IDX2_N_DIFF1;
+	index2_stat_n_diff_key_vals[1] = TEST_IDX2_N_DIFF2;
+	index2_stat_n_diff_key_vals[2] = TEST_IDX2_N_DIFF3;
+	index2_stat_n_diff_key_vals[3] = TEST_IDX2_N_DIFF4;
+	index2_stat_n_sample_sizes[0] = TEST_IDX2_N_DIFF1_SAMPLE_SIZE;
+	index2_stat_n_sample_sizes[1] = TEST_IDX2_N_DIFF2_SAMPLE_SIZE;
+	index2_stat_n_sample_sizes[2] = TEST_IDX2_N_DIFF3_SAMPLE_SIZE;
+	index2_stat_n_sample_sizes[3] = TEST_IDX2_N_DIFF4_SAMPLE_SIZE;
+
+	ret = dict_stats_save(&table, NULL);
+
+	ut_a(ret == DB_SUCCESS);
+
+	printf("\nOK: stats saved successfully, now go ahead and read "
+	       "what's inside %s and %s:\n\n",
+	       TABLE_STATS_NAME_PRINT,
+	       INDEX_STATS_NAME_PRINT);
+
+	printf("SELECT COUNT(*) = 1 AS table_stats_saved_successfully\n"
+	       "FROM %s\n"
+	       "WHERE\n"
+	       "database_name = '%s' AND\n"
+	       "table_name = '%s' AND\n"
+	       "n_rows = %d AND\n"
+	       "clustered_index_size = %d AND\n"
+	       "sum_of_other_index_sizes = %d;\n"
+	       "\n",
+	       TABLE_STATS_NAME_PRINT,
+	       TEST_DATABASE_NAME,
+	       TEST_TABLE_NAME,
+	       TEST_N_ROWS,
+	       TEST_CLUSTERED_INDEX_SIZE,
+	       TEST_SUM_OF_OTHER_INDEX_SIZES);
+
+	printf("SELECT COUNT(*) = 3 AS tidx1_stats_saved_successfully\n"
+	       "FROM %s\n"
+	       "WHERE\n"
+	       "database_name = '%s' AND\n"
+	       "table_name = '%s' AND\n"
+	       "index_name = '%s' AND\n"
+	       "(\n"
+	       " (stat_name = 'size' AND stat_value = %d AND"
+	       "  sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_leaf_pages' AND stat_value = %d AND"
+	       "  sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_diff_pfx01' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s')\n"
+	       ");\n"
+	       "\n",
+	       INDEX_STATS_NAME_PRINT,
+	       TEST_DATABASE_NAME,
+	       TEST_TABLE_NAME,
+	       TEST_IDX1_NAME,
+	       TEST_IDX1_INDEX_SIZE,
+	       TEST_IDX1_N_LEAF_PAGES,
+	       TEST_IDX1_N_DIFF1,
+	       TEST_IDX1_N_DIFF1_SAMPLE_SIZE,
+	       TEST_IDX1_COL1_NAME);
+
+	printf("SELECT COUNT(*) = 6 AS tidx2_stats_saved_successfully\n"
+	       "FROM %s\n"
+	       "WHERE\n"
+	       "database_name = '%s' AND\n"
+	       "table_name = '%s' AND\n"
+	       "index_name = '%s' AND\n"
+	       "(\n"
+	       " (stat_name = 'size' AND stat_value = %d AND"
+	       "  sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_leaf_pages' AND stat_value = %d AND"
+	       "  sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_diff_pfx01' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s') OR\n"
+	       " (stat_name = 'n_diff_pfx02' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s,%s') OR\n"
+	       " (stat_name = 'n_diff_pfx03' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s,%s,%s') OR\n"
+	       " (stat_name = 'n_diff_pfx04' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s,%s,%s,%s')\n"
+	       ");\n"
+	       "\n",
+	       INDEX_STATS_NAME_PRINT,
+	       TEST_DATABASE_NAME,
+	       TEST_TABLE_NAME,
+	       TEST_IDX2_NAME,
+	       TEST_IDX2_INDEX_SIZE,
+	       TEST_IDX2_N_LEAF_PAGES,
+	       TEST_IDX2_N_DIFF1,
+	       TEST_IDX2_N_DIFF1_SAMPLE_SIZE, TEST_IDX2_COL1_NAME,
+	       TEST_IDX2_N_DIFF2,
+	       TEST_IDX2_N_DIFF2_SAMPLE_SIZE,
+	       TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME,
+	       TEST_IDX2_N_DIFF3,
+	       TEST_IDX2_N_DIFF3_SAMPLE_SIZE,
+	       TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, TEST_IDX2_COL3_NAME,
+	       TEST_IDX2_N_DIFF4,
+	       TEST_IDX2_N_DIFF4_SAMPLE_SIZE,
+	       TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, TEST_IDX2_COL3_NAME,
+	       TEST_IDX2_COL4_NAME);
+}
+/* @} */
+
+/* test_dict_stats_fetch_from_ps() @{ */
+void
+test_dict_stats_fetch_from_ps()
+{
+	dict_table_t	table;
+	dict_index_t	index1;
+	ib_uint64_t	index1_stat_n_diff_key_vals[1];
+	ib_uint64_t	index1_stat_n_sample_sizes[1];
+	dict_index_t	index2;
+	ib_uint64_t	index2_stat_n_diff_key_vals[4];
+	ib_uint64_t	index2_stat_n_sample_sizes[4];
+	dberr_t		ret;
+
+	/* craft a dummy dict_table_t */
+	table.name = (char*) (TEST_DATABASE_NAME "/" TEST_TABLE_NAME);
+	UT_LIST_INIT(table.indexes);
+	UT_LIST_ADD_LAST(indexes, table.indexes, &index1);
+	UT_LIST_ADD_LAST(indexes, table.indexes, &index2);
+	ut_d(table.magic_n = DICT_TABLE_MAGIC_N);
+
+	index1.name = TEST_IDX1_NAME;
+	ut_d(index1.magic_n = DICT_INDEX_MAGIC_N);
+	index1.cached = 1;
+	index1.n_uniq = 1;
+	index1.stat_n_diff_key_vals = index1_stat_n_diff_key_vals;
+	index1.stat_n_sample_sizes = index1_stat_n_sample_sizes;
+
+	index2.name = TEST_IDX2_NAME;
+	ut_d(index2.magic_n = DICT_INDEX_MAGIC_N);
+	index2.cached = 1;
+	index2.n_uniq = 4;
+	index2.stat_n_diff_key_vals = index2_stat_n_diff_key_vals;
+	index2.stat_n_sample_sizes = index2_stat_n_sample_sizes;
+
+	ret = dict_stats_fetch_from_ps(&table);
+
+	ut_a(ret == DB_SUCCESS);
+
+	ut_a(table.stat_n_rows == TEST_N_ROWS);
+	ut_a(table.stat_clustered_index_size == TEST_CLUSTERED_INDEX_SIZE);
+	ut_a(table.stat_sum_of_other_index_sizes
+	     == TEST_SUM_OF_OTHER_INDEX_SIZES);
+
+	ut_a(index1.stat_index_size == TEST_IDX1_INDEX_SIZE);
+	ut_a(index1.stat_n_leaf_pages == TEST_IDX1_N_LEAF_PAGES);
+	ut_a(index1_stat_n_diff_key_vals[0] == TEST_IDX1_N_DIFF1);
+	ut_a(index1_stat_n_sample_sizes[0] == TEST_IDX1_N_DIFF1_SAMPLE_SIZE);
+
+	ut_a(index2.stat_index_size == TEST_IDX2_INDEX_SIZE);
+	ut_a(index2.stat_n_leaf_pages == TEST_IDX2_N_LEAF_PAGES);
+	ut_a(index2_stat_n_diff_key_vals[0] == TEST_IDX2_N_DIFF1);
+	ut_a(index2_stat_n_sample_sizes[0] == TEST_IDX2_N_DIFF1_SAMPLE_SIZE);
+	ut_a(index2_stat_n_diff_key_vals[1] == TEST_IDX2_N_DIFF2);
+	ut_a(index2_stat_n_sample_sizes[1] == TEST_IDX2_N_DIFF2_SAMPLE_SIZE);
+	ut_a(index2_stat_n_diff_key_vals[2] == TEST_IDX2_N_DIFF3);
+	ut_a(index2_stat_n_sample_sizes[2] == TEST_IDX2_N_DIFF3_SAMPLE_SIZE);
+	ut_a(index2_stat_n_diff_key_vals[3] == TEST_IDX2_N_DIFF4);
+	ut_a(index2_stat_n_sample_sizes[3] == TEST_IDX2_N_DIFF4_SAMPLE_SIZE);
+
+	printf("OK: fetch successful\n");
+}
+/* @} */
+
+/* test_dict_stats_all() @{ */
+void
+test_dict_stats_all()
+{
+	test_dict_table_schema_check();
+
+	test_dict_stats_save();
+
+	test_dict_stats_fetch_from_ps();
+}
+/* @} */
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
+/* @} */
+
+#endif /* UNIV_HOTBACKUP */
diff --git a/storage/innobase/dict/dict0stats_bg.cc b/storage/innobase/dict/dict0stats_bg.cc
new file mode 100644
index 00000000000..9e1f75a13a9
--- /dev/null
+++ b/storage/innobase/dict/dict0stats_bg.cc
@@ -0,0 +1,367 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0stats_bg.cc
+Code used for background table and index stats gathering.
+
+Created Apr 25, 2012 Vasil Dimov
+*******************************************************/
+
+#include "row0mysql.h"
+#include "srv0start.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+
+#ifdef UNIV_NONINL
+# include "dict0stats_bg.ic"
+#endif
+
+#include <vector>
+
+/** Minimum time interval between stats recalc for a given table */
+#define MIN_RECALC_INTERVAL	10 /* seconds */
+
+#define SHUTTING_DOWN()		(srv_shutdown_state != SRV_SHUTDOWN_NONE)
+
+/** Event to wake up the stats thread */
+UNIV_INTERN os_event_t		dict_stats_event = NULL;
+
+/** This mutex protects the "recalc_pool" variable. */
+static ib_mutex_t		recalc_pool_mutex;
+#ifdef HAVE_PSI_INTERFACE
+static mysql_pfs_key_t		recalc_pool_mutex_key;
+#endif /* HAVE_PSI_INTERFACE */
+
+/** The number of tables that can be added to "recalc_pool" before
+it is enlarged */
+static const ulint RECALC_POOL_INITIAL_SLOTS = 128;
+
+/** The multitude of tables whose stats are to be automatically
+recalculated - an STL vector */
+typedef std::vector<table_id_t>	recalc_pool_t;
+static recalc_pool_t		recalc_pool;
+
+typedef recalc_pool_t::iterator	recalc_pool_iterator_t;
+
+/*****************************************************************//**
+Initialize the recalc pool, called once during thread initialization. */
+static
+void
+dict_stats_recalc_pool_init()
+/*=========================*/
+{
+	ut_ad(!srv_read_only_mode);
+
+	recalc_pool.reserve(RECALC_POOL_INITIAL_SLOTS);
+}
+
+/*****************************************************************//**
+Free the resources occupied by the recalc pool, called once during
+thread de-initialization. */
+static
+void
+dict_stats_recalc_pool_deinit()
+/*===========================*/
+{
+	ut_ad(!srv_read_only_mode);
+
+	recalc_pool.clear();
+}
+
+/*****************************************************************//**
+Add a table to the recalc pool, which is processed by the
+background stats gathering thread. Only the table id is added to the
+list, so the table can be closed after being enqueued and it will be
+opened when needed. If the table does not exist later (has been DROPped),
+then it will be removed from the pool and skipped. */
+UNIV_INTERN
+void
+dict_stats_recalc_pool_add(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table to add */
+{
+	ut_ad(!srv_read_only_mode);
+
+	mutex_enter(&recalc_pool_mutex);
+
+	/* quit if already in the list */
+	for (recalc_pool_iterator_t iter = recalc_pool.begin();
+	     iter != recalc_pool.end();
+	     ++iter) {
+
+		if (*iter == table->id) {
+			mutex_exit(&recalc_pool_mutex);
+			return;
+		}
+	}
+
+	recalc_pool.push_back(table->id);
+
+	mutex_exit(&recalc_pool_mutex);
+
+	os_event_set(dict_stats_event);
+}
+
+/*****************************************************************//**
+Get a table from the auto recalc pool. The returned table id is removed
+from the pool.
+@return true if the pool was non-empty and "id" was set, false otherwise */
+static
+bool
+dict_stats_recalc_pool_get(
+/*=======================*/
+	table_id_t*	id)	/*!< out: table id, or unmodified if list is
+				empty */
+{
+	ut_ad(!srv_read_only_mode);
+
+	mutex_enter(&recalc_pool_mutex);
+
+	if (recalc_pool.empty()) {
+		mutex_exit(&recalc_pool_mutex);
+		return(false);
+	}
+
+	*id = recalc_pool[0];
+
+	recalc_pool.erase(recalc_pool.begin());
+
+	mutex_exit(&recalc_pool_mutex);
+
+	return(true);
+}
+
+/*****************************************************************//**
+Delete a given table from the auto recalc pool.
+dict_stats_recalc_pool_del() */
+UNIV_INTERN
+void
+dict_stats_recalc_pool_del(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table to remove */
+{
+	ut_ad(!srv_read_only_mode);
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	mutex_enter(&recalc_pool_mutex);
+
+	ut_ad(table->id > 0);
+
+	for (recalc_pool_iterator_t iter = recalc_pool.begin();
+	     iter != recalc_pool.end();
+	     ++iter) {
+
+		if (*iter == table->id) {
+			/* erase() invalidates the iterator */
+			recalc_pool.erase(iter);
+			break;
+		}
+	}
+
+	mutex_exit(&recalc_pool_mutex);
+}
+
+/*****************************************************************//**
+Wait until background stats thread has stopped using the specified table.
+The caller must have locked the data dictionary using
+row_mysql_lock_data_dictionary() and this function may unlock it temporarily
+and restore the lock before it exits.
+The background stats thread is guaranteed not to start using the specified
+table after this function returns and before the caller unlocks the data
+dictionary because it sets the BG_STAT_IN_PROGRESS bit in table->stats_bg_flag
+under dict_sys->mutex. */
+UNIV_INTERN
+void
+dict_stats_wait_bg_to_stop_using_table(
+/*===================================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	trx_t*		trx)	/*!< in/out: transaction to use for
+				unlocking/locking the data dict */
+{
+	while (!dict_stats_stop_bg(table)) {
+		DICT_STATS_BG_YIELD(trx);
+	}
+}
+
+/*****************************************************************//**
+Initialize global variables needed for the operation of dict_stats_thread()
+Must be called before dict_stats_thread() is started. */
+UNIV_INTERN
+void
+dict_stats_thread_init()
+/*====================*/
+{
+	ut_a(!srv_read_only_mode);
+
+	dict_stats_event = os_event_create();
+
+	/* The recalc_pool_mutex is acquired from:
+	1) the background stats gathering thread before any other latch
+	   and released without latching anything else in between (thus
+	   any level would do here)
+	2) from row_update_statistics_if_needed()
+	   and released without latching anything else in between. We know
+	   that dict_sys->mutex (SYNC_DICT) is not acquired when
+	   row_update_statistics_if_needed() is called and it may be acquired
+	   inside that function (thus a level <=SYNC_DICT would do).
+	3) from row_drop_table_for_mysql() after dict_sys->mutex (SYNC_DICT)
+	   and dict_operation_lock (SYNC_DICT_OPERATION) have been locked
+	   (thus a level <SYNC_DICT && <SYNC_DICT_OPERATION would do)
+	So we choose SYNC_STATS_AUTO_RECALC to be about below SYNC_DICT. */
+	mutex_create(recalc_pool_mutex_key, &recalc_pool_mutex,
+		     SYNC_STATS_AUTO_RECALC);
+
+	dict_stats_recalc_pool_init();
+}
+
+/*****************************************************************//**
+Free resources allocated by dict_stats_thread_init(), must be called
+after dict_stats_thread() has exited. */
+UNIV_INTERN
+void
+dict_stats_thread_deinit()
+/*======================*/
+{
+	ut_a(!srv_read_only_mode);
+	ut_ad(!srv_dict_stats_thread_active);
+
+	dict_stats_recalc_pool_deinit();
+
+	mutex_free(&recalc_pool_mutex);
+	memset(&recalc_pool_mutex, 0x0, sizeof(recalc_pool_mutex));
+
+	os_event_free(dict_stats_event);
+	dict_stats_event = NULL;
+}
+
+/*****************************************************************//**
+Get the first table that has been added for auto recalc and eventually
+update its stats. */
+static
+void
+dict_stats_process_entry_from_recalc_pool()
+/*=======================================*/
+{
+	table_id_t	table_id;
+
+	ut_ad(!srv_read_only_mode);
+
+	/* pop the first table from the auto recalc pool */
+	if (!dict_stats_recalc_pool_get(&table_id)) {
+		/* no tables for auto recalc */
+		return;
+	}
+
+	dict_table_t*	table;
+
+	mutex_enter(&dict_sys->mutex);
+
+	table = dict_table_open_on_id(table_id, TRUE, DICT_TABLE_OP_NORMAL);
+
+	if (table == NULL) {
+		/* table does not exist, must have been DROPped
+		after its id was enqueued */
+		mutex_exit(&dict_sys->mutex);
+		return;
+	}
+
+	/* Check whether table is corrupted */
+	if (table->corrupted) {
+		dict_table_close(table, TRUE, FALSE);
+		mutex_exit(&dict_sys->mutex);
+		return;
+	}
+
+	table->stats_bg_flag = BG_STAT_IN_PROGRESS;
+
+	mutex_exit(&dict_sys->mutex);
+
+	/* ut_time() could be expensive, the current function
+	is called once every time a table has been changed more than 10% and
+	on a system with lots of small tables, this could become hot. If we
+	find out that this is a problem, then the check below could eventually
+	be replaced with something else, though a time interval is the natural
+	approach. */
+
+	if (ut_difftime(ut_time(), table->stats_last_recalc)
+	    < MIN_RECALC_INTERVAL) {
+
+		/* Stats were (re)calculated not long ago. To avoid
+		too frequent stats updates we put back the table on
+		the auto recalc list and do nothing. */
+
+		dict_stats_recalc_pool_add(table);
+
+	} else {
+
+		dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT);
+	}
+
+	mutex_enter(&dict_sys->mutex);
+
+	table->stats_bg_flag = BG_STAT_NONE;
+
+	dict_table_close(table, TRUE, FALSE);
+
+	mutex_exit(&dict_sys->mutex);
+}
+
+/*****************************************************************//**
+This is the thread for background stats gathering. It pops tables, from
+the auto recalc list and proceeds them, eventually recalculating their
+statistics.
+@return this function does not return, it calls os_thread_exit() */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(dict_stats_thread)(
+/*==============================*/
+	void*	arg __attribute__((unused)))	/*!< in: a dummy parameter
+						required by os_thread_create */
+{
+	ut_a(!srv_read_only_mode);
+
+	srv_dict_stats_thread_active = TRUE;
+
+	while (!SHUTTING_DOWN()) {
+
+		/* Wake up periodically even if not signaled. This is
+		because we may lose an event - if the below call to
+		dict_stats_process_entry_from_recalc_pool() puts the entry back
+		in the list, the os_event_set() will be lost by the subsequent
+		os_event_reset(). */
+		os_event_wait_time(
+			dict_stats_event, MIN_RECALC_INTERVAL * 1000000);
+
+		if (SHUTTING_DOWN()) {
+			break;
+		}
+
+		dict_stats_process_entry_from_recalc_pool();
+
+		os_event_reset(dict_stats_event);
+	}
+
+	srv_dict_stats_thread_active = FALSE;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit instead of return(). */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
diff --git a/storage/innobase/dyn/dyn0dyn.cc b/storage/innobase/dyn/dyn0dyn.cc
new file mode 100644
index 00000000000..3ef5297a7c9
--- /dev/null
+++ b/storage/innobase/dyn/dyn0dyn.cc
@@ -0,0 +1,66 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dyn/dyn0dyn.cc
+The dynamically allocated array
+
+Created 2/5/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dyn0dyn.h"
+#ifdef UNIV_NONINL
+#include "dyn0dyn.ic"
+#endif
+
+/************************************************************//**
+Adds a new block to a dyn array.
+@return	created block */
+UNIV_INTERN
+dyn_block_t*
+dyn_array_add_block(
+/*================*/
+	dyn_array_t*	arr)	/*!< in/out: dyn array */
+{
+	mem_heap_t*	heap;
+	dyn_block_t*	block;
+
+	ut_ad(arr);
+	ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+
+	if (arr->heap == NULL) {
+		UT_LIST_INIT(arr->base);
+		UT_LIST_ADD_FIRST(list, arr->base, arr);
+
+		arr->heap = mem_heap_create(sizeof(dyn_block_t));
+	}
+
+	block = dyn_array_get_last_block(arr);
+	block->used = block->used | DYN_BLOCK_FULL_FLAG;
+
+	heap = arr->heap;
+
+	block = static_cast<dyn_block_t*>(
+		mem_heap_alloc(heap, sizeof(dyn_block_t)));
+
+	block->used = 0;
+
+	UT_LIST_ADD_LAST(list, arr->base, block);
+
+	return(block);
+}
diff --git a/storage/innobase/eval/eval0eval.cc b/storage/innobase/eval/eval0eval.cc
new file mode 100644
index 00000000000..ccc54781102
--- /dev/null
+++ b/storage/innobase/eval/eval0eval.cc
@@ -0,0 +1,950 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file eval/eval0eval.cc
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#include "eval0eval.h"
+
+#ifdef UNIV_NONINL
+#include "eval0eval.ic"
+#endif
+
+#include "data0data.h"
+#include "row0sel.h"
+#include "rem0cmp.h"
+
+/** The RND function seed */
+static ulint	eval_rnd	= 128367121;
+
+/** Dummy adress used when we should allocate a buffer of size 0 in
+eval_node_alloc_val_buf */
+
+static byte	eval_dummy;
+
+/*************************************************************************
+Gets the like node from the node */
+UNIV_INLINE
+que_node_t*
+que_node_get_like_node(
+/*===================*/
+				/* out: next node in a list of nodes */
+	que_node_t*     node)   /* in: node in a list */
+{
+	return(((sym_node_t*) node)->like_node);
+}
+
+/*****************************************************************//**
+Allocate a buffer from global dynamic memory for a value of a que_node.
+NOTE that this memory must be explicitly freed when the query graph is
+freed. If the node already has an allocated buffer, that buffer is freed
+here. NOTE that this is the only function where dynamic memory should be
+allocated for a query node val field.
+@return	pointer to allocated buffer */
+UNIV_INTERN
+byte*
+eval_node_alloc_val_buf(
+/*====================*/
+	que_node_t*	node,	/*!< in: query graph node; sets the val field
+				data field to point to the new buffer, and
+				len field equal to size */
+	ulint		size)	/*!< in: buffer size */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_SYMBOL
+	      || que_node_get_type(node) == QUE_NODE_FUNC);
+
+	dfield = que_node_get_val(node);
+
+	data = static_cast<byte*>(dfield_get_data(dfield));
+
+	if (data && data != &eval_dummy) {
+		mem_free(data);
+	}
+
+	if (size == 0) {
+		data = &eval_dummy;
+	} else {
+		data = static_cast<byte*>(mem_alloc(size));
+	}
+
+	que_node_set_val_buf_size(node, size);
+
+	dfield_set_data(dfield, data, size);
+
+	return(data);
+}
+
+/*****************************************************************//**
+Free the buffer from global dynamic memory for a value of a que_node,
+if it has been allocated in the above function. The freeing for pushed
+column values is done in sel_col_prefetch_buf_free. */
+UNIV_INTERN
+void
+eval_node_free_val_buf(
+/*===================*/
+	que_node_t*	node)	/*!< in: query graph node */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_SYMBOL
+	      || que_node_get_type(node) == QUE_NODE_FUNC);
+
+	dfield = que_node_get_val(node);
+
+	data = static_cast<byte*>(dfield_get_data(dfield));
+
+	if (que_node_get_val_buf_size(node) > 0) {
+		ut_a(data);
+
+		mem_free(data);
+	}
+}
+
+/*********************************************************************
+Evaluates a LIKE comparison node.
+@return the result of the comparison */
+UNIV_INLINE
+ibool
+eval_cmp_like(
+/*==========*/
+	que_node_t*	arg1,		/* !< in: left operand */
+	que_node_t*	arg2)		/* !< in: right operand */
+{
+	ib_like_t	op;
+	int		res;
+	que_node_t*	arg3;
+	que_node_t*	arg4;
+	dfield_t*	dfield;
+	dtype_t*	dtype;
+	ibool		val = TRUE;
+
+	arg3 = que_node_get_like_node(arg2);
+
+	/* Get the comparison type operator */
+	ut_a(arg3);
+
+	dfield = que_node_get_val(arg3);
+	dtype = dfield_get_type(dfield);
+
+	ut_a(dtype_get_mtype(dtype) == DATA_INT);
+	op = static_cast<ib_like_t>(mach_read_from_4(static_cast<const unsigned char*>(dfield_get_data(dfield))));
+
+	switch (op) {
+	case	IB_LIKE_PREFIX:
+
+		arg4 = que_node_get_next(arg3);
+		res = cmp_dfield_dfield_like_prefix(
+			que_node_get_val(arg1),
+			que_node_get_val(arg4));
+		break;
+
+	case	IB_LIKE_SUFFIX:
+
+		arg4 = que_node_get_next(arg3);
+		res = cmp_dfield_dfield_like_suffix(
+			que_node_get_val(arg1),
+			que_node_get_val(arg4));
+		break;
+
+	case	IB_LIKE_SUBSTR:
+
+		arg4 = que_node_get_next(arg3);
+		res = cmp_dfield_dfield_like_substr(
+			que_node_get_val(arg1),
+			que_node_get_val(arg4));
+		break;
+
+	case	IB_LIKE_EXACT:
+		res = cmp_dfield_dfield(
+			que_node_get_val(arg1),
+			que_node_get_val(arg2));
+		break;
+
+	default:
+		ut_error;
+	}
+
+	if (res != 0) {
+		val = FALSE;
+	}
+
+	return(val);
+}
+
+/*********************************************************************
+Evaluates a comparison node.
+@return the result of the comparison */
+ibool
+eval_cmp(
+/*=====*/
+	func_node_t*	cmp_node)	/*!< in: comparison node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	int		res;
+	int		func;
+	ibool		val = TRUE;
+
+	ut_ad(que_node_get_type(cmp_node) == QUE_NODE_FUNC);
+
+	arg1 = cmp_node->args;
+	arg2 = que_node_get_next(arg1);
+
+	func = cmp_node->func;
+
+	if (func == PARS_LIKE_TOKEN_EXACT
+	    || func == PARS_LIKE_TOKEN_PREFIX
+	    || func == PARS_LIKE_TOKEN_SUFFIX
+	    || func == PARS_LIKE_TOKEN_SUBSTR) {
+
+		val = eval_cmp_like(arg1, arg2);
+	} else {
+		res = cmp_dfield_dfield(
+			que_node_get_val(arg1), que_node_get_val(arg2));
+
+		if (func == '=') {
+			if (res != 0) {
+				val = FALSE;
+			}
+		} else if (func == '<') {
+			if (res != -1) {
+				val = FALSE;
+			}
+		} else if (func == PARS_LE_TOKEN) {
+			if (res == 1) {
+				val = FALSE;
+			}
+		} else if (func == PARS_NE_TOKEN) {
+			if (res == 0) {
+				val = FALSE;
+			}
+		} else if (func == PARS_GE_TOKEN) {
+			if (res == -1) {
+				val = FALSE;
+			}
+		} else {
+			ut_ad(func == '>');
+
+			if (res != 1) {
+				val = FALSE;
+			}
+		}
+	}
+
+	eval_node_set_ibool_val(cmp_node, val);
+
+	return(val);
+}
+
+/*****************************************************************//**
+Evaluates a logical operation node. */
+UNIV_INLINE
+void
+eval_logical(
+/*=========*/
+	func_node_t*	logical_node)	/*!< in: logical operation node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	ibool		val1;
+	ibool		val2 = 0; /* remove warning */
+	ibool		val = 0;  /* remove warning */
+	int		func;
+
+	ut_ad(que_node_get_type(logical_node) == QUE_NODE_FUNC);
+
+	arg1 = logical_node->args;
+	arg2 = que_node_get_next(arg1); /* arg2 is NULL if func is 'NOT' */
+
+	val1 = eval_node_get_ibool_val(arg1);
+
+	if (arg2) {
+		val2 = eval_node_get_ibool_val(arg2);
+	}
+
+	func = logical_node->func;
+
+	if (func == PARS_AND_TOKEN) {
+		val = val1 & val2;
+	} else if (func == PARS_OR_TOKEN) {
+		val = val1 | val2;
+	} else if (func == PARS_NOT_TOKEN) {
+		val = TRUE - val1;
+	} else {
+		ut_error;
+	}
+
+	eval_node_set_ibool_val(logical_node, val);
+}
+
+/*****************************************************************//**
+Evaluates an arithmetic operation node. */
+UNIV_INLINE
+void
+eval_arith(
+/*=======*/
+	func_node_t*	arith_node)	/*!< in: arithmetic operation node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	lint		val1;
+	lint		val2 = 0; /* remove warning */
+	lint		val;
+	int		func;
+
+	ut_ad(que_node_get_type(arith_node) == QUE_NODE_FUNC);
+
+	arg1 = arith_node->args;
+	arg2 = que_node_get_next(arg1); /* arg2 is NULL if func is unary '-' */
+
+	val1 = eval_node_get_int_val(arg1);
+
+	if (arg2) {
+		val2 = eval_node_get_int_val(arg2);
+	}
+
+	func = arith_node->func;
+
+	if (func == '+') {
+		val = val1 + val2;
+	} else if ((func == '-') && arg2) {
+		val = val1 - val2;
+	} else if (func == '-') {
+		val = -val1;
+	} else if (func == '*') {
+		val = val1 * val2;
+	} else {
+		ut_ad(func == '/');
+		val = val1 / val2;
+	}
+
+	eval_node_set_int_val(arith_node, val);
+}
+
+/*****************************************************************//**
+Evaluates an aggregate operation node. */
+UNIV_INLINE
+void
+eval_aggregate(
+/*===========*/
+	func_node_t*	node)	/*!< in: aggregate operation node */
+{
+	que_node_t*	arg;
+	lint		val;
+	lint		arg_val;
+	int		func;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_FUNC);
+
+	val = eval_node_get_int_val(node);
+
+	func = node->func;
+
+	if (func == PARS_COUNT_TOKEN) {
+
+		val = val + 1;
+	} else {
+		ut_ad(func == PARS_SUM_TOKEN);
+
+		arg = node->args;
+		arg_val = eval_node_get_int_val(arg);
+
+		val = val + arg_val;
+	}
+
+	eval_node_set_int_val(node, val);
+}
+
+/*****************************************************************//**
+Evaluates a predefined function node where the function is not relevant
+in benchmarks. */
+static
+void
+eval_predefined_2(
+/*==============*/
+	func_node_t*	func_node)	/*!< in: predefined function node */
+{
+	que_node_t*	arg;
+	que_node_t*	arg1;
+	que_node_t*	arg2 = 0; /* remove warning (??? bug ???) */
+	lint		int_val;
+	byte*		data;
+	ulint		len1;
+	ulint		len2;
+	int		func;
+	ulint		i;
+
+	ut_ad(que_node_get_type(func_node) == QUE_NODE_FUNC);
+
+	arg1 = func_node->args;
+
+	if (arg1) {
+		arg2 = que_node_get_next(arg1);
+	}
+
+	func = func_node->func;
+
+	if (func == PARS_PRINTF_TOKEN) {
+
+		arg = arg1;
+
+		while (arg) {
+			dfield_print(que_node_get_val(arg));
+
+			arg = que_node_get_next(arg);
+		}
+
+		putc('\n', stderr);
+
+	} else if (func == PARS_ASSERT_TOKEN) {
+
+		if (!eval_node_get_ibool_val(arg1)) {
+			fputs("SQL assertion fails in a stored procedure!\n",
+			      stderr);
+		}
+
+		ut_a(eval_node_get_ibool_val(arg1));
+
+		/* This function, or more precisely, a debug procedure,
+		returns no value */
+
+	} else if (func == PARS_RND_TOKEN) {
+
+		len1 = (ulint) eval_node_get_int_val(arg1);
+		len2 = (ulint) eval_node_get_int_val(arg2);
+
+		ut_ad(len2 >= len1);
+
+		if (len2 > len1) {
+			int_val = (lint) (len1
+					  + (eval_rnd % (len2 - len1 + 1)));
+		} else {
+			int_val = (lint) len1;
+		}
+
+		eval_rnd = ut_rnd_gen_next_ulint(eval_rnd);
+
+		eval_node_set_int_val(func_node, int_val);
+
+	} else if (func == PARS_RND_STR_TOKEN) {
+
+		len1 = (ulint) eval_node_get_int_val(arg1);
+
+		data = eval_node_ensure_val_buf(func_node, len1);
+
+		for (i = 0; i < len1; i++) {
+			data[i] = (byte)(97 + (eval_rnd % 3));
+
+			eval_rnd = ut_rnd_gen_next_ulint(eval_rnd);
+		}
+	} else {
+		ut_error;
+	}
+}
+
+/*****************************************************************//**
+Evaluates a notfound-function node. */
+UNIV_INLINE
+void
+eval_notfound(
+/*==========*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	sym_node_t*	cursor;
+	sel_node_t*	sel_node;
+	ibool		ibool_val;
+
+	ut_ad(func_node->func == PARS_NOTFOUND_TOKEN);
+
+	cursor = static_cast<sym_node_t*>(func_node->args);
+
+	ut_ad(que_node_get_type(cursor) == QUE_NODE_SYMBOL);
+
+	if (cursor->token_type == SYM_LIT) {
+
+		ut_ad(ut_memcmp(dfield_get_data(que_node_get_val(cursor)),
+				"SQL", 3) == 0);
+
+		sel_node = cursor->sym_table->query_graph->last_sel_node;
+	} else {
+		sel_node = cursor->alias->cursor_def;
+	}
+
+	if (sel_node->state == SEL_NODE_NO_MORE_ROWS) {
+		ibool_val = TRUE;
+	} else {
+		ibool_val = FALSE;
+	}
+
+	eval_node_set_ibool_val(func_node, ibool_val);
+}
+
+/*****************************************************************//**
+Evaluates a substr-function node. */
+UNIV_INLINE
+void
+eval_substr(
+/*========*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	que_node_t*	arg3;
+	dfield_t*	dfield;
+	byte*		str1;
+	ulint		len1;
+	ulint		len2;
+
+	arg1 = func_node->args;
+	arg2 = que_node_get_next(arg1);
+
+	ut_ad(func_node->func == PARS_SUBSTR_TOKEN);
+
+	arg3 = que_node_get_next(arg2);
+
+	str1 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg1)));
+
+	len1 = (ulint) eval_node_get_int_val(arg2);
+	len2 = (ulint) eval_node_get_int_val(arg3);
+
+	dfield = que_node_get_val(func_node);
+
+	dfield_set_data(dfield, str1 + len1, len2);
+}
+
+/*****************************************************************//**
+Evaluates a replstr-procedure node. */
+static
+void
+eval_replstr(
+/*=========*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	que_node_t*	arg3;
+	que_node_t*	arg4;
+	byte*		str1;
+	byte*		str2;
+	ulint		len1;
+	ulint		len2;
+
+	arg1 = func_node->args;
+	arg2 = que_node_get_next(arg1);
+
+	ut_ad(que_node_get_type(arg1) == QUE_NODE_SYMBOL);
+
+	arg3 = que_node_get_next(arg2);
+	arg4 = que_node_get_next(arg3);
+
+	str1 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg1)));
+	str2 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg2)));
+
+	len1 = (ulint) eval_node_get_int_val(arg3);
+	len2 = (ulint) eval_node_get_int_val(arg4);
+
+	if ((dfield_get_len(que_node_get_val(arg1)) < len1 + len2)
+	    || (dfield_get_len(que_node_get_val(arg2)) < len2)) {
+
+		ut_error;
+	}
+
+	ut_memcpy(str1 + len1, str2, len2);
+}
+
+/*****************************************************************//**
+Evaluates an instr-function node. */
+static
+void
+eval_instr(
+/*=======*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	dfield_t*	dfield1;
+	dfield_t*	dfield2;
+	lint		int_val;
+	byte*		str1;
+	byte*		str2;
+	byte		match_char;
+	ulint		len1;
+	ulint		len2;
+	ulint		i;
+	ulint		j;
+
+	arg1 = func_node->args;
+	arg2 = que_node_get_next(arg1);
+
+	dfield1 = que_node_get_val(arg1);
+	dfield2 = que_node_get_val(arg2);
+
+	str1 = static_cast<byte*>(dfield_get_data(dfield1));
+	str2 = static_cast<byte*>(dfield_get_data(dfield2));
+
+	len1 = dfield_get_len(dfield1);
+	len2 = dfield_get_len(dfield2);
+
+	if (len2 == 0) {
+		ut_error;
+	}
+
+	match_char = str2[0];
+
+	for (i = 0; i < len1; i++) {
+		/* In this outer loop, the number of matched characters is 0 */
+
+		if (str1[i] == match_char) {
+
+			if (i + len2 > len1) {
+
+				break;
+			}
+
+			for (j = 1;; j++) {
+				/* We have already matched j characters */
+
+				if (j == len2) {
+					int_val = i + 1;
+
+					goto match_found;
+				}
+
+				if (str1[i + j] != str2[j]) {
+
+					break;
+				}
+			}
+		}
+	}
+
+	int_val = 0;
+
+match_found:
+	eval_node_set_int_val(func_node, int_val);
+}
+
+/*****************************************************************//**
+Evaluates a predefined function node. */
+UNIV_INLINE
+void
+eval_binary_to_number(
+/*==================*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg1;
+	dfield_t*	dfield;
+	byte*		str1;
+	byte*		str2;
+	ulint		len1;
+	ulint		int_val;
+
+	arg1 = func_node->args;
+
+	dfield = que_node_get_val(arg1);
+
+	str1 = static_cast<byte*>(dfield_get_data(dfield));
+	len1 = dfield_get_len(dfield);
+
+	if (len1 > 4) {
+		ut_error;
+	}
+
+	if (len1 == 4) {
+		str2 = str1;
+	} else {
+		int_val = 0;
+		str2 = (byte*) &int_val;
+
+		ut_memcpy(str2 + (4 - len1), str1, len1);
+	}
+
+	eval_node_copy_and_alloc_val(func_node, str2, 4);
+}
+
+/*****************************************************************//**
+Evaluates a predefined function node. */
+static
+void
+eval_concat(
+/*========*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg;
+	dfield_t*	dfield;
+	byte*		data;
+	ulint		len;
+	ulint		len1;
+
+	arg = func_node->args;
+	len = 0;
+
+	while (arg) {
+		len1 = dfield_get_len(que_node_get_val(arg));
+
+		len += len1;
+
+		arg = que_node_get_next(arg);
+	}
+
+	data = eval_node_ensure_val_buf(func_node, len);
+
+	arg = func_node->args;
+	len = 0;
+
+	while (arg) {
+		dfield = que_node_get_val(arg);
+		len1 = dfield_get_len(dfield);
+
+		ut_memcpy(data + len, dfield_get_data(dfield), len1);
+
+		len += len1;
+
+		arg = que_node_get_next(arg);
+	}
+}
+
+/*****************************************************************//**
+Evaluates a predefined function node. If the first argument is an integer,
+this function looks at the second argument which is the integer length in
+bytes, and converts the integer to a VARCHAR.
+If the first argument is of some other type, this function converts it to
+BINARY. */
+UNIV_INLINE
+void
+eval_to_binary(
+/*===========*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	dfield_t*	dfield;
+	byte*		str1;
+	ulint		len;
+	ulint		len1;
+
+	arg1 = func_node->args;
+
+	str1 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg1)));
+
+	if (dtype_get_mtype(que_node_get_data_type(arg1)) != DATA_INT) {
+
+		len = dfield_get_len(que_node_get_val(arg1));
+
+		dfield = que_node_get_val(func_node);
+
+		dfield_set_data(dfield, str1, len);
+
+		return;
+	}
+
+	arg2 = que_node_get_next(arg1);
+
+	len1 = (ulint) eval_node_get_int_val(arg2);
+
+	if (len1 > 4) {
+
+		ut_error;
+	}
+
+	dfield = que_node_get_val(func_node);
+
+	dfield_set_data(dfield, str1 + (4 - len1), len1);
+}
+
+/*****************************************************************//**
+Evaluates a predefined function node. */
+UNIV_INLINE
+void
+eval_predefined(
+/*============*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg1;
+	lint		int_val;
+	byte*		data;
+	int		func;
+
+	func = func_node->func;
+
+	arg1 = func_node->args;
+
+	if (func == PARS_LENGTH_TOKEN) {
+
+		int_val = (lint) dfield_get_len(que_node_get_val(arg1));
+
+	} else if (func == PARS_TO_CHAR_TOKEN) {
+
+		/* Convert number to character string as a
+		signed decimal integer. */
+
+		ulint	uint_val;
+		int	int_len;
+
+		int_val = eval_node_get_int_val(arg1);
+
+		/* Determine the length of the string. */
+
+		if (int_val == 0) {
+			int_len = 1; /* the number 0 occupies 1 byte */
+		} else {
+			int_len = 0;
+			if (int_val < 0) {
+				uint_val = ((ulint) -int_val - 1) + 1;
+				int_len++; /* reserve space for minus sign */
+			} else {
+				uint_val = (ulint) int_val;
+			}
+			for (; uint_val > 0; int_len++) {
+				uint_val /= 10;
+			}
+		}
+
+		/* allocate the string */
+		data = eval_node_ensure_val_buf(func_node, int_len + 1);
+
+		/* add terminating NUL character */
+		data[int_len] = 0;
+
+		/* convert the number */
+
+		if (int_val == 0) {
+			data[0] = '0';
+		} else {
+			int tmp;
+			if (int_val < 0) {
+				data[0] = '-'; /* preceding minus sign */
+				uint_val = ((ulint) -int_val - 1) + 1;
+			} else {
+				uint_val = (ulint) int_val;
+			}
+			for (tmp = int_len; uint_val > 0; uint_val /= 10) {
+				data[--tmp] = (byte)
+					('0' + (byte)(uint_val % 10));
+			}
+		}
+
+		dfield_set_len(que_node_get_val(func_node), int_len);
+
+		return;
+
+	} else if (func == PARS_TO_NUMBER_TOKEN) {
+
+		int_val = atoi((char*)
+			       dfield_get_data(que_node_get_val(arg1)));
+
+	} else if (func == PARS_SYSDATE_TOKEN) {
+		int_val = (lint) ut_time();
+	} else {
+		eval_predefined_2(func_node);
+
+		return;
+	}
+
+	eval_node_set_int_val(func_node, int_val);
+}
+
+/*****************************************************************//**
+Evaluates a function node. */
+UNIV_INTERN
+void
+eval_func(
+/*======*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg;
+	ulint		fclass;
+	ulint		func;
+
+	ut_ad(que_node_get_type(func_node) == QUE_NODE_FUNC);
+
+	fclass = func_node->fclass;
+	func = func_node->func;
+
+	arg = func_node->args;
+
+	/* Evaluate first the argument list */
+	while (arg) {
+		eval_exp(arg);
+
+		/* The functions are not defined for SQL null argument
+		values, except for eval_cmp and notfound */
+
+		if (dfield_is_null(que_node_get_val(arg))
+		    && (fclass != PARS_FUNC_CMP)
+		    && (func != PARS_NOTFOUND_TOKEN)
+		    && (func != PARS_PRINTF_TOKEN)) {
+			ut_error;
+		}
+
+		arg = que_node_get_next(arg);
+	}
+
+	switch (fclass) {
+	case PARS_FUNC_CMP:
+		eval_cmp(func_node);
+		return;
+	case PARS_FUNC_ARITH:
+		eval_arith(func_node);
+		return;
+	case PARS_FUNC_AGGREGATE:
+		eval_aggregate(func_node);
+		return;
+	case PARS_FUNC_PREDEFINED:
+		switch (func) {
+		case PARS_NOTFOUND_TOKEN:
+			eval_notfound(func_node);
+			return;
+		case PARS_SUBSTR_TOKEN:
+			eval_substr(func_node);
+			return;
+		case PARS_REPLSTR_TOKEN:
+			eval_replstr(func_node);
+			return;
+		case PARS_INSTR_TOKEN:
+			eval_instr(func_node);
+			return;
+		case PARS_BINARY_TO_NUMBER_TOKEN:
+			eval_binary_to_number(func_node);
+			return;
+		case PARS_CONCAT_TOKEN:
+			eval_concat(func_node);
+			return;
+		case PARS_TO_BINARY_TOKEN:
+			eval_to_binary(func_node);
+			return;
+		default:
+			eval_predefined(func_node);
+			return;
+		}
+	case PARS_FUNC_LOGICAL:
+		eval_logical(func_node);
+		return;
+	}
+
+	ut_error;
+}
diff --git a/storage/innobase/eval/eval0proc.cc b/storage/innobase/eval/eval0proc.cc
new file mode 100644
index 00000000000..e6f3a32cd48
--- /dev/null
+++ b/storage/innobase/eval/eval0proc.cc
@@ -0,0 +1,296 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file eval/eval0proc.cc
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#include "eval0proc.h"
+
+#ifdef UNIV_NONINL
+#include "eval0proc.ic"
+#endif
+
+/**********************************************************************//**
+Performs an execution step of an if-statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+if_step(
+/*====*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	if_node_t*	node;
+	elsif_node_t*	elsif_node;
+
+	ut_ad(thr);
+
+	node = static_cast<if_node_t*>(thr->run_node);
+	ut_ad(que_node_get_type(node) == QUE_NODE_IF);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+
+		/* Evaluate the condition */
+
+		eval_exp(node->cond);
+
+		if (eval_node_get_ibool_val(node->cond)) {
+
+			/* The condition evaluated to TRUE: start execution
+			from the first statement in the statement list */
+
+			thr->run_node = node->stat_list;
+
+		} else if (node->else_part) {
+			thr->run_node = node->else_part;
+
+		} else if (node->elsif_list) {
+			elsif_node = node->elsif_list;
+
+			for (;;) {
+				eval_exp(elsif_node->cond);
+
+				if (eval_node_get_ibool_val(
+					    elsif_node->cond)) {
+
+					/* The condition evaluated to TRUE:
+					start execution from the first
+					statement in the statement list */
+
+					thr->run_node = elsif_node->stat_list;
+
+					break;
+				}
+
+				elsif_node = static_cast<elsif_node_t*>(
+					que_node_get_next(elsif_node));
+
+				if (elsif_node == NULL) {
+					thr->run_node = NULL;
+
+					break;
+				}
+			}
+		} else {
+			thr->run_node = NULL;
+		}
+	} else {
+		/* Move to the next statement */
+		ut_ad(que_node_get_next(thr->prev_node) == NULL);
+
+		thr->run_node = NULL;
+	}
+
+	if (thr->run_node == NULL) {
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a while-statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+while_step(
+/*=======*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	while_node_t*	node;
+
+	ut_ad(thr);
+
+	node = static_cast<while_node_t*>(thr->run_node);
+	ut_ad(que_node_get_type(node) == QUE_NODE_WHILE);
+
+	ut_ad((thr->prev_node == que_node_get_parent(node))
+	      || (que_node_get_next(thr->prev_node) == NULL));
+
+	/* Evaluate the condition */
+
+	eval_exp(node->cond);
+
+	if (eval_node_get_ibool_val(node->cond)) {
+
+		/* The condition evaluated to TRUE: start execution
+		from the first statement in the statement list */
+
+		thr->run_node = node->stat_list;
+	} else {
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of an assignment statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+assign_step(
+/*========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	assign_node_t*	node;
+
+	ut_ad(thr);
+
+	node = static_cast<assign_node_t*>(thr->run_node);
+	ut_ad(que_node_get_type(node) == QUE_NODE_ASSIGNMENT);
+
+	/* Evaluate the value to assign */
+
+	eval_exp(node->val);
+
+	eval_node_copy_val(node->var->alias, node->val);
+
+	thr->run_node = que_node_get_parent(node);
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a for-loop node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+for_step(
+/*=====*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	for_node_t*	node;
+	que_node_t*	parent;
+	lint		loop_var_value;
+
+	ut_ad(thr);
+
+	node = static_cast<for_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_FOR);
+
+	parent = que_node_get_parent(node);
+
+	if (thr->prev_node != parent) {
+
+		/* Move to the next statement */
+		thr->run_node = que_node_get_next(thr->prev_node);
+
+		if (thr->run_node != NULL) {
+
+			return(thr);
+		}
+
+		/* Increment the value of loop_var */
+
+		loop_var_value = 1 + eval_node_get_int_val(node->loop_var);
+	} else {
+		/* Initialize the loop */
+
+		eval_exp(node->loop_start_limit);
+		eval_exp(node->loop_end_limit);
+
+		loop_var_value = eval_node_get_int_val(node->loop_start_limit);
+
+		node->loop_end_value
+                  = (int) eval_node_get_int_val(node->loop_end_limit);
+	}
+
+	/* Check if we should do another loop */
+
+	if (loop_var_value > node->loop_end_value) {
+
+		/* Enough loops done */
+
+		thr->run_node = parent;
+	} else {
+		eval_node_set_int_val(node->loop_var, loop_var_value);
+
+		thr->run_node = node->stat_list;
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of an exit statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+exit_step(
+/*======*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	exit_node_t*	node;
+	que_node_t*	loop_node;
+
+	ut_ad(thr);
+
+	node = static_cast<exit_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_EXIT);
+
+	/* Loops exit by setting thr->run_node as the loop node's parent, so
+	find our containing loop node and get its parent. */
+
+	loop_node = que_node_get_containing_loop_node(node);
+
+	/* If someone uses an EXIT statement outside of a loop, this will
+	trigger. */
+	ut_a(loop_node);
+
+	thr->run_node = que_node_get_parent(loop_node);
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a return-statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+return_step(
+/*========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	return_node_t*	node;
+	que_node_t*	parent;
+
+	ut_ad(thr);
+
+	node = static_cast<return_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_RETURN);
+
+	parent = node;
+
+	while (que_node_get_type(parent) != QUE_NODE_PROC) {
+
+		parent = que_node_get_parent(parent);
+	}
+
+	ut_a(parent);
+
+	thr->run_node = que_node_get_parent(parent);
+
+	return(thr);
+}
diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc
new file mode 100644
index 00000000000..9755ef2ee68
--- /dev/null
+++ b/storage/innobase/fil/fil0fil.cc
@@ -0,0 +1,6547 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fil/fil0fil.cc
+The tablespace memory cache
+
+Created 10/25/1995 Heikki Tuuri
+*******************************************************/
+
+#include "fil0fil.h"
+
+#include <debug_sync.h>
+#include <my_dbug.h>
+
+#include "mem0mem.h"
+#include "hash0hash.h"
+#include "os0file.h"
+#include "mach0data.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "log0recv.h"
+#include "fsp0fsp.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "dict0dict.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "trx0sys.h"
+#include "row0mysql.h"
+#ifndef UNIV_HOTBACKUP
+# include "buf0lru.h"
+# include "ibuf0ibuf.h"
+# include "sync0sync.h"
+# include "os0sync.h"
+#else /* !UNIV_HOTBACKUP */
+# include "srv0srv.h"
+static ulint srv_data_read, srv_data_written;
+#endif /* !UNIV_HOTBACKUP */
+
+/*
+		IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE
+		=============================================
+
+The tablespace cache is responsible for providing fast read/write access to
+tablespaces and logs of the database. File creation and deletion is done
+in other modules which know more of the logic of the operation, however.
+
+A tablespace consists of a chain of files. The size of the files does not
+have to be divisible by the database block size, because we may just leave
+the last incomplete block unused. When a new file is appended to the
+tablespace, the maximum size of the file is also specified. At the moment,
+we think that it is best to extend the file to its maximum size already at
+the creation of the file, because then we can avoid dynamically extending
+the file when more space is needed for the tablespace.
+
+A block's position in the tablespace is specified with a 32-bit unsigned
+integer. The files in the chain are thought to be catenated, and the block
+corresponding to an address n is the nth block in the catenated file (where
+the first block is named the 0th block, and the incomplete block fragments
+at the end of files are not taken into account). A tablespace can be extended
+by appending a new file at the end of the chain.
+
+Our tablespace concept is similar to the one of Oracle.
+
+To acquire more speed in disk transfers, a technique called disk striping is
+sometimes used. This means that logical block addresses are divided in a
+round-robin fashion across several disks. Windows NT supports disk striping,
+so there we do not need to support it in the database. Disk striping is
+implemented in hardware in RAID disks. We conclude that it is not necessary
+to implement it in the database. Oracle 7 does not support disk striping,
+either.
+
+Another trick used at some database sites is replacing tablespace files by
+raw disks, that is, the whole physical disk drive, or a partition of it, is
+opened as a single file, and it is accessed through byte offsets calculated
+from the start of the disk or the partition. This is recommended in some
+books on database tuning to achieve more speed in i/o. Using raw disk
+certainly prevents the OS from fragmenting disk space, but it is not clear
+if it really adds speed. We measured on the Pentium 100 MHz + NT + NTFS file
+system + EIDE Conner disk only a negligible difference in speed when reading
+from a file, versus reading from a raw disk.
+
+To have fast access to a tablespace or a log file, we put the data structures
+to a hash table. Each tablespace and log file is given an unique 32-bit
+identifier.
+
+Some operating systems do not support many open files at the same time,
+though NT seems to tolerate at least 900 open files. Therefore, we put the
+open files in an LRU-list. If we need to open another file, we may close the
+file at the end of the LRU-list. When an i/o-operation is pending on a file,
+the file cannot be closed. We take the file nodes with pending i/o-operations
+out of the LRU-list and keep a count of pending operations. When an operation
+completes, we decrement the count and return the file node to the LRU-list if
+the count drops to zero. */
+
+/** When mysqld is run, the default directory "." is the mysqld datadir,
+but in the MySQL Embedded Server Library and mysqlbackup it is not the default
+directory, and we must set the base file path explicitly */
+UNIV_INTERN const char*	fil_path_to_mysql_datadir	= ".";
+
+/** The number of fsyncs done to the log */
+UNIV_INTERN ulint	fil_n_log_flushes			= 0;
+
+/** Number of pending redo log flushes */
+UNIV_INTERN ulint	fil_n_pending_log_flushes		= 0;
+/** Number of pending tablespace flushes */
+UNIV_INTERN ulint	fil_n_pending_tablespace_flushes	= 0;
+
+/** Number of files currently open */
+UNIV_INTERN ulint	fil_n_file_opened			= 0;
+
+/** The null file address */
+UNIV_INTERN fil_addr_t	fil_addr_null = {FIL_NULL, 0};
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register fil_system_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	fil_system_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+#ifdef UNIV_PFS_RWLOCK
+/* Key to register file space latch with performance schema */
+UNIV_INTERN mysql_pfs_key_t	fil_space_latch_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+/** File node of a tablespace or the log data space */
+struct fil_node_t {
+	fil_space_t*	space;	/*!< backpointer to the space where this node
+				belongs */
+	char*		name;	/*!< path to the file */
+	ibool		open;	/*!< TRUE if file open */
+	os_file_t	handle;	/*!< OS handle to the file, if file open */
+	os_event_t	sync_event;/*!< Condition event to group and
+				serialize calls to fsync */
+	ibool		is_raw_disk;/*!< TRUE if the 'file' is actually a raw
+				device or a raw disk partition */
+	ulint		size;	/*!< size of the file in database pages, 0 if
+				not known yet; the possible last incomplete
+				megabyte may be ignored if space == 0 */
+	ulint		n_pending;
+				/*!< count of pending i/o's on this file;
+				closing of the file is not allowed if
+				this is > 0 */
+	ulint		n_pending_flushes;
+				/*!< count of pending flushes on this file;
+				closing of the file is not allowed if
+				this is > 0 */
+	ibool		being_extended;
+				/*!< TRUE if the node is currently
+				being extended. */
+	ib_int64_t	modification_counter;/*!< when we write to the file we
+				increment this by one */
+	ib_int64_t	flush_counter;/*!< up to what
+				modification_counter value we have
+				flushed the modifications to disk */
+	UT_LIST_NODE_T(fil_node_t) chain;
+				/*!< link field for the file chain */
+	UT_LIST_NODE_T(fil_node_t) LRU;
+				/*!< link field for the LRU list */
+	ulint		magic_n;/*!< FIL_NODE_MAGIC_N */
+};
+
+/** Value of fil_node_t::magic_n */
+#define	FIL_NODE_MAGIC_N	89389
+
+/** Tablespace or log data space: let us call them by a common name space */
+struct fil_space_t {
+	char*		name;	/*!< space name = the path to the first file in
+				it */
+	ulint		id;	/*!< space id */
+	ib_int64_t	tablespace_version;
+				/*!< in DISCARD/IMPORT this timestamp
+				is used to check if we should ignore
+				an insert buffer merge request for a
+				page because it actually was for the
+				previous incarnation of the space */
+	ibool		mark;	/*!< this is set to TRUE at database startup if
+				the space corresponds to a table in the InnoDB
+				data dictionary; so we can print a warning of
+				orphaned tablespaces */
+	ibool		stop_ios;/*!< TRUE if we want to rename the
+				.ibd file of tablespace and want to
+				stop temporarily posting of new i/o
+				requests on the file */
+	ibool		stop_new_ops;
+				/*!< we set this TRUE when we start
+				deleting a single-table tablespace.
+				When this is set following new ops
+				are not allowed:
+				* read IO request
+				* ibuf merge
+				* file flush
+				Note that we can still possibly have
+				new write operations because we don't
+				check this flag when doing flush
+				batches. */
+	ulint		purpose;/*!< FIL_TABLESPACE, FIL_LOG, or
+				FIL_ARCH_LOG */
+	UT_LIST_BASE_NODE_T(fil_node_t) chain;
+				/*!< base node for the file chain */
+	ulint		size;	/*!< space size in pages; 0 if a single-table
+				tablespace whose size we do not know yet;
+				last incomplete megabytes in data files may be
+				ignored if space == 0 */
+	ulint		flags;	/*!< tablespace flags; see
+				fsp_flags_is_valid(),
+				fsp_flags_get_zip_size() */
+	ulint		n_reserved_extents;
+				/*!< number of reserved free extents for
+				ongoing operations like B-tree page split */
+	ulint		n_pending_flushes; /*!< this is positive when flushing
+				the tablespace to disk; dropping of the
+				tablespace is forbidden if this is positive */
+	ulint		n_pending_ops;/*!< this is positive when we
+				have pending operations against this
+				tablespace. The pending operations can
+				be ibuf merges or lock validation code
+				trying to read a block.
+				Dropping of the tablespace is forbidden
+				if this is positive */
+	hash_node_t	hash;	/*!< hash chain node */
+	hash_node_t	name_hash;/*!< hash chain the name_hash table */
+#ifndef UNIV_HOTBACKUP
+	rw_lock_t	latch;	/*!< latch protecting the file space storage
+				allocation */
+#endif /* !UNIV_HOTBACKUP */
+	UT_LIST_NODE_T(fil_space_t) unflushed_spaces;
+				/*!< list of spaces with at least one unflushed
+				file we have written to */
+	bool		is_in_unflushed_spaces;
+				/*!< true if this space is currently in
+				unflushed_spaces */
+	UT_LIST_NODE_T(fil_space_t) space_list;
+				/*!< list of all spaces */
+	ulint		magic_n;/*!< FIL_SPACE_MAGIC_N */
+};
+
+/** Value of fil_space_t::magic_n */
+#define	FIL_SPACE_MAGIC_N	89472
+
+/** The tablespace memory cache; also the totality of logs (the log
+data space) is stored here; below we talk about tablespaces, but also
+the ib_logfiles form a 'space' and it is handled here */
+struct fil_system_t {
+#ifndef UNIV_HOTBACKUP
+	ib_mutex_t		mutex;		/*!< The mutex protecting the cache */
+#endif /* !UNIV_HOTBACKUP */
+	hash_table_t*	spaces;		/*!< The hash table of spaces in the
+					system; they are hashed on the space
+					id */
+	hash_table_t*	name_hash;	/*!< hash table based on the space
+					name */
+	UT_LIST_BASE_NODE_T(fil_node_t) LRU;
+					/*!< base node for the LRU list of the
+					most recently used open files with no
+					pending i/o's; if we start an i/o on
+					the file, we first remove it from this
+					list, and return it to the start of
+					the list when the i/o ends;
+					log files and the system tablespace are
+					not put to this list: they are opened
+					after the startup, and kept open until
+					shutdown */
+	UT_LIST_BASE_NODE_T(fil_space_t) unflushed_spaces;
+					/*!< base node for the list of those
+					tablespaces whose files contain
+					unflushed writes; those spaces have
+					at least one file node where
+					modification_counter > flush_counter */
+	ulint		n_open;		/*!< number of files currently open */
+	ulint		max_n_open;	/*!< n_open is not allowed to exceed
+					this */
+	ib_int64_t	modification_counter;/*!< when we write to a file we
+					increment this by one */
+	ulint		max_assigned_id;/*!< maximum space id in the existing
+					tables, or assigned during the time
+					mysqld has been up; at an InnoDB
+					startup we scan the data dictionary
+					and set here the maximum of the
+					space id's of the tables there */
+	ib_int64_t	tablespace_version;
+					/*!< a counter which is incremented for
+					every space object memory creation;
+					every space mem object gets a
+					'timestamp' from this; in DISCARD/
+					IMPORT this is used to check if we
+					should ignore an insert buffer merge
+					request */
+	UT_LIST_BASE_NODE_T(fil_space_t) space_list;
+					/*!< list of all file spaces */
+	ibool		space_id_reuse_warned;
+					/* !< TRUE if fil_space_create()
+					has issued a warning about
+					potential space_id reuse */
+};
+
+/** The tablespace memory cache. This variable is NULL before the module is
+initialized. */
+static fil_system_t*	fil_system	= NULL;
+
+/** Determine if (i) is a user tablespace id or not. */
+# define fil_is_user_tablespace_id(i) ((i) > srv_undo_tablespaces_open)
+
+/** Determine if user has explicitly disabled fsync(). */
+#ifndef __WIN__
+# define fil_buffering_disabled(s)	\
+	((s)->purpose == FIL_TABLESPACE	\
+	 && srv_unix_file_flush_method	\
+	 == SRV_UNIX_O_DIRECT_NO_FSYNC)
+#else /* __WIN__ */
+# define fil_buffering_disabled(s)	(0)
+#endif /* __WIN__ */
+
+#ifdef UNIV_DEBUG
+/** Try fil_validate() every this many times */
+# define FIL_VALIDATE_SKIP	17
+
+/******************************************************************//**
+Checks the consistency of the tablespace cache some of the time.
+@return	TRUE if ok or the check was skipped */
+static
+ibool
+fil_validate_skip(void)
+/*===================*/
+{
+	/** The fil_validate() call skip counter. Use a signed type
+	because of the race condition below. */
+	static int fil_validate_count = FIL_VALIDATE_SKIP;
+
+	/* There is a race condition below, but it does not matter,
+	because this call is only for heuristic purposes. We want to
+	reduce the call frequency of the costly fil_validate() check
+	in debug builds. */
+	if (--fil_validate_count > 0) {
+		return(TRUE);
+	}
+
+	fil_validate_count = FIL_VALIDATE_SKIP;
+	return(fil_validate());
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Determines if a file node belongs to the least-recently-used list.
+@return TRUE if the file belongs to fil_system->LRU mutex. */
+UNIV_INLINE
+ibool
+fil_space_belongs_in_lru(
+/*=====================*/
+	const fil_space_t*	space)	/*!< in: file space */
+{
+	return(space->purpose == FIL_TABLESPACE
+	       && fil_is_user_tablespace_id(space->id));
+}
+
+/********************************************************************//**
+NOTE: you must call fil_mutex_enter_and_prepare_for_io() first!
+
+Prepares a file node for i/o. Opens the file if it is closed. Updates the
+pending i/o's field in the node and the system appropriately. Takes the node
+off the LRU list if it is in the LRU list. The caller must hold the fil_sys
+mutex.
+@return false if the file can't be opened, otherwise true */
+static
+bool
+fil_node_prepare_for_io(
+/*====================*/
+	fil_node_t*	node,	/*!< in: file node */
+	fil_system_t*	system,	/*!< in: tablespace memory cache */
+	fil_space_t*	space);	/*!< in: space */
+/********************************************************************//**
+Updates the data structures when an i/o operation finishes. Updates the
+pending i/o's field in the node appropriately. */
+static
+void
+fil_node_complete_io(
+/*=================*/
+	fil_node_t*	node,	/*!< in: file node */
+	fil_system_t*	system,	/*!< in: tablespace memory cache */
+	ulint		type);	/*!< in: OS_FILE_WRITE or OS_FILE_READ; marks
+				the node as modified if
+				type == OS_FILE_WRITE */
+/*******************************************************************//**
+Frees a space object from the tablespace memory cache. Closes the files in
+the chain but does not delete them. There must not be any pending i/o's or
+flushes on the files.
+@return TRUE on success */
+static
+ibool
+fil_space_free(
+/*===========*/
+	ulint		id,		/* in: space id */
+	ibool		x_latched);	/* in: TRUE if caller has space->latch
+					in X mode */
+/********************************************************************//**
+Reads data from a space to a buffer. Remember that the possible incomplete
+blocks at the end of file are ignored: they are not taken into account when
+calculating the byte offset within a space.
+@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
+i/o on a tablespace which does not exist */
+UNIV_INLINE
+dberr_t
+fil_read(
+/*=====*/
+	bool	sync,		/*!< in: true if synchronous aio is desired */
+	ulint	space_id,	/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	block_offset,	/*!< in: offset in number of blocks */
+	ulint	byte_offset,	/*!< in: remainder of offset in bytes; in aio
+				this must be divisible by the OS block size */
+	ulint	len,		/*!< in: how many bytes to read; this must not
+				cross a file boundary; in aio this must be a
+				block size multiple */
+	void*	buf,		/*!< in/out: buffer where to store data read;
+				in aio this must be appropriately aligned */
+	void*	message)	/*!< in: message for aio handler if non-sync
+				aio used, else ignored */
+{
+	return(fil_io(OS_FILE_READ, sync, space_id, zip_size, block_offset,
+					  byte_offset, len, buf, message));
+}
+
+/********************************************************************//**
+Writes data to a space from a buffer. Remember that the possible incomplete
+blocks at the end of file are ignored: they are not taken into account when
+calculating the byte offset within a space.
+@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
+i/o on a tablespace which does not exist */
+UNIV_INLINE
+dberr_t
+fil_write(
+/*======*/
+	bool	sync,		/*!< in: true if synchronous aio is desired */
+	ulint	space_id,	/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	block_offset,	/*!< in: offset in number of blocks */
+	ulint	byte_offset,	/*!< in: remainder of offset in bytes; in aio
+				this must be divisible by the OS block size */
+	ulint	len,		/*!< in: how many bytes to write; this must
+				not cross a file boundary; in aio this must
+				be a block size multiple */
+	void*	buf,		/*!< in: buffer from which to write; in aio
+				this must be appropriately aligned */
+	void*	message)	/*!< in: message for aio handler if non-sync
+				aio used, else ignored */
+{
+	ut_ad(!srv_read_only_mode);
+
+	return(fil_io(OS_FILE_WRITE, sync, space_id, zip_size, block_offset,
+					   byte_offset, len, buf, message));
+}
+
+/*******************************************************************//**
+Returns the table space by a given id, NULL if not found. */
+UNIV_INLINE
+fil_space_t*
+fil_space_get_by_id(
+/*================*/
+	ulint	id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+
+	ut_ad(mutex_own(&fil_system->mutex));
+
+	HASH_SEARCH(hash, fil_system->spaces, id,
+		    fil_space_t*, space,
+		    ut_ad(space->magic_n == FIL_SPACE_MAGIC_N),
+		    space->id == id);
+
+	return(space);
+}
+
+/*******************************************************************//**
+Returns the table space by a given name, NULL if not found. */
+UNIV_INLINE
+fil_space_t*
+fil_space_get_by_name(
+/*==================*/
+	const char*	name)	/*!< in: space name */
+{
+	fil_space_t*	space;
+	ulint		fold;
+
+	ut_ad(mutex_own(&fil_system->mutex));
+
+	fold = ut_fold_string(name);
+
+	HASH_SEARCH(name_hash, fil_system->name_hash, fold,
+		    fil_space_t*, space,
+		    ut_ad(space->magic_n == FIL_SPACE_MAGIC_N),
+		    !strcmp(name, space->name));
+
+	return(space);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Returns the version number of a tablespace, -1 if not found.
+@return version number, -1 if the tablespace does not exist in the
+memory cache */
+UNIV_INTERN
+ib_int64_t
+fil_space_get_version(
+/*==================*/
+	ulint	id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+	ib_int64_t	version		= -1;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	if (space) {
+		version = space->tablespace_version;
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(version);
+}
+
+/*******************************************************************//**
+Returns the latch of a file space.
+@return	latch protecting storage allocation */
+UNIV_INTERN
+rw_lock_t*
+fil_space_get_latch(
+/*================*/
+	ulint	id,	/*!< in: space id */
+	ulint*	flags)	/*!< out: tablespace flags */
+{
+	fil_space_t*	space;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	ut_a(space);
+
+	if (flags) {
+		*flags = space->flags;
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(&(space->latch));
+}
+
+/*******************************************************************//**
+Returns the type of a file space.
+@return	FIL_TABLESPACE or FIL_LOG */
+UNIV_INTERN
+ulint
+fil_space_get_type(
+/*===============*/
+	ulint	id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	ut_a(space);
+
+	mutex_exit(&fil_system->mutex);
+
+	return(space->purpose);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Checks if all the file nodes in a space are flushed. The caller must hold
+the fil_system mutex.
+@return	true if all are flushed */
+static
+bool
+fil_space_is_flushed(
+/*=================*/
+	fil_space_t*	space)	/*!< in: space */
+{
+	fil_node_t*	node;
+
+	ut_ad(mutex_own(&fil_system->mutex));
+
+	node = UT_LIST_GET_FIRST(space->chain);
+
+	while (node) {
+		if (node->modification_counter > node->flush_counter) {
+
+			ut_ad(!fil_buffering_disabled(space));
+			return(false);
+		}
+
+		node = UT_LIST_GET_NEXT(chain, node);
+	}
+
+	return(true);
+}
+
+/*******************************************************************//**
+Appends a new file to the chain of files of a space. File must be closed.
+@return pointer to the file name, or NULL on error */
+UNIV_INTERN
+char*
+fil_node_create(
+/*============*/
+	const char*	name,	/*!< in: file name (file must be closed) */
+	ulint		size,	/*!< in: file size in database blocks, rounded
+				downwards to an integer */
+	ulint		id,	/*!< in: space id where to append */
+	ibool		is_raw)	/*!< in: TRUE if a raw device or
+				a raw disk partition */
+{
+	fil_node_t*	node;
+	fil_space_t*	space;
+
+	ut_a(fil_system);
+	ut_a(name);
+
+	mutex_enter(&fil_system->mutex);
+
+	node = static_cast<fil_node_t*>(mem_zalloc(sizeof(fil_node_t)));
+
+	node->name = mem_strdup(name);
+
+	ut_a(!is_raw || srv_start_raw_disk_in_use);
+
+	node->sync_event = os_event_create();
+	node->is_raw_disk = is_raw;
+	node->size = size;
+	node->magic_n = FIL_NODE_MAGIC_N;
+
+	space = fil_space_get_by_id(id);
+
+	if (!space) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: Could not find tablespace %lu for\n"
+			"InnoDB: file ", (ulong) id);
+		ut_print_filename(stderr, name);
+		fputs(" in the tablespace memory cache.\n", stderr);
+		mem_free(node->name);
+
+		mem_free(node);
+
+		mutex_exit(&fil_system->mutex);
+
+		return(NULL);
+	}
+
+	space->size += size;
+
+	node->space = space;
+
+	UT_LIST_ADD_LAST(chain, space->chain, node);
+
+	if (id < SRV_LOG_SPACE_FIRST_ID && fil_system->max_assigned_id < id) {
+
+		fil_system->max_assigned_id = id;
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(node->name);
+}
+
+/********************************************************************//**
+Opens a file of a node of a tablespace. The caller must own the fil_system
+mutex.
+@return false if the file can't be opened, otherwise true */
+static
+bool
+fil_node_open_file(
+/*===============*/
+	fil_node_t*	node,	/*!< in: file node */
+	fil_system_t*	system,	/*!< in: tablespace memory cache */
+	fil_space_t*	space)	/*!< in: space */
+{
+	os_offset_t	size_bytes;
+	ibool		ret;
+	ibool		success;
+	byte*		buf2;
+	byte*		page;
+	ulint		space_id;
+	ulint		flags;
+	ulint		page_size;
+
+	ut_ad(mutex_own(&(system->mutex)));
+	ut_a(node->n_pending == 0);
+	ut_a(node->open == FALSE);
+
+	if (node->size == 0) {
+		/* It must be a single-table tablespace and we do not know the
+		size of the file yet. First we open the file in the normal
+		mode, no async I/O here, for simplicity. Then do some checks,
+		and close the file again.
+		NOTE that we could not use the simple file read function
+		os_file_read() in Windows to read from a file opened for
+		async I/O! */
+
+		node->handle = os_file_create_simple_no_error_handling(
+			innodb_file_data_key, node->name, OS_FILE_OPEN,
+			OS_FILE_READ_ONLY, &success);
+		if (!success) {
+			/* The following call prints an error message */
+			os_file_get_last_error(true);
+
+			ut_print_timestamp(stderr);
+
+			ib_logf(IB_LOG_LEVEL_WARN, "InnoDB: Error: cannot "
+				"open %s\n. InnoDB: Have you deleted .ibd "
+				"files under a running mysqld server?\n",
+				node->name);
+
+			return(false);
+		}
+
+		size_bytes = os_file_get_size(node->handle);
+		ut_a(size_bytes != (os_offset_t) -1);
+#ifdef UNIV_HOTBACKUP
+		if (space->id == 0) {
+			node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
+			os_file_close(node->handle);
+			goto add_size;
+		}
+#endif /* UNIV_HOTBACKUP */
+		ut_a(space->purpose != FIL_LOG);
+		ut_a(fil_is_user_tablespace_id(space->id));
+
+		if (size_bytes < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) {
+			fprintf(stderr,
+				"InnoDB: Error: the size of single-table"
+				" tablespace file %s\n"
+				"InnoDB: is only " UINT64PF ","
+				" should be at least %lu!\n",
+				node->name,
+				size_bytes,
+				(ulong) (FIL_IBD_FILE_INITIAL_SIZE
+					 * UNIV_PAGE_SIZE));
+
+			ut_a(0);
+		}
+
+		/* Read the first page of the tablespace */
+
+		buf2 = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
+		/* Align the memory for file i/o if we might have O_DIRECT
+		set */
+		page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
+
+		success = os_file_read(node->handle, page, 0, UNIV_PAGE_SIZE);
+		space_id = fsp_header_get_space_id(page);
+		flags = fsp_header_get_flags(page);
+		page_size = fsp_flags_get_page_size(flags);
+
+		ut_free(buf2);
+
+		/* Close the file now that we have read the space id from it */
+
+		os_file_close(node->handle);
+
+		if (UNIV_UNLIKELY(space_id != space->id)) {
+			fprintf(stderr,
+				"InnoDB: Error: tablespace id is %lu"
+				" in the data dictionary\n"
+				"InnoDB: but in file %s it is %lu!\n",
+				space->id, node->name, space_id);
+
+			ut_error;
+		}
+
+		if (UNIV_UNLIKELY(space_id == ULINT_UNDEFINED
+				  || space_id == 0)) {
+			fprintf(stderr,
+				"InnoDB: Error: tablespace id %lu"
+				" in file %s is not sensible\n",
+				(ulong) space_id, node->name);
+
+			ut_error;
+		}
+
+		if (UNIV_UNLIKELY(fsp_flags_get_page_size(space->flags)
+				  != page_size)) {
+			fprintf(stderr,
+				"InnoDB: Error: tablespace file %s"
+				" has page size 0x%lx\n"
+				"InnoDB: but the data dictionary"
+				" expects page size 0x%lx!\n",
+				node->name, flags,
+				fsp_flags_get_page_size(space->flags));
+
+			ut_error;
+		}
+
+		if (UNIV_UNLIKELY(space->flags != flags)) {
+			fprintf(stderr,
+				"InnoDB: Error: table flags are 0x%lx"
+				" in the data dictionary\n"
+				"InnoDB: but the flags in file %s are 0x%lx!\n",
+				space->flags, node->name, flags);
+
+			ut_error;
+		}
+
+		if (size_bytes >= 1024 * 1024) {
+			/* Truncate the size to whole megabytes. */
+			size_bytes = ut_2pow_round(size_bytes, 1024 * 1024);
+		}
+
+		if (!fsp_flags_is_compressed(flags)) {
+			node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
+		} else {
+			node->size = (ulint)
+				(size_bytes
+				 / fsp_flags_get_zip_size(flags));
+		}
+
+#ifdef UNIV_HOTBACKUP
+add_size:
+#endif /* UNIV_HOTBACKUP */
+		space->size += node->size;
+	}
+
+	/* printf("Opening file %s\n", node->name); */
+
+	/* Open the file for reading and writing, in Windows normally in the
+	unbuffered async I/O mode, though global variables may make
+	os_file_create() to fall back to the normal file I/O mode. */
+
+	if (space->purpose == FIL_LOG) {
+		node->handle = os_file_create(innodb_file_log_key,
+					      node->name, OS_FILE_OPEN,
+					      OS_FILE_AIO, OS_LOG_FILE,
+					      &ret);
+	} else if (node->is_raw_disk) {
+		node->handle = os_file_create(innodb_file_data_key,
+					      node->name,
+					      OS_FILE_OPEN_RAW,
+					      OS_FILE_AIO, OS_DATA_FILE,
+						     &ret);
+	} else {
+		node->handle = os_file_create(innodb_file_data_key,
+					      node->name, OS_FILE_OPEN,
+					      OS_FILE_AIO, OS_DATA_FILE,
+					      &ret);
+	}
+
+	ut_a(ret);
+
+	node->open = TRUE;
+
+	system->n_open++;
+	fil_n_file_opened++;
+
+	if (fil_space_belongs_in_lru(space)) {
+
+		/* Put the node to the LRU list */
+		UT_LIST_ADD_FIRST(LRU, system->LRU, node);
+	}
+
+	return(true);
+}
+
+/**********************************************************************//**
+Closes a file. */
+static
+void
+fil_node_close_file(
+/*================*/
+	fil_node_t*	node,	/*!< in: file node */
+	fil_system_t*	system)	/*!< in: tablespace memory cache */
+{
+	ibool	ret;
+
+	ut_ad(node && system);
+	ut_ad(mutex_own(&(system->mutex)));
+	ut_a(node->open);
+	ut_a(node->n_pending == 0);
+	ut_a(node->n_pending_flushes == 0);
+	ut_a(!node->being_extended);
+#ifndef UNIV_HOTBACKUP
+	ut_a(node->modification_counter == node->flush_counter
+	     || srv_fast_shutdown == 2);
+#endif /* !UNIV_HOTBACKUP */
+
+	ret = os_file_close(node->handle);
+	ut_a(ret);
+
+	/* printf("Closing file %s\n", node->name); */
+
+	node->open = FALSE;
+	ut_a(system->n_open > 0);
+	system->n_open--;
+	fil_n_file_opened--;
+
+	if (fil_space_belongs_in_lru(node->space)) {
+
+		ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
+
+		/* The node is in the LRU list, remove it */
+		UT_LIST_REMOVE(LRU, system->LRU, node);
+	}
+}
+
+/********************************************************************//**
+Tries to close a file in the LRU list. The caller must hold the fil_sys
+mutex.
+@return TRUE if success, FALSE if should retry later; since i/o's
+generally complete in < 100 ms, and as InnoDB writes at most 128 pages
+from the buffer pool in a batch, and then immediately flushes the
+files, there is a good chance that the next time we find a suitable
+node from the LRU list */
+static
+ibool
+fil_try_to_close_file_in_LRU(
+/*=========================*/
+	ibool	print_info)	/*!< in: if TRUE, prints information why it
+				cannot close a file */
+{
+	fil_node_t*	node;
+
+	ut_ad(mutex_own(&fil_system->mutex));
+
+	if (print_info) {
+		fprintf(stderr,
+			"InnoDB: fil_sys open file LRU len %lu\n",
+			(ulong) UT_LIST_GET_LEN(fil_system->LRU));
+	}
+
+	for (node = UT_LIST_GET_LAST(fil_system->LRU);
+	     node != NULL;
+	     node = UT_LIST_GET_PREV(LRU, node)) {
+
+		if (node->modification_counter == node->flush_counter
+		    && node->n_pending_flushes == 0
+		    && !node->being_extended) {
+
+			fil_node_close_file(node, fil_system);
+
+			return(TRUE);
+		}
+
+		if (!print_info) {
+			continue;
+		}
+
+		if (node->n_pending_flushes > 0) {
+			fputs("InnoDB: cannot close file ", stderr);
+			ut_print_filename(stderr, node->name);
+			fprintf(stderr, ", because n_pending_flushes %lu\n",
+				(ulong) node->n_pending_flushes);
+		}
+
+		if (node->modification_counter != node->flush_counter) {
+			fputs("InnoDB: cannot close file ", stderr);
+			ut_print_filename(stderr, node->name);
+			fprintf(stderr,
+				", because mod_count %ld != fl_count %ld\n",
+				(long) node->modification_counter,
+				(long) node->flush_counter);
+
+		}
+
+		if (node->being_extended) {
+			fputs("InnoDB: cannot close file ", stderr);
+			ut_print_filename(stderr, node->name);
+			fprintf(stderr, ", because it is being extended\n");
+		}
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Reserves the fil_system mutex and tries to make sure we can open at least one
+file while holding it. This should be called before calling
+fil_node_prepare_for_io(), because that function may need to open a file. */
+static
+void
+fil_mutex_enter_and_prepare_for_io(
+/*===============================*/
+	ulint	space_id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+	ibool		success;
+	ibool		print_info	= FALSE;
+	ulint		count		= 0;
+	ulint		count2		= 0;
+
+retry:
+	mutex_enter(&fil_system->mutex);
+
+	if (space_id == 0 || space_id >= SRV_LOG_SPACE_FIRST_ID) {
+		/* We keep log files and system tablespace files always open;
+		this is important in preventing deadlocks in this module, as
+		a page read completion often performs another read from the
+		insert buffer. The insert buffer is in tablespace 0, and we
+		cannot end up waiting in this function. */
+
+		return;
+	}
+
+	space = fil_space_get_by_id(space_id);
+
+	if (space != NULL && space->stop_ios) {
+		/* We are going to do a rename file and want to stop new i/o's
+		for a while */
+
+		if (count2 > 20000) {
+			fputs("InnoDB: Warning: tablespace ", stderr);
+			ut_print_filename(stderr, space->name);
+			fprintf(stderr,
+				" has i/o ops stopped for a long time %lu\n",
+				(ulong) count2);
+		}
+
+		mutex_exit(&fil_system->mutex);
+
+#ifndef UNIV_HOTBACKUP
+
+		/* Wake the i/o-handler threads to make sure pending
+		i/o's are performed */
+		os_aio_simulated_wake_handler_threads();
+
+		/* The sleep here is just to give IO helper threads a
+		bit of time to do some work. It is not required that
+		all IO related to the tablespace being renamed must
+		be flushed here as we do fil_flush() in
+		fil_rename_tablespace() as well. */
+		os_thread_sleep(20000);
+
+#endif /* UNIV_HOTBACKUP */
+
+		/* Flush tablespaces so that we can close modified
+		files in the LRU list */
+		fil_flush_file_spaces(FIL_TABLESPACE);
+
+		os_thread_sleep(20000);
+
+		count2++;
+
+		goto retry;
+	}
+
+	if (fil_system->n_open < fil_system->max_n_open) {
+
+		return;
+	}
+
+	/* If the file is already open, no need to do anything; if the space
+	does not exist, we handle the situation in the function which called
+	this function */
+
+	if (!space || UT_LIST_GET_FIRST(space->chain)->open) {
+
+		return;
+	}
+
+	if (count > 1) {
+		print_info = TRUE;
+	}
+
+	/* Too many files are open, try to close some */
+close_more:
+	success = fil_try_to_close_file_in_LRU(print_info);
+
+	if (success && fil_system->n_open >= fil_system->max_n_open) {
+
+		goto close_more;
+	}
+
+	if (fil_system->n_open < fil_system->max_n_open) {
+		/* Ok */
+
+		return;
+	}
+
+	if (count >= 2) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Warning: too many (%lu) files stay open"
+			" while the maximum\n"
+			"InnoDB: allowed value would be %lu.\n"
+			"InnoDB: You may need to raise the value of"
+			" innodb_open_files in\n"
+			"InnoDB: my.cnf.\n",
+			(ulong) fil_system->n_open,
+			(ulong) fil_system->max_n_open);
+
+		return;
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+#ifndef UNIV_HOTBACKUP
+	/* Wake the i/o-handler threads to make sure pending i/o's are
+	performed */
+	os_aio_simulated_wake_handler_threads();
+
+	os_thread_sleep(20000);
+#endif
+	/* Flush tablespaces so that we can close modified files in the LRU
+	list */
+
+	fil_flush_file_spaces(FIL_TABLESPACE);
+
+	count++;
+
+	goto retry;
+}
+
+/*******************************************************************//**
+Frees a file node object from a tablespace memory cache. */
+static
+void
+fil_node_free(
+/*==========*/
+	fil_node_t*	node,	/*!< in, own: file node */
+	fil_system_t*	system,	/*!< in: tablespace memory cache */
+	fil_space_t*	space)	/*!< in: space where the file node is chained */
+{
+	ut_ad(node && system && space);
+	ut_ad(mutex_own(&(system->mutex)));
+	ut_a(node->magic_n == FIL_NODE_MAGIC_N);
+	ut_a(node->n_pending == 0);
+	ut_a(!node->being_extended);
+
+	if (node->open) {
+		/* We fool the assertion in fil_node_close_file() to think
+		there are no unflushed modifications in the file */
+
+		node->modification_counter = node->flush_counter;
+		os_event_set(node->sync_event);
+
+		if (fil_buffering_disabled(space)) {
+
+			ut_ad(!space->is_in_unflushed_spaces);
+			ut_ad(fil_space_is_flushed(space));
+
+		} else if (space->is_in_unflushed_spaces
+			   && fil_space_is_flushed(space)) {
+
+			space->is_in_unflushed_spaces = false;
+
+			UT_LIST_REMOVE(unflushed_spaces,
+				       system->unflushed_spaces,
+				       space);
+		}
+
+		fil_node_close_file(node, system);
+	}
+
+	space->size -= node->size;
+
+	UT_LIST_REMOVE(chain, space->chain, node);
+
+	os_event_free(node->sync_event);
+	mem_free(node->name);
+	mem_free(node);
+}
+
+#ifdef UNIV_LOG_ARCHIVE
+/****************************************************************//**
+Drops files from the start of a file space, so that its size is cut by
+the amount given. */
+UNIV_INTERN
+void
+fil_space_truncate_start(
+/*=====================*/
+	ulint	id,		/*!< in: space id */
+	ulint	trunc_len)	/*!< in: truncate by this much; it is an error
+				if this does not equal to the combined size of
+				some initial files in the space */
+{
+	fil_node_t*	node;
+	fil_space_t*	space;
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	ut_a(space);
+
+	while (trunc_len > 0) {
+		node = UT_LIST_GET_FIRST(space->chain);
+
+		ut_a(node->size * UNIV_PAGE_SIZE <= trunc_len);
+
+		trunc_len -= node->size * UNIV_PAGE_SIZE;
+
+		fil_node_free(node, fil_system, space);
+	}
+
+	mutex_exit(&fil_system->mutex);
+}
+#endif /* UNIV_LOG_ARCHIVE */
+
+/*******************************************************************//**
+Creates a space memory object and puts it to the 'fil system' hash table.
+If there is an error, prints an error message to the .err log.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_space_create(
+/*=============*/
+	const char*	name,	/*!< in: space name */
+	ulint		id,	/*!< in: space id */
+	ulint		flags,	/*!< in: tablespace flags */
+	ulint		purpose)/*!< in: FIL_TABLESPACE, or FIL_LOG if log */
+{
+	fil_space_t*	space;
+
+	DBUG_EXECUTE_IF("fil_space_create_failure", return(false););
+
+	ut_a(fil_system);
+	ut_a(fsp_flags_is_valid(flags));
+
+	/* Look for a matching tablespace and if found free it. */
+	do {
+		mutex_enter(&fil_system->mutex);
+
+		space = fil_space_get_by_name(name);
+
+		if (space != 0) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Tablespace '%s' exists in the cache "
+				"with id %lu != %lu",
+				name, (ulong) space->id, (ulong) id);
+
+			if (id == 0 || purpose != FIL_TABLESPACE) {
+
+				mutex_exit(&fil_system->mutex);
+
+				return(FALSE);
+			}
+
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Freeing existing tablespace '%s' entry "
+				"from the cache with id %lu",
+				name, (ulong) id);
+
+			ibool	success = fil_space_free(space->id, FALSE);
+			ut_a(success);
+
+			mutex_exit(&fil_system->mutex);
+		}
+
+	} while (space != 0);
+
+	space = fil_space_get_by_id(id);
+
+	if (space != 0) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Trying to add tablespace '%s' with id %lu "
+			"to the tablespace memory cache, but tablespace '%s' "
+			"with id %lu already exists in the cache!",
+			name, (ulong) id, space->name, (ulong) space->id);
+
+		mutex_exit(&fil_system->mutex);
+
+		return(FALSE);
+	}
+
+	space = static_cast<fil_space_t*>(mem_zalloc(sizeof(*space)));
+
+	space->name = mem_strdup(name);
+	space->id = id;
+
+	fil_system->tablespace_version++;
+	space->tablespace_version = fil_system->tablespace_version;
+	space->mark = FALSE;
+
+	if (purpose == FIL_TABLESPACE && !recv_recovery_on
+	    && id > fil_system->max_assigned_id) {
+
+		if (!fil_system->space_id_reuse_warned) {
+			fil_system->space_id_reuse_warned = TRUE;
+
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Allocated tablespace %lu, old maximum "
+				"was %lu",
+				(ulong) id,
+				(ulong) fil_system->max_assigned_id);
+		}
+
+		fil_system->max_assigned_id = id;
+	}
+
+	space->purpose = purpose;
+	space->flags = flags;
+
+	space->magic_n = FIL_SPACE_MAGIC_N;
+
+	rw_lock_create(fil_space_latch_key, &space->latch, SYNC_FSP);
+
+	HASH_INSERT(fil_space_t, hash, fil_system->spaces, id, space);
+
+	HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash,
+		    ut_fold_string(name), space);
+	space->is_in_unflushed_spaces = false;
+
+	UT_LIST_ADD_LAST(space_list, fil_system->space_list, space);
+
+	mutex_exit(&fil_system->mutex);
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Assigns a new space id for a new single-table tablespace. This works simply by
+incrementing the global counter. If 4 billion id's is not enough, we may need
+to recycle id's.
+@return	TRUE if assigned, FALSE if not */
+UNIV_INTERN
+ibool
+fil_assign_new_space_id(
+/*====================*/
+	ulint*	space_id)	/*!< in/out: space id */
+{
+	ulint	id;
+	ibool	success;
+
+	mutex_enter(&fil_system->mutex);
+
+	id = *space_id;
+
+	if (id < fil_system->max_assigned_id) {
+		id = fil_system->max_assigned_id;
+	}
+
+	id++;
+
+	if (id > (SRV_LOG_SPACE_FIRST_ID / 2) && (id % 1000000UL == 0)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"InnoDB: Warning: you are running out of new"
+			" single-table tablespace id's.\n"
+			"InnoDB: Current counter is %lu and it"
+			" must not exceed %lu!\n"
+			"InnoDB: To reset the counter to zero"
+			" you have to dump all your tables and\n"
+			"InnoDB: recreate the whole InnoDB installation.\n",
+			(ulong) id,
+			(ulong) SRV_LOG_SPACE_FIRST_ID);
+	}
+
+	success = (id < SRV_LOG_SPACE_FIRST_ID);
+
+	if (success) {
+		*space_id = fil_system->max_assigned_id = id;
+	} else {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"InnoDB: You have run out of single-table"
+			" tablespace id's!\n"
+			"InnoDB: Current counter is %lu.\n"
+			"InnoDB: To reset the counter to zero you"
+			" have to dump all your tables and\n"
+			"InnoDB: recreate the whole InnoDB installation.\n",
+			(ulong) id);
+		*space_id = ULINT_UNDEFINED;
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(success);
+}
+
+/*******************************************************************//**
+Frees a space object from the tablespace memory cache. Closes the files in
+the chain but does not delete them. There must not be any pending i/o's or
+flushes on the files.
+@return	TRUE if success */
+static
+ibool
+fil_space_free(
+/*===========*/
+					/* out: TRUE if success */
+	ulint		id,		/* in: space id */
+	ibool		x_latched)	/* in: TRUE if caller has space->latch
+					in X mode */
+{
+	fil_space_t*	space;
+	fil_space_t*	fnamespace;
+
+	ut_ad(mutex_own(&fil_system->mutex));
+
+	space = fil_space_get_by_id(id);
+
+	if (!space) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: trying to remove tablespace %lu"
+			" from the cache but\n"
+			"InnoDB: it is not there.\n", (ulong) id);
+
+		return(FALSE);
+	}
+
+	HASH_DELETE(fil_space_t, hash, fil_system->spaces, id, space);
+
+	fnamespace = fil_space_get_by_name(space->name);
+	ut_a(fnamespace);
+	ut_a(space == fnamespace);
+
+	HASH_DELETE(fil_space_t, name_hash, fil_system->name_hash,
+		    ut_fold_string(space->name), space);
+
+	if (space->is_in_unflushed_spaces) {
+
+		ut_ad(!fil_buffering_disabled(space));
+		space->is_in_unflushed_spaces = false;
+
+		UT_LIST_REMOVE(unflushed_spaces, fil_system->unflushed_spaces,
+			       space);
+	}
+
+	UT_LIST_REMOVE(space_list, fil_system->space_list, space);
+
+	ut_a(space->magic_n == FIL_SPACE_MAGIC_N);
+	ut_a(0 == space->n_pending_flushes);
+
+	for (fil_node_t* fil_node = UT_LIST_GET_FIRST(space->chain);
+	     fil_node != NULL;
+	     fil_node = UT_LIST_GET_FIRST(space->chain)) {
+
+		fil_node_free(fil_node, fil_system, space);
+	}
+
+	ut_a(0 == UT_LIST_GET_LEN(space->chain));
+
+	if (x_latched) {
+		rw_lock_x_unlock(&space->latch);
+	}
+
+	rw_lock_free(&(space->latch));
+
+	mem_free(space->name);
+	mem_free(space);
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Returns a pointer to the file_space_t that is in the memory cache
+associated with a space id. The caller must lock fil_system->mutex.
+@return	file_space_t pointer, NULL if space not found */
+UNIV_INLINE
+fil_space_t*
+fil_space_get_space(
+/*================*/
+	ulint	id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+	fil_node_t*	node;
+
+	ut_ad(fil_system);
+
+	space = fil_space_get_by_id(id);
+	if (space == NULL) {
+		return(NULL);
+	}
+
+	if (space->size == 0 && space->purpose == FIL_TABLESPACE) {
+		ut_a(id != 0);
+
+		mutex_exit(&fil_system->mutex);
+
+		/* It is possible that the space gets evicted at this point
+		before the fil_mutex_enter_and_prepare_for_io() acquires
+		the fil_system->mutex. Check for this after completing the
+		call to fil_mutex_enter_and_prepare_for_io(). */
+		fil_mutex_enter_and_prepare_for_io(id);
+
+		/* We are still holding the fil_system->mutex. Check if
+		the space is still in memory cache. */
+		space = fil_space_get_by_id(id);
+		if (space == NULL) {
+			return(NULL);
+		}
+
+		/* The following code must change when InnoDB supports
+		multiple datafiles per tablespace. */
+		ut_a(1 == UT_LIST_GET_LEN(space->chain));
+
+		node = UT_LIST_GET_FIRST(space->chain);
+
+		/* It must be a single-table tablespace and we have not opened
+		the file yet; the following calls will open it and update the
+		size fields */
+
+		if (!fil_node_prepare_for_io(node, fil_system, space)) {
+			/* The single-table tablespace can't be opened,
+			because the ibd file is missing. */
+			return(NULL);
+		}
+		fil_node_complete_io(node, fil_system, OS_FILE_READ);
+	}
+
+	return(space);
+}
+
+/*******************************************************************//**
+Returns the path from the first fil_node_t found for the space ID sent.
+The caller is responsible for freeing the memory allocated here for the
+value returned.
+@return	own: A copy of fil_node_t::path, NULL if space ID is zero
+or not found. */
+UNIV_INTERN
+char*
+fil_space_get_first_path(
+/*=====================*/
+	ulint		id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+	fil_node_t*	node;
+	char*		path;
+
+	ut_ad(fil_system);
+	ut_a(id);
+
+	fil_mutex_enter_and_prepare_for_io(id);
+
+	space = fil_space_get_space(id);
+
+	if (space == NULL) {
+		mutex_exit(&fil_system->mutex);
+
+		return(NULL);
+	}
+
+	ut_ad(mutex_own(&fil_system->mutex));
+
+	node = UT_LIST_GET_FIRST(space->chain);
+
+	path = mem_strdup(node->name);
+
+	mutex_exit(&fil_system->mutex);
+
+	return(path);
+}
+
+/*******************************************************************//**
+Returns the size of the space in pages. The tablespace must be cached in the
+memory cache.
+@return	space size, 0 if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_size(
+/*===============*/
+	ulint	id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+	ulint		size;
+
+	ut_ad(fil_system);
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_space(id);
+
+	size = space ? space->size : 0;
+
+	mutex_exit(&fil_system->mutex);
+
+	return(size);
+}
+
+/*******************************************************************//**
+Returns the flags of the space. The tablespace must be cached
+in the memory cache.
+@return	flags, ULINT_UNDEFINED if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_flags(
+/*================*/
+	ulint	id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+	ulint		flags;
+
+	ut_ad(fil_system);
+
+	if (!id) {
+		return(0);
+	}
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_space(id);
+
+	if (space == NULL) {
+		mutex_exit(&fil_system->mutex);
+
+		return(ULINT_UNDEFINED);
+	}
+
+	flags = space->flags;
+
+	mutex_exit(&fil_system->mutex);
+
+	return(flags);
+}
+
+/*******************************************************************//**
+Returns the compressed page size of the space, or 0 if the space
+is not compressed. The tablespace must be cached in the memory cache.
+@return	compressed page size, ULINT_UNDEFINED if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_zip_size(
+/*===================*/
+	ulint	id)	/*!< in: space id */
+{
+	ulint	flags;
+
+	flags = fil_space_get_flags(id);
+
+	if (flags && flags != ULINT_UNDEFINED) {
+
+		return(fsp_flags_get_zip_size(flags));
+	}
+
+	return(flags);
+}
+
+/*******************************************************************//**
+Checks if the pair space, page_no refers to an existing page in a tablespace
+file space. The tablespace must be cached in the memory cache.
+@return	TRUE if the address is meaningful */
+UNIV_INTERN
+ibool
+fil_check_adress_in_tablespace(
+/*===========================*/
+	ulint	id,	/*!< in: space id */
+	ulint	page_no)/*!< in: page number */
+{
+	if (fil_space_get_size(id) > page_no) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/****************************************************************//**
+Initializes the tablespace memory cache. */
+UNIV_INTERN
+void
+fil_init(
+/*=====*/
+	ulint	hash_size,	/*!< in: hash table size */
+	ulint	max_n_open)	/*!< in: max number of open files */
+{
+	ut_a(fil_system == NULL);
+
+	ut_a(hash_size > 0);
+	ut_a(max_n_open > 0);
+
+	fil_system = static_cast<fil_system_t*>(
+		mem_zalloc(sizeof(fil_system_t)));
+
+	mutex_create(fil_system_mutex_key,
+		     &fil_system->mutex, SYNC_ANY_LATCH);
+
+	fil_system->spaces = hash_create(hash_size);
+	fil_system->name_hash = hash_create(hash_size);
+
+	UT_LIST_INIT(fil_system->LRU);
+
+	fil_system->max_n_open = max_n_open;
+}
+
+/*******************************************************************//**
+Opens all log files and system tablespace data files. They stay open until the
+database server shutdown. This should be called at a server startup after the
+space objects for the log and the system tablespace have been created. The
+purpose of this operation is to make sure we never run out of file descriptors
+if we need to read from the insert buffer or to write to the log. */
+UNIV_INTERN
+void
+fil_open_log_and_system_tablespace_files(void)
+/*==========================================*/
+{
+	fil_space_t*	space;
+
+	mutex_enter(&fil_system->mutex);
+
+	for (space = UT_LIST_GET_FIRST(fil_system->space_list);
+	     space != NULL;
+	     space = UT_LIST_GET_NEXT(space_list, space)) {
+
+		fil_node_t*	node;
+
+		if (fil_space_belongs_in_lru(space)) {
+
+			continue;
+		}
+
+		for (node = UT_LIST_GET_FIRST(space->chain);
+		     node != NULL;
+		     node = UT_LIST_GET_NEXT(chain, node)) {
+
+			if (!node->open) {
+				if (!fil_node_open_file(node, fil_system,
+							space)) {
+					/* This func is called during server's
+					startup. If some file of log or system
+					tablespace is missing, the server
+					can't start successfully. So we should
+					assert for it. */
+					ut_a(0);
+				}
+			}
+
+			if (fil_system->max_n_open < 10 + fil_system->n_open) {
+
+				fprintf(stderr,
+					"InnoDB: Warning: you must"
+					" raise the value of"
+					" innodb_open_files in\n"
+					"InnoDB: my.cnf! Remember that"
+					" InnoDB keeps all log files"
+					" and all system\n"
+					"InnoDB: tablespace files open"
+					" for the whole time mysqld is"
+					" running, and\n"
+					"InnoDB: needs to open also"
+					" some .ibd files if the"
+					" file-per-table storage\n"
+					"InnoDB: model is used."
+					" Current open files %lu,"
+					" max allowed"
+					" open files %lu.\n",
+					(ulong) fil_system->n_open,
+					(ulong) fil_system->max_n_open);
+			}
+		}
+	}
+
+	mutex_exit(&fil_system->mutex);
+}
+
+/*******************************************************************//**
+Closes all open files. There must not be any pending i/o's or not flushed
+modifications in the files. */
+UNIV_INTERN
+void
+fil_close_all_files(void)
+/*=====================*/
+{
+	fil_space_t*	space;
+
+	mutex_enter(&fil_system->mutex);
+
+	space = UT_LIST_GET_FIRST(fil_system->space_list);
+
+	while (space != NULL) {
+		fil_node_t*	node;
+		fil_space_t*	prev_space = space;
+
+		for (node = UT_LIST_GET_FIRST(space->chain);
+		     node != NULL;
+		     node = UT_LIST_GET_NEXT(chain, node)) {
+
+			if (node->open) {
+				fil_node_close_file(node, fil_system);
+			}
+		}
+
+		space = UT_LIST_GET_NEXT(space_list, space);
+
+		fil_space_free(prev_space->id, FALSE);
+	}
+
+	mutex_exit(&fil_system->mutex);
+}
+
+/*******************************************************************//**
+Closes the redo log files. There must not be any pending i/o's or not
+flushed modifications in the files. */
+UNIV_INTERN
+void
+fil_close_log_files(
+/*================*/
+	bool	free)	/*!< in: whether to free the memory object */
+{
+	fil_space_t*	space;
+
+	mutex_enter(&fil_system->mutex);
+
+	space = UT_LIST_GET_FIRST(fil_system->space_list);
+
+	while (space != NULL) {
+		fil_node_t*	node;
+		fil_space_t*	prev_space = space;
+
+		if (space->purpose != FIL_LOG) {
+			space = UT_LIST_GET_NEXT(space_list, space);
+			continue;
+		}
+
+		for (node = UT_LIST_GET_FIRST(space->chain);
+		     node != NULL;
+		     node = UT_LIST_GET_NEXT(chain, node)) {
+
+			if (node->open) {
+				fil_node_close_file(node, fil_system);
+			}
+		}
+
+		space = UT_LIST_GET_NEXT(space_list, space);
+
+		if (free) {
+			fil_space_free(prev_space->id, FALSE);
+		}
+	}
+
+	mutex_exit(&fil_system->mutex);
+}
+
+/*******************************************************************//**
+Sets the max tablespace id counter if the given number is bigger than the
+previous value. */
+UNIV_INTERN
+void
+fil_set_max_space_id_if_bigger(
+/*===========================*/
+	ulint	max_id)	/*!< in: maximum known id */
+{
+	if (max_id >= SRV_LOG_SPACE_FIRST_ID) {
+		fprintf(stderr,
+			"InnoDB: Fatal error: max tablespace id"
+			" is too high, %lu\n", (ulong) max_id);
+		ut_error;
+	}
+
+	mutex_enter(&fil_system->mutex);
+
+	if (fil_system->max_assigned_id < max_id) {
+
+		fil_system->max_assigned_id = max_id;
+	}
+
+	mutex_exit(&fil_system->mutex);
+}
+
+/****************************************************************//**
+Writes the flushed lsn and the latest archived log number to the page header
+of the first page of a data file of the system tablespace (space 0),
+which is uncompressed. */
+static __attribute__((warn_unused_result))
+dberr_t
+fil_write_lsn_and_arch_no_to_file(
+/*==============================*/
+	ulint	space,		/*!< in: space to write to */
+	ulint	sum_of_sizes,	/*!< in: combined size of previous files
+				in space, in database pages */
+	lsn_t	lsn,		/*!< in: lsn to write */
+	ulint	arch_log_no __attribute__((unused)))
+				/*!< in: archived log number to write */
+{
+	byte*	buf1;
+	byte*	buf;
+	dberr_t	err;
+
+	buf1 = static_cast<byte*>(mem_alloc(2 * UNIV_PAGE_SIZE));
+	buf = static_cast<byte*>(ut_align(buf1, UNIV_PAGE_SIZE));
+
+	err = fil_read(TRUE, space, 0, sum_of_sizes, 0,
+		       UNIV_PAGE_SIZE, buf, NULL);
+	if (err == DB_SUCCESS) {
+		mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn);
+
+		err = fil_write(TRUE, space, 0, sum_of_sizes, 0,
+				UNIV_PAGE_SIZE, buf, NULL);
+	}
+
+	mem_free(buf1);
+
+	return(err);
+}
+
+/****************************************************************//**
+Writes the flushed lsn and the latest archived log number to the page
+header of the first page of each data file in the system tablespace.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+dberr_t
+fil_write_flushed_lsn_to_data_files(
+/*================================*/
+	lsn_t	lsn,		/*!< in: lsn to write */
+	ulint	arch_log_no)	/*!< in: latest archived log file number */
+{
+	fil_space_t*	space;
+	fil_node_t*	node;
+	dberr_t		err;
+
+	mutex_enter(&fil_system->mutex);
+
+	for (space = UT_LIST_GET_FIRST(fil_system->space_list);
+	     space != NULL;
+	     space = UT_LIST_GET_NEXT(space_list, space)) {
+
+		/* We only write the lsn to all existing data files which have
+		been open during the lifetime of the mysqld process; they are
+		represented by the space objects in the tablespace memory
+		cache. Note that all data files in the system tablespace 0
+		and the UNDO log tablespaces (if separate) are always open. */
+
+		if (space->purpose == FIL_TABLESPACE
+		    && !fil_is_user_tablespace_id(space->id)) {
+			ulint	sum_of_sizes = 0;
+
+			for (node = UT_LIST_GET_FIRST(space->chain);
+			     node != NULL;
+			     node = UT_LIST_GET_NEXT(chain, node)) {
+
+				mutex_exit(&fil_system->mutex);
+
+				err = fil_write_lsn_and_arch_no_to_file(
+					space->id, sum_of_sizes, lsn,
+					arch_log_no);
+
+				if (err != DB_SUCCESS) {
+
+					return(err);
+				}
+
+				mutex_enter(&fil_system->mutex);
+
+				sum_of_sizes += node->size;
+			}
+		}
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Checks the consistency of the first data page of a tablespace
+at database startup.
+@retval NULL on success, or if innodb_force_recovery is set
+@return pointer to an error message string */
+static __attribute__((warn_unused_result))
+const char*
+fil_check_first_page(
+/*=================*/
+	const page_t*	page)		/*!< in: data page */
+{
+	ulint	space_id;
+	ulint	flags;
+
+	if (srv_force_recovery >= SRV_FORCE_IGNORE_CORRUPT) {
+		return(NULL);
+	}
+
+	space_id = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + page);
+	flags = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page);
+
+	if (UNIV_PAGE_SIZE != fsp_flags_get_page_size(flags)) {
+		return("innodb-page-size mismatch");
+	}
+
+	if (!space_id && !flags) {
+		ulint		nonzero_bytes	= UNIV_PAGE_SIZE;
+		const byte*	b		= page;
+
+		while (!*b && --nonzero_bytes) {
+			b++;
+		}
+
+		if (!nonzero_bytes) {
+			return("space header page consists of zero bytes");
+		}
+	}
+
+	if (buf_page_is_corrupted(
+		    false, page, fsp_flags_get_zip_size(flags))) {
+		return("checksum mismatch");
+	}
+
+	if (page_get_space_id(page) == space_id
+	    && page_get_page_no(page) == 0) {
+		return(NULL);
+	}
+
+	return("inconsistent data in space header");
+}
+
+/*******************************************************************//**
+Reads the flushed lsn, arch no, space_id and tablespace flag fields from
+the first page of a data file at database startup.
+@retval NULL on success, or if innodb_force_recovery is set
+@return pointer to an error message string */
+UNIV_INTERN
+const char*
+fil_read_first_page(
+/*================*/
+	os_file_t	data_file,		/*!< in: open data file */
+	ibool		one_read_already,	/*!< in: TRUE if min and max
+						parameters below already
+						contain sensible data */
+	ulint*		flags,			/*!< out: tablespace flags */
+	ulint*		space_id,		/*!< out: tablespace ID */
+#ifdef UNIV_LOG_ARCHIVE
+	ulint*		min_arch_log_no,	/*!< out: min of archived
+						log numbers in data files */
+	ulint*		max_arch_log_no,	/*!< out: max of archived
+						log numbers in data files */
+#endif /* UNIV_LOG_ARCHIVE */
+	lsn_t*		min_flushed_lsn,	/*!< out: min of flushed
+						lsn values in data files */
+	lsn_t*		max_flushed_lsn)	/*!< out: max of flushed
+						lsn values in data files */
+{
+	byte*		buf;
+	byte*		page;
+	lsn_t		flushed_lsn;
+	const char*	check_msg = NULL;
+
+	buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
+
+	/* Align the memory for a possible read from a raw device */
+
+	page = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
+
+	os_file_read(data_file, page, 0, UNIV_PAGE_SIZE);
+
+	/* The FSP_HEADER on page 0 is only valid for the first file
+	in a tablespace.  So if this is not the first datafile, leave
+	*flags and *space_id as they were read from the first file and
+	do not validate the first page. */
+	if (!one_read_already) {
+		*flags = fsp_header_get_flags(page);
+		*space_id = fsp_header_get_space_id(page);
+
+		check_msg = fil_check_first_page(page);
+	}
+
+	flushed_lsn = mach_read_from_8(page + FIL_PAGE_FILE_FLUSH_LSN);
+
+	ut_free(buf);
+
+	if (check_msg) {
+		return(check_msg);
+	}
+
+	if (!one_read_already) {
+		*min_flushed_lsn = flushed_lsn;
+		*max_flushed_lsn = flushed_lsn;
+#ifdef UNIV_LOG_ARCHIVE
+		*min_arch_log_no = arch_log_no;
+		*max_arch_log_no = arch_log_no;
+#endif /* UNIV_LOG_ARCHIVE */
+		return(NULL);
+	}
+
+	if (*min_flushed_lsn > flushed_lsn) {
+		*min_flushed_lsn = flushed_lsn;
+	}
+	if (*max_flushed_lsn < flushed_lsn) {
+		*max_flushed_lsn = flushed_lsn;
+	}
+#ifdef UNIV_LOG_ARCHIVE
+	if (*min_arch_log_no > arch_log_no) {
+		*min_arch_log_no = arch_log_no;
+	}
+	if (*max_arch_log_no < arch_log_no) {
+		*max_arch_log_no = arch_log_no;
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+
+	return(NULL);
+}
+
+/*================ SINGLE-TABLE TABLESPACES ==========================*/
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Increments the count of pending operation, if space is not being deleted.
+@return	TRUE if being deleted, and operation should be skipped */
+UNIV_INTERN
+ibool
+fil_inc_pending_ops(
+/*================*/
+	ulint	id,		/*!< in: space id */
+	ibool	print_err)	/*!< in: need to print error or not */
+{
+	fil_space_t*	space;
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	if (space == NULL) {
+		if (print_err) {
+			fprintf(stderr,
+				"InnoDB: Error: trying to do an operation on a"
+				" dropped tablespace %lu\n",
+				(ulong) id);
+		}
+	}
+
+	if (space == NULL || space->stop_new_ops) {
+		mutex_exit(&fil_system->mutex);
+
+		return(TRUE);
+	}
+
+	space->n_pending_ops++;
+
+	mutex_exit(&fil_system->mutex);
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Decrements the count of pending operations. */
+UNIV_INTERN
+void
+fil_decr_pending_ops(
+/*=================*/
+	ulint	id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	if (space == NULL) {
+		fprintf(stderr,
+			"InnoDB: Error: decrementing pending operation"
+			" of a dropped tablespace %lu\n",
+			(ulong) id);
+	}
+
+	if (space != NULL) {
+		space->n_pending_ops--;
+	}
+
+	mutex_exit(&fil_system->mutex);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Creates the database directory for a table if it does not exist yet. */
+static
+void
+fil_create_directory_for_tablename(
+/*===============================*/
+	const char*	name)	/*!< in: name in the standard
+				'databasename/tablename' format */
+{
+	const char*	namend;
+	char*		path;
+	ulint		len;
+
+	len = strlen(fil_path_to_mysql_datadir);
+	namend = strchr(name, '/');
+	ut_a(namend);
+	path = static_cast<char*>(mem_alloc(len + (namend - name) + 2));
+
+	memcpy(path, fil_path_to_mysql_datadir, len);
+	path[len] = '/';
+	memcpy(path + len + 1, name, namend - name);
+	path[len + (namend - name) + 1] = 0;
+
+	srv_normalize_path_for_win(path);
+
+	ut_a(os_file_create_directory(path, FALSE));
+	mem_free(path);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Writes a log record about an .ibd file create/rename/delete. */
+static
+void
+fil_op_write_log(
+/*=============*/
+	ulint		type,		/*!< in: MLOG_FILE_CREATE,
+					MLOG_FILE_CREATE2,
+					MLOG_FILE_DELETE, or
+					MLOG_FILE_RENAME */
+	ulint		space_id,	/*!< in: space id */
+	ulint		log_flags,	/*!< in: redo log flags (stored
+					in the page number field) */
+	ulint		flags,		/*!< in: compressed page size
+					and file format
+					if type==MLOG_FILE_CREATE2, or 0 */
+	const char*	name,		/*!< in: table name in the familiar
+					'databasename/tablename' format, or
+					the file path in the case of
+					MLOG_FILE_DELETE */
+	const char*	new_name,	/*!< in: if type is MLOG_FILE_RENAME,
+					the new table name in the
+					'databasename/tablename' format */
+	mtr_t*		mtr)		/*!< in: mini-transaction handle */
+{
+	byte*	log_ptr;
+	ulint	len;
+
+	log_ptr = mlog_open(mtr, 11 + 2 + 1);
+
+	if (!log_ptr) {
+		/* Logging in mtr is switched off during crash recovery:
+		in that case mlog_open returns NULL */
+		return;
+	}
+
+	log_ptr = mlog_write_initial_log_record_for_file_op(
+		type, space_id, log_flags, log_ptr, mtr);
+	if (type == MLOG_FILE_CREATE2) {
+		mach_write_to_4(log_ptr, flags);
+		log_ptr += 4;
+	}
+	/* Let us store the strings as null-terminated for easier readability
+	and handling */
+
+	len = strlen(name) + 1;
+
+	mach_write_to_2(log_ptr, len);
+	log_ptr += 2;
+	mlog_close(mtr, log_ptr);
+
+	mlog_catenate_string(mtr, (byte*) name, len);
+
+	if (type == MLOG_FILE_RENAME) {
+		len = strlen(new_name) + 1;
+		log_ptr = mlog_open(mtr, 2 + len);
+		ut_a(log_ptr);
+		mach_write_to_2(log_ptr, len);
+		log_ptr += 2;
+		mlog_close(mtr, log_ptr);
+
+		mlog_catenate_string(mtr, (byte*) new_name, len);
+	}
+}
+#endif
+
+/*******************************************************************//**
+Parses the body of a log record written about an .ibd file operation. That is,
+the log record part after the standard (type, space id, page no) header of the
+log record.
+
+If desired, also replays the delete or rename operation if the .ibd file
+exists and the space id in it matches. Replays the create operation if a file
+at that path does not exist yet. If the database directory for the file to be
+created does not exist, then we create the directory, too.
+
+Note that mysqlbackup --apply-log sets fil_path_to_mysql_datadir to point to
+the datadir that we should use in replaying the file operations.
+
+InnoDB recovery does not replay these fully since it always sets the space id
+to zero. But mysqlbackup does replay them.  TODO: If remote tablespaces are
+used, mysqlbackup will only create tables in the default directory since
+MLOG_FILE_CREATE and MLOG_FILE_CREATE2 only know the tablename, not the path.
+
+@return end of log record, or NULL if the record was not completely
+contained between ptr and end_ptr */
+UNIV_INTERN
+byte*
+fil_op_log_parse_or_replay(
+/*=======================*/
+	byte*	ptr,		/*!< in: buffer containing the log record body,
+				or an initial segment of it, if the record does
+				not fir completely between ptr and end_ptr */
+	byte*	end_ptr,	/*!< in: buffer end */
+	ulint	type,		/*!< in: the type of this log record */
+	ulint	space_id,	/*!< in: the space id of the tablespace in
+				question, or 0 if the log record should
+				only be parsed but not replayed */
+	ulint	log_flags)	/*!< in: redo log flags
+				(stored in the page number parameter) */
+{
+	ulint		name_len;
+	ulint		new_name_len;
+	const char*	name;
+	const char*	new_name	= NULL;
+	ulint		flags		= 0;
+
+	if (type == MLOG_FILE_CREATE2) {
+		if (end_ptr < ptr + 4) {
+
+			return(NULL);
+		}
+
+		flags = mach_read_from_4(ptr);
+		ptr += 4;
+	}
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	name_len = mach_read_from_2(ptr);
+
+	ptr += 2;
+
+	if (end_ptr < ptr + name_len) {
+
+		return(NULL);
+	}
+
+	name = (const char*) ptr;
+
+	ptr += name_len;
+
+	if (type == MLOG_FILE_RENAME) {
+		if (end_ptr < ptr + 2) {
+
+			return(NULL);
+		}
+
+		new_name_len = mach_read_from_2(ptr);
+
+		ptr += 2;
+
+		if (end_ptr < ptr + new_name_len) {
+
+			return(NULL);
+		}
+
+		new_name = (const char*) ptr;
+
+		ptr += new_name_len;
+	}
+
+	/* We managed to parse a full log record body */
+	/*
+	printf("Parsed log rec of type %lu space %lu\n"
+	"name %s\n", type, space_id, name);
+
+	if (type == MLOG_FILE_RENAME) {
+	printf("new name %s\n", new_name);
+	}
+	*/
+	if (!space_id) {
+		return(ptr);
+	}
+
+	/* Let us try to perform the file operation, if sensible. Note that
+	mysqlbackup has at this stage already read in all space id info to the
+	fil0fil.cc data structures.
+
+	NOTE that our algorithm is not guaranteed to work correctly if there
+	were renames of tables during the backup. See mysqlbackup code for more
+	on the problem. */
+
+	switch (type) {
+	case MLOG_FILE_DELETE:
+		if (fil_tablespace_exists_in_mem(space_id)) {
+			dberr_t	err = fil_delete_tablespace(
+				space_id, BUF_REMOVE_FLUSH_NO_WRITE);
+			ut_a(err == DB_SUCCESS);
+		}
+
+		break;
+
+	case MLOG_FILE_RENAME:
+		/* In order to replay the rename, the following must hold:
+		* The new name is not already used.
+		* A tablespace is open in memory with the old name.
+		* The space ID for that tablepace matches this log entry.
+		This will prevent unintended renames during recovery. */
+
+		if (fil_get_space_id_for_table(new_name) == ULINT_UNDEFINED
+		    && space_id == fil_get_space_id_for_table(name)) {
+			/* Create the database directory for the new name, if
+			it does not exist yet */
+			fil_create_directory_for_tablename(new_name);
+
+			if (!fil_rename_tablespace(name, space_id,
+						   new_name, NULL)) {
+				ut_error;
+			}
+		}
+
+		break;
+
+	case MLOG_FILE_CREATE:
+	case MLOG_FILE_CREATE2:
+		if (fil_tablespace_exists_in_mem(space_id)) {
+			/* Do nothing */
+		} else if (fil_get_space_id_for_table(name)
+			   != ULINT_UNDEFINED) {
+			/* Do nothing */
+		} else if (log_flags & MLOG_FILE_FLAG_TEMP) {
+			/* Temporary table, do nothing */
+		} else {
+			const char*	path = NULL;
+
+			/* Create the database directory for name, if it does
+			not exist yet */
+			fil_create_directory_for_tablename(name);
+
+			if (fil_create_new_single_table_tablespace(
+				    space_id, name, path, flags,
+				    DICT_TF2_USE_TABLESPACE,
+				    FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) {
+				ut_error;
+			}
+		}
+
+		break;
+
+	default:
+		ut_error;
+	}
+
+	return(ptr);
+}
+
+/*******************************************************************//**
+Allocates a file name for the EXPORT/IMPORT config file name.  The
+string must be freed by caller with mem_free().
+@return own: file name */
+static
+char*
+fil_make_cfg_name(
+/*==============*/
+	const char*	filepath)	/*!< in: .ibd file name */
+{
+	char*	cfg_name;
+
+	/* Create a temporary file path by replacing the .ibd suffix
+	with .cfg. */
+
+	ut_ad(strlen(filepath) > 4);
+
+	cfg_name = mem_strdup(filepath);
+	ut_snprintf(cfg_name + strlen(cfg_name) - 3, 4, "cfg");
+	return(cfg_name);
+}
+
+/*******************************************************************//**
+Check for change buffer merges.
+@return 0 if no merges else count + 1. */
+static
+ulint
+fil_ibuf_check_pending_ops(
+/*=======================*/
+	fil_space_t*	space,	/*!< in/out: Tablespace to check */
+	ulint		count)	/*!< in: number of attempts so far */
+{
+	ut_ad(mutex_own(&fil_system->mutex));
+
+	if (space != 0 && space->n_pending_ops != 0) {
+
+		if (count > 5000) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Trying to close/delete tablespace "
+				"'%s' but there are %lu pending change "
+				"buffer merges on it.",
+				space->name,
+				(ulong) space->n_pending_ops);
+		}
+
+		return(count + 1);
+	}
+
+	return(0);
+}
+
+/*******************************************************************//**
+Check for pending IO.
+@return 0 if no pending else count + 1. */
+static
+ulint
+fil_check_pending_io(
+/*=================*/
+	fil_space_t*	space,	/*!< in/out: Tablespace to check */
+	fil_node_t**	node,	/*!< out: Node in space list */
+	ulint		count)	/*!< in: number of attempts so far */
+{
+	ut_ad(mutex_own(&fil_system->mutex));
+	ut_a(space->n_pending_ops == 0);
+
+	/* The following code must change when InnoDB supports
+	multiple datafiles per tablespace. */
+	ut_a(UT_LIST_GET_LEN(space->chain) == 1);
+
+	*node = UT_LIST_GET_FIRST(space->chain);
+
+	if (space->n_pending_flushes > 0 || (*node)->n_pending > 0) {
+
+		ut_a(!(*node)->being_extended);
+
+		if (count > 1000) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Trying to close/delete tablespace '%s' "
+				"but there are %lu flushes "
+				" and %lu pending i/o's on it.",
+				space->name,
+				(ulong) space->n_pending_flushes,
+				(ulong) (*node)->n_pending);
+		}
+
+		return(count + 1);
+	}
+
+	return(0);
+}
+
+/*******************************************************************//**
+Check pending operations on a tablespace.
+@return DB_SUCCESS or error failure. */
+static
+dberr_t
+fil_check_pending_operations(
+/*=========================*/
+	ulint		id,	/*!< in: space id */
+	fil_space_t**	space,	/*!< out: tablespace instance in memory */
+	char**		path)	/*!< out/own: tablespace path */
+{
+	ulint		count = 0;
+
+	ut_a(id != TRX_SYS_SPACE);
+	ut_ad(space);
+
+	*space = 0;
+
+	mutex_enter(&fil_system->mutex);
+	fil_space_t* sp = fil_space_get_by_id(id);
+	if (sp) {
+		sp->stop_new_ops = TRUE;
+	}
+	mutex_exit(&fil_system->mutex);
+
+	/* Check for pending change buffer merges. */
+
+	do {
+		mutex_enter(&fil_system->mutex);
+
+		sp = fil_space_get_by_id(id);
+
+		count = fil_ibuf_check_pending_ops(sp, count);
+
+		mutex_exit(&fil_system->mutex);
+
+		if (count > 0) {
+			os_thread_sleep(20000);
+		}
+
+	} while (count > 0);
+
+	/* Check for pending IO. */
+
+	*path = 0;
+
+	do {
+		mutex_enter(&fil_system->mutex);
+
+		sp = fil_space_get_by_id(id);
+
+		if (sp == NULL) {
+			mutex_exit(&fil_system->mutex);
+			return(DB_TABLESPACE_NOT_FOUND);
+		}
+
+		fil_node_t*	node;
+
+		count = fil_check_pending_io(sp, &node, count);
+
+		if (count == 0) {
+			*path = mem_strdup(node->name);
+		}
+
+		mutex_exit(&fil_system->mutex);
+
+		if (count > 0) {
+			os_thread_sleep(20000);
+		}
+
+	} while (count > 0);
+
+	ut_ad(sp);
+
+	*space = sp;
+	return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Closes a single-table tablespace. The tablespace must be cached in the
+memory cache. Free all pages used by the tablespace.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+dberr_t
+fil_close_tablespace(
+/*=================*/
+	trx_t*		trx,	/*!< in/out: Transaction covering the close */
+	ulint		id)	/*!< in: space id */
+{
+	char*		path = 0;
+	fil_space_t*	space = 0;
+
+	ut_a(id != TRX_SYS_SPACE);
+
+	dberr_t		err = fil_check_pending_operations(id, &space, &path);
+
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	ut_a(space);
+	ut_a(path != 0);
+
+	rw_lock_x_lock(&space->latch);
+
+#ifndef UNIV_HOTBACKUP
+	/* Invalidate in the buffer pool all pages belonging to the
+	tablespace. Since we have set space->stop_new_ops = TRUE, readahead
+	or ibuf merge can no longer read more pages of this tablespace to the
+	buffer pool. Thus we can clean the tablespace out of the buffer pool
+	completely and permanently. The flag stop_new_ops also prevents
+	fil_flush() from being applied to this tablespace. */
+
+	buf_LRU_flush_or_remove_pages(id, BUF_REMOVE_FLUSH_WRITE, trx);
+#endif
+	mutex_enter(&fil_system->mutex);
+
+	/* If the free is successful, the X lock will be released before
+	the space memory data structure is freed. */
+
+	if (!fil_space_free(id, TRUE)) {
+		rw_lock_x_unlock(&space->latch);
+		err = DB_TABLESPACE_NOT_FOUND;
+	} else {
+		err = DB_SUCCESS;
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	/* If it is a delete then also delete any generated files, otherwise
+	when we drop the database the remove directory will fail. */
+
+	char*	cfg_name = fil_make_cfg_name(path);
+
+	os_file_delete_if_exists(innodb_file_data_key, cfg_name);
+
+	mem_free(path);
+	mem_free(cfg_name);
+
+	return(err);
+}
+
+/*******************************************************************//**
+Deletes a single-table tablespace. The tablespace must be cached in the
+memory cache.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+dberr_t
+fil_delete_tablespace(
+/*==================*/
+	ulint		id,		/*!< in: space id */
+	buf_remove_t	buf_remove)	/*!< in: specify the action to take
+					on the tables pages in the buffer
+					pool */
+{
+	char*		path = 0;
+	fil_space_t*	space = 0;
+
+	ut_a(id != TRX_SYS_SPACE);
+
+	dberr_t		err = fil_check_pending_operations(id, &space, &path);
+
+	if (err != DB_SUCCESS) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Cannot delete tablespace %lu because it is not "
+			"found in the tablespace memory cache.",
+			(ulong) id);
+
+		return(err);
+	}
+
+	ut_a(space);
+	ut_a(path != 0);
+
+	/* Important: We rely on the data dictionary mutex to ensure
+	that a race is not possible here. It should serialize the tablespace
+	drop/free. We acquire an X latch only to avoid a race condition
+	when accessing the tablespace instance via:
+
+	  fsp_get_available_space_in_free_extents().
+
+	There our main motivation is to reduce the contention on the
+	dictionary mutex. */
+
+	rw_lock_x_lock(&space->latch);
+
+#ifndef UNIV_HOTBACKUP
+	/* IMPORTANT: Because we have set space::stop_new_ops there
+	can't be any new ibuf merges, reads or flushes. We are here
+	because node::n_pending was zero above. However, it is still
+	possible to have pending read and write requests:
+
+	A read request can happen because the reader thread has
+	gone through the ::stop_new_ops check in buf_page_init_for_read()
+	before the flag was set and has not yet incremented ::n_pending
+	when we checked it above.
+
+	A write request can be issued any time because we don't check
+	the ::stop_new_ops flag when queueing a block for write.
+
+	We deal with pending write requests in the following function
+	where we'd minimally evict all dirty pages belonging to this
+	space from the flush_list. Not that if a block is IO-fixed
+	we'll wait for IO to complete.
+
+	To deal with potential read requests by checking the
+	::stop_new_ops flag in fil_io() */
+
+	buf_LRU_flush_or_remove_pages(id, buf_remove, 0);
+
+#endif /* !UNIV_HOTBACKUP */
+
+	/* If it is a delete then also delete any generated files, otherwise
+	when we drop the database the remove directory will fail. */
+	{
+		char*	cfg_name = fil_make_cfg_name(path);
+		os_file_delete_if_exists(innodb_file_data_key, cfg_name);
+		mem_free(cfg_name);
+	}
+
+	/* Delete the link file pointing to the ibd file we are deleting. */
+	if (FSP_FLAGS_HAS_DATA_DIR(space->flags)) {
+		fil_delete_link_file(space->name);
+	}
+
+	mutex_enter(&fil_system->mutex);
+
+	/* Double check the sanity of pending ops after reacquiring
+	the fil_system::mutex. */
+	if (fil_space_get_by_id(id)) {
+		ut_a(space->n_pending_ops == 0);
+		ut_a(UT_LIST_GET_LEN(space->chain) == 1);
+		fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
+		ut_a(node->n_pending == 0);
+	}
+
+	if (!fil_space_free(id, TRUE)) {
+		err = DB_TABLESPACE_NOT_FOUND;
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	if (err != DB_SUCCESS) {
+		rw_lock_x_unlock(&space->latch);
+	} else if (!os_file_delete(innodb_file_data_key, path)
+		   && !os_file_delete_if_exists(innodb_file_data_key, path)) {
+
+		/* Note: This is because we have removed the
+		tablespace instance from the cache. */
+
+		err = DB_IO_ERROR;
+	}
+
+	if (err == DB_SUCCESS) {
+#ifndef UNIV_HOTBACKUP
+		/* Write a log record about the deletion of the .ibd
+		file, so that mysqlbackup can replay it in the
+		--apply-log phase. We use a dummy mtr and the familiar
+		log write mechanism. */
+		mtr_t		mtr;
+
+		/* When replaying the operation in mysqlbackup, do not try
+		to write any log record */
+		mtr_start(&mtr);
+
+		fil_op_write_log(MLOG_FILE_DELETE, id, 0, 0, path, NULL, &mtr);
+		mtr_commit(&mtr);
+#endif
+		err = DB_SUCCESS;
+	}
+
+	mem_free(path);
+
+	return(err);
+}
+
+/*******************************************************************//**
+Returns TRUE if a single-table tablespace is being deleted.
+@return TRUE if being deleted */
+UNIV_INTERN
+ibool
+fil_tablespace_is_being_deleted(
+/*============================*/
+	ulint		id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+	ibool		is_being_deleted;
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	ut_a(space != NULL);
+
+	is_being_deleted = space->stop_new_ops;
+
+	mutex_exit(&fil_system->mutex);
+
+	return(is_being_deleted);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Discards a single-table tablespace. The tablespace must be cached in the
+memory cache. Discarding is like deleting a tablespace, but
+
+ 1. We do not drop the table from the data dictionary;
+
+ 2. We remove all insert buffer entries for the tablespace immediately;
+    in DROP TABLE they are only removed gradually in the background;
+
+ 3. Free all the pages in use by the tablespace.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+dberr_t
+fil_discard_tablespace(
+/*===================*/
+	ulint	id)	/*!< in: space id */
+{
+	dberr_t	err;
+
+	switch (err = fil_delete_tablespace(id, BUF_REMOVE_ALL_NO_WRITE)) {
+	case DB_SUCCESS:
+		break;
+
+	case DB_IO_ERROR:
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"While deleting tablespace %lu in DISCARD TABLESPACE."
+			" File rename/delete failed: %s",
+			(ulong) id, ut_strerr(err));
+		break;
+
+	case DB_TABLESPACE_NOT_FOUND:
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Cannot delete tablespace %lu in DISCARD "
+			"TABLESPACE. %s",
+			(ulong) id, ut_strerr(err));
+		break;
+
+	default:
+		ut_error;
+	}
+
+	/* Remove all insert buffer entries for the tablespace */
+
+	ibuf_delete_for_discarded_space(id);
+
+	return(err);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Renames the memory cache structures of a single-table tablespace.
+@return	TRUE if success */
+static
+ibool
+fil_rename_tablespace_in_mem(
+/*=========================*/
+	fil_space_t*	space,	/*!< in: tablespace memory object */
+	fil_node_t*	node,	/*!< in: file node of that tablespace */
+	const char*	new_name,	/*!< in: new name */
+	const char*	new_path)	/*!< in: new file path */
+{
+	fil_space_t*	space2;
+	const char*	old_name	= space->name;
+
+	ut_ad(mutex_own(&fil_system->mutex));
+
+	space2 = fil_space_get_by_name(old_name);
+	if (space != space2) {
+		fputs("InnoDB: Error: cannot find ", stderr);
+		ut_print_filename(stderr, old_name);
+		fputs(" in tablespace memory cache\n", stderr);
+
+		return(FALSE);
+	}
+
+	space2 = fil_space_get_by_name(new_name);
+	if (space2 != NULL) {
+		fputs("InnoDB: Error: ", stderr);
+		ut_print_filename(stderr, new_name);
+		fputs(" is already in tablespace memory cache\n", stderr);
+
+		return(FALSE);
+	}
+
+	HASH_DELETE(fil_space_t, name_hash, fil_system->name_hash,
+		    ut_fold_string(space->name), space);
+	mem_free(space->name);
+	mem_free(node->name);
+
+	space->name = mem_strdup(new_name);
+	node->name = mem_strdup(new_path);
+
+	HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash,
+		    ut_fold_string(new_name), space);
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Allocates a file name for a single-table tablespace. The string must be freed
+by caller with mem_free().
+@return	own: file name */
+UNIV_INTERN
+char*
+fil_make_ibd_name(
+/*==============*/
+	const char*	name,		/*!< in: table name or a dir path */
+	bool		is_full_path)	/*!< in: TRUE if it is a dir path */
+{
+	char*	filename;
+	ulint	namelen		= strlen(name);
+	ulint	dirlen		= strlen(fil_path_to_mysql_datadir);
+	ulint	pathlen		= dirlen + namelen + sizeof "/.ibd";
+
+	filename = static_cast<char*>(mem_alloc(pathlen));
+
+	if (is_full_path) {
+		memcpy(filename, name, namelen);
+		memcpy(filename + namelen, ".ibd", sizeof ".ibd");
+	} else {
+		ut_snprintf(filename, pathlen, "%s/%s.ibd",
+			fil_path_to_mysql_datadir, name);
+
+	}
+
+	srv_normalize_path_for_win(filename);
+
+	return(filename);
+}
+
+/*******************************************************************//**
+Allocates a file name for a tablespace ISL file (InnoDB Symbolic Link).
+The string must be freed by caller with mem_free().
+@return	own: file name */
+UNIV_INTERN
+char*
+fil_make_isl_name(
+/*==============*/
+	const char*	name)	/*!< in: table name */
+{
+	char*	filename;
+	ulint	namelen		= strlen(name);
+	ulint	dirlen		= strlen(fil_path_to_mysql_datadir);
+	ulint	pathlen		= dirlen + namelen + sizeof "/.isl";
+
+	filename = static_cast<char*>(mem_alloc(pathlen));
+
+	ut_snprintf(filename, pathlen, "%s/%s.isl",
+		fil_path_to_mysql_datadir, name);
+
+	srv_normalize_path_for_win(filename);
+
+	return(filename);
+}
+
+/*******************************************************************//**
+Renames a single-table tablespace. The tablespace must be cached in the
+tablespace memory cache.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_rename_tablespace(
+/*==================*/
+	const char*	old_name_in,	/*!< in: old table name in the
+					standard databasename/tablename
+					format of InnoDB, or NULL if we
+					do the rename based on the space
+					id only */
+	ulint		id,		/*!< in: space id */
+	const char*	new_name,	/*!< in: new table name in the
+					standard databasename/tablename
+					format of InnoDB */
+	const char*	new_path_in)	/*!< in: new full datafile path
+					if the tablespace is remotely
+					located, or NULL if it is located
+					in the normal data directory. */
+{
+	ibool		success;
+	fil_space_t*	space;
+	fil_node_t*	node;
+	ulint		count		= 0;
+	char*		new_path;
+	char*		old_name;
+	char*		old_path;
+	const char*	not_given	= "(name not specified)";
+
+	ut_a(id != 0);
+
+retry:
+	count++;
+
+	if (!(count % 1000)) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Warning: problems renaming ", stderr);
+		ut_print_filename(stderr,
+				  old_name_in ? old_name_in : not_given);
+		fputs(" to ", stderr);
+		ut_print_filename(stderr, new_name);
+		fprintf(stderr, ", %lu iterations\n", (ulong) count);
+	}
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	DBUG_EXECUTE_IF("fil_rename_tablespace_failure_1", space = NULL; );
+
+	if (space == NULL) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Cannot find space id %lu in the tablespace "
+			"memory cache, though the table '%s' in a "
+			"rename operation should have that id.",
+			(ulong) id, old_name_in ? old_name_in : not_given);
+		mutex_exit(&fil_system->mutex);
+
+		return(FALSE);
+	}
+
+	if (count > 25000) {
+		space->stop_ios = FALSE;
+		mutex_exit(&fil_system->mutex);
+
+		return(FALSE);
+	}
+
+	/* We temporarily close the .ibd file because we do not trust that
+	operating systems can rename an open file. For the closing we have to
+	wait until there are no pending i/o's or flushes on the file. */
+
+	space->stop_ios = TRUE;
+
+	/* The following code must change when InnoDB supports
+	multiple datafiles per tablespace. */
+	ut_a(UT_LIST_GET_LEN(space->chain) == 1);
+	node = UT_LIST_GET_FIRST(space->chain);
+
+	if (node->n_pending > 0
+	    || node->n_pending_flushes > 0
+	    || node->being_extended) {
+		/* There are pending i/o's or flushes or the file is
+		currently being extended, sleep for a while and
+		retry */
+
+		mutex_exit(&fil_system->mutex);
+
+		os_thread_sleep(20000);
+
+		goto retry;
+
+	} else if (node->modification_counter > node->flush_counter) {
+		/* Flush the space */
+
+		mutex_exit(&fil_system->mutex);
+
+		os_thread_sleep(20000);
+
+		fil_flush(id);
+
+		goto retry;
+
+	} else if (node->open) {
+		/* Close the file */
+
+		fil_node_close_file(node, fil_system);
+	}
+
+	/* Check that the old name in the space is right */
+
+	if (old_name_in) {
+		old_name = mem_strdup(old_name_in);
+		ut_a(strcmp(space->name, old_name) == 0);
+	} else {
+		old_name = mem_strdup(space->name);
+	}
+	old_path = mem_strdup(node->name);
+
+	/* Rename the tablespace and the node in the memory cache */
+	new_path = new_path_in ? mem_strdup(new_path_in)
+		: fil_make_ibd_name(new_name, false);
+
+	success = fil_rename_tablespace_in_mem(
+		space, node, new_name, new_path);
+
+	if (success) {
+
+		DBUG_EXECUTE_IF("fil_rename_tablespace_failure_2",
+			goto skip_second_rename; );
+
+		success = os_file_rename(
+			innodb_file_data_key, old_path, new_path);
+
+		DBUG_EXECUTE_IF("fil_rename_tablespace_failure_2",
+skip_second_rename:
+			success = FALSE; );
+
+		if (!success) {
+			/* We have to revert the changes we made
+			to the tablespace memory cache */
+
+			ut_a(fil_rename_tablespace_in_mem(
+					space, node, old_name, old_path));
+		}
+	}
+
+	space->stop_ios = FALSE;
+
+	mutex_exit(&fil_system->mutex);
+
+#ifndef UNIV_HOTBACKUP
+	if (success && !recv_recovery_on) {
+		mtr_t		mtr;
+
+		mtr_start(&mtr);
+
+		fil_op_write_log(MLOG_FILE_RENAME, id, 0, 0, old_name, new_name,
+				 &mtr);
+		mtr_commit(&mtr);
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	mem_free(new_path);
+	mem_free(old_path);
+	mem_free(old_name);
+
+	return(success);
+}
+
+/*******************************************************************//**
+Creates a new InnoDB Symbolic Link (ISL) file.  It is always created
+under the 'datadir' of MySQL. The datadir is the directory of a
+running mysqld program. We can refer to it by simply using the path '.'.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fil_create_link_file(
+/*=================*/
+	const char*	tablename,	/*!< in: tablename */
+	const char*	filepath)	/*!< in: pathname of tablespace */
+{
+	os_file_t	file;
+	ibool		success;
+	dberr_t		err = DB_SUCCESS;
+	char*		link_filepath;
+	char*		prev_filepath = fil_read_link_file(tablename);
+
+	ut_ad(!srv_read_only_mode);
+
+	if (prev_filepath) {
+		/* Truncate will call this with an existing
+		link file which contains the same filepath. */
+		if (0 == strcmp(prev_filepath, filepath)) {
+			mem_free(prev_filepath);
+			return(DB_SUCCESS);
+		}
+		mem_free(prev_filepath);
+	}
+
+	link_filepath = fil_make_isl_name(tablename);
+
+	file = os_file_create_simple_no_error_handling(
+		innodb_file_data_key, link_filepath,
+		OS_FILE_CREATE, OS_FILE_READ_WRITE, &success);
+
+	if (!success) {
+		/* The following call will print an error message */
+		ulint	error = os_file_get_last_error(true);
+
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Cannot create file ", stderr);
+		ut_print_filename(stderr, link_filepath);
+		fputs(".\n", stderr);
+
+		if (error == OS_FILE_ALREADY_EXISTS) {
+			fputs("InnoDB: The link file: ", stderr);
+			ut_print_filename(stderr, filepath);
+			fputs(" already exists.\n", stderr);
+			err = DB_TABLESPACE_EXISTS;
+
+		} else if (error == OS_FILE_DISK_FULL) {
+			err = DB_OUT_OF_FILE_SPACE;
+
+		} else {
+			err = DB_ERROR;
+		}
+
+		/* file is not open, no need to close it. */
+		mem_free(link_filepath);
+		return(err);
+	}
+
+	if (!os_file_write(link_filepath, file, filepath, 0,
+			    strlen(filepath))) {
+		err = DB_ERROR;
+	}
+
+	/* Close the file, we only need it at startup */
+	os_file_close(file);
+
+	mem_free(link_filepath);
+
+	return(err);
+}
+
+/*******************************************************************//**
+Deletes an InnoDB Symbolic Link (ISL) file. */
+UNIV_INTERN
+void
+fil_delete_link_file(
+/*=================*/
+	const char*	tablename)	/*!< in: name of table */
+{
+	char* link_filepath = fil_make_isl_name(tablename);
+
+	os_file_delete_if_exists(innodb_file_data_key, link_filepath);
+
+	mem_free(link_filepath);
+}
+
+/*******************************************************************//**
+Reads an InnoDB Symbolic Link (ISL) file.
+It is always created under the 'datadir' of MySQL.  The name is of the
+form {databasename}/{tablename}. and the isl file is expected to be in a
+'{databasename}' directory called '{tablename}.isl'. The caller must free
+the memory of the null-terminated path returned if it is not null.
+@return	own: filepath found in link file, NULL if not found. */
+UNIV_INTERN
+char*
+fil_read_link_file(
+/*===============*/
+	const char*	name)		/*!< in: tablespace name */
+{
+	char*		filepath = NULL;
+	char*		link_filepath;
+	FILE*		file = NULL;
+
+	/* The .isl file is in the 'normal' tablespace location. */
+	link_filepath = fil_make_isl_name(name);
+
+	file = fopen(link_filepath, "r+b");
+
+	mem_free(link_filepath);
+
+	if (file) {
+		filepath = static_cast<char*>(mem_alloc(OS_FILE_MAX_PATH));
+
+		os_file_read_string(file, filepath, OS_FILE_MAX_PATH);
+		fclose(file);
+
+		if (strlen(filepath)) {
+			/* Trim whitespace from end of filepath */
+			ulint lastch = strlen(filepath) - 1;
+			while (lastch > 4 && filepath[lastch] <= 0x20) {
+				filepath[lastch--] = 0x00;
+			}
+			srv_normalize_path_for_win(filepath);
+		}
+	}
+
+	return(filepath);
+}
+
+/*******************************************************************//**
+Opens a handle to the file linked to in an InnoDB Symbolic Link file.
+@return	TRUE if remote linked tablespace file is found and opened. */
+UNIV_INTERN
+ibool
+fil_open_linked_file(
+/*===============*/
+	const char*	tablename,	/*!< in: database/tablename */
+	char**		remote_filepath,/*!< out: remote filepath */
+	os_file_t*	remote_file)	/*!< out: remote file handle */
+
+{
+	ibool		success;
+
+	*remote_filepath = fil_read_link_file(tablename);
+	if (*remote_filepath == NULL) {
+		return(FALSE);
+	}
+
+	/* The filepath provided is different from what was
+	found in the link file. */
+	*remote_file = os_file_create_simple_no_error_handling(
+		innodb_file_data_key, *remote_filepath,
+		OS_FILE_OPEN, OS_FILE_READ_ONLY,
+		&success);
+
+	if (!success) {
+		char*	link_filepath = fil_make_isl_name(tablename);
+
+		/* The following call prints an error message */
+		os_file_get_last_error(true);
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"A link file was found named '%s' "
+			"but the linked tablespace '%s' "
+			"could not be opened.",
+			link_filepath, *remote_filepath);
+
+		mem_free(link_filepath);
+		mem_free(*remote_filepath);
+		*remote_filepath = NULL;
+	}
+
+	return(success);
+}
+
+/*******************************************************************//**
+Creates a new single-table tablespace to a database directory of MySQL.
+Database directories are under the 'datadir' of MySQL. The datadir is the
+directory of a running mysqld program. We can refer to it by simply the
+path '.'. Tables created with CREATE TEMPORARY TABLE we place in the temp
+dir of the mysqld server.
+
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fil_create_new_single_table_tablespace(
+/*===================================*/
+	ulint		space_id,	/*!< in: space id */
+	const char*	tablename,	/*!< in: the table name in the usual
+					databasename/tablename format
+					of InnoDB */
+	const char*	dir_path,	/*!< in: NULL or a dir path */
+	ulint		flags,		/*!< in: tablespace flags */
+	ulint		flags2,		/*!< in: table flags2 */
+	ulint		size)		/*!< in: the initial size of the
+					tablespace file in pages,
+					must be >= FIL_IBD_FILE_INITIAL_SIZE */
+{
+	os_file_t	file;
+	ibool		ret;
+	dberr_t		err;
+	byte*		buf2;
+	byte*		page;
+	char*		path;
+	ibool		success;
+	/* TRUE if a table is created with CREATE TEMPORARY TABLE */
+	bool		is_temp = !!(flags2 & DICT_TF2_TEMPORARY);
+	bool		has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags);
+
+	ut_a(space_id > 0);
+	ut_ad(!srv_read_only_mode);
+	ut_a(space_id < SRV_LOG_SPACE_FIRST_ID);
+	ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE);
+	ut_a(fsp_flags_is_valid(flags));
+
+	if (is_temp) {
+		/* Temporary table filepath */
+		ut_ad(dir_path);
+		path = fil_make_ibd_name(dir_path, true);
+	} else if (has_data_dir) {
+		ut_ad(dir_path);
+		path = os_file_make_remote_pathname(dir_path, tablename, "ibd");
+
+		/* Since this tablespace file will be created in a
+		remote directory, let's create the subdirectories
+		in the path, if they are not there already. */
+		success = os_file_create_subdirs_if_needed(path);
+		if (!success) {
+			err = DB_ERROR;
+			goto error_exit_3;
+		}
+	} else {
+		path = fil_make_ibd_name(tablename, false);
+	}
+
+	file = os_file_create(
+		innodb_file_data_key, path,
+		OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
+		OS_FILE_NORMAL,
+		OS_DATA_FILE,
+		&ret);
+
+	if (ret == FALSE) {
+		/* The following call will print an error message */
+		ulint	error = os_file_get_last_error(true);
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Cannot create file '%s'\n", path);
+
+		if (error == OS_FILE_ALREADY_EXISTS) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"The file '%s' already exists though the "
+				"corresponding table did not exist "
+				"in the InnoDB data dictionary. "
+				"Have you moved InnoDB .ibd files "
+				"around without using the SQL commands "
+				"DISCARD TABLESPACE and IMPORT TABLESPACE, "
+				"or did mysqld crash in the middle of "
+				"CREATE TABLE? "
+				"You can resolve the problem by removing "
+				"the file '%s' under the 'datadir' of MySQL.",
+				path, path);
+
+			err = DB_TABLESPACE_EXISTS;
+			goto error_exit_3;
+		}
+
+		if (error == OS_FILE_DISK_FULL) {
+			err = DB_OUT_OF_FILE_SPACE;
+			goto error_exit_3;
+		}
+
+		err = DB_ERROR;
+		goto error_exit_3;
+	}
+
+	ret = os_file_set_size(path, file, size * UNIV_PAGE_SIZE);
+
+	if (!ret) {
+		err = DB_OUT_OF_FILE_SPACE;
+		goto error_exit_2;
+	}
+
+	/* printf("Creating tablespace %s id %lu\n", path, space_id); */
+
+	/* We have to write the space id to the file immediately and flush the
+	file to disk. This is because in crash recovery we must be aware what
+	tablespaces exist and what are their space id's, so that we can apply
+	the log records to the right file. It may take quite a while until
+	buffer pool flush algorithms write anything to the file and flush it to
+	disk. If we would not write here anything, the file would be filled
+	with zeros from the call of os_file_set_size(), until a buffer pool
+	flush would write to it. */
+
+	buf2 = static_cast<byte*>(ut_malloc(3 * UNIV_PAGE_SIZE));
+	/* Align the memory for file i/o if we might have O_DIRECT set */
+	page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
+
+	memset(page, '\0', UNIV_PAGE_SIZE);
+
+	/* Add the UNIV_PAGE_SIZE to the table flags and write them to the
+	tablespace header. */
+	flags = fsp_flags_set_page_size(flags, UNIV_PAGE_SIZE);
+	fsp_header_init_fields(page, space_id, flags);
+	mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id);
+
+	if (!(fsp_flags_is_compressed(flags))) {
+		buf_flush_init_for_writing(page, NULL, 0);
+		ret = os_file_write(path, file, page, 0, UNIV_PAGE_SIZE);
+	} else {
+		page_zip_des_t	page_zip;
+		ulint		zip_size;
+
+		zip_size = fsp_flags_get_zip_size(flags);
+
+		page_zip_set_size(&page_zip, zip_size);
+		page_zip.data = page + UNIV_PAGE_SIZE;
+#ifdef UNIV_DEBUG
+		page_zip.m_start =
+#endif /* UNIV_DEBUG */
+			page_zip.m_end = page_zip.m_nonempty =
+			page_zip.n_blobs = 0;
+		buf_flush_init_for_writing(page, &page_zip, 0);
+		ret = os_file_write(path, file, page_zip.data, 0, zip_size);
+	}
+
+	ut_free(buf2);
+
+	if (!ret) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Could not write the first page to tablespace "
+			"'%s'", path);
+
+		err = DB_ERROR;
+		goto error_exit_2;
+	}
+
+	ret = os_file_flush(file);
+
+	if (!ret) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"File flush of tablespace '%s' failed", path);
+		err = DB_ERROR;
+		goto error_exit_2;
+	}
+
+	if (has_data_dir) {
+		/* Now that the IBD file is created, make the ISL file. */
+		err = fil_create_link_file(tablename, path);
+		if (err != DB_SUCCESS) {
+			goto error_exit_2;
+		}
+	}
+
+	success = fil_space_create(tablename, space_id, flags, FIL_TABLESPACE);
+	if (!success || !fil_node_create(path, size, space_id, FALSE)) {
+		err = DB_ERROR;
+		goto error_exit_1;
+	}
+
+#ifndef UNIV_HOTBACKUP
+	{
+		mtr_t		mtr;
+		ulint		mlog_file_flag = 0;
+
+		if (is_temp) {
+			mlog_file_flag |= MLOG_FILE_FLAG_TEMP;
+		}
+
+		mtr_start(&mtr);
+
+		fil_op_write_log(flags
+				 ? MLOG_FILE_CREATE2
+				 : MLOG_FILE_CREATE,
+				 space_id, mlog_file_flag, flags,
+				 tablename, NULL, &mtr);
+
+		mtr_commit(&mtr);
+	}
+#endif
+	err = DB_SUCCESS;
+
+	/* Error code is set.  Cleanup the various variables used.
+	These labels reflect the order in which variables are assigned or
+	actions are done. */
+error_exit_1:
+	if (has_data_dir && err != DB_SUCCESS) {
+		fil_delete_link_file(tablename);
+	}
+error_exit_2:
+	os_file_close(file);
+	if (err != DB_SUCCESS) {
+		os_file_delete(innodb_file_data_key, path);
+	}
+error_exit_3:
+	mem_free(path);
+
+	return(err);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Report information about a bad tablespace. */
+static
+void
+fil_report_bad_tablespace(
+/*======================*/
+	const char*	filepath,	/*!< in: filepath */
+	const char*	check_msg,	/*!< in: fil_check_first_page() */
+	ulint		found_id,	/*!< in: found space ID */
+	ulint		found_flags,	/*!< in: found flags */
+	ulint		expected_id,	/*!< in: expected space id */
+	ulint		expected_flags)	/*!< in: expected flags */
+{
+	if (check_msg) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Error %s in file '%s',"
+			"tablespace id=%lu, flags=%lu. "
+			"Please refer to "
+			REFMAN "innodb-troubleshooting-datadict.html "
+			"for how to resolve the issue.",
+			check_msg, filepath,
+			(ulong) expected_id, (ulong) expected_flags);
+		return;
+	}
+
+	ib_logf(IB_LOG_LEVEL_ERROR,
+		"In file '%s', tablespace id and flags are %lu and %lu, "
+		"but in the InnoDB data dictionary they are %lu and %lu. "
+		"Have you moved InnoDB .ibd files around without using the "
+		"commands DISCARD TABLESPACE and IMPORT TABLESPACE? "
+		"Please refer to "
+		REFMAN "innodb-troubleshooting-datadict.html "
+		"for how to resolve the issue.",
+		filepath, (ulong) found_id, (ulong) found_flags,
+		(ulong) expected_id, (ulong) expected_flags);
+}
+
+/********************************************************************//**
+Tries to open a single-table tablespace and optionally checks that the
+space id in it is correct. If this does not succeed, print an error message
+to the .err log. This function is used to open a tablespace when we start
+mysqld after the dictionary has been booted, and also in IMPORT TABLESPACE.
+
+NOTE that we assume this operation is used either at the database startup
+or under the protection of the dictionary mutex, so that two users cannot
+race here. This operation does not leave the file associated with the
+tablespace open, but closes it after we have looked at the space id in it.
+
+If the validate boolean is set, we read the first page of the file and
+check that the space id in the file is what we expect. We assume that
+this function runs much faster if no check is made, since accessing the
+file inode probably is much faster (the OS caches them) than accessing
+the first page of the file.  This boolean may be initially FALSE, but if
+a remote tablespace is found it will be changed to true.
+
+If the fix_dict boolean is set, then it is safe to use an internal SQL
+statement to update the dictionary tables if they are incorrect.
+
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fil_open_single_table_tablespace(
+/*=============================*/
+	bool		validate,	/*!< in: Do we validate tablespace? */
+	bool		fix_dict,	/*!< in: Can we fix the dictionary? */
+	ulint		id,		/*!< in: space id */
+	ulint		flags,		/*!< in: tablespace flags */
+	const char*	tablename,	/*!< in: table name in the
+					databasename/tablename format */
+	const char*	path_in)	/*!< in: tablespace filepath */
+{
+	dberr_t		err = DB_SUCCESS;
+	bool		dict_filepath_same_as_default = false;
+	bool		link_file_found = false;
+	bool		link_file_is_bad = false;
+	fsp_open_info	def;
+	fsp_open_info	dict;
+	fsp_open_info	remote;
+	ulint		tablespaces_found = 0;
+	ulint		valid_tablespaces_found = 0;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!fix_dict || rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(!fix_dict || mutex_own(&(dict_sys->mutex)));
+
+	if (!fsp_flags_is_valid(flags)) {
+		return(DB_CORRUPTION);
+	}
+
+	/* If the tablespace was relocated, we do not
+	compare the DATA_DIR flag */
+	ulint mod_flags = flags & ~FSP_FLAGS_MASK_DATA_DIR;
+
+	memset(&def, 0, sizeof(def));
+	memset(&dict, 0, sizeof(dict));
+	memset(&remote, 0, sizeof(remote));
+
+	/* Discover the correct filepath.  We will always look for an ibd
+	in the default location. If it is remote, it should not be here. */
+	def.filepath = fil_make_ibd_name(tablename, false);
+
+	/* The path_in was read from SYS_DATAFILES. */
+	if (path_in) {
+		if (strcmp(def.filepath, path_in)) {
+			dict.filepath = mem_strdup(path_in);
+			/* possibility of multiple files. */
+			validate = true;
+		} else {
+			dict_filepath_same_as_default = true;
+		}
+	}
+
+	link_file_found = fil_open_linked_file(
+		tablename, &remote.filepath, &remote.file);
+	remote.success = link_file_found;
+	if (remote.success) {
+		/* possibility of multiple files. */
+		validate = true;
+		tablespaces_found++;
+
+		/* A link file was found. MySQL does not allow a DATA
+		DIRECTORY to be be the same as the default filepath. */
+		ut_a(strcmp(def.filepath, remote.filepath));
+
+		/* If there was a filepath found in SYS_DATAFILES,
+		we hope it was the same as this remote.filepath found
+		in the ISL file. */
+		if (dict.filepath
+		    && (0 == strcmp(dict.filepath, remote.filepath))) {
+			remote.success = FALSE;
+			os_file_close(remote.file);
+			mem_free(remote.filepath);
+			remote.filepath = NULL;
+			tablespaces_found--;
+		}
+	}
+
+	/* Attempt to open the tablespace at other possible filepaths. */
+	if (dict.filepath) {
+		dict.file = os_file_create_simple_no_error_handling(
+			innodb_file_data_key, dict.filepath, OS_FILE_OPEN,
+			OS_FILE_READ_ONLY, &dict.success);
+		if (dict.success) {
+			/* possibility of multiple files. */
+			validate = true;
+			tablespaces_found++;
+		}
+	}
+
+	/* Always look for a file at the default location. */
+	ut_a(def.filepath);
+	def.file = os_file_create_simple_no_error_handling(
+		innodb_file_data_key, def.filepath, OS_FILE_OPEN,
+		OS_FILE_READ_ONLY, &def.success);
+	if (def.success) {
+		tablespaces_found++;
+	}
+
+	/*  We have now checked all possible tablespace locations and
+	have a count of how many we found.  If things are normal, we
+	only found 1. */
+	if (!validate && tablespaces_found == 1) {
+		goto skip_validate;
+	}
+
+	/* Read the first page of the datadir tablespace, if found. */
+	if (def.success) {
+		def.check_msg = fil_read_first_page(
+			def.file, FALSE, &def.flags, &def.id,
+#ifdef UNIV_LOG_ARCHIVE
+			&space_arch_log_no, &space_arch_log_no,
+#endif /* UNIV_LOG_ARCHIVE */
+			&def.lsn, &def.lsn);
+		def.valid = !def.check_msg;
+
+		/* Validate this single-table-tablespace with SYS_TABLES,
+		but do not compare the DATA_DIR flag, in case the
+		tablespace was relocated. */
+		if (def.valid && def.id == id
+		    && (def.flags & ~FSP_FLAGS_MASK_DATA_DIR) == mod_flags) {
+			valid_tablespaces_found++;
+		} else {
+			def.valid = false;
+			/* Do not use this tablespace. */
+			fil_report_bad_tablespace(
+				def.filepath, def.check_msg, def.id,
+				def.flags, id, flags);
+		}
+	}
+
+	/* Read the first page of the remote tablespace */
+	if (remote.success) {
+		remote.check_msg = fil_read_first_page(
+			remote.file, FALSE, &remote.flags, &remote.id,
+#ifdef UNIV_LOG_ARCHIVE
+			&remote.arch_log_no, &remote.arch_log_no,
+#endif /* UNIV_LOG_ARCHIVE */
+			&remote.lsn, &remote.lsn);
+		remote.valid = !remote.check_msg;
+
+		/* Validate this single-table-tablespace with SYS_TABLES,
+		but do not compare the DATA_DIR flag, in case the
+		tablespace was relocated. */
+		if (remote.valid && remote.id == id
+		    && (remote.flags & ~FSP_FLAGS_MASK_DATA_DIR) == mod_flags) {
+			valid_tablespaces_found++;
+		} else {
+			remote.valid = false;
+			/* Do not use this linked tablespace. */
+			fil_report_bad_tablespace(
+				remote.filepath, remote.check_msg, remote.id,
+				remote.flags, id, flags);
+			link_file_is_bad = true;
+		}
+	}
+
+	/* Read the first page of the datadir tablespace, if found. */
+	if (dict.success) {
+		dict.check_msg = fil_read_first_page(
+			dict.file, FALSE, &dict.flags, &dict.id,
+#ifdef UNIV_LOG_ARCHIVE
+			&dict.arch_log_no, &dict.arch_log_no,
+#endif /* UNIV_LOG_ARCHIVE */
+			&dict.lsn, &dict.lsn);
+		dict.valid = !dict.check_msg;
+
+		/* Validate this single-table-tablespace with SYS_TABLES,
+		but do not compare the DATA_DIR flag, in case the
+		tablespace was relocated. */
+		if (dict.valid && dict.id == id
+		    && (dict.flags & ~FSP_FLAGS_MASK_DATA_DIR) == mod_flags) {
+			valid_tablespaces_found++;
+		} else {
+			dict.valid = false;
+			/* Do not use this tablespace. */
+			fil_report_bad_tablespace(
+				dict.filepath, dict.check_msg, dict.id,
+				dict.flags, id, flags);
+		}
+	}
+
+	/* Make sense of these three possible locations.
+	First, bail out if no tablespace files were found. */
+	if (valid_tablespaces_found == 0) {
+		/* The following call prints an error message */
+		os_file_get_last_error(true);
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Could not find a valid tablespace file for '%s'. "
+			"See " REFMAN "innodb-troubleshooting-datadict.html "
+			"for how to resolve the issue.",
+			tablename);
+
+		err = DB_CORRUPTION;
+
+		goto cleanup_and_exit;
+	}
+
+	/* Do not open any tablespaces if more than one tablespace with
+	the correct space ID and flags were found. */
+	if (tablespaces_found > 1) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"A tablespace for %s has been found in "
+			"multiple places;", tablename);
+		if (def.success) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Default location; %s, LSN=" LSN_PF
+				", Space ID=%lu, Flags=%lu",
+				def.filepath, def.lsn,
+				(ulong) def.id, (ulong) def.flags);
+		}
+		if (remote.success) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Remote location; %s, LSN=" LSN_PF
+				", Space ID=%lu, Flags=%lu",
+				remote.filepath, remote.lsn,
+				(ulong) remote.id, (ulong) remote.flags);
+		}
+		if (dict.success) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Dictionary location; %s, LSN=" LSN_PF
+				", Space ID=%lu, Flags=%lu",
+				dict.filepath, dict.lsn,
+				(ulong) dict.id, (ulong) dict.flags);
+		}
+
+		/* Force-recovery will allow some tablespaces to be
+		skipped by REDO if there was more than one file found.
+		Unlike during the REDO phase of recovery, we now know
+		if the tablespace is valid according to the dictionary,
+		which was not available then. So if we did not force
+		recovery and there is only one good tablespace, ignore
+		any bad tablespaces. */
+		if (valid_tablespaces_found > 1 || srv_force_recovery > 0) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Will not open the tablespace for '%s'",
+				tablename);
+
+			if (def.success != def.valid
+			    || dict.success != dict.valid
+			    || remote.success != remote.valid) {
+				err = DB_CORRUPTION;
+			} else {
+				err = DB_ERROR;
+			}
+			goto cleanup_and_exit;
+		}
+
+		/* There is only one valid tablespace found and we did
+		not use srv_force_recovery during REDO.  Use this one
+		tablespace and clean up invalid tablespace pointers */
+		if (def.success && !def.valid) {
+			def.success = false;
+			os_file_close(def.file);
+			tablespaces_found--;
+		}
+		if (dict.success && !dict.valid) {
+			dict.success = false;
+			os_file_close(dict.file);
+			/* Leave dict.filepath so that SYS_DATAFILES
+			can be corrected below. */
+			tablespaces_found--;
+		}
+		if (remote.success && !remote.valid) {
+			remote.success = false;
+			os_file_close(remote.file);
+			mem_free(remote.filepath);
+			remote.filepath = NULL;
+			tablespaces_found--;
+		}
+	}
+
+	/* At this point, there should be only one filepath. */
+	ut_a(tablespaces_found == 1);
+	ut_a(valid_tablespaces_found == 1);
+
+	/* Only fix the dictionary at startup when there is only one thread.
+	Calls to dict_load_table() can be done while holding other latches. */
+	if (!fix_dict) {
+		goto skip_validate;
+	}
+
+	/* We may need to change what is stored in SYS_DATAFILES or
+	SYS_TABLESPACES or adjust the link file.
+	Since a failure to update SYS_TABLESPACES or SYS_DATAFILES does
+	not prevent opening and using the single_table_tablespace either
+	this time or the next, we do not check the return code or fail
+	to open the tablespace. But dict_update_filepath() will issue a
+	warning to the log. */
+	if (dict.filepath) {
+		if (remote.success) {
+			dict_update_filepath(id, remote.filepath);
+		} else if (def.success) {
+			dict_update_filepath(id, def.filepath);
+			if (link_file_is_bad) {
+				fil_delete_link_file(tablename);
+			}
+		} else if (!link_file_found || link_file_is_bad) {
+			ut_ad(dict.success);
+			/* Fix the link file if we got our filepath
+			from the dictionary but a link file did not
+			exist or it did not point to a valid file. */
+			fil_delete_link_file(tablename);
+			fil_create_link_file(tablename, dict.filepath);
+		}
+
+	} else if (remote.success && dict_filepath_same_as_default) {
+		dict_update_filepath(id, remote.filepath);
+
+	} else if (remote.success && path_in == NULL) {
+		/* SYS_DATAFILES record for this space ID was not found. */
+		dict_insert_tablespace_and_filepath(
+			id, tablename, remote.filepath, flags);
+	}
+
+skip_validate:
+	if (err != DB_SUCCESS) {
+		; // Don't load the tablespace into the cache
+	} else if (!fil_space_create(tablename, id, flags, FIL_TABLESPACE)) {
+		err = DB_ERROR;
+	} else {
+		/* We do not measure the size of the file, that is why
+		we pass the 0 below */
+
+		if (!fil_node_create(remote.success ? remote.filepath :
+				     dict.success ? dict.filepath :
+				     def.filepath, 0, id, FALSE)) {
+			err = DB_ERROR;
+		}
+	}
+
+cleanup_and_exit:
+	if (remote.success) {
+		os_file_close(remote.file);
+	}
+	if (remote.filepath) {
+		mem_free(remote.filepath);
+	}
+	if (dict.success) {
+		os_file_close(dict.file);
+	}
+	if (dict.filepath) {
+		mem_free(dict.filepath);
+	}
+	if (def.success) {
+		os_file_close(def.file);
+	}
+	mem_free(def.filepath);
+
+	return(err);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_HOTBACKUP
+/*******************************************************************//**
+Allocates a file name for an old version of a single-table tablespace.
+The string must be freed by caller with mem_free()!
+@return	own: file name */
+static
+char*
+fil_make_ibbackup_old_name(
+/*=======================*/
+	const char*	name)		/*!< in: original file name */
+{
+	static const char suffix[] = "_ibbackup_old_vers_";
+	char*	path;
+	ulint	len	= strlen(name);
+
+	path = static_cast<char*>(mem_alloc(len + (15 + sizeof suffix)));
+
+	memcpy(path, name, len);
+	memcpy(path + len, suffix, (sizeof suffix) - 1);
+	ut_sprintf_timestamp_without_extra_chars(
+		path + len + ((sizeof suffix) - 1));
+	return(path);
+}
+#endif /* UNIV_HOTBACKUP */
+
+
+/*******************************************************************//**
+Determine the space id of the given file descriptor by reading a few
+pages from the beginning of the .ibd file.
+@return true if space id was successfully identified, or false. */
+static
+bool
+fil_user_tablespace_find_space_id(
+/*==============================*/
+	fsp_open_info*	fsp)	/* in/out: contains file descriptor, which is
+				used as input.  contains space_id, which is
+				the output */
+{
+	bool		st;
+	os_offset_t	file_size;
+
+	file_size = os_file_get_size(fsp->file);
+
+	if (file_size == (os_offset_t) -1) {
+		ib_logf(IB_LOG_LEVEL_ERROR, "Could not get file size: %s",
+			fsp->filepath);
+		return(false);
+	}
+
+	/* Assuming a page size, read the space_id from each page and store it
+	in a map.  Find out which space_id is agreed on by majority of the
+	pages.  Choose that space_id. */
+	for (ulint page_size = UNIV_ZIP_SIZE_MIN;
+	     page_size <= UNIV_PAGE_SIZE_MAX; page_size <<= 1) {
+
+		/* map[space_id] = count of pages */
+		std::map<ulint, ulint> verify;
+
+		ulint page_count = 64;
+		ulint valid_pages = 0;
+
+		/* Adjust the number of pages to analyze based on file size */
+		while ((page_count * page_size) > file_size) {
+			--page_count;
+		}
+
+		ib_logf(IB_LOG_LEVEL_INFO, "Page size:%lu Pages to analyze:"
+			"%lu", page_size, page_count);
+
+		byte* buf = static_cast<byte*>(ut_malloc(2*page_size));
+		byte* page = static_cast<byte*>(ut_align(buf, page_size));
+
+		for (ulint j = 0; j < page_count; ++j) {
+
+			st = os_file_read(fsp->file, page, (j* page_size), page_size);
+
+			if (!st) {
+				ib_logf(IB_LOG_LEVEL_INFO,
+					"READ FAIL: page_no:%lu", j);
+				continue;
+			}
+
+			bool uncompressed_ok = false;
+
+			/* For uncompressed pages, the page size must be equal
+			to UNIV_PAGE_SIZE. */
+			if (page_size == UNIV_PAGE_SIZE) {
+				uncompressed_ok = !buf_page_is_corrupted(
+					false, page, 0);
+			}
+
+			bool compressed_ok = !buf_page_is_corrupted(
+				false, page, page_size);
+
+			if (uncompressed_ok || compressed_ok) {
+
+				ulint space_id = mach_read_from_4(page
+					+ FIL_PAGE_SPACE_ID);
+
+				if (space_id > 0) {
+					ib_logf(IB_LOG_LEVEL_INFO,
+						"VALID: space:%lu "
+						"page_no:%lu page_size:%lu",
+						space_id, j, page_size);
+					verify[space_id]++;
+					++valid_pages;
+				}
+			}
+		}
+
+		ut_free(buf);
+
+		ib_logf(IB_LOG_LEVEL_INFO, "Page size: %lu, Possible space_id "
+			"count:%lu", page_size, (ulint) verify.size());
+
+		const ulint pages_corrupted = 3;
+		for (ulint missed = 0; missed <= pages_corrupted; ++missed) {
+
+			for (std::map<ulint, ulint>::iterator
+			     m = verify.begin(); m != verify.end(); ++m ) {
+
+				ib_logf(IB_LOG_LEVEL_INFO, "space_id:%lu, "
+					"Number of pages matched: %lu/%lu "
+					"(%lu)", m->first, m->second,
+					valid_pages, page_size);
+
+				if (m->second == (valid_pages - missed)) {
+
+					ib_logf(IB_LOG_LEVEL_INFO,
+						"Chosen space:%lu\n", m->first);
+
+					fsp->id = m->first;
+					return(true);
+				}
+			}
+
+		}
+	}
+
+	return(false);
+}
+
+/*******************************************************************//**
+Finds the given page_no of the given space id from the double write buffer,
+and copies it to the corresponding .ibd file.
+@return true if copy was successful, or false. */
+bool
+fil_user_tablespace_restore_page(
+/*==============================*/
+	fsp_open_info*	fsp,		/* in: contains space id and .ibd
+					file information */
+	ulint		page_no)	/* in: page_no to obtain from double
+					write buffer */
+{
+	bool	err;
+	ulint	flags;
+	ulint	zip_size;
+	ulint	page_size;
+	ulint	buflen;
+	byte*	page;
+
+	ib_logf(IB_LOG_LEVEL_INFO, "Restoring page %lu of tablespace %lu",
+		page_no, fsp->id);
+
+	// find if double write buffer has page_no of given space id
+	page = recv_sys->dblwr.find_page(fsp->id, page_no);
+
+	if (!page) {
+                ib_logf(IB_LOG_LEVEL_WARN, "Doublewrite does not have "
+			"page_no=%lu of space: %lu", page_no, fsp->id);
+		err = false;
+		goto out;
+	}
+
+        flags = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page);
+	zip_size = fsp_flags_get_zip_size(flags);
+	page_size = fsp_flags_get_page_size(flags);
+
+	ut_ad(page_no == page_get_page_no(page));
+
+	buflen = zip_size ? zip_size: page_size;
+
+	ib_logf(IB_LOG_LEVEL_INFO, "Writing %lu bytes into file: %s",
+		buflen, fsp->filepath);
+
+	err = os_file_write(fsp->filepath, fsp->file, page,
+			    (zip_size ? zip_size : page_size) * page_no,
+			    buflen);
+
+	os_file_flush(fsp->file);
+out:
+	return(err);
+}
+
+/********************************************************************//**
+Opens an .ibd file and adds the associated single-table tablespace to the
+InnoDB fil0fil.cc data structures.
+Set fsp->success to TRUE if tablespace is valid, FALSE if not. */
+static
+void
+fil_validate_single_table_tablespace(
+/*=================================*/
+	const char*	tablename,	/*!< in: database/tablename */
+	fsp_open_info*	fsp)		/*!< in/out: tablespace info */
+{
+	bool restore_attempted = false;
+
+check_first_page:
+	fsp->success = TRUE;
+	if (const char* check_msg = fil_read_first_page(
+		    fsp->file, FALSE, &fsp->flags, &fsp->id,
+#ifdef UNIV_LOG_ARCHIVE
+		    &fsp->arch_log_no, &fsp->arch_log_no,
+#endif /* UNIV_LOG_ARCHIVE */
+		    &fsp->lsn, &fsp->lsn)) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"%s in tablespace %s (table %s)",
+			check_msg, fsp->filepath, tablename);
+		fsp->success = FALSE;
+	}
+
+	if (!fsp->success) {
+		if (!restore_attempted) {
+			if (!fil_user_tablespace_find_space_id(fsp)) {
+				return;
+			}
+			restore_attempted = true;
+
+			if (fsp->id > 0
+			    && !fil_user_tablespace_restore_page(fsp, 0)) {
+				return;
+			}
+			goto check_first_page;
+		}
+		return;
+	}
+
+	if (fsp->id == ULINT_UNDEFINED || fsp->id == 0) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Tablespace is not sensible;"
+			" Table: %s  Space ID: %lu  Filepath: %s\n",
+		tablename, (ulong) fsp->id, fsp->filepath);
+		fsp->success = FALSE;
+		return;
+	}
+
+	mutex_enter(&fil_system->mutex);
+	fil_space_t* space = fil_space_get_by_id(fsp->id);
+	mutex_exit(&fil_system->mutex);
+	if (space != NULL) {
+		char* prev_filepath = fil_space_get_first_path(fsp->id);
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Attempted to open a previously opened tablespace. "
+			"Previous tablespace %s uses space ID: %lu at "
+			"filepath: %s. Cannot open tablespace %s which uses "
+			"space ID: %lu at filepath: %s",
+			space->name, (ulong) space->id, prev_filepath,
+			tablename, (ulong) fsp->id, fsp->filepath);
+
+		mem_free(prev_filepath);
+		fsp->success = FALSE;
+		return;
+	}
+
+	fsp->success = TRUE;
+}
+
+
+/********************************************************************//**
+Opens an .ibd file and adds the associated single-table tablespace to the
+InnoDB fil0fil.cc data structures. */
+static
+void
+fil_load_single_table_tablespace(
+/*=============================*/
+	const char*	dbname,		/*!< in: database name */
+	const char*	filename)	/*!< in: file name (not a path),
+					including the .ibd or .isl extension */
+{
+	char*		tablename;
+	ulint		tablename_len;
+	ulint		dbname_len = strlen(dbname);
+	ulint		filename_len = strlen(filename);
+	fsp_open_info	def;
+	fsp_open_info	remote;
+	os_offset_t	size;
+#ifdef UNIV_HOTBACKUP
+	fil_space_t*	space;
+#endif
+
+	memset(&def, 0, sizeof(def));
+	memset(&remote, 0, sizeof(remote));
+
+	/* The caller assured that the extension is ".ibd" or ".isl". */
+	ut_ad(0 == memcmp(filename + filename_len - 4, ".ibd", 4)
+	      || 0 == memcmp(filename + filename_len - 4, ".isl", 4));
+
+	/* Build up the tablename in the standard form database/table. */
+	tablename = static_cast<char*>(
+		mem_alloc(dbname_len + filename_len + 2));
+
+	/* When lower_case_table_names = 2 it is possible that the
+	dbname is in upper case ,but while storing it in fil_space_t
+	we must convert it into lower case */
+	sprintf(tablename, "%s" , dbname);
+	tablename[dbname_len] = '\0';
+
+        if (lower_case_file_system) {
+                dict_casedn_str(tablename);
+        }
+
+	sprintf(tablename+dbname_len,"/%s",filename);
+	tablename_len = strlen(tablename) - strlen(".ibd");
+	tablename[tablename_len] = '\0';
+
+	/* There may be both .ibd and .isl file in the directory.
+	And it is possible that the .isl file refers to a different
+	.ibd file.  If so, we open and compare them the first time
+	one of them is sent to this function.  So if this table has
+	already been loaded, there is nothing to do.*/
+	mutex_enter(&fil_system->mutex);
+	if (fil_space_get_by_name(tablename)) {
+		mem_free(tablename);
+		mutex_exit(&fil_system->mutex);
+		return;
+	}
+	mutex_exit(&fil_system->mutex);
+
+	/* Build up the filepath of the .ibd tablespace in the datadir.
+	This must be freed independent of def.success. */
+	def.filepath = fil_make_ibd_name(tablename, false);
+
+#ifdef __WIN__
+# ifndef UNIV_HOTBACKUP
+	/* If lower_case_table_names is 0 or 2, then MySQL allows database
+	directory names with upper case letters. On Windows, all table and
+	database names in InnoDB are internally always in lower case. Put the
+	file path to lower case, so that we are consistent with InnoDB's
+	internal data dictionary. */
+
+	dict_casedn_str(def.filepath);
+# endif /* !UNIV_HOTBACKUP */
+#endif
+
+	/* Check for a link file which locates a remote tablespace. */
+	remote.success = fil_open_linked_file(
+		tablename, &remote.filepath, &remote.file);
+
+	/* Read the first page of the remote tablespace */
+	if (remote.success) {
+		fil_validate_single_table_tablespace(tablename, &remote);
+		if (!remote.success) {
+			os_file_close(remote.file);
+			mem_free(remote.filepath);
+		}
+	}
+
+
+	/* Try to open the tablespace in the datadir. */
+	def.file = os_file_create_simple_no_error_handling(
+		innodb_file_data_key, def.filepath, OS_FILE_OPEN,
+		OS_FILE_READ_WRITE, &def.success);
+
+	/* Read the first page of the remote tablespace */
+	if (def.success) {
+		fil_validate_single_table_tablespace(tablename, &def);
+		if (!def.success) {
+			os_file_close(def.file);
+		}
+	}
+
+	if (!def.success && !remote.success) {
+		/* The following call prints an error message */
+		os_file_get_last_error(true);
+		fprintf(stderr,
+			"InnoDB: Error: could not open single-table"
+			" tablespace file %s\n", def.filepath);
+
+		if (!strncmp(filename,
+			     tmp_file_prefix, tmp_file_prefix_length)) {
+			/* Ignore errors for #sql tablespaces. */
+			mem_free(tablename);
+			if (remote.filepath) {
+				mem_free(remote.filepath);
+			}
+			if (def.filepath) {
+				mem_free(def.filepath);
+			}
+			return;
+		}
+no_good_file:
+		fprintf(stderr,
+			"InnoDB: We do not continue the crash recovery,"
+			" because the table may become\n"
+			"InnoDB: corrupt if we cannot apply the log"
+			" records in the InnoDB log to it.\n"
+			"InnoDB: To fix the problem and start mysqld:\n"
+			"InnoDB: 1) If there is a permission problem"
+			" in the file and mysqld cannot\n"
+			"InnoDB: open the file, you should"
+			" modify the permissions.\n"
+			"InnoDB: 2) If the table is not needed, or you"
+			" can restore it from a backup,\n"
+			"InnoDB: then you can remove the .ibd file,"
+			" and InnoDB will do a normal\n"
+			"InnoDB: crash recovery and ignore that table.\n"
+			"InnoDB: 3) If the file system or the"
+			" disk is broken, and you cannot remove\n"
+			"InnoDB: the .ibd file, you can set"
+			" innodb_force_recovery > 0 in my.cnf\n"
+			"InnoDB: and force InnoDB to continue crash"
+			" recovery here.\n");
+will_not_choose:
+		mem_free(tablename);
+		if (remote.filepath) {
+			mem_free(remote.filepath);
+		}
+		if (def.filepath) {
+			mem_free(def.filepath);
+		}
+
+		if (srv_force_recovery > 0) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"innodb_force_recovery was set to %lu. "
+				"Continuing crash recovery even though we "
+				"cannot access the .ibd file of this table.",
+				srv_force_recovery);
+			return;
+		}
+
+		exit(1);
+	}
+
+	if (def.success && remote.success) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Tablespaces for %s have been found in two places;\n"
+			"Location 1: SpaceID: %lu  LSN: %lu  File: %s\n"
+			"Location 2: SpaceID: %lu  LSN: %lu  File: %s\n"
+			"You must delete one of them.",
+			tablename, (ulong) def.id, (ulong) def.lsn,
+			def.filepath, (ulong) remote.id, (ulong) remote.lsn,
+			remote.filepath);
+
+		def.success = FALSE;
+		os_file_close(def.file);
+		os_file_close(remote.file);
+		goto will_not_choose;
+	}
+
+	/* At this point, only one tablespace is open */
+	ut_a(def.success == !remote.success);
+
+	fsp_open_info*	fsp = def.success ? &def : &remote;
+
+	/* Get and test the file size. */
+	size = os_file_get_size(fsp->file);
+
+	if (size == (os_offset_t) -1) {
+		/* The following call prints an error message */
+		os_file_get_last_error(true);
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"could not measure the size of single-table "
+			"tablespace file %s", fsp->filepath);
+
+		os_file_close(fsp->file);
+		goto no_good_file;
+	}
+
+	/* Every .ibd file is created >= 4 pages in size. Smaller files
+	cannot be ok. */
+	ulong minimum_size = FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE;
+	if (size < minimum_size) {
+#ifndef UNIV_HOTBACKUP
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"The size of single-table tablespace file %s "
+			"is only " UINT64PF ", should be at least %lu!",
+			fsp->filepath, size, minimum_size);
+		os_file_close(fsp->file);
+		goto no_good_file;
+#else
+		fsp->id = ULINT_UNDEFINED;
+		fsp->flags = 0;
+#endif /* !UNIV_HOTBACKUP */
+	}
+
+#ifdef UNIV_HOTBACKUP
+	if (fsp->id == ULINT_UNDEFINED || fsp->id == 0) {
+		char*	new_path;
+
+		fprintf(stderr,
+			"InnoDB: Renaming tablespace %s of id %lu,\n"
+			"InnoDB: to %s_ibbackup_old_vers_<timestamp>\n"
+			"InnoDB: because its size %" PRId64 " is too small"
+			" (< 4 pages 16 kB each),\n"
+			"InnoDB: or the space id in the file header"
+			" is not sensible.\n"
+			"InnoDB: This can happen in an mysqlbackup run,"
+			" and is not dangerous.\n",
+			fsp->filepath, fsp->id, fsp->filepath, size);
+		os_file_close(fsp->file);
+
+		new_path = fil_make_ibbackup_old_name(fsp->filepath);
+
+		bool	success = os_file_rename(
+			innodb_file_data_key, fsp->filepath, new_path);
+
+		ut_a(success);
+
+		mem_free(new_path);
+
+		goto func_exit_after_close;
+	}
+
+	/* A backup may contain the same space several times, if the space got
+	renamed at a sensitive time. Since it is enough to have one version of
+	the space, we rename the file if a space with the same space id
+	already exists in the tablespace memory cache. We rather rename the
+	file than delete it, because if there is a bug, we do not want to
+	destroy valuable data. */
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(fsp->id);
+
+	if (space) {
+		char*	new_path;
+
+		fprintf(stderr,
+			"InnoDB: Renaming tablespace %s of id %lu,\n"
+			"InnoDB: to %s_ibbackup_old_vers_<timestamp>\n"
+			"InnoDB: because space %s with the same id\n"
+			"InnoDB: was scanned earlier. This can happen"
+			" if you have renamed tables\n"
+			"InnoDB: during an mysqlbackup run.\n",
+			fsp->filepath, fsp->id, fsp->filepath,
+			space->name);
+		os_file_close(fsp->file);
+
+		new_path = fil_make_ibbackup_old_name(fsp->filepath);
+
+		mutex_exit(&fil_system->mutex);
+
+		bool	success = os_file_rename(
+			innodb_file_data_key, fsp->filepath, new_path);
+
+		ut_a(success);
+
+		mem_free(new_path);
+
+		goto func_exit_after_close;
+	}
+	mutex_exit(&fil_system->mutex);
+#endif /* UNIV_HOTBACKUP */
+	ibool file_space_create_success = fil_space_create(
+		tablename, fsp->id, fsp->flags, FIL_TABLESPACE);
+
+	if (!file_space_create_success) {
+		if (srv_force_recovery > 0) {
+			fprintf(stderr,
+				"InnoDB: innodb_force_recovery was set"
+				" to %lu. Continuing crash recovery\n"
+				"InnoDB: even though the tablespace"
+				" creation of this table failed.\n",
+				srv_force_recovery);
+			goto func_exit;
+		}
+
+		/* Exit here with a core dump, stack, etc. */
+		ut_a(file_space_create_success);
+	}
+
+	/* We do not use the size information we have about the file, because
+	the rounding formula for extents and pages is somewhat complex; we
+	let fil_node_open() do that task. */
+
+	if (!fil_node_create(fsp->filepath, 0, fsp->id, FALSE)) {
+		ut_error;
+	}
+
+func_exit:
+	os_file_close(fsp->file);
+
+#ifdef UNIV_HOTBACKUP
+func_exit_after_close:
+#else
+	ut_ad(!mutex_own(&fil_system->mutex));
+#endif
+	mem_free(tablename);
+	if (remote.success) {
+		mem_free(remote.filepath);
+	}
+	mem_free(def.filepath);
+}
+
+/***********************************************************************//**
+A fault-tolerant function that tries to read the next file name in the
+directory. We retry 100 times if os_file_readdir_next_file() returns -1. The
+idea is to read as much good data as we can and jump over bad data.
+@return 0 if ok, -1 if error even after the retries, 1 if at the end
+of the directory */
+static
+int
+fil_file_readdir_next_file(
+/*=======================*/
+	dberr_t*	err,	/*!< out: this is set to DB_ERROR if an error
+				was encountered, otherwise not changed */
+	const char*	dirname,/*!< in: directory name or path */
+	os_file_dir_t	dir,	/*!< in: directory stream */
+	os_file_stat_t*	info)	/*!< in/out: buffer where the
+				info is returned */
+{
+	for (ulint i = 0; i < 100; i++) {
+		int	ret = os_file_readdir_next_file(dirname, dir, info);
+
+		if (ret != -1) {
+
+			return(ret);
+		}
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"os_file_readdir_next_file() returned -1 in "
+			"directory %s, crash recovery may have failed "
+			"for some .ibd files!", dirname);
+
+		*err = DB_ERROR;
+	}
+
+	return(-1);
+}
+
+/********************************************************************//**
+At the server startup, if we need crash recovery, scans the database
+directories under the MySQL datadir, looking for .ibd files. Those files are
+single-table tablespaces. We need to know the space id in each of them so that
+we know into which file we should look to check the contents of a page stored
+in the doublewrite buffer, also to know where to apply log records where the
+space id is != 0.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+dberr_t
+fil_load_single_table_tablespaces(void)
+/*===================================*/
+{
+	int		ret;
+	char*		dbpath		= NULL;
+	ulint		dbpath_len	= 100;
+	os_file_dir_t	dir;
+	os_file_dir_t	dbdir;
+	os_file_stat_t	dbinfo;
+	os_file_stat_t	fileinfo;
+	dberr_t		err		= DB_SUCCESS;
+
+	/* The datadir of MySQL is always the default directory of mysqld */
+
+	dir = os_file_opendir(fil_path_to_mysql_datadir, TRUE);
+
+	if (dir == NULL) {
+
+		return(DB_ERROR);
+	}
+
+	dbpath = static_cast<char*>(mem_alloc(dbpath_len));
+
+	/* Scan all directories under the datadir. They are the database
+	directories of MySQL. */
+
+	ret = fil_file_readdir_next_file(&err, fil_path_to_mysql_datadir, dir,
+					 &dbinfo);
+	while (ret == 0) {
+		ulint len;
+		/* printf("Looking at %s in datadir\n", dbinfo.name); */
+
+		if (dbinfo.type == OS_FILE_TYPE_FILE
+		    || dbinfo.type == OS_FILE_TYPE_UNKNOWN) {
+
+			goto next_datadir_item;
+		}
+
+		/* We found a symlink or a directory; try opening it to see
+		if a symlink is a directory */
+
+		len = strlen(fil_path_to_mysql_datadir)
+			+ strlen (dbinfo.name) + 2;
+		if (len > dbpath_len) {
+			dbpath_len = len;
+
+			if (dbpath) {
+				mem_free(dbpath);
+			}
+
+			dbpath = static_cast<char*>(mem_alloc(dbpath_len));
+		}
+		ut_snprintf(dbpath, dbpath_len,
+			    "%s/%s", fil_path_to_mysql_datadir, dbinfo.name);
+		srv_normalize_path_for_win(dbpath);
+
+		dbdir = os_file_opendir(dbpath, FALSE);
+
+		if (dbdir != NULL) {
+
+			/* We found a database directory; loop through it,
+			looking for possible .ibd files in it */
+
+			ret = fil_file_readdir_next_file(&err, dbpath, dbdir,
+							 &fileinfo);
+			while (ret == 0) {
+
+				if (fileinfo.type == OS_FILE_TYPE_DIR) {
+
+					goto next_file_item;
+				}
+
+				/* We found a symlink or a file */
+				if (strlen(fileinfo.name) > 4
+				    && (0 == strcmp(fileinfo.name
+						   + strlen(fileinfo.name) - 4,
+						   ".ibd")
+					|| 0 == strcmp(fileinfo.name
+						   + strlen(fileinfo.name) - 4,
+						   ".isl"))) {
+					/* The name ends in .ibd or .isl;
+					try opening the file */
+					fil_load_single_table_tablespace(
+						dbinfo.name, fileinfo.name);
+				}
+next_file_item:
+				ret = fil_file_readdir_next_file(&err,
+								 dbpath, dbdir,
+								 &fileinfo);
+			}
+
+			if (0 != os_file_closedir(dbdir)) {
+				fputs("InnoDB: Warning: could not"
+				      " close database directory ", stderr);
+				ut_print_filename(stderr, dbpath);
+				putc('\n', stderr);
+
+				err = DB_ERROR;
+			}
+		}
+
+next_datadir_item:
+		ret = fil_file_readdir_next_file(&err,
+						 fil_path_to_mysql_datadir,
+						 dir, &dbinfo);
+	}
+
+	mem_free(dbpath);
+
+	if (0 != os_file_closedir(dir)) {
+		fprintf(stderr,
+			"InnoDB: Error: could not close MySQL datadir\n");
+
+		return(DB_ERROR);
+	}
+
+	return(err);
+}
+
+/*******************************************************************//**
+Returns TRUE if a single-table tablespace does not exist in the memory cache,
+or is being deleted there.
+@return	TRUE if does not exist or is being deleted */
+UNIV_INTERN
+ibool
+fil_tablespace_deleted_or_being_deleted_in_mem(
+/*===========================================*/
+	ulint		id,	/*!< in: space id */
+	ib_int64_t	version)/*!< in: tablespace_version should be this; if
+				you pass -1 as the value of this, then this
+				parameter is ignored */
+{
+	fil_space_t*	space;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	if (space == NULL || space->stop_new_ops) {
+		mutex_exit(&fil_system->mutex);
+
+		return(TRUE);
+	}
+
+	if (version != ((ib_int64_t)-1)
+	    && space->tablespace_version != version) {
+		mutex_exit(&fil_system->mutex);
+
+		return(TRUE);
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Returns TRUE if a single-table tablespace exists in the memory cache.
+@return	TRUE if exists */
+UNIV_INTERN
+ibool
+fil_tablespace_exists_in_mem(
+/*=========================*/
+	ulint	id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	mutex_exit(&fil_system->mutex);
+
+	return(space != NULL);
+}
+
+/*******************************************************************//**
+Report that a tablespace for a table was not found. */
+static
+void
+fil_report_missing_tablespace(
+/*===========================*/
+	const char*	name,			/*!< in: table name */
+	ulint		space_id)		/*!< in: table's space id */
+{
+	char index_name[MAX_FULL_NAME_LEN + 1];
+
+	innobase_format_name(index_name, sizeof(index_name), name, TRUE);
+
+	ib_logf(IB_LOG_LEVEL_ERROR,
+		"Table %s in the InnoDB data dictionary has tablespace id %lu, "
+		"but tablespace with that id or name does not exist. Have "
+		"you deleted or moved .ibd files? This may also be a table "
+		"created with CREATE TEMPORARY TABLE whose .ibd and .frm "
+		"files MySQL automatically removed, but the table still "
+		"exists in the InnoDB internal data dictionary.",
+		name, space_id);
+}
+
+/*******************************************************************//**
+Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory
+cache. Note that if we have not done a crash recovery at the database startup,
+there may be many tablespaces which are not yet in the memory cache.
+@return	TRUE if a matching tablespace exists in the memory cache */
+UNIV_INTERN
+ibool
+fil_space_for_table_exists_in_mem(
+/*==============================*/
+	ulint		id,		/*!< in: space id */
+	const char*	name,		/*!< in: table name used in
+					fil_space_create().  Either the
+					standard 'dbname/tablename' format
+					or table->dir_path_of_temp_table */
+	ibool		mark_space,	/*!< in: in crash recovery, at database
+					startup we mark all spaces which have
+					an associated table in the InnoDB
+					data dictionary, so that
+					we can print a warning about orphaned
+					tablespaces */
+	ibool		print_error_if_does_not_exist,
+					/*!< in: print detailed error
+					information to the .err log if a
+					matching tablespace is not found from
+					memory */
+	bool		adjust_space,	/*!< in: whether to adjust space id
+					when find table space mismatch */
+	mem_heap_t*	heap,		/*!< in: heap memory */
+	table_id_t	table_id)	/*!< in: table id */
+{
+	fil_space_t*	fnamespace;
+	fil_space_t*	space;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	/* Look if there is a space with the same id */
+
+	space = fil_space_get_by_id(id);
+
+	/* Look if there is a space with the same name; the name is the
+	directory path from the datadir to the file */
+
+	fnamespace = fil_space_get_by_name(name);
+	if (space && space == fnamespace) {
+		/* Found */
+
+		if (mark_space) {
+			space->mark = TRUE;
+		}
+
+		mutex_exit(&fil_system->mutex);
+
+		return(TRUE);
+	}
+
+	/* Info from "fnamespace" comes from the ibd file itself, it can
+	be different from data obtained from System tables since it is
+	not transactional. If adjust_space is set, and the mismatching
+	space are between a user table and its temp table, we shall
+	adjust the ibd file name according to system table info */
+	if (adjust_space
+	    && space != NULL
+	    && row_is_mysql_tmp_table_name(space->name)
+	    && !row_is_mysql_tmp_table_name(name)) {
+
+		mutex_exit(&fil_system->mutex);
+
+		DBUG_EXECUTE_IF("ib_crash_before_adjust_fil_space",
+				DBUG_SUICIDE(););
+
+		if (fnamespace) {
+			char*	tmp_name;
+
+			tmp_name = dict_mem_create_temporary_tablename(
+				heap, name, table_id);
+
+			fil_rename_tablespace(fnamespace->name, fnamespace->id,
+					      tmp_name, NULL);
+		}
+
+		DBUG_EXECUTE_IF("ib_crash_after_adjust_one_fil_space",
+				DBUG_SUICIDE(););
+
+		fil_rename_tablespace(space->name, id, name, NULL);
+
+		DBUG_EXECUTE_IF("ib_crash_after_adjust_fil_space",
+				DBUG_SUICIDE(););
+
+		mutex_enter(&fil_system->mutex);
+		fnamespace = fil_space_get_by_name(name);
+		ut_ad(space == fnamespace);
+		mutex_exit(&fil_system->mutex);
+
+		return(TRUE);
+	}
+
+	if (!print_error_if_does_not_exist) {
+
+		mutex_exit(&fil_system->mutex);
+
+		return(FALSE);
+	}
+
+	if (space == NULL) {
+		if (fnamespace == NULL) {
+			if (print_error_if_does_not_exist) {
+				fil_report_missing_tablespace(name, id);
+			}
+		} else {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Error: table ", stderr);
+			ut_print_filename(stderr, name);
+			fprintf(stderr, "\n"
+				"InnoDB: in InnoDB data dictionary has"
+				" tablespace id %lu,\n"
+				"InnoDB: but a tablespace with that id"
+				" does not exist. There is\n"
+				"InnoDB: a tablespace of name %s and id %lu,"
+				" though. Have\n"
+				"InnoDB: you deleted or moved .ibd files?\n",
+				(ulong) id, fnamespace->name,
+				(ulong) fnamespace->id);
+		}
+error_exit:
+		fputs("InnoDB: Please refer to\n"
+		      "InnoDB: " REFMAN "innodb-troubleshooting-datadict.html\n"
+		      "InnoDB: for how to resolve the issue.\n", stderr);
+
+		mutex_exit(&fil_system->mutex);
+
+		return(FALSE);
+	}
+
+	if (0 != strcmp(space->name, name)) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_filename(stderr, name);
+		fprintf(stderr, "\n"
+			"InnoDB: in InnoDB data dictionary has"
+			" tablespace id %lu,\n"
+			"InnoDB: but the tablespace with that id"
+			" has name %s.\n"
+			"InnoDB: Have you deleted or moved .ibd files?\n",
+			(ulong) id, space->name);
+
+		if (fnamespace != NULL) {
+			fputs("InnoDB: There is a tablespace"
+			      " with the right name\n"
+			      "InnoDB: ", stderr);
+			ut_print_filename(stderr, fnamespace->name);
+			fprintf(stderr, ", but its id is %lu.\n",
+				(ulong) fnamespace->id);
+		}
+
+		goto error_exit;
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Checks if a single-table tablespace for a given table name exists in the
+tablespace memory cache.
+@return	space id, ULINT_UNDEFINED if not found */
+UNIV_INTERN
+ulint
+fil_get_space_id_for_table(
+/*=======================*/
+	const char*	tablename)	/*!< in: table name in the standard
+				'databasename/tablename' format */
+{
+	fil_space_t*	fnamespace;
+	ulint		id		= ULINT_UNDEFINED;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	/* Look if there is a space with the same name. */
+
+	fnamespace = fil_space_get_by_name(tablename);
+
+	if (fnamespace) {
+		id = fnamespace->id;
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(id);
+}
+
+/**********************************************************************//**
+Tries to extend a data file so that it would accommodate the number of pages
+given. The tablespace must be cached in the memory cache. If the space is big
+enough already, does nothing.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_extend_space_to_desired_size(
+/*=============================*/
+	ulint*	actual_size,	/*!< out: size of the space after extension;
+				if we ran out of disk space this may be lower
+				than the desired size */
+	ulint	space_id,	/*!< in: space id */
+	ulint	size_after_extend)/*!< in: desired size in pages after the
+				extension; if the current space size is bigger
+				than this already, the function does nothing */
+{
+	fil_node_t*	node;
+	fil_space_t*	space;
+	byte*		buf2;
+	byte*		buf;
+	ulint		buf_size;
+	ulint		start_page_no;
+	ulint		file_start_page_no;
+	ulint		page_size;
+	ulint		pages_added;
+	ibool		success;
+
+	ut_ad(!srv_read_only_mode);
+
+retry:
+	pages_added = 0;
+	success = TRUE;
+
+	fil_mutex_enter_and_prepare_for_io(space_id);
+
+	space = fil_space_get_by_id(space_id);
+	ut_a(space);
+
+	if (space->size >= size_after_extend) {
+		/* Space already big enough */
+
+		*actual_size = space->size;
+
+		mutex_exit(&fil_system->mutex);
+
+		return(TRUE);
+	}
+
+	page_size = fsp_flags_get_zip_size(space->flags);
+	if (!page_size) {
+		page_size = UNIV_PAGE_SIZE;
+	}
+
+	node = UT_LIST_GET_LAST(space->chain);
+
+	if (!node->being_extended) {
+		/* Mark this node as undergoing extension. This flag
+		is used by other threads to wait for the extension
+		opereation to finish. */
+		node->being_extended = TRUE;
+	} else {
+		/* Another thread is currently extending the file. Wait
+		for it to finish.
+		It'd have been better to use event driven mechanism but
+		the entire module is peppered with polling stuff. */
+		mutex_exit(&fil_system->mutex);
+		os_thread_sleep(100000);
+		goto retry;
+	}
+
+	if (!fil_node_prepare_for_io(node, fil_system, space)) {
+		/* The tablespace data file, such as .ibd file, is missing */
+		node->being_extended = false;
+		mutex_exit(&fil_system->mutex);
+
+		return(false);
+	}
+
+	/* At this point it is safe to release fil_system mutex. No
+	other thread can rename, delete or close the file because
+	we have set the node->being_extended flag. */
+	mutex_exit(&fil_system->mutex);
+
+	start_page_no = space->size;
+	file_start_page_no = space->size - node->size;
+
+	/* Extend at most 64 pages at a time */
+	buf_size = ut_min(64, size_after_extend - start_page_no) * page_size;
+	buf2 = static_cast<byte*>(mem_alloc(buf_size + page_size));
+	buf = static_cast<byte*>(ut_align(buf2, page_size));
+
+	memset(buf, 0, buf_size);
+
+	while (start_page_no < size_after_extend) {
+		ulint		n_pages
+			= ut_min(buf_size / page_size,
+				 size_after_extend - start_page_no);
+
+		os_offset_t	offset
+			= ((os_offset_t) (start_page_no - file_start_page_no))
+			* page_size;
+#ifdef UNIV_HOTBACKUP
+		success = os_file_write(node->name, node->handle, buf,
+					offset, page_size * n_pages);
+#else
+		success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC,
+				 node->name, node->handle, buf,
+				 offset, page_size * n_pages,
+				 NULL, NULL);
+#endif /* UNIV_HOTBACKUP */
+		if (success) {
+			os_has_said_disk_full = FALSE;
+		} else {
+			/* Let us measure the size of the file to determine
+			how much we were able to extend it */
+			os_offset_t	size;
+
+			size = os_file_get_size(node->handle);
+			ut_a(size != (os_offset_t) -1);
+
+			n_pages = ((ulint) (size / page_size))
+				- node->size - pages_added;
+
+			pages_added += n_pages;
+			break;
+		}
+
+		start_page_no += n_pages;
+		pages_added += n_pages;
+	}
+
+	mem_free(buf2);
+
+	mutex_enter(&fil_system->mutex);
+
+	ut_a(node->being_extended);
+
+	space->size += pages_added;
+	node->size += pages_added;
+	node->being_extended = FALSE;
+
+	fil_node_complete_io(node, fil_system, OS_FILE_WRITE);
+
+	*actual_size = space->size;
+
+#ifndef UNIV_HOTBACKUP
+	if (space_id == 0) {
+		ulint pages_per_mb = (1024 * 1024) / page_size;
+
+		/* Keep the last data file size info up to date, rounded to
+		full megabytes */
+
+		srv_data_file_sizes[srv_n_data_files - 1]
+			= (node->size / pages_per_mb) * pages_per_mb;
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	/*
+	printf("Extended %s to %lu, actual size %lu pages\n", space->name,
+	size_after_extend, *actual_size); */
+	mutex_exit(&fil_system->mutex);
+
+	fil_flush(space_id);
+
+	return(success);
+}
+
+#ifdef UNIV_HOTBACKUP
+/********************************************************************//**
+Extends all tablespaces to the size stored in the space header. During the
+mysqlbackup --apply-log phase we extended the spaces on-demand so that log
+records could be applied, but that may have left spaces still too small
+compared to the size stored in the space header. */
+UNIV_INTERN
+void
+fil_extend_tablespaces_to_stored_len(void)
+/*======================================*/
+{
+	fil_space_t*	space;
+	byte*		buf;
+	ulint		actual_size;
+	ulint		size_in_header;
+	dberr_t		error;
+	ibool		success;
+
+	buf = mem_alloc(UNIV_PAGE_SIZE);
+
+	mutex_enter(&fil_system->mutex);
+
+	space = UT_LIST_GET_FIRST(fil_system->space_list);
+
+	while (space) {
+		ut_a(space->purpose == FIL_TABLESPACE);
+
+		mutex_exit(&fil_system->mutex); /* no need to protect with a
+					      mutex, because this is a
+					      single-threaded operation */
+		error = fil_read(TRUE, space->id,
+				 fsp_flags_get_zip_size(space->flags),
+				 0, 0, UNIV_PAGE_SIZE, buf, NULL);
+		ut_a(error == DB_SUCCESS);
+
+		size_in_header = fsp_get_size_low(buf);
+
+		success = fil_extend_space_to_desired_size(
+			&actual_size, space->id, size_in_header);
+		if (!success) {
+			fprintf(stderr,
+				"InnoDB: Error: could not extend the"
+				" tablespace of %s\n"
+				"InnoDB: to the size stored in header,"
+				" %lu pages;\n"
+				"InnoDB: size after extension %lu pages\n"
+				"InnoDB: Check that you have free disk space"
+				" and retry!\n",
+				space->name, size_in_header, actual_size);
+			ut_a(success);
+		}
+
+		mutex_enter(&fil_system->mutex);
+
+		space = UT_LIST_GET_NEXT(space_list, space);
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	mem_free(buf);
+}
+#endif
+
+/*========== RESERVE FREE EXTENTS (for a B-tree split, for example) ===*/
+
+/*******************************************************************//**
+Tries to reserve free extents in a file space.
+@return	TRUE if succeed */
+UNIV_INTERN
+ibool
+fil_space_reserve_free_extents(
+/*===========================*/
+	ulint	id,		/*!< in: space id */
+	ulint	n_free_now,	/*!< in: number of free extents now */
+	ulint	n_to_reserve)	/*!< in: how many one wants to reserve */
+{
+	fil_space_t*	space;
+	ibool		success;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	ut_a(space);
+
+	if (space->n_reserved_extents + n_to_reserve > n_free_now) {
+		success = FALSE;
+	} else {
+		space->n_reserved_extents += n_to_reserve;
+		success = TRUE;
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(success);
+}
+
+/*******************************************************************//**
+Releases free extents in a file space. */
+UNIV_INTERN
+void
+fil_space_release_free_extents(
+/*===========================*/
+	ulint	id,		/*!< in: space id */
+	ulint	n_reserved)	/*!< in: how many one reserved */
+{
+	fil_space_t*	space;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	ut_a(space);
+	ut_a(space->n_reserved_extents >= n_reserved);
+
+	space->n_reserved_extents -= n_reserved;
+
+	mutex_exit(&fil_system->mutex);
+}
+
+/*******************************************************************//**
+Gets the number of reserved extents. If the database is silent, this number
+should be zero. */
+UNIV_INTERN
+ulint
+fil_space_get_n_reserved_extents(
+/*=============================*/
+	ulint	id)		/*!< in: space id */
+{
+	fil_space_t*	space;
+	ulint		n;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	ut_a(space);
+
+	n = space->n_reserved_extents;
+
+	mutex_exit(&fil_system->mutex);
+
+	return(n);
+}
+
+/*============================ FILE I/O ================================*/
+
+/********************************************************************//**
+NOTE: you must call fil_mutex_enter_and_prepare_for_io() first!
+
+Prepares a file node for i/o. Opens the file if it is closed. Updates the
+pending i/o's field in the node and the system appropriately. Takes the node
+off the LRU list if it is in the LRU list. The caller must hold the fil_sys
+mutex.
+@return false if the file can't be opened, otherwise true */
+static
+bool
+fil_node_prepare_for_io(
+/*====================*/
+	fil_node_t*	node,	/*!< in: file node */
+	fil_system_t*	system,	/*!< in: tablespace memory cache */
+	fil_space_t*	space)	/*!< in: space */
+{
+	ut_ad(node && system && space);
+	ut_ad(mutex_own(&(system->mutex)));
+
+	if (system->n_open > system->max_n_open + 5) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Warning: open files %lu"
+			" exceeds the limit %lu\n",
+			(ulong) system->n_open,
+			(ulong) system->max_n_open);
+	}
+
+	if (node->open == FALSE) {
+		/* File is closed: open it */
+		ut_a(node->n_pending == 0);
+
+		if (!fil_node_open_file(node, system, space)) {
+			return(false);
+		}
+	}
+
+	if (node->n_pending == 0 && fil_space_belongs_in_lru(space)) {
+		/* The node is in the LRU list, remove it */
+
+		ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
+
+		UT_LIST_REMOVE(LRU, system->LRU, node);
+	}
+
+	node->n_pending++;
+
+	return(true);
+}
+
+/********************************************************************//**
+Updates the data structures when an i/o operation finishes. Updates the
+pending i/o's field in the node appropriately. */
+static
+void
+fil_node_complete_io(
+/*=================*/
+	fil_node_t*	node,	/*!< in: file node */
+	fil_system_t*	system,	/*!< in: tablespace memory cache */
+	ulint		type)	/*!< in: OS_FILE_WRITE or OS_FILE_READ; marks
+				the node as modified if
+				type == OS_FILE_WRITE */
+{
+	ut_ad(node);
+	ut_ad(system);
+	ut_ad(mutex_own(&(system->mutex)));
+
+	ut_a(node->n_pending > 0);
+
+	node->n_pending--;
+
+	if (type == OS_FILE_WRITE) {
+		ut_ad(!srv_read_only_mode);
+		system->modification_counter++;
+		node->modification_counter = system->modification_counter;
+
+		if (fil_buffering_disabled(node->space)) {
+
+			/* We don't need to keep track of unflushed
+			changes as user has explicitly disabled
+			buffering. */
+			ut_ad(!node->space->is_in_unflushed_spaces);
+			node->flush_counter = node->modification_counter;
+
+		} else if (!node->space->is_in_unflushed_spaces) {
+
+			node->space->is_in_unflushed_spaces = true;
+			UT_LIST_ADD_FIRST(unflushed_spaces,
+					  system->unflushed_spaces,
+					  node->space);
+		}
+	}
+
+	if (node->n_pending == 0 && fil_space_belongs_in_lru(node->space)) {
+
+		/* The node must be put back to the LRU list */
+		UT_LIST_ADD_FIRST(LRU, system->LRU, node);
+	}
+}
+
+/********************************************************************//**
+Report information about an invalid page access. */
+static
+void
+fil_report_invalid_page_access(
+/*===========================*/
+	ulint		block_offset,	/*!< in: block offset */
+	ulint		space_id,	/*!< in: space id */
+	const char*	space_name,	/*!< in: space name */
+	ulint		byte_offset,	/*!< in: byte offset */
+	ulint		len,		/*!< in: I/O length */
+	ulint		type)		/*!< in: I/O type */
+{
+	fprintf(stderr,
+		"InnoDB: Error: trying to access page number %lu"
+		" in space %lu,\n"
+		"InnoDB: space name %s,\n"
+		"InnoDB: which is outside the tablespace bounds.\n"
+		"InnoDB: Byte offset %lu, len %lu, i/o type %lu.\n"
+		"InnoDB: If you get this error at mysqld startup,"
+		" please check that\n"
+		"InnoDB: your my.cnf matches the ibdata files"
+		" that you have in the\n"
+		"InnoDB: MySQL server.\n",
+		(ulong) block_offset, (ulong) space_id, space_name,
+		(ulong) byte_offset, (ulong) len, (ulong) type);
+}
+
+/********************************************************************//**
+Reads or writes data. This operation is asynchronous (aio).
+@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
+i/o on a tablespace which does not exist */
+UNIV_INTERN
+dberr_t
+fil_io(
+/*===*/
+	ulint	type,		/*!< in: OS_FILE_READ or OS_FILE_WRITE,
+				ORed to OS_FILE_LOG, if a log i/o
+				and ORed to OS_AIO_SIMULATED_WAKE_LATER
+				if simulated aio and we want to post a
+				batch of i/os; NOTE that a simulated batch
+				may introduce hidden chances of deadlocks,
+				because i/os are not actually handled until
+				all have been posted: use with great
+				caution! */
+	bool	sync,		/*!< in: true if synchronous aio is desired */
+	ulint	space_id,	/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	block_offset,	/*!< in: offset in number of blocks */
+	ulint	byte_offset,	/*!< in: remainder of offset in bytes; in
+				aio this must be divisible by the OS block
+				size */
+	ulint	len,		/*!< in: how many bytes to read or write; this
+				must not cross a file boundary; in aio this
+				must be a block size multiple */
+	void*	buf,		/*!< in/out: buffer where to store read data
+				or from where to write; in aio this must be
+				appropriately aligned */
+	void*	message)	/*!< in: message for aio handler if non-sync
+				aio used, else ignored */
+{
+	ulint		mode;
+	fil_space_t*	space;
+	fil_node_t*	node;
+	ibool		ret;
+	ulint		is_log;
+	ulint		wake_later;
+	os_offset_t	offset;
+	ibool		ignore_nonexistent_pages;
+
+	is_log = type & OS_FILE_LOG;
+	type = type & ~OS_FILE_LOG;
+
+	wake_later = type & OS_AIO_SIMULATED_WAKE_LATER;
+	type = type & ~OS_AIO_SIMULATED_WAKE_LATER;
+
+	ignore_nonexistent_pages = type & BUF_READ_IGNORE_NONEXISTENT_PAGES;
+	type &= ~BUF_READ_IGNORE_NONEXISTENT_PAGES;
+
+	ut_ad(byte_offset < UNIV_PAGE_SIZE);
+	ut_ad(!zip_size || !byte_offset);
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(buf);
+	ut_ad(len > 0);
+	ut_ad(UNIV_PAGE_SIZE == (ulong)(1 << UNIV_PAGE_SIZE_SHIFT));
+#if (1 << UNIV_PAGE_SIZE_SHIFT_MAX) != UNIV_PAGE_SIZE_MAX
+# error "(1 << UNIV_PAGE_SIZE_SHIFT_MAX) != UNIV_PAGE_SIZE_MAX"
+#endif
+#if (1 << UNIV_PAGE_SIZE_SHIFT_MIN) != UNIV_PAGE_SIZE_MIN
+# error "(1 << UNIV_PAGE_SIZE_SHIFT_MIN) != UNIV_PAGE_SIZE_MIN"
+#endif
+	ut_ad(fil_validate_skip());
+#ifndef UNIV_HOTBACKUP
+# ifndef UNIV_LOG_DEBUG
+	/* ibuf bitmap pages must be read in the sync aio mode: */
+	ut_ad(recv_no_ibuf_operations
+	      || type == OS_FILE_WRITE
+	      || !ibuf_bitmap_page(zip_size, block_offset)
+	      || sync
+	      || is_log);
+# endif /* UNIV_LOG_DEBUG */
+	if (sync) {
+		mode = OS_AIO_SYNC;
+	} else if (is_log) {
+		mode = OS_AIO_LOG;
+	} else if (type == OS_FILE_READ
+		   && !recv_no_ibuf_operations
+		   && ibuf_page(space_id, zip_size, block_offset, NULL)) {
+		mode = OS_AIO_IBUF;
+	} else {
+		mode = OS_AIO_NORMAL;
+	}
+#else /* !UNIV_HOTBACKUP */
+	ut_a(sync);
+	mode = OS_AIO_SYNC;
+#endif /* !UNIV_HOTBACKUP */
+
+	if (type == OS_FILE_READ) {
+		srv_stats.data_read.add(len);
+	} else if (type == OS_FILE_WRITE) {
+		ut_ad(!srv_read_only_mode);
+		srv_stats.data_written.add(len);
+	}
+
+	/* Reserve the fil_system mutex and make sure that we can open at
+	least one file while holding it, if the file is not already open */
+
+	fil_mutex_enter_and_prepare_for_io(space_id);
+
+	space = fil_space_get_by_id(space_id);
+
+	/* If we are deleting a tablespace we don't allow any read
+	operations on that. However, we do allow write operations. */
+	if (space == 0 || (type == OS_FILE_READ && space->stop_new_ops)) {
+		mutex_exit(&fil_system->mutex);
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Trying to do i/o to a tablespace which does "
+			"not exist. i/o type %lu, space id %lu, "
+			"page no. %lu, i/o length %lu bytes",
+			(ulong) type, (ulong) space_id, (ulong) block_offset,
+			(ulong) len);
+
+		return(DB_TABLESPACE_DELETED);
+	}
+
+	ut_ad(mode != OS_AIO_IBUF || space->purpose == FIL_TABLESPACE);
+
+	node = UT_LIST_GET_FIRST(space->chain);
+
+	for (;;) {
+		if (node == NULL) {
+			if (ignore_nonexistent_pages) {
+				mutex_exit(&fil_system->mutex);
+				return(DB_ERROR);
+			}
+
+			fil_report_invalid_page_access(
+				block_offset, space_id, space->name,
+				byte_offset, len, type);
+
+			ut_error;
+
+		} else if (fil_is_user_tablespace_id(space->id)
+			   && node->size == 0) {
+
+			/* We do not know the size of a single-table tablespace
+			before we open the file */
+			break;
+		} else if (node->size > block_offset) {
+			/* Found! */
+			break;
+		} else {
+			block_offset -= node->size;
+			node = UT_LIST_GET_NEXT(chain, node);
+		}
+	}
+
+	/* Open file if closed */
+	if (!fil_node_prepare_for_io(node, fil_system, space)) {
+		if (space->purpose == FIL_TABLESPACE
+		    && fil_is_user_tablespace_id(space->id)) {
+			mutex_exit(&fil_system->mutex);
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Trying to do i/o to a tablespace which "
+				"exists without .ibd data file. "
+				"i/o type %lu, space id %lu, page no %lu, "
+				"i/o length %lu bytes",
+				(ulong) type, (ulong) space_id,
+				(ulong) block_offset, (ulong) len);
+
+			return(DB_TABLESPACE_DELETED);
+		}
+
+		/* The tablespace is for log. Currently, we just assert here
+		to prevent handling errors along the way fil_io returns.
+		Also, if the log files are missing, it would be hard to
+		promise the server can continue running. */
+		ut_a(0);
+	}
+
+	/* Check that at least the start offset is within the bounds of a
+	single-table tablespace, including rollback tablespaces. */
+	if (UNIV_UNLIKELY(node->size <= block_offset)
+	    && space->id != 0 && space->purpose == FIL_TABLESPACE) {
+
+		fil_report_invalid_page_access(
+			block_offset, space_id, space->name, byte_offset,
+			len, type);
+
+		ut_error;
+	}
+
+	/* Now we have made the changes in the data structures of fil_system */
+	mutex_exit(&fil_system->mutex);
+
+	/* Calculate the low 32 bits and the high 32 bits of the file offset */
+
+	if (!zip_size) {
+		offset = ((os_offset_t) block_offset << UNIV_PAGE_SIZE_SHIFT)
+			+ byte_offset;
+
+		ut_a(node->size - block_offset
+		     >= ((byte_offset + len + (UNIV_PAGE_SIZE - 1))
+			 / UNIV_PAGE_SIZE));
+	} else {
+		ulint	zip_size_shift;
+		switch (zip_size) {
+		case 1024: zip_size_shift = 10; break;
+		case 2048: zip_size_shift = 11; break;
+		case 4096: zip_size_shift = 12; break;
+		case 8192: zip_size_shift = 13; break;
+		case 16384: zip_size_shift = 14; break;
+		default: ut_error;
+		}
+		offset = ((os_offset_t) block_offset << zip_size_shift)
+			+ byte_offset;
+		ut_a(node->size - block_offset
+		     >= (len + (zip_size - 1)) / zip_size);
+	}
+
+	/* Do aio */
+
+	ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0);
+	ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0);
+
+#ifdef UNIV_HOTBACKUP
+	/* In mysqlbackup do normal i/o, not aio */
+	if (type == OS_FILE_READ) {
+		ret = os_file_read(node->handle, buf, offset, len);
+	} else {
+		ut_ad(!srv_read_only_mode);
+		ret = os_file_write(node->name, node->handle, buf,
+				    offset, len);
+	}
+#else
+	/* Queue the aio request */
+	ret = os_aio(type, mode | wake_later, node->name, node->handle, buf,
+		     offset, len, node, message);
+#endif /* UNIV_HOTBACKUP */
+	ut_a(ret);
+
+	if (mode == OS_AIO_SYNC) {
+		/* The i/o operation is already completed when we return from
+		os_aio: */
+
+		mutex_enter(&fil_system->mutex);
+
+		fil_node_complete_io(node, fil_system, type);
+
+		mutex_exit(&fil_system->mutex);
+
+		ut_ad(fil_validate_skip());
+	}
+
+	return(DB_SUCCESS);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Waits for an aio operation to complete. This function is used to write the
+handler for completed requests. The aio array of pending requests is divided
+into segments (see os0file.cc for more info). The thread specifies which
+segment it wants to wait for. */
+UNIV_INTERN
+void
+fil_aio_wait(
+/*=========*/
+	ulint	segment)	/*!< in: the number of the segment in the aio
+				array to wait for */
+{
+	ibool		ret;
+	fil_node_t*	fil_node;
+	void*		message;
+	ulint		type;
+
+	ut_ad(fil_validate_skip());
+
+	if (srv_use_native_aio) {
+		srv_set_io_thread_op_info(segment, "native aio handle");
+#ifdef WIN_ASYNC_IO
+		ret = os_aio_windows_handle(
+			segment, 0, &fil_node, &message, &type);
+#elif defined(LINUX_NATIVE_AIO)
+		ret = os_aio_linux_handle(
+			segment, &fil_node, &message, &type);
+#else
+		ut_error;
+		ret = 0; /* Eliminate compiler warning */
+#endif /* WIN_ASYNC_IO */
+	} else {
+		srv_set_io_thread_op_info(segment, "simulated aio handle");
+
+		ret = os_aio_simulated_handle(
+			segment, &fil_node, &message, &type);
+	}
+
+	ut_a(ret);
+	if (fil_node == NULL) {
+		ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS);
+		return;
+	}
+
+	srv_set_io_thread_op_info(segment, "complete io for fil node");
+
+	mutex_enter(&fil_system->mutex);
+
+	fil_node_complete_io(fil_node, fil_system, type);
+
+	mutex_exit(&fil_system->mutex);
+
+	ut_ad(fil_validate_skip());
+
+	/* Do the i/o handling */
+	/* IMPORTANT: since i/o handling for reads will read also the insert
+	buffer in tablespace 0, you have to be very careful not to introduce
+	deadlocks in the i/o system. We keep tablespace 0 data files always
+	open, and use a special i/o thread to serve insert buffer requests. */
+
+	if (fil_node->space->purpose == FIL_TABLESPACE) {
+		srv_set_io_thread_op_info(segment, "complete io for buf page");
+		buf_page_io_complete(static_cast<buf_page_t*>(message));
+	} else {
+		srv_set_io_thread_op_info(segment, "complete io for log");
+		log_io_complete(static_cast<log_group_t*>(message));
+	}
+}
+#endif /* UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Flushes to disk possible writes cached by the OS. If the space does not exist
+or is being dropped, does not do anything. */
+UNIV_INTERN
+void
+fil_flush(
+/*======*/
+	ulint	space_id)	/*!< in: file space id (this can be a group of
+				log files or a tablespace of the database) */
+{
+	fil_space_t*	space;
+	fil_node_t*	node;
+	os_file_t	file;
+
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(space_id);
+
+	if (!space || space->stop_new_ops) {
+		mutex_exit(&fil_system->mutex);
+
+		return;
+	}
+
+	if (fil_buffering_disabled(space)) {
+
+		/* No need to flush. User has explicitly disabled
+		buffering. */
+		ut_ad(!space->is_in_unflushed_spaces);
+		ut_ad(fil_space_is_flushed(space));
+		ut_ad(space->n_pending_flushes == 0);
+
+#ifdef UNIV_DEBUG
+		for (node = UT_LIST_GET_FIRST(space->chain);
+		     node != NULL;
+		     node = UT_LIST_GET_NEXT(chain, node)) {
+			ut_ad(node->modification_counter
+			      == node->flush_counter);
+			ut_ad(node->n_pending_flushes == 0);
+		}
+#endif /* UNIV_DEBUG */
+
+		mutex_exit(&fil_system->mutex);
+		return;
+	}
+
+	space->n_pending_flushes++;	/*!< prevent dropping of the space while
+					we are flushing */
+	for (node = UT_LIST_GET_FIRST(space->chain);
+	     node != NULL;
+	     node = UT_LIST_GET_NEXT(chain, node)) {
+
+		ib_int64_t old_mod_counter = node->modification_counter;;
+
+		if (old_mod_counter <= node->flush_counter) {
+			continue;
+		}
+
+		ut_a(node->open);
+
+		if (space->purpose == FIL_TABLESPACE) {
+			fil_n_pending_tablespace_flushes++;
+		} else {
+			fil_n_pending_log_flushes++;
+			fil_n_log_flushes++;
+		}
+#ifdef __WIN__
+		if (node->is_raw_disk) {
+
+			goto skip_flush;
+		}
+#endif /* __WIN__ */
+retry:
+		if (node->n_pending_flushes > 0) {
+			/* We want to avoid calling os_file_flush() on
+			the file twice at the same time, because we do
+			not know what bugs OS's may contain in file
+			i/o */
+
+			ib_int64_t sig_count =
+				os_event_reset(node->sync_event);
+
+			mutex_exit(&fil_system->mutex);
+
+			os_event_wait_low(node->sync_event, sig_count);
+
+			mutex_enter(&fil_system->mutex);
+
+			if (node->flush_counter >= old_mod_counter) {
+
+				goto skip_flush;
+			}
+
+			goto retry;
+		}
+
+		ut_a(node->open);
+		file = node->handle;
+		node->n_pending_flushes++;
+
+		mutex_exit(&fil_system->mutex);
+
+		os_file_flush(file);
+
+		mutex_enter(&fil_system->mutex);
+
+		os_event_set(node->sync_event);
+
+		node->n_pending_flushes--;
+skip_flush:
+		if (node->flush_counter < old_mod_counter) {
+			node->flush_counter = old_mod_counter;
+
+			if (space->is_in_unflushed_spaces
+			    && fil_space_is_flushed(space)) {
+
+				space->is_in_unflushed_spaces = false;
+
+				UT_LIST_REMOVE(
+					unflushed_spaces,
+					fil_system->unflushed_spaces,
+					space);
+			}
+		}
+
+		if (space->purpose == FIL_TABLESPACE) {
+			fil_n_pending_tablespace_flushes--;
+		} else {
+			fil_n_pending_log_flushes--;
+		}
+	}
+
+	space->n_pending_flushes--;
+
+	mutex_exit(&fil_system->mutex);
+}
+
+/**********************************************************************//**
+Flushes to disk the writes in file spaces of the given type possibly cached by
+the OS. */
+UNIV_INTERN
+void
+fil_flush_file_spaces(
+/*==================*/
+	ulint	purpose)	/*!< in: FIL_TABLESPACE, FIL_LOG */
+{
+	fil_space_t*	space;
+	ulint*		space_ids;
+	ulint		n_space_ids;
+	ulint		i;
+
+	mutex_enter(&fil_system->mutex);
+
+	n_space_ids = UT_LIST_GET_LEN(fil_system->unflushed_spaces);
+	if (n_space_ids == 0) {
+
+		mutex_exit(&fil_system->mutex);
+		return;
+	}
+
+	/* Assemble a list of space ids to flush.  Previously, we
+	traversed fil_system->unflushed_spaces and called UT_LIST_GET_NEXT()
+	on a space that was just removed from the list by fil_flush().
+	Thus, the space could be dropped and the memory overwritten. */
+	space_ids = static_cast<ulint*>(
+		mem_alloc(n_space_ids * sizeof *space_ids));
+
+	n_space_ids = 0;
+
+	for (space = UT_LIST_GET_FIRST(fil_system->unflushed_spaces);
+	     space;
+	     space = UT_LIST_GET_NEXT(unflushed_spaces, space)) {
+
+		if (space->purpose == purpose && !space->stop_new_ops) {
+
+			space_ids[n_space_ids++] = space->id;
+		}
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	/* Flush the spaces.  It will not hurt to call fil_flush() on
+	a non-existing space id. */
+	for (i = 0; i < n_space_ids; i++) {
+
+		fil_flush(space_ids[i]);
+	}
+
+	mem_free(space_ids);
+}
+
+/** Functor to validate the space list. */
+struct	Check {
+	void	operator()(const fil_node_t* elem)
+	{
+		ut_a(elem->open || !elem->n_pending);
+	}
+};
+
+/******************************************************************//**
+Checks the consistency of the tablespace cache.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+fil_validate(void)
+/*==============*/
+{
+	fil_space_t*	space;
+	fil_node_t*	fil_node;
+	ulint		n_open		= 0;
+	ulint		i;
+
+	mutex_enter(&fil_system->mutex);
+
+	/* Look for spaces in the hash table */
+
+	for (i = 0; i < hash_get_n_cells(fil_system->spaces); i++) {
+
+		for (space = static_cast<fil_space_t*>(
+				HASH_GET_FIRST(fil_system->spaces, i));
+		     space != 0;
+		     space = static_cast<fil_space_t*>(
+			     	HASH_GET_NEXT(hash, space))) {
+
+			UT_LIST_VALIDATE(
+				chain, fil_node_t, space->chain, Check());
+
+			for (fil_node = UT_LIST_GET_FIRST(space->chain);
+			     fil_node != 0;
+			     fil_node = UT_LIST_GET_NEXT(chain, fil_node)) {
+
+				if (fil_node->n_pending > 0) {
+					ut_a(fil_node->open);
+				}
+
+				if (fil_node->open) {
+					n_open++;
+				}
+			}
+		}
+	}
+
+	ut_a(fil_system->n_open == n_open);
+
+	UT_LIST_CHECK(LRU, fil_node_t, fil_system->LRU);
+
+	for (fil_node = UT_LIST_GET_FIRST(fil_system->LRU);
+	     fil_node != 0;
+	     fil_node = UT_LIST_GET_NEXT(LRU, fil_node)) {
+
+		ut_a(fil_node->n_pending == 0);
+		ut_a(!fil_node->being_extended);
+		ut_a(fil_node->open);
+		ut_a(fil_space_belongs_in_lru(fil_node->space));
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(TRUE);
+}
+
+/********************************************************************//**
+Returns TRUE if file address is undefined.
+@return	TRUE if undefined */
+UNIV_INTERN
+ibool
+fil_addr_is_null(
+/*=============*/
+	fil_addr_t	addr)	/*!< in: address */
+{
+	return(addr.page == FIL_NULL);
+}
+
+/********************************************************************//**
+Get the predecessor of a file page.
+@return	FIL_PAGE_PREV */
+UNIV_INTERN
+ulint
+fil_page_get_prev(
+/*==============*/
+	const byte*	page)	/*!< in: file page */
+{
+	return(mach_read_from_4(page + FIL_PAGE_PREV));
+}
+
+/********************************************************************//**
+Get the successor of a file page.
+@return	FIL_PAGE_NEXT */
+UNIV_INTERN
+ulint
+fil_page_get_next(
+/*==============*/
+	const byte*	page)	/*!< in: file page */
+{
+	return(mach_read_from_4(page + FIL_PAGE_NEXT));
+}
+
+/*********************************************************************//**
+Sets the file page type. */
+UNIV_INTERN
+void
+fil_page_set_type(
+/*==============*/
+	byte*	page,	/*!< in/out: file page */
+	ulint	type)	/*!< in: type */
+{
+	ut_ad(page);
+
+	mach_write_to_2(page + FIL_PAGE_TYPE, type);
+}
+
+/*********************************************************************//**
+Gets the file page type.
+@return type; NOTE that if the type has not been written to page, the
+return value not defined */
+UNIV_INTERN
+ulint
+fil_page_get_type(
+/*==============*/
+	const byte*	page)	/*!< in: file page */
+{
+	ut_ad(page);
+
+	return(mach_read_from_2(page + FIL_PAGE_TYPE));
+}
+
+/****************************************************************//**
+Closes the tablespace memory cache. */
+UNIV_INTERN
+void
+fil_close(void)
+/*===========*/
+{
+#ifndef UNIV_HOTBACKUP
+	/* The mutex should already have been freed. */
+	ut_ad(fil_system->mutex.magic_n == 0);
+#endif /* !UNIV_HOTBACKUP */
+
+	hash_table_free(fil_system->spaces);
+
+	hash_table_free(fil_system->name_hash);
+
+	ut_a(UT_LIST_GET_LEN(fil_system->LRU) == 0);
+	ut_a(UT_LIST_GET_LEN(fil_system->unflushed_spaces) == 0);
+	ut_a(UT_LIST_GET_LEN(fil_system->space_list) == 0);
+
+	mem_free(fil_system);
+
+	fil_system = NULL;
+}
+
+/********************************************************************//**
+Initializes a buffer control block when the buf_pool is created. */
+static
+void
+fil_buf_block_init(
+/*===============*/
+	buf_block_t*	block,		/*!< in: pointer to control block */
+	byte*		frame)		/*!< in: pointer to buffer frame */
+{
+	UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE);
+
+	block->frame = frame;
+
+	block->page.io_fix = BUF_IO_NONE;
+	/* There are assertions that check for this. */
+	block->page.buf_fix_count = 1;
+	block->page.state = BUF_BLOCK_READY_FOR_USE;
+
+	page_zip_des_init(&block->page.zip);
+}
+
+struct fil_iterator_t {
+	os_file_t	file;			/*!< File handle */
+	const char*	filepath;		/*!< File path name */
+	os_offset_t	start;			/*!< From where to start */
+	os_offset_t	end;			/*!< Where to stop */
+	os_offset_t	file_size;		/*!< File size in bytes */
+	ulint		page_size;		/*!< Page size */
+	ulint		n_io_buffers;		/*!< Number of pages to use
+						for IO */
+	byte*		io_buffer;		/*!< Buffer to use for IO */
+};
+
+/********************************************************************//**
+TODO: This can be made parallel trivially by chunking up the file and creating
+a callback per thread. . Main benefit will be to use multiple CPUs for
+checksums and compressed tables. We have to do compressed tables block by
+block right now. Secondly we need to decompress/compress and copy too much
+of data. These are CPU intensive.
+
+Iterate over all the pages in the tablespace.
+@param iter - Tablespace iterator
+@param block - block to use for IO
+@param callback - Callback to inspect and update page contents
+@retval DB_SUCCESS or error code */
+static
+dberr_t
+fil_iterate(
+/*========*/
+	const fil_iterator_t&	iter,
+	buf_block_t*		block,
+	PageCallback&		callback)
+{
+	os_offset_t		offset;
+	ulint			page_no = 0;
+	ulint			space_id = callback.get_space_id();
+	ulint			n_bytes = iter.n_io_buffers * iter.page_size;
+
+	ut_ad(!srv_read_only_mode);
+
+	/* TODO: For compressed tables we do a lot of useless
+	copying for non-index pages. Unfortunately, it is
+	required by buf_zip_decompress() */
+
+	for (offset = iter.start; offset < iter.end; offset += n_bytes) {
+
+		byte*		io_buffer = iter.io_buffer;
+
+		block->frame = io_buffer;
+
+		if (callback.get_zip_size() > 0) {
+			page_zip_des_init(&block->page.zip);
+			page_zip_set_size(&block->page.zip, iter.page_size);
+			block->page.zip.data = block->frame + UNIV_PAGE_SIZE;
+			ut_d(block->page.zip.m_external = true);
+			ut_ad(iter.page_size == callback.get_zip_size());
+
+			/* Zip IO is done in the compressed page buffer. */
+			io_buffer = block->page.zip.data;
+		} else {
+			io_buffer = iter.io_buffer;
+		}
+
+		/* We have to read the exact number of bytes. Otherwise the
+		InnoDB IO functions croak on failed reads. */
+
+		n_bytes = static_cast<ulint>(
+			ut_min(static_cast<os_offset_t>(n_bytes),
+			       iter.end - offset));
+
+		ut_ad(n_bytes > 0);
+		ut_ad(!(n_bytes % iter.page_size));
+
+		if (!os_file_read(iter.file, io_buffer, offset,
+				  (ulint) n_bytes)) {
+
+			ib_logf(IB_LOG_LEVEL_ERROR, "os_file_read() failed");
+
+			return(DB_IO_ERROR);
+		}
+
+		bool		updated = false;
+		os_offset_t	page_off = offset;
+		ulint		n_pages_read = (ulint) n_bytes / iter.page_size;
+
+		for (ulint i = 0; i < n_pages_read; ++i) {
+
+			buf_block_set_file_page(block, space_id, page_no++);
+
+			dberr_t	err;
+
+			if ((err = callback(page_off, block)) != DB_SUCCESS) {
+
+				return(err);
+
+			} else if (!updated) {
+				updated = buf_block_get_state(block)
+					== BUF_BLOCK_FILE_PAGE;
+			}
+
+			buf_block_set_state(block, BUF_BLOCK_NOT_USED);
+			buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE);
+
+			page_off += iter.page_size;
+			block->frame += iter.page_size;
+		}
+
+		/* A page was updated in the set, write back to disk. */
+		if (updated
+		    && !os_file_write(
+				iter.filepath, iter.file, io_buffer,
+				offset, (ulint) n_bytes)) {
+
+			ib_logf(IB_LOG_LEVEL_ERROR, "os_file_write() failed");
+
+			return(DB_IO_ERROR);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/********************************************************************//**
+Iterate over all the pages in the tablespace.
+@param table - the table definiton in the server
+@param n_io_buffers - number of blocks to read and write together
+@param callback - functor that will do the page updates
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fil_tablespace_iterate(
+/*===================*/
+	dict_table_t*	table,
+	ulint		n_io_buffers,
+	PageCallback&	callback)
+{
+	dberr_t		err;
+	os_file_t	file;
+	char*		filepath;
+
+	ut_a(n_io_buffers > 0);
+	ut_ad(!srv_read_only_mode);
+
+	DBUG_EXECUTE_IF("ib_import_trigger_corruption_1",
+			return(DB_CORRUPTION););
+
+	if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+		dict_get_and_save_data_dir_path(table, false);
+		ut_a(table->data_dir_path);
+
+		filepath = os_file_make_remote_pathname(
+			table->data_dir_path, table->name, "ibd");
+	} else {
+		filepath = fil_make_ibd_name(table->name, false);
+	}
+
+	{
+		ibool	success;
+
+		file = os_file_create_simple_no_error_handling(
+			innodb_file_data_key, filepath,
+			OS_FILE_OPEN, OS_FILE_READ_WRITE, &success);
+
+		DBUG_EXECUTE_IF("fil_tablespace_iterate_failure",
+		{
+			static bool once;
+
+			if (!once || ut_rnd_interval(0, 10) == 5) {
+				once = true;
+				success = FALSE;
+				os_file_close(file);
+			}
+		});
+
+		if (!success) {
+			/* The following call prints an error message */
+			os_file_get_last_error(true);
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Trying to import a tablespace, but could not "
+				"open the tablespace file %s", filepath);
+
+			mem_free(filepath);
+
+			return(DB_TABLESPACE_NOT_FOUND);
+
+		} else {
+			err = DB_SUCCESS;
+		}
+	}
+
+	callback.set_file(filepath, file);
+
+	os_offset_t	file_size = os_file_get_size(file);
+	ut_a(file_size != (os_offset_t) -1);
+
+	/* The block we will use for every physical page */
+	buf_block_t	block;
+
+	memset(&block, 0x0, sizeof(block));
+
+	/* Allocate a page to read in the tablespace header, so that we
+	can determine the page size and zip_size (if it is compressed).
+	We allocate an extra page in case it is a compressed table. One
+	page is to ensure alignement. */
+
+	void*	page_ptr = mem_alloc(3 * UNIV_PAGE_SIZE);
+	byte*	page = static_cast<byte*>(ut_align(page_ptr, UNIV_PAGE_SIZE));
+
+	fil_buf_block_init(&block, page);
+
+	/* Read the first page and determine the page and zip size. */
+
+	if (!os_file_read(file, page, 0, UNIV_PAGE_SIZE)) {
+
+		err = DB_IO_ERROR;
+
+	} else if ((err = callback.init(file_size, &block)) == DB_SUCCESS) {
+		fil_iterator_t	iter;
+
+		iter.file = file;
+		iter.start = 0;
+		iter.end = file_size;
+		iter.filepath = filepath;
+		iter.file_size = file_size;
+		iter.n_io_buffers = n_io_buffers;
+		iter.page_size = callback.get_page_size();
+
+		/* Compressed pages can't be optimised for block IO for now.
+		We do the IMPORT page by page. */
+
+		if (callback.get_zip_size() > 0) {
+			iter.n_io_buffers = 1;
+			ut_a(iter.page_size == callback.get_zip_size());
+		}
+
+		/** Add an extra page for compressed page scratch area. */
+
+		void*	io_buffer = mem_alloc(
+			(2 + iter.n_io_buffers) * UNIV_PAGE_SIZE);
+
+		iter.io_buffer = static_cast<byte*>(
+			ut_align(io_buffer, UNIV_PAGE_SIZE));
+
+		err = fil_iterate(iter, &block, callback);
+
+		mem_free(io_buffer);
+	}
+
+	if (err == DB_SUCCESS) {
+
+		ib_logf(IB_LOG_LEVEL_INFO, "Sync to disk");
+
+		if (!os_file_flush(file)) {
+			ib_logf(IB_LOG_LEVEL_INFO, "os_file_flush() failed!");
+			err = DB_IO_ERROR;
+		} else {
+			ib_logf(IB_LOG_LEVEL_INFO, "Sync to disk - done!");
+		}
+	}
+
+	os_file_close(file);
+
+	mem_free(page_ptr);
+	mem_free(filepath);
+
+	return(err);
+}
+
+/**
+Set the tablespace compressed table size.
+@return DB_SUCCESS if it is valie or DB_CORRUPTION if not */
+dberr_t
+PageCallback::set_zip_size(const buf_frame_t* page) UNIV_NOTHROW
+{
+	m_zip_size = fsp_header_get_zip_size(page);
+
+	if (!ut_is_2pow(m_zip_size) || m_zip_size > UNIV_ZIP_SIZE_MAX) {
+		return(DB_CORRUPTION);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/********************************************************************//**
+Delete the tablespace file and any related files like .cfg.
+This should not be called for temporary tables. */
+UNIV_INTERN
+void
+fil_delete_file(
+/*============*/
+	const char*	ibd_name)	/*!< in: filepath of the ibd
+					tablespace */
+{
+	/* Force a delete of any stale .ibd files that are lying around. */
+
+	ib_logf(IB_LOG_LEVEL_INFO, "Deleting %s", ibd_name);
+
+	os_file_delete_if_exists(innodb_file_data_key, ibd_name);
+
+	char*	cfg_name = fil_make_cfg_name(ibd_name);
+
+	os_file_delete_if_exists(innodb_file_data_key, cfg_name);
+
+	mem_free(cfg_name);
+}
+
+/**
+Iterate over all the spaces in the space list and fetch the
+tablespace names. It will return a copy of the name that must be
+freed by the caller using: delete[].
+@return DB_SUCCESS if all OK. */
+UNIV_INTERN
+dberr_t
+fil_get_space_names(
+/*================*/
+	space_name_list_t&	space_name_list)
+				/*!< in/out: List to append to */
+{
+	fil_space_t*	space;
+	dberr_t		err = DB_SUCCESS;
+
+	mutex_enter(&fil_system->mutex);
+
+	for (space = UT_LIST_GET_FIRST(fil_system->space_list);
+	     space != NULL;
+	     space = UT_LIST_GET_NEXT(space_list, space)) {
+
+		if (space->purpose == FIL_TABLESPACE) {
+			ulint	len;
+			char*	name;
+
+			len = strlen(space->name);
+			name = new(std::nothrow) char[len + 1];
+
+			if (name == 0) {
+				/* Caller to free elements allocated so far. */
+				err = DB_OUT_OF_MEMORY;
+				break;
+			}
+
+			memcpy(name, space->name, len);
+			name[len] = 0;
+
+			space_name_list.push_back(name);
+		}
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(err);
+}
+
+/****************************************************************//**
+Generate redo logs for swapping two .ibd files */
+UNIV_INTERN
+void
+fil_mtr_rename_log(
+/*===============*/
+	ulint		old_space_id,	/*!< in: tablespace id of the old
+					table. */
+	const char*	old_name,	/*!< in: old table name */
+	ulint		new_space_id,	/*!< in: tablespace id of the new
+					table */
+	const char*	new_name,	/*!< in: new table name */
+	const char*	tmp_name,	/*!< in: temp table name used while
+					swapping */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	if (old_space_id != TRX_SYS_SPACE) {
+		fil_op_write_log(MLOG_FILE_RENAME, old_space_id,
+				 0, 0, old_name, tmp_name, mtr);
+	}
+
+	if (new_space_id != TRX_SYS_SPACE) {
+		fil_op_write_log(MLOG_FILE_RENAME, new_space_id,
+				 0, 0, new_name, old_name, mtr);
+	}
+}
diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc
new file mode 100644
index 00000000000..d1bb22ed7a9
--- /dev/null
+++ b/storage/innobase/fsp/fsp0fsp.cc
@@ -0,0 +1,4123 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fsp/fsp0fsp.cc
+File space management
+
+Created 11/29/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "fsp0fsp.h"
+
+#ifdef UNIV_NONINL
+#include "fsp0fsp.ic"
+#endif
+
+#include "buf0buf.h"
+#include "fil0fil.h"
+#include "mtr0log.h"
+#include "ut0byte.h"
+#include "page0page.h"
+#include "page0zip.h"
+#ifdef UNIV_HOTBACKUP
+# include "fut0lst.h"
+#else /* UNIV_HOTBACKUP */
+# include "sync0sync.h"
+# include "fut0fut.h"
+# include "srv0srv.h"
+# include "ibuf0ibuf.h"
+# include "btr0btr.h"
+# include "btr0sea.h"
+# include "dict0boot.h"
+# include "log0log.h"
+#endif /* UNIV_HOTBACKUP */
+#include "dict0mem.h"
+#include "srv0start.h"
+
+
+#ifndef UNIV_HOTBACKUP
+/** Flag to indicate if we have printed the tablespace full error. */
+static ibool fsp_tbs_full_error_printed = FALSE;
+
+/**********************************************************************//**
+Returns an extent to the free list of a space. */
+static
+void
+fsp_free_extent(
+/*============*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint		page,	/*!< in: page offset in the extent */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+/**********************************************************************//**
+Frees an extent of a segment to the space free list. */
+static
+void
+fseg_free_extent(
+/*=============*/
+	fseg_inode_t*	seg_inode, /*!< in: segment inode */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint		page,	/*!< in: page offset in the extent */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+/**********************************************************************//**
+Calculates the number of pages reserved by a segment, and how
+many pages are currently used.
+@return	number of reserved pages */
+static
+ulint
+fseg_n_reserved_pages_low(
+/*======================*/
+	fseg_inode_t*	header,	/*!< in: segment inode */
+	ulint*		used,	/*!< out: number of pages used (not
+				more than reserved) */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+/********************************************************************//**
+Marks a page used. The page must reside within the extents of the given
+segment. */
+static __attribute__((nonnull))
+void
+fseg_mark_page_used(
+/*================*/
+	fseg_inode_t*	seg_inode,/*!< in: segment inode */
+	ulint		page,	/*!< in: page offset */
+	xdes_t*		descr,  /*!< in: extent descriptor */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+/**********************************************************************//**
+Returns the first extent descriptor for a segment. We think of the extent
+lists of the segment catenated in the order FSEG_FULL -> FSEG_NOT_FULL
+-> FSEG_FREE.
+@return	the first extent descriptor, or NULL if none */
+static
+xdes_t*
+fseg_get_first_extent(
+/*==================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+/**********************************************************************//**
+Puts new extents to the free list if
+there are free extents above the free limit. If an extent happens
+to contain an extent descriptor page, the extent is put to
+the FSP_FREE_FRAG list with the page marked as used. */
+static
+void
+fsp_fill_free_list(
+/*===============*/
+	ibool		init_space,	/*!< in: TRUE if this is a single-table
+					tablespace and we are only initing
+					the tablespace's first extent
+					descriptor page and ibuf bitmap page;
+					then we do not allocate more extents */
+	ulint		space,		/*!< in: space */
+	fsp_header_t*	header,		/*!< in/out: space header */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+	UNIV_COLD __attribute__((nonnull));
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@retval NULL if no page could be allocated
+@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
+(init_mtr == mtr, or the page was not previously freed in mtr)
+@retval block (not allocated or initialized) otherwise */
+static
+buf_block_t*
+fseg_alloc_free_page_low(
+/*=====================*/
+	ulint		space,	/*!< in: space */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	fseg_inode_t*	seg_inode, /*!< in/out: segment inode */
+	ulint		hint,	/*!< in: hint of which page would be
+				desirable */
+	byte		direction, /*!< in: if the new page is needed because
+				of an index page split, and records are
+				inserted there in order, into which
+				direction they go alphabetically: FSP_DOWN,
+				FSP_UP, FSP_NO_DIR */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	mtr_t*		init_mtr)/*!< in/out: mtr or another mini-transaction
+				in which the page should be initialized.
+				If init_mtr!=mtr, but the page is already
+				latched in mtr, do not initialize the page. */
+	__attribute__((warn_unused_result, nonnull));
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Reads the file space size stored in the header page.
+@return	tablespace size stored in the space header */
+UNIV_INTERN
+ulint
+fsp_get_size_low(
+/*=============*/
+	page_t*	page)	/*!< in: header page (page 0 in the tablespace) */
+{
+	return(mach_read_from_4(page + FSP_HEADER_OFFSET + FSP_SIZE));
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Gets a pointer to the space header and x-locks its page.
+@return	pointer to the space header, page x-locked */
+UNIV_INLINE
+fsp_header_t*
+fsp_get_space_header(
+/*=================*/
+	ulint	id,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	buf_block_t*	block;
+	fsp_header_t*	header;
+
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
+	ut_ad(!zip_size || zip_size >= UNIV_ZIP_SIZE_MIN);
+	ut_ad(id || !zip_size);
+
+	block = buf_page_get(id, zip_size, 0, RW_X_LATCH, mtr);
+	header = FSP_HEADER_OFFSET + buf_block_get_frame(block);
+	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+	ut_ad(id == mach_read_from_4(FSP_SPACE_ID + header));
+	ut_ad(zip_size == fsp_flags_get_zip_size(
+		      mach_read_from_4(FSP_SPACE_FLAGS + header)));
+	return(header);
+}
+
+/**********************************************************************//**
+Gets a descriptor bit of a page.
+@return	TRUE if free */
+UNIV_INLINE
+ibool
+xdes_mtr_get_bit(
+/*=============*/
+	const xdes_t*	descr,	/*!< in: descriptor */
+	ulint		bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
+	ulint		offset,	/*!< in: page offset within extent:
+				0 ... FSP_EXTENT_SIZE - 1 */
+	mtr_t*		mtr)	/*!< in: mini-transaction */
+{
+	ut_ad(mtr->state == MTR_ACTIVE);
+	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+
+	return(xdes_get_bit(descr, bit, offset));
+}
+
+/**********************************************************************//**
+Sets a descriptor bit of a page. */
+UNIV_INLINE
+void
+xdes_set_bit(
+/*=========*/
+	xdes_t*	descr,	/*!< in: descriptor */
+	ulint	bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
+	ulint	offset,	/*!< in: page offset within extent:
+			0 ... FSP_EXTENT_SIZE - 1 */
+	ibool	val,	/*!< in: bit value */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	ulint	index;
+	ulint	byte_index;
+	ulint	bit_index;
+	ulint	descr_byte;
+
+	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+	ut_ad((bit == XDES_FREE_BIT) || (bit == XDES_CLEAN_BIT));
+	ut_ad(offset < FSP_EXTENT_SIZE);
+
+	index = bit + XDES_BITS_PER_PAGE * offset;
+
+	byte_index = index / 8;
+	bit_index = index % 8;
+
+	descr_byte = mtr_read_ulint(descr + XDES_BITMAP + byte_index,
+				    MLOG_1BYTE, mtr);
+	descr_byte = ut_bit_set_nth(descr_byte, bit_index, val);
+
+	mlog_write_ulint(descr + XDES_BITMAP + byte_index, descr_byte,
+			 MLOG_1BYTE, mtr);
+}
+
+/**********************************************************************//**
+Looks for a descriptor bit having the desired value. Starts from hint
+and scans upward; at the end of the extent the search is wrapped to
+the start of the extent.
+@return	bit index of the bit, ULINT_UNDEFINED if not found */
+UNIV_INLINE
+ulint
+xdes_find_bit(
+/*==========*/
+	xdes_t*	descr,	/*!< in: descriptor */
+	ulint	bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
+	ibool	val,	/*!< in: desired bit value */
+	ulint	hint,	/*!< in: hint of which bit position would
+			be desirable */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	ulint	i;
+
+	ut_ad(descr && mtr);
+	ut_ad(val <= TRUE);
+	ut_ad(hint < FSP_EXTENT_SIZE);
+	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+	for (i = hint; i < FSP_EXTENT_SIZE; i++) {
+		if (val == xdes_mtr_get_bit(descr, bit, i, mtr)) {
+
+			return(i);
+		}
+	}
+
+	for (i = 0; i < hint; i++) {
+		if (val == xdes_mtr_get_bit(descr, bit, i, mtr)) {
+
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Looks for a descriptor bit having the desired value. Scans the extent in
+a direction opposite to xdes_find_bit.
+@return	bit index of the bit, ULINT_UNDEFINED if not found */
+UNIV_INLINE
+ulint
+xdes_find_bit_downward(
+/*===================*/
+	xdes_t*	descr,	/*!< in: descriptor */
+	ulint	bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
+	ibool	val,	/*!< in: desired bit value */
+	ulint	hint,	/*!< in: hint of which bit position would
+			be desirable */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	ulint	i;
+
+	ut_ad(descr && mtr);
+	ut_ad(val <= TRUE);
+	ut_ad(hint < FSP_EXTENT_SIZE);
+	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+	for (i = hint + 1; i > 0; i--) {
+		if (val == xdes_mtr_get_bit(descr, bit, i - 1, mtr)) {
+
+			return(i - 1);
+		}
+	}
+
+	for (i = FSP_EXTENT_SIZE - 1; i > hint; i--) {
+		if (val == xdes_mtr_get_bit(descr, bit, i, mtr)) {
+
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Returns the number of used pages in a descriptor.
+@return	number of pages used */
+UNIV_INLINE
+ulint
+xdes_get_n_used(
+/*============*/
+	const xdes_t*	descr,	/*!< in: descriptor */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint	count	= 0;
+
+	ut_ad(descr && mtr);
+	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+	for (ulint i = 0; i < FSP_EXTENT_SIZE; ++i) {
+		if (FALSE == xdes_mtr_get_bit(descr, XDES_FREE_BIT, i, mtr)) {
+			count++;
+		}
+	}
+
+	return(count);
+}
+
+/**********************************************************************//**
+Returns true if extent contains no used pages.
+@return	TRUE if totally free */
+UNIV_INLINE
+ibool
+xdes_is_free(
+/*=========*/
+	const xdes_t*	descr,	/*!< in: descriptor */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	if (0 == xdes_get_n_used(descr, mtr)) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Returns true if extent contains no free pages.
+@return	TRUE if full */
+UNIV_INLINE
+ibool
+xdes_is_full(
+/*=========*/
+	const xdes_t*	descr,	/*!< in: descriptor */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	if (FSP_EXTENT_SIZE == xdes_get_n_used(descr, mtr)) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Sets the state of an xdes. */
+UNIV_INLINE
+void
+xdes_set_state(
+/*===========*/
+	xdes_t*	descr,	/*!< in/out: descriptor */
+	ulint	state,	/*!< in: state to set */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(descr && mtr);
+	ut_ad(state >= XDES_FREE);
+	ut_ad(state <= XDES_FSEG);
+	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+
+	mlog_write_ulint(descr + XDES_STATE, state, MLOG_4BYTES, mtr);
+}
+
+/**********************************************************************//**
+Gets the state of an xdes.
+@return	state */
+UNIV_INLINE
+ulint
+xdes_get_state(
+/*===========*/
+	const xdes_t*	descr,	/*!< in: descriptor */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint	state;
+
+	ut_ad(descr && mtr);
+	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+
+	state = mtr_read_ulint(descr + XDES_STATE, MLOG_4BYTES, mtr);
+	ut_ad(state - 1 < XDES_FSEG);
+	return(state);
+}
+
+/**********************************************************************//**
+Inits an extent descriptor to the free and clean state. */
+UNIV_INLINE
+void
+xdes_init(
+/*======*/
+	xdes_t*	descr,	/*!< in: descriptor */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	ulint	i;
+
+	ut_ad(descr && mtr);
+	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+	ut_ad((XDES_SIZE - XDES_BITMAP) % 4 == 0);
+
+	for (i = XDES_BITMAP; i < XDES_SIZE; i += 4) {
+		mlog_write_ulint(descr + i, 0xFFFFFFFFUL, MLOG_4BYTES, mtr);
+	}
+
+	xdes_set_state(descr, XDES_FREE, mtr);
+}
+
+/********************************************************************//**
+Gets pointer to a the extent descriptor of a page. The page where the extent
+descriptor resides is x-locked. This function no longer extends the data
+file.
+@return pointer to the extent descriptor, NULL if the page does not
+exist in the space or if the offset is >= the free limit */
+UNIV_INLINE __attribute__((nonnull, warn_unused_result))
+xdes_t*
+xdes_get_descriptor_with_space_hdr(
+/*===============================*/
+	fsp_header_t*	sp_header,	/*!< in/out: space header, x-latched
+					in mtr */
+	ulint		space,		/*!< in: space id */
+	ulint		offset,		/*!< in: page offset; if equal
+					to the free limit, we try to
+					add new extents to the space
+					free list */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	ulint	limit;
+	ulint	size;
+	ulint	zip_size;
+	ulint	descr_page_no;
+	page_t*	descr_page;
+
+	ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(page_offset(sp_header) == FSP_HEADER_OFFSET);
+	/* Read free limit and space size */
+	limit = mach_read_from_4(sp_header + FSP_FREE_LIMIT);
+	size  = mach_read_from_4(sp_header + FSP_SIZE);
+	zip_size = fsp_flags_get_zip_size(
+		mach_read_from_4(sp_header + FSP_SPACE_FLAGS));
+
+	if ((offset >= size) || (offset >= limit)) {
+		return(NULL);
+	}
+
+	descr_page_no = xdes_calc_descriptor_page(zip_size, offset);
+
+	if (descr_page_no == 0) {
+		/* It is on the space header page */
+
+		descr_page = page_align(sp_header);
+	} else {
+		buf_block_t*	block;
+
+		block = buf_page_get(space, zip_size, descr_page_no,
+				     RW_X_LATCH, mtr);
+		buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+		descr_page = buf_block_get_frame(block);
+	}
+
+	return(descr_page + XDES_ARR_OFFSET
+	       + XDES_SIZE * xdes_calc_descriptor_index(zip_size, offset));
+}
+
+/********************************************************************//**
+Gets pointer to a the extent descriptor of a page. The page where the
+extent descriptor resides is x-locked. If the page offset is equal to
+the free limit of the space, adds new extents from above the free limit
+to the space free list, if not free limit == space size. This adding
+is necessary to make the descriptor defined, as they are uninitialized
+above the free limit.
+@return pointer to the extent descriptor, NULL if the page does not
+exist in the space or if the offset exceeds the free limit */
+static __attribute__((nonnull, warn_unused_result))
+xdes_t*
+xdes_get_descriptor(
+/*================*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	ulint	offset,	/*!< in: page offset; if equal to the free limit,
+			we try to add new extents to the space free list */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	buf_block_t*	block;
+	fsp_header_t*	sp_header;
+
+	block = buf_page_get(space, zip_size, 0, RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+	sp_header = FSP_HEADER_OFFSET + buf_block_get_frame(block);
+	return(xdes_get_descriptor_with_space_hdr(sp_header, space, offset,
+						  mtr));
+}
+
+/********************************************************************//**
+Gets pointer to a the extent descriptor if the file address
+of the descriptor list node is known. The page where the
+extent descriptor resides is x-locked.
+@return	pointer to the extent descriptor */
+UNIV_INLINE
+xdes_t*
+xdes_lst_get_descriptor(
+/*====================*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	fil_addr_t	lst_node,/*!< in: file address of the list node
+				contained in the descriptor */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	xdes_t*	descr;
+
+	ut_ad(mtr);
+	ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL),
+				MTR_MEMO_X_LOCK));
+	descr = fut_get_ptr(space, zip_size, lst_node, RW_X_LATCH, mtr)
+		- XDES_FLST_NODE;
+
+	return(descr);
+}
+
+/********************************************************************//**
+Returns page offset of the first page in extent described by a descriptor.
+@return	offset of the first page in extent */
+UNIV_INLINE
+ulint
+xdes_get_offset(
+/*============*/
+	const xdes_t*	descr)	/*!< in: extent descriptor */
+{
+	ut_ad(descr);
+
+	return(page_get_page_no(page_align(descr))
+	       + ((page_offset(descr) - XDES_ARR_OFFSET) / XDES_SIZE)
+	       * FSP_EXTENT_SIZE);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Inits a file page whose prior contents should be ignored. */
+static
+void
+fsp_init_file_page_low(
+/*===================*/
+	buf_block_t*	block)	/*!< in: pointer to a page */
+{
+	page_t*		page	= buf_block_get_frame(block);
+	page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
+
+#ifndef UNIV_HOTBACKUP
+	block->check_index_page_at_flush = FALSE;
+#endif /* !UNIV_HOTBACKUP */
+
+	if (page_zip) {
+		memset(page, 0, UNIV_PAGE_SIZE);
+		memset(page_zip->data, 0, page_zip_get_size(page_zip));
+		mach_write_to_4(page + FIL_PAGE_OFFSET,
+				buf_block_get_page_no(block));
+		mach_write_to_4(page
+				+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+				buf_block_get_space(block));
+		memcpy(page_zip->data + FIL_PAGE_OFFSET,
+		       page + FIL_PAGE_OFFSET, 4);
+		memcpy(page_zip->data + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+		       page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4);
+		return;
+	}
+
+	memset(page, 0, UNIV_PAGE_SIZE);
+	mach_write_to_4(page + FIL_PAGE_OFFSET, buf_block_get_page_no(block));
+	mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+			buf_block_get_space(block));
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************//**
+Inits a file page whose prior contents should be ignored. */
+static
+void
+fsp_init_file_page(
+/*===============*/
+	buf_block_t*	block,	/*!< in: pointer to a page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	fsp_init_file_page_low(block);
+
+	mlog_write_initial_log_record(buf_block_get_frame(block),
+				      MLOG_INIT_FILE_PAGE, mtr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses a redo log record of a file page init.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+fsp_parse_init_file_page(
+/*=====================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr __attribute__((unused)), /*!< in: buffer end */
+	buf_block_t*	block)	/*!< in: block or NULL */
+{
+	ut_ad(ptr && end_ptr);
+
+	if (block) {
+		fsp_init_file_page_low(block);
+	}
+
+	return(ptr);
+}
+
+/**********************************************************************//**
+Initializes the fsp system. */
+UNIV_INTERN
+void
+fsp_init(void)
+/*==========*/
+{
+	/* FSP_EXTENT_SIZE must be a multiple of page & zip size */
+	ut_a(0 == (UNIV_PAGE_SIZE % FSP_EXTENT_SIZE));
+	ut_a(UNIV_PAGE_SIZE);
+
+#if UNIV_PAGE_SIZE_MAX % FSP_EXTENT_SIZE_MAX
+# error "UNIV_PAGE_SIZE_MAX % FSP_EXTENT_SIZE_MAX != 0"
+#endif
+#if UNIV_ZIP_SIZE_MIN % FSP_EXTENT_SIZE_MIN
+# error "UNIV_ZIP_SIZE_MIN % FSP_EXTENT_SIZE_MIN != 0"
+#endif
+
+	/* Does nothing at the moment */
+}
+
+/**********************************************************************//**
+Writes the space id and flags to a tablespace header.  The flags contain
+row type, physical/compressed page size, and logical/uncompressed page
+size of the tablespace. */
+UNIV_INTERN
+void
+fsp_header_init_fields(
+/*===================*/
+	page_t*	page,		/*!< in/out: first page in the space */
+	ulint	space_id,	/*!< in: space id */
+	ulint	flags)		/*!< in: tablespace flags (FSP_SPACE_FLAGS) */
+{
+	ut_a(fsp_flags_is_valid(flags));
+
+	mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + page,
+			space_id);
+	mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page,
+			flags);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Initializes the space header of a new created space and creates also the
+insert buffer tree root if space == 0. */
+UNIV_INTERN
+void
+fsp_header_init(
+/*============*/
+	ulint	space,		/*!< in: space id */
+	ulint	size,		/*!< in: current size in blocks */
+	mtr_t*	mtr)		/*!< in/out: mini-transaction */
+{
+	fsp_header_t*	header;
+	buf_block_t*	block;
+	page_t*		page;
+	ulint		flags;
+	ulint		zip_size;
+
+	ut_ad(mtr);
+
+	mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
+
+	zip_size = fsp_flags_get_zip_size(flags);
+	block = buf_page_create(space, 0, zip_size, mtr);
+	buf_page_get(space, zip_size, 0, RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+	/* The prior contents of the file page should be ignored */
+
+	fsp_init_file_page(block, mtr);
+	page = buf_block_get_frame(block);
+
+	mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_FSP_HDR,
+			 MLOG_2BYTES, mtr);
+
+	header = FSP_HEADER_OFFSET + page;
+
+	mlog_write_ulint(header + FSP_SPACE_ID, space, MLOG_4BYTES, mtr);
+	mlog_write_ulint(header + FSP_NOT_USED, 0, MLOG_4BYTES, mtr);
+
+	mlog_write_ulint(header + FSP_SIZE, size, MLOG_4BYTES, mtr);
+	mlog_write_ulint(header + FSP_FREE_LIMIT, 0, MLOG_4BYTES, mtr);
+	mlog_write_ulint(header + FSP_SPACE_FLAGS, flags,
+			 MLOG_4BYTES, mtr);
+	mlog_write_ulint(header + FSP_FRAG_N_USED, 0, MLOG_4BYTES, mtr);
+
+	flst_init(header + FSP_FREE, mtr);
+	flst_init(header + FSP_FREE_FRAG, mtr);
+	flst_init(header + FSP_FULL_FRAG, mtr);
+	flst_init(header + FSP_SEG_INODES_FULL, mtr);
+	flst_init(header + FSP_SEG_INODES_FREE, mtr);
+
+	mlog_write_ull(header + FSP_SEG_ID, 1, mtr);
+	if (space == 0) {
+		fsp_fill_free_list(FALSE, space, header, mtr);
+		btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF,
+			   0, 0, DICT_IBUF_ID_MIN + space,
+			   dict_ind_redundant, mtr);
+	} else {
+		fsp_fill_free_list(TRUE, space, header, mtr);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Reads the space id from the first page of a tablespace.
+@return	space id, ULINT UNDEFINED if error */
+UNIV_INTERN
+ulint
+fsp_header_get_space_id(
+/*====================*/
+	const page_t*	page)	/*!< in: first page of a tablespace */
+{
+	ulint	fsp_id;
+	ulint	id;
+
+	fsp_id = mach_read_from_4(FSP_HEADER_OFFSET + page + FSP_SPACE_ID);
+
+	id = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+	DBUG_EXECUTE_IF("fsp_header_get_space_id_failure",
+			id = ULINT_UNDEFINED;);
+
+	if (id != fsp_id) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Space id in fsp header %lu,but in the page header "
+			"%lu", fsp_id, id);
+
+		return(ULINT_UNDEFINED);
+	}
+
+	return(id);
+}
+
+/**********************************************************************//**
+Reads the space flags from the first page of a tablespace.
+@return	flags */
+UNIV_INTERN
+ulint
+fsp_header_get_flags(
+/*=================*/
+	const page_t*	page)	/*!< in: first page of a tablespace */
+{
+	ut_ad(!page_offset(page));
+
+	return(mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page));
+}
+
+/**********************************************************************//**
+Reads the compressed page size from the first page of a tablespace.
+@return	compressed page size in bytes, or 0 if uncompressed */
+UNIV_INTERN
+ulint
+fsp_header_get_zip_size(
+/*====================*/
+	const page_t*	page)	/*!< in: first page of a tablespace */
+{
+	ulint	flags = fsp_header_get_flags(page);
+
+	return(fsp_flags_get_zip_size(flags));
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Increases the space size field of a space. */
+UNIV_INTERN
+void
+fsp_header_inc_size(
+/*================*/
+	ulint	space,		/*!< in: space id */
+	ulint	size_inc,	/*!< in: size increment in pages */
+	mtr_t*	mtr)		/*!< in/out: mini-transaction */
+{
+	fsp_header_t*	header;
+	ulint		size;
+	ulint		flags;
+
+	ut_ad(mtr);
+
+	mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
+
+	header = fsp_get_space_header(space,
+				      fsp_flags_get_zip_size(flags),
+				      mtr);
+
+	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+
+	mlog_write_ulint(header + FSP_SIZE, size + size_inc, MLOG_4BYTES,
+			 mtr);
+}
+
+/**********************************************************************//**
+Gets the size of the system tablespace from the tablespace header.  If
+we do not have an auto-extending data file, this should be equal to
+the size of the data files.  If there is an auto-extending data file,
+this can be smaller.
+@return	size in pages */
+UNIV_INTERN
+ulint
+fsp_header_get_tablespace_size(void)
+/*================================*/
+{
+	fsp_header_t*	header;
+	ulint		size;
+	mtr_t		mtr;
+
+	mtr_start(&mtr);
+
+	mtr_x_lock(fil_space_get_latch(0, NULL), &mtr);
+
+	header = fsp_get_space_header(0, 0, &mtr);
+
+	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, &mtr);
+
+	mtr_commit(&mtr);
+
+	return(size);
+}
+
+/***********************************************************************//**
+Tries to extend a single-table tablespace so that a page would fit in the
+data file.
+@return	TRUE if success */
+static UNIV_COLD __attribute__((nonnull, warn_unused_result))
+ibool
+fsp_try_extend_data_file_with_pages(
+/*================================*/
+	ulint		space,		/*!< in: space */
+	ulint		page_no,	/*!< in: page number */
+	fsp_header_t*	header,		/*!< in/out: space header */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	ibool	success;
+	ulint	actual_size;
+	ulint	size;
+
+	ut_a(space != 0);
+
+	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+
+	ut_a(page_no >= size);
+
+	success = fil_extend_space_to_desired_size(&actual_size, space,
+						   page_no + 1);
+	/* actual_size now has the space size in pages; it may be less than
+	we wanted if we ran out of disk space */
+
+	mlog_write_ulint(header + FSP_SIZE, actual_size, MLOG_4BYTES, mtr);
+
+	return(success);
+}
+
+/***********************************************************************//**
+Tries to extend the last data file of a tablespace if it is auto-extending.
+@return	FALSE if not auto-extending */
+static UNIV_COLD __attribute__((nonnull))
+ibool
+fsp_try_extend_data_file(
+/*=====================*/
+	ulint*		actual_increase,/*!< out: actual increase in pages, where
+					we measure the tablespace size from
+					what the header field says; it may be
+					the actual file size rounded down to
+					megabyte */
+	ulint		space,		/*!< in: space */
+	fsp_header_t*	header,		/*!< in/out: space header */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	ulint	size;
+	ulint	zip_size;
+	ulint	new_size;
+	ulint	old_size;
+	ulint	size_increase;
+	ulint	actual_size;
+	ibool	success;
+
+	*actual_increase = 0;
+
+	if (space == 0 && !srv_auto_extend_last_data_file) {
+
+		/* We print the error message only once to avoid
+		spamming the error log. Note that we don't need
+		to reset the flag to FALSE as dealing with this
+		error requires server restart. */
+		if (fsp_tbs_full_error_printed == FALSE) {
+			fprintf(stderr,
+				"InnoDB: Error: Data file(s) ran"
+				" out of space.\n"
+				"Please add another data file or"
+				" use \'autoextend\' for the last"
+				" data file.\n");
+			fsp_tbs_full_error_printed = TRUE;
+		}
+		return(FALSE);
+	}
+
+	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+	zip_size = fsp_flags_get_zip_size(
+		mach_read_from_4(header + FSP_SPACE_FLAGS));
+
+	old_size = size;
+
+	if (space == 0) {
+		if (!srv_last_file_size_max) {
+			size_increase = SRV_AUTO_EXTEND_INCREMENT;
+		} else {
+			if (srv_last_file_size_max
+			    < srv_data_file_sizes[srv_n_data_files - 1]) {
+
+				fprintf(stderr,
+					"InnoDB: Error: Last data file size"
+					" is %lu, max size allowed %lu\n",
+					(ulong) srv_data_file_sizes[
+						srv_n_data_files - 1],
+					(ulong) srv_last_file_size_max);
+			}
+
+			size_increase = srv_last_file_size_max
+				- srv_data_file_sizes[srv_n_data_files - 1];
+			if (size_increase > SRV_AUTO_EXTEND_INCREMENT) {
+				size_increase = SRV_AUTO_EXTEND_INCREMENT;
+			}
+		}
+	} else {
+		/* We extend single-table tablespaces first one extent
+		at a time, but for bigger tablespaces more. It is not
+		enough to extend always by one extent, because some
+		extents are frag page extents. */
+		ulint	extent_size;	/*!< one megabyte, in pages */
+
+		if (!zip_size) {
+			extent_size = FSP_EXTENT_SIZE;
+		} else {
+			extent_size = FSP_EXTENT_SIZE
+				* UNIV_PAGE_SIZE / zip_size;
+		}
+
+		if (size < extent_size) {
+			/* Let us first extend the file to extent_size */
+			success = fsp_try_extend_data_file_with_pages(
+				space, extent_size - 1, header, mtr);
+			if (!success) {
+				new_size = mtr_read_ulint(header + FSP_SIZE,
+							  MLOG_4BYTES, mtr);
+
+				*actual_increase = new_size - old_size;
+
+				return(FALSE);
+			}
+
+			size = extent_size;
+		}
+
+		if (size < 32 * extent_size) {
+			size_increase = extent_size;
+		} else {
+			/* Below in fsp_fill_free_list() we assume
+			that we add at most FSP_FREE_ADD extents at
+			a time */
+			size_increase = FSP_FREE_ADD * extent_size;
+		}
+	}
+
+	if (size_increase == 0) {
+
+		return(TRUE);
+	}
+
+	success = fil_extend_space_to_desired_size(&actual_size, space,
+						   size + size_increase);
+	if (!success) {
+
+		return(false);
+	}
+
+	/* We ignore any fragments of a full megabyte when storing the size
+	to the space header */
+
+	if (!zip_size) {
+		new_size = ut_calc_align_down(actual_size,
+					      (1024 * 1024) / UNIV_PAGE_SIZE);
+	} else {
+		new_size = ut_calc_align_down(actual_size,
+					      (1024 * 1024) / zip_size);
+	}
+	mlog_write_ulint(header + FSP_SIZE, new_size, MLOG_4BYTES, mtr);
+
+	*actual_increase = new_size - old_size;
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Puts new extents to the free list if there are free extents above the free
+limit. If an extent happens to contain an extent descriptor page, the extent
+is put to the FSP_FREE_FRAG list with the page marked as used. */
+static
+void
+fsp_fill_free_list(
+/*===============*/
+	ibool		init_space,	/*!< in: TRUE if this is a single-table
+					tablespace and we are only initing
+					the tablespace's first extent
+					descriptor page and ibuf bitmap page;
+					then we do not allocate more extents */
+	ulint		space,		/*!< in: space */
+	fsp_header_t*	header,		/*!< in/out: space header */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	ulint	limit;
+	ulint	size;
+	ulint	zip_size;
+	xdes_t*	descr;
+	ulint	count		= 0;
+	ulint	frag_n_used;
+	ulint	actual_increase;
+	ulint	i;
+	mtr_t	ibuf_mtr;
+
+	ut_ad(header && mtr);
+	ut_ad(page_offset(header) == FSP_HEADER_OFFSET);
+
+	/* Check if we can fill free list from above the free list limit */
+	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+	limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES, mtr);
+
+	zip_size = fsp_flags_get_zip_size(
+		mach_read_from_4(FSP_SPACE_FLAGS + header));
+	ut_a(ut_is_2pow(zip_size));
+	ut_a(zip_size <= UNIV_ZIP_SIZE_MAX);
+	ut_a(!zip_size || zip_size >= UNIV_ZIP_SIZE_MIN);
+
+	if (space == 0 && srv_auto_extend_last_data_file
+	    && size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) {
+
+		/* Try to increase the last data file size */
+		fsp_try_extend_data_file(&actual_increase, space, header, mtr);
+		size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+	}
+
+	if (space != 0 && !init_space
+	    && size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) {
+
+		/* Try to increase the .ibd file size */
+		fsp_try_extend_data_file(&actual_increase, space, header, mtr);
+		size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+	}
+
+	i = limit;
+
+	while ((init_space && i < 1)
+	       || ((i + FSP_EXTENT_SIZE <= size) && (count < FSP_FREE_ADD))) {
+
+		ibool	init_xdes;
+		if (zip_size) {
+			init_xdes = ut_2pow_remainder(i, zip_size) == 0;
+		} else {
+			init_xdes = ut_2pow_remainder(i, UNIV_PAGE_SIZE) == 0;
+		}
+
+		mlog_write_ulint(header + FSP_FREE_LIMIT, i + FSP_EXTENT_SIZE,
+				 MLOG_4BYTES, mtr);
+
+		if (UNIV_UNLIKELY(init_xdes)) {
+
+			buf_block_t*	block;
+
+			/* We are going to initialize a new descriptor page
+			and a new ibuf bitmap page: the prior contents of the
+			pages should be ignored. */
+
+			if (i > 0) {
+				block = buf_page_create(
+					space, i, zip_size, mtr);
+				buf_page_get(space, zip_size, i,
+					     RW_X_LATCH, mtr);
+				buf_block_dbg_add_level(block,
+							SYNC_FSP_PAGE);
+
+				fsp_init_file_page(block, mtr);
+				mlog_write_ulint(buf_block_get_frame(block)
+						 + FIL_PAGE_TYPE,
+						 FIL_PAGE_TYPE_XDES,
+						 MLOG_2BYTES, mtr);
+			}
+
+			/* Initialize the ibuf bitmap page in a separate
+			mini-transaction because it is low in the latching
+			order, and we must be able to release its latch
+			before returning from the fsp routine */
+
+			mtr_start(&ibuf_mtr);
+
+			block = buf_page_create(space,
+						    i + FSP_IBUF_BITMAP_OFFSET,
+						    zip_size, &ibuf_mtr);
+			buf_page_get(space, zip_size,
+				     i + FSP_IBUF_BITMAP_OFFSET,
+				     RW_X_LATCH, &ibuf_mtr);
+			buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+			fsp_init_file_page(block, &ibuf_mtr);
+
+			ibuf_bitmap_page_init(block, &ibuf_mtr);
+
+			mtr_commit(&ibuf_mtr);
+		}
+
+		descr = xdes_get_descriptor_with_space_hdr(header, space, i,
+							   mtr);
+		xdes_init(descr, mtr);
+
+		if (UNIV_UNLIKELY(init_xdes)) {
+
+			/* The first page in the extent is a descriptor page
+			and the second is an ibuf bitmap page: mark them
+			used */
+
+			xdes_set_bit(descr, XDES_FREE_BIT, 0, FALSE, mtr);
+			xdes_set_bit(descr, XDES_FREE_BIT,
+				     FSP_IBUF_BITMAP_OFFSET, FALSE, mtr);
+			xdes_set_state(descr, XDES_FREE_FRAG, mtr);
+
+			flst_add_last(header + FSP_FREE_FRAG,
+				      descr + XDES_FLST_NODE, mtr);
+			frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED,
+						     MLOG_4BYTES, mtr);
+			mlog_write_ulint(header + FSP_FRAG_N_USED,
+					 frag_n_used + 2, MLOG_4BYTES, mtr);
+		} else {
+			flst_add_last(header + FSP_FREE,
+				      descr + XDES_FLST_NODE, mtr);
+			count++;
+		}
+
+		i += FSP_EXTENT_SIZE;
+	}
+}
+
+/**********************************************************************//**
+Allocates a new free extent.
+@return	extent descriptor, NULL if cannot be allocated */
+static
+xdes_t*
+fsp_alloc_free_extent(
+/*==================*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	ulint	hint,	/*!< in: hint of which extent would be desirable: any
+			page offset in the extent goes; the hint must not
+			be > FSP_FREE_LIMIT */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	fsp_header_t*	header;
+	fil_addr_t	first;
+	xdes_t*		descr;
+
+	ut_ad(mtr);
+
+	header = fsp_get_space_header(space, zip_size, mtr);
+
+	descr = xdes_get_descriptor_with_space_hdr(header, space, hint, mtr);
+
+	if (descr && (xdes_get_state(descr, mtr) == XDES_FREE)) {
+		/* Ok, we can take this extent */
+	} else {
+		/* Take the first extent in the free list */
+		first = flst_get_first(header + FSP_FREE, mtr);
+
+		if (fil_addr_is_null(first)) {
+			fsp_fill_free_list(FALSE, space, header, mtr);
+
+			first = flst_get_first(header + FSP_FREE, mtr);
+		}
+
+		if (fil_addr_is_null(first)) {
+
+			return(NULL);	/* No free extents left */
+		}
+
+		descr = xdes_lst_get_descriptor(space, zip_size, first, mtr);
+	}
+
+	flst_remove(header + FSP_FREE, descr + XDES_FLST_NODE, mtr);
+
+	return(descr);
+}
+
+/**********************************************************************//**
+Allocates a single free page from a space. */
+static __attribute__((nonnull))
+void
+fsp_alloc_from_free_frag(
+/*=====================*/
+	fsp_header_t*	header,	/*!< in/out: tablespace header */
+	xdes_t*		descr,	/*!< in/out: extent descriptor */
+	ulint		bit,	/*!< in: slot to allocate in the extent */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint		frag_n_used;
+
+	ut_ad(xdes_get_state(descr, mtr) == XDES_FREE_FRAG);
+	ut_a(xdes_mtr_get_bit(descr, XDES_FREE_BIT, bit, mtr));
+	xdes_set_bit(descr, XDES_FREE_BIT, bit, FALSE, mtr);
+
+	/* Update the FRAG_N_USED field */
+	frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES,
+				     mtr);
+	frag_n_used++;
+	mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES,
+			 mtr);
+	if (xdes_is_full(descr, mtr)) {
+		/* The fragment is full: move it to another list */
+		flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE,
+			    mtr);
+		xdes_set_state(descr, XDES_FULL_FRAG, mtr);
+
+		flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE,
+			      mtr);
+		mlog_write_ulint(header + FSP_FRAG_N_USED,
+				 frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES,
+				 mtr);
+	}
+}
+
+/**********************************************************************//**
+Gets a buffer block for an allocated page.
+
+NOTE: If init_mtr != mtr, the block will only be initialized if it was
+not previously x-latched. It is assumed that the block has been
+x-latched only by mtr, and freed in mtr in that case.
+
+@return block, initialized if init_mtr==mtr
+or rw_lock_x_lock_count(&block->lock) == 1 */
+static
+buf_block_t*
+fsp_page_create(
+/*============*/
+	ulint	space,		/*!< in: space id of the allocated page */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number of the allocated page */
+	mtr_t*	mtr,		/*!< in: mini-transaction of the allocation */
+	mtr_t*	init_mtr)	/*!< in: mini-transaction for initializing
+				the page */
+{
+	buf_block_t*	block
+		= buf_page_create(space, page_no, zip_size, init_mtr);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)
+	      == rw_lock_own(&block->lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* Mimic buf_page_get(), but avoid the buf_pool->page_hash lookup. */
+	rw_lock_x_lock(&block->lock);
+	mutex_enter(&block->mutex);
+	buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+	mutex_exit(&block->mutex);
+	mtr_memo_push(init_mtr, block, MTR_MEMO_PAGE_X_FIX);
+
+	if (init_mtr == mtr
+	    || rw_lock_get_x_lock_count(&block->lock) == 1) {
+
+		/* Initialize the page, unless it was already
+		X-latched in mtr. (In this case, we would want to
+		allocate another page that has not been freed in mtr.) */
+		ut_ad(init_mtr == mtr
+		      || !mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+
+		fsp_init_file_page(block, init_mtr);
+	}
+
+	return(block);
+}
+
+/**********************************************************************//**
+Allocates a single free page from a space. The page is marked as used.
+@retval NULL if no page could be allocated
+@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
+(init_mtr == mtr, or the page was not previously freed in mtr)
+@retval block (not allocated or initialized) otherwise */
+static __attribute__((nonnull, warn_unused_result))
+buf_block_t*
+fsp_alloc_free_page(
+/*================*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	ulint	hint,	/*!< in: hint of which page would be desirable */
+	mtr_t*	mtr,	/*!< in/out: mini-transaction */
+	mtr_t*	init_mtr)/*!< in/out: mini-transaction in which the
+			page should be initialized
+			(may be the same as mtr) */
+{
+	fsp_header_t*	header;
+	fil_addr_t	first;
+	xdes_t*		descr;
+	ulint		free;
+	ulint		page_no;
+	ulint		space_size;
+
+	ut_ad(mtr);
+	ut_ad(init_mtr);
+
+	header = fsp_get_space_header(space, zip_size, mtr);
+
+	/* Get the hinted descriptor */
+	descr = xdes_get_descriptor_with_space_hdr(header, space, hint, mtr);
+
+	if (descr && (xdes_get_state(descr, mtr) == XDES_FREE_FRAG)) {
+		/* Ok, we can take this extent */
+	} else {
+		/* Else take the first extent in free_frag list */
+		first = flst_get_first(header + FSP_FREE_FRAG, mtr);
+
+		if (fil_addr_is_null(first)) {
+			/* There are no partially full fragments: allocate
+			a free extent and add it to the FREE_FRAG list. NOTE
+			that the allocation may have as a side-effect that an
+			extent containing a descriptor page is added to the
+			FREE_FRAG list. But we will allocate our page from the
+			the free extent anyway. */
+
+			descr = fsp_alloc_free_extent(space, zip_size,
+						      hint, mtr);
+
+			if (descr == NULL) {
+				/* No free space left */
+
+				return(NULL);
+			}
+
+			xdes_set_state(descr, XDES_FREE_FRAG, mtr);
+			flst_add_last(header + FSP_FREE_FRAG,
+				      descr + XDES_FLST_NODE, mtr);
+		} else {
+			descr = xdes_lst_get_descriptor(space, zip_size,
+							first, mtr);
+		}
+
+		/* Reset the hint */
+		hint = 0;
+	}
+
+	/* Now we have in descr an extent with at least one free page. Look
+	for a free page in the extent. */
+
+	free = xdes_find_bit(descr, XDES_FREE_BIT, TRUE,
+			     hint % FSP_EXTENT_SIZE, mtr);
+	if (free == ULINT_UNDEFINED) {
+
+		ut_print_buf(stderr, ((byte*) descr) - 500, 1000);
+		putc('\n', stderr);
+
+		ut_error;
+	}
+
+	page_no = xdes_get_offset(descr) + free;
+
+	space_size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+
+	if (space_size <= page_no) {
+		/* It must be that we are extending a single-table tablespace
+		whose size is still < 64 pages */
+
+		ut_a(space != 0);
+		if (page_no >= FSP_EXTENT_SIZE) {
+			fprintf(stderr,
+				"InnoDB: Error: trying to extend a"
+				" single-table tablespace %lu\n"
+				"InnoDB: by single page(s) though the"
+				" space size %lu. Page no %lu.\n",
+				(ulong) space, (ulong) space_size,
+				(ulong) page_no);
+			return(NULL);
+		}
+		if (!fsp_try_extend_data_file_with_pages(space, page_no,
+							 header, mtr)) {
+			/* No disk space left */
+			return(NULL);
+		}
+	}
+
+	fsp_alloc_from_free_frag(header, descr, free, mtr);
+	return(fsp_page_create(space, zip_size, page_no, mtr, init_mtr));
+}
+
+/**********************************************************************//**
+Frees a single page of a space. The page is marked as free and clean. */
+static
+void
+fsp_free_page(
+/*==========*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	ulint	page,	/*!< in: page offset */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	fsp_header_t*	header;
+	xdes_t*		descr;
+	ulint		state;
+	ulint		frag_n_used;
+
+	ut_ad(mtr);
+
+	/* fprintf(stderr, "Freeing page %lu in space %lu\n", page, space); */
+
+	header = fsp_get_space_header(space, zip_size, mtr);
+
+	descr = xdes_get_descriptor_with_space_hdr(header, space, page, mtr);
+
+	state = xdes_get_state(descr, mtr);
+
+	if (state != XDES_FREE_FRAG && state != XDES_FULL_FRAG) {
+		fprintf(stderr,
+			"InnoDB: Error: File space extent descriptor"
+			" of page %lu has state %lu\n",
+			(ulong) page,
+			(ulong) state);
+		fputs("InnoDB: Dump of descriptor: ", stderr);
+		ut_print_buf(stderr, ((byte*) descr) - 50, 200);
+		putc('\n', stderr);
+		/* Crash in debug version, so that we get a core dump
+		of this corruption. */
+		ut_ad(0);
+
+		if (state == XDES_FREE) {
+			/* We put here some fault tolerance: if the page
+			is already free, return without doing anything! */
+
+			return;
+		}
+
+		ut_error;
+	}
+
+	if (xdes_mtr_get_bit(descr, XDES_FREE_BIT,
+			     page % FSP_EXTENT_SIZE, mtr)) {
+
+		fprintf(stderr,
+			"InnoDB: Error: File space extent descriptor"
+			" of page %lu says it is free\n"
+			"InnoDB: Dump of descriptor: ", (ulong) page);
+		ut_print_buf(stderr, ((byte*) descr) - 50, 200);
+		putc('\n', stderr);
+		/* Crash in debug version, so that we get a core dump
+		of this corruption. */
+		ut_ad(0);
+
+		/* We put here some fault tolerance: if the page
+		is already free, return without doing anything! */
+
+		return;
+	}
+
+	xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr);
+	xdes_set_bit(descr, XDES_CLEAN_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr);
+
+	frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES,
+				     mtr);
+	if (state == XDES_FULL_FRAG) {
+		/* The fragment was full: move it to another list */
+		flst_remove(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE,
+			    mtr);
+		xdes_set_state(descr, XDES_FREE_FRAG, mtr);
+		flst_add_last(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE,
+			      mtr);
+		mlog_write_ulint(header + FSP_FRAG_N_USED,
+				 frag_n_used + FSP_EXTENT_SIZE - 1,
+				 MLOG_4BYTES, mtr);
+	} else {
+		ut_a(frag_n_used > 0);
+		mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used - 1,
+				 MLOG_4BYTES, mtr);
+	}
+
+	if (xdes_is_free(descr, mtr)) {
+		/* The extent has become free: move it to another list */
+		flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE,
+			    mtr);
+		fsp_free_extent(space, zip_size, page, mtr);
+	}
+
+	mtr->n_freed_pages++;
+}
+
+/**********************************************************************//**
+Returns an extent to the free list of a space. */
+static
+void
+fsp_free_extent(
+/*============*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	ulint	page,	/*!< in: page offset in the extent */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	fsp_header_t*	header;
+	xdes_t*		descr;
+
+	ut_ad(mtr);
+
+	header = fsp_get_space_header(space, zip_size, mtr);
+
+	descr = xdes_get_descriptor_with_space_hdr(header, space, page, mtr);
+
+	if (xdes_get_state(descr, mtr) == XDES_FREE) {
+
+		ut_print_buf(stderr, (byte*) descr - 500, 1000);
+		putc('\n', stderr);
+
+		ut_error;
+	}
+
+	xdes_init(descr, mtr);
+
+	flst_add_last(header + FSP_FREE, descr + XDES_FLST_NODE, mtr);
+}
+
+/**********************************************************************//**
+Returns the nth inode slot on an inode page.
+@return	segment inode */
+UNIV_INLINE
+fseg_inode_t*
+fsp_seg_inode_page_get_nth_inode(
+/*=============================*/
+	page_t*	page,	/*!< in: segment inode page */
+	ulint	i,	/*!< in: inode index on page */
+	ulint	zip_size __attribute__((unused)),
+			/*!< in: compressed page size, or 0 */
+	mtr_t*	mtr __attribute__((unused)))
+			/*!< in/out: mini-transaction */
+{
+	ut_ad(i < FSP_SEG_INODES_PER_PAGE(zip_size));
+	ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
+
+	return(page + FSEG_ARR_OFFSET + FSEG_INODE_SIZE * i);
+}
+
+/**********************************************************************//**
+Looks for a used segment inode on a segment inode page.
+@return	segment inode index, or ULINT_UNDEFINED if not found */
+static
+ulint
+fsp_seg_inode_page_find_used(
+/*=========================*/
+	page_t*	page,	/*!< in: segment inode page */
+	ulint	zip_size,/*!< in: compressed page size, or 0 */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	ulint		i;
+	fseg_inode_t*	inode;
+
+	for (i = 0; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) {
+
+		inode = fsp_seg_inode_page_get_nth_inode(
+			page, i, zip_size, mtr);
+
+		if (mach_read_from_8(inode + FSEG_ID)) {
+			/* This is used */
+
+			ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N)
+			      == FSEG_MAGIC_N_VALUE);
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Looks for an unused segment inode on a segment inode page.
+@return	segment inode index, or ULINT_UNDEFINED if not found */
+static
+ulint
+fsp_seg_inode_page_find_free(
+/*=========================*/
+	page_t*	page,	/*!< in: segment inode page */
+	ulint	i,	/*!< in: search forward starting from this index */
+	ulint	zip_size,/*!< in: compressed page size, or 0 */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	for (; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) {
+
+		fseg_inode_t*	inode;
+
+		inode = fsp_seg_inode_page_get_nth_inode(
+			page, i, zip_size, mtr);
+
+		if (!mach_read_from_8(inode + FSEG_ID)) {
+			/* This is unused */
+			return(i);
+		}
+
+		ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N)
+		      == FSEG_MAGIC_N_VALUE);
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Allocates a new file segment inode page.
+@return	TRUE if could be allocated */
+static
+ibool
+fsp_alloc_seg_inode_page(
+/*=====================*/
+	fsp_header_t*	space_header,	/*!< in: space header */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	fseg_inode_t*	inode;
+	buf_block_t*	block;
+	page_t*		page;
+	ulint		space;
+	ulint		zip_size;
+
+	ut_ad(page_offset(space_header) == FSP_HEADER_OFFSET);
+
+	space = page_get_space_id(page_align(space_header));
+
+	zip_size = fsp_flags_get_zip_size(
+		mach_read_from_4(FSP_SPACE_FLAGS + space_header));
+
+	block = fsp_alloc_free_page(space, zip_size, 0, mtr, mtr);
+
+	if (block == NULL) {
+
+		return(FALSE);
+	}
+
+	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+	ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1);
+
+	block->check_index_page_at_flush = FALSE;
+
+	page = buf_block_get_frame(block);
+
+	mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_INODE,
+			 MLOG_2BYTES, mtr);
+
+	for (ulint i = 0; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) {
+
+		inode = fsp_seg_inode_page_get_nth_inode(
+			page, i, zip_size, mtr);
+
+		mlog_write_ull(inode + FSEG_ID, 0, mtr);
+	}
+
+	flst_add_last(
+		space_header + FSP_SEG_INODES_FREE,
+		page + FSEG_INODE_PAGE_NODE, mtr);
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Allocates a new file segment inode.
+@return	segment inode, or NULL if not enough space */
+static
+fseg_inode_t*
+fsp_alloc_seg_inode(
+/*================*/
+	fsp_header_t*	space_header,	/*!< in: space header */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	ulint		page_no;
+	buf_block_t*	block;
+	page_t*		page;
+	fseg_inode_t*	inode;
+	ibool		success;
+	ulint		zip_size;
+	ulint		n;
+
+	ut_ad(page_offset(space_header) == FSP_HEADER_OFFSET);
+
+	if (flst_get_len(space_header + FSP_SEG_INODES_FREE, mtr) == 0) {
+		/* Allocate a new segment inode page */
+
+		success = fsp_alloc_seg_inode_page(space_header, mtr);
+
+		if (!success) {
+
+			return(NULL);
+		}
+	}
+
+	page_no = flst_get_first(space_header + FSP_SEG_INODES_FREE, mtr).page;
+
+	zip_size = fsp_flags_get_zip_size(
+		mach_read_from_4(FSP_SPACE_FLAGS + space_header));
+	block = buf_page_get(page_get_space_id(page_align(space_header)),
+			     zip_size, page_no, RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+	page = buf_block_get_frame(block);
+
+	n = fsp_seg_inode_page_find_free(page, 0, zip_size, mtr);
+
+	ut_a(n != ULINT_UNDEFINED);
+
+	inode = fsp_seg_inode_page_get_nth_inode(page, n, zip_size, mtr);
+
+	if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(page, n + 1,
+							    zip_size, mtr)) {
+		/* There are no other unused headers left on the page: move it
+		to another list */
+
+		flst_remove(space_header + FSP_SEG_INODES_FREE,
+			    page + FSEG_INODE_PAGE_NODE, mtr);
+
+		flst_add_last(space_header + FSP_SEG_INODES_FULL,
+			      page + FSEG_INODE_PAGE_NODE, mtr);
+	}
+
+	ut_ad(!mach_read_from_8(inode + FSEG_ID)
+	      || mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+	return(inode);
+}
+
+/**********************************************************************//**
+Frees a file segment inode. */
+static
+void
+fsp_free_seg_inode(
+/*===============*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	page_t*		page;
+	fsp_header_t*	space_header;
+
+	page = page_align(inode);
+
+	space_header = fsp_get_space_header(space, zip_size, mtr);
+
+	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+
+	if (ULINT_UNDEFINED
+	    == fsp_seg_inode_page_find_free(page, 0, zip_size, mtr)) {
+
+		/* Move the page to another list */
+
+		flst_remove(space_header + FSP_SEG_INODES_FULL,
+			    page + FSEG_INODE_PAGE_NODE, mtr);
+
+		flst_add_last(space_header + FSP_SEG_INODES_FREE,
+			      page + FSEG_INODE_PAGE_NODE, mtr);
+	}
+
+	mlog_write_ull(inode + FSEG_ID, 0, mtr);
+	mlog_write_ulint(inode + FSEG_MAGIC_N, 0xfa051ce3, MLOG_4BYTES, mtr);
+
+	if (ULINT_UNDEFINED
+	    == fsp_seg_inode_page_find_used(page, zip_size, mtr)) {
+
+		/* There are no other used headers left on the page: free it */
+
+		flst_remove(space_header + FSP_SEG_INODES_FREE,
+			    page + FSEG_INODE_PAGE_NODE, mtr);
+
+		fsp_free_page(space, zip_size, page_get_page_no(page), mtr);
+	}
+}
+
+/**********************************************************************//**
+Returns the file segment inode, page x-latched.
+@return	segment inode, page x-latched; NULL if the inode is free */
+static
+fseg_inode_t*
+fseg_inode_try_get(
+/*===============*/
+	fseg_header_t*	header,	/*!< in: segment header */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	fil_addr_t	inode_addr;
+	fseg_inode_t*	inode;
+
+	inode_addr.page = mach_read_from_4(header + FSEG_HDR_PAGE_NO);
+	inode_addr.boffset = mach_read_from_2(header + FSEG_HDR_OFFSET);
+	ut_ad(space == mach_read_from_4(header + FSEG_HDR_SPACE));
+
+	inode = fut_get_ptr(space, zip_size, inode_addr, RW_X_LATCH, mtr);
+
+	if (UNIV_UNLIKELY(!mach_read_from_8(inode + FSEG_ID))) {
+
+		inode = NULL;
+	} else {
+		ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N)
+		      == FSEG_MAGIC_N_VALUE);
+	}
+
+	return(inode);
+}
+
+/**********************************************************************//**
+Returns the file segment inode, page x-latched.
+@return	segment inode, page x-latched */
+static
+fseg_inode_t*
+fseg_inode_get(
+/*===========*/
+	fseg_header_t*	header,	/*!< in: segment header */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	fseg_inode_t*	inode
+		= fseg_inode_try_get(header, space, zip_size, mtr);
+	ut_a(inode);
+	return(inode);
+}
+
+/**********************************************************************//**
+Gets the page number from the nth fragment page slot.
+@return	page number, FIL_NULL if not in use */
+UNIV_INLINE
+ulint
+fseg_get_nth_frag_page_no(
+/*======================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	ulint		n,	/*!< in: slot index */
+	mtr_t*		mtr __attribute__((unused)))
+				/*!< in/out: mini-transaction */
+{
+	ut_ad(inode && mtr);
+	ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
+	ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+	return(mach_read_from_4(inode + FSEG_FRAG_ARR
+				+ n * FSEG_FRAG_SLOT_SIZE));
+}
+
+/**********************************************************************//**
+Sets the page number in the nth fragment page slot. */
+UNIV_INLINE
+void
+fseg_set_nth_frag_page_no(
+/*======================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	ulint		n,	/*!< in: slot index */
+	ulint		page_no,/*!< in: page number to set */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(inode && mtr);
+	ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
+	ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+
+	mlog_write_ulint(inode + FSEG_FRAG_ARR + n * FSEG_FRAG_SLOT_SIZE,
+			 page_no, MLOG_4BYTES, mtr);
+}
+
+/**********************************************************************//**
+Finds a fragment page slot which is free.
+@return	slot index; ULINT_UNDEFINED if none found */
+static
+ulint
+fseg_find_free_frag_page_slot(
+/*==========================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint	i;
+	ulint	page_no;
+
+	ut_ad(inode && mtr);
+
+	for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+		page_no = fseg_get_nth_frag_page_no(inode, i, mtr);
+
+		if (page_no == FIL_NULL) {
+
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Finds a fragment page slot which is used and last in the array.
+@return	slot index; ULINT_UNDEFINED if none found */
+static
+ulint
+fseg_find_last_used_frag_page_slot(
+/*===============================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint	i;
+	ulint	page_no;
+
+	ut_ad(inode && mtr);
+
+	for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+		page_no = fseg_get_nth_frag_page_no(
+			inode, FSEG_FRAG_ARR_N_SLOTS - i - 1, mtr);
+
+		if (page_no != FIL_NULL) {
+
+			return(FSEG_FRAG_ARR_N_SLOTS - i - 1);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Calculates reserved fragment page slots.
+@return	number of fragment pages */
+static
+ulint
+fseg_get_n_frag_pages(
+/*==================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint	i;
+	ulint	count	= 0;
+
+	ut_ad(inode && mtr);
+
+	for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+		if (FIL_NULL != fseg_get_nth_frag_page_no(inode, i, mtr)) {
+			count++;
+		}
+	}
+
+	return(count);
+}
+
+/**********************************************************************//**
+Creates a new segment.
+@return the block where the segment header is placed, x-latched, NULL
+if could not create segment because of lack of space */
+UNIV_INTERN
+buf_block_t*
+fseg_create_general(
+/*================*/
+	ulint	space,	/*!< in: space id */
+	ulint	page,	/*!< in: page where the segment header is placed: if
+			this is != 0, the page must belong to another segment,
+			if this is 0, a new page will be allocated and it
+			will belong to the created segment */
+	ulint	byte_offset, /*!< in: byte offset of the created segment header
+			on the page */
+	ibool	has_done_reservation, /*!< in: TRUE if the caller has already
+			done the reservation for the pages with
+			fsp_reserve_free_extents (at least 2 extents: one for
+			the inode and the other for the segment) then there is
+			no need to do the check for this individual
+			operation */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	ulint		flags;
+	ulint		zip_size;
+	fsp_header_t*	space_header;
+	fseg_inode_t*	inode;
+	ib_id_t		seg_id;
+	buf_block_t*	block	= 0; /* remove warning */
+	fseg_header_t*	header	= 0; /* remove warning */
+	rw_lock_t*	latch;
+	ibool		success;
+	ulint		n_reserved;
+	ulint		i;
+
+	ut_ad(mtr);
+	ut_ad(byte_offset + FSEG_HEADER_SIZE
+	      <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END);
+
+	latch = fil_space_get_latch(space, &flags);
+	zip_size = fsp_flags_get_zip_size(flags);
+
+	if (page != 0) {
+		block = buf_page_get(space, zip_size, page, RW_X_LATCH, mtr);
+		header = byte_offset + buf_block_get_frame(block);
+	}
+
+	mtr_x_lock(latch, mtr);
+
+	if (rw_lock_get_x_lock_count(latch) == 1) {
+		/* This thread did not own the latch before this call: free
+		excess pages from the insert buffer free list */
+
+		if (space == IBUF_SPACE_ID) {
+			ibuf_free_excess_pages();
+		}
+	}
+
+	if (!has_done_reservation) {
+		success = fsp_reserve_free_extents(&n_reserved, space, 2,
+						   FSP_NORMAL, mtr);
+		if (!success) {
+			return(NULL);
+		}
+	}
+
+	space_header = fsp_get_space_header(space, zip_size, mtr);
+
+	inode = fsp_alloc_seg_inode(space_header, mtr);
+
+	if (inode == NULL) {
+
+		goto funct_exit;
+	}
+
+	/* Read the next segment id from space header and increment the
+	value in space header */
+
+	seg_id = mach_read_from_8(space_header + FSP_SEG_ID);
+
+	mlog_write_ull(space_header + FSP_SEG_ID, seg_id + 1, mtr);
+
+	mlog_write_ull(inode + FSEG_ID, seg_id, mtr);
+	mlog_write_ulint(inode + FSEG_NOT_FULL_N_USED, 0, MLOG_4BYTES, mtr);
+
+	flst_init(inode + FSEG_FREE, mtr);
+	flst_init(inode + FSEG_NOT_FULL, mtr);
+	flst_init(inode + FSEG_FULL, mtr);
+
+	mlog_write_ulint(inode + FSEG_MAGIC_N, FSEG_MAGIC_N_VALUE,
+			 MLOG_4BYTES, mtr);
+	for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+		fseg_set_nth_frag_page_no(inode, i, FIL_NULL, mtr);
+	}
+
+	if (page == 0) {
+		block = fseg_alloc_free_page_low(space, zip_size,
+						 inode, 0, FSP_UP, mtr, mtr);
+
+		if (block == NULL) {
+
+			fsp_free_seg_inode(space, zip_size, inode, mtr);
+
+			goto funct_exit;
+		}
+
+		ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1);
+
+		header = byte_offset + buf_block_get_frame(block);
+		mlog_write_ulint(buf_block_get_frame(block) + FIL_PAGE_TYPE,
+				 FIL_PAGE_TYPE_SYS, MLOG_2BYTES, mtr);
+	}
+
+	mlog_write_ulint(header + FSEG_HDR_OFFSET,
+			 page_offset(inode), MLOG_2BYTES, mtr);
+
+	mlog_write_ulint(header + FSEG_HDR_PAGE_NO,
+			 page_get_page_no(page_align(inode)),
+			 MLOG_4BYTES, mtr);
+
+	mlog_write_ulint(header + FSEG_HDR_SPACE, space, MLOG_4BYTES, mtr);
+
+funct_exit:
+	if (!has_done_reservation) {
+
+		fil_space_release_free_extents(space, n_reserved);
+	}
+
+	return(block);
+}
+
+/**********************************************************************//**
+Creates a new segment.
+@return the block where the segment header is placed, x-latched, NULL
+if could not create segment because of lack of space */
+UNIV_INTERN
+buf_block_t*
+fseg_create(
+/*========*/
+	ulint	space,	/*!< in: space id */
+	ulint	page,	/*!< in: page where the segment header is placed: if
+			this is != 0, the page must belong to another segment,
+			if this is 0, a new page will be allocated and it
+			will belong to the created segment */
+	ulint	byte_offset, /*!< in: byte offset of the created segment header
+			on the page */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	return(fseg_create_general(space, page, byte_offset, FALSE, mtr));
+}
+
+/**********************************************************************//**
+Calculates the number of pages reserved by a segment, and how many pages are
+currently used.
+@return	number of reserved pages */
+static
+ulint
+fseg_n_reserved_pages_low(
+/*======================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	ulint*		used,	/*!< out: number of pages used (not
+				more than reserved) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint	ret;
+
+	ut_ad(inode && used && mtr);
+	ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX));
+
+	*used = mtr_read_ulint(inode + FSEG_NOT_FULL_N_USED, MLOG_4BYTES, mtr)
+		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL, mtr)
+		+ fseg_get_n_frag_pages(inode, mtr);
+
+	ret = fseg_get_n_frag_pages(inode, mtr)
+		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FREE, mtr)
+		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_NOT_FULL, mtr)
+		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL, mtr);
+
+	return(ret);
+}
+
+/**********************************************************************//**
+Calculates the number of pages reserved by a segment, and how many pages are
+currently used.
+@return	number of reserved pages */
+UNIV_INTERN
+ulint
+fseg_n_reserved_pages(
+/*==================*/
+	fseg_header_t*	header,	/*!< in: segment header */
+	ulint*		used,	/*!< out: number of pages used (<= reserved) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint		ret;
+	fseg_inode_t*	inode;
+	ulint		space;
+	ulint		flags;
+	ulint		zip_size;
+	rw_lock_t*	latch;
+
+	space = page_get_space_id(page_align(header));
+	latch = fil_space_get_latch(space, &flags);
+	zip_size = fsp_flags_get_zip_size(flags);
+
+	mtr_x_lock(latch, mtr);
+
+	inode = fseg_inode_get(header, space, zip_size, mtr);
+
+	ret = fseg_n_reserved_pages_low(inode, used, mtr);
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Tries to fill the free list of a segment with consecutive free extents.
+This happens if the segment is big enough to allow extents in the free list,
+the free list is empty, and the extents can be allocated consecutively from
+the hint onward. */
+static
+void
+fseg_fill_free_list(
+/*================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint		hint,	/*!< in: hint which extent would be good as
+				the first extent */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	xdes_t*	descr;
+	ulint	i;
+	ib_id_t	seg_id;
+	ulint	reserved;
+	ulint	used;
+
+	ut_ad(inode && mtr);
+	ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+
+	reserved = fseg_n_reserved_pages_low(inode, &used, mtr);
+
+	if (reserved < FSEG_FREE_LIST_LIMIT * FSP_EXTENT_SIZE) {
+
+		/* The segment is too small to allow extents in free list */
+
+		return;
+	}
+
+	if (flst_get_len(inode + FSEG_FREE, mtr) > 0) {
+		/* Free list is not empty */
+
+		return;
+	}
+
+	for (i = 0; i < FSEG_FREE_LIST_MAX_LEN; i++) {
+		descr = xdes_get_descriptor(space, zip_size, hint, mtr);
+
+		if ((descr == NULL)
+		    || (XDES_FREE != xdes_get_state(descr, mtr))) {
+
+			/* We cannot allocate the desired extent: stop */
+
+			return;
+		}
+
+		descr = fsp_alloc_free_extent(space, zip_size, hint, mtr);
+
+		xdes_set_state(descr, XDES_FSEG, mtr);
+
+		seg_id = mach_read_from_8(inode + FSEG_ID);
+		ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N)
+		      == FSEG_MAGIC_N_VALUE);
+		mlog_write_ull(descr + XDES_ID, seg_id, mtr);
+
+		flst_add_last(inode + FSEG_FREE, descr + XDES_FLST_NODE, mtr);
+		hint += FSP_EXTENT_SIZE;
+	}
+}
+
+/*********************************************************************//**
+Allocates a free extent for the segment: looks first in the free list of the
+segment, then tries to allocate from the space free list. NOTE that the extent
+returned still resides in the segment free list, it is not yet taken off it!
+@retval NULL if no page could be allocated
+@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
+(init_mtr == mtr, or the page was not previously freed in mtr)
+@retval block (not allocated or initialized) otherwise */
+static
+xdes_t*
+fseg_alloc_free_extent(
+/*===================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	xdes_t*		descr;
+	ib_id_t		seg_id;
+	fil_addr_t	first;
+
+	ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+
+	if (flst_get_len(inode + FSEG_FREE, mtr) > 0) {
+		/* Segment free list is not empty, allocate from it */
+
+		first = flst_get_first(inode + FSEG_FREE, mtr);
+
+		descr = xdes_lst_get_descriptor(space, zip_size, first, mtr);
+	} else {
+		/* Segment free list was empty, allocate from space */
+		descr = fsp_alloc_free_extent(space, zip_size, 0, mtr);
+
+		if (descr == NULL) {
+
+			return(NULL);
+		}
+
+		seg_id = mach_read_from_8(inode + FSEG_ID);
+
+		xdes_set_state(descr, XDES_FSEG, mtr);
+		mlog_write_ull(descr + XDES_ID, seg_id, mtr);
+		flst_add_last(inode + FSEG_FREE, descr + XDES_FLST_NODE, mtr);
+
+		/* Try to fill the segment free list */
+		fseg_fill_free_list(inode, space, zip_size,
+				    xdes_get_offset(descr) + FSP_EXTENT_SIZE,
+				    mtr);
+	}
+
+	return(descr);
+}
+
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@retval NULL if no page could be allocated
+@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
+(init_mtr == mtr, or the page was not previously freed in mtr)
+@retval block (not allocated or initialized) otherwise */
+static
+buf_block_t*
+fseg_alloc_free_page_low(
+/*=====================*/
+	ulint		space,	/*!< in: space */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	fseg_inode_t*	seg_inode, /*!< in/out: segment inode */
+	ulint		hint,	/*!< in: hint of which page would be
+				desirable */
+	byte		direction, /*!< in: if the new page is needed because
+				of an index page split, and records are
+				inserted there in order, into which
+				direction they go alphabetically: FSP_DOWN,
+				FSP_UP, FSP_NO_DIR */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	mtr_t*		init_mtr)/*!< in/out: mtr or another mini-transaction
+				in which the page should be initialized.
+				If init_mtr!=mtr, but the page is already
+				latched in mtr, do not initialize the page. */
+{
+	fsp_header_t*	space_header;
+	ulint		space_size;
+	ib_id_t		seg_id;
+	ulint		used;
+	ulint		reserved;
+	xdes_t*		descr;		/*!< extent of the hinted page */
+	ulint		ret_page;	/*!< the allocated page offset, FIL_NULL
+					if could not be allocated */
+	xdes_t*		ret_descr;	/*!< the extent of the allocated page */
+	ibool		success;
+	ulint		n;
+
+	ut_ad(mtr);
+	ut_ad((direction >= FSP_UP) && (direction <= FSP_NO_DIR));
+	ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
+	      == FSEG_MAGIC_N_VALUE);
+	ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+	seg_id = mach_read_from_8(seg_inode + FSEG_ID);
+
+	ut_ad(seg_id);
+
+	reserved = fseg_n_reserved_pages_low(seg_inode, &used, mtr);
+
+	space_header = fsp_get_space_header(space, zip_size, mtr);
+
+	descr = xdes_get_descriptor_with_space_hdr(space_header, space,
+						   hint, mtr);
+	if (descr == NULL) {
+		/* Hint outside space or too high above free limit: reset
+		hint */
+		/* The file space header page is always allocated. */
+		hint = 0;
+		descr = xdes_get_descriptor(space, zip_size, hint, mtr);
+	}
+
+	/* In the big if-else below we look for ret_page and ret_descr */
+	/*-------------------------------------------------------------*/
+	if ((xdes_get_state(descr, mtr) == XDES_FSEG)
+	    && mach_read_from_8(descr + XDES_ID) == seg_id
+	    && (xdes_mtr_get_bit(descr, XDES_FREE_BIT,
+				 hint % FSP_EXTENT_SIZE, mtr) == TRUE)) {
+take_hinted_page:
+		/* 1. We can take the hinted page
+		=================================*/
+		ret_descr = descr;
+		ret_page = hint;
+		/* Skip the check for extending the tablespace. If the
+		page hint were not within the size of the tablespace,
+		we would have got (descr == NULL) above and reset the hint. */
+		goto got_hinted_page;
+		/*-----------------------------------------------------------*/
+	} else if (xdes_get_state(descr, mtr) == XDES_FREE
+		   && reserved - used < reserved / FSEG_FILLFACTOR
+		   && used >= FSEG_FRAG_LIMIT) {
+
+		/* 2. We allocate the free extent from space and can take
+		=========================================================
+		the hinted page
+		===============*/
+		ret_descr = fsp_alloc_free_extent(space, zip_size, hint, mtr);
+
+		ut_a(ret_descr == descr);
+
+		xdes_set_state(ret_descr, XDES_FSEG, mtr);
+		mlog_write_ull(ret_descr + XDES_ID, seg_id, mtr);
+		flst_add_last(seg_inode + FSEG_FREE,
+			      ret_descr + XDES_FLST_NODE, mtr);
+
+		/* Try to fill the segment free list */
+		fseg_fill_free_list(seg_inode, space, zip_size,
+				    hint + FSP_EXTENT_SIZE, mtr);
+		goto take_hinted_page;
+		/*-----------------------------------------------------------*/
+	} else if ((direction != FSP_NO_DIR)
+		   && ((reserved - used) < reserved / FSEG_FILLFACTOR)
+		   && (used >= FSEG_FRAG_LIMIT)
+		   && (!!(ret_descr
+			  = fseg_alloc_free_extent(seg_inode,
+						   space, zip_size, mtr)))) {
+
+		/* 3. We take any free extent (which was already assigned above
+		===============================================================
+		in the if-condition to ret_descr) and take the lowest or
+		========================================================
+		highest page in it, depending on the direction
+		==============================================*/
+		ret_page = xdes_get_offset(ret_descr);
+
+		if (direction == FSP_DOWN) {
+			ret_page += FSP_EXTENT_SIZE - 1;
+		}
+		/*-----------------------------------------------------------*/
+	} else if ((xdes_get_state(descr, mtr) == XDES_FSEG)
+		   && mach_read_from_8(descr + XDES_ID) == seg_id
+		   && (!xdes_is_full(descr, mtr))) {
+
+		/* 4. We can take the page from the same extent as the
+		======================================================
+		hinted page (and the extent already belongs to the
+		==================================================
+		segment)
+		========*/
+		ret_descr = descr;
+		ret_page = xdes_get_offset(ret_descr)
+			+ xdes_find_bit(ret_descr, XDES_FREE_BIT, TRUE,
+					hint % FSP_EXTENT_SIZE, mtr);
+		/*-----------------------------------------------------------*/
+	} else if (reserved - used > 0) {
+		/* 5. We take any unused page from the segment
+		==============================================*/
+		fil_addr_t	first;
+
+		if (flst_get_len(seg_inode + FSEG_NOT_FULL, mtr) > 0) {
+			first = flst_get_first(seg_inode + FSEG_NOT_FULL,
+					       mtr);
+		} else if (flst_get_len(seg_inode + FSEG_FREE, mtr) > 0) {
+			first = flst_get_first(seg_inode + FSEG_FREE, mtr);
+		} else {
+			ut_error;
+			return(NULL);
+		}
+
+		ret_descr = xdes_lst_get_descriptor(space, zip_size,
+						    first, mtr);
+		ret_page = xdes_get_offset(ret_descr)
+			+ xdes_find_bit(ret_descr, XDES_FREE_BIT, TRUE,
+					0, mtr);
+		/*-----------------------------------------------------------*/
+	} else if (used < FSEG_FRAG_LIMIT) {
+		/* 6. We allocate an individual page from the space
+		===================================================*/
+		buf_block_t* block = fsp_alloc_free_page(
+			space, zip_size, hint, mtr, init_mtr);
+
+		if (block != NULL) {
+			/* Put the page in the fragment page array of the
+			segment */
+			n = fseg_find_free_frag_page_slot(seg_inode, mtr);
+			ut_a(n != ULINT_UNDEFINED);
+
+			fseg_set_nth_frag_page_no(
+				seg_inode, n, buf_block_get_page_no(block),
+				mtr);
+		}
+
+		/* fsp_alloc_free_page() invoked fsp_init_file_page()
+		already. */
+		return(block);
+		/*-----------------------------------------------------------*/
+	} else {
+		/* 7. We allocate a new extent and take its first page
+		======================================================*/
+		ret_descr = fseg_alloc_free_extent(seg_inode,
+						   space, zip_size, mtr);
+
+		if (ret_descr == NULL) {
+			ret_page = FIL_NULL;
+		} else {
+			ret_page = xdes_get_offset(ret_descr);
+		}
+	}
+
+	if (ret_page == FIL_NULL) {
+		/* Page could not be allocated */
+
+		return(NULL);
+	}
+
+	if (space != 0) {
+		space_size = fil_space_get_size(space);
+
+		if (space_size <= ret_page) {
+			/* It must be that we are extending a single-table
+			tablespace whose size is still < 64 pages */
+
+			if (ret_page >= FSP_EXTENT_SIZE) {
+				fprintf(stderr,
+					"InnoDB: Error (2): trying to extend"
+					" a single-table tablespace %lu\n"
+					"InnoDB: by single page(s) though"
+					" the space size %lu. Page no %lu.\n",
+					(ulong) space, (ulong) space_size,
+					(ulong) ret_page);
+				return(NULL);
+			}
+
+			success = fsp_try_extend_data_file_with_pages(
+				space, ret_page, space_header, mtr);
+			if (!success) {
+				/* No disk space left */
+				return(NULL);
+			}
+		}
+	}
+
+got_hinted_page:
+	/* ret_descr == NULL if the block was allocated from free_frag
+	(XDES_FREE_FRAG) */
+	if (ret_descr != NULL) {
+		/* At this point we know the extent and the page offset.
+		The extent is still in the appropriate list (FSEG_NOT_FULL
+		or FSEG_FREE), and the page is not yet marked as used. */
+
+		ut_ad(xdes_get_descriptor(space, zip_size, ret_page, mtr)
+		      == ret_descr);
+
+		ut_ad(xdes_mtr_get_bit(
+				ret_descr, XDES_FREE_BIT,
+				ret_page % FSP_EXTENT_SIZE, mtr));
+
+		fseg_mark_page_used(seg_inode, ret_page, ret_descr, mtr);
+	}
+
+	return(fsp_page_create(
+		       space, fsp_flags_get_zip_size(
+			       mach_read_from_4(FSP_SPACE_FLAGS
+						+ space_header)),
+		       ret_page, mtr, init_mtr));
+}
+
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@retval NULL if no page could be allocated
+@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
+(init_mtr == mtr, or the page was not previously freed in mtr)
+@retval block (not allocated or initialized) otherwise */
+UNIV_INTERN
+buf_block_t*
+fseg_alloc_free_page_general(
+/*=========================*/
+	fseg_header_t*	seg_header,/*!< in/out: segment header */
+	ulint		hint,	/*!< in: hint of which page would be
+				desirable */
+	byte		direction,/*!< in: if the new page is needed because
+				of an index page split, and records are
+				inserted there in order, into which
+				direction they go alphabetically: FSP_DOWN,
+				FSP_UP, FSP_NO_DIR */
+	ibool		has_done_reservation, /*!< in: TRUE if the caller has
+				already done the reservation for the page
+				with fsp_reserve_free_extents, then there
+				is no need to do the check for this individual
+				page */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	mtr_t*		init_mtr)/*!< in/out: mtr or another mini-transaction
+				in which the page should be initialized.
+				If init_mtr!=mtr, but the page is already
+				latched in mtr, do not initialize the page. */
+{
+	fseg_inode_t*	inode;
+	ulint		space;
+	ulint		flags;
+	ulint		zip_size;
+	rw_lock_t*	latch;
+	buf_block_t*	block;
+	ulint		n_reserved;
+
+	space = page_get_space_id(page_align(seg_header));
+
+	latch = fil_space_get_latch(space, &flags);
+
+	zip_size = fsp_flags_get_zip_size(flags);
+
+	mtr_x_lock(latch, mtr);
+
+	if (rw_lock_get_x_lock_count(latch) == 1) {
+		/* This thread did not own the latch before this call: free
+		excess pages from the insert buffer free list */
+
+		if (space == IBUF_SPACE_ID) {
+			ibuf_free_excess_pages();
+		}
+	}
+
+	inode = fseg_inode_get(seg_header, space, zip_size, mtr);
+
+	if (!has_done_reservation
+	    && !fsp_reserve_free_extents(&n_reserved, space, 2,
+					 FSP_NORMAL, mtr)) {
+		return(NULL);
+	}
+
+	block = fseg_alloc_free_page_low(space, zip_size,
+					 inode, hint, direction,
+					 mtr, init_mtr);
+	if (!has_done_reservation) {
+		fil_space_release_free_extents(space, n_reserved);
+	}
+
+	return(block);
+}
+
+/**********************************************************************//**
+Checks that we have at least 2 frag pages free in the first extent of a
+single-table tablespace, and they are also physically initialized to the data
+file. That is we have already extended the data file so that those pages are
+inside the data file. If not, this function extends the tablespace with
+pages.
+@return	TRUE if there were >= 3 free pages, or we were able to extend */
+static
+ibool
+fsp_reserve_free_pages(
+/*===================*/
+	ulint		space,		/*!< in: space id, must be != 0 */
+	fsp_header_t*	space_header,	/*!< in: header of that space,
+					x-latched */
+	ulint		size,		/*!< in: size of the tablespace in
+					pages, must be < FSP_EXTENT_SIZE/2 */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	xdes_t*	descr;
+	ulint	n_used;
+
+	ut_a(space != 0);
+	ut_a(size < FSP_EXTENT_SIZE / 2);
+
+	descr = xdes_get_descriptor_with_space_hdr(space_header, space, 0,
+						   mtr);
+	n_used = xdes_get_n_used(descr, mtr);
+
+	ut_a(n_used <= size);
+
+	if (size >= n_used + 2) {
+
+		return(TRUE);
+	}
+
+	return(fsp_try_extend_data_file_with_pages(space, n_used + 1,
+						   space_header, mtr));
+}
+
+/**********************************************************************//**
+Reserves free pages from a tablespace. All mini-transactions which may
+use several pages from the tablespace should call this function beforehand
+and reserve enough free extents so that they certainly will be able
+to do their operation, like a B-tree page split, fully. Reservations
+must be released with function fil_space_release_free_extents!
+
+The alloc_type below has the following meaning: FSP_NORMAL means an
+operation which will probably result in more space usage, like an
+insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are
+deleting rows, then this allocation will in the long run result in
+less space usage (after a purge); FSP_CLEANING means allocation done
+in a physical record delete (like in a purge) or other cleaning operation
+which will result in less space usage in the long run. We prefer the latter
+two types of allocation: when space is scarce, FSP_NORMAL allocations
+will not succeed, but the latter two allocations will succeed, if possible.
+The purpose is to avoid dead end where the database is full but the
+user cannot free any space because these freeing operations temporarily
+reserve some space.
+
+Single-table tablespaces whose size is < 32 pages are a special case. In this
+function we would liberally reserve several 64 page extents for every page
+split or merge in a B-tree. But we do not want to waste disk space if the table
+only occupies < 32 pages. That is why we apply different rules in that special
+case, just ensuring that there are 3 free pages available.
+@return	TRUE if we were able to make the reservation */
+UNIV_INTERN
+ibool
+fsp_reserve_free_extents(
+/*=====================*/
+	ulint*	n_reserved,/*!< out: number of extents actually reserved; if we
+			return TRUE and the tablespace size is < 64 pages,
+			then this can be 0, otherwise it is n_ext */
+	ulint	space,	/*!< in: space id */
+	ulint	n_ext,	/*!< in: number of extents to reserve */
+	ulint	alloc_type,/*!< in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	fsp_header_t*	space_header;
+	rw_lock_t*	latch;
+	ulint		n_free_list_ext;
+	ulint		free_limit;
+	ulint		size;
+	ulint		flags;
+	ulint		zip_size;
+	ulint		n_free;
+	ulint		n_free_up;
+	ulint		reserve;
+	ibool		success;
+	ulint		n_pages_added;
+
+	ut_ad(mtr);
+	*n_reserved = n_ext;
+
+	latch = fil_space_get_latch(space, &flags);
+	zip_size = fsp_flags_get_zip_size(flags);
+
+	mtr_x_lock(latch, mtr);
+
+	space_header = fsp_get_space_header(space, zip_size, mtr);
+try_again:
+	size = mtr_read_ulint(space_header + FSP_SIZE, MLOG_4BYTES, mtr);
+
+	if (size < FSP_EXTENT_SIZE / 2) {
+		/* Use different rules for small single-table tablespaces */
+		*n_reserved = 0;
+		return(fsp_reserve_free_pages(space, space_header, size, mtr));
+	}
+
+	n_free_list_ext = flst_get_len(space_header + FSP_FREE, mtr);
+
+	free_limit = mtr_read_ulint(space_header + FSP_FREE_LIMIT,
+				    MLOG_4BYTES, mtr);
+
+	/* Below we play safe when counting free extents above the free limit:
+	some of them will contain extent descriptor pages, and therefore
+	will not be free extents */
+
+	n_free_up = (size - free_limit) / FSP_EXTENT_SIZE;
+
+	if (n_free_up > 0) {
+		n_free_up--;
+		if (!zip_size) {
+			n_free_up -= n_free_up
+				/ (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE);
+		} else {
+			n_free_up -= n_free_up
+				/ (zip_size / FSP_EXTENT_SIZE);
+		}
+	}
+
+	n_free = n_free_list_ext + n_free_up;
+
+	if (alloc_type == FSP_NORMAL) {
+		/* We reserve 1 extent + 0.5 % of the space size to undo logs
+		and 1 extent + 0.5 % to cleaning operations; NOTE: this source
+		code is duplicated in the function below! */
+
+		reserve = 2 + ((size / FSP_EXTENT_SIZE) * 2) / 200;
+
+		if (n_free <= reserve + n_ext) {
+
+			goto try_to_extend;
+		}
+	} else if (alloc_type == FSP_UNDO) {
+		/* We reserve 0.5 % of the space size to cleaning operations */
+
+		reserve = 1 + ((size / FSP_EXTENT_SIZE) * 1) / 200;
+
+		if (n_free <= reserve + n_ext) {
+
+			goto try_to_extend;
+		}
+	} else {
+		ut_a(alloc_type == FSP_CLEANING);
+	}
+
+	success = fil_space_reserve_free_extents(space, n_free, n_ext);
+
+	if (success) {
+		return(TRUE);
+	}
+try_to_extend:
+	success = fsp_try_extend_data_file(&n_pages_added, space,
+					   space_header, mtr);
+	if (success && n_pages_added > 0) {
+
+		goto try_again;
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+This function should be used to get information on how much we still
+will be able to insert new data to the database without running out the
+tablespace. Only free extents are taken into account and we also subtract
+the safety margin required by the above function fsp_reserve_free_extents.
+@return	available space in kB */
+UNIV_INTERN
+ullint
+fsp_get_available_space_in_free_extents(
+/*====================================*/
+	ulint	space)	/*!< in: space id */
+{
+	fsp_header_t*	space_header;
+	ulint		n_free_list_ext;
+	ulint		free_limit;
+	ulint		size;
+	ulint		flags;
+	ulint		zip_size;
+	ulint		n_free;
+	ulint		n_free_up;
+	ulint		reserve;
+	rw_lock_t*	latch;
+	mtr_t		mtr;
+
+	/* The convoluted mutex acquire is to overcome latching order
+	issues: The problem is that the fil_mutex is at a lower level
+	than the tablespace latch and the buffer pool mutex. We have to
+	first prevent any operations on the file system by acquiring the
+	dictionary mutex. Then acquire the tablespace latch to obey the
+	latching order and then release the dictionary mutex. That way we
+	ensure that the tablespace instance can't be freed while we are
+	examining its contents (see fil_space_free()).
+
+	However, there is one further complication, we release the fil_mutex
+	when we need to invalidate the the pages in the buffer pool and we
+	reacquire the fil_mutex when deleting and freeing the tablespace
+	instance in fil0fil.cc. Here we need to account for that situation
+	too. */
+
+	mutex_enter(&dict_sys->mutex);
+
+	/* At this stage there is no guarantee that the tablespace even
+	exists in the cache. */
+
+	if (fil_tablespace_deleted_or_being_deleted_in_mem(space, -1)) {
+
+		mutex_exit(&dict_sys->mutex);
+
+		return(ULLINT_UNDEFINED);
+	}
+
+	mtr_start(&mtr);
+
+	latch = fil_space_get_latch(space, &flags);
+
+	/* This should ensure that the tablespace instance can't be freed
+	by another thread. However, the tablespace pages can still be freed
+	from the buffer pool. We need to check for that again. */
+
+	zip_size = fsp_flags_get_zip_size(flags);
+
+	mtr_x_lock(latch, &mtr);
+
+	mutex_exit(&dict_sys->mutex);
+
+	/* At this point it is possible for the tablespace to be deleted and
+	its pages removed from the buffer pool. We need to check for that
+	situation. However, the tablespace instance can't be deleted because
+	our latching above should ensure that. */
+
+	if (fil_tablespace_is_being_deleted(space)) {
+
+		mtr_commit(&mtr);
+
+		return(ULLINT_UNDEFINED);
+	}
+
+	/* From here on even if the user has dropped the tablespace, the
+	pages _must_ still exist in the buffer pool and the tablespace
+	instance _must_ be in the file system hash table. */
+
+	space_header = fsp_get_space_header(space, zip_size, &mtr);
+
+	size = mtr_read_ulint(space_header + FSP_SIZE, MLOG_4BYTES, &mtr);
+
+	n_free_list_ext = flst_get_len(space_header + FSP_FREE, &mtr);
+
+	free_limit = mtr_read_ulint(space_header + FSP_FREE_LIMIT,
+				    MLOG_4BYTES, &mtr);
+	mtr_commit(&mtr);
+
+	if (size < FSP_EXTENT_SIZE) {
+		ut_a(space != 0);	/* This must be a single-table
+					tablespace */
+
+		return(0);		/* TODO: count free frag pages and
+					return a value based on that */
+	}
+
+	/* Below we play safe when counting free extents above the free limit:
+	some of them will contain extent descriptor pages, and therefore
+	will not be free extents */
+
+	n_free_up = (size - free_limit) / FSP_EXTENT_SIZE;
+
+	if (n_free_up > 0) {
+		n_free_up--;
+		if (!zip_size) {
+			n_free_up -= n_free_up
+				/ (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE);
+		} else {
+			n_free_up -= n_free_up
+				/ (zip_size / FSP_EXTENT_SIZE);
+		}
+	}
+
+	n_free = n_free_list_ext + n_free_up;
+
+	/* We reserve 1 extent + 0.5 % of the space size to undo logs
+	and 1 extent + 0.5 % to cleaning operations; NOTE: this source
+	code is duplicated in the function above! */
+
+	reserve = 2 + ((size / FSP_EXTENT_SIZE) * 2) / 200;
+
+	if (reserve > n_free) {
+		return(0);
+	}
+
+	if (!zip_size) {
+		return((ullint) (n_free - reserve)
+		       * FSP_EXTENT_SIZE
+		       * (UNIV_PAGE_SIZE / 1024));
+	} else {
+		return((ullint) (n_free - reserve)
+		       * FSP_EXTENT_SIZE
+		       * (zip_size / 1024));
+	}
+}
+
+/********************************************************************//**
+Marks a page used. The page must reside within the extents of the given
+segment. */
+static __attribute__((nonnull))
+void
+fseg_mark_page_used(
+/*================*/
+	fseg_inode_t*	seg_inode,/*!< in: segment inode */
+	ulint		page,	/*!< in: page offset */
+	xdes_t*		descr,  /*!< in: extent descriptor */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint	not_full_n_used;
+
+	ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+	ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
+	      == FSEG_MAGIC_N_VALUE);
+
+	ut_ad(mtr_read_ulint(seg_inode + FSEG_ID, MLOG_4BYTES, mtr)
+	      == mtr_read_ulint(descr + XDES_ID, MLOG_4BYTES, mtr));
+
+	if (xdes_is_free(descr, mtr)) {
+		/* We move the extent from the free list to the
+		NOT_FULL list */
+		flst_remove(seg_inode + FSEG_FREE, descr + XDES_FLST_NODE,
+			    mtr);
+		flst_add_last(seg_inode + FSEG_NOT_FULL,
+			      descr + XDES_FLST_NODE, mtr);
+	}
+
+	ut_ad(xdes_mtr_get_bit(
+			descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr));
+
+	/* We mark the page as used */
+	xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, FALSE, mtr);
+
+	not_full_n_used = mtr_read_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
+					 MLOG_4BYTES, mtr);
+	not_full_n_used++;
+	mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED, not_full_n_used,
+			 MLOG_4BYTES, mtr);
+	if (xdes_is_full(descr, mtr)) {
+		/* We move the extent from the NOT_FULL list to the
+		FULL list */
+		flst_remove(seg_inode + FSEG_NOT_FULL,
+			    descr + XDES_FLST_NODE, mtr);
+		flst_add_last(seg_inode + FSEG_FULL,
+			      descr + XDES_FLST_NODE, mtr);
+
+		mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
+				 not_full_n_used - FSP_EXTENT_SIZE,
+				 MLOG_4BYTES, mtr);
+	}
+}
+
+/**********************************************************************//**
+Frees a single page of a segment. */
+static
+void
+fseg_free_page_low(
+/*===============*/
+	fseg_inode_t*	seg_inode, /*!< in: segment inode */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint		page,	/*!< in: page offset */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	xdes_t*	descr;
+	ulint	not_full_n_used;
+	ulint	state;
+	ib_id_t	descr_id;
+	ib_id_t	seg_id;
+	ulint	i;
+
+	ut_ad(seg_inode && mtr);
+	ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
+	      == FSEG_MAGIC_N_VALUE);
+	ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+
+	/* Drop search system page hash index if the page is found in
+	the pool and is hashed */
+
+	btr_search_drop_page_hash_when_freed(space, zip_size, page);
+
+	descr = xdes_get_descriptor(space, zip_size, page, mtr);
+
+	if (xdes_mtr_get_bit(descr, XDES_FREE_BIT,
+			     page % FSP_EXTENT_SIZE, mtr)) {
+		fputs("InnoDB: Dump of the tablespace extent descriptor: ",
+		      stderr);
+		ut_print_buf(stderr, descr, 40);
+
+		fprintf(stderr, "\n"
+			"InnoDB: Serious error! InnoDB is trying to"
+			" free page %lu\n"
+			"InnoDB: though it is already marked as free"
+			" in the tablespace!\n"
+			"InnoDB: The tablespace free space info is corrupt.\n"
+			"InnoDB: You may need to dump your"
+			" InnoDB tables and recreate the whole\n"
+			"InnoDB: database!\n", (ulong) page);
+crash:
+		fputs("InnoDB: Please refer to\n"
+		      "InnoDB: " REFMAN "forcing-innodb-recovery.html\n"
+		      "InnoDB: about forcing recovery.\n", stderr);
+		ut_error;
+	}
+
+	state = xdes_get_state(descr, mtr);
+
+	if (state != XDES_FSEG) {
+		/* The page is in the fragment pages of the segment */
+
+		for (i = 0;; i++) {
+			if (fseg_get_nth_frag_page_no(seg_inode, i, mtr)
+			    == page) {
+
+				fseg_set_nth_frag_page_no(seg_inode, i,
+							  FIL_NULL, mtr);
+				break;
+			}
+		}
+
+		fsp_free_page(space, zip_size, page, mtr);
+
+		return;
+	}
+
+	/* If we get here, the page is in some extent of the segment */
+
+	descr_id = mach_read_from_8(descr + XDES_ID);
+	seg_id = mach_read_from_8(seg_inode + FSEG_ID);
+#if 0
+	fprintf(stderr,
+		"InnoDB: InnoDB is freeing space %lu page %lu,\n"
+		"InnoDB: which belongs to descr seg %llu\n"
+		"InnoDB: segment %llu.\n",
+		(ulong) space, (ulong) page,
+		(ullint) descr_id,
+		(ullint) seg_id);
+#endif /* 0 */
+	if (UNIV_UNLIKELY(descr_id != seg_id)) {
+		fputs("InnoDB: Dump of the tablespace extent descriptor: ",
+		      stderr);
+		ut_print_buf(stderr, descr, 40);
+		fputs("\nInnoDB: Dump of the segment inode: ", stderr);
+		ut_print_buf(stderr, seg_inode, 40);
+		putc('\n', stderr);
+
+		fprintf(stderr,
+			"InnoDB: Serious error: InnoDB is trying to"
+			" free space %lu page %lu,\n"
+			"InnoDB: which does not belong to"
+			" segment %llu but belongs\n"
+			"InnoDB: to segment %llu.\n",
+			(ulong) space, (ulong) page,
+			(ullint) descr_id,
+			(ullint) seg_id);
+		goto crash;
+	}
+
+	not_full_n_used = mtr_read_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
+					 MLOG_4BYTES, mtr);
+	if (xdes_is_full(descr, mtr)) {
+		/* The fragment is full: move it to another list */
+		flst_remove(seg_inode + FSEG_FULL,
+			    descr + XDES_FLST_NODE, mtr);
+		flst_add_last(seg_inode + FSEG_NOT_FULL,
+			      descr + XDES_FLST_NODE, mtr);
+		mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
+				 not_full_n_used + FSP_EXTENT_SIZE - 1,
+				 MLOG_4BYTES, mtr);
+	} else {
+		ut_a(not_full_n_used > 0);
+		mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
+				 not_full_n_used - 1, MLOG_4BYTES, mtr);
+	}
+
+	xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr);
+	xdes_set_bit(descr, XDES_CLEAN_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr);
+
+	if (xdes_is_free(descr, mtr)) {
+		/* The extent has become free: free it to space */
+		flst_remove(seg_inode + FSEG_NOT_FULL,
+			    descr + XDES_FLST_NODE, mtr);
+		fsp_free_extent(space, zip_size, page, mtr);
+	}
+
+	mtr->n_freed_pages++;
+}
+
+/**********************************************************************//**
+Frees a single page of a segment. */
+UNIV_INTERN
+void
+fseg_free_page(
+/*===========*/
+	fseg_header_t*	seg_header, /*!< in: segment header */
+	ulint		space,	/*!< in: space id */
+	ulint		page,	/*!< in: page offset */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint		flags;
+	ulint		zip_size;
+	fseg_inode_t*	seg_inode;
+	rw_lock_t*	latch;
+
+	latch = fil_space_get_latch(space, &flags);
+	zip_size = fsp_flags_get_zip_size(flags);
+
+	mtr_x_lock(latch, mtr);
+
+	seg_inode = fseg_inode_get(seg_header, space, zip_size, mtr);
+
+	fseg_free_page_low(seg_inode, space, zip_size, page, mtr);
+
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+	buf_page_set_file_page_was_freed(space, page);
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
+}
+
+/**********************************************************************//**
+Checks if a single page of a segment is free.
+@return	true if free */
+UNIV_INTERN
+bool
+fseg_page_is_free(
+/*==============*/
+	fseg_header_t*	seg_header,	/*!< in: segment header */
+	ulint		space,		/*!< in: space id */
+	ulint		page)		/*!< in: page offset */
+{
+	mtr_t		mtr;
+	ibool		is_free;
+	ulint		flags;
+	rw_lock_t*	latch;
+	xdes_t*		descr;
+	ulint		zip_size;
+	fseg_inode_t*	seg_inode;
+
+	latch = fil_space_get_latch(space, &flags);
+	zip_size = dict_tf_get_zip_size(flags);
+
+	mtr_start(&mtr);
+	mtr_x_lock(latch, &mtr);
+
+	seg_inode = fseg_inode_get(seg_header, space, zip_size, &mtr);
+
+	ut_a(seg_inode);
+	ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
+	      == FSEG_MAGIC_N_VALUE);
+	ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+
+	descr = xdes_get_descriptor(space, zip_size, page, &mtr);
+	ut_a(descr);
+
+	is_free = xdes_mtr_get_bit(
+		descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, &mtr);
+
+	mtr_commit(&mtr);
+
+	return(is_free);
+}
+
+/**********************************************************************//**
+Frees an extent of a segment to the space free list. */
+static
+void
+fseg_free_extent(
+/*=============*/
+	fseg_inode_t*	seg_inode, /*!< in: segment inode */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint		page,	/*!< in: a page in the extent */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint	first_page_in_extent;
+	xdes_t*	descr;
+	ulint	not_full_n_used;
+	ulint	descr_n_used;
+	ulint	i;
+
+	ut_ad(seg_inode && mtr);
+
+	descr = xdes_get_descriptor(space, zip_size, page, mtr);
+
+	ut_a(xdes_get_state(descr, mtr) == XDES_FSEG);
+	ut_a(!memcmp(descr + XDES_ID, seg_inode + FSEG_ID, 8));
+	ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
+	      == FSEG_MAGIC_N_VALUE);
+
+	first_page_in_extent = page - (page % FSP_EXTENT_SIZE);
+
+	for (i = 0; i < FSP_EXTENT_SIZE; i++) {
+		if (!xdes_mtr_get_bit(descr, XDES_FREE_BIT, i, mtr)) {
+
+			/* Drop search system page hash index if the page is
+			found in the pool and is hashed */
+
+			btr_search_drop_page_hash_when_freed(
+				space, zip_size, first_page_in_extent + i);
+		}
+	}
+
+	if (xdes_is_full(descr, mtr)) {
+		flst_remove(seg_inode + FSEG_FULL,
+			    descr + XDES_FLST_NODE, mtr);
+	} else if (xdes_is_free(descr, mtr)) {
+		flst_remove(seg_inode + FSEG_FREE,
+			    descr + XDES_FLST_NODE, mtr);
+	} else {
+		flst_remove(seg_inode + FSEG_NOT_FULL,
+			    descr + XDES_FLST_NODE, mtr);
+
+		not_full_n_used = mtr_read_ulint(
+			seg_inode + FSEG_NOT_FULL_N_USED, MLOG_4BYTES, mtr);
+
+		descr_n_used = xdes_get_n_used(descr, mtr);
+		ut_a(not_full_n_used >= descr_n_used);
+		mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
+				 not_full_n_used - descr_n_used,
+				 MLOG_4BYTES, mtr);
+	}
+
+	fsp_free_extent(space, zip_size, page, mtr);
+
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+	for (i = 0; i < FSP_EXTENT_SIZE; i++) {
+
+		buf_page_set_file_page_was_freed(space,
+						 first_page_in_extent + i);
+	}
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
+}
+
+/**********************************************************************//**
+Frees part of a segment. This function can be used to free a segment by
+repeatedly calling this function in different mini-transactions. Doing
+the freeing in a single mini-transaction might result in too big a
+mini-transaction.
+@return	TRUE if freeing completed */
+UNIV_INTERN
+ibool
+fseg_free_step(
+/*===========*/
+	fseg_header_t*	header,	/*!< in, own: segment header; NOTE: if the header
+				resides on the first page of the frag list
+				of the segment, this pointer becomes obsolete
+				after the last freeing step */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint		n;
+	ulint		page;
+	xdes_t*		descr;
+	fseg_inode_t*	inode;
+	ulint		space;
+	ulint		flags;
+	ulint		zip_size;
+	ulint		header_page;
+	rw_lock_t*	latch;
+
+	space = page_get_space_id(page_align(header));
+	header_page = page_get_page_no(page_align(header));
+
+	latch = fil_space_get_latch(space, &flags);
+	zip_size = fsp_flags_get_zip_size(flags);
+
+	mtr_x_lock(latch, mtr);
+
+	descr = xdes_get_descriptor(space, zip_size, header_page, mtr);
+
+	/* Check that the header resides on a page which has not been
+	freed yet */
+
+	ut_a(xdes_mtr_get_bit(descr, XDES_FREE_BIT,
+			      header_page % FSP_EXTENT_SIZE, mtr) == FALSE);
+
+	inode = fseg_inode_try_get(header, space, zip_size, mtr);
+
+	if (UNIV_UNLIKELY(inode == NULL)) {
+		fprintf(stderr, "double free of inode from %u:%u\n",
+			(unsigned) space, (unsigned) header_page);
+		return(TRUE);
+	}
+
+	descr = fseg_get_first_extent(inode, space, zip_size, mtr);
+
+	if (descr != NULL) {
+		/* Free the extent held by the segment */
+		page = xdes_get_offset(descr);
+
+		fseg_free_extent(inode, space, zip_size, page, mtr);
+
+		return(FALSE);
+	}
+
+	/* Free a frag page */
+	n = fseg_find_last_used_frag_page_slot(inode, mtr);
+
+	if (n == ULINT_UNDEFINED) {
+		/* Freeing completed: free the segment inode */
+		fsp_free_seg_inode(space, zip_size, inode, mtr);
+
+		return(TRUE);
+	}
+
+	fseg_free_page_low(inode, space, zip_size,
+			   fseg_get_nth_frag_page_no(inode, n, mtr), mtr);
+
+	n = fseg_find_last_used_frag_page_slot(inode, mtr);
+
+	if (n == ULINT_UNDEFINED) {
+		/* Freeing completed: free the segment inode */
+		fsp_free_seg_inode(space, zip_size, inode, mtr);
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Frees part of a segment. Differs from fseg_free_step because this function
+leaves the header page unfreed.
+@return	TRUE if freeing completed, except the header page */
+UNIV_INTERN
+ibool
+fseg_free_step_not_header(
+/*======================*/
+	fseg_header_t*	header,	/*!< in: segment header which must reside on
+				the first fragment page of the segment */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint		n;
+	ulint		page;
+	xdes_t*		descr;
+	fseg_inode_t*	inode;
+	ulint		space;
+	ulint		flags;
+	ulint		zip_size;
+	ulint		page_no;
+	rw_lock_t*	latch;
+
+	space = page_get_space_id(page_align(header));
+
+	latch = fil_space_get_latch(space, &flags);
+	zip_size = fsp_flags_get_zip_size(flags);
+
+	mtr_x_lock(latch, mtr);
+
+	inode = fseg_inode_get(header, space, zip_size, mtr);
+
+	descr = fseg_get_first_extent(inode, space, zip_size, mtr);
+
+	if (descr != NULL) {
+		/* Free the extent held by the segment */
+		page = xdes_get_offset(descr);
+
+		fseg_free_extent(inode, space, zip_size, page, mtr);
+
+		return(FALSE);
+	}
+
+	/* Free a frag page */
+
+	n = fseg_find_last_used_frag_page_slot(inode, mtr);
+
+	if (n == ULINT_UNDEFINED) {
+		ut_error;
+	}
+
+	page_no = fseg_get_nth_frag_page_no(inode, n, mtr);
+
+	if (page_no == page_get_page_no(page_align(header))) {
+
+		return(TRUE);
+	}
+
+	fseg_free_page_low(inode, space, zip_size, page_no, mtr);
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Returns the first extent descriptor for a segment. We think of the extent
+lists of the segment catenated in the order FSEG_FULL -> FSEG_NOT_FULL
+-> FSEG_FREE.
+@return	the first extent descriptor, or NULL if none */
+static
+xdes_t*
+fseg_get_first_extent(
+/*==================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	fil_addr_t	first;
+	xdes_t*		descr;
+
+	ut_ad(inode && mtr);
+
+	ut_ad(space == page_get_space_id(page_align(inode)));
+	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+
+	first = fil_addr_null;
+
+	if (flst_get_len(inode + FSEG_FULL, mtr) > 0) {
+
+		first = flst_get_first(inode + FSEG_FULL, mtr);
+
+	} else if (flst_get_len(inode + FSEG_NOT_FULL, mtr) > 0) {
+
+		first = flst_get_first(inode + FSEG_NOT_FULL, mtr);
+
+	} else if (flst_get_len(inode + FSEG_FREE, mtr) > 0) {
+
+		first = flst_get_first(inode + FSEG_FREE, mtr);
+	}
+
+	if (first.page == FIL_NULL) {
+
+		return(NULL);
+	}
+	descr = xdes_lst_get_descriptor(space, zip_size, first, mtr);
+
+	return(descr);
+}
+
+/*******************************************************************//**
+Validates a segment.
+@return	TRUE if ok */
+static
+ibool
+fseg_validate_low(
+/*==============*/
+	fseg_inode_t*	inode, /*!< in: segment inode */
+	mtr_t*		mtr2)	/*!< in/out: mini-transaction */
+{
+	ulint		space;
+	ib_id_t		seg_id;
+	mtr_t		mtr;
+	xdes_t*		descr;
+	fil_addr_t	node_addr;
+	ulint		n_used		= 0;
+	ulint		n_used2		= 0;
+
+	ut_ad(mtr_memo_contains_page(mtr2, inode, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+
+	space = page_get_space_id(page_align(inode));
+
+	seg_id = mach_read_from_8(inode + FSEG_ID);
+	n_used = mtr_read_ulint(inode + FSEG_NOT_FULL_N_USED,
+				MLOG_4BYTES, mtr2);
+	flst_validate(inode + FSEG_FREE, mtr2);
+	flst_validate(inode + FSEG_NOT_FULL, mtr2);
+	flst_validate(inode + FSEG_FULL, mtr2);
+
+	/* Validate FSEG_FREE list */
+	node_addr = flst_get_first(inode + FSEG_FREE, mtr2);
+
+	while (!fil_addr_is_null(node_addr)) {
+		ulint	flags;
+		ulint	zip_size;
+
+		mtr_start(&mtr);
+		mtr_x_lock(fil_space_get_latch(space, &flags), &mtr);
+		zip_size = fsp_flags_get_zip_size(flags);
+
+		descr = xdes_lst_get_descriptor(space, zip_size,
+						node_addr, &mtr);
+
+		ut_a(xdes_get_n_used(descr, &mtr) == 0);
+		ut_a(xdes_get_state(descr, &mtr) == XDES_FSEG);
+		ut_a(mach_read_from_8(descr + XDES_ID) == seg_id);
+
+		node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr);
+		mtr_commit(&mtr);
+	}
+
+	/* Validate FSEG_NOT_FULL list */
+
+	node_addr = flst_get_first(inode + FSEG_NOT_FULL, mtr2);
+
+	while (!fil_addr_is_null(node_addr)) {
+		ulint	flags;
+		ulint	zip_size;
+
+		mtr_start(&mtr);
+		mtr_x_lock(fil_space_get_latch(space, &flags), &mtr);
+		zip_size = fsp_flags_get_zip_size(flags);
+
+		descr = xdes_lst_get_descriptor(space, zip_size,
+						node_addr, &mtr);
+
+		ut_a(xdes_get_n_used(descr, &mtr) > 0);
+		ut_a(xdes_get_n_used(descr, &mtr) < FSP_EXTENT_SIZE);
+		ut_a(xdes_get_state(descr, &mtr) == XDES_FSEG);
+		ut_a(mach_read_from_8(descr + XDES_ID) == seg_id);
+
+		n_used2 += xdes_get_n_used(descr, &mtr);
+
+		node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr);
+		mtr_commit(&mtr);
+	}
+
+	/* Validate FSEG_FULL list */
+
+	node_addr = flst_get_first(inode + FSEG_FULL, mtr2);
+
+	while (!fil_addr_is_null(node_addr)) {
+		ulint	flags;
+		ulint	zip_size;
+
+		mtr_start(&mtr);
+		mtr_x_lock(fil_space_get_latch(space, &flags), &mtr);
+		zip_size = fsp_flags_get_zip_size(flags);
+
+		descr = xdes_lst_get_descriptor(space, zip_size,
+						node_addr, &mtr);
+
+		ut_a(xdes_get_n_used(descr, &mtr) == FSP_EXTENT_SIZE);
+		ut_a(xdes_get_state(descr, &mtr) == XDES_FSEG);
+		ut_a(mach_read_from_8(descr + XDES_ID) == seg_id);
+
+		node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr);
+		mtr_commit(&mtr);
+	}
+
+	ut_a(n_used == n_used2);
+
+	return(TRUE);
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Validates a segment.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+fseg_validate(
+/*==========*/
+	fseg_header_t*	header, /*!< in: segment header */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	fseg_inode_t*	inode;
+	ibool		ret;
+	ulint		space;
+	ulint		flags;
+	ulint		zip_size;
+
+	space = page_get_space_id(page_align(header));
+
+	mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
+	zip_size = fsp_flags_get_zip_size(flags);
+
+	inode = fseg_inode_get(header, space, zip_size, mtr);
+
+	ret = fseg_validate_low(inode, mtr);
+
+	return(ret);
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Writes info of a segment. */
+static
+void
+fseg_print_low(
+/*===========*/
+	fseg_inode_t*	inode, /*!< in: segment inode */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint	space;
+	ulint	n_used;
+	ulint	n_frag;
+	ulint	n_free;
+	ulint	n_not_full;
+	ulint	n_full;
+	ulint	reserved;
+	ulint	used;
+	ulint	page_no;
+	ib_id_t	seg_id;
+
+	ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX));
+	space = page_get_space_id(page_align(inode));
+	page_no = page_get_page_no(page_align(inode));
+
+	reserved = fseg_n_reserved_pages_low(inode, &used, mtr);
+
+	seg_id = mach_read_from_8(inode + FSEG_ID);
+
+	n_used = mtr_read_ulint(inode + FSEG_NOT_FULL_N_USED,
+				MLOG_4BYTES, mtr);
+	n_frag = fseg_get_n_frag_pages(inode, mtr);
+	n_free = flst_get_len(inode + FSEG_FREE, mtr);
+	n_not_full = flst_get_len(inode + FSEG_NOT_FULL, mtr);
+	n_full = flst_get_len(inode + FSEG_FULL, mtr);
+
+	fprintf(stderr,
+		"SEGMENT id %llu space %lu; page %lu;"
+		" res %lu used %lu; full ext %lu\n"
+		"fragm pages %lu; free extents %lu;"
+		" not full extents %lu: pages %lu\n",
+		(ullint) seg_id,
+		(ulong) space, (ulong) page_no,
+		(ulong) reserved, (ulong) used, (ulong) n_full,
+		(ulong) n_frag, (ulong) n_free, (ulong) n_not_full,
+		(ulong) n_used);
+	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+}
+
+#ifdef UNIV_BTR_PRINT
+/*******************************************************************//**
+Writes info of a segment. */
+UNIV_INTERN
+void
+fseg_print(
+/*=======*/
+	fseg_header_t*	header, /*!< in: segment header */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	fseg_inode_t*	inode;
+	ulint		space;
+	ulint		flags;
+	ulint		zip_size;
+
+	space = page_get_space_id(page_align(header));
+
+	mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
+	zip_size = fsp_flags_get_zip_size(flags);
+
+	inode = fseg_inode_get(header, space, zip_size, mtr);
+
+	fseg_print_low(inode, mtr);
+}
+#endif /* UNIV_BTR_PRINT */
+
+/*******************************************************************//**
+Validates the file space system and its segments.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+fsp_validate(
+/*=========*/
+	ulint	space)	/*!< in: space id */
+{
+	fsp_header_t*	header;
+	fseg_inode_t*	seg_inode;
+	page_t*		seg_inode_page;
+	rw_lock_t*	latch;
+	ulint		size;
+	ulint		flags;
+	ulint		zip_size;
+	ulint		free_limit;
+	ulint		frag_n_used;
+	mtr_t		mtr;
+	mtr_t		mtr2;
+	xdes_t*		descr;
+	fil_addr_t	node_addr;
+	fil_addr_t	next_node_addr;
+	ulint		descr_count	= 0;
+	ulint		n_used		= 0;
+	ulint		n_used2		= 0;
+	ulint		n_full_frag_pages;
+	ulint		n;
+	ulint		seg_inode_len_free;
+	ulint		seg_inode_len_full;
+
+	latch = fil_space_get_latch(space, &flags);
+	zip_size = fsp_flags_get_zip_size(flags);
+	ut_a(ut_is_2pow(zip_size));
+	ut_a(zip_size <= UNIV_ZIP_SIZE_MAX);
+	ut_a(!zip_size || zip_size >= UNIV_ZIP_SIZE_MIN);
+
+	/* Start first a mini-transaction mtr2 to lock out all other threads
+	from the fsp system */
+	mtr_start(&mtr2);
+	mtr_x_lock(latch, &mtr2);
+
+	mtr_start(&mtr);
+	mtr_x_lock(latch, &mtr);
+
+	header = fsp_get_space_header(space, zip_size, &mtr);
+
+	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, &mtr);
+	free_limit = mtr_read_ulint(header + FSP_FREE_LIMIT,
+				    MLOG_4BYTES, &mtr);
+	frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED,
+				     MLOG_4BYTES, &mtr);
+
+	n_full_frag_pages = FSP_EXTENT_SIZE
+		* flst_get_len(header + FSP_FULL_FRAG, &mtr);
+
+	if (UNIV_UNLIKELY(free_limit > size)) {
+
+		ut_a(space != 0);
+		ut_a(size < FSP_EXTENT_SIZE);
+	}
+
+	flst_validate(header + FSP_FREE, &mtr);
+	flst_validate(header + FSP_FREE_FRAG, &mtr);
+	flst_validate(header + FSP_FULL_FRAG, &mtr);
+
+	mtr_commit(&mtr);
+
+	/* Validate FSP_FREE list */
+	mtr_start(&mtr);
+	mtr_x_lock(latch, &mtr);
+
+	header = fsp_get_space_header(space, zip_size, &mtr);
+	node_addr = flst_get_first(header + FSP_FREE, &mtr);
+
+	mtr_commit(&mtr);
+
+	while (!fil_addr_is_null(node_addr)) {
+		mtr_start(&mtr);
+		mtr_x_lock(latch, &mtr);
+
+		descr_count++;
+		descr = xdes_lst_get_descriptor(space, zip_size,
+						node_addr, &mtr);
+
+		ut_a(xdes_get_n_used(descr, &mtr) == 0);
+		ut_a(xdes_get_state(descr, &mtr) == XDES_FREE);
+
+		node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr);
+		mtr_commit(&mtr);
+	}
+
+	/* Validate FSP_FREE_FRAG list */
+	mtr_start(&mtr);
+	mtr_x_lock(latch, &mtr);
+
+	header = fsp_get_space_header(space, zip_size, &mtr);
+	node_addr = flst_get_first(header + FSP_FREE_FRAG, &mtr);
+
+	mtr_commit(&mtr);
+
+	while (!fil_addr_is_null(node_addr)) {
+		mtr_start(&mtr);
+		mtr_x_lock(latch, &mtr);
+
+		descr_count++;
+		descr = xdes_lst_get_descriptor(space, zip_size,
+						node_addr, &mtr);
+
+		ut_a(xdes_get_n_used(descr, &mtr) > 0);
+		ut_a(xdes_get_n_used(descr, &mtr) < FSP_EXTENT_SIZE);
+		ut_a(xdes_get_state(descr, &mtr) == XDES_FREE_FRAG);
+
+		n_used += xdes_get_n_used(descr, &mtr);
+		node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr);
+
+		mtr_commit(&mtr);
+	}
+
+	/* Validate FSP_FULL_FRAG list */
+	mtr_start(&mtr);
+	mtr_x_lock(latch, &mtr);
+
+	header = fsp_get_space_header(space, zip_size, &mtr);
+	node_addr = flst_get_first(header + FSP_FULL_FRAG, &mtr);
+
+	mtr_commit(&mtr);
+
+	while (!fil_addr_is_null(node_addr)) {
+		mtr_start(&mtr);
+		mtr_x_lock(latch, &mtr);
+
+		descr_count++;
+		descr = xdes_lst_get_descriptor(space, zip_size,
+						node_addr, &mtr);
+
+		ut_a(xdes_get_n_used(descr, &mtr) == FSP_EXTENT_SIZE);
+		ut_a(xdes_get_state(descr, &mtr) == XDES_FULL_FRAG);
+
+		node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr);
+		mtr_commit(&mtr);
+	}
+
+	/* Validate segments */
+	mtr_start(&mtr);
+	mtr_x_lock(latch, &mtr);
+
+	header = fsp_get_space_header(space, zip_size, &mtr);
+
+	node_addr = flst_get_first(header + FSP_SEG_INODES_FULL, &mtr);
+
+	seg_inode_len_full = flst_get_len(header + FSP_SEG_INODES_FULL, &mtr);
+
+	mtr_commit(&mtr);
+
+	while (!fil_addr_is_null(node_addr)) {
+
+		n = 0;
+		do {
+			mtr_start(&mtr);
+			mtr_x_lock(latch, &mtr);
+
+			seg_inode_page = fut_get_ptr(
+				space, zip_size, node_addr, RW_X_LATCH, &mtr)
+				- FSEG_INODE_PAGE_NODE;
+
+			seg_inode = fsp_seg_inode_page_get_nth_inode(
+				seg_inode_page, n, zip_size, &mtr);
+			ut_a(mach_read_from_8(seg_inode + FSEG_ID) != 0);
+			fseg_validate_low(seg_inode, &mtr);
+
+			descr_count += flst_get_len(seg_inode + FSEG_FREE,
+						    &mtr);
+			descr_count += flst_get_len(seg_inode + FSEG_FULL,
+						    &mtr);
+			descr_count += flst_get_len(seg_inode + FSEG_NOT_FULL,
+						    &mtr);
+
+			n_used2 += fseg_get_n_frag_pages(seg_inode, &mtr);
+
+			next_node_addr = flst_get_next_addr(
+				seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr);
+			mtr_commit(&mtr);
+		} while (++n < FSP_SEG_INODES_PER_PAGE(zip_size));
+
+		node_addr = next_node_addr;
+	}
+
+	mtr_start(&mtr);
+	mtr_x_lock(latch, &mtr);
+
+	header = fsp_get_space_header(space, zip_size, &mtr);
+
+	node_addr = flst_get_first(header + FSP_SEG_INODES_FREE, &mtr);
+
+	seg_inode_len_free = flst_get_len(header + FSP_SEG_INODES_FREE, &mtr);
+
+	mtr_commit(&mtr);
+
+	while (!fil_addr_is_null(node_addr)) {
+
+		n = 0;
+
+		do {
+			mtr_start(&mtr);
+			mtr_x_lock(latch, &mtr);
+
+			seg_inode_page = fut_get_ptr(
+				space, zip_size, node_addr, RW_X_LATCH, &mtr)
+				- FSEG_INODE_PAGE_NODE;
+
+			seg_inode = fsp_seg_inode_page_get_nth_inode(
+				seg_inode_page, n, zip_size, &mtr);
+			if (mach_read_from_8(seg_inode + FSEG_ID)) {
+				fseg_validate_low(seg_inode, &mtr);
+
+				descr_count += flst_get_len(
+					seg_inode + FSEG_FREE, &mtr);
+				descr_count += flst_get_len(
+					seg_inode + FSEG_FULL, &mtr);
+				descr_count += flst_get_len(
+					seg_inode + FSEG_NOT_FULL, &mtr);
+				n_used2 += fseg_get_n_frag_pages(
+					seg_inode, &mtr);
+			}
+
+			next_node_addr = flst_get_next_addr(
+				seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr);
+			mtr_commit(&mtr);
+		} while (++n < FSP_SEG_INODES_PER_PAGE(zip_size));
+
+		node_addr = next_node_addr;
+	}
+
+	ut_a(descr_count * FSP_EXTENT_SIZE == free_limit);
+	if (!zip_size) {
+		ut_a(n_used + n_full_frag_pages
+		     == n_used2 + 2 * ((free_limit + (UNIV_PAGE_SIZE - 1))
+				       / UNIV_PAGE_SIZE)
+		     + seg_inode_len_full + seg_inode_len_free);
+	} else {
+		ut_a(n_used + n_full_frag_pages
+		     == n_used2 + 2 * ((free_limit + (zip_size - 1))
+				       / zip_size)
+		     + seg_inode_len_full + seg_inode_len_free);
+	}
+	ut_a(frag_n_used == n_used);
+
+	mtr_commit(&mtr2);
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Prints info of a file space. */
+UNIV_INTERN
+void
+fsp_print(
+/*======*/
+	ulint	space)	/*!< in: space id */
+{
+	fsp_header_t*	header;
+	fseg_inode_t*	seg_inode;
+	page_t*		seg_inode_page;
+	rw_lock_t*	latch;
+	ulint		flags;
+	ulint		zip_size;
+	ulint		size;
+	ulint		free_limit;
+	ulint		frag_n_used;
+	fil_addr_t	node_addr;
+	fil_addr_t	next_node_addr;
+	ulint		n_free;
+	ulint		n_free_frag;
+	ulint		n_full_frag;
+	ib_id_t		seg_id;
+	ulint		n;
+	ulint		n_segs		= 0;
+	mtr_t		mtr;
+	mtr_t		mtr2;
+
+	latch = fil_space_get_latch(space, &flags);
+	zip_size = fsp_flags_get_zip_size(flags);
+
+	/* Start first a mini-transaction mtr2 to lock out all other threads
+	from the fsp system */
+
+	mtr_start(&mtr2);
+
+	mtr_x_lock(latch, &mtr2);
+
+	mtr_start(&mtr);
+
+	mtr_x_lock(latch, &mtr);
+
+	header = fsp_get_space_header(space, zip_size, &mtr);
+
+	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, &mtr);
+
+	free_limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES,
+				    &mtr);
+	frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES,
+				     &mtr);
+	n_free = flst_get_len(header + FSP_FREE, &mtr);
+	n_free_frag = flst_get_len(header + FSP_FREE_FRAG, &mtr);
+	n_full_frag = flst_get_len(header + FSP_FULL_FRAG, &mtr);
+
+	seg_id = mach_read_from_8(header + FSP_SEG_ID);
+
+	fprintf(stderr,
+		"FILE SPACE INFO: id %lu\n"
+		"size %lu, free limit %lu, free extents %lu\n"
+		"not full frag extents %lu: used pages %lu,"
+		" full frag extents %lu\n"
+		"first seg id not used %llu\n",
+		(ulong) space,
+		(ulong) size, (ulong) free_limit, (ulong) n_free,
+		(ulong) n_free_frag, (ulong) frag_n_used, (ulong) n_full_frag,
+		(ullint) seg_id);
+
+	mtr_commit(&mtr);
+
+	/* Print segments */
+
+	mtr_start(&mtr);
+	mtr_x_lock(latch, &mtr);
+
+	header = fsp_get_space_header(space, zip_size, &mtr);
+
+	node_addr = flst_get_first(header + FSP_SEG_INODES_FULL, &mtr);
+
+	mtr_commit(&mtr);
+
+	while (!fil_addr_is_null(node_addr)) {
+
+		n = 0;
+
+		do {
+
+			mtr_start(&mtr);
+			mtr_x_lock(latch, &mtr);
+
+			seg_inode_page = fut_get_ptr(
+				space, zip_size, node_addr, RW_X_LATCH, &mtr)
+				- FSEG_INODE_PAGE_NODE;
+
+			seg_inode = fsp_seg_inode_page_get_nth_inode(
+				seg_inode_page, n, zip_size, &mtr);
+			ut_a(mach_read_from_8(seg_inode + FSEG_ID) != 0);
+			fseg_print_low(seg_inode, &mtr);
+
+			n_segs++;
+
+			next_node_addr = flst_get_next_addr(
+				seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr);
+			mtr_commit(&mtr);
+		} while (++n < FSP_SEG_INODES_PER_PAGE(zip_size));
+
+		node_addr = next_node_addr;
+	}
+
+	mtr_start(&mtr);
+	mtr_x_lock(latch, &mtr);
+
+	header = fsp_get_space_header(space, zip_size, &mtr);
+
+	node_addr = flst_get_first(header + FSP_SEG_INODES_FREE, &mtr);
+
+	mtr_commit(&mtr);
+
+	while (!fil_addr_is_null(node_addr)) {
+
+		n = 0;
+
+		do {
+
+			mtr_start(&mtr);
+			mtr_x_lock(latch, &mtr);
+
+			seg_inode_page = fut_get_ptr(
+				space, zip_size, node_addr, RW_X_LATCH, &mtr)
+				- FSEG_INODE_PAGE_NODE;
+
+			seg_inode = fsp_seg_inode_page_get_nth_inode(
+				seg_inode_page, n, zip_size, &mtr);
+			if (mach_read_from_8(seg_inode + FSEG_ID)) {
+
+				fseg_print_low(seg_inode, &mtr);
+				n_segs++;
+			}
+
+			next_node_addr = flst_get_next_addr(
+				seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr);
+			mtr_commit(&mtr);
+		} while (++n < FSP_SEG_INODES_PER_PAGE(zip_size));
+
+		node_addr = next_node_addr;
+	}
+
+	mtr_commit(&mtr2);
+
+	fprintf(stderr, "NUMBER of file segments: %lu\n", (ulong) n_segs);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/fts/Makefile.query b/storage/innobase/fts/Makefile.query
new file mode 100644
index 00000000000..12dcd833064
--- /dev/null
+++ b/storage/innobase/fts/Makefile.query
@@ -0,0 +1,32 @@
+LEX=flex
+YACC=bison
+PREFIX=fts
+
+all:	fts0pars.cc fts0blex.cc fts0tlex.cc
+
+fts0par.cc: fts0pars.y
+fts0blex.cc: fts0blex.l
+fts0tlex.cc: fts0tlex.l
+
+.l.cc:
+	$(LEX) -P$(subst lex,,$*) -o $*.cc --header-file=../include/$*.h $<
+
+.y.cc:
+	$(YACC) -p $(PREFIX) -o $*.cc -d $<
+	mv $*.h ../include
+LEX=flex
+YACC=bison
+PREFIX=fts
+
+all:	fts0pars.cc fts0blex.cc fts0tlex.cc
+
+fts0par.cc: fts0pars.y
+fts0blex.cc: fts0blex.l
+fts0tlex.cc: fts0tlex.l
+
+.l.cc:
+	$(LEX) -P$(subst lex,,$*) -o $*.cc --header-file=../include/$*.h $<
+
+.y.cc:
+	$(YACC) -p $(PREFIX) -o $*.cc -d $<
+	mv $*.h ../include
diff --git a/storage/innobase/fts/fts0ast.cc b/storage/innobase/fts/fts0ast.cc
new file mode 100644
index 00000000000..030b972440f
--- /dev/null
+++ b/storage/innobase/fts/fts0ast.cc
@@ -0,0 +1,744 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0ast.cc
+Full Text Search parser helper file.
+
+Created 2007/3/16 Sunny Bains.
+***********************************************************************/
+
+#include "mem0mem.h"
+#include "fts0ast.h"
+#include "fts0pars.h"
+#include "fts0fts.h"
+
+/* The FTS ast visit pass. */
+enum fts_ast_visit_pass_t {
+	FTS_PASS_FIRST,		/*!< First visit pass,
+				process operators excluding
+				FTS_EXIST and FTS_IGNORE */
+	FTS_PASS_EXIST,		/*!< Exist visit pass,
+				process operator FTS_EXIST */
+	FTS_PASS_IGNORE		/*!< Ignore visit pass,
+				process operator FTS_IGNORE */
+};
+
+/******************************************************************//**
+Create an empty fts_ast_node_t.
+@return Create a new node */
+static
+fts_ast_node_t*
+fts_ast_node_create(void)
+/*=====================*/
+{
+	fts_ast_node_t*	node;
+
+	node = (fts_ast_node_t*) ut_malloc(sizeof(*node));
+	memset(node, 0x0, sizeof(*node));
+
+	return(node);
+}
+
+/******************************************************************//**
+Create a operator fts_ast_node_t.
+@return new node */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_oper(
+/*=====================*/
+	void*		arg,			/*!< in: ast state instance */
+	fts_ast_oper_t	oper)			/*!< in: ast operator */
+{
+	fts_ast_node_t*	node = fts_ast_node_create();
+
+	node->type = FTS_AST_OPER;
+	node->oper = oper;
+
+	fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+	return(node);
+}
+
+/******************************************************************//**
+This function takes ownership of the ptr and is responsible
+for free'ing it
+@return new node or a node list with tokenized words */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_term(
+/*=====================*/
+	void*			arg,		/*!< in: ast state instance */
+	const fts_ast_string_t*	ptr)		/*!< in: ast term string */
+{
+	fts_ast_state_t*	state = static_cast<fts_ast_state_t*>(arg);
+	ulint			len = ptr->len;
+	ulint			cur_pos = 0;
+	fts_ast_node_t*         node = NULL;
+	fts_ast_node_t*		node_list = NULL;
+	fts_ast_node_t*		first_node = NULL;
+
+	/* Scan the incoming string and filter out any "non-word" characters */
+	while (cur_pos < len) {
+		fts_string_t	str;
+		ulint		offset;
+		ulint		cur_len;
+
+		cur_len = innobase_mysql_fts_get_token(
+			state->charset,
+			reinterpret_cast<const byte*>(ptr->str) + cur_pos,
+			reinterpret_cast<const byte*>(ptr->str) + len,
+			&str, &offset);
+
+		if (cur_len == 0) {
+			break;
+		}
+
+		cur_pos += cur_len;
+
+		if (str.f_n_char > 0) {
+			/* If the subsequent term (after the first one)'s size
+			is less than fts_min_token_size or the term is greater
+			than fts_max_token_size, we shall ignore that. This is
+			to make consistent with MyISAM behavior */
+			if ((first_node && (str.f_n_char < fts_min_token_size))
+			    || str.f_n_char > fts_max_token_size) {
+				continue;
+			}
+
+			node = fts_ast_node_create();
+
+			node->type = FTS_AST_TERM;
+
+			node->term.ptr = fts_ast_string_create(
+						str.f_str, str.f_len);
+
+			fts_ast_state_add_node(
+				static_cast<fts_ast_state_t*>(arg), node);
+
+			if (first_node) {
+				/* There is more than one word, create
+				a list to organize them */
+				if (!node_list) {
+					node_list = fts_ast_create_node_list(
+						static_cast<fts_ast_state_t*>(
+							arg),
+						 first_node);
+				}
+
+				fts_ast_add_node(node_list, node);
+			} else {
+				first_node = node;
+			}
+		}
+	}
+
+	return((node_list != NULL) ? node_list : first_node);
+}
+
+/******************************************************************//**
+This function takes ownership of the ptr and is responsible
+for free'ing it.
+@return new node */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_text(
+/*=====================*/
+	void*			arg,	/*!< in: ast state instance */
+	const fts_ast_string_t*	ptr)	/*!< in: ast text string */
+{
+	ulint		len = ptr->len;
+	fts_ast_node_t*	node = NULL;
+
+	/* Once we come here, the string must have at least 2 quotes ""
+	around the query string, which could be empty. Also the query
+	string may contain 0x00 in it, we don't treat it as null-terminated. */
+	ut_ad(len >= 2);
+	ut_ad(ptr->str[0] == '\"' && ptr->str[len - 1] == '\"');
+
+	if (len == 2) {
+		/* If the query string contains nothing except quotes,
+		it's obviously an invalid query. */
+		return(NULL);
+	}
+
+	node = fts_ast_node_create();
+
+	/*!< We ignore the actual quotes "" */
+	len -= 2;
+
+	node->type = FTS_AST_TEXT;
+	/*!< Skip copying the first quote */
+	node->text.ptr = fts_ast_string_create(
+			reinterpret_cast<const byte*>(ptr->str + 1), len);
+	node->text.distance = ULINT_UNDEFINED;
+
+	fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+	return(node);
+}
+
+/******************************************************************//**
+This function takes ownership of the expr and is responsible
+for free'ing it.
+@return new node */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_list(
+/*=====================*/
+	void*		arg,			/*!< in: ast state instance */
+	fts_ast_node_t*	expr)			/*!< in: ast expr instance */
+{
+	fts_ast_node_t*	node = fts_ast_node_create();
+
+	node->type = FTS_AST_LIST;
+	node->list.head = node->list.tail = expr;
+
+	fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+	return(node);
+}
+
+/******************************************************************//**
+Create a sub-expression list node. This function takes ownership of
+expr and is responsible for deleting it.
+@return new node */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_create_node_subexp_list(
+/*============================*/
+	void*		arg,			/*!< in: ast state instance */
+	fts_ast_node_t*	expr)			/*!< in: ast expr instance */
+{
+	fts_ast_node_t*	node = fts_ast_node_create();
+
+	node->type = FTS_AST_SUBEXP_LIST;
+	node->list.head = node->list.tail = expr;
+
+	fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+	return(node);
+}
+
+/******************************************************************//**
+Free an expr list node elements. */
+static
+void
+fts_ast_free_list(
+/*==============*/
+	fts_ast_node_t*	node)			/*!< in: ast node to free */
+{
+	ut_a(node->type == FTS_AST_LIST
+	     || node->type == FTS_AST_SUBEXP_LIST);
+
+	for (node = node->list.head;
+	     node != NULL;
+	     node = fts_ast_free_node(node)) {
+
+		/*!< No op */
+	}
+}
+
+/********************************************************************//**
+Free a fts_ast_node_t instance.
+@return next node to free */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_free_node(
+/*==============*/
+	fts_ast_node_t*	node)			/*!< in: the node to free */
+{
+	fts_ast_node_t*	next_node;
+
+	switch (node->type) {
+	case FTS_AST_TEXT:
+		if (node->text.ptr) {
+			fts_ast_string_free(node->text.ptr);
+			node->text.ptr = NULL;
+		}
+		break;
+
+	case FTS_AST_TERM:
+		if (node->term.ptr) {
+			fts_ast_string_free(node->term.ptr);
+			node->term.ptr = NULL;
+		}
+		break;
+
+	case FTS_AST_LIST:
+	case FTS_AST_SUBEXP_LIST:
+		fts_ast_free_list(node);
+		node->list.head = node->list.tail = NULL;
+		break;
+
+	case FTS_AST_OPER:
+		break;
+
+	default:
+		ut_error;
+	}
+
+	/*!< Get next node before freeing the node itself */
+	next_node = node->next;
+
+	ut_free(node);
+
+	return(next_node);
+}
+
+/******************************************************************//**
+This AST takes ownership of the expr and is responsible
+for free'ing it.
+@return in param "list" */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_add_node(
+/*=============*/
+	fts_ast_node_t*	node,			/*!< in: list instance */
+	fts_ast_node_t*	elem)			/*!< in: node to add to list */
+{
+	if (!elem) {
+		return(NULL);
+	}
+
+	ut_a(!elem->next);
+	ut_a(node->type == FTS_AST_LIST
+	     || node->type == FTS_AST_SUBEXP_LIST);
+
+	if (!node->list.head) {
+		ut_a(!node->list.tail);
+
+		node->list.head = node->list.tail = elem;
+	} else {
+		ut_a(node->list.tail);
+
+		node->list.tail->next = elem;
+		node->list.tail = elem;
+	}
+
+	return(node);
+}
+
+/******************************************************************//**
+For tracking node allocations, in case there is an error during
+parsing. */
+UNIV_INTERN
+void
+fts_ast_state_add_node(
+/*===================*/
+	fts_ast_state_t*state,			/*!< in: ast instance */
+	fts_ast_node_t*	node)			/*!< in: node to add to ast */
+{
+	if (!state->list.head) {
+		ut_a(!state->list.tail);
+
+		state->list.head = state->list.tail = node;
+	} else {
+		state->list.tail->next_alloc = node;
+		state->list.tail = node;
+	}
+}
+
+/******************************************************************//**
+Set the wildcard attribute of a term. */
+UNIV_INTERN
+void
+fts_ast_term_set_wildcard(
+/*======================*/
+	fts_ast_node_t*	node)			/*!< in/out: set attribute of
+						a term node */
+{
+	if (!node) {
+		return;
+	}
+
+	/* If it's a node list, the wildcard should be set to the tail node*/
+	if (node->type == FTS_AST_LIST)	{
+		ut_ad(node->list.tail != NULL);
+		node = node->list.tail;
+	}
+
+	ut_a(node->type == FTS_AST_TERM);
+	ut_a(!node->term.wildcard);
+
+	node->term.wildcard = TRUE;
+}
+
+/******************************************************************//**
+Set the proximity attribute of a text node. */
+UNIV_INTERN
+void
+fts_ast_term_set_distance(
+/*======================*/
+	fts_ast_node_t*	node,			/*!< in/out: text node */
+	ulint		distance)		/*!< in: the text proximity
+						distance */
+{
+	if (node == NULL) {
+		return;
+	}
+
+	ut_a(node->type == FTS_AST_TEXT);
+	ut_a(node->text.distance == ULINT_UNDEFINED);
+
+	node->text.distance = distance;
+}
+
+/******************************************************************//**
+Free node and expr allocations. */
+UNIV_INTERN
+void
+fts_ast_state_free(
+/*===============*/
+	fts_ast_state_t*state)			/*!< in: ast state to free */
+{
+	fts_ast_node_t*	node = state->list.head;
+
+	/* Free the nodes that were allocated during parsing. */
+	while (node) {
+		fts_ast_node_t*	next = node->next_alloc;
+
+		if (node->type == FTS_AST_TEXT && node->text.ptr) {
+			fts_ast_string_free(node->text.ptr);
+			node->text.ptr = NULL;
+		} else if (node->type == FTS_AST_TERM && node->term.ptr) {
+			fts_ast_string_free(node->term.ptr);
+			node->term.ptr = NULL;
+		}
+
+		ut_free(node);
+		node = next;
+	}
+
+	state->root = state->list.head = state->list.tail = NULL;
+}
+
+/******************************************************************//**
+Print an ast node. */
+UNIV_INTERN
+void
+fts_ast_node_print(
+/*===============*/
+	fts_ast_node_t*	node)			/*!< in: ast node to print */
+{
+	switch (node->type) {
+	case FTS_AST_TEXT:
+		printf("TEXT: ");
+		fts_ast_string_print(node->text.ptr);
+		break;
+
+	case FTS_AST_TERM:
+		printf("TERM: ");
+		fts_ast_string_print(node->term.ptr);
+		break;
+
+	case FTS_AST_LIST:
+		printf("LIST: ");
+		node = node->list.head;
+
+		while (node) {
+			fts_ast_node_print(node);
+			node = node->next;
+		}
+		break;
+
+	case FTS_AST_SUBEXP_LIST:
+		printf("SUBEXP_LIST: ");
+		node = node->list.head;
+
+		while (node) {
+			fts_ast_node_print(node);
+			node = node->next;
+		}
+	case FTS_AST_OPER:
+		printf("OPER: %d\n", node->oper);
+		break;
+
+	default:
+		ut_error;
+	}
+}
+
+/******************************************************************//**
+Traverse the AST - in-order traversal, except for the FTX_EXIST and FTS_IGNORE
+nodes, which will be ignored in the first pass of each level, and visited in a
+second and third pass after all other nodes in the same level are visited.
+@return DB_SUCCESS if all went well */
+UNIV_INTERN
+dberr_t
+fts_ast_visit(
+/*==========*/
+	fts_ast_oper_t		oper,		/*!< in: current operator */
+	fts_ast_node_t*		node,		/*!< in: current root node */
+	fts_ast_callback	visitor,	/*!< in: callback function */
+	void*			arg,		/*!< in: arg for callback */
+	bool*			has_ignore)	/*!< out: true, if the operator
+						was ignored during processing,
+						currently we ignore FTS_EXIST
+						and FTS_IGNORE operators */
+{
+	dberr_t			error = DB_SUCCESS;
+	fts_ast_node_t*		oper_node = NULL;
+	fts_ast_node_t*		start_node;
+	bool			revisit = false;
+	bool			will_be_ignored = false;
+	fts_ast_visit_pass_t	visit_pass = FTS_PASS_FIRST;
+
+	start_node = node->list.head;
+
+	ut_a(node->type == FTS_AST_LIST
+	     || node->type == FTS_AST_SUBEXP_LIST);
+
+	if (oper == FTS_EXIST_SKIP) {
+		visit_pass = FTS_PASS_EXIST;
+	} else if (oper == FTS_IGNORE_SKIP) {
+		visit_pass = FTS_PASS_IGNORE;
+	}
+
+	/* In the first pass of the tree, at the leaf level of the
+	tree, FTS_EXIST and FTS_IGNORE operation will be ignored.
+	It will be repeated at the level above the leaf level.
+
+	The basic idea here is that when we encounter FTS_EXIST or
+	FTS_IGNORE, we will change the operator node into FTS_EXIST_SKIP
+	or FTS_IGNORE_SKIP, and term node & text node with the operators
+	is ignored in the first pass. We have two passes during the revisit:
+	We process nodes with FTS_EXIST_SKIP in the exist pass, and then
+	process nodes with FTS_IGNORE_SKIP in the ignore pass.
+
+	The order should be restrictly followed, or we will get wrong results.
+	For example, we have a query 'a +b -c d +e -f'.
+	first pass: process 'a' and 'd' by union;
+	exist pass: process '+b' and '+e' by intersection;
+	ignore pass: process '-c' and '-f' by difference. */
+
+	for (node = node->list.head;
+	     node && (error == DB_SUCCESS);
+	     node = node->next) {
+
+		switch(node->type) {
+		case FTS_AST_LIST:
+			if (visit_pass != FTS_PASS_FIRST) {
+				break;
+			}
+
+			error = fts_ast_visit(oper, node, visitor,
+					      arg, &will_be_ignored);
+
+			/* If will_be_ignored is set to true, then
+			we encountered and ignored a FTS_EXIST or FTS_IGNORE
+			operator. */
+			if (will_be_ignored) {
+				revisit = true;
+				/* Remember oper for list in case '-abc&def',
+				ignored oper is from previous node of list.*/
+				node->oper = oper;
+			}
+
+			break;
+
+		case FTS_AST_OPER:
+			oper = node->oper;
+			oper_node = node;
+
+			/* Change the operator for revisit */
+			if (oper == FTS_EXIST) {
+				oper_node->oper = FTS_EXIST_SKIP;
+			} else if (oper == FTS_IGNORE) {
+				oper_node->oper = FTS_IGNORE_SKIP;
+			}
+
+			break;
+
+		default:
+			if (node->visited) {
+				continue;
+			}
+
+			ut_a(oper == FTS_NONE || !oper_node
+			     || oper_node->oper == oper
+			     || oper_node->oper == FTS_EXIST_SKIP
+			     || oper_node->oper == FTS_IGNORE_SKIP);
+
+			if (oper== FTS_EXIST || oper == FTS_IGNORE) {
+				*has_ignore = true;
+				continue;
+			}
+
+			/* Process leaf node accroding to its pass.*/
+			if (oper == FTS_EXIST_SKIP
+			    && visit_pass == FTS_PASS_EXIST) {
+				error = visitor(FTS_EXIST, node, arg);
+				node->visited = true;
+			} else if (oper == FTS_IGNORE_SKIP
+				   && visit_pass == FTS_PASS_IGNORE) {
+				error = visitor(FTS_IGNORE, node, arg);
+				node->visited = true;
+			} else if (visit_pass == FTS_PASS_FIRST) {
+				error = visitor(oper, node, arg);
+				node->visited = true;
+			}
+		}
+	}
+
+	if (revisit) {
+		/* Exist pass processes the skipped FTS_EXIST operation. */
+                for (node = start_node;
+		     node && error == DB_SUCCESS;
+		     node = node->next) {
+
+			if (node->type == FTS_AST_LIST
+			    && node->oper != FTS_IGNORE) {
+				error = fts_ast_visit(FTS_EXIST_SKIP, node,
+					visitor, arg, &will_be_ignored);
+			}
+		}
+
+		/* Ignore pass processes the skipped FTS_IGNORE operation. */
+		for (node = start_node;
+		     node && error == DB_SUCCESS;
+		     node = node->next) {
+
+			if (node->type == FTS_AST_LIST) {
+				error = fts_ast_visit(FTS_IGNORE_SKIP, node,
+					visitor, arg, &will_be_ignored);
+			}
+		}
+	}
+
+	return(error);
+}
+
+/**
+Create an ast string object, with NUL-terminator, so the string
+has one more byte than len
+@param[in] str		pointer to string
+@param[in] len		length of the string
+@return ast string with NUL-terminator */
+UNIV_INTERN
+fts_ast_string_t*
+fts_ast_string_create(
+	const byte*	str,
+	ulint		len)
+{
+	fts_ast_string_t*	ast_str;
+
+	ut_ad(len > 0);
+
+	ast_str = static_cast<fts_ast_string_t*>
+			(ut_malloc(sizeof(fts_ast_string_t)));
+	ast_str->str = static_cast<byte*>(ut_malloc(len + 1));
+
+	ast_str->len = len;
+	memcpy(ast_str->str, str, len);
+	ast_str->str[len] = '\0';
+
+	return(ast_str);
+}
+
+/**
+Free an ast string instance
+@param[in,out] ast_str		string to free */
+UNIV_INTERN
+void
+fts_ast_string_free(
+	fts_ast_string_t*	ast_str)
+{
+	if (ast_str != NULL) {
+		ut_free(ast_str->str);
+		ut_free(ast_str);
+	}
+}
+
+/**
+Translate ast string of type FTS_AST_NUMB to unsigned long by strtoul
+@param[in] str		string to translate
+@param[in] base		the base
+@return translated number */
+UNIV_INTERN
+ulint
+fts_ast_string_to_ul(
+	const fts_ast_string_t*	ast_str,
+	int			base)
+{
+	return(strtoul(reinterpret_cast<const char*>(ast_str->str),
+		       NULL, base));
+}
+
+/**
+Print the ast string
+@param[in] str		string to print */
+UNIV_INTERN
+void
+fts_ast_string_print(
+	const fts_ast_string_t*	ast_str)
+{
+	for (ulint i = 0; i < ast_str->len; ++i) {
+		printf("%c", ast_str->str[i]);
+	}
+
+	printf("\n");
+}
+
+#ifdef UNIV_DEBUG
+const char*
+fts_ast_oper_name_get(fts_ast_oper_t	oper)
+{
+	switch(oper) {
+	case FTS_NONE:
+		return("FTS_NONE");
+	case FTS_IGNORE:
+		return("FTS_IGNORE");
+	case FTS_EXIST:
+		return("FTS_EXIST");
+	case FTS_NEGATE:
+		return("FTS_NEGATE");
+	case FTS_INCR_RATING:
+		return("FTS_INCR_RATING");
+	case FTS_DECR_RATING:
+		return("FTS_DECR_RATING");
+	case FTS_DISTANCE:
+		return("FTS_DISTANCE");
+	case FTS_IGNORE_SKIP:
+		return("FTS_IGNORE_SKIP");
+	case FTS_EXIST_SKIP:
+		return("FTS_EXIST_SKIP");
+	}
+	ut_ad(0);
+}
+
+const char*
+fts_ast_node_type_get(fts_ast_type_t	type)
+{
+	switch (type) {
+	case FTS_AST_OPER:
+		return("FTS_AST_OPER");
+	case FTS_AST_NUMB:
+		return("FTS_AST_NUMB");
+	case FTS_AST_TERM:
+		return("FTS_AST_TERM");
+	case FTS_AST_TEXT:
+		return("FTS_AST_TEXT");
+	case FTS_AST_LIST:
+		return("FTS_AST_LIST");
+	case FTS_AST_SUBEXP_LIST:
+		return("FTS_AST_SUBEXP_LIST");
+	}
+	ut_ad(0);
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/fts/fts0blex.cc b/storage/innobase/fts/fts0blex.cc
new file mode 100644
index 00000000000..7d0acb00a3b
--- /dev/null
+++ b/storage/innobase/fts/fts0blex.cc
@@ -0,0 +1,1957 @@
+#include "univ.i"
+#line 2 "fts0blex.cc"
+
+#line 4 "fts0blex.cc"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 5
+#define YY_FLEX_SUBMINOR_VERSION 35
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types.
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t;
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+#ifdef __cplusplus
+
+/* The "const" storage-class-modifier is valid. */
+#define YY_USE_CONST
+
+#else	/* ! __cplusplus */
+
+/* C99 requires __STDC__ to be defined as 1. */
+#if defined (__STDC__)
+
+#define YY_USE_CONST
+
+#endif	/* defined (__STDC__) */
+#endif	/* ! __cplusplus */
+
+#ifdef YY_USE_CONST
+#define yyconst const
+#else
+#define yyconst
+#endif
+
+/* Returned upon end-of-file. */
+#define YY_NULL 0
+
+/* Promotes a possibly negative, possibly signed char to an unsigned
+ * integer for use as an array index.  If the signed char is negative,
+ * we want to instead treat it as an 8-bit unsigned char, hence the
+ * double cast.
+ */
+#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c)
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Enter a start condition.  This macro really ought to take a parameter,
+ * but we do it the disgusting crufty way forced on us by the ()-less
+ * definition of BEGIN.
+ */
+#define BEGIN yyg->yy_start = 1 + 2 *
+
+/* Translate the current start state into a value that can be later handed
+ * to BEGIN to return to the state.  The YYSTATE alias is for lex
+ * compatibility.
+ */
+#define YY_START ((yyg->yy_start - 1) / 2)
+#define YYSTATE YY_START
+
+/* Action number for EOF rule of a given start state. */
+#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1)
+
+/* Special action meaning "start processing a new file". */
+#define YY_NEW_FILE fts0brestart(yyin ,yyscanner )
+
+#define YY_END_OF_BUFFER_CHAR 0
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+/* The state buf must be large enough to hold one state per character in the main buffer.
+ */
+#define YY_STATE_BUF_SIZE   ((YY_BUF_SIZE + 2) * sizeof(yy_state_type))
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#define EOB_ACT_CONTINUE_SCAN 0
+#define EOB_ACT_END_OF_FILE 1
+#define EOB_ACT_LAST_MATCH 2
+
+#define YY_LESS_LINENO(n)
+
+/* Return all but the first "n" matched characters back to the input stream. */
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+		int yyless_macro_arg = (n); \
+		YY_LESS_LINENO(yyless_macro_arg);\
+		*yy_cp = yyg->yy_hold_char; \
+		YY_RESTORE_YY_MORE_OFFSET \
+		yyg->yy_c_buf_p = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \
+		YY_DO_BEFORE_ACTION; /* set up yytext again */ \
+		} \
+	while ( 0 )
+
+#define unput(c) yyunput( c, yyg->yytext_ptr , yyscanner )
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	yy_size_t yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+	int yy_bs_lineno; /**< The line count. */
+	int yy_bs_column; /**< The column count. */
+
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+#define YY_BUFFER_NEW 0
+#define YY_BUFFER_NORMAL 1
+	/* When an EOF's been seen but there's still some text to process
+	 * then we mark the buffer as YY_EOF_PENDING, to indicate that we
+	 * shouldn't try reading from the input source any more.  We might
+	 * still have a bunch of tokens to match, though, because of
+	 * possible backing-up.
+	 *
+	 * When we actually see the EOF, we change the status to "new"
+	 * (via fts0brestart()), so that the user can continue scanning by
+	 * just pointing yyin at a new input file.
+	 */
+#define YY_BUFFER_EOF_PENDING 2
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+/* We provide macros for accessing buffer states in case in the
+ * future we want to put the buffer states in a more general
+ * "scanner state".
+ *
+ * Returns the top of the stack, or NULL.
+ */
+#define YY_CURRENT_BUFFER ( yyg->yy_buffer_stack \
+                          ? yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] \
+                          : NULL)
+
+/* Same as previous macro, but useful when we know that the buffer stack is not
+ * NULL or when we need an lvalue. For internal use only.
+ */
+#define YY_CURRENT_BUFFER_LVALUE yyg->yy_buffer_stack[yyg->yy_buffer_stack_top]
+
+void fts0brestart (FILE *input_file ,yyscan_t yyscanner );
+void fts0b_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0b_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
+void fts0b_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0b_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0bpush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+void fts0bpop_buffer_state (yyscan_t yyscanner );
+
+static void fts0bensure_buffer_stack (yyscan_t yyscanner );
+static void fts0b_load_buffer_state (yyscan_t yyscanner );
+static void fts0b_init_buffer (YY_BUFFER_STATE b,FILE *file ,yyscan_t yyscanner );
+
+#define YY_FLUSH_BUFFER fts0b_flush_buffer(YY_CURRENT_BUFFER ,yyscanner)
+
+YY_BUFFER_STATE fts0b_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0b_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0b_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner );
+
+void *fts0balloc (yy_size_t ,           yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) );
+void *fts0brealloc (void *,yy_size_t ,           yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) );
+void fts0bfree (void * ,           yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) );
+
+#define yy_new_buffer fts0b_create_buffer
+
+#define yy_set_interactive(is_interactive) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){ \
+		fts0bensure_buffer_stack (yyscanner); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+		fts0b_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \
+	}
+
+#define yy_set_bol(at_bol) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){\
+		fts0bensure_buffer_stack (yyscanner); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+		fts0b_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \
+	}
+
+#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol)
+
+/* Begin user sect3 */
+
+#define fts0bwrap(n) 1
+#define YY_SKIP_YYWRAP
+
+typedef unsigned char YY_CHAR;
+
+typedef int yy_state_type;
+
+#define yytext_ptr yytext_r
+
+static yy_state_type yy_get_previous_state (yyscan_t yyscanner );
+static yy_state_type yy_try_NUL_trans (yy_state_type current_state  ,yyscan_t yyscanner);
+static int yy_get_next_buffer (yyscan_t yyscanner );
+static void yy_fatal_error (yyconst char msg[] ,           yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) );
+
+/* Done after the current pattern has been matched and before the
+ * corresponding action - sets up yytext.
+ */
+#define YY_DO_BEFORE_ACTION \
+	yyg->yytext_ptr = yy_bp; \
+	yyleng = static_cast<int>(yy_cp - yy_bp); \
+	yyg->yy_hold_char = *yy_cp; \
+	*yy_cp = '\0'; \
+	yyg->yy_c_buf_p = yy_cp;
+
+#define YY_NUM_RULES 7
+#define YY_END_OF_BUFFER 8
+/* This struct is not used in this scanner,
+   but its presence is necessary. */
+struct yy_trans_info
+	{
+	flex_int32_t yy_verify;
+	flex_int32_t yy_nxt;
+	};
+static yyconst flex_int16_t yy_accept[19] =
+    {   0,
+        4,    4,    8,    4,    1,    6,    1,    7,    7,    2,
+        3,    4,    1,    1,    0,    5,    3,    0
+    } ;
+
+static yyconst flex_int32_t yy_ec[256] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    1,    2,    3,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    4,    1,    5,    1,    1,    6,    1,    1,    7,
+        7,    7,    7,    1,    7,    1,    1,    8,    8,    8,
+        8,    8,    8,    8,    8,    8,    8,    1,    1,    7,
+        1,    7,    1,    7,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    7,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1
+    } ;
+
+static yyconst flex_int32_t yy_meta[9] =
+    {   0,
+        1,    2,    3,    4,    5,    5,    5,    1
+    } ;
+
+static yyconst flex_int16_t yy_base[22] =
+    {   0,
+        0,    0,   22,    0,    7,   23,    0,   14,   23,   23,
+        7,    0,    0,    0,    5,   23,    0,   23,   11,   12,
+       16
+    } ;
+
+static yyconst flex_int16_t yy_def[22] =
+    {   0,
+       18,    1,   18,   19,   19,   18,   20,   21,   18,   18,
+       19,   19,    5,   20,   21,   18,   11,    0,   18,   18,
+       18
+    } ;
+
+static yyconst flex_int16_t yy_nxt[32] =
+    {   0,
+        4,    5,    6,    7,    8,    9,   10,   11,   13,   16,
+       14,   12,   12,   14,   17,   14,   15,   15,   16,   15,
+       15,   18,    3,   18,   18,   18,   18,   18,   18,   18,
+       18
+    } ;
+
+static yyconst flex_int16_t yy_chk[32] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    1,    5,   15,
+        5,   19,   19,   20,   11,   20,   21,   21,    8,   21,
+       21,    3,   18,   18,   18,   18,   18,   18,   18,   18,
+       18
+    } ;
+
+/* The intent behind this definition is that it'll catch
+ * any uses of REJECT which flex missed.
+ */
+#define REJECT reject_used_but_not_detected
+#define yymore() yymore_used_but_not_detected
+#define YY_MORE_ADJ 0
+#define YY_RESTORE_YY_MORE_OFFSET
+#line 1 "fts0blex.l"
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**
+ * @file fts/fts0blex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+#line 27 "fts0blex.l"
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_blexer(YYSTYPE* val, yyscan_t yyscanner)
+
+#define YY_NO_INPUT 1
+#line 484 "fts0blex.cc"
+
+#define INITIAL 0
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+/* Holds the entire state of the reentrant scanner. */
+struct yyguts_t
+{
+
+	/* User-defined. Not touched by flex. */
+	YY_EXTRA_TYPE yyextra_r;
+
+	/* The rest are the same as the globals declared in the non-reentrant scanner. */
+	FILE *yyin_r, *yyout_r;
+	size_t yy_buffer_stack_top; /**< index of top of stack. */
+	size_t yy_buffer_stack_max; /**< capacity of stack. */
+	YY_BUFFER_STATE * yy_buffer_stack; /**< Stack as an array. */
+	char yy_hold_char;
+	int yy_n_chars;
+	int yyleng_r;
+	char *yy_c_buf_p;
+	int yy_init;
+	int yy_start;
+	int yy_did_buffer_switch_on_eof;
+	int yy_start_stack_ptr;
+	int yy_start_stack_depth;
+	int *yy_start_stack;
+	yy_state_type yy_last_accepting_state;
+	char* yy_last_accepting_cpos;
+
+	int yylineno_r;
+	int yy_flex_debug_r;
+
+	char *yytext_r;
+	int yy_more_flag;
+	int yy_more_len;
+
+}; /* end struct yyguts_t */
+
+static int yy_init_globals (yyscan_t yyscanner );
+
+int fts0blex_init (yyscan_t* scanner);
+
+int fts0blex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int fts0blex_destroy (yyscan_t yyscanner );
+
+int fts0bget_debug (yyscan_t yyscanner );
+
+void fts0bset_debug (int debug_flag ,yyscan_t yyscanner );
+
+YY_EXTRA_TYPE fts0bget_extra (yyscan_t yyscanner );
+
+void fts0bset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
+
+FILE *fts0bget_in (yyscan_t yyscanner );
+
+void fts0bset_in  (FILE * in_str ,yyscan_t yyscanner );
+
+FILE *fts0bget_out (yyscan_t yyscanner );
+
+void fts0bset_out  (FILE * out_str ,yyscan_t yyscanner );
+
+int fts0bget_leng (yyscan_t yyscanner );
+
+char *fts0bget_text (yyscan_t yyscanner );
+
+int fts0bget_lineno (yyscan_t yyscanner );
+
+void fts0bset_lineno (int line_number ,yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int fts0bwrap (yyscan_t yyscanner );
+#else
+extern int fts0bwrap (yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char *,yyconst char *,int ,           yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)));
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * ,           yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)));
+#endif
+
+#ifndef YY_NO_INPUT
+
+#ifdef __cplusplus
+static int yyinput (yyscan_t yyscanner );
+#else
+static int input (yyscan_t yyscanner );
+#endif
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Copy whatever the last rule matched to the standard output. */
+#ifndef ECHO
+/* This used to be an fputs(), but since the string might contain NUL's,
+ * we now use fwrite().
+ */
+#define ECHO do { if (fwrite( yytext, yyleng, 1, yyout )) {} } while (0)
+#endif
+
+/* Gets input and stuffs it into "buf".  number of characters read, or YY_NULL,
+ * is returned in "result".
+ */
+#ifndef YY_INPUT
+#define YY_INPUT(buf,result,max_size) \
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \
+		{ \
+		int c = '*'; \
+		int n; \
+		for ( n = 0; n < static_cast<int>(max_size) && \
+			     (c = getc( yyin )) != EOF && c != '\n'; ++n ) \
+			buf[n] = (char) c; \
+		if ( c == '\n' ) \
+			buf[n++] = (char) c; \
+		if ( c == EOF && ferror( yyin ) ) \
+			YY_FATAL_ERROR( "input in flex scanner failed" ); \
+		result = n; \
+		} \
+	else \
+		{ \
+		errno=0; \
+		while ( (result = static_cast<int>(fread(buf, 1, max_size, yyin))) \
+		      == 0 && ferror(yyin) ) \
+			{ \
+			if( errno != EINTR) \
+				{ \
+				YY_FATAL_ERROR( "input in flex scanner failed" ); \
+				break; \
+				} \
+			errno=0; \
+			clearerr(yyin); \
+			} \
+		}\
+\
+
+#endif
+
+/* No semi-colon after return; correct usage is to write "yyterminate();" -
+ * we don't want an extra ';' after the "return" because that will cause
+ * some compilers to complain about unreachable statements.
+ */
+#ifndef yyterminate
+#define yyterminate() return YY_NULL
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Report a fatal error. */
+#ifndef YY_FATAL_ERROR
+#define YY_FATAL_ERROR(msg) yy_fatal_error( msg , yyscanner)
+#endif
+
+/* end tables serialization structures and prototypes */
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int fts0blex (yyscan_t yyscanner);
+
+#define YY_DECL int fts0blex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* Code executed at the beginning of each rule, after yytext and yyleng
+ * have been set up.
+ */
+#ifndef YY_USER_ACTION
+#define YY_USER_ACTION
+#endif
+
+/* Code executed at the end of each rule. */
+#ifndef YY_BREAK
+#define YY_BREAK break;
+#endif
+
+#define YY_RULE_SETUP \
+	YY_USER_ACTION
+
+/** The main scanner function which does all the work.
+ */
+YY_DECL
+{
+	register yy_state_type yy_current_state;
+	register char *yy_cp, *yy_bp;
+	register int yy_act;
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+#line 43 "fts0blex.l"
+
+
+#line 712 "fts0blex.cc"
+
+	if ( !yyg->yy_init )
+		{
+		yyg->yy_init = 1;
+
+#ifdef YY_USER_INIT
+		YY_USER_INIT;
+#endif
+
+		if ( ! yyg->yy_start )
+			yyg->yy_start = 1;	/* first start state */
+
+		if ( ! yyin )
+			yyin = stdin;
+
+		if ( ! yyout )
+			yyout = stdout;
+
+		if ( ! YY_CURRENT_BUFFER ) {
+			fts0bensure_buffer_stack (yyscanner);
+			YY_CURRENT_BUFFER_LVALUE =
+				fts0b_create_buffer(yyin,YY_BUF_SIZE ,yyscanner);
+		}
+
+		fts0b_load_buffer_state(yyscanner );
+		}
+
+	while ( 1 )		/* loops until end-of-file is reached */
+		{
+		yy_cp = yyg->yy_c_buf_p;
+
+		/* Support of yytext. */
+		*yy_cp = yyg->yy_hold_char;
+
+		/* yy_bp points to the position in yy_ch_buf of the start of
+		 * the current run.
+		 */
+		yy_bp = yy_cp;
+
+		yy_current_state = yyg->yy_start;
+yy_match:
+		do
+			{
+			register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)];
+			if ( yy_accept[yy_current_state] )
+				{
+				yyg->yy_last_accepting_state = yy_current_state;
+				yyg->yy_last_accepting_cpos = yy_cp;
+				}
+			while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+				{
+				yy_current_state = (int) yy_def[yy_current_state];
+				if ( yy_current_state >= 19 )
+					yy_c = yy_meta[(unsigned int) yy_c];
+				}
+			yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+			++yy_cp;
+			}
+		while ( yy_current_state != 18 );
+		yy_cp = yyg->yy_last_accepting_cpos;
+		yy_current_state = yyg->yy_last_accepting_state;
+
+yy_find_action:
+		yy_act = yy_accept[yy_current_state];
+
+		YY_DO_BEFORE_ACTION;
+
+do_action:	/* This label is used only to access EOF actions. */
+
+		switch ( yy_act )
+	{ /* beginning of action switch */
+			case 0: /* must back up */
+			/* undo the effects of YY_DO_BEFORE_ACTION */
+			*yy_cp = yyg->yy_hold_char;
+			yy_cp = yyg->yy_last_accepting_cpos;
+			yy_current_state = yyg->yy_last_accepting_state;
+			goto yy_find_action;
+
+case 1:
+YY_RULE_SETUP
+#line 45 "fts0blex.l"
+/* Ignore whitespace */ ;
+	YY_BREAK
+case 2:
+YY_RULE_SETUP
+#line 47 "fts0blex.l"
+{
+	val->oper = fts0bget_text(yyscanner)[0];
+
+	return(val->oper);
+}
+	YY_BREAK
+case 3:
+YY_RULE_SETUP
+#line 53 "fts0blex.l"
+{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner));
+
+	return(FTS_NUMB);
+}
+	YY_BREAK
+case 4:
+YY_RULE_SETUP
+#line 59 "fts0blex.l"
+{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner));
+
+	return(FTS_TERM);
+}
+	YY_BREAK
+case 5:
+YY_RULE_SETUP
+#line 65 "fts0blex.l"
+{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner));
+
+	return(FTS_TEXT);
+}
+	YY_BREAK
+case 6:
+/* rule 6 can match eol */
+YY_RULE_SETUP
+#line 71 "fts0blex.l"
+
+	YY_BREAK
+case 7:
+YY_RULE_SETUP
+#line 73 "fts0blex.l"
+ECHO;
+	YY_BREAK
+#line 843 "fts0blex.cc"
+case YY_STATE_EOF(INITIAL):
+	yyterminate();
+
+	case YY_END_OF_BUFFER:
+		{
+		/* Amount of text matched not including the EOB char. */
+		int yy_amount_of_matched_text = (int) (yy_cp - yyg->yytext_ptr) - 1;
+
+		/* Undo the effects of YY_DO_BEFORE_ACTION. */
+		*yy_cp = yyg->yy_hold_char;
+		YY_RESTORE_YY_MORE_OFFSET
+
+		if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW )
+			{
+			/* We're scanning a new file or input source.  It's
+			 * possible that this happened because the user
+			 * just pointed yyin at a new source and called
+			 * fts0blex().  If so, then we have to assure
+			 * consistency between YY_CURRENT_BUFFER and our
+			 * globals.  Here is the right place to do so, because
+			 * this is the first action (other than possibly a
+			 * back-up) that will match for the new input source.
+			 */
+			yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+			YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL;
+			}
+
+		/* Note that here we test for yy_c_buf_p "<=" to the position
+		 * of the first EOB in the buffer, since yy_c_buf_p will
+		 * already have been incremented past the NUL character
+		 * (since all states make transitions on EOB to the
+		 * end-of-buffer state).  Contrast this with the test
+		 * in input().
+		 */
+		if ( yyg->yy_c_buf_p <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+			{ /* This was really a NUL. */
+			yy_state_type yy_next_state;
+
+			yyg->yy_c_buf_p = yyg->yytext_ptr + yy_amount_of_matched_text;
+
+			yy_current_state = yy_get_previous_state( yyscanner );
+
+			/* Okay, we're now positioned to make the NUL
+			 * transition.  We couldn't have
+			 * yy_get_previous_state() go ahead and do it
+			 * for us because it doesn't know how to deal
+			 * with the possibility of jamming (and we don't
+			 * want to build jamming into it because then it
+			 * will run more slowly).
+			 */
+
+			yy_next_state = yy_try_NUL_trans( yy_current_state , yyscanner);
+
+			yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+
+			if ( yy_next_state )
+				{
+				/* Consume the NUL. */
+				yy_cp = ++yyg->yy_c_buf_p;
+				yy_current_state = yy_next_state;
+				goto yy_match;
+				}
+
+			else
+				{
+				yy_cp = yyg->yy_last_accepting_cpos;
+				yy_current_state = yyg->yy_last_accepting_state;
+				goto yy_find_action;
+				}
+			}
+
+		else switch ( yy_get_next_buffer( yyscanner ) )
+			{
+			case EOB_ACT_END_OF_FILE:
+				{
+				yyg->yy_did_buffer_switch_on_eof = 0;
+
+				if ( fts0bwrap(yyscanner ) )
+					{
+					/* Note: because we've taken care in
+					 * yy_get_next_buffer() to have set up
+					 * yytext, we can now set up
+					 * yy_c_buf_p so that if some total
+					 * hoser (like flex itself) wants to
+					 * call the scanner after we return the
+					 * YY_NULL, it'll still work - another
+					 * YY_NULL will get returned.
+					 */
+					yyg->yy_c_buf_p = yyg->yytext_ptr + YY_MORE_ADJ;
+
+					yy_act = YY_STATE_EOF(YY_START);
+					goto do_action;
+					}
+
+				else
+					{
+					if ( ! yyg->yy_did_buffer_switch_on_eof )
+						YY_NEW_FILE;
+					}
+				break;
+				}
+
+			case EOB_ACT_CONTINUE_SCAN:
+				yyg->yy_c_buf_p =
+					yyg->yytext_ptr + yy_amount_of_matched_text;
+
+				yy_current_state = yy_get_previous_state( yyscanner );
+
+				yy_cp = yyg->yy_c_buf_p;
+				yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+				goto yy_match;
+
+			case EOB_ACT_LAST_MATCH:
+				yyg->yy_c_buf_p =
+				&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars];
+
+				yy_current_state = yy_get_previous_state( yyscanner );
+
+				yy_cp = yyg->yy_c_buf_p;
+				yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+				goto yy_find_action;
+			}
+		break;
+		}
+
+	default:
+		YY_FATAL_ERROR(
+			"fatal flex scanner internal error--no action found" );
+	} /* end of action switch */
+		} /* end of scanning one token */
+} /* end of fts0blex */
+
+/* yy_get_next_buffer - try to read in a new buffer
+ *
+ * Returns a code representing an action:
+ *	EOB_ACT_LAST_MATCH -
+ *	EOB_ACT_CONTINUE_SCAN - continue scanning from current position
+ *	EOB_ACT_END_OF_FILE - end of file
+ */
+static int yy_get_next_buffer (yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf;
+	register char *source = yyg->yytext_ptr;
+	register int number_to_move, i;
+	int ret_val;
+
+	if ( yyg->yy_c_buf_p > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] )
+		YY_FATAL_ERROR(
+		"fatal flex scanner internal error--end of buffer missed" );
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 )
+		{ /* Don't try to fill the buffer, so this is an EOF. */
+		if ( yyg->yy_c_buf_p - yyg->yytext_ptr - YY_MORE_ADJ == 1 )
+			{
+			/* We matched a single character, the EOB, so
+			 * treat this as a final EOF.
+			 */
+			return EOB_ACT_END_OF_FILE;
+			}
+
+		else
+			{
+			/* We matched some text prior to the EOB, first
+			 * process it.
+			 */
+			return EOB_ACT_LAST_MATCH;
+			}
+		}
+
+	/* Try to read more data. */
+
+	/* First move last chars to start of buffer. */
+	number_to_move = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr) - 1;
+
+	for ( i = 0; i < number_to_move; ++i )
+		*(dest++) = *(source++);
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING )
+		/* don't do the read, it's not guaranteed to return an EOF,
+		 * just force an EOF
+		 */
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars = 0;
+
+	else
+		{
+			int num_to_read = static_cast<int>(
+				YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1);
+
+		while ( num_to_read <= 0 )
+			{ /* Not enough room in the buffer - grow it. */
+
+			/* just a shorter name for the current buffer */
+			YY_BUFFER_STATE b = YY_CURRENT_BUFFER;
+
+			int yy_c_buf_p_offset =
+				(int) (yyg->yy_c_buf_p - b->yy_ch_buf);
+
+			if ( b->yy_is_our_buffer )
+				{
+				int new_size = static_cast<int>(b->yy_buf_size * 2);
+
+				if ( new_size <= 0 )
+					b->yy_buf_size += b->yy_buf_size / 8;
+				else
+					b->yy_buf_size *= 2;
+
+				b->yy_ch_buf = (char *)
+					/* Include room in for 2 EOB chars. */
+					fts0brealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ,yyscanner );
+				}
+			else
+				/* Can't grow it, we don't own it. */
+				b->yy_ch_buf = 0;
+
+			if ( ! b->yy_ch_buf )
+				YY_FATAL_ERROR(
+				"fatal error - scanner input buffer overflow" );
+
+			yyg->yy_c_buf_p = &b->yy_ch_buf[yy_c_buf_p_offset];
+
+			num_to_read = static_cast<int>(
+				YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1);
+
+			}
+
+		if ( num_to_read > YY_READ_BUF_SIZE )
+			num_to_read = YY_READ_BUF_SIZE;
+
+		/* Read in more data. */
+		YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]),
+			yyg->yy_n_chars, num_to_read);
+
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	if ( yyg->yy_n_chars == 0 )
+		{
+		if ( number_to_move == YY_MORE_ADJ )
+			{
+			ret_val = EOB_ACT_END_OF_FILE;
+			fts0brestart(yyin  ,yyscanner);
+			}
+
+		else
+			{
+			ret_val = EOB_ACT_LAST_MATCH;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status =
+				YY_BUFFER_EOF_PENDING;
+			}
+		}
+
+	else
+		ret_val = EOB_ACT_CONTINUE_SCAN;
+
+	if ((yy_size_t) (yyg->yy_n_chars + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) {
+		/* Extend the array by 50%, plus the number we really need. */
+		yy_size_t new_size = yyg->yy_n_chars + number_to_move + (yyg->yy_n_chars >> 1);
+		YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) fts0brealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ,yyscanner );
+		if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf )
+			YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" );
+	}
+
+	yyg->yy_n_chars += number_to_move;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] = YY_END_OF_BUFFER_CHAR;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] = YY_END_OF_BUFFER_CHAR;
+
+	yyg->yytext_ptr = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0];
+
+	return ret_val;
+}
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+static yy_state_type yy_get_previous_state (yyscan_t yyscanner)
+{
+	register yy_state_type yy_current_state;
+	register char *yy_cp;
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	yy_current_state = yyg->yy_start;
+
+	for ( yy_cp = yyg->yytext_ptr + YY_MORE_ADJ; yy_cp < yyg->yy_c_buf_p; ++yy_cp )
+		{
+		register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1);
+		if ( yy_accept[yy_current_state] )
+			{
+			yyg->yy_last_accepting_state = yy_current_state;
+			yyg->yy_last_accepting_cpos = yy_cp;
+			}
+		while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+			{
+			yy_current_state = (int) yy_def[yy_current_state];
+			if ( yy_current_state >= 19 )
+				yy_c = yy_meta[(unsigned int) yy_c];
+			}
+		yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+		}
+
+	return yy_current_state;
+}
+
+/* yy_try_NUL_trans - try to make a transition on the NUL character
+ *
+ * synopsis
+ *	next_state = yy_try_NUL_trans( current_state );
+ */
+static yy_state_type yy_try_NUL_trans  (yy_state_type yy_current_state , yyscan_t yyscanner)
+{
+	register int yy_is_jam;
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* This var may be unused depending upon options. */
+	register char *yy_cp = yyg->yy_c_buf_p;
+
+	register YY_CHAR yy_c = 1;
+	if ( yy_accept[yy_current_state] )
+		{
+		yyg->yy_last_accepting_state = yy_current_state;
+		yyg->yy_last_accepting_cpos = yy_cp;
+		}
+	while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+		{
+		yy_current_state = (int) yy_def[yy_current_state];
+		if ( yy_current_state >= 19 )
+			yy_c = yy_meta[(unsigned int) yy_c];
+		}
+	yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+	yy_is_jam = (yy_current_state == 18);
+
+	return yy_is_jam ? 0 : yy_current_state;
+}
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+	static int yyinput (yyscan_t yyscanner)
+#else
+	static int input  (yyscan_t yyscanner)
+#endif
+
+{
+	int c;
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	*yyg->yy_c_buf_p = yyg->yy_hold_char;
+
+	if ( *yyg->yy_c_buf_p == YY_END_OF_BUFFER_CHAR )
+		{
+		/* yy_c_buf_p now points to the character we want to return.
+		 * If this occurs *before* the EOB characters, then it's a
+		 * valid NUL; if not, then we've hit the end of the buffer.
+		 */
+		if ( yyg->yy_c_buf_p < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+			/* This was really a NUL. */
+			*yyg->yy_c_buf_p = '\0';
+
+		else
+			{ /* need more input */
+			int offset = yyg->yy_c_buf_p - yyg->yytext_ptr;
+			++yyg->yy_c_buf_p;
+
+			switch ( yy_get_next_buffer( yyscanner ) )
+				{
+				case EOB_ACT_LAST_MATCH:
+					/* This happens because yy_g_n_b()
+					 * sees that we've accumulated a
+					 * token and flags that we need to
+					 * try matching the token before
+					 * proceeding.  But for input(),
+					 * there's no matching to consider.
+					 * So convert the EOB_ACT_LAST_MATCH
+					 * to EOB_ACT_END_OF_FILE.
+					 */
+
+					/* Reset buffer status. */
+					fts0brestart(yyin ,yyscanner);
+
+					/*FALLTHROUGH*/
+
+				case EOB_ACT_END_OF_FILE:
+					{
+					if ( fts0bwrap(yyscanner ) )
+						return EOF;
+
+					if ( ! yyg->yy_did_buffer_switch_on_eof )
+						YY_NEW_FILE;
+#ifdef __cplusplus
+					return yyinput(yyscanner);
+#else
+					return input(yyscanner);
+#endif
+					}
+
+				case EOB_ACT_CONTINUE_SCAN:
+					yyg->yy_c_buf_p = yyg->yytext_ptr + offset;
+					break;
+				}
+			}
+		}
+
+	c = *(unsigned char *) yyg->yy_c_buf_p;	/* cast for 8-bit char's */
+	*yyg->yy_c_buf_p = '\0';	/* preserve yytext */
+	yyg->yy_hold_char = *++yyg->yy_c_buf_p;
+
+	return c;
+}
+#endif	/* ifndef YY_NO_INPUT */
+
+/** Immediately switch to a different input stream.
+ * @param input_file A readable stream.
+ * @param yyscanner The scanner object.
+ * @note This function does not reset the start condition to @c INITIAL .
+ */
+void fts0brestart  (FILE * input_file , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if ( ! YY_CURRENT_BUFFER ){
+		fts0bensure_buffer_stack (yyscanner);
+		YY_CURRENT_BUFFER_LVALUE =
+			fts0b_create_buffer(yyin,YY_BUF_SIZE ,yyscanner);
+	}
+
+	fts0b_init_buffer(YY_CURRENT_BUFFER,input_file ,yyscanner);
+	fts0b_load_buffer_state(yyscanner );
+}
+
+/** Switch to a different input buffer.
+ * @param new_buffer The new input buffer.
+ * @param yyscanner The scanner object.
+ */
+void fts0b_switch_to_buffer  (YY_BUFFER_STATE  new_buffer , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	/* TODO. We should be able to replace this entire function body
+	 * with
+	 *		fts0bpop_buffer_state();
+	 *		fts0bpush_buffer_state(new_buffer);
+	 */
+	fts0bensure_buffer_stack (yyscanner);
+	if ( YY_CURRENT_BUFFER == new_buffer )
+		return;
+
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*yyg->yy_c_buf_p = yyg->yy_hold_char;
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+	fts0b_load_buffer_state(yyscanner );
+
+	/* We don't actually know whether we did this switch during
+	 * EOF (fts0bwrap()) processing, but the only time this flag
+	 * is looked at is after fts0bwrap() is called, so it's safe
+	 * to go ahead and always set it.
+	 */
+	yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+static void fts0b_load_buffer_state  (yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+	yyg->yytext_ptr = yyg->yy_c_buf_p = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos;
+	yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file;
+	yyg->yy_hold_char = *yyg->yy_c_buf_p;
+}
+
+/** Allocate and initialize an input buffer state.
+ * @param file A readable stream.
+ * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE.
+ * @param yyscanner The scanner object.
+ * @return the allocated buffer state.
+ */
+YY_BUFFER_STATE fts0b_create_buffer  (FILE * file, int  size , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+
+	b = (YY_BUFFER_STATE) fts0balloc(sizeof( struct yy_buffer_state ) ,yyscanner );
+	if ( ! b )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0b_create_buffer()" );
+
+	b->yy_buf_size = size;
+
+	/* yy_ch_buf has to be 2 characters longer than the size given because
+	 * we need to put in 2 end-of-buffer characters.
+	 */
+	b->yy_ch_buf = (char *) fts0balloc(b->yy_buf_size + 2 ,yyscanner );
+	if ( ! b->yy_ch_buf )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0b_create_buffer()" );
+
+	b->yy_is_our_buffer = 1;
+
+	fts0b_init_buffer(b,file ,yyscanner);
+
+	return b;
+}
+
+/** Destroy the buffer.
+ * @param b a buffer created with fts0b_create_buffer()
+ * @param yyscanner The scanner object.
+ */
+void fts0b_delete_buffer (YY_BUFFER_STATE  b , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if ( ! b )
+		return;
+
+	if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */
+		YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0;
+
+	if ( b->yy_is_our_buffer )
+		fts0bfree((void *) b->yy_ch_buf ,yyscanner );
+
+	fts0bfree((void *) b ,yyscanner );
+}
+
+/* Initializes or reinitializes a buffer.
+ * This function is sometimes called more than once on the same buffer,
+ * such as during a fts0brestart() or at EOF.
+ */
+static void fts0b_init_buffer  (YY_BUFFER_STATE  b, FILE * file , yyscan_t yyscanner)
+
+{
+	int oerrno = errno;
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	fts0b_flush_buffer(b ,yyscanner);
+
+	b->yy_input_file = file;
+	b->yy_fill_buffer = 1;
+
+	/* If b is the current buffer, then fts0b_init_buffer was _probably_
+	 * called from fts0brestart() or through yy_get_next_buffer.
+	 * In that case, we don't want to reset the lineno or column.
+	 */
+	if (b != YY_CURRENT_BUFFER){
+		b->yy_bs_lineno = 1;
+		b->yy_bs_column = 0;
+	}
+
+	b->yy_is_interactive = 0;
+
+	errno = oerrno;
+}
+
+/** Discard all buffered characters. On the next scan, YY_INPUT will be called.
+ * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER.
+ * @param yyscanner The scanner object.
+ */
+void fts0b_flush_buffer (YY_BUFFER_STATE  b , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	if ( ! b )
+		return;
+
+	b->yy_n_chars = 0;
+
+	/* We always need two end-of-buffer characters.  The first causes
+	 * a transition to the end-of-buffer state.  The second causes
+	 * a jam in that state.
+	 */
+	b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR;
+	b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR;
+
+	b->yy_buf_pos = &b->yy_ch_buf[0];
+
+	b->yy_at_bol = 1;
+	b->yy_buffer_status = YY_BUFFER_NEW;
+
+	if ( b == YY_CURRENT_BUFFER )
+		fts0b_load_buffer_state(yyscanner );
+}
+
+/** Pushes the new state onto the stack. The new state becomes
+ *  the current state. This function will allocate the stack
+ *  if necessary.
+ *  @param new_buffer The new state.
+ *  @param yyscanner The scanner object.
+ */
+void fts0bpush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	if (new_buffer == NULL)
+		return;
+
+	fts0bensure_buffer_stack(yyscanner);
+
+	/* This block is copied from fts0b_switch_to_buffer. */
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*yyg->yy_c_buf_p = yyg->yy_hold_char;
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	/* Only push if top exists. Otherwise, replace top. */
+	if (YY_CURRENT_BUFFER)
+		yyg->yy_buffer_stack_top++;
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+
+	/* copied from fts0b_switch_to_buffer. */
+	fts0b_load_buffer_state(yyscanner );
+	yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+/** Removes and deletes the top of the stack, if present.
+ *  The next element becomes the new top.
+ *  @param yyscanner The scanner object.
+ */
+void fts0bpop_buffer_state (yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	if (!YY_CURRENT_BUFFER)
+		return;
+
+	fts0b_delete_buffer(YY_CURRENT_BUFFER ,yyscanner);
+	YY_CURRENT_BUFFER_LVALUE = NULL;
+	if (yyg->yy_buffer_stack_top > 0)
+		--yyg->yy_buffer_stack_top;
+
+	if (YY_CURRENT_BUFFER) {
+		fts0b_load_buffer_state(yyscanner );
+		yyg->yy_did_buffer_switch_on_eof = 1;
+	}
+}
+
+/* Allocates the stack if it does not exist.
+ *  Guarantees space for at least one push.
+ */
+static void fts0bensure_buffer_stack (yyscan_t yyscanner)
+{
+	int num_to_alloc;
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if (!yyg->yy_buffer_stack) {
+
+		/* First allocation is just for 2 elements, since we don't know if this
+		 * scanner will even need a stack. We use 2 instead of 1 to avoid an
+		 * immediate realloc on the next call.
+		 */
+		num_to_alloc = 1;
+		yyg->yy_buffer_stack = (struct yy_buffer_state**)fts0balloc
+								(num_to_alloc * sizeof(struct yy_buffer_state*)
+								, yyscanner);
+		if ( ! yyg->yy_buffer_stack )
+			YY_FATAL_ERROR( "out of dynamic memory in fts0bensure_buffer_stack()" );
+
+		memset(yyg->yy_buffer_stack, 0, num_to_alloc * sizeof(struct yy_buffer_state*));
+
+		yyg->yy_buffer_stack_max = num_to_alloc;
+		yyg->yy_buffer_stack_top = 0;
+		return;
+	}
+
+	if (yyg->yy_buffer_stack_top >= (yyg->yy_buffer_stack_max) - 1){
+
+		/* Increase the buffer to prepare for a possible push. */
+		int grow_size = 8 /* arbitrary grow size */;
+
+		num_to_alloc = static_cast<int>(yyg->yy_buffer_stack_max + grow_size);
+		yyg->yy_buffer_stack = (struct yy_buffer_state**)fts0brealloc
+								(yyg->yy_buffer_stack,
+								num_to_alloc * sizeof(struct yy_buffer_state*)
+								, yyscanner);
+		if ( ! yyg->yy_buffer_stack )
+			YY_FATAL_ERROR( "out of dynamic memory in fts0bensure_buffer_stack()" );
+
+		/* zero only the new slots.*/
+		memset(yyg->yy_buffer_stack + yyg->yy_buffer_stack_max, 0, grow_size * sizeof(struct yy_buffer_state*));
+		yyg->yy_buffer_stack_max = num_to_alloc;
+	}
+}
+
+/** Setup the input buffer state to scan directly from a user-specified character buffer.
+ * @param base the character buffer
+ * @param size the size in bytes of the character buffer
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ */
+YY_BUFFER_STATE fts0b_scan_buffer  (char * base, yy_size_t  size , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+
+	if ( size < 2 ||
+	     base[size-2] != YY_END_OF_BUFFER_CHAR ||
+	     base[size-1] != YY_END_OF_BUFFER_CHAR )
+		/* They forgot to leave room for the EOB's. */
+		return 0;
+
+	b = (YY_BUFFER_STATE) fts0balloc(sizeof( struct yy_buffer_state ) ,yyscanner );
+	if ( ! b )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0b_scan_buffer()" );
+
+	b->yy_buf_size = size - 2;	/* "- 2" to take care of EOB's */
+	b->yy_buf_pos = b->yy_ch_buf = base;
+	b->yy_is_our_buffer = 0;
+	b->yy_input_file = 0;
+	b->yy_n_chars = static_cast<int>(b->yy_buf_size);
+	b->yy_is_interactive = 0;
+	b->yy_at_bol = 1;
+	b->yy_fill_buffer = 0;
+	b->yy_buffer_status = YY_BUFFER_NEW;
+
+	fts0b_switch_to_buffer(b ,yyscanner );
+
+	return b;
+}
+
+/** Setup the input buffer state to scan a string. The next call to fts0blex() will
+ * scan from a @e copy of @a str.
+ * @param yystr a NUL-terminated string to scan
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ * @note If you want to scan bytes that may contain NUL values, then use
+ *       fts0b_scan_bytes() instead.
+ */
+YY_BUFFER_STATE fts0b_scan_string (yyconst char * yystr , yyscan_t yyscanner)
+{
+	return fts0b_scan_bytes(yystr,static_cast<int>(strlen(yystr)), yyscanner);
+}
+
+/** Setup the input buffer state to scan the given bytes. The next call to fts0blex() will
+ * scan from a @e copy of @a bytes.
+ * @param yybytes the byte buffer to scan
+ * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes.
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ */
+YY_BUFFER_STATE fts0b_scan_bytes  (yyconst char * yybytes, int  _yybytes_len , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+	char *buf;
+	yy_size_t n;
+	int i;
+
+	/* Get memory for full buffer, including space for trailing EOB's. */
+	n = _yybytes_len + 2;
+	buf = (char *) fts0balloc(n ,yyscanner );
+	if ( ! buf )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0b_scan_bytes()" );
+
+	for ( i = 0; i < _yybytes_len; ++i )
+		buf[i] = yybytes[i];
+
+	buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR;
+
+	b = fts0b_scan_buffer(buf,n ,yyscanner);
+	if ( ! b )
+		YY_FATAL_ERROR( "bad buffer in fts0b_scan_bytes()" );
+
+	/* It's okay to grow etc. this buffer, and we should throw it
+	 * away when we're done.
+	 */
+	b->yy_is_our_buffer = 1;
+
+	return b;
+}
+
+#ifndef YY_EXIT_FAILURE
+#define YY_EXIT_FAILURE 2
+#endif
+
+static void yy_fatal_error (yyconst char* msg ,            yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	(void) fprintf( stderr, "%s\n", msg );
+	exit( YY_EXIT_FAILURE );
+}
+
+/* Redefine yyless() so it works in section 3 code. */
+
+#undef yyless
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+	int yyless_macro_arg = (n); \
+	YY_LESS_LINENO(yyless_macro_arg);\
+		yytext[yyleng] = yyg->yy_hold_char; \
+		yyg->yy_c_buf_p = yytext + yyless_macro_arg; \
+		yyg->yy_hold_char = *yyg->yy_c_buf_p; \
+		*yyg->yy_c_buf_p = '\0'; \
+		yyleng = yyless_macro_arg; \
+		} \
+	while ( 0 )
+
+/* Accessor  methods (get/set functions) to struct members. */
+
+/** Get the user-defined data for this scanner.
+ * @param yyscanner The scanner object.
+ */
+YY_EXTRA_TYPE fts0bget_extra  (yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	return yyextra;
+}
+
+/** Get the current line number.
+ * @param yyscanner The scanner object.
+ */
+int fts0bget_lineno  (yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if (! YY_CURRENT_BUFFER)
+		return 0;
+
+	return yylineno;
+}
+
+/** Get the current column number.
+ * @param yyscanner The scanner object.
+ */
+int fts0bget_column  (yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if (! YY_CURRENT_BUFFER)
+		return 0;
+
+	return yycolumn;
+}
+
+/** Get the input stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *fts0bget_in  (yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	return yyin;
+}
+
+/** Get the output stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *fts0bget_out  (yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	return yyout;
+}
+
+/** Get the length of the current token.
+ * @param yyscanner The scanner object.
+ */
+int fts0bget_leng  (yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	return yyleng;
+}
+
+/** Get the current token.
+ * @param yyscanner The scanner object.
+ */
+
+char *fts0bget_text  (yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	return yytext;
+}
+
+/** Set the user-defined data. This data is never touched by the scanner.
+ * @param user_defined The data to be associated with this scanner.
+ * @param yyscanner The scanner object.
+ */
+void fts0bset_extra (YY_EXTRA_TYPE  user_defined , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	yyextra = user_defined ;
+}
+
+/** Set the current line number.
+ * @param line_number
+ * @param yyscanner The scanner object.
+ */
+void fts0bset_lineno (int  line_number , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	/* lineno is only valid if an input buffer exists. */
+	if (! YY_CURRENT_BUFFER )
+		yy_fatal_error( "fts0bset_lineno called with no buffer" , yyscanner);
+
+	yylineno = line_number;
+}
+
+/** Set the current column.
+ * @param line_number
+ * @param yyscanner The scanner object.
+ */
+void fts0bset_column (int  column_no , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	/* column is only valid if an input buffer exists. */
+	if (! YY_CURRENT_BUFFER )
+		yy_fatal_error( "fts0bset_column called with no buffer" , yyscanner);
+
+	yycolumn = column_no;
+}
+
+/** Set the input stream. This does not discard the current
+ * input buffer.
+ * @param in_str A readable stream.
+ * @param yyscanner The scanner object.
+ * @see fts0b_switch_to_buffer
+ */
+void fts0bset_in (FILE *  in_str , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	yyin = in_str ;
+}
+
+void fts0bset_out (FILE *  out_str , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	yyout = out_str ;
+}
+
+int fts0bget_debug  (yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	return yy_flex_debug;
+}
+
+void fts0bset_debug (int  bdebug , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	yy_flex_debug = bdebug ;
+}
+
+/* Accessor methods for yylval and yylloc */
+
+/* User-visible API */
+
+/* fts0blex_init is special because it creates the scanner itself, so it is
+ * the ONLY reentrant function that doesn't take the scanner as the last argument.
+ * That's why we explicitly handle the declaration, instead of using our macros.
+ */
+
+int fts0blex_init(yyscan_t* ptr_yy_globals)
+
+{
+	if (ptr_yy_globals == NULL){
+		errno = EINVAL;
+		return 1;
+	}
+
+	*ptr_yy_globals = (yyscan_t) fts0balloc ( sizeof( struct yyguts_t ), NULL );
+
+	if (*ptr_yy_globals == NULL){
+		errno = ENOMEM;
+		return 1;
+	}
+
+	/* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */
+	memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+
+	return yy_init_globals ( *ptr_yy_globals );
+}
+
+/* fts0blex_init_extra has the same functionality as fts0blex_init, but follows the
+ * convention of taking the scanner as the last argument. Note however, that
+ * this is a *pointer* to a scanner, as it will be allocated by this call (and
+ * is the reason, too, why this function also must handle its own declaration).
+ * The user defined value in the first argument will be available to fts0balloc in
+ * the yyextra field.
+ */
+
+int fts0blex_init_extra(YY_EXTRA_TYPE yy_user_defined,yyscan_t* ptr_yy_globals )
+
+{
+	struct yyguts_t dummy_yyguts;
+
+	fts0bset_extra (yy_user_defined, &dummy_yyguts);
+
+	if (ptr_yy_globals == NULL){
+		errno = EINVAL;
+		return 1;
+	}
+
+	*ptr_yy_globals = (yyscan_t) fts0balloc ( sizeof( struct yyguts_t ), &dummy_yyguts );
+
+	if (*ptr_yy_globals == NULL){
+		errno = ENOMEM;
+		return 1;
+	}
+
+	/* By setting to 0xAA, we expose bugs in
+	yy_init_globals. Leave at 0x00 for releases. */
+	memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+
+	fts0bset_extra (yy_user_defined, *ptr_yy_globals);
+
+	return yy_init_globals ( *ptr_yy_globals );
+}
+
+static int yy_init_globals (yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	/* Initialization is the same as for the non-reentrant scanner.
+	 * This function is called from fts0blex_destroy(), so don't allocate here.
+	 */
+
+	yyg->yy_buffer_stack = 0;
+	yyg->yy_buffer_stack_top = 0;
+	yyg->yy_buffer_stack_max = 0;
+	yyg->yy_c_buf_p = (char *) 0;
+	yyg->yy_init = 0;
+	yyg->yy_start = 0;
+
+	yyg->yy_start_stack_ptr = 0;
+	yyg->yy_start_stack_depth = 0;
+	yyg->yy_start_stack =  NULL;
+
+	/* Defined in main.c */
+#ifdef YY_STDINIT
+	yyin = stdin;
+	yyout = stdout;
+#else
+	yyin = (FILE *) 0;
+	yyout = (FILE *) 0;
+#endif
+
+	/* For future reference: Set errno on error, since we are called by
+	 * fts0blex_init()
+	 */
+	return 0;
+}
+
+/* fts0blex_destroy is for both reentrant and non-reentrant scanners. */
+int fts0blex_destroy  (yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	/* Pop the buffer stack, destroying each element. */
+	while(YY_CURRENT_BUFFER){
+		fts0b_delete_buffer(YY_CURRENT_BUFFER ,yyscanner );
+		YY_CURRENT_BUFFER_LVALUE = NULL;
+		fts0bpop_buffer_state(yyscanner);
+	}
+
+	/* Destroy the stack itself. */
+	fts0bfree(yyg->yy_buffer_stack ,yyscanner);
+	yyg->yy_buffer_stack = NULL;
+
+	/* Destroy the start condition stack. */
+	fts0bfree(yyg->yy_start_stack ,yyscanner );
+	yyg->yy_start_stack = NULL;
+
+	/* Reset the globals. This is important in a non-reentrant scanner so the next time
+	 * fts0blex() is called, initialization will occur. */
+	yy_init_globals( yyscanner);
+
+	/* Destroy the main struct (reentrant only). */
+	fts0bfree ( yyscanner , yyscanner );
+	yyscanner = NULL;
+	return 0;
+}
+
+/*
+ * Internal utility routines.
+ */
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char* s1, yyconst char * s2, int n ,            yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	register int i;
+	for ( i = 0; i < n; ++i )
+		s1[i] = s2[i];
+}
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * s ,            yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	register int n;
+	for ( n = 0; s[n]; ++n )
+		;
+
+	return n;
+}
+#endif
+
+void *fts0balloc (yy_size_t  size ,            yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	return (void *) malloc( size );
+}
+
+void *fts0brealloc  (void * ptr, yy_size_t  size ,            yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	/* The cast to (char *) in the following accommodates both
+	 * implementations that use char* generic pointers, and those
+	 * that use void* generic pointers.  It works with the latter
+	 * because both ANSI C and C++ allow castless assignment from
+	 * any pointer type to void*, and deal with argument conversions
+	 * as though doing an assignment.
+	 */
+	return (void *) realloc( (char *) ptr, size );
+}
+
+void fts0bfree (void * ptr ,            yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	free( (char *) ptr );	/* see fts0brealloc() for (char *) cast */
+}
+
+#define YYTABLES_NAME "yytables"
+
+#line 73 "fts0blex.l"
+
diff --git a/storage/innobase/fts/fts0blex.l b/storage/innobase/fts/fts0blex.l
new file mode 100644
index 00000000000..ae6e8ffaa48
--- /dev/null
+++ b/storage/innobase/fts/fts0blex.l
@@ -0,0 +1,73 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+ * @file fts/fts0blex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+
+%{
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_blexer(YYSTYPE* val, yyscan_t yyscanner)
+
+%}
+
+%option noinput
+%option nounput
+%option noyywrap
+%option nostdinit
+%option reentrant
+%option never-interactive
+
+%%
+
+[\t ]+	/* Ignore whitespace */ ;
+
+[*()+\-<>~@]		{
+	val->oper = fts0bget_text(yyscanner)[0];
+
+	return(val->oper);
+}
+
+[0-9]+			{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner));
+
+	return(FTS_NUMB);
+}
+
+[^" \n*()+\-<>~@%]*		{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner));
+
+	return(FTS_TERM);
+}
+
+\"[^\"\n]*\"		{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner));
+
+	return(FTS_TEXT);
+}
+
+\n
+
+%%
diff --git a/storage/innobase/fts/fts0config.cc b/storage/innobase/fts/fts0config.cc
new file mode 100644
index 00000000000..5b4ae5c39f7
--- /dev/null
+++ b/storage/innobase/fts/fts0config.cc
@@ -0,0 +1,564 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fts/fts0config.cc
+Full Text Search configuration table.
+
+Created 2007/5/9 Sunny Bains
+***********************************************************************/
+
+#include "trx0roll.h"
+#include "row0sel.h"
+
+#include "fts0priv.h"
+
+#ifndef UNIV_NONINL
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#endif
+
+/******************************************************************//**
+Callback function for fetching the config value.
+@return always returns TRUE */
+static
+ibool
+fts_config_fetch_value(
+/*===================*/
+	void*		row,			/*!< in: sel_node_t* */
+	void*		user_arg)		/*!< in: pointer to
+						 ib_vector_t */
+{
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	fts_string_t*	value = static_cast<fts_string_t*>(user_arg);
+
+	dfield_t*	dfield = que_node_get_val(node->select_list);
+	dtype_t*	type = dfield_get_type(dfield);
+	ulint		len = dfield_get_len(dfield);
+	void*		data = dfield_get_data(dfield);
+
+	ut_a(dtype_get_mtype(type) == DATA_VARCHAR);
+
+	if (len != UNIV_SQL_NULL) {
+		ulint	max_len = ut_min(value->f_len - 1, len);
+
+		memcpy(value->f_str, data, max_len);
+		value->f_len = max_len;
+		value->f_str[value->f_len] = '\0';
+	}
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+Get value from the config table. The caller must ensure that enough
+space is allocated for value to hold the column contents.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_get_value(
+/*=================*/
+	trx_t*		trx,			/*!< transaction */
+	fts_table_t*	fts_table,		/*!< in: the indexed
+						FTS table */
+	const char*	name,			/*!< in: get config value for
+						this parameter name */
+	fts_string_t*	value)			/*!< out: value read from
+						config table */
+{
+	pars_info_t*	info;
+	que_t*		graph;
+	dberr_t		error;
+	ulint		name_len = strlen(name);
+
+	info = pars_info_create();
+
+	*value->f_str = '\0';
+	ut_a(value->f_len > 0);
+
+	pars_info_bind_function(info, "my_func", fts_config_fetch_value,
+				value);
+
+	/* The len field of value must be set to the max bytes that
+	it can hold. On a successful read, the len field will be set
+	to the actual number of bytes copied to value. */
+	pars_info_bind_varchar_literal(info, "name", (byte*) name, name_len);
+
+	fts_table->suffix = "CONFIG";
+
+	graph = fts_parse_sql(
+		fts_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS SELECT value FROM \"%s\""
+		" WHERE key = :name;\n"
+		"BEGIN\n"
+		""
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	trx->op_info = "getting FTS config value";
+
+	error = fts_eval_sql(trx, graph);
+
+	mutex_enter(&dict_sys->mutex);
+	que_graph_free(graph);
+	mutex_exit(&dict_sys->mutex);
+
+	return(error);
+}
+
+/*********************************************************************//**
+Create the config table name for retrieving index specific value.
+@return index config parameter name */
+UNIV_INTERN
+char*
+fts_config_create_index_param_name(
+/*===============================*/
+	const char*		param,		/*!< in: base name of param */
+	const dict_index_t*	index)		/*!< in: index for config */
+{
+	ulint		len;
+	char*		name;
+
+	/* The format of the config name is: name_<index_id>. */
+	len = strlen(param);
+
+	/* Caller is responsible for deleting name. */
+	name = static_cast<char*>(ut_malloc(
+		len + FTS_AUX_MIN_TABLE_ID_LENGTH + 2));
+	strcpy(name, param);
+	name[len] = '_';
+
+	fts_write_object_id(index->id, name + len + 1,
+			    DICT_TF2_FLAG_IS_SET(index->table,
+						 DICT_TF2_FTS_AUX_HEX_NAME));
+
+	return(name);
+}
+
+/******************************************************************//**
+Get value specific to an FTS index from the config table. The caller
+must ensure that enough space is allocated for value to hold the
+column contents.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_get_index_value(
+/*=======================*/
+	trx_t*		trx,			/*!< transaction */
+	dict_index_t*	index,			/*!< in: index */
+	const char*	param,			/*!< in: get config value for
+						this parameter name */
+	fts_string_t*	value)			/*!< out: value read from
+						config table */
+{
+	char*		name;
+	dberr_t		error;
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE,
+			   index->table);
+
+	/* We are responsible for free'ing name. */
+	name = fts_config_create_index_param_name(param, index);
+
+	error = fts_config_get_value(trx, &fts_table, name, value);
+
+	ut_free(name);
+
+	return(error);
+}
+
+/******************************************************************//**
+Set the value in the config table for name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_set_value(
+/*=================*/
+	trx_t*		trx,			/*!< transaction */
+	fts_table_t*	fts_table,		/*!< in: the indexed
+						FTS table */
+	const char*	name,			/*!< in: get config value for
+						this parameter name */
+	const fts_string_t*
+			value)			/*!< in: value to update */
+{
+	pars_info_t*	info;
+	que_t*		graph;
+	dberr_t		error;
+	undo_no_t	undo_no;
+	undo_no_t	n_rows_updated;
+	ulint		name_len = strlen(name);
+
+	info = pars_info_create();
+
+	pars_info_bind_varchar_literal(info, "name", (byte*) name, name_len);
+	pars_info_bind_varchar_literal(info, "value",
+				       value->f_str, value->f_len);
+
+	fts_table->suffix = "CONFIG";
+
+	graph = fts_parse_sql(
+		fts_table, info,
+		"BEGIN UPDATE \"%s\" SET value = :value WHERE key = :name;");
+
+	trx->op_info = "setting FTS config value";
+
+	undo_no = trx->undo_no;
+
+	error = fts_eval_sql(trx, graph);
+
+	fts_que_graph_free_check_lock(fts_table, NULL, graph);
+
+	n_rows_updated = trx->undo_no - undo_no;
+
+	/* Check if we need to do an insert. */
+	if (n_rows_updated == 0) {
+		info = pars_info_create();
+
+		pars_info_bind_varchar_literal(
+			info, "name", (byte*) name, name_len);
+
+		pars_info_bind_varchar_literal(
+			info, "value", value->f_str, value->f_len);
+
+		graph = fts_parse_sql(
+			fts_table, info,
+			"BEGIN\n"
+			"INSERT INTO \"%s\" VALUES(:name, :value);");
+
+		trx->op_info = "inserting FTS config value";
+
+		error = fts_eval_sql(trx, graph);
+
+		fts_que_graph_free_check_lock(fts_table, NULL, graph);
+	}
+
+	return(error);
+}
+
+/******************************************************************//**
+Set the value specific to an FTS index in the config table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_set_index_value(
+/*=======================*/
+	trx_t*		trx,			/*!< transaction */
+	dict_index_t*	index,			/*!< in: index */
+	const char*	param,			/*!< in: get config value for
+						this parameter name */
+	fts_string_t*	value)			/*!< out: value read from
+						config table */
+{
+	char*		name;
+	dberr_t		error;
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE,
+			   index->table);
+
+	/* We are responsible for free'ing name. */
+	name = fts_config_create_index_param_name(param, index);
+
+	error = fts_config_set_value(trx, &fts_table, name, value);
+
+	ut_free(name);
+
+	return(error);
+}
+
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+dberr_t
+fts_config_get_index_ulint(
+/*=======================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	const char*	name,			/*!< in: param name */
+	ulint*		int_value)		/*!< out: value */
+{
+	dberr_t		error;
+	fts_string_t	value;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value.*/
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
+
+	error = fts_config_get_index_value(trx, index, name, &value);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr, "  InnoDB: Error: (%s) reading `%s'\n",
+			ut_strerr(error), name);
+	} else {
+		*int_value = strtoul((char*) value.f_str, NULL, 10);
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+
+/******************************************************************//**
+Set an ulint value in the config table.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+dberr_t
+fts_config_set_index_ulint(
+/*=======================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	const char*	name,			/*!< in: param name */
+	ulint		int_value)		/*!< in: value */
+{
+	dberr_t		error;
+	fts_string_t	value;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value.*/
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
+
+	// FIXME: Get rid of snprintf
+	ut_a(FTS_MAX_INT_LEN < FTS_MAX_CONFIG_VALUE_LEN);
+
+	value.f_len = ut_snprintf(
+		(char*) value.f_str, FTS_MAX_INT_LEN, "%lu", int_value);
+
+	error = fts_config_set_index_value(trx, index, name, &value);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr, "  InnoDB: Error: (%s) writing `%s'\n",
+			ut_strerr(error), name);
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+dberr_t
+fts_config_get_ulint(
+/*=================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table,		/*!< in: the indexed
+						FTS table */
+	const char*	name,			/*!< in: param name */
+	ulint*		int_value)		/*!< out: value */
+{
+	dberr_t		error;
+	fts_string_t	value;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value.*/
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
+
+	error = fts_config_get_value(trx, fts_table, name, &value);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr, "  InnoDB: Error: (%s) reading `%s'\n",
+			ut_strerr(error), name);
+	} else {
+		*int_value = strtoul((char*) value.f_str, NULL, 10);
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+
+/******************************************************************//**
+Set an ulint value in the config table.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+dberr_t
+fts_config_set_ulint(
+/*=================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table,		/*!< in: the indexed
+						FTS table */
+	const char*	name,			/*!< in: param name */
+	ulint		int_value)		/*!< in: value */
+{
+	dberr_t		error;
+	fts_string_t	value;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value.*/
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
+
+	// FIXME: Get rid of snprintf
+	ut_a(FTS_MAX_INT_LEN < FTS_MAX_CONFIG_VALUE_LEN);
+
+	value.f_len = snprintf(
+		(char*) value.f_str, FTS_MAX_INT_LEN, "%lu", int_value);
+
+	error = fts_config_set_value(trx, fts_table, name, &value);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr, "  InnoDB: Error: (%s) writing `%s'\n",
+			ut_strerr(error), name);
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+
+/******************************************************************//**
+Increment the value in the config table for column name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_increment_value(
+/*=======================*/
+	trx_t*		trx,			/*!< transaction */
+	fts_table_t*	fts_table,		/*!< in: the indexed
+						FTS table */
+	const char*	name,			/*!< in: increment config value
+						for this parameter name */
+	ulint		delta)			/*!< in: increment by this
+						much */
+{
+	dberr_t		error;
+	fts_string_t	value;
+	que_t*		graph = NULL;
+	ulint		name_len = strlen(name);
+	pars_info_t*	info = pars_info_create();
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value.*/
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
+
+	*value.f_str = '\0';
+
+	pars_info_bind_varchar_literal(info, "name", (byte*) name, name_len);
+
+	pars_info_bind_function(
+		info, "my_func", fts_config_fetch_value, &value);
+
+	fts_table->suffix = "CONFIG";
+
+	graph = fts_parse_sql(
+		fts_table, info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS SELECT value FROM \"%s\""
+		" WHERE key = :name FOR UPDATE;\n"
+		"BEGIN\n"
+		""
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	trx->op_info = "read  FTS config value";
+
+	error = fts_eval_sql(trx, graph);
+
+	fts_que_graph_free_check_lock(fts_table, NULL, graph);
+
+	if (UNIV_UNLIKELY(error == DB_SUCCESS)) {
+		ulint		int_value;
+
+		int_value = strtoul((char*) value.f_str, NULL, 10);
+
+		int_value += delta;
+
+		ut_a(FTS_MAX_CONFIG_VALUE_LEN > FTS_MAX_INT_LEN);
+
+		// FIXME: Get rid of snprintf
+		value.f_len = snprintf(
+			(char*) value.f_str, FTS_MAX_INT_LEN, "%lu", int_value);
+
+		fts_config_set_value(trx, fts_table, name, &value);
+	}
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr, "  InnoDB: Error: (%s) "
+			"while incrementing %s.\n", ut_strerr(error), name);
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+
+/******************************************************************//**
+Increment the per index value in the config table for column name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_increment_index_value(
+/*=============================*/
+	trx_t*		trx,			/*!< transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	const char*	param,			/*!< in: increment config value
+						for this parameter name */
+	ulint		delta)			/*!< in: increment by this
+						much */
+{
+	char*		name;
+	dberr_t		error;
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE,
+			   index->table);
+
+	/* We are responsible for free'ing name. */
+	name = fts_config_create_index_param_name(param, index);
+
+	error = fts_config_increment_value(trx, &fts_table, name, delta);
+
+	ut_free(name);
+
+	return(error);
+}
+
diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc
new file mode 100644
index 00000000000..ef940ca78aa
--- /dev/null
+++ b/storage/innobase/fts/fts0fts.cc
@@ -0,0 +1,7550 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0fts.cc
+Full Text Search interface
+***********************************************************************/
+
+#include "trx0roll.h"
+#include "row0mysql.h"
+#include "row0upd.h"
+#include "dict0types.h"
+#include "row0sel.h"
+
+#include "fts0fts.h"
+#include "fts0priv.h"
+#include "fts0types.h"
+
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#include "dict0priv.h"
+#include "dict0stats.h"
+#include "btr0pcur.h"
+#include <vector>
+
+#include "ha_prototypes.h"
+
+#define FTS_MAX_ID_LEN	32
+
+/** Column name from the FTS config table */
+#define FTS_MAX_CACHE_SIZE_IN_MB	"cache_size_in_mb"
+
+/** Verify if a aux table name is a obsolete table
+by looking up the key word in the obsolete table names */
+#define FTS_IS_OBSOLETE_AUX_TABLE(table_name)			\
+	(strstr((table_name), "DOC_ID") != NULL			\
+	 || strstr((table_name), "ADDED") != NULL		\
+	 || strstr((table_name), "STOPWORDS") != NULL)
+
+/** This is maximum FTS cache for each table and would be
+a configurable variable */
+UNIV_INTERN ulong	fts_max_cache_size;
+
+/** Whether the total memory used for FTS cache is exhausted, and we will
+need a sync to free some memory */
+UNIV_INTERN bool       fts_need_sync = false;
+
+/** Variable specifying the total memory allocated for FTS cache */
+UNIV_INTERN ulong      fts_max_total_cache_size;
+
+/** This is FTS result cache limit for each query and would be
+a configurable variable */
+UNIV_INTERN ulong	fts_result_cache_limit;
+
+/** Variable specifying the maximum FTS max token size */
+UNIV_INTERN ulong	fts_max_token_size;
+
+/** Variable specifying the minimum FTS max token size */
+UNIV_INTERN ulong	fts_min_token_size;
+
+
+// FIXME: testing
+ib_time_t elapsed_time = 0;
+ulint n_nodes = 0;
+
+/** Error condition reported by fts_utf8_decode() */
+const ulint UTF8_ERROR = 0xFFFFFFFF;
+
+/** The cache size permissible lower limit (1K) */
+static const ulint FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB = 1;
+
+/** The cache size permissible upper limit (1G) */
+static const ulint FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB = 1024;
+
+/** Time to sleep after DEADLOCK error before retrying operation. */
+static const ulint FTS_DEADLOCK_RETRY_WAIT = 100000;
+
+#ifdef UNIV_PFS_RWLOCK
+UNIV_INTERN mysql_pfs_key_t	fts_cache_rw_lock_key;
+UNIV_INTERN mysql_pfs_key_t	fts_cache_init_rw_lock_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	fts_delete_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	fts_optimize_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	fts_bg_threads_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	fts_doc_id_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	fts_pll_tokenize_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/** variable to record innodb_fts_internal_tbl_name for information
+schema table INNODB_FTS_INSERTED etc. */
+UNIV_INTERN char* fts_internal_tbl_name		= NULL;
+
+/** InnoDB default stopword list:
+There are different versions of stopwords, the stop words listed
+below comes from "Google Stopword" list. Reference:
+http://meta.wikimedia.org/wiki/Stop_word_list/google_stop_word_list.
+The final version of InnoDB default stopword list is still pending
+for decision */
+const char *fts_default_stopword[] =
+{
+	"a",
+	"about",
+	"an",
+	"are",
+	"as",
+	"at",
+	"be",
+	"by",
+	"com",
+	"de",
+	"en",
+	"for",
+	"from",
+	"how",
+	"i",
+	"in",
+	"is",
+	"it",
+	"la",
+	"of",
+	"on",
+	"or",
+	"that",
+	"the",
+	"this",
+	"to",
+	"was",
+	"what",
+	"when",
+	"where",
+	"who",
+	"will",
+	"with",
+	"und",
+	"the",
+	"www",
+	NULL
+};
+
+/** For storing table info when checking for orphaned tables. */
+struct fts_aux_table_t {
+	table_id_t	id;		/*!< Table id */
+	table_id_t	parent_id;	/*!< Parent table id */
+	table_id_t	index_id;	/*!< Table FT index id */
+	char*		name;		/*!< Name of the table */
+};
+
+/** SQL statements for creating the ancillary common FTS tables. */
+static const char* fts_create_common_tables_sql = {
+	"BEGIN\n"
+	""
+	"CREATE TABLE \"%s_DELETED\" (\n"
+	"  doc_id BIGINT UNSIGNED\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND ON \"%s_DELETED\"(doc_id);\n"
+	""
+	"CREATE TABLE \"%s_DELETED_CACHE\" (\n"
+	"  doc_id BIGINT UNSIGNED\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND "
+		"ON \"%s_DELETED_CACHE\"(doc_id);\n"
+	""
+	"CREATE TABLE \"%s_BEING_DELETED\" (\n"
+	"  doc_id BIGINT UNSIGNED\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND "
+		"ON \"%s_BEING_DELETED\"(doc_id);\n"
+	""
+	"CREATE TABLE \"%s_BEING_DELETED_CACHE\" (\n"
+	"  doc_id BIGINT UNSIGNED\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND "
+		"ON \"%s_BEING_DELETED_CACHE\"(doc_id);\n"
+	""
+	"CREATE TABLE \"%s_CONFIG\" (\n"
+	"  key CHAR(50),\n"
+	"  value CHAR(50) NOT NULL\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND ON \"%s_CONFIG\"(key);\n"
+};
+
+#ifdef FTS_DOC_STATS_DEBUG
+/** Template for creating the FTS auxiliary index specific tables. This is
+mainly designed for the statistics work in the future */
+static const char* fts_create_index_tables_sql = {
+	"BEGIN\n"
+	""
+	"CREATE TABLE \"%s_DOC_ID\" (\n"
+	"   doc_id BIGINT UNSIGNED,\n"
+	"   word_count INTEGER UNSIGNED NOT NULL\n"
+	") COMPACT;\n"
+	"CREATE UNIQUE CLUSTERED INDEX IND ON \"%s_DOC_ID\"(doc_id);\n"
+};
+#endif
+
+/** Template for creating the ancillary FTS tables word index tables. */
+static const char* fts_create_index_sql = {
+	"BEGIN\n"
+	""
+	"CREATE UNIQUE CLUSTERED INDEX FTS_INDEX_TABLE_IND "
+		"ON \"%s\"(word, first_doc_id);\n"
+};
+
+/** FTS auxiliary table suffixes that are common to all FT indexes. */
+static const char* fts_common_tables[] = {
+	"BEING_DELETED",
+	"BEING_DELETED_CACHE",
+	"CONFIG",
+	"DELETED",
+	"DELETED_CACHE",
+	NULL
+};
+
+/** FTS auxiliary INDEX split intervals. */
+const  fts_index_selector_t fts_index_selector[] = {
+	{ 9, "INDEX_1" },
+	{ 65, "INDEX_2" },
+	{ 70, "INDEX_3" },
+	{ 75, "INDEX_4" },
+	{ 80, "INDEX_5" },
+	{ 85, "INDEX_6" },
+	{  0 , NULL	 }
+};
+
+/** Default config values for FTS indexes on a table. */
+static const char* fts_config_table_insert_values_sql =
+	"BEGIN\n"
+	"\n"
+	"INSERT INTO \"%s\" VALUES('"
+		FTS_MAX_CACHE_SIZE_IN_MB "', '256');\n"
+	""
+	"INSERT INTO \"%s\" VALUES('"
+		FTS_OPTIMIZE_LIMIT_IN_SECS  "', '180');\n"
+	""
+	"INSERT INTO \"%s\" VALUES ('"
+		FTS_SYNCED_DOC_ID "', '0');\n"
+	""
+	"INSERT INTO \"%s\" VALUES ('"
+		FTS_TOTAL_DELETED_COUNT "', '0');\n"
+	"" /* Note: 0 == FTS_TABLE_STATE_RUNNING */
+	"INSERT INTO \"%s\" VALUES ('"
+		FTS_TABLE_STATE "', '0');\n";
+
+/****************************************************************//**
+Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end.
+@return DB_SUCCESS if all OK  */
+static
+dberr_t
+fts_sync(
+/*=====*/
+	fts_sync_t*	sync)		/*!< in: sync state */
+	__attribute__((nonnull));
+
+/****************************************************************//**
+Release all resources help by the words rb tree e.g., the node ilist. */
+static
+void
+fts_words_free(
+/*===========*/
+	ib_rbt_t*	words)		/*!< in: rb tree of words */
+	__attribute__((nonnull));
+#ifdef FTS_CACHE_SIZE_DEBUG
+/****************************************************************//**
+Read the max cache size parameter from the config table. */
+static
+void
+fts_update_max_cache_size(
+/*======================*/
+	fts_sync_t*	sync);		/*!< in: sync state */
+#endif
+
+/*********************************************************************//**
+This function fetches the document just inserted right before
+we commit the transaction, and tokenize the inserted text data
+and insert into FTS auxiliary table and its cache.
+@return TRUE if successful */
+static
+ulint
+fts_add_doc_by_id(
+/*==============*/
+	fts_trx_table_t*ftt,		/*!< in: FTS trx table */
+	doc_id_t	doc_id,		/*!< in: doc id */
+	ib_vector_t*	fts_indexes __attribute__((unused)));
+					/*!< in: affected fts indexes */
+#ifdef FTS_DOC_STATS_DEBUG
+/****************************************************************//**
+Check whether a particular word (term) exists in the FTS index.
+@return DB_SUCCESS if all went fine */
+static
+dberr_t
+fts_is_word_in_index(
+/*=================*/
+	trx_t*		trx,		/*!< in: FTS query state */
+	que_t**		graph,		/*!< out: Query graph */
+	fts_table_t*	fts_table,	/*!< in: table instance */
+	const fts_string_t* word,	/*!< in: the word to check */
+	ibool*		found)		/*!< out: TRUE if exists */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* FTS_DOC_STATS_DEBUG */
+
+/******************************************************************//**
+Update the last document id. This function could create a new
+transaction to update the last document id.
+@return DB_SUCCESS if OK */
+static
+dberr_t
+fts_update_sync_doc_id(
+/*===================*/
+	const dict_table_t*	table,		/*!< in: table */
+	const char*		table_name,	/*!< in: table name, or NULL */
+	doc_id_t		doc_id,		/*!< in: last document id */
+	trx_t*			trx)		/*!< in: update trx, or NULL */
+	__attribute__((nonnull(1)));
+/********************************************************************
+Check if we should stop. */
+UNIV_INLINE
+ibool
+fts_is_stop_signalled(
+/*==================*/
+	fts_t*		fts)			/*!< in: fts instance */
+{
+	ibool		stop_signalled = FALSE;
+
+	mutex_enter(&fts->bg_threads_mutex);
+
+	if (fts->fts_status & BG_THREAD_STOP) {
+
+		stop_signalled = TRUE;
+	}
+
+	mutex_exit(&fts->bg_threads_mutex);
+
+	return(stop_signalled);
+}
+
+/****************************************************************//**
+This function loads the default InnoDB stopword list */
+static
+void
+fts_load_default_stopword(
+/*======================*/
+	fts_stopword_t*		stopword_info)	/*!< in: stopword info */
+{
+	fts_string_t		str;
+	mem_heap_t*		heap;
+	ib_alloc_t*		allocator;
+	ib_rbt_t*		stop_words;
+
+	allocator = stopword_info->heap;
+	heap = static_cast<mem_heap_t*>(allocator->arg);
+
+	if (!stopword_info->cached_stopword) {
+		/* For default stopword, we always use fts_utf8_string_cmp() */
+		stopword_info->cached_stopword = rbt_create(
+			sizeof(fts_tokenizer_word_t), fts_utf8_string_cmp);
+	}
+
+	stop_words = stopword_info->cached_stopword;
+
+	str.f_n_char = 0;
+
+	for (ulint i = 0; fts_default_stopword[i]; ++i) {
+		char*			word;
+		fts_tokenizer_word_t	new_word;
+
+		/* We are going to duplicate the value below. */
+		word = const_cast<char*>(fts_default_stopword[i]);
+
+		new_word.nodes = ib_vector_create(
+			allocator, sizeof(fts_node_t), 4);
+
+		str.f_len = ut_strlen(word);
+		str.f_str = reinterpret_cast<byte*>(word);
+
+		fts_utf8_string_dup(&new_word.text, &str, heap);
+
+		rbt_insert(stop_words, &new_word, &new_word);
+	}
+
+	stopword_info->status = STOPWORD_FROM_DEFAULT;
+}
+
+/****************************************************************//**
+Callback function to read a single stopword value.
+@return Always return TRUE */
+static
+ibool
+fts_read_stopword(
+/*==============*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to ib_vector_t */
+{
+	ib_alloc_t*	allocator;
+	fts_stopword_t*	stopword_info;
+	sel_node_t*	sel_node;
+	que_node_t*	exp;
+	ib_rbt_t*	stop_words;
+	dfield_t*	dfield;
+	fts_string_t	str;
+	mem_heap_t*	heap;
+	ib_rbt_bound_t	parent;
+
+	sel_node = static_cast<sel_node_t*>(row);
+	stopword_info = static_cast<fts_stopword_t*>(user_arg);
+
+	stop_words = stopword_info->cached_stopword;
+	allocator =  static_cast<ib_alloc_t*>(stopword_info->heap);
+	heap = static_cast<mem_heap_t*>(allocator->arg);
+
+	exp = sel_node->select_list;
+
+	/* We only need to read the first column */
+	dfield = que_node_get_val(exp);
+
+	str.f_n_char = 0;
+	str.f_str = static_cast<byte*>(dfield_get_data(dfield));
+	str.f_len = dfield_get_len(dfield);
+
+	/* Only create new node if it is a value not already existed */
+	if (str.f_len != UNIV_SQL_NULL
+	    && rbt_search(stop_words, &parent, &str) != 0) {
+
+		fts_tokenizer_word_t	new_word;
+
+		new_word.nodes = ib_vector_create(
+			allocator, sizeof(fts_node_t), 4);
+
+		new_word.text.f_str = static_cast<byte*>(
+			 mem_heap_alloc(heap, str.f_len + 1));
+
+		memcpy(new_word.text.f_str, str.f_str, str.f_len);
+
+		new_word.text.f_n_char = 0;
+		new_word.text.f_len = str.f_len;
+		new_word.text.f_str[str.f_len] = 0;
+
+		rbt_insert(stop_words, &new_word, &new_word);
+	}
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+Load user defined stopword from designated user table
+@return TRUE if load operation is successful */
+static
+ibool
+fts_load_user_stopword(
+/*===================*/
+	fts_t*		fts,			/*!< in: FTS struct */
+	const char*	stopword_table_name,	/*!< in: Stopword table
+						name */
+	fts_stopword_t*	stopword_info)		/*!< in: Stopword info */
+{
+	pars_info_t*	info;
+	que_t*		graph;
+	dberr_t		error = DB_SUCCESS;
+	ibool		ret = TRUE;
+	trx_t*		trx;
+	ibool		has_lock = fts->fts_status & TABLE_DICT_LOCKED;
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "Load user stopword table into FTS cache";
+
+	if (!has_lock) {
+		mutex_enter(&dict_sys->mutex);
+	}
+
+	/* Validate the user table existence and in the right
+	format */
+	stopword_info->charset = fts_valid_stopword_table(stopword_table_name);
+	if (!stopword_info->charset) {
+		ret = FALSE;
+		goto cleanup;
+	} else if (!stopword_info->cached_stopword) {
+		/* Create the stopword RB tree with the stopword column
+		charset. All comparison will use this charset */
+		stopword_info->cached_stopword = rbt_create_arg_cmp(
+			sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp,
+			stopword_info->charset);
+
+	}
+
+	info = pars_info_create();
+
+	pars_info_bind_id(info, TRUE, "table_stopword", stopword_table_name);
+
+	pars_info_bind_function(info, "my_func", fts_read_stopword,
+				stopword_info);
+
+	graph = fts_parse_sql_no_dict_lock(
+		NULL,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT value "
+		" FROM $table_stopword;\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for (;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+			stopword_info->status = STOPWORD_USER_TABLE;
+			break;
+		} else {
+
+			fts_sql_rollback(trx);
+
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, "  InnoDB: Warning: lock wait "
+					"timeout reading user stopword table. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, "  InnoDB: Error '%s' "
+					"while reading user stopword table.\n",
+					ut_strerr(error));
+				ret = FALSE;
+				break;
+			}
+		}
+	}
+
+	que_graph_free(graph);
+
+cleanup:
+	if (!has_lock) {
+		mutex_exit(&dict_sys->mutex);
+	}
+
+	trx_free_for_background(trx);
+	return(ret);
+}
+
+/******************************************************************//**
+Initialize the index cache. */
+static
+void
+fts_index_cache_init(
+/*=================*/
+	ib_alloc_t*		allocator,	/*!< in: the allocator to use */
+	fts_index_cache_t*	index_cache)	/*!< in: index cache */
+{
+	ulint			i;
+
+	ut_a(index_cache->words == NULL);
+
+	index_cache->words = rbt_create_arg_cmp(
+		sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp,
+		index_cache->charset);
+
+	ut_a(index_cache->doc_stats == NULL);
+
+	index_cache->doc_stats = ib_vector_create(
+		allocator, sizeof(fts_doc_stats_t), 4);
+
+	for (i = 0; fts_index_selector[i].value; ++i) {
+		ut_a(index_cache->ins_graph[i] == NULL);
+		ut_a(index_cache->sel_graph[i] == NULL);
+	}
+}
+
+/*********************************************************************//**
+Initialize FTS cache. */
+UNIV_INTERN
+void
+fts_cache_init(
+/*===========*/
+	fts_cache_t*	cache)		/*!< in: cache to initialize */
+{
+	ulint		i;
+
+	/* Just to make sure */
+	ut_a(cache->sync_heap->arg == NULL);
+
+	cache->sync_heap->arg = mem_heap_create(1024);
+
+	cache->total_size = 0;
+
+	mutex_enter((ib_mutex_t*) &cache->deleted_lock);
+	cache->deleted_doc_ids = ib_vector_create(
+		cache->sync_heap, sizeof(fts_update_t), 4);
+	mutex_exit((ib_mutex_t*) &cache->deleted_lock);
+
+	/* Reset the cache data for all the FTS indexes. */
+	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+		fts_index_cache_t*	index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		fts_index_cache_init(cache->sync_heap, index_cache);
+	}
+}
+
+/****************************************************************//**
+Create a FTS cache. */
+UNIV_INTERN
+fts_cache_t*
+fts_cache_create(
+/*=============*/
+	dict_table_t*	table)	/*!< in: table owns the FTS cache */
+{
+	mem_heap_t*	heap;
+	fts_cache_t*	cache;
+
+	heap = static_cast<mem_heap_t*>(mem_heap_create(512));
+
+	cache = static_cast<fts_cache_t*>(
+		mem_heap_zalloc(heap, sizeof(*cache)));
+
+	cache->cache_heap = heap;
+
+	rw_lock_create(fts_cache_rw_lock_key, &cache->lock, SYNC_FTS_CACHE);
+
+	rw_lock_create(
+		fts_cache_init_rw_lock_key, &cache->init_lock,
+		SYNC_FTS_CACHE_INIT);
+
+	mutex_create(
+		fts_delete_mutex_key, &cache->deleted_lock, SYNC_FTS_OPTIMIZE);
+
+	mutex_create(
+		fts_optimize_mutex_key, &cache->optimize_lock,
+		SYNC_FTS_OPTIMIZE);
+
+	mutex_create(
+		fts_doc_id_mutex_key, &cache->doc_id_lock, SYNC_FTS_OPTIMIZE);
+
+	/* This is the heap used to create the cache itself. */
+	cache->self_heap = ib_heap_allocator_create(heap);
+
+	/* This is a transient heap, used for storing sync data. */
+	cache->sync_heap = ib_heap_allocator_create(heap);
+	cache->sync_heap->arg = NULL;
+
+	fts_need_sync = false;
+
+	cache->sync = static_cast<fts_sync_t*>(
+		mem_heap_zalloc(heap, sizeof(fts_sync_t)));
+
+	cache->sync->table = table;
+
+	/* Create the index cache vector that will hold the inverted indexes. */
+	cache->indexes = ib_vector_create(
+		cache->self_heap, sizeof(fts_index_cache_t), 2);
+
+	fts_cache_init(cache);
+
+	cache->stopword_info.cached_stopword = NULL;
+	cache->stopword_info.charset = NULL;
+
+	cache->stopword_info.heap = cache->self_heap;
+
+	cache->stopword_info.status = STOPWORD_NOT_INIT;
+
+	return(cache);
+}
+
+/*******************************************************************//**
+Add a newly create index into FTS cache */
+UNIV_INTERN
+void
+fts_add_index(
+/*==========*/
+	dict_index_t*	index,		/*!< FTS index to be added */
+	dict_table_t*	table)		/*!< table */
+{
+	fts_t*			fts = table->fts;
+	fts_cache_t*		cache;
+	fts_index_cache_t*	index_cache;
+
+	ut_ad(fts);
+	cache = table->fts->cache;
+
+	rw_lock_x_lock(&cache->init_lock);
+
+	ib_vector_push(fts->indexes, &index);
+
+	index_cache = fts_find_index_cache(cache, index);
+
+	if (!index_cache) {
+		/* Add new index cache structure */
+		index_cache = fts_cache_index_cache_create(table, index);
+	}
+
+	rw_lock_x_unlock(&cache->init_lock);
+}
+
+/*******************************************************************//**
+recalibrate get_doc structure after index_cache in cache->indexes changed */
+static
+void
+fts_reset_get_doc(
+/*==============*/
+	fts_cache_t*	cache)	/*!< in: FTS index cache */
+{
+	fts_get_doc_t*  get_doc;
+	ulint		i;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&cache->init_lock, RW_LOCK_EX));
+#endif
+	ib_vector_reset(cache->get_docs);
+
+	for (i = 0; i < ib_vector_size(cache->indexes); i++) {
+		fts_index_cache_t*	ind_cache;
+
+		ind_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		get_doc = static_cast<fts_get_doc_t*>(
+			ib_vector_push(cache->get_docs, NULL));
+
+		memset(get_doc, 0x0, sizeof(*get_doc));
+
+		get_doc->index_cache = ind_cache;
+	}
+
+	ut_ad(ib_vector_size(cache->get_docs)
+	      == ib_vector_size(cache->indexes));
+}
+
+/*******************************************************************//**
+Check an index is in the table->indexes list
+@return TRUE if it exists */
+static
+ibool
+fts_in_dict_index(
+/*==============*/
+	dict_table_t*	table,		/*!< in: Table */
+	dict_index_t*	index_check)	/*!< in: index to be checked */
+{
+	dict_index_t*	index;
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (index == index_check) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Check an index is in the fts->cache->indexes list
+@return TRUE if it exists */
+static
+ibool
+fts_in_index_cache(
+/*===============*/
+	dict_table_t*	table,	/*!< in: Table */
+	dict_index_t*	index)	/*!< in: index to be checked */
+{
+	ulint	i;
+
+	for (i = 0; i < ib_vector_size(table->fts->cache->indexes); i++) {
+		fts_index_cache_t*      index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(table->fts->cache->indexes, i));
+
+		if (index_cache->index == index) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Check indexes in the fts->indexes is also present in index cache and
+table->indexes list
+@return TRUE if all indexes match */
+UNIV_INTERN
+ibool
+fts_check_cached_index(
+/*===================*/
+	dict_table_t*	table)	/*!< in: Table where indexes are dropped */
+{
+	ulint	i;
+
+	if (!table->fts || !table->fts->cache) {
+		return(TRUE);
+	}
+
+	ut_a(ib_vector_size(table->fts->indexes)
+	      == ib_vector_size(table->fts->cache->indexes));
+
+	for (i = 0; i < ib_vector_size(table->fts->indexes); i++) {
+		dict_index_t*	index;
+
+		index = static_cast<dict_index_t*>(
+			ib_vector_getp(table->fts->indexes, i));
+
+		if (!fts_in_index_cache(table, index)) {
+			return(FALSE);
+		}
+
+		if (!fts_in_dict_index(table, index)) {
+			return(FALSE);
+		}
+	}
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Drop auxiliary tables related to an FTS index
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+dberr_t
+fts_drop_index(
+/*===========*/
+	dict_table_t*	table,	/*!< in: Table where indexes are dropped */
+	dict_index_t*	index,	/*!< in: Index to be dropped */
+	trx_t*		trx)	/*!< in: Transaction for the drop */
+{
+	ib_vector_t*	indexes = table->fts->indexes;
+	dberr_t		err = DB_SUCCESS;
+
+	ut_a(indexes);
+
+	if ((ib_vector_size(indexes) == 1
+	    && (index == static_cast<dict_index_t*>(
+			ib_vector_getp(table->fts->indexes, 0))))
+	   || ib_vector_is_empty(indexes)) {
+		doc_id_t	current_doc_id;
+		doc_id_t	first_doc_id;
+
+		/* If we are dropping the only FTS index of the table,
+		remove it from optimize thread */
+		fts_optimize_remove_table(table);
+
+		DICT_TF2_FLAG_UNSET(table, DICT_TF2_FTS);
+
+		/* If Doc ID column is not added internally by FTS index,
+		we can drop all FTS auxiliary tables. Otherwise, we will
+		need to keep some common table such as CONFIG table, so
+		as to keep track of incrementing Doc IDs */
+		if (!DICT_TF2_FLAG_IS_SET(
+			table, DICT_TF2_FTS_HAS_DOC_ID)) {
+
+			err = fts_drop_tables(trx, table);
+
+			err = fts_drop_index_tables(trx, index);
+
+			fts_free(table);
+
+			return(err);
+		}
+
+		current_doc_id = table->fts->cache->next_doc_id;
+		first_doc_id = table->fts->cache->first_doc_id;
+		fts_cache_clear(table->fts->cache);
+		fts_cache_destroy(table->fts->cache);
+		table->fts->cache = fts_cache_create(table);
+		table->fts->cache->next_doc_id = current_doc_id;
+		table->fts->cache->first_doc_id = first_doc_id;
+	} else {
+		fts_cache_t*            cache = table->fts->cache;
+		fts_index_cache_t*      index_cache;
+
+		rw_lock_x_lock(&cache->init_lock);
+
+		index_cache = fts_find_index_cache(cache, index);
+
+		if (index_cache != NULL) {
+			if (index_cache->words) {
+				fts_words_free(index_cache->words);
+				rbt_free(index_cache->words);
+			}
+
+			ib_vector_remove(cache->indexes, *(void**) index_cache);
+		}
+
+		if (cache->get_docs) {
+			fts_reset_get_doc(cache);
+		}
+
+		rw_lock_x_unlock(&cache->init_lock);
+	}
+
+	err = fts_drop_index_tables(trx, index);
+
+	ib_vector_remove(indexes, (const void*) index);
+
+	return(err);
+}
+
+/****************************************************************//**
+Free the query graph but check whether dict_sys->mutex is already
+held */
+UNIV_INTERN
+void
+fts_que_graph_free_check_lock(
+/*==========================*/
+	fts_table_t*		fts_table,	/*!< in: FTS table */
+	const fts_index_cache_t*index_cache,	/*!< in: FTS index cache */
+	que_t*			graph)		/*!< in: query graph */
+{
+	ibool	has_dict = FALSE;
+
+	if (fts_table && fts_table->table) {
+		ut_ad(fts_table->table->fts);
+
+		has_dict = fts_table->table->fts->fts_status
+			 & TABLE_DICT_LOCKED;
+	} else if (index_cache) {
+		ut_ad(index_cache->index->table->fts);
+
+		has_dict = index_cache->index->table->fts->fts_status
+			 & TABLE_DICT_LOCKED;
+	}
+
+	if (!has_dict) {
+		mutex_enter(&dict_sys->mutex);
+	}
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	que_graph_free(graph);
+
+	if (!has_dict) {
+		mutex_exit(&dict_sys->mutex);
+	}
+}
+
+/****************************************************************//**
+Create an FTS index cache. */
+UNIV_INTERN
+CHARSET_INFO*
+fts_index_get_charset(
+/*==================*/
+	dict_index_t*		index)		/*!< in: FTS index */
+{
+	CHARSET_INFO*		charset = NULL;
+	dict_field_t*		field;
+	ulint			prtype;
+
+	field = dict_index_get_nth_field(index, 0);
+	prtype = field->col->prtype;
+
+	charset = innobase_get_fts_charset(
+		(int) (prtype & DATA_MYSQL_TYPE_MASK),
+		(uint) dtype_get_charset_coll(prtype));
+
+#ifdef FTS_DEBUG
+	/* Set up charset info for this index. Please note all
+	field of the FTS index should have the same charset */
+	for (i = 1; i < index->n_fields; i++) {
+		CHARSET_INFO*   fld_charset;
+
+		field = dict_index_get_nth_field(index, i);
+		prtype = field->col->prtype;
+
+		fld_charset = innobase_get_fts_charset(
+			(int)(prtype & DATA_MYSQL_TYPE_MASK),
+			(uint) dtype_get_charset_coll(prtype));
+
+		/* All FTS columns should have the same charset */
+		if (charset) {
+			ut_a(charset == fld_charset);
+		} else {
+			charset = fld_charset;
+		}
+	}
+#endif
+
+	return(charset);
+
+}
+/****************************************************************//**
+Create an FTS index cache.
+@return Index Cache */
+UNIV_INTERN
+fts_index_cache_t*
+fts_cache_index_cache_create(
+/*=========================*/
+	dict_table_t*		table,		/*!< in: table with FTS index */
+	dict_index_t*		index)		/*!< in: FTS index */
+{
+	ulint			n_bytes;
+	fts_index_cache_t*	index_cache;
+	fts_cache_t*		cache = table->fts->cache;
+
+	ut_a(cache != NULL);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&cache->init_lock, RW_LOCK_EX));
+#endif
+
+	/* Must not already exist in the cache vector. */
+	ut_a(fts_find_index_cache(cache, index) == NULL);
+
+	index_cache = static_cast<fts_index_cache_t*>(
+		ib_vector_push(cache->indexes, NULL));
+
+	memset(index_cache, 0x0, sizeof(*index_cache));
+
+	index_cache->index = index;
+
+	index_cache->charset = fts_index_get_charset(index);
+
+	n_bytes = sizeof(que_t*) * sizeof(fts_index_selector);
+
+	index_cache->ins_graph = static_cast<que_t**>(
+		mem_heap_zalloc(static_cast<mem_heap_t*>(
+			cache->self_heap->arg), n_bytes));
+
+	index_cache->sel_graph = static_cast<que_t**>(
+		mem_heap_zalloc(static_cast<mem_heap_t*>(
+			cache->self_heap->arg), n_bytes));
+
+	fts_index_cache_init(cache->sync_heap, index_cache);
+
+	if (cache->get_docs) {
+		fts_reset_get_doc(cache);
+	}
+
+	return(index_cache);
+}
+
+/****************************************************************//**
+Release all resources help by the words rb tree e.g., the node ilist. */
+static
+void
+fts_words_free(
+/*===========*/
+	ib_rbt_t*	words)			/*!< in: rb tree of words */
+{
+	const ib_rbt_node_t*	rbt_node;
+
+	/* Free the resources held by a word. */
+	for (rbt_node = rbt_first(words);
+	     rbt_node != NULL;
+	     rbt_node = rbt_first(words)) {
+
+		ulint			i;
+		fts_tokenizer_word_t*	word;
+
+		word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+		/* Free the ilists of this word. */
+		for (i = 0; i < ib_vector_size(word->nodes); ++i) {
+
+			fts_node_t* fts_node = static_cast<fts_node_t*>(
+				ib_vector_get(word->nodes, i));
+
+			ut_free(fts_node->ilist);
+			fts_node->ilist = NULL;
+		}
+
+		/* NOTE: We are responsible for free'ing the node */
+		ut_free(rbt_remove_node(words, rbt_node));
+	}
+}
+
+/*********************************************************************//**
+Clear cache. */
+UNIV_INTERN
+void
+fts_cache_clear(
+/*============*/
+	fts_cache_t*	cache)		/*!< in: cache */
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+		ulint			j;
+		fts_index_cache_t*	index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		fts_words_free(index_cache->words);
+
+		rbt_free(index_cache->words);
+
+		index_cache->words = NULL;
+
+		for (j = 0; fts_index_selector[j].value; ++j) {
+
+			if (index_cache->ins_graph[j] != NULL) {
+
+				fts_que_graph_free_check_lock(
+					NULL, index_cache,
+					index_cache->ins_graph[j]);
+
+				index_cache->ins_graph[j] = NULL;
+			}
+
+			if (index_cache->sel_graph[j] != NULL) {
+
+				fts_que_graph_free_check_lock(
+					NULL, index_cache,
+					index_cache->sel_graph[j]);
+
+				index_cache->sel_graph[j] = NULL;
+			}
+		}
+
+		index_cache->doc_stats = NULL;
+	}
+
+	mem_heap_free(static_cast<mem_heap_t*>(cache->sync_heap->arg));
+	cache->sync_heap->arg = NULL;
+
+	cache->total_size = 0;
+
+	mutex_enter((ib_mutex_t*) &cache->deleted_lock);
+	cache->deleted_doc_ids = NULL;
+	mutex_exit((ib_mutex_t*) &cache->deleted_lock);
+}
+
+/*********************************************************************//**
+Search the index specific cache for a particular FTS index.
+@return the index cache else NULL */
+UNIV_INLINE
+fts_index_cache_t*
+fts_get_index_cache(
+/*================*/
+	fts_cache_t*		cache,		/*!< in: cache to search */
+	const dict_index_t*	index)		/*!< in: index to search for */
+{
+	ulint			i;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own((rw_lock_t*) &cache->lock, RW_LOCK_EX)
+	      || rw_lock_own((rw_lock_t*) &cache->init_lock, RW_LOCK_EX));
+#endif
+
+	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+		fts_index_cache_t*	index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		if (index_cache->index == index) {
+
+			return(index_cache);
+		}
+	}
+
+	return(NULL);
+}
+
+#ifdef FTS_DEBUG
+/*********************************************************************//**
+Search the index cache for a get_doc structure.
+@return the fts_get_doc_t item else NULL */
+static
+fts_get_doc_t*
+fts_get_index_get_doc(
+/*==================*/
+	fts_cache_t*		cache,		/*!< in: cache to search */
+	const dict_index_t*	index)		/*!< in: index to search for */
+{
+	ulint			i;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own((rw_lock_t*) &cache->init_lock, RW_LOCK_EX));
+#endif
+
+	for (i = 0; i < ib_vector_size(cache->get_docs); ++i) {
+		fts_get_doc_t*	get_doc;
+
+		get_doc = static_cast<fts_get_doc_t*>(
+			ib_vector_get(cache->get_docs, i));
+
+		if (get_doc->index_cache->index == index) {
+
+			return(get_doc);
+		}
+	}
+
+	return(NULL);
+}
+#endif
+
+/**********************************************************************//**
+Free the FTS cache. */
+UNIV_INTERN
+void
+fts_cache_destroy(
+/*==============*/
+	fts_cache_t*	cache)			/*!< in: cache*/
+{
+	rw_lock_free(&cache->lock);
+	rw_lock_free(&cache->init_lock);
+	mutex_free(&cache->optimize_lock);
+	mutex_free(&cache->deleted_lock);
+	mutex_free(&cache->doc_id_lock);
+
+	if (cache->stopword_info.cached_stopword) {
+		rbt_free(cache->stopword_info.cached_stopword);
+	}
+
+	if (cache->sync_heap->arg) {
+		mem_heap_free(static_cast<mem_heap_t*>(cache->sync_heap->arg));
+	}
+
+	mem_heap_free(cache->cache_heap);
+}
+
+/**********************************************************************//**
+Find an existing word, or if not found, create one and return it.
+@return specified word token */
+static
+fts_tokenizer_word_t*
+fts_tokenizer_word_get(
+/*===================*/
+	fts_cache_t*	cache,			/*!< in: cache */
+	fts_index_cache_t*
+			index_cache,		/*!< in: index cache */
+	fts_string_t*	text)			/*!< in: node text */
+{
+	fts_tokenizer_word_t*	word;
+	ib_rbt_bound_t		parent;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&cache->lock, RW_LOCK_EX));
+#endif
+
+	/* If it is a stopword, do not index it */
+	if (cache->stopword_info.cached_stopword != NULL
+	    && rbt_search(cache->stopword_info.cached_stopword,
+		       &parent, text) == 0) {
+
+		return(NULL);
+	}
+
+	/* Check if we found a match, if not then add word to tree. */
+	if (rbt_search(index_cache->words, &parent, text) != 0) {
+		mem_heap_t*		heap;
+		fts_tokenizer_word_t	new_word;
+
+		heap = static_cast<mem_heap_t*>(cache->sync_heap->arg);
+
+		new_word.nodes = ib_vector_create(
+			cache->sync_heap, sizeof(fts_node_t), 4);
+
+		fts_utf8_string_dup(&new_word.text, text, heap);
+
+		parent.last = rbt_add_node(
+			index_cache->words, &parent, &new_word);
+
+		/* Take into account the RB tree memory use and the vector. */
+		cache->total_size += sizeof(new_word)
+			+ sizeof(ib_rbt_node_t)
+			+ text->f_len
+			+ (sizeof(fts_node_t) * 4)
+			+ sizeof(*new_word.nodes);
+
+		ut_ad(rbt_validate(index_cache->words));
+	}
+
+	word = rbt_value(fts_tokenizer_word_t, parent.last);
+
+	return(word);
+}
+
+/**********************************************************************//**
+Add the given doc_id/word positions to the given node's ilist. */
+UNIV_INTERN
+void
+fts_cache_node_add_positions(
+/*=========================*/
+	fts_cache_t*	cache,		/*!< in: cache */
+	fts_node_t*	node,		/*!< in: word node */
+	doc_id_t	doc_id,		/*!< in: doc id */
+	ib_vector_t*	positions)	/*!< in: fts_token_t::positions */
+{
+	ulint		i;
+	byte*		ptr;
+	byte*		ilist;
+	ulint		enc_len;
+	ulint		last_pos;
+	byte*		ptr_start;
+	ulint		doc_id_delta;
+
+#ifdef UNIV_SYNC_DEBUG
+	if (cache) {
+		ut_ad(rw_lock_own(&cache->lock, RW_LOCK_EX));
+	}
+#endif
+	ut_ad(doc_id >= node->last_doc_id);
+
+	/* Calculate the space required to store the ilist. */
+	doc_id_delta = (ulint)(doc_id - node->last_doc_id);
+	enc_len = fts_get_encoded_len(doc_id_delta);
+
+	last_pos = 0;
+	for (i = 0; i < ib_vector_size(positions); i++) {
+		ulint	pos = *(static_cast<ulint*>(
+			ib_vector_get(positions, i)));
+
+		ut_ad(last_pos == 0 || pos > last_pos);
+
+		enc_len += fts_get_encoded_len(pos - last_pos);
+		last_pos = pos;
+	}
+
+	/* The 0x00 byte at the end of the token positions list. */
+	enc_len++;
+
+	if ((node->ilist_size_alloc - node->ilist_size) >= enc_len) {
+		/* No need to allocate more space, we can fit in the new
+		data at the end of the old one. */
+		ilist = NULL;
+		ptr = node->ilist + node->ilist_size;
+	} else {
+		ulint	new_size = node->ilist_size + enc_len;
+
+		/* Over-reserve space by a fixed size for small lengths and
+		by 20% for lengths >= 48 bytes. */
+		if (new_size < 16) {
+			new_size = 16;
+		} else if (new_size < 32) {
+			new_size = 32;
+		} else if (new_size < 48) {
+			new_size = 48;
+		} else {
+			new_size = (ulint)(1.2 * new_size);
+		}
+
+		ilist = static_cast<byte*>(ut_malloc(new_size));
+		ptr = ilist + node->ilist_size;
+
+		node->ilist_size_alloc = new_size;
+	}
+
+	ptr_start = ptr;
+
+	/* Encode the new fragment. */
+	ptr += fts_encode_int(doc_id_delta, ptr);
+
+	last_pos = 0;
+	for (i = 0; i < ib_vector_size(positions); i++) {
+		ulint	pos = *(static_cast<ulint*>(
+			 ib_vector_get(positions, i)));
+
+		ptr += fts_encode_int(pos - last_pos, ptr);
+		last_pos = pos;
+	}
+
+	*ptr++ = 0;
+
+	ut_a(enc_len == (ulint)(ptr - ptr_start));
+
+	if (ilist) {
+		/* Copy old ilist to the start of the new one and switch the
+		new one into place in the node. */
+		if (node->ilist_size > 0) {
+			memcpy(ilist, node->ilist, node->ilist_size);
+			ut_free(node->ilist);
+		}
+
+		node->ilist = ilist;
+	}
+
+	node->ilist_size += enc_len;
+
+	if (cache) {
+		cache->total_size += enc_len;
+	}
+
+	if (node->first_doc_id == FTS_NULL_DOC_ID) {
+		node->first_doc_id = doc_id;
+	}
+
+	node->last_doc_id = doc_id;
+	++node->doc_count;
+}
+
+/**********************************************************************//**
+Add document to the cache. */
+static
+void
+fts_cache_add_doc(
+/*==============*/
+	fts_cache_t*	cache,			/*!< in: cache */
+	fts_index_cache_t*
+			index_cache,		/*!< in: index cache */
+	doc_id_t	doc_id,			/*!< in: doc id to add */
+	ib_rbt_t*	tokens)			/*!< in: document tokens */
+{
+	const ib_rbt_node_t*	node;
+	ulint			n_words;
+	fts_doc_stats_t*	doc_stats;
+
+	if (!tokens) {
+		return;
+	}
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&cache->lock, RW_LOCK_EX));
+#endif
+
+	n_words = rbt_size(tokens);
+
+	for (node = rbt_first(tokens); node; node = rbt_first(tokens)) {
+
+		fts_tokenizer_word_t*	word;
+		fts_node_t*		fts_node = NULL;
+		fts_token_t*		token = rbt_value(fts_token_t, node);
+
+		/* Find and/or add token to the cache. */
+		word = fts_tokenizer_word_get(
+			cache, index_cache, &token->text);
+
+		if (!word) {
+			ut_free(rbt_remove_node(tokens, node));
+			continue;
+		}
+
+		if (ib_vector_size(word->nodes) > 0) {
+			fts_node = static_cast<fts_node_t*>(
+				ib_vector_last(word->nodes));
+		}
+
+		if (fts_node == NULL
+		    || fts_node->ilist_size > FTS_ILIST_MAX_SIZE
+		    || doc_id < fts_node->last_doc_id) {
+
+			fts_node = static_cast<fts_node_t*>(
+				ib_vector_push(word->nodes, NULL));
+
+			memset(fts_node, 0x0, sizeof(*fts_node));
+
+			cache->total_size += sizeof(*fts_node);
+		}
+
+		fts_cache_node_add_positions(
+			cache, fts_node, doc_id, token->positions);
+
+		ut_free(rbt_remove_node(tokens, node));
+	}
+
+	ut_a(rbt_empty(tokens));
+
+	/* Add to doc ids processed so far. */
+	doc_stats = static_cast<fts_doc_stats_t*>(
+		ib_vector_push(index_cache->doc_stats, NULL));
+
+	doc_stats->doc_id = doc_id;
+	doc_stats->word_count = n_words;
+
+	/* Add the doc stats memory usage too. */
+	cache->total_size += sizeof(*doc_stats);
+
+	if (doc_id > cache->sync->max_doc_id) {
+		cache->sync->max_doc_id = doc_id;
+	}
+}
+
+/****************************************************************//**
+Drops a table. If the table can't be found we return a SUCCESS code.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_drop_table(
+/*===========*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	table_name)		/*!< in: table to drop */
+{
+	dict_table_t*	table;
+	dberr_t		error = DB_SUCCESS;
+
+	/* Check that the table exists in our data dictionary.
+	Similar to regular drop table case, we will open table with
+	DICT_ERR_IGNORE_INDEX_ROOT and DICT_ERR_IGNORE_CORRUPT option */
+	table = dict_table_open_on_name(
+		table_name, TRUE, FALSE,
+		static_cast<dict_err_ignore_t>(
+                        DICT_ERR_IGNORE_INDEX_ROOT | DICT_ERR_IGNORE_CORRUPT));
+
+	if (table != 0) {
+
+		dict_table_close(table, TRUE, FALSE);
+
+		/* Pass nonatomic=false (dont allow data dict unlock),
+		because the transaction may hold locks on SYS_* tables from
+		previous calls to fts_drop_table(). */
+		error = row_drop_table_for_mysql(table_name, trx, true, false);
+
+		if (error != DB_SUCCESS) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Unable to drop FTS index aux table %s: %s",
+				table_name, ut_strerr(error));
+		}
+	} else {
+		error = DB_FAIL;
+	}
+
+	return(error);
+}
+
+/****************************************************************//**
+Rename a single auxiliary table due to database name change.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_rename_one_aux_table(
+/*=====================*/
+	const char*	new_name,		/*!< in: new parent tbl name */
+	const char*	fts_table_old_name,	/*!< in: old aux tbl name */
+	trx_t*		trx)			/*!< in: transaction */
+{
+	char	fts_table_new_name[MAX_TABLE_NAME_LEN];
+	ulint	new_db_name_len = dict_get_db_name_len(new_name);
+	ulint	old_db_name_len = dict_get_db_name_len(fts_table_old_name);
+	ulint	table_new_name_len = strlen(fts_table_old_name)
+				     + new_db_name_len - old_db_name_len;
+
+	/* Check if the new and old database names are the same, if so,
+	nothing to do */
+	ut_ad((new_db_name_len != old_db_name_len)
+	      || strncmp(new_name, fts_table_old_name, old_db_name_len) != 0);
+
+	/* Get the database name from "new_name", and table name
+	from the fts_table_old_name */
+	strncpy(fts_table_new_name, new_name, new_db_name_len);
+	strncpy(fts_table_new_name + new_db_name_len,
+	       strchr(fts_table_old_name, '/'),
+	       table_new_name_len - new_db_name_len);
+	fts_table_new_name[table_new_name_len] = 0;
+
+	return(row_rename_table_for_mysql(
+		fts_table_old_name, fts_table_new_name, trx, false));
+}
+
+/****************************************************************//**
+Rename auxiliary tables for all fts index for a table. This(rename)
+is due to database name change
+@return DB_SUCCESS or error code */
+
+dberr_t
+fts_rename_aux_tables(
+/*==================*/
+	dict_table_t*	table,		/*!< in: user Table */
+	const char*     new_name,       /*!< in: new table name */
+	trx_t*		trx)		/*!< in: transaction */
+{
+	ulint		i;
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
+
+	/* Rename common auxiliary tables */
+	for (i = 0; fts_common_tables[i] != NULL; ++i) {
+		char*	old_table_name;
+		dberr_t	err = DB_SUCCESS;
+
+		fts_table.suffix = fts_common_tables[i];
+
+		old_table_name = fts_get_table_name(&fts_table);
+
+		err = fts_rename_one_aux_table(new_name, old_table_name, trx);
+
+		mem_free(old_table_name);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	fts_t*	fts = table->fts;
+
+	/* Rename index specific auxiliary tables */
+	for (i = 0; fts->indexes != 0 && i < ib_vector_size(fts->indexes);
+	     ++i) {
+		dict_index_t*	index;
+
+		index = static_cast<dict_index_t*>(
+			ib_vector_getp(fts->indexes, i));
+
+		FTS_INIT_INDEX_TABLE(&fts_table, NULL, FTS_INDEX_TABLE, index);
+
+		for (ulint j = 0; fts_index_selector[j].value; ++j) {
+			dberr_t	err;
+			char*	old_table_name;
+
+			fts_table.suffix = fts_get_suffix(j);
+
+			old_table_name = fts_get_table_name(&fts_table);
+
+			err = fts_rename_one_aux_table(
+				new_name, old_table_name, trx);
+
+			DBUG_EXECUTE_IF("fts_rename_failure",
+					err = DB_DEADLOCK;
+					fts_sql_rollback(trx););
+
+			mem_free(old_table_name);
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/****************************************************************//**
+Drops the common ancillary tables needed for supporting an FTS index
+on the given table. row_mysql_lock_data_dictionary must have been called
+before this.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_drop_common_tables(
+/*===================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table)		/*!< in: table with an FTS
+						index */
+{
+	ulint		i;
+	dberr_t		error = DB_SUCCESS;
+
+	for (i = 0; fts_common_tables[i] != NULL; ++i) {
+		dberr_t	err;
+		char*	table_name;
+
+		fts_table->suffix = fts_common_tables[i];
+
+		table_name = fts_get_table_name(fts_table);
+
+		err = fts_drop_table(trx, table_name);
+
+		/* We only return the status of the last error. */
+		if (err != DB_SUCCESS && err != DB_FAIL) {
+			error = err;
+		}
+
+		mem_free(table_name);
+	}
+
+	return(error);
+}
+
+/****************************************************************//**
+Since we do a horizontal split on the index table, we need to drop
+all the split tables.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_drop_index_split_tables(
+/*========================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index)			/*!< in: fts instance */
+
+{
+	ulint		i;
+	fts_table_t	fts_table;
+	dberr_t		error = DB_SUCCESS;
+
+	FTS_INIT_INDEX_TABLE(&fts_table, NULL, FTS_INDEX_TABLE, index);
+
+	for (i = 0; fts_index_selector[i].value; ++i) {
+		dberr_t	err;
+		char*	table_name;
+
+		fts_table.suffix = fts_get_suffix(i);
+
+		table_name = fts_get_table_name(&fts_table);
+
+		err = fts_drop_table(trx, table_name);
+
+		/* We only return the status of the last error. */
+		if (err != DB_SUCCESS && err != DB_FAIL) {
+			error = err;
+		}
+
+		mem_free(table_name);
+	}
+
+	return(error);
+}
+
+/****************************************************************//**
+Drops FTS auxiliary tables for an FTS index
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_drop_index_tables(
+/*==================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index)		/*!< in: Index to drop */
+{
+	dberr_t			error = DB_SUCCESS;
+
+#ifdef FTS_DOC_STATS_DEBUG
+	fts_table_t		fts_table;
+	static const char*	index_tables[] = {
+		"DOC_ID",
+		NULL
+	};
+#endif /* FTS_DOC_STATS_DEBUG */
+
+	dberr_t	err = fts_drop_index_split_tables(trx, index);
+
+	/* We only return the status of the last error. */
+	if (err != DB_SUCCESS) {
+		error = err;
+	}
+
+#ifdef FTS_DOC_STATS_DEBUG
+	FTS_INIT_INDEX_TABLE(&fts_table, NULL, FTS_INDEX_TABLE, index);
+
+	for (ulint i = 0; index_tables[i] != NULL; ++i) {
+		char*	table_name;
+
+		fts_table.suffix = index_tables[i];
+
+		table_name = fts_get_table_name(&fts_table);
+
+		err = fts_drop_table(trx, table_name);
+
+		/* We only return the status of the last error. */
+		if (err != DB_SUCCESS && err != DB_FAIL) {
+			error = err;
+		}
+
+		mem_free(table_name);
+	}
+#endif /* FTS_DOC_STATS_DEBUG */
+
+	return(error);
+}
+
+/****************************************************************//**
+Drops FTS ancillary tables needed for supporting an FTS index
+on the given table. row_mysql_lock_data_dictionary must have been called
+before this.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_drop_all_index_tables(
+/*======================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_t*		fts)			/*!< in: fts instance */
+{
+	dberr_t		error = DB_SUCCESS;
+
+	for (ulint i = 0;
+	     fts->indexes != 0 && i < ib_vector_size(fts->indexes);
+	     ++i) {
+
+		dberr_t		err;
+		dict_index_t*	index;
+
+		index = static_cast<dict_index_t*>(
+			ib_vector_getp(fts->indexes, i));
+
+		err = fts_drop_index_tables(trx, index);
+
+		if (err != DB_SUCCESS) {
+			error = err;
+		}
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Drops the ancillary tables needed for supporting an FTS index on a
+given table. row_mysql_lock_data_dictionary must have been called before
+this.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_drop_tables(
+/*============*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_table_t*	table)		/*!< in: table has the FTS index */
+{
+	dberr_t		error;
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
+
+	/* TODO: This is not atomic and can cause problems during recovery. */
+
+	error = fts_drop_common_tables(trx, &fts_table);
+
+	if (error == DB_SUCCESS) {
+		error = fts_drop_all_index_tables(trx, table->fts);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Prepare the SQL, so that all '%s' are replaced by the common prefix.
+@return sql string, use mem_free() to free the memory */
+static
+char*
+fts_prepare_sql(
+/*============*/
+	fts_table_t*	fts_table,	/*!< in: table name info */
+	const char*	my_template)	/*!< in: sql template */
+{
+	char*		sql;
+	char*		name_prefix;
+
+	name_prefix = fts_get_table_name_prefix(fts_table);
+	sql = ut_strreplace(my_template, "%s", name_prefix);
+	mem_free(name_prefix);
+
+	return(sql);
+}
+
+/*********************************************************************//**
+Creates the common ancillary tables needed for supporting an FTS index
+on the given table. row_mysql_lock_data_dictionary must have been called
+before this.
+@return DB_SUCCESS if succeed */
+UNIV_INTERN
+dberr_t
+fts_create_common_tables(
+/*=====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const dict_table_t* table,	/*!< in: table with FTS index */
+	const char*	name,		/*!< in: table name normalized.*/
+	bool		skip_doc_id_index)/*!< in: Skip index on doc id */
+{
+	char*		sql;
+	dberr_t		error;
+	que_t*		graph;
+	fts_table_t	fts_table;
+	mem_heap_t*	heap = mem_heap_create(1024);
+	pars_info_t*	info;
+
+	FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
+
+	error = fts_drop_common_tables(trx, &fts_table);
+
+	if (error != DB_SUCCESS) {
+
+		goto func_exit;
+	}
+
+	/* Create the FTS tables that are common to an FTS index. */
+	sql = fts_prepare_sql(&fts_table, fts_create_common_tables_sql);
+	graph = fts_parse_sql_no_dict_lock(NULL, NULL, sql);
+	mem_free(sql);
+
+	error = fts_eval_sql(trx, graph);
+
+	que_graph_free(graph);
+
+	if (error != DB_SUCCESS) {
+
+		goto func_exit;
+	}
+
+	/* Write the default settings to the config table. */
+	fts_table.suffix = "CONFIG";
+	graph = fts_parse_sql_no_dict_lock(
+		&fts_table, NULL, fts_config_table_insert_values_sql);
+
+	error = fts_eval_sql(trx, graph);
+
+	que_graph_free(graph);
+
+	if (error != DB_SUCCESS || skip_doc_id_index) {
+
+		goto func_exit;
+	}
+
+	info = pars_info_create();
+
+	pars_info_bind_id(info, TRUE, "table_name", name);
+	pars_info_bind_id(info, TRUE, "index_name", FTS_DOC_ID_INDEX_NAME);
+	pars_info_bind_id(info, TRUE, "doc_id_col_name", FTS_DOC_ID_COL_NAME);
+
+	/* Create the FTS DOC_ID index on the hidden column. Currently this
+	is common for any FT index created on the table. */
+	graph = fts_parse_sql_no_dict_lock(
+		NULL,
+		info,
+		mem_heap_printf(
+			heap,
+			"BEGIN\n"
+			""
+			"CREATE UNIQUE INDEX $index_name ON $table_name("
+			"$doc_id_col_name);\n"));
+
+	error = fts_eval_sql(trx, graph);
+	que_graph_free(graph);
+
+func_exit:
+	if (error != DB_SUCCESS) {
+		/* We have special error handling here */
+
+		trx->error_state = DB_SUCCESS;
+
+		trx_rollback_to_savepoint(trx, NULL);
+
+		row_drop_table_for_mysql(table->name, trx, FALSE);
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	mem_heap_free(heap);
+
+	return(error);
+}
+
+/*************************************************************//**
+Wrapper function of fts_create_index_tables_low(), create auxiliary
+tables for an FTS index
+@return: DB_SUCCESS or error code */
+static
+dict_table_t*
+fts_create_one_index_table(
+/*=======================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const dict_index_t*
+			index,		/*!< in: the index instance */
+	fts_table_t*	fts_table,	/*!< in: fts_table structure */
+	mem_heap_t*	heap)		/*!< in: heap */
+{
+	dict_field_t*		field;
+	dict_table_t*		new_table = NULL;
+	char*			table_name = fts_get_table_name(fts_table);
+	dberr_t			error;
+	CHARSET_INFO*		charset;
+	ulint			flags2 = 0;
+
+	ut_ad(index->type & DICT_FTS);
+
+	if (srv_file_per_table) {
+		flags2 = DICT_TF2_USE_TABLESPACE;
+	}
+
+	new_table = dict_mem_table_create(table_name, 0, 5, 1, flags2);
+
+	field = dict_index_get_nth_field(index, 0);
+	charset = innobase_get_fts_charset(
+		(int)(field->col->prtype & DATA_MYSQL_TYPE_MASK),
+		(uint) dtype_get_charset_coll(field->col->prtype));
+
+	if (strcmp(charset->name, "latin1_swedish_ci") == 0) {
+		dict_mem_table_add_col(new_table, heap, "word", DATA_VARCHAR,
+				       field->col->prtype, FTS_MAX_WORD_LEN);
+	} else {
+		dict_mem_table_add_col(new_table, heap, "word", DATA_VARMYSQL,
+				       field->col->prtype, FTS_MAX_WORD_LEN);
+	}
+
+	dict_mem_table_add_col(new_table, heap, "first_doc_id", DATA_INT,
+			       DATA_NOT_NULL | DATA_UNSIGNED,
+			       sizeof(doc_id_t));
+
+	dict_mem_table_add_col(new_table, heap, "last_doc_id", DATA_INT,
+			       DATA_NOT_NULL | DATA_UNSIGNED,
+			       sizeof(doc_id_t));
+
+	dict_mem_table_add_col(new_table, heap, "doc_count", DATA_INT,
+			       DATA_NOT_NULL | DATA_UNSIGNED, 4);
+
+	dict_mem_table_add_col(new_table, heap, "ilist", DATA_BLOB,
+			       4130048,	0);
+
+	error = row_create_table_for_mysql(new_table, trx, false);
+
+	if (error != DB_SUCCESS) {
+		trx->error_state = error;
+		dict_mem_table_free(new_table);
+		new_table = NULL;
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Fail to create FTS index table %s", table_name);
+	}
+
+	mem_free(table_name);
+
+	return(new_table);
+}
+
+/*************************************************************//**
+Wrapper function of fts_create_index_tables_low(), create auxiliary
+tables for an FTS index
+@return: DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_create_index_tables_low(
+/*========================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const dict_index_t*
+			index,		/*!< in: the index instance */
+	const char*	table_name,	/*!< in: the table name */
+	table_id_t	table_id)	/*!< in: the table id */
+
+{
+	ulint		i;
+	que_t*		graph;
+	fts_table_t	fts_table;
+	dberr_t		error = DB_SUCCESS;
+	mem_heap_t*	heap = mem_heap_create(1024);
+
+	fts_table.type = FTS_INDEX_TABLE;
+	fts_table.index_id = index->id;
+	fts_table.table_id = table_id;
+	fts_table.parent = table_name;
+	fts_table.table = index->table;
+
+#ifdef FTS_DOC_STATS_DEBUG
+	char*		sql;
+
+	/* Create the FTS auxiliary tables that are specific
+	to an FTS index. */
+	sql = fts_prepare_sql(&fts_table, fts_create_index_tables_sql);
+
+	graph = fts_parse_sql_no_dict_lock(NULL, NULL, sql);
+	mem_free(sql);
+
+	error = fts_eval_sql(trx, graph);
+	que_graph_free(graph);
+#endif /* FTS_DOC_STATS_DEBUG */
+
+	for (i = 0; fts_index_selector[i].value && error == DB_SUCCESS; ++i) {
+		dict_table_t*	new_table;
+
+		/* Create the FTS auxiliary tables that are specific
+		to an FTS index. We need to preserve the table_id %s
+		which fts_parse_sql_no_dict_lock() will fill in for us. */
+		fts_table.suffix = fts_get_suffix(i);
+
+		new_table = fts_create_one_index_table(
+			trx, index, &fts_table, heap);
+
+		if (!new_table) {
+			error = DB_FAIL;
+			break;
+		}
+
+		graph = fts_parse_sql_no_dict_lock(
+			&fts_table, NULL, fts_create_index_sql);
+
+		error = fts_eval_sql(trx, graph);
+		que_graph_free(graph);
+	}
+
+	if (error != DB_SUCCESS) {
+		/* We have special error handling here */
+
+		trx->error_state = DB_SUCCESS;
+
+		trx_rollback_to_savepoint(trx, NULL);
+
+		row_drop_table_for_mysql(table_name, trx, FALSE);
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	mem_heap_free(heap);
+
+	return(error);
+}
+
+/******************************************************************//**
+Creates the column specific ancillary tables needed for supporting an
+FTS index on the given table. row_mysql_lock_data_dictionary must have
+been called before this.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_create_index_tables(
+/*====================*/
+	trx_t*			trx,	/*!< in: transaction */
+	const dict_index_t*	index)	/*!< in: the index instance */
+{
+	dberr_t		err;
+	dict_table_t*	table;
+
+	table = dict_table_get_low(index->table_name);
+	ut_a(table != NULL);
+
+	err = fts_create_index_tables_low(trx, index, table->name, table->id);
+
+	if (err == DB_SUCCESS) {
+		trx_commit(trx);
+	}
+
+	return(err);
+}
+#if 0
+/******************************************************************//**
+Return string representation of state. */
+static
+const char*
+fts_get_state_str(
+/*==============*/
+				/* out: string representation of state */
+	fts_row_state	state)	/*!< in: state */
+{
+	switch (state) {
+	case FTS_INSERT:
+		return("INSERT");
+
+	case FTS_MODIFY:
+		return("MODIFY");
+
+	case FTS_DELETE:
+		return("DELETE");
+
+	case FTS_NOTHING:
+		return("NOTHING");
+
+	case FTS_INVALID:
+		return("INVALID");
+
+	default:
+		return("UNKNOWN");
+	}
+}
+#endif
+
+/******************************************************************//**
+Calculate the new state of a row given the existing state and a new event.
+@return new state of row */
+static
+fts_row_state
+fts_trx_row_get_new_state(
+/*======================*/
+	fts_row_state	old_state,		/*!< in: existing state of row */
+	fts_row_state	event)			/*!< in: new event */
+{
+	/* The rules for transforming states:
+
+	I = inserted
+	M = modified
+	D = deleted
+	N = nothing
+
+	M+D -> D:
+
+	If the row existed before the transaction started and it is modified
+	during the transaction, followed by a deletion of the row, only the
+	deletion will be signaled.
+
+	M+ -> M:
+
+	If the row existed before the transaction started and it is modified
+	more than once during the transaction, only the last modification
+	will be signaled.
+
+	IM*D -> N:
+
+	If a new row is added during the transaction (and possibly modified
+	after its initial insertion) but it is deleted before the end of the
+	transaction, nothing will be signaled.
+
+	IM* -> I:
+
+	If a new row is added during the transaction and modified after its
+	initial insertion, only the addition will be signaled.
+
+	M*DI -> M:
+
+	If the row existed before the transaction started and it is deleted,
+	then re-inserted, only a modification will be signaled. Note that
+	this case is only possible if the table is using the row's primary
+	key for FTS row ids, since those can be re-inserted by the user,
+	which is not true for InnoDB generated row ids.
+
+	It is easily seen that the above rules decompose such that we do not
+	need to store the row's entire history of events. Instead, we can
+	store just one state for the row and update that when new events
+	arrive. Then we can implement the above rules as a two-dimensional
+	look-up table, and get checking of invalid combinations "for free"
+	in the process. */
+
+	/* The lookup table for transforming states. old_state is the
+	Y-axis, event is the X-axis. */
+	static const fts_row_state table[4][4] = {
+			/*    I            M            D            N */
+		/* I */	{ FTS_INVALID, FTS_INSERT,  FTS_NOTHING, FTS_INVALID },
+		/* M */	{ FTS_INVALID, FTS_MODIFY,  FTS_DELETE,  FTS_INVALID },
+		/* D */	{ FTS_MODIFY,  FTS_INVALID, FTS_INVALID, FTS_INVALID },
+		/* N */	{ FTS_INVALID, FTS_INVALID, FTS_INVALID, FTS_INVALID }
+	};
+
+	fts_row_state result;
+
+	ut_a(old_state < FTS_INVALID);
+	ut_a(event < FTS_INVALID);
+
+	result = table[(int) old_state][(int) event];
+	ut_a(result != FTS_INVALID);
+
+	return(result);
+}
+
+/******************************************************************//**
+Create a savepoint instance.
+@return savepoint instance */
+static
+fts_savepoint_t*
+fts_savepoint_create(
+/*=================*/
+	ib_vector_t*	savepoints,		/*!< out: InnoDB transaction */
+	const char*	name,			/*!< in: savepoint name */
+	mem_heap_t*	heap)			/*!< in: heap */
+{
+	fts_savepoint_t*	savepoint;
+
+	savepoint = static_cast<fts_savepoint_t*>(
+		ib_vector_push(savepoints, NULL));
+
+	memset(savepoint, 0x0, sizeof(*savepoint));
+
+	if (name) {
+		savepoint->name = mem_heap_strdup(heap, name);
+	}
+
+	savepoint->tables = rbt_create(
+		sizeof(fts_trx_table_t*), fts_trx_table_cmp);
+
+	return(savepoint);
+}
+
+/******************************************************************//**
+Create an FTS trx.
+@return FTS trx  */
+static
+fts_trx_t*
+fts_trx_create(
+/*===========*/
+	trx_t*	trx)				/*!< in/out: InnoDB
+						transaction */
+{
+	fts_trx_t*		ftt;
+	ib_alloc_t*		heap_alloc;
+	mem_heap_t*		heap = mem_heap_create(1024);
+	trx_named_savept_t*	savep;
+
+	ut_a(trx->fts_trx == NULL);
+
+	ftt = static_cast<fts_trx_t*>(mem_heap_alloc(heap, sizeof(fts_trx_t)));
+	ftt->trx = trx;
+	ftt->heap = heap;
+
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	ftt->savepoints = static_cast<ib_vector_t*>(ib_vector_create(
+		heap_alloc, sizeof(fts_savepoint_t), 4));
+
+	ftt->last_stmt = static_cast<ib_vector_t*>(ib_vector_create(
+		heap_alloc, sizeof(fts_savepoint_t), 4));
+
+	/* Default instance has no name and no heap. */
+	fts_savepoint_create(ftt->savepoints, NULL, NULL);
+	fts_savepoint_create(ftt->last_stmt, NULL, NULL);
+
+	/* Copy savepoints that already set before. */
+	for (savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+	     savep != NULL;
+	     savep = UT_LIST_GET_NEXT(trx_savepoints, savep)) {
+
+		fts_savepoint_take(trx, ftt, savep->name);
+	}
+
+	return(ftt);
+}
+
+/******************************************************************//**
+Create an FTS trx table.
+@return FTS trx table */
+static
+fts_trx_table_t*
+fts_trx_table_create(
+/*=================*/
+	fts_trx_t*	fts_trx,		/*!< in: FTS trx */
+	dict_table_t*	table)			/*!< in: table */
+{
+	fts_trx_table_t*	ftt;
+
+	ftt = static_cast<fts_trx_table_t*>(
+		mem_heap_alloc(fts_trx->heap, sizeof(*ftt)));
+
+	memset(ftt, 0x0, sizeof(*ftt));
+
+	ftt->table = table;
+	ftt->fts_trx = fts_trx;
+
+	ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_trx_row_doc_id_cmp);
+
+	return(ftt);
+}
+
+/******************************************************************//**
+Clone an FTS trx table.
+@return FTS trx table */
+static
+fts_trx_table_t*
+fts_trx_table_clone(
+/*=================*/
+	const fts_trx_table_t*	ftt_src)	/*!< in: FTS trx */
+{
+	fts_trx_table_t*	ftt;
+
+	ftt = static_cast<fts_trx_table_t*>(
+		mem_heap_alloc(ftt_src->fts_trx->heap, sizeof(*ftt)));
+
+	memset(ftt, 0x0, sizeof(*ftt));
+
+	ftt->table = ftt_src->table;
+	ftt->fts_trx = ftt_src->fts_trx;
+
+	ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_trx_row_doc_id_cmp);
+
+	/* Copy the rb tree values to the new savepoint. */
+	rbt_merge_uniq(ftt->rows, ftt_src->rows);
+
+	/* These are only added on commit. At this stage we only have
+	the updated row state. */
+	ut_a(ftt_src->added_doc_ids == NULL);
+
+	return(ftt);
+}
+
+/******************************************************************//**
+Initialize the FTS trx instance.
+@return FTS trx instance */
+static
+fts_trx_table_t*
+fts_trx_init(
+/*=========*/
+	trx_t*			trx,		/*!< in: transaction */
+	dict_table_t*		table,		/*!< in: FTS table instance */
+	ib_vector_t*		savepoints)	/*!< in: Savepoints */
+{
+	fts_trx_table_t*	ftt;
+	ib_rbt_bound_t		parent;
+	ib_rbt_t*		tables;
+	fts_savepoint_t*	savepoint;
+
+	savepoint = static_cast<fts_savepoint_t*>(ib_vector_last(savepoints));
+
+	tables = savepoint->tables;
+	rbt_search_cmp(tables, &parent, &table->id, fts_trx_table_id_cmp, NULL);
+
+	if (parent.result == 0) {
+		fts_trx_table_t**	fttp;
+
+		fttp = rbt_value(fts_trx_table_t*, parent.last);
+		ftt = *fttp;
+	} else {
+		ftt = fts_trx_table_create(trx->fts_trx, table);
+		rbt_add_node(tables, &parent, &ftt);
+	}
+
+	ut_a(ftt->table == table);
+
+	return(ftt);
+}
+
+/******************************************************************//**
+Notify the FTS system about an operation on an FTS-indexed table. */
+static
+void
+fts_trx_table_add_op(
+/*=================*/
+	fts_trx_table_t*ftt,			/*!< in: FTS trx table */
+	doc_id_t	doc_id,			/*!< in: doc id */
+	fts_row_state	state,			/*!< in: state of the row */
+	ib_vector_t*	fts_indexes)		/*!< in: FTS indexes affected */
+{
+	ib_rbt_t*	rows;
+	ib_rbt_bound_t	parent;
+
+	rows = ftt->rows;
+	rbt_search(rows, &parent, &doc_id);
+
+	/* Row id found, update state, and if new state is FTS_NOTHING,
+	we delete the row from our tree. */
+	if (parent.result == 0) {
+		fts_trx_row_t*	row = rbt_value(fts_trx_row_t, parent.last);
+
+		row->state = fts_trx_row_get_new_state(row->state, state);
+
+		if (row->state == FTS_NOTHING) {
+			if (row->fts_indexes) {
+				ib_vector_free(row->fts_indexes);
+			}
+
+			ut_free(rbt_remove_node(rows, parent.last));
+			row = NULL;
+		} else if (row->fts_indexes != NULL) {
+			ib_vector_free(row->fts_indexes);
+			row->fts_indexes = fts_indexes;
+		}
+
+	} else { /* Row-id not found, create a new one. */
+		fts_trx_row_t	row;
+
+		row.doc_id = doc_id;
+		row.state = state;
+		row.fts_indexes = fts_indexes;
+
+		rbt_add_node(rows, &parent, &row);
+	}
+}
+
+/******************************************************************//**
+Notify the FTS system about an operation on an FTS-indexed table. */
+UNIV_INTERN
+void
+fts_trx_add_op(
+/*===========*/
+	trx_t*		trx,			/*!< in: InnoDB transaction */
+	dict_table_t*	table,			/*!< in: table */
+	doc_id_t	doc_id,			/*!< in: new doc id */
+	fts_row_state	state,			/*!< in: state of the row */
+	ib_vector_t*	fts_indexes)		/*!< in: FTS indexes affected
+						(NULL=all) */
+{
+	fts_trx_table_t*	tran_ftt;
+	fts_trx_table_t*	stmt_ftt;
+
+	if (!trx->fts_trx) {
+		trx->fts_trx = fts_trx_create(trx);
+	}
+
+	tran_ftt = fts_trx_init(trx, table, trx->fts_trx->savepoints);
+	stmt_ftt = fts_trx_init(trx, table, trx->fts_trx->last_stmt);
+
+	fts_trx_table_add_op(tran_ftt, doc_id, state, fts_indexes);
+	fts_trx_table_add_op(stmt_ftt, doc_id, state, fts_indexes);
+}
+
+/******************************************************************//**
+Fetch callback that converts a textual document id to a binary value and
+stores it in the given place.
+@return always returns NULL */
+static
+ibool
+fts_fetch_store_doc_id(
+/*===================*/
+	void*		row,			/*!< in: sel_node_t* */
+	void*		user_arg)		/*!< in: doc_id_t* to store
+						doc_id in */
+{
+	int		n_parsed;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	doc_id_t*	doc_id = static_cast<doc_id_t*>(user_arg);
+	dfield_t*	dfield = que_node_get_val(node->select_list);
+	dtype_t*	type = dfield_get_type(dfield);
+	ulint		len = dfield_get_len(dfield);
+
+	char		buf[32];
+
+	ut_a(dtype_get_mtype(type) == DATA_VARCHAR);
+	ut_a(len > 0 && len < sizeof(buf));
+
+	memcpy(buf, dfield_get_data(dfield), len);
+	buf[len] = '\0';
+
+	n_parsed = sscanf(buf, FTS_DOC_ID_FORMAT, doc_id);
+	ut_a(n_parsed == 1);
+
+	return(FALSE);
+}
+
+#ifdef FTS_CACHE_SIZE_DEBUG
+/******************************************************************//**
+Get the max cache size in bytes. If there is an error reading the
+value we simply print an error message here and return the default
+value to the caller.
+@return max cache size in bytes */
+static
+ulint
+fts_get_max_cache_size(
+/*===================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table)		/*!< in: table instance */
+{
+	dberr_t		error;
+	fts_string_t	value;
+	ulint		cache_size_in_mb;
+
+	/* Set to the default value. */
+	cache_size_in_mb = FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value. */
+	value.f_n_char = 0;
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = ut_malloc(value.f_len + 1);
+
+	error = fts_config_get_value(
+		trx, fts_table, FTS_MAX_CACHE_SIZE_IN_MB, &value);
+
+	if (error == DB_SUCCESS) {
+
+		value.f_str[value.f_len] = 0;
+		cache_size_in_mb = strtoul((char*) value.f_str, NULL, 10);
+
+		if (cache_size_in_mb > FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: Warning: FTS max cache size "
+				" (%lu) out of range. Minimum value is "
+				"%luMB and the maximum values is %luMB, "
+				"setting cache size to upper limit\n",
+				cache_size_in_mb,
+				FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB,
+				FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB);
+
+			cache_size_in_mb = FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB;
+
+		} else if  (cache_size_in_mb
+			    < FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: Warning: FTS max cache size "
+				" (%lu) out of range. Minimum value is "
+				"%luMB and the maximum values is %luMB, "
+				"setting cache size to lower limit\n",
+				cache_size_in_mb,
+				FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB,
+				FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB);
+
+			cache_size_in_mb = FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB;
+		}
+	} else {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "InnoDB: Error: (%lu) reading max cache "
+			"config value from config table\n", error);
+	}
+
+	ut_free(value.f_str);
+
+	return(cache_size_in_mb * 1024 * 1024);
+}
+#endif
+
+#ifdef FTS_DOC_STATS_DEBUG
+/*********************************************************************//**
+Get the total number of words in the FTS for a particular FTS index.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+dberr_t
+fts_get_total_word_count(
+/*=====================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: for this index */
+	ulint*		total)			/* out: total words */
+{
+	dberr_t		error;
+	fts_string_t	value;
+
+	*total = 0;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value. */
+	value.f_n_char = 0;
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc(value.f_len + 1));
+
+	error = fts_config_get_index_value(
+		trx, index, FTS_TOTAL_WORD_COUNT, &value);
+
+	if (error == DB_SUCCESS) {
+
+		value.f_str[value.f_len] = 0;
+		*total = strtoul((char*) value.f_str, NULL, 10);
+	} else {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Error: (%s) reading total words "
+			"value from config table\n", ut_strerr(error));
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+#endif /* FTS_DOC_STATS_DEBUG */
+
+/*********************************************************************//**
+Update the next and last Doc ID in the CONFIG table to be the input
+"doc_id" value (+ 1). We would do so after each FTS index build or
+table truncate */
+UNIV_INTERN
+void
+fts_update_next_doc_id(
+/*===================*/
+	trx_t*			trx,		/*!< in/out: transaction */
+	const dict_table_t*	table,		/*!< in: table */
+	const char*		table_name,	/*!< in: table name, or NULL */
+	doc_id_t		doc_id)		/*!< in: DOC ID to set */
+{
+	table->fts->cache->synced_doc_id = doc_id;
+	table->fts->cache->next_doc_id = doc_id + 1;
+
+	table->fts->cache->first_doc_id = table->fts->cache->next_doc_id;
+
+	fts_update_sync_doc_id(
+		table, table_name, table->fts->cache->synced_doc_id, trx);
+
+}
+
+/*********************************************************************//**
+Get the next available document id.
+@return DB_SUCCESS if OK */
+UNIV_INTERN
+dberr_t
+fts_get_next_doc_id(
+/*================*/
+	const dict_table_t*	table,		/*!< in: table */
+	doc_id_t*		doc_id)		/*!< out: new document id */
+{
+	fts_cache_t*	cache = table->fts->cache;
+
+	/* If the Doc ID system has not yet been initialized, we
+	will consult the CONFIG table and user table to re-establish
+	the initial value of the Doc ID */
+
+	if (cache->first_doc_id != 0 || !fts_init_doc_id(table)) {
+		if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+			*doc_id = FTS_NULL_DOC_ID;
+			return(DB_SUCCESS);
+		}
+
+		/* Otherwise, simply increment the value in cache */
+		mutex_enter(&cache->doc_id_lock);
+		*doc_id = ++cache->next_doc_id;
+		mutex_exit(&cache->doc_id_lock);
+	} else {
+		mutex_enter(&cache->doc_id_lock);
+		*doc_id = cache->next_doc_id;
+		mutex_exit(&cache->doc_id_lock);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+This function fetch the Doc ID from CONFIG table, and compare with
+the Doc ID supplied. And store the larger one to the CONFIG table.
+@return DB_SUCCESS if OK */
+static __attribute__((nonnull))
+dberr_t
+fts_cmp_set_sync_doc_id(
+/*====================*/
+	const dict_table_t*	table,		/*!< in: table */
+	doc_id_t		doc_id_cmp,	/*!< in: Doc ID to compare */
+	ibool			read_only,	/*!< in: TRUE if read the
+						synced_doc_id only */
+	doc_id_t*		doc_id)		/*!< out: larger document id
+						after comparing "doc_id_cmp"
+						to the one stored in CONFIG
+						table */
+{
+	trx_t*		trx;
+	pars_info_t*	info;
+	dberr_t		error;
+	fts_table_t	fts_table;
+	que_t*		graph = NULL;
+	fts_cache_t*	cache = table->fts->cache;
+retry:
+	ut_a(table->fts->doc_col != ULINT_UNDEFINED);
+
+	fts_table.suffix = "CONFIG";
+	fts_table.table_id = table->id;
+	fts_table.type = FTS_COMMON_TABLE;
+	fts_table.table = table;
+
+	fts_table.parent = table->name;
+
+	trx = trx_allocate_for_background();
+
+	trx->op_info = "update the next FTS document id";
+
+	info = pars_info_create();
+
+	pars_info_bind_function(
+		info, "my_func", fts_fetch_store_doc_id, doc_id);
+
+	graph = fts_parse_sql(
+		&fts_table, info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS SELECT value FROM \"%s\""
+		" WHERE key = 'synced_doc_id' FOR UPDATE;\n"
+		"BEGIN\n"
+		""
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	*doc_id = 0;
+
+	error = fts_eval_sql(trx, graph);
+
+	fts_que_graph_free_check_lock(&fts_table, NULL, graph);
+
+	// FIXME: We need to retry deadlock errors
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	if (read_only) {
+		goto func_exit;
+	}
+
+	if (doc_id_cmp == 0 && *doc_id) {
+		cache->synced_doc_id = *doc_id - 1;
+	} else {
+		cache->synced_doc_id = ut_max(doc_id_cmp, *doc_id);
+	}
+
+	mutex_enter(&cache->doc_id_lock);
+	/* For each sync operation, we will add next_doc_id by 1,
+	so to mark a sync operation */
+	if (cache->next_doc_id < cache->synced_doc_id + 1) {
+		cache->next_doc_id = cache->synced_doc_id + 1;
+	}
+	mutex_exit(&cache->doc_id_lock);
+
+	if (doc_id_cmp > *doc_id) {
+		error = fts_update_sync_doc_id(
+			table, table->name, cache->synced_doc_id, trx);
+	}
+
+	*doc_id = cache->next_doc_id;
+
+func_exit:
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(trx);
+	} else {
+		*doc_id = 0;
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Error: (%s) "
+			"while getting next doc id.\n", ut_strerr(error));
+
+		fts_sql_rollback(trx);
+
+		if (error == DB_DEADLOCK) {
+			os_thread_sleep(FTS_DEADLOCK_RETRY_WAIT);
+			goto retry;
+		}
+	}
+
+	trx_free_for_background(trx);
+
+	return(error);
+}
+
+/*********************************************************************//**
+Update the last document id. This function could create a new
+transaction to update the last document id.
+@return DB_SUCCESS if OK */
+static
+dberr_t
+fts_update_sync_doc_id(
+/*===================*/
+	const dict_table_t*	table,		/*!< in: table */
+	const char*		table_name,	/*!< in: table name, or NULL */
+	doc_id_t		doc_id,		/*!< in: last document id */
+	trx_t*			trx)		/*!< in: update trx, or NULL */
+{
+	byte		id[FTS_MAX_ID_LEN];
+	pars_info_t*	info;
+	fts_table_t	fts_table;
+	ulint		id_len;
+	que_t*		graph = NULL;
+	dberr_t		error;
+	ibool		local_trx = FALSE;
+	fts_cache_t*	cache = table->fts->cache;
+
+	fts_table.suffix = "CONFIG";
+	fts_table.table_id = table->id;
+	fts_table.type = FTS_COMMON_TABLE;
+	fts_table.table = table;
+	if (table_name) {
+		fts_table.parent = table_name;
+	} else {
+		fts_table.parent = table->name;
+	}
+
+	if (!trx) {
+		trx = trx_allocate_for_background();
+
+		trx->op_info = "setting last FTS document id";
+		local_trx = TRUE;
+	}
+
+	info = pars_info_create();
+
+	id_len = ut_snprintf(
+		(char*) id, sizeof(id), FTS_DOC_ID_FORMAT, doc_id + 1);
+
+	pars_info_bind_varchar_literal(info, "doc_id", id, id_len);
+
+	graph = fts_parse_sql(
+		&fts_table, info,
+		"BEGIN "
+		"UPDATE \"%s\" SET value = :doc_id"
+		" WHERE key = 'synced_doc_id';");
+
+	error = fts_eval_sql(trx, graph);
+
+	fts_que_graph_free_check_lock(&fts_table, NULL, graph);
+
+	if (local_trx) {
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+			cache->synced_doc_id = doc_id;
+		} else {
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"(%s) while updating last doc id.",
+				ut_strerr(error));
+
+			fts_sql_rollback(trx);
+		}
+		trx_free_for_background(trx);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Create a new fts_doc_ids_t.
+@return new fts_doc_ids_t */
+UNIV_INTERN
+fts_doc_ids_t*
+fts_doc_ids_create(void)
+/*====================*/
+{
+	fts_doc_ids_t*	fts_doc_ids;
+	mem_heap_t*	heap = mem_heap_create(512);
+
+	fts_doc_ids = static_cast<fts_doc_ids_t*>(
+		mem_heap_alloc(heap, sizeof(*fts_doc_ids)));
+
+	fts_doc_ids->self_heap = ib_heap_allocator_create(heap);
+
+	fts_doc_ids->doc_ids = static_cast<ib_vector_t*>(ib_vector_create(
+		fts_doc_ids->self_heap, sizeof(fts_update_t), 32));
+
+	return(fts_doc_ids);
+}
+
+/*********************************************************************//**
+Free a fts_doc_ids_t. */
+
+void
+fts_doc_ids_free(
+/*=============*/
+	fts_doc_ids_t*	fts_doc_ids)
+{
+	mem_heap_t*	heap = static_cast<mem_heap_t*>(
+		fts_doc_ids->self_heap->arg);
+
+	memset(fts_doc_ids, 0, sizeof(*fts_doc_ids));
+
+	mem_heap_free(heap);
+}
+
+/*********************************************************************//**
+Do commit-phase steps necessary for the insertion of a new row.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_add(
+/*====*/
+	fts_trx_table_t*ftt,			/*!< in: FTS trx table */
+	fts_trx_row_t*	row)			/*!< in: row */
+{
+	dict_table_t*	table = ftt->table;
+	dberr_t		error = DB_SUCCESS;
+	doc_id_t	doc_id = row->doc_id;
+
+	ut_a(row->state == FTS_INSERT || row->state == FTS_MODIFY);
+
+	fts_add_doc_by_id(ftt, doc_id, row->fts_indexes);
+
+	if (error == DB_SUCCESS) {
+		mutex_enter(&table->fts->cache->deleted_lock);
+		++table->fts->cache->added;
+		mutex_exit(&table->fts->cache->deleted_lock);
+
+		if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
+		    && doc_id >= table->fts->cache->next_doc_id) {
+			table->fts->cache->next_doc_id = doc_id + 1;
+		}
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Do commit-phase steps necessary for the deletion of a row.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_delete(
+/*=======*/
+	fts_trx_table_t*ftt,			/*!< in: FTS trx table */
+	fts_trx_row_t*	row)			/*!< in: row */
+{
+	que_t*		graph;
+	fts_table_t	fts_table;
+	dberr_t		error = DB_SUCCESS;
+	doc_id_t	write_doc_id;
+	dict_table_t*	table = ftt->table;
+	doc_id_t	doc_id = row->doc_id;
+	trx_t*		trx = ftt->fts_trx->trx;
+	pars_info_t*	info = pars_info_create();
+	fts_cache_t*	cache = table->fts->cache;
+
+	/* we do not index Documents whose Doc ID value is 0 */
+	if (doc_id == FTS_NULL_DOC_ID) {
+		ut_ad(!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID));
+		return(error);
+	}
+
+	ut_a(row->state == FTS_DELETE || row->state == FTS_MODIFY);
+
+	FTS_INIT_FTS_TABLE(&fts_table, "DELETED", FTS_COMMON_TABLE, table);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &write_doc_id, doc_id);
+	fts_bind_doc_id(info, "doc_id", &write_doc_id);
+
+	/* It is possible we update a record that has not yet been sync-ed
+	into cache from last crash (delete Doc will not initialize the
+	sync). Avoid any added counter accounting until the FTS cache
+	is re-established and sync-ed */
+	if (table->fts->fts_status & ADDED_TABLE_SYNCED
+	    && doc_id > cache->synced_doc_id) {
+		mutex_enter(&table->fts->cache->deleted_lock);
+
+		/* The Doc ID could belong to those left in
+		ADDED table from last crash. So need to check
+		if it is less than first_doc_id when we initialize
+		the Doc ID system after reboot */
+		if (doc_id >= table->fts->cache->first_doc_id
+		    && table->fts->cache->added > 0) {
+			--table->fts->cache->added;
+		}
+
+		mutex_exit(&table->fts->cache->deleted_lock);
+
+		/* Only if the row was really deleted. */
+		ut_a(row->state == FTS_DELETE || row->state == FTS_MODIFY);
+	}
+
+	/* Note the deleted document for OPTIMIZE to purge. */
+	if (error == DB_SUCCESS) {
+
+		trx->op_info = "adding doc id to FTS DELETED";
+
+		info->graph_owns_us = TRUE;
+
+		fts_table.suffix = "DELETED";
+
+		graph = fts_parse_sql(
+			&fts_table,
+			info,
+			"BEGIN INSERT INTO \"%s\" VALUES (:doc_id);");
+
+		error = fts_eval_sql(trx, graph);
+
+		fts_que_graph_free(graph);
+	} else {
+		pars_info_free(info);
+	}
+
+	/* Increment the total deleted count, this is used to calculate the
+	number of documents indexed. */
+	if (error == DB_SUCCESS) {
+		mutex_enter(&table->fts->cache->deleted_lock);
+
+		++table->fts->cache->deleted;
+
+		mutex_exit(&table->fts->cache->deleted_lock);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Do commit-phase steps necessary for the modification of a row.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_modify(
+/*=======*/
+	fts_trx_table_t*	ftt,		/*!< in: FTS trx table */
+	fts_trx_row_t*		row)		/*!< in: row */
+{
+	dberr_t	error;
+
+	ut_a(row->state == FTS_MODIFY);
+
+	error = fts_delete(ftt, row);
+
+	if (error == DB_SUCCESS) {
+		error = fts_add(ftt, row);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Create a new document id.
+@return DB_SUCCESS if all went well else error */
+UNIV_INTERN
+dberr_t
+fts_create_doc_id(
+/*==============*/
+	dict_table_t*	table,		/*!< in: row is of this table. */
+	dtuple_t*	row,		/* in/out: add doc id value to this
+					row. This is the current row that is
+					being inserted. */
+	mem_heap_t*	heap)		/*!< in: heap */
+{
+	doc_id_t	doc_id;
+	dberr_t		error = DB_SUCCESS;
+
+	ut_a(table->fts->doc_col != ULINT_UNDEFINED);
+
+	if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+		if (table->fts->cache->first_doc_id == FTS_NULL_DOC_ID) {
+			error = fts_get_next_doc_id(table, &doc_id);
+		}
+		return(error);
+	}
+
+	error = fts_get_next_doc_id(table, &doc_id);
+
+	if (error == DB_SUCCESS) {
+		dfield_t*	dfield;
+		doc_id_t*	write_doc_id;
+
+		ut_a(doc_id > 0);
+
+		dfield = dtuple_get_nth_field(row, table->fts->doc_col);
+		write_doc_id = static_cast<doc_id_t*>(
+			mem_heap_alloc(heap, sizeof(*write_doc_id)));
+
+		ut_a(doc_id != FTS_NULL_DOC_ID);
+		ut_a(sizeof(doc_id) == dfield->type.len);
+		fts_write_doc_id((byte*) write_doc_id, doc_id);
+
+		dfield_set_data(dfield, write_doc_id, sizeof(*write_doc_id));
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+The given transaction is about to be committed; do whatever is necessary
+from the FTS system's POV.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_commit_table(
+/*=============*/
+	fts_trx_table_t*	ftt)		/*!< in: FTS table to commit*/
+{
+	const ib_rbt_node_t*	node;
+	ib_rbt_t*		rows;
+	dberr_t			error = DB_SUCCESS;
+	fts_cache_t*		cache = ftt->table->fts->cache;
+	trx_t*			trx = trx_allocate_for_background();
+
+	rows = ftt->rows;
+
+	ftt->fts_trx->trx = trx;
+
+	if (cache->get_docs == NULL) {
+		rw_lock_x_lock(&cache->init_lock);
+		if (cache->get_docs == NULL) {
+			cache->get_docs = fts_get_docs_create(cache);
+		}
+		rw_lock_x_unlock(&cache->init_lock);
+	}
+
+	for (node = rbt_first(rows);
+	     node != NULL && error == DB_SUCCESS;
+	     node = rbt_next(rows, node)) {
+
+		fts_trx_row_t*	row = rbt_value(fts_trx_row_t, node);
+
+		switch (row->state) {
+		case FTS_INSERT:
+			error = fts_add(ftt, row);
+			break;
+
+		case FTS_MODIFY:
+			error = fts_modify(ftt, row);
+			break;
+
+		case FTS_DELETE:
+			error = fts_delete(ftt, row);
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	fts_sql_commit(trx);
+
+	trx_free_for_background(trx);
+
+	return(error);
+}
+
+/*********************************************************************//**
+The given transaction is about to be committed; do whatever is necessary
+from the FTS system's POV.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_commit(
+/*=======*/
+	trx_t*	trx)				/*!< in: transaction */
+{
+	const ib_rbt_node_t*	node;
+	dberr_t			error;
+	ib_rbt_t*		tables;
+	fts_savepoint_t*	savepoint;
+
+	savepoint = static_cast<fts_savepoint_t*>(
+		ib_vector_last(trx->fts_trx->savepoints));
+	tables = savepoint->tables;
+
+	for (node = rbt_first(tables), error = DB_SUCCESS;
+	     node != NULL && error == DB_SUCCESS;
+	     node = rbt_next(tables, node)) {
+
+		fts_trx_table_t**	ftt;
+
+		ftt = rbt_value(fts_trx_table_t*, node);
+
+		error = fts_commit_table(*ftt);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Initialize a document. */
+UNIV_INTERN
+void
+fts_doc_init(
+/*=========*/
+	fts_doc_t*	doc)			/*!< in: doc to initialize */
+{
+	mem_heap_t*	heap = mem_heap_create(32);
+
+	memset(doc, 0, sizeof(*doc));
+
+	doc->self_heap = ib_heap_allocator_create(heap);
+}
+
+/*********************************************************************//**
+Free document. */
+UNIV_INTERN
+void
+fts_doc_free(
+/*=========*/
+	fts_doc_t*	doc)			/*!< in: document */
+{
+	mem_heap_t*	heap = static_cast<mem_heap_t*>(doc->self_heap->arg);
+
+	if (doc->tokens) {
+		rbt_free(doc->tokens);
+	}
+
+#ifdef UNIV_DEBUG
+	memset(doc, 0, sizeof(*doc));
+#endif /* UNIV_DEBUG */
+
+	mem_heap_free(heap);
+}
+
+/*********************************************************************//**
+Callback function for fetch that stores a row id to the location pointed.
+The column's type must be DATA_FIXBINARY, DATA_BINARY_TYPE, length = 8.
+@return always returns NULL */
+UNIV_INTERN
+void*
+fts_fetch_row_id(
+/*=============*/
+	void*	row,				/*!< in: sel_node_t* */
+	void*	user_arg)			/*!< in: data pointer */
+{
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+
+	dfield_t*	dfield = que_node_get_val(node->select_list);
+	dtype_t*	type = dfield_get_type(dfield);
+	ulint		len = dfield_get_len(dfield);
+
+	ut_a(dtype_get_mtype(type) == DATA_FIXBINARY);
+	ut_a(dtype_get_prtype(type) & DATA_BINARY_TYPE);
+	ut_a(len == 8);
+
+	memcpy(user_arg, dfield_get_data(dfield), 8);
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Callback function for fetch that stores the text of an FTS document,
+converting each column to UTF-16.
+@return always FALSE */
+UNIV_INTERN
+ibool
+fts_query_expansion_fetch_doc(
+/*==========================*/
+	void*		row,			/*!< in: sel_node_t* */
+	void*		user_arg)		/*!< in: fts_doc_t* */
+{
+	que_node_t*	exp;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	fts_doc_t*	result_doc = static_cast<fts_doc_t*>(user_arg);
+	dfield_t*	dfield;
+	ulint		len;
+	ulint		doc_len;
+	fts_doc_t	doc;
+	CHARSET_INFO*	doc_charset = NULL;
+	ulint		field_no = 0;
+
+	len = 0;
+
+	fts_doc_init(&doc);
+	doc.found = TRUE;
+
+	exp = node->select_list;
+	doc_len = 0;
+
+	doc_charset  = result_doc->charset;
+
+	/* Copy each indexed column content into doc->text.f_str */
+	while (exp) {
+		dfield = que_node_get_val(exp);
+		len = dfield_get_len(dfield);
+
+		/* NULL column */
+		if (len == UNIV_SQL_NULL) {
+			exp = que_node_get_next(exp);
+			continue;
+		}
+
+		if (!doc_charset) {
+			ulint   prtype = dfield->type.prtype;
+			doc_charset = innobase_get_fts_charset(
+					(int)(prtype & DATA_MYSQL_TYPE_MASK),
+					(uint) dtype_get_charset_coll(prtype));
+		}
+
+		doc.charset = doc_charset;
+
+		if (dfield_is_ext(dfield)) {
+			/* We ignore columns that are stored externally, this
+			could result in too many words to search */
+			exp = que_node_get_next(exp);
+			continue;
+		} else {
+			doc.text.f_n_char = 0;
+
+			doc.text.f_str = static_cast<byte*>(
+				dfield_get_data(dfield));
+
+			doc.text.f_len = len;
+		}
+
+		if (field_no == 0) {
+			fts_tokenize_document(&doc, result_doc);
+		} else {
+			fts_tokenize_document_next(&doc, doc_len, result_doc);
+		}
+
+		exp = que_node_get_next(exp);
+
+		doc_len += (exp) ? len + 1 : len;
+
+		field_no++;
+	}
+
+	ut_ad(doc_charset);
+
+	if (!result_doc->charset) {
+		result_doc->charset = doc_charset;
+	}
+
+	fts_doc_free(&doc);
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+fetch and tokenize the document. */
+static
+void
+fts_fetch_doc_from_rec(
+/*===================*/
+	fts_get_doc_t*  get_doc,	/*!< in: FTS index's get_doc struct */
+	dict_index_t*	clust_index,	/*!< in: cluster index */
+	btr_pcur_t*	pcur,		/*!< in: cursor whose position
+					has been stored */
+	ulint*		offsets,	/*!< in: offsets */
+	fts_doc_t*	doc)		/*!< out: fts doc to hold parsed
+					documents */
+{
+	dict_index_t*		index;
+	dict_table_t*		table;
+	const rec_t*		clust_rec;
+	ulint			num_field;
+	const dict_field_t*	ifield;
+	const dict_col_t*	col;
+	ulint			clust_pos;
+	ulint			i;
+	ulint			doc_len = 0;
+	ulint			processed_doc = 0;
+
+	if (!get_doc) {
+		return;
+	}
+
+	index = get_doc->index_cache->index;
+	table = get_doc->index_cache->index->table;
+
+	clust_rec = btr_pcur_get_rec(pcur);
+
+	num_field = dict_index_get_n_fields(index);
+
+	for (i = 0; i < num_field; i++) {
+		ifield = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(ifield);
+		clust_pos = dict_col_get_clust_pos(col, clust_index);
+
+		if (!get_doc->index_cache->charset) {
+			ulint   prtype = ifield->col->prtype;
+
+			get_doc->index_cache->charset =
+				innobase_get_fts_charset(
+					(int) (prtype & DATA_MYSQL_TYPE_MASK),
+					(uint) dtype_get_charset_coll(prtype));
+		}
+
+		if (rec_offs_nth_extern(offsets, clust_pos)) {
+			doc->text.f_str =
+				btr_rec_copy_externally_stored_field(
+					clust_rec, offsets,
+					dict_table_zip_size(table),
+					clust_pos, &doc->text.f_len,
+					static_cast<mem_heap_t*>(
+						doc->self_heap->arg));
+		} else {
+			doc->text.f_str = (byte*) rec_get_nth_field(
+				clust_rec, offsets, clust_pos,
+				&doc->text.f_len);
+		}
+
+		doc->found = TRUE;
+		doc->charset = get_doc->index_cache->charset;
+
+		/* Null Field */
+		if (doc->text.f_len == UNIV_SQL_NULL) {
+			continue;
+		}
+
+		if (processed_doc == 0) {
+			fts_tokenize_document(doc, NULL);
+		} else {
+			fts_tokenize_document_next(doc, doc_len, NULL);
+		}
+
+		processed_doc++;
+		doc_len += doc->text.f_len + 1;
+	}
+}
+
+/*********************************************************************//**
+This function fetches the document inserted during the committing
+transaction, and tokenize the inserted text data and insert into
+FTS auxiliary table and its cache.
+@return TRUE if successful */
+static
+ulint
+fts_add_doc_by_id(
+/*==============*/
+	fts_trx_table_t*ftt,		/*!< in: FTS trx table */
+	doc_id_t	doc_id,		/*!< in: doc id */
+	ib_vector_t*	fts_indexes __attribute__((unused)))
+					/*!< in: affected fts indexes */
+{
+	mtr_t		mtr;
+	mem_heap_t*	heap;
+	btr_pcur_t	pcur;
+	dict_table_t*	table;
+	dtuple_t*	tuple;
+	dfield_t*       dfield;
+	fts_get_doc_t*	get_doc;
+	doc_id_t        temp_doc_id;
+	dict_index_t*   clust_index;
+	dict_index_t*	fts_id_index;
+	ibool		is_id_cluster;
+	fts_cache_t*   	cache = ftt->table->fts->cache;
+
+	ut_ad(cache->get_docs);
+
+	/* If Doc ID has been supplied by the user, then the table
+	might not yet be sync-ed */
+
+	if (!(ftt->table->fts->fts_status & ADDED_TABLE_SYNCED)) {
+		fts_init_index(ftt->table, FALSE);
+	}
+
+	/* Get the first FTS index's get_doc */
+	get_doc = static_cast<fts_get_doc_t*>(
+		ib_vector_get(cache->get_docs, 0));
+	ut_ad(get_doc);
+
+	table = get_doc->index_cache->index->table;
+
+	heap = mem_heap_create(512);
+
+	clust_index = dict_table_get_first_index(table);
+	fts_id_index = dict_table_get_index_on_name(
+				table, FTS_DOC_ID_INDEX_NAME);
+
+	/* Check whether the index on FTS_DOC_ID is cluster index */
+	is_id_cluster = (clust_index == fts_id_index);
+
+	mtr_start(&mtr);
+	btr_pcur_init(&pcur);
+
+	/* Search based on Doc ID. Here, we'll need to consider the case
+	when there is no primary index on Doc ID */
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+	dfield->type.mtype = DATA_INT;
+	dfield->type.prtype = DATA_NOT_NULL | DATA_UNSIGNED | DATA_BINARY_TYPE;
+
+	mach_write_to_8((byte*) &temp_doc_id, doc_id);
+	dfield_set_data(dfield, &temp_doc_id, sizeof(temp_doc_id));
+
+	btr_pcur_open_with_no_init(
+		fts_id_index, tuple, PAGE_CUR_LE, BTR_SEARCH_LEAF,
+		&pcur, 0, &mtr);
+
+	/* If we have a match, add the data to doc structure */
+	if (btr_pcur_get_low_match(&pcur) == 1) {
+		const rec_t*	rec;
+		btr_pcur_t*	doc_pcur;
+		const rec_t*	clust_rec;
+		btr_pcur_t	clust_pcur;
+		ulint*		offsets = NULL;
+		ulint		num_idx = ib_vector_size(cache->get_docs);
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		/* Doc could be deleted */
+		if (page_rec_is_infimum(rec)
+		    || rec_get_deleted_flag(rec, dict_table_is_comp(table))) {
+
+			goto func_exit;
+		}
+
+		if (is_id_cluster) {
+			clust_rec = rec;
+			doc_pcur = &pcur;
+		} else {
+			dtuple_t*	clust_ref;
+			ulint		n_fields;
+
+			btr_pcur_init(&clust_pcur);
+			n_fields = dict_index_get_n_unique(clust_index);
+
+			clust_ref = dtuple_create(heap, n_fields);
+			dict_index_copy_types(clust_ref, clust_index, n_fields);
+
+			row_build_row_ref_in_tuple(
+				clust_ref, rec, fts_id_index, NULL, NULL);
+
+			btr_pcur_open_with_no_init(
+				clust_index, clust_ref, PAGE_CUR_LE,
+				BTR_SEARCH_LEAF, &clust_pcur, 0, &mtr);
+
+			doc_pcur = &clust_pcur;
+			clust_rec = btr_pcur_get_rec(&clust_pcur);
+
+		}
+
+		offsets = rec_get_offsets(clust_rec, clust_index,
+					  NULL, ULINT_UNDEFINED, &heap);
+
+		 for (ulint i = 0; i < num_idx; ++i) {
+			fts_doc_t       doc;
+			dict_table_t*   table;
+			fts_get_doc_t*  get_doc;
+
+			get_doc = static_cast<fts_get_doc_t*>(
+				ib_vector_get(cache->get_docs, i));
+
+			table = get_doc->index_cache->index->table;
+
+			fts_doc_init(&doc);
+
+			fts_fetch_doc_from_rec(
+				get_doc, clust_index, doc_pcur, offsets, &doc);
+
+			if (doc.found) {
+				ibool	success __attribute__((unused));
+
+				btr_pcur_store_position(doc_pcur, &mtr);
+				mtr_commit(&mtr);
+
+				rw_lock_x_lock(&table->fts->cache->lock);
+
+				if (table->fts->cache->stopword_info.status
+				    & STOPWORD_NOT_INIT) {
+					fts_load_stopword(table, NULL, NULL,
+							  NULL, TRUE, TRUE);
+				}
+
+				fts_cache_add_doc(
+					table->fts->cache,
+					get_doc->index_cache,
+					doc_id, doc.tokens);
+
+				rw_lock_x_unlock(&table->fts->cache->lock);
+
+				DBUG_EXECUTE_IF(
+					"fts_instrument_sync",
+					fts_sync(cache->sync);
+				);
+
+				if (cache->total_size > fts_max_cache_size
+				    || fts_need_sync) {
+					fts_sync(cache->sync);
+				}
+
+				mtr_start(&mtr);
+
+				if (i < num_idx - 1) {
+
+					success = btr_pcur_restore_position(
+						BTR_SEARCH_LEAF, doc_pcur,
+						&mtr);
+
+					ut_ad(success);
+				}
+			}
+
+			fts_doc_free(&doc);
+		}
+
+		if (!is_id_cluster) {
+			btr_pcur_close(doc_pcur);
+		}
+	}
+func_exit:
+	mtr_commit(&mtr);
+
+	btr_pcur_close(&pcur);
+
+	mem_heap_free(heap);
+	return(TRUE);
+}
+
+
+/*********************************************************************//**
+Callback function to read a single ulint column.
+return always returns TRUE */
+static
+ibool
+fts_read_ulint(
+/*===========*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to ulint */
+{
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	ulint*		value = static_cast<ulint*>(user_arg);
+	que_node_t*	exp = sel_node->select_list;
+	dfield_t*	dfield = que_node_get_val(exp);
+	void*		data = dfield_get_data(dfield);
+
+	*value = static_cast<ulint>(mach_read_from_4(
+		static_cast<const byte*>(data)));
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Get maximum Doc ID in a table if index "FTS_DOC_ID_INDEX" exists
+@return max Doc ID or 0 if index "FTS_DOC_ID_INDEX" does not exist */
+UNIV_INTERN
+doc_id_t
+fts_get_max_doc_id(
+/*===============*/
+	dict_table_t*	table)		/*!< in: user table */
+{
+	dict_index_t*	index;
+	dict_field_t*	dfield __attribute__((unused)) = NULL;
+	doc_id_t	doc_id = 0;
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+
+	index = dict_table_get_index_on_name(table, FTS_DOC_ID_INDEX_NAME);
+
+	if (!index) {
+		return(0);
+	}
+
+	dfield = dict_index_get_nth_field(index, 0);
+
+#if 0 /* This can fail when renaming a column to FTS_DOC_ID_COL_NAME. */
+	ut_ad(innobase_strcasecmp(FTS_DOC_ID_COL_NAME, dfield->name) == 0);
+#endif
+
+	mtr_start(&mtr);
+
+	/* fetch the largest indexes value */
+	btr_pcur_open_at_index_side(
+		false, index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
+
+	if (!page_is_empty(btr_pcur_get_page(&pcur))) {
+		const rec_t*    rec = NULL;
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		ulint*		offsets = offsets_;
+		mem_heap_t*	heap = NULL;
+		ulint		len;
+		const void*	data;
+
+		rec_offs_init(offsets_);
+
+		do {
+			rec = btr_pcur_get_rec(&pcur);
+
+			if (page_rec_is_user_rec(rec)) {
+				break;
+			}
+		} while (btr_pcur_move_to_prev(&pcur, &mtr));
+
+		if (!rec) {
+			goto func_exit;
+		}
+
+		offsets = rec_get_offsets(
+			rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+		data = rec_get_nth_field(rec, offsets, 0, &len);
+
+		doc_id = static_cast<doc_id_t>(fts_read_doc_id(
+			static_cast<const byte*>(data)));
+	}
+
+func_exit:
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+	return(doc_id);
+}
+
+/*********************************************************************//**
+Fetch document with the given document id.
+@return DB_SUCCESS if OK else error */
+UNIV_INTERN
+dberr_t
+fts_doc_fetch_by_doc_id(
+/*====================*/
+	fts_get_doc_t*	get_doc,	/*!< in: state */
+	doc_id_t	doc_id,		/*!< in: id of document to
+					fetch */
+	dict_index_t*	index_to_use,	/*!< in: caller supplied FTS index,
+					or NULL */
+	ulint		option,		/*!< in: search option, if it is
+					greater than doc_id or equal */
+	fts_sql_callback
+			callback,	/*!< in: callback to read */
+	void*		arg)		/*!< in: callback arg */
+{
+	pars_info_t*	info;
+	dberr_t		error;
+	const char*	select_str;
+	doc_id_t	write_doc_id;
+	dict_index_t*	index;
+	trx_t*		trx = trx_allocate_for_background();
+	que_t*          graph;
+
+	trx->op_info = "fetching indexed FTS document";
+
+	/* The FTS index can be supplied by caller directly with
+	"index_to_use", otherwise, get it from "get_doc" */
+	index = (index_to_use) ? index_to_use : get_doc->index_cache->index;
+
+	if (get_doc && get_doc->get_document_graph) {
+		info = get_doc->get_document_graph->info;
+	} else {
+		info = pars_info_create();
+	}
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &write_doc_id, doc_id);
+	fts_bind_doc_id(info, "doc_id", &write_doc_id);
+	pars_info_bind_function(info, "my_func", callback, arg);
+
+	select_str = fts_get_select_columns_str(index, info, info->heap);
+	pars_info_bind_id(info, TRUE, "table_name", index->table_name);
+
+	if (!get_doc || !get_doc->get_document_graph) {
+		if (option == FTS_FETCH_DOC_BY_ID_EQUAL) {
+			graph = fts_parse_sql(
+				NULL,
+				info,
+				mem_heap_printf(info->heap,
+					"DECLARE FUNCTION my_func;\n"
+					"DECLARE CURSOR c IS"
+					" SELECT %s FROM $table_name"
+					" WHERE %s = :doc_id;\n"
+					"BEGIN\n"
+					""
+					"OPEN c;\n"
+					"WHILE 1 = 1 LOOP\n"
+					"  FETCH c INTO my_func();\n"
+					"  IF c %% NOTFOUND THEN\n"
+					"    EXIT;\n"
+					"  END IF;\n"
+					"END LOOP;\n"
+					"CLOSE c;",
+					select_str, FTS_DOC_ID_COL_NAME));
+		} else {
+			ut_ad(option == FTS_FETCH_DOC_BY_ID_LARGE);
+
+			/* This is used for crash recovery of table with
+			hidden DOC ID or FTS indexes. We will scan the table
+			to re-processing user table rows whose DOC ID or
+			FTS indexed documents have not been sync-ed to disc
+			during recent crash.
+			In the case that all fulltext indexes are dropped
+			for a table, we will keep the "hidden" FTS_DOC_ID
+			column, and this scan is to retreive the largest
+			DOC ID being used in the table to determine the
+			appropriate next DOC ID.
+			In the case of there exists fulltext index(es), this
+			operation will re-tokenize any docs that have not
+			been sync-ed to the disk, and re-prime the FTS
+			cached */
+			graph = fts_parse_sql(
+				NULL,
+				info,
+				mem_heap_printf(info->heap,
+					"DECLARE FUNCTION my_func;\n"
+					"DECLARE CURSOR c IS"
+					" SELECT %s, %s FROM $table_name"
+					" WHERE %s > :doc_id;\n"
+					"BEGIN\n"
+					""
+					"OPEN c;\n"
+					"WHILE 1 = 1 LOOP\n"
+					"  FETCH c INTO my_func();\n"
+					"  IF c %% NOTFOUND THEN\n"
+					"    EXIT;\n"
+					"  END IF;\n"
+					"END LOOP;\n"
+					"CLOSE c;",
+					FTS_DOC_ID_COL_NAME,
+					select_str, FTS_DOC_ID_COL_NAME));
+		}
+		if (get_doc) {
+			get_doc->get_document_graph = graph;
+		}
+	} else {
+		graph = get_doc->get_document_graph;
+	}
+
+	error = fts_eval_sql(trx, graph);
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(trx);
+	} else {
+		fts_sql_rollback(trx);
+	}
+
+	trx_free_for_background(trx);
+
+	if (!get_doc) {
+		fts_que_graph_free(graph);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Write out a single word's data as new entry/entries in the INDEX table.
+@return DB_SUCCESS if all OK. */
+UNIV_INTERN
+dberr_t
+fts_write_node(
+/*===========*/
+	trx_t*		trx,			/*!< in: transaction */
+	que_t**		graph,			/*!< in: query graph */
+	fts_table_t*	fts_table,		/*!< in: aux table */
+	fts_string_t*	word,			/*!< in: word in UTF-8 */
+	fts_node_t*	node)			/*!< in: node columns */
+{
+	pars_info_t*	info;
+	dberr_t		error;
+	ib_uint32_t	doc_count;
+	ib_time_t	start_time;
+	doc_id_t	last_doc_id;
+	doc_id_t	first_doc_id;
+
+	if (*graph) {
+		info = (*graph)->info;
+	} else {
+		info = pars_info_create();
+	}
+
+	pars_info_bind_varchar_literal(info, "token", word->f_str, word->f_len);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &first_doc_id, node->first_doc_id);
+	fts_bind_doc_id(info, "first_doc_id", &first_doc_id);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &last_doc_id, node->last_doc_id);
+	fts_bind_doc_id(info, "last_doc_id", &last_doc_id);
+
+	ut_a(node->last_doc_id >= node->first_doc_id);
+
+	/* Convert to "storage" byte order. */
+	mach_write_to_4((byte*) &doc_count, node->doc_count);
+	pars_info_bind_int4_literal(
+		info, "doc_count", (const ib_uint32_t*) &doc_count);
+
+	/* Set copy_name to FALSE since it's a static. */
+	pars_info_bind_literal(
+		info, "ilist", node->ilist, node->ilist_size,
+		DATA_BLOB, DATA_BINARY_TYPE);
+
+	if (!*graph) {
+		*graph = fts_parse_sql(
+			fts_table,
+			info,
+			"BEGIN\n"
+			"INSERT INTO \"%s\" VALUES "
+			"(:token, :first_doc_id,"
+			" :last_doc_id, :doc_count, :ilist);");
+	}
+
+	start_time = ut_time();
+	error = fts_eval_sql(trx, *graph);
+	elapsed_time += ut_time() - start_time;
+	++n_nodes;
+
+	return(error);
+}
+
+/*********************************************************************//**
+Add rows to the DELETED_CACHE table.
+@return DB_SUCCESS if all went well else error code*/
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_sync_add_deleted_cache(
+/*=======================*/
+	fts_sync_t*	sync,			/*!< in: sync state */
+	ib_vector_t*	doc_ids)		/*!< in: doc ids to add */
+{
+	ulint		i;
+	pars_info_t*	info;
+	que_t*		graph;
+	fts_table_t	fts_table;
+	doc_id_t	dummy = 0;
+	dberr_t		error = DB_SUCCESS;
+	ulint		n_elems = ib_vector_size(doc_ids);
+
+	ut_a(ib_vector_size(doc_ids) > 0);
+
+	ib_vector_sort(doc_ids, fts_update_doc_id_cmp);
+
+	info = pars_info_create();
+
+	fts_bind_doc_id(info, "doc_id", &dummy);
+
+	FTS_INIT_FTS_TABLE(
+		&fts_table, "DELETED_CACHE", FTS_COMMON_TABLE, sync->table);
+
+	graph = fts_parse_sql(
+		&fts_table,
+		info,
+		"BEGIN INSERT INTO \"%s\" VALUES (:doc_id);");
+
+	for (i = 0; i < n_elems && error == DB_SUCCESS; ++i) {
+		fts_update_t*	update;
+		doc_id_t	write_doc_id;
+
+		update = static_cast<fts_update_t*>(ib_vector_get(doc_ids, i));
+
+		/* Convert to "storage" byte order. */
+		fts_write_doc_id((byte*) &write_doc_id, update->doc_id);
+		fts_bind_doc_id(info, "doc_id", &write_doc_id);
+
+		error = fts_eval_sql(sync->trx, graph);
+	}
+
+	fts_que_graph_free(graph);
+
+	return(error);
+}
+
+/*********************************************************************//**
+Write the words and ilist to disk.
+@return DB_SUCCESS if all went well else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_sync_write_words(
+/*=================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_index_cache_t*
+			index_cache)		/*!< in: index cache */
+{
+	fts_table_t	fts_table;
+	ulint		n_nodes = 0;
+	ulint		n_words = 0;
+	const ib_rbt_node_t* rbt_node;
+	dberr_t		error = DB_SUCCESS;
+	ibool		print_error = FALSE;
+#ifdef FTS_DOC_STATS_DEBUG
+	dict_table_t*	table = index_cache->index->table;
+	ulint		n_new_words = 0;
+#endif /* FTS_DOC_STATS_DEBUG */
+
+	FTS_INIT_INDEX_TABLE(
+		&fts_table, NULL, FTS_INDEX_TABLE, index_cache->index);
+
+	n_words = rbt_size(index_cache->words);
+
+	/* We iterate over the entire tree, even if there is an error,
+	since we want to free the memory used during caching. */
+	for (rbt_node = rbt_first(index_cache->words);
+	     rbt_node;
+	     rbt_node = rbt_first(index_cache->words)) {
+
+		ulint			i;
+		ulint			selected;
+		fts_tokenizer_word_t*	word;
+
+		word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+		selected = fts_select_index(
+			index_cache->charset, word->text.f_str,
+			word->text.f_len);
+
+		fts_table.suffix = fts_get_suffix(selected);
+
+#ifdef FTS_DOC_STATS_DEBUG
+		/* Check if the word exists in the FTS index and if not
+		then we need to increment the total word count stats. */
+		if (error == DB_SUCCESS && fts_enable_diag_print) {
+			ibool	found = FALSE;
+
+			error = fts_is_word_in_index(
+				trx,
+				&index_cache->sel_graph[selected],
+				&fts_table,
+				&word->text, &found);
+
+			if (error == DB_SUCCESS && !found) {
+
+				++n_new_words;
+			}
+		}
+#endif /* FTS_DOC_STATS_DEBUG */
+
+		n_nodes += ib_vector_size(word->nodes);
+
+		/* We iterate over all the nodes even if there was an error,
+		this is to free the memory of the fts_node_t elements. */
+		for (i = 0; i < ib_vector_size(word->nodes); ++i) {
+
+			fts_node_t* fts_node = static_cast<fts_node_t*>(
+				ib_vector_get(word->nodes, i));
+
+			if (error == DB_SUCCESS) {
+
+				error = fts_write_node(
+					trx,
+					&index_cache->ins_graph[selected],
+					&fts_table, &word->text, fts_node);
+			}
+
+			ut_free(fts_node->ilist);
+			fts_node->ilist = NULL;
+		}
+
+		if (error != DB_SUCCESS && !print_error) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: Error (%s) writing "
+				"word node to FTS auxiliary index "
+				"table.\n", ut_strerr(error));
+
+			print_error = TRUE;
+		}
+
+		/* NOTE: We are responsible for free'ing the node */
+		ut_free(rbt_remove_node(index_cache->words, rbt_node));
+	}
+
+#ifdef FTS_DOC_STATS_DEBUG
+	if (error == DB_SUCCESS && n_new_words > 0 && fts_enable_diag_print) {
+		fts_table_t	fts_table;
+
+		FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
+
+		/* Increment the total number of words in the FTS index */
+		error = fts_config_increment_index_value(
+			trx, index_cache->index, FTS_TOTAL_WORD_COUNT,
+			n_new_words);
+	}
+#endif /* FTS_DOC_STATS_DEBUG */
+
+	if (fts_enable_diag_print) {
+		printf("Avg number of nodes: %lf\n",
+		       (double) n_nodes / (double) (n_words > 1 ? n_words : 1));
+	}
+
+	return(error);
+}
+
+#ifdef FTS_DOC_STATS_DEBUG
+/*********************************************************************//**
+Write a single documents statistics to disk.
+@return DB_SUCCESS if all went well else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_sync_write_doc_stat(
+/*====================*/
+	trx_t*			trx,		/*!< in: transaction */
+	dict_index_t*		index,		/*!< in: index */
+	que_t**			graph,		/* out: query graph */
+	const fts_doc_stats_t*	doc_stat)	/*!< in: doc stats to write */
+{
+	pars_info_t*	info;
+	doc_id_t	doc_id;
+	dberr_t		error = DB_SUCCESS;
+	ib_uint32_t	word_count;
+
+	if (*graph) {
+		info = (*graph)->info;
+	} else {
+		info = pars_info_create();
+	}
+
+	/* Convert to "storage" byte order. */
+	mach_write_to_4((byte*) &word_count, doc_stat->word_count);
+	pars_info_bind_int4_literal(
+		info, "count", (const ib_uint32_t*) &word_count);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &doc_id, doc_stat->doc_id);
+	fts_bind_doc_id(info, "doc_id", &doc_id);
+
+	if (!*graph) {
+		fts_table_t	fts_table;
+
+		FTS_INIT_INDEX_TABLE(
+			&fts_table, "DOC_ID", FTS_INDEX_TABLE, index);
+
+		*graph = fts_parse_sql(
+			&fts_table,
+			info,
+			"BEGIN INSERT INTO \"%s\" VALUES (:doc_id, :count);");
+	}
+
+	for (;;) {
+		error = fts_eval_sql(trx, *graph);
+
+		if (error == DB_SUCCESS) {
+
+			break;				/* Exit the loop. */
+		} else {
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, "  InnoDB: Warning: lock wait "
+					"timeout writing to FTS doc_id. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, "  InnoDB: Error: (%s) "
+					"while writing to FTS doc_id.\n",
+					ut_strerr(error));
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Write document statistics to disk.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_sync_write_doc_stats(
+/*=====================*/
+	trx_t*			trx,		/*!< in: transaction */
+	const fts_index_cache_t*index_cache)	/*!< in: index cache */
+{
+	dberr_t		error = DB_SUCCESS;
+	que_t*		graph = NULL;
+	fts_doc_stats_t*  doc_stat;
+
+	if (ib_vector_is_empty(index_cache->doc_stats)) {
+		return(DB_SUCCESS);
+	}
+
+	doc_stat = static_cast<ts_doc_stats_t*>(
+		ib_vector_pop(index_cache->doc_stats));
+
+	while (doc_stat) {
+		error = fts_sync_write_doc_stat(
+			trx, index_cache->index, &graph, doc_stat);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		if (ib_vector_is_empty(index_cache->doc_stats)) {
+			break;
+		}
+
+		doc_stat = static_cast<ts_doc_stats_t*>(
+			ib_vector_pop(index_cache->doc_stats));
+	}
+
+	if (graph != NULL) {
+		fts_que_graph_free_check_lock(NULL, index_cache, graph);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Callback to check the existince of a word.
+@return always return NULL */
+static
+ibool
+fts_lookup_word(
+/*============*/
+	void*	row,				/*!< in:  sel_node_t* */
+	void*	user_arg)			/*!< in:  fts_doc_t* */
+{
+
+	que_node_t*	exp;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	ibool*		found = static_cast<ibool*>(user_arg);
+
+	exp = node->select_list;
+
+	while (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		ulint		len = dfield_get_len(dfield);
+
+		if (len != UNIV_SQL_NULL && len != 0) {
+			*found = TRUE;
+		}
+
+		exp = que_node_get_next(exp);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Check whether a particular word (term) exists in the FTS index.
+@return DB_SUCCESS if all went well else error code */
+static
+dberr_t
+fts_is_word_in_index(
+/*=================*/
+	trx_t*		trx,			/*!< in: FTS query state */
+	que_t**		graph,			/* out: Query graph */
+	fts_table_t*	fts_table,		/*!< in: table instance */
+	const fts_string_t*
+			word,			/*!< in: the word to check */
+	ibool*		found)			/* out: TRUE if exists */
+{
+	pars_info_t*	info;
+	dberr_t		error;
+
+	trx->op_info = "looking up word in FTS index";
+
+	if (*graph) {
+		info = (*graph)->info;
+	} else {
+		info = pars_info_create();
+	}
+
+	pars_info_bind_function(info, "my_func", fts_lookup_word, found);
+	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+	if (*graph == NULL) {
+		*graph = fts_parse_sql(
+			fts_table,
+			info,
+			"DECLARE FUNCTION my_func;\n"
+			"DECLARE CURSOR c IS"
+			" SELECT doc_count\n"
+			" FROM \"%s\"\n"
+			" WHERE word = :word "
+			" ORDER BY first_doc_id;\n"
+			"BEGIN\n"
+			"\n"
+			"OPEN c;\n"
+			"WHILE 1 = 1 LOOP\n"
+			"  FETCH c INTO my_func();\n"
+			"  IF c % NOTFOUND THEN\n"
+			"    EXIT;\n"
+			"  END IF;\n"
+			"END LOOP;\n"
+			"CLOSE c;");
+	}
+
+	for (;;) {
+		error = fts_eval_sql(trx, *graph);
+
+		if (error == DB_SUCCESS) {
+
+			break;				/* Exit the loop. */
+		} else {
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, "  InnoDB: Warning: lock wait "
+					"timeout reading FTS index. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, "  InnoDB: Error: (%s) "
+					"while reading FTS index.\n",
+					ut_strerr(error));
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	return(error);
+}
+#endif /* FTS_DOC_STATS_DEBUG */
+
+/*********************************************************************//**
+Begin Sync, create transaction, acquire locks, etc. */
+static
+void
+fts_sync_begin(
+/*===========*/
+	fts_sync_t*	sync)			/*!< in: sync state */
+{
+	fts_cache_t*	cache = sync->table->fts->cache;
+
+	n_nodes = 0;
+	elapsed_time = 0;
+
+	sync->start_time = ut_time();
+
+	sync->trx = trx_allocate_for_background();
+
+	if (fts_enable_diag_print) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"FTS SYNC for table %s, deleted count: %ld size: "
+			"%lu bytes",
+			sync->table->name,
+			ib_vector_size(cache->deleted_doc_ids),
+			cache->total_size);
+	}
+}
+
+/*********************************************************************//**
+Run SYNC on the table, i.e., write out data from the index specific
+cache to the FTS aux INDEX table and FTS aux doc id stats table.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_sync_index(
+/*===========*/
+	fts_sync_t*		sync,		/*!< in: sync state */
+	fts_index_cache_t*	index_cache)	/*!< in: index cache */
+{
+	trx_t*		trx = sync->trx;
+	dberr_t		error = DB_SUCCESS;
+
+	trx->op_info = "doing SYNC index";
+
+	if (fts_enable_diag_print) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"SYNC words: %ld", rbt_size(index_cache->words));
+	}
+
+	ut_ad(rbt_validate(index_cache->words));
+
+	error = fts_sync_write_words(trx, index_cache);
+
+#ifdef FTS_DOC_STATS_DEBUG
+	/* FTS_RESOLVE: the word counter info in auxiliary table "DOC_ID"
+	is not used currently for ranking. We disable fts_sync_write_doc_stats()
+	for now */
+	/* Write the per doc statistics that will be used for ranking. */
+	if (error == DB_SUCCESS) {
+
+		error = fts_sync_write_doc_stats(trx, index_cache);
+	}
+#endif /* FTS_DOC_STATS_DEBUG */
+
+	return(error);
+}
+
+/*********************************************************************//**
+Commit the SYNC, change state of processed doc ids etc.
+@return DB_SUCCESS if all OK */
+static  __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_sync_commit(
+/*============*/
+	fts_sync_t*	sync)			/*!< in: sync state */
+{
+	dberr_t		error;
+	trx_t*		trx = sync->trx;
+	fts_cache_t*	cache = sync->table->fts->cache;
+	doc_id_t	last_doc_id;
+
+	trx->op_info = "doing SYNC commit";
+
+	/* After each Sync, update the CONFIG table about the max doc id
+	we just sync-ed to index table */
+	error = fts_cmp_set_sync_doc_id(sync->table, sync->max_doc_id, FALSE,
+					&last_doc_id);
+
+	/* Get the list of deleted documents that are either in the
+	cache or were headed there but were deleted before the add
+	thread got to them. */
+
+	if (error == DB_SUCCESS && ib_vector_size(cache->deleted_doc_ids) > 0) {
+
+		error = fts_sync_add_deleted_cache(
+			sync, cache->deleted_doc_ids);
+	}
+
+	/* We need to do this within the deleted lock since fts_delete() can
+	attempt to add a deleted doc id to the cache deleted id array. */
+	fts_cache_clear(cache);
+	DEBUG_SYNC_C("fts_deleted_doc_ids_clear");
+	fts_cache_init(cache);
+	rw_lock_x_unlock(&cache->lock);
+
+	if (error == DB_SUCCESS) {
+
+		fts_sql_commit(trx);
+
+	} else if (error != DB_SUCCESS) {
+
+		fts_sql_rollback(trx);
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Error: (%s) during SYNC.\n",
+			ut_strerr(error));
+	}
+
+	if (fts_enable_diag_print && elapsed_time) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"SYNC for table %s: SYNC time : %lu secs: "
+			"elapsed %lf ins/sec",
+			sync->table->name,
+			(ulong) (ut_time() - sync->start_time),
+			(double) n_nodes/ (double) elapsed_time);
+	}
+
+	trx_free_for_background(trx);
+
+	return(error);
+}
+
+/*********************************************************************//**
+Rollback a sync operation */
+static
+void
+fts_sync_rollback(
+/*==============*/
+	fts_sync_t*	sync)			/*!< in: sync state */
+{
+	trx_t*		trx = sync->trx;
+	fts_cache_t*	cache = sync->table->fts->cache;
+
+	rw_lock_x_unlock(&cache->lock);
+
+	fts_sql_rollback(trx);
+	trx_free_for_background(trx);
+}
+
+/****************************************************************//**
+Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end.
+@return DB_SUCCESS if all OK */
+static
+dberr_t
+fts_sync(
+/*=====*/
+	fts_sync_t*	sync)		/*!< in: sync state */
+{
+	ulint		i;
+	dberr_t		error = DB_SUCCESS;
+	fts_cache_t*	cache = sync->table->fts->cache;
+
+	rw_lock_x_lock(&cache->lock);
+
+	fts_sync_begin(sync);
+
+	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+		fts_index_cache_t*	index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		if (index_cache->index->to_be_dropped) {
+			continue;
+		}
+
+		error = fts_sync_index(sync, index_cache);
+
+		if (error != DB_SUCCESS && !sync->interrupted) {
+
+			break;
+		}
+	}
+
+	DBUG_EXECUTE_IF("fts_instrument_sync_interrupted",
+			sync->interrupted = true;
+			error = DB_INTERRUPTED;
+	);
+
+	if (error == DB_SUCCESS && !sync->interrupted) {
+		error = fts_sync_commit(sync);
+	}  else {
+		fts_sync_rollback(sync);
+	}
+
+	/* We need to check whether an optimize is required, for that
+	we make copies of the two variables that control the trigger. These
+	variables can change behind our back and we don't want to hold the
+	lock for longer than is needed. */
+	mutex_enter(&cache->deleted_lock);
+
+	cache->added = 0;
+	cache->deleted = 0;
+
+	mutex_exit(&cache->deleted_lock);
+
+	return(error);
+}
+
+/****************************************************************//**
+Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end. */
+UNIV_INTERN
+dberr_t
+fts_sync_table(
+/*===========*/
+	dict_table_t*	table)		/*!< in: table */
+{
+	dberr_t	err = DB_SUCCESS;
+
+	ut_ad(table->fts);
+
+	if (!dict_table_is_discarded(table) && table->fts->cache) {
+		err = fts_sync(table->fts->cache->sync);
+	}
+
+	return(err);
+}
+
+/********************************************************************
+Process next token from document starting at the given position, i.e., add
+the token's start position to the token's list of positions.
+@return number of characters handled in this call */
+static
+ulint
+fts_process_token(
+/*==============*/
+	fts_doc_t*	doc,		/* in/out: document to
+					tokenize */
+	fts_doc_t*	result,		/* out: if provided, save
+					result here */
+	ulint		start_pos,	/*!< in: start position in text */
+	ulint		add_pos)	/*!< in: add this position to all
+					tokens from this tokenization */
+{
+	ulint		ret;
+	fts_string_t	str;
+	ulint		offset = 0;
+	fts_doc_t*	result_doc;
+
+	/* Determine where to save the result. */
+	result_doc = (result) ? result : doc;
+
+	/* The length of a string in characters is set here only. */
+	ret = innobase_mysql_fts_get_token(
+		doc->charset, doc->text.f_str + start_pos,
+		doc->text.f_str + doc->text.f_len, &str, &offset);
+
+	/* Ignore string whose character number is less than
+	"fts_min_token_size" or more than "fts_max_token_size" */
+
+	if (str.f_n_char >= fts_min_token_size
+	    && str.f_n_char <= fts_max_token_size) {
+
+		mem_heap_t*	heap;
+		fts_string_t	t_str;
+		fts_token_t*	token;
+		ib_rbt_bound_t	parent;
+		ulint		newlen;
+
+		heap = static_cast<mem_heap_t*>(result_doc->self_heap->arg);
+
+		t_str.f_n_char = str.f_n_char;
+
+		t_str.f_len = str.f_len * doc->charset->casedn_multiply + 1;
+
+		t_str.f_str = static_cast<byte*>(
+			mem_heap_alloc(heap, t_str.f_len));
+
+		newlen = innobase_fts_casedn_str(
+			doc->charset, (char*) str.f_str, str.f_len,
+			(char*) t_str.f_str, t_str.f_len);
+
+		t_str.f_len = newlen;
+		t_str.f_str[newlen] = 0;
+
+		/* Add the word to the document statistics. If the word
+		hasn't been seen before we create a new entry for it. */
+		if (rbt_search(result_doc->tokens, &parent, &t_str) != 0) {
+			fts_token_t	new_token;
+
+			new_token.text.f_len = newlen;
+			new_token.text.f_str = t_str.f_str;
+			new_token.text.f_n_char = t_str.f_n_char;
+
+			new_token.positions = ib_vector_create(
+				result_doc->self_heap, sizeof(ulint), 32);
+
+			ut_a(new_token.text.f_n_char >= fts_min_token_size);
+			ut_a(new_token.text.f_n_char <= fts_max_token_size);
+
+			parent.last = rbt_add_node(
+				result_doc->tokens, &parent, &new_token);
+
+			ut_ad(rbt_validate(result_doc->tokens));
+		}
+
+#ifdef	FTS_CHARSET_DEBUG
+		offset += start_pos + add_pos;
+#endif /* FTS_CHARSET_DEBUG */
+
+		offset += start_pos + ret - str.f_len + add_pos;
+
+		token = rbt_value(fts_token_t, parent.last);
+		ib_vector_push(token->positions, &offset);
+	}
+
+	return(ret);
+}
+
+/******************************************************************//**
+Tokenize a document. */
+UNIV_INTERN
+void
+fts_tokenize_document(
+/*==================*/
+	fts_doc_t*	doc,		/* in/out: document to
+					tokenize */
+	fts_doc_t*	result)		/* out: if provided, save
+					the result token here */
+{
+	ulint		inc;
+
+	ut_a(!doc->tokens);
+	ut_a(doc->charset);
+
+	doc->tokens = rbt_create_arg_cmp(
+		sizeof(fts_token_t), innobase_fts_text_cmp, doc->charset);
+
+	for (ulint i = 0; i < doc->text.f_len; i += inc) {
+		inc = fts_process_token(doc, result, i, 0);
+		ut_a(inc > 0);
+	}
+}
+
+/******************************************************************//**
+Continue to tokenize a document. */
+UNIV_INTERN
+void
+fts_tokenize_document_next(
+/*=======================*/
+	fts_doc_t*	doc,		/*!< in/out: document to
+					tokenize */
+	ulint		add_pos,	/*!< in: add this position to all
+					tokens from this tokenization */
+	fts_doc_t*	result)		/*!< out: if provided, save
+					the result token here */
+{
+	ulint		inc;
+
+	ut_a(doc->tokens);
+
+	for (ulint i = 0; i < doc->text.f_len; i += inc) {
+		inc = fts_process_token(doc, result, i, add_pos);
+		ut_a(inc > 0);
+	}
+}
+
+/********************************************************************
+Create the vector of fts_get_doc_t instances. */
+UNIV_INTERN
+ib_vector_t*
+fts_get_docs_create(
+/*================*/
+						/* out: vector of
+						fts_get_doc_t instances */
+	fts_cache_t*	cache)			/*!< in: fts cache */
+{
+	ulint		i;
+	ib_vector_t*	get_docs;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&cache->init_lock, RW_LOCK_EX));
+#endif
+	/* We need one instance of fts_get_doc_t per index. */
+	get_docs = ib_vector_create(
+		cache->self_heap, sizeof(fts_get_doc_t), 4);
+
+	/* Create the get_doc instance, we need one of these
+	per FTS index. */
+	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+
+		dict_index_t**	index;
+		fts_get_doc_t*	get_doc;
+
+		index = static_cast<dict_index_t**>(
+			ib_vector_get(cache->indexes, i));
+
+		get_doc = static_cast<fts_get_doc_t*>(
+			ib_vector_push(get_docs, NULL));
+
+		memset(get_doc, 0x0, sizeof(*get_doc));
+
+		get_doc->index_cache = fts_get_index_cache(cache, *index);
+		get_doc->cache = cache;
+
+		/* Must find the index cache. */
+		ut_a(get_doc->index_cache != NULL);
+	}
+
+	return(get_docs);
+}
+
+/********************************************************************
+Release any resources held by the fts_get_doc_t instances. */
+static
+void
+fts_get_docs_clear(
+/*===============*/
+	ib_vector_t*	get_docs)		/*!< in: Doc retrieval vector */
+{
+	ulint		i;
+
+	/* Release the get doc graphs if any. */
+	for (i = 0; i < ib_vector_size(get_docs); ++i) {
+
+		fts_get_doc_t*	get_doc = static_cast<fts_get_doc_t*>(
+			ib_vector_get(get_docs, i));
+
+		if (get_doc->get_document_graph != NULL) {
+
+			ut_a(get_doc->index_cache);
+
+			fts_que_graph_free(get_doc->get_document_graph);
+			get_doc->get_document_graph = NULL;
+		}
+	}
+}
+
+/*********************************************************************//**
+Get the initial Doc ID by consulting the CONFIG table
+@return initial Doc ID */
+UNIV_INTERN
+doc_id_t
+fts_init_doc_id(
+/*============*/
+	const dict_table_t*	table)		/*!< in: table */
+{
+	doc_id_t	max_doc_id = 0;
+
+	rw_lock_x_lock(&table->fts->cache->lock);
+
+	/* Return if the table is already initialized for DOC ID */
+	if (table->fts->cache->first_doc_id != FTS_NULL_DOC_ID) {
+		rw_lock_x_unlock(&table->fts->cache->lock);
+		return(0);
+	}
+
+	DEBUG_SYNC_C("fts_initialize_doc_id");
+
+	/* Then compare this value with the ID value stored in the CONFIG
+	table. The larger one will be our new initial Doc ID */
+	fts_cmp_set_sync_doc_id(table, 0, FALSE, &max_doc_id);
+
+	/* If DICT_TF2_FTS_ADD_DOC_ID is set, we are in the process of
+	creating index (and add doc id column. No need to recovery
+	documents */
+	if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+		fts_init_index((dict_table_t*) table, TRUE);
+	}
+
+	table->fts->fts_status |= ADDED_TABLE_SYNCED;
+
+	table->fts->cache->first_doc_id = max_doc_id;
+
+	rw_lock_x_unlock(&table->fts->cache->lock);
+
+	ut_ad(max_doc_id > 0);
+
+	return(max_doc_id);
+}
+
+#ifdef FTS_MULT_INDEX
+/*********************************************************************//**
+Check if the index is in the affected set.
+@return TRUE if index is updated */
+static
+ibool
+fts_is_index_updated(
+/*=================*/
+	const ib_vector_t*	fts_indexes,	/*!< in: affected FTS indexes */
+	const fts_get_doc_t*	get_doc)	/*!< in: info for reading
+						document */
+{
+	ulint		i;
+	dict_index_t*	index = get_doc->index_cache->index;
+
+	for (i = 0; i < ib_vector_size(fts_indexes); ++i) {
+		const dict_index_t*	updated_fts_index;
+
+		updated_fts_index = static_cast<const dict_index_t*>(
+			ib_vector_getp_const(fts_indexes, i));
+
+		ut_a(updated_fts_index != NULL);
+
+		if (updated_fts_index == index) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+#endif
+
+/*********************************************************************//**
+Fetch COUNT(*) from specified table.
+@return the number of rows in the table */
+UNIV_INTERN
+ulint
+fts_get_rows_count(
+/*===============*/
+	fts_table_t*	fts_table)	/*!< in: fts table to read */
+{
+	trx_t*		trx;
+	pars_info_t*	info;
+	que_t*		graph;
+	dberr_t		error;
+	ulint		count = 0;
+
+	trx = trx_allocate_for_background();
+
+	trx->op_info = "fetching FT table rows count";
+
+	info = pars_info_create();
+
+	pars_info_bind_function(info, "my_func", fts_read_ulint, &count);
+
+	graph = fts_parse_sql(
+		fts_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT COUNT(*) "
+		" FROM \"%s\";\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for (;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+
+			break;				/* Exit the loop. */
+		} else {
+			fts_sql_rollback(trx);
+
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, "  InnoDB: Warning: lock wait "
+					"timeout reading FTS table. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, "  InnoDB: Error: (%s) "
+					"while reading FTS table.\n",
+					ut_strerr(error));
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	fts_que_graph_free(graph);
+
+	trx_free_for_background(trx);
+
+	return(count);
+}
+
+#ifdef FTS_CACHE_SIZE_DEBUG
+/*********************************************************************//**
+Read the max cache size parameter from the config table. */
+static
+void
+fts_update_max_cache_size(
+/*======================*/
+	fts_sync_t*	sync)			/*!< in: sync state */
+{
+	trx_t*		trx;
+	fts_table_t	fts_table;
+
+	trx = trx_allocate_for_background();
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, sync->table);
+
+	/* The size returned is in bytes. */
+	sync->max_cache_size = fts_get_max_cache_size(trx, &fts_table);
+
+	fts_sql_commit(trx);
+
+	trx_free_for_background(trx);
+}
+#endif /* FTS_CACHE_SIZE_DEBUG */
+
+/*********************************************************************//**
+Free the modified rows of a table. */
+UNIV_INLINE
+void
+fts_trx_table_rows_free(
+/*====================*/
+	ib_rbt_t*	rows)			/*!< in: rbt of rows to free */
+{
+	const ib_rbt_node_t*	node;
+
+	for (node = rbt_first(rows); node; node = rbt_first(rows)) {
+		fts_trx_row_t*	row;
+
+		row = rbt_value(fts_trx_row_t, node);
+
+		if (row->fts_indexes != NULL) {
+			/* This vector shouldn't be using the
+			heap allocator.  */
+			ut_a(row->fts_indexes->allocator->arg == NULL);
+
+			ib_vector_free(row->fts_indexes);
+			row->fts_indexes = NULL;
+		}
+
+		ut_free(rbt_remove_node(rows, node));
+	}
+
+	ut_a(rbt_empty(rows));
+	rbt_free(rows);
+}
+
+/*********************************************************************//**
+Free an FTS savepoint instance. */
+UNIV_INLINE
+void
+fts_savepoint_free(
+/*===============*/
+	fts_savepoint_t*	savepoint)	/*!< in: savepoint instance */
+{
+	const ib_rbt_node_t*	node;
+	ib_rbt_t*		tables = savepoint->tables;
+
+	/* Nothing to free! */
+	if (tables == NULL) {
+		return;
+	}
+
+	for (node = rbt_first(tables); node; node = rbt_first(tables)) {
+		fts_trx_table_t*	ftt;
+		fts_trx_table_t**	fttp;
+
+		fttp = rbt_value(fts_trx_table_t*, node);
+		ftt = *fttp;
+
+		/* This can be NULL if a savepoint was released. */
+		if (ftt->rows != NULL) {
+			fts_trx_table_rows_free(ftt->rows);
+			ftt->rows = NULL;
+		}
+
+		/* This can be NULL if a savepoint was released. */
+		if (ftt->added_doc_ids != NULL) {
+			fts_doc_ids_free(ftt->added_doc_ids);
+			ftt->added_doc_ids = NULL;
+		}
+
+		/* The default savepoint name must be NULL. */
+		if (ftt->docs_added_graph) {
+			fts_que_graph_free(ftt->docs_added_graph);
+		}
+
+		/* NOTE: We are responsible for free'ing the node */
+		ut_free(rbt_remove_node(tables, node));
+	}
+
+	ut_a(rbt_empty(tables));
+	rbt_free(tables);
+	savepoint->tables = NULL;
+}
+
+/*********************************************************************//**
+Free an FTS trx. */
+UNIV_INTERN
+void
+fts_trx_free(
+/*=========*/
+	fts_trx_t*	fts_trx)		/* in, own: FTS trx */
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(fts_trx->savepoints); ++i) {
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_get(fts_trx->savepoints, i));
+
+		/* The default savepoint name must be NULL. */
+		if (i == 0) {
+			ut_a(savepoint->name == NULL);
+		}
+
+		fts_savepoint_free(savepoint);
+	}
+
+	for (i = 0; i < ib_vector_size(fts_trx->last_stmt); ++i) {
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_get(fts_trx->last_stmt, i));
+
+		/* The default savepoint name must be NULL. */
+		if (i == 0) {
+			ut_a(savepoint->name == NULL);
+		}
+
+		fts_savepoint_free(savepoint);
+	}
+
+	if (fts_trx->heap) {
+		mem_heap_free(fts_trx->heap);
+	}
+}
+
+/*********************************************************************//**
+Extract the doc id from the FTS hidden column.
+@return doc id that was extracted from rec */
+UNIV_INTERN
+doc_id_t
+fts_get_doc_id_from_row(
+/*====================*/
+	dict_table_t*	table,			/*!< in: table */
+	dtuple_t*	row)			/*!< in: row whose FTS doc id we
+						want to extract.*/
+{
+	dfield_t*	field;
+	doc_id_t	doc_id = 0;
+
+	ut_a(table->fts->doc_col != ULINT_UNDEFINED);
+
+	field = dtuple_get_nth_field(row, table->fts->doc_col);
+
+	ut_a(dfield_get_len(field) == sizeof(doc_id));
+	ut_a(dfield_get_type(field)->mtype == DATA_INT);
+
+	doc_id = fts_read_doc_id(
+		static_cast<const byte*>(dfield_get_data(field)));
+
+	return(doc_id);
+}
+
+/*********************************************************************//**
+Extract the doc id from the FTS hidden column.
+@return doc id that was extracted from rec */
+UNIV_INTERN
+doc_id_t
+fts_get_doc_id_from_rec(
+/*====================*/
+	dict_table_t*	table,			/*!< in: table */
+	const rec_t*	rec,			/*!< in: rec */
+	mem_heap_t*	heap)			/*!< in: heap */
+{
+	ulint		len;
+	const byte*	data;
+	ulint		col_no;
+	doc_id_t	doc_id = 0;
+	dict_index_t*	clust_index;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets = offsets_;
+	mem_heap_t*	my_heap = heap;
+
+	ut_a(table->fts->doc_col != ULINT_UNDEFINED);
+
+	clust_index = dict_table_get_first_index(table);
+
+	rec_offs_init(offsets_);
+
+	offsets = rec_get_offsets(
+		rec, clust_index, offsets, ULINT_UNDEFINED, &my_heap);
+
+	col_no = dict_col_get_clust_pos(
+		&table->cols[table->fts->doc_col], clust_index);
+	ut_ad(col_no != ULINT_UNDEFINED);
+
+	data = rec_get_nth_field(rec, offsets, col_no, &len);
+
+	ut_a(len == 8);
+	ut_ad(8 == sizeof(doc_id));
+	doc_id = static_cast<doc_id_t>(mach_read_from_8(data));
+
+	if (my_heap && !heap) {
+		mem_heap_free(my_heap);
+	}
+
+	return(doc_id);
+}
+
+/*********************************************************************//**
+Search the index specific cache for a particular FTS index.
+@return the index specific cache else NULL */
+UNIV_INTERN
+fts_index_cache_t*
+fts_find_index_cache(
+/*=================*/
+	const fts_cache_t*	cache,		/*!< in: cache to search */
+	const dict_index_t*	index)		/*!< in: index to search for */
+{
+	/* We cast away the const because our internal function, takes
+	non-const cache arg and returns a non-const pointer. */
+	return(static_cast<fts_index_cache_t*>(
+		fts_get_index_cache((fts_cache_t*) cache, index)));
+}
+
+/*********************************************************************//**
+Search cache for word.
+@return the word node vector if found else NULL */
+UNIV_INTERN
+const ib_vector_t*
+fts_cache_find_word(
+/*================*/
+	const fts_index_cache_t*index_cache,	/*!< in: cache to search */
+	const fts_string_t*	text)		/*!< in: word to search for */
+{
+	ib_rbt_bound_t		parent;
+	const ib_vector_t*	nodes = NULL;
+#ifdef UNIV_SYNC_DEBUG
+	dict_table_t*		table = index_cache->index->table;
+	fts_cache_t*		cache = table->fts->cache;
+
+	ut_ad(rw_lock_own((rw_lock_t*) &cache->lock, RW_LOCK_EX));
+#endif
+
+	/* Lookup the word in the rb tree */
+	if (rbt_search(index_cache->words, &parent, text) == 0) {
+		const fts_tokenizer_word_t*	word;
+
+		word = rbt_value(fts_tokenizer_word_t, parent.last);
+
+		nodes = word->nodes;
+	}
+
+	return(nodes);
+}
+
+/*********************************************************************//**
+Check cache for deleted doc id.
+@return TRUE if deleted */
+UNIV_INTERN
+ibool
+fts_cache_is_deleted_doc_id(
+/*========================*/
+	const fts_cache_t*	cache,		/*!< in: cache ito search */
+	doc_id_t		doc_id)		/*!< in: doc id to search for */
+{
+	ulint			i;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(mutex_own(&cache->deleted_lock));
+#endif
+
+	for (i = 0; i < ib_vector_size(cache->deleted_doc_ids); ++i) {
+		const fts_update_t*	update;
+
+		update = static_cast<const fts_update_t*>(
+			ib_vector_get_const(cache->deleted_doc_ids, i));
+
+		if (doc_id == update->doc_id) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Append deleted doc ids to vector. */
+UNIV_INTERN
+void
+fts_cache_append_deleted_doc_ids(
+/*=============================*/
+	const fts_cache_t*	cache,		/*!< in: cache to use */
+	ib_vector_t*		vector)		/*!< in: append to this vector */
+{
+	ulint			i;
+
+	mutex_enter((ib_mutex_t*) &cache->deleted_lock);
+
+	if (cache->deleted_doc_ids == NULL) {
+		mutex_exit((ib_mutex_t*) &cache->deleted_lock);
+		return;
+	}
+
+
+	for (i = 0; i < ib_vector_size(cache->deleted_doc_ids); ++i) {
+		fts_update_t*	update;
+
+		update = static_cast<fts_update_t*>(
+			ib_vector_get(cache->deleted_doc_ids, i));
+
+		ib_vector_push(vector, &update->doc_id);
+	}
+
+	mutex_exit((ib_mutex_t*) &cache->deleted_lock);
+}
+
+/*********************************************************************//**
+Wait for the background thread to start. We poll to detect change
+of state, which is acceptable, since the wait should happen only
+once during startup.
+@return true if the thread started else FALSE (i.e timed out) */
+UNIV_INTERN
+ibool
+fts_wait_for_background_thread_to_start(
+/*====================================*/
+	dict_table_t*		table,		/*!< in: table to which the thread
+						is attached */
+	ulint			max_wait)	/*!< in: time in microseconds, if
+						set to 0 then it disables
+						timeout checking */
+{
+	ulint			count = 0;
+	ibool			done = FALSE;
+
+	ut_a(max_wait == 0 || max_wait >= FTS_MAX_BACKGROUND_THREAD_WAIT);
+
+	for (;;) {
+		fts_t*		fts = table->fts;
+
+		mutex_enter(&fts->bg_threads_mutex);
+
+		if (fts->fts_status & BG_THREAD_READY) {
+
+			done = TRUE;
+		}
+
+		mutex_exit(&fts->bg_threads_mutex);
+
+		if (!done) {
+			os_thread_sleep(FTS_MAX_BACKGROUND_THREAD_WAIT);
+
+			if (max_wait > 0) {
+
+				max_wait -= FTS_MAX_BACKGROUND_THREAD_WAIT;
+
+				/* We ignore the residual value. */
+				if (max_wait < FTS_MAX_BACKGROUND_THREAD_WAIT) {
+					break;
+				}
+			}
+
+			++count;
+		} else {
+			break;
+		}
+
+		if (count >= FTS_BACKGROUND_THREAD_WAIT_COUNT) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, " InnoDB: Error the background thread "
+				"for the FTS table %s refuses to start\n",
+				table->name);
+
+			count = 0;
+		}
+	}
+
+	return(done);
+}
+
+/*********************************************************************//**
+Add the FTS document id hidden column. */
+UNIV_INTERN
+void
+fts_add_doc_id_column(
+/*==================*/
+	dict_table_t*	table,	/*!< in/out: Table with FTS index */
+	mem_heap_t*	heap)	/*!< in: temporary memory heap, or NULL */
+{
+	dict_mem_table_add_col(
+		table, heap,
+		FTS_DOC_ID_COL_NAME,
+		DATA_INT,
+		dtype_form_prtype(
+			DATA_NOT_NULL | DATA_UNSIGNED
+			| DATA_BINARY_TYPE | DATA_FTS_DOC_ID, 0),
+		sizeof(doc_id_t));
+	DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_HAS_DOC_ID);
+}
+
+/*********************************************************************//**
+Update the query graph with a new document id.
+@return Doc ID used */
+UNIV_INTERN
+doc_id_t
+fts_update_doc_id(
+/*==============*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	ufield,		/*!< out: update node */
+	doc_id_t*	next_doc_id)	/*!< in/out: buffer for writing */
+{
+	doc_id_t	doc_id;
+	dberr_t		error = DB_SUCCESS;
+
+	if (*next_doc_id) {
+		doc_id = *next_doc_id;
+	} else {
+		/* Get the new document id that will be added. */
+		error = fts_get_next_doc_id(table, &doc_id);
+	}
+
+	if (error == DB_SUCCESS) {
+		dict_index_t*	clust_index;
+
+		ufield->exp = NULL;
+
+		ufield->new_val.len = sizeof(doc_id);
+
+		clust_index = dict_table_get_first_index(table);
+
+		ufield->field_no = dict_col_get_clust_pos(
+			&table->cols[table->fts->doc_col], clust_index);
+
+		/* It is possible we update record that has
+		not yet be sync-ed from last crash. */
+
+		/* Convert to storage byte order. */
+		ut_a(doc_id != FTS_NULL_DOC_ID);
+		fts_write_doc_id((byte*) next_doc_id, doc_id);
+
+		ufield->new_val.data = next_doc_id;
+	}
+
+	return(doc_id);
+}
+
+/*********************************************************************//**
+Check if the table has an FTS index. This is the non-inline version
+of dict_table_has_fts_index().
+@return TRUE if table has an FTS index */
+UNIV_INTERN
+ibool
+fts_dict_table_has_fts_index(
+/*=========================*/
+	dict_table_t*	table)		/*!< in: table */
+{
+	return(dict_table_has_fts_index(table));
+}
+
+/*********************************************************************//**
+Create an instance of fts_t.
+@return instance of fts_t */
+UNIV_INTERN
+fts_t*
+fts_create(
+/*=======*/
+	dict_table_t*	table)		/*!< in/out: table with FTS indexes */
+{
+	fts_t*		fts;
+	ib_alloc_t*	heap_alloc;
+	mem_heap_t*	heap;
+
+	ut_a(!table->fts);
+
+	heap = mem_heap_create(512);
+
+	fts = static_cast<fts_t*>(mem_heap_alloc(heap, sizeof(*fts)));
+
+	memset(fts, 0x0, sizeof(*fts));
+
+	fts->fts_heap = heap;
+
+	fts->doc_col = ULINT_UNDEFINED;
+
+	mutex_create(
+		fts_bg_threads_mutex_key, &fts->bg_threads_mutex,
+		SYNC_FTS_BG_THREADS);
+
+	heap_alloc = ib_heap_allocator_create(heap);
+	fts->indexes = ib_vector_create(heap_alloc, sizeof(dict_index_t*), 4);
+	dict_table_get_all_fts_indexes(table, fts->indexes);
+
+	return(fts);
+}
+
+/*********************************************************************//**
+Free the FTS resources. */
+UNIV_INTERN
+void
+fts_free(
+/*=====*/
+	dict_table_t*	table)	/*!< in/out: table with FTS indexes */
+{
+	fts_t*		fts = table->fts;
+
+	mutex_free(&fts->bg_threads_mutex);
+
+	ut_ad(!fts->add_wq);
+
+	if (fts->cache) {
+		fts_cache_clear(fts->cache);
+		fts_cache_destroy(fts->cache);
+		fts->cache = NULL;
+	}
+
+	mem_heap_free(fts->fts_heap);
+
+	table->fts = NULL;
+}
+
+/*********************************************************************//**
+Signal FTS threads to initiate shutdown. */
+UNIV_INTERN
+void
+fts_start_shutdown(
+/*===============*/
+	dict_table_t*	table,		/*!< in: table with FTS indexes */
+	fts_t*		fts)		/*!< in: fts instance that needs
+					to be informed about shutdown */
+{
+	mutex_enter(&fts->bg_threads_mutex);
+
+	fts->fts_status |= BG_THREAD_STOP;
+
+	mutex_exit(&fts->bg_threads_mutex);
+
+}
+
+/*********************************************************************//**
+Wait for FTS threads to shutdown. */
+UNIV_INTERN
+void
+fts_shutdown(
+/*=========*/
+	dict_table_t*	table,		/*!< in: table with FTS indexes */
+	fts_t*		fts)		/*!< in: fts instance to shutdown */
+{
+	mutex_enter(&fts->bg_threads_mutex);
+
+	ut_a(fts->fts_status & BG_THREAD_STOP);
+
+	dict_table_wait_for_bg_threads_to_exit(table, 20000);
+
+	mutex_exit(&fts->bg_threads_mutex);
+}
+
+/*********************************************************************//**
+Take a FTS savepoint. */
+UNIV_INLINE
+void
+fts_savepoint_copy(
+/*===============*/
+	const fts_savepoint_t*	src,	/*!< in: source savepoint */
+	fts_savepoint_t*	dst)	/*!< out: destination savepoint */
+{
+	const ib_rbt_node_t*	node;
+	const ib_rbt_t*		tables;
+
+	tables = src->tables;
+
+	for (node = rbt_first(tables); node; node = rbt_next(tables, node)) {
+
+		fts_trx_table_t*	ftt_dst;
+		const fts_trx_table_t**	ftt_src;
+
+		ftt_src = rbt_value(const fts_trx_table_t*, node);
+
+		ftt_dst = fts_trx_table_clone(*ftt_src);
+
+		rbt_insert(dst->tables, &ftt_dst, &ftt_dst);
+	}
+}
+
+/*********************************************************************//**
+Take a FTS savepoint. */
+UNIV_INTERN
+void
+fts_savepoint_take(
+/*===============*/
+	trx_t*		trx,		/*!< in: transaction */
+	fts_trx_t*	fts_trx,	/*!< in: fts transaction */
+	const char*	name)		/*!< in: savepoint name */
+{
+	mem_heap_t*		heap;
+	fts_savepoint_t*	savepoint;
+	fts_savepoint_t*	last_savepoint;
+
+	ut_a(name != NULL);
+
+	heap = fts_trx->heap;
+
+	/* The implied savepoint must exist. */
+	ut_a(ib_vector_size(fts_trx->savepoints) > 0);
+
+	last_savepoint = static_cast<fts_savepoint_t*>(
+		ib_vector_last(fts_trx->savepoints));
+	savepoint = fts_savepoint_create(fts_trx->savepoints, name, heap);
+
+	if (last_savepoint->tables != NULL) {
+		fts_savepoint_copy(last_savepoint, savepoint);
+	}
+}
+
+/*********************************************************************//**
+Lookup a savepoint instance by name.
+@return ULINT_UNDEFINED if not found */
+UNIV_INLINE
+ulint
+fts_savepoint_lookup(
+/*==================*/
+	ib_vector_t*	savepoints,	/*!< in: savepoints */
+	const char*	name)		/*!< in: savepoint name */
+{
+	ulint			i;
+
+	ut_a(ib_vector_size(savepoints) > 0);
+
+	for (i = 1; i < ib_vector_size(savepoints); ++i) {
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_get(savepoints, i));
+
+		if (strcmp(name, savepoint->name) == 0) {
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/*********************************************************************//**
+Release the savepoint data identified by  name. All savepoints created
+after the named savepoint are also released.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+void
+fts_savepoint_release(
+/*==================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const char*	name)		/*!< in: savepoint name */
+{
+	ulint			i;
+	ib_vector_t*		savepoints;
+	ulint			top_of_stack = 0;
+
+	ut_a(name != NULL);
+
+	savepoints = trx->fts_trx->savepoints;
+
+	ut_a(ib_vector_size(savepoints) > 0);
+
+	/* Skip the implied savepoint (first element). */
+	for (i = 1; i < ib_vector_size(savepoints); ++i) {
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_get(savepoints, i));
+
+		/* Even though we release the resources that are part
+		of the savepoint, we don't (always) actually delete the
+		entry.  We simply set the savepoint name to NULL. Therefore
+		we have to skip deleted/released entries. */
+		if (savepoint->name != NULL
+		    && strcmp(name, savepoint->name) == 0) {
+			break;
+
+		/* Track the previous savepoint instance that will
+		be at the top of the stack after the release. */
+		} else if (savepoint->name != NULL) {
+			/* We need to delete all entries
+			greater than this element. */
+			top_of_stack = i;
+		}
+	}
+
+	/* Only if we found and element to release. */
+	if (i < ib_vector_size(savepoints)) {
+		fts_savepoint_t*	last_savepoint;
+		fts_savepoint_t*	top_savepoint;
+		ib_rbt_t*		tables;
+
+		ut_a(top_of_stack < ib_vector_size(savepoints));
+
+		/* Exchange tables between last savepoint and top savepoint */
+		last_savepoint = static_cast<fts_savepoint_t*>(
+				ib_vector_last(trx->fts_trx->savepoints));
+		top_savepoint = static_cast<fts_savepoint_t*>(
+				ib_vector_get(savepoints, top_of_stack));
+		tables = top_savepoint->tables;
+		top_savepoint->tables = last_savepoint->tables;
+		last_savepoint->tables = tables;
+
+		/* Skip the implied savepoint. */
+		for (i = ib_vector_size(savepoints) - 1;
+		     i > top_of_stack;
+		     --i) {
+
+			fts_savepoint_t*	savepoint;
+
+			savepoint = static_cast<fts_savepoint_t*>(
+				ib_vector_get(savepoints, i));
+
+			/* Skip savepoints that were released earlier. */
+			if (savepoint->name != NULL) {
+				savepoint->name = NULL;
+				fts_savepoint_free(savepoint);
+			}
+
+			ib_vector_pop(savepoints);
+		}
+
+		/* Make sure we don't delete the implied savepoint. */
+		ut_a(ib_vector_size(savepoints) > 0);
+
+		/* This must hold. */
+		ut_a(ib_vector_size(savepoints) == (top_of_stack + 1));
+	}
+}
+
+/**********************************************************************//**
+Refresh last statement savepoint. */
+UNIV_INTERN
+void
+fts_savepoint_laststmt_refresh(
+/*===========================*/
+	trx_t*			trx)	/*!< in: transaction */
+{
+
+	fts_trx_t*              fts_trx;
+	fts_savepoint_t*        savepoint;
+
+	fts_trx = trx->fts_trx;
+
+	savepoint = static_cast<fts_savepoint_t*>(
+		ib_vector_pop(fts_trx->last_stmt));
+	fts_savepoint_free(savepoint);
+
+	ut_ad(ib_vector_is_empty(fts_trx->last_stmt));
+	savepoint = fts_savepoint_create(fts_trx->last_stmt, NULL, NULL);
+}
+
+/********************************************************************
+Undo the Doc ID add/delete operations in last stmt */
+static
+void
+fts_undo_last_stmt(
+/*===============*/
+	fts_trx_table_t*	s_ftt,	/*!< in: Transaction FTS table */
+	fts_trx_table_t*	l_ftt)	/*!< in: last stmt FTS table */
+{
+	ib_rbt_t*		s_rows;
+	ib_rbt_t*		l_rows;
+	const ib_rbt_node_t*	node;
+
+	l_rows = l_ftt->rows;
+	s_rows = s_ftt->rows;
+
+	for (node = rbt_first(l_rows);
+	     node;
+	     node = rbt_next(l_rows, node)) {
+		fts_trx_row_t*	l_row = rbt_value(fts_trx_row_t, node);
+		ib_rbt_bound_t	parent;
+
+		rbt_search(s_rows, &parent, &(l_row->doc_id));
+
+		if (parent.result == 0) {
+			fts_trx_row_t*	s_row = rbt_value(
+				fts_trx_row_t, parent.last);
+
+			switch (l_row->state) {
+			case FTS_INSERT:
+				ut_free(rbt_remove_node(s_rows, parent.last));
+				break;
+
+			case FTS_DELETE:
+				if (s_row->state == FTS_NOTHING) {
+					s_row->state = FTS_INSERT;
+				} else if (s_row->state == FTS_DELETE) {
+					ut_free(rbt_remove_node(
+						s_rows, parent.last));
+				}
+				break;
+
+			/* FIXME: Check if FTS_MODIFY need to be addressed */
+			case FTS_MODIFY:
+			case FTS_NOTHING:
+				break;
+			default:
+				ut_error;
+			}
+		}
+	}
+}
+
+/**********************************************************************//**
+Rollback to savepoint indentified by name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+void
+fts_savepoint_rollback_last_stmt(
+/*=============================*/
+	trx_t*		trx)		/*!< in: transaction */
+{
+	ib_vector_t*		savepoints;
+	fts_savepoint_t*	savepoint;
+	fts_savepoint_t*	last_stmt;
+	fts_trx_t*		fts_trx;
+	ib_rbt_bound_t		parent;
+	const ib_rbt_node_t*    node;
+	ib_rbt_t*		l_tables;
+	ib_rbt_t*		s_tables;
+
+	fts_trx = trx->fts_trx;
+	savepoints = fts_trx->savepoints;
+
+	savepoint = static_cast<fts_savepoint_t*>(ib_vector_last(savepoints));
+	last_stmt = static_cast<fts_savepoint_t*>(
+		ib_vector_last(fts_trx->last_stmt));
+
+	l_tables = last_stmt->tables;
+	s_tables = savepoint->tables;
+
+	for (node = rbt_first(l_tables);
+	     node;
+	     node = rbt_next(l_tables, node)) {
+
+		fts_trx_table_t**	l_ftt;
+
+		l_ftt = rbt_value(fts_trx_table_t*, node);
+
+		rbt_search_cmp(
+			s_tables, &parent, &(*l_ftt)->table->id,
+			fts_trx_table_id_cmp, NULL);
+
+		if (parent.result == 0) {
+			fts_trx_table_t**	s_ftt;
+
+			s_ftt = rbt_value(fts_trx_table_t*, parent.last);
+
+			fts_undo_last_stmt(*s_ftt, *l_ftt);
+		}
+	}
+}
+
+/**********************************************************************//**
+Rollback to savepoint indentified by name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+void
+fts_savepoint_rollback(
+/*===================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const char*	name)		/*!< in: savepoint name */
+{
+	ulint		i;
+	ib_vector_t*	savepoints;
+
+	ut_a(name != NULL);
+
+	savepoints = trx->fts_trx->savepoints;
+
+	/* We pop all savepoints from the the top of the stack up to
+	and including the instance that was found. */
+	i = fts_savepoint_lookup(savepoints, name);
+
+	if (i != ULINT_UNDEFINED) {
+		fts_savepoint_t*	savepoint;
+
+		ut_a(i > 0);
+
+		while (ib_vector_size(savepoints) > i) {
+			fts_savepoint_t*	savepoint;
+
+			savepoint = static_cast<fts_savepoint_t*>(
+				ib_vector_pop(savepoints));
+
+			if (savepoint->name != NULL) {
+				/* Since name was allocated on the heap, the
+				memory will be released when the transaction
+				completes. */
+				savepoint->name = NULL;
+
+				fts_savepoint_free(savepoint);
+			}
+		}
+
+		/* Pop all a elements from the top of the stack that may
+		have been released. We have to be careful that we don't
+		delete the implied savepoint. */
+
+		for (savepoint = static_cast<fts_savepoint_t*>(
+				ib_vector_last(savepoints));
+		     ib_vector_size(savepoints) > 1
+		     && savepoint->name == NULL;
+		     savepoint = static_cast<fts_savepoint_t*>(
+				ib_vector_last(savepoints))) {
+
+			ib_vector_pop(savepoints);
+		}
+
+		/* Make sure we don't delete the implied savepoint. */
+		ut_a(ib_vector_size(savepoints) > 0);
+
+		/* Restore the savepoint. */
+		fts_savepoint_take(trx, trx->fts_trx, name);
+	}
+}
+
+/**********************************************************************//**
+Check if a table is an FTS auxiliary table name.
+@return TRUE if the name matches an auxiliary table name pattern */
+static
+ibool
+fts_is_aux_table_name(
+/*==================*/
+	fts_aux_table_t*table,		/*!< out: table info */
+	const char*	name,		/*!< in: table name */
+	ulint		len)		/*!< in: length of table name */
+{
+	const char*	ptr;
+	char*		end;
+	char		my_name[MAX_FULL_NAME_LEN + 1];
+
+	ut_ad(len <= MAX_FULL_NAME_LEN);
+	ut_memcpy(my_name, name, len);
+	my_name[len] = 0;
+	end = my_name + len;
+
+	ptr = static_cast<const char*>(memchr(my_name, '/', len));
+
+	if (ptr != NULL) {
+		/* We will start the match after the '/' */
+		++ptr;
+		len = end - ptr;
+	}
+
+	/* All auxiliary tables are prefixed with "FTS_" and the name
+	length will be at the very least greater than 20 bytes. */
+	if (ptr != NULL && len > 20 && strncmp(ptr, "FTS_", 4) == 0) {
+		ulint		i;
+
+		/* Skip the prefix. */
+		ptr += 4;
+		len -= 4;
+
+		/* Try and read the table id. */
+		if (!fts_read_object_id(&table->parent_id, ptr)) {
+			return(FALSE);
+		}
+
+		/* Skip the table id. */
+		ptr = static_cast<const char*>(memchr(ptr, '_', len));
+
+		if (ptr == NULL) {
+			return(FALSE);
+		}
+
+		/* Skip the underscore. */
+		++ptr;
+		ut_a(end > ptr);
+		len = end - ptr;
+
+		/* First search the common table suffix array. */
+		for (i = 0; fts_common_tables[i] != NULL; ++i) {
+
+			if (strncmp(ptr, fts_common_tables[i], len) == 0) {
+				return(TRUE);
+			}
+		}
+
+		/* Could be obsolete common tables. */
+		if (strncmp(ptr, "ADDED", len) == 0
+		    || strncmp(ptr, "STOPWORDS", len) == 0) {
+			return(true);
+		}
+
+		/* Try and read the index id. */
+		if (!fts_read_object_id(&table->index_id, ptr)) {
+			return(FALSE);
+		}
+
+		/* Skip the table id. */
+		ptr = static_cast<const char*>(memchr(ptr, '_', len));
+
+		if (ptr == NULL) {
+			return(FALSE);
+		}
+
+		/* Skip the underscore. */
+		++ptr;
+		ut_a(end > ptr);
+		len = end - ptr;
+
+		/* Search the FT index specific array. */
+		for (i = 0; fts_index_selector[i].value; ++i) {
+
+			if (strncmp(ptr, fts_get_suffix(i), len) == 0) {
+				return(TRUE);
+			}
+		}
+
+		/* Other FT index specific table(s). */
+		if (strncmp(ptr, "DOC_ID", len) == 0) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Callback function to read a single table ID column.
+@return Always return TRUE */
+static
+ibool
+fts_read_tables(
+/*============*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to ib_vector_t */
+{
+	int		i;
+	fts_aux_table_t*table;
+	mem_heap_t*	heap;
+	ibool		done = FALSE;
+	ib_vector_t*	tables = static_cast<ib_vector_t*>(user_arg);
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	que_node_t*	exp = sel_node->select_list;
+
+	/* Must be a heap allocated vector. */
+	ut_a(tables->allocator->arg != NULL);
+
+	/* We will use this heap for allocating strings. */
+	heap = static_cast<mem_heap_t*>(tables->allocator->arg);
+	table = static_cast<fts_aux_table_t*>(ib_vector_push(tables, NULL));
+
+	memset(table, 0x0, sizeof(*table));
+
+	/* Iterate over the columns and read the values. */
+	for (i = 0; exp && !done; exp = que_node_get_next(exp), ++i) {
+
+		dfield_t*	dfield = que_node_get_val(exp);
+		void*		data = dfield_get_data(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		ut_a(len != UNIV_SQL_NULL);
+
+		/* Note: The column numbers below must match the SELECT */
+		switch (i) {
+		case 0: /* NAME */
+
+			if (!fts_is_aux_table_name(
+				table, static_cast<const char*>(data), len)) {
+				ib_vector_pop(tables);
+				done = TRUE;
+				break;
+			}
+
+			table->name = static_cast<char*>(
+				mem_heap_alloc(heap, len + 1));
+			memcpy(table->name, data, len);
+			table->name[len] = 0;
+			break;
+
+		case 1: /* ID */
+			ut_a(len == 8);
+			table->id = mach_read_from_8(
+				static_cast<const byte*>(data));
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+Callback that sets a hex formatted FTS table's flags2 in
+SYS_TABLES. The flags is stored in MIX_LEN column.
+@return FALSE if all OK */
+static
+ibool
+fts_set_hex_format(
+/*===============*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: bool set/unset flag */
+{
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	dfield_t*	dfield = que_node_get_val(node->select_list);
+
+	ut_ad(dtype_get_mtype(dfield_get_type(dfield)) == DATA_INT);
+	ut_ad(dfield_get_len(dfield) == sizeof(ib_uint32_t));
+	/* There should be at most one matching record. So the value
+	must be the default value. */
+	ut_ad(mach_read_from_4(static_cast<byte*>(user_arg))
+	      == ULINT32_UNDEFINED);
+
+	ulint		flags2 = mach_read_from_4(
+			static_cast<byte*>(dfield_get_data(dfield)));
+
+	flags2 |= DICT_TF2_FTS_AUX_HEX_NAME;
+
+	mach_write_to_4(static_cast<byte*>(user_arg), flags2);
+
+	return(FALSE);
+}
+
+/*****************************************************************//**
+Update the DICT_TF2_FTS_AUX_HEX_NAME flag in SYS_TABLES.
+@return DB_SUCCESS or error code. */
+UNIV_INTERN
+dberr_t
+fts_update_hex_format_flag(
+/*=======================*/
+	trx_t*		trx,		/*!< in/out: transaction that
+					covers the update */
+	table_id_t	table_id,	/*!< in: Table for which we want
+					to set the root table->flags2 */
+	bool		dict_locked)	/*!< in: set to true if the
+					caller already owns the
+					dict_sys_t::mutex. */
+{
+	pars_info_t*		info;
+	ib_uint32_t		flags2;
+
+	static const char	sql[] =
+		"PROCEDURE UPDATE_HEX_FORMAT_FLAG() IS\n"
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS\n"
+		" SELECT MIX_LEN "
+		" FROM SYS_TABLES "
+		" WHERE ID = :table_id FOR UPDATE;"
+		"\n"
+		"BEGIN\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"UPDATE SYS_TABLES"
+		" SET MIX_LEN = :flags2"
+		" WHERE ID = :table_id;\n"
+		"CLOSE c;\n"
+		"END;\n";
+
+	flags2 = ULINT32_UNDEFINED;
+
+	info = pars_info_create();
+
+	pars_info_add_ull_literal(info, "table_id", table_id);
+	pars_info_bind_int4_literal(info, "flags2", &flags2);
+
+	pars_info_bind_function(
+		info, "my_func", fts_set_hex_format, &flags2);
+
+	if (trx_get_dict_operation(trx) == TRX_DICT_OP_NONE) {
+		trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+	}
+
+	dberr_t err = que_eval_sql(info, sql, !dict_locked, trx);
+
+	ut_a(flags2 != ULINT32_UNDEFINED);
+
+	return (err);
+}
+
+/*********************************************************************//**
+Rename an aux table to HEX format. It's called when "%016llu" is used
+to format an object id in table name, which only happens in Windows. */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_rename_one_aux_table_to_hex_format(
+/*===================================*/
+	trx_t*			trx,		/*!< in: transaction */
+	const fts_aux_table_t*	aux_table,	/*!< in: table info */
+	const dict_table_t*	parent_table)	/*!< in: parent table name */
+{
+	const char*     ptr;
+	fts_table_t	fts_table;
+	char*		new_name;
+	dberr_t		error;
+
+	ptr = strchr(aux_table->name, '/');
+	ut_a(ptr != NULL);
+	++ptr;
+	/* Skip "FTS_", table id and underscore */
+	for (ulint i = 0; i < 2; ++i) {
+		ptr = strchr(ptr, '_');
+		ut_a(ptr != NULL);
+		++ptr;
+	}
+
+	fts_table.suffix = NULL;
+	if (aux_table->index_id == 0) {
+		fts_table.type = FTS_COMMON_TABLE;
+
+		for (ulint i = 0; fts_common_tables[i] != NULL; ++i) {
+			if (strcmp(ptr, fts_common_tables[i]) == 0) {
+				fts_table.suffix = fts_common_tables[i];
+				break;
+			}
+		}
+	} else {
+		fts_table.type = FTS_INDEX_TABLE;
+
+		/* Skip index id and underscore */
+		ptr = strchr(ptr, '_');
+		ut_a(ptr != NULL);
+		++ptr;
+
+		for (ulint i = 0; fts_index_selector[i].value; ++i) {
+			if (strcmp(ptr, fts_get_suffix(i)) == 0) {
+				fts_table.suffix = fts_get_suffix(i);
+				break;
+			}
+		}
+	}
+
+	ut_a(fts_table.suffix != NULL);
+
+	fts_table.parent = parent_table->name;
+	fts_table.table_id = aux_table->parent_id;
+	fts_table.index_id = aux_table->index_id;
+	fts_table.table = parent_table;
+
+	new_name = fts_get_table_name(&fts_table);
+	ut_ad(strcmp(new_name, aux_table->name) != 0);
+
+	if (trx_get_dict_operation(trx) == TRX_DICT_OP_NONE) {
+		trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+	}
+
+	error = row_rename_table_for_mysql(aux_table->name, new_name, trx,
+					   FALSE);
+
+	if (error != DB_SUCCESS) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Failed to rename aux table \'%s\' to "
+			"new format \'%s\'. ",
+			aux_table->name, new_name);
+	} else {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Renamed aux table \'%s\' to \'%s\'.",
+			aux_table->name, new_name);
+	}
+
+	mem_free(new_name);
+
+	return (error);
+}
+
+/**********************************************************************//**
+Rename all aux tables of a parent table to HEX format. Also set aux tables'
+flags2 and parent table's flags2 with DICT_TF2_FTS_AUX_HEX_NAME.
+It's called when "%016llu" is used to format an object id in table name,
+which only happens in Windows.
+Note the ids in tables are correct but the names are old ambiguous ones.
+
+This function should make sure that either all the parent table and aux tables
+are set DICT_TF2_FTS_AUX_HEX_NAME with flags2 or none of them are set */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_rename_aux_tables_to_hex_format_low(
+/*====================================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_table_t*	parent_table,	/*!< in: parent table */
+	ib_vector_t*	tables)		/*!< in: aux tables to rename. */
+{
+	dberr_t		error;
+	ulint		count;
+
+	ut_ad(!DICT_TF2_FLAG_IS_SET(parent_table, DICT_TF2_FTS_AUX_HEX_NAME));
+	ut_ad(!ib_vector_is_empty(tables));
+
+	error = fts_update_hex_format_flag(trx, parent_table->id, true);
+
+	if (error != DB_SUCCESS) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Setting parent table %s to hex format failed.",
+			parent_table->name);
+
+		fts_sql_rollback(trx);
+		return (error);
+	}
+
+	DICT_TF2_FLAG_SET(parent_table, DICT_TF2_FTS_AUX_HEX_NAME);
+
+	for (count = 0; count < ib_vector_size(tables); ++count) {
+		dict_table_t*		table;
+		fts_aux_table_t*	aux_table;
+
+		aux_table = static_cast<fts_aux_table_t*>(
+			ib_vector_get(tables, count));
+
+		table = dict_table_open_on_id(aux_table->id, TRUE,
+					      DICT_TABLE_OP_NORMAL);
+
+		ut_ad(table != NULL);
+		ut_ad(!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_AUX_HEX_NAME));
+
+		/* Set HEX_NAME flag here to make sure we can get correct
+		new table name in following function */
+		DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_AUX_HEX_NAME);
+		error = fts_rename_one_aux_table_to_hex_format(trx,
+				aux_table, parent_table);
+		/* We will rollback the trx if the error != DB_SUCCESS,
+		so setting the flag here is the same with setting it in
+		row_rename_table_for_mysql */
+		DBUG_EXECUTE_IF("rename_aux_table_fail", error = DB_ERROR;);
+
+		if (error != DB_SUCCESS) {
+			dict_table_close(table, TRUE, FALSE);
+
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Failed to rename one aux table %s "
+				"Will revert all successful rename "
+				"operations.", aux_table->name);
+
+			fts_sql_rollback(trx);
+			break;
+		}
+
+		error = fts_update_hex_format_flag(trx, aux_table->id, true);
+		dict_table_close(table, TRUE, FALSE);
+
+		if (error != DB_SUCCESS) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Setting aux table %s to hex format failed.",
+				aux_table->name);
+
+			fts_sql_rollback(trx);
+			break;
+		}
+	}
+
+	if (error != DB_SUCCESS) {
+		ut_ad(count != ib_vector_size(tables));
+		/* If rename fails, thr trx would be rolled back, we can't
+		use it any more, we'll start a new background trx to do
+		the reverting. */
+		ut_a(trx->state == TRX_STATE_NOT_STARTED);
+		bool not_rename = false;
+
+		/* Try to revert those succesful rename operations
+		in order to revert the ibd file rename. */
+		for (ulint i = 0; i <= count; ++i) {
+			dict_table_t*		table;
+			fts_aux_table_t*	aux_table;
+			trx_t*			trx_bg;
+			dberr_t			err;
+
+			aux_table = static_cast<fts_aux_table_t*>(
+				ib_vector_get(tables, i));
+
+			table = dict_table_open_on_id(aux_table->id, TRUE,
+						      DICT_TABLE_OP_NORMAL);
+			ut_ad(table != NULL);
+
+			if (not_rename) {
+				DICT_TF2_FLAG_UNSET(table,
+						    DICT_TF2_FTS_AUX_HEX_NAME);
+			}
+
+			if (!DICT_TF2_FLAG_IS_SET(table,
+						  DICT_TF2_FTS_AUX_HEX_NAME)) {
+				dict_table_close(table, TRUE, FALSE);
+				continue;
+			}
+
+			trx_bg = trx_allocate_for_background();
+			trx_bg->op_info = "Revert half done rename";
+			trx_bg->dict_operation_lock_mode = RW_X_LATCH;
+			trx_start_for_ddl(trx_bg, TRX_DICT_OP_TABLE);
+
+			DICT_TF2_FLAG_UNSET(table, DICT_TF2_FTS_AUX_HEX_NAME);
+			err = row_rename_table_for_mysql(table->name,
+							 aux_table->name,
+							 trx_bg, FALSE);
+
+			trx_bg->dict_operation_lock_mode = 0;
+			dict_table_close(table, TRUE, FALSE);
+
+			if (err != DB_SUCCESS) {
+				ib_logf(IB_LOG_LEVEL_WARN, "Failed to revert "
+					"table %s. Please revert manually.",
+					table->name);
+				fts_sql_rollback(trx_bg);
+				trx_free_for_background(trx_bg);
+				/* Continue to clear aux tables' flags2 */
+				not_rename = true;
+				continue;
+			}
+
+			fts_sql_commit(trx_bg);
+			trx_free_for_background(trx_bg);
+		}
+
+		DICT_TF2_FLAG_UNSET(parent_table, DICT_TF2_FTS_AUX_HEX_NAME);
+	}
+
+	return (error);
+}
+
+/**********************************************************************//**
+Convert an id, which is actually a decimal number but was regard as a HEX
+from a string, to its real value. */
+static
+ib_id_t
+fts_fake_hex_to_dec(
+/*================*/
+	ib_id_t		id)			/*!< in: number to convert */
+{
+	ib_id_t		dec_id = 0;
+	char		tmp_id[FTS_AUX_MIN_TABLE_ID_LENGTH];
+	int		ret;
+
+	ret = sprintf(tmp_id, UINT64PFx, id);
+	ut_ad(ret == 16);
+#ifdef _WIN32
+	ret = sscanf(tmp_id, "%016llu", &dec_id);
+#else
+	ret = sscanf(tmp_id, "%016"PRIu64, &dec_id);
+#endif /* _WIN32 */
+	ut_ad(ret == 1);
+
+	return dec_id;
+}
+
+/*********************************************************************//**
+Compare two fts_aux_table_t parent_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_check_aux_table_parent_id_cmp(
+/*==============================*/
+	const void*	p1,		/*!< in: id1 */
+	const void*	p2)		/*!< in: id2 */
+{
+	const fts_aux_table_t*	fa1 = static_cast<const fts_aux_table_t*>(p1);
+	const fts_aux_table_t*	fa2 = static_cast<const fts_aux_table_t*>(p2);
+
+	return static_cast<int>(fa1->parent_id - fa2->parent_id);
+}
+
+/** Mark all the fts index associated with the parent table as corrupted.
+@param[in]	trx		transaction
+@param[in, out] parent_table	fts index associated with this parent table
+				will be marked as corrupted. */
+static
+void
+fts_parent_all_index_set_corrupt(
+	trx_t*		trx,
+	dict_table_t*	parent_table)
+{
+	fts_t*	fts = parent_table->fts;
+
+	if (trx_get_dict_operation(trx) == TRX_DICT_OP_NONE) {
+		trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+	}
+
+	for (ulint j = 0; j < ib_vector_size(fts->indexes); j++) {
+		dict_index_t*	index = static_cast<dict_index_t*>(
+			ib_vector_getp_const(fts->indexes, j));
+		dict_set_corrupted(index,
+				   trx, "DROP ORPHANED TABLE");
+	}
+}
+
+/** Mark the fts index which index id matches the id as corrupted.
+@param[in]	trx		transaction
+@param[in]	id		index id to search
+@param[in, out]	parent_table	parent table to check with all
+				the index. */
+static
+void
+fts_set_index_corrupt(
+	trx_t*		trx,
+	index_id_t	id,
+	dict_table_t*	table)
+{
+	fts_t*	fts = table->fts;
+
+	if (trx_get_dict_operation(trx) == TRX_DICT_OP_NONE) {
+		trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+	}
+
+	for (ulint j = 0; j < ib_vector_size(fts->indexes); j++) {
+		dict_index_t*   index = static_cast<dict_index_t*>(
+			ib_vector_getp_const(fts->indexes, j));
+		if (index->id == id) {
+			dict_set_corrupted(index, trx,
+					   "DROP ORPHANED TABLE");
+			break;
+		}
+	}
+}
+
+/** Check the index for the aux table is corrupted.
+@param[in]	aux_table	auxiliary table
+@retval nonzero if index is corrupted, zero for valid index */
+static
+ulint
+fts_check_corrupt_index(
+	fts_aux_table_t*	aux_table)
+{
+	dict_table_t*	table;
+	dict_index_t*	index;
+	table = dict_table_open_on_id(
+		aux_table->parent_id, TRUE, DICT_TABLE_OP_NORMAL);
+
+	if (table == NULL) {
+		return(0);
+	}
+
+	for (index = UT_LIST_GET_FIRST(table->indexes);
+	     index;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+		if (index->id == aux_table->index_id) {
+			ut_ad(index->type & DICT_FTS);
+			dict_table_close(table, true, false);
+			return(dict_index_is_corrupted(index));
+		}
+	}
+
+	dict_table_close(table, true, false);
+	return(0);
+}
+
+/** Check the validity of the parent table.
+@param[in]	aux_table	auxiliary table
+@return true if it is a valid table or false if it is not */
+static
+bool
+fts_valid_parent_table(
+	const fts_aux_table_t*	aux_table)
+{
+	dict_table_t*	parent_table;
+	bool		valid = false;
+
+	parent_table = dict_table_open_on_id(
+		aux_table->parent_id, TRUE, DICT_TABLE_OP_NORMAL);
+
+	if (parent_table != NULL && parent_table->fts != NULL) {
+		if (aux_table->index_id == 0) {
+			valid = true;
+		} else {
+			index_id_t	id = aux_table->index_id;
+			dict_index_t*	index;
+
+			/* Search for the FT index in the table's list. */
+			for (index = UT_LIST_GET_FIRST(parent_table->indexes);
+			     index;
+			     index = UT_LIST_GET_NEXT(indexes, index)) {
+				if (index->id == id) {
+					valid = true;
+					break;
+				}
+
+			}
+		}
+	}
+
+	if (parent_table) {
+		dict_table_close(parent_table, TRUE, FALSE);
+	}
+
+	return(valid);
+}
+
+/** Try to rename all aux tables of the specified parent table.
+@param[in]	aux_tables	aux_tables to be renamed
+@param[in]	parent_table	parent table of all aux
+				tables stored in tables. */
+static
+void
+fts_rename_aux_tables_to_hex_format(
+	ib_vector_t*	aux_tables,
+	dict_table_t*	parent_table)
+{
+	dberr_t err;
+	trx_t*	trx_rename = trx_allocate_for_background();
+	trx_rename->op_info = "Rename aux tables to hex format";
+	trx_rename->dict_operation_lock_mode = RW_X_LATCH;
+	trx_start_for_ddl(trx_rename, TRX_DICT_OP_TABLE);
+
+	err = fts_rename_aux_tables_to_hex_format_low(trx_rename,
+						      parent_table, aux_tables);
+
+	trx_rename->dict_operation_lock_mode = 0;
+
+	if (err != DB_SUCCESS) {
+
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Rollback operations on all aux tables of table %s. "
+			"All the fts index associated with the table are "
+			"marked as corrupted. Please rebuild the "
+			"index again.", parent_table->name);
+		fts_sql_rollback(trx_rename);
+
+		/* Corrupting the fts index related to parent table. */
+		trx_t*	trx_corrupt;
+		trx_corrupt = trx_allocate_for_background();
+		trx_corrupt->dict_operation_lock_mode = RW_X_LATCH;
+		trx_start_for_ddl(trx_corrupt, TRX_DICT_OP_TABLE);
+		fts_parent_all_index_set_corrupt(trx_corrupt, parent_table);
+		trx_corrupt->dict_operation_lock_mode = 0;
+		fts_sql_commit(trx_corrupt);
+		trx_free_for_background(trx_corrupt);
+	} else {
+		fts_sql_commit(trx_rename);
+	}
+
+	trx_free_for_background(trx_rename);
+	ib_vector_reset(aux_tables);
+}
+
+/** Set the hex format flag for the parent table.
+@param[in, out]	parent_table	parent table
+@param[in]	trx		transaction */
+static
+void
+fts_set_parent_hex_format_flag(
+	dict_table_t*	parent_table,
+	trx_t*		trx)
+{
+	if (!DICT_TF2_FLAG_IS_SET(parent_table,
+				  DICT_TF2_FTS_AUX_HEX_NAME)) {
+		DBUG_EXECUTE_IF("parent_table_flag_fail",
+			ib_logf(IB_LOG_LEVEL_FATAL,
+				"Setting parent table %s  to hex format "
+				"failed. Please try to restart the server "
+				"again, if it doesn't work, the system "
+				"tables might be corrupted.",
+				parent_table->name);
+			return;);
+
+		dberr_t	err = fts_update_hex_format_flag(
+				trx, parent_table->id, true);
+
+		if (err != DB_SUCCESS) {
+			ib_logf(IB_LOG_LEVEL_FATAL,
+				"Setting parent table %s  to hex format "
+				"failed. Please try to restart the server "
+				"again, if it doesn't work, the system "
+				"tables might be corrupted.",
+				parent_table->name);
+		} else {
+			DICT_TF2_FLAG_SET(
+				parent_table, DICT_TF2_FTS_AUX_HEX_NAME);
+		}
+	}
+}
+
+/** Drop the obsolete auxilary table.
+@param[in]	tables	tables to be dropped. */
+static
+void
+fts_drop_obsolete_aux_table_from_vector(
+	ib_vector_t*	tables)
+{
+	dberr_t		err;
+
+	for (ulint count = 0; count < ib_vector_size(tables);
+	     ++count) {
+
+		fts_aux_table_t*	aux_drop_table;
+		aux_drop_table = static_cast<fts_aux_table_t*>(
+			ib_vector_get(tables, count));
+		trx_t*	trx_drop = trx_allocate_for_background();
+		trx_drop->op_info = "Drop obsolete aux tables";
+		trx_drop->dict_operation_lock_mode = RW_X_LATCH;
+		trx_start_for_ddl(trx_drop, TRX_DICT_OP_TABLE);
+
+		err = row_drop_table_for_mysql(
+			aux_drop_table->name, trx_drop, false, true);
+
+		trx_drop->dict_operation_lock_mode = 0;
+
+		if (err != DB_SUCCESS) {
+			/* We don't need to worry about the
+			failure, since server would try to
+			drop it on next restart, even if
+			the table was broken. */
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Fail to drop obsolete aux table '%s', which "
+				"is harmless. will try to drop it on next "
+				"restart.", aux_drop_table->name);
+			fts_sql_rollback(trx_drop);
+		} else {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Dropped obsolete aux table '%s'.",
+				aux_drop_table->name);
+
+			fts_sql_commit(trx_drop);
+		}
+
+		trx_free_for_background(trx_drop);
+	}
+}
+
+/** Drop all the auxiliary table present in the vector.
+@param[in]	trx	transaction
+@param[in]	tables	tables to be dropped */
+static
+void
+fts_drop_aux_table_from_vector(
+	trx_t*		trx,
+	ib_vector_t*	tables)
+{
+	for (ulint count = 0; count < ib_vector_size(tables);
+	    ++count) {
+		fts_aux_table_t*	aux_drop_table;
+		aux_drop_table = static_cast<fts_aux_table_t*>(
+				ib_vector_get(tables, count));
+
+		/* Check for the validity of the parent table */
+		if (!fts_valid_parent_table(aux_drop_table)) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Parent table of FTS auxiliary table %s not "
+				"found.", aux_drop_table->name);
+			dberr_t err = fts_drop_table(trx, aux_drop_table->name);
+			if (err == DB_FAIL) {
+				char*	path = fil_make_ibd_name(
+					aux_drop_table->name, false);
+				os_file_delete_if_exists(innodb_file_data_key,
+							 path);
+				mem_free(path);
+			}
+		}
+	}
+}
+
+/**********************************************************************//**
+Check and drop all orphaned FTS auxiliary tables, those that don't have
+a parent table or FTS index defined on them.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull))
+void
+fts_check_and_drop_orphaned_tables(
+/*===============================*/
+	trx_t*		trx,			/*!< in: transaction */
+	ib_vector_t*	tables)			/*!< in: tables to check */
+{
+	mem_heap_t*	heap;
+	ib_vector_t*	aux_tables_to_rename;
+	ib_vector_t*	invalid_aux_tables;
+	ib_vector_t*	valid_aux_tables;
+	ib_vector_t*	drop_aux_tables;
+	ib_vector_t*	obsolete_aux_tables;
+	ib_alloc_t*	heap_alloc;
+
+	heap = mem_heap_create(1024);
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	/* We store all aux tables belonging to the same parent table here,
+	and rename all these tables in a batch mode. */
+	aux_tables_to_rename = ib_vector_create(heap_alloc,
+						sizeof(fts_aux_table_t), 128);
+
+	/* We store all fake auxiliary table and orphaned table here. */
+	invalid_aux_tables = ib_vector_create(heap_alloc,
+					      sizeof(fts_aux_table_t), 128);
+
+	/* We store all valid aux tables. We use this to filter the
+	fake auxiliary table from invalid auxiliary tables. */
+	valid_aux_tables = ib_vector_create(heap_alloc,
+					    sizeof(fts_aux_table_t), 128);
+
+	/* We store all auxiliary tables to be dropped. */
+	drop_aux_tables = ib_vector_create(heap_alloc,
+					   sizeof(fts_aux_table_t), 128);
+
+	/* We store all obsolete auxiliary tables to be dropped. */
+	obsolete_aux_tables = ib_vector_create(heap_alloc,
+					       sizeof(fts_aux_table_t), 128);
+
+	/* Sort by parent_id first, in case rename will fail */
+	ib_vector_sort(tables, fts_check_aux_table_parent_id_cmp);
+
+	for (ulint i = 0; i < ib_vector_size(tables); ++i) {
+		dict_table_t*		parent_table;
+		fts_aux_table_t*	aux_table;
+		bool			drop = false;
+		dict_table_t*		table;
+		fts_aux_table_t*	next_aux_table = NULL;
+		ib_id_t			orig_parent_id = 0;
+		ib_id_t			orig_index_id = 0;
+		bool			rename = false;
+
+		aux_table = static_cast<fts_aux_table_t*>(
+			ib_vector_get(tables, i));
+
+		table = dict_table_open_on_id(
+			aux_table->id, TRUE, DICT_TABLE_OP_NORMAL);
+		orig_parent_id = aux_table->parent_id;
+		orig_index_id = aux_table->index_id;
+
+		if (table == NULL || strcmp(table->name, aux_table->name)) {
+
+			bool	fake_aux = false;
+
+			if (table != NULL) {
+				dict_table_close(table, TRUE, FALSE);
+			}
+
+			if (i + 1 < ib_vector_size(tables)) {
+				next_aux_table = static_cast<fts_aux_table_t*>(
+						ib_vector_get(tables, i + 1));
+			}
+
+			/* To know whether aux table is fake fts or
+			orphan fts table. */
+			for (ulint count = 0;
+			     count < ib_vector_size(valid_aux_tables);
+			     count++) {
+				fts_aux_table_t*	valid_aux;
+				valid_aux = static_cast<fts_aux_table_t*>(
+					ib_vector_get(valid_aux_tables, count));
+				if (strcmp(valid_aux->name,
+					   aux_table->name) == 0) {
+					fake_aux = true;
+					break;
+				}
+			}
+
+			/* All aux tables of parent table, whose id is
+			last_parent_id, have been checked, try to rename
+			them if necessary. */
+			if ((next_aux_table == NULL
+			     || orig_parent_id != next_aux_table->parent_id)
+			    && (!ib_vector_is_empty(aux_tables_to_rename))) {
+
+					ulint	parent_id = fts_fake_hex_to_dec(
+							aux_table->parent_id);
+
+					parent_table = dict_table_open_on_id(
+						parent_id, TRUE,
+						DICT_TABLE_OP_NORMAL);
+
+					fts_rename_aux_tables_to_hex_format(
+						aux_tables_to_rename, parent_table);
+
+					dict_table_close(parent_table, TRUE,
+							 FALSE);
+			}
+
+			/* If the aux table is fake aux table. Skip it. */
+			if (!fake_aux) {
+				ib_vector_push(invalid_aux_tables, aux_table);
+			}
+
+			continue;
+		} else if (!DICT_TF2_FLAG_IS_SET(table,
+						 DICT_TF2_FTS_AUX_HEX_NAME)) {
+
+			aux_table->parent_id = fts_fake_hex_to_dec(
+						aux_table->parent_id);
+
+			if (aux_table->index_id != 0) {
+				aux_table->index_id = fts_fake_hex_to_dec(
+							aux_table->index_id);
+			}
+
+			ut_ad(aux_table->id > aux_table->parent_id);
+
+			/* Check whether parent table id and index id
+			are stored as decimal format. */
+			if (fts_valid_parent_table(aux_table)) {
+
+				parent_table = dict_table_open_on_id(
+					aux_table->parent_id, true,
+					DICT_TABLE_OP_NORMAL);
+
+				ut_ad(parent_table != NULL);
+				ut_ad(parent_table->fts != NULL);
+
+				if (!DICT_TF2_FLAG_IS_SET(
+					parent_table,
+					DICT_TF2_FTS_AUX_HEX_NAME)) {
+					rename = true;
+				}
+
+				dict_table_close(parent_table, TRUE, FALSE);
+			}
+
+			if (!rename) {
+				/* Reassign the original value of
+				aux table if it is not in decimal format */
+				aux_table->parent_id = orig_parent_id;
+				aux_table->index_id = orig_index_id;
+			}
+		}
+
+		if (table != NULL) {
+			dict_table_close(table, true, false);
+		}
+
+		if (!rename) {
+			/* Check the validity of the parent table. */
+			if (!fts_valid_parent_table(aux_table)) {
+				drop = true;
+			}
+		}
+
+		/* Filter out the fake aux table by comparing with the
+		current valid auxiliary table name . */
+		for (ulint count = 0;
+		     count < ib_vector_size(invalid_aux_tables); count++) {
+			fts_aux_table_t*	invalid_aux;
+			invalid_aux = static_cast<fts_aux_table_t*>(
+				ib_vector_get(invalid_aux_tables, count));
+			if (strcmp(invalid_aux->name, aux_table->name) == 0) {
+				ib_vector_remove(
+					invalid_aux_tables,
+					*reinterpret_cast<void**>(invalid_aux));
+				break;
+			}
+		}
+
+		ib_vector_push(valid_aux_tables, aux_table);
+
+		/* If the index associated with aux table is corrupted,
+		skip it. */
+		if (fts_check_corrupt_index(aux_table) > 0) {
+
+			if (i + 1 < ib_vector_size(tables)) {
+				next_aux_table = static_cast<fts_aux_table_t*>(
+						ib_vector_get(tables, i + 1));
+			}
+
+			if (next_aux_table == NULL
+			    || orig_parent_id != next_aux_table->parent_id) {
+
+				parent_table = dict_table_open_on_id(
+					aux_table->parent_id, TRUE,
+					DICT_TABLE_OP_NORMAL);
+
+				if (!ib_vector_is_empty(aux_tables_to_rename)) {
+					fts_rename_aux_tables_to_hex_format(
+						aux_tables_to_rename, parent_table);
+
+				} else {
+					fts_set_parent_hex_format_flag(
+						parent_table, trx);
+				}
+
+				dict_table_close(parent_table, TRUE, FALSE);
+			}
+
+			continue;
+		}
+
+		parent_table = dict_table_open_on_id(
+			aux_table->parent_id, TRUE, DICT_TABLE_OP_NORMAL);
+
+		if (drop) {
+			 ib_vector_push(drop_aux_tables, aux_table);
+		} else {
+			if (FTS_IS_OBSOLETE_AUX_TABLE(aux_table->name)) {
+
+				/* Current table could be one of the three
+				obsolete tables, in this case, we should
+				always try to drop it but not rename it.
+				This could happen when we try to upgrade
+				from older server to later one, which doesn't
+				contain these obsolete tables. */
+				ib_vector_push(obsolete_aux_tables, aux_table);
+				continue;
+			}
+		}
+
+		/* If the aux table is in decimal format, we should
+		rename it, so push it to aux_tables_to_rename */
+		if (!drop && rename) {
+			ib_vector_push(aux_tables_to_rename, aux_table);
+		}
+
+		if (i + 1 < ib_vector_size(tables)) {
+			next_aux_table = static_cast<fts_aux_table_t*>(
+					ib_vector_get(tables, i + 1));
+		}
+
+		if ((next_aux_table == NULL
+		     || orig_parent_id != next_aux_table->parent_id)
+		    && !ib_vector_is_empty(aux_tables_to_rename)) {
+			/* All aux tables of parent table, whose id is
+			last_parent_id, have been checked, try to rename
+			them if necessary. We had better use a new background
+			trx to rename rather than the original trx, in case
+			any failure would cause a complete rollback. */
+			ut_ad(rename);
+			ut_ad(!DICT_TF2_FLAG_IS_SET(
+				parent_table, DICT_TF2_FTS_AUX_HEX_NAME));
+
+			fts_rename_aux_tables_to_hex_format(
+				aux_tables_to_rename,parent_table);
+		}
+
+		/* The IDs are already in correct hex format. */
+		if (!drop && !rename) {
+			dict_table_t*	table;
+
+			table = dict_table_open_on_id(
+				aux_table->id, TRUE, DICT_TABLE_OP_NORMAL);
+			if (table != NULL
+			    && strcmp(table->name, aux_table->name)) {
+				dict_table_close(table, TRUE, FALSE);
+				table = NULL;
+			}
+
+			if (table != NULL
+			    && !DICT_TF2_FLAG_IS_SET(
+						table,
+						DICT_TF2_FTS_AUX_HEX_NAME)) {
+
+				DBUG_EXECUTE_IF("aux_table_flag_fail",
+					ib_logf(IB_LOG_LEVEL_WARN,
+						"Setting aux table %s to hex "
+						"format failed.", table->name);
+					fts_set_index_corrupt(
+						trx, aux_table->index_id,
+						parent_table);
+						goto table_exit;);
+
+				dberr_t err = fts_update_hex_format_flag(
+						trx, table->id, true);
+
+				if (err != DB_SUCCESS) {
+					ib_logf(IB_LOG_LEVEL_WARN,
+						"Setting aux table %s to hex "
+						"format failed.", table->name);
+
+					fts_set_index_corrupt(
+						trx, aux_table->index_id,
+						parent_table);
+				} else {
+					DICT_TF2_FLAG_SET(table,
+						DICT_TF2_FTS_AUX_HEX_NAME);
+				}
+			}
+#ifndef DBUG_OFF
+table_exit:
+#endif	/* !DBUG_OFF */
+
+			if (table != NULL) {
+				dict_table_close(table, TRUE, FALSE);
+			}
+
+			ut_ad(parent_table != NULL);
+
+			fts_set_parent_hex_format_flag(
+					parent_table, trx);
+		}
+
+		if (parent_table != NULL) {
+			dict_table_close(parent_table, TRUE, FALSE);
+		}
+	}
+
+	fts_drop_aux_table_from_vector(trx, invalid_aux_tables);
+	fts_drop_aux_table_from_vector(trx, drop_aux_tables);
+	fts_sql_commit(trx);
+
+	fts_drop_obsolete_aux_table_from_vector(obsolete_aux_tables);
+
+	/* Free the memory allocated at the beginning */
+	if (heap != NULL) {
+		mem_heap_free(heap);
+	}
+}
+
+/**********************************************************************//**
+Drop all orphaned FTS auxiliary tables, those that don't have a parent
+table or FTS index defined on them. */
+UNIV_INTERN
+void
+fts_drop_orphaned_tables(void)
+/*==========================*/
+{
+	trx_t*			trx;
+	pars_info_t*		info;
+	mem_heap_t*		heap;
+	que_t*			graph;
+	ib_vector_t*		tables;
+	ib_alloc_t*		heap_alloc;
+	space_name_list_t	space_name_list;
+	dberr_t			error = DB_SUCCESS;
+
+	/* Note: We have to free the memory after we are done with the list. */
+	error = fil_get_space_names(space_name_list);
+
+	if (error == DB_OUT_OF_MEMORY) {
+		ib_logf(IB_LOG_LEVEL_ERROR, "Out of memory");
+		ut_error;
+	}
+
+	heap = mem_heap_create(1024);
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	/* We store the table ids of all the FTS indexes that were found. */
+	tables = ib_vector_create(heap_alloc, sizeof(fts_aux_table_t), 128);
+
+	/* Get the list of all known .ibd files and check for orphaned
+	FTS auxiliary files in that list. We need to remove them because
+	users can't map them back to table names and this will create
+	unnecessary clutter. */
+
+	for (space_name_list_t::iterator it = space_name_list.begin();
+	     it != space_name_list.end();
+	     ++it) {
+
+		fts_aux_table_t*	fts_aux_table;
+
+		fts_aux_table = static_cast<fts_aux_table_t*>(
+			ib_vector_push(tables, NULL));
+
+		memset(fts_aux_table, 0x0, sizeof(*fts_aux_table));
+
+		if (!fts_is_aux_table_name(fts_aux_table, *it, strlen(*it))) {
+			ib_vector_pop(tables);
+		} else {
+			ulint	len = strlen(*it);
+
+			fts_aux_table->id = fil_get_space_id_for_table(*it);
+
+			/* We got this list from fil0fil.cc. The tablespace
+			with this name must exist. */
+			ut_a(fts_aux_table->id != ULINT_UNDEFINED);
+
+			fts_aux_table->name = static_cast<char*>(
+				mem_heap_dup(heap, *it, len + 1));
+
+			fts_aux_table->name[len] = 0;
+		}
+	}
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "dropping orphaned FTS tables";
+	row_mysql_lock_data_dictionary(trx);
+
+	info = pars_info_create();
+
+	pars_info_bind_function(info, "my_func", fts_read_tables, tables);
+
+	graph = fts_parse_sql_no_dict_lock(
+		NULL,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT NAME, ID "
+		" FROM SYS_TABLES;\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for (;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+			fts_check_and_drop_orphaned_tables(trx, tables);
+			break;				/* Exit the loop. */
+		} else {
+			ib_vector_reset(tables);
+
+			fts_sql_rollback(trx);
+
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				ib_logf(IB_LOG_LEVEL_WARN,
+					"lock wait timeout reading SYS_TABLES. "
+					"Retrying!");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"(%s) while reading SYS_TABLES.",
+					ut_strerr(error));
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	que_graph_free(graph);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx_free_for_background(trx);
+
+	if (heap != NULL) {
+		mem_heap_free(heap);
+	}
+
+	/** Free the memory allocated to store the .ibd names. */
+	for (space_name_list_t::iterator it = space_name_list.begin();
+	     it != space_name_list.end();
+	     ++it) {
+
+		delete[] *it;
+	}
+}
+
+/**********************************************************************//**
+Check whether user supplied stopword table is of the right format.
+Caller is responsible to hold dictionary locks.
+@return the stopword column charset if qualifies */
+UNIV_INTERN
+CHARSET_INFO*
+fts_valid_stopword_table(
+/*=====================*/
+	 const char*	stopword_table_name)	/*!< in: Stopword table
+						name */
+{
+	dict_table_t*	table;
+	dict_col_t*     col = NULL;
+
+	if (!stopword_table_name) {
+		return(NULL);
+	}
+
+	table = dict_table_get_low(stopword_table_name);
+
+	if (!table) {
+		fprintf(stderr,
+			"InnoDB: user stopword table %s does not exist.\n",
+			stopword_table_name);
+
+		return(NULL);
+	} else {
+		const char*     col_name;
+
+		col_name = dict_table_get_col_name(table, 0);
+
+		if (ut_strcmp(col_name, "value")) {
+			fprintf(stderr,
+				"InnoDB: invalid column name for stopword "
+				"table %s. Its first column must be named as "
+				"'value'.\n", stopword_table_name);
+
+			return(NULL);
+		}
+
+		col = dict_table_get_nth_col(table, 0);
+
+		if (col->mtype != DATA_VARCHAR
+		    && col->mtype != DATA_VARMYSQL) {
+			fprintf(stderr,
+				"InnoDB: invalid column type for stopword "
+				"table %s. Its first column must be of "
+				"varchar type\n", stopword_table_name);
+
+			return(NULL);
+		}
+	}
+
+	ut_ad(col);
+
+	return(innobase_get_fts_charset(
+		static_cast<int>(col->prtype & DATA_MYSQL_TYPE_MASK),
+		static_cast<uint>(dtype_get_charset_coll(col->prtype))));
+}
+
+/**********************************************************************//**
+This function loads the stopword into the FTS cache. It also
+records/fetches stopword configuration to/from FTS configure
+table, depending on whether we are creating or reloading the
+FTS.
+@return TRUE if load operation is successful */
+UNIV_INTERN
+ibool
+fts_load_stopword(
+/*==============*/
+	const dict_table_t*
+			table,			/*!< in: Table with FTS */
+	trx_t*		trx,			/*!< in: Transactions */
+	const char*	global_stopword_table,	/*!< in: Global stopword table
+						name */
+	const char*	session_stopword_table,	/*!< in: Session stopword table
+						name */
+	ibool		stopword_is_on,		/*!< in: Whether stopword
+						option is turned on/off */
+	ibool		reload)			/*!< in: Whether it is
+						for reloading FTS table */
+{
+	fts_table_t	fts_table;
+	fts_string_t	str;
+	dberr_t		error = DB_SUCCESS;
+	ulint		use_stopword;
+	fts_cache_t*	cache;
+	const char*	stopword_to_use = NULL;
+	ibool		new_trx = FALSE;
+	byte		str_buffer[MAX_FULL_NAME_LEN + 1];
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, table);
+
+	cache = table->fts->cache;
+
+	if (!reload && !(cache->stopword_info.status
+			 & STOPWORD_NOT_INIT)) {
+		return(TRUE);
+	}
+
+	if (!trx) {
+		trx = trx_allocate_for_background();
+		trx->op_info = "upload FTS stopword";
+		new_trx = TRUE;
+	}
+
+	/* First check whether stopword filtering is turned off */
+	if (reload) {
+		error = fts_config_get_ulint(
+			trx, &fts_table, FTS_USE_STOPWORD, &use_stopword);
+	} else {
+		use_stopword = (ulint) stopword_is_on;
+
+		error = fts_config_set_ulint(
+			trx, &fts_table, FTS_USE_STOPWORD, use_stopword);
+	}
+
+	if (error != DB_SUCCESS) {
+		goto cleanup;
+	}
+
+	/* If stopword is turned off, no need to continue to load the
+	stopword into cache, but still need to do initialization */
+	if (!use_stopword) {
+		cache->stopword_info.status = STOPWORD_OFF;
+		goto cleanup;
+	}
+
+	if (reload) {
+		/* Fetch the stopword table name from FTS config
+		table */
+		str.f_n_char = 0;
+		str.f_str = str_buffer;
+		str.f_len = sizeof(str_buffer) - 1;
+
+		error = fts_config_get_value(
+			trx, &fts_table, FTS_STOPWORD_TABLE_NAME, &str);
+
+		if (error != DB_SUCCESS) {
+			goto cleanup;
+		}
+
+		if (strlen((char*) str.f_str) > 0) {
+			stopword_to_use = (const char*) str.f_str;
+		}
+	} else {
+		stopword_to_use = (session_stopword_table)
+			? session_stopword_table : global_stopword_table;
+	}
+
+	if (stopword_to_use
+	    && fts_load_user_stopword(table->fts, stopword_to_use,
+				      &cache->stopword_info)) {
+		/* Save the stopword table name to the configure
+		table */
+		if (!reload) {
+			str.f_n_char = 0;
+			str.f_str = (byte*) stopword_to_use;
+			str.f_len = ut_strlen(stopword_to_use);
+
+			error = fts_config_set_value(
+				trx, &fts_table, FTS_STOPWORD_TABLE_NAME, &str);
+		}
+	} else {
+		/* Load system default stopword list */
+		fts_load_default_stopword(&cache->stopword_info);
+	}
+
+cleanup:
+	if (new_trx) {
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+		} else {
+			fts_sql_rollback(trx);
+		}
+
+		trx_free_for_background(trx);
+	}
+
+	if (!cache->stopword_info.cached_stopword) {
+		cache->stopword_info.cached_stopword = rbt_create(
+			sizeof(fts_tokenizer_word_t), fts_utf8_string_cmp);
+	}
+
+	return(error == DB_SUCCESS);
+}
+
+/**********************************************************************//**
+Callback function when we initialize the FTS at the start up
+time. It recovers the maximum Doc IDs presented in the current table.
+@return: always returns TRUE */
+static
+ibool
+fts_init_get_doc_id(
+/*================*/
+	void*	row,			/*!< in: sel_node_t* */
+	void*	user_arg)		/*!< in: fts cache */
+{
+	doc_id_t	doc_id = FTS_NULL_DOC_ID;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	que_node_t*	exp = node->select_list;
+	fts_cache_t*    cache = static_cast<fts_cache_t*>(user_arg);
+
+	ut_ad(ib_vector_is_empty(cache->get_docs));
+
+	/* Copy each indexed column content into doc->text.f_str */
+	if (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		dtype_t*        type = dfield_get_type(dfield);
+		void*           data = dfield_get_data(dfield);
+
+		ut_a(dtype_get_mtype(type) == DATA_INT);
+
+		doc_id = static_cast<doc_id_t>(mach_read_from_8(
+			static_cast<const byte*>(data)));
+
+		if (doc_id >= cache->next_doc_id) {
+			cache->next_doc_id = doc_id + 1;
+		}
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Callback function when we initialize the FTS at the start up
+time. It recovers Doc IDs that have not sync-ed to the auxiliary
+table, and require to bring them back into FTS index.
+@return: always returns TRUE */
+static
+ibool
+fts_init_recover_doc(
+/*=================*/
+	void*	row,			/*!< in: sel_node_t* */
+	void*	user_arg)		/*!< in: fts cache */
+{
+
+	fts_doc_t       doc;
+	ulint		doc_len = 0;
+	ulint		field_no = 0;
+	fts_get_doc_t*  get_doc = static_cast<fts_get_doc_t*>(user_arg);
+	doc_id_t	doc_id = FTS_NULL_DOC_ID;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	que_node_t*	exp = node->select_list;
+	fts_cache_t*	cache = get_doc->cache;
+
+	fts_doc_init(&doc);
+	doc.found = TRUE;
+
+	ut_ad(cache);
+
+	/* Copy each indexed column content into doc->text.f_str */
+	while (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		ulint		len = dfield_get_len(dfield);
+
+		if (field_no == 0) {
+			dtype_t*        type = dfield_get_type(dfield);
+			void*           data = dfield_get_data(dfield);
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+
+			doc_id = static_cast<doc_id_t>(mach_read_from_8(
+				static_cast<const byte*>(data)));
+
+			field_no++;
+			exp = que_node_get_next(exp);
+			continue;
+		}
+
+		if (len == UNIV_SQL_NULL) {
+			exp = que_node_get_next(exp);
+			continue;
+		}
+
+		ut_ad(get_doc);
+
+		if (!get_doc->index_cache->charset) {
+			ulint   prtype = dfield->type.prtype;
+
+			get_doc->index_cache->charset =
+				innobase_get_fts_charset(
+				(int)(prtype & DATA_MYSQL_TYPE_MASK),
+				(uint) dtype_get_charset_coll(prtype));
+		}
+
+		doc.charset = get_doc->index_cache->charset;
+
+		if (dfield_is_ext(dfield)) {
+			dict_table_t*	table = cache->sync->table;
+			ulint		zip_size = dict_table_zip_size(table);
+
+			doc.text.f_str = btr_copy_externally_stored_field(
+				&doc.text.f_len,
+				static_cast<byte*>(dfield_get_data(dfield)),
+				zip_size, len,
+				static_cast<mem_heap_t*>(doc.self_heap->arg));
+		} else {
+			doc.text.f_str = static_cast<byte*>(
+				dfield_get_data(dfield));
+
+			doc.text.f_len = len;
+		}
+
+		if (field_no == 1) {
+			fts_tokenize_document(&doc, NULL);
+		} else {
+			fts_tokenize_document_next(&doc, doc_len, NULL);
+		}
+
+		exp = que_node_get_next(exp);
+
+		doc_len += (exp) ? len + 1 : len;
+
+		field_no++;
+	}
+
+	fts_cache_add_doc(cache, get_doc->index_cache, doc_id, doc.tokens);
+
+	fts_doc_free(&doc);
+
+	cache->added++;
+
+	if (doc_id >= cache->next_doc_id) {
+		cache->next_doc_id = doc_id + 1;
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+This function brings FTS index in sync when FTS index is first
+used. There are documents that have not yet sync-ed to auxiliary
+tables from last server abnormally shutdown, we will need to bring
+such document into FTS cache before any further operations
+@return TRUE if all OK */
+UNIV_INTERN
+ibool
+fts_init_index(
+/*===========*/
+	dict_table_t*	table,		/*!< in: Table with FTS */
+	ibool		has_cache_lock)	/*!< in: Whether we already have
+					cache lock */
+{
+	dict_index_t*   index;
+	doc_id_t        start_doc;
+	fts_get_doc_t*  get_doc = NULL;
+	fts_cache_t*    cache = table->fts->cache;
+	bool		need_init = false;
+
+	ut_ad(!mutex_own(&dict_sys->mutex));
+
+	/* First check cache->get_docs is initialized */
+	if (!has_cache_lock) {
+		rw_lock_x_lock(&cache->lock);
+	}
+
+	rw_lock_x_lock(&cache->init_lock);
+	if (cache->get_docs == NULL) {
+		cache->get_docs = fts_get_docs_create(cache);
+	}
+	rw_lock_x_unlock(&cache->init_lock);
+
+	if (table->fts->fts_status & ADDED_TABLE_SYNCED) {
+		goto func_exit;
+	}
+
+	need_init = true;
+
+	start_doc = cache->synced_doc_id;
+
+	if (!start_doc) {
+		fts_cmp_set_sync_doc_id(table, 0, TRUE, &start_doc);
+		cache->synced_doc_id = start_doc;
+	}
+
+	/* No FTS index, this is the case when previous FTS index
+	dropped, and we re-initialize the Doc ID system for subsequent
+	insertion */
+	if (ib_vector_is_empty(cache->get_docs)) {
+		index = dict_table_get_index_on_name(table, FTS_DOC_ID_INDEX_NAME);
+
+		ut_a(index);
+
+		fts_doc_fetch_by_doc_id(NULL, start_doc, index,
+					FTS_FETCH_DOC_BY_ID_LARGE,
+					fts_init_get_doc_id, cache);
+	} else {
+		if (table->fts->cache->stopword_info.status
+		    & STOPWORD_NOT_INIT) {
+			fts_load_stopword(table, NULL, NULL, NULL, TRUE, TRUE);
+		}
+
+		for (ulint i = 0; i < ib_vector_size(cache->get_docs); ++i) {
+			get_doc = static_cast<fts_get_doc_t*>(
+				ib_vector_get(cache->get_docs, i));
+
+			index = get_doc->index_cache->index;
+
+			fts_doc_fetch_by_doc_id(NULL, start_doc, index,
+						FTS_FETCH_DOC_BY_ID_LARGE,
+						fts_init_recover_doc, get_doc);
+		}
+	}
+
+	table->fts->fts_status |= ADDED_TABLE_SYNCED;
+
+	fts_get_docs_clear(cache->get_docs);
+
+func_exit:
+	if (!has_cache_lock) {
+		rw_lock_x_unlock(&cache->lock);
+	}
+
+	if (need_init) {
+		mutex_enter(&dict_sys->mutex);
+		/* Register the table with the optimize thread. */
+		fts_optimize_add_table(table);
+		mutex_exit(&dict_sys->mutex);
+	}
+
+	return(TRUE);
+}
diff --git a/storage/innobase/fts/fts0opt.cc b/storage/innobase/fts/fts0opt.cc
new file mode 100644
index 00000000000..5891b53a6e2
--- /dev/null
+++ b/storage/innobase/fts/fts0opt.cc
@@ -0,0 +1,3201 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fts/fts0opt.cc
+Full Text Search optimize thread
+
+Created 2007/03/27 Sunny Bains
+Completed 2011/7/10 Sunny and Jimmy Yang
+
+***********************************************************************/
+
+#include "fts0fts.h"
+#include "row0sel.h"
+#include "que0types.h"
+#include "fts0priv.h"
+#include "fts0types.h"
+#include "ut0wqueue.h"
+#include "srv0start.h"
+#include "zlib.h"
+
+#ifndef UNIV_NONINL
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#endif
+
+/** The FTS optimize thread's work queue. */
+static ib_wqueue_t* fts_optimize_wq;
+
+/** The number of document ids to delete in one statement. */
+static const ulint FTS_MAX_DELETE_DOC_IDS = 1000;
+
+/** Time to wait for a message. */
+static const ulint FTS_QUEUE_WAIT_IN_USECS = 5000000;
+
+/** Default optimize interval in secs. */
+static const ulint FTS_OPTIMIZE_INTERVAL_IN_SECS = 300;
+
+/** Server is shutting down, so does we exiting the optimize thread */
+static bool fts_opt_start_shutdown = false;
+
+/** Initial size of nodes in fts_word_t. */
+static const ulint FTS_WORD_NODES_INIT_SIZE = 64;
+
+/** Last time we did check whether system need a sync */
+static ib_time_t	last_check_sync_time;
+
+#if 0
+/** Check each table in round robin to see whether they'd
+need to be "optimized" */
+static	ulint	fts_optimize_sync_iterator = 0;
+#endif
+
+/** State of a table within the optimization sub system. */
+enum fts_state_t {
+	FTS_STATE_LOADED,
+	FTS_STATE_RUNNING,
+	FTS_STATE_SUSPENDED,
+	FTS_STATE_DONE,
+	FTS_STATE_EMPTY
+};
+
+/** FTS optimize thread message types. */
+enum fts_msg_type_t {
+	FTS_MSG_START,			/*!< Start optimizing thread */
+
+	FTS_MSG_PAUSE,			/*!< Pause optimizing thread */
+
+	FTS_MSG_STOP,			/*!< Stop optimizing and exit thread */
+
+	FTS_MSG_ADD_TABLE,		/*!< Add table to the optimize thread's
+					work queue */
+
+	FTS_MSG_OPTIMIZE_TABLE,		/*!< Optimize a table */
+
+	FTS_MSG_DEL_TABLE,		/*!< Remove a table from the optimize
+					threads work queue */
+};
+
+/** Compressed list of words that have been read from FTS INDEX
+that needs to be optimized. */
+struct fts_zip_t {
+	lint		status;		/*!< Status of (un)/zip operation */
+
+	ulint		n_words;	/*!< Number of words compressed */
+
+	ulint		block_sz;	/*!< Size of a block in bytes */
+
+	ib_vector_t*	blocks;		/*!< Vector of compressed blocks */
+
+	ib_alloc_t*	heap_alloc;	/*!< Heap to use for allocations */
+
+	ulint		pos;		/*!< Offset into blocks */
+
+	ulint		last_big_block;	/*!< Offset of last block in the
+					blocks array that is of size
+					block_sz. Blocks beyond this offset
+					are of size FTS_MAX_WORD_LEN */
+
+	z_streamp	zp;		/*!< ZLib state */
+
+					/*!< The value of the last word read
+					from the FTS INDEX table. This is
+					used to discard duplicates */
+
+	fts_string_t	word;		/*!< UTF-8 string */
+
+	ulint		max_words;	/*!< maximum number of words to read
+					in one pase */
+};
+
+/** Prepared statemets used during optimize */
+struct fts_optimize_graph_t {
+					/*!< Delete a word from FTS INDEX */
+	que_t*		delete_nodes_graph;
+					/*!< Insert a word into FTS INDEX */
+	que_t*		write_nodes_graph;
+					/*!< COMMIT a transaction */
+	que_t*		commit_graph;
+					/*!< Read the nodes from FTS_INDEX */
+	que_t*		read_nodes_graph;
+};
+
+/** Used by fts_optimize() to store state. */
+struct fts_optimize_t {
+	trx_t*		trx;		/*!< The transaction used for all SQL */
+
+	ib_alloc_t*	self_heap;	/*!< Heap to use for allocations */
+
+	char*		name_prefix;	/*!< FTS table name prefix */
+
+	fts_table_t	fts_index_table;/*!< Common table definition */
+
+					/*!< Common table definition */
+	fts_table_t	fts_common_table;
+
+	dict_table_t*	table;		/*!< Table that has to be queried */
+
+	dict_index_t*	index;		/*!< The FTS index to be optimized */
+
+	fts_doc_ids_t*	to_delete;	/*!< doc ids to delete, we check against
+					this vector and purge the matching
+					entries during the optimizing
+					process. The vector entries are
+					sorted on doc id */
+
+	ulint		del_pos;	/*!< Offset within to_delete vector,
+					this is used to keep track of where
+					we are up to in the vector */
+
+	ibool		done;		/*!< TRUE when optimize finishes */
+
+	ib_vector_t*	words;		/*!< Word + Nodes read from FTS_INDEX,
+					it contains instances of fts_word_t */
+
+	fts_zip_t*	zip;		/*!< Words read from the FTS_INDEX */
+
+	fts_optimize_graph_t		/*!< Prepared statements used during */
+			graph;		/*optimize */
+
+	ulint		n_completed;	/*!< Number of FTS indexes that have
+					been optimized */
+	ibool		del_list_regenerated;
+					/*!< BEING_DELETED list regenarated */
+};
+
+/** Used by the optimize, to keep state during compacting nodes. */
+struct fts_encode_t {
+	doc_id_t	src_last_doc_id;/*!< Last doc id read from src node */
+	byte*		src_ilist_ptr;	/*!< Current ptr within src ilist */
+};
+
+/** We use this information to determine when to start the optimize
+cycle for a table. */
+struct fts_slot_t {
+	dict_table_t*	table;		/*!< Table to optimize */
+
+	table_id_t	table_id;	/*!< Table id */
+
+	fts_state_t	state;		/*!< State of this slot */
+
+	ulint		added;		/*!< Number of doc ids added since the
+					last time this table was optimized */
+
+	ulint		deleted;	/*!< Number of doc ids deleted since the
+					last time this table was optimized */
+
+	ib_time_t	last_run;	/*!< Time last run completed */
+
+	ib_time_t	completed;	/*!< Optimize finish time */
+
+	ib_time_t	interval_time;	/*!< Minimum time to wait before
+					optimizing the table again. */
+};
+
+/** A table remove message for the FTS optimize thread. */
+struct fts_msg_del_t {
+	dict_table_t*	table;		/*!< The table to remove */
+
+	os_event_t	event;		/*!< Event to synchronize acknowledgement
+					of receipt and processing of the
+					this message by the consumer */
+};
+
+/** Stop the optimize thread. */
+struct fts_msg_optimize_t {
+	dict_table_t*	table;		/*!< Table to optimize */
+};
+
+/** The FTS optimize message work queue message type. */
+struct fts_msg_t {
+	fts_msg_type_t	type;		/*!< Message type */
+
+	void*		ptr;		/*!< The message contents */
+
+	mem_heap_t*	heap;		/*!< The heap used to allocate this
+					message, the message consumer will
+					free the heap. */
+};
+
+/** The number of words to read and optimize in a single pass. */
+UNIV_INTERN ulong	fts_num_word_optimize;
+
+// FIXME
+UNIV_INTERN char	fts_enable_diag_print;
+
+/** ZLib compressed block size.*/
+static ulint FTS_ZIP_BLOCK_SIZE	= 1024;
+
+/** The amount of time optimizing in a single pass, in milliseconds. */
+static ib_time_t fts_optimize_time_limit = 0;
+
+/** SQL Statement for changing state of rows to be deleted from FTS Index. */
+static	const char* fts_init_delete_sql =
+	"BEGIN\n"
+	"\n"
+	"INSERT INTO \"%s_BEING_DELETED\"\n"
+		"SELECT doc_id FROM \"%s_DELETED\";\n"
+	"\n"
+	"INSERT INTO \"%s_BEING_DELETED_CACHE\"\n"
+		"SELECT doc_id FROM \"%s_DELETED_CACHE\";\n";
+
+static const char* fts_delete_doc_ids_sql =
+	"BEGIN\n"
+	"\n"
+	"DELETE FROM \"%s_DELETED\" WHERE doc_id = :doc_id1;\n"
+	"DELETE FROM \"%s_DELETED_CACHE\" WHERE doc_id = :doc_id2;\n";
+
+static const char* fts_end_delete_sql =
+	"BEGIN\n"
+	"\n"
+	"DELETE FROM \"%s_BEING_DELETED\";\n"
+	"DELETE FROM \"%s_BEING_DELETED_CACHE\";\n";
+
+/**********************************************************************//**
+Initialize fts_zip_t. */
+static
+void
+fts_zip_initialize(
+/*===============*/
+	fts_zip_t*	zip)		/*!< out: zip instance to initialize */
+{
+	zip->pos = 0;
+	zip->n_words = 0;
+
+	zip->status = Z_OK;
+
+	zip->last_big_block = 0;
+
+	zip->word.f_len = 0;
+	memset(zip->word.f_str, 0, FTS_MAX_WORD_LEN);
+
+	ib_vector_reset(zip->blocks);
+
+	memset(zip->zp, 0, sizeof(*zip->zp));
+}
+
+/**********************************************************************//**
+Create an instance of fts_zip_t.
+@return a new instance of fts_zip_t */
+static
+fts_zip_t*
+fts_zip_create(
+/*===========*/
+	mem_heap_t*	heap,		/*!< in: heap */
+	ulint		block_sz,	/*!< in: size of a zip block.*/
+	ulint		max_words)	/*!< in: max words to read */
+{
+	fts_zip_t*	zip;
+
+	zip = static_cast<fts_zip_t*>(mem_heap_zalloc(heap, sizeof(*zip)));
+
+	zip->word.f_str = static_cast<byte*>(
+		mem_heap_zalloc(heap, FTS_MAX_WORD_LEN + 1));
+
+	zip->block_sz = block_sz;
+
+	zip->heap_alloc = ib_heap_allocator_create(heap);
+
+	zip->blocks = ib_vector_create(zip->heap_alloc, sizeof(void*), 128);
+
+	zip->max_words = max_words;
+
+	zip->zp = static_cast<z_stream*>(
+		mem_heap_zalloc(heap, sizeof(*zip->zp)));
+
+	return(zip);
+}
+
+/**********************************************************************//**
+Initialize an instance of fts_zip_t. */
+static
+void
+fts_zip_init(
+/*=========*/
+
+	fts_zip_t*	zip)		/*!< in: zip instance to init */
+{
+	memset(zip->zp, 0, sizeof(*zip->zp));
+
+	zip->word.f_len = 0;
+	*zip->word.f_str = '\0';
+}
+
+/**********************************************************************//**
+Create a fts_optimizer_word_t instance.
+@return new instance */
+UNIV_INTERN
+fts_word_t*
+fts_word_init(
+/*==========*/
+	fts_word_t*	word,		/*!< in: word to initialize */
+	byte*		utf8,		/*!< in: UTF-8 string */
+	ulint		len)		/*!< in: length of string in bytes */
+{
+	mem_heap_t*	heap = mem_heap_create(sizeof(fts_node_t));
+
+	memset(word, 0, sizeof(*word));
+
+	word->text.f_len = len;
+	word->text.f_str = static_cast<byte*>(mem_heap_alloc(heap, len + 1));
+
+	/* Need to copy the NUL character too. */
+	memcpy(word->text.f_str, utf8, word->text.f_len);
+	word->text.f_str[word->text.f_len] = 0;
+
+	word->heap_alloc = ib_heap_allocator_create(heap);
+
+	word->nodes = ib_vector_create(
+		word->heap_alloc, sizeof(fts_node_t), FTS_WORD_NODES_INIT_SIZE);
+
+	return(word);
+}
+
+/**********************************************************************//**
+Read the FTS INDEX row.
+@return fts_node_t instance */
+static
+fts_node_t*
+fts_optimize_read_node(
+/*===================*/
+	fts_word_t*	word,		/*!< in: */
+	que_node_t*	exp)		/*!< in: */
+{
+	int		i;
+	fts_node_t*	node = static_cast<fts_node_t*>(
+		ib_vector_push(word->nodes, NULL));
+
+	/* Start from 1 since the first node has been read by the caller */
+	for (i = 1; exp; exp = que_node_get_next(exp), ++i) {
+
+		dfield_t*	dfield = que_node_get_val(exp);
+		byte*		data = static_cast<byte*>(
+			dfield_get_data(dfield));
+		ulint		len = dfield_get_len(dfield);
+
+		ut_a(len != UNIV_SQL_NULL);
+
+		/* Note: The column numbers below must match the SELECT */
+		switch (i) {
+		case 1: /* DOC_COUNT */
+			node->doc_count = mach_read_from_4(data);
+			break;
+
+		case 2: /* FIRST_DOC_ID */
+			node->first_doc_id = fts_read_doc_id(data);
+			break;
+
+		case 3: /* LAST_DOC_ID */
+			node->last_doc_id = fts_read_doc_id(data);
+			break;
+
+		case 4: /* ILIST */
+			node->ilist_size_alloc = node->ilist_size = len;
+			node->ilist = static_cast<byte*>(ut_malloc(len));
+			memcpy(node->ilist, data, len);
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	/* Make sure all columns were read. */
+	ut_a(i == 5);
+
+	return(node);
+}
+
+/**********************************************************************//**
+Callback function to fetch the rows in an FTS INDEX record.
+@return always returns non-NULL */
+UNIV_INTERN
+ibool
+fts_optimize_index_fetch_node(
+/*==========================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to ib_vector_t */
+{
+	fts_word_t*	word;
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	fts_fetch_t*	fetch = static_cast<fts_fetch_t*>(user_arg);
+	ib_vector_t*	words = static_cast<ib_vector_t*>(fetch->read_arg);
+	que_node_t*	exp = sel_node->select_list;
+	dfield_t*	dfield = que_node_get_val(exp);
+	void*		data = dfield_get_data(dfield);
+	ulint		dfield_len = dfield_get_len(dfield);
+	fts_node_t*	node;
+	bool		is_word_init = false;
+
+	ut_a(dfield_len <= FTS_MAX_WORD_LEN);
+
+	if (ib_vector_size(words) == 0) {
+
+		word = static_cast<fts_word_t*>(ib_vector_push(words, NULL));
+		fts_word_init(word, (byte*) data, dfield_len);
+		is_word_init = true;
+	}
+
+	word = static_cast<fts_word_t*>(ib_vector_last(words));
+
+	if (dfield_len != word->text.f_len
+	    || memcmp(word->text.f_str, data, dfield_len)) {
+
+		word = static_cast<fts_word_t*>(ib_vector_push(words, NULL));
+		fts_word_init(word, (byte*) data, dfield_len);
+		is_word_init = true;
+	}
+
+	node = fts_optimize_read_node(word, que_node_get_next(exp));
+
+	fetch->total_memory += node->ilist_size;
+	if (is_word_init) {
+		fetch->total_memory += sizeof(fts_word_t)
+			+ sizeof(ib_alloc_t) + sizeof(ib_vector_t) + dfield_len
+			+ sizeof(fts_node_t) * FTS_WORD_NODES_INIT_SIZE;
+	} else if (ib_vector_size(words) > FTS_WORD_NODES_INIT_SIZE) {
+		fetch->total_memory += sizeof(fts_node_t);
+	}
+
+	if (fetch->total_memory >= fts_result_cache_limit) {
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Read the rows from the FTS inde.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_index_fetch_nodes(
+/*==================*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t**		graph,		/*!< in: prepared statement */
+	fts_table_t*	fts_table,	/*!< in: table of the FTS INDEX */
+	const fts_string_t*
+			word,		/*!< in: the word to fetch */
+	fts_fetch_t*	fetch)		/*!< in: fetch callback.*/
+{
+	pars_info_t*	info;
+	dberr_t		error;
+
+	trx->op_info = "fetching FTS index nodes";
+
+	if (*graph) {
+		info = (*graph)->info;
+	} else {
+		info = pars_info_create();
+	}
+
+	pars_info_bind_function(info, "my_func", fetch->read_record, fetch);
+	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+	if (!*graph) {
+		ulint	selected;
+
+		ut_a(fts_table->type == FTS_INDEX_TABLE);
+
+		selected = fts_select_index(fts_table->charset,
+					    word->f_str, word->f_len);
+
+		fts_table->suffix = fts_get_suffix(selected);
+
+		*graph = fts_parse_sql(
+			fts_table,
+			info,
+			"DECLARE FUNCTION my_func;\n"
+			"DECLARE CURSOR c IS"
+			" SELECT word, doc_count, first_doc_id, last_doc_id, "
+				"ilist\n"
+			" FROM \"%s\"\n"
+			" WHERE word LIKE :word\n"
+			" ORDER BY first_doc_id;\n"
+			"BEGIN\n"
+			"\n"
+			"OPEN c;\n"
+			"WHILE 1 = 1 LOOP\n"
+			"  FETCH c INTO my_func();\n"
+			"  IF c % NOTFOUND THEN\n"
+			"    EXIT;\n"
+			"  END IF;\n"
+			"END LOOP;\n"
+			"CLOSE c;");
+	}
+
+	for(;;) {
+		error = fts_eval_sql(trx, *graph);
+
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+
+			break;				/* Exit the loop. */
+		} else {
+			fts_sql_rollback(trx);
+
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, " InnoDB: Warning: lock wait "
+					"timeout reading FTS index. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, " InnoDB: Error: (%s) "
+					"while reading FTS index.\n",
+					ut_strerr(error));
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Read a word */
+static
+byte*
+fts_zip_read_word(
+/*==============*/
+	fts_zip_t*	zip,		/*!< in: Zip state + data */
+	fts_string_t*	word)		/*!< out: uncompressed word */
+{
+#ifdef UNIV_DEBUG
+	ulint		i;
+#endif
+	byte		len = 0;
+	void*		null = NULL;
+	byte*		ptr = word->f_str;
+	int		flush = Z_NO_FLUSH;
+
+	/* Either there was an error or we are at the Z_STREAM_END. */
+	if (zip->status != Z_OK) {
+		return(NULL);
+	}
+
+	zip->zp->next_out = &len;
+	zip->zp->avail_out = sizeof(len);
+
+	while (zip->status == Z_OK && zip->zp->avail_out > 0) {
+
+		/* Finished decompressing block. */
+		if (zip->zp->avail_in == 0) {
+
+			/* Free the block thats been decompressed. */
+			if (zip->pos > 0) {
+				ulint	prev = zip->pos - 1;
+
+				ut_a(zip->pos < ib_vector_size(zip->blocks));
+
+				ut_free(ib_vector_getp(zip->blocks, prev));
+				ib_vector_set(zip->blocks, prev, &null);
+			}
+
+			/* Any more blocks to decompress. */
+			if (zip->pos < ib_vector_size(zip->blocks)) {
+
+				zip->zp->next_in = static_cast<byte*>(
+					ib_vector_getp(
+						zip->blocks, zip->pos));
+
+				if (zip->pos > zip->last_big_block) {
+					zip->zp->avail_in =
+						FTS_MAX_WORD_LEN;
+				} else {
+					zip->zp->avail_in = static_cast<uInt>(zip->block_sz);
+				}
+
+				++zip->pos;
+			} else {
+				flush = Z_FINISH;
+			}
+		}
+
+		switch (zip->status = inflate(zip->zp, flush)) {
+		case Z_OK:
+			if (zip->zp->avail_out == 0 && len > 0) {
+
+				ut_a(len <= FTS_MAX_WORD_LEN);
+				ptr[len] = 0;
+
+				zip->zp->next_out = ptr;
+				zip->zp->avail_out = len;
+
+				word->f_len = len;
+				len = 0;
+			}
+			break;
+
+		case Z_BUF_ERROR:	/* No progress possible. */
+		case Z_STREAM_END:
+			inflateEnd(zip->zp);
+			break;
+
+		case Z_STREAM_ERROR:
+		default:
+			ut_error;
+		}
+	}
+
+#ifdef UNIV_DEBUG
+	/* All blocks must be freed at end of inflate. */
+	if (zip->status != Z_OK) {
+		for (i = 0; i < ib_vector_size(zip->blocks); ++i) {
+			if (ib_vector_getp(zip->blocks, i)) {
+				ut_free(ib_vector_getp(zip->blocks, i));
+				ib_vector_set(zip->blocks, i, &null);
+			}
+		}
+	}
+
+	if (ptr != NULL) {
+		ut_ad(word->f_len == strlen((char*) ptr));
+	}
+#endif /* UNIV_DEBUG */
+
+	return(zip->status == Z_OK || zip->status == Z_STREAM_END ? ptr : NULL);
+}
+
+/**********************************************************************//**
+Callback function to fetch and compress the word in an FTS
+INDEX record.
+@return FALSE on EOF */
+static
+ibool
+fts_fetch_index_words(
+/*==================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to ib_vector_t */
+{
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	fts_zip_t*	zip = static_cast<fts_zip_t*>(user_arg);
+	que_node_t*	exp = sel_node->select_list;
+	dfield_t*	dfield = que_node_get_val(exp);
+	byte		len = (byte) dfield_get_len(dfield);
+	void*		data = dfield_get_data(dfield);
+
+	/* Skip the duplicate words. */
+	if (zip->word.f_len == len && !memcmp(zip->word.f_str, data, len)) {
+
+		return(TRUE);
+	}
+
+	ut_a(len <= FTS_MAX_WORD_LEN);
+
+	memcpy(zip->word.f_str, data, len);
+	zip->word.f_len = len;
+
+	ut_a(zip->zp->avail_in == 0);
+	ut_a(zip->zp->next_in == NULL);
+
+	/* The string is prefixed by len. */
+	zip->zp->next_in = &len;
+	zip->zp->avail_in = sizeof(len);
+
+	/* Compress the word, create output blocks as necessary. */
+	while (zip->zp->avail_in > 0) {
+
+		/* No space left in output buffer, create a new one. */
+		if (zip->zp->avail_out == 0) {
+			byte*		block;
+
+			block = static_cast<byte*>(ut_malloc(zip->block_sz));
+			ib_vector_push(zip->blocks, &block);
+
+			zip->zp->next_out = block;
+			zip->zp->avail_out = static_cast<uInt>(zip->block_sz);
+		}
+
+		switch (zip->status = deflate(zip->zp, Z_NO_FLUSH)) {
+		case Z_OK:
+			if (zip->zp->avail_in == 0) {
+				zip->zp->next_in = static_cast<byte*>(data);
+				zip->zp->avail_in = len;
+				ut_a(len <= FTS_MAX_WORD_LEN);
+				len = 0;
+			}
+			break;
+
+		case Z_STREAM_END:
+		case Z_BUF_ERROR:
+		case Z_STREAM_ERROR:
+		default:
+			ut_error;
+			break;
+		}
+	}
+
+	/* All data should have been compressed. */
+	ut_a(zip->zp->avail_in == 0);
+	zip->zp->next_in = NULL;
+
+	++zip->n_words;
+
+	return(zip->n_words >= zip->max_words ? FALSE : TRUE);
+}
+
+/**********************************************************************//**
+Finish Zip deflate. */
+static
+void
+fts_zip_deflate_end(
+/*================*/
+	fts_zip_t*	zip)		/*!< in: instance that should be closed*/
+{
+	ut_a(zip->zp->avail_in == 0);
+	ut_a(zip->zp->next_in == NULL);
+
+	zip->status = deflate(zip->zp, Z_FINISH);
+
+	ut_a(ib_vector_size(zip->blocks) > 0);
+	zip->last_big_block = ib_vector_size(zip->blocks) - 1;
+
+	/* Allocate smaller block(s), since this is trailing data. */
+	while (zip->status == Z_OK) {
+		byte*		block;
+
+		ut_a(zip->zp->avail_out == 0);
+
+		block = static_cast<byte*>(ut_malloc(FTS_MAX_WORD_LEN + 1));
+		ib_vector_push(zip->blocks, &block);
+
+		zip->zp->next_out = block;
+		zip->zp->avail_out = FTS_MAX_WORD_LEN;
+
+		zip->status = deflate(zip->zp, Z_FINISH);
+	}
+
+	ut_a(zip->status == Z_STREAM_END);
+
+	zip->status = deflateEnd(zip->zp);
+	ut_a(zip->status == Z_OK);
+
+	/* Reset the ZLib data structure. */
+	memset(zip->zp, 0, sizeof(*zip->zp));
+}
+
+/**********************************************************************//**
+Read the words from the FTS INDEX.
+@return DB_SUCCESS if all OK, DB_TABLE_NOT_FOUND if no more indexes
+        to search else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_index_fetch_words(
+/*==================*/
+	fts_optimize_t*		optim,	/*!< in: optimize scratch pad */
+	const fts_string_t*	word,	/*!< in: get words greater than this
+					 word */
+	ulint			n_words)/*!< in: max words to read */
+{
+	pars_info_t*	info;
+	que_t*		graph;
+	ulint		selected;
+	fts_zip_t*	zip = NULL;
+	dberr_t		error = DB_SUCCESS;
+	mem_heap_t*	heap = static_cast<mem_heap_t*>(optim->self_heap->arg);
+	ibool		inited = FALSE;
+
+	optim->trx->op_info = "fetching FTS index words";
+
+	if (optim->zip == NULL) {
+		optim->zip = fts_zip_create(heap, FTS_ZIP_BLOCK_SIZE, n_words);
+	} else {
+		fts_zip_initialize(optim->zip);
+	}
+
+	for (selected = fts_select_index(
+		optim->fts_index_table.charset, word->f_str, word->f_len);
+	     fts_index_selector[selected].value;
+	     selected++) {
+
+		optim->fts_index_table.suffix = fts_get_suffix(selected);
+
+		/* We've search all indexes. */
+		if (optim->fts_index_table.suffix == NULL) {
+			return(DB_TABLE_NOT_FOUND);
+		}
+
+		info = pars_info_create();
+
+		pars_info_bind_function(
+			info, "my_func", fts_fetch_index_words, optim->zip);
+
+		pars_info_bind_varchar_literal(
+			info, "word", word->f_str, word->f_len);
+
+		graph = fts_parse_sql(
+			&optim->fts_index_table,
+			info,
+			"DECLARE FUNCTION my_func;\n"
+			"DECLARE CURSOR c IS"
+			" SELECT word\n"
+			" FROM \"%s\"\n"
+			" WHERE word > :word\n"
+			" ORDER BY word;\n"
+			"BEGIN\n"
+			"\n"
+			"OPEN c;\n"
+			"WHILE 1 = 1 LOOP\n"
+			"  FETCH c INTO my_func();\n"
+			"  IF c % NOTFOUND THEN\n"
+			"    EXIT;\n"
+			"  END IF;\n"
+			"END LOOP;\n"
+			"CLOSE c;");
+
+		zip = optim->zip;
+
+		for(;;) {
+			int	err;
+
+			if (!inited && ((err = deflateInit(zip->zp, 9))
+					!= Z_OK)) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					" InnoDB: Error: ZLib deflateInit() "
+					"failed: %d\n", err);
+
+				error = DB_ERROR;
+				break;
+			} else {
+				inited = TRUE;
+				error = fts_eval_sql(optim->trx, graph);
+			}
+
+			if (error == DB_SUCCESS) {
+				//FIXME fts_sql_commit(optim->trx);
+				break;
+			} else {
+				//FIXME fts_sql_rollback(optim->trx);
+
+				ut_print_timestamp(stderr);
+
+				if (error == DB_LOCK_WAIT_TIMEOUT) {
+					fprintf(stderr, " InnoDB: "
+						"Warning: lock wait "
+						"timeout reading document. "
+						"Retrying!\n");
+
+					/* We need to reset the ZLib state. */
+					inited = FALSE;
+					deflateEnd(zip->zp);
+					fts_zip_init(zip);
+
+					optim->trx->error_state = DB_SUCCESS;
+				} else {
+					fprintf(stderr, " InnoDB: Error: (%s) "
+						"while reading document.\n",
+						ut_strerr(error));
+
+					break;	/* Exit the loop. */
+				}
+			}
+		}
+
+		fts_que_graph_free(graph);
+
+		/* Check if max word to fetch is exceeded */
+		if (optim->zip->n_words >= n_words) {
+			break;
+		}
+	}
+
+	if (error == DB_SUCCESS && zip->status == Z_OK && zip->n_words > 0) {
+
+		/* All data should have been read. */
+		ut_a(zip->zp->avail_in == 0);
+
+		fts_zip_deflate_end(zip);
+	} else {
+		deflateEnd(zip->zp);
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Callback function to fetch the doc id from the record.
+@return always returns TRUE */
+static
+ibool
+fts_fetch_doc_ids(
+/*==============*/
+	void*	row,		/*!< in: sel_node_t* */
+	void*	user_arg)	/*!< in: pointer to ib_vector_t */
+{
+	que_node_t*	exp;
+	int		i = 0;
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	fts_doc_ids_t*	fts_doc_ids = static_cast<fts_doc_ids_t*>(user_arg);
+	fts_update_t*	update = static_cast<fts_update_t*>(
+		ib_vector_push(fts_doc_ids->doc_ids, NULL));
+
+	for (exp = sel_node->select_list;
+	     exp;
+	     exp = que_node_get_next(exp), ++i) {
+
+		dfield_t*	dfield = que_node_get_val(exp);
+		void*		data = dfield_get_data(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		ut_a(len != UNIV_SQL_NULL);
+
+		/* Note: The column numbers below must match the SELECT. */
+		switch (i) {
+		case 0: /* DOC_ID */
+			update->fts_indexes = NULL;
+			update->doc_id = fts_read_doc_id(
+				static_cast<byte*>(data));
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Read the rows from a FTS common auxiliary table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_table_fetch_doc_ids(
+/*====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	fts_table_t*	fts_table,	/*!< in: table */
+	fts_doc_ids_t*	doc_ids)	/*!< in: For collecting doc ids */
+{
+	dberr_t		error;
+	que_t*		graph;
+	pars_info_t*	info = pars_info_create();
+	ibool		alloc_bk_trx = FALSE;
+
+	ut_a(fts_table->suffix != NULL);
+	ut_a(fts_table->type == FTS_COMMON_TABLE);
+
+	if (!trx) {
+		trx = trx_allocate_for_background();
+		alloc_bk_trx = TRUE;
+	}
+
+	trx->op_info = "fetching FTS doc ids";
+
+	pars_info_bind_function(info, "my_func", fts_fetch_doc_ids, doc_ids);
+
+	graph = fts_parse_sql(
+		fts_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT doc_id FROM \"%s\";\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	error = fts_eval_sql(trx, graph);
+
+	mutex_enter(&dict_sys->mutex);
+	que_graph_free(graph);
+	mutex_exit(&dict_sys->mutex);
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(trx);
+
+		ib_vector_sort(doc_ids->doc_ids, fts_update_doc_id_cmp);
+	} else {
+		fts_sql_rollback(trx);
+	}
+
+	if (alloc_bk_trx) {
+		trx_free_for_background(trx);
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Do a binary search for a doc id in the array
+@return +ve index if found -ve index where it should be inserted
+        if not found */
+UNIV_INTERN
+int
+fts_bsearch(
+/*========*/
+	fts_update_t*	array,	/*!< in: array to sort */
+	int		lower,	/*!< in: the array lower bound */
+	int		upper,	/*!< in: the array upper bound */
+	doc_id_t	doc_id)	/*!< in: the doc id to search for */
+{
+	int	orig_size = upper;
+
+	if (upper == 0) {
+		/* Nothing to search */
+		return(-1);
+	} else {
+		while (lower < upper) {
+			int	i = (lower + upper) >> 1;
+
+			if (doc_id > array[i].doc_id) {
+				lower = i + 1;
+			} else if (doc_id < array[i].doc_id) {
+				upper = i - 1;
+			} else {
+				return(i); /* Found. */
+			}
+		}
+	}
+
+	if (lower == upper && lower < orig_size) {
+		if (doc_id == array[lower].doc_id) {
+			return(lower);
+		} else if (lower == 0) {
+			return(-1);
+		}
+	}
+
+	/* Not found. */
+	return( (lower == 0) ? -1 : -lower);
+}
+
+/**********************************************************************//**
+Search in the to delete array whether any of the doc ids within
+the [first, last] range are to be deleted
+@return +ve index if found -ve index where it should be inserted
+        if not found */
+static
+int
+fts_optimize_lookup(
+/*================*/
+	ib_vector_t*	doc_ids,	/*!< in: array to search */
+	ulint		lower,		/*!< in: lower limit of array */
+	doc_id_t	first_doc_id,	/*!< in: doc id to lookup */
+	doc_id_t	last_doc_id)	/*!< in: doc id to lookup */
+{
+	int		pos;
+	int		upper = static_cast<int>(ib_vector_size(doc_ids));
+	fts_update_t*	array = (fts_update_t*) doc_ids->data;
+
+	pos = fts_bsearch(array, static_cast<int>(lower), upper, first_doc_id);
+
+	ut_a(abs(pos) <= upper + 1);
+
+	if (pos < 0) {
+
+		int	i = abs(pos);
+
+		/* If i is 1, it could be first_doc_id is less than
+		either the first or second array item, do a
+		double check */
+		if (i == 1 && array[0].doc_id <= last_doc_id
+		    && first_doc_id < array[0].doc_id) {
+			pos = 0;
+		} else if (i < upper && array[i].doc_id <= last_doc_id) {
+
+			/* Check if the "next" doc id is within the
+			first & last doc id of the node. */
+			pos = i;
+		}
+	}
+
+	return(pos);
+}
+
+/**********************************************************************//**
+Encode the word pos list into the node
+@return DB_SUCCESS or error code*/
+static __attribute__((nonnull))
+dberr_t
+fts_optimize_encode_node(
+/*=====================*/
+	fts_node_t*	node,		/*!< in: node to fill*/
+	doc_id_t	doc_id,		/*!< in: doc id to encode */
+	fts_encode_t*	enc)		/*!< in: encoding state.*/
+{
+	byte*		dst;
+	ulint		enc_len;
+	ulint		pos_enc_len;
+	doc_id_t	doc_id_delta;
+	dberr_t		error = DB_SUCCESS;
+	byte*		src = enc->src_ilist_ptr;
+
+	if (node->first_doc_id == 0) {
+		ut_a(node->last_doc_id == 0);
+
+		node->first_doc_id = doc_id;
+	}
+
+	/* Calculate the space required to store the ilist. */
+	doc_id_delta = doc_id - node->last_doc_id;
+	enc_len = fts_get_encoded_len(static_cast<ulint>(doc_id_delta));
+
+	/* Calculate the size of the encoded pos array. */
+	while (*src) {
+		fts_decode_vlc(&src);
+	}
+
+	/* Skip the 0x00 byte at the end of the word positions list. */
+	++src;
+
+	/* Number of encoded pos bytes to copy. */
+	pos_enc_len = src - enc->src_ilist_ptr;
+
+	/* Total number of bytes required for copy. */
+	enc_len += pos_enc_len;
+
+	/* Check we have enough space in the destination buffer for
+	copying the document word list. */
+	if (!node->ilist) {
+		ulint	new_size;
+
+		ut_a(node->ilist_size == 0);
+
+		new_size = enc_len > FTS_ILIST_MAX_SIZE
+			? enc_len : FTS_ILIST_MAX_SIZE;
+
+		node->ilist = static_cast<byte*>(ut_malloc(new_size));
+		node->ilist_size_alloc = new_size;
+
+	} else if ((node->ilist_size + enc_len) > node->ilist_size_alloc) {
+		ulint	new_size = node->ilist_size + enc_len;
+		byte*	ilist = static_cast<byte*>(ut_malloc(new_size));
+
+		memcpy(ilist, node->ilist, node->ilist_size);
+
+		ut_free(node->ilist);
+
+		node->ilist = ilist;
+		node->ilist_size_alloc = new_size;
+	}
+
+	src = enc->src_ilist_ptr;
+	dst = node->ilist + node->ilist_size;
+
+	/* Encode the doc id. Cast to ulint, the delta should be small and
+	therefore no loss of precision. */
+	dst += fts_encode_int((ulint) doc_id_delta, dst);
+
+	/* Copy the encoded pos array. */
+	memcpy(dst, src, pos_enc_len);
+
+	node->last_doc_id = doc_id;
+
+	/* Data copied upto here. */
+	node->ilist_size += enc_len;
+	enc->src_ilist_ptr += pos_enc_len;
+
+	ut_a(node->ilist_size <= node->ilist_size_alloc);
+
+	return(error);
+}
+
+/**********************************************************************//**
+Optimize the data contained in a node.
+@return DB_SUCCESS or error code*/
+static __attribute__((nonnull))
+dberr_t
+fts_optimize_node(
+/*==============*/
+	ib_vector_t*	del_vec,	/*!< in: vector of doc ids to delete*/
+	int*		del_pos,	/*!< in: offset into above vector */
+	fts_node_t*	dst_node,	/*!< in: node to fill*/
+	fts_node_t*	src_node,	/*!< in: source node for data*/
+	fts_encode_t*	enc)		/*!< in: encoding state */
+{
+	ulint		copied;
+	dberr_t		error = DB_SUCCESS;
+	doc_id_t	doc_id = enc->src_last_doc_id;
+
+	if (!enc->src_ilist_ptr) {
+		enc->src_ilist_ptr = src_node->ilist;
+	}
+
+	copied = enc->src_ilist_ptr - src_node->ilist;
+
+	/* While there is data in the source node and space to copy
+	into in the destination node. */
+	while (copied < src_node->ilist_size
+	       && dst_node->ilist_size < FTS_ILIST_MAX_SIZE) {
+
+		doc_id_t	delta;
+		doc_id_t	del_doc_id = FTS_NULL_DOC_ID;
+
+		delta = fts_decode_vlc(&enc->src_ilist_ptr);
+
+test_again:
+		/* Check whether the doc id is in the delete list, if
+		so then we skip the entries but we need to track the
+		delta for decoding the entries following this document's
+		entries. */
+		if (*del_pos >= 0 && *del_pos < (int) ib_vector_size(del_vec)) {
+			fts_update_t*	update;
+
+			update = (fts_update_t*) ib_vector_get(
+				del_vec, *del_pos);
+
+			del_doc_id = update->doc_id;
+		}
+
+		if (enc->src_ilist_ptr == src_node->ilist && doc_id == 0) {
+			ut_a(delta == src_node->first_doc_id);
+		}
+
+		doc_id += delta;
+
+		if (del_doc_id > 0 && doc_id == del_doc_id) {
+
+			++*del_pos;
+
+			/* Skip the entries for this document. */
+			while (*enc->src_ilist_ptr) {
+				fts_decode_vlc(&enc->src_ilist_ptr);
+			}
+
+			/* Skip the end of word position marker. */
+			++enc->src_ilist_ptr;
+
+		} else {
+
+			/* DOC ID already becomes larger than
+			del_doc_id, check the next del_doc_id */
+			if (del_doc_id > 0 && doc_id > del_doc_id) {
+				del_doc_id = 0;
+				++*del_pos;
+				delta = 0;
+				goto test_again;
+			}
+
+			/* Decode and copy the word positions into
+			the dest node. */
+			fts_optimize_encode_node(dst_node, doc_id, enc);
+
+			++dst_node->doc_count;
+
+			ut_a(dst_node->last_doc_id == doc_id);
+		}
+
+		/* Bytes copied so for from source. */
+		copied = enc->src_ilist_ptr - src_node->ilist;
+	}
+
+	if (copied >= src_node->ilist_size) {
+		ut_a(doc_id == src_node->last_doc_id);
+	}
+
+	enc->src_last_doc_id = doc_id;
+
+	return(error);
+}
+
+/**********************************************************************//**
+Determine the starting pos within the deleted doc id vector for a word.
+@return delete position */
+static __attribute__((nonnull, warn_unused_result))
+int
+fts_optimize_deleted_pos(
+/*=====================*/
+	fts_optimize_t*	optim,		/*!< in: optimize state data */
+	fts_word_t*	word)		/*!< in: the word data to check */
+{
+	int		del_pos;
+	ib_vector_t*	del_vec = optim->to_delete->doc_ids;
+
+	/* Get the first and last dict ids for the word, we will use
+	these values to determine which doc ids need to be removed
+	when we coalesce the nodes. This way we can reduce the numer
+	of elements that need to be searched in the deleted doc ids
+	vector and secondly we can remove the doc ids during the
+	coalescing phase. */
+	if (ib_vector_size(del_vec) > 0) {
+		fts_node_t*	node;
+		doc_id_t	last_id;
+		doc_id_t	first_id;
+		ulint		size = ib_vector_size(word->nodes);
+
+		node = (fts_node_t*) ib_vector_get(word->nodes, 0);
+		first_id = node->first_doc_id;
+
+		node = (fts_node_t*) ib_vector_get(word->nodes, size - 1);
+		last_id = node->last_doc_id;
+
+		ut_a(first_id <= last_id);
+
+		del_pos = fts_optimize_lookup(
+			del_vec, optim->del_pos, first_id, last_id);
+	} else {
+
+		del_pos = -1; /* Note that there is nothing to delete. */
+	}
+
+	return(del_pos);
+}
+
+#define FTS_DEBUG_PRINT
+/**********************************************************************//**
+Compact the nodes for a word, we also remove any doc ids during the
+compaction pass.
+@return DB_SUCCESS or error code.*/
+static
+ib_vector_t*
+fts_optimize_word(
+/*==============*/
+	fts_optimize_t*	optim,		/*!< in: optimize state data */
+	fts_word_t*	word)		/*!< in: the word to optimize */
+{
+	fts_encode_t	enc;
+	ib_vector_t*	nodes;
+	ulint		i = 0;
+	int		del_pos;
+	fts_node_t*	dst_node = NULL;
+	ib_vector_t*	del_vec = optim->to_delete->doc_ids;
+	ulint		size = ib_vector_size(word->nodes);
+
+	del_pos = fts_optimize_deleted_pos(optim, word);
+	nodes = ib_vector_create(word->heap_alloc, sizeof(*dst_node), 128);
+
+	enc.src_last_doc_id = 0;
+	enc.src_ilist_ptr = NULL;
+
+	if (fts_enable_diag_print) {
+		word->text.f_str[word->text.f_len] = 0;
+		fprintf(stderr, "FTS_OPTIMIZE: optimize \"%s\"\n",
+			word->text.f_str);
+	}
+
+	while (i < size) {
+		ulint		copied;
+		fts_node_t*	src_node;
+
+		src_node = (fts_node_t*) ib_vector_get(word->nodes, i);
+
+		if (!dst_node) {
+
+			dst_node = static_cast<fts_node_t*>(
+				ib_vector_push(nodes, NULL));
+			memset(dst_node, 0, sizeof(*dst_node));
+		}
+
+		/* Copy from the src to the dst node. */
+		fts_optimize_node(del_vec, &del_pos, dst_node, src_node, &enc);
+
+		ut_a(enc.src_ilist_ptr != NULL);
+
+		/* Determine the numer of bytes copied to dst_node. */
+		copied = enc.src_ilist_ptr - src_node->ilist;
+
+		/* Can't copy more than whats in the vlc array. */
+		ut_a(copied <= src_node->ilist_size);
+
+		/* We are done with this node release the resources. */
+		if (copied == src_node->ilist_size) {
+
+			enc.src_last_doc_id = 0;
+			enc.src_ilist_ptr = NULL;
+
+			ut_free(src_node->ilist);
+
+			src_node->ilist = NULL;
+			src_node->ilist_size = src_node->ilist_size_alloc = 0;
+
+			src_node = NULL;
+
+			++i; /* Get next source node to OPTIMIZE. */
+		}
+
+		if (dst_node->ilist_size >= FTS_ILIST_MAX_SIZE || i >= size) {
+
+			dst_node = NULL;
+		}
+	}
+
+	/* All dst nodes created should have been added to the vector. */
+	ut_a(dst_node == NULL);
+
+	/* Return the OPTIMIZED nodes. */
+	return(nodes);
+}
+
+/**********************************************************************//**
+Update the FTS index table. This is a delete followed by an insert.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_write_word(
+/*====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	fts_table_t*	fts_table,	/*!< in: table of FTS index */
+	fts_string_t*	word,		/*!< in: word data to write */
+	ib_vector_t*	nodes)		/*!< in: the nodes to write */
+{
+	ulint		i;
+	pars_info_t*	info;
+	que_t*		graph;
+	ulint		selected;
+	dberr_t		error = DB_SUCCESS;
+	char*		table_name = fts_get_table_name(fts_table);
+
+	info = pars_info_create();
+
+	ut_ad(fts_table->charset);
+
+	if (fts_enable_diag_print) {
+		fprintf(stderr, "FTS_OPTIMIZE: processed \"%s\"\n",
+			word->f_str);
+	}
+
+	pars_info_bind_varchar_literal(
+		info, "word", word->f_str, word->f_len);
+
+	selected = fts_select_index(fts_table->charset,
+				    word->f_str, word->f_len);
+
+	fts_table->suffix = fts_get_suffix(selected);
+
+	graph = fts_parse_sql(
+		fts_table,
+		info,
+		"BEGIN DELETE FROM \"%s\" WHERE word = :word;");
+
+	error = fts_eval_sql(trx, graph);
+
+	if (error != DB_SUCCESS) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Error: (%s) during optimize, "
+			"when deleting a word from the FTS index.\n",
+			ut_strerr(error));
+	}
+
+	fts_que_graph_free(graph);
+	graph = NULL;
+
+	mem_free(table_name);
+
+	/* Even if the operation needs to be rolled back and redone,
+	we iterate over the nodes in order to free the ilist. */
+	for (i = 0; i < ib_vector_size(nodes); ++i) {
+
+		fts_node_t* node = (fts_node_t*) ib_vector_get(nodes, i);
+
+		if (error == DB_SUCCESS) {
+			error = fts_write_node(
+				trx, &graph, fts_table, word, node);
+
+			if (error != DB_SUCCESS) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr, " InnoDB: Error: (%s) "
+					"during optimize, while adding a "
+					"word to the FTS index.\n",
+					ut_strerr(error));
+			}
+		}
+
+		ut_free(node->ilist);
+		node->ilist = NULL;
+		node->ilist_size = node->ilist_size_alloc = 0;
+	}
+
+	if (graph != NULL) {
+		fts_que_graph_free(graph);
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Free fts_optimizer_word_t instanace.*/
+UNIV_INTERN
+void
+fts_word_free(
+/*==========*/
+	fts_word_t*	word)		/*!< in: instance to free.*/
+{
+	mem_heap_t*	heap = static_cast<mem_heap_t*>(word->heap_alloc->arg);
+
+#ifdef UNIV_DEBUG
+	memset(word, 0, sizeof(*word));
+#endif /* UNIV_DEBUG */
+
+	mem_heap_free(heap);
+}
+
+/**********************************************************************//**
+Optimize the word ilist and rewrite data to the FTS index.
+@return status one of RESTART, EXIT, ERROR */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_compact(
+/*=================*/
+	fts_optimize_t*	optim,		/*!< in: optimize state data */
+	dict_index_t*	index,		/*!< in: current FTS being optimized */
+	ib_time_t	start_time)	/*!< in: optimize start time */
+{
+	ulint		i;
+	dberr_t		error = DB_SUCCESS;
+	ulint		size = ib_vector_size(optim->words);
+
+	for (i = 0; i < size && error == DB_SUCCESS && !optim->done; ++i) {
+		fts_word_t*	word;
+		ib_vector_t*	nodes;
+		trx_t*		trx = optim->trx;
+
+		word = (fts_word_t*) ib_vector_get(optim->words, i);
+
+		/* nodes is allocated from the word heap and will be destroyed
+		when the word is freed. We however have to be careful about
+		the ilist, that needs to be freed explicitly. */
+		nodes = fts_optimize_word(optim, word);
+
+		/* Update the data on disk. */
+		error = fts_optimize_write_word(
+			trx, &optim->fts_index_table, &word->text, nodes);
+
+		if (error == DB_SUCCESS) {
+			/* Write the last word optimized to the config table,
+			we use this value for restarting optimize. */
+			error = fts_config_set_index_value(
+				optim->trx, index,
+				FTS_LAST_OPTIMIZED_WORD, &word->text);
+		}
+
+		/* Free the word that was optimized. */
+		fts_word_free(word);
+
+		if (fts_optimize_time_limit > 0
+		    && (ut_time() - start_time) > fts_optimize_time_limit) {
+
+			optim->done = TRUE;
+		}
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Create an instance of fts_optimize_t. Also create a new
+background transaction.*/
+static
+fts_optimize_t*
+fts_optimize_create(
+/*================*/
+	dict_table_t*	table)		/*!< in: table with FTS indexes */
+{
+	fts_optimize_t*	optim;
+	mem_heap_t*	heap = mem_heap_create(128);
+
+	optim = (fts_optimize_t*) mem_heap_zalloc(heap, sizeof(*optim));
+
+	optim->self_heap = ib_heap_allocator_create(heap);
+
+	optim->to_delete = fts_doc_ids_create();
+
+	optim->words = ib_vector_create(
+		optim->self_heap, sizeof(fts_word_t), 256);
+
+	optim->table = table;
+
+	optim->trx = trx_allocate_for_background();
+
+	optim->fts_common_table.parent = table->name;
+	optim->fts_common_table.table_id = table->id;
+	optim->fts_common_table.type = FTS_COMMON_TABLE;
+	optim->fts_common_table.table = table;
+
+	optim->fts_index_table.parent = table->name;
+	optim->fts_index_table.table_id = table->id;
+	optim->fts_index_table.type = FTS_INDEX_TABLE;
+	optim->fts_index_table.table = table;
+
+	/* The common prefix for all this parent table's aux tables. */
+	optim->name_prefix = fts_get_table_name_prefix(
+		&optim->fts_common_table);
+
+	return(optim);
+}
+
+#ifdef FTS_OPTIMIZE_DEBUG
+/**********************************************************************//**
+Get optimize start time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_get_index_start_time(
+/*==============================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	ib_time_t*	start_time)		/*!< out: time in secs */
+{
+	return(fts_config_get_index_ulint(
+		       trx, index, FTS_OPTIMIZE_START_TIME,
+		       (ulint*) start_time));
+}
+
+/**********************************************************************//**
+Set the optimize start time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_set_index_start_time(
+/*==============================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	ib_time_t	start_time)		/*!< in: start time */
+{
+	return(fts_config_set_index_ulint(
+		       trx, index, FTS_OPTIMIZE_START_TIME,
+		       (ulint) start_time));
+}
+
+/**********************************************************************//**
+Get optimize end time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_get_index_end_time(
+/*============================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	ib_time_t*	end_time)		/*!< out: time in secs */
+{
+	return(fts_config_get_index_ulint(
+		       trx, index, FTS_OPTIMIZE_END_TIME, (ulint*) end_time));
+}
+
+/**********************************************************************//**
+Set the optimize end time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_set_index_end_time(
+/*============================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	ib_time_t	end_time)		/*!< in: end time */
+{
+	return(fts_config_set_index_ulint(
+		       trx, index, FTS_OPTIMIZE_END_TIME, (ulint) end_time));
+}
+#endif
+
+/**********************************************************************//**
+Free the optimize prepared statements.*/
+static
+void
+fts_optimize_graph_free(
+/*====================*/
+	fts_optimize_graph_t*	graph)	/*!< in/out: The graph instances
+					to free */
+{
+	if (graph->commit_graph) {
+		que_graph_free(graph->commit_graph);
+		graph->commit_graph = NULL;
+	}
+
+	if (graph->write_nodes_graph) {
+		que_graph_free(graph->write_nodes_graph);
+		graph->write_nodes_graph = NULL;
+	}
+
+	if (graph->delete_nodes_graph) {
+		que_graph_free(graph->delete_nodes_graph);
+		graph->delete_nodes_graph = NULL;
+	}
+
+	if (graph->read_nodes_graph) {
+		que_graph_free(graph->read_nodes_graph);
+		graph->read_nodes_graph = NULL;
+	}
+}
+
+/**********************************************************************//**
+Free all optimize resources. */
+static
+void
+fts_optimize_free(
+/*==============*/
+	fts_optimize_t*	optim)		/*!< in: table with on FTS index */
+{
+	mem_heap_t*	heap = static_cast<mem_heap_t*>(optim->self_heap->arg);
+
+	trx_free_for_background(optim->trx);
+
+	fts_doc_ids_free(optim->to_delete);
+	fts_optimize_graph_free(&optim->graph);
+
+	mem_free(optim->name_prefix);
+
+	/* This will free the heap from which optim itself was allocated. */
+	mem_heap_free(heap);
+}
+
+/**********************************************************************//**
+Get the max time optimize should run in millisecs.
+@return max optimize time limit in millisecs. */
+static
+ib_time_t
+fts_optimize_get_time_limit(
+/*========================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table)		/*!< in: aux table */
+{
+	ib_time_t	time_limit = 0;
+
+	fts_config_get_ulint(
+		trx, fts_table,
+		FTS_OPTIMIZE_LIMIT_IN_SECS, (ulint*) &time_limit);
+
+	return(time_limit * 1000);
+}
+
+
+/**********************************************************************//**
+Run OPTIMIZE on the given table. Note: this can take a very long time
+(hours). */
+static
+void
+fts_optimize_words(
+/*===============*/
+	fts_optimize_t*	optim,	/*!< in: optimize instance */
+	dict_index_t*	index,	/*!< in: current FTS being optimized */
+	fts_string_t*	word)	/*!< in: the starting word to optimize */
+{
+	fts_fetch_t	fetch;
+	ib_time_t	start_time;
+	que_t*		graph = NULL;
+	CHARSET_INFO*	charset = optim->fts_index_table.charset;
+
+	ut_a(!optim->done);
+
+	/* Get the time limit from the config table. */
+	fts_optimize_time_limit = fts_optimize_get_time_limit(
+		optim->trx, &optim->fts_common_table);
+
+	start_time = ut_time();
+
+	/* Setup the callback to use for fetching the word ilist etc. */
+	fetch.read_arg = optim->words;
+	fetch.read_record = fts_optimize_index_fetch_node;
+
+	fprintf(stderr, "%.*s\n", (int) word->f_len, word->f_str);
+
+	while(!optim->done) {
+		dberr_t	error;
+		trx_t*	trx = optim->trx;
+		ulint	selected;
+
+		ut_a(ib_vector_size(optim->words) == 0);
+
+		selected = fts_select_index(charset, word->f_str, word->f_len);
+
+		/* Read the index records to optimize. */
+		fetch.total_memory = 0;
+		error = fts_index_fetch_nodes(
+			trx, &graph, &optim->fts_index_table, word,
+			&fetch);
+		ut_ad(fetch.total_memory < fts_result_cache_limit);
+
+		if (error == DB_SUCCESS) {
+			/* There must be some nodes to read. */
+			ut_a(ib_vector_size(optim->words) > 0);
+
+			/* Optimize the nodes that were read and write
+			back to DB. */
+			error = fts_optimize_compact(optim, index, start_time);
+
+			if (error == DB_SUCCESS) {
+				fts_sql_commit(optim->trx);
+			} else {
+				fts_sql_rollback(optim->trx);
+			}
+		}
+
+		ib_vector_reset(optim->words);
+
+		if (error == DB_SUCCESS) {
+			if (!optim->done) {
+				if (!fts_zip_read_word(optim->zip, word)) {
+					optim->done = TRUE;
+				} else if (selected
+					   != fts_select_index(
+						charset, word->f_str,
+						word->f_len)
+					  && graph) {
+					fts_que_graph_free(graph);
+					graph = NULL;
+				}
+			}
+		} else if (error == DB_LOCK_WAIT_TIMEOUT) {
+			fprintf(stderr, "InnoDB: Warning: lock wait timeout "
+				"during optimize. Retrying!\n");
+
+			trx->error_state = DB_SUCCESS;
+		} else if (error == DB_DEADLOCK) {
+			fprintf(stderr, "InnoDB: Warning: deadlock "
+				"during optimize. Retrying!\n");
+
+			trx->error_state = DB_SUCCESS;
+		} else {
+			optim->done = TRUE;		/* Exit the loop. */
+		}
+	}
+
+	if (graph != NULL) {
+		fts_que_graph_free(graph);
+	}
+}
+
+/**********************************************************************//**
+Select the FTS index to search.
+@return TRUE if last index */
+static
+ibool
+fts_optimize_set_next_word(
+/*=======================*/
+	CHARSET_INFO*	charset,	/*!< in: charset */
+	fts_string_t*	word)		/*!< in: current last word */
+{
+	ulint		selected;
+	ibool		last = FALSE;
+
+	selected = fts_select_next_index(charset, word->f_str, word->f_len);
+
+	/* If this was the last index then reset to start. */
+	if (fts_index_selector[selected].value == 0) {
+		/* Reset the last optimized word to '' if no
+		more words could be read from the FTS index. */
+		word->f_len = 0;
+		*word->f_str = 0;
+
+		last = TRUE;
+	} else {
+		ulint	value = fts_index_selector[selected].value;
+
+		ut_a(value <= 0xff);
+
+		/* Set to the first character of the next slot. */
+		word->f_len = 1;
+		*word->f_str = (byte) value;
+	}
+
+	return(last);
+}
+
+/**********************************************************************//**
+Optimize is complete. Set the completion time, and reset the optimize
+start string for this FTS index to "".
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_index_completed(
+/*=========================*/
+	fts_optimize_t*	optim,	/*!< in: optimize instance */
+	dict_index_t*	index)	/*!< in: table with one FTS index */
+{
+	fts_string_t	word;
+	dberr_t		error;
+	byte		buf[sizeof(ulint)];
+#ifdef FTS_OPTIMIZE_DEBUG
+	ib_time_t	end_time = ut_time();
+
+	error = fts_optimize_set_index_end_time(optim->trx, index, end_time);
+#endif
+
+	/* If we've reached the end of the index then set the start
+	word to the empty string. */
+
+	word.f_len = 0;
+	word.f_str = buf;
+	*word.f_str = '\0';
+
+	error = fts_config_set_index_value(
+		optim->trx, index, FTS_LAST_OPTIMIZED_WORD, &word);
+
+	if (error != DB_SUCCESS) {
+
+		fprintf(stderr, "InnoDB: Error: (%s) while "
+			"updating last optimized word!\n", ut_strerr(error));
+	}
+
+	return(error);
+}
+
+
+/**********************************************************************//**
+Read the list of words from the FTS auxiliary index that will be
+optimized in this pass.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_index_read_words(
+/*==========================*/
+	fts_optimize_t*	optim,	/*!< in: optimize instance */
+	dict_index_t*	index,	/*!< in: table with one FTS index */
+	fts_string_t*	word)	/*!< in: buffer to use */
+{
+	dberr_t	error = DB_SUCCESS;
+
+	if (optim->del_list_regenerated) {
+		word->f_len = 0;
+	} else {
+
+		/* Get the last word that was optimized from
+		the config table. */
+		error = fts_config_get_index_value(
+			optim->trx, index, FTS_LAST_OPTIMIZED_WORD, word);
+	}
+
+	/* If record not found then we start from the top. */
+	if (error == DB_RECORD_NOT_FOUND) {
+		word->f_len = 0;
+		error = DB_SUCCESS;
+	}
+
+	while (error == DB_SUCCESS) {
+
+		error = fts_index_fetch_words(
+			optim, word, fts_num_word_optimize);
+
+		if (error == DB_SUCCESS) {
+
+			/* If the search returned an empty set
+			try the next index in the horizontal split. */
+			if (optim->zip->n_words > 0) {
+				break;
+			} else {
+
+				fts_optimize_set_next_word(
+					optim->fts_index_table.charset,
+					word);
+
+				if (word->f_len == 0) {
+					break;
+				}
+			}
+		}
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Run OPTIMIZE on the given FTS index. Note: this can take a very long
+time (hours).
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_index(
+/*===============*/
+	fts_optimize_t*	optim,	/*!< in: optimize instance */
+	dict_index_t*	index)	/*!< in: table with one FTS index */
+{
+	fts_string_t	word;
+	dberr_t		error;
+	byte		str[FTS_MAX_WORD_LEN + 1];
+
+	/* Set the current index that we have to optimize. */
+	optim->fts_index_table.index_id = index->id;
+	optim->fts_index_table.charset = fts_index_get_charset(index);
+
+	optim->done = FALSE; /* Optimize until !done */
+
+	/* We need to read the last word optimized so that we start from
+	the next word. */
+	word.f_str = str;
+
+	/* We set the length of word to the size of str since we
+	need to pass the max len info to the fts_get_config_value() function. */
+	word.f_len = sizeof(str) - 1;
+
+	memset(word.f_str, 0x0, word.f_len);
+
+	/* Read the words that will be optimized in this pass. */
+	error = fts_optimize_index_read_words(optim, index, &word);
+
+	if (error == DB_SUCCESS) {
+		int	zip_error;
+
+		ut_a(optim->zip->pos == 0);
+		ut_a(optim->zip->zp->total_in == 0);
+		ut_a(optim->zip->zp->total_out == 0);
+
+		zip_error = inflateInit(optim->zip->zp);
+		ut_a(zip_error == Z_OK);
+
+		word.f_len = 0;
+		word.f_str = str;
+
+		/* Read the first word to optimize from the Zip buffer. */
+		if (!fts_zip_read_word(optim->zip, &word)) {
+
+			optim->done = TRUE;
+		} else {
+			fts_optimize_words(optim, index, &word);
+		}
+
+		/* If we couldn't read any records then optimize is
+		complete. Increment the number of indexes that have
+		been optimized and set FTS index optimize state to
+		completed. */
+		if (error == DB_SUCCESS && optim->zip->n_words == 0) {
+
+			error = fts_optimize_index_completed(optim, index);
+
+			if (error == DB_SUCCESS) {
+				++optim->n_completed;
+			}
+		}
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Delete the document ids in the delete, and delete cache tables.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_purge_deleted_doc_ids(
+/*===============================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	ulint		i;
+	pars_info_t*	info;
+	que_t*		graph;
+	fts_update_t*	update;
+	char*		sql_str;
+	doc_id_t	write_doc_id;
+	dberr_t		error = DB_SUCCESS;
+
+	info = pars_info_create();
+
+	ut_a(ib_vector_size(optim->to_delete->doc_ids) > 0);
+
+	update = static_cast<fts_update_t*>(
+		ib_vector_get(optim->to_delete->doc_ids, 0));
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &write_doc_id, update->doc_id);
+
+	/* This is required for the SQL parser to work. It must be able
+	to find the following variables. So we do it twice. */
+	fts_bind_doc_id(info, "doc_id1", &write_doc_id);
+	fts_bind_doc_id(info, "doc_id2", &write_doc_id);
+
+	/* Since we only replace the table_id and don't construct the full
+	name, we do substitution ourselves. Remember to free sql_str. */
+	sql_str = ut_strreplace(
+		fts_delete_doc_ids_sql, "%s", optim->name_prefix);
+
+	graph = fts_parse_sql(NULL, info, sql_str);
+
+	mem_free(sql_str);
+
+	/* Delete the doc ids that were copied at the start. */
+	for (i = 0; i < ib_vector_size(optim->to_delete->doc_ids); ++i) {
+
+		update = static_cast<fts_update_t*>(ib_vector_get(
+			optim->to_delete->doc_ids, i));
+
+		/* Convert to "storage" byte order. */
+		fts_write_doc_id((byte*) &write_doc_id, update->doc_id);
+
+		fts_bind_doc_id(info, "doc_id1", &write_doc_id);
+
+		fts_bind_doc_id(info, "doc_id2", &write_doc_id);
+
+		error = fts_eval_sql(optim->trx, graph);
+
+		// FIXME: Check whether delete actually succeeded!
+		if (error != DB_SUCCESS) {
+
+			fts_sql_rollback(optim->trx);
+			break;
+		}
+	}
+
+	fts_que_graph_free(graph);
+
+	return(error);
+}
+
+/**********************************************************************//**
+Delete the document ids in the pending delete, and delete tables.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_purge_deleted_doc_id_snapshot(
+/*=======================================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	dberr_t		error;
+	que_t*		graph;
+	char*		sql_str;
+
+	/* Since we only replace the table_id and don't construct
+	the full name, we do the '%s' substitution ourselves. */
+	sql_str = ut_strreplace(fts_end_delete_sql, "%s", optim->name_prefix);
+
+	/* Delete the doc ids that were copied to delete pending state at
+	the start of optimize. */
+	graph = fts_parse_sql(NULL, NULL, sql_str);
+
+	mem_free(sql_str);
+
+	error = fts_eval_sql(optim->trx, graph);
+	fts_que_graph_free(graph);
+
+	return(error);
+}
+
+/**********************************************************************//**
+Copy the deleted doc ids that will be purged during this optimize run
+to the being deleted FTS auxiliary tables. The transaction is committed
+upon successfull copy and rolled back on DB_DUPLICATE_KEY error.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_optimize_being_deleted_count(
+/*=============================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, "BEING_DELETED", FTS_COMMON_TABLE,
+			   optim->table);
+
+	return(fts_get_rows_count(&fts_table));
+}
+
+/*********************************************************************//**
+Copy the deleted doc ids that will be purged during this optimize run
+to the being deleted FTS auxiliary tables. The transaction is committed
+upon successfull copy and rolled back on DB_DUPLICATE_KEY error.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_create_deleted_doc_id_snapshot(
+/*========================================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	dberr_t		error;
+	que_t*		graph;
+	char*		sql_str;
+
+	/* Since we only replace the table_id and don't construct the
+	full name, we do the substitution ourselves. */
+	sql_str = ut_strreplace(fts_init_delete_sql, "%s", optim->name_prefix);
+
+	/* Move doc_ids that are to be deleted to state being deleted. */
+	graph = fts_parse_sql(NULL, NULL, sql_str);
+
+	mem_free(sql_str);
+
+	error = fts_eval_sql(optim->trx, graph);
+
+	fts_que_graph_free(graph);
+
+	if (error != DB_SUCCESS) {
+		fts_sql_rollback(optim->trx);
+	} else {
+		fts_sql_commit(optim->trx);
+	}
+
+	optim->del_list_regenerated = TRUE;
+
+	return(error);
+}
+
+/*********************************************************************//**
+Read in the document ids that are to be purged during optimize. The
+transaction is committed upon successfully read.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_read_deleted_doc_id_snapshot(
+/*======================================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	dberr_t		error;
+
+	optim->fts_common_table.suffix = "BEING_DELETED";
+
+	/* Read the doc_ids to delete. */
+	error = fts_table_fetch_doc_ids(
+		optim->trx, &optim->fts_common_table, optim->to_delete);
+
+	if (error == DB_SUCCESS) {
+
+		optim->fts_common_table.suffix = "BEING_DELETED_CACHE";
+
+		/* Read additional doc_ids to delete. */
+		error = fts_table_fetch_doc_ids(
+			optim->trx, &optim->fts_common_table, optim->to_delete);
+	}
+
+	if (error != DB_SUCCESS) {
+
+		fts_doc_ids_free(optim->to_delete);
+		optim->to_delete = NULL;
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Optimze all the FTS indexes, skipping those that have already been
+optimized, since the FTS auxiliary indexes are not guaranteed to be
+of the same cardinality.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_indexes(
+/*=================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	ulint		i;
+	dberr_t		error = DB_SUCCESS;
+	fts_t*		fts = optim->table->fts;
+
+	/* Optimize the FTS indexes. */
+	for (i = 0; i < ib_vector_size(fts->indexes); ++i) {
+		dict_index_t*	index;
+
+#ifdef	FTS_OPTIMIZE_DEBUG
+		ib_time_t	end_time;
+		ib_time_t	start_time;
+
+		/* Get the start and end optimize times for this index. */
+		error = fts_optimize_get_index_start_time(
+			optim->trx, index, &start_time);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		error = fts_optimize_get_index_end_time(
+			optim->trx, index, &end_time);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		/* Start time will be 0 only for the first time or after
+		completing the optimization of all FTS indexes. */
+		if (start_time == 0) {
+			start_time = ut_time();
+
+			error = fts_optimize_set_index_start_time(
+				optim->trx, index, start_time);
+		}
+
+		/* Check if this index needs to be optimized or not. */
+		if (ut_difftime(end_time, start_time) < 0) {
+			error = fts_optimize_index(optim, index);
+
+			if (error != DB_SUCCESS) {
+				break;
+			}
+		} else {
+			++optim->n_completed;
+		}
+#endif
+		index = static_cast<dict_index_t*>(
+			ib_vector_getp(fts->indexes, i));
+		error = fts_optimize_index(optim, index);
+	}
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(optim->trx);
+	} else {
+		fts_sql_rollback(optim->trx);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Cleanup the snapshot tables and the master deleted table.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_purge_snapshot(
+/*========================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	dberr_t		error;
+
+	/* Delete the doc ids from the master deleted tables, that were
+	in the snapshot that was taken at the start of optimize. */
+	error = fts_optimize_purge_deleted_doc_ids(optim);
+
+	if (error == DB_SUCCESS) {
+		/* Destroy the deleted doc id snapshot. */
+		error = fts_optimize_purge_deleted_doc_id_snapshot(optim);
+	}
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(optim->trx);
+	} else {
+		fts_sql_rollback(optim->trx);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Reset the start time to 0 so that a new optimize can be started.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_reset_start_time(
+/*==========================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	dberr_t		error = DB_SUCCESS;
+#ifdef FTS_OPTIMIZE_DEBUG
+	fts_t*		fts = optim->table->fts;
+
+	/* Optimization should have been completed for all indexes. */
+	ut_a(optim->n_completed == ib_vector_size(fts->indexes));
+
+	for (uint i = 0; i < ib_vector_size(fts->indexes); ++i) {
+		dict_index_t*	index;
+
+		ib_time_t	start_time = 0;
+
+		/* Reset the start time to 0 for this index. */
+		error = fts_optimize_set_index_start_time(
+			optim->trx, index, start_time);
+
+		index = static_cast<dict_index_t*>(
+			ib_vector_getp(fts->indexes, i));
+	}
+#endif
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(optim->trx);
+	} else {
+		fts_sql_rollback(optim->trx);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Run OPTIMIZE on the given table by a background thread.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull))
+dberr_t
+fts_optimize_table_bk(
+/*==================*/
+	fts_slot_t*	slot)	/*!< in: table to optimiza */
+{
+	dberr_t		error;
+	dict_table_t*	table = slot->table;
+	fts_t*		fts = table->fts;
+
+	/* Avoid optimizing tables that were optimized recently. */
+	if (slot->last_run > 0
+	    && (ut_time() - slot->last_run) < slot->interval_time) {
+
+		return(DB_SUCCESS);
+
+	} else if (fts && fts->cache
+		   && fts->cache->deleted >= FTS_OPTIMIZE_THRESHOLD) {
+
+		error = fts_optimize_table(table);
+
+		if (error == DB_SUCCESS) {
+			slot->state = FTS_STATE_DONE;
+			slot->last_run = 0;
+			slot->completed = ut_time();
+		}
+	} else {
+		error = DB_SUCCESS;
+	}
+
+	/* Note time this run completed. */
+	slot->last_run = ut_time();
+
+	return(error);
+}
+/*********************************************************************//**
+Run OPTIMIZE on the given table.
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+dberr_t
+fts_optimize_table(
+/*===============*/
+	dict_table_t*	table)	/*!< in: table to optimiza */
+{
+	dberr_t		error = DB_SUCCESS;
+	fts_optimize_t*	optim = NULL;
+	fts_t*		fts = table->fts;
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: FTS start optimize %s\n", table->name);
+
+	optim = fts_optimize_create(table);
+
+	// FIXME: Call this only at the start of optimize, currently we
+	// rely on DB_DUPLICATE_KEY to handle corrupting the snapshot.
+
+	/* Check whether there are still records in BEING_DELETED table */
+	if (fts_optimize_being_deleted_count(optim) == 0) {
+		/* Take a snapshot of the deleted document ids, they are copied
+		to the BEING_ tables. */
+		error = fts_optimize_create_deleted_doc_id_snapshot(optim);
+	}
+
+	/* A duplicate error is OK, since we don't erase the
+	doc ids from the being deleted state until all FTS
+	indexes have been optimized. */
+	if (error == DB_DUPLICATE_KEY) {
+		error = DB_SUCCESS;
+	}
+
+	if (error == DB_SUCCESS) {
+
+		/* These document ids will be filtered out during the
+		index optimization phase. They are in the snapshot that we
+		took above, at the start of the optimize. */
+		error = fts_optimize_read_deleted_doc_id_snapshot(optim);
+
+		if (error == DB_SUCCESS) {
+
+			/* Commit the read of being deleted
+			doc ids transaction. */
+			fts_sql_commit(optim->trx);
+
+			/* We would do optimization only if there
+			are deleted records to be cleaned up */
+			if (ib_vector_size(optim->to_delete->doc_ids) > 0) {
+				error = fts_optimize_indexes(optim);
+			}
+
+		} else {
+			ut_a(optim->to_delete == NULL);
+		}
+
+		/* Only after all indexes have been optimized can we
+		delete the (snapshot) doc ids in the pending delete,
+		and master deleted tables. */
+		if (error == DB_SUCCESS
+		    && optim->n_completed == ib_vector_size(fts->indexes)) {
+
+			if (fts_enable_diag_print) {
+				fprintf(stderr, "FTS_OPTIMIZE: Completed "
+						"Optimize, cleanup DELETED "
+						"table\n");
+			}
+
+			if (ib_vector_size(optim->to_delete->doc_ids) > 0) {
+
+				/* Purge the doc ids that were in the
+				snapshot from the snapshot tables and
+				the master deleted table. */
+				error = fts_optimize_purge_snapshot(optim);
+			}
+
+			if (error == DB_SUCCESS) {
+				/* Reset the start time of all the FTS indexes
+				so that optimize can be restarted. */
+				error = fts_optimize_reset_start_time(optim);
+			}
+		}
+	}
+
+	fts_optimize_free(optim);
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: FTS end optimize %s\n", table->name);
+
+	return(error);
+}
+
+/********************************************************************//**
+Add the table to add to the OPTIMIZER's list.
+@return new message instance */
+static
+fts_msg_t*
+fts_optimize_create_msg(
+/*====================*/
+	fts_msg_type_t	type,		/*!< in: type of message */
+	void*		ptr)		/*!< in: message payload */
+{
+	mem_heap_t*	heap;
+	fts_msg_t*	msg;
+
+	heap = mem_heap_create(sizeof(*msg) + sizeof(ib_list_node_t) + 16);
+	msg = static_cast<fts_msg_t*>(mem_heap_alloc(heap, sizeof(*msg)));
+
+	msg->ptr = ptr;
+	msg->type = type;
+	msg->heap = heap;
+
+	return(msg);
+}
+
+/**********************************************************************//**
+Add the table to add to the OPTIMIZER's list. */
+UNIV_INTERN
+void
+fts_optimize_add_table(
+/*===================*/
+	dict_table_t*	table)			/*!< in: table to add */
+{
+	fts_msg_t*	msg;
+
+	if (!fts_optimize_wq) {
+		return;
+	}
+
+	/* Make sure table with FTS index cannot be evicted */
+	if (table->can_be_evicted) {
+		dict_table_move_from_lru_to_non_lru(table);
+	}
+
+	msg = fts_optimize_create_msg(FTS_MSG_ADD_TABLE, table);
+
+	ib_wqueue_add(fts_optimize_wq, msg, msg->heap);
+}
+
+/**********************************************************************//**
+Optimize a table. */
+UNIV_INTERN
+void
+fts_optimize_do_table(
+/*==================*/
+	dict_table_t*	table)			/*!< in: table to optimize */
+{
+	fts_msg_t*	msg;
+
+	/* Optimizer thread could be shutdown */
+	if (!fts_optimize_wq) {
+		return;
+	}
+
+	msg = fts_optimize_create_msg(FTS_MSG_OPTIMIZE_TABLE, table);
+
+	ib_wqueue_add(fts_optimize_wq, msg, msg->heap);
+}
+
+/**********************************************************************//**
+Remove the table from the OPTIMIZER's list. We do wait for
+acknowledgement from the consumer of the message. */
+UNIV_INTERN
+void
+fts_optimize_remove_table(
+/*======================*/
+	dict_table_t*	table)			/*!< in: table to remove */
+{
+	fts_msg_t*	msg;
+	os_event_t	event;
+	fts_msg_del_t*	remove;
+
+	/* if the optimize system not yet initialized, return */
+	if (!fts_optimize_wq) {
+		return;
+	}
+
+	/* FTS optimizer thread is already exited */
+	if (fts_opt_start_shutdown) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Try to remove table %s after FTS optimize"
+			" thread exiting.", table->name);
+		return;
+	}
+
+	msg = fts_optimize_create_msg(FTS_MSG_DEL_TABLE, NULL);
+
+	/* We will wait on this event until signalled by the consumer. */
+	event = os_event_create();
+
+	remove = static_cast<fts_msg_del_t*>(
+		mem_heap_alloc(msg->heap, sizeof(*remove)));
+
+	remove->table = table;
+	remove->event = event;
+	msg->ptr = remove;
+
+	ib_wqueue_add(fts_optimize_wq, msg, msg->heap);
+
+	os_event_wait(event);
+
+	os_event_free(event);
+}
+
+/**********************************************************************//**
+Find the slot for a particular table.
+@return slot if found else NULL. */
+static
+fts_slot_t*
+fts_optimize_find_slot(
+/*===================*/
+	ib_vector_t*		tables,		/*!< in: vector of tables */
+	const dict_table_t*	table)		/*!< in: table to add */
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(tables); ++i) {
+		fts_slot_t*	slot;
+
+		slot = static_cast<fts_slot_t*>(ib_vector_get(tables, i));
+
+		if (slot->table->id == table->id) {
+			return(slot);
+		}
+	}
+
+	return(NULL);
+}
+
+/**********************************************************************//**
+Start optimizing table. */
+static
+void
+fts_optimize_start_table(
+/*=====================*/
+	ib_vector_t*		tables,		/*!< in/out: vector of tables */
+	dict_table_t*		table)		/*!< in: table to optimize */
+{
+	fts_slot_t*	slot;
+
+	slot = fts_optimize_find_slot(tables, table);
+
+	if (slot == NULL) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Error: table %s not registered "
+			"with the optimize thread.\n", table->name);
+	} else {
+		slot->last_run = 0;
+		slot->completed = 0;
+	}
+}
+
+/**********************************************************************//**
+Add the table to the vector if it doesn't already exist. */
+static
+ibool
+fts_optimize_new_table(
+/*===================*/
+	ib_vector_t*	tables,			/*!< in/out: vector of tables */
+	dict_table_t*	table)			/*!< in: table to add */
+{
+	ulint		i;
+	fts_slot_t*	slot;
+	ulint		empty_slot = ULINT_UNDEFINED;
+
+	/* Search for duplicates, also find a free slot if one exists. */
+	for (i = 0; i < ib_vector_size(tables); ++i) {
+
+		slot = static_cast<fts_slot_t*>(
+			ib_vector_get(tables, i));
+
+		if (slot->state == FTS_STATE_EMPTY) {
+			empty_slot = i;
+		} else if (slot->table->id == table->id) {
+			/* Already exists in our optimize queue. */
+			return(FALSE);
+		}
+	}
+
+	/* Reuse old slot. */
+	if (empty_slot != ULINT_UNDEFINED) {
+
+		slot = static_cast<fts_slot_t*>(
+			ib_vector_get(tables, empty_slot));
+
+		ut_a(slot->state == FTS_STATE_EMPTY);
+
+	} else { /* Create a new slot. */
+
+		slot = static_cast<fts_slot_t*>(ib_vector_push(tables, NULL));
+	}
+
+	memset(slot, 0x0, sizeof(*slot));
+
+	slot->table = table;
+	slot->table_id = table->id;
+	slot->state = FTS_STATE_LOADED;
+	slot->interval_time = FTS_OPTIMIZE_INTERVAL_IN_SECS;
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Remove the table from the vector if it exists. */
+static
+ibool
+fts_optimize_del_table(
+/*===================*/
+	ib_vector_t*	tables,			/*!< in/out: vector of tables */
+	fts_msg_del_t*	msg)			/*!< in: table to delete */
+{
+	ulint		i;
+	dict_table_t*	table = msg->table;
+
+	for (i = 0; i < ib_vector_size(tables); ++i) {
+		fts_slot_t*	slot;
+
+		slot = static_cast<fts_slot_t*>(ib_vector_get(tables, i));
+
+		/* FIXME: Should we assert on this ? */
+		if (slot->state != FTS_STATE_EMPTY
+		    && slot->table->id == table->id) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr, " InnoDB: FTS Optimize Removing "
+				"table %s\n", table->name);
+
+			slot->table = NULL;
+			slot->state = FTS_STATE_EMPTY;
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Calculate how many of the registered tables need to be optimized.
+@return no. of tables to optimize */
+static
+ulint
+fts_optimize_how_many(
+/*==================*/
+	const ib_vector_t*	tables)		/*!< in: registered tables
+						vector*/
+{
+	ulint		i;
+	ib_time_t	delta;
+	ulint		n_tables = 0;
+	ib_time_t	current_time;
+
+	current_time = ut_time();
+
+	for (i = 0; i < ib_vector_size(tables); ++i) {
+		const fts_slot_t*	slot;
+
+		slot = static_cast<const fts_slot_t*>(
+			ib_vector_get_const(tables, i));
+
+		switch (slot->state) {
+		case FTS_STATE_DONE:
+		case FTS_STATE_LOADED:
+			ut_a(slot->completed <= current_time);
+
+			delta = current_time - slot->completed;
+
+			/* Skip slots that have been optimized recently. */
+			if (delta >= slot->interval_time) {
+				++n_tables;
+			}
+			break;
+
+		case FTS_STATE_RUNNING:
+			ut_a(slot->last_run <= current_time);
+
+			delta = current_time - slot->last_run;
+
+			if (delta > slot->interval_time) {
+				++n_tables;
+			}
+			break;
+
+			/* Slots in a state other than the above
+			are ignored. */
+		case FTS_STATE_EMPTY:
+		case FTS_STATE_SUSPENDED:
+			break;
+		}
+
+	}
+
+	return(n_tables);
+}
+
+/**********************************************************************//**
+Check if the total memory used by all FTS table exceeds the maximum limit.
+@return true if a sync is needed, false otherwise */
+static
+bool
+fts_is_sync_needed(
+/*===============*/
+	const ib_vector_t*	tables)		/*!< in: registered tables
+						vector*/
+{
+	ulint		total_memory = 0;
+	double		time_diff = difftime(ut_time(), last_check_sync_time);
+
+	if (fts_need_sync || time_diff < 5) {
+		return(false);
+	}
+
+	last_check_sync_time = ut_time();
+
+	for (ulint i = 0; i < ib_vector_size(tables); ++i) {
+		const fts_slot_t*	slot;
+
+		slot = static_cast<const fts_slot_t*>(
+			ib_vector_get_const(tables, i));
+
+		if (slot->state != FTS_STATE_EMPTY && slot->table
+		    && slot->table->fts) {
+			total_memory += slot->table->fts->cache->total_size;
+		}
+
+		if (total_memory > fts_max_total_cache_size) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+#if 0
+/*********************************************************************//**
+Check whether a table needs to be optimized. */
+static
+void
+fts_optimize_need_sync(
+/*===================*/
+	ib_vector_t*	tables)	/*!< in: list of tables */
+{
+	dict_table_t*	table = NULL;
+	fts_slot_t*	slot;
+	ulint		num_table = ib_vector_size(tables);
+
+	if (!num_table) {
+		return;
+	}
+
+	if (fts_optimize_sync_iterator >= num_table) {
+		fts_optimize_sync_iterator = 0;
+	}
+
+	slot = ib_vector_get(tables, fts_optimize_sync_iterator);
+	table = slot->table;
+
+	if (!table) {
+		return;
+	}
+
+	ut_ad(table->fts);
+
+	if (table->fts->cache) {
+		ulint	deleted = table->fts->cache->deleted;
+
+		if (table->fts->cache->added
+		    >= fts_optimize_add_threshold) {
+			fts_sync_table(table);
+		} else if (deleted >= fts_optimize_delete_threshold) {
+			fts_optimize_do_table(table);
+
+			mutex_enter(&table->fts->cache->deleted_lock);
+			table->fts->cache->deleted -= deleted;
+			mutex_exit(&table->fts->cache->deleted_lock);
+		}
+	}
+
+	fts_optimize_sync_iterator++;
+
+	return;
+}
+#endif
+
+/**********************************************************************//**
+Optimize all FTS tables.
+@return Dummy return */
+UNIV_INTERN
+os_thread_ret_t
+fts_optimize_thread(
+/*================*/
+	void*		arg)			/*!< in: work queue*/
+{
+	mem_heap_t*	heap;
+	ib_vector_t*	tables;
+	ib_alloc_t*	heap_alloc;
+	ulint		current = 0;
+	ibool		done = FALSE;
+	ulint		n_tables = 0;
+	os_event_t	exit_event = 0;
+	ulint		n_optimize = 0;
+	ib_wqueue_t*	wq = (ib_wqueue_t*) arg;
+
+	ut_ad(!srv_read_only_mode);
+	my_thread_init();
+
+	heap = mem_heap_create(sizeof(dict_table_t*) * 64);
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	tables = ib_vector_create(heap_alloc, sizeof(fts_slot_t), 4);
+
+	while(!done && srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+
+		/* If there is no message in the queue and we have tables
+		to optimize then optimize the tables. */
+
+		if (!done
+		    && ib_wqueue_is_empty(wq)
+		    && n_tables > 0
+		    && n_optimize > 0) {
+
+			fts_slot_t*	slot;
+
+			ut_a(ib_vector_size(tables) > 0);
+
+			slot = static_cast<fts_slot_t*>(
+				ib_vector_get(tables, current));
+
+			/* Handle the case of empty slots. */
+			if (slot->state != FTS_STATE_EMPTY) {
+
+				slot->state = FTS_STATE_RUNNING;
+
+				fts_optimize_table_bk(slot);
+			}
+
+			++current;
+
+			/* Wrap around the counter. */
+			if (current >= ib_vector_size(tables)) {
+				n_optimize = fts_optimize_how_many(tables);
+
+				current = 0;
+			}
+
+		} else if (n_optimize == 0 || !ib_wqueue_is_empty(wq)) {
+			fts_msg_t*	msg;
+
+			msg = static_cast<fts_msg_t*>(
+				ib_wqueue_timedwait(wq,
+						    FTS_QUEUE_WAIT_IN_USECS));
+
+			/* Timeout ? */
+			if (msg == NULL) {
+				if (fts_is_sync_needed(tables)) {
+					fts_need_sync = true;
+				}
+
+				continue;
+			}
+
+			switch (msg->type) {
+			case FTS_MSG_START:
+				break;
+
+			case FTS_MSG_PAUSE:
+				break;
+
+			case FTS_MSG_STOP:
+				done = TRUE;
+				exit_event = (os_event_t) msg->ptr;
+				break;
+
+			case FTS_MSG_ADD_TABLE:
+				ut_a(!done);
+				if (fts_optimize_new_table(
+					tables,
+					static_cast<dict_table_t*>(
+					msg->ptr))) {
+					++n_tables;
+				}
+				break;
+
+			case FTS_MSG_OPTIMIZE_TABLE:
+				if (!done) {
+					fts_optimize_start_table(
+						tables,
+						static_cast<dict_table_t*>(
+						msg->ptr));
+				}
+				break;
+
+			case FTS_MSG_DEL_TABLE:
+				if (fts_optimize_del_table(
+					tables, static_cast<fts_msg_del_t*>(
+						msg->ptr))) {
+					--n_tables;
+				}
+
+				/* Signal the producer that we have
+				removed the table. */
+				os_event_set(
+					((fts_msg_del_t*) msg->ptr)->event);
+				break;
+
+			default:
+				ut_error;
+			}
+
+			mem_heap_free(msg->heap);
+
+			if (!done) {
+				n_optimize = fts_optimize_how_many(tables);
+			} else {
+				n_optimize = 0;
+			}
+		}
+	}
+
+	/* Server is being shutdown, sync the data from FTS cache to disk
+	if needed */
+	if (n_tables > 0) {
+		ulint	i;
+
+		for (i = 0; i < ib_vector_size(tables); i++) {
+			fts_slot_t*	slot;
+
+			slot = static_cast<fts_slot_t*>(
+				ib_vector_get(tables, i));
+
+			if (slot->state != FTS_STATE_EMPTY) {
+				dict_table_t*	table = NULL;
+
+				/*slot->table may be freed, so we try to open
+				table by slot->table_id.*/
+				table = dict_table_open_on_id(
+					slot->table_id, FALSE,
+					DICT_TABLE_OP_NORMAL);
+
+				if (table) {
+
+					if (dict_table_has_fts_index(table)) {
+						fts_sync_table(table);
+					}
+
+					if (table->fts) {
+						fts_free(table);
+					}
+
+					dict_table_close(table, FALSE, FALSE);
+				}
+			}
+		}
+	}
+
+	ib_vector_free(tables);
+
+	ib_logf(IB_LOG_LEVEL_INFO, "FTS optimize thread exiting.");
+
+	os_event_set(exit_event);
+	my_thread_end();
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/**********************************************************************//**
+Startup the optimize thread and create the work queue. */
+UNIV_INTERN
+void
+fts_optimize_init(void)
+/*===================*/
+{
+	ut_ad(!srv_read_only_mode);
+
+	/* For now we only support one optimize thread. */
+	ut_a(fts_optimize_wq == NULL);
+
+	fts_optimize_wq = ib_wqueue_create();
+	ut_a(fts_optimize_wq != NULL);
+	last_check_sync_time = ut_time();
+
+	os_thread_create(fts_optimize_thread, fts_optimize_wq, NULL);
+}
+
+/**********************************************************************//**
+Check whether the work queue is initialized.
+@return TRUE if optimze queue is initialized. */
+UNIV_INTERN
+ibool
+fts_optimize_is_init(void)
+/*======================*/
+{
+	return(fts_optimize_wq != NULL);
+}
+
+/**********************************************************************//**
+Signal the optimize thread to prepare for shutdown. */
+UNIV_INTERN
+void
+fts_optimize_start_shutdown(void)
+/*=============================*/
+{
+	ut_ad(!srv_read_only_mode);
+
+	fts_msg_t*	msg;
+	os_event_t	event;
+
+	/* If there is an ongoing activity on dictionary, such as
+	srv_master_evict_from_table_cache(), wait for it */
+	dict_mutex_enter_for_mysql();
+
+	/* Tells FTS optimizer system that we are exiting from
+	optimizer thread, message send their after will not be
+	processed */
+	fts_opt_start_shutdown = true;
+	dict_mutex_exit_for_mysql();
+
+	/* We tell the OPTIMIZE thread to switch to state done, we
+	can't delete the work queue here because the add thread needs
+	deregister the FTS tables. */
+	event = os_event_create();
+
+	msg = fts_optimize_create_msg(FTS_MSG_STOP, NULL);
+	msg->ptr = event;
+
+	ib_wqueue_add(fts_optimize_wq, msg, msg->heap);
+
+	os_event_wait(event);
+	os_event_free(event);
+
+	ib_wqueue_free(fts_optimize_wq);
+
+}
+
+/**********************************************************************//**
+Reset the work queue. */
+UNIV_INTERN
+void
+fts_optimize_end(void)
+/*==================*/
+{
+	ut_ad(!srv_read_only_mode);
+
+	// FIXME: Potential race condition here: We should wait for
+	// the optimize thread to confirm shutdown.
+	fts_optimize_wq = NULL;
+}
diff --git a/storage/innobase/fts/fts0pars.cc b/storage/innobase/fts/fts0pars.cc
new file mode 100644
index 00000000000..7f0ba4e0c1b
--- /dev/null
+++ b/storage/innobase/fts/fts0pars.cc
@@ -0,0 +1,2010 @@
+/* A Bison parser, made by GNU Bison 2.5.  */
+
+/* Bison implementation for Yacc-like parsers in C
+
+      Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* As a special exception, you may create a larger work that contains
+   part or all of the Bison parser skeleton and distribute that work
+   under terms of your choice, so long as that work isn't itself a
+   parser generator using the skeleton or a modified version thereof
+   as a parser skeleton.  Alternatively, if you modify or redistribute
+   the parser skeleton itself, you may (at your option) remove this
+   special exception, which will cause the skeleton and the resulting
+   Bison output files to be licensed under the GNU General Public
+   License without this special exception.
+
+   This special exception was added by the Free Software Foundation in
+   version 2.2 of Bison.  */
+
+/* C LALR(1) parser skeleton written by Richard Stallman, by
+   simplifying the original so-called "semantic" parser.  */
+
+/* All symbols defined below should begin with yy or YY, to avoid
+   infringing on user name space.  This should be done even for local
+   variables, as they might otherwise be expanded by user macros.
+   There are some unavoidable exceptions within include files to
+   define necessary library symbols; they are noted "INFRINGES ON
+   USER NAME SPACE" below.  */
+
+/* Identify Bison output.  */
+#define YYBISON 1
+
+/* Bison version.  */
+#define YYBISON_VERSION "2.5"
+
+/* Skeleton name.  */
+#define YYSKELETON_NAME "yacc.c"
+
+/* Pure parsers.  */
+#define YYPURE 1
+
+/* Push parsers.  */
+#define YYPUSH 0
+
+/* Pull parsers.  */
+#define YYPULL 1
+
+/* Using locations.  */
+#define YYLSP_NEEDED 0
+
+/* Substitute the variable and function names.  */
+#define yyparse         ftsparse
+#define yylex           ftslex
+#define yyerror         ftserror
+#define yylval          ftslval
+#define yychar          ftschar
+#define yydebug         ftsdebug
+#define yynerrs         ftsnerrs
+
+
+/* Copy the first part of user declarations.  */
+
+/* Line 268 of yacc.c  */
+#line 26 "fts0pars.y"
+
+
+#include "mem0mem.h"
+#include "fts0ast.h"
+#include "fts0blex.h"
+#include "fts0tlex.h"
+#include "fts0pars.h"
+
+extern	int fts_lexer(YYSTYPE*, fts_lexer_t*);
+extern	int fts_blexer(YYSTYPE*, yyscan_t);
+extern	int fts_tlexer(YYSTYPE*, yyscan_t);
+
+typedef int (*fts_scan)();
+
+extern int ftserror(const char* p);
+
+/* Required for reentrant parser */
+#define ftslex	fts_lexer
+
+#define YYERROR_VERBOSE
+
+/* For passing an argument to yyparse() */
+#define YYPARSE_PARAM state
+#define YYLEX_PARAM ((fts_ast_state_t*) state)->lexer
+
+#define YYTOKENFREE(token) fts_ast_string_free((token))
+
+typedef	int	(*fts_scanner_alt)(YYSTYPE* val, yyscan_t yyscanner);
+typedef	int	(*fts_scanner)();
+
+struct fts_lexer_t {
+	fts_scanner	scanner;
+	void*		yyscanner;
+};
+
+
+
+/* Line 268 of yacc.c  */
+#line 115 "fts0pars.cc"
+
+/* Enabling traces.  */
+#ifndef YYDEBUG
+# define YYDEBUG 0
+#endif
+
+/* Enabling verbose error messages.  */
+#ifdef YYERROR_VERBOSE
+# undef YYERROR_VERBOSE
+# define YYERROR_VERBOSE 1
+#else
+# define YYERROR_VERBOSE 0
+#endif
+
+/* Enabling the token table.  */
+#ifndef YYTOKEN_TABLE
+# define YYTOKEN_TABLE 0
+#endif
+
+
+/* Tokens.  */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+   /* Put the tokens into the symbol table, so that GDB and other debuggers
+      know about them.  */
+   enum yytokentype {
+     FTS_OPER = 258,
+     FTS_TEXT = 259,
+     FTS_TERM = 260,
+     FTS_NUMB = 261
+   };
+#endif
+
+
+
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef union YYSTYPE
+{
+
+/* Line 293 of yacc.c  */
+#line 61 "fts0pars.y"
+
+	int			oper;
+	fts_ast_string_t*	token;
+	fts_ast_node_t*		node;
+
+
+
+/* Line 293 of yacc.c  */
+#line 165 "fts0pars.cc"
+} YYSTYPE;
+# define YYSTYPE_IS_TRIVIAL 1
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+#endif
+
+
+/* Copy the second part of user declarations.  */
+
+
+/* Line 343 of yacc.c  */
+#line 177 "fts0pars.cc"
+
+#ifdef short
+# undef short
+#endif
+
+#ifdef YYTYPE_UINT8
+typedef YYTYPE_UINT8 yytype_uint8;
+#else
+typedef unsigned char yytype_uint8;
+#endif
+
+#ifdef YYTYPE_INT8
+typedef YYTYPE_INT8 yytype_int8;
+#elif (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+typedef signed char yytype_int8;
+#else
+typedef short int yytype_int8;
+#endif
+
+#ifdef YYTYPE_UINT16
+typedef YYTYPE_UINT16 yytype_uint16;
+#else
+typedef unsigned short int yytype_uint16;
+#endif
+
+#ifdef YYTYPE_INT16
+typedef YYTYPE_INT16 yytype_int16;
+#else
+typedef short int yytype_int16;
+#endif
+
+#ifndef YYSIZE_T
+# ifdef __SIZE_TYPE__
+#  define YYSIZE_T __SIZE_TYPE__
+# elif defined size_t
+#  define YYSIZE_T size_t
+# elif ! defined YYSIZE_T && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+#  include <stddef.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYSIZE_T size_t
+# else
+#  define YYSIZE_T unsigned int
+# endif
+#endif
+
+#define YYSIZE_MAXIMUM ((YYSIZE_T) -1)
+
+#ifndef YY_
+# if defined YYENABLE_NLS && YYENABLE_NLS
+#  if ENABLE_NLS
+#   include <libintl.h> /* INFRINGES ON USER NAME SPACE */
+#   define YY_(msgid) dgettext ("bison-runtime", msgid)
+#  endif
+# endif
+# ifndef YY_
+#  define YY_(msgid) msgid
+# endif
+#endif
+
+/* Suppress unused-variable warnings by "using" E.  */
+#if ! defined lint || defined __GNUC__
+# define YYUSE(e) ((void) (e))
+#else
+# define YYUSE(e) /* empty */
+#endif
+
+/* Identity function, used to suppress warnings about constant conditions.  */
+#ifndef lint
+# define YYID(n) (n)
+#else
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static int
+YYID (int yyi)
+#else
+static int
+YYID (yyi)
+    int yyi;
+#endif
+{
+  return yyi;
+}
+#endif
+
+#if ! defined yyoverflow || YYERROR_VERBOSE
+
+/* The parser invokes alloca or malloc; define the necessary symbols.  */
+
+# ifdef YYSTACK_USE_ALLOCA
+#  if YYSTACK_USE_ALLOCA
+#   ifdef __GNUC__
+#    define YYSTACK_ALLOC __builtin_alloca
+#   elif defined __BUILTIN_VA_ARG_INCR
+#    include <alloca.h> /* INFRINGES ON USER NAME SPACE */
+#   elif defined _AIX
+#    define YYSTACK_ALLOC __alloca
+#   elif defined _MSC_VER
+#    include <malloc.h> /* INFRINGES ON USER NAME SPACE */
+#    define alloca _alloca
+#   else
+#    define YYSTACK_ALLOC alloca
+#    if ! defined _ALLOCA_H && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+#     include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+#     ifndef EXIT_SUCCESS
+#      define EXIT_SUCCESS 0
+#     endif
+#    endif
+#   endif
+#  endif
+# endif
+
+# ifdef YYSTACK_ALLOC
+   /* Pacify GCC's `empty if-body' warning.  */
+#  define YYSTACK_FREE(Ptr) do { /* empty */; } while (YYID (0))
+#  ifndef YYSTACK_ALLOC_MAXIMUM
+    /* The OS might guarantee only one guard page at the bottom of the stack,
+       and a page size can be as small as 4096 bytes.  So we cannot safely
+       invoke alloca (N) if N exceeds 4096.  Use a slightly smaller number
+       to allow for a few compiler-allocated temporary stack slots.  */
+#   define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */
+#  endif
+# else
+#  define YYSTACK_ALLOC YYMALLOC
+#  define YYSTACK_FREE YYFREE
+#  ifndef YYSTACK_ALLOC_MAXIMUM
+#   define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM
+#  endif
+#  if (defined __cplusplus && ! defined EXIT_SUCCESS \
+       && ! ((defined YYMALLOC || defined malloc) \
+	     && (defined YYFREE || defined free)))
+#   include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+#   ifndef EXIT_SUCCESS
+#    define EXIT_SUCCESS 0
+#   endif
+#  endif
+#  ifndef YYMALLOC
+#   define YYMALLOC malloc
+#   if ! defined malloc && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */
+#   endif
+#  endif
+#  ifndef YYFREE
+#   define YYFREE free
+#   if ! defined free && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+void free (void *); /* INFRINGES ON USER NAME SPACE */
+#   endif
+#  endif
+# endif
+#endif /* ! defined yyoverflow || YYERROR_VERBOSE */
+
+
+#if (! defined yyoverflow \
+     && (! defined __cplusplus \
+	 || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL)))
+
+/* A type that is properly aligned for any stack member.  */
+union yyalloc
+{
+  yytype_int16 yyss_alloc;
+  YYSTYPE yyvs_alloc;
+};
+
+/* The size of the maximum gap between one aligned stack and the next.  */
+# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1)
+
+/* The size of an array large to enough to hold all stacks, each with
+   N elements.  */
+# define YYSTACK_BYTES(N) \
+     ((N) * (sizeof (yytype_int16) + sizeof (YYSTYPE)) \
+      + YYSTACK_GAP_MAXIMUM)
+
+# define YYCOPY_NEEDED 1
+
+/* Relocate STACK from its old location to the new one.  The
+   local variables YYSIZE and YYSTACKSIZE give the old and new number of
+   elements in the stack, and YYPTR gives the new location of the
+   stack.  Advance YYPTR to a properly aligned location for the next
+   stack.  */
+# define YYSTACK_RELOCATE(Stack_alloc, Stack)				\
+    do									\
+      {									\
+	YYSIZE_T yynewbytes;						\
+	YYCOPY (&yyptr->Stack_alloc, Stack, yysize);			\
+	Stack = &yyptr->Stack_alloc;					\
+	yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \
+	yyptr += yynewbytes / sizeof (*yyptr);				\
+      }									\
+    while (YYID (0))
+
+#endif
+
+#if defined YYCOPY_NEEDED && YYCOPY_NEEDED
+/* Copy COUNT objects from FROM to TO.  The source and destination do
+   not overlap.  */
+# ifndef YYCOPY
+#  if defined __GNUC__ && 1 < __GNUC__
+#   define YYCOPY(To, From, Count) \
+      __builtin_memcpy (To, From, (Count) * sizeof (*(From)))
+#  else
+#   define YYCOPY(To, From, Count)		\
+      do					\
+	{					\
+	  YYSIZE_T yyi;				\
+	  for (yyi = 0; yyi < (Count); yyi++)	\
+	    (To)[yyi] = (From)[yyi];		\
+	}					\
+      while (YYID (0))
+#  endif
+# endif
+#endif /* !YYCOPY_NEEDED */
+
+/* YYFINAL -- State number of the termination state.  */
+#define YYFINAL  3
+/* YYLAST -- Last index in YYTABLE.  */
+#define YYLAST   52
+
+/* YYNTOKENS -- Number of terminals.  */
+#define YYNTOKENS  16
+/* YYNNTS -- Number of nonterminals.  */
+#define YYNNTS  8
+/* YYNRULES -- Number of rules.  */
+#define YYNRULES  24
+/* YYNRULES -- Number of states.  */
+#define YYNSTATES  33
+
+/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX.  */
+#define YYUNDEFTOK  2
+#define YYMAXUTOK   261
+
+#define YYTRANSLATE(YYX)						\
+  ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK)
+
+/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX.  */
+static const yytype_uint8 yytranslate[] =
+{
+       0,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+      12,    13,    14,     7,     2,     8,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+      10,     2,    11,     2,    15,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     9,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     1,     2,     3,     4,
+       5,     6
+};
+
+#if YYDEBUG
+/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in
+   YYRHS.  */
+static const yytype_uint8 yyprhs[] =
+{
+       0,     0,     3,     5,     6,     9,    12,    16,    21,    23,
+      25,    28,    32,    36,    39,    44,    47,    49,    51,    53,
+      55,    57,    59,    61,    64
+};
+
+/* YYRHS -- A `-1'-separated list of the rules' RHS.  */
+static const yytype_int8 yyrhs[] =
+{
+      17,     0,    -1,    18,    -1,    -1,    18,    20,    -1,    18,
+      19,    -1,    12,    18,    13,    -1,    21,    12,    18,    13,
+      -1,    22,    -1,    23,    -1,    22,    14,    -1,    23,    15,
+       6,    -1,    21,    22,    14,    -1,    21,    22,    -1,    21,
+      23,    15,     6,    -1,    21,    23,    -1,     8,    -1,     7,
+      -1,     9,    -1,    10,    -1,    11,    -1,     5,    -1,     6,
+      -1,    14,    22,    -1,     4,    -1
+};
+
+/* YYRLINE[YYN] -- source line where rule number YYN was defined.  */
+static const yytype_uint8 yyrline[] =
+{
+       0,    79,    79,    85,    89,    99,   111,   119,   129,   133,
+     137,   141,   146,   152,   157,   164,   170,   174,   178,   182,
+     186,   191,   196,   202,   207
+};
+#endif
+
+#if YYDEBUG || YYERROR_VERBOSE || YYTOKEN_TABLE
+/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
+   First, the terminals, then, starting at YYNTOKENS, nonterminals.  */
+static const char *const yytname[] =
+{
+  "$end", "error", "$undefined", "FTS_OPER", "FTS_TEXT", "FTS_TERM",
+  "FTS_NUMB", "'+'", "'-'", "'~'", "'<'", "'>'", "'('", "')'", "'*'",
+  "'@'", "$accept", "query", "expr_lst", "sub_expr", "expr", "prefix",
+  "term", "text", 0
+};
+#endif
+
+# ifdef YYPRINT
+/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to
+   token YYLEX-NUM.  */
+static const yytype_uint16 yytoknum[] =
+{
+       0,   256,   257,   258,   259,   260,   261,    43,    45,   126,
+      60,    62,    40,    41,    42,    64
+};
+# endif
+
+/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives.  */
+static const yytype_uint8 yyr1[] =
+{
+       0,    16,    17,    18,    18,    18,    19,    19,    20,    20,
+      20,    20,    20,    20,    20,    20,    21,    21,    21,    21,
+      21,    22,    22,    22,    23
+};
+
+/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN.  */
+static const yytype_uint8 yyr2[] =
+{
+       0,     2,     1,     0,     2,     2,     3,     4,     1,     1,
+       2,     3,     3,     2,     4,     2,     1,     1,     1,     1,
+       1,     1,     1,     2,     1
+};
+
+/* YYDEFACT[STATE-NAME] -- Default reduction number in state STATE-NUM.
+   Performed when YYTABLE doesn't specify something else to do.  Zero
+   means the default is an error.  */
+static const yytype_uint8 yydefact[] =
+{
+       3,     0,     2,     1,    24,    21,    22,    17,    16,    18,
+      19,    20,     3,     0,     5,     4,     0,     8,     9,     0,
+      23,     3,    13,    15,    10,     0,     6,     0,    12,     0,
+      11,     7,    14
+};
+
+/* YYDEFGOTO[NTERM-NUM].  */
+static const yytype_int8 yydefgoto[] =
+{
+      -1,     1,     2,    14,    15,    16,    17,    18
+};
+
+/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
+   STATE-NUM.  */
+#define YYPACT_NINF -5
+static const yytype_int8 yypact[] =
+{
+      -5,    38,    18,    -5,    -5,    -5,    -5,    -5,    -5,    -5,
+      -5,    -5,    -5,    31,    -5,    -5,    29,    30,    32,    -4,
+      -5,    -5,    34,    35,    -5,    40,    -5,     7,    -5,    43,
+      -5,    -5,    -5
+};
+
+/* YYPGOTO[NTERM-NUM].  */
+static const yytype_int8 yypgoto[] =
+{
+      -5,    -5,    19,    -5,    -5,    -5,    26,    36
+};
+
+/* YYTABLE[YYPACT[STATE-NUM]].  What to do in state STATE-NUM.  If
+   positive, shift that token.  If negative, reduce the rule which
+   number is the opposite.  If YYTABLE_NINF, syntax error.  */
+#define YYTABLE_NINF -1
+static const yytype_uint8 yytable[] =
+{
+       4,     5,     6,     7,     8,     9,    10,    11,    12,    26,
+      13,     4,     5,     6,     7,     8,     9,    10,    11,    12,
+      31,    13,     4,     5,     6,     7,     8,     9,    10,    11,
+      12,    19,    13,     4,     5,     6,     5,     6,     3,    20,
+      27,    21,    22,    13,    24,    13,    30,    25,    28,    32,
+      29,     0,    23
+};
+
+#define yypact_value_is_default(yystate) \
+  ((yystate) == (-5))
+
+#define yytable_value_is_error(yytable_value) \
+  YYID (0)
+
+static const yytype_int8 yycheck[] =
+{
+       4,     5,     6,     7,     8,     9,    10,    11,    12,    13,
+      14,     4,     5,     6,     7,     8,     9,    10,    11,    12,
+      13,    14,     4,     5,     6,     7,     8,     9,    10,    11,
+      12,    12,    14,     4,     5,     6,     5,     6,     0,    13,
+      21,    12,    16,    14,    14,    14,     6,    15,    14,     6,
+      15,    -1,    16
+};
+
+/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
+   symbol of state STATE-NUM.  */
+static const yytype_uint8 yystos[] =
+{
+       0,    17,    18,     0,     4,     5,     6,     7,     8,     9,
+      10,    11,    12,    14,    19,    20,    21,    22,    23,    18,
+      22,    12,    22,    23,    14,    15,    13,    18,    14,    15,
+       6,    13,     6
+};
+
+#define yyerrok		(yyerrstatus = 0)
+#define yyclearin	(yychar = YYEMPTY)
+#define YYEMPTY		(-2)
+#define YYEOF		0
+
+#define YYACCEPT	goto yyacceptlab
+#define YYABORT		goto yyabortlab
+#define YYERROR		goto yyerrorlab
+
+
+/* Like YYERROR except do call yyerror.  This remains here temporarily
+   to ease the transition to the new meaning of YYERROR, for GCC.
+   Once GCC version 2 has supplanted version 1, this can go.  However,
+   YYFAIL appears to be in use.  Nevertheless, it is formally deprecated
+   in Bison 2.4.2's NEWS entry, where a plan to phase it out is
+   discussed.  */
+
+#define YYFAIL		goto yyerrlab
+#if defined YYFAIL
+  /* This is here to suppress warnings from the GCC cpp's
+     -Wunused-macros.  Normally we don't worry about that warning, but
+     some users do, and we want to make it easy for users to remove
+     YYFAIL uses, which will produce warnings from Bison 2.5.  */
+#endif
+
+#define YYRECOVERING()  (!!yyerrstatus)
+
+#define YYBACKUP(Token, Value)					\
+do								\
+  if (yychar == YYEMPTY && yylen == 1)				\
+    {								\
+      yychar = (Token);						\
+      yylval = (Value);						\
+      YYPOPSTACK (1);						\
+      goto yybackup;						\
+    }								\
+  else								\
+    {								\
+      yyerror (YY_("syntax error: cannot back up")); \
+      YYERROR;							\
+    }								\
+while (YYID (0))
+
+
+#define YYTERROR	1
+#define YYERRCODE	256
+
+#define YYERRCLEANUP						\
+do								\
+  switch (yylastchar)						\
+    {								\
+      case FTS_NUMB:						\
+      case FTS_TEXT:						\
+      case FTS_TERM:						\
+        YYTOKENFREE(yylval.token);				\
+        break;							\
+      default:							\
+        break;							\
+    }								\
+while (YYID (0))
+
+/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N].
+   If N is 0, then set CURRENT to the empty location which ends
+   the previous symbol: RHS[0] (always defined).  */
+
+#define YYRHSLOC(Rhs, K) ((Rhs)[K])
+#ifndef YYLLOC_DEFAULT
+# define YYLLOC_DEFAULT(Current, Rhs, N)				\
+    do									\
+      if (YYID (N))                                                    \
+	{								\
+	  (Current).first_line   = YYRHSLOC (Rhs, 1).first_line;	\
+	  (Current).first_column = YYRHSLOC (Rhs, 1).first_column;	\
+	  (Current).last_line    = YYRHSLOC (Rhs, N).last_line;		\
+	  (Current).last_column  = YYRHSLOC (Rhs, N).last_column;	\
+	}								\
+      else								\
+	{								\
+	  (Current).first_line   = (Current).last_line   =		\
+	    YYRHSLOC (Rhs, 0).last_line;				\
+	  (Current).first_column = (Current).last_column =		\
+	    YYRHSLOC (Rhs, 0).last_column;				\
+	}								\
+    while (YYID (0))
+#endif
+
+
+/* This macro is provided for backward compatibility. */
+
+#ifndef YY_LOCATION_PRINT
+# define YY_LOCATION_PRINT(File, Loc) ((void) 0)
+#endif
+
+
+/* YYLEX -- calling `yylex' with the right arguments.  */
+
+#ifdef YYLEX_PARAM
+# define YYLEX yylex (&yylval, YYLEX_PARAM)
+#else
+# define YYLEX yylex (&yylval)
+#endif
+
+/* Enable debugging if requested.  */
+#if YYDEBUG
+
+# ifndef YYFPRINTF
+#  include <stdio.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYFPRINTF fprintf
+# endif
+
+# define YYDPRINTF(Args)			\
+do {						\
+  if (yydebug)					\
+    YYFPRINTF Args;				\
+} while (YYID (0))
+
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)			  \
+do {									  \
+  if (yydebug)								  \
+    {									  \
+      YYFPRINTF (stderr, "%s ", Title);					  \
+      yy_symbol_print (stderr,						  \
+		  Type, Value); \
+      YYFPRINTF (stderr, "\n");						  \
+    }									  \
+} while (YYID (0))
+
+
+/*--------------------------------.
+| Print this symbol on YYOUTPUT.  |
+`--------------------------------*/
+
+/*ARGSUSED*/
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_symbol_value_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep)
+#else
+static void
+yy_symbol_value_print (yyoutput, yytype, yyvaluep)
+    FILE *yyoutput;
+    int yytype;
+    YYSTYPE const * const yyvaluep;
+#endif
+{
+  if (!yyvaluep)
+    return;
+# ifdef YYPRINT
+  if (yytype < YYNTOKENS)
+    YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep);
+# else
+  YYUSE (yyoutput);
+# endif
+  switch (yytype)
+    {
+      default:
+	break;
+    }
+}
+
+
+/*--------------------------------.
+| Print this symbol on YYOUTPUT.  |
+`--------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_symbol_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep)
+#else
+static void
+yy_symbol_print (yyoutput, yytype, yyvaluep)
+    FILE *yyoutput;
+    int yytype;
+    YYSTYPE const * const yyvaluep;
+#endif
+{
+  if (yytype < YYNTOKENS)
+    YYFPRINTF (yyoutput, "token %s (", yytname[yytype]);
+  else
+    YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]);
+
+  yy_symbol_value_print (yyoutput, yytype, yyvaluep);
+  YYFPRINTF (yyoutput, ")");
+}
+
+/*------------------------------------------------------------------.
+| yy_stack_print -- Print the state stack from its BOTTOM up to its |
+| TOP (included).                                                   |
+`------------------------------------------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_stack_print (yytype_int16 *yybottom, yytype_int16 *yytop)
+#else
+static void
+yy_stack_print (yybottom, yytop)
+    yytype_int16 *yybottom;
+    yytype_int16 *yytop;
+#endif
+{
+  YYFPRINTF (stderr, "Stack now");
+  for (; yybottom <= yytop; yybottom++)
+    {
+      int yybot = *yybottom;
+      YYFPRINTF (stderr, " %d", yybot);
+    }
+  YYFPRINTF (stderr, "\n");
+}
+
+# define YY_STACK_PRINT(Bottom, Top)				\
+do {								\
+  if (yydebug)							\
+    yy_stack_print ((Bottom), (Top));				\
+} while (YYID (0))
+
+
+/*------------------------------------------------.
+| Report that the YYRULE is going to be reduced.  |
+`------------------------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_reduce_print (YYSTYPE *yyvsp, int yyrule)
+#else
+static void
+yy_reduce_print (yyvsp, yyrule)
+    YYSTYPE *yyvsp;
+    int yyrule;
+#endif
+{
+  int yynrhs = yyr2[yyrule];
+  int yyi;
+  unsigned long int yylno = yyrline[yyrule];
+  YYFPRINTF (stderr, "Reducing stack by rule %d (line %lu):\n",
+	     yyrule - 1, yylno);
+  /* The symbols being reduced.  */
+  for (yyi = 0; yyi < yynrhs; yyi++)
+    {
+      YYFPRINTF (stderr, "   $%d = ", yyi + 1);
+      yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi],
+		       &(yyvsp[(yyi + 1) - (yynrhs)])
+		       		       );
+      YYFPRINTF (stderr, "\n");
+    }
+}
+
+# define YY_REDUCE_PRINT(Rule)		\
+do {					\
+  if (yydebug)				\
+    yy_reduce_print (yyvsp, Rule); \
+} while (YYID (0))
+
+/* Nonzero means print parse trace.  It is left uninitialized so that
+   multiple parsers can coexist.  */
+int yydebug;
+#else /* !YYDEBUG */
+# define YYDPRINTF(Args)
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)
+# define YY_STACK_PRINT(Bottom, Top)
+# define YY_REDUCE_PRINT(Rule)
+#endif /* !YYDEBUG */
+
+
+/* YYINITDEPTH -- initial size of the parser's stacks.  */
+#ifndef	YYINITDEPTH
+# define YYINITDEPTH 200
+#endif
+
+/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only
+   if the built-in stack extension method is used).
+
+   Do not make this value too large; the results are undefined if
+   YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH)
+   evaluated with infinite-precision integer arithmetic.  */
+
+#ifndef YYMAXDEPTH
+# define YYMAXDEPTH 10000
+#endif
+
+
+#if YYERROR_VERBOSE
+
+# ifndef yystrlen
+#  if defined __GLIBC__ && defined _STRING_H
+#   define yystrlen strlen
+#  else
+/* Return the length of YYSTR.  */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static YYSIZE_T
+yystrlen (const char *yystr)
+#else
+static YYSIZE_T
+yystrlen (yystr)
+    const char *yystr;
+#endif
+{
+  YYSIZE_T yylen;
+  for (yylen = 0; yystr[yylen]; yylen++)
+    continue;
+  return yylen;
+}
+#  endif
+# endif
+
+# ifndef yystpcpy
+#  if defined __GLIBC__ && defined _STRING_H && defined _GNU_SOURCE
+#   define yystpcpy stpcpy
+#  else
+/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in
+   YYDEST.  */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static char *
+yystpcpy (char *yydest, const char *yysrc)
+#else
+static char *
+yystpcpy (yydest, yysrc)
+    char *yydest;
+    const char *yysrc;
+#endif
+{
+  char *yyd = yydest;
+  const char *yys = yysrc;
+
+  while ((*yyd++ = *yys++) != '\0')
+    continue;
+
+  return yyd - 1;
+}
+#  endif
+# endif
+
+# ifndef yytnamerr
+/* Copy to YYRES the contents of YYSTR after stripping away unnecessary
+   quotes and backslashes, so that it's suitable for yyerror.  The
+   heuristic is that double-quoting is unnecessary unless the string
+   contains an apostrophe, a comma, or backslash (other than
+   backslash-backslash).  YYSTR is taken from yytname.  If YYRES is
+   null, do not copy; instead, return the length of what the result
+   would have been.  */
+static YYSIZE_T
+yytnamerr (char *yyres, const char *yystr)
+{
+  if (*yystr == '"')
+    {
+      YYSIZE_T yyn = 0;
+      char const *yyp = yystr;
+
+      for (;;)
+	switch (*++yyp)
+	  {
+	  case '\'':
+	  case ',':
+	    goto do_not_strip_quotes;
+
+	  case '\\':
+	    if (*++yyp != '\\')
+	      goto do_not_strip_quotes;
+	    /* Fall through.  */
+	  default:
+	    if (yyres)
+	      yyres[yyn] = *yyp;
+	    yyn++;
+	    break;
+
+	  case '"':
+	    if (yyres)
+	      yyres[yyn] = '\0';
+	    return yyn;
+	  }
+    do_not_strip_quotes: ;
+    }
+
+  if (! yyres)
+    return yystrlen (yystr);
+
+  return yystpcpy (yyres, yystr) - yyres;
+}
+# endif
+
+/* Copy into *YYMSG, which is of size *YYMSG_ALLOC, an error message
+   about the unexpected token YYTOKEN for the state stack whose top is
+   YYSSP.
+
+   Return 0 if *YYMSG was successfully written.  Return 1 if *YYMSG is
+   not large enough to hold the message.  In that case, also set
+   *YYMSG_ALLOC to the required number of bytes.  Return 2 if the
+   required number of bytes is too large to store.  */
+static int
+yysyntax_error (YYSIZE_T *yymsg_alloc, char **yymsg,
+                yytype_int16 *yyssp, int yytoken)
+{
+  YYSIZE_T yysize0 = yytnamerr (0, yytname[yytoken]);
+  YYSIZE_T yysize = yysize0;
+  YYSIZE_T yysize1;
+  enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 };
+  /* Internationalized format string. */
+  const char *yyformat = 0;
+  /* Arguments of yyformat. */
+  char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM];
+  /* Number of reported tokens (one for the "unexpected", one per
+     "expected"). */
+  int yycount = 0;
+
+  /* There are many possibilities here to consider:
+     - Assume YYFAIL is not used.  It's too flawed to consider.  See
+       <http://lists.gnu.org/archive/html/bison-patches/2009-12/msg00024.html>
+       for details.  YYERROR is fine as it does not invoke this
+       function.
+     - If this state is a consistent state with a default action, then
+       the only way this function was invoked is if the default action
+       is an error action.  In that case, don't check for expected
+       tokens because there are none.
+     - The only way there can be no lookahead present (in yychar) is if
+       this state is a consistent state with a default action.  Thus,
+       detecting the absence of a lookahead is sufficient to determine
+       that there is no unexpected or expected token to report.  In that
+       case, just report a simple "syntax error".
+     - Don't assume there isn't a lookahead just because this state is a
+       consistent state with a default action.  There might have been a
+       previous inconsistent state, consistent state with a non-default
+       action, or user semantic action that manipulated yychar.
+     - Of course, the expected token list depends on states to have
+       correct lookahead information, and it depends on the parser not
+       to perform extra reductions after fetching a lookahead from the
+       scanner and before detecting a syntax error.  Thus, state merging
+       (from LALR or IELR) and default reductions corrupt the expected
+       token list.  However, the list is correct for canonical LR with
+       one exception: it will still contain any token that will not be
+       accepted due to an error action in a later state.
+  */
+  if (yytoken != YYEMPTY)
+    {
+      int yyn = yypact[*yyssp];
+      yyarg[yycount++] = yytname[yytoken];
+      if (!yypact_value_is_default (yyn))
+        {
+          /* Start YYX at -YYN if negative to avoid negative indexes in
+             YYCHECK.  In other words, skip the first -YYN actions for
+             this state because they are default actions.  */
+          int yyxbegin = yyn < 0 ? -yyn : 0;
+          /* Stay within bounds of both yycheck and yytname.  */
+          int yychecklim = YYLAST - yyn + 1;
+          int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS;
+          int yyx;
+
+          for (yyx = yyxbegin; yyx < yyxend; ++yyx)
+            if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR
+                && !yytable_value_is_error (yytable[yyx + yyn]))
+              {
+                if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM)
+                  {
+                    yycount = 1;
+                    yysize = yysize0;
+                    break;
+                  }
+                yyarg[yycount++] = yytname[yyx];
+                yysize1 = yysize + yytnamerr (0, yytname[yyx]);
+                if (! (yysize <= yysize1
+                       && yysize1 <= YYSTACK_ALLOC_MAXIMUM))
+                  return 2;
+                yysize = yysize1;
+              }
+        }
+    }
+
+  switch (yycount)
+    {
+# define YYCASE_(N, S)                      \
+      case N:                               \
+        yyformat = S;                       \
+      break
+      YYCASE_(0, YY_("syntax error"));
+      YYCASE_(1, YY_("syntax error, unexpected %s"));
+      YYCASE_(2, YY_("syntax error, unexpected %s, expecting %s"));
+      YYCASE_(3, YY_("syntax error, unexpected %s, expecting %s or %s"));
+      YYCASE_(4, YY_("syntax error, unexpected %s, expecting %s or %s or %s"));
+      YYCASE_(5, YY_("syntax error, unexpected %s, expecting %s or %s or %s or %s"));
+# undef YYCASE_
+    }
+
+  yysize1 = yysize + yystrlen (yyformat);
+  if (! (yysize <= yysize1 && yysize1 <= YYSTACK_ALLOC_MAXIMUM))
+    return 2;
+  yysize = yysize1;
+
+  if (*yymsg_alloc < yysize)
+    {
+      *yymsg_alloc = 2 * yysize;
+      if (! (yysize <= *yymsg_alloc
+             && *yymsg_alloc <= YYSTACK_ALLOC_MAXIMUM))
+        *yymsg_alloc = YYSTACK_ALLOC_MAXIMUM;
+      return 1;
+    }
+
+  /* Avoid sprintf, as that infringes on the user's name space.
+     Don't have undefined behavior even if the translation
+     produced a string with the wrong number of "%s"s.  */
+  {
+    char *yyp = *yymsg;
+    int yyi = 0;
+    while ((*yyp = *yyformat) != '\0')
+      if (*yyp == '%' && yyformat[1] == 's' && yyi < yycount)
+        {
+          yyp += yytnamerr (yyp, yyarg[yyi++]);
+          yyformat += 2;
+        }
+      else
+        {
+          yyp++;
+          yyformat++;
+        }
+  }
+  return 0;
+}
+#endif /* YYERROR_VERBOSE */
+
+/*-----------------------------------------------.
+| Release the memory associated to this symbol.  |
+`-----------------------------------------------*/
+
+/*ARGSUSED*/
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep)
+#else
+static void
+yydestruct (yymsg, yytype, yyvaluep)
+    const char *yymsg;
+    int yytype;
+    YYSTYPE *yyvaluep;
+#endif
+{
+  YYUSE (yyvaluep);
+
+  if (!yymsg)
+    yymsg = "Deleting";
+  YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp);
+
+  switch (yytype)
+    {
+
+      default:
+	break;
+    }
+}
+
+
+/* Prevent warnings from -Wmissing-prototypes.  */
+#ifdef YYPARSE_PARAM
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void *YYPARSE_PARAM);
+#else
+int yyparse ();
+#endif
+#else /* ! YYPARSE_PARAM */
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void);
+#else
+int yyparse ();
+#endif
+#endif /* ! YYPARSE_PARAM */
+
+
+/*----------.
+| yyparse.  |
+`----------*/
+
+#ifdef YYPARSE_PARAM
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+int
+yyparse (void *YYPARSE_PARAM)
+#else
+int
+yyparse (YYPARSE_PARAM)
+    void *YYPARSE_PARAM;
+#endif
+#else /* ! YYPARSE_PARAM */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+int
+yyparse (void)
+#else
+int
+yyparse ()
+
+#endif
+#endif
+{
+/* The lookahead symbol.  */
+int yychar;
+/* The backup of yychar when there is an error and we're in yyerrlab. */
+int yylastchar;
+
+/* The semantic value of the lookahead symbol.  */
+YYSTYPE yylval;
+
+    /* Number of syntax errors so far.  */
+    int yynerrs;
+
+    int yystate;
+    /* Number of tokens to shift before error messages enabled.  */
+    int yyerrstatus;
+
+    /* The stacks and their tools:
+       `yyss': related to states.
+       `yyvs': related to semantic values.
+
+       Refer to the stacks thru separate pointers, to allow yyoverflow
+       to reallocate them elsewhere.  */
+
+    /* The state stack.  */
+    yytype_int16 yyssa[YYINITDEPTH];
+    yytype_int16 *yyss;
+    yytype_int16 *yyssp;
+
+    /* The semantic value stack.  */
+    YYSTYPE yyvsa[YYINITDEPTH];
+    YYSTYPE *yyvs;
+    YYSTYPE *yyvsp;
+
+    YYSIZE_T yystacksize;
+
+  int yyn;
+  int yyresult;
+  /* Lookahead token as an internal (translated) token number.  */
+  int yytoken;
+  /* The variables used to return semantic value and location from the
+     action routines.  */
+  YYSTYPE yyval;
+
+#if YYERROR_VERBOSE
+  /* Buffer for error messages, and its allocated size.  */
+  char yymsgbuf[128];
+  char *yymsg = yymsgbuf;
+  YYSIZE_T yymsg_alloc = sizeof yymsgbuf;
+#endif
+
+#define YYPOPSTACK(N)   (yyvsp -= (N), yyssp -= (N))
+
+  /* The number of symbols on the RHS of the reduced rule.
+     Keep to zero when no symbol should be popped.  */
+  int yylen = 0;
+
+  yytoken = 0;
+  yyss = yyssa;
+  yyvs = yyvsa;
+  yystacksize = YYINITDEPTH;
+
+  YYDPRINTF ((stderr, "Starting parse\n"));
+
+  yystate = 0;
+  yyerrstatus = 0;
+  yynerrs = 0;
+  yychar = YYEMPTY; /* Cause a token to be read.  */
+
+  /* Initialize stack pointers.
+     Waste one element of value and location stack
+     so that they stay on the same level as the state stack.
+     The wasted elements are never initialized.  */
+  yyssp = yyss;
+  yyvsp = yyvs;
+
+  goto yysetstate;
+
+/*------------------------------------------------------------.
+| yynewstate -- Push a new state, which is found in yystate.  |
+`------------------------------------------------------------*/
+ yynewstate:
+  /* In all cases, when you get here, the value and location stacks
+     have just been pushed.  So pushing a state here evens the stacks.  */
+  yyssp++;
+
+ yysetstate:
+  *yyssp = yystate;
+
+  if (yyss + yystacksize - 1 <= yyssp)
+    {
+      /* Get the current used size of the three stacks, in elements.  */
+      YYSIZE_T yysize = yyssp - yyss + 1;
+
+#ifdef yyoverflow
+      {
+	/* Give user a chance to reallocate the stack.  Use copies of
+	   these so that the &'s don't force the real ones into
+	   memory.  */
+	YYSTYPE *yyvs1 = yyvs;
+	yytype_int16 *yyss1 = yyss;
+
+	/* Each stack pointer address is followed by the size of the
+	   data in use in that stack, in bytes.  This used to be a
+	   conditional around just the two extra args, but that might
+	   be undefined if yyoverflow is a macro.  */
+	yyoverflow (YY_("memory exhausted"),
+		    &yyss1, yysize * sizeof (*yyssp),
+		    &yyvs1, yysize * sizeof (*yyvsp),
+		    &yystacksize);
+
+	yyss = yyss1;
+	yyvs = yyvs1;
+      }
+#else /* no yyoverflow */
+# ifndef YYSTACK_RELOCATE
+      goto yyexhaustedlab;
+# else
+      /* Extend the stack our own way.  */
+      if (YYMAXDEPTH <= yystacksize)
+	goto yyexhaustedlab;
+      yystacksize *= 2;
+      if (YYMAXDEPTH < yystacksize)
+	yystacksize = YYMAXDEPTH;
+
+      {
+	yytype_int16 *yyss1 = yyss;
+	union yyalloc *yyptr =
+	  (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize));
+	if (! yyptr)
+	  goto yyexhaustedlab;
+	YYSTACK_RELOCATE (yyss_alloc, yyss);
+	YYSTACK_RELOCATE (yyvs_alloc, yyvs);
+#  undef YYSTACK_RELOCATE
+	if (yyss1 != yyssa)
+	  YYSTACK_FREE (yyss1);
+      }
+# endif
+#endif /* no yyoverflow */
+
+      yyssp = yyss + yysize - 1;
+      yyvsp = yyvs + yysize - 1;
+
+      YYDPRINTF ((stderr, "Stack size increased to %lu\n",
+		  (unsigned long int) yystacksize));
+
+      if (yyss + yystacksize - 1 <= yyssp)
+	YYABORT;
+    }
+
+  YYDPRINTF ((stderr, "Entering state %d\n", yystate));
+
+  if (yystate == YYFINAL)
+    YYACCEPT;
+
+  goto yybackup;
+
+/*-----------.
+| yybackup.  |
+`-----------*/
+yybackup:
+
+  /* Do appropriate processing given the current state.  Read a
+     lookahead token if we need one and don't already have one.  */
+
+  /* First try to decide what to do without reference to lookahead token.  */
+  yyn = yypact[yystate];
+  if (yypact_value_is_default (yyn))
+    goto yydefault;
+
+  /* Not known => get a lookahead token if don't already have one.  */
+
+  /* YYCHAR is either YYEMPTY or YYEOF or a valid lookahead symbol.  */
+  if (yychar == YYEMPTY)
+    {
+      YYDPRINTF ((stderr, "Reading a token: "));
+      yychar = YYLEX;
+    }
+
+  if (yychar <= YYEOF)
+    {
+      yychar = yytoken = YYEOF;
+      YYDPRINTF ((stderr, "Now at end of input.\n"));
+    }
+  else
+    {
+      yytoken = YYTRANSLATE (yychar);
+      YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc);
+    }
+
+  /* If the proper action on seeing token YYTOKEN is to reduce or to
+     detect an error, take that action.  */
+  yyn += yytoken;
+  if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken)
+    goto yydefault;
+  yyn = yytable[yyn];
+  if (yyn <= 0)
+    {
+      if (yytable_value_is_error (yyn))
+        goto yyerrlab;
+      yyn = -yyn;
+      goto yyreduce;
+    }
+
+  /* Count tokens shifted since error; after three, turn off error
+     status.  */
+  if (yyerrstatus)
+    yyerrstatus--;
+
+  /* Shift the lookahead token.  */
+  YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc);
+
+  /* Discard the shifted token.  */
+  yychar = YYEMPTY;
+
+  yystate = yyn;
+  *++yyvsp = yylval;
+
+  goto yynewstate;
+
+
+/*-----------------------------------------------------------.
+| yydefault -- do the default action for the current state.  |
+`-----------------------------------------------------------*/
+yydefault:
+  yyn = yydefact[yystate];
+  if (yyn == 0)
+    goto yyerrlab;
+  goto yyreduce;
+
+
+/*-----------------------------.
+| yyreduce -- Do a reduction.  |
+`-----------------------------*/
+yyreduce:
+  /* yyn is the number of a rule to reduce with.  */
+  yylen = yyr2[yyn];
+
+  /* If YYLEN is nonzero, implement the default value of the action:
+     `$$ = $1'.
+
+     Otherwise, the following line sets YYVAL to garbage.
+     This behavior is undocumented and Bison
+     users should not rely upon it.  Assigning to YYVAL
+     unconditionally makes the parser a bit smaller, and it avoids a
+     GCC warning that YYVAL may be used uninitialized.  */
+  yyval = yyvsp[1-yylen];
+
+
+  YY_REDUCE_PRINT (yyn);
+  switch (yyn)
+    {
+        case 2:
+
+/* Line 1806 of yacc.c  */
+#line 79 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (1)].node);
+		((fts_ast_state_t*) state)->root = (yyval.node);
+	}
+    break;
+
+  case 3:
+
+/* Line 1806 of yacc.c  */
+#line 85 "fts0pars.y"
+    {
+		(yyval.node) = NULL;
+	}
+    break;
+
+  case 4:
+
+/* Line 1806 of yacc.c  */
+#line 89 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (2)].node);
+
+		if (!(yyval.node)) {
+			(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(2) - (2)].node));
+		} else {
+			fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+		}
+	}
+    break;
+
+  case 5:
+
+/* Line 1806 of yacc.c  */
+#line 99 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (2)].node);
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node));
+
+		if (!(yyval.node)) {
+			(yyval.node) = (yyvsp[(2) - (2)].node);
+		} else {
+			fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+		}
+	}
+    break;
+
+  case 6:
+
+/* Line 1806 of yacc.c  */
+#line 111 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(2) - (3)].node);
+
+		if ((yyval.node)) {
+			(yyval.node) = fts_ast_create_node_subexp_list(state, (yyval.node));
+		}
+	}
+    break;
+
+  case 7:
+
+/* Line 1806 of yacc.c  */
+#line 119 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (4)].node));
+
+		if ((yyvsp[(3) - (4)].node)) {
+			fts_ast_add_node((yyval.node),
+				fts_ast_create_node_subexp_list(state, (yyvsp[(3) - (4)].node)));
+		}
+	}
+    break;
+
+  case 8:
+
+/* Line 1806 of yacc.c  */
+#line 129 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (1)].node);
+	}
+    break;
+
+  case 9:
+
+/* Line 1806 of yacc.c  */
+#line 133 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (1)].node);
+	}
+    break;
+
+  case 10:
+
+/* Line 1806 of yacc.c  */
+#line 137 "fts0pars.y"
+    {
+		fts_ast_term_set_wildcard((yyvsp[(1) - (2)].node));
+	}
+    break;
+
+  case 11:
+
+/* Line 1806 of yacc.c  */
+#line 141 "fts0pars.y"
+    {
+		fts_ast_term_set_distance((yyvsp[(1) - (3)].node), fts_ast_string_to_ul((yyvsp[(3) - (3)].token), 10));
+		fts_ast_string_free((yyvsp[(3) - (3)].token));
+	}
+    break;
+
+  case 12:
+
+/* Line 1806 of yacc.c  */
+#line 146 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (3)].node));
+		fts_ast_add_node((yyval.node), (yyvsp[(2) - (3)].node));
+		fts_ast_term_set_wildcard((yyvsp[(2) - (3)].node));
+	}
+    break;
+
+  case 13:
+
+/* Line 1806 of yacc.c  */
+#line 152 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node));
+		fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+	}
+    break;
+
+  case 14:
+
+/* Line 1806 of yacc.c  */
+#line 157 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (4)].node));
+		fts_ast_add_node((yyval.node), (yyvsp[(2) - (4)].node));
+		fts_ast_term_set_distance((yyvsp[(2) - (4)].node), fts_ast_string_to_ul((yyvsp[(4) - (4)].token), 10));
+		fts_ast_string_free((yyvsp[(4) - (4)].token));
+	}
+    break;
+
+  case 15:
+
+/* Line 1806 of yacc.c  */
+#line 164 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node));
+		fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+	}
+    break;
+
+  case 16:
+
+/* Line 1806 of yacc.c  */
+#line 170 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_IGNORE);
+	}
+    break;
+
+  case 17:
+
+/* Line 1806 of yacc.c  */
+#line 174 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_EXIST);
+	}
+    break;
+
+  case 18:
+
+/* Line 1806 of yacc.c  */
+#line 178 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_NEGATE);
+	}
+    break;
+
+  case 19:
+
+/* Line 1806 of yacc.c  */
+#line 182 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_DECR_RATING);
+	}
+    break;
+
+  case 20:
+
+/* Line 1806 of yacc.c  */
+#line 186 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_INCR_RATING);
+	}
+    break;
+
+  case 21:
+
+/* Line 1806 of yacc.c  */
+#line 191 "fts0pars.y"
+    {
+		(yyval.node)  = fts_ast_create_node_term(state, (yyvsp[(1) - (1)].token));
+		fts_ast_string_free((yyvsp[(1) - (1)].token));
+	}
+    break;
+
+  case 22:
+
+/* Line 1806 of yacc.c  */
+#line 196 "fts0pars.y"
+    {
+		(yyval.node)  = fts_ast_create_node_term(state, (yyvsp[(1) - (1)].token));
+		fts_ast_string_free((yyvsp[(1) - (1)].token));
+	}
+    break;
+
+  case 23:
+
+/* Line 1806 of yacc.c  */
+#line 202 "fts0pars.y"
+    {
+		(yyval.node)  = (yyvsp[(2) - (2)].node);
+	}
+    break;
+
+  case 24:
+
+/* Line 1806 of yacc.c  */
+#line 207 "fts0pars.y"
+    {
+		(yyval.node)  = fts_ast_create_node_text(state, (yyvsp[(1) - (1)].token));
+		fts_ast_string_free((yyvsp[(1) - (1)].token));
+	}
+    break;
+
+
+
+/* Line 1806 of yacc.c  */
+#line 1663 "fts0pars.cc"
+      default: break;
+    }
+  /* User semantic actions sometimes alter yychar, and that requires
+     that yytoken be updated with the new translation.  We take the
+     approach of translating immediately before every use of yytoken.
+     One alternative is translating here after every semantic action,
+     but that translation would be missed if the semantic action invokes
+     YYABORT, YYACCEPT, or YYERROR immediately after altering yychar or
+     if it invokes YYBACKUP.  In the case of YYABORT or YYACCEPT, an
+     incorrect destructor might then be invoked immediately.  In the
+     case of YYERROR or YYBACKUP, subsequent parser actions might lead
+     to an incorrect destructor call or verbose syntax error message
+     before the lookahead is translated.  */
+  YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc);
+
+  YYPOPSTACK (yylen);
+  yylen = 0;
+  YY_STACK_PRINT (yyss, yyssp);
+
+  *++yyvsp = yyval;
+
+  /* Now `shift' the result of the reduction.  Determine what state
+     that goes to, based on the state we popped back to and the rule
+     number reduced by.  */
+
+  yyn = yyr1[yyn];
+
+  yystate = yypgoto[yyn - YYNTOKENS] + *yyssp;
+  if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp)
+    yystate = yytable[yystate];
+  else
+    yystate = yydefgoto[yyn - YYNTOKENS];
+
+  goto yynewstate;
+
+
+/*------------------------------------.
+| yyerrlab -- here on detecting error |
+`------------------------------------*/
+yyerrlab:
+  /* Backup yychar, in case we would change it. */
+  yylastchar = yychar;
+  /* Make sure we have latest lookahead translation.  See comments at
+     user semantic actions for why this is necessary.  */
+  yytoken = yychar == YYEMPTY ? YYEMPTY : YYTRANSLATE (yychar);
+
+  /* If not already recovering from an error, report this error.  */
+  if (!yyerrstatus)
+    {
+      ++yynerrs;
+#if ! YYERROR_VERBOSE
+      yyerror (YY_("syntax error"));
+#else
+# define YYSYNTAX_ERROR yysyntax_error (&yymsg_alloc, &yymsg, \
+                                        yyssp, yytoken)
+      {
+        char const *yymsgp = YY_("syntax error");
+        int yysyntax_error_status;
+        yysyntax_error_status = YYSYNTAX_ERROR;
+        if (yysyntax_error_status == 0)
+          yymsgp = yymsg;
+        else if (yysyntax_error_status == 1)
+          {
+            if (yymsg != yymsgbuf)
+              YYSTACK_FREE (yymsg);
+            yymsg = (char *) YYSTACK_ALLOC (yymsg_alloc);
+            if (!yymsg)
+              {
+                yymsg = yymsgbuf;
+                yymsg_alloc = sizeof yymsgbuf;
+                yysyntax_error_status = 2;
+              }
+            else
+              {
+                yysyntax_error_status = YYSYNTAX_ERROR;
+                yymsgp = yymsg;
+              }
+          }
+        yyerror (yymsgp);
+        if (yysyntax_error_status == 2)
+          goto yyexhaustedlab;
+      }
+# undef YYSYNTAX_ERROR
+#endif
+    }
+
+
+
+  if (yyerrstatus == 3)
+    {
+      /* If just tried and failed to reuse lookahead token after an
+	 error, discard it.  */
+
+      if (yychar <= YYEOF)
+	{
+	  /* Return failure if at end of input.  */
+	  if (yychar == YYEOF)
+	    {
+	      /* Since we don't need the token, we have to free it first. */
+	      YYERRCLEANUP;
+	      YYABORT;
+	    }
+	}
+      else
+	{
+	  yydestruct ("Error: discarding",
+		      yytoken, &yylval);
+	  yychar = YYEMPTY;
+	}
+    }
+
+  /* Else will try to reuse lookahead token after shifting the error
+     token.  */
+  goto yyerrlab1;
+
+
+/*---------------------------------------------------.
+| yyerrorlab -- error raised explicitly by YYERROR.  |
+`---------------------------------------------------*/
+yyerrorlab:
+
+  /* Pacify compilers like GCC when the user code never invokes
+     YYERROR and the label yyerrorlab therefore never appears in user
+     code.  */
+  if (/*CONSTCOND*/ 0)
+     goto yyerrorlab;
+
+  /* Do not reclaim the symbols of the rule which action triggered
+     this YYERROR.  */
+  YYPOPSTACK (yylen);
+  yylen = 0;
+  YY_STACK_PRINT (yyss, yyssp);
+  yystate = *yyssp;
+  goto yyerrlab1;
+
+
+/*-------------------------------------------------------------.
+| yyerrlab1 -- common code for both syntax error and YYERROR.  |
+`-------------------------------------------------------------*/
+yyerrlab1:
+  yyerrstatus = 3;	/* Each real token shifted decrements this.  */
+
+  for (;;)
+    {
+      yyn = yypact[yystate];
+      if (!yypact_value_is_default (yyn))
+	{
+	  yyn += YYTERROR;
+	  if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR)
+	    {
+	      yyn = yytable[yyn];
+	      if (0 < yyn)
+		break;
+	    }
+	}
+
+      /* Pop the current state because it cannot handle the error token.  */
+      if (yyssp == yyss)
+	{
+	  /* Since we don't need the error token, we have to free it first. */
+	  YYERRCLEANUP;
+	  YYABORT;
+	}
+
+
+      yydestruct ("Error: popping",
+		  yystos[yystate], yyvsp);
+      YYPOPSTACK (1);
+      yystate = *yyssp;
+      YY_STACK_PRINT (yyss, yyssp);
+    }
+
+  *++yyvsp = yylval;
+
+
+  /* Shift the error token.  */
+  YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp);
+
+  yystate = yyn;
+  goto yynewstate;
+
+
+/*-------------------------------------.
+| yyacceptlab -- YYACCEPT comes here.  |
+`-------------------------------------*/
+yyacceptlab:
+  yyresult = 0;
+  goto yyreturn;
+
+/*-----------------------------------.
+| yyabortlab -- YYABORT comes here.  |
+`-----------------------------------*/
+yyabortlab:
+  yyresult = 1;
+  goto yyreturn;
+
+#if !defined(yyoverflow) || YYERROR_VERBOSE
+/*-------------------------------------------------.
+| yyexhaustedlab -- memory exhaustion comes here.  |
+`-------------------------------------------------*/
+yyexhaustedlab:
+  yyerror (YY_("memory exhausted"));
+  yyresult = 2;
+  /* Fall through.  */
+#endif
+
+yyreturn:
+  if (yychar != YYEMPTY)
+    {
+      /* Make sure we have latest lookahead translation.  See comments at
+         user semantic actions for why this is necessary.  */
+      yytoken = YYTRANSLATE (yychar);
+      yydestruct ("Cleanup: discarding lookahead",
+                  yytoken, &yylval);
+    }
+  /* Do not reclaim the symbols of the rule which action triggered
+     this YYABORT or YYACCEPT.  */
+  YYPOPSTACK (yylen);
+  YY_STACK_PRINT (yyss, yyssp);
+  while (yyssp != yyss)
+    {
+      yydestruct ("Cleanup: popping",
+		  yystos[*yyssp], yyvsp);
+      YYPOPSTACK (1);
+    }
+#ifndef yyoverflow
+  if (yyss != yyssa)
+    YYSTACK_FREE (yyss);
+#endif
+#if YYERROR_VERBOSE
+  if (yymsg != yymsgbuf)
+    YYSTACK_FREE (yymsg);
+#endif
+  /* Make sure YYID is used.  */
+  return YYID (yyresult);
+}
+
+
+
+/* Line 2067 of yacc.c  */
+#line 212 "fts0pars.y"
+
+
+/********************************************************************
+*/
+int
+ftserror(
+/*=====*/
+	const char*	p)
+{
+	my_printf_error(ER_PARSE_ERROR, "%s", MYF(0), p);
+	return(0);
+}
+
+/********************************************************************
+Create a fts_lexer_t instance.*/
+
+fts_lexer_t*
+fts_lexer_create(
+/*=============*/
+	ibool		boolean_mode,
+	const byte*	query,
+	ulint		query_len)
+{
+	fts_lexer_t*	fts_lexer = static_cast<fts_lexer_t*>(
+		ut_malloc(sizeof(fts_lexer_t)));
+
+	if (boolean_mode) {
+		fts0blex_init(&fts_lexer->yyscanner);
+		fts0b_scan_bytes(
+			reinterpret_cast<const char*>(query),
+			static_cast<int>(query_len),
+			fts_lexer->yyscanner);
+		fts_lexer->scanner = reinterpret_cast<fts_scan>(fts_blexer);
+		/* FIXME: Debugging */
+		/* fts0bset_debug(1 , fts_lexer->yyscanner); */
+	} else {
+		fts0tlex_init(&fts_lexer->yyscanner);
+		fts0t_scan_bytes(
+			reinterpret_cast<const char*>(query),
+			static_cast<int>(query_len),
+			fts_lexer->yyscanner);
+		fts_lexer->scanner = reinterpret_cast<fts_scan>(fts_tlexer);
+	}
+
+	return(fts_lexer);
+}
+
+/********************************************************************
+Free an fts_lexer_t instance.*/
+void
+
+fts_lexer_free(
+/*===========*/
+	fts_lexer_t*	fts_lexer)
+{
+	if (fts_lexer->scanner == (fts_scan) fts_blexer) {
+		fts0blex_destroy(fts_lexer->yyscanner);
+	} else {
+		fts0tlex_destroy(fts_lexer->yyscanner);
+	}
+
+	ut_free(fts_lexer);
+}
+
+/********************************************************************
+Call the appropaiate scanner.*/
+
+int
+fts_lexer(
+/*======*/
+	YYSTYPE*	val,
+	fts_lexer_t*	fts_lexer)
+{
+	fts_scanner_alt func_ptr;
+
+	func_ptr = (fts_scanner_alt) fts_lexer->scanner;
+
+	return(func_ptr(val, fts_lexer->yyscanner));
+}
+
+/********************************************************************
+Parse the query.*/
+int
+fts_parse(
+/*======*/
+	fts_ast_state_t*	state)
+{
+	return(ftsparse(state));
+}
+
diff --git a/storage/innobase/fts/fts0pars.y b/storage/innobase/fts/fts0pars.y
new file mode 100644
index 00000000000..e48036e82fe
--- /dev/null
+++ b/storage/innobase/fts/fts0pars.y
@@ -0,0 +1,294 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014,  Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+ * @file fts/fts0pars.y
+ * FTS parser: input file for the GNU Bison parser generator
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+
+%{
+
+#include "mem0mem.h"
+#include "fts0ast.h"
+#include "fts0blex.h"
+#include "fts0tlex.h"
+#include "fts0pars.h"
+
+extern	int fts_lexer(YYSTYPE*, fts_lexer_t*);
+extern	int fts_blexer(YYSTYPE*, yyscan_t);
+extern	int fts_tlexer(YYSTYPE*, yyscan_t);
+
+typedef int (*fts_scan)();
+
+extern int ftserror(const char* p);
+
+/* Required for reentrant parser */
+#define ftslex	fts_lexer
+
+#define YYERROR_VERBOSE
+
+/* For passing an argument to yyparse() */
+#define YYPARSE_PARAM state
+#define YYLEX_PARAM ((fts_ast_state_t*) state)->lexer
+
+typedef	int	(*fts_scanner_alt)(YYSTYPE* val, yyscan_t yyscanner);
+typedef	int	(*fts_scanner)();
+
+struct fts_lexer_struct {
+	fts_scanner	scanner;
+	void*		yyscanner;
+};
+
+%}
+
+%union {
+	int			oper;
+	fts_ast_string_t*	token;
+	fts_ast_node_t*		node;
+};
+
+/* Enable re-entrant parser */
+%pure_parser
+
+%token<oper>	FTS_OPER
+%token<token>	FTS_TEXT FTS_TERM FTS_NUMB
+
+%type<node>	prefix term text expr sub_expr expr_lst query
+
+%nonassoc	'+' '-' '~' '<' '>'
+
+%%
+
+query	: expr_lst	{
+		$$ = $1;
+		((fts_ast_state_t*) state)->root = $$;
+	}
+	;
+
+expr_lst: /* Empty */	{
+		$$ = NULL;
+	}
+
+	| expr_lst expr	{
+		$$ = $1;
+
+		if (!$$) {
+			$$ = fts_ast_create_node_list(state, $2);
+		} else {
+			fts_ast_add_node($$, $2);
+		}
+	}
+
+	| expr_lst sub_expr		{
+		$$ = $1;
+		$$ = fts_ast_create_node_list(state, $1);
+
+		if (!$$) {
+			$$ = $2;
+		} else {
+			fts_ast_add_node($$, $2);
+		}
+	}
+	;
+
+sub_expr: '(' expr_lst ')'		{
+		$$ = $2;
+
+		if ($$) {
+			$$ = fts_ast_create_node_subexp_list(state, $$);
+		}
+	}
+
+	| prefix '(' expr_lst ')'	{
+		$$ = fts_ast_create_node_list(state, $1);
+
+		if ($3) {
+			fts_ast_add_node($$,
+				fts_ast_create_node_subexp_list(state, $3));
+		}
+	}
+	;
+
+expr	: term		{
+		$$ = $1;
+	}
+
+	| text		{
+		$$ = $1;
+	}
+
+	| term '*' {
+		fts_ast_term_set_wildcard($1);
+	}
+
+	| text '@' FTS_NUMB {
+		fts_ast_term_set_distance($1, fts_ast_string_to_ul($3, 10));
+		fts_ast_string_free($3);
+	}
+
+	| prefix term '*' {
+		$$ = fts_ast_create_node_list(state, $1);
+		fts_ast_add_node($$, $2);
+		fts_ast_term_set_wildcard($2);
+	}
+
+	| prefix term	{
+		$$ = fts_ast_create_node_list(state, $1);
+		fts_ast_add_node($$, $2);
+	}
+
+	| prefix text '@' FTS_NUMB {
+		$$ = fts_ast_create_node_list(state, $1);
+		fts_ast_add_node($$, $2);
+		fts_ast_term_set_distance($2, fts_ast_string_to_ul($4, 10));
+		fts_ast_string_free($4);
+	}
+
+	| prefix text {
+		$$ = fts_ast_create_node_list(state, $1);
+		fts_ast_add_node($$, $2);
+	}
+	;
+
+prefix	: '-'		{
+		$$ = fts_ast_create_node_oper(state, FTS_IGNORE);
+	}
+
+	| '+'		{
+		$$ = fts_ast_create_node_oper(state, FTS_EXIST);
+	}
+
+	| '~'		{
+		$$ = fts_ast_create_node_oper(state, FTS_NEGATE);
+	}
+
+	| '<'		{
+		$$ = fts_ast_create_node_oper(state, FTS_DECR_RATING);
+	}
+
+	| '>'		{
+		$$ = fts_ast_create_node_oper(state, FTS_INCR_RATING);
+	}
+	;
+
+term	: FTS_TERM	{
+		$$  = fts_ast_create_node_term(state, $1);
+		fts_ast_string_free($1);
+	}
+
+	| FTS_NUMB	{
+		$$  = fts_ast_create_node_term(state, $1);
+		fts_ast_string_free($1);
+	}
+
+	/* Ignore leading '*' */
+	| '*' term {
+		$$  = $2;
+	}
+	;
+
+text	: FTS_TEXT	{
+		$$  = fts_ast_create_node_text(state, $1);
+		fts_ast_string_free($1);
+	}
+	;
+%%
+
+/********************************************************************
+*/
+int
+ftserror(
+/*=====*/
+	const char*	p)
+{
+	fprintf(stderr, "%s\n", p);
+	return(0);
+}
+
+/********************************************************************
+Create a fts_lexer_t instance.*/
+
+fts_lexer_t*
+fts_lexer_create(
+/*=============*/
+	ibool		boolean_mode,
+	const byte*	query,
+	ulint		query_len)
+{
+	fts_lexer_t*	fts_lexer = static_cast<fts_lexer_t*>(
+		ut_malloc(sizeof(fts_lexer_t)));
+
+	if (boolean_mode) {
+		fts0blex_init(&fts_lexer->yyscanner);
+		fts0b_scan_bytes((char*) query, query_len, fts_lexer->yyscanner);
+		fts_lexer->scanner = (fts_scan) fts_blexer;
+		/* FIXME: Debugging */
+		/* fts0bset_debug(1 , fts_lexer->yyscanner); */
+	} else {
+		fts0tlex_init(&fts_lexer->yyscanner);
+		fts0t_scan_bytes((char*) query, query_len, fts_lexer->yyscanner);
+		fts_lexer->scanner = (fts_scan) fts_tlexer;
+	}
+
+	return(fts_lexer);
+}
+
+/********************************************************************
+Free an fts_lexer_t instance.*/
+void
+
+fts_lexer_free(
+/*===========*/
+	fts_lexer_t*	fts_lexer)
+{
+	if (fts_lexer->scanner == (fts_scan) fts_blexer) {
+		fts0blex_destroy(fts_lexer->yyscanner);
+	} else {
+		fts0tlex_destroy(fts_lexer->yyscanner);
+	}
+
+	ut_free(fts_lexer);
+}
+
+/********************************************************************
+Call the appropaiate scanner.*/
+
+int
+fts_lexer(
+/*======*/
+	YYSTYPE*	val,
+	fts_lexer_t*	fts_lexer)
+{
+	fts_scanner_alt func_ptr;
+
+	func_ptr = (fts_scanner_alt) fts_lexer->scanner;
+
+	return(func_ptr(val, fts_lexer->yyscanner));
+}
+
+/********************************************************************
+Parse the query.*/
+int
+fts_parse(
+/*======*/
+	fts_ast_state_t*	state)
+{
+	return(ftsparse(state));
+}
diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc
new file mode 100644
index 00000000000..4629e3b7e91
--- /dev/null
+++ b/storage/innobase/fts/fts0que.cc
@@ -0,0 +1,4473 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0que.cc
+Full Text Search functionality.
+
+Created 2007/03/27 Sunny Bains
+Completed 2011/7/10 Sunny and Jimmy Yang
+*******************************************************/
+
+#include "dict0dict.h" /* dict_table_get_n_rows() */
+#include "ut0rbt.h"
+#include "row0sel.h"
+#include "fts0fts.h"
+#include "fts0priv.h"
+#include "fts0ast.h"
+#include "fts0pars.h"
+#include "fts0types.h"
+#include "ha_prototypes.h"
+#include <ctype.h>
+
+#ifndef UNIV_NONINL
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#endif
+
+#include <vector>
+
+#define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
+
+#define RANK_DOWNGRADE		(-1.0F)
+#define RANK_UPGRADE		(1.0F)
+
+/* Maximum number of words supported in a phrase or proximity search. */
+#define MAX_PROXIMITY_ITEM	128
+
+/* Memory used by rbt itself for create and node add */
+#define SIZEOF_RBT_CREATE	sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2
+#define SIZEOF_RBT_NODE_ADD	sizeof(ib_rbt_node_t)
+
+/*Initial byte length for 'words' in fts_ranking_t */
+#define RANKING_WORDS_INIT_LEN	4
+
+/* Coeffecient to use for normalize relevance ranking. */
+static const double FTS_NORMALIZE_COEFF = 0.0115F;
+
+// FIXME: Need to have a generic iterator that traverses the ilist.
+
+typedef std::vector<fts_string_t>	word_vector_t;
+
+struct fts_word_freq_t;
+
+/** State of an FTS query. */
+struct fts_query_t {
+	mem_heap_t*	heap;		/*!< Heap to use for allocations */
+
+	trx_t*		trx;		/*!< The query transaction */
+
+	dict_index_t*	index;		/*!< The FTS index to search */
+					/*!< FTS auxiliary common table def */
+	fts_table_t	fts_common_table;
+
+	fts_table_t	fts_index_table;/*!< FTS auxiliary index table def */
+
+	ulint		total_size;	/*!< total memory size used by query */
+
+	fts_doc_ids_t*	deleted;	/*!< Deleted doc ids that need to be
+					filtered from the output */
+
+	fts_ast_node_t*	root;		/*!< Abstract syntax tree */
+
+	fts_ast_node_t* cur_node;	/*!< Current tree node */
+
+	ib_rbt_t*	word_map;	/*!< Matched word map for
+					searching by word*/
+
+	word_vector_t*	word_vector;	/*!< Matched word vector for
+					searching by index */
+
+	ib_rbt_t*       doc_ids;	/*!< The current set of matching
+					doc ids, elements are of
+					type fts_ranking_t */
+
+	ib_rbt_t*	intersection;	/*!< The doc ids that were found in
+					doc_ids, this tree will become
+					the new doc_ids, elements are of type
+					fts_ranking_t */
+
+					/*!< Prepared statement to read the
+					nodes from the FTS INDEX */
+	que_t*		read_nodes_graph;
+
+	fts_ast_oper_t	oper;		/*!< Current boolean mode operator */
+
+					/*!< TRUE if we want to collect the
+					word positions within the document */
+	ibool		collect_positions;
+
+	ulint		flags;		/*!< Specify the full text search type,
+					such as  boolean search, phrase
+					search, proximity search etc. */
+
+	ulint		distance;	/*!< The proximity distance of a
+					phrase search. */
+
+					/*!< These doc ids are used as a
+					boundary condition when searching the
+					FTS index rows */
+
+	doc_id_t	lower_doc_id;	/*!< Lowest doc id in doc_ids */
+
+	doc_id_t	upper_doc_id;	/*!< Highest doc id in doc_ids */
+
+	bool		boolean_mode;	/*!< TRUE if boolean mode query */
+
+	ib_vector_t*	matched;	/*!< Array of matching documents
+					(fts_match_t) to search for a phrase */
+
+	ib_vector_t**	match_array;	/*!< Used for proximity search, contains
+					position info for each matched word
+					in the word list */
+
+	ib_uint64_t	total_docs;	/*!< The total number of documents */
+
+	ulint		total_words;	/*!< The total number of words */
+
+	dberr_t		error;		/*!< Error code if any, that is
+					encountered during query processing */
+
+	ib_rbt_t*	word_freqs;	/*!< RB tree of word frequencies per
+					document, its elements are of type
+					fts_word_freq_t */
+
+	bool		multi_exist;	/*!< multiple FTS_EXIST oper */
+};
+
+/** For phrase matching, first we collect the documents and the positions
+then we match. */
+struct fts_match_t {
+	doc_id_t	doc_id;		/*!< Document id */
+
+	ulint		start;		/*!< Start the phrase match from
+					this offset within the positions
+					vector. */
+
+	ib_vector_t*	positions;	/*!< Offsets of a word in a
+					document */
+};
+
+/** For matching tokens in a phrase search. We use this data structure in
+the callback that determines whether a document should be accepted or
+rejected for a phrase search. */
+struct fts_select_t {
+	doc_id_t	doc_id;		/*!< The document id to match */
+
+	ulint		min_pos;	/*!< For found to be TRUE at least
+					one position must be greater than
+					min_pos. */
+
+	ibool		found;		/*!< TRUE if found */
+
+	fts_word_freq_t*
+			word_freq;	/*!< Word frequency instance of the
+					current word being looked up in
+					the FTS index */
+};
+
+typedef std::vector<ulint>       pos_vector_t;
+
+/** structure defines a set of ranges for original documents, each of which
+has a minimum position and maximum position. Text in such range should
+contain all words in the proximity search. We will need to count the
+words in such range to make sure it is less than the specified distance
+of the proximity search */
+struct fts_proximity_t {
+	ulint		n_pos;		/*!< number of position set, defines
+					a range (min to max) containing all
+					matching words */
+	pos_vector_t	min_pos;	/*!< the minimum position (in bytes)
+					of the range */
+	pos_vector_t	max_pos;	/*!< the maximum position (in bytes)
+					of the range */
+};
+
+/** The match positions and tokesn to match */
+struct fts_phrase_t {
+	ibool		found;		/*!< Match result */
+
+	const fts_match_t*
+			match;		/*!< Positions within text */
+
+	const ib_vector_t*
+			tokens;		/*!< Tokens to match */
+
+	ulint		distance;	/*!< For matching on proximity
+					distance. Can be 0 for exact match */
+	CHARSET_INFO*	charset;	/*!< Phrase match charset */
+	mem_heap_t*     heap;		/*!< Heap for word processing */
+	ulint		zip_size;	/*!< row zip size */
+	fts_proximity_t*proximity_pos;	/*!< position info for proximity
+					search verification. Records the min
+					and max position of words matched */
+};
+
+/** For storing the frequncy of a word/term in a document */
+struct fts_doc_freq_t {
+	doc_id_t	doc_id;		/*!< Document id */
+	ulint		freq;		/*!< Frequency of a word in a document */
+};
+
+/** To determine the word frequency per document. */
+struct fts_word_freq_t {
+	fts_string_t	word;		/*!< Word for which we need the freq,
+					it's allocated on the query heap */
+
+	ib_rbt_t*	doc_freqs;	/*!< RB Tree for storing per document
+					word frequencies. The elements are
+					of type fts_doc_freq_t */
+	ib_uint64_t	doc_count;	/*!< Total number of documents that
+					contain this word */
+	double		idf;		/*!< Inverse document frequency */
+};
+
+/********************************************************************
+Callback function to fetch the rows in an FTS INDEX record.
+@return always TRUE */
+static
+ibool
+fts_query_index_fetch_nodes(
+/*========================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg);	/*!< in: pointer to ib_vector_t */
+
+/********************************************************************
+Read and filter nodes.
+@return fts_node_t instance */
+static
+dberr_t
+fts_query_filter_doc_ids(
+/*=====================*/
+	fts_query_t*		query,		/*!< in: query instance */
+	const fts_string_t*	word,		/*!< in: the current word */
+	fts_word_freq_t*	word_freq,	/*!< in/out: word frequency */
+	const fts_node_t*	node,		/*!< in: current FTS node */
+	void*			data,		/*!< in: doc id ilist */
+	ulint			len,		/*!< in: doc id ilist size */
+	ibool			calc_doc_count);/*!< in: whether to remember doc
+						count */
+
+#if 0
+/*****************************************************************//***
+Find a doc_id in a word's ilist.
+@return TRUE if found. */
+static
+ibool
+fts_query_find_doc_id(
+/*==================*/
+	fts_select_t*	select,		/*!< in/out: search the doc id selected,
+					update the frequency if found. */
+	void*		data,		/*!< in: doc id ilist */
+	ulint		len);		/*!< in: doc id ilist size */
+#endif
+
+/*************************************************************//**
+This function implements a simple "blind" query expansion search:
+words in documents found in the first search pass will be used as
+search arguments to search the document again, thus "expand"
+the search result set.
+@return DB_SUCCESS if success, otherwise the error code */
+static
+dberr_t
+fts_expand_query(
+/*=============*/
+	dict_index_t*	index,		/*!< in: FTS index to search */
+	fts_query_t*	query)		/*!< in: query result, to be freed
+					by the client */
+	__attribute__((nonnull, warn_unused_result));
+/*************************************************************//**
+This function finds documents that contain all words in a
+phrase or proximity search. And if proximity search, verify
+the words are close enough to each other, as in specified distance.
+This function is called for phrase and proximity search.
+@return TRUE if documents are found, FALSE if otherwise */
+static
+ibool
+fts_phrase_or_proximity_search(
+/*===========================*/
+	fts_query_t*	query,		/*!< in/out:  query instance
+					query->doc_ids might be instantiated
+					with qualified doc IDs */
+	ib_vector_t*	tokens);	/*!< in: Tokens contain words */
+/*************************************************************//**
+This function checks whether words in result documents are close to
+each other (within proximity range as specified by "distance").
+If "distance" is MAX_ULINT, then it will find all combinations of
+positions of matching words and store min and max positions
+in the "qualified_pos" for later verification.
+@return true if words are close to each other, false if otherwise */
+static
+bool
+fts_proximity_get_positions(
+/*========================*/
+	fts_match_t**		match,		/*!< in: query instance */
+	ulint			num_match,	/*!< in: number of matching
+						items */
+	ulint			distance,	/*!< in: distance value
+						for proximity search */
+	fts_proximity_t*	qualified_pos);	/*!< out: the position info
+						records ranges containing
+						all matching words. */
+#if 0
+/********************************************************************
+Get the total number of words in a documents. */
+static
+ulint
+fts_query_terms_in_document(
+/*========================*/
+					/*!< out: DB_SUCCESS if all go well
+					else error code */
+	fts_query_t*	query,		/*!< in: FTS query state */
+	doc_id_t	doc_id,		/*!< in: the word to check */
+	ulint*		total);		/*!< out: total words in document */
+#endif
+
+/********************************************************************
+Compare two fts_doc_freq_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_freq_doc_id_cmp(
+/*================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const fts_doc_freq_t*	fq1 = (const fts_doc_freq_t*) p1;
+	const fts_doc_freq_t*	fq2 = (const fts_doc_freq_t*) p2;
+
+	return((int) (fq1->doc_id - fq2->doc_id));
+}
+
+#if 0
+/*******************************************************************//**
+Print the table used for calculating LCS. */
+static
+void
+fts_print_lcs_table(
+/*================*/
+	const ulint*	table,		/*!< in: array to print */
+	ulint		n_rows,		/*!< in: total no. of rows */
+	ulint		n_cols)		/*!< in: total no. of cols */
+{
+	ulint		i;
+
+	for (i = 0; i < n_rows; ++i) {
+		ulint	j;
+
+		printf("\n");
+
+		for (j = 0; j < n_cols; ++j) {
+
+			printf("%2lu ", FTS_ELEM(table, n_cols, i, j));
+		}
+	}
+}
+
+/********************************************************************
+Find the longest common subsequence between the query string and
+the document. */
+static
+ulint
+fts_query_lcs(
+/*==========*/
+					/*!< out: LCS (length) between
+					two ilists */
+	const	ulint*	p1,		/*!< in: word positions of query */
+	ulint	len_p1,			/*!< in: no. of elements in p1 */
+	const	ulint*	p2,		/*!< in: word positions within document */
+	ulint	len_p2)			/*!< in: no. of elements in p2 */
+{
+	int	i;
+	ulint	len = 0;
+	ulint	r = len_p1;
+	ulint	c = len_p2;
+	ulint	size = (r + 1) * (c + 1) * sizeof(ulint);
+	ulint*	table = (ulint*) ut_malloc(size);
+
+	/* Traverse the table backwards, from the last row to the first and
+	also from the last column to the first. We compute the smaller
+	common subsequeces first, then use the caluclated values to determine
+	the longest common subsequence. The result will be in TABLE[0][0]. */
+	for (i = r; i >= 0; --i) {
+		int	j;
+
+		for (j = c; j >= 0; --j) {
+
+			if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) {
+
+				FTS_ELEM(table, c, i, j) = 0;
+
+			} else if (p1[i] == p2[j]) {
+
+				FTS_ELEM(table, c, i, j) = FTS_ELEM(
+					table, c, i + 1, j + 1) + 1;
+
+			} else {
+
+				ulint	value;
+
+				value = ut_max(
+					FTS_ELEM(table, c, i + 1, j),
+					FTS_ELEM(table, c, i, j + 1));
+
+				FTS_ELEM(table, c, i, j) = value;
+			}
+		}
+	}
+
+	len = FTS_ELEM(table, c, 0, 0);
+
+	fts_print_lcs_table(table, r, c);
+	printf("\nLen=%lu\n", len);
+
+	ut_free(table);
+
+	return(len);
+}
+#endif
+
+/*******************************************************************//**
+Compare two fts_ranking_t instance on their rank value and doc ids in
+descending order on the rank and ascending order on doc id.
+@return 0 if p1 == p2, < 0 if p1 <  p2, > 0 if p1 >  p2 */
+static
+int
+fts_query_compare_rank(
+/*===================*/
+	const void*	p1,		/*!< in: pointer to elem */
+	const void*	p2)		/*!< in: pointer to elem */
+{
+	const fts_ranking_t*	r1 = (const fts_ranking_t*) p1;
+	const fts_ranking_t*	r2 = (const fts_ranking_t*) p2;
+
+	if (r2->rank < r1->rank) {
+		return(-1);
+	} else if (r2->rank == r1->rank) {
+
+		if (r1->doc_id < r2->doc_id) {
+			return(1);
+		} else if (r1->doc_id > r2->doc_id) {
+			return(1);
+		}
+
+		return(0);
+	}
+
+	return(1);
+}
+
+#ifdef FTS_UTF8_DEBUG
+/*******************************************************************//**
+Convert string to lowercase.
+@return lower case string, callers responsibility to delete using
+ut_free() */
+static
+byte*
+fts_tolower(
+/*========*/
+	const byte*	src,		/*!< in: src string */
+	ulint		len)		/*!< in: src string length */
+{
+	fts_string_t	str;
+	byte*		lc_str = ut_malloc(len + 1);
+
+	str.f_len = len;
+	str.f_str = lc_str;
+
+	memcpy(str.f_str, src, len);
+
+	/* Make sure the last byte is NUL terminated */
+	str.f_str[len] = '\0';
+
+	fts_utf8_tolower(&str);
+
+	return(lc_str);
+}
+
+/*******************************************************************//**
+Do a case insensitive search. Doesn't check for NUL byte end marker
+only relies on len. Convert str2 to lower case before comparing.
+@return 0 if p1 == p2, < 0 if p1 <  p2, > 0 if p1 >  p2 */
+static
+int
+fts_utf8_strcmp(
+/*============*/
+	const fts_string_t*
+			str1,		/*!< in: should be lower case*/
+
+	fts_string_t*	str2)		/*!< in: any case. We will use the length
+					of this string during compare as it
+					should be the min of the two strings */
+{
+	byte		b = str2->f_str[str2->f_len];
+
+	ut_a(str2->f_len <= str1->f_len);
+
+	/* We need to write a NUL byte at the end of the string because the
+	string is converted to lowercase by a MySQL function which doesn't
+	care about the length. */
+	str2->f_str[str2->f_len] = 0;
+
+	fts_utf8_tolower(str2);
+
+	/* Restore the value we replaced above. */
+	str2->f_str[str2->f_len] = b;
+
+	return(memcmp(str1->f_str, str2->f_str, str2->f_len));
+}
+#endif
+
+/*******************************************************************//**
+Create words in ranking */
+static
+void
+fts_ranking_words_create(
+/*=====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	fts_ranking_t*	ranking)	/*!< in: ranking instance */
+{
+	ranking->words = static_cast<byte*>(
+		mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN));
+	ranking->words_len = RANKING_WORDS_INIT_LEN;
+}
+
+/*
+The optimization here is using a char array(bitmap) to replace words rb tree
+in fts_ranking_t.
+
+It can save lots of memory except in some cases of QUERY EXPANSION.
+
+'word_map' is used as a word dictionary, in which the key is a word, the value
+is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'.
+if not, we add it into 'word_map', and give it a position(actually a number).
+then we set the corresponding bit to '1' at the position in the char array 'words'.
+
+'word_vector' is a useful backup of 'word_map', and we can get a word by its position,
+more quickly than searching by value in 'word_map'. we use 'word_vector'
+in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need
+to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq
+by the word.
+*/
+
+/*******************************************************************//**
+Add a word into ranking */
+static
+void
+fts_ranking_words_add(
+/*==================*/
+	fts_query_t*		query,		/*!< in: query instance */
+	fts_ranking_t*		ranking,	/*!< in: ranking instance */
+	const fts_string_t*	word)		/*!< in: term/word to add */
+{
+	ulint	pos;
+	ulint	byte_offset;
+	ulint	bit_offset;
+	ib_rbt_bound_t	parent;
+
+	/* Note: we suppose the word map and vector are append-only. */
+	ut_ad(query->word_vector->size() == rbt_size(query->word_map));
+
+	/* We use ib_rbt to simulate a map, f_n_char means position. */
+	if (rbt_search(query->word_map, &parent, word) == 0) {
+		fts_string_t*	result_word;
+
+		result_word = rbt_value(fts_string_t, parent.last);
+		pos = result_word->f_n_char;
+		ut_ad(pos < rbt_size(query->word_map));
+	} else {
+		/* Add the word to map. */
+		fts_string_t	new_word;
+
+		pos = rbt_size(query->word_map);
+
+		new_word.f_str = static_cast<byte*>(mem_heap_alloc(query->heap,
+			word->f_len + 1));
+		memcpy(new_word.f_str, word->f_str, word->f_len);
+		new_word.f_str[word->f_len] = 0;
+		new_word.f_len = word->f_len;
+		new_word.f_n_char = pos;
+
+		rbt_add_node(query->word_map, &parent, &new_word);
+		ut_ad(rbt_validate(query->word_map));
+		query->word_vector->push_back(new_word);
+	}
+
+	/* Check words len */
+	byte_offset = pos / CHAR_BIT;
+	if (byte_offset >= ranking->words_len) {
+		byte*	words = ranking->words;
+		ulint	words_len = ranking->words_len;
+
+		while (byte_offset >= words_len) {
+			words_len *= 2;
+		}
+
+		ranking->words = static_cast<byte*>(
+			mem_heap_zalloc(query->heap, words_len));
+		ut_memcpy(ranking->words, words, ranking->words_len);
+		ranking->words_len = words_len;
+	}
+
+	/* Set ranking words */
+	ut_ad(byte_offset < ranking->words_len);
+	bit_offset = pos % CHAR_BIT;
+	ranking->words[byte_offset] |= 1 << bit_offset;
+}
+
+/*******************************************************************//**
+Get a word from a ranking
+@return true if it's successful */
+static
+bool
+fts_ranking_words_get_next(
+/*=======================*/
+	const	fts_query_t*	query,	/*!< in: query instance */
+	fts_ranking_t*		ranking,/*!< in: ranking instance */
+	ulint*			pos,	/*!< in/out: word start pos */
+	fts_string_t*		word)	/*!< in/out: term/word to add */
+{
+	bool	ret = false;
+	ulint	max_pos = ranking->words_len * CHAR_BIT;
+
+	/* Search for next word */
+	while (*pos < max_pos) {
+		ulint	byte_offset = *pos / CHAR_BIT;
+		ulint	bit_offset = *pos % CHAR_BIT;
+
+		if (ranking->words[byte_offset] & (1 << bit_offset)) {
+			ret = true;
+			break;
+		}
+
+		*pos += 1;
+	};
+
+	/* Get next word from word vector */
+	if (ret) {
+		ut_ad(*pos < query->word_vector->size());
+		*word = query->word_vector->at((size_t)*pos);
+		*pos += 1;
+	}
+
+	return ret;
+}
+
+/*******************************************************************//**
+Add a word if it doesn't exist, to the term freq RB tree. We store
+a pointer to the word that is passed in as the argument.
+@return pointer to word */
+static
+fts_word_freq_t*
+fts_query_add_word_freq(
+/*====================*/
+	fts_query_t*		query,		/*!< in: query instance */
+	const fts_string_t*	word)		/*!< in: term/word to add */
+{
+	ib_rbt_bound_t		parent;
+
+	/* Lookup the word in our rb tree and add if it doesn't exist. */
+	if (rbt_search(query->word_freqs, &parent, word) != 0) {
+		fts_word_freq_t	word_freq;
+
+		memset(&word_freq, 0, sizeof(word_freq));
+
+		word_freq.word.f_str = static_cast<byte*>(
+			mem_heap_alloc(query->heap, word->f_len + 1));
+		memcpy(word_freq.word.f_str, word->f_str, word->f_len);
+		word_freq.word.f_str[word->f_len] = 0;
+		word_freq.word.f_len = word->f_len;
+
+		word_freq.doc_count = 0;
+
+		word_freq.doc_freqs = rbt_create(
+			sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp);
+
+		parent.last = rbt_add_node(
+			query->word_freqs, &parent, &word_freq);
+
+		query->total_size += word->f_len
+			+ SIZEOF_RBT_CREATE
+			+ SIZEOF_RBT_NODE_ADD
+			+ sizeof(fts_word_freq_t);
+	}
+
+	return(rbt_value(fts_word_freq_t, parent.last));
+}
+
+/*******************************************************************//**
+Add a doc id if it doesn't exist, to the doc freq RB tree.
+@return pointer to word */
+static
+fts_doc_freq_t*
+fts_query_add_doc_freq(
+/*===================*/
+	fts_query_t*	query,		/*!< in: query instance	*/
+	ib_rbt_t*	doc_freqs,	/*!< in: rb tree of fts_doc_freq_t */
+	doc_id_t	doc_id)		/*!< in: doc id to add */
+{
+	ib_rbt_bound_t	parent;
+
+	/* Lookup the doc id in our rb tree and add if it doesn't exist. */
+	if (rbt_search(doc_freqs, &parent, &doc_id) != 0) {
+		fts_doc_freq_t	doc_freq;
+
+		memset(&doc_freq, 0, sizeof(doc_freq));
+
+		doc_freq.freq = 0;
+		doc_freq.doc_id = doc_id;
+
+		parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq);
+
+		query->total_size += SIZEOF_RBT_NODE_ADD
+			+ sizeof(fts_doc_freq_t);
+	}
+
+	return(rbt_value(fts_doc_freq_t, parent.last));
+}
+
+/*******************************************************************//**
+Add the doc id to the query set only if it's not in the
+deleted array. */
+static
+void
+fts_query_union_doc_id(
+/*===================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id,		/*!< in: the doc id to add */
+	fts_rank_t	rank)		/*!< in: if non-zero, it is the
+					rank associated with the doc_id */
+{
+	ib_rbt_bound_t	parent;
+	ulint		size = ib_vector_size(query->deleted->doc_ids);
+	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
+
+	/* Check if the doc id is deleted and it's not already in our set. */
+	if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0
+	    && rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
+
+		fts_ranking_t	ranking;
+
+		ranking.rank = rank;
+		ranking.doc_id = doc_id;
+		fts_ranking_words_create(query, &ranking);
+
+		rbt_add_node(query->doc_ids, &parent, &ranking);
+
+		query->total_size += SIZEOF_RBT_NODE_ADD
+			+ sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN;
+	}
+}
+
+/*******************************************************************//**
+Remove the doc id from the query set only if it's not in the
+deleted set. */
+static
+void
+fts_query_remove_doc_id(
+/*====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id)		/*!< in: the doc id to add */
+{
+	ib_rbt_bound_t	parent;
+	ulint		size = ib_vector_size(query->deleted->doc_ids);
+	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
+
+	/* Check if the doc id is deleted and it's in our set. */
+	if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0
+	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
+		ut_free(rbt_remove_node(query->doc_ids, parent.last));
+
+		ut_ad(query->total_size >=
+		      SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
+		query->total_size -= SIZEOF_RBT_NODE_ADD
+			+ sizeof(fts_ranking_t);
+	}
+}
+
+/*******************************************************************//**
+Find the doc id in the query set but not in the deleted set, artificialy
+downgrade or upgrade its ranking by a value and make/initialize its ranking
+under or above its normal range 0 to 1. This is used for Boolean Search
+operator such as Negation operator, which makes word's contribution to the
+row's relevance to be negative */
+static
+void
+fts_query_change_ranking(
+/*====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id,		/*!< in: the doc id to add */
+	ibool		downgrade)	/*!< in: Whether to downgrade ranking */
+{
+	ib_rbt_bound_t	parent;
+	ulint		size = ib_vector_size(query->deleted->doc_ids);
+	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
+
+	/* Check if the doc id is deleted and it's in our set. */
+	if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0
+	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
+
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+
+		ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE;
+
+		/* Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5)
+		and RANK_UPGRADE (0.5) */
+		if (ranking->rank >= 1.0F) {
+			ranking->rank = 1.0F;
+		} else if (ranking->rank <= -1.0F) {
+			ranking->rank = -1.0F;
+		}
+	}
+}
+
+/*******************************************************************//**
+Check the doc id in the query set only if it's not in the
+deleted array. The doc ids that were found are stored in
+another rb tree (fts_query_t::intersect). */
+static
+void
+fts_query_intersect_doc_id(
+/*=======================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id,		/*!< in: the doc id to add */
+	fts_rank_t	rank)		/*!< in: if non-zero, it is the
+					rank associated with the doc_id */
+{
+	ib_rbt_bound_t	parent;
+	ulint		size = ib_vector_size(query->deleted->doc_ids);
+	fts_update_t*	array = (fts_update_t*) query->deleted->doc_ids->data;
+	fts_ranking_t*	ranking= NULL;
+
+	/* There are three types of intersect:
+	   1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'.
+	   2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect
+	      if it matches 'b'. if the doc is also in  doc_ids, then change the
+	      doc's rank, and add 'a' in doc's words.
+	   3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect
+	      if it matches 'b' and it's in doc_ids.(multi_exist = true). */
+
+	/* Check if the doc id is deleted and it's in our set */
+	if (fts_bsearch(array, 0, static_cast<int>(size), doc_id) < 0) {
+		fts_ranking_t	new_ranking;
+
+		if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
+			if (query->multi_exist) {
+				return;
+			} else {
+				new_ranking.words = NULL;
+			}
+		} else {
+			ranking = rbt_value(fts_ranking_t, parent.last);
+
+			/* We've just checked the doc id before */
+			if (ranking->words == NULL) {
+				ut_ad(rbt_search(query->intersection, &parent,
+					ranking) == 0);
+				return;
+			}
+
+			/* Merge rank */
+			rank += ranking->rank;
+			if (rank >= 1.0F) {
+				rank = 1.0F;
+			} else if (rank <= -1.0F) {
+				rank = -1.0F;
+			}
+
+			/* Take words */
+			new_ranking.words = ranking->words;
+			new_ranking.words_len = ranking->words_len;
+		}
+
+		new_ranking.rank = rank;
+		new_ranking.doc_id = doc_id;
+
+		if (rbt_search(query->intersection, &parent,
+			       &new_ranking) != 0) {
+			if (new_ranking.words == NULL) {
+				fts_ranking_words_create(query, &new_ranking);
+
+				query->total_size += RANKING_WORDS_INIT_LEN;
+			} else {
+				/* Note that the intersection has taken
+				ownership of the ranking data. */
+				ranking->words = NULL;
+			}
+
+			rbt_add_node(query->intersection,
+				     &parent, &new_ranking);
+
+			query->total_size += SIZEOF_RBT_NODE_ADD
+				+ sizeof(fts_ranking_t);
+		}
+	}
+}
+
+/*******************************************************************//**
+Free the document ranking rb tree. */
+static
+void
+fts_query_free_doc_ids(
+/*===================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	ib_rbt_t*	doc_ids)	/*!< in: rb tree to free */
+{
+	const ib_rbt_node_t*	node;
+
+	for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) {
+
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, node);
+
+		if (ranking->words) {
+			ranking->words = NULL;
+		}
+
+		ut_free(rbt_remove_node(doc_ids, node));
+
+		ut_ad(query->total_size >=
+		      SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
+		query->total_size -= SIZEOF_RBT_NODE_ADD
+			+ sizeof(fts_ranking_t);
+	}
+
+	rbt_free(doc_ids);
+
+	ut_ad(query->total_size >= SIZEOF_RBT_CREATE);
+	query->total_size -= SIZEOF_RBT_CREATE;
+}
+
+/*******************************************************************//**
+Add the word to the documents "list" of matching words from
+the query. We make a copy of the word from the query heap. */
+static
+void
+fts_query_add_word_to_document(
+/*===========================*/
+	fts_query_t*		query,	/*!< in: query to update */
+	doc_id_t		doc_id,	/*!< in: the document to update */
+	const fts_string_t*	word)	/*!< in: the token to add */
+{
+	ib_rbt_bound_t		parent;
+	fts_ranking_t*		ranking = NULL;
+
+	if (query->flags == FTS_OPT_RANKING) {
+		return;
+	}
+
+	/* First we search the intersection RB tree as it could have
+	taken ownership of the words rb tree instance. */
+	if (query->intersection
+	    && rbt_search(query->intersection, &parent, &doc_id) == 0) {
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+	}
+
+	if (ranking == NULL
+	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+	}
+
+	if (ranking != NULL) {
+		fts_ranking_words_add(query, ranking, word);
+	}
+}
+
+/*******************************************************************//**
+Check the node ilist. */
+static
+void
+fts_query_check_node(
+/*=================*/
+	fts_query_t*		query,	/*!< in: query to update */
+	const fts_string_t*	token,	/*!< in: the token to search */
+	const fts_node_t*	node)	/*!< in: node to check */
+{
+	/* Skip nodes whose doc ids are out range. */
+	if (query->oper == FTS_EXIST
+	    && ((query->upper_doc_id > 0
+		&& node->first_doc_id > query->upper_doc_id)
+		|| (query->lower_doc_id > 0
+		    && node->last_doc_id < query->lower_doc_id))) {
+
+		/* Ignore */
+
+	} else {
+		int		ret;
+		ib_rbt_bound_t	parent;
+		ulint		ilist_size = node->ilist_size;
+		fts_word_freq_t*word_freqs;
+
+		/* The word must exist. */
+		ret = rbt_search(query->word_freqs, &parent, token);
+		ut_a(ret == 0);
+
+		word_freqs = rbt_value(fts_word_freq_t, parent.last);
+
+		query->error = fts_query_filter_doc_ids(
+					query, token, word_freqs, node,
+					node->ilist, ilist_size, TRUE);
+	}
+}
+
+/*****************************************************************//**
+Search index cache for word with wildcard match.
+@return number of words matched */
+static
+ulint
+fts_cache_find_wildcard(
+/*====================*/
+	fts_query_t*		query,		/*!< in: query instance */
+	const fts_index_cache_t*index_cache,	/*!< in: cache to search */
+	const fts_string_t*	token)		/*!< in: token to search */
+{
+	ib_rbt_bound_t		parent;
+	const ib_vector_t*	nodes = NULL;
+	fts_string_t		srch_text;
+	byte			term[FTS_MAX_WORD_LEN + 1];
+	ulint			num_word = 0;
+
+	srch_text.f_len = (token->f_str[token->f_len - 1] == '%')
+			? token->f_len - 1
+			: token->f_len;
+
+	strncpy((char*) term, (char*) token->f_str, srch_text.f_len);
+	term[srch_text.f_len] = '\0';
+	srch_text.f_str = term;
+
+	/* Lookup the word in the rb tree */
+	if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL,
+			   innobase_fts_text_cmp_prefix) == 0) {
+		const fts_tokenizer_word_t*     word;
+		ulint				i;
+		const ib_rbt_node_t*		cur_node;
+		ibool				forward = FALSE;
+
+		word = rbt_value(fts_tokenizer_word_t, parent.last);
+		cur_node = parent.last;
+
+		while (innobase_fts_text_cmp_prefix(
+			index_cache->charset, &srch_text, &word->text) == 0) {
+
+			nodes = word->nodes;
+
+			for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
+				int                     ret;
+				const fts_node_t*       node;
+				ib_rbt_bound_t          freq_parent;
+				fts_word_freq_t*	word_freqs;
+
+				node = static_cast<const fts_node_t*>(
+					ib_vector_get_const(nodes, i));
+
+				ret = rbt_search(query->word_freqs,
+						 &freq_parent,
+						 &srch_text);
+
+				ut_a(ret == 0);
+
+				word_freqs = rbt_value(
+					fts_word_freq_t,
+					freq_parent.last);
+
+				query->error = fts_query_filter_doc_ids(
+					query, &srch_text,
+					word_freqs, node,
+					node->ilist, node->ilist_size, TRUE);
+
+				if (query->error != DB_SUCCESS) {
+					return(0);
+				}
+			}
+
+			num_word++;
+
+			if (!forward) {
+				cur_node = rbt_prev(
+					index_cache->words, cur_node);
+			} else {
+cont_search:
+				cur_node = rbt_next(
+					index_cache->words, cur_node);
+			}
+
+			if (!cur_node) {
+				break;
+			}
+
+			word = rbt_value(fts_tokenizer_word_t, cur_node);
+		}
+
+		if (!forward) {
+			forward = TRUE;
+			cur_node = parent.last;
+			goto cont_search;
+		}
+	}
+
+	return(num_word);
+}
+
+/*****************************************************************//**
+Set difference.
+@return DB_SUCCESS if all go well */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_difference(
+/*=================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	const fts_string_t*	token)	/*!< in: token to search */
+{
+	ulint			n_doc_ids= 0;
+	trx_t*			trx = query->trx;
+	dict_table_t*		table = query->index->table;
+
+	ut_a(query->oper == FTS_IGNORE);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	fprintf(stderr, "DIFFERENCE: Searching: '%.*s'\n",
+		(int) token->f_len, token->f_str);
+#endif
+
+	if (query->doc_ids) {
+		n_doc_ids = rbt_size(query->doc_ids);
+	}
+
+	/* There is nothing we can substract from an empty set. */
+	if (query->doc_ids && !rbt_empty(query->doc_ids)) {
+		ulint			i;
+		fts_fetch_t		fetch;
+		const ib_vector_t*	nodes;
+		const fts_index_cache_t*index_cache;
+		que_t*			graph = NULL;
+		fts_cache_t*		cache = table->fts->cache;
+		dberr_t			error;
+
+		rw_lock_x_lock(&cache->lock);
+
+		index_cache = fts_find_index_cache(cache, query->index);
+
+		/* Must find the index cache */
+		ut_a(index_cache != NULL);
+
+		/* Search the cache for a matching word first. */
+		if (query->cur_node->term.wildcard
+		    && query->flags != FTS_PROXIMITY
+		    && query->flags != FTS_PHRASE) {
+			fts_cache_find_wildcard(query, index_cache, token);
+		} else {
+			nodes = fts_cache_find_word(index_cache, token);
+
+			for (i = 0; nodes && i < ib_vector_size(nodes)
+			     && query->error == DB_SUCCESS; ++i) {
+				const fts_node_t*	node;
+
+				node = static_cast<const fts_node_t*>(
+					ib_vector_get_const(nodes, i));
+
+				fts_query_check_node(query, token, node);
+			}
+		}
+
+		rw_lock_x_unlock(&cache->lock);
+
+		/* error is passed by 'query->error' */
+		if (query->error != DB_SUCCESS) {
+			ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+			return(query->error);
+		}
+
+		/* Setup the callback args for filtering and
+		consolidating the ilist. */
+		fetch.read_arg = query;
+		fetch.read_record = fts_query_index_fetch_nodes;
+
+		error = fts_index_fetch_nodes(
+			trx, &graph, &query->fts_index_table, token, &fetch);
+
+		/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+		ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+		if (error != DB_SUCCESS) {
+			query->error = error;
+		}
+
+		fts_que_graph_free(graph);
+	}
+
+	/* The size can't increase. */
+	ut_a(rbt_size(query->doc_ids) <= n_doc_ids);
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Intersect the token doc ids with the current set.
+@return DB_SUCCESS if all go well */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_intersect(
+/*================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	const fts_string_t*	token)	/*!< in: the token to search */
+{
+	trx_t*			trx = query->trx;
+	dict_table_t*		table = query->index->table;
+
+	ut_a(query->oper == FTS_EXIST);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	fprintf(stderr, "INTERSECT: Searching: '%.*s'\n",
+		(int) token->f_len, token->f_str);
+#endif
+
+	/* If the words set is not empty and multi exist is true,
+	we know the intersection set is empty in advance. */
+	if (!(rbt_empty(query->doc_ids) && query->multi_exist)) {
+		ulint                   n_doc_ids = 0;
+		ulint			i;
+		fts_fetch_t		fetch;
+		const ib_vector_t*	nodes;
+		const fts_index_cache_t*index_cache;
+		que_t*			graph = NULL;
+		fts_cache_t*		cache = table->fts->cache;
+		dberr_t			error;
+
+		ut_a(!query->intersection);
+
+		n_doc_ids = rbt_size(query->doc_ids);
+
+		/* Create the rb tree that will hold the doc ids of
+		the intersection. */
+		query->intersection = rbt_create(
+			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+		query->total_size += SIZEOF_RBT_CREATE;
+
+		/* This is to avoid decompressing the ilist if the
+		node's ilist doc ids are out of range. */
+		if (!rbt_empty(query->doc_ids) && query->multi_exist) {
+			const ib_rbt_node_t*	node;
+			doc_id_t*		doc_id;
+
+			node = rbt_first(query->doc_ids);
+			doc_id = rbt_value(doc_id_t, node);
+			query->lower_doc_id = *doc_id;
+
+			node = rbt_last(query->doc_ids);
+			doc_id = rbt_value(doc_id_t, node);
+			query->upper_doc_id = *doc_id;
+
+		} else {
+			query->lower_doc_id = 0;
+			query->upper_doc_id = 0;
+		}
+
+		/* Search the cache for a matching word first. */
+
+		rw_lock_x_lock(&cache->lock);
+
+		/* Search for the index specific cache. */
+		index_cache = fts_find_index_cache(cache, query->index);
+
+		/* Must find the index cache. */
+		ut_a(index_cache != NULL);
+
+		if (query->cur_node->term.wildcard) {
+			/* Wildcard search the index cache */
+			fts_cache_find_wildcard(query, index_cache, token);
+		} else {
+			nodes = fts_cache_find_word(index_cache, token);
+
+			for (i = 0; nodes && i < ib_vector_size(nodes)
+			     && query->error == DB_SUCCESS; ++i) {
+				const fts_node_t*	node;
+
+				node = static_cast<const fts_node_t*>(
+					ib_vector_get_const(nodes, i));
+
+				fts_query_check_node(query, token, node);
+			}
+		}
+
+		rw_lock_x_unlock(&cache->lock);
+
+		/* error is passed by 'query->error' */
+		if (query->error != DB_SUCCESS) {
+			ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+			return(query->error);
+		}
+
+		/* Setup the callback args for filtering and
+		consolidating the ilist. */
+		fetch.read_arg = query;
+		fetch.read_record = fts_query_index_fetch_nodes;
+
+		error = fts_index_fetch_nodes(
+			trx, &graph, &query->fts_index_table, token, &fetch);
+
+		/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+		ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+		if (error != DB_SUCCESS) {
+			query->error = error;
+		}
+
+		fts_que_graph_free(graph);
+
+		if (query->error == DB_SUCCESS) {
+			/* Make the intesection (rb tree) the current doc id
+			set and free the old set. */
+			fts_query_free_doc_ids(query, query->doc_ids);
+			query->doc_ids = query->intersection;
+			query->intersection = NULL;
+
+			ut_a(!query->multi_exist || (query->multi_exist
+			     && rbt_size(query->doc_ids) <= n_doc_ids));
+		}
+	}
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Query index cache.
+@return DB_SUCCESS if all go well */
+static
+dberr_t
+fts_query_cache(
+/*============*/
+	fts_query_t*		query,	/*!< in/out: query instance */
+	const fts_string_t*	token)	/*!< in: token to search */
+{
+	const fts_index_cache_t*index_cache;
+	dict_table_t*		table = query->index->table;
+	fts_cache_t*		cache = table->fts->cache;
+
+	/* Search the cache for a matching word first. */
+	rw_lock_x_lock(&cache->lock);
+
+	/* Search for the index specific cache. */
+	index_cache = fts_find_index_cache(cache, query->index);
+
+	/* Must find the index cache. */
+	ut_a(index_cache != NULL);
+
+	if (query->cur_node->term.wildcard
+	    && query->flags != FTS_PROXIMITY
+	    && query->flags != FTS_PHRASE) {
+		/* Wildcard search the index cache */
+		fts_cache_find_wildcard(query, index_cache, token);
+	} else {
+		const ib_vector_t*      nodes;
+		ulint			i;
+
+		nodes = fts_cache_find_word(index_cache, token);
+
+		for (i = 0; nodes && i < ib_vector_size(nodes)
+		     && query->error == DB_SUCCESS; ++i) {
+			const fts_node_t*	node;
+
+			node = static_cast<const fts_node_t*>(
+				ib_vector_get_const(nodes, i));
+
+			fts_query_check_node(query, token, node);
+		}
+	}
+
+	rw_lock_x_unlock(&cache->lock);
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Set union.
+@return DB_SUCCESS if all go well */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_union(
+/*============*/
+	fts_query_t*		query,	/*!< in: query instance */
+	fts_string_t*		token)	/*!< in: token to search */
+{
+	fts_fetch_t		fetch;
+	ulint			n_doc_ids = 0;
+	trx_t*			trx = query->trx;
+	que_t*			graph = NULL;
+	dberr_t			error;
+
+	ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING ||
+	     query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	fprintf(stderr, "UNION: Searching: '%.*s'\n",
+		(int) token->f_len, token->f_str);
+#endif
+
+	if (query->doc_ids) {
+		n_doc_ids = rbt_size(query->doc_ids);
+	}
+
+	if (token->f_len == 0) {
+		return(query->error);
+	}
+
+	/* Single '%' would confuse parser in pars_like_rebind(). In addition,
+	our wildcard search only supports prefix search */
+	ut_ad(*token->f_str != '%');
+
+	fts_query_cache(query, token);
+
+	/* Setup the callback args for filtering and
+	consolidating the ilist. */
+	fetch.read_arg = query;
+	fetch.read_record = fts_query_index_fetch_nodes;
+
+	/* Read the nodes from disk. */
+	error = fts_index_fetch_nodes(
+		trx, &graph, &query->fts_index_table, token, &fetch);
+
+	/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+	ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+	if (error != DB_SUCCESS) {
+		query->error = error;
+	}
+
+	fts_que_graph_free(graph);
+
+	if (query->error == DB_SUCCESS) {
+
+		/* The size can't decrease. */
+		ut_a(rbt_size(query->doc_ids) >= n_doc_ids);
+
+		/* Calulate the number of doc ids that were added to
+		the current doc id set. */
+		if (query->doc_ids) {
+			n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids;
+		}
+	}
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Depending upon the current query operator process the doc id.
+return DB_SUCCESS if all go well
+or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
+static
+dberr_t
+fts_query_process_doc_id(
+/*=====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id,		/*!< in: doc id to process */
+	fts_rank_t	rank)		/*!< in: if non-zero, it is the
+					rank associated with the doc_id */
+{
+	if (query->flags == FTS_OPT_RANKING) {
+		return(DB_SUCCESS);
+	}
+
+	switch (query->oper) {
+	case FTS_NONE:
+		fts_query_union_doc_id(query, doc_id, rank);
+		break;
+
+	case FTS_EXIST:
+		fts_query_intersect_doc_id(query, doc_id, rank);
+		break;
+
+	case FTS_IGNORE:
+		fts_query_remove_doc_id(query, doc_id);
+		break;
+
+	case FTS_NEGATE:
+		fts_query_change_ranking(query, doc_id, TRUE);
+		break;
+
+	case FTS_DECR_RATING:
+		fts_query_union_doc_id(query, doc_id, rank);
+		fts_query_change_ranking(query, doc_id, TRUE);
+		break;
+
+	case FTS_INCR_RATING:
+		fts_query_union_doc_id(query, doc_id, rank);
+		fts_query_change_ranking(query, doc_id, FALSE);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	if (query->total_size > fts_result_cache_limit) {
+		return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+	} else {
+		return(DB_SUCCESS);
+	}
+}
+
+/*****************************************************************//**
+Merge two result sets. */
+static
+dberr_t
+fts_merge_doc_ids(
+/*==============*/
+	fts_query_t*	query,		/*!< in,out: query instance */
+	const ib_rbt_t*	doc_ids)	/*!< in: result set to merge */
+{
+	const ib_rbt_node_t*	node;
+
+	DBUG_ENTER("fts_merge_doc_ids");
+
+	ut_a(!query->intersection);
+
+	/* To process FTS_EXIST operation (intersection), we need
+	to create a new result set for fts_query_intersect(). */
+	if (query->oper == FTS_EXIST) {
+
+		query->intersection = rbt_create(
+			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+		query->total_size += SIZEOF_RBT_CREATE;
+	}
+
+	/* Merge the elements to the result set. */
+	for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) {
+		fts_ranking_t*		ranking;
+		ulint			pos = 0;
+		fts_string_t		word;
+
+		ranking = rbt_value(fts_ranking_t, node);
+
+		query->error = fts_query_process_doc_id(
+				query, ranking->doc_id, ranking->rank);
+
+		if (query->error != DB_SUCCESS) {
+			DBUG_RETURN(query->error);
+		}
+
+		/* Merge words. Don't need to take operator into account. */
+		ut_a(ranking->words);
+		while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
+			fts_query_add_word_to_document(query, ranking->doc_id,
+						       &word);
+		}
+	}
+
+	/* If it is an intersection operation, reset query->doc_ids
+	to query->intersection and free the old result list. */
+	if (query->oper == FTS_EXIST && query->intersection != NULL) {
+		fts_query_free_doc_ids(query, query->doc_ids);
+		query->doc_ids = query->intersection;
+		query->intersection = NULL;
+	}
+
+	DBUG_RETURN(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Skip non-whitespace in a string. Move ptr to the next word boundary.
+@return pointer to first whitespace character or end */
+UNIV_INLINE
+byte*
+fts_query_skip_word(
+/*================*/
+	byte*		ptr,		/*!< in: start of scan */
+	const byte*	end)		/*!< in: pointer to end of string */
+{
+	/* TODO: Does this have to be UTF-8 too ? */
+	while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) {
+		++ptr;
+	}
+
+	return(ptr);
+}
+
+/*****************************************************************//**
+Check whether the remaining terms in the phrase match the text.
+@return TRUE if matched else FALSE */
+static
+ibool
+fts_query_match_phrase_terms(
+/*=========================*/
+	fts_phrase_t*	phrase,		/*!< in: phrase to match */
+	byte**		start,		/*!< in/out: text to search, we can't
+					make this const becase we need to
+					first convert the string to
+					lowercase */
+	const byte*	end,		/*!< in: pointer to the end of
+					the string to search */
+	mem_heap_t*	heap)		/*!< in: heap */
+{
+	ulint			i;
+	byte*			ptr = *start;
+	const ib_vector_t*	tokens = phrase->tokens;
+	ulint			distance = phrase->distance;
+
+	/* We check only from the second term onwards, since the first
+	must have matched otherwise we wouldn't be here. */
+	for (i = 1; ptr < end && i < ib_vector_size(tokens); /* No op */) {
+		fts_string_t		match;
+		fts_string_t		cmp_str;
+		const fts_string_t*	token;
+		int			result;
+		ulint			ret;
+		ulint			offset;
+
+		ret = innobase_mysql_fts_get_token(
+			phrase->charset, ptr, (byte*) end,
+			&match, &offset);
+
+		if (match.f_len > 0) {
+			/* Get next token to match. */
+			token = static_cast<const fts_string_t*>(
+				ib_vector_get_const(tokens, i));
+
+			fts_utf8_string_dup(&cmp_str, &match, heap);
+
+			result = innobase_fts_text_case_cmp(
+				phrase->charset, token, &cmp_str);
+
+			/* Skip the rest of the tokens if this one doesn't
+			match and the proximity distance is exceeded. */
+			if (result
+			    && (distance == ULINT_UNDEFINED
+				|| distance == 0)) {
+
+				break;
+			}
+
+			/* This token matched move to the next token. */
+			if (result == 0) {
+				/* Advance the text to search by the length
+				of the last token. */
+				ptr += ret;
+
+				/* Advance to the next token. */
+				++i;
+			} else {
+
+				ut_a(distance != ULINT_UNDEFINED);
+
+				ptr = fts_query_skip_word(ptr, end);
+			}
+
+			/* Distance can be 0 for exact matches. */
+			if (distance != ULINT_UNDEFINED && distance > 0) {
+				--distance;
+			}
+		} else {
+			ptr += ret;
+		}
+	}
+
+	*start = ptr;
+
+	/* Can't be greater than the number of elements. */
+	ut_a(i <= ib_vector_size(tokens));
+
+	/* This is the case for multiple words. */
+	if (i == ib_vector_size(tokens)) {
+		phrase->found = TRUE;
+	}
+
+	return(phrase->found);
+}
+
+/*****************************************************************//**
+Callback function to count the number of words in position ranges,
+and see whether the word count is in specified "phrase->distance"
+@return true if the number of characters is less than the "distance" */
+static
+bool
+fts_proximity_is_word_in_range(
+/*===========================*/
+	const fts_phrase_t*
+			phrase,		/*!< in: phrase with the search info */
+	byte*		start,		/*!< in: text to search */
+	ulint		total_len)	/*!< in: length of text */
+{
+	fts_proximity_t*	proximity_pos = phrase->proximity_pos;
+
+	ut_ad(proximity_pos->n_pos == proximity_pos->min_pos.size());
+	ut_ad(proximity_pos->n_pos == proximity_pos->max_pos.size());
+
+	/* Search each matched position pair (with min and max positions)
+	and count the number of words in the range */
+	for (ulint i = 0; i < proximity_pos->n_pos; i++) {
+		ulint		cur_pos = proximity_pos->min_pos[i];
+		ulint		n_word = 0;
+
+		ut_ad(proximity_pos->max_pos[i] <= total_len);
+
+		/* Walk through words in the range and count them */
+		while (cur_pos <= proximity_pos->max_pos[i]) {
+			ulint		len;
+			fts_string_t	str;
+			ulint           offset = 0;
+
+			len = innobase_mysql_fts_get_token(
+				phrase->charset,
+				start + cur_pos,
+				start + total_len, &str, &offset);
+
+			if (len == 0) {
+				break;
+			}
+
+			/* Advances position with "len" bytes */
+			cur_pos += len;
+
+			/* Record the number of words */
+			if (str.f_n_char > 0) {
+				n_word++;
+			}
+
+			if (n_word > phrase->distance) {
+				break;
+			}
+		}
+
+		/* Check if the number of words is less than specified
+		"distance" */
+		if (n_word && n_word <= phrase->distance) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/*****************************************************************//**
+Callback function to fetch and search the document.
+@return TRUE if matched else FALSE */
+static
+ibool
+fts_query_match_phrase(
+/*===================*/
+	fts_phrase_t*	phrase,		/*!< in: phrase to match */
+	byte*		start,		/*!< in: text to search, we can't make
+					this const becase we need to first
+					convert the string to lowercase */
+	ulint		cur_len,	/*!< in: length of text */
+	ulint		prev_len,	/*!< in: total length for searched
+					doc fields*/
+	mem_heap_t*	heap)		/* heap */
+{
+	ulint			i;
+	const fts_string_t*	first;
+	const byte*		end = start + cur_len;
+	const ib_vector_t*	tokens = phrase->tokens;
+	const ib_vector_t*	positions = phrase->match->positions;
+
+	ut_a(!phrase->found);
+	ut_a(phrase->match->doc_id > 0);
+	ut_a(ib_vector_size(tokens) > 0);
+	ut_a(ib_vector_size(positions) > 0);
+
+	first = static_cast<const fts_string_t*>(
+		ib_vector_get_const(tokens, 0));
+
+	ut_a(phrase->match->start < ib_vector_size(positions));
+
+	for (i = phrase->match->start; i < ib_vector_size(positions); ++i) {
+		ulint		pos;
+		fts_string_t	match;
+		fts_string_t	cmp_str;
+		byte*		ptr = start;
+		ulint		ret;
+		ulint		offset;
+
+		pos = *(ulint*) ib_vector_get_const(positions, i);
+
+		if (pos == ULINT_UNDEFINED) {
+			break;
+		}
+
+		if (pos < prev_len) {
+			continue;
+		}
+
+		/* Document positions are calculated from the beginning
+		of the first field, need to save the length for each
+		searched field to adjust the doc position when search
+		phrases. */
+		pos -= prev_len;
+		ptr = match.f_str = start + pos;
+
+		/* Within limits ? */
+		if (ptr >= end) {
+			break;
+		}
+
+		ret = innobase_mysql_fts_get_token(
+			phrase->charset, start + pos, (byte*) end,
+			&match, &offset);
+
+		if (match.f_len == 0) {
+			break;
+		}
+
+		fts_utf8_string_dup(&cmp_str, &match, heap);
+
+		if (innobase_fts_text_case_cmp(
+			phrase->charset, first, &cmp_str) == 0) {
+
+			/* This is the case for the single word
+			in the phrase. */
+			if (ib_vector_size(phrase->tokens) == 1) {
+				phrase->found = TRUE;
+				break;
+			}
+
+			ptr += ret;
+
+			/* Match the remaining terms in the phrase. */
+			if (fts_query_match_phrase_terms(phrase, &ptr,
+							 end, heap)) {
+				break;
+			}
+		}
+	}
+
+	return(phrase->found);
+}
+
+/*****************************************************************//**
+Callback function to fetch and search the document.
+@return whether the phrase is found */
+static
+ibool
+fts_query_fetch_document(
+/*=====================*/
+	void*		row,		/*!< in:  sel_node_t* */
+	void*		user_arg)	/*!< in:  fts_doc_t* */
+{
+
+	que_node_t*	exp;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	fts_phrase_t*	phrase = static_cast<fts_phrase_t*>(user_arg);
+	ulint		prev_len = 0;
+	ulint		total_len = 0;
+	byte*		document_text = NULL;
+
+	exp = node->select_list;
+
+	phrase->found = FALSE;
+
+	/* For proximity search, we will need to get the whole document
+	from all fields, so first count the total length of the document
+	from all the fields */
+	if (phrase->proximity_pos) {
+		 while (exp) {
+			ulint		field_len;
+			dfield_t*	dfield = que_node_get_val(exp);
+			byte*		data = static_cast<byte*>(
+						dfield_get_data(dfield));
+
+			if (dfield_is_ext(dfield)) {
+				ulint	local_len = dfield_get_len(dfield);
+
+				local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+				field_len = mach_read_from_4(
+					data + local_len + BTR_EXTERN_LEN + 4);
+			} else {
+				field_len = dfield_get_len(dfield);
+			}
+
+			if (field_len != UNIV_SQL_NULL) {
+				total_len += field_len + 1;
+			}
+
+			exp = que_node_get_next(exp);
+		}
+
+		document_text = static_cast<byte*>(mem_heap_zalloc(
+					phrase->heap, total_len));
+
+		if (!document_text) {
+			return(FALSE);
+		}
+	}
+
+	exp = node->select_list;
+
+	while (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		byte*		data = static_cast<byte*>(
+					dfield_get_data(dfield));
+		ulint		cur_len;
+
+		if (dfield_is_ext(dfield)) {
+			data = btr_copy_externally_stored_field(
+				&cur_len, data, phrase->zip_size,
+				dfield_get_len(dfield), phrase->heap);
+		} else {
+			cur_len = dfield_get_len(dfield);
+		}
+
+		if (cur_len != UNIV_SQL_NULL && cur_len != 0) {
+			if (phrase->proximity_pos) {
+				ut_ad(prev_len + cur_len <= total_len);
+				memcpy(document_text + prev_len, data, cur_len);
+			} else {
+				/* For phrase search */
+				phrase->found =
+					fts_query_match_phrase(
+						phrase,
+						static_cast<byte*>(data),
+						cur_len, prev_len,
+						phrase->heap);
+			}
+
+			/* Document positions are calculated from the beginning
+			of the first field, need to save the length for each
+			searched field to adjust the doc position when search
+			phrases. */
+			prev_len += cur_len + 1;
+		}
+
+		if (phrase->found) {
+			break;
+		}
+
+		exp = que_node_get_next(exp);
+	}
+
+	if (phrase->proximity_pos) {
+		ut_ad(prev_len <= total_len);
+
+		phrase->found = fts_proximity_is_word_in_range(
+			phrase, document_text, total_len);
+	}
+
+	return(phrase->found);
+}
+
+#if 0
+/********************************************************************
+Callback function to check whether a record was found or not. */
+static
+ibool
+fts_query_select(
+/*=============*/
+	void*		row,		/*!< in:  sel_node_t* */
+	void*		user_arg)	/*!< in:  fts_doc_t* */
+{
+	int		i;
+	que_node_t*	exp;
+	sel_node_t*	node = row;
+	fts_select_t*	select = user_arg;
+
+	ut_a(select->word_freq);
+	ut_a(select->word_freq->doc_freqs);
+
+	exp = node->select_list;
+
+	for (i = 0; exp && !select->found; ++i) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		void*		data = dfield_get_data(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		switch (i) {
+		case 0: /* DOC_COUNT */
+			if (len != UNIV_SQL_NULL && len != 0) {
+
+				select->word_freq->doc_count +=
+					mach_read_from_4(data);
+			}
+			break;
+
+		case 1: /* ILIST */
+			if (len != UNIV_SQL_NULL && len != 0) {
+
+				fts_query_find_doc_id(select, data, len);
+			}
+			break;
+
+		default:
+			ut_error;
+		}
+
+		exp = que_node_get_next(exp);
+	}
+
+	return(FALSE);
+}
+
+/********************************************************************
+Read the rows from the FTS index, that match word and where the
+doc id is between first and last doc id.
+@return DB_SUCCESS if all go well else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_find_term(
+/*================*/
+	fts_query_t*		query,	/*!< in: FTS query state */
+	que_t**			graph,	/*!< in: prepared statement */
+	const fts_string_t*	word,	/*!< in: the word to fetch */
+	doc_id_t		doc_id,	/*!< in: doc id to match */
+	ulint*			min_pos,/*!< in/out: pos found must be
+					 greater than this minimum value. */
+	ibool*			found)	/*!< out: TRUE if found else FALSE */
+{
+	pars_info_t*		info;
+	dberr_t			error;
+	fts_select_t		select;
+	doc_id_t		match_doc_id;
+	trx_t*			trx = query->trx;
+
+	trx->op_info = "fetching FTS index matching nodes";
+
+	if (*graph) {
+		info = (*graph)->info;
+	} else {
+		info = pars_info_create();
+	}
+
+	select.found = FALSE;
+	select.doc_id = doc_id;
+	select.min_pos = *min_pos;
+	select.word_freq = fts_query_add_word_freq(query, word->f_str);
+
+	pars_info_bind_function(info, "my_func", fts_query_select, &select);
+	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &match_doc_id, doc_id);
+
+	fts_bind_doc_id(info, "min_doc_id", &match_doc_id);
+
+	fts_bind_doc_id(info, "max_doc_id", &match_doc_id);
+
+	if (!*graph) {
+		ulint		selected;
+
+		selected = fts_select_index(*word->f_str);
+
+		query->fts_index_table.suffix = fts_get_suffix(selected);
+
+		*graph = fts_parse_sql(
+			&query->fts_index_table,
+			info,
+			"DECLARE FUNCTION my_func;\n"
+			"DECLARE CURSOR c IS"
+			" SELECT doc_count, ilist\n"
+			" FROM \"%s\"\n"
+			" WHERE word LIKE :word AND "
+			"	first_doc_id <= :min_doc_id AND "
+			"	last_doc_id >= :max_doc_id\n"
+			" ORDER BY first_doc_id;\n"
+			"BEGIN\n"
+			"\n"
+			"OPEN c;\n"
+			"WHILE 1 = 1 LOOP\n"
+			"  FETCH c INTO my_func();\n"
+			"  IF c % NOTFOUND THEN\n"
+			"    EXIT;\n"
+			"  END IF;\n"
+			"END LOOP;\n"
+			"CLOSE c;");
+	}
+
+	for(;;) {
+		error = fts_eval_sql(trx, *graph);
+
+		if (error == DB_SUCCESS) {
+
+			break;				/* Exit the loop. */
+		} else {
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, " InnoDB: Warning: lock wait "
+					"timeout reading FTS index. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, " InnoDB: Error: %lu "
+					"while reading FTS index.\n", error);
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	/* Value to return */
+	*found = select.found;
+
+	if (*found) {
+		*min_pos = select.min_pos;
+	}
+
+	return(error);
+}
+
+/********************************************************************
+Callback aggregator for int columns. */
+static
+ibool
+fts_query_sum(
+/*==========*/
+					/*!< out: always returns TRUE */
+	void*		row,		/*!< in:  sel_node_t* */
+	void*		user_arg)	/*!< in:  ulint* */
+{
+
+	que_node_t*	exp;
+	sel_node_t*	node = row;
+	ulint*		total = user_arg;
+
+	exp = node->select_list;
+
+	while (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		void*		data = dfield_get_data(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		if (len != UNIV_SQL_NULL && len != 0) {
+			*total += mach_read_from_4(data);
+		}
+
+		exp = que_node_get_next(exp);
+	}
+
+	return(TRUE);
+}
+
+/********************************************************************
+Calculate the total documents that contain a particular word (term).
+@return DB_SUCCESS if all go well else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_total_docs_containing_term(
+/*=================================*/
+	fts_query_t*		query,	/*!< in: FTS query state */
+	const fts_string_t*	word,	/*!< in: the word to check */
+	ulint*			total)	/*!< out: documents containing word */
+{
+	pars_info_t*		info;
+	dberr_t			error;
+	que_t*			graph;
+	ulint			selected;
+	trx_t*			trx = query->trx;
+
+	trx->op_info = "fetching FTS index document count";
+
+	*total = 0;
+
+	info = pars_info_create();
+
+	pars_info_bind_function(info, "my_func", fts_query_sum, total);
+	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+	selected = fts_select_index(*word->f_str);
+
+	query->fts_index_table.suffix = fts_get_suffix(selected);
+
+	graph = fts_parse_sql(
+		&query->fts_index_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT doc_count\n"
+		" FROM %s\n"
+		" WHERE word = :word "
+		" ORDER BY first_doc_id;\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for(;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+
+			break;				/* Exit the loop. */
+		} else {
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, " InnoDB: Warning: lock wait "
+					"timeout reading FTS index. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, " InnoDB: Error: %lu "
+					"while reading FTS index.\n", error);
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	fts_que_graph_free(graph);
+
+	return(error);
+}
+
+/********************************************************************
+Get the total number of words in a documents.
+@return DB_SUCCESS if all go well else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_terms_in_document(
+/*========================*/
+	fts_query_t*	query,		/*!< in: FTS query state */
+	doc_id_t	doc_id,		/*!< in: the word to check */
+	ulint*		total)		/*!< out: total words in document */
+{
+	pars_info_t*	info;
+	dberr_t		error;
+	que_t*		graph;
+	doc_id_t	read_doc_id;
+	trx_t*		trx = query->trx;
+
+	trx->op_info = "fetching FTS document term count";
+
+	*total = 0;
+
+	info = pars_info_create();
+
+	pars_info_bind_function(info, "my_func", fts_query_sum, total);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &read_doc_id, doc_id);
+	fts_bind_doc_id(info, "doc_id", &read_doc_id);
+
+	query->fts_index_table.suffix = "DOC_ID";
+
+	graph = fts_parse_sql(
+		&query->fts_index_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT count\n"
+		" FROM \"%s\"\n"
+		" WHERE doc_id = :doc_id "
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for(;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+
+			break;				/* Exit the loop. */
+		} else {
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, " InnoDB: Warning: lock wait "
+					"timeout reading FTS doc id table. "
+					"Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, " InnoDB: Error: %lu "
+					"while reading FTS doc id table.\n",
+					error);
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	fts_que_graph_free(graph);
+
+	return(error);
+}
+#endif
+
+/*****************************************************************//**
+Retrieve the document and match the phrase tokens.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_match_document(
+/*=====================*/
+	ib_vector_t*	tokens,		/*!< in: phrase tokens */
+	fts_get_doc_t*	get_doc,	/*!< in: table and prepared statements */
+	fts_match_t*	match,		/*!< in: doc id and positions */
+	ulint		distance,	/*!< in: proximity distance */
+	ibool*		found)		/*!< out: TRUE if phrase found */
+{
+	dberr_t		error;
+	fts_phrase_t	phrase;
+
+	memset(&phrase, 0x0, sizeof(phrase));
+
+	phrase.match = match;		/* Positions to match */
+	phrase.tokens = tokens;		/* Tokens to match */
+	phrase.distance = distance;
+	phrase.charset = get_doc->index_cache->charset;
+	phrase.zip_size = dict_table_zip_size(
+		get_doc->index_cache->index->table);
+	phrase.heap = mem_heap_create(512);
+
+	*found = phrase.found = FALSE;
+
+	error = fts_doc_fetch_by_doc_id(
+		get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
+		fts_query_fetch_document, &phrase);
+
+	if (error != DB_SUCCESS) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "InnoDB: Error: (%s) matching document.\n",
+			ut_strerr(error));
+	} else {
+		*found = phrase.found;
+	}
+
+	mem_heap_free(phrase.heap);
+
+	return(error);
+}
+
+/*****************************************************************//**
+This function fetches the original documents and count the
+words in between matching words to see that is in specified distance
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+bool
+fts_query_is_in_proximity_range(
+/*============================*/
+	const fts_query_t*	query,		/*!< in:  query instance */
+	fts_match_t**		match,		/*!< in: query instance */
+	fts_proximity_t*	qualified_pos)	/*!< in: position info for
+						qualified ranges */
+{
+	fts_get_doc_t		get_doc;
+	fts_cache_t*		cache = query->index->table->fts->cache;
+	dberr_t			err;
+	fts_phrase_t		phrase;
+
+	memset(&get_doc, 0x0, sizeof(get_doc));
+	memset(&phrase, 0x0, sizeof(phrase));
+
+	rw_lock_x_lock(&cache->lock);
+	get_doc.index_cache = fts_find_index_cache(cache, query->index);
+	rw_lock_x_unlock(&cache->lock);
+	ut_a(get_doc.index_cache != NULL);
+
+	phrase.distance = query->distance;
+	phrase.charset = get_doc.index_cache->charset;
+	phrase.zip_size = dict_table_zip_size(
+		get_doc.index_cache->index->table);
+	phrase.heap = mem_heap_create(512);
+	phrase.proximity_pos = qualified_pos;
+	phrase.found = FALSE;
+
+	err = fts_doc_fetch_by_doc_id(
+		&get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
+		fts_query_fetch_document, &phrase);
+
+	if (err != DB_SUCCESS) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Error: (%s) in verification phase of proximity "
+			"search", ut_strerr(err));
+	}
+
+	/* Free the prepared statement. */
+	if (get_doc.get_document_graph) {
+		fts_que_graph_free(get_doc.get_document_graph);
+		get_doc.get_document_graph = NULL;
+	}
+
+	mem_heap_free(phrase.heap);
+
+	return(err == DB_SUCCESS && phrase.found);
+}
+
+/*****************************************************************//**
+Iterate over the matched document ids and search the for the
+actual phrase in the text.
+@return DB_SUCCESS if all OK */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_search_phrase(
+/*====================*/
+	fts_query_t*		query,		/*!< in: query instance */
+	ib_vector_t*		orig_tokens,	/*!< in: tokens to search,
+						with any stopwords in the
+						original phrase */
+	ib_vector_t*		tokens)		/*!< in: tokens that does
+						not include stopwords and
+						can be used to calculate
+						ranking */
+{
+	ulint			i;
+	fts_get_doc_t		get_doc;
+	ulint			n_matched;
+	fts_cache_t*		cache = query->index->table->fts->cache;
+
+	n_matched = ib_vector_size(query->matched);
+
+	/* Setup the doc retrieval infrastructure. */
+	memset(&get_doc, 0x0, sizeof(get_doc));
+
+	rw_lock_x_lock(&cache->lock);
+
+	get_doc.index_cache = fts_find_index_cache(cache, query->index);
+
+	/* Must find the index cache */
+	ut_a(get_doc.index_cache != NULL);
+
+	rw_lock_x_unlock(&cache->lock);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " Start phrase search\n");
+#endif
+
+	/* Read the document from disk and do the actual
+	match, matching documents will be added to the current
+	doc id set. */
+	for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++i) {
+		fts_match_t*	match;
+		ibool		found = FALSE;
+
+		match = static_cast<fts_match_t*>(
+			ib_vector_get(query->matched, i));
+
+		/* Skip the document ids that were filtered out by
+		an earlier pass. */
+		if (match->doc_id != 0) {
+
+			query->error = fts_query_match_document(
+				orig_tokens, &get_doc,
+				match, query->distance, &found);
+
+			if (query->error == DB_SUCCESS && found) {
+				ulint	z;
+
+				query->error = fts_query_process_doc_id(query,
+							 match->doc_id, 0);
+				if (query->error != DB_SUCCESS) {
+					goto func_exit;
+				}
+
+				for (z = 0; z < ib_vector_size(tokens); z++) {
+					fts_string_t*   token;
+					token = static_cast<fts_string_t*>(
+						ib_vector_get(tokens, z));
+					fts_query_add_word_to_document(
+						query, match->doc_id, token);
+				}
+			}
+		}
+	}
+
+func_exit:
+	/* Free the prepared statement. */
+	if (get_doc.get_document_graph) {
+		fts_que_graph_free(get_doc.get_document_graph);
+		get_doc.get_document_graph = NULL;
+	}
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Text/Phrase search.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_phrase_search(
+/*====================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	const fts_string_t*	phrase)	/*!< in: token to search */
+{
+	ib_vector_t*		tokens;
+	ib_vector_t*		orig_tokens;
+	mem_heap_t*		heap = mem_heap_create(sizeof(fts_string_t));
+	ulint			len = phrase->f_len;
+	ulint			cur_pos = 0;
+	ib_alloc_t*		heap_alloc;
+	ulint			num_token;
+	CHARSET_INFO*		charset;
+
+	charset = query->fts_index_table.charset;
+
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
+	orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
+
+	if (query->distance != ULINT_UNDEFINED && query->distance > 0) {
+		query->flags = FTS_PROXIMITY;
+	} else {
+		query->flags = FTS_PHRASE;
+	}
+
+	/* Split the phrase into tokens. */
+	while (cur_pos < len) {
+		fts_cache_t*	cache = query->index->table->fts->cache;
+		ib_rbt_bound_t	parent;
+		ulint		offset;
+		ulint		cur_len;
+		fts_string_t	result_str;
+
+                cur_len = innobase_mysql_fts_get_token(
+                        charset,
+                        reinterpret_cast<const byte*>(phrase->f_str) + cur_pos,
+                        reinterpret_cast<const byte*>(phrase->f_str) + len,
+			&result_str, &offset);
+
+		if (cur_len == 0) {
+			break;
+		}
+
+		cur_pos += cur_len;
+
+		if (result_str.f_n_char == 0) {
+			continue;
+		}
+
+		fts_string_t*	token = static_cast<fts_string_t*>(
+			ib_vector_push(tokens, NULL));
+
+		token->f_str = static_cast<byte*>(
+			mem_heap_alloc(heap, result_str.f_len + 1));
+		ut_memcpy(token->f_str, result_str.f_str, result_str.f_len);
+
+		token->f_len = result_str.f_len;
+		token->f_str[token->f_len] = 0;
+
+		if (cache->stopword_info.cached_stopword
+		    && rbt_search(cache->stopword_info.cached_stopword,
+			       &parent, token) != 0
+		    && result_str.f_n_char >= fts_min_token_size
+		    && result_str.f_n_char <= fts_max_token_size) {
+			/* Add the word to the RB tree so that we can
+			calculate it's frequencey within a document. */
+			fts_query_add_word_freq(query, token);
+		} else {
+			ib_vector_pop(tokens);
+		}
+
+		/* we will start to store all words including stopwords
+		in the "orig_tokens" vector, but skip any leading words
+		that are stopwords */
+		if (!ib_vector_is_empty(tokens)) {
+			fts_string_t*	orig_token = static_cast<fts_string_t*>(
+				ib_vector_push(orig_tokens, NULL));
+
+			orig_token->f_str = token->f_str;
+			orig_token->f_len = token->f_len;
+		}
+	}
+
+	num_token = ib_vector_size(tokens);
+	if (num_token > MAX_PROXIMITY_ITEM) {
+		query->error = DB_FTS_TOO_MANY_WORDS_IN_PHRASE;
+		goto func_exit;
+	}
+
+	ut_ad(ib_vector_size(orig_tokens) >= num_token);
+
+	/* Ignore empty strings. */
+	if (num_token > 0) {
+		fts_string_t*	token;
+		fts_fetch_t	fetch;
+		trx_t*		trx = query->trx;
+		fts_ast_oper_t	oper = query->oper;
+		que_t*		graph = NULL;
+		ulint		i;
+		dberr_t		error;
+
+		/* Create the vector for storing matching document ids
+		and the positions of the first token of the phrase. */
+		if (!query->matched) {
+			ib_alloc_t*	heap_alloc;
+
+			heap_alloc = ib_heap_allocator_create(heap);
+
+			if (!(query->flags & FTS_PROXIMITY)
+			    && !(query->flags & FTS_PHRASE)) {
+				query->matched = ib_vector_create(
+					heap_alloc, sizeof(fts_match_t),
+					64);
+			} else {
+				ut_a(num_token <= MAX_PROXIMITY_ITEM);
+				query->match_array =
+					(ib_vector_t**) mem_heap_alloc(
+						heap,
+						num_token *
+						sizeof(query->matched));
+
+				for (i = 0; i < num_token; i++) {
+					query->match_array[i] =
+					ib_vector_create(
+						heap_alloc, sizeof(fts_match_t),
+						64);
+				}
+
+				query->matched = query->match_array[0];
+			}
+		}
+
+		/* Setup the callback args for filtering and consolidating
+		the ilist. */
+		fetch.read_arg = query;
+		fetch.read_record = fts_query_index_fetch_nodes;
+
+		for (i = 0; i < num_token; i++) {
+			/* Search for the first word from the phrase. */
+			token = static_cast<fts_string_t*>(
+				ib_vector_get(tokens, i));
+
+			if (query->flags & FTS_PROXIMITY
+			    || query->flags & FTS_PHRASE) {
+				query->matched = query->match_array[i];
+			}
+
+			error = fts_index_fetch_nodes(
+				trx, &graph, &query->fts_index_table,
+				token, &fetch);
+
+			/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+			ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+			if (error != DB_SUCCESS) {
+				query->error = error;
+			}
+
+			fts_que_graph_free(graph);
+			graph = NULL;
+
+			fts_query_cache(query, token);
+
+			if (!(query->flags & FTS_PHRASE)
+			    && !(query->flags & FTS_PROXIMITY)) {
+				break;
+			}
+
+			/* If any of the token can't be found,
+			no need to continue match */
+			if (ib_vector_is_empty(query->match_array[i])
+			    || query->error != DB_SUCCESS) {
+				goto func_exit;
+			}
+		}
+
+		/* Just a single word, no need to fetch the original
+		documents to do phrase matching */
+		if (ib_vector_size(orig_tokens) == 1
+		    && !ib_vector_is_empty(query->match_array[0])) {
+			fts_match_t*    match;
+			ulint		n_matched;
+
+			n_matched = ib_vector_size(query->match_array[0]);
+
+			for (i = 0; i < n_matched; i++) {
+				match = static_cast<fts_match_t*>(
+					ib_vector_get(
+						query->match_array[0], i));
+
+				query->error = fts_query_process_doc_id(
+						query, match->doc_id, 0);
+				if (query->error != DB_SUCCESS) {
+					goto func_exit;
+				}
+
+				fts_query_add_word_to_document(
+					query, match->doc_id, token);
+			}
+			query->oper = oper;
+			goto func_exit;
+		}
+
+		/* If we are doing proximity search, verify the distance
+		between all words, and check they are in specified distance. */
+		if (query->flags & FTS_PROXIMITY) {
+			fts_phrase_or_proximity_search(query, tokens);
+		} else {
+			ibool	matched;
+
+			/* Phrase Search case:
+			We filter out the doc ids that don't contain
+			all the tokens in the phrase. It's cheaper to
+			search the ilist than bringing the documents in
+			and then doing a search through the text. Isolated
+			testing shows this also helps in mitigating disruption
+			of the buffer cache. */
+			matched = fts_phrase_or_proximity_search(query, tokens);
+			query->matched = query->match_array[0];
+
+			/* Read the actual text in and search for the phrase. */
+			if (matched) {
+				ut_ad(query->error == DB_SUCCESS);
+				query->error = fts_query_search_phrase(
+					query, orig_tokens, tokens);
+			}
+		}
+
+		/* Restore original operation. */
+		query->oper = oper;
+
+		if (query->error != DB_SUCCESS) {
+			goto func_exit;
+		}
+	}
+
+func_exit:
+	mem_heap_free(heap);
+
+	/* Don't need it anymore. */
+	query->matched = NULL;
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Find the word and evaluate.
+@return DB_SUCCESS if all go well */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_query_execute(
+/*==============*/
+	fts_query_t*		query,	/*!< in: query instance */
+	fts_string_t*		token)	/*!< in: token to search */
+{
+	switch (query->oper) {
+	case FTS_NONE:
+	case FTS_NEGATE:
+	case FTS_INCR_RATING:
+	case FTS_DECR_RATING:
+		query->error = fts_query_union(query, token);
+		break;
+
+	case FTS_EXIST:
+		query->error = fts_query_intersect(query, token);
+		break;
+
+	case FTS_IGNORE:
+		query->error = fts_query_difference(query, token);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Create a wildcard string. It's the responsibility of the caller to
+free the byte* pointer. It's allocated using ut_malloc().
+@return ptr to allocated memory */
+static
+byte*
+fts_query_get_token(
+/*================*/
+	fts_ast_node_t*	node,		/*!< in: the current sub tree */
+	fts_string_t*	token)		/*!< in: token to create */
+{
+	ulint		str_len;
+	byte*		new_ptr = NULL;
+
+	str_len = node->term.ptr->len;
+
+	ut_a(node->type == FTS_AST_TERM);
+
+	token->f_len = str_len;
+	token->f_str = node->term.ptr->str;
+
+	if (node->term.wildcard) {
+
+		token->f_str = static_cast<byte*>(ut_malloc(str_len + 2));
+		token->f_len = str_len + 1;
+
+		memcpy(token->f_str, node->term.ptr->str, str_len);
+
+		token->f_str[str_len] = '%';
+		token->f_str[token->f_len] = 0;
+
+		new_ptr = token->f_str;
+	}
+
+	return(new_ptr);
+}
+
+/*****************************************************************//**
+Visit every node of the AST. */
+static
+dberr_t
+fts_query_visitor(
+/*==============*/
+	fts_ast_oper_t	oper,		/*!< in: current operator */
+	fts_ast_node_t*	node,		/*!< in: The root of the current subtree*/
+	void*		arg)		/*!< in: callback arg*/
+{
+	byte*		ptr;
+	fts_string_t	token;
+	fts_query_t*	query = static_cast<fts_query_t*>(arg);
+
+	ut_a(node);
+	DBUG_ENTER("fts_query_visitor");
+	DBUG_PRINT("fts", ("nodetype: %s", fts_ast_node_type_get(node->type)));
+
+	token.f_n_char = 0;
+	query->oper = oper;
+	query->cur_node = node;
+
+	switch (node->type) {
+	case FTS_AST_TEXT:
+		token.f_str = node->text.ptr->str;
+		token.f_len = node->text.ptr->len;
+
+		if (query->oper == FTS_EXIST) {
+			ut_ad(query->intersection == NULL);
+			query->intersection = rbt_create(
+				sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+			query->total_size += SIZEOF_RBT_CREATE;
+		}
+
+		/* Set the current proximity distance. */
+		query->distance = node->text.distance;
+
+		/* Force collection of doc ids and the positions. */
+		query->collect_positions = TRUE;
+
+		query->error = fts_query_phrase_search(query, &token);
+
+		query->collect_positions = FALSE;
+
+		if (query->oper == FTS_EXIST) {
+			fts_query_free_doc_ids(query, query->doc_ids);
+			query->doc_ids = query->intersection;
+			query->intersection = NULL;
+		}
+
+		break;
+
+	case FTS_AST_TERM:
+		token.f_str = node->term.ptr->str;
+		token.f_len = node->term.ptr->len;
+
+		/* Add the word to our RB tree that will be used to
+		calculate this terms per document frequency. */
+		fts_query_add_word_freq(query, &token);
+
+		ptr = fts_query_get_token(node, &token);
+		query->error = fts_query_execute(query, &token);
+
+		if (ptr) {
+			ut_free(ptr);
+		}
+		break;
+
+	case FTS_AST_SUBEXP_LIST:
+		query->error = fts_ast_visit_sub_exp(node, fts_query_visitor, arg);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	if (query->oper == FTS_EXIST) {
+		query->multi_exist = true;
+	}
+
+	DBUG_RETURN(query->error);
+}
+
+/*****************************************************************//**
+Process (nested) sub-expression, create a new result set to store the
+sub-expression result by processing nodes under current sub-expression
+list. Merge the sub-expression result with that of parent expression list.
+@return DB_SUCCESS if all  well */
+UNIV_INTERN
+dberr_t
+fts_ast_visit_sub_exp(
+/*==================*/
+	fts_ast_node_t*		node,		/*!< in,out: current root node */
+	fts_ast_callback	visitor,	/*!< in: callback function */
+	void*			arg)		/*!< in,out: arg for callback */
+{
+	fts_ast_oper_t		cur_oper;
+	fts_query_t*		query = static_cast<fts_query_t*>(arg);
+	ib_rbt_t*		parent_doc_ids;
+	ib_rbt_t*		subexpr_doc_ids;
+	dberr_t			error = DB_SUCCESS;
+	bool			will_be_ignored = false;
+	bool			multi_exist;
+
+	DBUG_ENTER("fts_ast_visit_sub_exp");
+
+	ut_a(node->type == FTS_AST_SUBEXP_LIST);
+
+	cur_oper = query->oper;
+
+	/* Save current result set */
+	parent_doc_ids = query->doc_ids;
+
+	/* Create new result set to store the sub-expression result. We
+	will merge this result set with the parent after processing. */
+	query->doc_ids = rbt_create(sizeof(fts_ranking_t),
+				    fts_ranking_doc_id_cmp);
+
+	query->total_size += SIZEOF_RBT_CREATE;
+
+	multi_exist = query->multi_exist;
+	query->multi_exist = false;
+	/* Process nodes in current sub-expression and store its
+	result set in query->doc_ids we created above. */
+	error = fts_ast_visit(FTS_NONE, node, visitor,
+			      arg, &will_be_ignored);
+
+	/* Reinstate parent node state */
+	query->multi_exist = multi_exist;
+	query->oper = cur_oper;
+
+	/* Merge the sub-expression result with the parent result set. */
+	subexpr_doc_ids = query->doc_ids;
+	query->doc_ids = parent_doc_ids;
+	if (error == DB_SUCCESS) {
+		error = fts_merge_doc_ids(query, subexpr_doc_ids);
+	}
+
+	/* Free current result set. Result already merged into parent. */
+	fts_query_free_doc_ids(query, subexpr_doc_ids);
+
+	DBUG_RETURN(error);
+}
+
+#if 0
+/*****************************************************************//***
+Check if the doc id exists in the ilist.
+@return TRUE if doc id found */
+static
+ulint
+fts_query_find_doc_id(
+/*==================*/
+	fts_select_t*	select,		/*!< in/out: contains the doc id to
+					find, we update the word freq if
+					document found */
+	void*		data,		/*!< in: doc id ilist */
+	ulint		len)		/*!< in: doc id ilist size */
+{
+	byte*		ptr = data;
+	doc_id_t	doc_id = 0;
+	ulint		decoded = 0;
+
+	/* Decode the ilist and search for selected doc_id. We also
+	calculate the frequency of the word in the document if found. */
+	while (decoded < len && !select->found) {
+		ulint		freq = 0;
+		ulint		min_pos = 0;
+		ulint		last_pos = 0;
+		ulint		pos = fts_decode_vlc(&ptr);
+
+		/* Add the delta. */
+		doc_id += pos;
+
+		while (*ptr) {
+			++freq;
+			last_pos += fts_decode_vlc(&ptr);
+
+			/* Only if min_pos is not set and the current
+			term exists in a position greater than the
+			min_pos of the previous term. */
+			if (min_pos == 0 && last_pos > select->min_pos) {
+				min_pos = last_pos;
+			}
+		}
+
+		/* Skip the end of word position marker. */
+		++ptr;
+
+		/* Bytes decoded so far. */
+		decoded = ptr - (byte*) data;
+
+		/* A word may exist in the document but we only consider a
+		match if it exists in a position that is greater than the
+		position of the previous term. */
+		if (doc_id == select->doc_id && min_pos > 0) {
+			fts_doc_freq_t*	doc_freq;
+
+			/* Add the doc id to the doc freq rb tree, if
+			the doc id doesn't exist it will be created. */
+			doc_freq = fts_query_add_doc_freq(
+				select->word_freq->doc_freqs, doc_id);
+
+			/* Avoid duplicating the frequency tally */
+			if (doc_freq->freq == 0) {
+				doc_freq->freq = freq;
+			}
+
+			select->found = TRUE;
+			select->min_pos = min_pos;
+		}
+	}
+
+	return(select->found);
+}
+#endif
+
+/*****************************************************************//**
+Read and filter nodes.
+@return DB_SUCCESS if all go well,
+or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
+static
+dberr_t
+fts_query_filter_doc_ids(
+/*=====================*/
+	fts_query_t*		query,		/*!< in: query instance */
+	const fts_string_t*	word,		/*!< in: the current word */
+	fts_word_freq_t*	word_freq,	/*!< in/out: word frequency */
+	const fts_node_t*	node,		/*!< in: current FTS node */
+	void*			data,		/*!< in: doc id ilist */
+	ulint			len,		/*!< in: doc id ilist size */
+	ibool			calc_doc_count)	/*!< in: whether to remember doc count */
+{
+	byte*		ptr = static_cast<byte*>(data);
+	doc_id_t	doc_id = 0;
+	ulint		decoded = 0;
+	ib_rbt_t*	doc_freqs = word_freq->doc_freqs;
+
+	/* Decode the ilist and add the doc ids to the query doc_id set. */
+	while (decoded < len) {
+		ulint		freq = 0;
+		fts_doc_freq_t*	doc_freq;
+		fts_match_t*	match = NULL;
+		ulint		last_pos = 0;
+		ulint		pos = fts_decode_vlc(&ptr);
+
+		/* Some sanity checks. */
+		if (doc_id == 0) {
+			ut_a(pos == node->first_doc_id);
+		}
+
+		/* Add the delta. */
+		doc_id += pos;
+
+		if (calc_doc_count) {
+			word_freq->doc_count++;
+		}
+
+		/* We simply collect the matching instances here. */
+		if (query->collect_positions) {
+			ib_alloc_t*	heap_alloc;
+
+			/* Create a new fts_match_t instance. */
+			match = static_cast<fts_match_t*>(
+				ib_vector_push(query->matched, NULL));
+
+			match->start = 0;
+			match->doc_id = doc_id;
+			heap_alloc = ib_vector_allocator(query->matched);
+
+			/* Allocate from the same heap as the
+			parent container. */
+			match->positions = ib_vector_create(
+				heap_alloc, sizeof(ulint), 64);
+
+			query->total_size += sizeof(fts_match_t)
+				+ sizeof(ib_vector_t)
+				+ sizeof(ulint) * 64;
+		}
+
+		/* Unpack the positions within the document. */
+		while (*ptr) {
+			last_pos += fts_decode_vlc(&ptr);
+
+			/* Collect the matching word positions, for phrase
+			matching later. */
+			if (query->collect_positions) {
+				ib_vector_push(match->positions, &last_pos);
+			}
+
+			++freq;
+		}
+
+		/* End of list marker. */
+		last_pos = (ulint) -1;
+
+		if (query->collect_positions) {
+			ut_a(match != NULL);
+			ib_vector_push(match->positions, &last_pos);
+		}
+
+		/* Add the doc id to the doc freq rb tree, if the doc id
+		doesn't exist it will be created. */
+		doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id);
+
+		/* Avoid duplicating frequency tally. */
+		if (doc_freq->freq == 0) {
+			doc_freq->freq = freq;
+		}
+
+		/* Skip the end of word position marker. */
+		++ptr;
+
+		/* Bytes decoded so far */
+		decoded = ptr - (byte*) data;
+
+		/* We simply collect the matching documents and the
+		positions here and match later. */
+		if (!query->collect_positions) {
+			/* We ignore error here and will check it later */
+			fts_query_process_doc_id(query, doc_id, 0);
+
+			/* Add the word to the document's matched RB tree. */
+			fts_query_add_word_to_document(query, doc_id, word);
+		}
+	}
+
+	/* Some sanity checks. */
+	ut_a(doc_id == node->last_doc_id);
+
+	if (query->total_size > fts_result_cache_limit) {
+		return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+	} else {
+		return(DB_SUCCESS);
+	}
+}
+
+/*****************************************************************//**
+Read the FTS INDEX row.
+@return DB_SUCCESS if all go well. */
+static
+dberr_t
+fts_query_read_node(
+/*================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	const fts_string_t*	word,	/*!< in: current word */
+	que_node_t*		exp)	/*!< in: query graph node */
+{
+	int			i;
+	int			ret;
+	fts_node_t		node;
+	ib_rbt_bound_t		parent;
+	fts_word_freq_t*	word_freq;
+	ibool			skip = FALSE;
+	fts_string_t		term;
+	byte			buf[FTS_MAX_WORD_LEN + 1];
+	dberr_t			error = DB_SUCCESS;
+
+	ut_a(query->cur_node->type == FTS_AST_TERM ||
+	     query->cur_node->type == FTS_AST_TEXT);
+
+	memset(&node, 0, sizeof(node));
+	term.f_str = buf;
+
+	/* Need to consider the wildcard search case, the word frequency
+	is created on the search string not the actual word. So we need
+	to assign the frequency on search string behalf. */
+	if (query->cur_node->type == FTS_AST_TERM
+	    && query->cur_node->term.wildcard) {
+		term.f_len = query->cur_node->term.ptr->len;
+		ut_ad(FTS_MAX_WORD_LEN >= term.f_len);
+		memcpy(term.f_str, query->cur_node->term.ptr->str, term.f_len);
+	} else {
+		term.f_len = word->f_len;
+		ut_ad(FTS_MAX_WORD_LEN >= word->f_len);
+		memcpy(term.f_str, word->f_str, word->f_len);
+	}
+
+	/* Lookup the word in our rb tree, it must exist. */
+	ret = rbt_search(query->word_freqs, &parent, &term);
+
+	ut_a(ret == 0);
+
+	word_freq = rbt_value(fts_word_freq_t, parent.last);
+
+	/* Start from 1 since the first column has been read by the caller.
+	Also, we rely on the order of the columns projected, to filter
+	out ilists that are out of range and we always want to read
+	the doc_count irrespective of the suitablility of the row. */
+
+	for (i = 1; exp && !skip; exp = que_node_get_next(exp), ++i) {
+
+		dfield_t*	dfield = que_node_get_val(exp);
+		byte*		data = static_cast<byte*>(
+			dfield_get_data(dfield));
+		ulint		len = dfield_get_len(dfield);
+
+		ut_a(len != UNIV_SQL_NULL);
+
+		/* Note: The column numbers below must match the SELECT. */
+
+		switch (i) {
+		case 1: /* DOC_COUNT */
+			word_freq->doc_count += mach_read_from_4(data);
+			break;
+
+		case 2: /* FIRST_DOC_ID */
+			node.first_doc_id = fts_read_doc_id(data);
+
+			/* Skip nodes whose doc ids are out range. */
+			if (query->oper == FTS_EXIST
+			    && query->upper_doc_id > 0
+			    && node.first_doc_id > query->upper_doc_id) {
+				skip = TRUE;
+			}
+			break;
+
+		case 3: /* LAST_DOC_ID */
+			node.last_doc_id = fts_read_doc_id(data);
+
+			/* Skip nodes whose doc ids are out range. */
+			if (query->oper == FTS_EXIST
+			    && query->lower_doc_id > 0
+			    && node.last_doc_id < query->lower_doc_id) {
+				skip = TRUE;
+			}
+			break;
+
+		case 4: /* ILIST */
+
+			error = fts_query_filter_doc_ids(
+					query, &word_freq->word, word_freq,
+					&node, data, len, FALSE);
+
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	if (!skip) {
+		/* Make sure all columns were read. */
+
+		ut_a(i == 5);
+	}
+
+	return error;
+}
+
+/*****************************************************************//**
+Callback function to fetch the rows in an FTS INDEX record.
+@return always returns TRUE */
+static
+ibool
+fts_query_index_fetch_nodes(
+/*========================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to fts_fetch_t */
+{
+	fts_string_t	key;
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	fts_fetch_t*	fetch = static_cast<fts_fetch_t*>(user_arg);
+	fts_query_t*	query = static_cast<fts_query_t*>(fetch->read_arg);
+	que_node_t*	exp = sel_node->select_list;
+	dfield_t*	dfield = que_node_get_val(exp);
+	void*		data = dfield_get_data(dfield);
+	ulint		dfield_len = dfield_get_len(dfield);
+
+	key.f_str = static_cast<byte*>(data);
+	key.f_len = dfield_len;
+
+	ut_a(dfield_len <= FTS_MAX_WORD_LEN);
+
+	/* Note: we pass error out by 'query->error' */
+	query->error = fts_query_read_node(query, &key, que_node_get_next(exp));
+
+	if (query->error != DB_SUCCESS) {
+		ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+		return(FALSE);
+	} else {
+		return(TRUE);
+	}
+}
+
+/*****************************************************************//**
+Calculate the inverse document frequency (IDF) for all the terms. */
+static
+void
+fts_query_calculate_idf(
+/*====================*/
+	fts_query_t*	query)	/*!< in: Query state */
+{
+	const ib_rbt_node_t*	node;
+	ib_uint64_t		total_docs = query->total_docs;
+
+	/* We need to free any instances of fts_doc_freq_t that we
+	may have allocated. */
+	for (node = rbt_first(query->word_freqs);
+	     node;
+	     node = rbt_next(query->word_freqs, node)) {
+
+		fts_word_freq_t*	word_freq;
+
+		word_freq = rbt_value(fts_word_freq_t, node);
+
+		if (word_freq->doc_count > 0) {
+			if (total_docs == word_freq->doc_count) {
+				/* QP assume ranking > 0 if we find
+				a match. Since Log10(1) = 0, we cannot
+				make IDF a zero value if do find a
+				word in all documents. So let's make
+				it an arbitrary very small number */
+				word_freq->idf = log10(1.0001);
+			} else {
+				word_freq->idf = log10(
+					total_docs
+					/ (double) word_freq->doc_count);
+			}
+		}
+
+		if (fts_enable_diag_print) {
+			fprintf(stderr,"'%s' -> " UINT64PF "/" UINT64PF
+				" %6.5lf\n",
+			        word_freq->word.f_str,
+			        query->total_docs, word_freq->doc_count,
+			        word_freq->idf);
+		}
+	}
+}
+
+/*****************************************************************//**
+Calculate the ranking of the document. */
+static
+void
+fts_query_calculate_ranking(
+/*========================*/
+	const fts_query_t*	query,		/*!< in: query state */
+	fts_ranking_t*		ranking)	/*!< in: Document to rank */
+{
+	ulint	pos = 0;
+	fts_string_t	word;
+
+	/* At this stage, ranking->rank should not exceed the 1.0
+	bound */
+	ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0);
+	ut_ad(rbt_size(query->word_map) == query->word_vector->size());
+
+	while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
+		int			ret;
+		ib_rbt_bound_t		parent;
+		double			weight;
+		fts_doc_freq_t*		doc_freq;
+		fts_word_freq_t*	word_freq;
+
+		ret = rbt_search(query->word_freqs, &parent, &word);
+
+		/* It must exist. */
+		ut_a(ret == 0);
+
+		word_freq = rbt_value(fts_word_freq_t, parent.last);
+
+		ret = rbt_search(
+			word_freq->doc_freqs, &parent, &ranking->doc_id);
+
+		/* It must exist. */
+		ut_a(ret == 0);
+
+		doc_freq = rbt_value(fts_doc_freq_t, parent.last);
+
+		weight = (double) doc_freq->freq * word_freq->idf;
+
+		ranking->rank += (fts_rank_t) (weight * word_freq->idf);
+	}
+}
+
+/*****************************************************************//**
+Add ranking to the result set. */
+static
+void
+fts_query_add_ranking(
+/*==================*/
+	fts_query_t*		query,		/*!< in: query state */
+	ib_rbt_t*		ranking_tree,	/*!< in: ranking tree */
+	const fts_ranking_t*	new_ranking)	/*!< in: ranking of a document */
+{
+	ib_rbt_bound_t		parent;
+
+	/* Lookup the ranking in our rb tree and add if it doesn't exist. */
+	if (rbt_search(ranking_tree, &parent, new_ranking) == 0) {
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+
+		ranking->rank += new_ranking->rank;
+
+		ut_a(ranking->words == NULL);
+	} else {
+		rbt_add_node(ranking_tree, &parent, new_ranking);
+
+		query->total_size += SIZEOF_RBT_NODE_ADD
+			+ sizeof(fts_ranking_t);
+	}
+}
+
+/*****************************************************************//**
+Retrieve the FTS Relevance Ranking result for doc with doc_id
+@return the relevance ranking value, 0 if no ranking value
+present. */
+float
+fts_retrieve_ranking(
+/*=================*/
+	fts_result_t*	result,	/*!< in: FTS result structure */
+	doc_id_t	doc_id)	/*!< in: doc_id of the item to retrieve */
+{
+	ib_rbt_bound_t		parent;
+	fts_ranking_t		new_ranking;
+
+	DBUG_ENTER("fts_retrieve_ranking");
+
+	if (!result || !result->rankings_by_id) {
+		DBUG_RETURN(0);
+	}
+
+	new_ranking.doc_id = doc_id;
+
+	/* Lookup the ranking in our rb tree */
+	if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == 0) {
+		fts_ranking_t*  ranking;
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+
+		DBUG_RETURN(ranking->rank);
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*****************************************************************//**
+Create the result and copy the data to it. */
+static
+fts_result_t*
+fts_query_prepare_result(
+/*=====================*/
+	fts_query_t*	query,	/*!< in: Query state */
+	fts_result_t*	result)	/*!< in: result this can contain
+				data from a previous search on
+				another FTS index */
+{
+	const ib_rbt_node_t*	node;
+	bool			result_is_null = false;
+
+	DBUG_ENTER("fts_query_prepare_result");
+
+	if (result == NULL) {
+		result = static_cast<fts_result_t*>(ut_malloc(sizeof(*result)));
+
+		memset(result, 0x0, sizeof(*result));
+
+		result->rankings_by_id = rbt_create(
+			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+		query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE;
+		result_is_null = true;
+	}
+
+	if (query->flags == FTS_OPT_RANKING) {
+		fts_word_freq_t*	word_freq;
+		ulint		size = ib_vector_size(query->deleted->doc_ids);
+		fts_update_t*	array =
+			(fts_update_t*) query->deleted->doc_ids->data;
+
+		node = rbt_first(query->word_freqs);
+		ut_ad(node);
+		word_freq = rbt_value(fts_word_freq_t, node);
+
+		for (node = rbt_first(word_freq->doc_freqs);
+		     node;
+		     node = rbt_next(word_freq->doc_freqs, node)) {
+			fts_doc_freq_t* doc_freq;
+			fts_ranking_t	ranking;
+
+			doc_freq = rbt_value(fts_doc_freq_t, node);
+
+			/* Don't put deleted docs into result */
+			if (fts_bsearch(array, 0, static_cast<int>(size),
+					doc_freq->doc_id) >= 0) {
+				/* one less matching doc count */
+				--word_freq->doc_count;
+				continue;
+			}
+
+			ranking.doc_id = doc_freq->doc_id;
+			ranking.rank = static_cast<fts_rank_t>(doc_freq->freq);
+			ranking.words = NULL;
+
+			fts_query_add_ranking(query, result->rankings_by_id,
+					      &ranking);
+
+			if (query->total_size > fts_result_cache_limit) {
+				query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
+				fts_query_free_result(result);
+				DBUG_RETURN(NULL);
+			}
+		}
+
+		/* Calculate IDF only after we exclude the deleted items */
+		fts_query_calculate_idf(query);
+
+		node = rbt_first(query->word_freqs);
+		word_freq = rbt_value(fts_word_freq_t, node);
+
+		/* Calculate the ranking for each doc */
+		for (node = rbt_first(result->rankings_by_id);
+		     node != NULL;
+		     node = rbt_next(result->rankings_by_id, node)) {
+
+			fts_ranking_t*  ranking;
+
+			ranking = rbt_value(fts_ranking_t, node);
+
+			ranking->rank = static_cast<fts_rank_t>(
+				ranking->rank * word_freq->idf * word_freq->idf);
+		}
+
+		DBUG_RETURN(result);
+	}
+
+	ut_a(rbt_size(query->doc_ids) > 0);
+
+	for (node = rbt_first(query->doc_ids);
+	     node;
+	     node = rbt_next(query->doc_ids, node)) {
+
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, node);
+		fts_query_calculate_ranking(query, ranking);
+
+		// FIXME: I think we may requre this information to improve the
+		// ranking of doc ids which have more word matches from
+		// different FTS indexes.
+
+		/* We don't need these anymore free the resources. */
+		ranking->words = NULL;
+
+		if (!result_is_null) {
+			fts_query_add_ranking(query, result->rankings_by_id, ranking);
+
+			 if (query->total_size > fts_result_cache_limit) {
+				query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
+				fts_query_free_result(result);
+				DBUG_RETURN(NULL);
+                        }
+		}
+	}
+
+	if (result_is_null) {
+		/* Use doc_ids directly */
+		rbt_free(result->rankings_by_id);
+		result->rankings_by_id = query->doc_ids;
+		query->doc_ids = NULL;
+	}
+
+	DBUG_RETURN(result);
+}
+
+/*****************************************************************//**
+Get the result of the query. Calculate the similarity coefficient. */
+static
+fts_result_t*
+fts_query_get_result(
+/*=================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	fts_result_t*		result)	/*!< in: result */
+{
+	DBUG_ENTER("fts_query_get_result");
+
+	if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) {
+		/* Copy the doc ids to the result. */
+		result = fts_query_prepare_result(query, result);
+	} else {
+		/* Create an empty result instance. */
+		result = static_cast<fts_result_t*>(ut_malloc(sizeof(*result)));
+		memset(result, 0, sizeof(*result));
+	}
+
+	DBUG_RETURN(result);
+}
+
+/*****************************************************************//**
+FTS Query free resources and reset. */
+static
+void
+fts_query_free(
+/*===========*/
+	fts_query_t*	query)		/*!< in: query instance to free*/
+{
+
+	if (query->read_nodes_graph) {
+		fts_que_graph_free(query->read_nodes_graph);
+	}
+
+	if (query->root) {
+		fts_ast_free_node(query->root);
+	}
+
+	if (query->deleted) {
+		fts_doc_ids_free(query->deleted);
+	}
+
+	if (query->doc_ids) {
+		fts_query_free_doc_ids(query, query->doc_ids);
+	}
+
+	if (query->word_freqs) {
+		const ib_rbt_node_t*	node;
+
+		/* We need to free any instances of fts_doc_freq_t that we
+		may have allocated. */
+		for (node = rbt_first(query->word_freqs);
+		     node;
+		     node = rbt_next(query->word_freqs, node)) {
+
+			fts_word_freq_t*	word_freq;
+
+			word_freq = rbt_value(fts_word_freq_t, node);
+
+			/* We need to cast away the const. */
+			rbt_free(word_freq->doc_freqs);
+		}
+
+		rbt_free(query->word_freqs);
+	}
+
+	ut_a(!query->intersection);
+
+	if (query->word_map) {
+		rbt_free(query->word_map);
+	}
+
+	if (query->word_vector) {
+		delete query->word_vector;
+	}
+
+	if (query->heap) {
+		mem_heap_free(query->heap);
+	}
+
+	memset(query, 0, sizeof(*query));
+}
+
+/*****************************************************************//**
+Parse the query using flex/bison. */
+static
+fts_ast_node_t*
+fts_query_parse(
+/*============*/
+	fts_query_t*	query,		/*!< in: query instance */
+	byte*		query_str,	/*!< in: query string */
+	ulint		query_len)	/*!< in: query string length */
+{
+	int		error;
+	fts_ast_state_t state;
+	bool		mode = query->boolean_mode;
+	DBUG_ENTER("fts_query_parse");
+
+	memset(&state, 0x0, sizeof(state));
+
+	/* Setup the scanner to use, this depends on the mode flag. */
+	state.lexer = fts_lexer_create(mode, query_str, query_len);
+	state.charset = query->fts_index_table.charset;
+	error = fts_parse(&state);
+	fts_lexer_free(state.lexer);
+	state.lexer = NULL;
+
+	/* Error during parsing ? */
+	if (error) {
+		/* Free the nodes that were allocated during parsing. */
+		fts_ast_state_free(&state);
+	} else {
+		query->root = state.root;
+	}
+
+	DBUG_RETURN(state.root);
+}
+
+/*******************************************************************//**
+FTS Query optimization
+Set FTS_OPT_RANKING if it is a simple term query */
+static
+void
+fts_query_can_optimize(
+/*===================*/
+	fts_query_t*	query,		/*!< in/out: query instance */
+	uint		flags)		/*!< In: FTS search mode */
+{
+	fts_ast_node_t*	node = query->root;
+
+	if (flags & FTS_EXPAND) {
+		return;
+	}
+
+	/* Check if it has only a term without oper */
+	ut_ad(node->type == FTS_AST_LIST);
+	node = node->list.head;
+	if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) {
+		query->flags = FTS_OPT_RANKING;
+	}
+}
+
+/*******************************************************************//**
+Pre-process the query string
+1) make it lower case
+2) in boolean mode, if there is '-' or '+' that is immediately proceeded
+and followed by valid word, make it a space
+@return the processed string */
+static
+byte*
+fts_query_str_preprocess(
+/*=====================*/
+	const byte*	query_str,	/*!< in: FTS query */
+	ulint		query_len,	/*!< in: FTS query string len */
+	ulint		*result_len,	/*!< out: result string length */
+	CHARSET_INFO*	charset,	/*!< in: string charset */
+	bool		boolean_mode)	/*!< in: is boolean mode */
+{
+	ulint	cur_pos = 0;
+	ulint	str_len;
+	byte*	str_ptr;
+	bool	in_phrase = false;
+
+	/* Convert the query string to lower case before parsing. We own
+	the ut_malloc'ed result and so remember to free it before return. */
+
+	str_len = query_len * charset->casedn_multiply + 1;
+	str_ptr = static_cast<byte*>(ut_malloc(str_len));
+
+	*result_len = innobase_fts_casedn_str(
+		charset, const_cast<char*>(reinterpret_cast<const char*>(
+			query_str)), query_len,
+		reinterpret_cast<char*>(str_ptr), str_len);
+
+	ut_ad(*result_len < str_len);
+
+	str_ptr[*result_len] = 0;
+
+	/* If it is boolean mode, no need to check for '-/+' */
+	if (!boolean_mode) {
+		return(str_ptr);
+	}
+
+	/* Otherwise, we travese the string to find any '-/+' that are
+	immediately proceeded and followed by valid search word.
+	NOTE: we should not do so for CJK languages, this should
+	be taken care of in our CJK implementation */
+        while (cur_pos < *result_len) {
+                fts_string_t    str;
+                ulint           offset;
+                ulint           cur_len;
+
+                cur_len = innobase_mysql_fts_get_token(
+                        charset, str_ptr + cur_pos, str_ptr + *result_len,
+			&str, &offset);
+
+		if (cur_len == 0 || str.f_str == NULL) {
+			/* No valid word found */
+			break;
+		}
+
+		/* Check if we are in a phrase, if so, no need to do
+		replacement of '-/+'. */
+		for (byte* ptr = str_ptr + cur_pos; ptr < str.f_str; ptr++) {
+			if ((char) (*ptr) == '"' ) {
+				in_phrase = !in_phrase;
+			}
+		}
+
+		/* Find those are not leading '-/+' and also not in a phrase */
+		if (cur_pos > 0 && str.f_str - str_ptr - cur_pos == 1
+		    && !in_phrase) {
+			char*	last_op = reinterpret_cast<char*>(
+						str_ptr + cur_pos);
+
+			if (*last_op == '-' || *last_op == '+') {
+				*last_op = ' ';
+			}
+		}
+
+                cur_pos += cur_len;
+	}
+
+	return(str_ptr);
+}
+
+/*******************************************************************//**
+FTS Query entry point.
+@return DB_SUCCESS if successful otherwise error code */
+UNIV_INTERN
+dberr_t
+fts_query(
+/*======*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: The FTS index to search */
+	uint		flags,		/*!< in: FTS search mode */
+	const byte*	query_str,	/*!< in: FTS query */
+	ulint		query_len,	/*!< in: FTS query string len
+					in bytes */
+	fts_result_t**	result)		/*!< in/out: result doc ids */
+{
+	fts_query_t	query;
+	dberr_t		error = DB_SUCCESS;
+	byte*		lc_query_str;
+	ulint		result_len;
+	bool		boolean_mode;
+	trx_t*		query_trx;
+	CHARSET_INFO*	charset;
+	ulint		start_time_ms;
+	bool		will_be_ignored = false;
+
+	boolean_mode = flags & FTS_BOOL;
+
+	*result = NULL;
+	memset(&query, 0x0, sizeof(query));
+	query_trx = trx_allocate_for_background();
+	query_trx->op_info = "FTS query";
+
+	start_time_ms = ut_time_ms();
+
+	query.trx = query_trx;
+	query.index = index;
+	query.boolean_mode = boolean_mode;
+	query.deleted = fts_doc_ids_create();
+	query.cur_node = NULL;
+
+	query.fts_common_table.type = FTS_COMMON_TABLE;
+	query.fts_common_table.table_id = index->table->id;
+	query.fts_common_table.parent = index->table->name;
+	query.fts_common_table.table = index->table;
+
+	charset = fts_index_get_charset(index);
+
+	query.fts_index_table.type = FTS_INDEX_TABLE;
+	query.fts_index_table.index_id = index->id;
+	query.fts_index_table.table_id = index->table->id;
+	query.fts_index_table.parent = index->table->name;
+	query.fts_index_table.charset = charset;
+	query.fts_index_table.table = index->table;
+
+	query.word_map = rbt_create_arg_cmp(
+		sizeof(fts_string_t), innobase_fts_text_cmp, charset);
+	query.word_vector = new word_vector_t;
+	query.error = DB_SUCCESS;
+
+	/* Setup the RB tree that will be used to collect per term
+	statistics. */
+	query.word_freqs = rbt_create_arg_cmp(
+		sizeof(fts_word_freq_t), innobase_fts_text_cmp, charset);
+
+	query.total_size += SIZEOF_RBT_CREATE;
+
+	query.total_docs = dict_table_get_n_rows(index->table);
+
+#ifdef FTS_DOC_STATS_DEBUG
+	if (ft_enable_diag_print) {
+		error = fts_get_total_word_count(
+			trx, query.index, &query.total_words);
+
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		}
+
+		fprintf(stderr, "Total docs: " UINT64PF " Total words: %lu\n",
+			query.total_docs, query.total_words);
+	}
+#endif /* FTS_DOC_STATS_DEBUG */
+
+	query.fts_common_table.suffix = "DELETED";
+
+	/* Read the deleted doc_ids, we need these for filtering. */
+	error = fts_table_fetch_doc_ids(
+		NULL, &query.fts_common_table, query.deleted);
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	query.fts_common_table.suffix = "DELETED_CACHE";
+
+	error = fts_table_fetch_doc_ids(
+		NULL, &query.fts_common_table, query.deleted);
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	/* Get the deleted doc ids that are in the cache. */
+	fts_cache_append_deleted_doc_ids(
+		index->table->fts->cache, query.deleted->doc_ids);
+	DEBUG_SYNC_C("fts_deleted_doc_ids_append");
+
+	/* Sort the vector so that we can do a binary search over the ids. */
+	ib_vector_sort(query.deleted->doc_ids, fts_update_doc_id_cmp);
+
+#if 0
+	/* Convert the query string to lower case before parsing. We own
+	the ut_malloc'ed result and so remember to free it before return. */
+
+	lc_query_str_len = query_len * charset->casedn_multiply + 1;
+	lc_query_str = static_cast<byte*>(ut_malloc(lc_query_str_len));
+
+	result_len = innobase_fts_casedn_str(
+		charset, (char*) query_str, query_len,
+		(char*) lc_query_str, lc_query_str_len);
+
+	ut_ad(result_len < lc_query_str_len);
+
+	lc_query_str[result_len] = 0;
+
+#endif
+
+	lc_query_str = fts_query_str_preprocess(
+		query_str, query_len, &result_len, charset, boolean_mode);
+
+	query.heap = mem_heap_create(128);
+
+	/* Create the rb tree for the doc id (current) set. */
+	query.doc_ids = rbt_create(
+		sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+	query.total_size += SIZEOF_RBT_CREATE;
+
+	/* Parse the input query string. */
+	if (fts_query_parse(&query, lc_query_str, result_len)) {
+		fts_ast_node_t*	ast = query.root;
+
+		/* Optimize query to check if it's a single term */
+		fts_query_can_optimize(&query, flags);
+
+		DBUG_EXECUTE_IF("fts_instrument_result_cache_limit",
+			        fts_result_cache_limit = 2048;
+		);
+
+		/* Traverse the Abstract Syntax Tree (AST) and execute
+		the query. */
+		query.error = fts_ast_visit(
+			FTS_NONE, ast, fts_query_visitor,
+			&query, &will_be_ignored);
+
+		/* If query expansion is requested, extend the search
+		with first search pass result */
+		if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) {
+			query.error = fts_expand_query(index, &query);
+		}
+
+		/* Calculate the inverse document frequency of the terms. */
+		if (query.error == DB_SUCCESS
+		    && query.flags != FTS_OPT_RANKING) {
+			fts_query_calculate_idf(&query);
+		}
+
+		/* Copy the result from the query state, so that we can
+		return it to the caller. */
+		if (query.error == DB_SUCCESS) {
+			*result = fts_query_get_result(&query, *result);
+		}
+
+		error = query.error;
+	} else {
+		/* still return an empty result set */
+		*result = static_cast<fts_result_t*>(
+			ut_malloc(sizeof(**result)));
+		memset(*result, 0, sizeof(**result));
+	}
+
+	ut_free(lc_query_str);
+
+	if (fts_enable_diag_print && (*result)) {
+		ulint	diff_time = ut_time_ms() - start_time_ms;
+		fprintf(stderr, "FTS Search Processing time: %ld secs:"
+				" %ld millisec: row(s) %d \n",
+			diff_time / 1000, diff_time % 1000,
+			(*result)->rankings_by_id
+				? (int) rbt_size((*result)->rankings_by_id)
+				: -1);
+
+		/* Log memory consumption & result size */
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Full Search Memory: "
+			"%lu (bytes),  Row: %lu .",
+			query.total_size,
+			(*result)->rankings_by_id
+				?  rbt_size((*result)->rankings_by_id)
+				: 0);
+	}
+
+func_exit:
+	fts_query_free(&query);
+
+	trx_free_for_background(query_trx);
+
+	return(error);
+}
+
+/*****************************************************************//**
+FTS Query free result, returned by fts_query(). */
+
+void
+fts_query_free_result(
+/*==================*/
+	fts_result_t*	result)		/*!< in: result instance to free.*/
+{
+	if (result) {
+		if (result->rankings_by_id != NULL) {
+			rbt_free(result->rankings_by_id);
+			result->rankings_by_id = NULL;
+		}
+		if (result->rankings_by_rank != NULL) {
+			rbt_free(result->rankings_by_rank);
+			result->rankings_by_rank = NULL;
+		}
+
+		ut_free(result);
+		result = NULL;
+	}
+}
+
+/*****************************************************************//**
+FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
+
+void
+fts_query_sort_result_on_rank(
+/*==========================*/
+	fts_result_t*	result)		/*!< out: result instance to sort.*/
+{
+	const ib_rbt_node_t*	node;
+	ib_rbt_t*		ranked;
+
+	ut_a(result->rankings_by_id != NULL);
+	if (result->rankings_by_rank) {
+		rbt_free(result->rankings_by_rank);
+	}
+
+	ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank);
+
+	/* We need to free any instances of fts_doc_freq_t that we
+	may have allocated. */
+	for (node = rbt_first(result->rankings_by_id);
+	     node;
+	     node = rbt_next(result->rankings_by_id, node)) {
+
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, node);
+
+		ut_a(ranking->words == NULL);
+
+		rbt_insert(ranked, ranking, ranking);
+	}
+
+	/* Reset the current node too. */
+	result->current = NULL;
+	result->rankings_by_rank = ranked;
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+A debug function to print result doc_id set. */
+static
+void
+fts_print_doc_id(
+/*=============*/
+	fts_query_t*	query)	/*!< in : tree that stores doc_ids.*/
+{
+	const ib_rbt_node_t*	node;
+
+	/* Iterate each member of the doc_id set */
+	for (node = rbt_first(query->doc_ids);
+	     node;
+	     node = rbt_next(query->doc_ids, node)) {
+		fts_ranking_t*	ranking;
+		ranking = rbt_value(fts_ranking_t, node);
+
+		ib_logf(IB_LOG_LEVEL_INFO, "doc_ids info, doc_id: %ld \n",
+			(ulint) ranking->doc_id);
+
+		ulint		pos = 0;
+		fts_string_t	word;
+
+		while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
+			ib_logf(IB_LOG_LEVEL_INFO, "doc_ids info, value: %s \n", word.f_str);
+		}
+	}
+}
+#endif
+
+/*************************************************************//**
+This function implements a simple "blind" query expansion search:
+words in documents found in the first search pass will be used as
+search arguments to search the document again, thus "expand"
+the search result set.
+@return DB_SUCCESS if success, otherwise the error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+fts_expand_query(
+/*=============*/
+	dict_index_t*	index,		/*!< in: FTS index to search */
+	fts_query_t*	query)		/*!< in: FTS query instance */
+{
+	const ib_rbt_node_t*	node;
+	const ib_rbt_node_t*	token_node;
+	fts_doc_t		result_doc;
+	dberr_t			error = DB_SUCCESS;
+	const fts_index_cache_t*index_cache;
+
+	/* If no doc is found in first search pass, return */
+	if (!rbt_size(query->doc_ids)) {
+		return(error);
+	}
+
+	/* Init "result_doc", to hold words from the first search pass */
+	fts_doc_init(&result_doc);
+
+	rw_lock_x_lock(&index->table->fts->cache->lock);
+	index_cache = fts_find_index_cache(index->table->fts->cache, index);
+	rw_lock_x_unlock(&index->table->fts->cache->lock);
+
+	ut_a(index_cache);
+
+	result_doc.tokens = rbt_create_arg_cmp(
+		sizeof(fts_token_t), innobase_fts_text_cmp,
+		index_cache->charset);
+
+	result_doc.charset = index_cache->charset;
+
+	query->total_size += SIZEOF_RBT_CREATE;
+#ifdef UNIV_DEBUG
+	fts_print_doc_id(query);
+#endif
+
+	for (node = rbt_first(query->doc_ids);
+	     node;
+	     node = rbt_next(query->doc_ids, node)) {
+
+		fts_ranking_t*	ranking;
+		ulint		pos;
+		fts_string_t	word;
+		ulint		prev_token_size;
+		ulint		estimate_size;
+
+		prev_token_size = rbt_size(result_doc.tokens);
+
+		ranking = rbt_value(fts_ranking_t, node);
+
+		/* Fetch the documents with the doc_id from the
+		result of first seach pass. Since we do not
+		store document-to-word mapping, we need to
+		fetch the original document and parse them.
+		Future optimization could be done here if we
+		support some forms of document-to-word mapping */
+		fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index,
+					FTS_FETCH_DOC_BY_ID_EQUAL,
+					fts_query_expansion_fetch_doc,
+					&result_doc);
+
+		/* Remove words that have already been searched in the
+		first pass */
+		pos = 0;
+		while (fts_ranking_words_get_next(query, ranking, &pos,
+		       &word)) {
+			ibool		ret;
+
+			ret = rbt_delete(result_doc.tokens, &word);
+
+			/* The word must exist in the doc we found */
+			if (!ret) {
+				ib_logf(IB_LOG_LEVEL_ERROR, "Did not "
+					"find word %s in doc %ld for query "
+					"expansion search.\n", word.f_str,
+					(ulint) ranking->doc_id);
+			}
+		}
+
+		/* Estimate memory used, see fts_process_token and fts_token_t.
+		   We ignore token size here. */
+		estimate_size = (rbt_size(result_doc.tokens) - prev_token_size)
+			* (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t)
+			+ sizeof(ib_vector_t) + sizeof(ulint) * 32);
+		query->total_size += estimate_size;
+
+		if (query->total_size > fts_result_cache_limit) {
+			error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
+			goto	func_exit;
+		}
+	}
+
+	/* Search the table the second time with expanded search list */
+	for (token_node = rbt_first(result_doc.tokens);
+	     token_node;
+	     token_node = rbt_next(result_doc.tokens, token_node)) {
+		fts_token_t*	mytoken;
+		mytoken = rbt_value(fts_token_t, token_node);
+
+		ut_ad(mytoken->text.f_str[mytoken->text.f_len] == 0);
+		fts_query_add_word_freq(query, &mytoken->text);
+		error = fts_query_union(query, &mytoken->text);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+	}
+
+func_exit:
+	fts_doc_free(&result_doc);
+
+	return(error);
+}
+/*************************************************************//**
+This function finds documents that contain all words in a
+phrase or proximity search. And if proximity search, verify
+the words are close enough to each other, as in specified distance.
+This function is called for phrase and proximity search.
+@return TRUE if documents are found, FALSE if otherwise */
+static
+ibool
+fts_phrase_or_proximity_search(
+/*===========================*/
+	fts_query_t*	query,		/*!< in/out:  query instance.
+					query->doc_ids might be instantiated
+					with qualified doc IDs */
+	ib_vector_t*	tokens)		/*!< in: Tokens contain words */
+{
+	ulint		n_matched;
+	ulint		i;
+	ibool		matched = FALSE;
+	ulint		num_token = ib_vector_size(tokens);
+	fts_match_t*	match[MAX_PROXIMITY_ITEM];
+	ibool		end_list = FALSE;
+
+	/* Number of matched documents for the first token */
+	n_matched = ib_vector_size(query->match_array[0]);
+
+	/* We have a set of match list for each word, we shall
+	walk through the list and find common documents that
+	contain all the matching words. */
+	for (i = 0; i < n_matched; i++) {
+		ulint		j;
+		ulint		k = 0;
+		fts_proximity_t	qualified_pos;
+
+		match[0] = static_cast<fts_match_t*>(
+			ib_vector_get(query->match_array[0], i));
+
+		/* For remaining match list for the token(word), we
+		try to see if there is a document with the same
+		doc id */
+		for (j = 1; j < num_token; j++) {
+			match[j] = static_cast<fts_match_t*>(
+				ib_vector_get(query->match_array[j], k));
+
+			while (match[j]->doc_id < match[0]->doc_id
+			       && k < ib_vector_size(query->match_array[j])) {
+				 match[j] = static_cast<fts_match_t*>(
+					ib_vector_get(
+						query->match_array[j], k));
+				k++;
+			}
+
+			if (match[j]->doc_id > match[0]->doc_id) {
+				/* no match */
+				if (query->flags & FTS_PHRASE) {
+					match[0]->doc_id = 0;
+				}
+				break;
+			}
+
+			if (k == ib_vector_size(query->match_array[j])) {
+				end_list = TRUE;
+
+				if (match[j]->doc_id != match[0]->doc_id) {
+					/* no match */
+					if (query->flags & FTS_PHRASE) {
+						ulint	s;
+
+						match[0]->doc_id = 0;
+
+						for (s = i + 1; s < n_matched;
+						     s++) {
+							match[0] = static_cast<
+							fts_match_t*>(
+							ib_vector_get(
+							query->match_array[0],
+							s));
+							match[0]->doc_id = 0;
+						}
+					}
+
+					goto func_exit;
+				}
+			}
+
+			/* FIXME: A better solution will be a counter array
+			remember each run's last position. So we don't
+			reset it here very time */
+			k = 0;
+		}
+
+		if (j != num_token) {
+			continue;
+		}
+
+		/* For this matching doc, we need to further
+		verify whether the words in the doc are close
+		to each other, and within the distance specified
+		in the proximity search */
+		if (query->flags & FTS_PHRASE) {
+			matched = TRUE;
+		} else if (fts_proximity_get_positions(
+			match, num_token, ULINT_MAX, &qualified_pos)) {
+
+			/* Fetch the original documents and count the
+			words in between matching words to see that is in
+			specified distance */
+			if (fts_query_is_in_proximity_range(
+				query, match, &qualified_pos)) {
+				/* If so, mark we find a matching doc */
+				query->error = fts_query_process_doc_id(
+					query, match[0]->doc_id, 0);
+				if (query->error != DB_SUCCESS) {
+					matched = FALSE;
+					goto func_exit;
+				}
+
+				matched = TRUE;
+				for (ulint z = 0; z < num_token; z++) {
+					fts_string_t*	token;
+					token = static_cast<fts_string_t*>(
+						ib_vector_get(tokens, z));
+					fts_query_add_word_to_document(
+						query, match[0]->doc_id, token);
+				}
+			}
+		}
+
+		if (end_list) {
+			break;
+		}
+	}
+
+func_exit:
+	return(matched);
+}
+
+/*************************************************************//**
+This function checks whether words in result documents are close to
+each other (within proximity range as specified by "distance").
+If "distance" is MAX_ULINT, then it will find all combinations of
+positions of matching words and store min and max positions
+in the "qualified_pos" for later verification.
+@return true if words are close to each other, false if otherwise */
+static
+bool
+fts_proximity_get_positions(
+/*========================*/
+	fts_match_t**		match,		/*!< in: query instance */
+	ulint			num_match,	/*!< in: number of matching
+						items */
+	ulint			distance,	/*!< in: distance value
+						for proximity search */
+	fts_proximity_t*	qualified_pos)	/*!< out: the position info
+						records ranges containing
+						all matching words. */
+{
+	ulint	i;
+	ulint	idx[MAX_PROXIMITY_ITEM];
+	ulint	num_pos[MAX_PROXIMITY_ITEM];
+	ulint	min_idx;
+
+	qualified_pos->n_pos = 0;
+
+	ut_a(num_match <= MAX_PROXIMITY_ITEM);
+
+	/* Each word could appear multiple times in a doc. So
+	we need to walk through each word's position list, and find
+	closest distance between different words to see if
+	they are in the proximity distance. */
+
+	/* Assume each word's position list is sorted, we
+	will just do a walk through to all words' lists
+	similar to a the merge phase of a merge sort */
+	for (i = 0; i < num_match; i++) {
+		/* idx is the current position we are checking
+		for a particular word */
+		idx[i] = 0;
+
+		/* Number of positions for this word */
+		num_pos[i] = ib_vector_size(match[i]->positions);
+	}
+
+	/* Start with the first word */
+	min_idx = 0;
+
+	while (idx[min_idx] < num_pos[min_idx]) {
+		ulint	position[MAX_PROXIMITY_ITEM];
+		ulint	min_pos = ULINT_MAX;
+		ulint	max_pos = 0;
+
+		/* Check positions in each word position list, and
+		record the max/min position */
+		for (i = 0; i < num_match; i++) {
+			position[i] = *(ulint*) ib_vector_get_const(
+				match[i]->positions, idx[i]);
+
+			if (position[i] == ULINT_UNDEFINED) {
+				break;
+			}
+
+			if (position[i] < min_pos) {
+				min_pos = position[i];
+				min_idx = i;
+			}
+
+			if (position[i] > max_pos) {
+				max_pos = position[i];
+			}
+		}
+
+		/* If max and min position are within range, we
+		find a good match */
+		if (max_pos - min_pos <= distance
+		    && (i >= num_match || position[i] != ULINT_UNDEFINED)) {
+			/* The charset has variable character
+			length encoding, record the min_pos and
+			max_pos, we will need to verify the actual
+			number of characters */
+			qualified_pos->min_pos.push_back(min_pos);
+			qualified_pos->max_pos.push_back(max_pos);
+			qualified_pos->n_pos++;
+		}
+
+		/* Otherwise, move to the next position is the
+		list for the word with the smallest position */
+		idx[min_idx]++;
+	}
+
+	return(qualified_pos->n_pos != 0);
+}
diff --git a/storage/innobase/fts/fts0sql.cc b/storage/innobase/fts/fts0sql.cc
new file mode 100644
index 00000000000..cb8eff3cacc
--- /dev/null
+++ b/storage/innobase/fts/fts0sql.cc
@@ -0,0 +1,363 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0sql.cc
+Full Text Search functionality.
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#include "que0que.h"
+#include "trx0roll.h"
+#include "pars0pars.h"
+#include "dict0dict.h"
+#include "fts0types.h"
+#include "fts0priv.h"
+
+#ifndef UNIV_NONINL
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#endif
+
+/** SQL statements for creating the ancillary FTS tables. %s must be replaced
+with the indexed table's id. */
+
+/** Preamble to all SQL statements. */
+static const char* fts_sql_begin=
+	"PROCEDURE P() IS\n";
+
+/** Postamble to non-committing SQL statements. */
+static const char* fts_sql_end=
+	"\n"
+	"END;\n";
+
+/******************************************************************//**
+Get the table id.
+@return number of bytes written */
+UNIV_INTERN
+int
+fts_get_table_id(
+/*=============*/
+	const fts_table_t*
+			fts_table,	/*!< in: FTS Auxiliary table */
+	char*		table_id)	/*!< out: table id, must be at least
+					FTS_AUX_MIN_TABLE_ID_LENGTH bytes
+					long */
+{
+	int		len;
+	bool		hex_name = DICT_TF2_FLAG_IS_SET(fts_table->table,
+						DICT_TF2_FTS_AUX_HEX_NAME);
+
+	ut_a(fts_table->table != NULL);
+
+	switch (fts_table->type) {
+	case FTS_COMMON_TABLE:
+		len = fts_write_object_id(fts_table->table_id, table_id,
+					  hex_name);
+		break;
+
+	case FTS_INDEX_TABLE:
+
+		len = fts_write_object_id(fts_table->table_id, table_id,
+					  hex_name);
+
+		table_id[len] = '_';
+		++len;
+		table_id += len;
+
+		len += fts_write_object_id(fts_table->index_id, table_id,
+					   hex_name);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	ut_a(len >= 16);
+	ut_a(len < FTS_AUX_MIN_TABLE_ID_LENGTH);
+
+	return(len);
+}
+
+/******************************************************************//**
+Construct the prefix name of an FTS table.
+@return own: table name, must be freed with mem_free() */
+UNIV_INTERN
+char*
+fts_get_table_name_prefix(
+/*======================*/
+	const fts_table_t*
+			fts_table)	/*!< in: Auxiliary table type */
+{
+	int		len;
+	const char*	slash;
+	char*		prefix_name;
+	int		dbname_len = 0;
+	int		prefix_name_len;
+	char		table_id[FTS_AUX_MIN_TABLE_ID_LENGTH];
+
+	slash = static_cast<const char*>(
+		memchr(fts_table->parent, '/', strlen(fts_table->parent)));
+
+	if (slash) {
+		/* Print up to and including the separator. */
+		dbname_len = static_cast<int>(slash - fts_table->parent) + 1;
+	}
+
+	len = fts_get_table_id(fts_table, table_id);
+
+	prefix_name_len = dbname_len + 4 + len + 1;
+
+	prefix_name = static_cast<char*>(mem_alloc(prefix_name_len));
+
+	len = sprintf(prefix_name, "%.*sFTS_%s",
+		      dbname_len, fts_table->parent, table_id);
+
+	ut_a(len > 0);
+	ut_a(len == prefix_name_len - 1);
+
+	return(prefix_name);
+}
+
+/******************************************************************//**
+Construct the name of an ancillary FTS table.
+@return own: table name, must be freed with mem_free() */
+UNIV_INTERN
+char*
+fts_get_table_name(
+/*===============*/
+	const fts_table_t*	fts_table)
+					/*!< in: Auxiliary table type */
+{
+	int		len;
+	char*		name;
+	int		name_len;
+	char*		prefix_name;
+
+	prefix_name = fts_get_table_name_prefix(fts_table);
+
+	name_len = static_cast<int>(
+		strlen(prefix_name) + 1 + strlen(fts_table->suffix) + 1);
+
+	name = static_cast<char*>(mem_alloc(name_len));
+
+	len = sprintf(name, "%s_%s", prefix_name, fts_table->suffix);
+
+	ut_a(len > 0);
+	ut_a(len == name_len - 1);
+
+	mem_free(prefix_name);
+
+	return(name);
+}
+
+/******************************************************************//**
+Parse an SQL string. %s is replaced with the table's id.
+@return query graph */
+UNIV_INTERN
+que_t*
+fts_parse_sql(
+/*==========*/
+	fts_table_t*	fts_table,	/*!< in: FTS auxiliarry table info */
+	pars_info_t*	info,		/*!< in: info struct, or NULL */
+	const char*	sql)		/*!< in: SQL string to evaluate */
+{
+	char*		str;
+	que_t*		graph;
+	char*		str_tmp;
+	ibool		dict_locked;
+
+	if (fts_table != NULL) {
+		char*	table_name;
+
+		table_name = fts_get_table_name(fts_table);
+		str_tmp = ut_strreplace(sql, "%s", table_name);
+		mem_free(table_name);
+	} else {
+		ulint	sql_len = strlen(sql) + 1;
+
+		str_tmp = static_cast<char*>(mem_alloc(sql_len));
+		strcpy(str_tmp, sql);
+	}
+
+	str = ut_str3cat(fts_sql_begin, str_tmp, fts_sql_end);
+	mem_free(str_tmp);
+
+	dict_locked = (fts_table && fts_table->table->fts
+		       && (fts_table->table->fts->fts_status
+			   & TABLE_DICT_LOCKED));
+
+	if (!dict_locked) {
+		ut_ad(!mutex_own(&(dict_sys->mutex)));
+
+		/* The InnoDB SQL parser is not re-entrant. */
+		mutex_enter(&dict_sys->mutex);
+	}
+
+	graph = pars_sql(info, str);
+	ut_a(graph);
+
+	if (!dict_locked) {
+		mutex_exit(&dict_sys->mutex);
+	}
+
+	mem_free(str);
+
+	return(graph);
+}
+
+/******************************************************************//**
+Parse an SQL string. %s is replaced with the table's id.
+@return query graph */
+UNIV_INTERN
+que_t*
+fts_parse_sql_no_dict_lock(
+/*=======================*/
+	fts_table_t*	fts_table,	/*!< in: FTS aux table info */
+	pars_info_t*	info,		/*!< in: info struct, or NULL */
+	const char*	sql)		/*!< in: SQL string to evaluate */
+{
+	char*		str;
+	que_t*		graph;
+	char*		str_tmp = NULL;
+
+#ifdef UNIV_DEBUG
+	ut_ad(mutex_own(&dict_sys->mutex));
+#endif
+
+	if (fts_table != NULL) {
+		char*		table_name;
+
+		table_name = fts_get_table_name(fts_table);
+		str_tmp = ut_strreplace(sql, "%s", table_name);
+		mem_free(table_name);
+	}
+
+	if (str_tmp != NULL) {
+		str = ut_str3cat(fts_sql_begin, str_tmp, fts_sql_end);
+		mem_free(str_tmp);
+	} else {
+		str = ut_str3cat(fts_sql_begin, sql, fts_sql_end);
+	}
+
+	//fprintf(stderr, "%s\n", str);
+
+	graph = pars_sql(info, str);
+	ut_a(graph);
+
+	mem_free(str);
+
+	return(graph);
+}
+
+/******************************************************************//**
+Evaluate an SQL query graph.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_eval_sql(
+/*=========*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t*		graph)		/*!< in: Query graph to evaluate */
+{
+	que_thr_t*	thr;
+
+	graph->trx = trx;
+	graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+	ut_a(thr = que_fork_start_command(graph));
+
+	que_run_threads(thr);
+
+	return(trx->error_state);
+}
+
+/******************************************************************//**
+Construct the column specification part of the SQL string for selecting the
+indexed FTS columns for the given table. Adds the necessary bound
+ids to the given 'info' and returns the SQL string. Examples:
+
+One indexed column named "text":
+
+ "$sel0",
+ info/ids: sel0 -> "text"
+
+Two indexed columns named "subject" and "content":
+
+ "$sel0, $sel1",
+ info/ids: sel0 -> "subject", sel1 -> "content",
+@return heap-allocated WHERE string */
+UNIV_INTERN
+const char*
+fts_get_select_columns_str(
+/*=======================*/
+	dict_index_t*   index,		/*!< in: index */
+	pars_info_t*    info,		/*!< in/out: parser info */
+	mem_heap_t*     heap)		/*!< in: memory heap */
+{
+	ulint		i;
+	const char*	str = "";
+
+	for (i = 0; i < index->n_user_defined_cols; i++) {
+		char*           sel_str;
+
+		dict_field_t*   field = dict_index_get_nth_field(index, i);
+
+		sel_str = mem_heap_printf(heap, "sel%lu", (ulong) i);
+
+		/* Set copy_name to TRUE since it's dynamic. */
+		pars_info_bind_id(info, TRUE, sel_str, field->name);
+
+		str = mem_heap_printf(
+			heap, "%s%s$%s", str, (*str) ? ", " : "", sel_str);
+	}
+
+	return(str);
+}
+
+/******************************************************************//**
+Commit a transaction.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_sql_commit(
+/*===========*/
+	trx_t*		trx)		/*!< in: transaction */
+{
+	dberr_t	error;
+
+	error = trx_commit_for_mysql(trx);
+
+	/* Commit should always succeed */
+	ut_a(error == DB_SUCCESS);
+
+	return(DB_SUCCESS);
+}
+
+/******************************************************************//**
+Rollback a transaction.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_sql_rollback(
+/*=============*/
+	trx_t*		trx)		/*!< in: transaction */
+{
+	return(trx_rollback_to_savepoint(trx, NULL));
+}
diff --git a/storage/innobase/fts/fts0tlex.cc b/storage/innobase/fts/fts0tlex.cc
new file mode 100644
index 00000000000..b744fbf0763
--- /dev/null
+++ b/storage/innobase/fts/fts0tlex.cc
@@ -0,0 +1,1952 @@
+#include "univ.i"
+#line 2 "fts0tlex.cc"
+
+#line 4 "fts0tlex.cc"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 5
+#define YY_FLEX_SUBMINOR_VERSION 35
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types.
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t;
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+#ifdef __cplusplus
+
+/* The "const" storage-class-modifier is valid. */
+#define YY_USE_CONST
+
+#else	/* ! __cplusplus */
+
+/* C99 requires __STDC__ to be defined as 1. */
+#if defined (__STDC__)
+
+#define YY_USE_CONST
+
+#endif	/* defined (__STDC__) */
+#endif	/* ! __cplusplus */
+
+#ifdef YY_USE_CONST
+#define yyconst const
+#else
+#define yyconst
+#endif
+
+/* Returned upon end-of-file. */
+#define YY_NULL 0
+
+/* Promotes a possibly negative, possibly signed char to an unsigned
+ * integer for use as an array index.  If the signed char is negative,
+ * we want to instead treat it as an 8-bit unsigned char, hence the
+ * double cast.
+ */
+#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c)
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Enter a start condition.  This macro really ought to take a parameter,
+ * but we do it the disgusting crufty way forced on us by the ()-less
+ * definition of BEGIN.
+ */
+#define BEGIN yyg->yy_start = 1 + 2 *
+
+/* Translate the current start state into a value that can be later handed
+ * to BEGIN to return to the state.  The YYSTATE alias is for lex
+ * compatibility.
+ */
+#define YY_START ((yyg->yy_start - 1) / 2)
+#define YYSTATE YY_START
+
+/* Action number for EOF rule of a given start state. */
+#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1)
+
+/* Special action meaning "start processing a new file". */
+#define YY_NEW_FILE fts0trestart(yyin ,yyscanner )
+
+#define YY_END_OF_BUFFER_CHAR 0
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+/* The state buf must be large enough to hold one state per character in the main buffer.
+ */
+#define YY_STATE_BUF_SIZE   ((YY_BUF_SIZE + 2) * sizeof(yy_state_type))
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#define EOB_ACT_CONTINUE_SCAN 0
+#define EOB_ACT_END_OF_FILE 1
+#define EOB_ACT_LAST_MATCH 2
+
+    #define YY_LESS_LINENO(n)
+
+/* Return all but the first "n" matched characters back to the input stream. */
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		*yy_cp = yyg->yy_hold_char; \
+		YY_RESTORE_YY_MORE_OFFSET \
+		yyg->yy_c_buf_p = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \
+		YY_DO_BEFORE_ACTION; /* set up yytext again */ \
+		} \
+	while ( 0 )
+
+#define unput(c) yyunput( c, yyg->yytext_ptr , yyscanner )
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	yy_size_t yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+#define YY_BUFFER_NEW 0
+#define YY_BUFFER_NORMAL 1
+	/* When an EOF's been seen but there's still some text to process
+	 * then we mark the buffer as YY_EOF_PENDING, to indicate that we
+	 * shouldn't try reading from the input source any more.  We might
+	 * still have a bunch of tokens to match, though, because of
+	 * possible backing-up.
+	 *
+	 * When we actually see the EOF, we change the status to "new"
+	 * (via fts0trestart()), so that the user can continue scanning by
+	 * just pointing yyin at a new input file.
+	 */
+#define YY_BUFFER_EOF_PENDING 2
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+/* We provide macros for accessing buffer states in case in the
+ * future we want to put the buffer states in a more general
+ * "scanner state".
+ *
+ * Returns the top of the stack, or NULL.
+ */
+#define YY_CURRENT_BUFFER ( yyg->yy_buffer_stack \
+                          ? yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] \
+                          : NULL)
+
+/* Same as previous macro, but useful when we know that the buffer stack is not
+ * NULL or when we need an lvalue. For internal use only.
+ */
+#define YY_CURRENT_BUFFER_LVALUE yyg->yy_buffer_stack[yyg->yy_buffer_stack_top]
+
+void fts0trestart (FILE *input_file ,yyscan_t yyscanner );
+void fts0t_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0t_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
+void fts0t_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0t_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0tpush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+void fts0tpop_buffer_state (yyscan_t yyscanner );
+
+static void fts0tensure_buffer_stack (yyscan_t yyscanner );
+static void fts0t_load_buffer_state (yyscan_t yyscanner );
+static void fts0t_init_buffer (YY_BUFFER_STATE b,FILE *file ,yyscan_t yyscanner );
+
+#define YY_FLUSH_BUFFER fts0t_flush_buffer(YY_CURRENT_BUFFER ,yyscanner)
+
+YY_BUFFER_STATE fts0t_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0t_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0t_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner );
+
+void *fts0talloc (yy_size_t ,           yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) );
+void *fts0trealloc (void *,yy_size_t ,           yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) );
+void fts0tfree (void * ,           yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) );
+
+#define yy_new_buffer fts0t_create_buffer
+
+#define yy_set_interactive(is_interactive) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){ \
+        fts0tensure_buffer_stack (yyscanner); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            fts0t_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \
+	}
+
+#define yy_set_bol(at_bol) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){\
+        fts0tensure_buffer_stack (yyscanner); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            fts0t_create_buffer(yyin,YY_BUF_SIZE ,yyscanner); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \
+	}
+
+#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol)
+
+/* Begin user sect3 */
+
+#define fts0twrap(n) 1
+#define YY_SKIP_YYWRAP
+
+typedef unsigned char YY_CHAR;
+
+typedef int yy_state_type;
+
+#define yytext_ptr yytext_r
+
+static yy_state_type yy_get_previous_state (yyscan_t yyscanner );
+static yy_state_type yy_try_NUL_trans (yy_state_type current_state  ,yyscan_t yyscanner);
+static int yy_get_next_buffer (yyscan_t yyscanner );
+static void yy_fatal_error (yyconst char msg[] ,           yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) );
+
+/* Done after the current pattern has been matched and before the
+ * corresponding action - sets up yytext.
+ */
+#define YY_DO_BEFORE_ACTION \
+	yyg->yytext_ptr = yy_bp; \
+	yyleng = static_cast<int>(yy_cp - yy_bp); \
+	yyg->yy_hold_char = *yy_cp; \
+	*yy_cp = '\0'; \
+	yyg->yy_c_buf_p = yy_cp;
+
+#define YY_NUM_RULES 7
+#define YY_END_OF_BUFFER 8
+/* This struct is not used in this scanner,
+   but its presence is necessary. */
+struct yy_trans_info
+	{
+	flex_int32_t yy_verify;
+	flex_int32_t yy_nxt;
+	};
+static yyconst flex_int16_t yy_accept[17] =
+    {   0,
+        4,    4,    8,    4,    1,    6,    1,    5,    5,    2,
+        4,    1,    1,    0,    3,    0
+    } ;
+
+static yyconst flex_int32_t yy_ec[256] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    1,    2,    3,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    4,    1,    5,    1,    1,    6,    1,    1,    1,
+        1,    7,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1
+    } ;
+
+static yyconst flex_int32_t yy_meta[8] =
+    {   0,
+        1,    2,    3,    4,    5,    5,    1
+    } ;
+
+static yyconst flex_int16_t yy_base[20] =
+    {   0,
+        0,    0,   18,    0,    6,   21,    0,    9,   21,    0,
+        0,    0,    0,    4,   21,   21,   10,   11,   15
+    } ;
+
+static yyconst flex_int16_t yy_def[20] =
+    {   0,
+       16,    1,   16,   17,   17,   16,   18,   19,   16,   17,
+       17,    5,   18,   19,   16,    0,   16,   16,   16
+    } ;
+
+static yyconst flex_int16_t yy_nxt[29] =
+    {   0,
+        4,    5,    6,    7,    8,    9,   10,   12,   15,   13,
+       11,   11,   13,   15,   13,   14,   14,   16,   14,   14,
+        3,   16,   16,   16,   16,   16,   16,   16
+    } ;
+
+static yyconst flex_int16_t yy_chk[29] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    5,   14,    5,
+       17,   17,   18,    8,   18,   19,   19,    3,   19,   19,
+       16,   16,   16,   16,   16,   16,   16,   16
+    } ;
+
+/* The intent behind this definition is that it'll catch
+ * any uses of REJECT which flex missed.
+ */
+#define REJECT reject_used_but_not_detected
+#define yymore() yymore_used_but_not_detected
+#define YY_MORE_ADJ 0
+#define YY_RESTORE_YY_MORE_OFFSET
+#line 1 "fts0tlex.l"
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**
+ * @file fts/fts0tlex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+#line 27 "fts0tlex.l"
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_tlexer(YYSTYPE* val, yyscan_t yyscanner)
+
+#define YY_NO_INPUT 1
+#line 480 "fts0tlex.cc"
+
+#define INITIAL 0
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+/* Holds the entire state of the reentrant scanner. */
+struct yyguts_t
+    {
+
+    /* User-defined. Not touched by flex. */
+    YY_EXTRA_TYPE yyextra_r;
+
+    /* The rest are the same as the globals declared in the non-reentrant scanner. */
+    FILE *yyin_r, *yyout_r;
+    size_t yy_buffer_stack_top; /**< index of top of stack. */
+    size_t yy_buffer_stack_max; /**< capacity of stack. */
+    YY_BUFFER_STATE * yy_buffer_stack; /**< Stack as an array. */
+    char yy_hold_char;
+    int yy_n_chars;
+    int yyleng_r;
+    char *yy_c_buf_p;
+    int yy_init;
+    int yy_start;
+    int yy_did_buffer_switch_on_eof;
+    int yy_start_stack_ptr;
+    int yy_start_stack_depth;
+    int *yy_start_stack;
+    yy_state_type yy_last_accepting_state;
+    char* yy_last_accepting_cpos;
+
+    int yylineno_r;
+    int yy_flex_debug_r;
+
+    char *yytext_r;
+    int yy_more_flag;
+    int yy_more_len;
+
+    }; /* end struct yyguts_t */
+
+static int yy_init_globals (yyscan_t yyscanner );
+
+int fts0tlex_init (yyscan_t* scanner);
+
+int fts0tlex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int fts0tlex_destroy (yyscan_t yyscanner );
+
+int fts0tget_debug (yyscan_t yyscanner );
+
+void fts0tset_debug (int debug_flag ,yyscan_t yyscanner );
+
+YY_EXTRA_TYPE fts0tget_extra (yyscan_t yyscanner );
+
+void fts0tset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
+
+FILE *fts0tget_in (yyscan_t yyscanner );
+
+void fts0tset_in  (FILE * in_str ,yyscan_t yyscanner );
+
+FILE *fts0tget_out (yyscan_t yyscanner );
+
+void fts0tset_out  (FILE * out_str ,yyscan_t yyscanner );
+
+int fts0tget_leng (yyscan_t yyscanner );
+
+char *fts0tget_text (yyscan_t yyscanner );
+
+int fts0tget_lineno (yyscan_t yyscanner );
+
+void fts0tset_lineno (int line_number ,yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int fts0twrap (yyscan_t yyscanner );
+#else
+extern int fts0twrap (yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char *,yyconst char *,int ,           yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)));
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * ,           yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)));
+#endif
+
+#ifndef YY_NO_INPUT
+
+#ifdef __cplusplus
+static int yyinput (yyscan_t yyscanner );
+#else
+static int input (yyscan_t yyscanner );
+#endif
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Copy whatever the last rule matched to the standard output. */
+#ifndef ECHO
+/* This used to be an fputs(), but since the string might contain NUL's,
+ * we now use fwrite().
+ */
+#define ECHO do { if (fwrite( yytext, yyleng, 1, yyout )) {} } while (0)
+#endif
+
+/* Gets input and stuffs it into "buf".  number of characters read, or YY_NULL,
+ * is returned in "result".
+ */
+#ifndef YY_INPUT
+#define YY_INPUT(buf,result,max_size) \
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \
+		{ \
+		int c = '*'; \
+		int n; \
+		for ( n = 0; n < static_cast<int>(max_size) && \
+			     (c = getc( yyin )) != EOF && c != '\n'; ++n ) \
+			buf[n] = (char) c; \
+		if ( c == '\n' ) \
+			buf[n++] = (char) c; \
+		if ( c == EOF && ferror( yyin ) ) \
+			YY_FATAL_ERROR( "input in flex scanner failed" ); \
+		result = n; \
+		} \
+	else \
+		{ \
+		errno=0; \
+		while ( (result = static_cast<int>(fread(buf, 1, max_size, yyin)))==0 \
+		       && ferror(yyin)) \
+			{ \
+			if( errno != EINTR) \
+				{ \
+				YY_FATAL_ERROR( "input in flex scanner failed" ); \
+				break; \
+				} \
+			errno=0; \
+			clearerr(yyin); \
+			} \
+		}\
+\
+
+#endif
+
+/* No semi-colon after return; correct usage is to write "yyterminate();" -
+ * we don't want an extra ';' after the "return" because that will cause
+ * some compilers to complain about unreachable statements.
+ */
+#ifndef yyterminate
+#define yyterminate() return YY_NULL
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Report a fatal error. */
+#ifndef YY_FATAL_ERROR
+#define YY_FATAL_ERROR(msg) yy_fatal_error( msg , yyscanner)
+#endif
+
+/* end tables serialization structures and prototypes */
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int fts0tlex (yyscan_t yyscanner);
+
+#define YY_DECL int fts0tlex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* Code executed at the beginning of each rule, after yytext and yyleng
+ * have been set up.
+ */
+#ifndef YY_USER_ACTION
+#define YY_USER_ACTION
+#endif
+
+/* Code executed at the end of each rule. */
+#ifndef YY_BREAK
+#define YY_BREAK break;
+#endif
+
+#define YY_RULE_SETUP \
+	YY_USER_ACTION
+
+/** The main scanner function which does all the work.
+ */
+YY_DECL
+{
+	register yy_state_type yy_current_state;
+	register char *yy_cp, *yy_bp;
+	register int yy_act;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+#line 44 "fts0tlex.l"
+
+
+#line 707 "fts0tlex.cc"
+
+	if ( !yyg->yy_init )
+		{
+		yyg->yy_init = 1;
+
+#ifdef YY_USER_INIT
+		YY_USER_INIT;
+#endif
+
+		if ( ! yyg->yy_start )
+			yyg->yy_start = 1;	/* first start state */
+
+		if ( ! yyin )
+			yyin = stdin;
+
+		if ( ! yyout )
+			yyout = stdout;
+
+		if ( ! YY_CURRENT_BUFFER ) {
+			fts0tensure_buffer_stack (yyscanner);
+			YY_CURRENT_BUFFER_LVALUE =
+				fts0t_create_buffer(yyin,YY_BUF_SIZE ,yyscanner);
+		}
+
+		fts0t_load_buffer_state(yyscanner );
+		}
+
+	while ( 1 )		/* loops until end-of-file is reached */
+		{
+		yy_cp = yyg->yy_c_buf_p;
+
+		/* Support of yytext. */
+		*yy_cp = yyg->yy_hold_char;
+
+		/* yy_bp points to the position in yy_ch_buf of the start of
+		 * the current run.
+		 */
+		yy_bp = yy_cp;
+
+		yy_current_state = yyg->yy_start;
+yy_match:
+		do
+			{
+			register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)];
+			if ( yy_accept[yy_current_state] )
+				{
+				yyg->yy_last_accepting_state = yy_current_state;
+				yyg->yy_last_accepting_cpos = yy_cp;
+				}
+			while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+				{
+				yy_current_state = (int) yy_def[yy_current_state];
+				if ( yy_current_state >= 17 )
+					yy_c = yy_meta[(unsigned int) yy_c];
+				}
+			yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+			++yy_cp;
+			}
+		while ( yy_current_state != 16 );
+		yy_cp = yyg->yy_last_accepting_cpos;
+		yy_current_state = yyg->yy_last_accepting_state;
+
+yy_find_action:
+		yy_act = yy_accept[yy_current_state];
+
+		YY_DO_BEFORE_ACTION;
+
+do_action:	/* This label is used only to access EOF actions. */
+
+		switch ( yy_act )
+	{ /* beginning of action switch */
+			case 0: /* must back up */
+			/* undo the effects of YY_DO_BEFORE_ACTION */
+			*yy_cp = yyg->yy_hold_char;
+			yy_cp = yyg->yy_last_accepting_cpos;
+			yy_current_state = yyg->yy_last_accepting_state;
+			goto yy_find_action;
+
+case 1:
+YY_RULE_SETUP
+#line 46 "fts0tlex.l"
+/* Ignore whitespace */ ;
+	YY_BREAK
+case 2:
+YY_RULE_SETUP
+#line 48 "fts0tlex.l"
+{
+	val->oper = fts0tget_text(yyscanner)[0];
+
+	return(val->oper);
+}
+	YY_BREAK
+case 3:
+YY_RULE_SETUP
+#line 54 "fts0tlex.l"
+{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner));
+
+	return(FTS_TEXT);
+}
+	YY_BREAK
+case 4:
+YY_RULE_SETUP
+#line 60 "fts0tlex.l"
+{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner));
+
+	return(FTS_TERM);
+}
+	YY_BREAK
+case 5:
+YY_RULE_SETUP
+#line 65 "fts0tlex.l"
+;
+	YY_BREAK
+case 6:
+/* rule 6 can match eol */
+YY_RULE_SETUP
+#line 66 "fts0tlex.l"
+
+	YY_BREAK
+case 7:
+YY_RULE_SETUP
+#line 68 "fts0tlex.l"
+ECHO;
+	YY_BREAK
+#line 834 "fts0tlex.cc"
+case YY_STATE_EOF(INITIAL):
+	yyterminate();
+
+	case YY_END_OF_BUFFER:
+		{
+		/* Amount of text matched not including the EOB char. */
+		int yy_amount_of_matched_text = (int) (yy_cp - yyg->yytext_ptr) - 1;
+
+		/* Undo the effects of YY_DO_BEFORE_ACTION. */
+		*yy_cp = yyg->yy_hold_char;
+		YY_RESTORE_YY_MORE_OFFSET
+
+		if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW )
+			{
+			/* We're scanning a new file or input source.  It's
+			 * possible that this happened because the user
+			 * just pointed yyin at a new source and called
+			 * fts0tlex().  If so, then we have to assure
+			 * consistency between YY_CURRENT_BUFFER and our
+			 * globals.  Here is the right place to do so, because
+			 * this is the first action (other than possibly a
+			 * back-up) that will match for the new input source.
+			 */
+			yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+			YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL;
+			}
+
+		/* Note that here we test for yy_c_buf_p "<=" to the position
+		 * of the first EOB in the buffer, since yy_c_buf_p will
+		 * already have been incremented past the NUL character
+		 * (since all states make transitions on EOB to the
+		 * end-of-buffer state).  Contrast this with the test
+		 * in input().
+		 */
+		if ( yyg->yy_c_buf_p <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+			{ /* This was really a NUL. */
+			yy_state_type yy_next_state;
+
+			yyg->yy_c_buf_p = yyg->yytext_ptr + yy_amount_of_matched_text;
+
+			yy_current_state = yy_get_previous_state( yyscanner );
+
+			/* Okay, we're now positioned to make the NUL
+			 * transition.  We couldn't have
+			 * yy_get_previous_state() go ahead and do it
+			 * for us because it doesn't know how to deal
+			 * with the possibility of jamming (and we don't
+			 * want to build jamming into it because then it
+			 * will run more slowly).
+			 */
+
+			yy_next_state = yy_try_NUL_trans( yy_current_state , yyscanner);
+
+			yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+
+			if ( yy_next_state )
+				{
+				/* Consume the NUL. */
+				yy_cp = ++yyg->yy_c_buf_p;
+				yy_current_state = yy_next_state;
+				goto yy_match;
+				}
+
+			else
+				{
+				yy_cp = yyg->yy_last_accepting_cpos;
+				yy_current_state = yyg->yy_last_accepting_state;
+				goto yy_find_action;
+				}
+			}
+
+		else switch ( yy_get_next_buffer( yyscanner ) )
+			{
+			case EOB_ACT_END_OF_FILE:
+				{
+				yyg->yy_did_buffer_switch_on_eof = 0;
+
+				if ( fts0twrap(yyscanner ) )
+					{
+					/* Note: because we've taken care in
+					 * yy_get_next_buffer() to have set up
+					 * yytext, we can now set up
+					 * yy_c_buf_p so that if some total
+					 * hoser (like flex itself) wants to
+					 * call the scanner after we return the
+					 * YY_NULL, it'll still work - another
+					 * YY_NULL will get returned.
+					 */
+					yyg->yy_c_buf_p = yyg->yytext_ptr + YY_MORE_ADJ;
+
+					yy_act = YY_STATE_EOF(YY_START);
+					goto do_action;
+					}
+
+				else
+					{
+					if ( ! yyg->yy_did_buffer_switch_on_eof )
+						YY_NEW_FILE;
+					}
+				break;
+				}
+
+			case EOB_ACT_CONTINUE_SCAN:
+				yyg->yy_c_buf_p =
+					yyg->yytext_ptr + yy_amount_of_matched_text;
+
+				yy_current_state = yy_get_previous_state( yyscanner );
+
+				yy_cp = yyg->yy_c_buf_p;
+				yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+				goto yy_match;
+
+			case EOB_ACT_LAST_MATCH:
+				yyg->yy_c_buf_p =
+				&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars];
+
+				yy_current_state = yy_get_previous_state( yyscanner );
+
+				yy_cp = yyg->yy_c_buf_p;
+				yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+				goto yy_find_action;
+			}
+		break;
+		}
+
+	default:
+		YY_FATAL_ERROR(
+			"fatal flex scanner internal error--no action found" );
+	} /* end of action switch */
+		} /* end of scanning one token */
+} /* end of fts0tlex */
+
+/* yy_get_next_buffer - try to read in a new buffer
+ *
+ * Returns a code representing an action:
+ *	EOB_ACT_LAST_MATCH -
+ *	EOB_ACT_CONTINUE_SCAN - continue scanning from current position
+ *	EOB_ACT_END_OF_FILE - end of file
+ */
+static int yy_get_next_buffer (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf;
+	register char *source = yyg->yytext_ptr;
+	register int number_to_move, i;
+	int ret_val;
+
+	if ( yyg->yy_c_buf_p > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] )
+		YY_FATAL_ERROR(
+		"fatal flex scanner internal error--end of buffer missed" );
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 )
+		{ /* Don't try to fill the buffer, so this is an EOF. */
+		if ( yyg->yy_c_buf_p - yyg->yytext_ptr - YY_MORE_ADJ == 1 )
+			{
+			/* We matched a single character, the EOB, so
+			 * treat this as a final EOF.
+			 */
+			return EOB_ACT_END_OF_FILE;
+			}
+
+		else
+			{
+			/* We matched some text prior to the EOB, first
+			 * process it.
+			 */
+			return EOB_ACT_LAST_MATCH;
+			}
+		}
+
+	/* Try to read more data. */
+
+	/* First move last chars to start of buffer. */
+	number_to_move = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr) - 1;
+
+	for ( i = 0; i < number_to_move; ++i )
+		*(dest++) = *(source++);
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING )
+		/* don't do the read, it's not guaranteed to return an EOF,
+		 * just force an EOF
+		 */
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars = 0;
+
+	else
+		{
+			int num_to_read =static_cast<int>(
+				YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1);
+
+		while ( num_to_read <= 0 )
+			{ /* Not enough room in the buffer - grow it. */
+
+			/* just a shorter name for the current buffer */
+			YY_BUFFER_STATE b = YY_CURRENT_BUFFER;
+
+			int yy_c_buf_p_offset =
+				(int) (yyg->yy_c_buf_p - b->yy_ch_buf);
+
+			if ( b->yy_is_our_buffer )
+				{
+				int new_size = static_cast<int>(b->yy_buf_size * 2);
+
+				if ( new_size <= 0 )
+					b->yy_buf_size += b->yy_buf_size / 8;
+				else
+					b->yy_buf_size *= 2;
+
+				b->yy_ch_buf = (char *)
+					/* Include room in for 2 EOB chars. */
+					fts0trealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ,yyscanner );
+				}
+			else
+				/* Can't grow it, we don't own it. */
+				b->yy_ch_buf = 0;
+
+			if ( ! b->yy_ch_buf )
+				YY_FATAL_ERROR(
+				"fatal error - scanner input buffer overflow" );
+
+			yyg->yy_c_buf_p = &b->yy_ch_buf[yy_c_buf_p_offset];
+
+			num_to_read = static_cast<int>(
+				YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1);
+
+			}
+
+		if ( num_to_read > YY_READ_BUF_SIZE )
+			num_to_read = YY_READ_BUF_SIZE;
+
+		/* Read in more data. */
+		YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]),
+			yyg->yy_n_chars, num_to_read);
+
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	if ( yyg->yy_n_chars == 0 )
+		{
+		if ( number_to_move == YY_MORE_ADJ )
+			{
+			ret_val = EOB_ACT_END_OF_FILE;
+			fts0trestart(yyin  ,yyscanner);
+			}
+
+		else
+			{
+			ret_val = EOB_ACT_LAST_MATCH;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status =
+				YY_BUFFER_EOF_PENDING;
+			}
+		}
+
+	else
+		ret_val = EOB_ACT_CONTINUE_SCAN;
+
+	if ((yy_size_t) (yyg->yy_n_chars + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) {
+		/* Extend the array by 50%, plus the number we really need. */
+		yy_size_t new_size = yyg->yy_n_chars + number_to_move + (yyg->yy_n_chars >> 1);
+		YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) fts0trealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ,yyscanner );
+		if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf )
+			YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" );
+	}
+
+	yyg->yy_n_chars += number_to_move;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] = YY_END_OF_BUFFER_CHAR;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] = YY_END_OF_BUFFER_CHAR;
+
+	yyg->yytext_ptr = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0];
+
+	return ret_val;
+}
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+    static yy_state_type yy_get_previous_state (yyscan_t yyscanner)
+{
+	register yy_state_type yy_current_state;
+	register char *yy_cp;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	yy_current_state = yyg->yy_start;
+
+	for ( yy_cp = yyg->yytext_ptr + YY_MORE_ADJ; yy_cp < yyg->yy_c_buf_p; ++yy_cp )
+		{
+		register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1);
+		if ( yy_accept[yy_current_state] )
+			{
+			yyg->yy_last_accepting_state = yy_current_state;
+			yyg->yy_last_accepting_cpos = yy_cp;
+			}
+		while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+			{
+			yy_current_state = (int) yy_def[yy_current_state];
+			if ( yy_current_state >= 17 )
+				yy_c = yy_meta[(unsigned int) yy_c];
+			}
+		yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+		}
+
+	return yy_current_state;
+}
+
+/* yy_try_NUL_trans - try to make a transition on the NUL character
+ *
+ * synopsis
+ *	next_state = yy_try_NUL_trans( current_state );
+ */
+    static yy_state_type yy_try_NUL_trans  (yy_state_type yy_current_state , yyscan_t yyscanner)
+{
+	register int yy_is_jam;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* This var may be unused depending upon options. */
+	register char *yy_cp = yyg->yy_c_buf_p;
+
+	register YY_CHAR yy_c = 1;
+	if ( yy_accept[yy_current_state] )
+		{
+		yyg->yy_last_accepting_state = yy_current_state;
+		yyg->yy_last_accepting_cpos = yy_cp;
+		}
+	while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+		{
+		yy_current_state = (int) yy_def[yy_current_state];
+		if ( yy_current_state >= 17 )
+			yy_c = yy_meta[(unsigned int) yy_c];
+		}
+	yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+	yy_is_jam = (yy_current_state == 16);
+
+	return yy_is_jam ? 0 : yy_current_state;
+}
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+    static int yyinput (yyscan_t yyscanner)
+#else
+    static int input  (yyscan_t yyscanner)
+#endif
+
+{
+	int c;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	*yyg->yy_c_buf_p = yyg->yy_hold_char;
+
+	if ( *yyg->yy_c_buf_p == YY_END_OF_BUFFER_CHAR )
+		{
+		/* yy_c_buf_p now points to the character we want to return.
+		 * If this occurs *before* the EOB characters, then it's a
+		 * valid NUL; if not, then we've hit the end of the buffer.
+		 */
+		if ( yyg->yy_c_buf_p < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+			/* This was really a NUL. */
+			*yyg->yy_c_buf_p = '\0';
+
+		else
+			{ /* need more input */
+			int offset = yyg->yy_c_buf_p - yyg->yytext_ptr;
+			++yyg->yy_c_buf_p;
+
+			switch ( yy_get_next_buffer( yyscanner ) )
+				{
+				case EOB_ACT_LAST_MATCH:
+					/* This happens because yy_g_n_b()
+					 * sees that we've accumulated a
+					 * token and flags that we need to
+					 * try matching the token before
+					 * proceeding.  But for input(),
+					 * there's no matching to consider.
+					 * So convert the EOB_ACT_LAST_MATCH
+					 * to EOB_ACT_END_OF_FILE.
+					 */
+
+					/* Reset buffer status. */
+					fts0trestart(yyin ,yyscanner);
+
+					/*FALLTHROUGH*/
+
+				case EOB_ACT_END_OF_FILE:
+					{
+					if ( fts0twrap(yyscanner ) )
+						return EOF;
+
+					if ( ! yyg->yy_did_buffer_switch_on_eof )
+						YY_NEW_FILE;
+#ifdef __cplusplus
+					return yyinput(yyscanner);
+#else
+					return input(yyscanner);
+#endif
+					}
+
+				case EOB_ACT_CONTINUE_SCAN:
+					yyg->yy_c_buf_p = yyg->yytext_ptr + offset;
+					break;
+				}
+			}
+		}
+
+	c = *(unsigned char *) yyg->yy_c_buf_p;	/* cast for 8-bit char's */
+	*yyg->yy_c_buf_p = '\0';	/* preserve yytext */
+	yyg->yy_hold_char = *++yyg->yy_c_buf_p;
+
+	return c;
+}
+#endif	/* ifndef YY_NO_INPUT */
+
+/** Immediately switch to a different input stream.
+ * @param input_file A readable stream.
+ * @param yyscanner The scanner object.
+ * @note This function does not reset the start condition to @c INITIAL .
+ */
+    void fts0trestart  (FILE * input_file , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if ( ! YY_CURRENT_BUFFER ){
+        fts0tensure_buffer_stack (yyscanner);
+		YY_CURRENT_BUFFER_LVALUE =
+            fts0t_create_buffer(yyin,YY_BUF_SIZE ,yyscanner);
+	}
+
+	fts0t_init_buffer(YY_CURRENT_BUFFER,input_file ,yyscanner);
+	fts0t_load_buffer_state(yyscanner );
+}
+
+/** Switch to a different input buffer.
+ * @param new_buffer The new input buffer.
+ * @param yyscanner The scanner object.
+ */
+    void fts0t_switch_to_buffer  (YY_BUFFER_STATE  new_buffer , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	/* TODO. We should be able to replace this entire function body
+	 * with
+	 *		fts0tpop_buffer_state();
+	 *		fts0tpush_buffer_state(new_buffer);
+     */
+	fts0tensure_buffer_stack (yyscanner);
+	if ( YY_CURRENT_BUFFER == new_buffer )
+		return;
+
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*yyg->yy_c_buf_p = yyg->yy_hold_char;
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+	fts0t_load_buffer_state(yyscanner );
+
+	/* We don't actually know whether we did this switch during
+	 * EOF (fts0twrap()) processing, but the only time this flag
+	 * is looked at is after fts0twrap() is called, so it's safe
+	 * to go ahead and always set it.
+	 */
+	yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+static void fts0t_load_buffer_state  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+	yyg->yytext_ptr = yyg->yy_c_buf_p = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos;
+	yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file;
+	yyg->yy_hold_char = *yyg->yy_c_buf_p;
+}
+
+/** Allocate and initialize an input buffer state.
+ * @param file A readable stream.
+ * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE.
+ * @param yyscanner The scanner object.
+ * @return the allocated buffer state.
+ */
+    YY_BUFFER_STATE fts0t_create_buffer  (FILE * file, int  size , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+
+	b = (YY_BUFFER_STATE) fts0talloc(sizeof( struct yy_buffer_state ) ,yyscanner );
+	if ( ! b )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0t_create_buffer()" );
+
+	b->yy_buf_size = size;
+
+	/* yy_ch_buf has to be 2 characters longer than the size given because
+	 * we need to put in 2 end-of-buffer characters.
+	 */
+	b->yy_ch_buf = (char *) fts0talloc(b->yy_buf_size + 2 ,yyscanner );
+	if ( ! b->yy_ch_buf )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0t_create_buffer()" );
+
+	b->yy_is_our_buffer = 1;
+
+	fts0t_init_buffer(b,file ,yyscanner);
+
+	return b;
+}
+
+/** Destroy the buffer.
+ * @param b a buffer created with fts0t_create_buffer()
+ * @param yyscanner The scanner object.
+ */
+    void fts0t_delete_buffer (YY_BUFFER_STATE  b , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if ( ! b )
+		return;
+
+	if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */
+		YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0;
+
+	if ( b->yy_is_our_buffer )
+		fts0tfree((void *) b->yy_ch_buf ,yyscanner );
+
+	fts0tfree((void *) b ,yyscanner );
+}
+
+/* Initializes or reinitializes a buffer.
+ * This function is sometimes called more than once on the same buffer,
+ * such as during a fts0trestart() or at EOF.
+ */
+    static void fts0t_init_buffer  (YY_BUFFER_STATE  b, FILE * file , yyscan_t yyscanner)
+
+{
+	int oerrno = errno;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	fts0t_flush_buffer(b ,yyscanner);
+
+	b->yy_input_file = file;
+	b->yy_fill_buffer = 1;
+
+    /* If b is the current buffer, then fts0t_init_buffer was _probably_
+     * called from fts0trestart() or through yy_get_next_buffer.
+     * In that case, we don't want to reset the lineno or column.
+     */
+    if (b != YY_CURRENT_BUFFER){
+        b->yy_bs_lineno = 1;
+        b->yy_bs_column = 0;
+    }
+
+        b->yy_is_interactive = 0;
+
+	errno = oerrno;
+}
+
+/** Discard all buffered characters. On the next scan, YY_INPUT will be called.
+ * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER.
+ * @param yyscanner The scanner object.
+ */
+    void fts0t_flush_buffer (YY_BUFFER_STATE  b , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	if ( ! b )
+		return;
+
+	b->yy_n_chars = 0;
+
+	/* We always need two end-of-buffer characters.  The first causes
+	 * a transition to the end-of-buffer state.  The second causes
+	 * a jam in that state.
+	 */
+	b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR;
+	b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR;
+
+	b->yy_buf_pos = &b->yy_ch_buf[0];
+
+	b->yy_at_bol = 1;
+	b->yy_buffer_status = YY_BUFFER_NEW;
+
+	if ( b == YY_CURRENT_BUFFER )
+		fts0t_load_buffer_state(yyscanner );
+}
+
+/** Pushes the new state onto the stack. The new state becomes
+ *  the current state. This function will allocate the stack
+ *  if necessary.
+ *  @param new_buffer The new state.
+ *  @param yyscanner The scanner object.
+ */
+void fts0tpush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	if (new_buffer == NULL)
+		return;
+
+	fts0tensure_buffer_stack(yyscanner);
+
+	/* This block is copied from fts0t_switch_to_buffer. */
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*yyg->yy_c_buf_p = yyg->yy_hold_char;
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	/* Only push if top exists. Otherwise, replace top. */
+	if (YY_CURRENT_BUFFER)
+		yyg->yy_buffer_stack_top++;
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+
+	/* copied from fts0t_switch_to_buffer. */
+	fts0t_load_buffer_state(yyscanner );
+	yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+/** Removes and deletes the top of the stack, if present.
+ *  The next element becomes the new top.
+ *  @param yyscanner The scanner object.
+ */
+void fts0tpop_buffer_state (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	if (!YY_CURRENT_BUFFER)
+		return;
+
+	fts0t_delete_buffer(YY_CURRENT_BUFFER ,yyscanner);
+	YY_CURRENT_BUFFER_LVALUE = NULL;
+	if (yyg->yy_buffer_stack_top > 0)
+		--yyg->yy_buffer_stack_top;
+
+	if (YY_CURRENT_BUFFER) {
+		fts0t_load_buffer_state(yyscanner );
+		yyg->yy_did_buffer_switch_on_eof = 1;
+	}
+}
+
+/* Allocates the stack if it does not exist.
+ *  Guarantees space for at least one push.
+ */
+static void fts0tensure_buffer_stack (yyscan_t yyscanner)
+{
+	int num_to_alloc;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if (!yyg->yy_buffer_stack) {
+
+		/* First allocation is just for 2 elements, since we don't know if this
+		 * scanner will even need a stack. We use 2 instead of 1 to avoid an
+		 * immediate realloc on the next call.
+         */
+		num_to_alloc = 1;
+		yyg->yy_buffer_stack = (struct yy_buffer_state**)fts0talloc
+								(num_to_alloc * sizeof(struct yy_buffer_state*)
+								, yyscanner);
+		if ( ! yyg->yy_buffer_stack )
+			YY_FATAL_ERROR( "out of dynamic memory in fts0tensure_buffer_stack()" );
+
+		memset(yyg->yy_buffer_stack, 0, num_to_alloc * sizeof(struct yy_buffer_state*));
+
+		yyg->yy_buffer_stack_max = num_to_alloc;
+		yyg->yy_buffer_stack_top = 0;
+		return;
+	}
+
+	if (yyg->yy_buffer_stack_top >= (yyg->yy_buffer_stack_max) - 1){
+
+		/* Increase the buffer to prepare for a possible push. */
+		int grow_size = 8 /* arbitrary grow size */;
+
+		num_to_alloc = static_cast<int>(yyg->yy_buffer_stack_max + grow_size);
+		yyg->yy_buffer_stack = (struct yy_buffer_state**)fts0trealloc
+								(yyg->yy_buffer_stack,
+								num_to_alloc * sizeof(struct yy_buffer_state*)
+								, yyscanner);
+		if ( ! yyg->yy_buffer_stack )
+			YY_FATAL_ERROR( "out of dynamic memory in fts0tensure_buffer_stack()" );
+
+		/* zero only the new slots.*/
+		memset(yyg->yy_buffer_stack + yyg->yy_buffer_stack_max, 0, grow_size * sizeof(struct yy_buffer_state*));
+		yyg->yy_buffer_stack_max = num_to_alloc;
+	}
+}
+
+/** Setup the input buffer state to scan directly from a user-specified character buffer.
+ * @param base the character buffer
+ * @param size the size in bytes of the character buffer
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ */
+YY_BUFFER_STATE fts0t_scan_buffer  (char * base, yy_size_t  size , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+
+	if ( size < 2 ||
+	     base[size-2] != YY_END_OF_BUFFER_CHAR ||
+	     base[size-1] != YY_END_OF_BUFFER_CHAR )
+		/* They forgot to leave room for the EOB's. */
+		return 0;
+
+	b = (YY_BUFFER_STATE) fts0talloc(sizeof( struct yy_buffer_state ) ,yyscanner );
+	if ( ! b )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0t_scan_buffer()" );
+
+	b->yy_buf_size = size - 2;	/* "- 2" to take care of EOB's */
+	b->yy_buf_pos = b->yy_ch_buf = base;
+	b->yy_is_our_buffer = 0;
+	b->yy_input_file = 0;
+	b->yy_n_chars = static_cast<int>(b->yy_buf_size);
+	b->yy_is_interactive = 0;
+	b->yy_at_bol = 1;
+	b->yy_fill_buffer = 0;
+	b->yy_buffer_status = YY_BUFFER_NEW;
+
+	fts0t_switch_to_buffer(b ,yyscanner );
+
+	return b;
+}
+
+/** Setup the input buffer state to scan a string. The next call to fts0tlex() will
+ * scan from a @e copy of @a str.
+ * @param yystr a NUL-terminated string to scan
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ * @note If you want to scan bytes that may contain NUL values, then use
+ *       fts0t_scan_bytes() instead.
+ */
+YY_BUFFER_STATE fts0t_scan_string (yyconst char * yystr , yyscan_t yyscanner)
+{
+
+	return fts0t_scan_bytes(yystr,static_cast<int>(strlen(yystr)) ,yyscanner);
+}
+
+/** Setup the input buffer state to scan the given bytes. The next call to fts0tlex() will
+ * scan from a @e copy of @a bytes.
+ * @param yybytes the byte buffer to scan
+ * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes.
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ */
+YY_BUFFER_STATE fts0t_scan_bytes  (yyconst char * yybytes, int  _yybytes_len , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+	char *buf;
+	yy_size_t n;
+	int i;
+
+	/* Get memory for full buffer, including space for trailing EOB's. */
+	n = _yybytes_len + 2;
+	buf = (char *) fts0talloc(n ,yyscanner );
+	if ( ! buf )
+		YY_FATAL_ERROR( "out of dynamic memory in fts0t_scan_bytes()" );
+
+	for ( i = 0; i < _yybytes_len; ++i )
+		buf[i] = yybytes[i];
+
+	buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR;
+
+	b = fts0t_scan_buffer(buf,n ,yyscanner);
+	if ( ! b )
+		YY_FATAL_ERROR( "bad buffer in fts0t_scan_bytes()" );
+
+	/* It's okay to grow etc. this buffer, and we should throw it
+	 * away when we're done.
+	 */
+	b->yy_is_our_buffer = 1;
+
+	return b;
+}
+
+#ifndef YY_EXIT_FAILURE
+#define YY_EXIT_FAILURE 2
+#endif
+
+static void yy_fatal_error (yyconst char* msg ,            yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+    	(void) fprintf( stderr, "%s\n", msg );
+	exit( YY_EXIT_FAILURE );
+}
+
+/* Redefine yyless() so it works in section 3 code. */
+
+#undef yyless
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		yytext[yyleng] = yyg->yy_hold_char; \
+		yyg->yy_c_buf_p = yytext + yyless_macro_arg; \
+		yyg->yy_hold_char = *yyg->yy_c_buf_p; \
+		*yyg->yy_c_buf_p = '\0'; \
+		yyleng = yyless_macro_arg; \
+		} \
+	while ( 0 )
+
+/* Accessor  methods (get/set functions) to struct members. */
+
+/** Get the user-defined data for this scanner.
+ * @param yyscanner The scanner object.
+ */
+YY_EXTRA_TYPE fts0tget_extra  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyextra;
+}
+
+/** Get the current line number.
+ * @param yyscanner The scanner object.
+ */
+int fts0tget_lineno  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+        if (! YY_CURRENT_BUFFER)
+            return 0;
+
+    return yylineno;
+}
+
+/** Get the current column number.
+ * @param yyscanner The scanner object.
+ */
+int fts0tget_column  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+        if (! YY_CURRENT_BUFFER)
+            return 0;
+
+    return yycolumn;
+}
+
+/** Get the input stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *fts0tget_in  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyin;
+}
+
+/** Get the output stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *fts0tget_out  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyout;
+}
+
+/** Get the length of the current token.
+ * @param yyscanner The scanner object.
+ */
+int fts0tget_leng  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyleng;
+}
+
+/** Get the current token.
+ * @param yyscanner The scanner object.
+ */
+
+char *fts0tget_text  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yytext;
+}
+
+/** Set the user-defined data. This data is never touched by the scanner.
+ * @param user_defined The data to be associated with this scanner.
+ * @param yyscanner The scanner object.
+ */
+void fts0tset_extra (YY_EXTRA_TYPE  user_defined , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yyextra = user_defined ;
+}
+
+/** Set the current line number.
+ * @param line_number
+ * @param yyscanner The scanner object.
+ */
+void fts0tset_lineno (int  line_number , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+        /* lineno is only valid if an input buffer exists. */
+        if (! YY_CURRENT_BUFFER )
+           yy_fatal_error( "fts0tset_lineno called with no buffer" , yyscanner);
+
+    yylineno = line_number;
+}
+
+/** Set the current column.
+ * @param line_number
+ * @param yyscanner The scanner object.
+ */
+void fts0tset_column (int  column_no , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+        /* column is only valid if an input buffer exists. */
+        if (! YY_CURRENT_BUFFER )
+           yy_fatal_error( "fts0tset_column called with no buffer" , yyscanner);
+
+    yycolumn = column_no;
+}
+
+/** Set the input stream. This does not discard the current
+ * input buffer.
+ * @param in_str A readable stream.
+ * @param yyscanner The scanner object.
+ * @see fts0t_switch_to_buffer
+ */
+void fts0tset_in (FILE *  in_str , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yyin = in_str ;
+}
+
+void fts0tset_out (FILE *  out_str , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yyout = out_str ;
+}
+
+int fts0tget_debug  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yy_flex_debug;
+}
+
+void fts0tset_debug (int  bdebug , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yy_flex_debug = bdebug ;
+}
+
+/* Accessor methods for yylval and yylloc */
+
+/* User-visible API */
+
+/* fts0tlex_init is special because it creates the scanner itself, so it is
+ * the ONLY reentrant function that doesn't take the scanner as the last argument.
+ * That's why we explicitly handle the declaration, instead of using our macros.
+ */
+
+int fts0tlex_init(yyscan_t* ptr_yy_globals)
+
+{
+    if (ptr_yy_globals == NULL){
+        errno = EINVAL;
+        return 1;
+    }
+
+    *ptr_yy_globals = (yyscan_t) fts0talloc ( sizeof( struct yyguts_t ), NULL );
+
+    if (*ptr_yy_globals == NULL){
+        errno = ENOMEM;
+        return 1;
+    }
+
+    /* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */
+    memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+
+    return yy_init_globals ( *ptr_yy_globals );
+}
+
+/* fts0tlex_init_extra has the same functionality as fts0tlex_init, but follows the
+ * convention of taking the scanner as the last argument. Note however, that
+ * this is a *pointer* to a scanner, as it will be allocated by this call (and
+ * is the reason, too, why this function also must handle its own declaration).
+ * The user defined value in the first argument will be available to fts0talloc in
+ * the yyextra field.
+ */
+
+int fts0tlex_init_extra(YY_EXTRA_TYPE yy_user_defined,yyscan_t* ptr_yy_globals )
+
+{
+    struct yyguts_t dummy_yyguts;
+
+    fts0tset_extra (yy_user_defined, &dummy_yyguts);
+
+    if (ptr_yy_globals == NULL){
+        errno = EINVAL;
+        return 1;
+    }
+
+    *ptr_yy_globals = (yyscan_t) fts0talloc ( sizeof( struct yyguts_t ), &dummy_yyguts );
+
+    if (*ptr_yy_globals == NULL){
+        errno = ENOMEM;
+        return 1;
+    }
+
+    /* By setting to 0xAA, we expose bugs in
+    yy_init_globals. Leave at 0x00 for releases. */
+    memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+
+    fts0tset_extra (yy_user_defined, *ptr_yy_globals);
+
+    return yy_init_globals ( *ptr_yy_globals );
+}
+
+static int yy_init_globals (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    /* Initialization is the same as for the non-reentrant scanner.
+     * This function is called from fts0tlex_destroy(), so don't allocate here.
+     */
+
+    yyg->yy_buffer_stack = 0;
+    yyg->yy_buffer_stack_top = 0;
+    yyg->yy_buffer_stack_max = 0;
+    yyg->yy_c_buf_p = (char *) 0;
+    yyg->yy_init = 0;
+    yyg->yy_start = 0;
+
+    yyg->yy_start_stack_ptr = 0;
+    yyg->yy_start_stack_depth = 0;
+    yyg->yy_start_stack =  NULL;
+
+/* Defined in main.c */
+#ifdef YY_STDINIT
+    yyin = stdin;
+    yyout = stdout;
+#else
+    yyin = (FILE *) 0;
+    yyout = (FILE *) 0;
+#endif
+
+    /* For future reference: Set errno on error, since we are called by
+     * fts0tlex_init()
+     */
+    return 0;
+}
+
+/* fts0tlex_destroy is for both reentrant and non-reentrant scanners. */
+int fts0tlex_destroy  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+    /* Pop the buffer stack, destroying each element. */
+	while(YY_CURRENT_BUFFER){
+		fts0t_delete_buffer(YY_CURRENT_BUFFER ,yyscanner );
+		YY_CURRENT_BUFFER_LVALUE = NULL;
+		fts0tpop_buffer_state(yyscanner);
+	}
+
+	/* Destroy the stack itself. */
+	fts0tfree(yyg->yy_buffer_stack ,yyscanner);
+	yyg->yy_buffer_stack = NULL;
+
+    /* Destroy the start condition stack. */
+        fts0tfree(yyg->yy_start_stack ,yyscanner );
+        yyg->yy_start_stack = NULL;
+
+    /* Reset the globals. This is important in a non-reentrant scanner so the next time
+     * fts0tlex() is called, initialization will occur. */
+    yy_init_globals( yyscanner);
+
+    /* Destroy the main struct (reentrant only). */
+    fts0tfree ( yyscanner , yyscanner );
+    yyscanner = NULL;
+    return 0;
+}
+
+/*
+ * Internal utility routines.
+ */
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char* s1, yyconst char * s2, int n ,            yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	register int i;
+	for ( i = 0; i < n; ++i )
+		s1[i] = s2[i];
+}
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * s ,            yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	register int n;
+	for ( n = 0; s[n]; ++n )
+		;
+
+	return n;
+}
+#endif
+
+void *fts0talloc (yy_size_t  size ,            yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	return (void *) malloc( size );
+}
+
+void *fts0trealloc  (void * ptr, yy_size_t  size ,            yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	/* The cast to (char *) in the following accommodates both
+	 * implementations that use char* generic pointers, and those
+	 * that use void* generic pointers.  It works with the latter
+	 * because both ANSI C and C++ allow castless assignment from
+	 * any pointer type to void*, and deal with argument conversions
+	 * as though doing an assignment.
+	 */
+	return (void *) realloc( (char *) ptr, size );
+}
+
+void fts0tfree (void * ptr ,            yyscan_t yyscanner __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)) __attribute__((unused)))
+{
+	free( (char *) ptr );	/* see fts0trealloc() for (char *) cast */
+}
+
+#define YYTABLES_NAME "yytables"
+
+#line 68 "fts0tlex.l"
+
+
+
diff --git a/storage/innobase/fts/fts0tlex.l b/storage/innobase/fts/fts0tlex.l
new file mode 100644
index 00000000000..4f55a83afe5
--- /dev/null
+++ b/storage/innobase/fts/fts0tlex.l
@@ -0,0 +1,68 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+ * @file fts/fts0tlex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+
+%{
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_tlexer(YYSTYPE* val, yyscan_t yyscanner)
+
+%}
+
+%option noinput
+%option nounput
+%option noyywrap
+%option nostdinit
+%option reentrant
+%option never-interactive
+
+
+%%
+
+[\t ]+	/* Ignore whitespace */ ;
+
+[*]	{
+	val->oper = fts0tget_text(yyscanner)[0];
+
+	return(val->oper);
+}
+
+\"[^\"\n]*\"	{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner));
+
+	return(FTS_TEXT);
+}
+
+[^" \n\%]*	{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner));
+
+	return(FTS_TERM);
+}
+.	;
+\n
+
+%%
diff --git a/storage/innobase/fts/make_parser.sh b/storage/innobase/fts/make_parser.sh
new file mode 100755
index 00000000000..2c072914c8b
--- /dev/null
+++ b/storage/innobase/fts/make_parser.sh
@@ -0,0 +1,49 @@
+#!/bin/sh
+#
+# Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+
+TMPF=t.$$
+
+make -f Makefile.query
+
+echo '#include "univ.i"' > $TMPF
+
+# This is to avoid compiler warning about unused parameters.
+# FIXME: gcc extension "__attribute__" causing compilation errors on windows
+# platform. Quote them out for now.
+sed -e '
+s/^\(static.*void.*yy_fatal_error.*msg.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/;
+s/^\(static.*void.*yy_flex_strncpy.*n.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/;
+s/^\(static.*int.*yy_flex_strlen.*s.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/;
+s/^\(\(static\|void\).*fts0[bt]alloc.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/;
+s/^\(\(static\|void\).*fts0[bt]realloc.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/;
+s/^\(\(static\|void\).*fts0[bt]free.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/;
+' < fts0blex.cc >> $TMPF
+
+mv $TMPF fts0blex.cc
+
+echo '#include "univ.i"' > $TMPF
+
+sed -e '
+s/^\(static.*void.*yy_fatal_error.*msg.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/;
+s/^\(static.*void.*yy_flex_strncpy.*n.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/;
+s/^\(static.*int.*yy_flex_strlen.*s.*,\)\(.*yyscanner\)/\1 \2 __attribute__((unused))/;
+s/^\(\(static\|void\).*fts0[bt]alloc.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/;
+s/^\(\(static\|void\).*fts0[bt]realloc.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/;
+s/^\(\(static\|void\).*fts0[bt]free.*,\)\(.*yyscanner\)/\1 \3 __attribute__((unused))/;
+' < fts0tlex.cc >> $TMPF
+
+mv $TMPF fts0tlex.cc
diff --git a/storage/innobase/fut/fut0fut.cc b/storage/innobase/fut/fut0fut.cc
new file mode 100644
index 00000000000..9bb1c512182
--- /dev/null
+++ b/storage/innobase/fut/fut0fut.cc
@@ -0,0 +1,31 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fut/fut0fut.cc
+File-based utilities
+
+Created 12/13/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "fut0fut.h"
+
+#ifdef UNIV_NONINL
+#include "fut0fut.ic"
+#endif
+
diff --git a/storage/innobase/fut/fut0lst.cc b/storage/innobase/fut/fut0lst.cc
new file mode 100644
index 00000000000..8f96a6426d2
--- /dev/null
+++ b/storage/innobase/fut/fut0lst.cc
@@ -0,0 +1,530 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fut/fut0lst.cc
+File-based list utilities
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "fut0lst.h"
+
+#ifdef UNIV_NONINL
+#include "fut0lst.ic"
+#endif
+
+#include "buf0buf.h"
+#include "page0page.h"
+
+/********************************************************************//**
+Adds a node to an empty list. */
+static
+void
+flst_add_to_empty(
+/*==============*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of
+					empty list */
+	flst_node_t*		node,	/*!< in: node to add */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	ulint		space;
+	fil_addr_t	node_addr;
+	ulint		len;
+
+	ut_ad(mtr && base && node);
+	ut_ad(base != node);
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node, MTR_MEMO_PAGE_X_FIX));
+	len = flst_get_len(base, mtr);
+	ut_a(len == 0);
+
+	buf_ptr_get_fsp_addr(node, &space, &node_addr);
+
+	/* Update first and last fields of base node */
+	flst_write_addr(base + FLST_FIRST, node_addr, mtr);
+	flst_write_addr(base + FLST_LAST, node_addr, mtr);
+
+	/* Set prev and next fields of node to add */
+	flst_write_addr(node + FLST_PREV, fil_addr_null, mtr);
+	flst_write_addr(node + FLST_NEXT, fil_addr_null, mtr);
+
+	/* Update len of base node */
+	mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr);
+}
+
+/********************************************************************//**
+Adds a node as the last node in a list. */
+UNIV_INTERN
+void
+flst_add_last(
+/*==========*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node,	/*!< in: node to add */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	ulint		space;
+	fil_addr_t	node_addr;
+	ulint		len;
+	fil_addr_t	last_addr;
+	flst_node_t*	last_node;
+
+	ut_ad(mtr && base && node);
+	ut_ad(base != node);
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node, MTR_MEMO_PAGE_X_FIX));
+	len = flst_get_len(base, mtr);
+	last_addr = flst_get_last(base, mtr);
+
+	buf_ptr_get_fsp_addr(node, &space, &node_addr);
+
+	/* If the list is not empty, call flst_insert_after */
+	if (len != 0) {
+		if (last_addr.page == node_addr.page) {
+			last_node = page_align(node) + last_addr.boffset;
+		} else {
+			ulint	zip_size = fil_space_get_zip_size(space);
+
+			last_node = fut_get_ptr(space, zip_size, last_addr,
+						RW_X_LATCH, mtr);
+		}
+
+		flst_insert_after(base, last_node, node, mtr);
+	} else {
+		/* else call flst_add_to_empty */
+		flst_add_to_empty(base, node, mtr);
+	}
+}
+
+/********************************************************************//**
+Adds a node as the first node in a list. */
+UNIV_INTERN
+void
+flst_add_first(
+/*===========*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node,	/*!< in: node to add */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	ulint		space;
+	fil_addr_t	node_addr;
+	ulint		len;
+	fil_addr_t	first_addr;
+	flst_node_t*	first_node;
+
+	ut_ad(mtr && base && node);
+	ut_ad(base != node);
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node, MTR_MEMO_PAGE_X_FIX));
+	len = flst_get_len(base, mtr);
+	first_addr = flst_get_first(base, mtr);
+
+	buf_ptr_get_fsp_addr(node, &space, &node_addr);
+
+	/* If the list is not empty, call flst_insert_before */
+	if (len != 0) {
+		if (first_addr.page == node_addr.page) {
+			first_node = page_align(node) + first_addr.boffset;
+		} else {
+			ulint	zip_size = fil_space_get_zip_size(space);
+
+			first_node = fut_get_ptr(space, zip_size, first_addr,
+						 RW_X_LATCH, mtr);
+		}
+
+		flst_insert_before(base, node, first_node, mtr);
+	} else {
+		/* else call flst_add_to_empty */
+		flst_add_to_empty(base, node, mtr);
+	}
+}
+
+/********************************************************************//**
+Inserts a node after another in a list. */
+UNIV_INTERN
+void
+flst_insert_after(
+/*==============*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node1,	/*!< in: node to insert after */
+	flst_node_t*		node2,	/*!< in: node to add */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	ulint		space;
+	fil_addr_t	node1_addr;
+	fil_addr_t	node2_addr;
+	flst_node_t*	node3;
+	fil_addr_t	node3_addr;
+	ulint		len;
+
+	ut_ad(mtr && node1 && node2 && base);
+	ut_ad(base != node1);
+	ut_ad(base != node2);
+	ut_ad(node2 != node1);
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node1, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX));
+
+	buf_ptr_get_fsp_addr(node1, &space, &node1_addr);
+	buf_ptr_get_fsp_addr(node2, &space, &node2_addr);
+
+	node3_addr = flst_get_next_addr(node1, mtr);
+
+	/* Set prev and next fields of node2 */
+	flst_write_addr(node2 + FLST_PREV, node1_addr, mtr);
+	flst_write_addr(node2 + FLST_NEXT, node3_addr, mtr);
+
+	if (!fil_addr_is_null(node3_addr)) {
+		/* Update prev field of node3 */
+		ulint	zip_size = fil_space_get_zip_size(space);
+
+		node3 = fut_get_ptr(space, zip_size,
+				    node3_addr, RW_X_LATCH, mtr);
+		flst_write_addr(node3 + FLST_PREV, node2_addr, mtr);
+	} else {
+		/* node1 was last in list: update last field in base */
+		flst_write_addr(base + FLST_LAST, node2_addr, mtr);
+	}
+
+	/* Set next field of node1 */
+	flst_write_addr(node1 + FLST_NEXT, node2_addr, mtr);
+
+	/* Update len of base node */
+	len = flst_get_len(base, mtr);
+	mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr);
+}
+
+/********************************************************************//**
+Inserts a node before another in a list. */
+UNIV_INTERN
+void
+flst_insert_before(
+/*===============*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node2,	/*!< in: node to insert */
+	flst_node_t*		node3,	/*!< in: node to insert before */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	ulint		space;
+	flst_node_t*	node1;
+	fil_addr_t	node1_addr;
+	fil_addr_t	node2_addr;
+	fil_addr_t	node3_addr;
+	ulint		len;
+
+	ut_ad(mtr && node2 && node3 && base);
+	ut_ad(base != node2);
+	ut_ad(base != node3);
+	ut_ad(node2 != node3);
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node3, MTR_MEMO_PAGE_X_FIX));
+
+	buf_ptr_get_fsp_addr(node2, &space, &node2_addr);
+	buf_ptr_get_fsp_addr(node3, &space, &node3_addr);
+
+	node1_addr = flst_get_prev_addr(node3, mtr);
+
+	/* Set prev and next fields of node2 */
+	flst_write_addr(node2 + FLST_PREV, node1_addr, mtr);
+	flst_write_addr(node2 + FLST_NEXT, node3_addr, mtr);
+
+	if (!fil_addr_is_null(node1_addr)) {
+		ulint	zip_size = fil_space_get_zip_size(space);
+		/* Update next field of node1 */
+		node1 = fut_get_ptr(space, zip_size, node1_addr,
+				    RW_X_LATCH, mtr);
+		flst_write_addr(node1 + FLST_NEXT, node2_addr, mtr);
+	} else {
+		/* node3 was first in list: update first field in base */
+		flst_write_addr(base + FLST_FIRST, node2_addr, mtr);
+	}
+
+	/* Set prev field of node3 */
+	flst_write_addr(node3 + FLST_PREV, node2_addr, mtr);
+
+	/* Update len of base node */
+	len = flst_get_len(base, mtr);
+	mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr);
+}
+
+/********************************************************************//**
+Removes a node. */
+UNIV_INTERN
+void
+flst_remove(
+/*========*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node2,	/*!< in: node to remove */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	ulint		space;
+	ulint		zip_size;
+	flst_node_t*	node1;
+	fil_addr_t	node1_addr;
+	fil_addr_t	node2_addr;
+	flst_node_t*	node3;
+	fil_addr_t	node3_addr;
+	ulint		len;
+
+	ut_ad(mtr && node2 && base);
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX));
+
+	buf_ptr_get_fsp_addr(node2, &space, &node2_addr);
+	zip_size = fil_space_get_zip_size(space);
+
+	node1_addr = flst_get_prev_addr(node2, mtr);
+	node3_addr = flst_get_next_addr(node2, mtr);
+
+	if (!fil_addr_is_null(node1_addr)) {
+
+		/* Update next field of node1 */
+
+		if (node1_addr.page == node2_addr.page) {
+
+			node1 = page_align(node2) + node1_addr.boffset;
+		} else {
+			node1 = fut_get_ptr(space, zip_size,
+					    node1_addr, RW_X_LATCH, mtr);
+		}
+
+		ut_ad(node1 != node2);
+
+		flst_write_addr(node1 + FLST_NEXT, node3_addr, mtr);
+	} else {
+		/* node2 was first in list: update first field in base */
+		flst_write_addr(base + FLST_FIRST, node3_addr, mtr);
+	}
+
+	if (!fil_addr_is_null(node3_addr)) {
+		/* Update prev field of node3 */
+
+		if (node3_addr.page == node2_addr.page) {
+
+			node3 = page_align(node2) + node3_addr.boffset;
+		} else {
+			node3 = fut_get_ptr(space, zip_size,
+					    node3_addr, RW_X_LATCH, mtr);
+		}
+
+		ut_ad(node2 != node3);
+
+		flst_write_addr(node3 + FLST_PREV, node1_addr, mtr);
+	} else {
+		/* node2 was last in list: update last field in base */
+		flst_write_addr(base + FLST_LAST, node1_addr, mtr);
+	}
+
+	/* Update len of base node */
+	len = flst_get_len(base, mtr);
+	ut_ad(len > 0);
+
+	mlog_write_ulint(base + FLST_LEN, len - 1, MLOG_4BYTES, mtr);
+}
+
+/********************************************************************//**
+Cuts off the tail of the list, including the node given. The number of
+nodes which will be removed must be provided by the caller, as this function
+does not measure the length of the tail. */
+UNIV_INTERN
+void
+flst_cut_end(
+/*=========*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node2,	/*!< in: first node to remove */
+	ulint			n_nodes,/*!< in: number of nodes to remove,
+					must be >= 1 */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	ulint		space;
+	flst_node_t*	node1;
+	fil_addr_t	node1_addr;
+	fil_addr_t	node2_addr;
+	ulint		len;
+
+	ut_ad(mtr && node2 && base);
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(n_nodes > 0);
+
+	buf_ptr_get_fsp_addr(node2, &space, &node2_addr);
+
+	node1_addr = flst_get_prev_addr(node2, mtr);
+
+	if (!fil_addr_is_null(node1_addr)) {
+
+		/* Update next field of node1 */
+
+		if (node1_addr.page == node2_addr.page) {
+
+			node1 = page_align(node2) + node1_addr.boffset;
+		} else {
+			node1 = fut_get_ptr(space,
+					    fil_space_get_zip_size(space),
+					    node1_addr, RW_X_LATCH, mtr);
+		}
+
+		flst_write_addr(node1 + FLST_NEXT, fil_addr_null, mtr);
+	} else {
+		/* node2 was first in list: update the field in base */
+		flst_write_addr(base + FLST_FIRST, fil_addr_null, mtr);
+	}
+
+	flst_write_addr(base + FLST_LAST, node1_addr, mtr);
+
+	/* Update len of base node */
+	len = flst_get_len(base, mtr);
+	ut_ad(len >= n_nodes);
+
+	mlog_write_ulint(base + FLST_LEN, len - n_nodes, MLOG_4BYTES, mtr);
+}
+
+/********************************************************************//**
+Cuts off the tail of the list, not including the given node. The number of
+nodes which will be removed must be provided by the caller, as this function
+does not measure the length of the tail. */
+UNIV_INTERN
+void
+flst_truncate_end(
+/*==============*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node2,	/*!< in: first node not to remove */
+	ulint			n_nodes,/*!< in: number of nodes to remove */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	fil_addr_t	node2_addr;
+	ulint		len;
+	ulint		space;
+
+	ut_ad(mtr && node2 && base);
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX));
+	if (n_nodes == 0) {
+
+		ut_ad(fil_addr_is_null(flst_get_next_addr(node2, mtr)));
+
+		return;
+	}
+
+	buf_ptr_get_fsp_addr(node2, &space, &node2_addr);
+
+	/* Update next field of node2 */
+	flst_write_addr(node2 + FLST_NEXT, fil_addr_null, mtr);
+
+	flst_write_addr(base + FLST_LAST, node2_addr, mtr);
+
+	/* Update len of base node */
+	len = flst_get_len(base, mtr);
+	ut_ad(len >= n_nodes);
+
+	mlog_write_ulint(base + FLST_LEN, len - n_nodes, MLOG_4BYTES, mtr);
+}
+
+/********************************************************************//**
+Validates a file-based list.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+flst_validate(
+/*==========*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	mtr_t*			mtr1)	/*!< in: mtr */
+{
+	ulint			space;
+	ulint			zip_size;
+	const flst_node_t*	node;
+	fil_addr_t		node_addr;
+	fil_addr_t		base_addr;
+	ulint			len;
+	ulint			i;
+	mtr_t			mtr2;
+
+	ut_ad(base);
+	ut_ad(mtr_memo_contains_page(mtr1, base, MTR_MEMO_PAGE_X_FIX));
+
+	/* We use two mini-transaction handles: the first is used to
+	lock the base node, and prevent other threads from modifying the
+	list. The second is used to traverse the list. We cannot run the
+	second mtr without committing it at times, because if the list
+	is long, then the x-locked pages could fill the buffer resulting
+	in a deadlock. */
+
+	/* Find out the space id */
+	buf_ptr_get_fsp_addr(base, &space, &base_addr);
+	zip_size = fil_space_get_zip_size(space);
+
+	len = flst_get_len(base, mtr1);
+	node_addr = flst_get_first(base, mtr1);
+
+	for (i = 0; i < len; i++) {
+		mtr_start(&mtr2);
+
+		node = fut_get_ptr(space, zip_size,
+				   node_addr, RW_X_LATCH, &mtr2);
+		node_addr = flst_get_next_addr(node, &mtr2);
+
+		mtr_commit(&mtr2); /* Commit mtr2 each round to prevent buffer
+				   becoming full */
+	}
+
+	ut_a(fil_addr_is_null(node_addr));
+
+	node_addr = flst_get_last(base, mtr1);
+
+	for (i = 0; i < len; i++) {
+		mtr_start(&mtr2);
+
+		node = fut_get_ptr(space, zip_size,
+				   node_addr, RW_X_LATCH, &mtr2);
+		node_addr = flst_get_prev_addr(node, &mtr2);
+
+		mtr_commit(&mtr2); /* Commit mtr2 each round to prevent buffer
+				   becoming full */
+	}
+
+	ut_a(fil_addr_is_null(node_addr));
+
+	return(TRUE);
+}
+
+/********************************************************************//**
+Prints info of a file-based list. */
+UNIV_INTERN
+void
+flst_print(
+/*=======*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	mtr_t*			mtr)	/*!< in: mtr */
+{
+	const buf_frame_t*	frame;
+	ulint			len;
+
+	ut_ad(base && mtr);
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+	frame = page_align((byte*) base);
+
+	len = flst_get_len(base, mtr);
+
+	fprintf(stderr,
+		"FILE-BASED LIST:\n"
+		"Base node in space %lu page %lu byte offset %lu; len %lu\n",
+		(ulong) page_get_space_id(frame),
+		(ulong) page_get_page_no(frame),
+		(ulong) page_offset(base), (ulong) len);
+}
diff --git a/storage/innobase/ha/ha0ha.cc b/storage/innobase/ha/ha0ha.cc
new file mode 100644
index 00000000000..9b4c837ef85
--- /dev/null
+++ b/storage/innobase/ha/ha0ha.cc
@@ -0,0 +1,498 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file ha/ha0ha.cc
+The hash table with external chains
+
+Created 8/22/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ha0ha.h"
+#ifdef UNIV_NONINL
+#include "ha0ha.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+#ifdef UNIV_DEBUG
+# include "buf0buf.h"
+#endif /* UNIV_DEBUG */
+# include "btr0sea.h"
+#include "page0page.h"
+
+/*************************************************************//**
+Creates a hash table with at least n array cells.  The actual number
+of cells is chosen to be a prime number slightly bigger than n.
+@return	own: created table */
+UNIV_INTERN
+hash_table_t*
+ha_create_func(
+/*===========*/
+	ulint	n,		/*!< in: number of array cells */
+#ifdef UNIV_SYNC_DEBUG
+	ulint	sync_level,	/*!< in: level of the mutexes or rw_locks
+				in the latching order: this is used in the
+				 debug version */
+#endif /* UNIV_SYNC_DEBUG */
+	ulint	n_sync_obj,	/*!< in: number of mutexes or rw_locks
+				to protect the hash table: must be a
+				power of 2, or 0 */
+	ulint	type)		/*!< in: type of datastructure for which
+				the memory heap is going to be used e.g.:
+				MEM_HEAP_FOR_BTR_SEARCH or
+				MEM_HEAP_FOR_PAGE_HASH */
+{
+	hash_table_t*	table;
+	ulint		i;
+
+	ut_a(type == MEM_HEAP_FOR_BTR_SEARCH
+	     || type == MEM_HEAP_FOR_PAGE_HASH);
+
+	ut_ad(ut_is_2pow(n_sync_obj));
+	table = hash_create(n);
+
+	/* Creating MEM_HEAP_BTR_SEARCH type heaps can potentially fail,
+	but in practise it never should in this case, hence the asserts. */
+
+	if (n_sync_obj == 0) {
+		table->heap = mem_heap_create_typed(
+			ut_min(4096, MEM_MAX_ALLOC_IN_BUF), type);
+		ut_a(table->heap);
+
+		return(table);
+	}
+
+	if (type == MEM_HEAP_FOR_PAGE_HASH) {
+		/* We create a hash table protected by rw_locks for
+		buf_pool->page_hash. */
+		hash_create_sync_obj(table, HASH_TABLE_SYNC_RW_LOCK,
+				     n_sync_obj, sync_level);
+	} else {
+		hash_create_sync_obj(table, HASH_TABLE_SYNC_MUTEX,
+				     n_sync_obj, sync_level);
+	}
+
+	table->heaps = static_cast<mem_heap_t**>(
+		mem_alloc(n_sync_obj * sizeof(void*)));
+
+	for (i = 0; i < n_sync_obj; i++) {
+		table->heaps[i] = mem_heap_create_typed(4096, type);
+		ut_a(table->heaps[i]);
+	}
+
+	return(table);
+}
+
+/*************************************************************//**
+Empties a hash table and frees the memory heaps. */
+UNIV_INTERN
+void
+ha_clear(
+/*=====*/
+	hash_table_t*	table)	/*!< in, own: hash table */
+{
+	ulint	i;
+	ulint	n;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!table->adaptive
+	       || rw_lock_own(&btr_search_latch, RW_LOCK_EXCLUSIVE));
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* Free the memory heaps. */
+	n = table->n_sync_obj;
+
+	for (i = 0; i < n; i++) {
+		mem_heap_free(table->heaps[i]);
+	}
+
+	if (table->heaps) {
+		mem_free(table->heaps);
+	}
+
+	switch (table->type) {
+	case HASH_TABLE_SYNC_MUTEX:
+		mem_free(table->sync_obj.mutexes);
+		table->sync_obj.mutexes = NULL;
+		break;
+
+	case HASH_TABLE_SYNC_RW_LOCK:
+		mem_free(table->sync_obj.rw_locks);
+		table->sync_obj.rw_locks = NULL;
+		break;
+
+	case HASH_TABLE_SYNC_NONE:
+		/* do nothing */
+		break;
+	}
+
+	table->n_sync_obj = 0;
+	table->type = HASH_TABLE_SYNC_NONE;
+
+
+	/* Clear the hash table. */
+	n = hash_get_n_cells(table);
+
+	for (i = 0; i < n; i++) {
+		hash_get_nth_cell(table, i)->node = NULL;
+	}
+}
+
+/*************************************************************//**
+Inserts an entry into a hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted. If btr_search_enabled is set to FALSE, we will only allow
+updating existing nodes, but no new node is allowed to be added.
+@return	TRUE if succeed, FALSE if no more memory could be allocated */
+UNIV_INTERN
+ibool
+ha_insert_for_fold_func(
+/*====================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold,	/*!< in: folded value of data; if a node with
+				the same fold value already exists, it is
+				updated to point to the same data, and no new
+				node is created! */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	buf_block_t*	block,	/*!< in: buffer block containing the data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	const rec_t*	data)	/*!< in: data, must not be NULL */
+{
+	hash_cell_t*	cell;
+	ha_node_t*	node;
+	ha_node_t*	prev_node;
+	ulint		hash;
+
+	ut_ad(data);
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	ut_a(block->frame == page_align(data));
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	hash_assert_can_modify(table, fold);
+	ut_ad(btr_search_enabled);
+
+	hash = hash_calc_hash(fold, table);
+
+	cell = hash_get_nth_cell(table, hash);
+
+	prev_node = static_cast<ha_node_t*>(cell->node);
+
+	while (prev_node != NULL) {
+		if (prev_node->fold == fold) {
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+			if (table->adaptive) {
+				buf_block_t* prev_block = prev_node->block;
+				ut_a(prev_block->frame
+				     == page_align(prev_node->data));
+				ut_a(prev_block->n_pointers > 0);
+				prev_block->n_pointers--;
+				block->n_pointers++;
+			}
+
+			prev_node->block = block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+			prev_node->data = data;
+
+			return(TRUE);
+		}
+
+		prev_node = prev_node->next;
+	}
+
+	/* We have to allocate a new chain node */
+
+	node = static_cast<ha_node_t*>(
+		mem_heap_alloc(hash_get_heap(table, fold), sizeof(ha_node_t)));
+
+	if (node == NULL) {
+		/* It was a btr search type memory heap and at the moment
+		no more memory could be allocated: return */
+
+		ut_ad(hash_get_heap(table, fold)->type & MEM_HEAP_BTR_SEARCH);
+
+		return(FALSE);
+	}
+
+	ha_node_set_data(node, block, data);
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	if (table->adaptive) {
+		block->n_pointers++;
+	}
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+	node->fold = fold;
+
+	node->next = NULL;
+
+	prev_node = static_cast<ha_node_t*>(cell->node);
+
+	if (prev_node == NULL) {
+
+		cell->node = node;
+
+		return(TRUE);
+	}
+
+	while (prev_node->next != NULL) {
+
+		prev_node = prev_node->next;
+	}
+
+	prev_node->next = node;
+
+	return(TRUE);
+}
+
+/***********************************************************//**
+Deletes a hash node. */
+UNIV_INTERN
+void
+ha_delete_hash_node(
+/*================*/
+	hash_table_t*	table,		/*!< in: hash table */
+	ha_node_t*	del_node)	/*!< in: node to be deleted */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(btr_search_enabled);
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	if (table->adaptive) {
+		ut_a(del_node->block->frame = page_align(del_node->data));
+		ut_a(del_node->block->n_pointers > 0);
+		del_node->block->n_pointers--;
+	}
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+	HASH_DELETE_AND_COMPACT(ha_node_t, next, table, del_node);
+}
+
+/*********************************************************//**
+Looks for an element when we know the pointer to the data, and updates
+the pointer to data, if found.
+@return TRUE if found */
+UNIV_INTERN
+ibool
+ha_search_and_update_if_found_func(
+/*===============================*/
+	hash_table_t*	table,	/*!< in/out: hash table */
+	ulint		fold,	/*!< in: folded value of the searched data */
+	const rec_t*	data,	/*!< in: pointer to the data */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	buf_block_t*	new_block,/*!< in: block containing new_data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	const rec_t*	new_data)/*!< in: new pointer to the data */
+{
+	ha_node_t*	node;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	hash_assert_can_modify(table, fold);
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	ut_a(new_block->frame == page_align(new_data));
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (!btr_search_enabled) {
+		return(FALSE);
+	}
+
+	node = ha_search_with_data(table, fold, data);
+
+	if (node) {
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+		if (table->adaptive) {
+			ut_a(node->block->n_pointers > 0);
+			node->block->n_pointers--;
+			new_block->n_pointers++;
+		}
+
+		node->block = new_block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+		node->data = new_data;
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*****************************************************************//**
+Removes from the chain determined by fold all nodes whose data pointer
+points to the page given. */
+UNIV_INTERN
+void
+ha_remove_all_nodes_to_page(
+/*========================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold,	/*!< in: fold value */
+	const page_t*	page)	/*!< in: buffer page */
+{
+	ha_node_t*	node;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	hash_assert_can_modify(table, fold);
+	ut_ad(btr_search_enabled);
+
+	node = ha_chain_get_first(table, fold);
+
+	while (node) {
+		if (page_align(ha_node_get_data(node)) == page) {
+
+			/* Remove the hash node */
+
+			ha_delete_hash_node(table, node);
+
+			/* Start again from the first node in the chain
+			because the deletion may compact the heap of
+			nodes and move other nodes! */
+
+			node = ha_chain_get_first(table, fold);
+		} else {
+			node = ha_chain_get_next(node);
+		}
+	}
+#ifdef UNIV_DEBUG
+	/* Check that all nodes really got deleted */
+
+	node = ha_chain_get_first(table, fold);
+
+	while (node) {
+		ut_a(page_align(ha_node_get_data(node)) != page);
+
+		node = ha_chain_get_next(node);
+	}
+#endif
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/*************************************************************//**
+Validates a given range of the cells in hash table.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+ha_validate(
+/*========*/
+	hash_table_t*	table,		/*!< in: hash table */
+	ulint		start_index,	/*!< in: start index */
+	ulint		end_index)	/*!< in: end index */
+{
+	ibool		ok	= TRUE;
+	ulint		i;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	ut_a(start_index <= end_index);
+	ut_a(start_index < hash_get_n_cells(table));
+	ut_a(end_index < hash_get_n_cells(table));
+
+	for (i = start_index; i <= end_index; i++) {
+		ha_node_t*	node;
+		hash_cell_t*	cell;
+
+		cell = hash_get_nth_cell(table, i);
+
+		for (node = static_cast<ha_node_t*>(cell->node);
+		     node != 0;
+		     node = node->next) {
+
+			if (hash_calc_hash(node->fold, table) != i) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"InnoDB: Error: hash table node"
+					" fold value %lu does not\n"
+					"InnoDB: match the cell number %lu.\n",
+					(ulong) node->fold, (ulong) i);
+
+				ok = FALSE;
+			}
+		}
+	}
+
+	return(ok);
+}
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
+
+/*************************************************************//**
+Prints info of a hash table. */
+UNIV_INTERN
+void
+ha_print_info(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	hash_table_t*	table)	/*!< in: hash table */
+{
+#ifdef UNIV_DEBUG
+/* Some of the code here is disabled for performance reasons in production
+builds, see http://bugs.mysql.com/36941 */
+#define PRINT_USED_CELLS
+#endif /* UNIV_DEBUG */
+
+#ifdef PRINT_USED_CELLS
+	hash_cell_t*	cell;
+	ulint		cells	= 0;
+	ulint		i;
+#endif /* PRINT_USED_CELLS */
+	ulint		n_bufs;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+#ifdef PRINT_USED_CELLS
+	for (i = 0; i < hash_get_n_cells(table); i++) {
+
+		cell = hash_get_nth_cell(table, i);
+
+		if (cell->node) {
+
+			cells++;
+		}
+	}
+#endif /* PRINT_USED_CELLS */
+
+	fprintf(file, "Hash table size %lu",
+		(ulong) hash_get_n_cells(table));
+
+#ifdef PRINT_USED_CELLS
+	fprintf(file, ", used cells %lu", (ulong) cells);
+#endif /* PRINT_USED_CELLS */
+
+	if (table->heaps == NULL && table->heap != NULL) {
+
+		/* This calculation is intended for the adaptive hash
+		index: how many buffer frames we have reserved? */
+
+		n_bufs = UT_LIST_GET_LEN(table->heap->base) - 1;
+
+		if (table->heap->free_block) {
+			n_bufs++;
+		}
+
+		fprintf(file, ", node heap has %lu buffer(s)\n",
+			(ulong) n_bufs);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/ha/ha0storage.cc b/storage/innobase/ha/ha0storage.cc
new file mode 100644
index 00000000000..6820591f316
--- /dev/null
+++ b/storage/innobase/ha/ha0storage.cc
@@ -0,0 +1,184 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ha/ha0storage.cc
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 22, 2007 Vasil Dimov
+*******************************************************/
+
+#include "univ.i"
+#include "ha0storage.h"
+#include "hash0hash.h"
+#include "mem0mem.h"
+#include "ut0rnd.h"
+
+#ifdef UNIV_NONINL
+#include "ha0storage.ic"
+#endif
+
+/*******************************************************************//**
+Retrieves a data from a storage. If it is present, a pointer to the
+stored copy of data is returned, otherwise NULL is returned. */
+static
+const void*
+ha_storage_get(
+/*===========*/
+	ha_storage_t*	storage,	/*!< in: hash storage */
+	const void*	data,		/*!< in: data to check for */
+	ulint		data_len)	/*!< in: data length */
+{
+	ha_storage_node_t*	node;
+	ulint			fold;
+
+	/* avoid repetitive calls to ut_fold_binary() in the HASH_SEARCH
+	macro */
+	fold = ut_fold_binary(static_cast<const byte*>(data), data_len);
+
+#define IS_FOUND	\
+	node->data_len == data_len && memcmp(node->data, data, data_len) == 0
+
+	HASH_SEARCH(
+		next,			/* node->"next" */
+		storage->hash,		/* the hash table */
+		fold,			/* key */
+		ha_storage_node_t*,	/* type of node->next */
+		node,			/* auxiliary variable */
+		,			/* assertion */
+		IS_FOUND);		/* search criteria */
+
+	if (node == NULL) {
+
+		return(NULL);
+	}
+	/* else */
+
+	return(node->data);
+}
+
+/*******************************************************************//**
+Copies data into the storage and returns a pointer to the copy. If the
+same data chunk is already present, then pointer to it is returned.
+Data chunks are considered to be equal if len1 == len2 and
+memcmp(data1, data2, len1) == 0. If "data" is not present (and thus
+data_len bytes need to be allocated) and the size of storage is going to
+become more than "memlim" then "data" is not added and NULL is returned.
+To disable this behavior "memlim" can be set to 0, which stands for
+"no limit". */
+UNIV_INTERN
+const void*
+ha_storage_put_memlim(
+/*==================*/
+	ha_storage_t*	storage,	/*!< in/out: hash storage */
+	const void*	data,		/*!< in: data to store */
+	ulint		data_len,	/*!< in: data length */
+	ulint		memlim)		/*!< in: memory limit to obey */
+{
+	void*			raw;
+	ha_storage_node_t*	node;
+	const void*		data_copy;
+	ulint			fold;
+
+	/* check if data chunk is already present */
+	data_copy = ha_storage_get(storage, data, data_len);
+	if (data_copy != NULL) {
+
+		return(data_copy);
+	}
+
+	/* not present */
+
+	/* check if we are allowed to allocate data_len bytes */
+	if (memlim > 0
+	    && ha_storage_get_size(storage) + data_len > memlim) {
+
+		return(NULL);
+	}
+
+	/* we put the auxiliary node struct and the data itself in one
+	continuous block */
+	raw = mem_heap_alloc(storage->heap,
+			     sizeof(ha_storage_node_t) + data_len);
+
+	node = (ha_storage_node_t*) raw;
+	data_copy = (byte*) raw + sizeof(*node);
+
+	memcpy((byte*) raw + sizeof(*node), data, data_len);
+
+	node->data_len = data_len;
+	node->data = data_copy;
+
+	/* avoid repetitive calls to ut_fold_binary() in the HASH_INSERT
+	macro */
+	fold = ut_fold_binary(static_cast<const byte*>(data), data_len);
+
+	HASH_INSERT(
+		ha_storage_node_t,	/* type used in the hash chain */
+		next,			/* node->"next" */
+		storage->hash,		/* the hash table */
+		fold,			/* key */
+		node);			/* add this data to the hash */
+
+	/* the output should not be changed because it will spoil the
+	hash table */
+	return(data_copy);
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+void
+test_ha_storage()
+{
+	ha_storage_t*	storage;
+	char		buf[1024];
+	int		i;
+	const void*	stored[256];
+	const void*	p;
+
+	storage = ha_storage_create(0, 0);
+
+	for (i = 0; i < 256; i++) {
+
+		memset(buf, i, sizeof(buf));
+		stored[i] = ha_storage_put(storage, buf, sizeof(buf));
+	}
+
+	//ha_storage_empty(&storage);
+
+	for (i = 255; i >= 0; i--) {
+
+		memset(buf, i, sizeof(buf));
+		p = ha_storage_put(storage, buf, sizeof(buf));
+
+		if (p != stored[i]) {
+
+			fprintf(stderr, "ha_storage_put() returned %p "
+				"instead of %p, i=%d\n", p, stored[i], i);
+			return;
+		}
+	}
+
+	fprintf(stderr, "all ok\n");
+
+	ha_storage_free(storage);
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
diff --git a/storage/innobase/ha/hash0hash.cc b/storage/innobase/ha/hash0hash.cc
new file mode 100644
index 00000000000..174b6bcb57e
--- /dev/null
+++ b/storage/innobase/ha/hash0hash.cc
@@ -0,0 +1,403 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ha/hash0hash.cc
+The simple hash table utility
+
+Created 5/20/1997 Heikki Tuuri
+*******************************************************/
+
+#include "hash0hash.h"
+#ifdef UNIV_NONINL
+#include "hash0hash.ic"
+#endif
+
+#include "mem0mem.h"
+
+#ifndef UNIV_HOTBACKUP
+
+# ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	hash_table_mutex_key;
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+UNIV_INTERN mysql_pfs_key_t	hash_table_rw_lock_key;
+# endif /* UNIV_PFS_RWLOCK */
+/************************************************************//**
+Reserves the mutex for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_mutex_enter(
+/*=============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	ut_ad(table->type == HASH_TABLE_SYNC_MUTEX);
+	mutex_enter(hash_get_mutex(table, fold));
+}
+
+/************************************************************//**
+Releases the mutex for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit(
+/*============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	ut_ad(table->type == HASH_TABLE_SYNC_MUTEX);
+	mutex_exit(hash_get_mutex(table, fold));
+}
+
+/************************************************************//**
+Reserves all the mutexes of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_mutex_enter_all(
+/*=================*/
+	hash_table_t*	table)	/*!< in: hash table */
+{
+	ulint	i;
+
+	ut_ad(table->type == HASH_TABLE_SYNC_MUTEX);
+	for (i = 0; i < table->n_sync_obj; i++) {
+
+		mutex_enter(table->sync_obj.mutexes + i);
+	}
+}
+
+/************************************************************//**
+Releases all the mutexes of a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit_all(
+/*================*/
+	hash_table_t*	table)	/*!< in: hash table */
+{
+	ulint	i;
+
+	ut_ad(table->type == HASH_TABLE_SYNC_MUTEX);
+	for (i = 0; i < table->n_sync_obj; i++) {
+
+		mutex_exit(table->sync_obj.mutexes + i);
+	}
+}
+
+/************************************************************//**
+Releases all but the passed in mutex of a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit_all_but(
+/*====================*/
+	hash_table_t*	table,		/*!< in: hash table */
+	ib_mutex_t*	keep_mutex)	/*!< in: mutex to keep */
+{
+	ulint	i;
+
+	ut_ad(table->type == HASH_TABLE_SYNC_MUTEX);
+	for (i = 0; i < table->n_sync_obj; i++) {
+
+		ib_mutex_t* mutex = table->sync_obj.mutexes + i;
+		if (UNIV_LIKELY(keep_mutex != mutex)) {
+			mutex_exit(mutex);
+		}
+	}
+
+	ut_ad(mutex_own(keep_mutex));
+}
+
+/************************************************************//**
+s-lock a lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_lock_s(
+/*========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+
+	rw_lock_t* lock = hash_get_lock(table, fold);
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	ut_ad(lock);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	rw_lock_s_lock(lock);
+}
+
+/************************************************************//**
+x-lock a lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_lock_x(
+/*========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+
+	rw_lock_t* lock = hash_get_lock(table, fold);
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	ut_ad(lock);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	rw_lock_x_lock(lock);
+}
+
+/************************************************************//**
+unlock an s-lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_unlock_s(
+/*==========*/
+
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+
+	rw_lock_t* lock = hash_get_lock(table, fold);
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	ut_ad(lock);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	rw_lock_s_unlock(lock);
+}
+
+/************************************************************//**
+unlock x-lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_unlock_x(
+/*==========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	rw_lock_t* lock = hash_get_lock(table, fold);
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	ut_ad(lock);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	rw_lock_x_unlock(lock);
+}
+
+/************************************************************//**
+Reserves all the locks of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_lock_x_all(
+/*============*/
+	hash_table_t*	table)	/*!< in: hash table */
+{
+	ulint	i;
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	for (i = 0; i < table->n_sync_obj; i++) {
+
+		rw_lock_t* lock = table->sync_obj.rw_locks + i;
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED));
+		ut_ad(!rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+		rw_lock_x_lock(lock);
+	}
+}
+
+/************************************************************//**
+Releases all the locks of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_unlock_x_all(
+/*==============*/
+	hash_table_t*	table)	/*!< in: hash table */
+{
+	ulint	i;
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	for (i = 0; i < table->n_sync_obj; i++) {
+
+		rw_lock_t* lock = table->sync_obj.rw_locks + i;
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+		rw_lock_x_unlock(lock);
+	}
+}
+
+/************************************************************//**
+Releases all but passed in lock of a hash table, */
+UNIV_INTERN
+void
+hash_unlock_x_all_but(
+/*==================*/
+	hash_table_t*	table,		/*!< in: hash table */
+	rw_lock_t*	keep_lock)	/*!< in: lock to keep */
+{
+	ulint	i;
+
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	for (i = 0; i < table->n_sync_obj; i++) {
+
+		rw_lock_t* lock = table->sync_obj.rw_locks + i;
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+		if (UNIV_LIKELY(keep_lock != lock)) {
+			rw_lock_x_unlock(lock);
+		}
+	}
+}
+
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Creates a hash table with >= n array cells. The actual number of cells is
+chosen to be a prime number slightly bigger than n.
+@return	own: created table */
+UNIV_INTERN
+hash_table_t*
+hash_create(
+/*========*/
+	ulint	n)	/*!< in: number of array cells */
+{
+	hash_cell_t*	array;
+	ulint		prime;
+	hash_table_t*	table;
+
+	prime = ut_find_prime(n);
+
+	table = static_cast<hash_table_t*>(mem_alloc(sizeof(hash_table_t)));
+
+	array = static_cast<hash_cell_t*>(
+		ut_malloc(sizeof(hash_cell_t) * prime));
+
+	/* The default type of hash_table is HASH_TABLE_SYNC_NONE i.e.:
+	the caller is responsible for access control to the table. */
+	table->type = HASH_TABLE_SYNC_NONE;
+	table->array = array;
+	table->n_cells = prime;
+#ifndef UNIV_HOTBACKUP
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	table->adaptive = FALSE;
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	table->n_sync_obj = 0;
+	table->sync_obj.mutexes = NULL;
+	table->heaps = NULL;
+#endif /* !UNIV_HOTBACKUP */
+	table->heap = NULL;
+	ut_d(table->magic_n = HASH_TABLE_MAGIC_N);
+
+	/* Initialize the cell array */
+	hash_table_clear(table);
+
+	return(table);
+}
+
+/*************************************************************//**
+Frees a hash table. */
+UNIV_INTERN
+void
+hash_table_free(
+/*============*/
+	hash_table_t*	table)	/*!< in, own: hash table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+
+	ut_free(table->array);
+	mem_free(table);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Creates a sync object array to protect a hash table.
+::sync_obj can be mutexes or rw_locks depening on the type of
+hash table. */
+UNIV_INTERN
+void
+hash_create_sync_obj_func(
+/*======================*/
+	hash_table_t*		table,	/*!< in: hash table */
+	enum hash_table_sync_t	type,	/*!< in: HASH_TABLE_SYNC_MUTEX
+					or HASH_TABLE_SYNC_RW_LOCK */
+#ifdef UNIV_SYNC_DEBUG
+	ulint			sync_level,/*!< in: latching order level
+					of the mutexes: used in the
+					debug version */
+#endif /* UNIV_SYNC_DEBUG */
+	ulint			n_sync_obj)/*!< in: number of sync objects,
+					must be a power of 2 */
+{
+	ulint	i;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	ut_a(n_sync_obj > 0);
+	ut_a(ut_is_2pow(n_sync_obj));
+
+	table->type = type;
+
+	switch (type) {
+	case HASH_TABLE_SYNC_MUTEX:
+		table->sync_obj.mutexes = static_cast<ib_mutex_t*>(
+			mem_alloc(n_sync_obj * sizeof(ib_mutex_t)));
+
+		for (i = 0; i < n_sync_obj; i++) {
+			mutex_create(hash_table_mutex_key,
+			     table->sync_obj.mutexes + i, sync_level);
+		}
+
+		break;
+
+	case HASH_TABLE_SYNC_RW_LOCK:
+		table->sync_obj.rw_locks = static_cast<rw_lock_t*>(
+			mem_alloc(n_sync_obj * sizeof(rw_lock_t)));
+
+		for (i = 0; i < n_sync_obj; i++) {
+			rw_lock_create(hash_table_rw_lock_key,
+			     table->sync_obj.rw_locks + i, sync_level);
+		}
+
+		break;
+
+	case HASH_TABLE_SYNC_NONE:
+		ut_error;
+	}
+
+	table->n_sync_obj = n_sync_obj;
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/ha_innodb.def b/storage/innobase/ha_innodb.def
new file mode 100644
index 00000000000..e0faa62deb1
--- /dev/null
+++ b/storage/innobase/ha_innodb.def
@@ -0,0 +1,4 @@
+EXPORTS
+	_mysql_plugin_interface_version_
+	_mysql_sizeof_struct_st_plugin_
+	_mysql_plugin_declarations_
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
new file mode 100644
index 00000000000..b8f6351ae27
--- /dev/null
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -0,0 +1,17176 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, 2009 Google Inc.
+Copyright (c) 2009, Percona Inc.
+Copyright (c) 2012, Facebook Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#include <sql_table.h>	// explain_filename, nz2, EXPLAIN_PARTITIONS_AS_COMMENT,
+			// EXPLAIN_FILENAME_MAX_EXTRA_LENGTH
+
+#include <sql_acl.h>	// PROCESS_ACL
+#include <debug_sync.h> // DEBUG_SYNC
+#include <my_base.h>	// HA_OPTION_*
+#include <mysys_err.h>
+#include <mysql/innodb_priv.h>
+#include <my_check_opt.h>
+/** @file ha_innodb.cc */
+
+/* Include necessary InnoDB headers */
+#include "univ.i"
+#include "buf0dump.h"
+#include "buf0lru.h"
+#include "buf0flu.h"
+#include "buf0dblwr.h"
+#include "btr0sea.h"
+#include "os0file.h"
+#include "os0thread.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "trx0roll.h"
+#include "trx0trx.h"
+
+#include "trx0sys.h"
+#include "mtr0mtr.h"
+#include "rem0types.h"
+#include "row0ins.h"
+#include "row0mysql.h"
+#include "row0sel.h"
+#include "row0upd.h"
+#include "log0log.h"
+#include "lock0lock.h"
+#include "dict0crea.h"
+#include "btr0cur.h"
+#include "btr0btr.h"
+#include "fsp0fsp.h"
+#include "sync0sync.h"
+#include "fil0fil.h"
+#include "trx0xa.h"
+#include "row0merge.h"
+#include "dict0boot.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "ha_prototypes.h"
+#include "ut0mem.h"
+#include "ibuf0ibuf.h"
+#include "dict0dict.h"
+#include "srv0mon.h"
+#include "api0api.h"
+#include "api0misc.h"
+#include "pars0pars.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include "row0import.h"
+#include "row0quiesce.h"
+#ifdef UNIV_DEBUG
+#include "trx0purge.h"
+#endif /* UNIV_DEBUG */
+#include "fts0priv.h"
+#include "page0zip.h"
+
+enum_tx_isolation thd_get_trx_isolation(const THD* thd);
+
+#include "ha_innodb.h"
+#include "i_s.h"
+
+# ifndef MYSQL_PLUGIN_IMPORT
+#  define MYSQL_PLUGIN_IMPORT /* nothing */
+# endif /* MYSQL_PLUGIN_IMPORT */
+
+/** to protect innobase_open_files */
+static mysql_mutex_t innobase_share_mutex;
+/** to force correct commit order in binlog */
+static ulong commit_threads = 0;
+static mysql_cond_t commit_cond;
+static mysql_mutex_t commit_cond_m;
+static bool innodb_inited = 0;
+
+#define INSIDE_HA_INNOBASE_CC
+
+#define EQ_CURRENT_THD(thd) ((thd) == current_thd)
+
+static struct handlerton* innodb_hton_ptr;
+
+static const long AUTOINC_OLD_STYLE_LOCKING = 0;
+static const long AUTOINC_NEW_STYLE_LOCKING = 1;
+static const long AUTOINC_NO_LOCKING = 2;
+
+static long innobase_mirrored_log_groups;
+static long innobase_log_buffer_size;
+static long innobase_additional_mem_pool_size;
+static long innobase_file_io_threads;
+static long innobase_open_files;
+static long innobase_autoinc_lock_mode;
+static ulong innobase_commit_concurrency = 0;
+static ulong innobase_read_io_threads;
+static ulong innobase_write_io_threads;
+static long innobase_buffer_pool_instances = 1;
+
+static long long innobase_buffer_pool_size, innobase_log_file_size;
+
+/** Percentage of the buffer pool to reserve for 'old' blocks.
+Connected to buf_LRU_old_ratio. */
+static uint innobase_old_blocks_pct;
+
+/** Maximum on-disk size of change buffer in terms of percentage
+of the buffer pool. */
+static uint innobase_change_buffer_max_size = CHANGE_BUFFER_DEFAULT_SIZE;
+
+/* The default values for the following char* start-up parameters
+are determined in innobase_init below: */
+
+static char*	innobase_data_home_dir			= NULL;
+static char*	innobase_data_file_path			= NULL;
+static char*	innobase_file_format_name		= NULL;
+static char*	innobase_change_buffering		= NULL;
+static char*	innobase_enable_monitor_counter		= NULL;
+static char*	innobase_disable_monitor_counter	= NULL;
+static char*	innobase_reset_monitor_counter		= NULL;
+static char*	innobase_reset_all_monitor_counter	= NULL;
+
+/* The highest file format being used in the database. The value can be
+set by user, however, it will be adjusted to the newer file format if
+a table of such format is created/opened. */
+static char*	innobase_file_format_max		= NULL;
+
+static char*	innobase_file_flush_method		= NULL;
+
+/* This variable can be set in the server configure file, specifying
+stopword table to be used */
+static char*	innobase_server_stopword_table		= NULL;
+
+/* Below we have boolean-valued start-up parameters, and their default
+values */
+
+static ulong	innobase_fast_shutdown			= 1;
+static my_bool	innobase_file_format_check		= TRUE;
+#ifdef UNIV_LOG_ARCHIVE
+static my_bool	innobase_log_archive			= FALSE;
+static char*	innobase_log_arch_dir			= NULL;
+#endif /* UNIV_LOG_ARCHIVE */
+static my_bool	innobase_use_doublewrite		= TRUE;
+static my_bool	innobase_use_checksums			= TRUE;
+static my_bool	innobase_locks_unsafe_for_binlog	= FALSE;
+static my_bool	innobase_rollback_on_timeout		= FALSE;
+static my_bool	innobase_create_status_file		= FALSE;
+static my_bool	innobase_stats_on_metadata		= TRUE;
+static my_bool	innobase_large_prefix			= FALSE;
+static my_bool	innodb_optimize_fulltext_only		= FALSE;
+
+static char*	internal_innobase_data_file_path	= NULL;
+
+static char*	innodb_version_str = (char*) INNODB_VERSION_STR;
+
+/** Possible values for system variable "innodb_stats_method". The values
+are defined the same as its corresponding MyISAM system variable
+"myisam_stats_method"(see "myisam_stats_method_names"), for better usability */
+static const char* innodb_stats_method_names[] = {
+	"nulls_equal",
+	"nulls_unequal",
+	"nulls_ignored",
+	NullS
+};
+
+/** Used to define an enumerate type of the system variable innodb_stats_method.
+This is the same as "myisam_stats_method_typelib" */
+static TYPELIB innodb_stats_method_typelib = {
+	array_elements(innodb_stats_method_names) - 1,
+	"innodb_stats_method_typelib",
+	innodb_stats_method_names,
+	NULL
+};
+
+/** Possible values for system variable "innodb_checksum_algorithm". */
+static const char* innodb_checksum_algorithm_names[] = {
+	"crc32",
+	"strict_crc32",
+	"innodb",
+	"strict_innodb",
+	"none",
+	"strict_none",
+	NullS
+};
+
+/** Used to define an enumerate type of the system variable
+innodb_checksum_algorithm. */
+static TYPELIB innodb_checksum_algorithm_typelib = {
+	array_elements(innodb_checksum_algorithm_names) - 1,
+	"innodb_checksum_algorithm_typelib",
+	innodb_checksum_algorithm_names,
+	NULL
+};
+
+/* The following counter is used to convey information to InnoDB
+about server activity: in case of normal DML ops it is not
+sensible to call srv_active_wake_master_thread after each
+operation, we only do it every INNOBASE_WAKE_INTERVAL'th step. */
+
+#define INNOBASE_WAKE_INTERVAL	32
+static ulong	innobase_active_counter	= 0;
+
+static hash_table_t*	innobase_open_tables;
+
+/** Allowed values of innodb_change_buffering */
+static const char* innobase_change_buffering_values[IBUF_USE_COUNT] = {
+	"none",		/* IBUF_USE_NONE */
+	"inserts",	/* IBUF_USE_INSERT */
+	"deletes",	/* IBUF_USE_DELETE_MARK */
+	"changes",	/* IBUF_USE_INSERT_DELETE_MARK */
+	"purges",	/* IBUF_USE_DELETE */
+	"all"		/* IBUF_USE_ALL */
+};
+
+/* Call back function array defined by MySQL and used to
+retrieve FTS results. */
+const struct _ft_vft ft_vft_result = {NULL,
+				      innobase_fts_find_ranking,
+				      innobase_fts_close_ranking,
+				      innobase_fts_retrieve_ranking,
+				      NULL};
+
+const struct _ft_vft_ext ft_vft_ext_result = {innobase_fts_get_version,
+					      innobase_fts_flags,
+					      innobase_fts_retrieve_docid,
+					      innobase_fts_count_matches};
+
+#ifdef HAVE_PSI_INTERFACE
+/* Keys to register pthread mutexes/cond in the current file with
+performance schema */
+static mysql_pfs_key_t	innobase_share_mutex_key;
+static mysql_pfs_key_t	commit_cond_mutex_key;
+static mysql_pfs_key_t	commit_cond_key;
+
+static PSI_mutex_info	all_pthread_mutexes[] = {
+	{&commit_cond_mutex_key, "commit_cond_mutex", 0},
+	{&innobase_share_mutex_key, "innobase_share_mutex", 0}
+};
+
+static PSI_cond_info	all_innodb_conds[] = {
+	{&commit_cond_key, "commit_cond", 0}
+};
+
+# ifdef UNIV_PFS_MUTEX
+/* all_innodb_mutexes array contains mutexes that are
+performance schema instrumented if "UNIV_PFS_MUTEX"
+is defined */
+static PSI_mutex_info all_innodb_mutexes[] = {
+	{&autoinc_mutex_key, "autoinc_mutex", 0},
+#  ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK
+	{&buffer_block_mutex_key, "buffer_block_mutex", 0},
+#  endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
+	{&buf_pool_mutex_key, "buf_pool_mutex", 0},
+	{&buf_pool_zip_mutex_key, "buf_pool_zip_mutex", 0},
+	{&cache_last_read_mutex_key, "cache_last_read_mutex", 0},
+	{&dict_foreign_err_mutex_key, "dict_foreign_err_mutex", 0},
+	{&dict_sys_mutex_key, "dict_sys_mutex", 0},
+	{&file_format_max_mutex_key, "file_format_max_mutex", 0},
+	{&fil_system_mutex_key, "fil_system_mutex", 0},
+	{&flush_list_mutex_key, "flush_list_mutex", 0},
+	{&fts_bg_threads_mutex_key, "fts_bg_threads_mutex", 0},
+	{&fts_delete_mutex_key, "fts_delete_mutex", 0},
+	{&fts_optimize_mutex_key, "fts_optimize_mutex", 0},
+	{&fts_doc_id_mutex_key, "fts_doc_id_mutex", 0},
+	{&fts_pll_tokenize_mutex_key, "fts_pll_tokenize_mutex", 0},
+	{&log_flush_order_mutex_key, "log_flush_order_mutex", 0},
+	{&hash_table_mutex_key, "hash_table_mutex", 0},
+	{&ibuf_bitmap_mutex_key, "ibuf_bitmap_mutex", 0},
+	{&ibuf_mutex_key, "ibuf_mutex", 0},
+	{&ibuf_pessimistic_insert_mutex_key,
+		 "ibuf_pessimistic_insert_mutex", 0},
+#  ifndef HAVE_ATOMIC_BUILTINS
+	{&server_mutex_key, "server_mutex", 0},
+#  endif /* !HAVE_ATOMIC_BUILTINS */
+	{&log_sys_mutex_key, "log_sys_mutex", 0},
+#  ifdef UNIV_MEM_DEBUG
+	{&mem_hash_mutex_key, "mem_hash_mutex", 0},
+#  endif /* UNIV_MEM_DEBUG */
+	{&mem_pool_mutex_key, "mem_pool_mutex", 0},
+	{&mutex_list_mutex_key, "mutex_list_mutex", 0},
+	{&page_zip_stat_per_index_mutex_key, "page_zip_stat_per_index_mutex", 0},
+	{&purge_sys_bh_mutex_key, "purge_sys_bh_mutex", 0},
+	{&recv_sys_mutex_key, "recv_sys_mutex", 0},
+	{&recv_writer_mutex_key, "recv_writer_mutex", 0},
+	{&rseg_mutex_key, "rseg_mutex", 0},
+#  ifdef UNIV_SYNC_DEBUG
+	{&rw_lock_debug_mutex_key, "rw_lock_debug_mutex", 0},
+#  endif /* UNIV_SYNC_DEBUG */
+	{&rw_lock_list_mutex_key, "rw_lock_list_mutex", 0},
+	{&rw_lock_mutex_key, "rw_lock_mutex", 0},
+	{&srv_dict_tmpfile_mutex_key, "srv_dict_tmpfile_mutex", 0},
+	{&srv_innodb_monitor_mutex_key, "srv_innodb_monitor_mutex", 0},
+	{&srv_misc_tmpfile_mutex_key, "srv_misc_tmpfile_mutex", 0},
+	{&srv_monitor_file_mutex_key, "srv_monitor_file_mutex", 0},
+#  ifdef UNIV_SYNC_DEBUG
+	{&sync_thread_mutex_key, "sync_thread_mutex", 0},
+#  endif /* UNIV_SYNC_DEBUG */
+	{&buf_dblwr_mutex_key, "buf_dblwr_mutex", 0},
+	{&trx_undo_mutex_key, "trx_undo_mutex", 0},
+	{&srv_sys_mutex_key, "srv_sys_mutex", 0},
+	{&lock_sys_mutex_key, "lock_mutex", 0},
+	{&lock_sys_wait_mutex_key, "lock_wait_mutex", 0},
+	{&trx_mutex_key, "trx_mutex", 0},
+	{&srv_sys_tasks_mutex_key, "srv_threads_mutex", 0},
+	/* mutex with os_fast_mutex_ interfaces */
+#  ifndef PFS_SKIP_EVENT_MUTEX
+	{&event_os_mutex_key, "event_os_mutex", 0},
+#  endif /* PFS_SKIP_EVENT_MUTEX */
+	{&os_mutex_key, "os_mutex", 0},
+#ifndef HAVE_ATOMIC_BUILTINS
+	{&srv_conc_mutex_key, "srv_conc_mutex", 0},
+#endif /* !HAVE_ATOMIC_BUILTINS */
+#ifndef HAVE_ATOMIC_BUILTINS_64
+	{&monitor_mutex_key, "monitor_mutex", 0},
+#endif /* !HAVE_ATOMIC_BUILTINS_64 */
+	{&ut_list_mutex_key, "ut_list_mutex", 0},
+	{&trx_sys_mutex_key, "trx_sys_mutex", 0},
+	{&zip_pad_mutex_key, "zip_pad_mutex", 0},
+};
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+/* all_innodb_rwlocks array contains rwlocks that are
+performance schema instrumented if "UNIV_PFS_RWLOCK"
+is defined */
+static PSI_rwlock_info all_innodb_rwlocks[] = {
+#  ifdef UNIV_LOG_ARCHIVE
+	{&archive_lock_key, "archive_lock", 0},
+#  endif /* UNIV_LOG_ARCHIVE */
+	{&btr_search_latch_key, "btr_search_latch", 0},
+#  ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK
+	{&buf_block_lock_key, "buf_block_lock", 0},
+#  endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
+#  ifdef UNIV_SYNC_DEBUG
+	{&buf_block_debug_latch_key, "buf_block_debug_latch", 0},
+#  endif /* UNIV_SYNC_DEBUG */
+	{&dict_operation_lock_key, "dict_operation_lock", 0},
+	{&fil_space_latch_key, "fil_space_latch", 0},
+	{&checkpoint_lock_key, "checkpoint_lock", 0},
+	{&fts_cache_rw_lock_key, "fts_cache_rw_lock", 0},
+	{&fts_cache_init_rw_lock_key, "fts_cache_init_rw_lock", 0},
+	{&trx_i_s_cache_lock_key, "trx_i_s_cache_lock", 0},
+	{&trx_purge_latch_key, "trx_purge_latch", 0},
+	{&index_tree_rw_lock_key, "index_tree_rw_lock", 0},
+	{&index_online_log_key, "index_online_log", 0},
+	{&dict_table_stats_key, "dict_table_stats", 0},
+	{&hash_table_rw_lock_key, "hash_table_locks", 0}
+};
+# endif /* UNIV_PFS_RWLOCK */
+
+# ifdef UNIV_PFS_THREAD
+/* all_innodb_threads array contains threads that are
+performance schema instrumented if "UNIV_PFS_THREAD"
+is defined */
+static PSI_thread_info	all_innodb_threads[] = {
+	{&trx_rollback_clean_thread_key, "trx_rollback_clean_thread", 0},
+	{&io_handler_thread_key, "io_handler_thread", 0},
+	{&srv_lock_timeout_thread_key, "srv_lock_timeout_thread", 0},
+	{&srv_error_monitor_thread_key, "srv_error_monitor_thread", 0},
+	{&srv_monitor_thread_key, "srv_monitor_thread", 0},
+	{&srv_master_thread_key, "srv_master_thread", 0},
+	{&srv_purge_thread_key, "srv_purge_thread", 0},
+	{&buf_page_cleaner_thread_key, "page_cleaner_thread", 0},
+	{&recv_writer_thread_key, "recv_writer_thread", 0}
+};
+# endif /* UNIV_PFS_THREAD */
+
+# ifdef UNIV_PFS_IO
+/* all_innodb_files array contains the type of files that are
+performance schema instrumented if "UNIV_PFS_IO" is defined */
+static PSI_file_info	all_innodb_files[] = {
+	{&innodb_file_data_key, "innodb_data_file", 0},
+	{&innodb_file_log_key, "innodb_log_file", 0},
+	{&innodb_file_temp_key, "innodb_temp_file", 0}
+};
+# endif /* UNIV_PFS_IO */
+#endif /* HAVE_PSI_INTERFACE */
+
+/** Always normalize table name to lower case on Windows */
+#ifdef __WIN__
+#define normalize_table_name(norm_name, name)           \
+	normalize_table_name_low(norm_name, name, TRUE)
+#else
+#define normalize_table_name(norm_name, name)           \
+	normalize_table_name_low(norm_name, name, FALSE)
+#endif /* __WIN__ */
+
+/** Set up InnoDB API callback function array */
+ib_cb_t innodb_api_cb[] = {
+	(ib_cb_t) ib_cursor_open_table,
+	(ib_cb_t) ib_cursor_read_row,
+	(ib_cb_t) ib_cursor_insert_row,
+	(ib_cb_t) ib_cursor_delete_row,
+	(ib_cb_t) ib_cursor_update_row,
+	(ib_cb_t) ib_cursor_moveto,
+	(ib_cb_t) ib_cursor_first,
+	(ib_cb_t) ib_cursor_next,
+	(ib_cb_t) ib_cursor_last,
+	(ib_cb_t) ib_cursor_set_match_mode,
+	(ib_cb_t) ib_sec_search_tuple_create,
+	(ib_cb_t) ib_clust_read_tuple_create,
+	(ib_cb_t) ib_tuple_delete,
+	(ib_cb_t) ib_tuple_copy,
+	(ib_cb_t) ib_tuple_read_u8,
+	(ib_cb_t) ib_tuple_write_u8,
+	(ib_cb_t) ib_tuple_read_u16,
+	(ib_cb_t) ib_tuple_write_u16,
+	(ib_cb_t) ib_tuple_read_u32,
+	(ib_cb_t) ib_tuple_write_u32,
+	(ib_cb_t) ib_tuple_read_u64,
+	(ib_cb_t) ib_tuple_write_u64,
+	(ib_cb_t) ib_tuple_read_i8,
+	(ib_cb_t) ib_tuple_write_i8,
+	(ib_cb_t) ib_tuple_read_i16,
+	(ib_cb_t) ib_tuple_write_i16,
+	(ib_cb_t) ib_tuple_read_i32,
+	(ib_cb_t) ib_tuple_write_i32,
+	(ib_cb_t) ib_tuple_read_i64,
+	(ib_cb_t) ib_tuple_write_i64,
+	(ib_cb_t) ib_tuple_get_n_cols,
+	(ib_cb_t) ib_col_set_value,
+	(ib_cb_t) ib_col_get_value,
+	(ib_cb_t) ib_col_get_meta,
+	(ib_cb_t) ib_trx_begin,
+	(ib_cb_t) ib_trx_commit,
+	(ib_cb_t) ib_trx_rollback,
+	(ib_cb_t) ib_trx_start,
+	(ib_cb_t) ib_trx_release,
+	(ib_cb_t) ib_trx_state,
+	(ib_cb_t) ib_cursor_lock,
+	(ib_cb_t) ib_cursor_close,
+	(ib_cb_t) ib_cursor_new_trx,
+	(ib_cb_t) ib_cursor_reset,
+	(ib_cb_t) ib_open_table_by_name,
+	(ib_cb_t) ib_col_get_name,
+	(ib_cb_t) ib_table_truncate,
+	(ib_cb_t) ib_cursor_open_index_using_name,
+	(ib_cb_t) ib_close_thd,
+	(ib_cb_t) ib_cfg_get_cfg,
+	(ib_cb_t) ib_cursor_set_memcached_sync,
+	(ib_cb_t) ib_cursor_set_cluster_access,
+	(ib_cb_t) ib_cursor_commit_trx,
+	(ib_cb_t) ib_cfg_trx_level,
+	(ib_cb_t) ib_tuple_get_n_user_cols,
+	(ib_cb_t) ib_cursor_set_lock_mode,
+	(ib_cb_t) ib_cursor_clear_trx,
+	(ib_cb_t) ib_get_idx_field_name,
+	(ib_cb_t) ib_trx_get_start_time,
+	(ib_cb_t) ib_cfg_bk_commit_interval,
+	(ib_cb_t) ib_cursor_stmt_begin
+};
+
+/*************************************************************//**
+Check whether valid argument given to innodb_ft_*_stopword_table.
+This function is registered as a callback with MySQL.
+@return 0 for valid stopword table */
+static
+int
+innodb_stopword_table_validate(
+/*===========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value);	/*!< in: incoming string */
+
+/** "GEN_CLUST_INDEX" is the name reserved for InnoDB default
+system clustered index when there is no primary key. */
+const char innobase_index_reserve_name[] = "GEN_CLUST_INDEX";
+
+/******************************************************************//**
+Maps a MySQL trx isolation level code to the InnoDB isolation level code
+@return	InnoDB isolation level */
+static inline
+ulint
+innobase_map_isolation_level(
+/*=========================*/
+	enum_tx_isolation	iso);	/*!< in: MySQL isolation level code */
+
+static const char innobase_hton_name[]= "InnoDB";
+
+static MYSQL_THDVAR_BOOL(support_xa, PLUGIN_VAR_OPCMDARG,
+  "Enable InnoDB support for the XA two-phase commit",
+  /* check_func */ NULL, /* update_func */ NULL,
+  /* default */ TRUE);
+
+static MYSQL_THDVAR_BOOL(table_locks, PLUGIN_VAR_OPCMDARG,
+  "Enable InnoDB locking in LOCK TABLES",
+  /* check_func */ NULL, /* update_func */ NULL,
+  /* default */ TRUE);
+
+static MYSQL_THDVAR_BOOL(strict_mode, PLUGIN_VAR_OPCMDARG,
+  "Use strict mode when evaluating create options.",
+  NULL, NULL, FALSE);
+
+static MYSQL_THDVAR_BOOL(ft_enable_stopword, PLUGIN_VAR_OPCMDARG,
+  "Create FTS index with stopword.",
+  NULL, NULL,
+  /* default */ TRUE);
+
+static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
+  "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.",
+  NULL, NULL, 50, 1, 1024 * 1024 * 1024, 0);
+
+static MYSQL_THDVAR_STR(ft_user_stopword_table,
+  PLUGIN_VAR_OPCMDARG|PLUGIN_VAR_MEMALLOC,
+  "User supplied stopword table name, effective in the session level.",
+  innodb_stopword_table_validate, NULL, NULL);
+
+static SHOW_VAR innodb_status_variables[]= {
+  {"buffer_pool_dump_status",
+  (char*) &export_vars.innodb_buffer_pool_dump_status,	  SHOW_CHAR},
+  {"buffer_pool_load_status",
+  (char*) &export_vars.innodb_buffer_pool_load_status,	  SHOW_CHAR},
+  {"buffer_pool_pages_data",
+  (char*) &export_vars.innodb_buffer_pool_pages_data,	  SHOW_LONG},
+  {"buffer_pool_bytes_data",
+  (char*) &export_vars.innodb_buffer_pool_bytes_data,	  SHOW_LONG},
+  {"buffer_pool_pages_dirty",
+  (char*) &export_vars.innodb_buffer_pool_pages_dirty,	  SHOW_LONG},
+  {"buffer_pool_bytes_dirty",
+  (char*) &export_vars.innodb_buffer_pool_bytes_dirty,	  SHOW_LONG},
+  {"buffer_pool_pages_flushed",
+  (char*) &export_vars.innodb_buffer_pool_pages_flushed,  SHOW_LONG},
+  {"buffer_pool_pages_free",
+  (char*) &export_vars.innodb_buffer_pool_pages_free,	  SHOW_LONG},
+#ifdef UNIV_DEBUG
+  {"buffer_pool_pages_latched",
+  (char*) &export_vars.innodb_buffer_pool_pages_latched,  SHOW_LONG},
+#endif /* UNIV_DEBUG */
+  {"buffer_pool_pages_misc",
+  (char*) &export_vars.innodb_buffer_pool_pages_misc,	  SHOW_LONG},
+  {"buffer_pool_pages_total",
+  (char*) &export_vars.innodb_buffer_pool_pages_total,	  SHOW_LONG},
+  {"buffer_pool_read_ahead_rnd",
+  (char*) &export_vars.innodb_buffer_pool_read_ahead_rnd, SHOW_LONG},
+  {"buffer_pool_read_ahead",
+  (char*) &export_vars.innodb_buffer_pool_read_ahead,	  SHOW_LONG},
+  {"buffer_pool_read_ahead_evicted",
+  (char*) &export_vars.innodb_buffer_pool_read_ahead_evicted, SHOW_LONG},
+  {"buffer_pool_read_requests",
+  (char*) &export_vars.innodb_buffer_pool_read_requests,  SHOW_LONG},
+  {"buffer_pool_reads",
+  (char*) &export_vars.innodb_buffer_pool_reads,	  SHOW_LONG},
+  {"buffer_pool_wait_free",
+  (char*) &export_vars.innodb_buffer_pool_wait_free,	  SHOW_LONG},
+  {"buffer_pool_write_requests",
+  (char*) &export_vars.innodb_buffer_pool_write_requests, SHOW_LONG},
+  {"data_fsyncs",
+  (char*) &export_vars.innodb_data_fsyncs,		  SHOW_LONG},
+  {"data_pending_fsyncs",
+  (char*) &export_vars.innodb_data_pending_fsyncs,	  SHOW_LONG},
+  {"data_pending_reads",
+  (char*) &export_vars.innodb_data_pending_reads,	  SHOW_LONG},
+  {"data_pending_writes",
+  (char*) &export_vars.innodb_data_pending_writes,	  SHOW_LONG},
+  {"data_read",
+  (char*) &export_vars.innodb_data_read,		  SHOW_LONG},
+  {"data_reads",
+  (char*) &export_vars.innodb_data_reads,		  SHOW_LONG},
+  {"data_writes",
+  (char*) &export_vars.innodb_data_writes,		  SHOW_LONG},
+  {"data_written",
+  (char*) &export_vars.innodb_data_written,		  SHOW_LONG},
+  {"dblwr_pages_written",
+  (char*) &export_vars.innodb_dblwr_pages_written,	  SHOW_LONG},
+  {"dblwr_writes",
+  (char*) &export_vars.innodb_dblwr_writes,		  SHOW_LONG},
+  {"have_atomic_builtins",
+  (char*) &export_vars.innodb_have_atomic_builtins,	  SHOW_BOOL},
+  {"log_waits",
+  (char*) &export_vars.innodb_log_waits,		  SHOW_LONG},
+  {"log_write_requests",
+  (char*) &export_vars.innodb_log_write_requests,	  SHOW_LONG},
+  {"log_writes",
+  (char*) &export_vars.innodb_log_writes,		  SHOW_LONG},
+  {"os_log_fsyncs",
+  (char*) &export_vars.innodb_os_log_fsyncs,		  SHOW_LONG},
+  {"os_log_pending_fsyncs",
+  (char*) &export_vars.innodb_os_log_pending_fsyncs,	  SHOW_LONG},
+  {"os_log_pending_writes",
+  (char*) &export_vars.innodb_os_log_pending_writes,	  SHOW_LONG},
+  {"os_log_written",
+  (char*) &export_vars.innodb_os_log_written,		  SHOW_LONGLONG},
+  {"page_size",
+  (char*) &export_vars.innodb_page_size,		  SHOW_LONG},
+  {"pages_created",
+  (char*) &export_vars.innodb_pages_created,		  SHOW_LONG},
+  {"pages_read",
+  (char*) &export_vars.innodb_pages_read,		  SHOW_LONG},
+  {"pages_written",
+  (char*) &export_vars.innodb_pages_written,		  SHOW_LONG},
+  {"row_lock_current_waits",
+  (char*) &export_vars.innodb_row_lock_current_waits,	  SHOW_LONG},
+  {"row_lock_time",
+  (char*) &export_vars.innodb_row_lock_time,		  SHOW_LONGLONG},
+  {"row_lock_time_avg",
+  (char*) &export_vars.innodb_row_lock_time_avg,	  SHOW_LONG},
+  {"row_lock_time_max",
+  (char*) &export_vars.innodb_row_lock_time_max,	  SHOW_LONG},
+  {"row_lock_waits",
+  (char*) &export_vars.innodb_row_lock_waits,		  SHOW_LONG},
+  {"rows_deleted",
+  (char*) &export_vars.innodb_rows_deleted,		  SHOW_LONG},
+  {"rows_inserted",
+  (char*) &export_vars.innodb_rows_inserted,		  SHOW_LONG},
+  {"rows_read",
+  (char*) &export_vars.innodb_rows_read,		  SHOW_LONG},
+  {"rows_updated",
+  (char*) &export_vars.innodb_rows_updated,		  SHOW_LONG},
+  {"num_open_files",
+  (char*) &export_vars.innodb_num_open_files,		  SHOW_LONG},
+  {"truncated_status_writes",
+  (char*) &export_vars.innodb_truncated_status_writes,	  SHOW_LONG},
+  {"available_undo_logs",
+  (char*) &export_vars.innodb_available_undo_logs,        SHOW_LONG},
+#ifdef UNIV_DEBUG
+  {"purge_trx_id_age",
+  (char*) &export_vars.innodb_purge_trx_id_age,           SHOW_LONG},
+  {"purge_view_trx_id_age",
+  (char*) &export_vars.innodb_purge_view_trx_id_age,      SHOW_LONG},
+#endif /* UNIV_DEBUG */
+  {NullS, NullS, SHOW_LONG}
+};
+
+/************************************************************************//**
+Handling the shared INNOBASE_SHARE structure that is needed to provide table
+locking. Register the table name if it doesn't exist in the hash table. */
+static
+INNOBASE_SHARE*
+get_share(
+/*======*/
+	const char*	table_name);	/*!< in: table to lookup */
+
+/************************************************************************//**
+Free the shared object that was registered with get_share(). */
+static
+void
+free_share(
+/*=======*/
+	INNOBASE_SHARE*	share);		/*!< in/own: share to free */
+
+/*****************************************************************//**
+Frees a possible InnoDB trx object associated with the current THD.
+@return	0 or error number */
+static
+int
+innobase_close_connection(
+/*======================*/
+	handlerton*	hton,		/*!< in/out: Innodb handlerton */
+	THD*		thd);		/*!< in: MySQL thread handle for
+					which to close the connection */
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database or marks an SQL statement
+ended.
+@return	0 */
+static
+int
+innobase_commit(
+/*============*/
+	handlerton*	hton,		/*!< in/out: Innodb handlerton */
+	THD*		thd,		/*!< in: MySQL thread handle of the
+					user for whom the transaction should
+					be committed */
+	bool		commit_trx);	/*!< in: true - commit transaction
+					false - the current SQL statement
+					ended */
+
+/*****************************************************************//**
+Rolls back a transaction to a savepoint.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_rollback(
+/*==============*/
+	handlerton*	hton,		/*!< in/out: Innodb handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction should
+					be rolled back */
+	bool		rollback_trx);	/*!< in: TRUE - rollback entire
+					transaction FALSE - rollback the current
+					statement only */
+
+/*****************************************************************//**
+Rolls back a transaction to a savepoint.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_rollback_to_savepoint(
+/*===========================*/
+	handlerton*	hton,		/*!< in/out: InnoDB handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread of
+					the user whose XA transaction should
+					be rolled back to savepoint */
+	void*		savepoint);	/*!< in: savepoint data */
+
+/*****************************************************************//**
+Check whether innodb state allows to safely release MDL locks after
+rollback to savepoint.
+@return true if it is safe, false if its not safe. */
+static
+bool
+innobase_rollback_to_savepoint_can_release_mdl(
+/*===========================================*/
+	handlerton*	hton,		/*!< in/out: InnoDB handlerton */
+	THD*		thd);		/*!< in: handle to the MySQL thread of
+					the user whose XA transaction should
+					be rolled back to savepoint */
+
+/*****************************************************************//**
+Sets a transaction savepoint.
+@return	always 0, that is, always succeeds */
+static
+int
+innobase_savepoint(
+/*===============*/
+	handlerton*	hton,		/*!< in/out: InnoDB handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread of
+					the user's XA transaction for which
+					we need to take a savepoint */
+	void*		savepoint);	/*!< in: savepoint data */
+
+/*****************************************************************//**
+Release transaction savepoint name.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_release_savepoint(
+/*=======================*/
+	handlerton*	hton,		/*!< in/out: handlerton for Innodb */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction's
+					savepoint should be released */
+	void*		savepoint);	/*!< in: savepoint data */
+
+/************************************************************************//**
+Function for constructing an InnoDB table handler instance. */
+static
+handler*
+innobase_create_handler(
+/*====================*/
+	handlerton*	hton,		/*!< in/out: handlerton for Innodb */
+	TABLE_SHARE*	table,
+	MEM_ROOT*	mem_root);
+
+/** @brief Initialize the default value of innodb_commit_concurrency.
+
+Once InnoDB is running, the innodb_commit_concurrency must not change
+from zero to nonzero. (Bug #42101)
+
+The initial default value is 0, and without this extra initialization,
+SET GLOBAL innodb_commit_concurrency=DEFAULT would set the parameter
+to 0, even if it was initially set to nonzero at the command line
+or configuration file. */
+static
+void
+innobase_commit_concurrency_init_default();
+/*=======================================*/
+
+/** @brief Initialize the default and max value of innodb_undo_logs.
+
+Once InnoDB is running, the default value and the max value of
+innodb_undo_logs must be equal to the available undo logs,
+given by srv_available_undo_logs. */
+static
+void
+innobase_undo_logs_init_default_max();
+/*==================================*/
+
+/************************************************************//**
+Validate the file format name and return its corresponding id.
+@return	valid file format id */
+static
+uint
+innobase_file_format_name_lookup(
+/*=============================*/
+	const char*	format_name);	/*!< in: pointer to file format
+					name */
+/************************************************************//**
+Validate the file format check config parameters, as a side effect it
+sets the srv_max_file_format_at_startup variable.
+@return	the format_id if valid config value, otherwise, return -1 */
+static
+int
+innobase_file_format_validate_and_set(
+/*==================================*/
+	const char*	format_max);	/*!< in: parameter value */
+
+/*******************************************************************//**
+This function is used to prepare an X/Open XA distributed transaction.
+@return	0 or error number */
+static
+int
+innobase_xa_prepare(
+/*================*/
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread of
+					the user whose XA transaction should
+					be prepared */
+	bool		all);		/*!< in: true - prepare transaction
+					false - the current SQL statement
+					ended */
+/*******************************************************************//**
+This function is used to recover X/Open XA distributed transactions.
+@return	number of prepared transactions stored in xid_list */
+static
+int
+innobase_xa_recover(
+/*================*/
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	XID*		xid_list,	/*!< in/out: prepared transactions */
+	uint		len);		/*!< in: number of slots in xid_list */
+/*******************************************************************//**
+This function is used to commit one X/Open XA distributed transaction
+which is in the prepared state
+@return	0 or error number */
+static
+int
+innobase_commit_by_xid(
+/*===================*/
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	XID*		xid);		/*!< in: X/Open XA transaction
+					identification */
+/*******************************************************************//**
+This function is used to rollback one X/Open XA distributed transaction
+which is in the prepared state
+@return	0 or error number */
+static
+int
+innobase_rollback_by_xid(
+/*=====================*/
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	XID*		xid);		/*!< in: X/Open XA transaction
+					identification */
+/*******************************************************************//**
+Create a consistent view for a cursor based on current transaction
+which is created if the corresponding MySQL thread still lacks one.
+This consistent view is then used inside of MySQL when accessing records
+using a cursor.
+@return	pointer to cursor view or NULL */
+static
+void*
+innobase_create_cursor_view(
+/*========================*/
+	handlerton*	hton,		/*!< in: innobase hton */
+	THD*		thd);		/*!< in: user thread handle */
+/*******************************************************************//**
+Set the given consistent cursor view to a transaction which is created
+if the corresponding MySQL thread still lacks one. If the given
+consistent cursor view is NULL global read view of a transaction is
+restored to a transaction read view. */
+static
+void
+innobase_set_cursor_view(
+/*=====================*/
+	handlerton*	hton,		/*!< in: handlerton of Innodb */
+	THD*		thd,		/*!< in: user thread handle */
+	void*		curview);	/*!< in: Consistent cursor view to
+					be set */
+/*******************************************************************//**
+Close the given consistent cursor view of a transaction and restore
+global read view to a transaction read view. Transaction is created if the
+corresponding MySQL thread still lacks one. */
+static
+void
+innobase_close_cursor_view(
+/*=======================*/
+	handlerton*	hton,		/*!< in: handlerton of Innodb */
+	THD*		thd,		/*!< in: user thread handle */
+	void*		curview);	/*!< in: Consistent read view to be
+					closed */
+/*****************************************************************//**
+Removes all tables in the named database inside InnoDB. */
+static
+void
+innobase_drop_database(
+/*===================*/
+	handlerton*	hton,		/*!< in: handlerton of Innodb */
+	char*		path);		/*!< in: database path; inside InnoDB
+					the name of the last directory in
+					the path is used as the database name:
+					for example, in 'mysql/data/test' the
+					database name is 'test' */
+/*******************************************************************//**
+Closes an InnoDB database. */
+static
+int
+innobase_end(
+/*=========*/
+	handlerton*		hton,	/* in: Innodb handlerton */
+	ha_panic_function	type);
+
+/*****************************************************************//**
+Creates an InnoDB transaction struct for the thd if it does not yet have one.
+Starts a new InnoDB transaction if a transaction is not yet started. And
+assigns a new snapshot for a consistent read if the transaction does not yet
+have one.
+@return	0 */
+static
+int
+innobase_start_trx_and_assign_read_view(
+/*====================================*/
+	handlerton*	hton,		/* in: Innodb handlerton */
+	THD*		thd);		/* in: MySQL thread handle of the
+					user for whom the transaction should
+					be committed */
+/****************************************************************//**
+Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes
+the logs, and the name of this function should be innobase_checkpoint.
+@return	TRUE if error */
+static
+bool
+innobase_flush_logs(
+/*================*/
+	handlerton*	hton);		/*!< in: InnoDB handlerton */
+
+/************************************************************************//**
+Implements the SHOW ENGINE INNODB STATUS command. Sends the output of the
+InnoDB Monitor to the client.
+@return 0 on success */
+static
+int
+innodb_show_status(
+/*===============*/
+	handlerton*	hton,		/*!< in: the innodb handlerton */
+	THD*		thd,		/*!< in: the MySQL query thread of
+					the caller */
+	stat_print_fn*	stat_print);
+/************************************************************************//**
+Return 0 on success and non-zero on failure. Note: the bool return type
+seems to be abused here, should be an int. */
+static
+bool
+innobase_show_status(
+/*=================*/
+	handlerton*		hton,	/*!< in: the innodb handlerton */
+	THD*			thd,	/*!< in: the MySQL query thread of
+					the caller */
+	stat_print_fn*		stat_print,
+	enum ha_stat_type	stat_type);
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database. */
+static
+void
+innobase_commit_low(
+/*================*/
+	trx_t*	trx);	/*!< in: transaction handle */
+
+/****************************************************************//**
+Parse and enable InnoDB monitor counters during server startup.
+User can enable monitor counters/groups by specifying
+"loose-innodb_monitor_enable = monitor_name1;monitor_name2..."
+in server configuration file or at the command line. */
+static
+void
+innodb_enable_monitor_at_startup(
+/*=============================*/
+	char*	str);	/*!< in: monitor counter enable list */
+
+/*********************************************************************
+Normalizes a table name string. A normalized name consists of the
+database name catenated to '/' and table name. An example:
+test/mytable. On Windows normalization puts both the database name and the
+table name always to lower case if "set_lower_case" is set to TRUE. */
+static
+void
+normalize_table_name_low(
+/*=====================*/
+	char*           norm_name,      /* out: normalized name as a
+					null-terminated string */
+	const char*     name,           /* in: table name string */
+	ibool           set_lower_case); /* in: TRUE if we want to set
+					 name to lower case */
+
+/*************************************************************//**
+Check for a valid value of innobase_commit_concurrency.
+@return	0 for valid innodb_commit_concurrency */
+static
+int
+innobase_commit_concurrency_validate(
+/*=================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	long long	intbuf;
+	ulong		commit_concurrency;
+
+	DBUG_ENTER("innobase_commit_concurrency_validate");
+
+	if (value->val_int(value, &intbuf)) {
+		/* The value is NULL. That is invalid. */
+		DBUG_RETURN(1);
+	}
+
+	*reinterpret_cast<ulong*>(save) = commit_concurrency
+		= static_cast<ulong>(intbuf);
+
+	/* Allow the value to be updated, as long as it remains zero
+	or nonzero. */
+	DBUG_RETURN(!(!commit_concurrency == !innobase_commit_concurrency));
+}
+
+/*******************************************************************//**
+Function for constructing an InnoDB table handler instance. */
+static
+handler*
+innobase_create_handler(
+/*====================*/
+	handlerton*	hton,	/*!< in: InnoDB handlerton */
+	TABLE_SHARE*	table,
+	MEM_ROOT*	mem_root)
+{
+	return(new (mem_root) ha_innobase(hton, table));
+}
+
+/* General functions */
+
+/*************************************************************//**
+Check that a page_size is correct for InnoDB.  If correct, set the
+associated page_size_shift which is the power of 2 for this page size.
+@return	an associated page_size_shift if valid, 0 if invalid. */
+inline
+int
+innodb_page_size_validate(
+/*======================*/
+	ulong	page_size)		/*!< in: Page Size to evaluate */
+{
+	ulong		n;
+
+	DBUG_ENTER("innodb_page_size_validate");
+
+	for (n = UNIV_PAGE_SIZE_SHIFT_MIN;
+	     n <= UNIV_PAGE_SIZE_SHIFT_MAX;
+	     n++) {
+		if (page_size == (ulong) (1 << n)) {
+			DBUG_RETURN(n);
+		}
+	}
+
+	DBUG_RETURN(0);
+}
+
+/******************************************************************//**
+Returns true if the thread is the replication thread on the slave
+server. Used in srv_conc_enter_innodb() to determine if the thread
+should be allowed to enter InnoDB - the replication thread is treated
+differently than other threads. Also used in
+srv_conc_force_exit_innodb().
+@return	true if thd is the replication thread */
+UNIV_INTERN
+ibool
+thd_is_replication_slave_thread(
+/*============================*/
+	THD*	thd)	/*!< in: thread handle */
+{
+	return((ibool) thd_slave_thread(thd));
+}
+
+/******************************************************************//**
+Gets information on the durability property requested by thread.
+Used when writing either a prepare or commit record to the log
+buffer. @return the durability property. */
+UNIV_INTERN
+enum durability_properties
+thd_requested_durability(
+/*=====================*/
+	const THD* thd)	/*!< in: thread handle */
+{
+	return(thd_get_durability_property(thd));
+}
+
+/******************************************************************//**
+Returns true if transaction should be flagged as read-only.
+@return	true if the thd is marked as read-only */
+UNIV_INTERN
+ibool
+thd_trx_is_read_only(
+/*=================*/
+	THD*	thd)	/*!< in: thread handle */
+{
+	return(thd != 0 && thd_tx_is_read_only(thd));
+}
+
+/******************************************************************//**
+Check if the transaction is an auto-commit transaction. TRUE also
+implies that it is a SELECT (read-only) transaction.
+@return	true if the transaction is an auto commit read-only transaction. */
+UNIV_INTERN
+ibool
+thd_trx_is_auto_commit(
+/*===================*/
+	THD*	thd)	/*!< in: thread handle, can be NULL */
+{
+	return(thd != NULL
+	       && !thd_test_options(
+		       thd,
+		       OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)
+	       && thd_is_select(thd));
+}
+
+/******************************************************************//**
+Save some CPU by testing the value of srv_thread_concurrency in inline
+functions. */
+static inline
+void
+innobase_srv_conc_enter_innodb(
+/*===========================*/
+	trx_t*	trx)	/*!< in: transaction handle */
+{
+	if (srv_thread_concurrency) {
+		if (trx->n_tickets_to_enter_innodb > 0) {
+
+			/* If trx has 'free tickets' to enter the engine left,
+			then use one such ticket */
+
+			--trx->n_tickets_to_enter_innodb;
+
+		} else if (trx->mysql_thd != NULL
+			   && thd_is_replication_slave_thread(trx->mysql_thd)) {
+
+			UT_WAIT_FOR(
+				srv_conc_get_active_threads()
+				< srv_thread_concurrency,
+				srv_replication_delay * 1000);
+
+		}  else {
+			srv_conc_enter_innodb(trx);
+		}
+	}
+}
+
+/******************************************************************//**
+Note that the thread wants to leave InnoDB only if it doesn't have
+any spare tickets. */
+static inline
+void
+innobase_srv_conc_exit_innodb(
+/*==========================*/
+	trx_t*	trx)	/*!< in: transaction handle */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* This is to avoid making an unnecessary function call. */
+	if (trx->declared_to_be_inside_innodb
+	    && trx->n_tickets_to_enter_innodb == 0) {
+
+		srv_conc_force_exit_innodb(trx);
+	}
+}
+
+/******************************************************************//**
+Force a thread to leave InnoDB even if it has spare tickets. */
+static inline
+void
+innobase_srv_conc_force_exit_innodb(
+/*================================*/
+	trx_t*	trx)	/*!< in: transaction handle */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* This is to avoid making an unnecessary function call. */
+	if (trx->declared_to_be_inside_innodb) {
+		srv_conc_force_exit_innodb(trx);
+	}
+}
+
+/******************************************************************//**
+Returns the NUL terminated value of glob_hostname.
+@return	pointer to glob_hostname. */
+UNIV_INTERN
+const char*
+server_get_hostname()
+/*=================*/
+{
+	return(glob_hostname);
+}
+
+/******************************************************************//**
+Returns true if the transaction this thread is processing has edited
+non-transactional tables. Used by the deadlock detector when deciding
+which transaction to rollback in case of a deadlock - we try to avoid
+rolling back transactions that have edited non-transactional tables.
+@return	true if non-transactional tables have been edited */
+UNIV_INTERN
+ibool
+thd_has_edited_nontrans_tables(
+/*===========================*/
+	THD*	thd)	/*!< in: thread handle */
+{
+	return((ibool) thd_non_transactional_update(thd));
+}
+
+/******************************************************************//**
+Returns true if the thread is executing a SELECT statement.
+@return	true if thd is executing SELECT */
+UNIV_INTERN
+ibool
+thd_is_select(
+/*==========*/
+	const THD*	thd)	/*!< in: thread handle */
+{
+	return(thd_sql_command(thd) == SQLCOM_SELECT);
+}
+
+/******************************************************************//**
+Returns true if the thread supports XA,
+global value of innodb_supports_xa if thd is NULL.
+@return	true if thd has XA support */
+UNIV_INTERN
+ibool
+thd_supports_xa(
+/*============*/
+	THD*	thd)	/*!< in: thread handle, or NULL to query
+			the global innodb_supports_xa */
+{
+	return(THDVAR(thd, support_xa));
+}
+
+/******************************************************************//**
+Returns the lock wait timeout for the current connection.
+@return	the lock wait timeout, in seconds */
+UNIV_INTERN
+ulong
+thd_lock_wait_timeout(
+/*==================*/
+	THD*	thd)	/*!< in: thread handle, or NULL to query
+			the global innodb_lock_wait_timeout */
+{
+	/* According to <mysql/plugin.h>, passing thd == NULL
+	returns the global value of the session variable. */
+	return(THDVAR(thd, lock_wait_timeout));
+}
+
+/******************************************************************//**
+Set the time waited for the lock for the current query. */
+UNIV_INTERN
+void
+thd_set_lock_wait_time(
+/*===================*/
+	THD*	thd,	/*!< in/out: thread handle */
+	ulint	value)	/*!< in: time waited for the lock */
+{
+	if (thd) {
+		thd_storage_lock_wait(thd, value);
+	}
+}
+
+/********************************************************************//**
+Obtain the InnoDB transaction of a MySQL thread.
+@return	reference to transaction pointer */
+__attribute__((warn_unused_result, nonnull))
+static inline
+trx_t*&
+thd_to_trx(
+/*=======*/
+	THD*	thd)	/*!< in: MySQL thread */
+{
+	return(*(trx_t**) thd_ha_data(thd, innodb_hton_ptr));
+}
+
+/********************************************************************//**
+Call this function when mysqld passes control to the client. That is to
+avoid deadlocks on the adaptive hash S-latch possibly held by thd. For more
+documentation, see handler.cc.
+@return	0 */
+static
+int
+innobase_release_temporary_latches(
+/*===============================*/
+	handlerton*	hton,	/*!< in: handlerton */
+	THD*		thd)	/*!< in: MySQL thread */
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	if (!innodb_inited) {
+
+		return(0);
+	}
+
+	trx_t*	trx = thd_to_trx(thd);
+
+	if (trx != NULL) {
+		trx_search_latch_release_if_reserved(trx);
+	}
+
+	return(0);
+}
+
+/********************************************************************//**
+Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth
+time calls srv_active_wake_master_thread. This function should be used
+when a single database operation may introduce a small need for
+server utility activity, like checkpointing. */
+static inline
+void
+innobase_active_small(void)
+/*=======================*/
+{
+	innobase_active_counter++;
+
+	if ((innobase_active_counter % INNOBASE_WAKE_INTERVAL) == 0) {
+		srv_active_wake_master_thread();
+	}
+}
+
+/********************************************************************//**
+Converts an InnoDB error code to a MySQL error code and also tells to MySQL
+about a possible transaction rollback inside InnoDB caused by a lock wait
+timeout or a deadlock.
+@return	MySQL error code */
+static
+int
+convert_error_code_to_mysql(
+/*========================*/
+	dberr_t	error,	/*!< in: InnoDB error code */
+	ulint	flags,  /*!< in: InnoDB table flags, or 0 */
+	THD*	thd)	/*!< in: user thread handle or NULL */
+{
+	switch (error) {
+	case DB_SUCCESS:
+		return(0);
+
+	case DB_INTERRUPTED:
+		my_error(ER_QUERY_INTERRUPTED, MYF(0));
+		return(-1);
+
+	case DB_FOREIGN_EXCEED_MAX_CASCADE:
+		ut_ad(thd);
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    HA_ERR_ROW_IS_REFERENCED,
+				    "InnoDB: Cannot delete/update "
+				    "rows with cascading foreign key "
+				    "constraints that exceed max "
+				    "depth of %d. Please "
+				    "drop extra constraints and try "
+				    "again", DICT_FK_MAX_RECURSIVE_LOAD);
+
+		/* fall through */
+
+	case DB_ERROR:
+	default:
+		return(-1); /* unspecified error */
+
+	case DB_DUPLICATE_KEY:
+		/* Be cautious with returning this error, since
+		mysql could re-enter the storage layer to get
+		duplicated key info, the operation requires a
+		valid table handle and/or transaction information,
+		which might not always be available in the error
+		handling stage. */
+		return(HA_ERR_FOUND_DUPP_KEY);
+
+	case DB_READ_ONLY:
+		if(srv_force_recovery) {
+			return(HA_ERR_INNODB_FORCED_RECOVERY);
+		}
+		return(HA_ERR_TABLE_READONLY);
+
+	case DB_FOREIGN_DUPLICATE_KEY:
+		return(HA_ERR_FOREIGN_DUPLICATE_KEY);
+
+	case DB_MISSING_HISTORY:
+		return(HA_ERR_TABLE_DEF_CHANGED);
+
+	case DB_RECORD_NOT_FOUND:
+		return(HA_ERR_NO_ACTIVE_RECORD);
+
+	case DB_DEADLOCK:
+		/* Since we rolled back the whole transaction, we must
+		tell it also to MySQL so that MySQL knows to empty the
+		cached binlog for this transaction */
+
+		if (thd) {
+			thd_mark_transaction_to_rollback(thd, TRUE);
+		}
+
+		return(HA_ERR_LOCK_DEADLOCK);
+
+	case DB_LOCK_WAIT_TIMEOUT:
+		/* Starting from 5.0.13, we let MySQL just roll back the
+		latest SQL statement in a lock wait timeout. Previously, we
+		rolled back the whole transaction. */
+
+		if (thd) {
+			thd_mark_transaction_to_rollback(
+				thd, (bool) row_rollback_on_timeout);
+		}
+
+		return(HA_ERR_LOCK_WAIT_TIMEOUT);
+
+	case DB_NO_REFERENCED_ROW:
+		return(HA_ERR_NO_REFERENCED_ROW);
+
+	case DB_ROW_IS_REFERENCED:
+		return(HA_ERR_ROW_IS_REFERENCED);
+
+	case DB_CANNOT_ADD_CONSTRAINT:
+	case DB_CHILD_NO_INDEX:
+	case DB_PARENT_NO_INDEX:
+		return(HA_ERR_CANNOT_ADD_FOREIGN);
+
+	case DB_CANNOT_DROP_CONSTRAINT:
+
+		return(HA_ERR_ROW_IS_REFERENCED); /* TODO: This is a bit
+						misleading, a new MySQL error
+						code should be introduced */
+
+	case DB_CORRUPTION:
+		return(HA_ERR_CRASHED);
+
+	case DB_OUT_OF_FILE_SPACE:
+		return(HA_ERR_RECORD_FILE_FULL);
+
+	case DB_TEMP_FILE_WRITE_FAILURE:
+		return(HA_ERR_TEMP_FILE_WRITE_FAILURE);
+
+	case DB_TABLE_IN_FK_CHECK:
+		return(HA_ERR_TABLE_IN_FK_CHECK);
+
+	case DB_TABLE_IS_BEING_USED:
+		return(HA_ERR_WRONG_COMMAND);
+
+	case DB_TABLESPACE_DELETED:
+	case DB_TABLE_NOT_FOUND:
+		return(HA_ERR_NO_SUCH_TABLE);
+
+	case DB_TABLESPACE_NOT_FOUND:
+		return(HA_ERR_NO_SUCH_TABLE);
+
+	case DB_TOO_BIG_RECORD: {
+		/* If prefix is true then a 768-byte prefix is stored
+		locally for BLOB fields. Refer to dict_table_get_format() */
+		bool prefix = (dict_tf_get_format(flags) == UNIV_FORMAT_A);
+		my_printf_error(ER_TOO_BIG_ROWSIZE,
+			"Row size too large (> %lu). Changing some columns "
+			"to TEXT or BLOB %smay help. In current row "
+			"format, BLOB prefix of %d bytes is stored inline.",
+			MYF(0),
+			page_get_free_space_of_empty(flags &
+				DICT_TF_COMPACT) / 2,
+			prefix ? "or using ROW_FORMAT=DYNAMIC "
+			"or ROW_FORMAT=COMPRESSED ": "",
+			prefix ? DICT_MAX_FIXED_COL_LEN : 0);
+		return(HA_ERR_TO_BIG_ROW);
+	}
+
+
+	case DB_TOO_BIG_FOR_REDO:
+		my_printf_error(ER_TOO_BIG_ROWSIZE, "%s" , MYF(0),
+				"The size of BLOB/TEXT data inserted"
+				" in one transaction is greater than"
+				" 10% of redo log size. Increase the"
+				" redo log size using innodb_log_file_size.");
+		return(HA_ERR_TO_BIG_ROW);
+
+	case DB_TOO_BIG_INDEX_COL:
+		my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
+			 DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags));
+		return(HA_ERR_INDEX_COL_TOO_LONG);
+
+	case DB_NO_SAVEPOINT:
+		return(HA_ERR_NO_SAVEPOINT);
+
+	case DB_LOCK_TABLE_FULL:
+		/* Since we rolled back the whole transaction, we must
+		tell it also to MySQL so that MySQL knows to empty the
+		cached binlog for this transaction */
+
+		if (thd) {
+			thd_mark_transaction_to_rollback(thd, TRUE);
+		}
+
+		return(HA_ERR_LOCK_TABLE_FULL);
+
+	case DB_FTS_INVALID_DOCID:
+		return(HA_FTS_INVALID_DOCID);
+	case DB_FTS_EXCEED_RESULT_CACHE_LIMIT:
+		return(HA_ERR_FTS_EXCEED_RESULT_CACHE_LIMIT);
+	case DB_TOO_MANY_CONCURRENT_TRXS:
+		return(HA_ERR_TOO_MANY_CONCURRENT_TRXS);
+	case DB_UNSUPPORTED:
+		return(HA_ERR_UNSUPPORTED);
+	case DB_INDEX_CORRUPT:
+		return(HA_ERR_INDEX_CORRUPT);
+	case DB_UNDO_RECORD_TOO_BIG:
+		return(HA_ERR_UNDO_REC_TOO_BIG);
+	case DB_OUT_OF_MEMORY:
+		return(HA_ERR_OUT_OF_MEM);
+	case DB_TABLESPACE_EXISTS:
+		return(HA_ERR_TABLESPACE_EXISTS);
+	case DB_IDENTIFIER_TOO_LONG:
+		return(HA_ERR_INTERNAL_ERROR);
+	case DB_FTS_TOO_MANY_WORDS_IN_PHRASE:
+		return(HA_ERR_FTS_TOO_MANY_WORDS_IN_PHRASE);
+	}
+}
+
+/*************************************************************//**
+Prints info of a THD object (== user session thread) to the given file. */
+UNIV_INTERN
+void
+innobase_mysql_print_thd(
+/*=====================*/
+	FILE*	f,		/*!< in: output stream */
+	THD*	thd,		/*!< in: MySQL THD object */
+	uint	max_query_len)	/*!< in: max query length to print, or 0 to
+				use the default max length */
+{
+	char	buffer[1024];
+
+	fputs(thd_security_context(thd, buffer, sizeof buffer,
+				   max_query_len), f);
+	putc('\n', f);
+}
+
+/******************************************************************//**
+Get the error message format string.
+@return the format string or 0 if not found. */
+UNIV_INTERN
+const char*
+innobase_get_err_msg(
+/*=================*/
+	int	error_code)	/*!< in: MySQL error code */
+{
+	return(my_get_err_msg(error_code));
+}
+
+/******************************************************************//**
+Get the variable length bounds of the given character set. */
+UNIV_INTERN
+void
+innobase_get_cset_width(
+/*====================*/
+	ulint	cset,		/*!< in: MySQL charset-collation code */
+	ulint*	mbminlen,	/*!< out: minimum length of a char (in bytes) */
+	ulint*	mbmaxlen)	/*!< out: maximum length of a char (in bytes) */
+{
+	CHARSET_INFO*	cs;
+	ut_ad(cset <= MAX_CHAR_COLL_NUM);
+	ut_ad(mbminlen);
+	ut_ad(mbmaxlen);
+
+	cs = all_charsets[cset];
+	if (cs) {
+		*mbminlen = cs->mbminlen;
+		*mbmaxlen = cs->mbmaxlen;
+		ut_ad(*mbminlen < DATA_MBMAX);
+		ut_ad(*mbmaxlen < DATA_MBMAX);
+	} else {
+		THD*	thd = current_thd;
+
+		if (thd && thd_sql_command(thd) == SQLCOM_DROP_TABLE) {
+
+			/* Fix bug#46256: allow tables to be dropped if the
+			collation is not found, but issue a warning. */
+			if ((log_warnings)
+			    && (cset != 0)){
+
+				sql_print_warning(
+					"Unknown collation #%lu.", cset);
+			}
+		} else {
+
+			ut_a(cset == 0);
+		}
+
+		*mbminlen = *mbmaxlen = 0;
+	}
+}
+
+/******************************************************************//**
+Converts an identifier to a table name. */
+UNIV_INTERN
+void
+innobase_convert_from_table_id(
+/*===========================*/
+	struct charset_info_st*	cs,	/*!< in: the 'from' character set */
+	char*			to,	/*!< out: converted identifier */
+	const char*		from,	/*!< in: identifier to convert */
+	ulint			len)	/*!< in: length of 'to', in bytes */
+{
+	uint	errors;
+
+	strconvert(cs, from, &my_charset_filename, to, (uint) len, &errors);
+}
+
+/**********************************************************************
+Check if the length of the identifier exceeds the maximum allowed.
+return true when length of identifier is too long. */
+UNIV_INTERN
+my_bool
+innobase_check_identifier_length(
+/*=============================*/
+	const char*	id)	/* in: FK identifier to check excluding the
+				database portion. */
+{
+	int		well_formed_error = 0;
+	CHARSET_INFO	*cs = system_charset_info;
+	DBUG_ENTER("innobase_check_identifier_length");
+
+	size_t len = cs->cset->well_formed_len(
+		cs, id, id + strlen(id),
+		NAME_CHAR_LEN, &well_formed_error);
+
+	if (well_formed_error || len == NAME_CHAR_LEN) {
+		my_error(ER_TOO_LONG_IDENT, MYF(0), id);
+		DBUG_RETURN(true);
+	}
+	DBUG_RETURN(false);
+}
+
+/******************************************************************//**
+Converts an identifier to UTF-8. */
+UNIV_INTERN
+void
+innobase_convert_from_id(
+/*=====================*/
+	struct charset_info_st*	cs,	/*!< in: the 'from' character set */
+	char*			to,	/*!< out: converted identifier */
+	const char*		from,	/*!< in: identifier to convert */
+	ulint			len)	/*!< in: length of 'to', in bytes */
+{
+	uint	errors;
+
+	strconvert(cs, from, system_charset_info, to, (uint) len, &errors);
+}
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively.
+@return	0 if a=b, <0 if a<b, >1 if a>b */
+UNIV_INTERN
+int
+innobase_strcasecmp(
+/*================*/
+	const char*	a,	/*!< in: first string to compare */
+	const char*	b)	/*!< in: second string to compare */
+{
+	if (!a) {
+		if (!b) {
+			return(0);
+		} else {
+			return(-1);
+		}
+	} else if (!b) {
+		return(1);
+	}
+
+	return(my_strcasecmp(system_charset_info, a, b));
+}
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively. The
+second string contains wildcards.
+@return 0 if a match is found, 1 if not */
+UNIV_INTERN
+int
+innobase_wildcasecmp(
+/*=================*/
+	const char*	a,	/*!< in: string to compare */
+	const char*	b)	/*!< in: wildcard string to compare */
+{
+	return(wild_case_compare(system_charset_info, a, b));
+}
+
+/******************************************************************//**
+Strip dir name from a full path name and return only the file name
+@return file name or "null" if no file name */
+UNIV_INTERN
+const char*
+innobase_basename(
+/*==============*/
+	const char*	path_name)	/*!< in: full path name */
+{
+	const char*	name = base_name(path_name);
+
+	return((name) ? name : "null");
+}
+
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+UNIV_INTERN
+void
+innobase_casedn_str(
+/*================*/
+	char*	a)	/*!< in/out: string to put in lower case */
+{
+	my_casedn_str(system_charset_info, a);
+}
+
+/**********************************************************************//**
+Determines the connection character set.
+@return	connection character set */
+UNIV_INTERN
+struct charset_info_st*
+innobase_get_charset(
+/*=================*/
+	THD*	mysql_thd)	/*!< in: MySQL thread handle */
+{
+	return(thd_charset(mysql_thd));
+}
+
+/**********************************************************************//**
+Determines the current SQL statement.
+@return	SQL statement string */
+UNIV_INTERN
+const char*
+innobase_get_stmt(
+/*==============*/
+	THD*	thd,		/*!< in: MySQL thread handle */
+	size_t*	length)		/*!< out: length of the SQL statement */
+{
+	LEX_STRING* stmt;
+
+	stmt = thd_query_string(thd);
+	*length = stmt->length;
+	return(stmt->str);
+}
+
+/**********************************************************************//**
+Get the current setting of the table_def_size global parameter. We do
+a dirty read because for one there is no synchronization object and
+secondly there is little harm in doing so even if we get a torn read.
+@return	value of table_def_size */
+UNIV_INTERN
+ulint
+innobase_get_table_cache_size(void)
+/*===============================*/
+{
+	return(table_def_size);
+}
+
+/**********************************************************************//**
+Get the current setting of the lower_case_table_names global parameter from
+mysqld.cc. We do a dirty read because for one there is no synchronization
+object and secondly there is little harm in doing so even if we get a torn
+read.
+@return	value of lower_case_table_names */
+UNIV_INTERN
+ulint
+innobase_get_lower_case_table_names(void)
+/*=====================================*/
+{
+	return(lower_case_table_names);
+}
+
+/*********************************************************************//**
+Creates a temporary file.
+@return	temporary file descriptor, or < 0 on error */
+UNIV_INTERN
+int
+innobase_mysql_tmpfile(void)
+/*========================*/
+{
+	int	fd2 = -1;
+	File	fd;
+
+	DBUG_EXECUTE_IF(
+		"innobase_tmpfile_creation_failure",
+		return(-1);
+	);
+
+	fd = mysql_tmpfile("ib");
+
+	if (fd >= 0) {
+		/* Copy the file descriptor, so that the additional resources
+		allocated by create_temp_file() can be freed by invoking
+		my_close().
+
+		Because the file descriptor returned by this function
+		will be passed to fdopen(), it will be closed by invoking
+		fclose(), which in turn will invoke close() instead of
+		my_close(). */
+
+#ifdef _WIN32
+		/* Note that on Windows, the integer returned by mysql_tmpfile
+		has no relation to C runtime file descriptor. Here, we need
+		to call my_get_osfhandle to get the HANDLE and then convert it
+		to C runtime filedescriptor. */
+		{
+			HANDLE hFile = my_get_osfhandle(fd);
+			HANDLE hDup;
+			BOOL bOK = DuplicateHandle(
+					GetCurrentProcess(),
+					hFile, GetCurrentProcess(),
+					&hDup, 0, FALSE, DUPLICATE_SAME_ACCESS);
+			if (bOK) {
+				fd2 = _open_osfhandle((intptr_t) hDup, 0);
+			} else {
+				my_osmaperr(GetLastError());
+				fd2 = -1;
+			}
+		}
+#else
+		fd2 = dup(fd);
+#endif
+		if (fd2 < 0) {
+			char errbuf[MYSYS_STRERROR_SIZE];
+			DBUG_PRINT("error",("Got error %d on dup",fd2));
+			my_errno=errno;
+			my_error(EE_OUT_OF_FILERESOURCES,
+				 MYF(ME_BELL+ME_WAITTANG),
+				 "ib*", my_errno,
+				 my_strerror(errbuf, sizeof(errbuf), my_errno));
+		}
+		my_close(fd, MYF(MY_WME));
+	}
+	return(fd2);
+}
+
+/*********************************************************************//**
+Wrapper around MySQL's copy_and_convert function.
+@return	number of bytes copied to 'to' */
+UNIV_INTERN
+ulint
+innobase_convert_string(
+/*====================*/
+	void*		to,		/*!< out: converted string */
+	ulint		to_length,	/*!< in: number of bytes reserved
+					for the converted string */
+	CHARSET_INFO*	to_cs,		/*!< in: character set to convert to */
+	const void*	from,		/*!< in: string to convert */
+	ulint		from_length,	/*!< in: number of bytes to convert */
+	CHARSET_INFO*	from_cs,	/*!< in: character set to convert
+					from */
+	uint*		errors)		/*!< out: number of errors encountered
+					during the conversion */
+{
+	return(copy_and_convert(
+			(char*) to, (uint32) to_length, to_cs,
+			(const char*) from, (uint32) from_length, from_cs,
+			errors));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes
+the result to "buf". The result is converted to "system_charset_info".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return	number of bytes that were written */
+UNIV_INTERN
+ulint
+innobase_raw_format(
+/*================*/
+	const char*	data,		/*!< in: raw data */
+	ulint		data_len,	/*!< in: raw data length
+					in bytes */
+	ulint		charset_coll,	/*!< in: charset collation */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size)	/*!< in: output buffer size
+					in bytes */
+{
+	/* XXX we use a hard limit instead of allocating
+	but_size bytes from the heap */
+	CHARSET_INFO*	data_cs;
+	char		buf_tmp[8192];
+	ulint		buf_tmp_used;
+	uint		num_errors;
+
+	data_cs = all_charsets[charset_coll];
+
+	buf_tmp_used = innobase_convert_string(buf_tmp, sizeof(buf_tmp),
+					       system_charset_info,
+					       data, data_len, data_cs,
+					       &num_errors);
+
+	return(ut_str_sql_format(buf_tmp, buf_tmp_used, buf, buf_size));
+}
+
+/*********************************************************************//**
+Compute the next autoinc value.
+
+For MySQL replication the autoincrement values can be partitioned among
+the nodes. The offset is the start or origin of the autoincrement value
+for a particular node. For n nodes the increment will be n and the offset
+will be in the interval [1, n]. The formula tries to allocate the next
+value for a particular node.
+
+Note: This function is also called with increment set to the number of
+values we want to reserve for multi-value inserts e.g.,
+
+	INSERT INTO T VALUES(), (), ();
+
+innobase_next_autoinc() will be called with increment set to 3 where
+autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for
+the multi-value INSERT above.
+@return	the next value */
+UNIV_INTERN
+ulonglong
+innobase_next_autoinc(
+/*==================*/
+	ulonglong	current,	/*!< in: Current value */
+	ulonglong	need,		/*!< in: count of values needed */
+	ulonglong	step,		/*!< in: AUTOINC increment step */
+	ulonglong	offset,		/*!< in: AUTOINC offset */
+	ulonglong	max_value)	/*!< in: max value for type */
+{
+	ulonglong	next_value;
+	ulonglong	block = need * step;
+
+	/* Should never be 0. */
+	ut_a(need > 0);
+	ut_a(block > 0);
+	ut_a(max_value > 0);
+
+	/* According to MySQL documentation, if the offset is greater than
+	the step then the offset is ignored. */
+	if (offset > block) {
+		offset = 0;
+	}
+
+	/* Check for overflow. Current can be > max_value if the value is
+	in reality a negative value.The visual studio compilers converts
+	large double values automatically into unsigned long long datatype
+	maximum value */
+
+	if (block >= max_value
+	    || offset > max_value
+	    || current >= max_value
+	    || max_value - offset <= offset) {
+
+		next_value = max_value;
+	} else {
+		ut_a(max_value > current);
+
+		ulonglong	free = max_value - current;
+
+		if (free < offset || free - offset <= block) {
+			next_value = max_value;
+		} else {
+			next_value = 0;
+		}
+	}
+
+	if (next_value == 0) {
+		ulonglong	next;
+
+		if (current > offset) {
+			next = (current - offset) / step;
+		} else {
+			next = (offset - current) / step;
+		}
+
+		ut_a(max_value > next);
+		next_value = next * step;
+		/* Check for multiplication overflow. */
+		ut_a(next_value >= next);
+		ut_a(max_value > next_value);
+
+		/* Check for overflow */
+		if (max_value - next_value >= block) {
+
+			next_value += block;
+
+			if (max_value - next_value >= offset) {
+				next_value += offset;
+			} else {
+				next_value = max_value;
+			}
+		} else {
+			next_value = max_value;
+		}
+	}
+
+	ut_a(next_value != 0);
+	ut_a(next_value <= max_value);
+
+	return(next_value);
+}
+
+/*********************************************************************//**
+Initializes some fields in an InnoDB transaction object. */
+static
+void
+innobase_trx_init(
+/*==============*/
+	THD*	thd,	/*!< in: user thread handle */
+	trx_t*	trx)	/*!< in/out: InnoDB transaction handle */
+{
+	DBUG_ENTER("innobase_trx_init");
+	DBUG_ASSERT(EQ_CURRENT_THD(thd));
+	DBUG_ASSERT(thd == trx->mysql_thd);
+
+	trx->check_foreigns = !thd_test_options(
+		thd, OPTION_NO_FOREIGN_KEY_CHECKS);
+
+	trx->check_unique_secondary = !thd_test_options(
+		thd, OPTION_RELAXED_UNIQUE_CHECKS);
+
+	DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Allocates an InnoDB transaction for a MySQL handler object for DML.
+@return	InnoDB transaction handle */
+UNIV_INTERN
+trx_t*
+innobase_trx_allocate(
+/*==================*/
+	THD*	thd)	/*!< in: user thread handle */
+{
+	trx_t*	trx;
+
+	DBUG_ENTER("innobase_trx_allocate");
+	DBUG_ASSERT(thd != NULL);
+	DBUG_ASSERT(EQ_CURRENT_THD(thd));
+
+	trx = trx_allocate_for_mysql();
+
+	trx->mysql_thd = thd;
+
+	innobase_trx_init(thd, trx);
+
+	DBUG_RETURN(trx);
+}
+
+/*********************************************************************//**
+Gets the InnoDB transaction handle for a MySQL handler object, creates
+an InnoDB transaction struct if the corresponding MySQL thread struct still
+lacks one.
+@return	InnoDB transaction handle */
+static inline
+trx_t*
+check_trx_exists(
+/*=============*/
+	THD*	thd)	/*!< in: user thread handle */
+{
+	trx_t*&	trx = thd_to_trx(thd);
+
+	ut_ad(EQ_CURRENT_THD(thd));
+
+	if (trx == NULL) {
+		trx = innobase_trx_allocate(thd);
+	} else if (UNIV_UNLIKELY(trx->magic_n != TRX_MAGIC_N)) {
+		mem_analyze_corruption(trx);
+		ut_error;
+	}
+
+	innobase_trx_init(thd, trx);
+
+	return(trx);
+}
+
+/*********************************************************************//**
+Note that a transaction has been registered with MySQL.
+@return true if transaction is registered with MySQL 2PC coordinator */
+static inline
+bool
+trx_is_registered_for_2pc(
+/*=========================*/
+	const trx_t*	trx)	/* in: transaction */
+{
+	return(trx->is_registered == 1);
+}
+
+/*********************************************************************//**
+Note that a transaction has been registered with MySQL 2PC coordinator. */
+static inline
+void
+trx_register_for_2pc(
+/*==================*/
+	trx_t*	trx)	/* in: transaction */
+{
+	trx->is_registered = 1;
+	ut_ad(trx->owns_prepare_mutex == 0);
+}
+
+/*********************************************************************//**
+Note that a transaction has been deregistered. */
+static inline
+void
+trx_deregister_from_2pc(
+/*====================*/
+	trx_t*	trx)	/* in: transaction */
+{
+	trx->is_registered = 0;
+	trx->owns_prepare_mutex = 0;
+}
+
+
+/*********************************************************************//**
+Check if transaction is started.
+@reutrn true if transaction is in state started */
+static
+bool
+trx_is_started(
+/*===========*/
+	trx_t*	trx)	/* in: transaction */
+{
+	return(trx->state != TRX_STATE_NOT_STARTED);
+}
+
+/*********************************************************************//**
+Copy table flags from MySQL's HA_CREATE_INFO into an InnoDB table object.
+Those flags are stored in .frm file and end up in the MySQL table object,
+but are frequently used inside InnoDB so we keep their copies into the
+InnoDB table object. */
+UNIV_INTERN
+void
+innobase_copy_frm_flags_from_create_info(
+/*=====================================*/
+	dict_table_t*		innodb_table,	/*!< in/out: InnoDB table */
+	const HA_CREATE_INFO*	create_info)	/*!< in: create info */
+{
+	ibool	ps_on;
+	ibool	ps_off;
+
+	if (dict_table_is_temporary(innodb_table)) {
+		/* Temp tables do not use persistent stats. */
+		ps_on = FALSE;
+		ps_off = TRUE;
+	} else {
+		ps_on = create_info->table_options
+			& HA_OPTION_STATS_PERSISTENT;
+		ps_off = create_info->table_options
+			& HA_OPTION_NO_STATS_PERSISTENT;
+	}
+
+	dict_stats_set_persistent(innodb_table, ps_on, ps_off);
+
+	dict_stats_auto_recalc_set(
+		innodb_table,
+		create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON,
+		create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF);
+
+	innodb_table->stats_sample_pages = create_info->stats_sample_pages;
+}
+
+/*********************************************************************//**
+Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object.
+Those flags are stored in .frm file and end up in the MySQL table object,
+but are frequently used inside InnoDB so we keep their copies into the
+InnoDB table object. */
+UNIV_INTERN
+void
+innobase_copy_frm_flags_from_table_share(
+/*=====================================*/
+	dict_table_t*		innodb_table,	/*!< in/out: InnoDB table */
+	const TABLE_SHARE*	table_share)	/*!< in: table share */
+{
+	ibool	ps_on;
+	ibool	ps_off;
+
+	if (dict_table_is_temporary(innodb_table)) {
+		/* Temp tables do not use persistent stats */
+		ps_on = FALSE;
+		ps_off = TRUE;
+	} else {
+		ps_on = table_share->db_create_options
+			& HA_OPTION_STATS_PERSISTENT;
+		ps_off = table_share->db_create_options
+			& HA_OPTION_NO_STATS_PERSISTENT;
+	}
+
+	dict_stats_set_persistent(innodb_table, ps_on, ps_off);
+
+	dict_stats_auto_recalc_set(
+		innodb_table,
+		table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON,
+		table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF);
+
+	innodb_table->stats_sample_pages = table_share->stats_sample_pages;
+}
+
+/*********************************************************************//**
+Construct ha_innobase handler. */
+UNIV_INTERN
+ha_innobase::ha_innobase(
+/*=====================*/
+	handlerton*	hton,
+	TABLE_SHARE*	table_arg)
+	:handler(hton, table_arg),
+	int_table_flags(HA_REC_NOT_IN_SEQ |
+		  HA_NULL_IN_KEY |
+		  HA_CAN_INDEX_BLOBS |
+		  HA_CAN_SQL_HANDLER |
+		  HA_PRIMARY_KEY_REQUIRED_FOR_POSITION |
+		  HA_PRIMARY_KEY_IN_READ_INDEX |
+		  HA_BINLOG_ROW_CAPABLE |
+		  HA_CAN_GEOMETRY | HA_PARTIAL_COLUMN_READ |
+		  HA_TABLE_SCAN_ON_INDEX | HA_CAN_FULLTEXT |
+		  HA_CAN_FULLTEXT_EXT | HA_CAN_EXPORT),
+	start_of_scan(0),
+	num_write_row(0)
+{}
+
+/*********************************************************************//**
+Destruct ha_innobase handler. */
+UNIV_INTERN
+ha_innobase::~ha_innobase()
+/*======================*/
+{
+}
+
+/*********************************************************************//**
+Updates the user_thd field in a handle and also allocates a new InnoDB
+transaction handle if needed, and updates the transaction fields in the
+prebuilt struct. */
+UNIV_INTERN inline
+void
+ha_innobase::update_thd(
+/*====================*/
+	THD*	thd)	/*!< in: thd to use the handle */
+{
+	trx_t*		trx;
+
+	DBUG_ENTER("ha_innobase::update_thd");
+	DBUG_PRINT("ha_innobase::update_thd", ("user_thd: %p -> %p",
+		   user_thd, thd));
+
+	/* The table should have been opened in ha_innobase::open(). */
+	DBUG_ASSERT(prebuilt->table->n_ref_count > 0);
+
+	trx = check_trx_exists(thd);
+
+	if (prebuilt->trx != trx) {
+
+		row_update_prebuilt_trx(prebuilt, trx);
+	}
+
+	user_thd = thd;
+	DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Updates the user_thd field in a handle and also allocates a new InnoDB
+transaction handle if needed, and updates the transaction fields in the
+prebuilt struct. */
+UNIV_INTERN
+void
+ha_innobase::update_thd()
+/*=====================*/
+{
+	THD*	thd = ha_thd();
+
+	ut_ad(EQ_CURRENT_THD(thd));
+	update_thd(thd);
+}
+
+/*********************************************************************//**
+Registers an InnoDB transaction with the MySQL 2PC coordinator, so that
+the MySQL XA code knows to call the InnoDB prepare and commit, or rollback
+for the transaction. This MUST be called for every transaction for which
+the user may call commit or rollback. Calling this several times to register
+the same transaction is allowed, too. This function also registers the
+current SQL statement. */
+static inline
+void
+innobase_register_trx(
+/*==================*/
+	handlerton*	hton,	/* in: Innobase handlerton */
+	THD*		thd,	/* in: MySQL thd (connection) object */
+	trx_t*		trx)	/* in: transaction to register */
+{
+	trans_register_ha(thd, FALSE, hton);
+
+	if (!trx_is_registered_for_2pc(trx)
+	    && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+		trans_register_ha(thd, TRUE, hton);
+	}
+
+	trx_register_for_2pc(trx);
+}
+
+/*	BACKGROUND INFO: HOW THE MYSQL QUERY CACHE WORKS WITH INNODB
+	------------------------------------------------------------
+
+1) The use of the query cache for TBL is disabled when there is an
+uncommitted change to TBL.
+
+2) When a change to TBL commits, InnoDB stores the current value of
+its global trx id counter, let us denote it by INV_TRX_ID, to the table object
+in the InnoDB data dictionary, and does only allow such transactions whose
+id <= INV_TRX_ID to use the query cache.
+
+3) When InnoDB does an INSERT/DELETE/UPDATE to a table TBL, or an implicit
+modification because an ON DELETE CASCADE, we invalidate the MySQL query cache
+of TBL immediately.
+
+How this is implemented inside InnoDB:
+
+1) Since every modification always sets an IX type table lock on the InnoDB
+table, it is easy to check if there can be uncommitted modifications for a
+table: just check if there are locks in the lock list of the table.
+
+2) When a transaction inside InnoDB commits, it reads the global trx id
+counter and stores the value INV_TRX_ID to the tables on which it had a lock.
+
+3) If there is an implicit table change from ON DELETE CASCADE or SET NULL,
+InnoDB calls an invalidate method for the MySQL query cache for that table.
+
+How this is implemented inside sql_cache.cc:
+
+1) The query cache for an InnoDB table TBL is invalidated immediately at an
+INSERT/UPDATE/DELETE, just like in the case of MyISAM. No need to delay
+invalidation to the transaction commit.
+
+2) To store or retrieve a value from the query cache of an InnoDB table TBL,
+any query must first ask InnoDB's permission. We must pass the thd as a
+parameter because InnoDB will look at the trx id, if any, associated with
+that thd. Also the full_name which is used as key to search for the table
+object. The full_name is a string containing the normalized path to the
+table in the canonical format.
+
+3) Use of the query cache for InnoDB tables is now allowed also when
+AUTOCOMMIT==0 or we are inside BEGIN ... COMMIT. Thus transactions no longer
+put restrictions on the use of the query cache.
+*/
+
+/******************************************************************//**
+The MySQL query cache uses this to check from InnoDB if the query cache at
+the moment is allowed to operate on an InnoDB table. The SQL query must
+be a non-locking SELECT.
+
+The query cache is allowed to operate on certain query only if this function
+returns TRUE for all tables in the query.
+
+If thd is not in the autocommit state, this function also starts a new
+transaction for thd if there is no active trx yet, and assigns a consistent
+read view to it if there is no read view yet.
+
+Why a deadlock of threads is not possible: the query cache calls this function
+at the start of a SELECT processing. Then the calling thread cannot be
+holding any InnoDB semaphores. The calling thread is holding the
+query cache mutex, and this function will reserve the InnoDB trx_sys->mutex.
+Thus, the 'rank' in sync0sync.h of the MySQL query cache mutex is above
+the InnoDB trx_sys->mutex.
+@return TRUE if permitted, FALSE if not; note that the value FALSE
+does not mean we should invalidate the query cache: invalidation is
+called explicitly */
+static
+my_bool
+innobase_query_caching_of_table_permitted(
+/*======================================*/
+	THD*	thd,		/*!< in: thd of the user who is trying to
+				store a result to the query cache or
+				retrieve it */
+	char*	full_name,	/*!< in: normalized path to the table */
+	uint	full_name_len,	/*!< in: length of the normalized path
+                                to the table */
+	ulonglong *unused)	/*!< unused for this engine */
+{
+	ibool	is_autocommit;
+	trx_t*	trx;
+	char	norm_name[1000];
+
+	ut_a(full_name_len < 999);
+
+	trx = check_trx_exists(thd);
+
+	if (trx->isolation_level == TRX_ISO_SERIALIZABLE) {
+		/* In the SERIALIZABLE mode we add LOCK IN SHARE MODE to every
+		plain SELECT if AUTOCOMMIT is not on. */
+
+		return((my_bool)FALSE);
+	}
+
+	if (UNIV_UNLIKELY(trx->has_search_latch)) {
+		sql_print_error("The calling thread is holding the adaptive "
+				"search, latch though calling "
+				"innobase_query_caching_of_table_permitted.");
+		trx_print(stderr, trx, 1024);
+	}
+
+	trx_search_latch_release_if_reserved(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
+
+	if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+		is_autocommit = TRUE;
+	} else {
+		is_autocommit = FALSE;
+
+	}
+
+	if (is_autocommit && trx->n_mysql_tables_in_use == 0) {
+		/* We are going to retrieve the query result from the query
+		cache. This cannot be a store operation to the query cache
+		because then MySQL would have locks on tables already.
+
+		TODO: if the user has used LOCK TABLES to lock the table,
+		then we open a transaction in the call of row_.. below.
+		That trx can stay open until UNLOCK TABLES. The same problem
+		exists even if we do not use the query cache. MySQL should be
+		modified so that it ALWAYS calls some cleanup function when
+		the processing of a query ends!
+
+		We can imagine we instantaneously serialize this consistent
+		read trx to the current trx id counter. If trx2 would have
+		changed the tables of a query result stored in the cache, and
+		trx2 would have already committed, making the result obsolete,
+		then trx2 would have already invalidated the cache. Thus we
+		can trust the result in the cache is ok for this query. */
+
+		return((my_bool)TRUE);
+	}
+
+	/* Normalize the table name to InnoDB format */
+	normalize_table_name(norm_name, full_name);
+
+	innobase_register_trx(innodb_hton_ptr, thd, trx);
+
+	if (row_search_check_if_query_cache_permitted(trx, norm_name)) {
+
+		/* printf("Query cache for %s permitted\n", norm_name); */
+
+		return((my_bool)TRUE);
+	}
+
+	/* printf("Query cache for %s NOT permitted\n", norm_name); */
+
+	return((my_bool)FALSE);
+}
+
+/*****************************************************************//**
+Invalidates the MySQL query cache for the table. */
+UNIV_INTERN
+void
+innobase_invalidate_query_cache(
+/*============================*/
+	trx_t*		trx,		/*!< in: transaction which
+					modifies the table */
+	const char*	full_name,	/*!< in: concatenation of
+					database name, null char NUL,
+					table name, null char NUL;
+					NOTE that in Windows this is
+					always in LOWER CASE! */
+	ulint		full_name_len)	/*!< in: full name length where
+					also the null chars count */
+{
+	/* Note that the sync0sync.h rank of the query cache mutex is just
+	above the InnoDB trx_sys_t->lock. The caller of this function must
+	not have latches of a lower rank. */
+
+#ifdef HAVE_QUERY_CACHE
+	char	qcache_key_name[2 * (NAME_LEN + 1)];
+	size_t	tabname_len;
+	size_t	dbname_len;
+
+	/* Construct the key("db-name\0table$name\0") for the query cache using
+	the path name("db@002dname\0table@0024name\0") of the table in its
+        canonical form. */
+	dbname_len = filename_to_tablename(full_name, qcache_key_name,
+					   sizeof(qcache_key_name));
+	tabname_len = filename_to_tablename(full_name + strlen(full_name) + 1,
+					    qcache_key_name + dbname_len + 1,
+					    sizeof(qcache_key_name)
+                                            - dbname_len - 1);
+
+	/* Argument TRUE below means we are using transactions */
+	mysql_query_cache_invalidate4(trx->mysql_thd,
+				      qcache_key_name,
+				      (dbname_len + tabname_len + 2),
+				      TRUE);
+#endif
+}
+
+/*****************************************************************//**
+Convert an SQL identifier to the MySQL system_charset_info (UTF-8)
+and quote it if needed.
+@return	pointer to the end of buf */
+static
+char*
+innobase_convert_identifier(
+/*========================*/
+	char*		buf,	/*!< out: buffer for converted identifier */
+	ulint		buflen,	/*!< in: length of buf, in bytes */
+	const char*	id,	/*!< in: identifier to convert */
+	ulint		idlen,	/*!< in: length of id, in bytes */
+	THD*		thd,	/*!< in: MySQL connection thread, or NULL */
+	ibool		file_id)/*!< in: TRUE=id is a table or database name;
+				FALSE=id is an UTF-8 string */
+{
+	const char*	s	= id;
+	int		q;
+
+	if (file_id) {
+
+		char nz[MAX_TABLE_NAME_LEN + 1];
+		char nz2[MAX_TABLE_NAME_LEN + 1];
+
+		/* Decode the table name.  The MySQL function expects
+		a NUL-terminated string.  The input and output strings
+		buffers must not be shared. */
+		ut_a(idlen <= MAX_TABLE_NAME_LEN);
+		memcpy(nz, id, idlen);
+		nz[idlen] = 0;
+
+		s = nz2;
+		idlen = explain_filename(thd, nz, nz2, sizeof nz2,
+					 EXPLAIN_PARTITIONS_AS_COMMENT);
+		goto no_quote;
+	}
+
+	/* See if the identifier needs to be quoted. */
+	if (UNIV_UNLIKELY(!thd)) {
+		q = '"';
+	} else {
+		q = get_quote_char_for_identifier(thd, s, (int) idlen);
+	}
+
+	if (q == EOF) {
+no_quote:
+		if (UNIV_UNLIKELY(idlen > buflen)) {
+			idlen = buflen;
+		}
+		memcpy(buf, s, idlen);
+		return(buf + idlen);
+	}
+
+	/* Quote the identifier. */
+	if (buflen < 2) {
+		return(buf);
+	}
+
+	*buf++ = q;
+	buflen--;
+
+	for (; idlen; idlen--) {
+		int	c = *s++;
+		if (UNIV_UNLIKELY(c == q)) {
+			if (UNIV_UNLIKELY(buflen < 3)) {
+				break;
+			}
+
+			*buf++ = c;
+			*buf++ = c;
+			buflen -= 2;
+		} else {
+			if (UNIV_UNLIKELY(buflen < 2)) {
+				break;
+			}
+
+			*buf++ = c;
+			buflen--;
+		}
+	}
+
+	*buf++ = q;
+	return(buf);
+}
+
+/*****************************************************************//**
+Convert a table or index name to the MySQL system_charset_info (UTF-8)
+and quote it if needed.
+@return	pointer to the end of buf */
+UNIV_INTERN
+char*
+innobase_convert_name(
+/*==================*/
+	char*		buf,	/*!< out: buffer for converted identifier */
+	ulint		buflen,	/*!< in: length of buf, in bytes */
+	const char*	id,	/*!< in: identifier to convert */
+	ulint		idlen,	/*!< in: length of id, in bytes */
+	THD*		thd,	/*!< in: MySQL connection thread, or NULL */
+	ibool		table_id)/*!< in: TRUE=id is a table or database name;
+				FALSE=id is an index name */
+{
+	char*		s	= buf;
+	const char*	bufend	= buf + buflen;
+
+	if (table_id) {
+		const char*	slash = (const char*) memchr(id, '/', idlen);
+		if (!slash) {
+
+			goto no_db_name;
+		}
+
+		/* Print the database name and table name separately. */
+		s = innobase_convert_identifier(s, bufend - s, id, slash - id,
+						thd, TRUE);
+		if (UNIV_LIKELY(s < bufend)) {
+			*s++ = '.';
+			s = innobase_convert_identifier(s, bufend - s,
+							slash + 1, idlen
+							- (slash - id) - 1,
+							thd, TRUE);
+		}
+	} else if (UNIV_UNLIKELY(*id == TEMP_INDEX_PREFIX)) {
+		/* Temporary index name (smart ALTER TABLE) */
+		const char temp_index_suffix[]= "--temporary--";
+
+		s = innobase_convert_identifier(buf, buflen, id + 1, idlen - 1,
+						thd, FALSE);
+		if (s - buf + (sizeof temp_index_suffix - 1) < buflen) {
+			memcpy(s, temp_index_suffix,
+			       sizeof temp_index_suffix - 1);
+			s += sizeof temp_index_suffix - 1;
+		}
+	} else {
+no_db_name:
+		s = innobase_convert_identifier(buf, buflen, id, idlen,
+						thd, table_id);
+	}
+
+	return(s);
+}
+
+/*****************************************************************//**
+A wrapper function of innobase_convert_name(), convert a table or
+index name to the MySQL system_charset_info (UTF-8) and quote it if needed.
+@return	pointer to the end of buf */
+UNIV_INTERN
+void
+innobase_format_name(
+/*==================*/
+	char*		buf,	/*!< out: buffer for converted identifier */
+	ulint		buflen,	/*!< in: length of buf, in bytes */
+	const char*	name,	/*!< in: index or table name to format */
+	ibool		is_index_name) /*!< in: index name */
+{
+	const char*     bufend;
+
+	bufend = innobase_convert_name(buf, buflen, name, strlen(name),
+				       NULL, !is_index_name);
+
+	ut_ad((ulint) (bufend - buf) < buflen);
+
+	buf[bufend - buf] = '\0';
+}
+
+/**********************************************************************//**
+Determines if the currently running transaction has been interrupted.
+@return	TRUE if interrupted */
+UNIV_INTERN
+ibool
+trx_is_interrupted(
+/*===============*/
+	const trx_t*	trx)	/*!< in: transaction */
+{
+	return(trx && trx->mysql_thd && thd_killed(trx->mysql_thd));
+}
+
+/**********************************************************************//**
+Determines if the currently running transaction is in strict mode.
+@return	TRUE if strict */
+UNIV_INTERN
+ibool
+trx_is_strict(
+/*==========*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	return(trx && trx->mysql_thd && THDVAR(trx->mysql_thd, strict_mode));
+}
+
+/**********************************************************************//**
+Determines if the current MySQL thread is running in strict mode.
+If thd==NULL, THDVAR returns the global value of innodb-strict-mode.
+@return	TRUE if strict */
+UNIV_INLINE
+ibool
+thd_is_strict(
+/*==========*/
+	THD*	thd)	/*!< in: MySQL thread descriptor */
+{
+	return(THDVAR(thd, strict_mode));
+}
+
+/**************************************************************//**
+Resets some fields of a prebuilt struct. The template is used in fast
+retrieval of just those column values MySQL needs in its processing. */
+inline
+void
+ha_innobase::reset_template(void)
+/*=============================*/
+{
+	ut_ad(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+	ut_ad(prebuilt->magic_n2 == prebuilt->magic_n);
+
+	prebuilt->keep_other_fields_on_keyread = 0;
+	prebuilt->read_just_key = 0;
+	prebuilt->in_fts_query = 0;
+	/* Reset index condition pushdown state. */
+	if (prebuilt->idx_cond) {
+		prebuilt->idx_cond = NULL;
+		prebuilt->idx_cond_n_cols = 0;
+		/* Invalidate prebuilt->mysql_template
+		in ha_innobase::write_row(). */
+		prebuilt->template_type = ROW_MYSQL_NO_TEMPLATE;
+	}
+}
+
+/*****************************************************************//**
+Call this when you have opened a new table handle in HANDLER, before you
+call index_read_idx() etc. Actually, we can let the cursor stay open even
+over a transaction commit! Then you should call this before every operation,
+fetch next etc. This function inits the necessary things even after a
+transaction commit. */
+UNIV_INTERN
+void
+ha_innobase::init_table_handle_for_HANDLER(void)
+/*============================================*/
+{
+	/* If current thd does not yet have a trx struct, create one.
+	If the current handle does not yet have a prebuilt struct, create
+	one. Update the trx pointers in the prebuilt struct. Normally
+	this operation is done in external_lock. */
+
+	update_thd(ha_thd());
+
+	/* Initialize the prebuilt struct much like it would be inited in
+	external_lock */
+
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+
+	innobase_srv_conc_force_exit_innodb(prebuilt->trx);
+
+	/* If the transaction is not started yet, start it */
+
+	trx_start_if_not_started_xa(prebuilt->trx);
+
+	/* Assign a read view if the transaction does not have it yet */
+
+	trx_assign_read_view(prebuilt->trx);
+
+	innobase_register_trx(ht, user_thd, prebuilt->trx);
+
+	/* We did the necessary inits in this function, no need to repeat them
+	in row_search_for_mysql */
+
+	prebuilt->sql_stat_start = FALSE;
+
+	/* We let HANDLER always to do the reads as consistent reads, even
+	if the trx isolation level would have been specified as SERIALIZABLE */
+
+	prebuilt->select_lock_type = LOCK_NONE;
+	prebuilt->stored_select_lock_type = LOCK_NONE;
+
+	/* Always fetch all columns in the index record */
+
+	prebuilt->hint_need_to_fetch_extra_cols = ROW_RETRIEVE_ALL_COLS;
+
+	/* We want always to fetch all columns in the whole row? Or do
+	we???? */
+
+	prebuilt->used_in_HANDLER = TRUE;
+	reset_template();
+}
+
+/*********************************************************************//**
+Opens an InnoDB database.
+@return	0 on success, error code on failure */
+static
+int
+innobase_init(
+/*==========*/
+	void	*p)	/*!< in: InnoDB handlerton */
+{
+	static char	current_dir[3];		/*!< Set if using current lib */
+	int		err;
+	bool		ret;
+	char		*default_path;
+	uint		format_id;
+	ulong		num_pll_degree;
+
+	DBUG_ENTER("innobase_init");
+	handlerton *innobase_hton= (handlerton*) p;
+	innodb_hton_ptr = innobase_hton;
+
+	innobase_hton->state = SHOW_OPTION_YES;
+	innobase_hton->db_type= DB_TYPE_INNODB;
+	innobase_hton->savepoint_offset = sizeof(trx_named_savept_t);
+	innobase_hton->close_connection = innobase_close_connection;
+	innobase_hton->savepoint_set = innobase_savepoint;
+	innobase_hton->savepoint_rollback = innobase_rollback_to_savepoint;
+	innobase_hton->savepoint_rollback_can_release_mdl =
+				innobase_rollback_to_savepoint_can_release_mdl;
+	innobase_hton->savepoint_release = innobase_release_savepoint;
+	innobase_hton->commit = innobase_commit;
+	innobase_hton->rollback = innobase_rollback;
+	innobase_hton->prepare = innobase_xa_prepare;
+	innobase_hton->recover = innobase_xa_recover;
+	innobase_hton->commit_by_xid = innobase_commit_by_xid;
+	innobase_hton->rollback_by_xid = innobase_rollback_by_xid;
+	innobase_hton->create_cursor_read_view = innobase_create_cursor_view;
+	innobase_hton->set_cursor_read_view = innobase_set_cursor_view;
+	innobase_hton->close_cursor_read_view = innobase_close_cursor_view;
+	innobase_hton->create = innobase_create_handler;
+	innobase_hton->drop_database = innobase_drop_database;
+	innobase_hton->panic = innobase_end;
+
+	innobase_hton->start_consistent_snapshot =
+		innobase_start_trx_and_assign_read_view;
+
+	innobase_hton->flush_logs = innobase_flush_logs;
+	innobase_hton->show_status = innobase_show_status;
+	innobase_hton->flags =
+		HTON_SUPPORTS_EXTENDED_KEYS | HTON_SUPPORTS_FOREIGN_KEYS;
+
+	innobase_hton->release_temporary_latches =
+		innobase_release_temporary_latches;
+
+	innobase_hton->data = &innodb_api_cb;
+
+	ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR);
+
+#ifndef DBUG_OFF
+	static const char	test_filename[] = "-@";
+	char			test_tablename[sizeof test_filename
+				+ sizeof(srv_mysql50_table_name_prefix) - 1];
+	if ((sizeof(test_tablename)) - 1
+			!= filename_to_tablename(test_filename,
+						 test_tablename,
+						 sizeof(test_tablename), true)
+			|| strncmp(test_tablename,
+				   srv_mysql50_table_name_prefix,
+				   sizeof(srv_mysql50_table_name_prefix) - 1)
+			|| strcmp(test_tablename
+				  + sizeof(srv_mysql50_table_name_prefix) - 1,
+				  test_filename)) {
+
+		sql_print_error("tablename encoding has been changed");
+
+		goto error;
+	}
+#endif /* DBUG_OFF */
+
+	/* Check that values don't overflow on 32-bit systems. */
+	if (sizeof(ulint) == 4) {
+		if (innobase_buffer_pool_size > UINT_MAX32) {
+			sql_print_error(
+				"innobase_buffer_pool_size can't be over 4GB"
+				" on 32-bit systems");
+
+			goto error;
+		}
+	}
+
+	os_innodb_umask = (ulint) my_umask;
+
+	/* First calculate the default path for innodb_data_home_dir etc.,
+	in case the user has not given any value.
+
+	Note that when using the embedded server, the datadirectory is not
+	necessarily the current directory of this program. */
+
+	if (mysqld_embedded) {
+		default_path = mysql_real_data_home;
+		fil_path_to_mysql_datadir = mysql_real_data_home;
+	} else {
+		/* It's better to use current lib, to keep paths short */
+		current_dir[0] = FN_CURLIB;
+		current_dir[1] = FN_LIBCHAR;
+		current_dir[2] = 0;
+		default_path = current_dir;
+	}
+
+	ut_a(default_path);
+
+	/* Set InnoDB initialization parameters according to the values
+	read from MySQL .cnf file */
+
+	/*--------------- Data files -------------------------*/
+
+	/* The default dir for data files is the datadir of MySQL */
+
+	srv_data_home = (innobase_data_home_dir ? innobase_data_home_dir :
+			 default_path);
+
+	/* Set default InnoDB data file size to 12 MB and let it be
+	auto-extending. Thus users can use InnoDB in >= 4.0 without having
+	to specify any startup options. */
+
+	if (!innobase_data_file_path) {
+		innobase_data_file_path = (char*) "ibdata1:12M:autoextend";
+	}
+
+	/* Since InnoDB edits the argument in the next call, we make another
+	copy of it: */
+
+	internal_innobase_data_file_path = my_strdup(innobase_data_file_path,
+						   MYF(MY_FAE));
+
+	ret = (bool) srv_parse_data_file_paths_and_sizes(
+		internal_innobase_data_file_path);
+	if (ret == FALSE) {
+		sql_print_error(
+			"InnoDB: syntax error in innodb_data_file_path"
+			" or size specified is less than 1 megabyte");
+mem_free_and_error:
+		srv_free_paths_and_sizes();
+		my_free(internal_innobase_data_file_path);
+		goto error;
+	}
+
+	/* -------------- All log files ---------------------------*/
+
+	/* The default dir for log files is the datadir of MySQL */
+
+	if (!srv_log_group_home_dir) {
+		srv_log_group_home_dir = default_path;
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	/* Since innodb_log_arch_dir has no relevance under MySQL,
+	starting from 4.0.6 we always set it the same as
+	innodb_log_group_home_dir: */
+
+	innobase_log_arch_dir = innobase_log_group_home_dir;
+
+	srv_arch_dir = innobase_log_arch_dir;
+#endif /* UNIG_LOG_ARCHIVE */
+
+	srv_normalize_path_for_win(srv_log_group_home_dir);
+
+	if (strchr(srv_log_group_home_dir, ';')) {
+		sql_print_error("syntax error in innodb_log_group_home_dir");
+		goto mem_free_and_error;
+	}
+
+	if (innobase_mirrored_log_groups == 1) {
+		sql_print_warning(
+			"innodb_mirrored_log_groups is an unimplemented "
+			"feature and the variable will be completely "
+			"removed in a future version.");
+	}
+
+	if (innobase_mirrored_log_groups > 1) {
+		sql_print_error(
+		"innodb_mirrored_log_groups is an unimplemented feature and "
+		"the variable will be completely removed in a future version. "
+		"Using values other than 1 is not supported.");
+		goto mem_free_and_error;
+	}
+
+	if (innobase_mirrored_log_groups == 0) {
+		/* To throw a deprecation warning message when the option is
+		passed, the default was changed to '0' (as a workaround). Since
+		the only value accepted for this option is '1', reset it to 1 */
+		innobase_mirrored_log_groups = 1;
+	}
+
+	/* Validate the file format by animal name */
+	if (innobase_file_format_name != NULL) {
+
+		format_id = innobase_file_format_name_lookup(
+			innobase_file_format_name);
+
+		if (format_id > UNIV_FORMAT_MAX) {
+
+			sql_print_error("InnoDB: wrong innodb_file_format.");
+
+			goto mem_free_and_error;
+		}
+	} else {
+		/* Set it to the default file format id. Though this
+		should never happen. */
+		format_id = 0;
+	}
+
+	srv_file_format = format_id;
+
+	/* Given the type of innobase_file_format_name we have little
+	choice but to cast away the constness from the returned name.
+	innobase_file_format_name is used in the MySQL set variable
+	interface and so can't be const. */
+
+	innobase_file_format_name =
+		(char*) trx_sys_file_format_id_to_name(format_id);
+
+	/* Check innobase_file_format_check variable */
+	if (!innobase_file_format_check) {
+
+		/* Set the value to disable checking. */
+		srv_max_file_format_at_startup = UNIV_FORMAT_MAX + 1;
+
+	} else {
+
+		/* Set the value to the lowest supported format. */
+		srv_max_file_format_at_startup = UNIV_FORMAT_MIN;
+	}
+
+	/* Did the user specify a format name that we support?
+	As a side effect it will update the variable
+	srv_max_file_format_at_startup */
+	if (innobase_file_format_validate_and_set(
+			innobase_file_format_max) < 0) {
+
+		sql_print_error("InnoDB: invalid "
+				"innodb_file_format_max value: "
+				"should be any value up to %s or its "
+				"equivalent numeric id",
+				trx_sys_file_format_id_to_name(
+					UNIV_FORMAT_MAX));
+
+		goto mem_free_and_error;
+	}
+
+	if (innobase_change_buffering) {
+		ulint	use;
+
+		for (use = 0;
+		     use < UT_ARR_SIZE(innobase_change_buffering_values);
+		     use++) {
+			if (!innobase_strcasecmp(
+				    innobase_change_buffering,
+				    innobase_change_buffering_values[use])) {
+				ibuf_use = (ibuf_use_t) use;
+				goto innobase_change_buffering_inited_ok;
+			}
+		}
+
+		sql_print_error("InnoDB: invalid value "
+				"innodb_change_buffering=%s",
+				innobase_change_buffering);
+		goto mem_free_and_error;
+	}
+
+innobase_change_buffering_inited_ok:
+	ut_a((ulint) ibuf_use < UT_ARR_SIZE(innobase_change_buffering_values));
+	innobase_change_buffering = (char*)
+		innobase_change_buffering_values[ibuf_use];
+
+	/* Check that interdependent parameters have sane values. */
+	if (srv_max_buf_pool_modified_pct < srv_max_dirty_pages_pct_lwm) {
+		sql_print_warning("InnoDB: innodb_max_dirty_pages_pct_lwm"
+				  " cannot be set higher than"
+				  " innodb_max_dirty_pages_pct.\n"
+				  "InnoDB: Setting"
+				  " innodb_max_dirty_pages_pct_lwm to %lu\n",
+				  srv_max_buf_pool_modified_pct);
+
+		srv_max_dirty_pages_pct_lwm = srv_max_buf_pool_modified_pct;
+	}
+
+	if (srv_max_io_capacity == SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT) {
+
+		if (srv_io_capacity >= SRV_MAX_IO_CAPACITY_LIMIT / 2) {
+			/* Avoid overflow. */
+			srv_max_io_capacity = SRV_MAX_IO_CAPACITY_LIMIT;
+		} else {
+			/* The user has not set the value. We should
+			set it based on innodb_io_capacity. */
+			srv_max_io_capacity = static_cast<ulong>(
+				ut_max(2 * srv_io_capacity, 2000));
+		}
+
+	} else if (srv_max_io_capacity < srv_io_capacity) {
+		sql_print_warning("InnoDB: innodb_io_capacity"
+				  " cannot be set higher than"
+				  " innodb_io_capacity_max.\n"
+				  "InnoDB: Setting"
+				  " innodb_io_capacity to %lu\n",
+				  srv_max_io_capacity);
+
+		srv_io_capacity = srv_max_io_capacity;
+	}
+
+	if (!is_filename_allowed(srv_buf_dump_filename,
+				 strlen(srv_buf_dump_filename), FALSE)) {
+		sql_print_error("InnoDB: innodb_buffer_pool_filename"
+			" cannot have colon (:) in the file name.");
+		goto mem_free_and_error;
+	}
+
+	/* --------------------------------------------------*/
+
+	srv_file_flush_method_str = innobase_file_flush_method;
+
+	srv_log_file_size = (ib_uint64_t) innobase_log_file_size;
+
+#ifdef UNIV_LOG_ARCHIVE
+	srv_log_archive_on = (ulint) innobase_log_archive;
+#endif /* UNIV_LOG_ARCHIVE */
+
+	/* Check that the value of system variable innodb_page_size was
+	set correctly.  Its value was put into srv_page_size. If valid,
+	return the associated srv_page_size_shift.*/
+	srv_page_size_shift = innodb_page_size_validate(srv_page_size);
+	if (!srv_page_size_shift) {
+		sql_print_error("InnoDB: Invalid page size=%lu.\n",
+				srv_page_size);
+		goto mem_free_and_error;
+	}
+	if (UNIV_PAGE_SIZE_DEF != srv_page_size) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: innodb-page-size has been changed"
+			" from the default value %d to %lu.\n",
+			UNIV_PAGE_SIZE_DEF, srv_page_size);
+	}
+
+	srv_log_buffer_size = (ulint) innobase_log_buffer_size;
+
+	if (innobase_buffer_pool_instances == 0) {
+		innobase_buffer_pool_instances = 8;
+
+#if defined(__WIN__) && !defined(_WIN64)
+		if (innobase_buffer_pool_size > 1331 * 1024 * 1024) {
+			innobase_buffer_pool_instances
+				= ut_min(MAX_BUFFER_POOLS,
+					(long) (innobase_buffer_pool_size
+					/ (128 * 1024 * 1024)));
+		}
+#endif /* defined(__WIN__) && !defined(_WIN64) */
+	}
+	srv_buf_pool_size = (ulint) innobase_buffer_pool_size;
+	srv_buf_pool_instances = (ulint) innobase_buffer_pool_instances;
+
+	srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
+
+	if (innobase_additional_mem_pool_size
+	    != 8*1024*1024L /* the default */ ) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Warning: Using "
+			"innodb_additional_mem_pool_size is DEPRECATED. "
+			"This option may be removed in future releases, "
+			"together with the option innodb_use_sys_malloc "
+			"and with the InnoDB's internal memory "
+			"allocator.\n");
+	}
+
+	if (!srv_use_sys_malloc ) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Warning: Setting "
+			"innodb_use_sys_malloc to FALSE is DEPRECATED. "
+			"This option may be removed in future releases, "
+			"together with the InnoDB's internal memory "
+			"allocator.\n");
+	}
+
+	srv_n_file_io_threads = (ulint) innobase_file_io_threads;
+	srv_n_read_io_threads = (ulint) innobase_read_io_threads;
+	srv_n_write_io_threads = (ulint) innobase_write_io_threads;
+
+	srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
+
+	if (!innobase_use_checksums) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Warning: Setting "
+			"innodb_checksums to OFF is DEPRECATED. "
+			"This option may be removed in future releases. "
+			"You should set innodb_checksum_algorithm=NONE "
+			"instead.\n");
+		srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_NONE;
+	}
+
+#ifdef HAVE_LARGE_PAGES
+	if ((os_use_large_pages = (ibool) my_use_large_pages)) {
+		os_large_page_size = (ulint) opt_large_page_size;
+	}
+#endif
+
+	row_rollback_on_timeout = (ibool) innobase_rollback_on_timeout;
+
+	srv_locks_unsafe_for_binlog = (ibool) innobase_locks_unsafe_for_binlog;
+	if (innobase_locks_unsafe_for_binlog) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Warning: Using "
+			"innodb_locks_unsafe_for_binlog is DEPRECATED. "
+			"This option may be removed in future releases. "
+			"Please use READ COMMITTED transaction isolation "
+			"level instead, see " REFMAN "set-transaction.html.\n");
+	}
+
+	if (innobase_open_files < 10) {
+		innobase_open_files = 300;
+		if (srv_file_per_table && table_cache_size > 300) {
+			innobase_open_files = table_cache_size;
+		}
+	}
+	srv_max_n_open_files = (ulint) innobase_open_files;
+	srv_innodb_status = (ibool) innobase_create_status_file;
+
+	srv_print_verbose_log = mysqld_embedded ? 0 : 1;
+
+	/* Round up fts_sort_pll_degree to nearest power of 2 number */
+	for (num_pll_degree = 1;
+	     num_pll_degree < fts_sort_pll_degree;
+	     num_pll_degree <<= 1) {
+
+		/* No op */
+	}
+
+	fts_sort_pll_degree = num_pll_degree;
+
+	/* Store the default charset-collation number of this MySQL
+	installation */
+
+	data_mysql_default_charset_coll = (ulint) default_charset_info->number;
+
+	ut_a(DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL ==
+					my_charset_latin1.number);
+	ut_a(DATA_MYSQL_BINARY_CHARSET_COLL == my_charset_bin.number);
+
+	/* Store the latin1_swedish_ci character ordering table to InnoDB. For
+	non-latin1_swedish_ci charsets we use the MySQL comparison functions,
+	and consequently we do not need to know the ordering internally in
+	InnoDB. */
+
+	ut_a(0 == strcmp(my_charset_latin1.name, "latin1_swedish_ci"));
+	srv_latin1_ordering = my_charset_latin1.sort_order;
+
+	innobase_commit_concurrency_init_default();
+
+#ifdef HAVE_PSI_INTERFACE
+	/* Register keys with MySQL performance schema */
+	int	count;
+
+	count = array_elements(all_pthread_mutexes);
+ 	mysql_mutex_register("innodb", all_pthread_mutexes, count);
+
+# ifdef UNIV_PFS_MUTEX
+	count = array_elements(all_innodb_mutexes);
+	mysql_mutex_register("innodb", all_innodb_mutexes, count);
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+	count = array_elements(all_innodb_rwlocks);
+	mysql_rwlock_register("innodb", all_innodb_rwlocks, count);
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_THREAD
+	count = array_elements(all_innodb_threads);
+	mysql_thread_register("innodb", all_innodb_threads, count);
+# endif /* UNIV_PFS_THREAD */
+
+# ifdef UNIV_PFS_IO
+	count = array_elements(all_innodb_files);
+	mysql_file_register("innodb", all_innodb_files, count);
+# endif /* UNIV_PFS_IO */
+
+	count = array_elements(all_innodb_conds);
+	mysql_cond_register("innodb", all_innodb_conds, count);
+#endif /* HAVE_PSI_INTERFACE */
+
+	/* Since we in this module access directly the fields of a trx
+	struct, and due to different headers and flags it might happen that
+	ib_mutex_t has a different size in this module and in InnoDB
+	modules, we check at run time that the size is the same in
+	these compilation modules. */
+
+	err = innobase_start_or_create_for_mysql();
+
+	if (err != DB_SUCCESS) {
+		goto mem_free_and_error;
+	}
+
+	/* Adjust the innodb_undo_logs config object */
+	innobase_undo_logs_init_default_max();
+
+	innobase_old_blocks_pct = static_cast<uint>(
+		buf_LRU_old_ratio_update(innobase_old_blocks_pct, TRUE));
+
+	ibuf_max_size_update(innobase_change_buffer_max_size);
+
+	innobase_open_tables = hash_create(200);
+	mysql_mutex_init(innobase_share_mutex_key,
+			 &innobase_share_mutex,
+			 MY_MUTEX_INIT_FAST);
+	mysql_mutex_init(commit_cond_mutex_key,
+			 &commit_cond_m, MY_MUTEX_INIT_FAST);
+	mysql_cond_init(commit_cond_key, &commit_cond, NULL);
+	innodb_inited= 1;
+#ifdef MYSQL_DYNAMIC_PLUGIN
+	if (innobase_hton != p) {
+		innobase_hton = reinterpret_cast<handlerton*>(p);
+		*innobase_hton = *innodb_hton_ptr;
+	}
+#endif /* MYSQL_DYNAMIC_PLUGIN */
+
+	/* Get the current high water mark format. */
+	innobase_file_format_max = (char*) trx_sys_file_format_max_get();
+
+	/* Currently, monitor counter information are not persistent. */
+	memset(monitor_set_tbl, 0, sizeof monitor_set_tbl);
+
+	memset(innodb_counter_value, 0, sizeof innodb_counter_value);
+
+	/* Do this as late as possible so server is fully starts up,
+	since  we might get some initial stats if user choose to turn
+	on some counters from start up */
+	if (innobase_enable_monitor_counter) {
+		innodb_enable_monitor_at_startup(
+			innobase_enable_monitor_counter);
+	}
+
+	/* Turn on monitor counters that are default on */
+	srv_mon_default_on();
+
+	DBUG_RETURN(FALSE);
+error:
+	DBUG_RETURN(TRUE);
+}
+
+/*******************************************************************//**
+Closes an InnoDB database.
+@return	TRUE if error */
+static
+int
+innobase_end(
+/*=========*/
+	handlerton*		hton,	/*!< in/out: InnoDB handlerton */
+	ha_panic_function	type __attribute__((unused)))
+					/*!< in: ha_panic() parameter */
+{
+	int	err= 0;
+
+	DBUG_ENTER("innobase_end");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	if (innodb_inited) {
+
+		srv_fast_shutdown = (ulint) innobase_fast_shutdown;
+
+		innodb_inited = 0;
+		hash_table_free(innobase_open_tables);
+		innobase_open_tables = NULL;
+		if (innobase_shutdown_for_mysql() != DB_SUCCESS) {
+			err = 1;
+		}
+		srv_free_paths_and_sizes();
+		my_free(internal_innobase_data_file_path);
+		mysql_mutex_destroy(&innobase_share_mutex);
+		mysql_mutex_destroy(&commit_cond_m);
+		mysql_cond_destroy(&commit_cond);
+	}
+
+	DBUG_RETURN(err);
+}
+
+/****************************************************************//**
+Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes
+the logs, and the name of this function should be innobase_checkpoint.
+@return	TRUE if error */
+static
+bool
+innobase_flush_logs(
+/*================*/
+	handlerton*	hton)	/*!< in/out: InnoDB handlerton */
+{
+	bool	result = 0;
+
+	DBUG_ENTER("innobase_flush_logs");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	if (!srv_read_only_mode) {
+		log_buffer_flush_to_disk();
+	}
+
+	DBUG_RETURN(result);
+}
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database. */
+static
+void
+innobase_commit_low(
+/*================*/
+	trx_t*	trx)	/*!< in: transaction handle */
+{
+	if (trx_is_started(trx)) {
+
+		trx_commit_for_mysql(trx);
+	}
+}
+
+/*****************************************************************//**
+Creates an InnoDB transaction struct for the thd if it does not yet have one.
+Starts a new InnoDB transaction if a transaction is not yet started. And
+assigns a new snapshot for a consistent read if the transaction does not yet
+have one.
+@return	0 */
+static
+int
+innobase_start_trx_and_assign_read_view(
+/*====================================*/
+	handlerton*	hton,	/*!< in: Innodb handlerton */
+	THD*		thd)	/*!< in: MySQL thread handle of the user for
+				whom the transaction should be committed */
+{
+	trx_t*	trx;
+
+	DBUG_ENTER("innobase_start_trx_and_assign_read_view");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	/* Create a new trx struct for thd, if it does not yet have one */
+
+	trx = check_trx_exists(thd);
+
+	/* This is just to play safe: release a possible FIFO ticket and
+	search latch. Since we can potentially reserve the trx_sys->mutex,
+	we have to release the search system latch first to obey the latching
+	order. */
+
+	trx_search_latch_release_if_reserved(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
+
+	/* If the transaction is not started yet, start it */
+
+	trx_start_if_not_started_xa(trx);
+
+	/* Assign a read view if the transaction does not have it yet.
+	Do this only if transaction is using REPEATABLE READ isolation
+	level. */
+	trx->isolation_level = innobase_map_isolation_level(
+		thd_get_trx_isolation(thd));
+
+	if (trx->isolation_level == TRX_ISO_REPEATABLE_READ) {
+		trx_assign_read_view(trx);
+	} else {
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    HA_ERR_UNSUPPORTED,
+				    "InnoDB: WITH CONSISTENT SNAPSHOT "
+				    "was ignored because this phrase "
+				    "can only be used with "
+				    "REPEATABLE READ isolation level.");
+	}
+
+	/* Set the MySQL flag to mark that there is an active transaction */
+
+	innobase_register_trx(hton, current_thd, trx);
+
+	DBUG_RETURN(0);
+}
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database or marks an SQL statement
+ended.
+@return	0 */
+static
+int
+innobase_commit(
+/*============*/
+	handlerton*	hton,		/*!< in: Innodb handlerton */
+	THD*		thd,		/*!< in: MySQL thread handle of the
+					user for whom the transaction should
+					be committed */
+	bool		commit_trx)	/*!< in: true - commit transaction
+					false - the current SQL statement
+					ended */
+{
+	trx_t*		trx;
+
+	DBUG_ENTER("innobase_commit");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+	DBUG_PRINT("trans", ("ending transaction"));
+
+	trx = check_trx_exists(thd);
+
+	/* Since we will reserve the trx_sys->mutex, we have to release
+	the search system latch first to obey the latching order. */
+
+	if (trx->has_search_latch) {
+		trx_search_latch_release_if_reserved(trx);
+	}
+
+	/* Transaction is deregistered only in a commit or a rollback. If
+	it is deregistered we know there cannot be resources to be freed
+	and we could return immediately.  For the time being, we play safe
+	and do the cleanup though there should be nothing to clean up. */
+
+	if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
+
+		sql_print_error("Transaction not registered for MySQL 2PC, "
+				"but transaction is active");
+	}
+
+	if (commit_trx
+	    || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
+
+		/* We were instructed to commit the whole transaction, or
+		this is an SQL statement end and autocommit is on */
+
+		/* We need current binlog position for mysqlbackup to work. */
+retry:
+		if (innobase_commit_concurrency > 0) {
+			mysql_mutex_lock(&commit_cond_m);
+			commit_threads++;
+
+			if (commit_threads > innobase_commit_concurrency) {
+				commit_threads--;
+				mysql_cond_wait(&commit_cond,
+					&commit_cond_m);
+				mysql_mutex_unlock(&commit_cond_m);
+				goto retry;
+			}
+			else {
+				mysql_mutex_unlock(&commit_cond_m);
+			}
+		}
+
+		/* The following call read the binary log position of
+		the transaction being committed.
+
+                Binary logging of other engines is not relevant to
+		InnoDB as all InnoDB requires is that committing
+		InnoDB transactions appear in the same order in the
+		MySQL binary log as they appear in InnoDB logs, which
+		is guaranteed by the server.
+
+                If the binary log is not enabled, or the transaction
+                is not written to the binary log, the file name will
+                be a NULL pointer. */
+                unsigned long long pos;
+                thd_binlog_pos(thd, &trx->mysql_log_file_name, &pos);
+                trx->mysql_log_offset= static_cast<ib_int64_t>(pos);
+		/* Don't do write + flush right now. For group commit
+		to work we want to do the flush later. */
+		trx->flush_log_later = TRUE;
+		innobase_commit_low(trx);
+		trx->flush_log_later = FALSE;
+
+		if (innobase_commit_concurrency > 0) {
+			mysql_mutex_lock(&commit_cond_m);
+			commit_threads--;
+			mysql_cond_signal(&commit_cond);
+			mysql_mutex_unlock(&commit_cond_m);
+		}
+
+		trx_deregister_from_2pc(trx);
+
+		/* Now do a write + flush of logs. */
+		trx_commit_complete_for_mysql(trx);
+	} else {
+		/* We just mark the SQL statement ended and do not do a
+		transaction commit */
+
+		/* If we had reserved the auto-inc lock for some
+		table in this SQL statement we release it now */
+
+		lock_unlock_table_autoinc(trx);
+
+		/* Store the current undo_no of the transaction so that we
+		know where to roll back if we have to roll back the next
+		SQL statement */
+
+		trx_mark_sql_stat_end(trx);
+	}
+
+	trx->n_autoinc_rows = 0; /* Reset the number AUTO-INC rows required */
+
+	/* This is a statement level variable. */
+	trx->fts_next_doc_id = 0;
+
+	innobase_srv_conc_force_exit_innodb(trx);
+
+	DBUG_RETURN(0);
+}
+
+/*****************************************************************//**
+Rolls back a transaction or the latest SQL statement.
+@return	0 or error number */
+static
+int
+innobase_rollback(
+/*==============*/
+	handlerton*	hton,		/*!< in: Innodb handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction should
+					be rolled back */
+	bool		rollback_trx)	/*!< in: TRUE - rollback entire
+					transaction FALSE - rollback the current
+					statement only */
+{
+	dberr_t	error;
+	trx_t*	trx;
+
+	DBUG_ENTER("innobase_rollback");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+	DBUG_PRINT("trans", ("aborting transaction"));
+
+	trx = check_trx_exists(thd);
+
+	/* Release a possible FIFO ticket and search latch. Since we will
+	reserve the trx_sys->mutex, we have to release the search system
+	latch first to obey the latching order. */
+
+	trx_search_latch_release_if_reserved(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
+
+	trx->n_autoinc_rows = 0; /* Reset the number AUTO-INC rows required */
+
+	/* If we had reserved the auto-inc lock for some table (if
+	we come here to roll back the latest SQL statement) we
+	release it now before a possibly lengthy rollback */
+
+	lock_unlock_table_autoinc(trx);
+
+	/* This is a statement level variable. */
+	trx->fts_next_doc_id = 0;
+
+	if (rollback_trx
+	    || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+		error = trx_rollback_for_mysql(trx);
+		trx_deregister_from_2pc(trx);
+	} else {
+		error = trx_rollback_last_sql_stat_for_mysql(trx);
+	}
+
+	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*****************************************************************//**
+Rolls back a transaction
+@return	0 or error number */
+static
+int
+innobase_rollback_trx(
+/*==================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	dberr_t	error = DB_SUCCESS;
+
+	DBUG_ENTER("innobase_rollback_trx");
+	DBUG_PRINT("trans", ("aborting transaction"));
+
+	/* Release a possible FIFO ticket and search latch. Since we will
+	reserve the trx_sys->mutex, we have to release the search system
+	latch first to obey the latching order. */
+
+	trx_search_latch_release_if_reserved(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
+
+	/* If we had reserved the auto-inc lock for some table (if
+	we come here to roll back the latest SQL statement) we
+	release it now before a possibly lengthy rollback */
+
+	lock_unlock_table_autoinc(trx);
+
+	if (!trx->read_only) {
+		error = trx_rollback_for_mysql(trx);
+	}
+
+	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*****************************************************************//**
+Rolls back a transaction to a savepoint.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_rollback_to_savepoint(
+/*===========================*/
+	handlerton*	hton,		/*!< in: Innodb handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction should
+					be rolled back to savepoint */
+	void*		savepoint)	/*!< in: savepoint data */
+{
+	ib_int64_t	mysql_binlog_cache_pos;
+	dberr_t		error;
+	trx_t*		trx;
+	char		name[64];
+
+	DBUG_ENTER("innobase_rollback_to_savepoint");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	trx = check_trx_exists(thd);
+
+	/* Release a possible FIFO ticket and search latch. Since we will
+	reserve the trx_sys->mutex, we have to release the search system
+	latch first to obey the latching order. */
+
+	trx_search_latch_release_if_reserved(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
+
+	/* TODO: use provided savepoint data area to store savepoint data */
+
+	longlong2str((ulint) savepoint, name, 36);
+
+	error = trx_rollback_to_savepoint_for_mysql(
+		trx, name, &mysql_binlog_cache_pos);
+
+	if (error == DB_SUCCESS && trx->fts_trx != NULL) {
+		fts_savepoint_rollback(trx, name);
+	}
+
+	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*****************************************************************//**
+Check whether innodb state allows to safely release MDL locks after
+rollback to savepoint.
+When binlog is on, MDL locks acquired after savepoint unit are not
+released if there are any locks held in InnoDB.
+@return true if it is safe, false if its not safe. */
+static
+bool
+innobase_rollback_to_savepoint_can_release_mdl(
+/*===========================================*/
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	THD*		thd)		/*!< in: handle to the MySQL thread
+					of the user whose transaction should
+					be rolled back to savepoint */
+{
+	trx_t*		trx;
+
+	DBUG_ENTER("innobase_rollback_to_savepoint_can_release_mdl");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	trx = check_trx_exists(thd);
+	ut_ad(trx);
+
+        /* If transaction has not acquired any locks then it is safe
+	   to release MDL after rollback to savepoint */
+	if (!(UT_LIST_GET_LEN(trx->lock.trx_locks))) {
+		DBUG_RETURN(true);
+	}
+
+	DBUG_RETURN(false);
+}
+
+/*****************************************************************//**
+Release transaction savepoint name.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_release_savepoint(
+/*=======================*/
+	handlerton*	hton,		/*!< in: handlerton for Innodb */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction's
+					savepoint should be released */
+	void*		savepoint)	/*!< in: savepoint data */
+{
+	dberr_t		error;
+	trx_t*		trx;
+	char		name[64];
+
+	DBUG_ENTER("innobase_release_savepoint");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	trx = check_trx_exists(thd);
+
+	/* TODO: use provided savepoint data area to store savepoint data */
+
+	longlong2str((ulint) savepoint, name, 36);
+
+	error = trx_release_savepoint_for_mysql(trx, name);
+
+	if (error == DB_SUCCESS && trx->fts_trx != NULL) {
+		fts_savepoint_release(trx, name);
+	}
+
+	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*****************************************************************//**
+Sets a transaction savepoint.
+@return	always 0, that is, always succeeds */
+static
+int
+innobase_savepoint(
+/*===============*/
+	handlerton*	hton,	/*!< in: handle to the Innodb handlerton */
+	THD*	thd,		/*!< in: handle to the MySQL thread */
+	void*	savepoint)	/*!< in: savepoint data */
+{
+	dberr_t	error;
+	trx_t*	trx;
+
+	DBUG_ENTER("innobase_savepoint");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	/* In the autocommit mode there is no sense to set a savepoint
+	(unless we are in sub-statement), so SQL layer ensures that
+	this method is never called in such situation.  */
+
+	trx = check_trx_exists(thd);
+
+	/* Release a possible FIFO ticket and search latch. Since we will
+	reserve the trx_sys->mutex, we have to release the search system
+	latch first to obey the latching order. */
+
+	trx_search_latch_release_if_reserved(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
+
+	/* Cannot happen outside of transaction */
+	DBUG_ASSERT(trx_is_registered_for_2pc(trx));
+
+	/* TODO: use provided savepoint data area to store savepoint data */
+	char name[64];
+	longlong2str((ulint) savepoint,name,36);
+
+	error = trx_savepoint_for_mysql(trx, name, (ib_int64_t)0);
+
+	if (error == DB_SUCCESS && trx->fts_trx != NULL) {
+		fts_savepoint_take(trx, trx->fts_trx, name);
+	}
+
+	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*****************************************************************//**
+Frees a possible InnoDB trx object associated with the current THD.
+@return	0 or error number */
+static
+int
+innobase_close_connection(
+/*======================*/
+	handlerton*	hton,	/*!< in: innobase handlerton */
+	THD*		thd)	/*!< in: handle to the MySQL thread of the user
+				whose resources should be free'd */
+{
+	trx_t*	trx;
+
+	DBUG_ENTER("innobase_close_connection");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+	trx = thd_to_trx(thd);
+
+	ut_a(trx);
+
+	if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
+
+		sql_print_error("Transaction not registered for MySQL 2PC, "
+				"but transaction is active");
+	}
+
+	if (trx_is_started(trx) && log_warnings) {
+
+		sql_print_warning(
+			"MySQL is closing a connection that has an active "
+			"InnoDB transaction.  " TRX_ID_FMT " row modifications "
+			"will roll back.",
+			trx->undo_no);
+	}
+
+	innobase_rollback_trx(trx);
+
+	trx_free_for_mysql(trx);
+
+	DBUG_RETURN(0);
+}
+
+/*****************************************************************//**
+Frees a possible InnoDB trx object associated with the current THD.
+@return	0 or error number */
+UNIV_INTERN
+int
+innobase_close_thd(
+/*===============*/
+	THD*		thd)	/*!< in: handle to the MySQL thread of the user
+				whose resources should be free'd */
+{
+	trx_t*	trx = thd_to_trx(thd);
+
+	if (!trx) {
+		return(0);
+	}
+
+	return(innobase_close_connection(innodb_hton_ptr, thd));
+}
+
+/*************************************************************************//**
+** InnoDB database tables
+*****************************************************************************/
+
+/****************************************************************//**
+Get the record format from the data dictionary.
+@return one of ROW_TYPE_REDUNDANT, ROW_TYPE_COMPACT,
+ROW_TYPE_COMPRESSED, ROW_TYPE_DYNAMIC */
+UNIV_INTERN
+enum row_type
+ha_innobase::get_row_type() const
+/*=============================*/
+{
+	if (prebuilt && prebuilt->table) {
+		const ulint	flags = prebuilt->table->flags;
+
+		switch (dict_tf_get_rec_format(flags)) {
+		case REC_FORMAT_REDUNDANT:
+			return(ROW_TYPE_REDUNDANT);
+		case REC_FORMAT_COMPACT:
+			return(ROW_TYPE_COMPACT);
+		case REC_FORMAT_COMPRESSED:
+			return(ROW_TYPE_COMPRESSED);
+		case REC_FORMAT_DYNAMIC:
+			return(ROW_TYPE_DYNAMIC);
+		}
+	}
+	ut_ad(0);
+	return(ROW_TYPE_NOT_USED);
+}
+
+
+
+/****************************************************************//**
+Get the table flags to use for the statement.
+@return	table flags */
+UNIV_INTERN
+handler::Table_flags
+ha_innobase::table_flags() const
+/*============================*/
+{
+	/* Need to use tx_isolation here since table flags is (also)
+	called before prebuilt is inited. */
+	ulong const tx_isolation = thd_tx_isolation(ha_thd());
+
+	if (tx_isolation <= ISO_READ_COMMITTED) {
+		return(int_table_flags);
+	}
+
+	return(int_table_flags | HA_BINLOG_STMT_CAPABLE);
+}
+
+/****************************************************************//**
+Gives the file extension of an InnoDB single-table tablespace. */
+static const char* ha_innobase_exts[] = {
+	".ibd",
+	".isl",
+	NullS
+};
+
+/****************************************************************//**
+Returns the table type (storage engine name).
+@return	table type */
+UNIV_INTERN
+const char*
+ha_innobase::table_type() const
+/*===========================*/
+{
+	return(innobase_hton_name);
+}
+
+/****************************************************************//**
+Returns the index type.
+@return index type */
+UNIV_INTERN
+const char*
+ha_innobase::index_type(
+/*====================*/
+	uint	keynr)		/*!< : index number */
+{
+	dict_index_t*	index = innobase_get_index(keynr);
+
+	if (index && index->type & DICT_FTS) {
+		return("FULLTEXT");
+	} else {
+		return("BTREE");
+	}
+}
+
+/****************************************************************//**
+Returns the table file name extension.
+@return	file extension string */
+UNIV_INTERN
+const char**
+ha_innobase::bas_ext() const
+/*========================*/
+{
+	return(ha_innobase_exts);
+}
+
+/****************************************************************//**
+Returns the operations supported for indexes.
+@return	flags of supported operations */
+UNIV_INTERN
+ulong
+ha_innobase::index_flags(
+/*=====================*/
+	uint	key,
+	uint,
+	bool) const
+{
+	return((table_share->key_info[key].algorithm == HA_KEY_ALG_FULLTEXT)
+		 ? 0
+		 : (HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER
+		  | HA_READ_RANGE | HA_KEYREAD_ONLY
+		  | HA_DO_INDEX_COND_PUSHDOWN));
+}
+
+/****************************************************************//**
+Returns the maximum number of keys.
+@return	MAX_KEY */
+UNIV_INTERN
+uint
+ha_innobase::max_supported_keys() const
+/*===================================*/
+{
+	return(MAX_KEY);
+}
+
+/****************************************************************//**
+Returns the maximum key length.
+@return	maximum supported key length, in bytes */
+UNIV_INTERN
+uint
+ha_innobase::max_supported_key_length() const
+/*=========================================*/
+{
+	/* An InnoDB page must store >= 2 keys; a secondary key record
+	must also contain the primary key value.  Therefore, if both
+	the primary key and the secondary key are at this maximum length,
+	it must be less than 1/4th of the free space on a page including
+	record overhead.
+
+	MySQL imposes its own limit to this number; MAX_KEY_LENGTH = 3072.
+
+	For page sizes = 16k, InnoDB historically reported 3500 bytes here,
+	But the MySQL limit of 3072 was always used through the handler
+	interface. */
+
+	switch (UNIV_PAGE_SIZE) {
+	case 4096:
+		return(768);
+	case 8192:
+		return(1536);
+	default:
+		return(3500);
+	}
+}
+
+/****************************************************************//**
+Returns the key map of keys that are usable for scanning.
+@return	key_map_full */
+UNIV_INTERN
+const key_map*
+ha_innobase::keys_to_use_for_scanning()
+/*===================================*/
+{
+	return(&key_map_full);
+}
+
+/****************************************************************//**
+Determines if table caching is supported.
+@return	HA_CACHE_TBL_ASKTRANSACT */
+UNIV_INTERN
+uint8
+ha_innobase::table_cache_type()
+/*===========================*/
+{
+	return(HA_CACHE_TBL_ASKTRANSACT);
+}
+
+/****************************************************************//**
+Determines if the primary key is clustered index.
+@return	true */
+UNIV_INTERN
+bool
+ha_innobase::primary_key_is_clustered()
+/*===================================*/
+{
+	return(true);
+}
+
+/*****************************************************************//**
+Normalizes a table name string. A normalized name consists of the
+database name catenated to '/' and table name. Example: test/mytable.
+On Windows normalization puts both the database name and the
+table name always to lower case if "set_lower_case" is set to TRUE. */
+static
+void
+normalize_table_name_low(
+/*=====================*/
+	char*		norm_name,	/*!< out: normalized name as a
+					null-terminated string */
+	const char*	name,		/*!< in: table name string */
+	ibool		set_lower_case)	/*!< in: TRUE if we want to set name
+					to lower case */
+{
+	char*	name_ptr;
+	ulint	name_len;
+	char*	db_ptr;
+	ulint	db_len;
+	char*	ptr;
+	ulint	norm_len;
+
+	/* Scan name from the end */
+
+	ptr = strend(name) - 1;
+
+	/* seek to the last path separator */
+	while (ptr >= name && *ptr != '\\' && *ptr != '/') {
+		ptr--;
+	}
+
+	name_ptr = ptr + 1;
+	name_len = strlen(name_ptr);
+
+	/* skip any number of path separators */
+	while (ptr >= name && (*ptr == '\\' || *ptr == '/')) {
+		ptr--;
+	}
+
+	DBUG_ASSERT(ptr >= name);
+
+	/* seek to the last but one path separator or one char before
+	the beginning of name */
+	db_len = 0;
+	while (ptr >= name && *ptr != '\\' && *ptr != '/') {
+		ptr--;
+		db_len++;
+	}
+
+	db_ptr = ptr + 1;
+
+	norm_len = db_len + name_len + sizeof "/";
+	ut_a(norm_len < FN_REFLEN - 1);
+
+	memcpy(norm_name, db_ptr, db_len);
+
+	norm_name[db_len] = '/';
+
+	/* Copy the name and null-byte. */
+	memcpy(norm_name + db_len + 1, name_ptr, name_len + 1);
+
+	if (set_lower_case) {
+		innobase_casedn_str(norm_name);
+	}
+}
+
+#if !defined(DBUG_OFF)
+/*********************************************************************
+Test normalize_table_name_low(). */
+static
+void
+test_normalize_table_name_low()
+/*===========================*/
+{
+	char		norm_name[FN_REFLEN];
+	const char*	test_data[][2] = {
+		/* input, expected result */
+		{"./mysqltest/t1", "mysqltest/t1"},
+		{"./test/#sql-842b_2", "test/#sql-842b_2"},
+		{"./test/#sql-85a3_10", "test/#sql-85a3_10"},
+		{"./test/#sql2-842b-2", "test/#sql2-842b-2"},
+		{"./test/bug29807", "test/bug29807"},
+		{"./test/foo", "test/foo"},
+		{"./test/innodb_bug52663", "test/innodb_bug52663"},
+		{"./test/t", "test/t"},
+		{"./test/t1", "test/t1"},
+		{"./test/t10", "test/t10"},
+		{"/a/b/db/table", "db/table"},
+		{"/a/b/db///////table", "db/table"},
+		{"/a/b////db///////table", "db/table"},
+		{"/var/tmp/mysqld.1/#sql842b_2_10", "mysqld.1/#sql842b_2_10"},
+		{"db/table", "db/table"},
+		{"ddd/t", "ddd/t"},
+		{"d/ttt", "d/ttt"},
+		{"d/t", "d/t"},
+		{".\\mysqltest\\t1", "mysqltest/t1"},
+		{".\\test\\#sql-842b_2", "test/#sql-842b_2"},
+		{".\\test\\#sql-85a3_10", "test/#sql-85a3_10"},
+		{".\\test\\#sql2-842b-2", "test/#sql2-842b-2"},
+		{".\\test\\bug29807", "test/bug29807"},
+		{".\\test\\foo", "test/foo"},
+		{".\\test\\innodb_bug52663", "test/innodb_bug52663"},
+		{".\\test\\t", "test/t"},
+		{".\\test\\t1", "test/t1"},
+		{".\\test\\t10", "test/t10"},
+		{"C:\\a\\b\\db\\table", "db/table"},
+		{"C:\\a\\b\\db\\\\\\\\\\\\\\table", "db/table"},
+		{"C:\\a\\b\\\\\\\\db\\\\\\\\\\\\\\table", "db/table"},
+		{"C:\\var\\tmp\\mysqld.1\\#sql842b_2_10", "mysqld.1/#sql842b_2_10"},
+		{"db\\table", "db/table"},
+		{"ddd\\t", "ddd/t"},
+		{"d\\ttt", "d/ttt"},
+		{"d\\t", "d/t"},
+	};
+
+	for (size_t i = 0; i < UT_ARR_SIZE(test_data); i++) {
+		printf("test_normalize_table_name_low(): "
+		       "testing \"%s\", expected \"%s\"... ",
+		       test_data[i][0], test_data[i][1]);
+
+		normalize_table_name_low(norm_name, test_data[i][0], FALSE);
+
+		if (strcmp(norm_name, test_data[i][1]) == 0) {
+			printf("ok\n");
+		} else {
+			printf("got \"%s\"\n", norm_name);
+			ut_error;
+		}
+	}
+}
+
+/*********************************************************************
+Test ut_format_name(). */
+static
+void
+test_ut_format_name()
+/*=================*/
+{
+	char		buf[NAME_LEN * 3];
+
+	struct {
+		const char*	name;
+		ibool		is_table;
+		ulint		buf_size;
+		const char*	expected;
+	} test_data[] = {
+		{"test/t1",	TRUE,	sizeof(buf),	"\"test\".\"t1\""},
+		{"test/t1",	TRUE,	12,		"\"test\".\"t1\""},
+		{"test/t1",	TRUE,	11,		"\"test\".\"t1"},
+		{"test/t1",	TRUE,	10,		"\"test\".\"t"},
+		{"test/t1",	TRUE,	9,		"\"test\".\""},
+		{"test/t1",	TRUE,	8,		"\"test\"."},
+		{"test/t1",	TRUE,	7,		"\"test\""},
+		{"test/t1",	TRUE,	6,		"\"test"},
+		{"test/t1",	TRUE,	5,		"\"tes"},
+		{"test/t1",	TRUE,	4,		"\"te"},
+		{"test/t1",	TRUE,	3,		"\"t"},
+		{"test/t1",	TRUE,	2,		"\""},
+		{"test/t1",	TRUE,	1,		""},
+		{"test/t1",	TRUE,	0,		"BUF_NOT_CHANGED"},
+		{"table",	TRUE,	sizeof(buf),	"\"table\""},
+		{"ta'le",	TRUE,	sizeof(buf),	"\"ta'le\""},
+		{"ta\"le",	TRUE,	sizeof(buf),	"\"ta\"\"le\""},
+		{"ta`le",	TRUE,	sizeof(buf),	"\"ta`le\""},
+		{"index",	FALSE,	sizeof(buf),	"\"index\""},
+		{"ind/ex",	FALSE,	sizeof(buf),	"\"ind/ex\""},
+	};
+
+	for (size_t i = 0; i < UT_ARR_SIZE(test_data); i++) {
+
+		memcpy(buf, "BUF_NOT_CHANGED", strlen("BUF_NOT_CHANGED") + 1);
+
+		char*	ret;
+
+		ret = ut_format_name(test_data[i].name,
+				     test_data[i].is_table,
+				     buf,
+				     test_data[i].buf_size);
+
+		ut_a(ret == buf);
+
+		if (strcmp(buf, test_data[i].expected) == 0) {
+			fprintf(stderr,
+				"ut_format_name(%s, %s, buf, %lu), "
+				"expected %s, OK\n",
+				test_data[i].name,
+				test_data[i].is_table ? "TRUE" : "FALSE",
+				test_data[i].buf_size,
+				test_data[i].expected);
+		} else {
+			fprintf(stderr,
+				"ut_format_name(%s, %s, buf, %lu), "
+				"expected %s, ERROR: got %s\n",
+				test_data[i].name,
+				test_data[i].is_table ? "TRUE" : "FALSE",
+				test_data[i].buf_size,
+				test_data[i].expected,
+				buf);
+			ut_error;
+		}
+	}
+}
+#endif /* !DBUG_OFF */
+
+/********************************************************************//**
+Get the upper limit of the MySQL integral and floating-point type.
+@return maximum allowed value for the field */
+UNIV_INTERN
+ulonglong
+innobase_get_int_col_max_value(
+/*===========================*/
+	const Field*	field)	/*!< in: MySQL field */
+{
+	ulonglong	max_value = 0;
+
+	switch (field->key_type()) {
+	/* TINY */
+	case HA_KEYTYPE_BINARY:
+		max_value = 0xFFULL;
+		break;
+	case HA_KEYTYPE_INT8:
+		max_value = 0x7FULL;
+		break;
+	/* SHORT */
+	case HA_KEYTYPE_USHORT_INT:
+		max_value = 0xFFFFULL;
+		break;
+	case HA_KEYTYPE_SHORT_INT:
+		max_value = 0x7FFFULL;
+		break;
+	/* MEDIUM */
+	case HA_KEYTYPE_UINT24:
+		max_value = 0xFFFFFFULL;
+		break;
+	case HA_KEYTYPE_INT24:
+		max_value = 0x7FFFFFULL;
+		break;
+	/* LONG */
+	case HA_KEYTYPE_ULONG_INT:
+		max_value = 0xFFFFFFFFULL;
+		break;
+	case HA_KEYTYPE_LONG_INT:
+		max_value = 0x7FFFFFFFULL;
+		break;
+	/* BIG */
+	case HA_KEYTYPE_ULONGLONG:
+		max_value = 0xFFFFFFFFFFFFFFFFULL;
+		break;
+	case HA_KEYTYPE_LONGLONG:
+		max_value = 0x7FFFFFFFFFFFFFFFULL;
+		break;
+	case HA_KEYTYPE_FLOAT:
+		/* We use the maximum as per IEEE754-2008 standard, 2^24 */
+		max_value = 0x1000000ULL;
+		break;
+	case HA_KEYTYPE_DOUBLE:
+		/* We use the maximum as per IEEE754-2008 standard, 2^53 */
+		max_value = 0x20000000000000ULL;
+		break;
+	default:
+		ut_error;
+	}
+
+	return(max_value);
+}
+
+/*******************************************************************//**
+This function checks whether the index column information
+is consistent between KEY info from mysql and that from innodb index.
+@return TRUE if all column types match. */
+static
+ibool
+innobase_match_index_columns(
+/*=========================*/
+	const KEY*		key_info,	/*!< in: Index info
+						from mysql */
+	const dict_index_t*	index_info)	/*!< in: Index info
+						from Innodb */
+{
+	const KEY_PART_INFO*	key_part;
+	const KEY_PART_INFO*	key_end;
+	const dict_field_t*	innodb_idx_fld;
+	const dict_field_t*	innodb_idx_fld_end;
+
+	DBUG_ENTER("innobase_match_index_columns");
+
+	/* Check whether user defined index column count matches */
+	if (key_info->user_defined_key_parts !=
+		index_info->n_user_defined_cols) {
+		DBUG_RETURN(FALSE);
+	}
+
+	key_part = key_info->key_part;
+	key_end = key_part + key_info->user_defined_key_parts;
+	innodb_idx_fld = index_info->fields;
+	innodb_idx_fld_end = index_info->fields + index_info->n_fields;
+
+	/* Check each index column's datatype. We do not check
+	column name because there exists case that index
+	column name got modified in mysql but such change does not
+	propagate to InnoDB.
+	One hidden assumption here is that the index column sequences
+	are matched up between those in mysql and Innodb. */
+	for (; key_part != key_end; ++key_part) {
+		ulint	col_type;
+		ibool	is_unsigned;
+		ulint	mtype = innodb_idx_fld->col->mtype;
+
+		/* Need to translate to InnoDB column type before
+		comparison. */
+		col_type = get_innobase_type_from_mysql_type(&is_unsigned,
+							     key_part->field);
+
+		/* Ignore Innodb specific system columns. */
+		while (mtype == DATA_SYS) {
+			innodb_idx_fld++;
+
+			if (innodb_idx_fld >= innodb_idx_fld_end) {
+				DBUG_RETURN(FALSE);
+			}
+		}
+
+		if (col_type != mtype) {
+			/* Column Type mismatches */
+			DBUG_RETURN(FALSE);
+		}
+
+		innodb_idx_fld++;
+	}
+
+	DBUG_RETURN(TRUE);
+}
+
+/*******************************************************************//**
+This function builds a translation table in INNOBASE_SHARE
+structure for fast index location with mysql array number from its
+table->key_info structure. This also provides the necessary translation
+between the key order in mysql key_info and Innodb ib_table->indexes if
+they are not fully matched with each other.
+Note we do not have any mutex protecting the translation table
+building based on the assumption that there is no concurrent
+index creation/drop and DMLs that requires index lookup. All table
+handle will be closed before the index creation/drop.
+@return TRUE if index translation table built successfully */
+static
+ibool
+innobase_build_index_translation(
+/*=============================*/
+	const TABLE*		table,	/*!< in: table in MySQL data
+					dictionary */
+	dict_table_t*		ib_table,/*!< in: table in Innodb data
+					dictionary */
+	INNOBASE_SHARE*		share)	/*!< in/out: share structure
+					where index translation table
+					will be constructed in. */
+{
+	ulint		mysql_num_index;
+	ulint		ib_num_index;
+	dict_index_t**	index_mapping;
+	ibool		ret = TRUE;
+
+	DBUG_ENTER("innobase_build_index_translation");
+
+	mutex_enter(&dict_sys->mutex);
+
+	mysql_num_index = table->s->keys;
+	ib_num_index = UT_LIST_GET_LEN(ib_table->indexes);
+
+	index_mapping = share->idx_trans_tbl.index_mapping;
+
+	/* If there exists inconsistency between MySQL and InnoDB dictionary
+	(metadata) information, the number of index defined in MySQL
+	could exceed that in InnoDB, do not build index translation
+	table in such case */
+	if (UNIV_UNLIKELY(ib_num_index < mysql_num_index)) {
+		ret = FALSE;
+		goto func_exit;
+	}
+
+	/* If index entry count is non-zero, nothing has
+	changed since last update, directly return TRUE */
+	if (share->idx_trans_tbl.index_count) {
+		/* Index entry count should still match mysql_num_index */
+		ut_a(share->idx_trans_tbl.index_count == mysql_num_index);
+		goto func_exit;
+	}
+
+	/* The number of index increased, rebuild the mapping table */
+	if (mysql_num_index > share->idx_trans_tbl.array_size) {
+		index_mapping = (dict_index_t**) my_realloc(index_mapping,
+							mysql_num_index *
+							sizeof(*index_mapping),
+							MYF(MY_ALLOW_ZERO_PTR));
+
+		if (!index_mapping) {
+			/* Report an error if index_mapping continues to be
+			NULL and mysql_num_index is a non-zero value */
+			sql_print_error("InnoDB: fail to allocate memory for "
+					"index translation table. Number of "
+					"Index:%lu, array size:%lu",
+					mysql_num_index,
+					share->idx_trans_tbl.array_size);
+			ret = FALSE;
+			goto func_exit;
+		}
+
+		share->idx_trans_tbl.array_size = mysql_num_index;
+	}
+
+	/* For each index in the mysql key_info array, fetch its
+	corresponding InnoDB index pointer into index_mapping
+	array. */
+	for (ulint count = 0; count < mysql_num_index; count++) {
+
+		/* Fetch index pointers into index_mapping according to mysql
+		index sequence */
+		index_mapping[count] = dict_table_get_index_on_name(
+			ib_table, table->key_info[count].name);
+
+		if (!index_mapping[count]) {
+			sql_print_error("Cannot find index %s in InnoDB "
+					"index dictionary.",
+					table->key_info[count].name);
+			ret = FALSE;
+			goto func_exit;
+		}
+
+		/* Double check fetched index has the same
+		column info as those in mysql key_info. */
+		if (!innobase_match_index_columns(&table->key_info[count],
+					          index_mapping[count])) {
+			sql_print_error("Found index %s whose column info "
+					"does not match that of MySQL.",
+					table->key_info[count].name);
+			ret = FALSE;
+			goto func_exit;
+		}
+	}
+
+	/* Successfully built the translation table */
+	share->idx_trans_tbl.index_count = mysql_num_index;
+
+func_exit:
+	if (!ret) {
+		/* Build translation table failed. */
+		my_free(index_mapping);
+
+		share->idx_trans_tbl.array_size = 0;
+		share->idx_trans_tbl.index_count = 0;
+		index_mapping = NULL;
+	}
+
+	share->idx_trans_tbl.index_mapping = index_mapping;
+
+	mutex_exit(&dict_sys->mutex);
+
+	DBUG_RETURN(ret);
+}
+
+/*******************************************************************//**
+This function uses index translation table to quickly locate the
+requested index structure.
+Note we do not have mutex protection for the index translatoin table
+access, it is based on the assumption that there is no concurrent
+translation table rebuild (fter create/drop index) and DMLs that
+require index lookup.
+@return dict_index_t structure for requested index. NULL if
+fail to locate the index structure. */
+static
+dict_index_t*
+innobase_index_lookup(
+/*==================*/
+	INNOBASE_SHARE*	share,	/*!< in: share structure for index
+				translation table. */
+	uint		keynr)	/*!< in: index number for the requested
+				index */
+{
+	if (!share->idx_trans_tbl.index_mapping
+	    || keynr >= share->idx_trans_tbl.index_count) {
+		return(NULL);
+	}
+
+	return(share->idx_trans_tbl.index_mapping[keynr]);
+}
+
+/************************************************************************
+Set the autoinc column max value. This should only be called once from
+ha_innobase::open(). Therefore there's no need for a covering lock. */
+UNIV_INTERN
+void
+ha_innobase::innobase_initialize_autoinc()
+/*======================================*/
+{
+	ulonglong	auto_inc;
+	const Field*	field = table->found_next_number_field;
+
+	if (field != NULL) {
+		auto_inc = innobase_get_int_col_max_value(field);
+	} else {
+		/* We have no idea what's been passed in to us as the
+		autoinc column. We set it to the 0, effectively disabling
+		updates to the table. */
+		auto_inc = 0;
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Unable to determine the AUTOINC "
+				"column name\n");
+	}
+
+	if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
+		/* If the recovery level is set so high that writes
+		are disabled we force the AUTOINC counter to 0
+		value effectively disabling writes to the table.
+		Secondly, we avoid reading the table in case the read
+		results in failure due to a corrupted table/index.
+
+		We will not return an error to the client, so that the
+		tables can be dumped with minimal hassle.  If an error
+		were returned in this case, the first attempt to read
+		the table would fail and subsequent SELECTs would succeed. */
+		auto_inc = 0;
+	} else if (field == NULL) {
+		/* This is a far more serious error, best to avoid
+		opening the table and return failure. */
+		my_error(ER_AUTOINC_READ_FAILED, MYF(0));
+	} else {
+		dict_index_t*	index;
+		const char*	col_name;
+		ib_uint64_t	read_auto_inc;
+		ulint		err;
+
+		update_thd(ha_thd());
+
+		ut_a(prebuilt->trx == thd_to_trx(user_thd));
+
+		col_name = field->field_name;
+		index = innobase_get_index(table->s->next_number_index);
+
+		/* Execute SELECT MAX(col_name) FROM TABLE; */
+		err = row_search_max_autoinc(index, col_name, &read_auto_inc);
+
+		switch (err) {
+		case DB_SUCCESS: {
+			ulonglong	col_max_value;
+
+			col_max_value = innobase_get_int_col_max_value(field);
+
+			/* At the this stage we do not know the increment
+			nor the offset, so use a default increment of 1. */
+
+			auto_inc = innobase_next_autoinc(
+				read_auto_inc, 1, 1, 0, col_max_value);
+
+			break;
+		}
+		case DB_RECORD_NOT_FOUND:
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: MySQL and InnoDB data "
+				"dictionaries are out of sync.\n"
+				"InnoDB: Unable to find the AUTOINC column "
+				"%s in the InnoDB table %s.\n"
+				"InnoDB: We set the next AUTOINC column "
+				"value to 0,\n"
+				"InnoDB: in effect disabling the AUTOINC "
+				"next value generation.\n"
+				"InnoDB: You can either set the next "
+				"AUTOINC value explicitly using ALTER TABLE\n"
+				"InnoDB: or fix the data dictionary by "
+				"recreating the table.\n",
+				col_name, index->table->name);
+
+			/* This will disable the AUTOINC generation. */
+			auto_inc = 0;
+
+			/* We want the open to succeed, so that the user can
+			take corrective action. ie. reads should succeed but
+			updates should fail. */
+			err = DB_SUCCESS;
+			break;
+		default:
+			/* row_search_max_autoinc() should only return
+			one of DB_SUCCESS or DB_RECORD_NOT_FOUND. */
+			ut_error;
+		}
+	}
+
+	dict_table_autoinc_initialize(prebuilt->table, auto_inc);
+}
+
+/*****************************************************************//**
+Creates and opens a handle to a table which already exists in an InnoDB
+database.
+@return	1 if error, 0 if success */
+UNIV_INTERN
+int
+ha_innobase::open(
+/*==============*/
+	const char*		name,		/*!< in: table name */
+	int			mode,		/*!< in: not used */
+	uint			test_if_locked)	/*!< in: not used */
+{
+	dict_table_t*		ib_table;
+	char			norm_name[FN_REFLEN];
+	THD*			thd;
+	char*			is_part = NULL;
+	ibool			par_case_name_set = FALSE;
+	char			par_case_name[FN_REFLEN];
+	dict_err_ignore_t	ignore_err = DICT_ERR_IGNORE_NONE;
+
+	DBUG_ENTER("ha_innobase::open");
+
+	UT_NOT_USED(mode);
+	UT_NOT_USED(test_if_locked);
+
+	thd = ha_thd();
+
+	/* Under some cases MySQL seems to call this function while
+	holding btr_search_latch. This breaks the latching order as
+	we acquire dict_sys->mutex below and leads to a deadlock. */
+	if (thd != NULL) {
+		innobase_release_temporary_latches(ht, thd);
+	}
+
+	normalize_table_name(norm_name, name);
+
+	user_thd = NULL;
+
+	if (!(share=get_share(name))) {
+
+		DBUG_RETURN(1);
+	}
+
+	/* Will be allocated if it is needed in ::update_row() */
+	upd_buf = NULL;
+	upd_buf_size = 0;
+
+	/* We look for pattern #P# to see if the table is partitioned
+	MySQL table. */
+#ifdef __WIN__
+	is_part = strstr(norm_name, "#p#");
+#else
+	is_part = strstr(norm_name, "#P#");
+#endif /* __WIN__ */
+
+	/* Check whether FOREIGN_KEY_CHECKS is set to 0. If so, the table
+	can be opened even if some FK indexes are missing. If not, the table
+	can't be opened in the same situation */
+	if (thd_test_options(thd, OPTION_NO_FOREIGN_KEY_CHECKS)) {
+		ignore_err = DICT_ERR_IGNORE_FK_NOKEY;
+	}
+
+	/* Get pointer to a table object in InnoDB dictionary cache */
+	ib_table = dict_table_open_on_name(norm_name, FALSE, TRUE, ignore_err);
+
+	if (ib_table
+	    && ((!DICT_TF2_FLAG_IS_SET(ib_table, DICT_TF2_FTS_HAS_DOC_ID)
+		 && table->s->fields != dict_table_get_n_user_cols(ib_table))
+		|| (DICT_TF2_FLAG_IS_SET(ib_table, DICT_TF2_FTS_HAS_DOC_ID)
+		    && (table->s->fields
+			!= dict_table_get_n_user_cols(ib_table) - 1)))) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"table %s contains %lu user defined columns "
+			"in InnoDB, but %lu columns in MySQL. Please "
+			"check INFORMATION_SCHEMA.INNODB_SYS_COLUMNS and "
+			REFMAN "innodb-troubleshooting.html "
+			"for how to resolve it",
+			norm_name, (ulong) dict_table_get_n_user_cols(ib_table),
+			(ulong) table->s->fields);
+
+		/* Mark this table as corrupted, so the drop table
+		or force recovery can still use it, but not others. */
+		ib_table->corrupted = true;
+		dict_table_close(ib_table, FALSE, FALSE);
+		ib_table = NULL;
+		is_part = NULL;
+	}
+
+	if (NULL == ib_table) {
+		if (is_part) {
+			/* MySQL partition engine hard codes the file name
+			separator as "#P#". The text case is fixed even if
+			lower_case_table_names is set to 1 or 2. This is true
+			for sub-partition names as well. InnoDB always
+			normalises file names to lower case on Windows, this
+			can potentially cause problems when copying/moving
+			tables between platforms.
+
+			1) If boot against an installation from Windows
+			platform, then its partition table name could
+			be in lower case in system tables. So we will
+			need to check lower case name when load table.
+
+			2) If we boot an installation from other case
+			sensitive platform in Windows, we might need to
+			check the existence of table name without lower
+			case in the system table. */
+			if (innobase_get_lower_case_table_names() == 1) {
+
+				if (!par_case_name_set) {
+#ifndef __WIN__
+					/* Check for the table using lower
+					case name, including the partition
+					separator "P" */
+					strcpy(par_case_name, norm_name);
+					innobase_casedn_str(par_case_name);
+#else
+					/* On Windows platfrom, check
+					whether there exists table name in
+					system table whose name is
+					not being normalized to lower case */
+					normalize_table_name_low(
+						par_case_name, name, FALSE);
+#endif
+					par_case_name_set = TRUE;
+				}
+
+				ib_table = dict_table_open_on_name(
+					par_case_name, FALSE, TRUE,
+					ignore_err);
+			}
+
+			if (ib_table) {
+#ifndef __WIN__
+				sql_print_warning("Partition table %s opened "
+						  "after converting to lower "
+						  "case. The table may have "
+						  "been moved from a case "
+						  "in-sensitive file system. "
+						  "Please recreate table in "
+						  "the current file system\n",
+						  norm_name);
+#else
+				sql_print_warning("Partition table %s opened "
+						  "after skipping the step to "
+						  "lower case the table name. "
+						  "The table may have been "
+						  "moved from a case sensitive "
+						  "file system. Please "
+						  "recreate table in the "
+						  "current file system\n",
+						  norm_name);
+#endif
+				goto table_opened;
+			}
+		}
+
+		if (is_part) {
+			sql_print_error("Failed to open table %s.\n",
+					norm_name);
+		}
+
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Cannot open table %s from the internal data "
+			"dictionary of InnoDB though the .frm file "
+			"for the table exists. See "
+			REFMAN "innodb-troubleshooting.html for how "
+			"you can resolve the problem.", norm_name);
+
+		free_share(share);
+		my_errno = ENOENT;
+
+		DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+	}
+
+table_opened:
+
+	innobase_copy_frm_flags_from_table_share(ib_table, table->s);
+
+	dict_stats_init(ib_table);
+
+	MONITOR_INC(MONITOR_TABLE_OPEN);
+
+	bool	no_tablespace;
+
+	if (dict_table_is_discarded(ib_table)) {
+
+		ib_senderrf(thd,
+			IB_LOG_LEVEL_WARN, ER_TABLESPACE_DISCARDED,
+			table->s->table_name.str);
+
+		/* Allow an open because a proper DISCARD should have set
+		all the flags and index root page numbers to FIL_NULL that
+		should prevent any DML from running but it should allow DDL
+		operations. */
+
+		no_tablespace = false;
+
+	} else if (ib_table->ibd_file_missing) {
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN,
+			ER_TABLESPACE_MISSING, norm_name);
+
+		/* This means we have no idea what happened to the tablespace
+		file, best to play it safe. */
+
+		no_tablespace = true;
+	} else {
+		no_tablespace = false;
+	}
+
+	if (!thd_tablespace_op(thd) && no_tablespace) {
+		free_share(share);
+		my_errno = ENOENT;
+
+		dict_table_close(ib_table, FALSE, FALSE);
+
+		DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+	}
+
+	prebuilt = row_create_prebuilt(ib_table, table->s->reclength);
+
+	prebuilt->default_rec = table->s->default_values;
+	ut_ad(prebuilt->default_rec);
+
+	/* Looks like MySQL-3.23 sometimes has primary key number != 0 */
+	primary_key = table->s->primary_key;
+	key_used_on_scan = primary_key;
+
+	if (!innobase_build_index_translation(table, ib_table, share)) {
+		  sql_print_error("Build InnoDB index translation table for"
+				  " Table %s failed", name);
+	}
+
+	/* Allocate a buffer for a 'row reference'. A row reference is
+	a string of bytes of length ref_length which uniquely specifies
+	a row in our table. Note that MySQL may also compare two row
+	references for equality by doing a simple memcmp on the strings
+	of length ref_length! */
+
+	if (!row_table_got_default_clust_index(ib_table)) {
+
+		prebuilt->clust_index_was_generated = FALSE;
+
+		if (UNIV_UNLIKELY(primary_key >= MAX_KEY)) {
+			sql_print_error("Table %s has a primary key in "
+					"InnoDB data dictionary, but not "
+					"in MySQL!", name);
+
+			/* This mismatch could cause further problems
+			if not attended, bring this to the user's attention
+			by printing a warning in addition to log a message
+			in the errorlog */
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_NO_SUCH_INDEX,
+					    "InnoDB: Table %s has a "
+					    "primary key in InnoDB data "
+					    "dictionary, but not in "
+					    "MySQL!", name);
+
+			/* If primary_key >= MAX_KEY, its (primary_key)
+			value could be out of bound if continue to index
+			into key_info[] array. Find InnoDB primary index,
+			and assign its key_length to ref_length.
+			In addition, since MySQL indexes are sorted starting
+			with primary index, unique index etc., initialize
+			ref_length to the first index key length in
+			case we fail to find InnoDB cluster index.
+
+			Please note, this will not resolve the primary
+			index mismatch problem, other side effects are
+			possible if users continue to use the table.
+			However, we allow this table to be opened so
+			that user can adopt necessary measures for the
+			mismatch while still being accessible to the table
+			date. */
+			if (!table->key_info) {
+				ut_ad(!table->s->keys);
+				ref_length = 0;
+			} else {
+				ref_length = table->key_info[0].key_length;
+			}
+
+			/* Find corresponding cluster index
+			key length in MySQL's key_info[] array */
+			for (uint i = 0; i < table->s->keys; i++) {
+				dict_index_t*	index;
+				index = innobase_get_index(i);
+				if (dict_index_is_clust(index)) {
+					ref_length =
+						 table->key_info[i].key_length;
+				}
+			}
+		} else {
+			/* MySQL allocates the buffer for ref.
+			key_info->key_length includes space for all key
+			columns + one byte for each column that may be
+			NULL. ref_length must be as exact as possible to
+			save space, because all row reference buffers are
+			allocated based on ref_length. */
+
+			ref_length = table->key_info[primary_key].key_length;
+		}
+	} else {
+		if (primary_key != MAX_KEY) {
+			sql_print_error(
+				"Table %s has no primary key in InnoDB data "
+				"dictionary, but has one in MySQL! If you "
+				"created the table with a MySQL version < "
+				"3.23.54 and did not define a primary key, "
+				"but defined a unique key with all non-NULL "
+				"columns, then MySQL internally treats that "
+				"key as the primary key. You can fix this "
+				"error by dump + DROP + CREATE + reimport "
+				"of the table.", name);
+
+			/* This mismatch could cause further problems
+			if not attended, bring this to the user attention
+			by printing a warning in addition to log a message
+			in the errorlog */
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_NO_SUCH_INDEX,
+					    "InnoDB: Table %s has no "
+					    "primary key in InnoDB data "
+					    "dictionary, but has one in "
+					    "MySQL!", name);
+		}
+
+		prebuilt->clust_index_was_generated = TRUE;
+
+		ref_length = DATA_ROW_ID_LEN;
+
+		/* If we automatically created the clustered index, then
+		MySQL does not know about it, and MySQL must NOT be aware
+		of the index used on scan, to make it avoid checking if we
+		update the column of the index. That is why we assert below
+		that key_used_on_scan is the undefined value MAX_KEY.
+		The column is the row id in the automatical generation case,
+		and it will never be updated anyway. */
+
+		if (key_used_on_scan != MAX_KEY) {
+			sql_print_warning(
+				"Table %s key_used_on_scan is %lu even "
+				"though there is no primary key inside "
+				"InnoDB.", name, (ulong) key_used_on_scan);
+		}
+	}
+
+	/* Index block size in InnoDB: used by MySQL in query optimization */
+	stats.block_size = UNIV_PAGE_SIZE;
+
+	/* Init table lock structure */
+	thr_lock_data_init(&share->lock,&lock,(void*) 0);
+
+	if (prebuilt->table) {
+		/* We update the highest file format in the system table
+		space, if this table has higher file format setting. */
+
+		trx_sys_file_format_max_upgrade(
+			(const char**) &innobase_file_format_max,
+			dict_table_get_format(prebuilt->table));
+	}
+
+	/* Only if the table has an AUTOINC column. */
+	if (prebuilt->table != NULL
+	    && !prebuilt->table->ibd_file_missing
+	    && table->found_next_number_field != NULL) {
+		dict_table_autoinc_lock(prebuilt->table);
+
+		/* Since a table can already be "open" in InnoDB's internal
+		data dictionary, we only init the autoinc counter once, the
+		first time the table is loaded. We can safely reuse the
+		autoinc value from a previous MySQL open. */
+		if (dict_table_autoinc_read(prebuilt->table) == 0) {
+
+			innobase_initialize_autoinc();
+		}
+
+		dict_table_autoinc_unlock(prebuilt->table);
+	}
+
+	info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN
+handler*
+ha_innobase::clone(
+/*===============*/
+	const char*	name,		/*!< in: table name */
+	MEM_ROOT*	mem_root)	/*!< in: memory context */
+{
+	ha_innobase* new_handler;
+
+	DBUG_ENTER("ha_innobase::clone");
+
+	new_handler = static_cast<ha_innobase*>(handler::clone(name,
+							       mem_root));
+	if (new_handler) {
+		DBUG_ASSERT(new_handler->prebuilt != NULL);
+
+		new_handler->prebuilt->select_lock_type
+			= prebuilt->select_lock_type;
+	}
+
+	DBUG_RETURN(new_handler);
+}
+
+UNIV_INTERN
+uint
+ha_innobase::max_supported_key_part_length() const
+/*==============================================*/
+{
+	/* A table format specific index column length check will be performed
+	at ha_innobase::add_index() and row_create_index_for_mysql() */
+	return(innobase_large_prefix
+		? REC_VERSION_56_MAX_INDEX_COL_LEN
+		: REC_ANTELOPE_MAX_INDEX_COL_LEN - 1);
+}
+
+/******************************************************************//**
+Closes a handle to an InnoDB table.
+@return	0 */
+UNIV_INTERN
+int
+ha_innobase::close()
+/*================*/
+{
+	THD*	thd;
+
+	DBUG_ENTER("ha_innobase::close");
+
+	thd = ha_thd();
+	if (thd != NULL) {
+		innobase_release_temporary_latches(ht, thd);
+	}
+
+	row_prebuilt_free(prebuilt, FALSE);
+
+	if (upd_buf != NULL) {
+		ut_ad(upd_buf_size != 0);
+		my_free(upd_buf);
+		upd_buf = NULL;
+		upd_buf_size = 0;
+	}
+
+	free_share(share);
+
+	MONITOR_INC(MONITOR_TABLE_CLOSE);
+
+	/* Tell InnoDB server that there might be work for
+	utility threads: */
+
+	srv_active_wake_master_thread();
+
+	DBUG_RETURN(0);
+}
+
+/* The following accessor functions should really be inside MySQL code! */
+
+/**************************************************************//**
+Gets field offset for a field in a table.
+@return	offset */
+static inline
+uint
+get_field_offset(
+/*=============*/
+	const TABLE*	table,	/*!< in: MySQL table object */
+	const Field*	field)	/*!< in: MySQL field object */
+{
+	return((uint) (field->ptr - table->record[0]));
+}
+
+/*************************************************************//**
+InnoDB uses this function to compare two data fields for which the data type
+is such that we must use MySQL code to compare them. NOTE that the prototype
+of this function is in rem0cmp.cc in InnoDB source code! If you change this
+function, remember to update the prototype there!
+@return	1, 0, -1, if a is greater, equal, less than b, respectively */
+UNIV_INTERN
+int
+innobase_mysql_cmp(
+/*===============*/
+	int		mysql_type,	/*!< in: MySQL type */
+	uint		charset_number,	/*!< in: number of the charset */
+	const unsigned char* a,		/*!< in: data field */
+	unsigned int	a_length,	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+	const unsigned char* b,		/*!< in: data field */
+	unsigned int	b_length)	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+{
+	CHARSET_INFO*		charset;
+	enum_field_types	mysql_tp;
+	int			ret;
+
+	DBUG_ASSERT(a_length != UNIV_SQL_NULL);
+	DBUG_ASSERT(b_length != UNIV_SQL_NULL);
+
+	mysql_tp = (enum_field_types) mysql_type;
+
+	switch (mysql_tp) {
+
+	case MYSQL_TYPE_BIT:
+	case MYSQL_TYPE_STRING:
+	case MYSQL_TYPE_VAR_STRING:
+	case MYSQL_TYPE_TINY_BLOB:
+	case MYSQL_TYPE_MEDIUM_BLOB:
+	case MYSQL_TYPE_BLOB:
+	case MYSQL_TYPE_LONG_BLOB:
+	case MYSQL_TYPE_VARCHAR:
+		/* Use the charset number to pick the right charset struct for
+		the comparison. Since the MySQL function get_charset may be
+		slow before Bar removes the mutex operation there, we first
+		look at 2 common charsets directly. */
+
+		if (charset_number == default_charset_info->number) {
+			charset = default_charset_info;
+		} else if (charset_number == my_charset_latin1.number) {
+			charset = &my_charset_latin1;
+		} else {
+			charset = get_charset(charset_number, MYF(MY_WME));
+
+			if (charset == NULL) {
+			  sql_print_error("InnoDB needs charset %lu for doing "
+					  "a comparison, but MySQL cannot "
+					  "find that charset.",
+					  (ulong) charset_number);
+				ut_a(0);
+			}
+		}
+
+		/* Starting from 4.1.3, we use strnncollsp() in comparisons of
+		non-latin1_swedish_ci strings. NOTE that the collation order
+		changes then: 'b\0\0...' is ordered BEFORE 'b  ...'. Users
+		having indexes on such data need to rebuild their tables! */
+
+		ret = charset->coll->strnncollsp(
+			charset, a, a_length, b, b_length, 0);
+
+		if (ret < 0) {
+			return(-1);
+		} else if (ret > 0) {
+			return(1);
+		} else {
+			return(0);
+		}
+	default:
+		ut_error;
+	}
+
+	return(0);
+}
+
+
+/*************************************************************//**
+Get the next token from the given string and store it in *token. */
+UNIV_INTERN
+CHARSET_INFO*
+innobase_get_fts_charset(
+/*=====================*/
+	int		mysql_type,	/*!< in: MySQL type */
+	uint		charset_number)	/*!< in: number of the charset */
+{
+	enum_field_types	mysql_tp;
+	CHARSET_INFO*		charset;
+
+	mysql_tp = (enum_field_types) mysql_type;
+
+	switch (mysql_tp) {
+
+	case MYSQL_TYPE_BIT:
+	case MYSQL_TYPE_STRING:
+	case MYSQL_TYPE_VAR_STRING:
+	case MYSQL_TYPE_TINY_BLOB:
+	case MYSQL_TYPE_MEDIUM_BLOB:
+	case MYSQL_TYPE_BLOB:
+	case MYSQL_TYPE_LONG_BLOB:
+	case MYSQL_TYPE_VARCHAR:
+		/* Use the charset number to pick the right charset struct for
+		the comparison. Since the MySQL function get_charset may be
+		slow before Bar removes the mutex operation there, we first
+		look at 2 common charsets directly. */
+
+		if (charset_number == default_charset_info->number) {
+			charset = default_charset_info;
+		} else if (charset_number == my_charset_latin1.number) {
+			charset = &my_charset_latin1;
+		} else {
+			charset = get_charset(charset_number, MYF(MY_WME));
+
+			if (charset == NULL) {
+			  sql_print_error("InnoDB needs charset %lu for doing "
+					  "a comparison, but MySQL cannot "
+					  "find that charset.",
+					  (ulong) charset_number);
+				ut_a(0);
+			}
+		}
+		break;
+	default:
+		ut_error;
+	}
+
+	return(charset);
+}
+
+/*************************************************************//**
+InnoDB uses this function to compare two data fields for which the data type
+is such that we must use MySQL code to compare them. NOTE that the prototype
+of this function is in rem0cmp.c in InnoDB source code! If you change this
+function, remember to update the prototype there!
+@return	1, 0, -1, if a is greater, equal, less than b, respectively */
+UNIV_INTERN
+int
+innobase_mysql_cmp_prefix(
+/*======================*/
+	int		mysql_type,	/*!< in: MySQL type */
+	uint		charset_number,	/*!< in: number of the charset */
+	const unsigned char* a,		/*!< in: data field */
+	unsigned int	a_length,	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+	const unsigned char* b,		/*!< in: data field */
+	unsigned int	b_length)	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+{
+	CHARSET_INFO*		charset;
+	int			result;
+
+	charset = innobase_get_fts_charset(mysql_type, charset_number);
+
+	result = ha_compare_text(charset, (uchar*) a, a_length,
+				 (uchar*) b, b_length, 1, 0);
+
+	return(result);
+}
+/******************************************************************//**
+compare two character string according to their charset. */
+UNIV_INTERN
+int
+innobase_fts_text_cmp(
+/*==================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*     p1,		/*!< in: key */
+	const void*     p2)		/*!< in: node */
+{
+	const CHARSET_INFO*	charset = (const CHARSET_INFO*) cs;
+	const fts_string_t*	s1 = (const fts_string_t*) p1;
+	const fts_string_t*	s2 = (const fts_string_t*) p2;
+
+	return(ha_compare_text(
+		charset, s1->f_str, static_cast<uint>(s1->f_len),
+		s2->f_str, static_cast<uint>(s2->f_len), 0, 0));
+}
+/******************************************************************//**
+compare two character string case insensitively according to their charset. */
+UNIV_INTERN
+int
+innobase_fts_text_case_cmp(
+/*=======================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*     p1,		/*!< in: key */
+	const void*     p2)		/*!< in: node */
+{
+	const CHARSET_INFO*	charset = (const CHARSET_INFO*) cs;
+	const fts_string_t*	s1 = (const fts_string_t*) p1;
+	const fts_string_t*	s2 = (const fts_string_t*) p2;
+	ulint			newlen;
+
+	my_casedn_str(charset, (char*) s2->f_str);
+
+	newlen = strlen((const char*) s2->f_str);
+
+	return(ha_compare_text(
+		charset, s1->f_str, static_cast<uint>(s1->f_len),
+		s2->f_str, static_cast<uint>(newlen), 0, 0));
+}
+/******************************************************************//**
+Get the first character's code position for FTS index partition. */
+UNIV_INTERN
+ulint
+innobase_strnxfrm(
+/*==============*/
+	const CHARSET_INFO*
+			cs,		/*!< in: Character set */
+	const uchar*	str,		/*!< in: string */
+	const ulint	len)		/*!< in: string length */
+{
+	uchar		mystr[2];
+	ulint		value;
+
+	if (!str || len == 0) {
+		return(0);
+	}
+
+	my_strnxfrm(cs, (uchar*) mystr, 2, str, len);
+
+	value = mach_read_from_2(mystr);
+
+	if (value > 255) {
+		value = value / 256;
+	}
+
+	return(value);
+}
+
+/******************************************************************//**
+compare two character string according to their charset. */
+UNIV_INTERN
+int
+innobase_fts_text_cmp_prefix(
+/*=========================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*	p1,		/*!< in: prefix key */
+	const void*	p2)		/*!< in: value to compare */
+{
+	const CHARSET_INFO*	charset = (const CHARSET_INFO*) cs;
+	const fts_string_t*	s1 = (const fts_string_t*) p1;
+	const fts_string_t*	s2 = (const fts_string_t*) p2;
+	int			result;
+
+	result = ha_compare_text(
+		charset, s2->f_str, static_cast<uint>(s2->f_len),
+		s1->f_str, static_cast<uint>(s1->f_len), 1, 0);
+
+	/* We switched s1, s2 position in ha_compare_text. So we need
+	to negate the result */
+	return(-result);
+}
+
+/******************************************************************//**
+Makes all characters in a string lower case. */
+UNIV_INTERN
+size_t
+innobase_fts_casedn_str(
+/*====================*/
+	CHARSET_INFO*	cs,	/*!< in: Character set */
+	char*		src,	/*!< in: string to put in lower case */
+	size_t		src_len,/*!< in: input string length */
+	char*		dst,	/*!< in: buffer for result string */
+	size_t		dst_len)/*!< in: buffer size */
+{
+	if (cs->casedn_multiply == 1) {
+		memcpy(dst, src, src_len);
+		dst[src_len] = 0;
+		my_casedn_str(cs, dst);
+
+		return(strlen(dst));
+	} else {
+		return(cs->cset->casedn(cs, src, src_len, dst, dst_len));
+	}
+}
+
+#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_')
+
+#define misc_word_char(X)       0
+
+/*************************************************************//**
+Get the next token from the given string and store it in *token.
+It is mostly copied from MyISAM's doc parsing function ft_simple_get_word()
+@return length of string processed */
+UNIV_INTERN
+ulint
+innobase_mysql_fts_get_token(
+/*=========================*/
+	CHARSET_INFO*	cs,		/*!< in: Character set */
+	const byte*	start,		/*!< in: start of text */
+	const byte*	end,		/*!< in: one character past end of
+					text */
+	fts_string_t*	token,		/*!< out: token's text */
+	ulint*		offset)		/*!< out: offset to token,
+					measured as characters from
+					'start' */
+{
+	int		mbl;
+	const uchar*	doc = start;
+
+	ut_a(cs);
+
+	token->f_n_char = token->f_len = 0;
+	token->f_str = NULL;
+
+	for (;;) {
+
+		if (doc >= end) {
+			return(doc - start);
+		}
+
+		int	ctype;
+
+		mbl = cs->cset->ctype(
+			cs, &ctype, doc, (const uchar*) end);
+
+		if (true_word_char(ctype, *doc)) {
+			break;
+		}
+
+		doc += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
+	}
+
+	ulint	mwc = 0;
+	ulint	length = 0;
+
+	token->f_str = const_cast<byte*>(doc);
+
+	while (doc < end) {
+
+		int	ctype;
+
+		mbl = cs->cset->ctype(
+			cs, &ctype, (uchar*) doc, (uchar*) end);
+		if (true_word_char(ctype, *doc)) {
+			mwc = 0;
+		} else if (!misc_word_char(*doc) || mwc) {
+			break;
+		} else {
+			++mwc;
+		}
+
+		++length;
+
+		doc += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
+	}
+
+	token->f_len = (uint) (doc - token->f_str) - mwc;
+	token->f_n_char = length;
+
+	return(doc - start);
+}
+
+/**************************************************************//**
+Converts a MySQL type to an InnoDB type. Note that this function returns
+the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
+VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
+@return	DATA_BINARY, DATA_VARCHAR, ... */
+UNIV_INTERN
+ulint
+get_innobase_type_from_mysql_type(
+/*==============================*/
+	ulint*		unsigned_flag,	/*!< out: DATA_UNSIGNED if an
+					'unsigned type';
+					at least ENUM and SET,
+					and unsigned integer
+					types are 'unsigned types' */
+	const void*	f)		/*!< in: MySQL Field */
+{
+	const class Field* field = reinterpret_cast<const class Field*>(f);
+
+	/* The following asserts try to check that the MySQL type code fits in
+	8 bits: this is used in ibuf and also when DATA_NOT_NULL is ORed to
+	the type */
+
+	DBUG_ASSERT((ulint)MYSQL_TYPE_STRING < 256);
+	DBUG_ASSERT((ulint)MYSQL_TYPE_VAR_STRING < 256);
+	DBUG_ASSERT((ulint)MYSQL_TYPE_DOUBLE < 256);
+	DBUG_ASSERT((ulint)MYSQL_TYPE_FLOAT < 256);
+	DBUG_ASSERT((ulint)MYSQL_TYPE_DECIMAL < 256);
+
+	if (field->flags & UNSIGNED_FLAG) {
+
+		*unsigned_flag = DATA_UNSIGNED;
+	} else {
+		*unsigned_flag = 0;
+	}
+
+	if (field->real_type() == MYSQL_TYPE_ENUM
+		|| field->real_type() == MYSQL_TYPE_SET) {
+
+		/* MySQL has field->type() a string type for these, but the
+		data is actually internally stored as an unsigned integer
+		code! */
+
+		*unsigned_flag = DATA_UNSIGNED; /* MySQL has its own unsigned
+						flag set to zero, even though
+						internally this is an unsigned
+						integer type */
+		return(DATA_INT);
+	}
+
+	switch (field->type()) {
+		/* NOTE that we only allow string types in DATA_MYSQL and
+		DATA_VARMYSQL */
+	case MYSQL_TYPE_VAR_STRING:	/* old <= 4.1 VARCHAR */
+	case MYSQL_TYPE_VARCHAR:	/* new >= 5.0.3 true VARCHAR */
+		if (field->binary()) {
+			return(DATA_BINARY);
+		} else if (strcmp(field->charset()->name,
+				  "latin1_swedish_ci") == 0) {
+			return(DATA_VARCHAR);
+		} else {
+			return(DATA_VARMYSQL);
+		}
+	case MYSQL_TYPE_BIT:
+	case MYSQL_TYPE_STRING: if (field->binary()) {
+
+			return(DATA_FIXBINARY);
+		} else if (strcmp(field->charset()->name,
+				  "latin1_swedish_ci") == 0) {
+			return(DATA_CHAR);
+		} else {
+			return(DATA_MYSQL);
+		}
+	case MYSQL_TYPE_NEWDECIMAL:
+		return(DATA_FIXBINARY);
+	case MYSQL_TYPE_LONG:
+	case MYSQL_TYPE_LONGLONG:
+	case MYSQL_TYPE_TINY:
+	case MYSQL_TYPE_SHORT:
+	case MYSQL_TYPE_INT24:
+	case MYSQL_TYPE_DATE:
+	case MYSQL_TYPE_YEAR:
+	case MYSQL_TYPE_NEWDATE:
+		return(DATA_INT);
+	case MYSQL_TYPE_TIME:
+	case MYSQL_TYPE_DATETIME:
+	case MYSQL_TYPE_TIMESTAMP:
+		switch (field->real_type()) {
+		case MYSQL_TYPE_TIME:
+		case MYSQL_TYPE_DATETIME:
+		case MYSQL_TYPE_TIMESTAMP:
+			return(DATA_INT);
+		default: /* Fall through */
+			DBUG_ASSERT((ulint)MYSQL_TYPE_DECIMAL < 256);
+		case MYSQL_TYPE_TIME2:
+		case MYSQL_TYPE_DATETIME2:
+		case MYSQL_TYPE_TIMESTAMP2:
+			return(DATA_FIXBINARY);
+		}
+	case MYSQL_TYPE_FLOAT:
+		return(DATA_FLOAT);
+	case MYSQL_TYPE_DOUBLE:
+		return(DATA_DOUBLE);
+	case MYSQL_TYPE_DECIMAL:
+		return(DATA_DECIMAL);
+	case MYSQL_TYPE_GEOMETRY:
+	case MYSQL_TYPE_TINY_BLOB:
+	case MYSQL_TYPE_MEDIUM_BLOB:
+	case MYSQL_TYPE_BLOB:
+	case MYSQL_TYPE_LONG_BLOB:
+		return(DATA_BLOB);
+	case MYSQL_TYPE_NULL:
+		/* MySQL currently accepts "NULL" datatype, but will
+		reject such datatype in the next release. We will cope
+		with it and not trigger assertion failure in 5.1 */
+		break;
+	default:
+		ut_error;
+	}
+
+	return(0);
+}
+
+/*******************************************************************//**
+Writes an unsigned integer value < 64k to 2 bytes, in the little-endian
+storage format. */
+static inline
+void
+innobase_write_to_2_little_endian(
+/*==============================*/
+	byte*	buf,	/*!< in: where to store */
+	ulint	val)	/*!< in: value to write, must be < 64k */
+{
+	ut_a(val < 256 * 256);
+
+	buf[0] = (byte)(val & 0xFF);
+	buf[1] = (byte)(val / 256);
+}
+
+/*******************************************************************//**
+Reads an unsigned integer value < 64k from 2 bytes, in the little-endian
+storage format.
+@return	value */
+static inline
+uint
+innobase_read_from_2_little_endian(
+/*===============================*/
+	const uchar*	buf)	/*!< in: from where to read */
+{
+	return((uint) ((ulint)(buf[0]) + 256 * ((ulint)(buf[1]))));
+}
+
+/*******************************************************************//**
+Stores a key value for a row to a buffer.
+@return	key value length as stored in buff */
+UNIV_INTERN
+uint
+ha_innobase::store_key_val_for_row(
+/*===============================*/
+	uint		keynr,	/*!< in: key number */
+	char*		buff,	/*!< in/out: buffer for the key value (in MySQL
+				format) */
+	uint		buff_len,/*!< in: buffer length */
+	const uchar*	record)/*!< in: row in MySQL format */
+{
+	KEY*		key_info	= table->key_info + keynr;
+	KEY_PART_INFO*	key_part	= key_info->key_part;
+	KEY_PART_INFO*	end		=
+		key_part + key_info->user_defined_key_parts;
+	char*		buff_start	= buff;
+	enum_field_types mysql_type;
+	Field*		field;
+	ibool		is_null;
+
+	DBUG_ENTER("store_key_val_for_row");
+
+	/* The format for storing a key field in MySQL is the following:
+
+	1. If the column can be NULL, then in the first byte we put 1 if the
+	field value is NULL, 0 otherwise.
+
+	2. If the column is of a BLOB type (it must be a column prefix field
+	in this case), then we put the length of the data in the field to the
+	next 2 bytes, in the little-endian format. If the field is SQL NULL,
+	then these 2 bytes are set to 0. Note that the length of data in the
+	field is <= column prefix length.
+
+	3. In a column prefix field, prefix_len next bytes are reserved for
+	data. In a normal field the max field length next bytes are reserved
+	for data. For a VARCHAR(n) the max field length is n. If the stored
+	value is the SQL NULL then these data bytes are set to 0.
+
+	4. We always use a 2 byte length for a true >= 5.0.3 VARCHAR. Note that
+	in the MySQL row format, the length is stored in 1 or 2 bytes,
+	depending on the maximum allowed length. But in the MySQL key value
+	format, the length always takes 2 bytes.
+
+	We have to zero-fill the buffer so that MySQL is able to use a
+	simple memcmp to compare two key values to determine if they are
+	equal. MySQL does this to compare contents of two 'ref' values. */
+
+	memset(buff, 0, buff_len);
+
+	for (; key_part != end; key_part++) {
+		is_null = FALSE;
+
+		if (key_part->null_bit) {
+			if (record[key_part->null_offset]
+						& key_part->null_bit) {
+				*buff = 1;
+				is_null = TRUE;
+			} else {
+				*buff = 0;
+			}
+			buff++;
+		}
+
+		field = key_part->field;
+		mysql_type = field->type();
+
+		if (mysql_type == MYSQL_TYPE_VARCHAR) {
+						/* >= 5.0.3 true VARCHAR */
+			ulint		lenlen;
+			ulint		len;
+			const byte*	data;
+			ulint		key_len;
+			ulint		true_len;
+			const CHARSET_INFO* cs;
+			int		error=0;
+
+			key_len = key_part->length;
+
+			if (is_null) {
+				buff += key_len + 2;
+
+				continue;
+			}
+			cs = field->charset();
+
+			lenlen = (ulint)
+				(((Field_varstring*) field)->length_bytes);
+
+			data = row_mysql_read_true_varchar(&len,
+				(byte*) (record
+				+ (ulint) get_field_offset(table, field)),
+				lenlen);
+
+			true_len = len;
+
+			/* For multi byte character sets we need to calculate
+			the true length of the key */
+
+			if (len > 0 && cs->mbmaxlen > 1) {
+				true_len = (ulint) cs->cset->well_formed_len(cs,
+						(const char*) data,
+						(const char*) data + len,
+						(uint) (key_len / cs->mbmaxlen),
+						&error);
+			}
+
+			/* In a column prefix index, we may need to truncate
+			the stored value: */
+
+			if (true_len > key_len) {
+				true_len = key_len;
+			}
+
+			/* The length in a key value is always stored in 2
+			bytes */
+
+			row_mysql_store_true_var_len((byte*) buff, true_len, 2);
+			buff += 2;
+
+			memcpy(buff, data, true_len);
+
+			/* Note that we always reserve the maximum possible
+			length of the true VARCHAR in the key value, though
+			only len first bytes after the 2 length bytes contain
+			actual data. The rest of the space was reset to zero
+			in the memset() call above. */
+
+			buff += key_len;
+
+		} else if (mysql_type == MYSQL_TYPE_TINY_BLOB
+			|| mysql_type == MYSQL_TYPE_MEDIUM_BLOB
+			|| mysql_type == MYSQL_TYPE_BLOB
+			|| mysql_type == MYSQL_TYPE_LONG_BLOB
+			/* MYSQL_TYPE_GEOMETRY data is treated
+			as BLOB data in innodb. */
+			|| mysql_type == MYSQL_TYPE_GEOMETRY) {
+
+			const CHARSET_INFO* cs;
+			ulint		key_len;
+			ulint		true_len;
+			int		error=0;
+			ulint		blob_len;
+			const byte*	blob_data;
+
+			ut_a(key_part->key_part_flag & HA_PART_KEY_SEG);
+
+			key_len = key_part->length;
+
+			if (is_null) {
+				buff += key_len + 2;
+
+				continue;
+			}
+
+			cs = field->charset();
+
+			blob_data = row_mysql_read_blob_ref(&blob_len,
+				(byte*) (record
+				+ (ulint) get_field_offset(table, field)),
+					(ulint) field->pack_length());
+
+			true_len = blob_len;
+
+			ut_a(get_field_offset(table, field)
+				== key_part->offset);
+
+			/* For multi byte character sets we need to calculate
+			the true length of the key */
+
+			if (blob_len > 0 && cs->mbmaxlen > 1) {
+				true_len = (ulint) cs->cset->well_formed_len(cs,
+						(const char*) blob_data,
+						(const char*) blob_data
+							+ blob_len,
+						(uint) (key_len / cs->mbmaxlen),
+						&error);
+			}
+
+			/* All indexes on BLOB and TEXT are column prefix
+			indexes, and we may need to truncate the data to be
+			stored in the key value: */
+
+			if (true_len > key_len) {
+				true_len = key_len;
+			}
+
+			/* MySQL reserves 2 bytes for the length and the
+			storage of the number is little-endian */
+
+			innobase_write_to_2_little_endian(
+					(byte*) buff, true_len);
+			buff += 2;
+
+			memcpy(buff, blob_data, true_len);
+
+			/* Note that we always reserve the maximum possible
+			length of the BLOB prefix in the key value. */
+
+			buff += key_len;
+		} else {
+			/* Here we handle all other data types except the
+			true VARCHAR, BLOB and TEXT. Note that the column
+			value we store may be also in a column prefix
+			index. */
+
+			const CHARSET_INFO*	cs = NULL;
+			ulint			true_len;
+			ulint			key_len;
+			const uchar*		src_start;
+			int			error=0;
+			enum_field_types	real_type;
+
+			key_len = key_part->length;
+
+			if (is_null) {
+				 buff += key_len;
+
+				 continue;
+			}
+
+			src_start = record + key_part->offset;
+			real_type = field->real_type();
+			true_len = key_len;
+
+			/* Character set for the field is defined only
+			to fields whose type is string and real field
+			type is not enum or set. For these fields check
+			if character set is multi byte. */
+
+			if (real_type != MYSQL_TYPE_ENUM
+				&& real_type != MYSQL_TYPE_SET
+				&& ( mysql_type == MYSQL_TYPE_VAR_STRING
+					|| mysql_type == MYSQL_TYPE_STRING)) {
+
+				cs = field->charset();
+
+				/* For multi byte character sets we need to
+				calculate the true length of the key */
+
+				if (key_len > 0 && cs->mbmaxlen > 1) {
+
+					true_len = (ulint)
+						cs->cset->well_formed_len(cs,
+							(const char*) src_start,
+							(const char*) src_start
+								+ key_len,
+							(uint) (key_len
+								/ cs->mbmaxlen),
+							&error);
+				}
+			}
+
+			memcpy(buff, src_start, true_len);
+			buff += true_len;
+
+			/* Pad the unused space with spaces. */
+
+			if (true_len < key_len) {
+				ulint	pad_len = key_len - true_len;
+				ut_a(cs != NULL);
+				ut_a(!(pad_len % cs->mbminlen));
+
+				cs->cset->fill(cs, buff, pad_len,
+					       0x20 /* space */);
+				buff += pad_len;
+			}
+		}
+	}
+
+	ut_a(buff <= buff_start + buff_len);
+
+	DBUG_RETURN((uint)(buff - buff_start));
+}
+
+/**************************************************************//**
+Determines if a field is needed in a prebuilt struct 'template'.
+@return field to use, or NULL if the field is not needed */
+static
+const Field*
+build_template_needs_field(
+/*=======================*/
+	ibool		index_contains,	/*!< in:
+					dict_index_contains_col_or_prefix(
+					index, i) */
+	ibool		read_just_key,	/*!< in: TRUE when MySQL calls
+					ha_innobase::extra with the
+					argument HA_EXTRA_KEYREAD; it is enough
+					to read just columns defined in
+					the index (i.e., no read of the
+					clustered index record necessary) */
+	ibool		fetch_all_in_key,
+					/*!< in: true=fetch all fields in
+					the index */
+	ibool		fetch_primary_key_cols,
+					/*!< in: true=fetch the
+					primary key columns */
+	dict_index_t*	index,		/*!< in: InnoDB index to use */
+	const TABLE*	table,		/*!< in: MySQL table object */
+	ulint		i)		/*!< in: field index in InnoDB table */
+{
+	const Field*	field	= table->field[i];
+
+	ut_ad(index_contains == dict_index_contains_col_or_prefix(index, i));
+
+	if (!index_contains) {
+		if (read_just_key) {
+			/* If this is a 'key read', we do not need
+			columns that are not in the key */
+
+			return(NULL);
+		}
+	} else if (fetch_all_in_key) {
+		/* This field is needed in the query */
+
+		return(field);
+	}
+
+	if (bitmap_is_set(table->read_set, static_cast<uint>(i))
+	    || bitmap_is_set(table->write_set, static_cast<uint>(i))) {
+		/* This field is needed in the query */
+
+		return(field);
+	}
+
+	if (fetch_primary_key_cols
+	    && dict_table_col_in_clustered_key(index->table, i)) {
+		/* This field is needed in the query */
+
+		return(field);
+	}
+
+	/* This field is not needed in the query, skip it */
+
+	return(NULL);
+}
+
+/**************************************************************//**
+Determines if a field is needed in a prebuilt struct 'template'.
+@return whether the field is needed for index condition pushdown */
+inline
+bool
+build_template_needs_field_in_icp(
+/*==============================*/
+	const dict_index_t*	index,	/*!< in: InnoDB index */
+	const row_prebuilt_t*	prebuilt,/*!< in: row fetch template */
+	bool			contains,/*!< in: whether the index contains
+					column i */
+	ulint			i)	/*!< in: column number */
+{
+	ut_ad(contains == dict_index_contains_col_or_prefix(index, i));
+
+	return(index == prebuilt->index
+	       ? contains
+	       : dict_index_contains_col_or_prefix(prebuilt->index, i));
+}
+
+/**************************************************************//**
+Adds a field to a prebuilt struct 'template'.
+@return the field template */
+static
+mysql_row_templ_t*
+build_template_field(
+/*=================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: template */
+	dict_index_t*	clust_index,	/*!< in: InnoDB clustered index */
+	dict_index_t*	index,		/*!< in: InnoDB index to use */
+	TABLE*		table,		/*!< in: MySQL table object */
+	const Field*	field,		/*!< in: field in MySQL table */
+	ulint		i)		/*!< in: field index in InnoDB table */
+{
+	mysql_row_templ_t*	templ;
+	const dict_col_t*	col;
+
+	ut_ad(field == table->field[i]);
+	ut_ad(clust_index->table == index->table);
+
+	col = dict_table_get_nth_col(index->table, i);
+
+	templ = prebuilt->mysql_template + prebuilt->n_template++;
+	UNIV_MEM_INVALID(templ, sizeof *templ);
+	templ->col_no = i;
+	templ->clust_rec_field_no = dict_col_get_clust_pos(col, clust_index);
+	ut_a(templ->clust_rec_field_no != ULINT_UNDEFINED);
+
+	if (dict_index_is_clust(index)) {
+		templ->rec_field_no = templ->clust_rec_field_no;
+	} else {
+		templ->rec_field_no = dict_index_get_nth_col_pos(index, i);
+	}
+
+	if (field->real_maybe_null()) {
+		templ->mysql_null_byte_offset =
+			field->null_offset();
+
+		templ->mysql_null_bit_mask = (ulint) field->null_bit;
+	} else {
+		templ->mysql_null_bit_mask = 0;
+	}
+
+	templ->mysql_col_offset = (ulint) get_field_offset(table, field);
+
+	templ->mysql_col_len = (ulint) field->pack_length();
+	templ->type = col->mtype;
+	templ->mysql_type = (ulint) field->type();
+
+	if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+		templ->mysql_length_bytes = (ulint)
+			(((Field_varstring*) field)->length_bytes);
+	}
+
+	templ->charset = dtype_get_charset_coll(col->prtype);
+	templ->mbminlen = dict_col_get_mbminlen(col);
+	templ->mbmaxlen = dict_col_get_mbmaxlen(col);
+	templ->is_unsigned = col->prtype & DATA_UNSIGNED;
+
+	if (!dict_index_is_clust(index)
+	    && templ->rec_field_no == ULINT_UNDEFINED) {
+		prebuilt->need_to_access_clustered = TRUE;
+	}
+
+	if (prebuilt->mysql_prefix_len < templ->mysql_col_offset
+	    + templ->mysql_col_len) {
+		prebuilt->mysql_prefix_len = templ->mysql_col_offset
+			+ templ->mysql_col_len;
+	}
+
+	if (templ->type == DATA_BLOB) {
+		prebuilt->templ_contains_blob = TRUE;
+	}
+
+	return(templ);
+}
+
+/**************************************************************//**
+Builds a 'template' to the prebuilt struct. The template is used in fast
+retrieval of just those column values MySQL needs in its processing. */
+UNIV_INTERN
+void
+ha_innobase::build_template(
+/*========================*/
+	bool		whole_row)	/*!< in: true=ROW_MYSQL_WHOLE_ROW,
+					false=ROW_MYSQL_REC_FIELDS */
+{
+	dict_index_t*	index;
+	dict_index_t*	clust_index;
+	ulint		n_fields;
+	ibool		fetch_all_in_key	= FALSE;
+	ibool		fetch_primary_key_cols	= FALSE;
+	ulint		i;
+
+	if (prebuilt->select_lock_type == LOCK_X) {
+		/* We always retrieve the whole clustered index record if we
+		use exclusive row level locks, for example, if the read is
+		done in an UPDATE statement. */
+
+		whole_row = true;
+	} else if (!whole_row) {
+		if (prebuilt->hint_need_to_fetch_extra_cols
+			== ROW_RETRIEVE_ALL_COLS) {
+
+			/* We know we must at least fetch all columns in the
+			key, or all columns in the table */
+
+			if (prebuilt->read_just_key) {
+				/* MySQL has instructed us that it is enough
+				to fetch the columns in the key; looks like
+				MySQL can set this flag also when there is
+				only a prefix of the column in the key: in
+				that case we retrieve the whole column from
+				the clustered index */
+
+				fetch_all_in_key = TRUE;
+			} else {
+				whole_row = true;
+			}
+		} else if (prebuilt->hint_need_to_fetch_extra_cols
+			== ROW_RETRIEVE_PRIMARY_KEY) {
+			/* We must at least fetch all primary key cols. Note
+			that if the clustered index was internally generated
+			by InnoDB on the row id (no primary key was
+			defined), then row_search_for_mysql() will always
+			retrieve the row id to a special buffer in the
+			prebuilt struct. */
+
+			fetch_primary_key_cols = TRUE;
+		}
+	}
+
+	clust_index = dict_table_get_first_index(prebuilt->table);
+
+	index = whole_row ? clust_index : prebuilt->index;
+
+	prebuilt->need_to_access_clustered = (index == clust_index);
+
+	/* Either prebuilt->index should be a secondary index, or it
+	should be the clustered index. */
+	ut_ad(dict_index_is_clust(index) == (index == clust_index));
+
+	/* Below we check column by column if we need to access
+	the clustered index. */
+
+	n_fields = (ulint) table->s->fields; /* number of columns */
+
+	if (!prebuilt->mysql_template) {
+		prebuilt->mysql_template = (mysql_row_templ_t*)
+			mem_alloc(n_fields * sizeof(mysql_row_templ_t));
+	}
+
+	prebuilt->template_type = whole_row
+		? ROW_MYSQL_WHOLE_ROW : ROW_MYSQL_REC_FIELDS;
+	prebuilt->null_bitmap_len = table->s->null_bytes;
+
+	/* Prepare to build prebuilt->mysql_template[]. */
+	prebuilt->templ_contains_blob = FALSE;
+	prebuilt->mysql_prefix_len = 0;
+	prebuilt->n_template = 0;
+	prebuilt->idx_cond_n_cols = 0;
+
+	/* Note that in InnoDB, i is the column number in the table.
+	MySQL calls columns 'fields'. */
+
+	if (active_index != MAX_KEY && active_index == pushed_idx_cond_keyno) {
+		/* Push down an index condition or an end_range check. */
+		for (i = 0; i < n_fields; i++) {
+			const ibool		index_contains
+				= dict_index_contains_col_or_prefix(index, i);
+
+			/* Test if an end_range or an index condition
+			refers to the field. Note that "index" and
+			"index_contains" may refer to the clustered index.
+			Index condition pushdown is relative to prebuilt->index
+			(the index that is being looked up first). */
+
+			/* When join_read_always_key() invokes this
+			code via handler::ha_index_init() and
+			ha_innobase::index_init(), end_range is not
+			yet initialized. Because of that, we must
+			always check for index_contains, instead of
+			the subset
+			field->part_of_key.is_set(active_index)
+			which would be acceptable if end_range==NULL. */
+			if (build_template_needs_field_in_icp(
+				    index, prebuilt, index_contains, i)) {
+				/* Needed in ICP */
+				const Field*		field;
+				mysql_row_templ_t*	templ;
+
+				if (whole_row) {
+					field = table->field[i];
+				} else {
+					field = build_template_needs_field(
+						index_contains,
+						prebuilt->read_just_key,
+						fetch_all_in_key,
+						fetch_primary_key_cols,
+						index, table, i);
+					if (!field) {
+						continue;
+					}
+				}
+
+				templ = build_template_field(
+					prebuilt, clust_index, index,
+					table, field, i);
+				prebuilt->idx_cond_n_cols++;
+				ut_ad(prebuilt->idx_cond_n_cols
+				      == prebuilt->n_template);
+
+				if (index == prebuilt->index) {
+					templ->icp_rec_field_no
+						= templ->rec_field_no;
+				} else {
+					templ->icp_rec_field_no
+						= dict_index_get_nth_col_pos(
+							prebuilt->index, i);
+				}
+
+				if (dict_index_is_clust(prebuilt->index)) {
+					ut_ad(templ->icp_rec_field_no
+					      != ULINT_UNDEFINED);
+					/* If the primary key includes
+					a column prefix, use it in
+					index condition pushdown,
+					because the condition is
+					evaluated before fetching any
+					off-page (externally stored)
+					columns. */
+					if (templ->icp_rec_field_no
+					    < prebuilt->index->n_uniq) {
+						/* This is a key column;
+						all set. */
+						continue;
+					}
+				} else if (templ->icp_rec_field_no
+					   != ULINT_UNDEFINED) {
+					continue;
+				}
+
+				/* This is a column prefix index.
+				The column prefix can be used in
+				an end_range comparison. */
+
+				templ->icp_rec_field_no
+					= dict_index_get_nth_col_or_prefix_pos(
+						prebuilt->index, i, TRUE);
+				ut_ad(templ->icp_rec_field_no
+				      != ULINT_UNDEFINED);
+
+				/* Index condition pushdown can be used on
+				all columns of a secondary index, and on
+				the PRIMARY KEY columns. On the clustered
+				index, it must never be used on other than
+				PRIMARY KEY columns, because those columns
+				may be stored off-page, and we will not
+				fetch externally stored columns before
+				checking the index condition. */
+				/* TODO: test the above with an assertion
+				like this. Note that index conditions are
+				currently pushed down as part of the
+				"optimizer phase" while end_range is done
+				as part of the execution phase. Therefore,
+				we were unable to use an accurate condition
+				for end_range in the "if" condition above,
+				and the following assertion would fail.
+				ut_ad(!dict_index_is_clust(prebuilt->index)
+				      || templ->rec_field_no
+				      < prebuilt->index->n_uniq);
+				*/
+			}
+		}
+
+		ut_ad(prebuilt->idx_cond_n_cols > 0);
+		ut_ad(prebuilt->idx_cond_n_cols == prebuilt->n_template);
+
+		/* Include the fields that are not needed in index condition
+		pushdown. */
+		for (i = 0; i < n_fields; i++) {
+			const ibool		index_contains
+				= dict_index_contains_col_or_prefix(index, i);
+
+			if (!build_template_needs_field_in_icp(
+				    index, prebuilt, index_contains, i)) {
+				/* Not needed in ICP */
+				const Field*	field;
+
+				if (whole_row) {
+					field = table->field[i];
+				} else {
+					field = build_template_needs_field(
+						index_contains,
+						prebuilt->read_just_key,
+						fetch_all_in_key,
+						fetch_primary_key_cols,
+						index, table, i);
+					if (!field) {
+						continue;
+					}
+				}
+
+				build_template_field(prebuilt,
+						     clust_index, index,
+						     table, field, i);
+			}
+		}
+
+		prebuilt->idx_cond = this;
+	} else {
+		/* No index condition pushdown */
+		prebuilt->idx_cond = NULL;
+
+		for (i = 0; i < n_fields; i++) {
+			const Field*	field;
+
+			if (whole_row) {
+				field = table->field[i];
+			} else {
+				field = build_template_needs_field(
+					dict_index_contains_col_or_prefix(
+						index, i),
+					prebuilt->read_just_key,
+					fetch_all_in_key,
+					fetch_primary_key_cols,
+					index, table, i);
+				if (!field) {
+					continue;
+				}
+			}
+
+			build_template_field(prebuilt, clust_index, index,
+					     table, field, i);
+		}
+	}
+
+	if (index != clust_index && prebuilt->need_to_access_clustered) {
+		/* Change rec_field_no's to correspond to the clustered index
+		record */
+		for (i = 0; i < prebuilt->n_template; i++) {
+
+			mysql_row_templ_t*	templ
+				= &prebuilt->mysql_template[i];
+
+			templ->rec_field_no = templ->clust_rec_field_no;
+		}
+	}
+}
+
+/********************************************************************//**
+This special handling is really to overcome the limitations of MySQL's
+binlogging. We need to eliminate the non-determinism that will arise in
+INSERT ... SELECT type of statements, since MySQL binlog only stores the
+min value of the autoinc interval. Once that is fixed we can get rid of
+the special lock handling.
+@return	DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+dberr_t
+ha_innobase::innobase_lock_autoinc(void)
+/*====================================*/
+{
+	dberr_t		error = DB_SUCCESS;
+
+	ut_ad(!srv_read_only_mode);
+
+	switch (innobase_autoinc_lock_mode) {
+	case AUTOINC_NO_LOCKING:
+		/* Acquire only the AUTOINC mutex. */
+		dict_table_autoinc_lock(prebuilt->table);
+		break;
+
+	case AUTOINC_NEW_STYLE_LOCKING:
+		/* For simple (single/multi) row INSERTs, we fallback to the
+		old style only if another transaction has already acquired
+		the AUTOINC lock on behalf of a LOAD FILE or INSERT ... SELECT
+		etc. type of statement. */
+		if (thd_sql_command(user_thd) == SQLCOM_INSERT
+		    || thd_sql_command(user_thd) == SQLCOM_REPLACE) {
+			dict_table_t*	ib_table = prebuilt->table;
+
+			/* Acquire the AUTOINC mutex. */
+			dict_table_autoinc_lock(ib_table);
+
+			/* We need to check that another transaction isn't
+			already holding the AUTOINC lock on the table. */
+			if (ib_table->n_waiting_or_granted_auto_inc_locks) {
+				/* Release the mutex to avoid deadlocks. */
+				dict_table_autoinc_unlock(ib_table);
+			} else {
+				break;
+			}
+		}
+		/* Fall through to old style locking. */
+
+	case AUTOINC_OLD_STYLE_LOCKING:
+		error = row_lock_table_autoinc_for_mysql(prebuilt);
+
+		if (error == DB_SUCCESS) {
+
+			/* Acquire the AUTOINC mutex. */
+			dict_table_autoinc_lock(prebuilt->table);
+		}
+		break;
+
+	default:
+		ut_error;
+	}
+
+	return(error);
+}
+
+/********************************************************************//**
+Reset the autoinc value in the table.
+@return	DB_SUCCESS if all went well else error code */
+UNIV_INTERN
+dberr_t
+ha_innobase::innobase_reset_autoinc(
+/*================================*/
+	ulonglong	autoinc)	/*!< in: value to store */
+{
+	dberr_t		error;
+
+	error = innobase_lock_autoinc();
+
+	if (error == DB_SUCCESS) {
+
+		dict_table_autoinc_initialize(prebuilt->table, autoinc);
+
+		dict_table_autoinc_unlock(prebuilt->table);
+	}
+
+	return(error);
+}
+
+/********************************************************************//**
+Store the autoinc value in the table. The autoinc value is only set if
+it's greater than the existing autoinc value in the table.
+@return	DB_SUCCESS if all went well else error code */
+UNIV_INTERN
+dberr_t
+ha_innobase::innobase_set_max_autoinc(
+/*==================================*/
+	ulonglong	auto_inc)	/*!< in: value to store */
+{
+	dberr_t		error;
+
+	error = innobase_lock_autoinc();
+
+	if (error == DB_SUCCESS) {
+
+		dict_table_autoinc_update_if_greater(prebuilt->table, auto_inc);
+
+		dict_table_autoinc_unlock(prebuilt->table);
+	}
+
+	return(error);
+}
+
+/********************************************************************//**
+Stores a row in an InnoDB database, to the table specified in this
+handle.
+@return	error code */
+UNIV_INTERN
+int
+ha_innobase::write_row(
+/*===================*/
+	uchar*	record)	/*!< in: a row in MySQL format */
+{
+	dberr_t		error;
+	int		error_result= 0;
+	ibool		auto_inc_used= FALSE;
+	ulint		sql_command;
+	trx_t*		trx = thd_to_trx(user_thd);
+
+	DBUG_ENTER("ha_innobase::write_row");
+
+	if (srv_read_only_mode) {
+		ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	} else if (prebuilt->trx != trx) {
+		sql_print_error("The transaction object for the table handle "
+				"is at %p, but for the current thread it is at "
+				"%p",
+				(const void*) prebuilt->trx, (const void*) trx);
+
+		fputs("InnoDB: Dump of 200 bytes around prebuilt: ", stderr);
+		ut_print_buf(stderr, ((const byte*) prebuilt) - 100, 200);
+		fputs("\n"
+			"InnoDB: Dump of 200 bytes around ha_data: ",
+			stderr);
+		ut_print_buf(stderr, ((const byte*) trx) - 100, 200);
+		putc('\n', stderr);
+		ut_error;
+	} else if (!trx_is_started(trx)) {
+		++trx->will_lock;
+	}
+
+	ha_statistic_increment(&SSV::ha_write_count);
+
+	sql_command = thd_sql_command(user_thd);
+
+	if ((sql_command == SQLCOM_ALTER_TABLE
+	     || sql_command == SQLCOM_OPTIMIZE
+	     || sql_command == SQLCOM_CREATE_INDEX
+	     || sql_command == SQLCOM_DROP_INDEX)
+	    && num_write_row >= 10000) {
+		/* ALTER TABLE is COMMITted at every 10000 copied rows.
+		The IX table lock for the original table has to be re-issued.
+		As this method will be called on a temporary table where the
+		contents of the original table is being copied to, it is
+		a bit tricky to determine the source table.  The cursor
+		position in the source table need not be adjusted after the
+		intermediate COMMIT, since writes by other transactions are
+		being blocked by a MySQL table lock TL_WRITE_ALLOW_READ. */
+
+		dict_table_t*	src_table;
+		enum lock_mode	mode;
+
+		num_write_row = 0;
+
+		/* Commit the transaction.  This will release the table
+		locks, so they have to be acquired again. */
+
+		/* Altering an InnoDB table */
+		/* Get the source table. */
+		src_table = lock_get_src_table(
+				prebuilt->trx, prebuilt->table, &mode);
+		if (!src_table) {
+no_commit:
+			/* Unknown situation: do not commit */
+			/*
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: ALTER TABLE is holding lock"
+				" on %lu tables!\n",
+				prebuilt->trx->mysql_n_tables_locked);
+			*/
+			;
+		} else if (src_table == prebuilt->table) {
+			/* Source table is not in InnoDB format:
+			no need to re-acquire locks on it. */
+
+			/* Altering to InnoDB format */
+			innobase_commit(ht, user_thd, 1);
+			/* Note that this transaction is still active. */
+			trx_register_for_2pc(prebuilt->trx);
+			/* We will need an IX lock on the destination table. */
+			prebuilt->sql_stat_start = TRUE;
+		} else {
+			/* Ensure that there are no other table locks than
+			LOCK_IX and LOCK_AUTO_INC on the destination table. */
+
+			if (!lock_is_table_exclusive(prebuilt->table,
+							prebuilt->trx)) {
+				goto no_commit;
+			}
+
+			/* Commit the transaction.  This will release the table
+			locks, so they have to be acquired again. */
+			innobase_commit(ht, user_thd, 1);
+			/* Note that this transaction is still active. */
+			trx_register_for_2pc(prebuilt->trx);
+			/* Re-acquire the table lock on the source table. */
+			row_lock_table_for_mysql(prebuilt, src_table, mode);
+			/* We will need an IX lock on the destination table. */
+			prebuilt->sql_stat_start = TRUE;
+		}
+	}
+
+	num_write_row++;
+
+	/* This is the case where the table has an auto-increment column */
+	if (table->next_number_field && record == table->record[0]) {
+
+		/* Reset the error code before calling
+		innobase_get_auto_increment(). */
+		prebuilt->autoinc_error = DB_SUCCESS;
+
+		if ((error_result = update_auto_increment())) {
+			/* We don't want to mask autoinc overflow errors. */
+
+			/* Handle the case where the AUTOINC sub-system
+			failed during initialization. */
+			if (prebuilt->autoinc_error == DB_UNSUPPORTED) {
+				error_result = ER_AUTOINC_READ_FAILED;
+				/* Set the error message to report too. */
+				my_error(ER_AUTOINC_READ_FAILED, MYF(0));
+				goto func_exit;
+			} else if (prebuilt->autoinc_error != DB_SUCCESS) {
+				error = prebuilt->autoinc_error;
+				goto report_error;
+			}
+
+			/* MySQL errors are passed straight back. */
+			goto func_exit;
+		}
+
+		auto_inc_used = TRUE;
+	}
+
+	if (prebuilt->mysql_template == NULL
+	    || prebuilt->template_type != ROW_MYSQL_WHOLE_ROW) {
+
+		/* Build the template used in converting quickly between
+		the two database formats */
+
+		build_template(true);
+	}
+
+	innobase_srv_conc_enter_innodb(prebuilt->trx);
+
+	error = row_insert_for_mysql((byte*) record, prebuilt);
+	DEBUG_SYNC(user_thd, "ib_after_row_insert");
+
+	/* Handle duplicate key errors */
+	if (auto_inc_used) {
+		ulonglong	auto_inc;
+		ulonglong	col_max_value;
+
+		/* Note the number of rows processed for this statement, used
+		by get_auto_increment() to determine the number of AUTO-INC
+		values to reserve. This is only useful for a mult-value INSERT
+		and is a statement level counter.*/
+		if (trx->n_autoinc_rows > 0) {
+			--trx->n_autoinc_rows;
+		}
+
+		/* We need the upper limit of the col type to check for
+		whether we update the table autoinc counter or not. */
+		col_max_value = innobase_get_int_col_max_value(
+			table->next_number_field);
+
+		/* Get the value that MySQL attempted to store in the table.*/
+		auto_inc = table->next_number_field->val_int();
+
+		switch (error) {
+		case DB_DUPLICATE_KEY:
+
+			/* A REPLACE command and LOAD DATA INFILE REPLACE
+			handle a duplicate key error themselves, but we
+			must update the autoinc counter if we are performing
+			those statements. */
+
+			switch (sql_command) {
+			case SQLCOM_LOAD:
+				if (trx->duplicates) {
+
+					goto set_max_autoinc;
+				}
+				break;
+
+			case SQLCOM_REPLACE:
+			case SQLCOM_INSERT_SELECT:
+			case SQLCOM_REPLACE_SELECT:
+				goto set_max_autoinc;
+
+			default:
+				break;
+			}
+
+			break;
+
+		case DB_SUCCESS:
+			/* If the actual value inserted is greater than
+			the upper limit of the interval, then we try and
+			update the table upper limit. Note: last_value
+			will be 0 if get_auto_increment() was not called.*/
+
+			if (auto_inc >= prebuilt->autoinc_last_value) {
+set_max_autoinc:
+				/* This should filter out the negative
+				values set explicitly by the user. */
+				if (auto_inc <= col_max_value) {
+					ut_a(prebuilt->autoinc_increment > 0);
+
+					ulonglong	offset;
+					ulonglong	increment;
+					dberr_t		err;
+
+					offset = prebuilt->autoinc_offset;
+					increment = prebuilt->autoinc_increment;
+
+					auto_inc = innobase_next_autoinc(
+						auto_inc,
+						1, increment, offset,
+						col_max_value);
+
+					err = innobase_set_max_autoinc(
+						auto_inc);
+
+					if (err != DB_SUCCESS) {
+						error = err;
+					}
+				}
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+	innobase_srv_conc_exit_innodb(prebuilt->trx);
+
+report_error:
+	if (error == DB_TABLESPACE_DELETED) {
+		ib_senderrf(
+			trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_DISCARDED,
+			table->s->table_name.str);
+	}
+
+	error_result = convert_error_code_to_mysql(error,
+						   prebuilt->table->flags,
+						   user_thd);
+
+	if (error_result == HA_FTS_INVALID_DOCID) {
+		my_error(HA_FTS_INVALID_DOCID, MYF(0));
+	}
+
+func_exit:
+	innobase_active_small();
+
+	DBUG_RETURN(error_result);
+}
+
+/**********************************************************************//**
+Checks which fields have changed in a row and stores information
+of them to an update vector.
+@return	DB_SUCCESS or error code */
+static
+dberr_t
+calc_row_difference(
+/*================*/
+	upd_t*		uvect,		/*!< in/out: update vector */
+	uchar*		old_row,	/*!< in: old row in MySQL format */
+	uchar*		new_row,	/*!< in: new row in MySQL format */
+	TABLE*		table,		/*!< in: table in MySQL data
+					dictionary */
+	uchar*		upd_buff,	/*!< in: buffer to use */
+	ulint		buff_len,	/*!< in: buffer length */
+	row_prebuilt_t*	prebuilt,	/*!< in: InnoDB prebuilt struct */
+	THD*		thd)		/*!< in: user thread */
+{
+	uchar*		original_upd_buff = upd_buff;
+	Field*		field;
+	enum_field_types field_mysql_type;
+	uint		n_fields;
+	ulint		o_len;
+	ulint		n_len;
+	ulint		col_pack_len;
+	const byte*	new_mysql_row_col;
+	const byte*	o_ptr;
+	const byte*	n_ptr;
+	byte*		buf;
+	upd_field_t*	ufield;
+	ulint		col_type;
+	ulint		n_changed = 0;
+	dfield_t	dfield;
+	dict_index_t*	clust_index;
+	uint		i;
+	ibool		changes_fts_column = FALSE;
+	ibool		changes_fts_doc_col = FALSE;
+	trx_t*          trx = thd_to_trx(thd);
+	doc_id_t	doc_id = FTS_NULL_DOC_ID;
+
+	ut_ad(!srv_read_only_mode);
+
+	n_fields = table->s->fields;
+	clust_index = dict_table_get_first_index(prebuilt->table);
+
+	/* We use upd_buff to convert changed fields */
+	buf = (byte*) upd_buff;
+
+	for (i = 0; i < n_fields; i++) {
+		field = table->field[i];
+
+		o_ptr = (const byte*) old_row + get_field_offset(table, field);
+		n_ptr = (const byte*) new_row + get_field_offset(table, field);
+
+		/* Use new_mysql_row_col and col_pack_len save the values */
+
+		new_mysql_row_col = n_ptr;
+		col_pack_len = field->pack_length();
+
+		o_len = col_pack_len;
+		n_len = col_pack_len;
+
+		/* We use o_ptr and n_ptr to dig up the actual data for
+		comparison. */
+
+		field_mysql_type = field->type();
+
+		col_type = prebuilt->table->cols[i].mtype;
+
+		switch (col_type) {
+
+		case DATA_BLOB:
+			o_ptr = row_mysql_read_blob_ref(&o_len, o_ptr, o_len);
+			n_ptr = row_mysql_read_blob_ref(&n_len, n_ptr, n_len);
+
+			break;
+
+		case DATA_VARCHAR:
+		case DATA_BINARY:
+		case DATA_VARMYSQL:
+			if (field_mysql_type == MYSQL_TYPE_VARCHAR) {
+				/* This is a >= 5.0.3 type true VARCHAR where
+				the real payload data length is stored in
+				1 or 2 bytes */
+
+				o_ptr = row_mysql_read_true_varchar(
+					&o_len, o_ptr,
+					(ulint)
+					(((Field_varstring*) field)->length_bytes));
+
+				n_ptr = row_mysql_read_true_varchar(
+					&n_len, n_ptr,
+					(ulint)
+					(((Field_varstring*) field)->length_bytes));
+			}
+
+			break;
+		default:
+			;
+		}
+
+		if (field_mysql_type == MYSQL_TYPE_LONGLONG
+		    && prebuilt->table->fts
+		    && innobase_strcasecmp(
+			field->field_name, FTS_DOC_ID_COL_NAME) == 0) {
+			doc_id = (doc_id_t) mach_read_from_n_little_endian(
+				n_ptr, 8);
+			if (doc_id == 0) {
+				return(DB_FTS_INVALID_DOCID);
+			}
+		}
+
+
+		if (field->real_maybe_null()) {
+			if (field->is_null_in_record(old_row)) {
+				o_len = UNIV_SQL_NULL;
+			}
+
+			if (field->is_null_in_record(new_row)) {
+				n_len = UNIV_SQL_NULL;
+			}
+		}
+
+		if (o_len != n_len || (o_len != UNIV_SQL_NULL &&
+					0 != memcmp(o_ptr, n_ptr, o_len))) {
+			/* The field has changed */
+
+			ufield = uvect->fields + n_changed;
+			UNIV_MEM_INVALID(ufield, sizeof *ufield);
+
+			/* Let us use a dummy dfield to make the conversion
+			from the MySQL column format to the InnoDB format */
+
+			if (n_len != UNIV_SQL_NULL) {
+				dict_col_copy_type(prebuilt->table->cols + i,
+						   dfield_get_type(&dfield));
+
+				buf = row_mysql_store_col_in_innobase_format(
+					&dfield,
+					(byte*) buf,
+					TRUE,
+					new_mysql_row_col,
+					col_pack_len,
+					dict_table_is_comp(prebuilt->table));
+				dfield_copy(&ufield->new_val, &dfield);
+			} else {
+				dfield_set_null(&ufield->new_val);
+			}
+
+			ufield->exp = NULL;
+			ufield->orig_len = 0;
+			ufield->field_no = dict_col_get_clust_pos(
+				&prebuilt->table->cols[i], clust_index);
+			n_changed++;
+
+			/* If an FTS indexed column was changed by this
+			UPDATE then we need to inform the FTS sub-system.
+
+			NOTE: Currently we re-index all FTS indexed columns
+			even if only a subset of the FTS indexed columns
+			have been updated. That is the reason we are
+			checking only once here. Later we will need to
+			note which columns have been updated and do
+			selective processing. */
+			if (prebuilt->table->fts != NULL) {
+				ulint           offset;
+				dict_table_t*   innodb_table;
+
+				innodb_table = prebuilt->table;
+
+				if (!changes_fts_column) {
+					offset = row_upd_changes_fts_column(
+						innodb_table, ufield);
+
+					if (offset != ULINT_UNDEFINED) {
+						changes_fts_column = TRUE;
+					}
+				}
+
+				if (!changes_fts_doc_col) {
+					changes_fts_doc_col =
+					row_upd_changes_doc_id(
+						innodb_table, ufield);
+				}
+			}
+		}
+	}
+
+	/* If the update changes a column with an FTS index on it, we
+	then add an update column node with a new document id to the
+	other changes. We piggy back our changes on the normal UPDATE
+	to reduce processing and IO overhead. */
+	if (!prebuilt->table->fts) {
+			trx->fts_next_doc_id = 0;
+	} else if (changes_fts_column || changes_fts_doc_col) {
+		dict_table_t*   innodb_table = prebuilt->table;
+
+		ufield = uvect->fields + n_changed;
+
+		if (!DICT_TF2_FLAG_IS_SET(
+			innodb_table, DICT_TF2_FTS_HAS_DOC_ID)) {
+
+			/* If Doc ID is managed by user, and if any
+			FTS indexed column has been updated, its corresponding
+			Doc ID must also be updated. Otherwise, return
+			error */
+			if (changes_fts_column && !changes_fts_doc_col) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr, " InnoDB: A new Doc ID"
+					" must be supplied while updating"
+					" FTS indexed columns.\n");
+				return(DB_FTS_INVALID_DOCID);
+			}
+
+			/* Doc ID must monotonically increase */
+			ut_ad(innodb_table->fts->cache);
+			if (doc_id < prebuilt->table->fts->cache->next_doc_id) {
+				fprintf(stderr,
+					"InnoDB: FTS Doc ID must be larger than"
+					" " IB_ID_FMT " for table",
+					innodb_table->fts->cache->next_doc_id
+					- 1);
+				ut_print_name(stderr, trx,
+					      TRUE, innodb_table->name);
+				putc('\n', stderr);
+
+				return(DB_FTS_INVALID_DOCID);
+			} else if ((doc_id
+				    - prebuilt->table->fts->cache->next_doc_id)
+				   >= FTS_DOC_ID_MAX_STEP) {
+				fprintf(stderr,
+					"InnoDB: Doc ID " UINT64PF " is too"
+					" big. Its difference with largest"
+					" Doc ID used " UINT64PF " cannot"
+					" exceed or equal to %d\n",
+					doc_id,
+					prebuilt->table->fts->cache->next_doc_id - 1,
+					FTS_DOC_ID_MAX_STEP);
+			}
+
+
+			trx->fts_next_doc_id = doc_id;
+		} else {
+			/* If the Doc ID is a hidden column, it can't be
+			changed by user */
+			ut_ad(!changes_fts_doc_col);
+
+			/* Doc ID column is hidden, a new Doc ID will be
+			generated by following fts_update_doc_id() call */
+			trx->fts_next_doc_id = 0;
+		}
+
+		fts_update_doc_id(
+			innodb_table, ufield, &trx->fts_next_doc_id);
+
+		++n_changed;
+	} else {
+		/* We have a Doc ID column, but none of FTS indexed
+		columns are touched, nor the Doc ID column, so set
+		fts_next_doc_id to UINT64_UNDEFINED, which means do not
+		update the Doc ID column */
+		trx->fts_next_doc_id = UINT64_UNDEFINED;
+	}
+
+	uvect->n_fields = n_changed;
+	uvect->info_bits = 0;
+
+	ut_a(buf <= (byte*) original_upd_buff + buff_len);
+
+	return(DB_SUCCESS);
+}
+
+/**********************************************************************//**
+Updates a row given as a parameter to a new value. Note that we are given
+whole rows, not just the fields which are updated: this incurs some
+overhead for CPU when we check which fields are actually updated.
+TODO: currently InnoDB does not prevent the 'Halloween problem':
+in a searched update a single row can get updated several times
+if its index columns are updated!
+@return	error number or 0 */
+UNIV_INTERN
+int
+ha_innobase::update_row(
+/*====================*/
+	const uchar*	old_row,	/*!< in: old row in MySQL format */
+	uchar*		new_row)	/*!< in: new row in MySQL format */
+{
+	upd_t*		uvect;
+	dberr_t		error;
+	trx_t*		trx = thd_to_trx(user_thd);
+
+	DBUG_ENTER("ha_innobase::update_row");
+
+	ut_a(prebuilt->trx == trx);
+
+	if (srv_read_only_mode) {
+		ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	} else if (!trx_is_started(trx)) {
+		++trx->will_lock;
+	}
+
+	if (upd_buf == NULL) {
+		ut_ad(upd_buf_size == 0);
+
+		/* Create a buffer for packing the fields of a record. Why
+		table->reclength did not work here? Obviously, because char
+		fields when packed actually became 1 byte longer, when we also
+		stored the string length as the first byte. */
+
+		upd_buf_size = table->s->reclength + table->s->max_key_length
+			+ MAX_REF_PARTS * 3;
+		upd_buf = (uchar*) my_malloc(upd_buf_size, MYF(MY_WME));
+		if (upd_buf == NULL) {
+			upd_buf_size = 0;
+			DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+		}
+	}
+
+	ha_statistic_increment(&SSV::ha_update_count);
+
+	if (prebuilt->upd_node) {
+		uvect = prebuilt->upd_node->update;
+	} else {
+		uvect = row_get_prebuilt_update_vector(prebuilt);
+	}
+
+	/* Build an update vector from the modified fields in the rows
+	(uses upd_buf of the handle) */
+
+	error = calc_row_difference(uvect, (uchar*) old_row, new_row, table,
+				    upd_buf, upd_buf_size, prebuilt, user_thd);
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	/* This is not a delete */
+	prebuilt->upd_node->is_delete = FALSE;
+
+	ut_a(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
+
+	innobase_srv_conc_enter_innodb(trx);
+
+	error = row_update_for_mysql((byte*) old_row, prebuilt);
+
+	/* We need to do some special AUTOINC handling for the following case:
+
+	INSERT INTO t (c1,c2) VALUES(x,y) ON DUPLICATE KEY UPDATE ...
+
+	We need to use the AUTOINC counter that was actually used by
+	MySQL in the UPDATE statement, which can be different from the
+	value used in the INSERT statement.*/
+
+	if (error == DB_SUCCESS
+	    && table->next_number_field
+	    && new_row == table->record[0]
+	    && thd_sql_command(user_thd) == SQLCOM_INSERT
+	    && trx->duplicates)  {
+
+		ulonglong	auto_inc;
+		ulonglong	col_max_value;
+
+		auto_inc = table->next_number_field->val_int();
+
+		/* We need the upper limit of the col type to check for
+		whether we update the table autoinc counter or not. */
+		col_max_value = innobase_get_int_col_max_value(
+			table->next_number_field);
+
+		if (auto_inc <= col_max_value && auto_inc != 0) {
+
+			ulonglong	offset;
+			ulonglong	increment;
+
+			offset = prebuilt->autoinc_offset;
+			increment = prebuilt->autoinc_increment;
+
+			auto_inc = innobase_next_autoinc(
+				auto_inc, 1, increment, offset, col_max_value);
+
+			error = innobase_set_max_autoinc(auto_inc);
+		}
+	}
+
+	innobase_srv_conc_exit_innodb(trx);
+
+func_exit:
+	int err = convert_error_code_to_mysql(error,
+					    prebuilt->table->flags, user_thd);
+
+	/* If success and no columns were updated. */
+	if (err == 0 && uvect->n_fields == 0) {
+
+		/* This is the same as success, but instructs
+		MySQL that the row is not really updated and it
+		should not increase the count of updated rows.
+		This is fix for http://bugs.mysql.com/29157 */
+		err = HA_ERR_RECORD_IS_THE_SAME;
+	} else if (err == HA_FTS_INVALID_DOCID) {
+		my_error(HA_FTS_INVALID_DOCID, MYF(0));
+	}
+
+	/* Tell InnoDB server that there might be work for
+	utility threads: */
+
+	innobase_active_small();
+
+	DBUG_RETURN(err);
+}
+
+/**********************************************************************//**
+Deletes a row given as the parameter.
+@return	error number or 0 */
+UNIV_INTERN
+int
+ha_innobase::delete_row(
+/*====================*/
+	const uchar*	record)	/*!< in: a row in MySQL format */
+{
+	dberr_t		error;
+	trx_t*		trx = thd_to_trx(user_thd);
+
+	DBUG_ENTER("ha_innobase::delete_row");
+
+	ut_a(prebuilt->trx == trx);
+
+	if (srv_read_only_mode) {
+		ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	} else if (!trx_is_started(trx)) {
+		++trx->will_lock;
+	}
+
+	ha_statistic_increment(&SSV::ha_delete_count);
+
+	if (!prebuilt->upd_node) {
+		row_get_prebuilt_update_vector(prebuilt);
+	}
+
+	/* This is a delete */
+
+	prebuilt->upd_node->is_delete = TRUE;
+
+	innobase_srv_conc_enter_innodb(trx);
+
+	error = row_update_for_mysql((byte*) record, prebuilt);
+
+	innobase_srv_conc_exit_innodb(trx);
+
+	/* Tell the InnoDB server that there might be work for
+	utility threads: */
+
+	innobase_active_small();
+
+	DBUG_RETURN(convert_error_code_to_mysql(
+			    error, prebuilt->table->flags, user_thd));
+}
+
+/**********************************************************************//**
+Removes a new lock set on a row, if it was not read optimistically. This can
+be called after a row has been read in the processing of an UPDATE or a DELETE
+query, if the option innodb_locks_unsafe_for_binlog is set. */
+UNIV_INTERN
+void
+ha_innobase::unlock_row(void)
+/*=========================*/
+{
+	DBUG_ENTER("ha_innobase::unlock_row");
+
+	/* Consistent read does not take any locks, thus there is
+	nothing to unlock. */
+
+	if (prebuilt->select_lock_type == LOCK_NONE) {
+		DBUG_VOID_RETURN;
+	}
+
+	/* Ideally, this assert must be in the beginning of the function.
+	But there are some calls to this function from the SQL layer when the
+	transaction is in state TRX_STATE_NOT_STARTED.  The check on
+	prebuilt->select_lock_type above gets around this issue. */
+	ut_ad(trx_state_eq(prebuilt->trx, TRX_STATE_ACTIVE));
+
+	switch (prebuilt->row_read_type) {
+	case ROW_READ_WITH_LOCKS:
+		if (!srv_locks_unsafe_for_binlog
+		    && prebuilt->trx->isolation_level
+		    > TRX_ISO_READ_COMMITTED) {
+			break;
+		}
+		/* fall through */
+	case ROW_READ_TRY_SEMI_CONSISTENT:
+		row_unlock_for_mysql(prebuilt, FALSE);
+		break;
+	case ROW_READ_DID_SEMI_CONSISTENT:
+		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+		break;
+	}
+
+	DBUG_VOID_RETURN;
+}
+
+/* See handler.h and row0mysql.h for docs on this function. */
+UNIV_INTERN
+bool
+ha_innobase::was_semi_consistent_read(void)
+/*=======================================*/
+{
+	return(prebuilt->row_read_type == ROW_READ_DID_SEMI_CONSISTENT);
+}
+
+/* See handler.h and row0mysql.h for docs on this function. */
+UNIV_INTERN
+void
+ha_innobase::try_semi_consistent_read(bool yes)
+/*===========================================*/
+{
+	ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
+
+	/* Row read type is set to semi consistent read if this was
+	requested by the MySQL and either innodb_locks_unsafe_for_binlog
+	option is used or this session is using READ COMMITTED isolation
+	level. */
+
+	if (yes
+	    && (srv_locks_unsafe_for_binlog
+		|| prebuilt->trx->isolation_level <= TRX_ISO_READ_COMMITTED)) {
+		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+	} else {
+		prebuilt->row_read_type = ROW_READ_WITH_LOCKS;
+	}
+}
+
+/******************************************************************//**
+Initializes a handle to use an index.
+@return	0 or error number */
+UNIV_INTERN
+int
+ha_innobase::index_init(
+/*====================*/
+	uint	keynr,	/*!< in: key (index) number */
+	bool sorted)	/*!< in: 1 if result MUST be sorted according to index */
+{
+	DBUG_ENTER("index_init");
+
+	DBUG_RETURN(change_active_index(keynr));
+}
+
+/******************************************************************//**
+Currently does nothing.
+@return	0 */
+UNIV_INTERN
+int
+ha_innobase::index_end(void)
+/*========================*/
+{
+	int	error	= 0;
+	DBUG_ENTER("index_end");
+	active_index = MAX_KEY;
+	in_range_check_pushed_down = FALSE;
+	ds_mrr.dsmrr_close();
+	DBUG_RETURN(error);
+}
+
+/*********************************************************************//**
+Converts a search mode flag understood by MySQL to a flag understood
+by InnoDB. */
+static inline
+ulint
+convert_search_mode_to_innobase(
+/*============================*/
+	enum ha_rkey_function	find_flag)
+{
+	switch (find_flag) {
+	case HA_READ_KEY_EXACT:
+		/* this does not require the index to be UNIQUE */
+		return(PAGE_CUR_GE);
+	case HA_READ_KEY_OR_NEXT:
+		return(PAGE_CUR_GE);
+	case HA_READ_KEY_OR_PREV:
+		return(PAGE_CUR_LE);
+	case HA_READ_AFTER_KEY:
+		return(PAGE_CUR_G);
+	case HA_READ_BEFORE_KEY:
+		return(PAGE_CUR_L);
+	case HA_READ_PREFIX:
+		return(PAGE_CUR_GE);
+	case HA_READ_PREFIX_LAST:
+		return(PAGE_CUR_LE);
+	case HA_READ_PREFIX_LAST_OR_PREV:
+		return(PAGE_CUR_LE);
+		/* In MySQL-4.0 HA_READ_PREFIX and HA_READ_PREFIX_LAST always
+		pass a complete-field prefix of a key value as the search
+		tuple. I.e., it is not allowed that the last field would
+		just contain n first bytes of the full field value.
+		MySQL uses a 'padding' trick to convert LIKE 'abc%'
+		type queries so that it can use as a search tuple
+		a complete-field-prefix of a key value. Thus, the InnoDB
+		search mode PAGE_CUR_LE_OR_EXTENDS is never used.
+		TODO: when/if MySQL starts to use also partial-field
+		prefixes, we have to deal with stripping of spaces
+		and comparison of non-latin1 char type fields in
+		innobase_mysql_cmp() to get PAGE_CUR_LE_OR_EXTENDS to
+		work correctly. */
+	case HA_READ_MBR_CONTAIN:
+	case HA_READ_MBR_INTERSECT:
+	case HA_READ_MBR_WITHIN:
+	case HA_READ_MBR_DISJOINT:
+	case HA_READ_MBR_EQUAL:
+		return(PAGE_CUR_UNSUPP);
+	/* do not use "default:" in order to produce a gcc warning:
+	enumeration value '...' not handled in switch
+	(if -Wswitch or -Wall is used) */
+	}
+
+	my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "this functionality");
+
+	return(PAGE_CUR_UNSUPP);
+}
+
+/*
+   BACKGROUND INFO: HOW A SELECT SQL QUERY IS EXECUTED
+   ---------------------------------------------------
+The following does not cover all the details, but explains how we determine
+the start of a new SQL statement, and what is associated with it.
+
+For each table in the database the MySQL interpreter may have several
+table handle instances in use, also in a single SQL query. For each table
+handle instance there is an InnoDB  'prebuilt' struct which contains most
+of the InnoDB data associated with this table handle instance.
+
+  A) if the user has not explicitly set any MySQL table level locks:
+
+  1) MySQL calls ::external_lock to set an 'intention' table level lock on
+the table of the handle instance. There we set
+prebuilt->sql_stat_start = TRUE. The flag sql_stat_start should be set
+true if we are taking this table handle instance to use in a new SQL
+statement issued by the user. We also increment trx->n_mysql_tables_in_use.
+
+  2) If prebuilt->sql_stat_start == TRUE we 'pre-compile' the MySQL search
+instructions to prebuilt->template of the table handle instance in
+::index_read. The template is used to save CPU time in large joins.
+
+  3) In row_search_for_mysql, if prebuilt->sql_stat_start is true, we
+allocate a new consistent read view for the trx if it does not yet have one,
+or in the case of a locking read, set an InnoDB 'intention' table level
+lock on the table.
+
+  4) We do the SELECT. MySQL may repeatedly call ::index_read for the
+same table handle instance, if it is a join.
+
+  5) When the SELECT ends, MySQL removes its intention table level locks
+in ::external_lock. When trx->n_mysql_tables_in_use drops to zero,
+ (a) we execute a COMMIT there if the autocommit is on,
+ (b) we also release possible 'SQL statement level resources' InnoDB may
+have for this SQL statement. The MySQL interpreter does NOT execute
+autocommit for pure read transactions, though it should. That is why the
+table handler in that case has to execute the COMMIT in ::external_lock.
+
+  B) If the user has explicitly set MySQL table level locks, then MySQL
+does NOT call ::external_lock at the start of the statement. To determine
+when we are at the start of a new SQL statement we at the start of
+::index_read also compare the query id to the latest query id where the
+table handle instance was used. If it has changed, we know we are at the
+start of a new SQL statement. Since the query id can theoretically
+overwrap, we use this test only as a secondary way of determining the
+start of a new SQL statement. */
+
+
+/**********************************************************************//**
+Positions an index cursor to the index specified in the handle. Fetches the
+row if any.
+@return	0, HA_ERR_KEY_NOT_FOUND, or error number */
+UNIV_INTERN
+int
+ha_innobase::index_read(
+/*====================*/
+	uchar*		buf,		/*!< in/out: buffer for the returned
+					row */
+	const uchar*	key_ptr,	/*!< in: key value; if this is NULL
+					we position the cursor at the
+					start or end of index; this can
+					also contain an InnoDB row id, in
+					which case key_len is the InnoDB
+					row id length; the key value can
+					also be a prefix of a full key value,
+					and the last column can be a prefix
+					of a full column */
+	uint			key_len,/*!< in: key value length */
+	enum ha_rkey_function find_flag)/*!< in: search flags from my_base.h */
+{
+	ulint		mode;
+	dict_index_t*	index;
+	ulint		match_mode	= 0;
+	int		error;
+	dberr_t		ret;
+
+	DBUG_ENTER("index_read");
+	DEBUG_SYNC_C("ha_innobase_index_read_begin");
+
+	ut_a(prebuilt->trx == thd_to_trx(user_thd));
+	ut_ad(key_len != 0 || find_flag != HA_READ_KEY_EXACT);
+
+	ha_statistic_increment(&SSV::ha_read_key_count);
+
+	index = prebuilt->index;
+
+	if (UNIV_UNLIKELY(index == NULL) || dict_index_is_corrupted(index)) {
+		prebuilt->index_usable = FALSE;
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+	if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
+		DBUG_RETURN(dict_index_is_corrupted(index)
+			    ? HA_ERR_INDEX_CORRUPT
+			    : HA_ERR_TABLE_DEF_CHANGED);
+	}
+
+	if (index->type & DICT_FTS) {
+		DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
+	}
+
+	/* Note that if the index for which the search template is built is not
+	necessarily prebuilt->index, but can also be the clustered index */
+
+	if (prebuilt->sql_stat_start) {
+		build_template(false);
+	}
+
+	if (key_ptr) {
+		/* Convert the search key value to InnoDB format into
+		prebuilt->search_tuple */
+
+		row_sel_convert_mysql_key_to_innobase(
+			prebuilt->search_tuple,
+			prebuilt->srch_key_val1,
+			prebuilt->srch_key_val_len,
+			index,
+			(byte*) key_ptr,
+			(ulint) key_len,
+			prebuilt->trx);
+		DBUG_ASSERT(prebuilt->search_tuple->n_fields > 0);
+	} else {
+		/* We position the cursor to the last or the first entry
+		in the index */
+
+		dtuple_set_n_fields(prebuilt->search_tuple, 0);
+	}
+
+	mode = convert_search_mode_to_innobase(find_flag);
+
+	match_mode = 0;
+
+	if (find_flag == HA_READ_KEY_EXACT) {
+
+		match_mode = ROW_SEL_EXACT;
+
+	} else if (find_flag == HA_READ_PREFIX
+		   || find_flag == HA_READ_PREFIX_LAST) {
+
+		match_mode = ROW_SEL_EXACT_PREFIX;
+	}
+
+	last_match_mode = (uint) match_mode;
+
+	if (mode != PAGE_CUR_UNSUPP) {
+
+		innobase_srv_conc_enter_innodb(prebuilt->trx);
+
+		ret = row_search_for_mysql((byte*) buf, mode, prebuilt,
+					   match_mode, 0);
+
+		innobase_srv_conc_exit_innodb(prebuilt->trx);
+	} else {
+
+		ret = DB_UNSUPPORTED;
+	}
+
+	switch (ret) {
+	case DB_SUCCESS:
+		error = 0;
+		table->status = 0;
+		srv_stats.n_rows_read.add((size_t) prebuilt->trx->id, 1);
+		break;
+	case DB_RECORD_NOT_FOUND:
+		error = HA_ERR_KEY_NOT_FOUND;
+		table->status = STATUS_NOT_FOUND;
+		break;
+	case DB_END_OF_INDEX:
+		error = HA_ERR_KEY_NOT_FOUND;
+		table->status = STATUS_NOT_FOUND;
+		break;
+	case DB_TABLESPACE_DELETED:
+
+		ib_senderrf(
+			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_DISCARDED,
+			table->s->table_name.str);
+
+		table->status = STATUS_NOT_FOUND;
+		error = HA_ERR_NO_SUCH_TABLE;
+		break;
+	case DB_TABLESPACE_NOT_FOUND:
+
+		ib_senderrf(
+			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_MISSING, MYF(0),
+			table->s->table_name.str);
+
+		table->status = STATUS_NOT_FOUND;
+		error = HA_ERR_NO_SUCH_TABLE;
+		break;
+	default:
+		error = convert_error_code_to_mysql(
+			ret, prebuilt->table->flags, user_thd);
+
+		table->status = STATUS_NOT_FOUND;
+		break;
+	}
+
+	DBUG_RETURN(error);
+}
+
+/*******************************************************************//**
+The following functions works like index_read, but it find the last
+row with the current key value or prefix.
+@return	0, HA_ERR_KEY_NOT_FOUND, or an error code */
+UNIV_INTERN
+int
+ha_innobase::index_read_last(
+/*=========================*/
+	uchar*		buf,	/*!< out: fetched row */
+	const uchar*	key_ptr,/*!< in: key value, or a prefix of a full
+				key value */
+	uint		key_len)/*!< in: length of the key val or prefix
+				in bytes */
+{
+	return(index_read(buf, key_ptr, key_len, HA_READ_PREFIX_LAST));
+}
+
+/********************************************************************//**
+Get the index for a handle. Does not change active index.
+@return	NULL or index instance. */
+UNIV_INTERN
+dict_index_t*
+ha_innobase::innobase_get_index(
+/*============================*/
+	uint		keynr)	/*!< in: use this index; MAX_KEY means always
+				clustered index, even if it was internally
+				generated by InnoDB */
+{
+	KEY*		key = 0;
+	dict_index_t*	index = 0;
+
+	DBUG_ENTER("innobase_get_index");
+
+	if (keynr != MAX_KEY && table->s->keys > 0) {
+		key = table->key_info + keynr;
+
+		index = innobase_index_lookup(share, keynr);
+
+		if (index) {
+			ut_a(ut_strcmp(index->name, key->name) == 0);
+		} else {
+			/* Can't find index with keynr in the translation
+			table. Only print message if the index translation
+			table exists */
+			if (share->idx_trans_tbl.index_mapping) {
+				sql_print_warning("InnoDB could not find "
+						  "index %s key no %u for "
+						  "table %s through its "
+						  "index translation table",
+						  key ? key->name : "NULL",
+						  keynr,
+						  prebuilt->table->name);
+			}
+
+			index = dict_table_get_index_on_name(prebuilt->table,
+							     key->name);
+		}
+	} else {
+		index = dict_table_get_first_index(prebuilt->table);
+	}
+
+	if (!index) {
+		sql_print_error(
+			"Innodb could not find key n:o %u with name %s "
+			"from dict cache for table %s",
+			keynr, key ? key->name : "NULL",
+			prebuilt->table->name);
+	}
+
+	DBUG_RETURN(index);
+}
+
+/********************************************************************//**
+Changes the active index of a handle.
+@return	0 or error code */
+UNIV_INTERN
+int
+ha_innobase::change_active_index(
+/*=============================*/
+	uint	keynr)	/*!< in: use this index; MAX_KEY means always clustered
+			index, even if it was internally generated by
+			InnoDB */
+{
+	DBUG_ENTER("change_active_index");
+
+	ut_ad(user_thd == ha_thd());
+	ut_a(prebuilt->trx == thd_to_trx(user_thd));
+
+	active_index = keynr;
+
+	prebuilt->index = innobase_get_index(keynr);
+
+	if (UNIV_UNLIKELY(!prebuilt->index)) {
+		sql_print_warning("InnoDB: change_active_index(%u) failed",
+				  keynr);
+		prebuilt->index_usable = FALSE;
+		DBUG_RETURN(1);
+	}
+
+	prebuilt->index_usable = row_merge_is_index_usable(prebuilt->trx,
+							   prebuilt->index);
+
+	if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
+		if (dict_index_is_corrupted(prebuilt->index)) {
+			char index_name[MAX_FULL_NAME_LEN + 1];
+			char table_name[MAX_FULL_NAME_LEN + 1];
+
+			innobase_format_name(
+				index_name, sizeof index_name,
+				prebuilt->index->name, TRUE);
+
+			innobase_format_name(
+				table_name, sizeof table_name,
+				prebuilt->index->table->name, FALSE);
+
+			push_warning_printf(
+				user_thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_ERR_INDEX_CORRUPT,
+				"InnoDB: Index %s for table %s is"
+				" marked as corrupted",
+				index_name, table_name);
+			DBUG_RETURN(HA_ERR_INDEX_CORRUPT);
+		} else {
+			push_warning_printf(
+				user_thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_ERR_TABLE_DEF_CHANGED,
+				"InnoDB: insufficient history for index %u",
+				keynr);
+		}
+
+		/* The caller seems to ignore this.  Thus, we must check
+		this again in row_search_for_mysql(). */
+		DBUG_RETURN(HA_ERR_TABLE_DEF_CHANGED);
+	}
+
+	ut_a(prebuilt->search_tuple != 0);
+
+	dtuple_set_n_fields(prebuilt->search_tuple, prebuilt->index->n_fields);
+
+	dict_index_copy_types(prebuilt->search_tuple, prebuilt->index,
+			      prebuilt->index->n_fields);
+
+	/* MySQL changes the active index for a handle also during some
+	queries, for example SELECT MAX(a), SUM(a) first retrieves the MAX()
+	and then calculates the sum. Previously we played safe and used
+	the flag ROW_MYSQL_WHOLE_ROW below, but that caused unnecessary
+	copying. Starting from MySQL-4.1 we use a more efficient flag here. */
+
+	build_template(false);
+
+	DBUG_RETURN(0);
+}
+
+/**********************************************************************//**
+Positions an index cursor to the index specified in keynr. Fetches the
+row if any.
+??? This is only used to read whole keys ???
+@return	error number or 0 */
+UNIV_INTERN
+int
+ha_innobase::index_read_idx(
+/*========================*/
+	uchar*		buf,		/*!< in/out: buffer for the returned
+					row */
+	uint		keynr,		/*!< in: use this index */
+	const uchar*	key,		/*!< in: key value; if this is NULL
+					we position the cursor at the
+					start or end of index */
+	uint		key_len,	/*!< in: key value length */
+	enum ha_rkey_function find_flag)/*!< in: search flags from my_base.h */
+{
+	if (change_active_index(keynr)) {
+
+		return(1);
+	}
+
+	return(index_read(buf, key, key_len, find_flag));
+}
+
+/***********************************************************************//**
+Reads the next or previous row from a cursor, which must have previously been
+positioned using index_read.
+@return	0, HA_ERR_END_OF_FILE, or error number */
+UNIV_INTERN
+int
+ha_innobase::general_fetch(
+/*=======================*/
+	uchar*	buf,		/*!< in/out: buffer for next row in MySQL
+				format */
+	uint	direction,	/*!< in: ROW_SEL_NEXT or ROW_SEL_PREV */
+	uint	match_mode)	/*!< in: 0, ROW_SEL_EXACT, or
+				ROW_SEL_EXACT_PREFIX */
+{
+	dberr_t	ret;
+	int	error;
+
+	DBUG_ENTER("general_fetch");
+
+	ut_a(prebuilt->trx == thd_to_trx(user_thd));
+
+	innobase_srv_conc_enter_innodb(prebuilt->trx);
+
+	ret = row_search_for_mysql(
+		(byte*) buf, 0, prebuilt, match_mode, direction);
+
+	innobase_srv_conc_exit_innodb(prebuilt->trx);
+
+	switch (ret) {
+	case DB_SUCCESS:
+		error = 0;
+		table->status = 0;
+		srv_stats.n_rows_read.add((size_t) prebuilt->trx->id, 1);
+		break;
+	case DB_RECORD_NOT_FOUND:
+		error = HA_ERR_END_OF_FILE;
+		table->status = STATUS_NOT_FOUND;
+		break;
+	case DB_END_OF_INDEX:
+		error = HA_ERR_END_OF_FILE;
+		table->status = STATUS_NOT_FOUND;
+		break;
+	case DB_TABLESPACE_DELETED:
+
+		ib_senderrf(
+			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_DISCARDED,
+			table->s->table_name.str);
+
+		table->status = STATUS_NOT_FOUND;
+		error = HA_ERR_NO_SUCH_TABLE;
+		break;
+	case DB_TABLESPACE_NOT_FOUND:
+
+		ib_senderrf(
+			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_MISSING,
+			table->s->table_name.str);
+
+		table->status = STATUS_NOT_FOUND;
+		error = HA_ERR_NO_SUCH_TABLE;
+		break;
+	default:
+		error = convert_error_code_to_mysql(
+			ret, prebuilt->table->flags, user_thd);
+
+		table->status = STATUS_NOT_FOUND;
+		break;
+	}
+
+	DBUG_RETURN(error);
+}
+
+/***********************************************************************//**
+Reads the next row from a cursor, which must have previously been
+positioned using index_read.
+@return	0, HA_ERR_END_OF_FILE, or error number */
+UNIV_INTERN
+int
+ha_innobase::index_next(
+/*====================*/
+	uchar*		buf)	/*!< in/out: buffer for next row in MySQL
+				format */
+{
+	ha_statistic_increment(&SSV::ha_read_next_count);
+
+	return(general_fetch(buf, ROW_SEL_NEXT, 0));
+}
+
+/*******************************************************************//**
+Reads the next row matching to the key value given as the parameter.
+@return	0, HA_ERR_END_OF_FILE, or error number */
+UNIV_INTERN
+int
+ha_innobase::index_next_same(
+/*=========================*/
+	uchar*		buf,	/*!< in/out: buffer for the row */
+	const uchar*	key,	/*!< in: key value */
+	uint		keylen)	/*!< in: key value length */
+{
+	ha_statistic_increment(&SSV::ha_read_next_count);
+
+	return(general_fetch(buf, ROW_SEL_NEXT, last_match_mode));
+}
+
+/***********************************************************************//**
+Reads the previous row from a cursor, which must have previously been
+positioned using index_read.
+@return	0, HA_ERR_END_OF_FILE, or error number */
+UNIV_INTERN
+int
+ha_innobase::index_prev(
+/*====================*/
+	uchar*	buf)	/*!< in/out: buffer for previous row in MySQL format */
+{
+	ha_statistic_increment(&SSV::ha_read_prev_count);
+
+	return(general_fetch(buf, ROW_SEL_PREV, 0));
+}
+
+/********************************************************************//**
+Positions a cursor on the first record in an index and reads the
+corresponding row to buf.
+@return	0, HA_ERR_END_OF_FILE, or error code */
+UNIV_INTERN
+int
+ha_innobase::index_first(
+/*=====================*/
+	uchar*	buf)	/*!< in/out: buffer for the row */
+{
+	int	error;
+
+	DBUG_ENTER("index_first");
+	ha_statistic_increment(&SSV::ha_read_first_count);
+
+	error = index_read(buf, NULL, 0, HA_READ_AFTER_KEY);
+
+	/* MySQL does not seem to allow this to return HA_ERR_KEY_NOT_FOUND */
+
+	if (error == HA_ERR_KEY_NOT_FOUND) {
+		error = HA_ERR_END_OF_FILE;
+	}
+
+	DBUG_RETURN(error);
+}
+
+/********************************************************************//**
+Positions a cursor on the last record in an index and reads the
+corresponding row to buf.
+@return	0, HA_ERR_END_OF_FILE, or error code */
+UNIV_INTERN
+int
+ha_innobase::index_last(
+/*====================*/
+	uchar*	buf)	/*!< in/out: buffer for the row */
+{
+	int	error;
+
+	DBUG_ENTER("index_last");
+	ha_statistic_increment(&SSV::ha_read_last_count);
+
+	error = index_read(buf, NULL, 0, HA_READ_BEFORE_KEY);
+
+	/* MySQL does not seem to allow this to return HA_ERR_KEY_NOT_FOUND */
+
+	if (error == HA_ERR_KEY_NOT_FOUND) {
+		error = HA_ERR_END_OF_FILE;
+	}
+
+	DBUG_RETURN(error);
+}
+
+/****************************************************************//**
+Initialize a table scan.
+@return	0 or error number */
+UNIV_INTERN
+int
+ha_innobase::rnd_init(
+/*==================*/
+	bool	scan)	/*!< in: TRUE if table/index scan FALSE otherwise */
+{
+	int	err;
+
+	/* Store the active index value so that we can restore the original
+	value after a scan */
+
+	if (prebuilt->clust_index_was_generated) {
+		err = change_active_index(MAX_KEY);
+	} else {
+		err = change_active_index(primary_key);
+	}
+
+	/* Don't use semi-consistent read in random row reads (by position).
+	This means we must disable semi_consistent_read if scan is false */
+
+	if (!scan) {
+		try_semi_consistent_read(0);
+	}
+
+	start_of_scan = 1;
+
+	return(err);
+}
+
+/*****************************************************************//**
+Ends a table scan.
+@return	0 or error number */
+UNIV_INTERN
+int
+ha_innobase::rnd_end(void)
+/*======================*/
+{
+	return(index_end());
+}
+
+/*****************************************************************//**
+Reads the next row in a table scan (also used to read the FIRST row
+in a table scan).
+@return	0, HA_ERR_END_OF_FILE, or error number */
+UNIV_INTERN
+int
+ha_innobase::rnd_next(
+/*==================*/
+	uchar*	buf)	/*!< in/out: returns the row in this buffer,
+			in MySQL format */
+{
+	int	error;
+
+	DBUG_ENTER("rnd_next");
+	ha_statistic_increment(&SSV::ha_read_rnd_next_count);
+
+	if (start_of_scan) {
+		error = index_first(buf);
+
+		if (error == HA_ERR_KEY_NOT_FOUND) {
+			error = HA_ERR_END_OF_FILE;
+		}
+
+		start_of_scan = 0;
+	} else {
+		error = general_fetch(buf, ROW_SEL_NEXT, 0);
+	}
+
+	DBUG_RETURN(error);
+}
+
+/**********************************************************************//**
+Fetches a row from the table based on a row reference.
+@return	0, HA_ERR_KEY_NOT_FOUND, or error code */
+UNIV_INTERN
+int
+ha_innobase::rnd_pos(
+/*=================*/
+	uchar*	buf,	/*!< in/out: buffer for the row */
+	uchar*	pos)	/*!< in: primary key value of the row in the
+			MySQL format, or the row id if the clustered
+			index was internally generated by InnoDB; the
+			length of data in pos has to be ref_length */
+{
+	int		error;
+	DBUG_ENTER("rnd_pos");
+	DBUG_DUMP("key", pos, ref_length);
+
+	ha_statistic_increment(&SSV::ha_read_rnd_count);
+
+	ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
+
+	/* Note that we assume the length of the row reference is fixed
+	for the table, and it is == ref_length */
+
+	error = index_read(buf, pos, ref_length, HA_READ_KEY_EXACT);
+
+	if (error) {
+		DBUG_PRINT("error", ("Got error: %d", error));
+	}
+
+	DBUG_RETURN(error);
+}
+
+/**********************************************************************//**
+Initialize FT index scan
+@return 0 or error number */
+UNIV_INTERN
+int
+ha_innobase::ft_init()
+/*==================*/
+{
+	DBUG_ENTER("ft_init");
+
+	trx_t*	trx = check_trx_exists(ha_thd());
+
+	/* FTS queries are not treated as autocommit non-locking selects.
+	This is because the FTS implementation can acquire locks behind
+	the scenes. This has not been verified but it is safer to treat
+	them as regular read only transactions for now. */
+
+	if (!trx_is_started(trx)) {
+		++trx->will_lock;
+	}
+
+	DBUG_RETURN(rnd_init(false));
+}
+
+/**********************************************************************//**
+Initialize FT index scan
+@return FT_INFO structure if successful or NULL */
+UNIV_INTERN
+FT_INFO*
+ha_innobase::ft_init_ext(
+/*=====================*/
+	uint			flags,	/* in: */
+	uint			keynr,	/* in: */
+	String*			key)	/* in: */
+{
+	trx_t*			trx;
+	dict_table_t*		ft_table;
+	dberr_t			error;
+	byte*			query = (byte*) key->ptr();
+	ulint			query_len = key->length();
+	const CHARSET_INFO*	char_set = key->charset();
+	NEW_FT_INFO*		fts_hdl = NULL;
+	dict_index_t*		index;
+	fts_result_t*		result;
+	char			buf_tmp[8192];
+	ulint			buf_tmp_used;
+	uint			num_errors;
+
+	if (fts_enable_diag_print) {
+		fprintf(stderr, "keynr=%u, '%.*s'\n",
+			keynr, (int) key->length(), (byte*) key->ptr());
+
+		if (flags & FT_BOOL) {
+			fprintf(stderr, "BOOL search\n");
+		} else {
+			fprintf(stderr, "NL search\n");
+		}
+	}
+
+	/* FIXME: utf32 and utf16 are not compatible with some
+	string function used. So to convert them to uft8 before
+	proceed. */
+	if (strcmp(char_set->csname, "utf32") == 0
+	    || strcmp(char_set->csname, "utf16") == 0) {
+		buf_tmp_used = innobase_convert_string(
+			buf_tmp, sizeof(buf_tmp) - 1,
+			&my_charset_utf8_general_ci,
+			query, query_len, (CHARSET_INFO*) char_set,
+			&num_errors);
+
+		query = (byte*) buf_tmp;
+		query_len = buf_tmp_used;
+		query[query_len] = 0;
+	}
+
+	trx = prebuilt->trx;
+
+	/* FTS queries are not treated as autocommit non-locking selects.
+	This is because the FTS implementation can acquire locks behind
+	the scenes. This has not been verified but it is safer to treat
+	them as regular read only transactions for now. */
+
+	if (!trx_is_started(trx)) {
+		++trx->will_lock;
+	}
+
+	ft_table = prebuilt->table;
+
+	/* Table does not have an FTS index */
+	if (!ft_table->fts || ib_vector_is_empty(ft_table->fts->indexes)) {
+		my_error(ER_TABLE_HAS_NO_FT, MYF(0));
+		return(NULL);
+	}
+
+	/* If tablespace is discarded, we should return here */
+	if (dict_table_is_discarded(ft_table)) {
+		my_error(ER_NO_SUCH_TABLE, MYF(0), table->s->db.str,
+			 table->s->table_name.str);
+		return(NULL);
+	}
+
+	if (keynr == NO_SUCH_KEY) {
+		/* FIXME: Investigate the NO_SUCH_KEY usage */
+		index = (dict_index_t*) ib_vector_getp(ft_table->fts->indexes, 0);
+	} else {
+		index = innobase_get_index(keynr);
+	}
+
+	if (!index || index->type != DICT_FTS) {
+		my_error(ER_TABLE_HAS_NO_FT, MYF(0));
+		return(NULL);
+	}
+
+	if (!(ft_table->fts->fts_status & ADDED_TABLE_SYNCED)) {
+		fts_init_index(ft_table, FALSE);
+
+		ft_table->fts->fts_status |= ADDED_TABLE_SYNCED;
+	}
+
+	error = fts_query(trx, index, flags, query, query_len, &result);
+
+	if (error != DB_SUCCESS) {
+		my_error(convert_error_code_to_mysql(error, 0, NULL),
+			MYF(0));
+		return(NULL);
+	}
+
+	/* Allocate FTS handler, and instantiate it before return */
+	fts_hdl = static_cast<NEW_FT_INFO*>(my_malloc(sizeof(NEW_FT_INFO),
+				   MYF(0)));
+
+	fts_hdl->please = const_cast<_ft_vft*>(&ft_vft_result);
+	fts_hdl->could_you = const_cast<_ft_vft_ext*>(&ft_vft_ext_result);
+	fts_hdl->ft_prebuilt = prebuilt;
+	fts_hdl->ft_result = result;
+
+	/* FIXME: Re-evluate the condition when Bug 14469540
+	is resolved */
+	prebuilt->in_fts_query = true;
+
+	return((FT_INFO*) fts_hdl);
+}
+
+/*****************************************************************//**
+Set up search tuple for a query through FTS_DOC_ID_INDEX on
+supplied Doc ID. This is used by MySQL to retrieve the documents
+once the search result (Doc IDs) is available */
+static
+void
+innobase_fts_create_doc_id_key(
+/*===========================*/
+	dtuple_t*	tuple,		/* in/out: prebuilt->search_tuple */
+	const dict_index_t*
+			index,		/* in: index (FTS_DOC_ID_INDEX) */
+	doc_id_t*	doc_id)		/* in/out: doc id to search, value
+					could be changed to storage format
+					used for search. */
+{
+	doc_id_t	temp_doc_id;
+	dfield_t*	dfield = dtuple_get_nth_field(tuple, 0);
+
+	ut_a(dict_index_get_n_unique(index) == 1);
+
+	dtuple_set_n_fields(tuple, index->n_fields);
+	dict_index_copy_types(tuple, index, index->n_fields);
+
+#ifdef UNIV_DEBUG
+	/* The unique Doc ID field should be an eight-bytes integer */
+	dict_field_t*	field = dict_index_get_nth_field(index, 0);
+        ut_a(field->col->mtype == DATA_INT);
+	ut_ad(sizeof(*doc_id) == field->fixed_len);
+	ut_ad(innobase_strcasecmp(index->name, FTS_DOC_ID_INDEX_NAME) == 0);
+#endif /* UNIV_DEBUG */
+
+	/* Convert to storage byte order */
+	mach_write_to_8(reinterpret_cast<byte*>(&temp_doc_id), *doc_id);
+	*doc_id = temp_doc_id;
+	dfield_set_data(dfield, doc_id, sizeof(*doc_id));
+
+        dtuple_set_n_fields_cmp(tuple, 1);
+
+	for (ulint i = 1; i < index->n_fields; i++) {
+		dfield = dtuple_get_nth_field(tuple, i);
+		dfield_set_null(dfield);
+	}
+}
+
+/**********************************************************************//**
+Fetch next result from the FT result set
+@return error code */
+UNIV_INTERN
+int
+ha_innobase::ft_read(
+/*=================*/
+	uchar*		buf)		/*!< in/out: buf contain result row */
+{
+	fts_result_t*	result;
+	int		error;
+	row_prebuilt_t*	ft_prebuilt;
+
+	ft_prebuilt = ((NEW_FT_INFO*) ft_handler)->ft_prebuilt;
+
+	ut_a(ft_prebuilt == prebuilt);
+
+	result = ((NEW_FT_INFO*) ft_handler)->ft_result;
+
+	if (result->current == NULL) {
+		/* This is the case where the FTS query did not
+		contain and matching documents. */
+		if (result->rankings_by_id != NULL) {
+			/* Now that we have the complete result, we
+			need to sort the document ids on their rank
+			calculation. */
+
+			fts_query_sort_result_on_rank(result);
+
+			result->current = const_cast<ib_rbt_node_t*>(
+				rbt_first(result->rankings_by_rank));
+		} else {
+			ut_a(result->current == NULL);
+		}
+	} else {
+		result->current = const_cast<ib_rbt_node_t*>(
+			rbt_next(result->rankings_by_rank, result->current));
+	}
+
+next_record:
+
+	if (result->current != NULL) {
+		dict_index_t*	index;
+		dtuple_t*	tuple = prebuilt->search_tuple;
+		doc_id_t	search_doc_id;
+
+		/* If we only need information from result we can return
+		   without fetching the table row */
+		if (ft_prebuilt->read_just_key) {
+			table->status= 0;
+			return(0);
+		}
+
+		index = dict_table_get_index_on_name(
+			prebuilt->table, FTS_DOC_ID_INDEX_NAME);
+
+		/* Must find the index */
+		ut_a(index);
+
+		/* Switch to the FTS doc id index */
+		prebuilt->index = index;
+
+		fts_ranking_t*	ranking = rbt_value(
+			fts_ranking_t, result->current);
+
+		search_doc_id = ranking->doc_id;
+
+		/* We pass a pointer of search_doc_id because it will be
+		converted to storage byte order used in the search
+		tuple. */
+		innobase_fts_create_doc_id_key(tuple, index, &search_doc_id);
+
+		innobase_srv_conc_enter_innodb(prebuilt->trx);
+
+		dberr_t ret = row_search_for_mysql(
+			(byte*) buf, PAGE_CUR_GE, prebuilt, ROW_SEL_EXACT, 0);
+
+		innobase_srv_conc_exit_innodb(prebuilt->trx);
+
+		switch (ret) {
+		case DB_SUCCESS:
+			error = 0;
+			table->status = 0;
+			break;
+		case DB_RECORD_NOT_FOUND:
+			result->current = const_cast<ib_rbt_node_t*>(
+				rbt_next(result->rankings_by_rank,
+					 result->current));
+
+			if (!result->current) {
+				/* exhaust the result set, should return
+				HA_ERR_END_OF_FILE just like
+				ha_innobase::general_fetch() and/or
+				ha_innobase::index_first() etc. */
+				error = HA_ERR_END_OF_FILE;
+				table->status = STATUS_NOT_FOUND;
+			} else {
+				goto next_record;
+			}
+			break;
+		case DB_END_OF_INDEX:
+			error = HA_ERR_END_OF_FILE;
+			table->status = STATUS_NOT_FOUND;
+			break;
+		case DB_TABLESPACE_DELETED:
+
+			ib_senderrf(
+				prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLESPACE_DISCARDED,
+				table->s->table_name.str);
+
+			table->status = STATUS_NOT_FOUND;
+			error = HA_ERR_NO_SUCH_TABLE;
+			break;
+		case DB_TABLESPACE_NOT_FOUND:
+
+			ib_senderrf(
+				prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLESPACE_MISSING,
+				table->s->table_name.str);
+
+			table->status = STATUS_NOT_FOUND;
+			error = HA_ERR_NO_SUCH_TABLE;
+			break;
+		default:
+			error = convert_error_code_to_mysql(
+				ret, 0, user_thd);
+
+			table->status = STATUS_NOT_FOUND;
+			break;
+		}
+
+		return(error);
+	}
+
+	return(HA_ERR_END_OF_FILE);
+}
+
+/*************************************************************************
+*/
+
+void
+ha_innobase::ft_end()
+{
+	fprintf(stderr, "ft_end()\n");
+
+	rnd_end();
+}
+
+/*********************************************************************//**
+Stores a reference to the current row to 'ref' field of the handle. Note
+that in the case where we have generated the clustered index for the
+table, the function parameter is illogical: we MUST ASSUME that 'record'
+is the current 'position' of the handle, because if row ref is actually
+the row id internally generated in InnoDB, then 'record' does not contain
+it. We just guess that the row id must be for the record where the handle
+was positioned the last time. */
+UNIV_INTERN
+void
+ha_innobase::position(
+/*==================*/
+	const uchar*	record)	/*!< in: row in MySQL format */
+{
+	uint		len;
+
+	ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
+
+	if (prebuilt->clust_index_was_generated) {
+		/* No primary key was defined for the table and we
+		generated the clustered index from row id: the
+		row reference will be the row id, not any key value
+		that MySQL knows of */
+
+		len = DATA_ROW_ID_LEN;
+
+		memcpy(ref, prebuilt->row_id, len);
+	} else {
+		len = store_key_val_for_row(primary_key, (char*) ref,
+							 ref_length, record);
+	}
+
+	/* We assume that the 'ref' value len is always fixed for the same
+	table. */
+
+	if (len != ref_length) {
+		sql_print_error("Stored ref len is %lu, but table ref len is "
+				"%lu", (ulong) len, (ulong) ref_length);
+	}
+}
+
+/*****************************************************************//**
+Check whether there exist a column named as "FTS_DOC_ID", which is
+reserved for InnoDB FTS Doc ID
+@return true if there exist a "FTS_DOC_ID" column */
+static
+bool
+create_table_check_doc_id_col(
+/*==========================*/
+	trx_t*		trx,		/*!< in: InnoDB transaction handle */
+	const TABLE*	form,		/*!< in: information on table
+					columns and indexes */
+	ulint*		doc_id_col)	/*!< out: Doc ID column number if
+					there exist a FTS_DOC_ID column,
+					ULINT_UNDEFINED if column is of the
+					wrong type/name/size */
+{
+	for (ulint i = 0; i < form->s->fields; i++) {
+		const Field*	field;
+		ulint		col_type;
+		ulint		col_len;
+		ulint		unsigned_type;
+
+		field = form->field[i];
+
+		col_type = get_innobase_type_from_mysql_type(&unsigned_type,
+							     field);
+
+		col_len = field->pack_length();
+
+		if (innobase_strcasecmp(field->field_name,
+					FTS_DOC_ID_COL_NAME) == 0) {
+
+			/* Note the name is case sensitive due to
+			our internal query parser */
+			if (col_type == DATA_INT
+			    && !field->real_maybe_null()
+			    && col_len == sizeof(doc_id_t)
+			    && (strcmp(field->field_name,
+				      FTS_DOC_ID_COL_NAME) == 0)) {
+				*doc_id_col = i;
+			} else {
+				push_warning_printf(
+					trx->mysql_thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					ER_ILLEGAL_HA_CREATE_OPTION,
+					"InnoDB: FTS_DOC_ID column must be "
+					"of BIGINT NOT NULL type, and named "
+					"in all capitalized characters");
+				my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+					 field->field_name);
+				*doc_id_col = ULINT_UNDEFINED;
+			}
+
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/*****************************************************************//**
+Creates a table definition to an InnoDB database. */
+static __attribute__((nonnull, warn_unused_result))
+int
+create_table_def(
+/*=============*/
+	trx_t*		trx,		/*!< in: InnoDB transaction handle */
+	const TABLE*	form,		/*!< in: information on table
+					columns and indexes */
+	const char*	table_name,	/*!< in: table name */
+	const char*	temp_path,	/*!< in: if this is a table explicitly
+					created by the user with the
+					TEMPORARY keyword, then this
+					parameter is the dir path where the
+					table should be placed if we create
+					an .ibd file for it (no .ibd extension
+					in the path, though). Otherwise this
+					is a zero length-string */
+	const char*	remote_path,	/*!< in: Remote path or zero length-string */
+	ulint		flags,		/*!< in: table flags */
+	ulint		flags2)		/*!< in: table flags2 */
+{
+	THD*		thd = trx->mysql_thd;
+	dict_table_t*	table;
+	ulint		n_cols;
+	dberr_t		err;
+	ulint		col_type;
+	ulint		col_len;
+	ulint		nulls_allowed;
+	ulint		unsigned_type;
+	ulint		binary_type;
+	ulint		long_true_varchar;
+	ulint		charset_no;
+	ulint		i;
+	ulint		doc_id_col = 0;
+	ibool		has_doc_id_col = FALSE;
+	mem_heap_t*	heap;
+
+	DBUG_ENTER("create_table_def");
+	DBUG_PRINT("enter", ("table_name: %s", table_name));
+
+	DBUG_ASSERT(thd != NULL);
+
+	/* MySQL does the name length check. But we do additional check
+	on the name length here */
+	if (strlen(table_name) > MAX_FULL_NAME_LEN) {
+		push_warning_printf(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_TABLE_NAME,
+			"InnoDB: Table Name or Database Name is too long");
+
+		DBUG_RETURN(ER_TABLE_NAME);
+	}
+
+	n_cols = form->s->fields;
+
+	/* Check whether there already exists a FTS_DOC_ID column */
+	if (create_table_check_doc_id_col(trx, form, &doc_id_col)){
+
+		/* Raise error if the Doc ID column is of wrong type or name */
+		if (doc_id_col == ULINT_UNDEFINED) {
+			trx_commit_for_mysql(trx);
+
+			err = DB_ERROR;
+			goto error_ret;
+		} else {
+			has_doc_id_col = TRUE;
+		}
+	}
+
+	/* We pass 0 as the space id, and determine at a lower level the space
+	id where to store the table */
+
+	if (flags2 & DICT_TF2_FTS) {
+		/* Adjust for the FTS hidden field */
+		if (!has_doc_id_col) {
+			table = dict_mem_table_create(table_name, 0, n_cols + 1,
+						      flags, flags2);
+
+			/* Set the hidden doc_id column. */
+			table->fts->doc_col = n_cols;
+		} else {
+			table = dict_mem_table_create(table_name, 0, n_cols,
+						      flags, flags2);
+			table->fts->doc_col = doc_id_col;
+		}
+	} else {
+		table = dict_mem_table_create(table_name, 0, n_cols,
+					      flags, flags2);
+	}
+
+	if (flags2 & DICT_TF2_TEMPORARY) {
+		ut_a(strlen(temp_path));
+		table->dir_path_of_temp_table =
+			mem_heap_strdup(table->heap, temp_path);
+	}
+
+	if (DICT_TF_HAS_DATA_DIR(flags)) {
+		ut_a(strlen(remote_path));
+		table->data_dir_path = mem_heap_strdup(table->heap, remote_path);
+	} else {
+		table->data_dir_path = NULL;
+	}
+	heap = mem_heap_create(1000);
+
+	for (i = 0; i < n_cols; i++) {
+		Field*	field = form->field[i];
+
+		col_type = get_innobase_type_from_mysql_type(&unsigned_type,
+							     field);
+
+		if (!col_type) {
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_CANT_CREATE_TABLE,
+				"Error creating table '%s' with "
+				"column '%s'. Please check its "
+				"column type and try to re-create "
+				"the table with an appropriate "
+				"column type.",
+				table->name, field->field_name);
+			goto err_col;
+		}
+
+		nulls_allowed = field->real_maybe_null() ? 0 : DATA_NOT_NULL;
+		binary_type = field->binary() ? DATA_BINARY_TYPE : 0;
+
+		charset_no = 0;
+
+		if (dtype_is_string_type(col_type)) {
+
+			charset_no = (ulint) field->charset()->number;
+
+			if (UNIV_UNLIKELY(charset_no > MAX_CHAR_COLL_NUM)) {
+				/* in data0type.h we assume that the
+				number fits in one byte in prtype */
+				push_warning_printf(
+					thd, Sql_condition::WARN_LEVEL_WARN,
+					ER_CANT_CREATE_TABLE,
+					"In InnoDB, charset-collation codes"
+					" must be below 256."
+					" Unsupported code %lu.",
+					(ulong) charset_no);
+				mem_heap_free(heap);
+				DBUG_RETURN(ER_CANT_CREATE_TABLE);
+			}
+		}
+
+		/* we assume in dtype_form_prtype() that this fits in
+		two bytes */
+		ut_a(static_cast<uint>(field->type()) <= MAX_CHAR_COLL_NUM);
+		col_len = field->pack_length();
+
+		/* The MySQL pack length contains 1 or 2 bytes length field
+		for a true VARCHAR. Let us subtract that, so that the InnoDB
+		column length in the InnoDB data dictionary is the real
+		maximum byte length of the actual data. */
+
+		long_true_varchar = 0;
+
+		if (field->type() == MYSQL_TYPE_VARCHAR) {
+			col_len -= ((Field_varstring*) field)->length_bytes;
+
+			if (((Field_varstring*) field)->length_bytes == 2) {
+				long_true_varchar = DATA_LONG_TRUE_VARCHAR;
+			}
+		}
+
+		/* First check whether the column to be added has a
+		system reserved name. */
+		if (dict_col_name_is_reserved(field->field_name)){
+			my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+				 field->field_name);
+err_col:
+			dict_mem_table_free(table);
+			mem_heap_free(heap);
+			trx_commit_for_mysql(trx);
+
+			err = DB_ERROR;
+			goto error_ret;
+		}
+
+		dict_mem_table_add_col(table, heap,
+			field->field_name,
+			col_type,
+			dtype_form_prtype(
+				(ulint) field->type()
+				| nulls_allowed | unsigned_type
+				| binary_type | long_true_varchar,
+				charset_no),
+			col_len);
+	}
+
+	/* Add the FTS doc_id hidden column. */
+	if (flags2 & DICT_TF2_FTS && !has_doc_id_col) {
+		fts_add_doc_id_column(table, heap);
+	}
+
+	err = row_create_table_for_mysql(table, trx, false);
+
+	mem_heap_free(heap);
+
+	DBUG_EXECUTE_IF("ib_create_err_tablespace_exist",
+			err = DB_TABLESPACE_EXISTS;);
+
+	if (err == DB_DUPLICATE_KEY || err == DB_TABLESPACE_EXISTS) {
+		char display_name[FN_REFLEN];
+		char* buf_end = innobase_convert_identifier(
+			display_name, sizeof(display_name) - 1,
+			table_name, strlen(table_name),
+			thd, TRUE);
+
+		*buf_end = '\0';
+
+		my_error(err == DB_DUPLICATE_KEY
+			 ? ER_TABLE_EXISTS_ERROR
+			 : ER_TABLESPACE_EXISTS, MYF(0), display_name);
+	}
+
+	if (err == DB_SUCCESS && (flags2 & DICT_TF2_FTS)) {
+		fts_optimize_add_table(table);
+	}
+
+error_ret:
+	DBUG_RETURN(convert_error_code_to_mysql(err, flags, thd));
+}
+
+/*****************************************************************//**
+Creates an index in an InnoDB database. */
+static
+int
+create_index(
+/*=========*/
+	trx_t*		trx,		/*!< in: InnoDB transaction handle */
+	const TABLE*	form,		/*!< in: information on table
+					columns and indexes */
+	ulint		flags,		/*!< in: InnoDB table flags */
+	const char*	table_name,	/*!< in: table name */
+	uint		key_num)	/*!< in: index number */
+{
+	dict_index_t*	index;
+	int		error;
+	const KEY*	key;
+	ulint		ind_type;
+	ulint*		field_lengths;
+
+	DBUG_ENTER("create_index");
+
+	key = form->key_info + key_num;
+
+	/* Assert that "GEN_CLUST_INDEX" cannot be used as non-primary index */
+	ut_a(innobase_strcasecmp(key->name, innobase_index_reserve_name) != 0);
+
+	if (key->flags & HA_FULLTEXT) {
+		index = dict_mem_index_create(table_name, key->name, 0,
+					      DICT_FTS,
+					      key->user_defined_key_parts);
+
+		for (ulint i = 0; i < key->user_defined_key_parts; i++) {
+			KEY_PART_INFO*	key_part = key->key_part + i;
+			dict_mem_index_add_field(
+				index, key_part->field->field_name, 0);
+		}
+
+		DBUG_RETURN(convert_error_code_to_mysql(
+				    row_create_index_for_mysql(
+					    index, trx, NULL),
+				    flags, NULL));
+
+	}
+
+	ind_type = 0;
+
+	if (key_num == form->s->primary_key) {
+		ind_type |= DICT_CLUSTERED;
+	}
+
+	if (key->flags & HA_NOSAME) {
+		ind_type |= DICT_UNIQUE;
+	}
+
+	field_lengths = (ulint*) my_malloc(
+		key->user_defined_key_parts * sizeof *
+				field_lengths, MYF(MY_FAE));
+
+	/* We pass 0 as the space id, and determine at a lower level the space
+	id where to store the table */
+
+	index = dict_mem_index_create(table_name, key->name, 0,
+				      ind_type, key->user_defined_key_parts);
+
+	for (ulint i = 0; i < key->user_defined_key_parts; i++) {
+		KEY_PART_INFO*	key_part = key->key_part + i;
+		ulint		prefix_len;
+		ulint		col_type;
+		ulint		is_unsigned;
+
+
+		/* (The flag HA_PART_KEY_SEG denotes in MySQL a
+		column prefix field in an index: we only store a
+		specified number of first bytes of the column to
+		the index field.) The flag does not seem to be
+		properly set by MySQL. Let us fall back on testing
+		the length of the key part versus the column. */
+
+		Field*	field = NULL;
+
+		for (ulint j = 0; j < form->s->fields; j++) {
+
+			field = form->field[j];
+
+			if (0 == innobase_strcasecmp(
+				    field->field_name,
+				    key_part->field->field_name)) {
+				/* Found the corresponding column */
+
+				goto found;
+			}
+		}
+
+		ut_error;
+found:
+		col_type = get_innobase_type_from_mysql_type(
+			&is_unsigned, key_part->field);
+
+		if (DATA_BLOB == col_type
+		    || (key_part->length < field->pack_length()
+			&& field->type() != MYSQL_TYPE_VARCHAR)
+		    || (field->type() == MYSQL_TYPE_VARCHAR
+			&& key_part->length < field->pack_length()
+			- ((Field_varstring*) field)->length_bytes)) {
+
+			switch (col_type) {
+			default:
+				prefix_len = key_part->length;
+				break;
+			case DATA_INT:
+			case DATA_FLOAT:
+			case DATA_DOUBLE:
+			case DATA_DECIMAL:
+				sql_print_error(
+					"MySQL is trying to create a column "
+					"prefix index field, on an "
+					"inappropriate data type. Table "
+					"name %s, column name %s.",
+					table_name,
+					key_part->field->field_name);
+
+				prefix_len = 0;
+			}
+		} else {
+			prefix_len = 0;
+		}
+
+		field_lengths[i] = key_part->length;
+
+		dict_mem_index_add_field(
+			index, key_part->field->field_name, prefix_len);
+	}
+
+	ut_ad(key->flags & HA_FULLTEXT || !(index->type & DICT_FTS));
+
+	/* Even though we've defined max_supported_key_part_length, we
+	still do our own checking using field_lengths to be absolutely
+	sure we don't create too long indexes. */
+
+	error = convert_error_code_to_mysql(
+		row_create_index_for_mysql(index, trx, field_lengths),
+		flags, NULL);
+
+	my_free(field_lengths);
+
+	DBUG_RETURN(error);
+}
+
+/*****************************************************************//**
+Creates an index to an InnoDB table when the user has defined no
+primary index. */
+static
+int
+create_clustered_index_when_no_primary(
+/*===================================*/
+	trx_t*		trx,		/*!< in: InnoDB transaction handle */
+	ulint		flags,		/*!< in: InnoDB table flags */
+	const char*	table_name)	/*!< in: table name */
+{
+	dict_index_t*	index;
+	dberr_t		error;
+
+	/* We pass 0 as the space id, and determine at a lower level the space
+	id where to store the table */
+	index = dict_mem_index_create(table_name,
+				      innobase_index_reserve_name,
+				      0, DICT_CLUSTERED, 0);
+
+	error = row_create_index_for_mysql(index, trx, NULL);
+
+	return(convert_error_code_to_mysql(error, flags, NULL));
+}
+
+/*****************************************************************//**
+Return a display name for the row format
+@return row format name */
+UNIV_INTERN
+const char*
+get_row_format_name(
+/*================*/
+	enum row_type	row_format)		/*!< in: Row Format */
+{
+	switch (row_format) {
+	case ROW_TYPE_COMPACT:
+		return("COMPACT");
+	case ROW_TYPE_COMPRESSED:
+		return("COMPRESSED");
+	case ROW_TYPE_DYNAMIC:
+		return("DYNAMIC");
+	case ROW_TYPE_REDUNDANT:
+		return("REDUNDANT");
+	case ROW_TYPE_DEFAULT:
+		return("DEFAULT");
+	case ROW_TYPE_FIXED:
+		return("FIXED");
+	case ROW_TYPE_PAGE:
+	case ROW_TYPE_NOT_USED:
+		break;
+	}
+	return("NOT USED");
+}
+
+/** If file-per-table is missing, issue warning and set ret false */
+#define CHECK_ERROR_ROW_TYPE_NEEDS_FILE_PER_TABLE(use_tablespace)\
+	if (!use_tablespace) {					\
+		push_warning_printf(				\
+			thd, Sql_condition::WARN_LEVEL_WARN,	\
+			ER_ILLEGAL_HA_CREATE_OPTION,		\
+			"InnoDB: ROW_FORMAT=%s requires"	\
+			" innodb_file_per_table.",		\
+			get_row_format_name(row_format));	\
+		ret = "ROW_FORMAT";					\
+	}
+
+/** If file-format is Antelope, issue warning and set ret false */
+#define CHECK_ERROR_ROW_TYPE_NEEDS_GT_ANTELOPE			\
+	if (srv_file_format < UNIV_FORMAT_B) {		\
+		push_warning_printf(				\
+			thd, Sql_condition::WARN_LEVEL_WARN,	\
+			ER_ILLEGAL_HA_CREATE_OPTION,		\
+			"InnoDB: ROW_FORMAT=%s requires"	\
+			" innodb_file_format > Antelope.",	\
+			get_row_format_name(row_format));	\
+		ret = "ROW_FORMAT";				\
+	}
+
+
+/*****************************************************************//**
+Validates the create options. We may build on this function
+in future. For now, it checks two specifiers:
+KEY_BLOCK_SIZE and ROW_FORMAT
+If innodb_strict_mode is not set then this function is a no-op
+@return	NULL if valid, string if not. */
+UNIV_INTERN
+const char*
+create_options_are_invalid(
+/*=======================*/
+	THD*		thd,		/*!< in: connection thread. */
+	TABLE*		form,		/*!< in: information on table
+					columns and indexes */
+	HA_CREATE_INFO*	create_info,	/*!< in: create info. */
+	bool		use_tablespace)	/*!< in: srv_file_per_table */
+{
+	ibool	kbs_specified	= FALSE;
+	const char*	ret	= NULL;
+	enum row_type	row_format	= form->s->row_type;
+
+	ut_ad(thd != NULL);
+
+	/* If innodb_strict_mode is not set don't do any validation. */
+	if (!(THDVAR(thd, strict_mode))) {
+		return(NULL);
+	}
+
+	ut_ad(form != NULL);
+	ut_ad(create_info != NULL);
+
+	/* First check if a non-zero KEY_BLOCK_SIZE was specified. */
+	if (create_info->key_block_size) {
+		kbs_specified = TRUE;
+		switch (create_info->key_block_size) {
+			ulint	kbs_max;
+		case 1:
+		case 2:
+		case 4:
+		case 8:
+		case 16:
+			/* Valid KEY_BLOCK_SIZE, check its dependencies. */
+			if (!use_tablespace) {
+				push_warning(
+					thd, Sql_condition::WARN_LEVEL_WARN,
+					ER_ILLEGAL_HA_CREATE_OPTION,
+					"InnoDB: KEY_BLOCK_SIZE requires"
+					" innodb_file_per_table.");
+				ret = "KEY_BLOCK_SIZE";
+			}
+			if (srv_file_format < UNIV_FORMAT_B) {
+				push_warning(
+					thd, Sql_condition::WARN_LEVEL_WARN,
+					ER_ILLEGAL_HA_CREATE_OPTION,
+					"InnoDB: KEY_BLOCK_SIZE requires"
+					" innodb_file_format > Antelope.");
+				ret = "KEY_BLOCK_SIZE";
+			}
+
+			/* The maximum KEY_BLOCK_SIZE (KBS) is 16. But if
+			UNIV_PAGE_SIZE is smaller than 16k, the maximum
+			KBS is also smaller. */
+			kbs_max = ut_min(
+				1 << (UNIV_PAGE_SSIZE_MAX - 1),
+				1 << (PAGE_ZIP_SSIZE_MAX - 1));
+			if (create_info->key_block_size > kbs_max) {
+				push_warning_printf(
+					thd, Sql_condition::WARN_LEVEL_WARN,
+					ER_ILLEGAL_HA_CREATE_OPTION,
+					"InnoDB: KEY_BLOCK_SIZE=%ld"
+					" cannot be larger than %ld.",
+					create_info->key_block_size,
+					kbs_max);
+				ret = "KEY_BLOCK_SIZE";
+			}
+			break;
+		default:
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: invalid KEY_BLOCK_SIZE = %lu."
+				" Valid values are [1, 2, 4, 8, 16]",
+				create_info->key_block_size);
+			ret = "KEY_BLOCK_SIZE";
+			break;
+		}
+	}
+
+	/* Check for a valid Innodb ROW_FORMAT specifier and
+	other incompatibilities. */
+	switch (row_format) {
+	case ROW_TYPE_COMPRESSED:
+		CHECK_ERROR_ROW_TYPE_NEEDS_FILE_PER_TABLE(use_tablespace);
+		CHECK_ERROR_ROW_TYPE_NEEDS_GT_ANTELOPE;
+		break;
+	case ROW_TYPE_DYNAMIC:
+		CHECK_ERROR_ROW_TYPE_NEEDS_FILE_PER_TABLE(use_tablespace);
+		CHECK_ERROR_ROW_TYPE_NEEDS_GT_ANTELOPE;
+		/* fall through since dynamic also shuns KBS */
+	case ROW_TYPE_COMPACT:
+	case ROW_TYPE_REDUNDANT:
+		if (kbs_specified) {
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: cannot specify ROW_FORMAT = %s"
+				" with KEY_BLOCK_SIZE.",
+				get_row_format_name(row_format));
+			ret = "KEY_BLOCK_SIZE";
+		}
+		break;
+	case ROW_TYPE_DEFAULT:
+		break;
+	case ROW_TYPE_FIXED:
+	case ROW_TYPE_PAGE:
+	case ROW_TYPE_NOT_USED:
+		push_warning(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_ILLEGAL_HA_CREATE_OPTION,		\
+			"InnoDB: invalid ROW_FORMAT specifier.");
+		ret = "ROW_TYPE";
+		break;
+	}
+
+	/* Use DATA DIRECTORY only with file-per-table. */
+	if (create_info->data_file_name && !use_tablespace) {
+		push_warning(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_ILLEGAL_HA_CREATE_OPTION,
+			"InnoDB: DATA DIRECTORY requires"
+			" innodb_file_per_table.");
+		ret = "DATA DIRECTORY";
+	}
+
+	/* Do not use DATA DIRECTORY with TEMPORARY TABLE. */
+	if (create_info->data_file_name
+	    && create_info->options & HA_LEX_CREATE_TMP_TABLE) {
+		push_warning(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_ILLEGAL_HA_CREATE_OPTION,
+			"InnoDB: DATA DIRECTORY cannot be used"
+			" for TEMPORARY tables.");
+		ret = "DATA DIRECTORY";
+	}
+
+	/* Do not allow INDEX_DIRECTORY */
+	if (create_info->index_file_name) {
+		push_warning_printf(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_ILLEGAL_HA_CREATE_OPTION,
+			"InnoDB: INDEX DIRECTORY is not supported");
+		ret = "INDEX DIRECTORY";
+	}
+
+	return(ret);
+}
+
+/*****************************************************************//**
+Update create_info.  Used in SHOW CREATE TABLE et al. */
+UNIV_INTERN
+void
+ha_innobase::update_create_info(
+/*============================*/
+	HA_CREATE_INFO*	create_info)	/*!< in/out: create info */
+{
+	if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
+		ha_innobase::info(HA_STATUS_AUTO);
+		create_info->auto_increment_value = stats.auto_increment_value;
+	}
+
+	/* Update the DATA DIRECTORY name from SYS_DATAFILES. */
+	dict_get_and_save_data_dir_path(prebuilt->table, false);
+
+	if (prebuilt->table->data_dir_path) {
+		create_info->data_file_name = prebuilt->table->data_dir_path;
+	}
+}
+
+/*****************************************************************//**
+Initialize the table FTS stopword list
+@return TRUE if success */
+UNIV_INTERN
+ibool
+innobase_fts_load_stopword(
+/*=======================*/
+	dict_table_t*	table,	/*!< in: Table has the FTS */
+	trx_t*		trx,	/*!< in: transaction */
+	THD*		thd)	/*!< in: current thread */
+{
+	return(fts_load_stopword(table, trx,
+				 innobase_server_stopword_table,
+				 THDVAR(thd, ft_user_stopword_table),
+				 THDVAR(thd, ft_enable_stopword), FALSE));
+}
+
+/*****************************************************************//**
+Parses the table name into normal name and either temp path or remote path
+if needed.
+@return	0 if successful, otherwise, error number */
+UNIV_INTERN
+int
+ha_innobase::parse_table_name(
+/*==========================*/
+	const char*	name,		/*!< in/out: table name provided*/
+	HA_CREATE_INFO*	create_info,	/*!< in: more information of the
+					created table, contains also the
+					create statement string */
+	ulint		flags,		/*!< in: flags*/
+	ulint		flags2,		/*!< in: flags2*/
+	char*		norm_name,	/*!< out: normalized table name */
+	char*		temp_path,	/*!< out: absolute path of table */
+	char*		remote_path)	/*!< out: remote path of table */
+{
+	THD*		thd = ha_thd();
+	bool		use_tablespace = flags2 & DICT_TF2_USE_TABLESPACE;
+	DBUG_ENTER("ha_innobase::parse_table_name");
+
+#ifdef __WIN__
+	/* Names passed in from server are in two formats:
+	1. <database_name>/<table_name>: for normal table creation
+	2. full path: for temp table creation, or DATA DIRECTORY.
+
+	When srv_file_per_table is on and mysqld_embedded is off,
+	check for full path pattern, i.e.
+	X:\dir\...,		X is a driver letter, or
+	\\dir1\dir2\...,	UNC path
+	returns error if it is in full path format, but not creating a temp.
+	table. Currently InnoDB does not support symbolic link on Windows. */
+
+	if (use_tablespace
+	    && !mysqld_embedded
+	    && !(create_info->options & HA_LEX_CREATE_TMP_TABLE)) {
+
+		if ((name[1] == ':')
+		    || (name[0] == '\\' && name[1] == '\\')) {
+			sql_print_error("Cannot create table %s\n", name);
+			DBUG_RETURN(HA_ERR_GENERIC);
+		}
+	}
+#endif
+
+	normalize_table_name(norm_name, name);
+	temp_path[0] = '\0';
+	remote_path[0] = '\0';
+
+	/* A full path is used for TEMPORARY TABLE and DATA DIRECTORY.
+	In the case of;
+	  CREATE TEMPORARY TABLE ... DATA DIRECTORY={path} ... ;
+	We ignore the DATA DIRECTORY. */
+	if (create_info->options & HA_LEX_CREATE_TMP_TABLE) {
+		strncpy(temp_path, name, FN_REFLEN - 1);
+	}
+
+	if (create_info->data_file_name) {
+		bool ignore = false;
+
+		/* Use DATA DIRECTORY only with file-per-table. */
+		if (!use_tablespace) {
+			push_warning(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: DATA DIRECTORY requires"
+				" innodb_file_per_table.");
+			ignore = true;
+		}
+
+		/* Do not use DATA DIRECTORY with TEMPORARY TABLE. */
+		if (create_info->options & HA_LEX_CREATE_TMP_TABLE) {
+			push_warning(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: DATA DIRECTORY cannot be"
+				" used for TEMPORARY tables.");
+			ignore = true;
+		}
+
+		if (ignore) {
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				WARN_OPTION_IGNORED,
+				ER_DEFAULT(WARN_OPTION_IGNORED),
+				"DATA DIRECTORY");
+		} else {
+			strncpy(remote_path, create_info->data_file_name,
+				FN_REFLEN - 1);
+		}
+	}
+
+	if (create_info->index_file_name) {
+		push_warning_printf(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			WARN_OPTION_IGNORED,
+			ER_DEFAULT(WARN_OPTION_IGNORED),
+			"INDEX DIRECTORY");
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*****************************************************************//**
+Determines InnoDB table flags.
+@retval true if successful, false if error */
+UNIV_INTERN
+bool
+innobase_table_flags(
+/*=================*/
+	const TABLE*		form,		/*!< in: table */
+	const HA_CREATE_INFO*	create_info,	/*!< in: information
+						on table columns and indexes */
+	THD*			thd,		/*!< in: connection */
+	bool			use_tablespace,	/*!< in: whether to create
+						outside system tablespace */
+	ulint*			flags,		/*!< out: DICT_TF flags */
+	ulint*			flags2)		/*!< out: DICT_TF2 flags */
+{
+	DBUG_ENTER("innobase_table_flags");
+
+	const char*	fts_doc_id_index_bad = NULL;
+	bool		zip_allowed = true;
+	ulint		zip_ssize = 0;
+	enum row_type	row_format;
+	rec_format_t	innodb_row_format = REC_FORMAT_COMPACT;
+	bool		use_data_dir;
+
+	/* Cache the value of innodb_file_format, in case it is
+	modified by another thread while the table is being created. */
+	const ulint	file_format_allowed = srv_file_format;
+
+	*flags = 0;
+	*flags2 = 0;
+
+	/* Check if there are any FTS indexes defined on this table. */
+	for (uint i = 0; i < form->s->keys; i++) {
+		const KEY*	key = &form->key_info[i];
+
+		if (key->flags & HA_FULLTEXT) {
+			*flags2 |= DICT_TF2_FTS;
+
+			/* We don't support FTS indexes in temporary
+			tables. */
+			if (create_info->options & HA_LEX_CREATE_TMP_TABLE) {
+
+				my_error(ER_INNODB_NO_FT_TEMP_TABLE, MYF(0));
+				DBUG_RETURN(false);
+			}
+
+			if (key->flags & HA_USES_PARSER) {
+				my_error(ER_INNODB_NO_FT_USES_PARSER, MYF(0));
+                                DBUG_RETURN(false);
+			}
+
+			if (fts_doc_id_index_bad) {
+				goto index_bad;
+			}
+		}
+
+		if (innobase_strcasecmp(key->name, FTS_DOC_ID_INDEX_NAME)) {
+			continue;
+		}
+
+		/* Do a pre-check on FTS DOC ID index */
+		if (!(key->flags & HA_NOSAME)
+		    || strcmp(key->name, FTS_DOC_ID_INDEX_NAME)
+		    || strcmp(key->key_part[0].field->field_name,
+			      FTS_DOC_ID_COL_NAME)) {
+			fts_doc_id_index_bad = key->name;
+		}
+
+		if (fts_doc_id_index_bad && (*flags2 & DICT_TF2_FTS)) {
+index_bad:
+			my_error(ER_INNODB_FT_WRONG_DOCID_INDEX, MYF(0),
+				 fts_doc_id_index_bad);
+			DBUG_RETURN(false);
+		}
+	}
+
+	if (create_info->key_block_size) {
+		/* The requested compressed page size (key_block_size)
+		is given in kilobytes. If it is a valid number, store
+		that value as the number of log2 shifts from 512 in
+		zip_ssize. Zero means it is not compressed. */
+		ulint zssize;		/* Zip Shift Size */
+		ulint kbsize;		/* Key Block Size */
+		for (zssize = kbsize = 1;
+		     zssize <= ut_min(UNIV_PAGE_SSIZE_MAX,
+				      PAGE_ZIP_SSIZE_MAX);
+		     zssize++, kbsize <<= 1) {
+			if (kbsize == create_info->key_block_size) {
+				zip_ssize = zssize;
+				break;
+			}
+		}
+
+		/* Make sure compressed row format is allowed. */
+		if (!use_tablespace) {
+			push_warning(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: KEY_BLOCK_SIZE requires"
+				" innodb_file_per_table.");
+			zip_allowed = FALSE;
+		}
+
+		if (file_format_allowed < UNIV_FORMAT_B) {
+			push_warning(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: KEY_BLOCK_SIZE requires"
+				" innodb_file_format > Antelope.");
+			zip_allowed = FALSE;
+		}
+
+		if (!zip_allowed
+		    || zssize > ut_min(UNIV_PAGE_SSIZE_MAX,
+				       PAGE_ZIP_SSIZE_MAX)) {
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: ignoring KEY_BLOCK_SIZE=%lu.",
+				create_info->key_block_size);
+		}
+	}
+
+	row_format = form->s->row_type;
+
+	if (zip_ssize && zip_allowed) {
+		/* if ROW_FORMAT is set to default,
+		automatically change it to COMPRESSED.*/
+		if (row_format == ROW_TYPE_DEFAULT) {
+			row_format = ROW_TYPE_COMPRESSED;
+		} else if (row_format != ROW_TYPE_COMPRESSED) {
+			/* ROW_FORMAT other than COMPRESSED
+			ignores KEY_BLOCK_SIZE.  It does not
+			make sense to reject conflicting
+			KEY_BLOCK_SIZE and ROW_FORMAT, because
+			such combinations can be obtained
+			with ALTER TABLE anyway. */
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: ignoring KEY_BLOCK_SIZE=%lu"
+				" unless ROW_FORMAT=COMPRESSED.",
+				create_info->key_block_size);
+			zip_allowed = FALSE;
+		}
+	} else {
+		/* zip_ssize == 0 means no KEY_BLOCK_SIZE.*/
+		if (row_format == ROW_TYPE_COMPRESSED && zip_allowed) {
+			/* ROW_FORMAT=COMPRESSED without KEY_BLOCK_SIZE
+			implies half the maximum KEY_BLOCK_SIZE(*1k) or
+			UNIV_PAGE_SIZE, whichever is less. */
+			zip_ssize = ut_min(UNIV_PAGE_SSIZE_MAX,
+					   PAGE_ZIP_SSIZE_MAX) - 1;
+		}
+	}
+
+	/* Validate the row format.  Correct it if necessary */
+	switch (row_format) {
+	case ROW_TYPE_REDUNDANT:
+		innodb_row_format = REC_FORMAT_REDUNDANT;
+		break;
+
+	case ROW_TYPE_COMPRESSED:
+	case ROW_TYPE_DYNAMIC:
+		if (!use_tablespace) {
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: ROW_FORMAT=%s requires"
+				" innodb_file_per_table.",
+				get_row_format_name(row_format));
+		} else if (file_format_allowed == UNIV_FORMAT_A) {
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: ROW_FORMAT=%s requires"
+				" innodb_file_format > Antelope.",
+				get_row_format_name(row_format));
+		} else {
+			innodb_row_format = (row_format == ROW_TYPE_DYNAMIC
+					     ? REC_FORMAT_DYNAMIC
+					     : REC_FORMAT_COMPRESSED);
+			break;
+		}
+		zip_allowed = FALSE;
+		/* fall through to set row_format = COMPACT */
+	case ROW_TYPE_NOT_USED:
+	case ROW_TYPE_FIXED:
+	case ROW_TYPE_PAGE:
+		push_warning(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_ILLEGAL_HA_CREATE_OPTION,
+			"InnoDB: assuming ROW_FORMAT=COMPACT.");
+	case ROW_TYPE_DEFAULT:
+		/* If we fell through, set row format to Compact. */
+		row_format = ROW_TYPE_COMPACT;
+	case ROW_TYPE_COMPACT:
+		break;
+	}
+
+	/* Set the table flags */
+	if (!zip_allowed) {
+		zip_ssize = 0;
+	}
+
+	use_data_dir = use_tablespace
+		       && ((create_info->data_file_name != NULL)
+		       && !(create_info->options & HA_LEX_CREATE_TMP_TABLE));
+
+	dict_tf_set(flags, innodb_row_format, zip_ssize, use_data_dir);
+
+	if (create_info->options & HA_LEX_CREATE_TMP_TABLE) {
+		*flags2 |= DICT_TF2_TEMPORARY;
+	}
+
+	if (use_tablespace) {
+		*flags2 |= DICT_TF2_USE_TABLESPACE;
+	}
+
+	/* Set the flags2 when create table or alter tables */
+	*flags2 |= DICT_TF2_FTS_AUX_HEX_NAME;
+	DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+			*flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME;);
+
+	DBUG_RETURN(true);
+}
+
+/*****************************************************************//**
+Creates a new table to an InnoDB database.
+@return	error number */
+UNIV_INTERN
+int
+ha_innobase::create(
+/*================*/
+	const char*	name,		/*!< in: table name */
+	TABLE*		form,		/*!< in: information on table
+					columns and indexes */
+	HA_CREATE_INFO*	create_info)	/*!< in: more information of the
+					created table, contains also the
+					create statement string */
+{
+	int		error;
+	trx_t*		parent_trx;
+	trx_t*		trx;
+	int		primary_key_no;
+	uint		i;
+	char		norm_name[FN_REFLEN];	/* {database}/{tablename} */
+	char		temp_path[FN_REFLEN];	/* absolute path of temp frm */
+	char		remote_path[FN_REFLEN];	/* absolute path of table */
+	THD*		thd = ha_thd();
+	ib_int64_t	auto_inc_value;
+
+	/* Cache the global variable "srv_file_per_table" to a local
+	variable before using it. Note that "srv_file_per_table"
+	is not under dict_sys mutex protection, and could be changed
+	while creating the table. So we read the current value here
+	and make all further decisions based on this. */
+	bool		use_tablespace = srv_file_per_table;
+
+	/* Zip Shift Size - log2 - 9 of compressed page size,
+	zero for uncompressed */
+	ulint		flags;
+	ulint		flags2;
+	dict_table_t*	innobase_table = NULL;
+
+	const char*	stmt;
+	size_t		stmt_len;
+
+	DBUG_ENTER("ha_innobase::create");
+
+	DBUG_ASSERT(thd != NULL);
+	DBUG_ASSERT(create_info != NULL);
+
+	if (form->s->fields > REC_MAX_N_USER_FIELDS) {
+		DBUG_RETURN(HA_ERR_TOO_MANY_FIELDS);
+	} else if (srv_read_only_mode) {
+		DBUG_RETURN(HA_ERR_INNODB_READ_ONLY);
+	}
+
+	/* Create the table definition in InnoDB */
+
+	/* Validate create options if innodb_strict_mode is set. */
+	if (create_options_are_invalid(
+			thd, form, create_info, use_tablespace)) {
+		DBUG_RETURN(HA_WRONG_CREATE_OPTION);
+	}
+
+	if (!innobase_table_flags(form, create_info,
+				  thd, use_tablespace,
+				  &flags, &flags2)) {
+		DBUG_RETURN(-1);
+	}
+
+	error = parse_table_name(name, create_info, flags, flags2,
+				 norm_name, temp_path, remote_path);
+	if (error) {
+		DBUG_RETURN(error);
+	}
+
+	/* Look for a primary key */
+	primary_key_no = (form->s->primary_key != MAX_KEY ?
+			  (int) form->s->primary_key :
+			  -1);
+
+	/* Our function innobase_get_mysql_key_number_for_index assumes
+	the primary key is always number 0, if it exists */
+	ut_a(primary_key_no == -1 || primary_key_no == 0);
+
+	/* Check for name conflicts (with reserved name) for
+	any user indices to be created. */
+	if (innobase_index_name_is_reserved(thd, form->key_info,
+					    form->s->keys)) {
+		DBUG_RETURN(-1);
+	}
+
+	if (row_is_magic_monitor_table(norm_name)) {
+		push_warning_printf(thd,
+				    Sql_condition::WARN_LEVEL_WARN,
+				    HA_ERR_WRONG_COMMAND,
+				    "Using the table name %s to enable "
+				    "diagnostic output is deprecated "
+				    "and may be removed in future releases. "
+				    "Use INFORMATION_SCHEMA or "
+				    "PERFORMANCE_SCHEMA tables or "
+				    "SET GLOBAL innodb_status_output=ON.",
+				    dict_remove_db_name(norm_name));
+
+		/* Limit innodb monitor access to users with PROCESS privilege.
+		See http://bugs.mysql.com/32710 why we chose PROCESS. */
+		if (check_global_access(thd, PROCESS_ACL)) {
+			DBUG_RETURN(HA_ERR_GENERIC);
+		}
+	}
+
+	/* Get the transaction associated with the current thd, or create one
+	if not yet created */
+
+	parent_trx = check_trx_exists(thd);
+
+	/* In case MySQL calls this in the middle of a SELECT query, release
+	possible adaptive hash latch to avoid deadlocks of threads */
+
+	trx_search_latch_release_if_reserved(parent_trx);
+
+	trx = innobase_trx_allocate(thd);
+
+	/* Latch the InnoDB data dictionary exclusively so that no deadlocks
+	or lock waits can happen in it during a table create operation.
+	Drop table etc. do this latching in row0mysql.cc. */
+
+	row_mysql_lock_data_dictionary(trx);
+
+	error = create_table_def(trx, form, norm_name, temp_path,
+				 remote_path, flags, flags2);
+	if (error) {
+		goto cleanup;
+	}
+
+	/* Create the keys */
+
+	if (form->s->keys == 0 || primary_key_no == -1) {
+		/* Create an index which is used as the clustered index;
+		order the rows by their row id which is internally generated
+		by InnoDB */
+
+		error = create_clustered_index_when_no_primary(
+			trx, flags, norm_name);
+		if (error) {
+			goto cleanup;
+		}
+	}
+
+	if (primary_key_no != -1) {
+		/* In InnoDB the clustered index must always be created
+		first */
+		if ((error = create_index(trx, form, flags, norm_name,
+					  (uint) primary_key_no))) {
+			goto cleanup;
+		}
+	}
+
+	/* Create the ancillary tables that are common to all FTS indexes on
+	this table. */
+	if (flags2 & DICT_TF2_FTS) {
+		enum fts_doc_id_index_enum	ret;
+
+		innobase_table = dict_table_open_on_name(
+			norm_name, TRUE, FALSE, DICT_ERR_IGNORE_NONE);
+
+		ut_a(innobase_table);
+
+		/* Check whether there already exists FTS_DOC_ID_INDEX */
+		ret = innobase_fts_check_doc_id_index_in_def(
+			form->s->keys, form->key_info);
+
+		switch (ret) {
+		case FTS_INCORRECT_DOC_ID_INDEX:
+			push_warning_printf(thd,
+					    Sql_condition::WARN_LEVEL_WARN,
+					    ER_WRONG_NAME_FOR_INDEX,
+					    " InnoDB: Index name %s is reserved"
+					    " for the unique index on"
+					    " FTS_DOC_ID column for FTS"
+					    " Document ID indexing"
+					    " on table %s. Please check"
+					    " the index definition to"
+					    " make sure it is of correct"
+					    " type\n",
+					    FTS_DOC_ID_INDEX_NAME,
+					    innobase_table->name);
+
+			if (innobase_table->fts) {
+				fts_free(innobase_table);
+			}
+
+			dict_table_close(innobase_table, TRUE, FALSE);
+			my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+				 FTS_DOC_ID_INDEX_NAME);
+			error = -1;
+			goto cleanup;
+		case FTS_EXIST_DOC_ID_INDEX:
+		case FTS_NOT_EXIST_DOC_ID_INDEX:
+			break;
+		}
+
+		dberr_t	err = fts_create_common_tables(
+			trx, innobase_table, norm_name,
+			(ret == FTS_EXIST_DOC_ID_INDEX));
+
+		error = convert_error_code_to_mysql(err, 0, NULL);
+
+		dict_table_close(innobase_table, TRUE, FALSE);
+
+		if (error) {
+			goto cleanup;
+		}
+	}
+
+	for (i = 0; i < form->s->keys; i++) {
+
+		if (i != static_cast<uint>(primary_key_no)) {
+
+			if ((error = create_index(trx, form, flags,
+						  norm_name, i))) {
+				goto cleanup;
+			}
+		}
+	}
+
+	/* Cache all the FTS indexes on this table in the FTS specific
+	structure. They are used for FTS indexed column update handling. */
+	if (flags2 & DICT_TF2_FTS) {
+		fts_t*          fts = innobase_table->fts;
+
+		ut_a(fts != NULL);
+
+		dict_table_get_all_fts_indexes(innobase_table, fts->indexes);
+	}
+
+	stmt = innobase_get_stmt(thd, &stmt_len);
+
+	if (stmt) {
+		dberr_t	err = row_table_add_foreign_constraints(
+			trx, stmt, stmt_len, norm_name,
+			create_info->options & HA_LEX_CREATE_TMP_TABLE);
+
+		switch (err) {
+
+		case DB_PARENT_NO_INDEX:
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_ERR_CANNOT_ADD_FOREIGN,
+				"Create table '%s' with foreign key constraint"
+				" failed. There is no index in the referenced"
+				" table where the referenced columns appear"
+				" as the first columns.\n", norm_name);
+			break;
+
+		case DB_CHILD_NO_INDEX:
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_ERR_CANNOT_ADD_FOREIGN,
+				"Create table '%s' with foreign key constraint"
+				" failed. There is no index in the referencing"
+				" table where referencing columns appear"
+				" as the first columns.\n", norm_name);
+			break;
+		default:
+			break;
+		}
+
+		error = convert_error_code_to_mysql(err, flags, NULL);
+
+		if (error) {
+			goto cleanup;
+		}
+	}
+
+	innobase_commit_low(trx);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	/* Flush the log to reduce probability that the .frm files and
+	the InnoDB data dictionary get out-of-sync if the user runs
+	with innodb_flush_log_at_trx_commit = 0 */
+
+	log_buffer_flush_to_disk();
+
+	innobase_table = dict_table_open_on_name(
+		norm_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+
+	DBUG_ASSERT(innobase_table != 0);
+
+	innobase_copy_frm_flags_from_create_info(innobase_table, create_info);
+
+	dict_stats_update(innobase_table, DICT_STATS_EMPTY_TABLE);
+
+	if (innobase_table) {
+		/* We update the highest file format in the system table
+		space, if this table has higher file format setting. */
+
+		trx_sys_file_format_max_upgrade(
+			(const char**) &innobase_file_format_max,
+			dict_table_get_format(innobase_table));
+	}
+
+	/* Load server stopword into FTS cache */
+	if (flags2 & DICT_TF2_FTS) {
+		if (!innobase_fts_load_stopword(innobase_table, NULL, thd)) {
+			dict_table_close(innobase_table, FALSE, FALSE);
+			srv_active_wake_master_thread();
+			trx_free_for_mysql(trx);
+			DBUG_RETURN(-1);
+		}
+	}
+
+	/* Note: We can't call update_thd() as prebuilt will not be
+	setup at this stage and so we use thd. */
+
+	/* We need to copy the AUTOINC value from the old table if
+	this is an ALTER|OPTIMIZE TABLE or CREATE INDEX because CREATE INDEX
+	does a table copy too. If query was one of :
+
+		CREATE TABLE ...AUTO_INCREMENT = x; or
+		ALTER TABLE...AUTO_INCREMENT = x;   or
+		OPTIMIZE TABLE t; or
+		CREATE INDEX x on t(...);
+
+	Find out a table definition from the dictionary and get
+	the current value of the auto increment field. Set a new
+	value to the auto increment field if the value is greater
+	than the maximum value in the column. */
+
+	if (((create_info->used_fields & HA_CREATE_USED_AUTO)
+	    || thd_sql_command(thd) == SQLCOM_ALTER_TABLE
+	    || thd_sql_command(thd) == SQLCOM_OPTIMIZE
+	    || thd_sql_command(thd) == SQLCOM_CREATE_INDEX)
+	    && create_info->auto_increment_value > 0) {
+
+		auto_inc_value = create_info->auto_increment_value;
+
+		dict_table_autoinc_lock(innobase_table);
+		dict_table_autoinc_initialize(innobase_table, auto_inc_value);
+		dict_table_autoinc_unlock(innobase_table);
+	}
+
+	dict_table_close(innobase_table, FALSE, FALSE);
+
+	/* Tell the InnoDB server that there might be work for
+	utility threads: */
+
+	srv_active_wake_master_thread();
+
+	trx_free_for_mysql(trx);
+
+	DBUG_RETURN(0);
+
+cleanup:
+	trx_rollback_for_mysql(trx);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx_free_for_mysql(trx);
+
+	DBUG_RETURN(error);
+}
+
+/*****************************************************************//**
+Discards or imports an InnoDB tablespace.
+@return	0 == success, -1 == error */
+UNIV_INTERN
+int
+ha_innobase::discard_or_import_tablespace(
+/*======================================*/
+	my_bool discard)	/*!< in: TRUE if discard, else import */
+{
+	dberr_t		err;
+	dict_table_t*	dict_table;
+
+	DBUG_ENTER("ha_innobase::discard_or_import_tablespace");
+
+	ut_a(prebuilt->trx);
+	ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N);
+	ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
+
+	if (srv_read_only_mode) {
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	}
+
+	dict_table = prebuilt->table;
+
+	if (dict_table->space == TRX_SYS_SPACE) {
+
+		ib_senderrf(
+			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLE_IN_SYSTEM_TABLESPACE,
+			table->s->table_name.str);
+
+		DBUG_RETURN(HA_ERR_TABLE_NEEDS_UPGRADE);
+	}
+
+	trx_start_if_not_started(prebuilt->trx);
+
+	/* In case MySQL calls this in the middle of a SELECT query, release
+	possible adaptive hash latch to avoid deadlocks of threads. */
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+
+	/* Obtain an exclusive lock on the table. */
+	err = row_mysql_lock_table(
+		prebuilt->trx, dict_table, LOCK_X,
+		discard ? "setting table lock for DISCARD TABLESPACE"
+			: "setting table lock for IMPORT TABLESPACE");
+
+	if (err != DB_SUCCESS) {
+		/* unable to lock the table: do nothing */
+	} else if (discard) {
+
+		/* Discarding an already discarded tablespace should be an
+		idempotent operation. Also, if the .ibd file is missing the
+		user may want to set the DISCARD flag in order to IMPORT
+		a new tablespace. */
+
+		if (dict_table->ibd_file_missing) {
+			ib_senderrf(
+				prebuilt->trx->mysql_thd,
+				IB_LOG_LEVEL_WARN, ER_TABLESPACE_MISSING,
+				table->s->table_name.str);
+		}
+
+		err = row_discard_tablespace_for_mysql(
+			dict_table->name, prebuilt->trx);
+
+	} else if (!dict_table->ibd_file_missing) {
+		/* Commit the transaction in order to
+		release the table lock. */
+		trx_commit_for_mysql(prebuilt->trx);
+
+		ib_senderrf(
+			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_EXISTS, table->s->table_name.str);
+
+		DBUG_RETURN(HA_ERR_TABLE_EXIST);
+	} else {
+		err = row_import_for_mysql(dict_table, prebuilt);
+
+		if (err == DB_SUCCESS) {
+
+			if (table->found_next_number_field) {
+				dict_table_autoinc_lock(dict_table);
+				innobase_initialize_autoinc();
+				dict_table_autoinc_unlock(dict_table);
+			}
+
+			info(HA_STATUS_TIME
+			     | HA_STATUS_CONST
+			     | HA_STATUS_VARIABLE
+			     | HA_STATUS_AUTO);
+		}
+	}
+
+	/* Commit the transaction in order to release the table lock. */
+	trx_commit_for_mysql(prebuilt->trx);
+
+	DBUG_RETURN(convert_error_code_to_mysql(err, dict_table->flags, NULL));
+}
+
+/*****************************************************************//**
+Deletes all rows of an InnoDB table.
+@return	error number */
+UNIV_INTERN
+int
+ha_innobase::truncate()
+/*===================*/
+{
+	dberr_t		err;
+	int		error;
+
+	DBUG_ENTER("ha_innobase::truncate");
+
+	if (srv_read_only_mode) {
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	}
+
+	/* Get the transaction associated with the current thd, or create one
+	if not yet created, and update prebuilt->trx */
+
+	update_thd(ha_thd());
+
+	if (!trx_is_started(prebuilt->trx)) {
+		++prebuilt->trx->will_lock;
+	}
+	/* Truncate the table in InnoDB */
+
+	err = row_truncate_table_for_mysql(prebuilt->table, prebuilt->trx);
+
+	switch (err) {
+
+	case DB_TABLESPACE_DELETED:
+	case DB_TABLESPACE_NOT_FOUND:
+		ib_senderrf(
+			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			(err == DB_TABLESPACE_DELETED ?
+			ER_TABLESPACE_DISCARDED : ER_TABLESPACE_MISSING),
+			table->s->table_name.str);
+		table->status = STATUS_NOT_FOUND;
+		error = HA_ERR_NO_SUCH_TABLE;
+		break;
+
+	default:
+		error = convert_error_code_to_mysql(
+			err, prebuilt->table->flags,
+			prebuilt->trx->mysql_thd);
+		table->status = STATUS_NOT_FOUND;
+		break;
+	}
+	DBUG_RETURN(error);
+}
+
+/*****************************************************************//**
+Drops a table from an InnoDB database. Before calling this function,
+MySQL calls innobase_commit to commit the transaction of the current user.
+Then the current user cannot have locks set on the table. Drop table
+operation inside InnoDB will remove all locks any user has on the table
+inside InnoDB.
+@return	error number */
+UNIV_INTERN
+int
+ha_innobase::delete_table(
+/*======================*/
+	const char*	name)	/*!< in: table name */
+{
+	ulint	name_len;
+	dberr_t	err;
+	trx_t*	parent_trx;
+	trx_t*	trx;
+	THD*	thd = ha_thd();
+	char	norm_name[FN_REFLEN];
+
+	DBUG_ENTER("ha_innobase::delete_table");
+
+	DBUG_EXECUTE_IF(
+		"test_normalize_table_name_low",
+		test_normalize_table_name_low();
+	);
+	DBUG_EXECUTE_IF(
+		"test_ut_format_name",
+		test_ut_format_name();
+	);
+
+	/* Strangely, MySQL passes the table name without the '.frm'
+	extension, in contrast to ::create */
+	normalize_table_name(norm_name, name);
+
+	if (srv_read_only_mode) {
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	} else if (row_is_magic_monitor_table(norm_name)
+		   && check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(HA_ERR_GENERIC);
+	}
+
+	parent_trx = check_trx_exists(thd);
+
+	/* In case MySQL calls this in the middle of a SELECT query, release
+	possible adaptive hash latch to avoid deadlocks of threads */
+
+	trx_search_latch_release_if_reserved(parent_trx);
+
+	trx = innobase_trx_allocate(thd);
+
+	name_len = strlen(name);
+
+	ut_a(name_len < 1000);
+
+	/* Either the transaction is already flagged as a locking transaction
+	or it hasn't been started yet. */
+
+	ut_a(!trx_is_started(trx) || trx->will_lock > 0);
+
+	/* We are doing a DDL operation. */
+	++trx->will_lock;
+	trx->ddl = true;
+
+	/* Drop the table in InnoDB */
+	err = row_drop_table_for_mysql(
+		norm_name, trx, thd_sql_command(thd) == SQLCOM_DROP_DB);
+
+
+	if (err == DB_TABLE_NOT_FOUND
+	    && innobase_get_lower_case_table_names() == 1) {
+		char*	is_part = NULL;
+#ifdef __WIN__
+		is_part = strstr(norm_name, "#p#");
+#else
+		is_part = strstr(norm_name, "#P#");
+#endif /* __WIN__ */
+
+		if (is_part) {
+			char	par_case_name[FN_REFLEN];
+
+#ifndef __WIN__
+			/* Check for the table using lower
+			case name, including the partition
+			separator "P" */
+			strcpy(par_case_name, norm_name);
+			innobase_casedn_str(par_case_name);
+#else
+			/* On Windows platfrom, check
+			whether there exists table name in
+			system table whose name is
+			not being normalized to lower case */
+			normalize_table_name_low(
+				par_case_name, name, FALSE);
+#endif
+			err = row_drop_table_for_mysql(
+				par_case_name, trx,
+				thd_sql_command(thd) == SQLCOM_DROP_DB);
+		}
+	}
+
+	/* Flush the log to reduce probability that the .frm files and
+	the InnoDB data dictionary get out-of-sync if the user runs
+	with innodb_flush_log_at_trx_commit = 0 */
+
+	log_buffer_flush_to_disk();
+
+	innobase_commit_low(trx);
+
+	trx_free_for_mysql(trx);
+
+	DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL));
+}
+
+/*****************************************************************//**
+Removes all tables in the named database inside InnoDB. */
+static
+void
+innobase_drop_database(
+/*===================*/
+	handlerton*	hton,	/*!< in: handlerton of Innodb */
+	char*		path)	/*!< in: database path; inside InnoDB the name
+				of the last directory in the path is used as
+				the database name: for example, in
+				'mysql/data/test' the database name is 'test' */
+{
+	ulint	len		= 0;
+	trx_t*	trx;
+	char*	ptr;
+	char*	namebuf;
+	THD*	thd		= current_thd;
+
+	/* Get the transaction associated with the current thd, or create one
+	if not yet created */
+
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	if (srv_read_only_mode) {
+		return;
+	}
+
+	/* In the Windows plugin, thd = current_thd is always NULL */
+	if (thd) {
+		trx_t*	parent_trx = check_trx_exists(thd);
+
+		/* In case MySQL calls this in the middle of a SELECT
+		query, release possible adaptive hash latch to avoid
+		deadlocks of threads */
+
+		trx_search_latch_release_if_reserved(parent_trx);
+	}
+
+	ptr = strend(path) - 2;
+
+	while (ptr >= path && *ptr != '\\' && *ptr != '/') {
+		ptr--;
+		len++;
+	}
+
+	ptr++;
+	namebuf = (char*) my_malloc((uint) len + 2, MYF(0));
+
+	memcpy(namebuf, ptr, len);
+	namebuf[len] = '/';
+	namebuf[len + 1] = '\0';
+#ifdef	__WIN__
+	innobase_casedn_str(namebuf);
+#endif
+	trx = innobase_trx_allocate(thd);
+
+	/* Either the transaction is already flagged as a locking transaction
+	or it hasn't been started yet. */
+
+	ut_a(!trx_is_started(trx) || trx->will_lock > 0);
+
+	/* We are doing a DDL operation. */
+	++trx->will_lock;
+
+	row_drop_database_for_mysql(namebuf, trx);
+
+	my_free(namebuf);
+
+	/* Flush the log to reduce probability that the .frm files and
+	the InnoDB data dictionary get out-of-sync if the user runs
+	with innodb_flush_log_at_trx_commit = 0 */
+
+	log_buffer_flush_to_disk();
+
+	innobase_commit_low(trx);
+	trx_free_for_mysql(trx);
+}
+
+/*********************************************************************//**
+Renames an InnoDB table.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+innobase_rename_table(
+/*==================*/
+	trx_t*		trx,	/*!< in: transaction */
+	const char*	from,	/*!< in: old name of the table */
+	const char*	to)	/*!< in: new name of the table */
+{
+	dberr_t	error;
+	char	norm_to[FN_REFLEN];
+	char	norm_from[FN_REFLEN];
+
+	DBUG_ENTER("innobase_rename_table");
+	DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+
+	ut_ad(!srv_read_only_mode);
+
+	normalize_table_name(norm_to, to);
+	normalize_table_name(norm_from, from);
+
+	DEBUG_SYNC_C("innodb_rename_table_ready");
+
+	trx_start_if_not_started(trx);
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations. */
+
+	row_mysql_lock_data_dictionary(trx);
+
+	/* Transaction must be flagged as a locking transaction or it hasn't
+	been started yet. */
+
+	ut_a(trx->will_lock > 0);
+
+	error = row_rename_table_for_mysql(
+		norm_from, norm_to, trx, TRUE);
+
+	if (error != DB_SUCCESS) {
+		if (error == DB_TABLE_NOT_FOUND
+		    && innobase_get_lower_case_table_names() == 1) {
+			char*	is_part = NULL;
+#ifdef __WIN__
+			is_part = strstr(norm_from, "#p#");
+#else
+			is_part = strstr(norm_from, "#P#");
+#endif /* __WIN__ */
+
+			if (is_part) {
+				char	par_case_name[FN_REFLEN];
+#ifndef __WIN__
+				/* Check for the table using lower
+				case name, including the partition
+				separator "P" */
+				strcpy(par_case_name, norm_from);
+				innobase_casedn_str(par_case_name);
+#else
+				/* On Windows platfrom, check
+				whether there exists table name in
+				system table whose name is
+				not being normalized to lower case */
+				normalize_table_name_low(
+					par_case_name, from, FALSE);
+#endif
+				trx_start_if_not_started(trx);
+				error = row_rename_table_for_mysql(
+					par_case_name, norm_to, trx, TRUE);
+			}
+		}
+
+		if (error == DB_SUCCESS) {
+#ifndef __WIN__
+			sql_print_warning("Rename partition table %s "
+					  "succeeds after converting to lower "
+					  "case. The table may have "
+					  "been moved from a case "
+					  "in-sensitive file system.\n",
+					  norm_from);
+#else
+			sql_print_warning("Rename partition table %s "
+					  "succeeds after skipping the step to "
+					  "lower case the table name. "
+					  "The table may have been "
+					  "moved from a case sensitive "
+					  "file system.\n",
+					  norm_from);
+#endif /* __WIN__ */
+		}
+	}
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	/* Flush the log to reduce probability that the .frm
+	files and the InnoDB data dictionary get out-of-sync
+	if the user runs with innodb_flush_log_at_trx_commit = 0 */
+
+	log_buffer_flush_to_disk();
+
+	DBUG_RETURN(error);
+}
+
+/*********************************************************************//**
+Renames an InnoDB table.
+@return	0 or error code */
+UNIV_INTERN
+int
+ha_innobase::rename_table(
+/*======================*/
+	const char*	from,	/*!< in: old name of the table */
+	const char*	to)	/*!< in: new name of the table */
+{
+	trx_t*	trx;
+	dberr_t	error;
+	trx_t*	parent_trx;
+	THD*	thd		= ha_thd();
+
+	DBUG_ENTER("ha_innobase::rename_table");
+
+	if (srv_read_only_mode) {
+		ib_senderrf(thd, IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	}
+
+	/* Get the transaction associated with the current thd, or create one
+	if not yet created */
+
+	parent_trx = check_trx_exists(thd);
+
+	/* In case MySQL calls this in the middle of a SELECT query, release
+	possible adaptive hash latch to avoid deadlocks of threads */
+
+	trx_search_latch_release_if_reserved(parent_trx);
+
+	trx = innobase_trx_allocate(thd);
+
+	/* We are doing a DDL operation. */
+	++trx->will_lock;
+	trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+
+	error = innobase_rename_table(trx, from, to);
+
+	DEBUG_SYNC(thd, "after_innobase_rename_table");
+
+	innobase_commit_low(trx);
+	trx_free_for_mysql(trx);
+
+	if (error == DB_SUCCESS) {
+		char	norm_from[MAX_FULL_NAME_LEN];
+		char	norm_to[MAX_FULL_NAME_LEN];
+		char	errstr[512];
+		dberr_t	ret;
+
+		normalize_table_name(norm_from, from);
+		normalize_table_name(norm_to, to);
+
+		ret = dict_stats_rename_table(norm_from, norm_to,
+					      errstr, sizeof(errstr));
+
+		if (ret != DB_SUCCESS) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, " InnoDB: %s\n", errstr);
+
+			push_warning(thd, Sql_condition::WARN_LEVEL_WARN,
+				     ER_LOCK_WAIT_TIMEOUT, errstr);
+		}
+	}
+
+	/* Add a special case to handle the Duplicated Key error
+	and return DB_ERROR instead.
+	This is to avoid a possible SIGSEGV error from mysql error
+	handling code. Currently, mysql handles the Duplicated Key
+	error by re-entering the storage layer and getting dup key
+	info by calling get_dup_key(). This operation requires a valid
+	table handle ('row_prebuilt_t' structure) which could no
+	longer be available in the error handling stage. The suggested
+	solution is to report a 'table exists' error message (since
+	the dup key error here is due to an existing table whose name
+	is the one we are trying to rename to) and return the generic
+	error code. */
+	if (error == DB_DUPLICATE_KEY) {
+		my_error(ER_TABLE_EXISTS_ERROR, MYF(0), to);
+
+		error = DB_ERROR;
+	}
+
+	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*********************************************************************//**
+Estimates the number of index records in a range.
+@return	estimated number of rows */
+UNIV_INTERN
+ha_rows
+ha_innobase::records_in_range(
+/*==========================*/
+	uint			keynr,		/*!< in: index number */
+	key_range		*min_key,	/*!< in: start key value of the
+						range, may also be 0 */
+	key_range		*max_key)	/*!< in: range end key val, may
+						also be 0 */
+{
+	KEY*		key;
+	dict_index_t*	index;
+	dtuple_t*	range_start;
+	dtuple_t*	range_end;
+	ib_int64_t	n_rows;
+	ulint		mode1;
+	ulint		mode2;
+	mem_heap_t*	heap;
+
+	DBUG_ENTER("records_in_range");
+
+	ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
+
+	prebuilt->trx->op_info = (char*)"estimating records in index range";
+
+	/* In case MySQL calls this in the middle of a SELECT query, release
+	possible adaptive hash latch to avoid deadlocks of threads */
+
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+
+	active_index = keynr;
+
+	key = table->key_info + active_index;
+
+	index = innobase_get_index(keynr);
+
+	/* There exists possibility of not being able to find requested
+	index due to inconsistency between MySQL and InoDB dictionary info.
+	Necessary message should have been printed in innobase_get_index() */
+	if (dict_table_is_discarded(prebuilt->table)) {
+		n_rows = HA_POS_ERROR;
+		goto func_exit;
+	}
+	if (UNIV_UNLIKELY(!index)) {
+		n_rows = HA_POS_ERROR;
+		goto func_exit;
+	}
+	if (dict_index_is_corrupted(index)) {
+		n_rows = HA_ERR_INDEX_CORRUPT;
+		goto func_exit;
+	}
+	if (UNIV_UNLIKELY(!row_merge_is_index_usable(prebuilt->trx, index))) {
+		n_rows = HA_ERR_TABLE_DEF_CHANGED;
+		goto func_exit;
+	}
+
+	heap = mem_heap_create(2 * (key->actual_key_parts * sizeof(dfield_t)
+				    + sizeof(dtuple_t)));
+
+	range_start = dtuple_create(heap, key->actual_key_parts);
+	dict_index_copy_types(range_start, index, key->actual_key_parts);
+
+	range_end = dtuple_create(heap, key->actual_key_parts);
+	dict_index_copy_types(range_end, index, key->actual_key_parts);
+
+	row_sel_convert_mysql_key_to_innobase(
+				range_start,
+				prebuilt->srch_key_val1,
+				prebuilt->srch_key_val_len,
+				index,
+				(byte*) (min_key ? min_key->key :
+					 (const uchar*) 0),
+				(ulint) (min_key ? min_key->length : 0),
+				prebuilt->trx);
+	DBUG_ASSERT(min_key
+		    ? range_start->n_fields > 0
+		    : range_start->n_fields == 0);
+
+	row_sel_convert_mysql_key_to_innobase(
+				range_end,
+				prebuilt->srch_key_val2,
+				prebuilt->srch_key_val_len,
+				index,
+				(byte*) (max_key ? max_key->key :
+					 (const uchar*) 0),
+				(ulint) (max_key ? max_key->length : 0),
+				prebuilt->trx);
+	DBUG_ASSERT(max_key
+		    ? range_end->n_fields > 0
+		    : range_end->n_fields == 0);
+
+	mode1 = convert_search_mode_to_innobase(min_key ? min_key->flag :
+						HA_READ_KEY_EXACT);
+	mode2 = convert_search_mode_to_innobase(max_key ? max_key->flag :
+						HA_READ_KEY_EXACT);
+
+	if (mode1 != PAGE_CUR_UNSUPP && mode2 != PAGE_CUR_UNSUPP) {
+
+		n_rows = btr_estimate_n_rows_in_range(index, range_start,
+						      mode1, range_end,
+						      mode2);
+	} else {
+
+		n_rows = HA_POS_ERROR;
+	}
+
+	mem_heap_free(heap);
+
+func_exit:
+
+	prebuilt->trx->op_info = (char*)"";
+
+	/* The MySQL optimizer seems to believe an estimate of 0 rows is
+	always accurate and may return the result 'Empty set' based on that.
+	The accuracy is not guaranteed, and even if it were, for a locking
+	read we should anyway perform the search to set the next-key lock.
+	Add 1 to the value to make sure MySQL does not make the assumption! */
+
+	if (n_rows == 0) {
+		n_rows = 1;
+	}
+
+	DBUG_RETURN((ha_rows) n_rows);
+}
+
+/*********************************************************************//**
+Gives an UPPER BOUND to the number of rows in a table. This is used in
+filesort.cc.
+@return	upper bound of rows */
+UNIV_INTERN
+ha_rows
+ha_innobase::estimate_rows_upper_bound()
+/*====================================*/
+{
+	const dict_index_t*	index;
+	ulonglong		estimate;
+	ulonglong		local_data_file_length;
+	ulint			stat_n_leaf_pages;
+
+	DBUG_ENTER("estimate_rows_upper_bound");
+
+	/* We do not know if MySQL can call this function before calling
+	external_lock(). To be safe, update the thd of the current table
+	handle. */
+
+	update_thd(ha_thd());
+
+	prebuilt->trx->op_info = "calculating upper bound for table rows";
+
+	/* In case MySQL calls this in the middle of a SELECT query, release
+	possible adaptive hash latch to avoid deadlocks of threads */
+
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+
+	index = dict_table_get_first_index(prebuilt->table);
+
+	stat_n_leaf_pages = index->stat_n_leaf_pages;
+
+	ut_a(stat_n_leaf_pages > 0);
+
+	local_data_file_length =
+		((ulonglong) stat_n_leaf_pages) * UNIV_PAGE_SIZE;
+
+	/* Calculate a minimum length for a clustered index record and from
+	that an upper bound for the number of rows. Since we only calculate
+	new statistics in row0mysql.cc when a table has grown by a threshold
+	factor, we must add a safety factor 2 in front of the formula below. */
+
+	estimate = 2 * local_data_file_length
+		/ dict_index_calc_min_rec_len(index);
+
+	prebuilt->trx->op_info = "";
+
+	DBUG_RETURN((ha_rows) estimate);
+}
+
+/*********************************************************************//**
+How many seeks it will take to read through the table. This is to be
+comparable to the number returned by records_in_range so that we can
+decide if we should scan the table or use keys.
+@return	estimated time measured in disk seeks */
+UNIV_INTERN
+double
+ha_innobase::scan_time()
+/*====================*/
+{
+	/* Since MySQL seems to favor table scans too much over index
+	searches, we pretend that a sequential read takes the same time
+	as a random disk read, that is, we do not divide the following
+	by 10, which would be physically realistic. */
+
+	/* The locking below is disabled for performance reasons. Without
+	it we could end up returning uninitialized value to the caller,
+	which in the worst case could make some query plan go bogus or
+	issue a Valgrind warning. */
+#if 0
+	/* avoid potential lock order violation with dict_table_stats_lock()
+	below */
+	update_thd(ha_thd());
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+#endif
+
+	ulint	stat_clustered_index_size;
+
+#if 0
+	dict_table_stats_lock(prebuilt->table, RW_S_LATCH);
+#endif
+
+	ut_a(prebuilt->table->stat_initialized);
+
+	stat_clustered_index_size = prebuilt->table->stat_clustered_index_size;
+
+#if 0
+	dict_table_stats_unlock(prebuilt->table, RW_S_LATCH);
+#endif
+
+	return((double) stat_clustered_index_size);
+}
+
+/******************************************************************//**
+Calculate the time it takes to read a set of ranges through an index
+This enables us to optimise reads for clustered indexes.
+@return	estimated time measured in disk seeks */
+UNIV_INTERN
+double
+ha_innobase::read_time(
+/*===================*/
+	uint	index,	/*!< in: key number */
+	uint	ranges,	/*!< in: how many ranges */
+	ha_rows rows)	/*!< in: estimated number of rows in the ranges */
+{
+	ha_rows total_rows;
+	double	time_for_scan;
+
+	if (index != table->s->primary_key) {
+		/* Not clustered */
+		return(handler::read_time(index, ranges, rows));
+	}
+
+	if (rows <= 2) {
+
+		return((double) rows);
+	}
+
+	/* Assume that the read time is proportional to the scan time for all
+	rows + at most one seek per range. */
+
+	time_for_scan = scan_time();
+
+	if ((total_rows = estimate_rows_upper_bound()) < rows) {
+
+		return(time_for_scan);
+	}
+
+	return(ranges + (double) rows / (double) total_rows * time_for_scan);
+}
+
+/******************************************************************//**
+Return the size of the InnoDB memory buffer. */
+UNIV_INTERN
+longlong
+ha_innobase::get_memory_buffer_size() const
+/*=======================================*/
+{
+	return(innobase_buffer_pool_size);
+}
+
+/*********************************************************************//**
+Calculates the key number used inside MySQL for an Innobase index. We will
+first check the "index translation table" for a match of the index to get
+the index number. If there does not exist an "index translation table",
+or not able to find the index in the translation table, then we will fall back
+to the traditional way of looping through dict_index_t list to find a
+match. In this case, we have to take into account if we generated a
+default clustered index for the table
+@return the key number used inside MySQL */
+static
+int
+innobase_get_mysql_key_number_for_index(
+/*====================================*/
+	INNOBASE_SHARE*		share,	/*!< in: share structure for index
+					translation table. */
+	const TABLE*		table,	/*!< in: table in MySQL data
+					dictionary */
+	dict_table_t*		ib_table,/*!< in: table in Innodb data
+					dictionary */
+	const dict_index_t*	index)	/*!< in: index */
+{
+	const dict_index_t*	ind;
+	unsigned int		i;
+
+ 	ut_a(index);
+
+	/* If index does not belong to the table object of share structure
+	(ib_table comes from the share structure) search the index->table
+	object instead */
+	if (index->table != ib_table) {
+		i = 0;
+		ind = dict_table_get_first_index(index->table);
+
+		while (index != ind) {
+			ind = dict_table_get_next_index(ind);
+			i++;
+		}
+
+		if (row_table_got_default_clust_index(index->table)) {
+			ut_a(i > 0);
+			i--;
+		}
+
+		return(i);
+	}
+
+	/* If index translation table exists, we will first check
+	the index through index translation table for a match. */
+	if (share->idx_trans_tbl.index_mapping) {
+		for (i = 0; i < share->idx_trans_tbl.index_count; i++) {
+			if (share->idx_trans_tbl.index_mapping[i] == index) {
+				return(i);
+			}
+		}
+
+		/* Print an error message if we cannot find the index
+		in the "index translation table". */
+		if (*index->name != TEMP_INDEX_PREFIX) {
+			sql_print_error("Cannot find index %s in InnoDB index "
+					"translation table.", index->name);
+		}
+	}
+
+	/* If we do not have an "index translation table", or not able
+	to find the index in the translation table, we'll directly find
+	matching index with information from mysql TABLE structure and
+	InnoDB dict_index_t list */
+	for (i = 0; i < table->s->keys; i++) {
+		ind = dict_table_get_index_on_name(
+			ib_table, table->key_info[i].name);
+
+		if (index == ind) {
+			return(i);
+		}
+	}
+
+	/* Loop through each index of the table and lock them */
+	for (ind = dict_table_get_first_index(ib_table);
+	     ind != NULL;
+	     ind = dict_table_get_next_index(ind)) {
+		if (index == ind) {
+			/* Temp index is internal to InnoDB, that is
+			not present in the MySQL index list, so no
+			need to print such mismatch warning. */
+			if (*(index->name) != TEMP_INDEX_PREFIX) {
+				sql_print_warning(
+					"Find index %s in InnoDB index list "
+					"but not its MySQL index number "
+					"It could be an InnoDB internal index.",
+					index->name);
+			}
+			return(-1);
+		}
+	}
+
+	ut_error;
+
+	return(-1);
+}
+
+/*********************************************************************//**
+Calculate Record Per Key value. Need to exclude the NULL value if
+innodb_stats_method is set to "nulls_ignored"
+@return estimated record per key value */
+static
+ha_rows
+innodb_rec_per_key(
+/*===============*/
+	dict_index_t*	index,		/*!< in: dict_index_t structure */
+	ulint		i,		/*!< in: the column we are
+					calculating rec per key */
+	ha_rows		records)	/*!< in: estimated total records */
+{
+	ha_rows		rec_per_key;
+	ib_uint64_t	n_diff;
+
+	ut_a(index->table->stat_initialized);
+
+	ut_ad(i < dict_index_get_n_unique(index));
+
+	n_diff = index->stat_n_diff_key_vals[i];
+
+	if (n_diff == 0) {
+
+		rec_per_key = records;
+	} else if (srv_innodb_stats_method == SRV_STATS_NULLS_IGNORED) {
+		ib_uint64_t	n_null;
+		ib_uint64_t	n_non_null;
+
+		n_non_null = index->stat_n_non_null_key_vals[i];
+
+		/* In theory, index->stat_n_non_null_key_vals[i]
+		should always be less than the number of records.
+		Since this is statistics value, the value could
+		have slight discrepancy. But we will make sure
+		the number of null values is not a negative number. */
+		if (records < n_non_null) {
+			n_null = 0;
+		} else {
+			n_null = records - n_non_null;
+		}
+
+		/* If the number of NULL values is the same as or
+		large than that of the distinct values, we could
+		consider that the table consists mostly of NULL value.
+		Set rec_per_key to 1. */
+		if (n_diff <= n_null) {
+			rec_per_key = 1;
+		} else {
+			/* Need to exclude rows with NULL values from
+			rec_per_key calculation */
+			rec_per_key = (ha_rows)
+				((records - n_null) / (n_diff - n_null));
+		}
+	} else {
+		DEBUG_SYNC_C("after_checking_for_0");
+		rec_per_key = (ha_rows) (records / n_diff);
+	}
+
+	return(rec_per_key);
+}
+
+/*********************************************************************//**
+Returns statistics information of the table to the MySQL interpreter,
+in various fields of the handle object.
+@return HA_ERR_* error code or 0 */
+UNIV_INTERN
+int
+ha_innobase::info_low(
+/*==================*/
+	uint	flag,	/*!< in: what information is requested */
+	bool	is_analyze)
+{
+	dict_table_t*	ib_table;
+	ha_rows		rec_per_key;
+	ib_uint64_t	n_rows;
+	char		path[FN_REFLEN];
+	os_file_stat_t	stat_info;
+
+	DBUG_ENTER("info");
+
+	/* If we are forcing recovery at a high level, we will suppress
+	statistics calculation on tables, because that may crash the
+	server if an index is badly corrupted. */
+
+	/* We do not know if MySQL can call this function before calling
+	external_lock(). To be safe, update the thd of the current table
+	handle. */
+
+	update_thd(ha_thd());
+
+	/* In case MySQL calls this in the middle of a SELECT query, release
+	possible adaptive hash latch to avoid deadlocks of threads */
+
+	prebuilt->trx->op_info = (char*)"returning various info to MySQL";
+
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+
+	ib_table = prebuilt->table;
+	DBUG_ASSERT(ib_table->n_ref_count > 0);
+
+	if (flag & HA_STATUS_TIME) {
+		if (is_analyze || innobase_stats_on_metadata) {
+
+			dict_stats_upd_option_t	opt;
+			dberr_t			ret;
+
+			prebuilt->trx->op_info = "updating table statistics";
+
+			if (dict_stats_is_persistent_enabled(ib_table)) {
+
+				if (is_analyze) {
+					opt = DICT_STATS_RECALC_PERSISTENT;
+				} else {
+					/* This is e.g. 'SHOW INDEXES', fetch
+					the persistent stats from disk. */
+					opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY;
+				}
+			} else {
+				opt = DICT_STATS_RECALC_TRANSIENT;
+			}
+
+			ut_ad(!mutex_own(&dict_sys->mutex));
+			ret = dict_stats_update(ib_table, opt);
+
+			if (ret != DB_SUCCESS) {
+				prebuilt->trx->op_info = "";
+				DBUG_RETURN(HA_ERR_GENERIC);
+			}
+
+			prebuilt->trx->op_info =
+				"returning various info to MySQL";
+		}
+
+		my_snprintf(path, sizeof(path), "%s/%s%s",
+			    mysql_data_home, ib_table->name, reg_ext);
+
+		unpack_filename(path,path);
+
+		/* Note that we do not know the access time of the table,
+		nor the CHECK TABLE time, nor the UPDATE or INSERT time. */
+
+		if (os_file_get_status(path, &stat_info, false) == DB_SUCCESS) {
+			stats.create_time = (ulong) stat_info.ctime;
+		}
+	}
+
+	if (flag & HA_STATUS_VARIABLE) {
+
+		ulint	page_size;
+		ulint	stat_clustered_index_size;
+		ulint	stat_sum_of_other_index_sizes;
+
+		if (!(flag & HA_STATUS_NO_LOCK)) {
+			dict_table_stats_lock(ib_table, RW_S_LATCH);
+		}
+
+		ut_a(ib_table->stat_initialized);
+
+		n_rows = ib_table->stat_n_rows;
+
+		stat_clustered_index_size
+			= ib_table->stat_clustered_index_size;
+
+		stat_sum_of_other_index_sizes
+			= ib_table->stat_sum_of_other_index_sizes;
+
+		if (!(flag & HA_STATUS_NO_LOCK)) {
+			dict_table_stats_unlock(ib_table, RW_S_LATCH);
+		}
+
+		/*
+		The MySQL optimizer seems to assume in a left join that n_rows
+		is an accurate estimate if it is zero. Of course, it is not,
+		since we do not have any locks on the rows yet at this phase.
+		Since SHOW TABLE STATUS seems to call this function with the
+		HA_STATUS_TIME flag set, while the left join optimizer does not
+		set that flag, we add one to a zero value if the flag is not
+		set. That way SHOW TABLE STATUS will show the best estimate,
+		while the optimizer never sees the table empty. */
+
+		if (n_rows == 0 && !(flag & HA_STATUS_TIME)) {
+			n_rows++;
+		}
+
+		/* Fix bug#40386: Not flushing query cache after truncate.
+		n_rows can not be 0 unless the table is empty, set to 1
+		instead. The original problem of bug#29507 is actually
+		fixed in the server code. */
+		if (thd_sql_command(user_thd) == SQLCOM_TRUNCATE) {
+
+			n_rows = 1;
+
+			/* We need to reset the prebuilt value too, otherwise
+			checks for values greater than the last value written
+			to the table will fail and the autoinc counter will
+			not be updated. This will force write_row() into
+			attempting an update of the table's AUTOINC counter. */
+
+			prebuilt->autoinc_last_value = 0;
+		}
+
+		page_size = dict_table_zip_size(ib_table);
+		if (page_size == 0) {
+			page_size = UNIV_PAGE_SIZE;
+		}
+
+		stats.records = (ha_rows) n_rows;
+		stats.deleted = 0;
+		stats.data_file_length
+			= ((ulonglong) stat_clustered_index_size)
+			* page_size;
+		stats.index_file_length
+			= ((ulonglong) stat_sum_of_other_index_sizes)
+			* page_size;
+
+		/* Since fsp_get_available_space_in_free_extents() is
+		acquiring latches inside InnoDB, we do not call it if we
+		are asked by MySQL to avoid locking. Another reason to
+		avoid the call is that it uses quite a lot of CPU.
+		See Bug#38185. */
+		if (flag & HA_STATUS_NO_LOCK
+		    || !(flag & HA_STATUS_VARIABLE_EXTRA)) {
+			/* We do not update delete_length if no
+			locking is requested so the "old" value can
+			remain. delete_length is initialized to 0 in
+			the ha_statistics' constructor. Also we only
+			need delete_length to be set when
+			HA_STATUS_VARIABLE_EXTRA is set */
+		} else if (UNIV_UNLIKELY
+			   (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE)) {
+			/* Avoid accessing the tablespace if
+			innodb_crash_recovery is set to a high value. */
+			stats.delete_length = 0;
+		} else {
+			ullint	avail_space;
+
+			avail_space = fsp_get_available_space_in_free_extents(
+				ib_table->space);
+
+			if (avail_space == ULLINT_UNDEFINED) {
+				THD*	thd;
+				char	errbuf[MYSYS_STRERROR_SIZE];
+
+				thd = ha_thd();
+
+				push_warning_printf(
+					thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					ER_CANT_GET_STAT,
+					"InnoDB: Trying to get the free "
+					"space for table %s but its "
+					"tablespace has been discarded or "
+					"the .ibd file is missing. Setting "
+					"the free space to zero. "
+					"(errno: %d - %s)",
+					ib_table->name, errno,
+					my_strerror(errbuf, sizeof(errbuf),
+						    errno));
+
+				stats.delete_length = 0;
+			} else {
+				stats.delete_length = avail_space * 1024;
+			}
+		}
+
+		stats.check_time = 0;
+		stats.mrr_length_per_rec = ref_length + sizeof(void*);
+
+		if (stats.records == 0) {
+			stats.mean_rec_length = 0;
+		} else {
+			stats.mean_rec_length = (ulong)
+				(stats.data_file_length / stats.records);
+		}
+	}
+
+	if (flag & HA_STATUS_CONST) {
+		ulong	i;
+		/* Verify the number of index in InnoDB and MySQL
+		matches up. If prebuilt->clust_index_was_generated
+		holds, InnoDB defines GEN_CLUST_INDEX internally */
+		ulint	num_innodb_index = UT_LIST_GET_LEN(ib_table->indexes)
+			- prebuilt->clust_index_was_generated;
+		if (table->s->keys < num_innodb_index) {
+			/* If there are too many indexes defined
+			inside InnoDB, ignore those that are being
+			created, because MySQL will only consider
+			the fully built indexes here. */
+
+			for (const dict_index_t* index
+				     = UT_LIST_GET_FIRST(ib_table->indexes);
+			     index != NULL;
+			     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+				/* First, online index creation is
+				completed inside InnoDB, and then
+				MySQL attempts to upgrade the
+				meta-data lock so that it can rebuild
+				the .frm file. If we get here in that
+				time frame, dict_index_is_online_ddl()
+				would not hold and the index would
+				still not be included in TABLE_SHARE. */
+				if (*index->name == TEMP_INDEX_PREFIX) {
+					num_innodb_index--;
+				}
+			}
+
+			if (table->s->keys < num_innodb_index
+			    && innobase_fts_check_doc_id_index(
+				    ib_table, NULL, NULL)
+			    == FTS_EXIST_DOC_ID_INDEX) {
+				num_innodb_index--;
+			}
+		}
+
+		if (table->s->keys != num_innodb_index) {
+			sql_print_error("InnoDB: Table %s contains %lu "
+					"indexes inside InnoDB, which "
+					"is different from the number of "
+					"indexes %u defined in the MySQL ",
+					ib_table->name, num_innodb_index,
+					table->s->keys);
+		}
+
+		if (!(flag & HA_STATUS_NO_LOCK)) {
+			dict_table_stats_lock(ib_table, RW_S_LATCH);
+		}
+
+		ut_a(ib_table->stat_initialized);
+
+		for (i = 0; i < table->s->keys; i++) {
+			ulong	j;
+			/* We could get index quickly through internal
+			index mapping with the index translation table.
+			The identity of index (match up index name with
+			that of table->key_info[i]) is already verified in
+			innobase_get_index().  */
+			dict_index_t* index = innobase_get_index(i);
+
+			if (index == NULL) {
+				sql_print_error("Table %s contains fewer "
+						"indexes inside InnoDB than "
+						"are defined in the MySQL "
+						".frm file. Have you mixed up "
+						".frm files from different "
+						"installations? See "
+						REFMAN
+						"innodb-troubleshooting.html\n",
+						ib_table->name);
+				break;
+			}
+
+			for (j = 0; j < table->key_info[i].actual_key_parts; j++) {
+
+				if (table->key_info[i].flags & HA_FULLTEXT) {
+					/* The whole concept has no validity
+					for FTS indexes. */
+					table->key_info[i].rec_per_key[j] = 1;
+					continue;
+				}
+
+				if (j + 1 > index->n_uniq) {
+					sql_print_error(
+						"Index %s of %s has %lu columns"
+					        " unique inside InnoDB, but "
+						"MySQL is asking statistics for"
+					        " %lu columns. Have you mixed "
+						"up .frm files from different "
+					       	"installations? "
+						"See " REFMAN
+						"innodb-troubleshooting.html\n",
+						index->name,
+						ib_table->name,
+						(unsigned long)
+						index->n_uniq, j + 1);
+					break;
+				}
+
+				rec_per_key = innodb_rec_per_key(
+					index, j, stats.records);
+
+				/* Since MySQL seems to favor table scans
+				too much over index searches, we pretend
+				index selectivity is 2 times better than
+				our estimate: */
+
+				rec_per_key = rec_per_key / 2;
+
+				if (rec_per_key == 0) {
+					rec_per_key = 1;
+				}
+
+				table->key_info[i].rec_per_key[j] =
+				  rec_per_key >= ~(ulong) 0 ? ~(ulong) 0 :
+				  (ulong) rec_per_key;
+			}
+		}
+
+		if (!(flag & HA_STATUS_NO_LOCK)) {
+			dict_table_stats_unlock(ib_table, RW_S_LATCH);
+		}
+	}
+
+	if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
+
+		goto func_exit;
+	}
+
+	if (flag & HA_STATUS_ERRKEY) {
+		const dict_index_t*	err_index;
+
+		ut_a(prebuilt->trx);
+		ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N);
+
+		err_index = trx_get_error_info(prebuilt->trx);
+
+		if (err_index) {
+			errkey = innobase_get_mysql_key_number_for_index(
+					share, table, ib_table, err_index);
+		} else {
+			errkey = (unsigned int) (
+				(prebuilt->trx->error_key_num
+				 == ULINT_UNDEFINED)
+					? ~0
+					: prebuilt->trx->error_key_num);
+		}
+	}
+
+	if ((flag & HA_STATUS_AUTO) && table->found_next_number_field) {
+		stats.auto_increment_value = innobase_peek_autoinc();
+	}
+
+func_exit:
+	prebuilt->trx->op_info = (char*)"";
+
+	DBUG_RETURN(0);
+}
+
+/*********************************************************************//**
+Returns statistics information of the table to the MySQL interpreter,
+in various fields of the handle object.
+@return HA_ERR_* error code or 0 */
+UNIV_INTERN
+int
+ha_innobase::info(
+/*==============*/
+	uint	flag)	/*!< in: what information is requested */
+{
+	return(this->info_low(flag, false /* not ANALYZE */));
+}
+
+/**********************************************************************//**
+Updates index cardinalities of the table, based on random dives into
+each index tree. This does NOT calculate exact statistics on the table.
+@return	HA_ADMIN_* error code or HA_ADMIN_OK */
+UNIV_INTERN
+int
+ha_innobase::analyze(
+/*=================*/
+	THD*		thd,		/*!< in: connection thread handle */
+	HA_CHECK_OPT*	check_opt)	/*!< in: currently ignored */
+{
+	int	ret;
+
+	/* Simply call this->info_low() with all the flags
+	and request recalculation of the statistics */
+	ret = this->info_low(
+		HA_STATUS_TIME | HA_STATUS_CONST | HA_STATUS_VARIABLE,
+		true /* this is ANALYZE */);
+
+	if (ret != 0) {
+		return(HA_ADMIN_FAILED);
+	}
+
+	return(HA_ADMIN_OK);
+}
+
+/**********************************************************************//**
+This is mapped to "ALTER TABLE tablename ENGINE=InnoDB", which rebuilds
+the table in MySQL. */
+UNIV_INTERN
+int
+ha_innobase::optimize(
+/*==================*/
+	THD*		thd,		/*!< in: connection thread handle */
+	HA_CHECK_OPT*	check_opt)	/*!< in: currently ignored */
+{
+	/*FTS-FIXME: Since MySQL doesn't support engine-specific commands,
+	we have to hijack some existing command in order to be able to test
+	the new admin commands added in InnoDB's FTS support. For now, we
+	use MySQL's OPTIMIZE command, normally mapped to ALTER TABLE in
+	InnoDB (so it recreates the table anew), and map it to OPTIMIZE.
+
+	This works OK otherwise, but MySQL locks the entire table during
+	calls to OPTIMIZE, which is undesirable. */
+
+	if (innodb_optimize_fulltext_only) {
+		if (prebuilt->table->fts && prebuilt->table->fts->cache
+		    && !dict_table_is_discarded(prebuilt->table)) {
+			fts_sync_table(prebuilt->table);
+			fts_optimize_table(prebuilt->table);
+		}
+		return(HA_ADMIN_OK);
+	} else {
+
+		return(HA_ADMIN_TRY_ALTER);
+	}
+}
+
+/*******************************************************************//**
+Tries to check that an InnoDB table is not corrupted. If corruption is
+noticed, prints to stderr information about it. In case of corruption
+may also assert a failure and crash the server.
+@return	HA_ADMIN_CORRUPT or HA_ADMIN_OK */
+UNIV_INTERN
+int
+ha_innobase::check(
+/*===============*/
+	THD*		thd,		/*!< in: user thread handle */
+	HA_CHECK_OPT*	check_opt)	/*!< in: check options */
+{
+	dict_index_t*	index;
+	ulint		n_rows;
+	ulint		n_rows_in_table	= ULINT_UNDEFINED;
+	bool		is_ok		= true;
+	ulint		old_isolation_level;
+	ibool		table_corrupted;
+
+	DBUG_ENTER("ha_innobase::check");
+	DBUG_ASSERT(thd == ha_thd());
+	ut_a(prebuilt->trx);
+	ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N);
+	ut_a(prebuilt->trx == thd_to_trx(thd));
+
+	if (prebuilt->mysql_template == NULL) {
+		/* Build the template; we will use a dummy template
+		in index scans done in checking */
+
+		build_template(true);
+	}
+
+	if (dict_table_is_discarded(prebuilt->table)) {
+
+		ib_senderrf(
+			thd,
+			IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_DISCARDED,
+			table->s->table_name.str);
+
+		DBUG_RETURN(HA_ADMIN_CORRUPT);
+
+	} else if (prebuilt->table->ibd_file_missing) {
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_MISSING,
+			table->s->table_name.str);
+
+		DBUG_RETURN(HA_ADMIN_CORRUPT);
+	}
+
+	prebuilt->trx->op_info = "checking table";
+
+	old_isolation_level = prebuilt->trx->isolation_level;
+
+	/* We must run the index record counts at an isolation level
+	>= READ COMMITTED, because a dirty read can see a wrong number
+	of records in some index; to play safe, we use always
+	REPEATABLE READ here */
+
+	prebuilt->trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+	/* Check whether the table is already marked as corrupted
+	before running the check table */
+	table_corrupted = prebuilt->table->corrupted;
+
+	/* Reset table->corrupted bit so that check table can proceed to
+	do additional check */
+	prebuilt->table->corrupted = FALSE;
+
+	for (index = dict_table_get_first_index(prebuilt->table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+		char	index_name[MAX_FULL_NAME_LEN + 1];
+
+		/* If this is an index being created or dropped, skip */
+		if (*index->name == TEMP_INDEX_PREFIX) {
+			continue;
+		}
+
+		if (!(check_opt->flags & T_QUICK)) {
+			/* Enlarge the fatal lock wait timeout during
+			CHECK TABLE. */
+			os_increment_counter_by_amount(
+				server_mutex,
+				srv_fatal_semaphore_wait_threshold,
+				SRV_SEMAPHORE_WAIT_EXTENSION);
+			bool valid = btr_validate_index(index, prebuilt->trx);
+
+			/* Restore the fatal lock wait timeout after
+			CHECK TABLE. */
+			os_decrement_counter_by_amount(
+				server_mutex,
+				srv_fatal_semaphore_wait_threshold,
+				SRV_SEMAPHORE_WAIT_EXTENSION);
+
+			if (!valid) {
+				is_ok = false;
+
+				innobase_format_name(
+					index_name, sizeof index_name,
+					index->name, TRUE);
+				push_warning_printf(
+					thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					ER_NOT_KEYFILE,
+					"InnoDB: The B-tree of"
+					" index %s is corrupted.",
+					index_name);
+				continue;
+			}
+		}
+
+		/* Instead of invoking change_active_index(), set up
+		a dummy template for non-locking reads, disabling
+		access to the clustered index. */
+		prebuilt->index = index;
+
+		prebuilt->index_usable = row_merge_is_index_usable(
+			prebuilt->trx, prebuilt->index);
+
+		if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
+			innobase_format_name(
+				index_name, sizeof index_name,
+				prebuilt->index->name, TRUE);
+
+			if (dict_index_is_corrupted(prebuilt->index)) {
+				push_warning_printf(
+					user_thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					HA_ERR_INDEX_CORRUPT,
+					"InnoDB: Index %s is marked as"
+					" corrupted",
+					index_name);
+				is_ok = false;
+			} else {
+				push_warning_printf(
+					thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					HA_ERR_TABLE_DEF_CHANGED,
+					"InnoDB: Insufficient history for"
+					" index %s",
+					index_name);
+			}
+			continue;
+		}
+
+		prebuilt->sql_stat_start = TRUE;
+		prebuilt->template_type = ROW_MYSQL_DUMMY_TEMPLATE;
+		prebuilt->n_template = 0;
+		prebuilt->need_to_access_clustered = FALSE;
+
+		dtuple_set_n_fields(prebuilt->search_tuple, 0);
+
+		prebuilt->select_lock_type = LOCK_NONE;
+
+		if (!row_check_index_for_mysql(prebuilt, index, &n_rows)) {
+			innobase_format_name(
+				index_name, sizeof index_name,
+				index->name, TRUE);
+
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_NOT_KEYFILE,
+				"InnoDB: The B-tree of"
+				" index %s is corrupted.",
+				index_name);
+			is_ok = false;
+			dict_set_corrupted(
+				index, prebuilt->trx, "CHECK TABLE-check index");
+		}
+
+		if (thd_killed(user_thd)) {
+			break;
+		}
+
+#if 0
+		fprintf(stderr, "%lu entries in index %s\n", n_rows,
+			index->name);
+#endif
+
+		if (index == dict_table_get_first_index(prebuilt->table)) {
+			n_rows_in_table = n_rows;
+		} else if (!(index->type & DICT_FTS)
+			   && (n_rows != n_rows_in_table)) {
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_NOT_KEYFILE,
+				"InnoDB: Index '%-.200s' contains %lu"
+				" entries, should be %lu.",
+				index->name,
+				(ulong) n_rows,
+				(ulong) n_rows_in_table);
+			is_ok = false;
+			dict_set_corrupted(
+				index, prebuilt->trx,
+				"CHECK TABLE; Wrong count");
+		}
+	}
+
+	if (table_corrupted) {
+		/* If some previous operation has marked the table as
+		corrupted in memory, and has not propagated such to
+		clustered index, we will do so here */
+		index = dict_table_get_first_index(prebuilt->table);
+
+		if (!dict_index_is_corrupted(index)) {
+			dict_set_corrupted(
+				index, prebuilt->trx, "CHECK TABLE");
+		}
+		prebuilt->table->corrupted = TRUE;
+	}
+
+	/* Restore the original isolation level */
+	prebuilt->trx->isolation_level = old_isolation_level;
+
+	/* We validate the whole adaptive hash index for all tables
+	at every CHECK TABLE only when QUICK flag is not present. */
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	if (!(check_opt->flags & T_QUICK) && !btr_search_validate()) {
+		push_warning(thd, Sql_condition::WARN_LEVEL_WARN,
+			     ER_NOT_KEYFILE,
+			     "InnoDB: The adaptive hash index is corrupted.");
+		is_ok = false;
+	}
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
+
+	prebuilt->trx->op_info = "";
+	if (thd_killed(user_thd)) {
+		my_error(ER_QUERY_INTERRUPTED, MYF(0));
+	}
+
+	DBUG_RETURN(is_ok ? HA_ADMIN_OK : HA_ADMIN_CORRUPT);
+}
+
+/*************************************************************//**
+Adds information about free space in the InnoDB tablespace to a table comment
+which is printed out when a user calls SHOW TABLE STATUS. Adds also info on
+foreign keys.
+@return	table comment + InnoDB free space + info on foreign keys */
+UNIV_INTERN
+char*
+ha_innobase::update_table_comment(
+/*==============================*/
+	const char*	comment)/*!< in: table comment defined by user */
+{
+	uint	length = (uint) strlen(comment);
+	char*	str;
+	long	flen;
+
+	/* We do not know if MySQL can call this function before calling
+	external_lock(). To be safe, update the thd of the current table
+	handle. */
+
+	if (length > 64000 - 3) {
+		return((char*) comment); /* string too long */
+	}
+
+	update_thd(ha_thd());
+
+	prebuilt->trx->op_info = (char*)"returning table comment";
+
+	/* In case MySQL calls this in the middle of a SELECT query, release
+	possible adaptive hash latch to avoid deadlocks of threads */
+
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+	str = NULL;
+
+	/* output the data to a temporary file */
+
+	if (!srv_read_only_mode) {
+
+		mutex_enter(&srv_dict_tmpfile_mutex);
+
+		rewind(srv_dict_tmpfile);
+
+		fprintf(srv_dict_tmpfile, "InnoDB free: %llu kB",
+			fsp_get_available_space_in_free_extents(
+				prebuilt->table->space));
+
+		dict_print_info_on_foreign_keys(
+			FALSE, srv_dict_tmpfile, prebuilt->trx,
+			prebuilt->table);
+
+		flen = ftell(srv_dict_tmpfile);
+
+		if (flen < 0) {
+			flen = 0;
+		} else if (length + flen + 3 > 64000) {
+			flen = 64000 - 3 - length;
+		}
+
+		/* allocate buffer for the full string, and
+		read the contents of the temporary file */
+
+		str = (char*) my_malloc(length + flen + 3, MYF(0));
+
+		if (str) {
+			char* pos	= str + length;
+			if (length) {
+				memcpy(str, comment, length);
+				*pos++ = ';';
+				*pos++ = ' ';
+			}
+			rewind(srv_dict_tmpfile);
+			flen = (uint) fread(pos, 1, flen, srv_dict_tmpfile);
+			pos[flen] = 0;
+		}
+
+		mutex_exit(&srv_dict_tmpfile_mutex);
+	}
+
+	prebuilt->trx->op_info = (char*)"";
+
+	return(str ? str : (char*) comment);
+}
+
+/*******************************************************************//**
+Gets the foreign key create info for a table stored in InnoDB.
+@return own: character string in the form which can be inserted to the
+CREATE TABLE statement, MUST be freed with
+ha_innobase::free_foreign_key_create_info */
+UNIV_INTERN
+char*
+ha_innobase::get_foreign_key_create_info(void)
+/*==========================================*/
+{
+	long	flen;
+	char*	str	= 0;
+
+	ut_a(prebuilt != NULL);
+
+	/* We do not know if MySQL can call this function before calling
+	external_lock(). To be safe, update the thd of the current table
+	handle. */
+
+	update_thd(ha_thd());
+
+	prebuilt->trx->op_info = (char*)"getting info on foreign keys";
+
+	/* In case MySQL calls this in the middle of a SELECT query,
+	release possible adaptive hash latch to avoid
+	deadlocks of threads */
+
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+
+	if (!srv_read_only_mode) {
+		mutex_enter(&srv_dict_tmpfile_mutex);
+		rewind(srv_dict_tmpfile);
+
+		/* Output the data to a temporary file */
+		dict_print_info_on_foreign_keys(
+			TRUE, srv_dict_tmpfile, prebuilt->trx,
+			prebuilt->table);
+
+		prebuilt->trx->op_info = (char*)"";
+
+		flen = ftell(srv_dict_tmpfile);
+
+		if (flen < 0) {
+			flen = 0;
+		}
+
+		/* Allocate buffer for the string, and
+		read the contents of the temporary file */
+
+		str = (char*) my_malloc(flen + 1, MYF(0));
+
+		if (str) {
+			rewind(srv_dict_tmpfile);
+			flen = (uint) fread(str, 1, flen, srv_dict_tmpfile);
+			str[flen] = 0;
+		}
+
+		mutex_exit(&srv_dict_tmpfile_mutex);
+	}
+
+	return(str);
+}
+
+
+/***********************************************************************//**
+Maps a InnoDB foreign key constraint to a equivalent MySQL foreign key info.
+@return pointer to foreign key info */
+static
+FOREIGN_KEY_INFO*
+get_foreign_key_info(
+/*=================*/
+	THD*			thd,		/*!< in: user thread handle */
+	dict_foreign_t*		foreign)	/*!< in: foreign key constraint */
+{
+	FOREIGN_KEY_INFO	f_key_info;
+	FOREIGN_KEY_INFO*	pf_key_info;
+	uint			i = 0;
+	ulint			len;
+	char			tmp_buff[NAME_LEN+1];
+	char			name_buff[NAME_LEN+1];
+	const char*		ptr;
+	LEX_STRING*		referenced_key_name;
+	LEX_STRING*		name = NULL;
+
+	ptr = dict_remove_db_name(foreign->id);
+	f_key_info.foreign_id = thd_make_lex_string(thd, 0, ptr,
+						    (uint) strlen(ptr), 1);
+
+	/* Name format: database name, '/', table name, '\0' */
+
+	/* Referenced (parent) database name */
+	len = dict_get_db_name_len(foreign->referenced_table_name);
+	ut_a(len < sizeof(tmp_buff));
+	ut_memcpy(tmp_buff, foreign->referenced_table_name, len);
+	tmp_buff[len] = 0;
+
+	len = filename_to_tablename(tmp_buff, name_buff, sizeof(name_buff));
+	f_key_info.referenced_db = thd_make_lex_string(
+		thd, 0, name_buff, static_cast<unsigned int>(len), 1);
+
+	/* Referenced (parent) table name */
+	ptr = dict_remove_db_name(foreign->referenced_table_name);
+	len = filename_to_tablename(ptr, name_buff, sizeof(name_buff));
+	f_key_info.referenced_table = thd_make_lex_string(
+		thd, 0, name_buff, static_cast<unsigned int>(len), 1);
+
+	/* Dependent (child) database name */
+	len = dict_get_db_name_len(foreign->foreign_table_name);
+	ut_a(len < sizeof(tmp_buff));
+	ut_memcpy(tmp_buff, foreign->foreign_table_name, len);
+	tmp_buff[len] = 0;
+
+	len = filename_to_tablename(tmp_buff, name_buff, sizeof(name_buff));
+	f_key_info.foreign_db = thd_make_lex_string(
+		thd, 0, name_buff, static_cast<unsigned int>(len), 1);
+
+	/* Dependent (child) table name */
+	ptr = dict_remove_db_name(foreign->foreign_table_name);
+	len = filename_to_tablename(ptr, name_buff, sizeof(name_buff));
+	f_key_info.foreign_table = thd_make_lex_string(
+		thd, 0, name_buff, static_cast<unsigned int>(len), 1);
+
+	do {
+		ptr = foreign->foreign_col_names[i];
+		name = thd_make_lex_string(thd, name, ptr,
+					   (uint) strlen(ptr), 1);
+		f_key_info.foreign_fields.push_back(name);
+		ptr = foreign->referenced_col_names[i];
+		name = thd_make_lex_string(thd, name, ptr,
+					   (uint) strlen(ptr), 1);
+		f_key_info.referenced_fields.push_back(name);
+	} while (++i < foreign->n_fields);
+
+	if (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE) {
+		len = 7;
+		ptr = "CASCADE";
+	} else if (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL) {
+		len = 8;
+		ptr = "SET NULL";
+	} else if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) {
+		len = 9;
+		ptr = "NO ACTION";
+	} else {
+		len = 8;
+		ptr = "RESTRICT";
+	}
+
+	f_key_info.delete_method = thd_make_lex_string(
+		thd, f_key_info.delete_method, ptr,
+		static_cast<unsigned int>(len), 1);
+
+	if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) {
+		len = 7;
+		ptr = "CASCADE";
+	} else if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) {
+		len = 8;
+		ptr = "SET NULL";
+	} else if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) {
+		len = 9;
+		ptr = "NO ACTION";
+	} else {
+		len = 8;
+		ptr = "RESTRICT";
+	}
+
+	f_key_info.update_method = thd_make_lex_string(
+		thd, f_key_info.update_method, ptr,
+		static_cast<unsigned int>(len), 1);
+
+	if (foreign->referenced_index && foreign->referenced_index->name) {
+		referenced_key_name = thd_make_lex_string(thd,
+					f_key_info.referenced_key_name,
+					foreign->referenced_index->name,
+					 (uint) strlen(foreign->referenced_index->name),
+					1);
+	} else {
+		referenced_key_name = NULL;
+	}
+
+	f_key_info.referenced_key_name = referenced_key_name;
+
+	pf_key_info = (FOREIGN_KEY_INFO*) thd_memdup(thd, &f_key_info,
+						      sizeof(FOREIGN_KEY_INFO));
+
+	return(pf_key_info);
+}
+
+/*******************************************************************//**
+Gets the list of foreign keys in this table.
+@return always 0, that is, always succeeds */
+UNIV_INTERN
+int
+ha_innobase::get_foreign_key_list(
+/*==============================*/
+	THD*			thd,		/*!< in: user thread handle */
+	List<FOREIGN_KEY_INFO>*	f_key_list)	/*!< out: foreign key list */
+{
+	FOREIGN_KEY_INFO*	pf_key_info;
+	dict_foreign_t*		foreign;
+
+	ut_a(prebuilt != NULL);
+	update_thd(ha_thd());
+
+	prebuilt->trx->op_info = "getting list of foreign keys";
+
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+
+	mutex_enter(&(dict_sys->mutex));
+
+	for (dict_foreign_set::iterator it
+		= prebuilt->table->foreign_set.begin();
+	     it != prebuilt->table->foreign_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		pf_key_info = get_foreign_key_info(thd, foreign);
+		if (pf_key_info) {
+			f_key_list->push_back(pf_key_info);
+		}
+	}
+
+	mutex_exit(&(dict_sys->mutex));
+
+	prebuilt->trx->op_info = "";
+
+	return(0);
+}
+
+/*******************************************************************//**
+Gets the set of foreign keys where this table is the referenced table.
+@return always 0, that is, always succeeds */
+UNIV_INTERN
+int
+ha_innobase::get_parent_foreign_key_list(
+/*=====================================*/
+	THD*			thd,		/*!< in: user thread handle */
+	List<FOREIGN_KEY_INFO>*	f_key_list)	/*!< out: foreign key list */
+{
+	FOREIGN_KEY_INFO*	pf_key_info;
+	dict_foreign_t*		foreign;
+
+	ut_a(prebuilt != NULL);
+	update_thd(ha_thd());
+
+	prebuilt->trx->op_info = "getting list of referencing foreign keys";
+
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+
+	mutex_enter(&(dict_sys->mutex));
+
+	for (dict_foreign_set::iterator it
+		= prebuilt->table->referenced_set.begin();
+	     it != prebuilt->table->referenced_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		pf_key_info = get_foreign_key_info(thd, foreign);
+		if (pf_key_info) {
+			f_key_list->push_back(pf_key_info);
+		}
+	}
+
+	mutex_exit(&(dict_sys->mutex));
+
+	prebuilt->trx->op_info = "";
+
+	return(0);
+}
+
+/*****************************************************************//**
+Checks if ALTER TABLE may change the storage engine of the table.
+Changing storage engines is not allowed for tables for which there
+are foreign key constraints (parent or child tables).
+@return	TRUE if can switch engines */
+UNIV_INTERN
+bool
+ha_innobase::can_switch_engines(void)
+/*=================================*/
+{
+	bool	can_switch;
+
+	DBUG_ENTER("ha_innobase::can_switch_engines");
+	update_thd();
+
+	prebuilt->trx->op_info =
+			"determining if there are foreign key constraints";
+	row_mysql_freeze_data_dictionary(prebuilt->trx);
+
+	can_switch = prebuilt->table->referenced_set.empty()
+		&& prebuilt->table->foreign_set.empty();
+
+	row_mysql_unfreeze_data_dictionary(prebuilt->trx);
+	prebuilt->trx->op_info = "";
+
+	DBUG_RETURN(can_switch);
+}
+
+/*******************************************************************//**
+Checks if a table is referenced by a foreign key. The MySQL manual states that
+a REPLACE is either equivalent to an INSERT, or DELETE(s) + INSERT. Only a
+delete is then allowed internally to resolve a duplicate key conflict in
+REPLACE, not an update.
+@return	> 0 if referenced by a FOREIGN KEY */
+UNIV_INTERN
+uint
+ha_innobase::referenced_by_foreign_key(void)
+/*========================================*/
+{
+	if (dict_table_is_referenced_by_foreign_key(prebuilt->table)) {
+
+		return(1);
+	}
+
+	return(0);
+}
+
+/*******************************************************************//**
+Frees the foreign key create info for a table stored in InnoDB, if it is
+non-NULL. */
+UNIV_INTERN
+void
+ha_innobase::free_foreign_key_create_info(
+/*======================================*/
+	char*	str)	/*!< in, own: create info string to free */
+{
+	if (str) {
+		my_free(str);
+	}
+}
+
+/*******************************************************************//**
+Tells something additional to the handler about how to do things.
+@return	0 or error number */
+UNIV_INTERN
+int
+ha_innobase::extra(
+/*===============*/
+	enum ha_extra_function operation)
+			   /*!< in: HA_EXTRA_FLUSH or some other flag */
+{
+	check_trx_exists(ha_thd());
+
+	/* Warning: since it is not sure that MySQL calls external_lock
+	before calling this function, the trx field in prebuilt can be
+	obsolete! */
+
+	switch (operation) {
+	case HA_EXTRA_FLUSH:
+		if (prebuilt->blob_heap) {
+			row_mysql_prebuilt_free_blob_heap(prebuilt);
+		}
+		break;
+	case HA_EXTRA_RESET_STATE:
+		reset_template();
+		thd_to_trx(ha_thd())->duplicates = 0;
+		break;
+	case HA_EXTRA_NO_KEYREAD:
+		prebuilt->read_just_key = 0;
+		break;
+	case HA_EXTRA_KEYREAD:
+		prebuilt->read_just_key = 1;
+		break;
+	case HA_EXTRA_KEYREAD_PRESERVE_FIELDS:
+		prebuilt->keep_other_fields_on_keyread = 1;
+		break;
+
+		/* IMPORTANT: prebuilt->trx can be obsolete in
+		this method, because it is not sure that MySQL
+		calls external_lock before this method with the
+		parameters below.  We must not invoke update_thd()
+		either, because the calling threads may change.
+		CAREFUL HERE, OR MEMORY CORRUPTION MAY OCCUR! */
+	case HA_EXTRA_INSERT_WITH_UPDATE:
+		thd_to_trx(ha_thd())->duplicates |= TRX_DUP_IGNORE;
+		break;
+	case HA_EXTRA_NO_IGNORE_DUP_KEY:
+		thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_IGNORE;
+		break;
+	case HA_EXTRA_WRITE_CAN_REPLACE:
+		thd_to_trx(ha_thd())->duplicates |= TRX_DUP_REPLACE;
+		break;
+	case HA_EXTRA_WRITE_CANNOT_REPLACE:
+		thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_REPLACE;
+		break;
+	default:/* Do nothing */
+		;
+	}
+
+	return(0);
+}
+
+/******************************************************************//**
+*/
+UNIV_INTERN
+int
+ha_innobase::reset()
+/*================*/
+{
+	if (prebuilt->blob_heap) {
+		row_mysql_prebuilt_free_blob_heap(prebuilt);
+	}
+
+	reset_template();
+	ds_mrr.reset();
+
+	/* TODO: This should really be reset in reset_template() but for now
+	it's safer to do it explicitly here. */
+
+	/* This is a statement level counter. */
+	prebuilt->autoinc_last_value = 0;
+
+	return(0);
+}
+
+/******************************************************************//**
+MySQL calls this function at the start of each SQL statement inside LOCK
+TABLES. Inside LOCK TABLES the ::external_lock method does not work to
+mark SQL statement borders. Note also a special case: if a temporary table
+is created inside LOCK TABLES, MySQL has not called external_lock() at all
+on that table.
+MySQL-5.0 also calls this before each statement in an execution of a stored
+procedure. To make the execution more deterministic for binlogging, MySQL-5.0
+locks all tables involved in a stored procedure with full explicit table
+locks (thd_in_lock_tables(thd) holds in store_lock()) before executing the
+procedure.
+@return	0 or error code */
+UNIV_INTERN
+int
+ha_innobase::start_stmt(
+/*====================*/
+	THD*		thd,	/*!< in: handle to the user thread */
+	thr_lock_type	lock_type)
+{
+	trx_t*		trx;
+	DBUG_ENTER("ha_innobase::start_stmt");
+
+	update_thd(thd);
+
+	trx = prebuilt->trx;
+
+	/* Here we release the search latch and the InnoDB thread FIFO ticket
+	if they were reserved. They should have been released already at the
+	end of the previous statement, but because inside LOCK TABLES the
+	lock count method does not work to mark the end of a SELECT statement,
+	that may not be the case. We MUST release the search latch before an
+	INSERT, for example. */
+
+	trx_search_latch_release_if_reserved(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
+
+	/* Reset the AUTOINC statement level counter for multi-row INSERTs. */
+	trx->n_autoinc_rows = 0;
+
+	prebuilt->sql_stat_start = TRUE;
+	prebuilt->hint_need_to_fetch_extra_cols = 0;
+	reset_template();
+
+	if (dict_table_is_temporary(prebuilt->table)
+	    && prebuilt->mysql_has_locked
+	    && prebuilt->select_lock_type == LOCK_NONE) {
+		dberr_t error;
+
+		switch (thd_sql_command(thd)) {
+		case SQLCOM_INSERT:
+		case SQLCOM_UPDATE:
+		case SQLCOM_DELETE:
+			init_table_handle_for_HANDLER();
+			prebuilt->select_lock_type = LOCK_X;
+			prebuilt->stored_select_lock_type = LOCK_X;
+			error = row_lock_table_for_mysql(prebuilt, NULL, 1);
+
+			if (error != DB_SUCCESS) {
+				int st = convert_error_code_to_mysql(
+					error, 0, thd);
+				DBUG_RETURN(st);
+			}
+			break;
+		}
+	}
+
+	if (!prebuilt->mysql_has_locked) {
+		/* This handle is for a temporary table created inside
+		this same LOCK TABLES; since MySQL does NOT call external_lock
+		in this case, we must use x-row locks inside InnoDB to be
+		prepared for an update of a row */
+
+		prebuilt->select_lock_type = LOCK_X;
+
+	} else if (trx->isolation_level != TRX_ISO_SERIALIZABLE
+		   && thd_sql_command(thd) == SQLCOM_SELECT
+		   && lock_type == TL_READ) {
+
+		/* For other than temporary tables, we obtain
+		no lock for consistent read (plain SELECT). */
+
+		prebuilt->select_lock_type = LOCK_NONE;
+	} else {
+		/* Not a consistent read: restore the
+		select_lock_type value. The value of
+		stored_select_lock_type was decided in:
+		1) ::store_lock(),
+		2) ::external_lock(),
+		3) ::init_table_handle_for_HANDLER(), and
+		4) ::transactional_table_lock(). */
+
+		ut_a(prebuilt->stored_select_lock_type != LOCK_NONE_UNSET);
+		prebuilt->select_lock_type = prebuilt->stored_select_lock_type;
+	}
+
+	*trx->detailed_error = 0;
+
+	innobase_register_trx(ht, thd, trx);
+
+	if (!trx_is_started(trx)) {
+		++trx->will_lock;
+	}
+
+	DBUG_RETURN(0);
+}
+
+/******************************************************************//**
+Maps a MySQL trx isolation level code to the InnoDB isolation level code
+@return	InnoDB isolation level */
+static inline
+ulint
+innobase_map_isolation_level(
+/*=========================*/
+	enum_tx_isolation	iso)	/*!< in: MySQL isolation level code */
+{
+	switch (iso) {
+	case ISO_REPEATABLE_READ:	return(TRX_ISO_REPEATABLE_READ);
+	case ISO_READ_COMMITTED:	return(TRX_ISO_READ_COMMITTED);
+	case ISO_SERIALIZABLE:		return(TRX_ISO_SERIALIZABLE);
+	case ISO_READ_UNCOMMITTED:	return(TRX_ISO_READ_UNCOMMITTED);
+	}
+
+	ut_error;
+
+	return(0);
+}
+
+/******************************************************************//**
+As MySQL will execute an external lock for every new table it uses when it
+starts to process an SQL statement (an exception is when MySQL calls
+start_stmt for the handle) we can use this function to store the pointer to
+the THD in the handle. We will also use this function to communicate
+to InnoDB that a new SQL statement has started and that we must store a
+savepoint to our transaction handle, so that we are able to roll back
+the SQL statement in case of an error.
+@return	0 */
+UNIV_INTERN
+int
+ha_innobase::external_lock(
+/*=======================*/
+	THD*	thd,		/*!< in: handle to the user thread */
+	int	lock_type)	/*!< in: lock type */
+{
+	trx_t*		trx;
+
+	DBUG_ENTER("ha_innobase::external_lock");
+	DBUG_PRINT("enter",("lock_type: %d", lock_type));
+
+	update_thd(thd);
+
+	/* Statement based binlogging does not work in isolation level
+	READ UNCOMMITTED and READ COMMITTED since the necessary
+	locks cannot be taken. In this case, we print an
+	informative error message and return with an error.
+	Note: decide_logging_format would give the same error message,
+	except it cannot give the extra details. */
+
+	if (lock_type == F_WRLCK
+	    && !(table_flags() & HA_BINLOG_STMT_CAPABLE)
+	    && thd_binlog_format(thd) == BINLOG_FORMAT_STMT
+	    && thd_binlog_filter_ok(thd)
+	    && thd_sqlcom_can_generate_row_events(thd)) {
+		bool skip = 0;
+		/* used by test case */
+		DBUG_EXECUTE_IF("no_innodb_binlog_errors", skip = true;);
+		if (!skip) {
+			my_error(ER_BINLOG_STMT_MODE_AND_ROW_ENGINE, MYF(0),
+			         " InnoDB is limited to row-logging when "
+			         "transaction isolation level is "
+			         "READ COMMITTED or READ UNCOMMITTED.");
+			DBUG_RETURN(HA_ERR_LOGGING_IMPOSSIBLE);
+		}
+	}
+
+	/* Check for UPDATEs in read-only mode. */
+	if (srv_read_only_mode
+	    && (thd_sql_command(thd) == SQLCOM_UPDATE
+		|| thd_sql_command(thd) == SQLCOM_INSERT
+		|| thd_sql_command(thd) == SQLCOM_REPLACE
+		|| thd_sql_command(thd) == SQLCOM_DROP_TABLE
+		|| thd_sql_command(thd) == SQLCOM_ALTER_TABLE
+		|| thd_sql_command(thd) == SQLCOM_OPTIMIZE
+		|| (thd_sql_command(thd) == SQLCOM_CREATE_TABLE
+		    && lock_type == F_WRLCK)
+		|| thd_sql_command(thd) == SQLCOM_CREATE_INDEX
+		|| thd_sql_command(thd) == SQLCOM_DROP_INDEX
+		|| thd_sql_command(thd) == SQLCOM_DELETE)) {
+
+		if (thd_sql_command(thd) == SQLCOM_CREATE_TABLE)
+		{
+			ib_senderrf(thd, IB_LOG_LEVEL_WARN,
+				    ER_INNODB_READ_ONLY);
+			DBUG_RETURN(HA_ERR_INNODB_READ_ONLY);
+		} else {
+			ib_senderrf(thd, IB_LOG_LEVEL_WARN,
+				    ER_READ_ONLY_MODE);
+			DBUG_RETURN(HA_ERR_TABLE_READONLY);
+		}
+
+	}
+
+	trx = prebuilt->trx;
+
+	prebuilt->sql_stat_start = TRUE;
+	prebuilt->hint_need_to_fetch_extra_cols = 0;
+
+	reset_template();
+
+	switch (prebuilt->table->quiesce) {
+	case QUIESCE_START:
+		/* Check for FLUSH TABLE t WITH READ LOCK; */
+		if (!srv_read_only_mode
+		    && thd_sql_command(thd) == SQLCOM_FLUSH
+		    && lock_type == F_RDLCK) {
+
+			row_quiesce_table_start(prebuilt->table, trx);
+
+			/* Use the transaction instance to track UNLOCK
+			TABLES. It can be done via START TRANSACTION; too
+			implicitly. */
+
+			++trx->flush_tables;
+		}
+		break;
+
+	case QUIESCE_COMPLETE:
+		/* Check for UNLOCK TABLES; implicit or explicit
+		or trx interruption. */
+		if (trx->flush_tables > 0
+		    && (lock_type == F_UNLCK || trx_is_interrupted(trx))) {
+
+			row_quiesce_table_complete(prebuilt->table, trx);
+
+			ut_a(trx->flush_tables > 0);
+			--trx->flush_tables;
+		}
+
+		break;
+
+	case QUIESCE_NONE:
+		break;
+	}
+
+	if (lock_type == F_WRLCK) {
+
+		/* If this is a SELECT, then it is in UPDATE TABLE ...
+		or SELECT ... FOR UPDATE */
+		prebuilt->select_lock_type = LOCK_X;
+		prebuilt->stored_select_lock_type = LOCK_X;
+	}
+
+	if (lock_type != F_UNLCK) {
+		/* MySQL is setting a new table lock */
+
+		*trx->detailed_error = 0;
+
+		innobase_register_trx(ht, thd, trx);
+
+		if (trx->isolation_level == TRX_ISO_SERIALIZABLE
+		    && prebuilt->select_lock_type == LOCK_NONE
+		    && thd_test_options(
+			    thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+			/* To get serializable execution, we let InnoDB
+			conceptually add 'LOCK IN SHARE MODE' to all SELECTs
+			which otherwise would have been consistent reads. An
+			exception is consistent reads in the AUTOCOMMIT=1 mode:
+			we know that they are read-only transactions, and they
+			can be serialized also if performed as consistent
+			reads. */
+
+			prebuilt->select_lock_type = LOCK_S;
+			prebuilt->stored_select_lock_type = LOCK_S;
+		}
+
+		/* Starting from 4.1.9, no InnoDB table lock is taken in LOCK
+		TABLES if AUTOCOMMIT=1. It does not make much sense to acquire
+		an InnoDB table lock if it is released immediately at the end
+		of LOCK TABLES, and InnoDB's table locks in that case cause
+		VERY easily deadlocks.
+
+		We do not set InnoDB table locks if user has not explicitly
+		requested a table lock. Note that thd_in_lock_tables(thd)
+		can hold in some cases, e.g., at the start of a stored
+		procedure call (SQLCOM_CALL). */
+
+		if (prebuilt->select_lock_type != LOCK_NONE) {
+
+			if (thd_sql_command(thd) == SQLCOM_LOCK_TABLES
+			    && THDVAR(thd, table_locks)
+			    && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT)
+			    && thd_in_lock_tables(thd)) {
+
+				dberr_t	error = row_lock_table_for_mysql(
+					prebuilt, NULL, 0);
+
+				if (error != DB_SUCCESS) {
+					DBUG_RETURN(
+						convert_error_code_to_mysql(
+							error, 0, thd));
+				}
+			}
+
+			trx->mysql_n_tables_locked++;
+		}
+
+		trx->n_mysql_tables_in_use++;
+		prebuilt->mysql_has_locked = TRUE;
+
+		if (!trx_is_started(trx)
+		    && (prebuilt->select_lock_type != LOCK_NONE
+			|| prebuilt->stored_select_lock_type != LOCK_NONE)) {
+
+			++trx->will_lock;
+		}
+
+		DBUG_RETURN(0);
+	}
+
+	/* MySQL is releasing a table lock */
+
+	trx->n_mysql_tables_in_use--;
+	prebuilt->mysql_has_locked = FALSE;
+
+	/* Release a possible FIFO ticket and search latch. Since we
+	may reserve the trx_sys->mutex, we have to release the search
+	system latch first to obey the latching order. */
+
+	trx_search_latch_release_if_reserved(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
+
+	/* If the MySQL lock count drops to zero we know that the current SQL
+	statement has ended */
+
+	if (trx->n_mysql_tables_in_use == 0) {
+
+		trx->mysql_n_tables_locked = 0;
+		prebuilt->used_in_HANDLER = FALSE;
+
+		if (!thd_test_options(
+				thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+			if (trx_is_started(trx)) {
+				innobase_commit(ht, thd, TRUE);
+			}
+
+		} else if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+			   && trx->global_read_view) {
+
+			/* At low transaction isolation levels we let
+			each consistent read set its own snapshot */
+
+			read_view_close_for_mysql(trx);
+		}
+	}
+
+	if (!trx_is_started(trx)
+	    && (prebuilt->select_lock_type != LOCK_NONE
+		|| prebuilt->stored_select_lock_type != LOCK_NONE)) {
+
+		++trx->will_lock;
+	}
+
+	DBUG_RETURN(0);
+}
+
+/******************************************************************//**
+With this function MySQL request a transactional lock to a table when
+user issued query LOCK TABLES..WHERE ENGINE = InnoDB.
+@return	error code */
+UNIV_INTERN
+int
+ha_innobase::transactional_table_lock(
+/*==================================*/
+	THD*	thd,		/*!< in: handle to the user thread */
+	int	lock_type)	/*!< in: lock type */
+{
+	trx_t*		trx;
+
+	DBUG_ENTER("ha_innobase::transactional_table_lock");
+	DBUG_PRINT("enter",("lock_type: %d", lock_type));
+
+	/* We do not know if MySQL can call this function before calling
+	external_lock(). To be safe, update the thd of the current table
+	handle. */
+
+	update_thd(thd);
+
+	if (!thd_tablespace_op(thd)) {
+
+		if (dict_table_is_discarded(prebuilt->table)) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLESPACE_DISCARDED,
+				table->s->table_name.str);
+
+		} else if (prebuilt->table->ibd_file_missing) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLESPACE_MISSING,
+				table->s->table_name.str);
+		}
+
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+
+	trx = prebuilt->trx;
+
+	prebuilt->sql_stat_start = TRUE;
+	prebuilt->hint_need_to_fetch_extra_cols = 0;
+
+	reset_template();
+
+	if (lock_type == F_WRLCK) {
+		prebuilt->select_lock_type = LOCK_X;
+		prebuilt->stored_select_lock_type = LOCK_X;
+	} else if (lock_type == F_RDLCK) {
+		prebuilt->select_lock_type = LOCK_S;
+		prebuilt->stored_select_lock_type = LOCK_S;
+	} else {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"MySQL is trying to set transactional table lock "
+			"with corrupted lock type to table %s, lock type "
+			"%d does not exist.",
+			table->s->table_name.str, lock_type);
+
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+
+	/* MySQL is setting a new transactional table lock */
+
+	innobase_register_trx(ht, thd, trx);
+
+	if (THDVAR(thd, table_locks) && thd_in_lock_tables(thd)) {
+		dberr_t	error;
+
+		error = row_lock_table_for_mysql(prebuilt, NULL, 0);
+
+		if (error != DB_SUCCESS) {
+			DBUG_RETURN(
+				convert_error_code_to_mysql(
+					error, prebuilt->table->flags, thd));
+		}
+
+		if (thd_test_options(
+			thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+			/* Store the current undo_no of the transaction
+			so that we know where to roll back if we have
+			to roll back the next SQL statement */
+
+			trx_mark_sql_stat_end(trx);
+		}
+	}
+
+	DBUG_RETURN(0);
+}
+
+/************************************************************************//**
+Here we export InnoDB status variables to MySQL. */
+static
+void
+innodb_export_status()
+/*==================*/
+{
+	if (innodb_inited) {
+		srv_export_innodb_status();
+	}
+}
+
+/************************************************************************//**
+Implements the SHOW ENGINE INNODB STATUS command. Sends the output of the
+InnoDB Monitor to the client.
+@return 0 on success */
+static
+int
+innodb_show_status(
+/*===============*/
+	handlerton*	hton,	/*!< in: the innodb handlerton */
+	THD*		thd,	/*!< in: the MySQL query thread of the caller */
+	stat_print_fn*	stat_print)
+{
+	trx_t*			trx;
+	static const char	truncated_msg[] = "... truncated...\n";
+	const long		MAX_STATUS_SIZE = 1048576;
+	ulint			trx_list_start = ULINT_UNDEFINED;
+	ulint			trx_list_end = ULINT_UNDEFINED;
+	bool			ret_val;
+
+	DBUG_ENTER("innodb_show_status");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	/* We don't create the temp files or associated
+	mutexes in read-only-mode */
+
+	if (srv_read_only_mode) {
+		DBUG_RETURN(0);
+	}
+
+	trx = check_trx_exists(thd);
+
+	trx_search_latch_release_if_reserved(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
+
+	/* We let the InnoDB Monitor to output at most MAX_STATUS_SIZE
+	bytes of text. */
+
+	char*	str;
+	ssize_t	flen, usable_len;
+
+	mutex_enter(&srv_monitor_file_mutex);
+	rewind(srv_monitor_file);
+
+	srv_printf_innodb_monitor(srv_monitor_file, FALSE,
+				  &trx_list_start, &trx_list_end);
+
+	os_file_set_eof(srv_monitor_file);
+
+	if ((flen = ftell(srv_monitor_file)) < 0) {
+		flen = 0;
+	}
+
+	if (flen > MAX_STATUS_SIZE) {
+		usable_len = MAX_STATUS_SIZE;
+		srv_truncated_status_writes++;
+	} else {
+		usable_len = flen;
+	}
+
+	/* allocate buffer for the string, and
+	read the contents of the temporary file */
+
+	if (!(str = (char*) my_malloc(usable_len + 1, MYF(0)))) {
+		mutex_exit(&srv_monitor_file_mutex);
+		DBUG_RETURN(1);
+	}
+
+	rewind(srv_monitor_file);
+
+	if (flen < MAX_STATUS_SIZE) {
+		/* Display the entire output. */
+		flen = fread(str, 1, flen, srv_monitor_file);
+	} else if (trx_list_end < (ulint) flen
+		   && trx_list_start < trx_list_end
+		   && trx_list_start + (flen - trx_list_end)
+		   < MAX_STATUS_SIZE - sizeof truncated_msg - 1) {
+
+		/* Omit the beginning of the list of active transactions. */
+		ssize_t	len = fread(str, 1, trx_list_start, srv_monitor_file);
+
+		memcpy(str + len, truncated_msg, sizeof truncated_msg - 1);
+		len += sizeof truncated_msg - 1;
+		usable_len = (MAX_STATUS_SIZE - 1) - len;
+		fseek(srv_monitor_file,
+		      static_cast<long>(flen - usable_len), SEEK_SET);
+		len += fread(str + len, 1, usable_len, srv_monitor_file);
+		flen = len;
+	} else {
+		/* Omit the end of the output. */
+		flen = fread(str, 1, MAX_STATUS_SIZE - 1, srv_monitor_file);
+	}
+
+	mutex_exit(&srv_monitor_file_mutex);
+
+	ret_val= stat_print(
+		thd, innobase_hton_name,
+		static_cast<uint>(strlen(innobase_hton_name)),
+		STRING_WITH_LEN(""), str, static_cast<uint>(flen));
+
+	my_free(str);
+
+	DBUG_RETURN(ret_val);
+}
+
+/************************************************************************//**
+Implements the SHOW MUTEX STATUS command.
+@return 0 on success. */
+static
+int
+innodb_mutex_show_status(
+/*=====================*/
+	handlerton*	hton,		/*!< in: the innodb handlerton */
+	THD*		thd,		/*!< in: the MySQL query thread of the
+					caller */
+	stat_print_fn*	stat_print)	/*!< in: function for printing
+					statistics */
+{
+	char		buf1[IO_SIZE];
+	char		buf2[IO_SIZE];
+	ib_mutex_t*	mutex;
+	rw_lock_t*	lock;
+	ulint		block_mutex_oswait_count = 0;
+	ulint		block_lock_oswait_count = 0;
+	ib_mutex_t*	block_mutex = NULL;
+	rw_lock_t*	block_lock = NULL;
+#ifdef UNIV_DEBUG
+	ulint		rw_lock_count= 0;
+	ulint		rw_lock_count_spin_loop= 0;
+	ulint		rw_lock_count_spin_rounds= 0;
+	ulint		rw_lock_count_os_wait= 0;
+	ulint		rw_lock_count_os_yield= 0;
+	ulonglong	rw_lock_wait_time= 0;
+#endif /* UNIV_DEBUG */
+	uint		buf1len;
+	uint		buf2len;
+	uint		hton_name_len;
+
+	hton_name_len = (uint) strlen(innobase_hton_name);
+
+	DBUG_ENTER("innodb_mutex_show_status");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	mutex_enter(&mutex_list_mutex);
+
+	for (mutex = UT_LIST_GET_FIRST(mutex_list); mutex != NULL;
+	     mutex = UT_LIST_GET_NEXT(list, mutex)) {
+		if (mutex->count_os_wait == 0) {
+			continue;
+		}
+
+		if (buf_pool_is_block_mutex(mutex)) {
+			block_mutex = mutex;
+			block_mutex_oswait_count += mutex->count_os_wait;
+			continue;
+		}
+
+		buf1len= (uint) my_snprintf(buf1, sizeof(buf1), "%s:%lu",
+				     innobase_basename(mutex->cfile_name),
+				     (ulong) mutex->cline);
+		buf2len= (uint) my_snprintf(buf2, sizeof(buf2), "os_waits=%lu",
+				     (ulong) mutex->count_os_wait);
+
+		if (stat_print(thd, innobase_hton_name,
+			       hton_name_len, buf1, buf1len,
+			       buf2, buf2len)) {
+			mutex_exit(&mutex_list_mutex);
+			DBUG_RETURN(1);
+		}
+	}
+
+	if (block_mutex) {
+		buf1len = (uint) my_snprintf(buf1, sizeof buf1,
+					     "combined %s:%lu",
+					     innobase_basename(
+						block_mutex->cfile_name),
+					     (ulong) block_mutex->cline);
+		buf2len = (uint) my_snprintf(buf2, sizeof buf2,
+					     "os_waits=%lu",
+					     (ulong) block_mutex_oswait_count);
+
+		if (stat_print(thd, innobase_hton_name,
+			       hton_name_len, buf1, buf1len,
+			       buf2, buf2len)) {
+			mutex_exit(&mutex_list_mutex);
+			DBUG_RETURN(1);
+		}
+	}
+
+	mutex_exit(&mutex_list_mutex);
+
+	mutex_enter(&rw_lock_list_mutex);
+
+	for (lock = UT_LIST_GET_FIRST(rw_lock_list); lock != NULL;
+	     lock = UT_LIST_GET_NEXT(list, lock)) {
+		if (lock->count_os_wait == 0) {
+			continue;
+		}
+
+		if (buf_pool_is_block_lock(lock)) {
+			block_lock = lock;
+			block_lock_oswait_count += lock->count_os_wait;
+			continue;
+		}
+
+		buf1len = (uint) my_snprintf(
+			buf1, sizeof buf1, "%s:%lu",
+			innobase_basename(lock->cfile_name),
+			static_cast<ulong>(lock->cline));
+		buf2len = (uint) my_snprintf(
+			buf2, sizeof buf2, "os_waits=%lu",
+			static_cast<ulong>(lock->count_os_wait));
+
+		if (stat_print(thd, innobase_hton_name,
+			       hton_name_len, buf1, buf1len,
+			       buf2, buf2len)) {
+			mutex_exit(&rw_lock_list_mutex);
+			DBUG_RETURN(1);
+		}
+	}
+
+	if (block_lock) {
+		buf1len = (uint) my_snprintf(buf1, sizeof buf1,
+					     "combined %s:%lu",
+					     innobase_basename(
+						block_lock->cfile_name),
+					     (ulong) block_lock->cline);
+		buf2len = (uint) my_snprintf(buf2, sizeof buf2,
+					     "os_waits=%lu",
+					     (ulong) block_lock_oswait_count);
+
+		if (stat_print(thd, innobase_hton_name,
+			       hton_name_len, buf1, buf1len,
+			       buf2, buf2len)) {
+			mutex_exit(&rw_lock_list_mutex);
+			DBUG_RETURN(1);
+		}
+	}
+
+	mutex_exit(&rw_lock_list_mutex);
+
+#ifdef UNIV_DEBUG
+	buf2len = static_cast<uint>(my_snprintf(buf2, sizeof buf2,
+			     "count=%lu, spin_waits=%lu, spin_rounds=%lu, "
+			     "os_waits=%lu, os_yields=%lu, os_wait_times=%lu",
+			      (ulong) rw_lock_count,
+			      (ulong) rw_lock_count_spin_loop,
+			      (ulong) rw_lock_count_spin_rounds,
+			      (ulong) rw_lock_count_os_wait,
+			      (ulong) rw_lock_count_os_yield,
+			      (ulong) (rw_lock_wait_time / 1000)));
+
+	if (stat_print(thd, innobase_hton_name, hton_name_len,
+			STRING_WITH_LEN("rw_lock_mutexes"), buf2, buf2len)) {
+		DBUG_RETURN(1);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* Success */
+	DBUG_RETURN(0);
+}
+
+/************************************************************************//**
+Return 0 on success and non-zero on failure. Note: the bool return type
+seems to be abused here, should be an int. */
+static
+bool
+innobase_show_status(
+/*=================*/
+	handlerton*		hton,	/*!< in: the innodb handlerton */
+	THD*			thd,	/*!< in: the MySQL query thread
+					of the caller */
+	stat_print_fn*		stat_print,
+	enum ha_stat_type	stat_type)
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	switch (stat_type) {
+	case HA_ENGINE_STATUS:
+		/* Non-zero return value means there was an error. */
+		return(innodb_show_status(hton, thd, stat_print) != 0);
+
+	case HA_ENGINE_MUTEX:
+		/* Non-zero return value means there was an error. */
+		return(innodb_mutex_show_status(hton, thd, stat_print) != 0);
+
+	case HA_ENGINE_LOGS:
+		/* Not handled */
+		break;
+	}
+
+	/* Success */
+	return(false);
+}
+
+/************************************************************************//**
+Handling the shared INNOBASE_SHARE structure that is needed to provide table
+locking. Register the table name if it doesn't exist in the hash table. */
+static
+INNOBASE_SHARE*
+get_share(
+/*======*/
+	const char*	table_name)
+{
+	INNOBASE_SHARE*	share;
+
+	mysql_mutex_lock(&innobase_share_mutex);
+
+	ulint	fold = ut_fold_string(table_name);
+
+	HASH_SEARCH(table_name_hash, innobase_open_tables, fold,
+		    INNOBASE_SHARE*, share,
+		    ut_ad(share->use_count > 0),
+		    !strcmp(share->table_name, table_name));
+
+	if (!share) {
+
+		uint length = (uint) strlen(table_name);
+
+		/* TODO: invoke HASH_MIGRATE if innobase_open_tables
+		grows too big */
+
+		share = (INNOBASE_SHARE*) my_malloc(sizeof(*share)+length+1,
+			MYF(MY_FAE | MY_ZEROFILL));
+
+		share->table_name = (char*) memcpy(share + 1,
+						   table_name, length + 1);
+
+		HASH_INSERT(INNOBASE_SHARE, table_name_hash,
+			    innobase_open_tables, fold, share);
+
+		thr_lock_init(&share->lock);
+
+		/* Index translation table initialization */
+		share->idx_trans_tbl.index_mapping = NULL;
+		share->idx_trans_tbl.index_count = 0;
+		share->idx_trans_tbl.array_size = 0;
+	}
+
+	share->use_count++;
+	mysql_mutex_unlock(&innobase_share_mutex);
+
+	return(share);
+}
+
+/************************************************************************//**
+Free the shared object that was registered with get_share(). */
+static
+void
+free_share(
+/*=======*/
+	INNOBASE_SHARE*	share)	/*!< in/own: table share to free */
+{
+	mysql_mutex_lock(&innobase_share_mutex);
+
+#ifdef UNIV_DEBUG
+	INNOBASE_SHARE* share2;
+	ulint	fold = ut_fold_string(share->table_name);
+
+	HASH_SEARCH(table_name_hash, innobase_open_tables, fold,
+		    INNOBASE_SHARE*, share2,
+		    ut_ad(share->use_count > 0),
+		    !strcmp(share->table_name, share2->table_name));
+
+	ut_a(share2 == share);
+#endif /* UNIV_DEBUG */
+
+	if (!--share->use_count) {
+		ulint	fold = ut_fold_string(share->table_name);
+
+		HASH_DELETE(INNOBASE_SHARE, table_name_hash,
+			    innobase_open_tables, fold, share);
+		thr_lock_delete(&share->lock);
+
+		/* Free any memory from index translation table */
+		my_free(share->idx_trans_tbl.index_mapping);
+
+		my_free(share);
+
+		/* TODO: invoke HASH_MIGRATE if innobase_open_tables
+		shrinks too much */
+	}
+
+	mysql_mutex_unlock(&innobase_share_mutex);
+}
+
+/*****************************************************************//**
+Converts a MySQL table lock stored in the 'lock' field of the handle to
+a proper type before storing pointer to the lock into an array of pointers.
+MySQL also calls this if it wants to reset some table locks to a not-locked
+state during the processing of an SQL query. An example is that during a
+SELECT the read lock is released early on the 'const' tables where we only
+fetch one row. MySQL does not call this when it releases all locks at the
+end of an SQL statement.
+@return	pointer to the next element in the 'to' array */
+UNIV_INTERN
+THR_LOCK_DATA**
+ha_innobase::store_lock(
+/*====================*/
+	THD*			thd,		/*!< in: user thread handle */
+	THR_LOCK_DATA**		to,		/*!< in: pointer to an array
+						of pointers to lock structs;
+						pointer to the 'lock' field
+						of current handle is stored
+						next to this array */
+	enum thr_lock_type	lock_type)	/*!< in: lock type to store in
+						'lock'; this may also be
+						TL_IGNORE */
+{
+	trx_t*		trx;
+
+	/* Note that trx in this function is NOT necessarily prebuilt->trx
+	because we call update_thd() later, in ::external_lock()! Failure to
+	understand this caused a serious memory corruption bug in 5.1.11. */
+
+	trx = check_trx_exists(thd);
+
+	/* NOTE: MySQL can call this function with lock 'type' TL_IGNORE!
+	Be careful to ignore TL_IGNORE if we are going to do something with
+	only 'real' locks! */
+
+	/* If no MySQL table is in use, we need to set the isolation level
+	of the transaction. */
+
+	if (lock_type != TL_IGNORE
+	    && trx->n_mysql_tables_in_use == 0) {
+		trx->isolation_level = innobase_map_isolation_level(
+			(enum_tx_isolation) thd_tx_isolation(thd));
+
+		if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+		    && trx->global_read_view) {
+
+			/* At low transaction isolation levels we let
+			each consistent read set its own snapshot */
+
+			read_view_close_for_mysql(trx);
+		}
+	}
+
+	DBUG_ASSERT(EQ_CURRENT_THD(thd));
+	const bool in_lock_tables = thd_in_lock_tables(thd);
+	const uint sql_command = thd_sql_command(thd);
+
+	if (srv_read_only_mode
+	    && (sql_command == SQLCOM_UPDATE
+		|| sql_command == SQLCOM_INSERT
+		|| sql_command == SQLCOM_REPLACE
+		|| sql_command == SQLCOM_DROP_TABLE
+		|| sql_command == SQLCOM_ALTER_TABLE
+		|| sql_command == SQLCOM_OPTIMIZE
+		|| (sql_command == SQLCOM_CREATE_TABLE
+		    && (lock_type >= TL_WRITE_CONCURRENT_INSERT
+			 && lock_type <= TL_WRITE))
+		|| sql_command == SQLCOM_CREATE_INDEX
+		|| sql_command == SQLCOM_DROP_INDEX
+		|| sql_command == SQLCOM_DELETE)) {
+
+		ib_senderrf(trx->mysql_thd,
+			    IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+
+	} else if (sql_command == SQLCOM_FLUSH
+		   && lock_type == TL_READ_NO_INSERT) {
+
+		/* Check for FLUSH TABLES ... WITH READ LOCK */
+
+		/* Note: This call can fail, but there is no way to return
+		the error to the caller. We simply ignore it for now here
+		and push the error code to the caller where the error is
+		detected in the function. */
+
+		dberr_t	err = row_quiesce_set_state(
+			prebuilt->table, QUIESCE_START, trx);
+
+		ut_a(err == DB_SUCCESS || err == DB_UNSUPPORTED);
+
+		if (trx->isolation_level == TRX_ISO_SERIALIZABLE) {
+			prebuilt->select_lock_type = LOCK_S;
+			prebuilt->stored_select_lock_type = LOCK_S;
+		} else {
+			prebuilt->select_lock_type = LOCK_NONE;
+			prebuilt->stored_select_lock_type = LOCK_NONE;
+		}
+
+	/* Check for DROP TABLE */
+	} else if (sql_command == SQLCOM_DROP_TABLE) {
+
+		/* MySQL calls this function in DROP TABLE though this table
+		handle may belong to another thd that is running a query. Let
+		us in that case skip any changes to the prebuilt struct. */
+
+	/* Check for LOCK TABLE t1,...,tn WITH SHARED LOCKS */
+	} else if ((lock_type == TL_READ && in_lock_tables)
+		   || (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables)
+		   || lock_type == TL_READ_WITH_SHARED_LOCKS
+		   || lock_type == TL_READ_NO_INSERT
+		   || (lock_type != TL_IGNORE
+		       && sql_command != SQLCOM_SELECT)) {
+
+		/* The OR cases above are in this order:
+		1) MySQL is doing LOCK TABLES ... READ LOCAL, or we
+		are processing a stored procedure or function, or
+		2) (we do not know when TL_READ_HIGH_PRIORITY is used), or
+		3) this is a SELECT ... IN SHARE MODE, or
+		4) we are doing a complex SQL statement like
+		INSERT INTO ... SELECT ... and the logical logging (MySQL
+		binlog) requires the use of a locking read, or
+		MySQL is doing LOCK TABLES ... READ.
+		5) we let InnoDB do locking reads for all SQL statements that
+		are not simple SELECTs; note that select_lock_type in this
+		case may get strengthened in ::external_lock() to LOCK_X.
+		Note that we MUST use a locking read in all data modifying
+		SQL statements, because otherwise the execution would not be
+		serializable, and also the results from the update could be
+		unexpected if an obsolete consistent read view would be
+		used. */
+
+		/* Use consistent read for checksum table */
+
+		if (sql_command == SQLCOM_CHECKSUM
+		    || ((srv_locks_unsafe_for_binlog
+			|| trx->isolation_level <= TRX_ISO_READ_COMMITTED)
+			&& trx->isolation_level != TRX_ISO_SERIALIZABLE
+			&& (lock_type == TL_READ
+			    || lock_type == TL_READ_NO_INSERT)
+			&& (sql_command == SQLCOM_INSERT_SELECT
+			    || sql_command == SQLCOM_REPLACE_SELECT
+			    || sql_command == SQLCOM_UPDATE
+			    || sql_command == SQLCOM_CREATE_TABLE))) {
+
+			/* If we either have innobase_locks_unsafe_for_binlog
+			option set or this session is using READ COMMITTED
+			isolation level and isolation level of the transaction
+			is not set to serializable and MySQL is doing
+			INSERT INTO...SELECT or REPLACE INTO...SELECT
+			or UPDATE ... = (SELECT ...) or CREATE  ...
+			SELECT... without FOR UPDATE or IN SHARE
+			MODE in select, then we use consistent read
+			for select. */
+
+			prebuilt->select_lock_type = LOCK_NONE;
+			prebuilt->stored_select_lock_type = LOCK_NONE;
+		} else {
+			prebuilt->select_lock_type = LOCK_S;
+			prebuilt->stored_select_lock_type = LOCK_S;
+		}
+
+	} else if (lock_type != TL_IGNORE) {
+
+		/* We set possible LOCK_X value in external_lock, not yet
+		here even if this would be SELECT ... FOR UPDATE */
+
+		prebuilt->select_lock_type = LOCK_NONE;
+		prebuilt->stored_select_lock_type = LOCK_NONE;
+	}
+
+	if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK) {
+
+		/* Starting from 5.0.7, we weaken also the table locks
+		set at the start of a MySQL stored procedure call, just like
+		we weaken the locks set at the start of an SQL statement.
+		MySQL does set in_lock_tables TRUE there, but in reality
+		we do not need table locks to make the execution of a
+		single transaction stored procedure call deterministic
+		(if it does not use a consistent read). */
+
+		if (lock_type == TL_READ
+		    && sql_command == SQLCOM_LOCK_TABLES) {
+			/* We come here if MySQL is processing LOCK TABLES
+			... READ LOCAL. MyISAM under that table lock type
+			reads the table as it was at the time the lock was
+			granted (new inserts are allowed, but not seen by the
+			reader). To get a similar effect on an InnoDB table,
+			we must use LOCK TABLES ... READ. We convert the lock
+			type here, so that for InnoDB, READ LOCAL is
+			equivalent to READ. This will change the InnoDB
+			behavior in mysqldump, so that dumps of InnoDB tables
+			are consistent with dumps of MyISAM tables. */
+
+			lock_type = TL_READ_NO_INSERT;
+		}
+
+		/* If we are not doing a LOCK TABLE, DISCARD/IMPORT
+		TABLESPACE or TRUNCATE TABLE then allow multiple
+		writers. Note that ALTER TABLE uses a TL_WRITE_ALLOW_READ
+		< TL_WRITE_CONCURRENT_INSERT.
+
+		We especially allow multiple writers if MySQL is at the
+		start of a stored procedure call (SQLCOM_CALL) or a
+		stored function call (MySQL does have in_lock_tables
+		TRUE there). */
+
+		if ((lock_type >= TL_WRITE_CONCURRENT_INSERT
+		     && lock_type <= TL_WRITE)
+		    && !(in_lock_tables
+			 && sql_command == SQLCOM_LOCK_TABLES)
+		    && !thd_tablespace_op(thd)
+		    && sql_command != SQLCOM_TRUNCATE
+		    && sql_command != SQLCOM_OPTIMIZE
+		    && sql_command != SQLCOM_CREATE_TABLE) {
+
+			lock_type = TL_WRITE_ALLOW_WRITE;
+		}
+
+		/* In queries of type INSERT INTO t1 SELECT ... FROM t2 ...
+		MySQL would use the lock TL_READ_NO_INSERT on t2, and that
+		would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts
+		to t2. Convert the lock to a normal read lock to allow
+		concurrent inserts to t2.
+
+		We especially allow concurrent inserts if MySQL is at the
+		start of a stored procedure call (SQLCOM_CALL)
+		(MySQL does have thd_in_lock_tables() TRUE there). */
+
+		if (lock_type == TL_READ_NO_INSERT
+		    && sql_command != SQLCOM_LOCK_TABLES) {
+
+			lock_type = TL_READ;
+		}
+
+		lock.type = lock_type;
+	}
+
+	*to++= &lock;
+
+	if (!trx_is_started(trx)
+	    && (prebuilt->select_lock_type != LOCK_NONE
+	        || prebuilt->stored_select_lock_type != LOCK_NONE)) {
+
+		++trx->will_lock;
+	}
+
+	return(to);
+}
+
+/*********************************************************************//**
+Read the next autoinc value. Acquire the relevant locks before reading
+the AUTOINC value. If SUCCESS then the table AUTOINC mutex will be locked
+on return and all relevant locks acquired.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+ha_innobase::innobase_get_autoinc(
+/*==============================*/
+	ulonglong*	value)		/*!< out: autoinc value */
+{
+	*value = 0;
+
+	prebuilt->autoinc_error = innobase_lock_autoinc();
+
+	if (prebuilt->autoinc_error == DB_SUCCESS) {
+
+		/* Determine the first value of the interval */
+		*value = dict_table_autoinc_read(prebuilt->table);
+
+		/* It should have been initialized during open. */
+		if (*value == 0) {
+			prebuilt->autoinc_error = DB_UNSUPPORTED;
+			dict_table_autoinc_unlock(prebuilt->table);
+		}
+	}
+
+	return(prebuilt->autoinc_error);
+}
+
+/*******************************************************************//**
+This function reads the global auto-inc counter. It doesn't use the
+AUTOINC lock even if the lock mode is set to TRADITIONAL.
+@return	the autoinc value */
+UNIV_INTERN
+ulonglong
+ha_innobase::innobase_peek_autoinc(void)
+/*====================================*/
+{
+	ulonglong	auto_inc;
+	dict_table_t*	innodb_table;
+
+	ut_a(prebuilt != NULL);
+	ut_a(prebuilt->table != NULL);
+
+	innodb_table = prebuilt->table;
+
+	dict_table_autoinc_lock(innodb_table);
+
+	auto_inc = dict_table_autoinc_read(innodb_table);
+
+	if (auto_inc == 0) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: AUTOINC next value generation "
+			"is disabled for '%s'\n", innodb_table->name);
+	}
+
+	dict_table_autoinc_unlock(innodb_table);
+
+	return(auto_inc);
+}
+
+/*********************************************************************//**
+Returns the value of the auto-inc counter in *first_value and ~0 on failure. */
+UNIV_INTERN
+void
+ha_innobase::get_auto_increment(
+/*============================*/
+	ulonglong	offset,			/*!< in: table autoinc offset */
+	ulonglong	increment,		/*!< in: table autoinc
+						increment */
+	ulonglong	nb_desired_values,	/*!< in: number of values
+						reqd */
+	ulonglong*	first_value,		/*!< out: the autoinc value */
+	ulonglong*	nb_reserved_values)	/*!< out: count of reserved
+						values */
+{
+	trx_t*		trx;
+	dberr_t		error;
+	ulonglong	autoinc = 0;
+
+	/* Prepare prebuilt->trx in the table handle */
+	update_thd(ha_thd());
+
+	error = innobase_get_autoinc(&autoinc);
+
+	if (error != DB_SUCCESS) {
+		*first_value = (~(ulonglong) 0);
+		return;
+	}
+
+	/* This is a hack, since nb_desired_values seems to be accurate only
+	for the first call to get_auto_increment() for multi-row INSERT and
+	meaningless for other statements e.g, LOAD etc. Subsequent calls to
+	this method for the same statement results in different values which
+	don't make sense. Therefore we store the value the first time we are
+	called and count down from that as rows are written (see write_row()).
+	*/
+
+	trx = prebuilt->trx;
+
+	/* Note: We can't rely on *first_value since some MySQL engines,
+	in particular the partition engine, don't initialize it to 0 when
+	invoking this method. So we are not sure if it's guaranteed to
+	be 0 or not. */
+
+	/* We need the upper limit of the col type to check for
+	whether we update the table autoinc counter or not. */
+	ulonglong	col_max_value = innobase_get_int_col_max_value(
+		table->next_number_field);
+
+	/* Called for the first time ? */
+	if (trx->n_autoinc_rows == 0) {
+
+		trx->n_autoinc_rows = (ulint) nb_desired_values;
+
+		/* It's possible for nb_desired_values to be 0:
+		e.g., INSERT INTO T1(C) SELECT C FROM T2; */
+		if (nb_desired_values == 0) {
+
+			trx->n_autoinc_rows = 1;
+		}
+
+		set_if_bigger(*first_value, autoinc);
+	/* Not in the middle of a mult-row INSERT. */
+	} else if (prebuilt->autoinc_last_value == 0) {
+		set_if_bigger(*first_value, autoinc);
+	/* Check for -ve values. */
+	} else if (*first_value > col_max_value && trx->n_autoinc_rows > 0) {
+		/* Set to next logical value. */
+		ut_a(autoinc > trx->n_autoinc_rows);
+		*first_value = (autoinc - trx->n_autoinc_rows) - 1;
+	}
+
+	*nb_reserved_values = trx->n_autoinc_rows;
+
+	/* With old style AUTOINC locking we only update the table's
+	AUTOINC counter after attempting to insert the row. */
+	if (innobase_autoinc_lock_mode != AUTOINC_OLD_STYLE_LOCKING) {
+		ulonglong	current;
+		ulonglong	next_value;
+
+		current = *first_value > col_max_value ? autoinc : *first_value;
+
+		/* If the increment step of the auto increment column
+		decreases then it is not affecting the immediate
+		next value in the series. */
+		if (prebuilt->autoinc_increment > increment) {
+
+			current = autoinc - prebuilt->autoinc_increment;
+
+			current = innobase_next_autoinc(
+				current, 1, increment, 1, col_max_value);
+
+			dict_table_autoinc_initialize(prebuilt->table, current);
+
+			*first_value = current;
+		}
+
+		/* Compute the last value in the interval */
+		next_value = innobase_next_autoinc(
+			current, *nb_reserved_values, increment, offset,
+			col_max_value);
+
+		prebuilt->autoinc_last_value = next_value;
+
+		if (prebuilt->autoinc_last_value < *first_value) {
+			*first_value = (~(ulonglong) 0);
+		} else {
+			/* Update the table autoinc variable */
+			dict_table_autoinc_update_if_greater(
+				prebuilt->table, prebuilt->autoinc_last_value);
+		}
+	} else {
+		/* This will force write_row() into attempting an update
+		of the table's AUTOINC counter. */
+		prebuilt->autoinc_last_value = 0;
+	}
+
+	/* The increment to be used to increase the AUTOINC value, we use
+	this in write_row() and update_row() to increase the autoinc counter
+	for columns that are filled by the user. We need the offset and
+	the increment. */
+	prebuilt->autoinc_offset = offset;
+	prebuilt->autoinc_increment = increment;
+
+	dict_table_autoinc_unlock(prebuilt->table);
+}
+
+/*******************************************************************//**
+Reset the auto-increment counter to the given value, i.e. the next row
+inserted will get the given value. This is called e.g. after TRUNCATE
+is emulated by doing a 'DELETE FROM t'. HA_ERR_WRONG_COMMAND is
+returned by storage engines that don't support this operation.
+@return	0 or error code */
+UNIV_INTERN
+int
+ha_innobase::reset_auto_increment(
+/*==============================*/
+	ulonglong	value)		/*!< in: new value for table autoinc */
+{
+	DBUG_ENTER("ha_innobase::reset_auto_increment");
+
+	dberr_t	error;
+
+	update_thd(ha_thd());
+
+	error = row_lock_table_autoinc_for_mysql(prebuilt);
+
+	if (error != DB_SUCCESS) {
+		DBUG_RETURN(convert_error_code_to_mysql(
+				    error, prebuilt->table->flags, user_thd));
+	}
+
+	/* The next value can never be 0. */
+	if (value == 0) {
+		value = 1;
+	}
+
+	innobase_reset_autoinc(value);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+See comment in handler.cc */
+UNIV_INTERN
+bool
+ha_innobase::get_error_message(
+/*===========================*/
+	int	error,
+	String*	buf)
+{
+	trx_t*	trx = check_trx_exists(ha_thd());
+
+	buf->copy(trx->detailed_error, (uint) strlen(trx->detailed_error),
+		system_charset_info);
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+  Retrieves the names of the table and the key for which there was a
+  duplicate entry in the case of HA_ERR_FOREIGN_DUPLICATE_KEY.
+
+  If any of the names is not available, then this method will return
+  false and will not change any of child_table_name or child_key_name.
+
+  @param child_table_name[out]    Table name
+  @param child_table_name_len[in] Table name buffer size
+  @param child_key_name[out]      Key name
+  @param child_key_name_len[in]   Key name buffer size
+
+  @retval  true                  table and key names were available
+                                 and were written into the corresponding
+                                 out parameters.
+  @retval  false                 table and key names were not available,
+                                 the out parameters were not touched.
+*/
+bool
+ha_innobase::get_foreign_dup_key(
+/*=============================*/
+	char*	child_table_name,
+	uint	child_table_name_len,
+	char*	child_key_name,
+	uint	child_key_name_len)
+{
+	const dict_index_t*	err_index;
+
+	ut_a(prebuilt->trx != NULL);
+	ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N);
+
+	err_index = trx_get_error_info(prebuilt->trx);
+
+	if (err_index == NULL) {
+		return(false);
+	}
+	/* else */
+
+	/* copy table name (and convert from filename-safe encoding to
+	system_charset_info) */
+	char*	p;
+	p = strchr(err_index->table->name, '/');
+	/* strip ".../" prefix if any */
+	if (p != NULL) {
+		p++;
+	} else {
+		p = err_index->table->name;
+	}
+	uint	len;
+	len = filename_to_tablename(p, child_table_name, child_table_name_len);
+	child_table_name[len] = '\0';
+
+	/* copy index name */
+	ut_snprintf(child_key_name, child_key_name_len, "%s", err_index->name);
+
+	return(true);
+}
+
+/*******************************************************************//**
+Compares two 'refs'. A 'ref' is the (internal) primary key value of the row.
+If there is no explicitly declared non-null unique key or a primary key, then
+InnoDB internally uses the row id as the primary key.
+@return	< 0 if ref1 < ref2, 0 if equal, else > 0 */
+UNIV_INTERN
+int
+ha_innobase::cmp_ref(
+/*=================*/
+	const uchar*	ref1,	/*!< in: an (internal) primary key value in the
+				MySQL key value format */
+	const uchar*	ref2)	/*!< in: an (internal) primary key value in the
+				MySQL key value format */
+{
+	enum_field_types mysql_type;
+	Field*		field;
+	KEY_PART_INFO*	key_part;
+	KEY_PART_INFO*	key_part_end;
+	uint		len1;
+	uint		len2;
+	int		result;
+
+	if (prebuilt->clust_index_was_generated) {
+		/* The 'ref' is an InnoDB row id */
+
+		return(memcmp(ref1, ref2, DATA_ROW_ID_LEN));
+	}
+
+	/* Do a type-aware comparison of primary key fields. PK fields
+	are always NOT NULL, so no checks for NULL are performed. */
+
+	key_part = table->key_info[table->s->primary_key].key_part;
+
+	key_part_end = key_part
+			+ table->key_info[table->s->primary_key].user_defined_key_parts;
+
+	for (; key_part != key_part_end; ++key_part) {
+		field = key_part->field;
+		mysql_type = field->type();
+
+		if (mysql_type == MYSQL_TYPE_TINY_BLOB
+			|| mysql_type == MYSQL_TYPE_MEDIUM_BLOB
+			|| mysql_type == MYSQL_TYPE_BLOB
+			|| mysql_type == MYSQL_TYPE_LONG_BLOB) {
+
+			/* In the MySQL key value format, a column prefix of
+			a BLOB is preceded by a 2-byte length field */
+
+			len1 = innobase_read_from_2_little_endian(ref1);
+			len2 = innobase_read_from_2_little_endian(ref2);
+
+			result = ((Field_blob*) field)->cmp(
+				ref1 + 2, len1, ref2 + 2, len2);
+		} else {
+			result = field->key_cmp(ref1, ref2);
+		}
+
+		if (result) {
+
+			return(result);
+		}
+
+		ref1 += key_part->store_length;
+		ref2 += key_part->store_length;
+	}
+
+	return(0);
+}
+
+/*******************************************************************//**
+Ask InnoDB if a query to a table can be cached.
+@return	TRUE if query caching of the table is permitted */
+UNIV_INTERN
+my_bool
+ha_innobase::register_query_cache_table(
+/*====================================*/
+	THD*		thd,		/*!< in: user thread handle */
+	char*		table_key,	/*!< in: normalized path to the
+					table */
+	uint		key_length,	/*!< in: length of the normalized
+					path to the table */
+	qc_engine_callback*
+			call_back,	/*!< out: pointer to function for
+					checking if query caching
+					is permitted */
+	ulonglong	*engine_data)	/*!< in/out: data to call_back */
+{
+	*call_back = innobase_query_caching_of_table_permitted;
+	*engine_data = 0;
+	return(innobase_query_caching_of_table_permitted(thd, table_key,
+							 key_length,
+							 engine_data));
+}
+
+/*******************************************************************//**
+Get the bin log name. */
+UNIV_INTERN
+const char*
+ha_innobase::get_mysql_bin_log_name()
+/*=================================*/
+{
+	return(trx_sys_mysql_bin_log_name);
+}
+
+/*******************************************************************//**
+Get the bin log offset (or file position). */
+UNIV_INTERN
+ulonglong
+ha_innobase::get_mysql_bin_log_pos()
+/*================================*/
+{
+	/* trx... is ib_int64_t, which is a typedef for a 64-bit integer
+	(__int64 or longlong) so it's ok to cast it to ulonglong. */
+
+	return(trx_sys_mysql_bin_log_pos);
+}
+
+/******************************************************************//**
+This function is used to find the storage length in bytes of the first n
+characters for prefix indexes using a multibyte character set. The function
+finds charset information and returns length of prefix_len characters in the
+index field in bytes.
+@return	number of bytes occupied by the first n characters */
+UNIV_INTERN
+ulint
+innobase_get_at_most_n_mbchars(
+/*===========================*/
+	ulint charset_id,	/*!< in: character set id */
+	ulint prefix_len,	/*!< in: prefix length in bytes of the index
+				(this has to be divided by mbmaxlen to get the
+				number of CHARACTERS n in the prefix) */
+	ulint data_len,		/*!< in: length of the string in bytes */
+	const char* str)	/*!< in: character string */
+{
+	ulint char_length;	/*!< character length in bytes */
+	ulint n_chars;		/*!< number of characters in prefix */
+	CHARSET_INFO* charset;	/*!< charset used in the field */
+
+	charset = get_charset((uint) charset_id, MYF(MY_WME));
+
+	ut_ad(charset);
+	ut_ad(charset->mbmaxlen);
+
+	/* Calculate how many characters at most the prefix index contains */
+
+	n_chars = prefix_len / charset->mbmaxlen;
+
+	/* If the charset is multi-byte, then we must find the length of the
+	first at most n chars in the string. If the string contains less
+	characters than n, then we return the length to the end of the last
+	character. */
+
+	if (charset->mbmaxlen > 1) {
+		/* my_charpos() returns the byte length of the first n_chars
+		characters, or a value bigger than the length of str, if
+		there were not enough full characters in str.
+
+		Why does the code below work:
+		Suppose that we are looking for n UTF-8 characters.
+
+		1) If the string is long enough, then the prefix contains at
+		least n complete UTF-8 characters + maybe some extra
+		characters + an incomplete UTF-8 character. No problem in
+		this case. The function returns the pointer to the
+		end of the nth character.
+
+		2) If the string is not long enough, then the string contains
+		the complete value of a column, that is, only complete UTF-8
+		characters, and we can store in the column prefix index the
+		whole string. */
+
+		char_length = my_charpos(charset, str,
+						str + data_len, (int) n_chars);
+		if (char_length > data_len) {
+			char_length = data_len;
+		}
+	} else {
+		if (data_len < prefix_len) {
+			char_length = data_len;
+		} else {
+			char_length = prefix_len;
+		}
+	}
+
+	return(char_length);
+}
+
+/*******************************************************************//**
+This function is used to prepare an X/Open XA distributed transaction.
+@return	0 or error number */
+static
+int
+innobase_xa_prepare(
+/*================*/
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread of
+					the user whose XA transaction should
+					be prepared */
+	bool		prepare_trx)	/*!< in: true - prepare transaction
+					false - the current SQL statement
+					ended */
+{
+	int		error = 0;
+	trx_t*		trx = check_trx_exists(thd);
+
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	/* we use support_xa value as it was seen at transaction start
+	time, not the current session variable value. Any possible changes
+	to the session variable take effect only in the next transaction */
+	if (!trx->support_xa) {
+
+		return(0);
+	}
+
+	thd_get_xid(thd, (MYSQL_XID*) &trx->xid);
+
+	/* Release a possible FIFO ticket and search latch. Since we will
+	reserve the trx_sys->mutex, we have to release the search system
+	latch first to obey the latching order. */
+
+	trx_search_latch_release_if_reserved(trx);
+
+	innobase_srv_conc_force_exit_innodb(trx);
+
+	if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
+
+		sql_print_error("Transaction not registered for MySQL 2PC, "
+				"but transaction is active");
+	}
+
+	if (prepare_trx
+	    || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
+
+		/* We were instructed to prepare the whole transaction, or
+		this is an SQL statement end and autocommit is on */
+
+		ut_ad(trx_is_registered_for_2pc(trx));
+
+		trx_prepare_for_mysql(trx);
+
+		error = 0;
+	} else {
+		/* We just mark the SQL statement ended and do not do a
+		transaction prepare */
+
+		/* If we had reserved the auto-inc lock for some
+		table in this SQL statement we release it now */
+
+		lock_unlock_table_autoinc(trx);
+
+		/* Store the current undo_no of the transaction so that we
+		know where to roll back if we have to roll back the next
+		SQL statement */
+
+		trx_mark_sql_stat_end(trx);
+	}
+
+	if (thd_sql_command(thd) != SQLCOM_XA_PREPARE
+	    && (prepare_trx
+		|| !thd_test_options(
+			thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
+
+		/* For mysqlbackup to work the order of transactions in binlog
+		and InnoDB must be the same. Consider the situation
+
+		  thread1> prepare; write to binlog; ...
+			  <context switch>
+		  thread2> prepare; write to binlog; commit
+		  thread1>			     ... commit
+
+                The server guarantees that writes to the binary log
+                and commits are in the same order, so we do not have
+                to handle this case. */
+	}
+
+	return(error);
+}
+
+/*******************************************************************//**
+This function is used to recover X/Open XA distributed transactions.
+@return	number of prepared transactions stored in xid_list */
+static
+int
+innobase_xa_recover(
+/*================*/
+	handlerton*	hton,	/*!< in: InnoDB handlerton */
+	XID*		xid_list,/*!< in/out: prepared transactions */
+	uint		len)	/*!< in: number of slots in xid_list */
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	if (len == 0 || xid_list == NULL) {
+
+		return(0);
+	}
+
+	return(trx_recover_for_mysql(xid_list, len));
+}
+
+/*******************************************************************//**
+This function is used to commit one X/Open XA distributed transaction
+which is in the prepared state
+@return	0 or error number */
+static
+int
+innobase_commit_by_xid(
+/*===================*/
+	handlerton*	hton,
+	XID*		xid)	/*!< in: X/Open XA transaction identification */
+{
+	trx_t*	trx;
+
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	trx = trx_get_trx_by_xid(xid);
+
+	if (trx) {
+		innobase_commit_low(trx);
+		trx_free_for_background(trx);
+		return(XA_OK);
+	} else {
+		return(XAER_NOTA);
+	}
+}
+
+/*******************************************************************//**
+This function is used to rollback one X/Open XA distributed transaction
+which is in the prepared state
+@return	0 or error number */
+static
+int
+innobase_rollback_by_xid(
+/*=====================*/
+	handlerton*	hton,	/*!< in: InnoDB handlerton */
+	XID*		xid)	/*!< in: X/Open XA transaction
+				identification */
+{
+	trx_t*	trx;
+
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	trx = trx_get_trx_by_xid(xid);
+
+	if (trx) {
+		int	ret = innobase_rollback_trx(trx);
+		trx_free_for_background(trx);
+		return(ret);
+	} else {
+		return(XAER_NOTA);
+	}
+}
+
+/*******************************************************************//**
+Create a consistent view for a cursor based on current transaction
+which is created if the corresponding MySQL thread still lacks one.
+This consistent view is then used inside of MySQL when accessing records
+using a cursor.
+@return	pointer to cursor view or NULL */
+static
+void*
+innobase_create_cursor_view(
+/*========================*/
+	handlerton*	hton,	/*!< in: innobase hton */
+	THD*		thd)	/*!< in: user thread handle */
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	return(read_cursor_view_create_for_mysql(check_trx_exists(thd)));
+}
+
+/*******************************************************************//**
+Close the given consistent cursor view of a transaction and restore
+global read view to a transaction read view. Transaction is created if the
+corresponding MySQL thread still lacks one. */
+static
+void
+innobase_close_cursor_view(
+/*=======================*/
+	handlerton*	hton,	/*!< in: innobase hton */
+	THD*		thd,	/*!< in: user thread handle */
+	void*		curview)/*!< in: Consistent read view to be closed */
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	read_cursor_view_close_for_mysql(check_trx_exists(thd),
+					 (cursor_view_t*) curview);
+}
+
+/*******************************************************************//**
+Set the given consistent cursor view to a transaction which is created
+if the corresponding MySQL thread still lacks one. If the given
+consistent cursor view is NULL global read view of a transaction is
+restored to a transaction read view. */
+static
+void
+innobase_set_cursor_view(
+/*=====================*/
+	handlerton*	hton,	/*!< in: innobase hton */
+	THD*		thd,	/*!< in: user thread handle */
+	void*		curview)/*!< in: Consistent cursor view to be set */
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	read_cursor_set_for_mysql(check_trx_exists(thd),
+				  (cursor_view_t*) curview);
+}
+
+/*******************************************************************//**
+*/
+UNIV_INTERN
+bool
+ha_innobase::check_if_incompatible_data(
+/*====================================*/
+	HA_CREATE_INFO*	info,
+	uint		table_changes)
+{
+	innobase_copy_frm_flags_from_create_info(prebuilt->table, info);
+
+	if (table_changes != IS_EQUAL_YES) {
+
+		return(COMPATIBLE_DATA_NO);
+	}
+
+	/* Check that auto_increment value was not changed */
+	if ((info->used_fields & HA_CREATE_USED_AUTO) &&
+		info->auto_increment_value != 0) {
+
+		return(COMPATIBLE_DATA_NO);
+	}
+
+	/* Check that row format didn't change */
+	if ((info->used_fields & HA_CREATE_USED_ROW_FORMAT)
+	    && info->row_type != get_row_type()) {
+
+		return(COMPATIBLE_DATA_NO);
+	}
+
+	/* Specifying KEY_BLOCK_SIZE requests a rebuild of the table. */
+	if (info->used_fields & HA_CREATE_USED_KEY_BLOCK_SIZE) {
+		return(COMPATIBLE_DATA_NO);
+	}
+
+	return(COMPATIBLE_DATA_YES);
+}
+
+/****************************************************************//**
+Update the system variable innodb_io_capacity_max using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_io_capacity_max_update(
+/*===========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	ulong	in_val = *static_cast<const ulong*>(save);
+	if (in_val < srv_io_capacity) {
+		in_val = srv_io_capacity;
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "innodb_io_capacity_max cannot be"
+				    " set lower than innodb_io_capacity.");
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "Setting innodb_io_capacity_max to %lu",
+				    srv_io_capacity);
+	}
+
+	srv_max_io_capacity = in_val;
+}
+
+/****************************************************************//**
+Update the system variable innodb_io_capacity using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_io_capacity_update(
+/*======================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	ulong	in_val = *static_cast<const ulong*>(save);
+	if (in_val > srv_max_io_capacity) {
+		in_val = srv_max_io_capacity;
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "innodb_io_capacity cannot be set"
+				    " higher than innodb_io_capacity_max.");
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "Setting innodb_io_capacity to %lu",
+				    srv_max_io_capacity);
+	}
+
+	srv_io_capacity = in_val;
+}
+
+/****************************************************************//**
+Update the system variable innodb_max_dirty_pages_pct using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_max_dirty_pages_pct_update(
+/*==============================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	ulong	in_val = *static_cast<const ulong*>(save);
+	if (in_val < srv_max_dirty_pages_pct_lwm) {
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "innodb_max_dirty_pages_pct cannot be"
+				    " set lower than"
+				    " innodb_max_dirty_pages_pct_lwm.");
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "Lowering"
+				    " innodb_max_dirty_page_pct_lwm to %lu",
+				    in_val);
+
+		srv_max_dirty_pages_pct_lwm = in_val;
+	}
+
+	srv_max_buf_pool_modified_pct = in_val;
+}
+
+/****************************************************************//**
+Update the system variable innodb_max_dirty_pages_pct_lwm using the
+"saved" value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_max_dirty_pages_pct_lwm_update(
+/*==================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	ulong	in_val = *static_cast<const ulong*>(save);
+	if (in_val > srv_max_buf_pool_modified_pct) {
+		in_val = srv_max_buf_pool_modified_pct;
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "innodb_max_dirty_pages_pct_lwm"
+				    " cannot be set higher than"
+				    " innodb_max_dirty_pages_pct.");
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "Setting innodb_max_dirty_page_pct_lwm"
+				    " to %lu",
+				    in_val);
+	}
+
+	srv_max_dirty_pages_pct_lwm = in_val;
+}
+
+/************************************************************//**
+Validate the file format name and return its corresponding id.
+@return	valid file format id */
+static
+uint
+innobase_file_format_name_lookup(
+/*=============================*/
+	const char*	format_name)	/*!< in: pointer to file format name */
+{
+	char*	endp;
+	uint	format_id;
+
+	ut_a(format_name != NULL);
+
+	/* The format name can contain the format id itself instead of
+	the name and we check for that. */
+	format_id = (uint) strtoul(format_name, &endp, 10);
+
+	/* Check for valid parse. */
+	if (*endp == '\0' && *format_name != '\0') {
+
+		if (format_id <= UNIV_FORMAT_MAX) {
+
+			return(format_id);
+		}
+	} else {
+
+		for (format_id = 0; format_id <= UNIV_FORMAT_MAX;
+		     format_id++) {
+			const char*	name;
+
+			name = trx_sys_file_format_id_to_name(format_id);
+
+			if (!innobase_strcasecmp(format_name, name)) {
+
+				return(format_id);
+			}
+		}
+	}
+
+	return(UNIV_FORMAT_MAX + 1);
+}
+
+/************************************************************//**
+Validate the file format check config parameters, as a side effect it
+sets the srv_max_file_format_at_startup variable.
+@return the format_id if valid config value, otherwise, return -1 */
+static
+int
+innobase_file_format_validate_and_set(
+/*==================================*/
+	const char*	format_max)	/*!< in: parameter value */
+{
+	uint		format_id;
+
+	format_id = innobase_file_format_name_lookup(format_max);
+
+	if (format_id < UNIV_FORMAT_MAX + 1) {
+		srv_max_file_format_at_startup = format_id;
+
+		return((int) format_id);
+	} else {
+		return(-1);
+	}
+}
+
+/*************************************************************//**
+Check if it is a valid file format. This function is registered as
+a callback with MySQL.
+@return	0 for valid file format */
+static
+int
+innodb_file_format_name_validate(
+/*=============================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	const char*	file_format_input;
+	char		buff[STRING_BUFFER_USUAL_SIZE];
+	int		len = sizeof(buff);
+
+	ut_a(save != NULL);
+	ut_a(value != NULL);
+
+	file_format_input = value->val_str(value, buff, &len);
+
+	if (file_format_input != NULL) {
+		uint	format_id;
+
+		format_id = innobase_file_format_name_lookup(
+			file_format_input);
+
+		if (format_id <= UNIV_FORMAT_MAX) {
+
+			/* Save a pointer to the name in the
+			'file_format_name_map' constant array. */
+			*static_cast<const char**>(save) =
+			    trx_sys_file_format_id_to_name(format_id);
+
+			return(0);
+		}
+	}
+
+	*static_cast<const char**>(save) = NULL;
+	return(1);
+}
+
+/****************************************************************//**
+Update the system variable innodb_file_format using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_file_format_name_update(
+/*===========================*/
+	THD*				thd,		/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,		/*!< in: pointer to
+							system variable */
+	void*				var_ptr,	/*!< out: where the
+							formal string goes */
+	const void*			save)		/*!< in: immediate result
+							from check function */
+{
+	const char* format_name;
+
+	ut_a(var_ptr != NULL);
+	ut_a(save != NULL);
+
+	format_name = *static_cast<const char*const*>(save);
+
+	if (format_name) {
+		uint	format_id;
+
+		format_id = innobase_file_format_name_lookup(format_name);
+
+		if (format_id <= UNIV_FORMAT_MAX) {
+			srv_file_format = format_id;
+		}
+	}
+
+	*static_cast<const char**>(var_ptr)
+		= trx_sys_file_format_id_to_name(srv_file_format);
+}
+
+/*************************************************************//**
+Check if valid argument to innodb_file_format_max. This function
+is registered as a callback with MySQL.
+@return	0 for valid file format */
+static
+int
+innodb_file_format_max_validate(
+/*============================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	const char*	file_format_input;
+	char		buff[STRING_BUFFER_USUAL_SIZE];
+	int		len = sizeof(buff);
+	int		format_id;
+
+	ut_a(save != NULL);
+	ut_a(value != NULL);
+
+	file_format_input = value->val_str(value, buff, &len);
+
+	if (file_format_input != NULL) {
+
+		format_id = innobase_file_format_validate_and_set(
+			file_format_input);
+
+		if (format_id >= 0) {
+			/* Save a pointer to the name in the
+			'file_format_name_map' constant array. */
+			*static_cast<const char**>(save) =
+			    trx_sys_file_format_id_to_name(
+						(uint) format_id);
+
+			return(0);
+
+		} else {
+			push_warning_printf(thd,
+			  Sql_condition::WARN_LEVEL_WARN,
+			  ER_WRONG_ARGUMENTS,
+			  "InnoDB: invalid innodb_file_format_max "
+			  "value; can be any format up to %s "
+			  "or equivalent id of %d",
+			  trx_sys_file_format_id_to_name(UNIV_FORMAT_MAX),
+			  UNIV_FORMAT_MAX);
+		}
+	}
+
+	*static_cast<const char**>(save) = NULL;
+	return(1);
+}
+
+/****************************************************************//**
+Update the system variable innodb_file_format_max using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_file_format_max_update(
+/*==========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	const char*	format_name_in;
+	const char**	format_name_out;
+	uint		format_id;
+
+	ut_a(save != NULL);
+	ut_a(var_ptr != NULL);
+
+	format_name_in = *static_cast<const char*const*>(save);
+
+	if (!format_name_in) {
+
+		return;
+	}
+
+	format_id = innobase_file_format_name_lookup(format_name_in);
+
+	if (format_id > UNIV_FORMAT_MAX) {
+		/* DEFAULT is "on", which is invalid at runtime. */
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "Ignoring SET innodb_file_format=%s",
+				    format_name_in);
+		return;
+	}
+
+	format_name_out = static_cast<const char**>(var_ptr);
+
+	/* Update the max format id in the system tablespace. */
+	if (trx_sys_file_format_max_set(format_id, format_name_out)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" [Info] InnoDB: the file format in the system "
+			"tablespace is now set to %s.\n", *format_name_out);
+	}
+}
+
+/*************************************************************//**
+Check whether valid argument given to innobase_*_stopword_table.
+This function is registered as a callback with MySQL.
+@return 0 for valid stopword table */
+static
+int
+innodb_stopword_table_validate(
+/*===========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	const char*	stopword_table_name;
+	char		buff[STRING_BUFFER_USUAL_SIZE];
+	int		len = sizeof(buff);
+	trx_t*		trx;
+	int		ret = 1;
+
+	ut_a(save != NULL);
+	ut_a(value != NULL);
+
+	stopword_table_name = value->val_str(value, buff, &len);
+
+	trx = check_trx_exists(thd);
+
+	row_mysql_lock_data_dictionary(trx);
+
+	/* Validate the stopword table's (if supplied) existence and
+	of the right format */
+	if (!stopword_table_name
+	    || fts_valid_stopword_table(stopword_table_name)) {
+		*static_cast<const char**>(save) = stopword_table_name;
+		ret = 0;
+	}
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	return(ret);
+}
+
+/*************************************************************//**
+Check whether valid argument given to "innodb_fts_internal_tbl_name"
+This function is registered as a callback with MySQL.
+@return 0 for valid stopword table */
+static
+int
+innodb_internal_table_validate(
+/*===========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	const char*	table_name;
+	char		buff[STRING_BUFFER_USUAL_SIZE];
+	int		len = sizeof(buff);
+	int		ret = 1;
+	dict_table_t*	user_table;
+
+	ut_a(save != NULL);
+	ut_a(value != NULL);
+
+	table_name = value->val_str(value, buff, &len);
+
+	if (!table_name) {
+		*static_cast<const char**>(save) = NULL;
+		return(0);
+	}
+
+	user_table = dict_table_open_on_name(
+		table_name, FALSE, TRUE, DICT_ERR_IGNORE_NONE);
+
+	if (user_table) {
+		if (dict_table_has_fts_index(user_table)) {
+			*static_cast<const char**>(save) = table_name;
+			ret = 0;
+		}
+
+		dict_table_close(user_table, FALSE, TRUE);
+	}
+
+	return(ret);
+}
+
+/****************************************************************//**
+Update global variable "fts_internal_tbl_name" with the "saved"
+stopword table name value. This function is registered as a callback
+with MySQL. */
+static
+void
+innodb_internal_table_update(
+/*=========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	const char*	table_name;
+	char*		old;
+
+	ut_a(save != NULL);
+	ut_a(var_ptr != NULL);
+
+	table_name = *static_cast<const char*const*>(save);
+	old = *(char**) var_ptr;
+
+	if (table_name) {
+		*(char**) var_ptr =  my_strdup(table_name,  MYF(0));
+	} else {
+		*(char**) var_ptr = NULL;
+	}
+
+	if (old) {
+		my_free(old);
+	}
+
+	fts_internal_tbl_name = *(char**) var_ptr;
+}
+
+/****************************************************************//**
+Update the system variable innodb_adaptive_hash_index using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_adaptive_hash_index_update(
+/*==============================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	if (*(my_bool*) save) {
+		btr_search_enable();
+	} else {
+		btr_search_disable();
+	}
+}
+
+/****************************************************************//**
+Update the system variable innodb_cmp_per_index using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_cmp_per_index_update(
+/*========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	/* Reset the stats whenever we enable the table
+	INFORMATION_SCHEMA.innodb_cmp_per_index. */
+	if (!srv_cmp_per_index_enabled && *(my_bool*) save) {
+		page_zip_reset_stat_per_index();
+	}
+
+	srv_cmp_per_index_enabled = !!(*(my_bool*) save);
+}
+
+/****************************************************************//**
+Update the system variable innodb_old_blocks_pct using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_old_blocks_pct_update(
+/*=========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innobase_old_blocks_pct = static_cast<uint>(
+		buf_LRU_old_ratio_update(
+			*static_cast<const uint*>(save), TRUE));
+}
+
+/****************************************************************//**
+Update the system variable innodb_old_blocks_pct using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_change_buffer_max_size_update(
+/*=================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innobase_change_buffer_max_size =
+			(*static_cast<const uint*>(save));
+	ibuf_max_size_update(innobase_change_buffer_max_size);
+}
+
+#ifdef UNIV_DEBUG
+ulong srv_fil_make_page_dirty_debug = 0;
+ulong srv_saved_page_number_debug = 0;
+
+/****************************************************************//**
+Save an InnoDB page number. */
+static
+void
+innodb_save_page_no(
+/*================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	srv_saved_page_number_debug = *static_cast<const ulong*>(save);
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Saving InnoDB page number: %lu",
+		srv_saved_page_number_debug);
+}
+
+/****************************************************************//**
+Make the first page of given user tablespace dirty. */
+static
+void
+innodb_make_page_dirty(
+/*===================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	mtr_t mtr;
+	ulong space_id = *static_cast<const ulong*>(save);
+
+	mtr_start(&mtr);
+
+	buf_block_t* block = buf_page_get(
+		space_id, 0, srv_saved_page_number_debug, RW_X_LATCH, &mtr);
+
+	if (block) {
+		byte* page = block->frame;
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Dirtying page:%lu of space:%lu",
+			page_get_page_no(page),
+			page_get_space_id(page));
+		mlog_write_ulint(page + FIL_PAGE_TYPE,
+				 fil_page_get_type(page),
+				 MLOG_2BYTES, &mtr);
+	}
+	mtr_commit(&mtr);
+}
+#endif // UNIV_DEBUG
+
+/*************************************************************//**
+Find the corresponding ibuf_use_t value that indexes into
+innobase_change_buffering_values[] array for the input
+change buffering option name.
+@return	corresponding IBUF_USE_* value for the input variable
+name, or IBUF_USE_COUNT if not able to find a match */
+static
+ibuf_use_t
+innodb_find_change_buffering_value(
+/*===============================*/
+	const char*	input_name)	/*!< in: input change buffering
+					option name */
+{
+	ulint	use;
+
+	for (use = 0; use < UT_ARR_SIZE(innobase_change_buffering_values);
+	     use++) {
+		/* found a match */
+		if (!innobase_strcasecmp(
+			input_name, innobase_change_buffering_values[use])) {
+			return((ibuf_use_t) use);
+		}
+	}
+
+	/* Did not find any match */
+	return(IBUF_USE_COUNT);
+}
+
+/*************************************************************//**
+Check if it is a valid value of innodb_change_buffering. This function is
+registered as a callback with MySQL.
+@return	0 for valid innodb_change_buffering */
+static
+int
+innodb_change_buffering_validate(
+/*=============================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	const char*	change_buffering_input;
+	char		buff[STRING_BUFFER_USUAL_SIZE];
+	int		len = sizeof(buff);
+
+	ut_a(save != NULL);
+	ut_a(value != NULL);
+
+	change_buffering_input = value->val_str(value, buff, &len);
+
+	if (change_buffering_input != NULL) {
+		ibuf_use_t	use;
+
+		use = innodb_find_change_buffering_value(
+			change_buffering_input);
+
+		if (use != IBUF_USE_COUNT) {
+			/* Find a matching change_buffering option value. */
+			*static_cast<const char**>(save) =
+				innobase_change_buffering_values[use];
+
+			return(0);
+		}
+	}
+
+	/* No corresponding change buffering option for user supplied
+	"change_buffering_input" */
+	return(1);
+}
+
+/****************************************************************//**
+Update the system variable innodb_change_buffering using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_change_buffering_update(
+/*===========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	ibuf_use_t	use;
+
+	ut_a(var_ptr != NULL);
+	ut_a(save != NULL);
+
+	use = innodb_find_change_buffering_value(
+		*static_cast<const char*const*>(save));
+
+	ut_a(use < IBUF_USE_COUNT);
+
+	ibuf_use = use;
+	*static_cast<const char**>(var_ptr) =
+		 *static_cast<const char*const*>(save);
+}
+
+/*************************************************************//**
+Just emit a warning that the usage of the variable is deprecated.
+@return	0 */
+static
+void
+innodb_stats_sample_pages_update(
+/*=============================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+#define STATS_SAMPLE_PAGES_DEPRECATED_MSG \
+	"Using innodb_stats_sample_pages is deprecated and " \
+	"the variable may be removed in future releases. " \
+	"Please use innodb_stats_transient_sample_pages " \
+	"instead."
+
+	push_warning(thd, Sql_condition::WARN_LEVEL_WARN,
+		     HA_ERR_WRONG_COMMAND, STATS_SAMPLE_PAGES_DEPRECATED_MSG);
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: Warning: %s\n",
+		STATS_SAMPLE_PAGES_DEPRECATED_MSG);
+
+	srv_stats_transient_sample_pages =
+		*static_cast<const unsigned long long*>(save);
+}
+
+/****************************************************************//**
+Update the monitor counter according to the "set_option",  turn
+on/off or reset specified monitor counter. */
+static
+void
+innodb_monitor_set_option(
+/*======================*/
+	const monitor_info_t* monitor_info,/*!< in: monitor info for the monitor
+					to set */
+	mon_option_t	set_option)	/*!< in: Turn on/off reset the
+					counter */
+{
+	monitor_id_t	monitor_id = monitor_info->monitor_id;
+
+	/* If module type is MONITOR_GROUP_MODULE, it cannot be
+	turned on/off individually. It should never use this
+	function to set options */
+	ut_a(!(monitor_info->monitor_type & MONITOR_GROUP_MODULE));
+
+	switch (set_option) {
+	case MONITOR_TURN_ON:
+		MONITOR_ON(monitor_id);
+		MONITOR_INIT(monitor_id);
+		MONITOR_SET_START(monitor_id);
+
+		/* If the monitor to be turned on uses
+		exisitng monitor counter (status variable),
+		make special processing to remember existing
+		counter value. */
+		if (monitor_info->monitor_type
+		    & MONITOR_EXISTING) {
+			srv_mon_process_existing_counter(
+				monitor_id, MONITOR_TURN_ON);
+		}
+		break;
+
+	case MONITOR_TURN_OFF:
+		if (monitor_info->monitor_type & MONITOR_EXISTING) {
+			srv_mon_process_existing_counter(
+				monitor_id, MONITOR_TURN_OFF);
+		}
+
+		MONITOR_OFF(monitor_id);
+		MONITOR_SET_OFF(monitor_id);
+		break;
+
+	case MONITOR_RESET_VALUE:
+		srv_mon_reset(monitor_id);
+		break;
+
+	case MONITOR_RESET_ALL_VALUE:
+		srv_mon_reset_all(monitor_id);
+		break;
+
+	default:
+		ut_error;
+	}
+}
+
+/****************************************************************//**
+Find matching InnoDB monitor counters and update their status
+according to the "set_option",  turn on/off or reset specified
+monitor counter. */
+static
+void
+innodb_monitor_update_wildcard(
+/*===========================*/
+	const char*	name,		/*!< in: monitor name to match */
+	mon_option_t	set_option)	/*!< in: the set option, whether
+					to turn on/off or reset the counter */
+{
+	ut_a(name);
+
+	for (ulint use = 0; use < NUM_MONITOR; use++) {
+		ulint		type;
+		monitor_id_t	monitor_id = static_cast<monitor_id_t>(use);
+		monitor_info_t*	monitor_info;
+
+		if (!innobase_wildcasecmp(
+			srv_mon_get_name(monitor_id), name)) {
+			monitor_info = srv_mon_get_info(monitor_id);
+
+			type = monitor_info->monitor_type;
+
+			/* If the monitor counter is of MONITOR_MODULE
+			type, skip it. Except for those also marked with
+			MONITOR_GROUP_MODULE flag, which can be turned
+			on only as a module. */
+			if (!(type & MONITOR_MODULE)
+			     && !(type & MONITOR_GROUP_MODULE)) {
+				innodb_monitor_set_option(monitor_info,
+							  set_option);
+			}
+
+			/* Need to special handle counters marked with
+			MONITOR_GROUP_MODULE, turn on the whole module if
+			any one of it comes here. Currently, only
+			"module_buf_page" is marked with MONITOR_GROUP_MODULE */
+			if (type & MONITOR_GROUP_MODULE) {
+				if ((monitor_id >= MONITOR_MODULE_BUF_PAGE)
+				     && (monitor_id < MONITOR_MODULE_OS)) {
+					if (set_option == MONITOR_TURN_ON
+					    && MONITOR_IS_ON(
+						MONITOR_MODULE_BUF_PAGE)) {
+						continue;
+					}
+
+					srv_mon_set_module_control(
+						MONITOR_MODULE_BUF_PAGE,
+						set_option);
+				} else {
+					/* If new monitor is added with
+					MONITOR_GROUP_MODULE, it needs
+					to be added here. */
+					ut_ad(0);
+				}
+			}
+		}
+	}
+}
+
+/*************************************************************//**
+Given a configuration variable name, find corresponding monitor counter
+and return its monitor ID if found.
+@return	monitor ID if found, MONITOR_NO_MATCH if there is no match */
+static
+ulint
+innodb_monitor_id_by_name_get(
+/*==========================*/
+	const char*	name)	/*!< in: monitor counter namer */
+{
+	ut_a(name);
+
+	/* Search for wild character '%' in the name, if
+	found, we treat it as a wildcard match. We do not search for
+	single character wildcard '_' since our monitor names already contain
+	such character. To avoid confusion, we request user must include
+	at least one '%' character to activate the wildcard search. */
+	if (strchr(name, '%')) {
+		return(MONITOR_WILDCARD_MATCH);
+	}
+
+	/* Not wildcard match, check for an exact match */
+	for (ulint i = 0; i < NUM_MONITOR; i++) {
+		if (!innobase_strcasecmp(
+			name, srv_mon_get_name(static_cast<monitor_id_t>(i)))) {
+			return(i);
+		}
+	}
+
+	return(MONITOR_NO_MATCH);
+}
+/*************************************************************//**
+Validate that the passed in monitor name matches at least one
+monitor counter name with wildcard compare.
+@return	TRUE if at least one monitor name matches */
+static
+ibool
+innodb_monitor_validate_wildcard_name(
+/*==================================*/
+	const char*	name)	/*!< in: monitor counter namer */
+{
+	for (ulint i = 0; i < NUM_MONITOR; i++) {
+		if (!innobase_wildcasecmp(
+			srv_mon_get_name(static_cast<monitor_id_t>(i)), name)) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+/*************************************************************//**
+Validate the passed in monitor name, find and save the
+corresponding monitor name in the function parameter "save".
+@return	0 if monitor name is valid */
+static
+int
+innodb_monitor_valid_byname(
+/*========================*/
+	void*			save,	/*!< out: immediate result
+					for update function */
+	const char*		name)	/*!< in: incoming monitor name */
+{
+	ulint		use;
+	monitor_info_t*	monitor_info;
+
+	if (!name) {
+		return(1);
+	}
+
+	use = innodb_monitor_id_by_name_get(name);
+
+	/* No monitor name matches, nor it is wildcard match */
+	if (use == MONITOR_NO_MATCH) {
+		return(1);
+	}
+
+	if (use < NUM_MONITOR) {
+		monitor_info = srv_mon_get_info((monitor_id_t) use);
+
+		/* If the monitor counter is marked with
+		MONITOR_GROUP_MODULE flag, then this counter
+		cannot be turned on/off individually, instead
+		it shall be turned on/off as a group using
+		its module name */
+		if ((monitor_info->monitor_type & MONITOR_GROUP_MODULE)
+		    && (!(monitor_info->monitor_type & MONITOR_MODULE))) {
+			sql_print_warning(
+				"Monitor counter '%s' cannot"
+				" be turned on/off individually."
+				" Please use its module name"
+				" to turn on/off the counters"
+				" in the module as a group.\n",
+				name);
+
+			return(1);
+		}
+
+	} else {
+		ut_a(use == MONITOR_WILDCARD_MATCH);
+
+		/* For wildcard match, if there is not a single monitor
+		counter name that matches, treat it as an invalid
+		value for the system configuration variables */
+		if (!innodb_monitor_validate_wildcard_name(name)) {
+			return(1);
+		}
+	}
+
+	/* Save the configure name for innodb_monitor_update() */
+	*static_cast<const char**>(save) = name;
+
+	return(0);
+}
+/*************************************************************//**
+Validate passed-in "value" is a valid monitor counter name.
+This function is registered as a callback with MySQL.
+@return	0 for valid name */
+static
+int
+innodb_monitor_validate(
+/*====================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	const char*	name;
+	char*		monitor_name;
+	char		buff[STRING_BUFFER_USUAL_SIZE];
+	int		len = sizeof(buff);
+	int		ret;
+
+	ut_a(save != NULL);
+	ut_a(value != NULL);
+
+	name = value->val_str(value, buff, &len);
+
+	/* monitor_name could point to memory from MySQL
+	or buff[]. Always dup the name to memory allocated
+	by InnoDB, so we can access it in another callback
+	function innodb_monitor_update() and free it appropriately */
+	if (name) {
+		monitor_name = my_strdup(name, MYF(0));
+	} else {
+		return(1);
+	}
+
+	ret = innodb_monitor_valid_byname(save, monitor_name);
+
+	if (ret) {
+		/* Validation failed */
+		my_free(monitor_name);
+	} else {
+		/* monitor_name will be freed in separate callback function
+		innodb_monitor_update(). Assert "save" point to
+		the "monitor_name" variable */
+		ut_ad(*static_cast<char**>(save) == monitor_name);
+	}
+
+	return(ret);
+}
+
+/****************************************************************//**
+Update the system variable innodb_enable(disable/reset/reset_all)_monitor
+according to the "set_option" and turn on/off or reset specified monitor
+counter. */
+static
+void
+innodb_monitor_update(
+/*==================*/
+	THD*			thd,		/*!< in: thread handle */
+	void*			var_ptr,	/*!< out: where the
+						formal string goes */
+	const void*		save,		/*!< in: immediate result
+						from check function */
+	mon_option_t		set_option,	/*!< in: the set option,
+						whether to turn on/off or
+						reset the counter */
+	ibool			free_mem)	/*!< in: whether we will
+						need to free the memory */
+{
+	monitor_info_t*	monitor_info;
+	ulint		monitor_id;
+	ulint		err_monitor = 0;
+	const char*	name;
+
+	ut_a(save != NULL);
+
+	name = *static_cast<const char*const*>(save);
+
+	if (!name) {
+		monitor_id = MONITOR_DEFAULT_START;
+	} else {
+		monitor_id = innodb_monitor_id_by_name_get(name);
+
+		/* Double check we have a valid monitor ID */
+		if (monitor_id == MONITOR_NO_MATCH) {
+			return;
+		}
+	}
+
+	if (monitor_id == MONITOR_DEFAULT_START) {
+		/* If user set the variable to "default", we will
+		print a message and make this set operation a "noop".
+		The check is being made here is because "set default"
+		does not go through validation function */
+		if (thd) {
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_NO_DEFAULT,
+				"Default value is not defined for "
+				"this set option. Please specify "
+				"correct counter or module name.");
+		} else {
+			sql_print_error(
+				"Default value is not defined for "
+				"this set option. Please specify "
+				"correct counter or module name.\n");
+		}
+
+		if (var_ptr) {
+			*(const char**) var_ptr = NULL;
+		}
+	} else if (monitor_id == MONITOR_WILDCARD_MATCH) {
+		innodb_monitor_update_wildcard(name, set_option);
+	} else {
+		monitor_info = srv_mon_get_info(
+			static_cast<monitor_id_t>(monitor_id));
+
+		ut_a(monitor_info);
+
+		/* If monitor is already truned on, someone could already
+		collect monitor data, exit and ask user to turn off the
+		monitor before turn it on again. */
+		if (set_option == MONITOR_TURN_ON
+		    && MONITOR_IS_ON(monitor_id)) {
+			err_monitor = monitor_id;
+			goto exit;
+		}
+
+		if (var_ptr) {
+			*(const char**) var_ptr = monitor_info->monitor_name;
+		}
+
+		/* Depending on the monitor name is for a module or
+		a counter, process counters in the whole module or
+		individual counter. */
+		if (monitor_info->monitor_type & MONITOR_MODULE) {
+			srv_mon_set_module_control(
+				static_cast<monitor_id_t>(monitor_id),
+				set_option);
+		} else {
+			innodb_monitor_set_option(monitor_info, set_option);
+		}
+	}
+exit:
+	/* Only if we are trying to turn on a monitor that already
+	been turned on, we will set err_monitor. Print related
+	information */
+	if (err_monitor) {
+		sql_print_warning("Monitor %s is already enabled.",
+				  srv_mon_get_name((monitor_id_t) err_monitor));
+	}
+
+	if (free_mem && name) {
+		my_free((void*) name);
+	}
+
+	return;
+}
+
+#ifdef __WIN__
+/*************************************************************//**
+Validate if passed-in "value" is a valid value for
+innodb_buffer_pool_filename. On Windows, file names with colon (:)
+are not allowed.
+
+@return	0 for valid name */
+static
+int
+innodb_srv_buf_dump_filename_validate(
+/*==================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	const char*	buf_name;
+	char		buff[OS_FILE_MAX_PATH];
+	int		len= sizeof(buff);
+
+	ut_a(save != NULL);
+	ut_a(value != NULL);
+
+	buf_name = value->val_str(value, buff, &len);
+
+	if (buf_name) {
+		if (is_filename_allowed(buf_name, len, FALSE)){
+			*static_cast<const char**>(save) = buf_name;
+			return(0);
+		} else {
+			push_warning_printf(thd,
+				Sql_condition::WARN_LEVEL_WARN,
+				ER_WRONG_ARGUMENTS,
+				"InnoDB: innodb_buffer_pool_filename "
+				"cannot have colon (:) in the file name.");
+
+		}
+	}
+
+	return(1);
+}
+#else /* __WIN__ */
+# define innodb_srv_buf_dump_filename_validate NULL
+#endif /* __WIN__ */
+
+#ifdef UNIV_DEBUG
+static char* srv_buffer_pool_evict;
+
+/****************************************************************//**
+Evict all uncompressed pages of compressed tables from the buffer pool.
+Keep the compressed pages in the buffer pool.
+@return whether all uncompressed pages were evicted */
+static __attribute__((warn_unused_result))
+bool
+innodb_buffer_pool_evict_uncompressed(void)
+/*=======================================*/
+{
+	bool	all_evicted = true;
+
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool = &buf_pool_ptr[i];
+
+		buf_pool_mutex_enter(buf_pool);
+
+		for (buf_block_t* block = UT_LIST_GET_LAST(
+			     buf_pool->unzip_LRU);
+		     block != NULL; ) {
+			buf_block_t*	prev_block = UT_LIST_GET_PREV(
+				unzip_LRU, block);
+			ut_ad(buf_block_get_state(block)
+			      == BUF_BLOCK_FILE_PAGE);
+			ut_ad(block->in_unzip_LRU_list);
+			ut_ad(block->page.in_LRU_list);
+
+			if (!buf_LRU_free_page(&block->page, false)) {
+				all_evicted = false;
+			}
+
+			block = prev_block;
+		}
+
+		buf_pool_mutex_exit(buf_pool);
+	}
+
+	return(all_evicted);
+}
+
+/****************************************************************//**
+Called on SET GLOBAL innodb_buffer_pool_evict=...
+Handles some values specially, to evict pages from the buffer pool.
+SET GLOBAL innodb_buffer_pool_evict='uncompressed'
+evicts all uncompressed page frames of compressed tablespaces. */
+static
+void
+innodb_buffer_pool_evict_update(
+/*============================*/
+	THD*			thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*var,	/*!< in: pointer to system variable */
+	void*			var_ptr,/*!< out: ignored */
+	const void*		save)	/*!< in: immediate result
+					from check function */
+{
+	if (const char* op = *static_cast<const char*const*>(save)) {
+		if (!strcmp(op, "uncompressed")) {
+			for (uint tries = 0; tries < 10000; tries++) {
+				if (innodb_buffer_pool_evict_uncompressed()) {
+					return;
+				}
+
+				os_thread_sleep(10000);
+			}
+
+			/* We failed to evict all uncompressed pages. */
+			ut_ad(0);
+		}
+	}
+}
+#endif /* UNIV_DEBUG */
+
+/****************************************************************//**
+Update the system variable innodb_monitor_enable and enable
+specified monitor counter.
+This function is registered as a callback with MySQL. */
+static
+void
+innodb_enable_monitor_update(
+/*=========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innodb_monitor_update(thd, var_ptr, save, MONITOR_TURN_ON, TRUE);
+}
+
+/****************************************************************//**
+Update the system variable innodb_monitor_disable and turn
+off specified monitor counter. */
+static
+void
+innodb_disable_monitor_update(
+/*==========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innodb_monitor_update(thd, var_ptr, save, MONITOR_TURN_OFF, TRUE);
+}
+
+/****************************************************************//**
+Update the system variable innodb_monitor_reset and reset
+specified monitor counter(s).
+This function is registered as a callback with MySQL. */
+static
+void
+innodb_reset_monitor_update(
+/*========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innodb_monitor_update(thd, var_ptr, save, MONITOR_RESET_VALUE, TRUE);
+}
+
+/****************************************************************//**
+Update the system variable innodb_monitor_reset_all and reset
+all value related monitor counter.
+This function is registered as a callback with MySQL. */
+static
+void
+innodb_reset_all_monitor_update(
+/*============================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innodb_monitor_update(thd, var_ptr, save, MONITOR_RESET_ALL_VALUE,
+			      TRUE);
+}
+
+/****************************************************************//**
+Parse and enable InnoDB monitor counters during server startup.
+User can list the monitor counters/groups to be enable by specifying
+"loose-innodb_monitor_enable=monitor_name1;monitor_name2..."
+in server configuration file or at the command line. The string
+separate could be ";", "," or empty space. */
+static
+void
+innodb_enable_monitor_at_startup(
+/*=============================*/
+	char*	str)	/*!< in/out: monitor counter enable list */
+{
+	static const char*	sep = " ;,";
+	char*			last;
+
+	ut_a(str);
+
+	/* Walk through the string, and separate each monitor counter
+	and/or counter group name, and calling innodb_monitor_update()
+	if successfully updated. Please note that the "str" would be
+	changed by strtok_r() as it walks through it. */
+	for (char* option = strtok_r(str, sep, &last);
+	     option;
+	     option = strtok_r(NULL, sep, &last)) {
+		ulint	ret;
+		char*	option_name;
+
+		ret = innodb_monitor_valid_byname(&option_name, option);
+
+		/* The name is validated if ret == 0 */
+		if (!ret) {
+			innodb_monitor_update(NULL, NULL, &option,
+					      MONITOR_TURN_ON, FALSE);
+		} else {
+			sql_print_warning("Invalid monitor counter"
+					  " name: '%s'", option);
+		}
+	}
+}
+
+/****************************************************************//**
+Callback function for accessing the InnoDB variables from MySQL:
+SHOW VARIABLES. */
+static
+int
+show_innodb_vars(
+/*=============*/
+	THD*		thd,
+	SHOW_VAR*	var,
+	char*		buff)
+{
+	innodb_export_status();
+	var->type = SHOW_ARRAY;
+	var->value = (char*) &innodb_status_variables;
+
+	return(0);
+}
+
+/****************************************************************//**
+This function checks each index name for a table against reserved
+system default primary index name 'GEN_CLUST_INDEX'. If a name
+matches, this function pushes an warning message to the client,
+and returns true.
+@return true if the index name matches the reserved name */
+UNIV_INTERN
+bool
+innobase_index_name_is_reserved(
+/*============================*/
+	THD*		thd,		/*!< in/out: MySQL connection */
+	const KEY*	key_info,	/*!< in: Indexes to be created */
+	ulint		num_of_keys)	/*!< in: Number of indexes to
+					be created. */
+{
+	const KEY*	key;
+	uint		key_num;	/* index number */
+
+	for (key_num = 0; key_num < num_of_keys; key_num++) {
+		key = &key_info[key_num];
+
+		if (innobase_strcasecmp(key->name,
+					innobase_index_reserve_name) == 0) {
+			/* Push warning to mysql */
+			push_warning_printf(thd,
+					    Sql_condition::WARN_LEVEL_WARN,
+					    ER_WRONG_NAME_FOR_INDEX,
+					    "Cannot Create Index with name "
+					    "'%s'. The name is reserved "
+					    "for the system default primary "
+					    "index.",
+					    innobase_index_reserve_name);
+
+			my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+				 innobase_index_reserve_name);
+
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/***********************************************************************
+Retrieve the FTS Relevance Ranking result for doc with doc_id
+of prebuilt->fts_doc_id
+@return the relevance ranking value */
+UNIV_INTERN
+float
+innobase_fts_retrieve_ranking(
+/*============================*/
+		FT_INFO * fts_hdl)	/*!< in: FTS handler */
+{
+	row_prebuilt_t*	ft_prebuilt;
+	fts_result_t*	result;
+
+	result = ((NEW_FT_INFO*) fts_hdl)->ft_result;
+
+	ft_prebuilt = ((NEW_FT_INFO*) fts_hdl)->ft_prebuilt;
+
+	if (ft_prebuilt->read_just_key) {
+		fts_ranking_t*  ranking =
+			rbt_value(fts_ranking_t, result->current);
+		return(ranking->rank);
+	}
+
+	/* Retrieve the ranking value for doc_id with value of
+	prebuilt->fts_doc_id */
+	return(fts_retrieve_ranking(result, ft_prebuilt->fts_doc_id));
+}
+
+/***********************************************************************
+Free the memory for the FTS handler */
+UNIV_INTERN
+void
+innobase_fts_close_ranking(
+/*=======================*/
+		FT_INFO * fts_hdl)
+{
+	fts_result_t*	result;
+
+	((NEW_FT_INFO*) fts_hdl)->ft_prebuilt->in_fts_query = false;
+
+	result = ((NEW_FT_INFO*) fts_hdl)->ft_result;
+
+	fts_query_free_result(result);
+
+	my_free((uchar*) fts_hdl);
+
+
+	return;
+}
+
+/***********************************************************************
+Find and Retrieve the FTS Relevance Ranking result for doc with doc_id
+of prebuilt->fts_doc_id
+@return the relevance ranking value */
+UNIV_INTERN
+float
+innobase_fts_find_ranking(
+/*======================*/
+		FT_INFO*	fts_hdl,	/*!< in: FTS handler */
+		uchar*		record,		/*!< in: Unused */
+		uint		len)		/*!< in: Unused */
+{
+	row_prebuilt_t*	ft_prebuilt;
+	fts_result_t*	result;
+
+	ft_prebuilt = ((NEW_FT_INFO*) fts_hdl)->ft_prebuilt;
+	result = ((NEW_FT_INFO*) fts_hdl)->ft_result;
+
+	/* Retrieve the ranking value for doc_id with value of
+	prebuilt->fts_doc_id */
+	return(fts_retrieve_ranking(result, ft_prebuilt->fts_doc_id));
+}
+
+#ifdef UNIV_DEBUG
+static my_bool	innodb_purge_run_now = TRUE;
+static my_bool	innodb_purge_stop_now = TRUE;
+static my_bool	innodb_log_checkpoint_now = TRUE;
+static my_bool	innodb_buf_flush_list_now = TRUE;
+
+/****************************************************************//**
+Set the purge state to RUN. If purge is disabled then it
+is a no-op. This function is registered as a callback with MySQL. */
+static
+void
+purge_run_now_set(
+/*==============*/
+	THD*				thd	/*!< in: thread handle */
+					__attribute__((unused)),
+	struct st_mysql_sys_var*	var	/*!< in: pointer to system
+						variable */
+					__attribute__((unused)),
+	void*				var_ptr	/*!< out: where the formal
+						string goes */
+					__attribute__((unused)),
+	const void*			save)	/*!< in: immediate result from
+						check function */
+{
+	if (*(my_bool*) save && trx_purge_state() != PURGE_STATE_DISABLED) {
+		trx_purge_run();
+	}
+}
+
+/****************************************************************//**
+Set the purge state to STOP. If purge is disabled then it
+is a no-op. This function is registered as a callback with MySQL. */
+static
+void
+purge_stop_now_set(
+/*===============*/
+	THD*				thd	/*!< in: thread handle */
+					__attribute__((unused)),
+	struct st_mysql_sys_var*	var	/*!< in: pointer to system
+						variable */
+					__attribute__((unused)),
+	void*				var_ptr	/*!< out: where the formal
+						string goes */
+					__attribute__((unused)),
+	const void*			save)	/*!< in: immediate result from
+						check function */
+{
+	if (*(my_bool*) save && trx_purge_state() != PURGE_STATE_DISABLED) {
+		trx_purge_stop();
+	}
+}
+
+/****************************************************************//**
+Force innodb to checkpoint. */
+static
+void
+checkpoint_now_set(
+/*===============*/
+	THD*				thd	/*!< in: thread handle */
+					__attribute__((unused)),
+	struct st_mysql_sys_var*	var	/*!< in: pointer to system
+						variable */
+					__attribute__((unused)),
+	void*				var_ptr	/*!< out: where the formal
+						string goes */
+					__attribute__((unused)),
+	const void*			save)	/*!< in: immediate result from
+						check function */
+{
+	if (*(my_bool*) save) {
+		while (log_sys->last_checkpoint_lsn < log_sys->lsn) {
+			log_make_checkpoint_at(LSN_MAX, TRUE);
+			fil_flush_file_spaces(FIL_LOG);
+		}
+		fil_write_flushed_lsn_to_data_files(log_sys->lsn, 0);
+		fil_flush_file_spaces(FIL_TABLESPACE);
+	}
+}
+
+/****************************************************************//**
+Force a dirty pages flush now. */
+static
+void
+buf_flush_list_now_set(
+/*===================*/
+	THD*				thd	/*!< in: thread handle */
+					__attribute__((unused)),
+	struct st_mysql_sys_var*	var	/*!< in: pointer to system
+						variable */
+					__attribute__((unused)),
+	void*				var_ptr	/*!< out: where the formal
+						string goes */
+					__attribute__((unused)),
+	const void*			save)	/*!< in: immediate result from
+						check function */
+{
+	if (*(my_bool*) save) {
+		buf_flush_list(ULINT_MAX, LSN_MAX, NULL);
+		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+	}
+}
+#endif /* UNIV_DEBUG */
+
+/***********************************************************************
+@return version of the extended FTS API */
+uint
+innobase_fts_get_version()
+/*======================*/
+{
+	/* Currently this doesn't make much sense as returning
+	HA_CAN_FULLTEXT_EXT automatically mean this version is supported.
+	This supposed to ease future extensions.  */
+	return(2);
+}
+
+/***********************************************************************
+@return Which part of the extended FTS API is supported */
+ulonglong
+innobase_fts_flags()
+/*================*/
+{
+	return(FTS_ORDERED_RESULT | FTS_DOCID_IN_RESULT);
+}
+
+
+/***********************************************************************
+Find and Retrieve the FTS doc_id for the current result row
+@return the document ID */
+ulonglong
+innobase_fts_retrieve_docid(
+/*========================*/
+		FT_INFO_EXT * fts_hdl)	/*!< in: FTS handler */
+{
+	row_prebuilt_t* ft_prebuilt;
+	fts_result_t*	result;
+
+	ft_prebuilt = ((NEW_FT_INFO *)fts_hdl)->ft_prebuilt;
+	result = ((NEW_FT_INFO *)fts_hdl)->ft_result;
+
+	if (ft_prebuilt->read_just_key) {
+		fts_ranking_t* ranking =
+			rbt_value(fts_ranking_t, result->current);
+		return(ranking->doc_id);
+	}
+
+	return(ft_prebuilt->fts_doc_id);
+}
+
+/***********************************************************************
+Find and retrieve the size of the current result
+@return number of matching rows */
+ulonglong
+innobase_fts_count_matches(
+/*=======================*/
+	FT_INFO_EXT* fts_hdl)	/*!< in: FTS handler */
+{
+	NEW_FT_INFO*	handle = (NEW_FT_INFO *) fts_hdl;
+
+	if (handle->ft_result->rankings_by_id != 0) {
+		return rbt_size(handle->ft_result->rankings_by_id);
+	} else {
+		return(0);
+	}
+}
+
+/* These variables are never read by InnoDB or changed. They are a kind of
+dummies that are needed by the MySQL infrastructure to call
+buffer_pool_dump_now(), buffer_pool_load_now() and buffer_pool_load_abort()
+by the user by doing:
+  SET GLOBAL innodb_buffer_pool_dump_now=ON;
+  SET GLOBAL innodb_buffer_pool_load_now=ON;
+  SET GLOBAL innodb_buffer_pool_load_abort=ON;
+Their values are read by MySQL and displayed to the user when the variables
+are queried, e.g.:
+  SELECT @@innodb_buffer_pool_dump_now;
+  SELECT @@innodb_buffer_pool_load_now;
+  SELECT @@innodb_buffer_pool_load_abort; */
+static my_bool	innodb_buffer_pool_dump_now = FALSE;
+static my_bool	innodb_buffer_pool_load_now = FALSE;
+static my_bool	innodb_buffer_pool_load_abort = FALSE;
+
+/****************************************************************//**
+Trigger a dump of the buffer pool if innodb_buffer_pool_dump_now is set
+to ON. This function is registered as a callback with MySQL. */
+static
+void
+buffer_pool_dump_now(
+/*=================*/
+	THD*				thd	/*!< in: thread handle */
+					__attribute__((unused)),
+	struct st_mysql_sys_var*	var	/*!< in: pointer to system
+						variable */
+					__attribute__((unused)),
+	void*				var_ptr	/*!< out: where the formal
+						string goes */
+					__attribute__((unused)),
+	const void*			save)	/*!< in: immediate result from
+						check function */
+{
+	if (*(my_bool*) save && !srv_read_only_mode) {
+		buf_dump_start();
+	}
+}
+
+/****************************************************************//**
+Trigger a load of the buffer pool if innodb_buffer_pool_load_now is set
+to ON. This function is registered as a callback with MySQL. */
+static
+void
+buffer_pool_load_now(
+/*=================*/
+	THD*				thd	/*!< in: thread handle */
+					__attribute__((unused)),
+	struct st_mysql_sys_var*	var	/*!< in: pointer to system
+						variable */
+					__attribute__((unused)),
+	void*				var_ptr	/*!< out: where the formal
+						string goes */
+					__attribute__((unused)),
+	const void*			save)	/*!< in: immediate result from
+						check function */
+{
+	if (*(my_bool*) save) {
+		buf_load_start();
+	}
+}
+
+/****************************************************************//**
+Abort a load of the buffer pool if innodb_buffer_pool_load_abort
+is set to ON. This function is registered as a callback with MySQL. */
+static
+void
+buffer_pool_load_abort(
+/*===================*/
+	THD*				thd	/*!< in: thread handle */
+					__attribute__((unused)),
+	struct st_mysql_sys_var*	var	/*!< in: pointer to system
+						variable */
+					__attribute__((unused)),
+	void*				var_ptr	/*!< out: where the formal
+						string goes */
+					__attribute__((unused)),
+	const void*			save)	/*!< in: immediate result from
+						check function */
+{
+	if (*(my_bool*) save) {
+		buf_load_abort();
+	}
+}
+
+/** Update innodb_status_output or innodb_status_output_locks,
+which control InnoDB "status monitor" output to the error log.
+@param[in]	thd	thread handle
+@param[in]	var	system variable
+@param[out]	var_ptr	current value
+@param[in]	save	to-be-assigned value */
+static
+void
+innodb_status_output_update(
+	THD*				thd __attribute__((unused)),
+	struct st_mysql_sys_var*	var __attribute__((unused)),
+	void*				var_ptr __attribute__((unused)),
+	const void*			save __attribute__((unused)))
+{
+	*static_cast<my_bool*>(var_ptr) = *static_cast<const my_bool*>(save);
+	/* The lock timeout monitor thread also takes care of this
+	output. */
+	os_event_set(lock_sys->timeout_event);
+}
+
+static SHOW_VAR innodb_status_variables_export[]= {
+	{"Innodb", (char*) &show_innodb_vars, SHOW_FUNC},
+	{NullS, NullS, SHOW_LONG}
+};
+
+static struct st_mysql_storage_engine innobase_storage_engine=
+{ MYSQL_HANDLERTON_INTERFACE_VERSION };
+
+/* plugin options */
+
+static MYSQL_SYSVAR_ENUM(checksum_algorithm, srv_checksum_algorithm,
+  PLUGIN_VAR_RQCMDARG,
+  "The algorithm InnoDB uses for page checksumming. Possible values are "
+  "CRC32 (hardware accelerated if the CPU supports it) "
+    "write crc32, allow any of the other checksums to match when reading; "
+  "STRICT_CRC32 "
+    "write crc32, do not allow other algorithms to match when reading; "
+  "INNODB "
+    "write a software calculated checksum, allow any other checksums "
+    "to match when reading; "
+  "STRICT_INNODB "
+    "write a software calculated checksum, do not allow other algorithms "
+    "to match when reading; "
+  "NONE "
+    "write a constant magic number, do not do any checksum verification "
+    "when reading (same as innodb_checksums=OFF); "
+  "STRICT_NONE "
+    "write a constant magic number, do not allow values other than that "
+    "magic number when reading; "
+  "Files updated when this option is set to crc32 or strict_crc32 will "
+  "not be readable by MySQL versions older than 5.6.3",
+  NULL, NULL, SRV_CHECKSUM_ALGORITHM_INNODB,
+  &innodb_checksum_algorithm_typelib);
+
+static MYSQL_SYSVAR_BOOL(checksums, innobase_use_checksums,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "DEPRECATED. Use innodb_checksum_algorithm=NONE instead of setting "
+  "this to OFF. "
+  "Enable InnoDB checksums validation (enabled by default). "
+  "Disable with --skip-innodb-checksums.",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_STR(data_home_dir, innobase_data_home_dir,
+  PLUGIN_VAR_READONLY,
+  "The common part for InnoDB table spaces.",
+  NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_BOOL(doublewrite, innobase_use_doublewrite,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Enable InnoDB doublewrite buffer (enabled by default). "
+  "Disable with --skip-innodb-doublewrite.",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity,
+  PLUGIN_VAR_RQCMDARG,
+  "Number of IOPs the server can do. Tunes the background IO rate",
+  NULL, innodb_io_capacity_update, 200, 100, ~0UL, 0);
+
+static MYSQL_SYSVAR_ULONG(io_capacity_max, srv_max_io_capacity,
+  PLUGIN_VAR_RQCMDARG,
+  "Limit to which innodb_io_capacity can be inflated.",
+  NULL, innodb_io_capacity_max_update,
+  SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT, 100,
+  SRV_MAX_IO_CAPACITY_LIMIT, 0);
+
+#ifdef UNIV_DEBUG
+static MYSQL_SYSVAR_BOOL(purge_run_now, innodb_purge_run_now,
+  PLUGIN_VAR_OPCMDARG,
+  "Set purge state to RUN",
+  NULL, purge_run_now_set, FALSE);
+
+static MYSQL_SYSVAR_BOOL(purge_stop_now, innodb_purge_stop_now,
+  PLUGIN_VAR_OPCMDARG,
+  "Set purge state to STOP",
+  NULL, purge_stop_now_set, FALSE);
+
+static MYSQL_SYSVAR_BOOL(log_checkpoint_now, innodb_log_checkpoint_now,
+  PLUGIN_VAR_OPCMDARG,
+  "Force checkpoint now",
+  NULL, checkpoint_now_set, FALSE);
+
+static MYSQL_SYSVAR_BOOL(buf_flush_list_now, innodb_buf_flush_list_now,
+  PLUGIN_VAR_OPCMDARG,
+  "Force dirty page flush now",
+  NULL, buf_flush_list_now_set, FALSE);
+#endif /* UNIV_DEBUG */
+
+static MYSQL_SYSVAR_ULONG(purge_batch_size, srv_purge_batch_size,
+  PLUGIN_VAR_OPCMDARG,
+  "Number of UNDO log pages to purge in one batch from the history list.",
+  NULL, NULL,
+  300,			/* Default setting */
+  1,			/* Minimum value */
+  5000, 0);		/* Maximum value */
+
+static MYSQL_SYSVAR_ULONG(purge_threads, srv_n_purge_threads,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Purge threads can be from 1 to 32. Default is 1.",
+  NULL, NULL,
+  1,			/* Default setting */
+  1,			/* Minimum value */
+  32, 0);		/* Maximum value */
+
+static MYSQL_SYSVAR_ULONG(sync_array_size, srv_sync_array_size,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Size of the mutex/lock wait array.",
+  NULL, NULL,
+  1,			/* Default setting */
+  1,			/* Minimum value */
+  1024, 0);		/* Maximum value */
+
+static MYSQL_SYSVAR_ULONG(fast_shutdown, innobase_fast_shutdown,
+  PLUGIN_VAR_OPCMDARG,
+  "Speeds up the shutdown process of the InnoDB storage engine. Possible "
+  "values are 0, 1 (faster) or 2 (fastest - crash-like).",
+  NULL, NULL, 1, 0, 2, 0);
+
+static MYSQL_SYSVAR_BOOL(file_per_table, srv_file_per_table,
+  PLUGIN_VAR_NOCMDARG,
+  "Stores each InnoDB table to an .ibd file in the database dir.",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_STR(file_format, innobase_file_format_name,
+  PLUGIN_VAR_RQCMDARG,
+  "File format to use for new tables in .ibd files.",
+  innodb_file_format_name_validate,
+  innodb_file_format_name_update, "Antelope");
+
+/* "innobase_file_format_check" decides whether we would continue
+booting the server if the file format stamped on the system
+table space exceeds the maximum file format supported
+by the server. Can be set during server startup at command
+line or configure file, and a read only variable after
+server startup */
+static MYSQL_SYSVAR_BOOL(file_format_check, innobase_file_format_check,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Whether to perform system file format check.",
+  NULL, NULL, TRUE);
+
+/* If a new file format is introduced, the file format
+name needs to be updated accordingly. Please refer to
+file_format_name_map[] defined in trx0sys.cc for the next
+file format name. */
+static MYSQL_SYSVAR_STR(file_format_max, innobase_file_format_max,
+  PLUGIN_VAR_OPCMDARG,
+  "The highest file format in the tablespace.",
+  innodb_file_format_max_validate,
+  innodb_file_format_max_update, "Antelope");
+
+static MYSQL_SYSVAR_STR(ft_server_stopword_table, innobase_server_stopword_table,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_MEMALLOC,
+  "The user supplied stopword table name.",
+  innodb_stopword_table_validate,
+  NULL,
+  NULL);
+
+static MYSQL_SYSVAR_UINT(flush_log_at_timeout, srv_flush_log_at_timeout,
+  PLUGIN_VAR_OPCMDARG,
+  "Write and flush logs every (n) second.",
+  NULL, NULL, 1, 0, 2700, 0);
+
+static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
+  PLUGIN_VAR_OPCMDARG,
+  "Set to 0 (write and flush once per second),"
+  " 1 (write and flush at each commit)"
+  " or 2 (write at commit, flush once per second).",
+  NULL, NULL, 1, 0, 2, 0);
+
+static MYSQL_SYSVAR_STR(flush_method, innobase_file_flush_method,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "With which method to flush data.", NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_BOOL(large_prefix, innobase_large_prefix,
+  PLUGIN_VAR_NOCMDARG,
+  "Support large index prefix length of REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes.",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(force_load_corrupted, srv_load_corrupted,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Force InnoDB to load metadata of corrupted table.",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(locks_unsafe_for_binlog, innobase_locks_unsafe_for_binlog,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "DEPRECATED. This option may be removed in future releases. "
+  "Please use READ COMMITTED transaction isolation level instead. "
+  "Force InnoDB to not use next-key locking, to use only row-level locking.",
+  NULL, NULL, FALSE);
+
+#ifdef UNIV_LOG_ARCHIVE
+static MYSQL_SYSVAR_STR(log_arch_dir, innobase_log_arch_dir,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Where full logs should be archived.", NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_BOOL(log_archive, innobase_log_archive,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Set to 1 if you want to have logs archived.", NULL, NULL, FALSE);
+#endif /* UNIV_LOG_ARCHIVE */
+
+static MYSQL_SYSVAR_STR(log_group_home_dir, srv_log_group_home_dir,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Path to InnoDB log files.", NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_ULONG(max_dirty_pages_pct, srv_max_buf_pool_modified_pct,
+  PLUGIN_VAR_RQCMDARG,
+  "Percentage of dirty pages allowed in bufferpool.",
+  NULL, innodb_max_dirty_pages_pct_update, 75, 0, 99, 0);
+
+static MYSQL_SYSVAR_ULONG(max_dirty_pages_pct_lwm,
+  srv_max_dirty_pages_pct_lwm,
+  PLUGIN_VAR_RQCMDARG,
+  "Percentage of dirty pages at which flushing kicks in.",
+  NULL, innodb_max_dirty_pages_pct_lwm_update, 0, 0, 99, 0);
+
+static MYSQL_SYSVAR_ULONG(adaptive_flushing_lwm,
+  srv_adaptive_flushing_lwm,
+  PLUGIN_VAR_RQCMDARG,
+  "Percentage of log capacity below which no adaptive flushing happens.",
+  NULL, NULL, 10, 0, 70, 0);
+
+static MYSQL_SYSVAR_BOOL(adaptive_flushing, srv_adaptive_flushing,
+  PLUGIN_VAR_NOCMDARG,
+  "Attempt flushing dirty pages to avoid IO bursts at checkpoints.",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_ULONG(flushing_avg_loops,
+  srv_flushing_avg_loops,
+  PLUGIN_VAR_RQCMDARG,
+  "Number of iterations over which the background flushing is averaged.",
+  NULL, NULL, 30, 1, 1000, 0);
+
+static MYSQL_SYSVAR_ULONG(max_purge_lag, srv_max_purge_lag,
+  PLUGIN_VAR_RQCMDARG,
+  "Desired maximum length of the purge queue (0 = no limit)",
+  NULL, NULL, 0, 0, ~0UL, 0);
+
+static MYSQL_SYSVAR_ULONG(max_purge_lag_delay, srv_max_purge_lag_delay,
+   PLUGIN_VAR_RQCMDARG,
+   "Maximum delay of user threads in micro-seconds",
+   NULL, NULL,
+   0L,			/* Default seting */
+   0L,			/* Minimum value */
+   10000000UL, 0);	/* Maximum value */
+
+static MYSQL_SYSVAR_BOOL(rollback_on_timeout, innobase_rollback_on_timeout,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Roll back the complete transaction on lock wait timeout, for 4.x compatibility (disabled by default)",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(status_file, innobase_create_status_file,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_NOSYSVAR,
+  "Enable SHOW ENGINE INNODB STATUS output in the innodb_status.<pid> file",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(stats_on_metadata, innobase_stats_on_metadata,
+  PLUGIN_VAR_OPCMDARG,
+  "Enable statistics gathering for metadata commands such as "
+  "SHOW TABLE STATUS for tables that use transient statistics (off by default)",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONGLONG(stats_sample_pages, srv_stats_transient_sample_pages,
+  PLUGIN_VAR_RQCMDARG,
+  "Deprecated, use innodb_stats_transient_sample_pages instead",
+  NULL, innodb_stats_sample_pages_update, 8, 1, ~0ULL, 0);
+
+static MYSQL_SYSVAR_ULONGLONG(stats_transient_sample_pages,
+  srv_stats_transient_sample_pages,
+  PLUGIN_VAR_RQCMDARG,
+  "The number of leaf index pages to sample when calculating transient "
+  "statistics (if persistent statistics are not used, default 8)",
+  NULL, NULL, 8, 1, ~0ULL, 0);
+
+static MYSQL_SYSVAR_BOOL(stats_persistent, srv_stats_persistent,
+  PLUGIN_VAR_OPCMDARG,
+  "InnoDB persistent statistics enabled for all tables unless overridden "
+  "at table level",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_BOOL(stats_auto_recalc, srv_stats_auto_recalc,
+  PLUGIN_VAR_OPCMDARG,
+  "InnoDB automatic recalculation of persistent statistics enabled for all "
+  "tables unless overridden at table level (automatic recalculation is only "
+  "done when InnoDB decides that the table has changed too much and needs a "
+  "new statistics)",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_ULONGLONG(stats_persistent_sample_pages,
+  srv_stats_persistent_sample_pages,
+  PLUGIN_VAR_RQCMDARG,
+  "The number of leaf index pages to sample when calculating persistent "
+  "statistics (by ANALYZE, default 20)",
+  NULL, NULL, 20, 1, ~0ULL, 0);
+
+static MYSQL_SYSVAR_BOOL(adaptive_hash_index, btr_search_enabled,
+  PLUGIN_VAR_OPCMDARG,
+  "Enable InnoDB adaptive hash index (enabled by default).  "
+  "Disable with --skip-innodb-adaptive-hash-index.",
+  NULL, innodb_adaptive_hash_index_update, TRUE);
+
+static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay,
+  PLUGIN_VAR_RQCMDARG,
+  "Replication thread delay (ms) on the slave server if "
+  "innodb_thread_concurrency is reached (0 by default)",
+  NULL, NULL, 0, 0, ~0UL, 0);
+
+static MYSQL_SYSVAR_UINT(compression_level, page_zip_level,
+  PLUGIN_VAR_RQCMDARG,
+  "Compression level used for compressed row format.  0 is no compression"
+  ", 1 is fastest, 9 is best compression and default is 6.",
+  NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0);
+
+static MYSQL_SYSVAR_BOOL(log_compressed_pages, page_zip_log_pages,
+       PLUGIN_VAR_OPCMDARG,
+  "Enables/disables the logging of entire compressed page images."
+  " InnoDB logs the compressed pages to prevent corruption if"
+  " the zlib compression algorithm changes."
+  " When turned OFF, InnoDB will assume that the zlib"
+  " compression algorithm doesn't change.",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_LONG(additional_mem_pool_size, innobase_additional_mem_pool_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "DEPRECATED. This option may be removed in future releases, "
+  "together with the option innodb_use_sys_malloc and with the InnoDB's "
+  "internal memory allocator. "
+  "Size of a memory pool InnoDB uses to store data dictionary information and other internal data structures.",
+  NULL, NULL, 8*1024*1024L, 512*1024L, LONG_MAX, 1024);
+
+static MYSQL_SYSVAR_ULONG(autoextend_increment, srv_auto_extend_increment,
+  PLUGIN_VAR_RQCMDARG,
+  "Data file autoextend increment in megabytes",
+  NULL, NULL, 64L, 1L, 1000L, 0);
+
+static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.",
+  NULL, NULL, 128*1024*1024L, 5*1024*1024L, LONGLONG_MAX, 1024*1024L);
+
+#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG
+static MYSQL_SYSVAR_ULONG(page_hash_locks, srv_n_page_hash_locks,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Number of rw_locks protecting buffer pool page_hash. Rounded up to the next power of 2",
+  NULL, NULL, 16, 1, MAX_PAGE_HASH_LOCKS, 0);
+
+static MYSQL_SYSVAR_ULONG(doublewrite_batch_size, srv_doublewrite_batch_size,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Number of pages reserved in doublewrite buffer for batch flushing",
+  NULL, NULL, 120, 1, 127, 0);
+#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */
+
+static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Number of buffer pool instances, set to higher value on high-end machines to increase scalability",
+  NULL, NULL, 0L, 0L, MAX_BUFFER_POOLS, 1L);
+
+static MYSQL_SYSVAR_STR(buffer_pool_filename, srv_buf_dump_filename,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
+  "Filename to/from which to dump/load the InnoDB buffer pool",
+  innodb_srv_buf_dump_filename_validate, NULL, SRV_BUF_DUMP_FILENAME_DEFAULT);
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_dump_now, innodb_buffer_pool_dump_now,
+  PLUGIN_VAR_RQCMDARG,
+  "Trigger an immediate dump of the buffer pool into a file named @@innodb_buffer_pool_filename",
+  NULL, buffer_pool_dump_now, FALSE);
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_dump_at_shutdown, srv_buffer_pool_dump_at_shutdown,
+  PLUGIN_VAR_RQCMDARG,
+  "Dump the buffer pool into a file named @@innodb_buffer_pool_filename",
+  NULL, NULL, FALSE);
+
+#ifdef UNIV_DEBUG
+static MYSQL_SYSVAR_STR(buffer_pool_evict, srv_buffer_pool_evict,
+  PLUGIN_VAR_RQCMDARG,
+  "Evict pages from the buffer pool",
+  NULL, innodb_buffer_pool_evict_update, "");
+#endif /* UNIV_DEBUG */
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_load_now, innodb_buffer_pool_load_now,
+  PLUGIN_VAR_RQCMDARG,
+  "Trigger an immediate load of the buffer pool from a file named @@innodb_buffer_pool_filename",
+  NULL, buffer_pool_load_now, FALSE);
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_load_abort, innodb_buffer_pool_load_abort,
+  PLUGIN_VAR_RQCMDARG,
+  "Abort a currently running load of the buffer pool",
+  NULL, buffer_pool_load_abort, FALSE);
+
+/* there is no point in changing this during runtime, thus readonly */
+static MYSQL_SYSVAR_BOOL(buffer_pool_load_at_startup, srv_buffer_pool_load_at_startup,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Load the buffer pool from a file named @@innodb_buffer_pool_filename",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth,
+  PLUGIN_VAR_RQCMDARG,
+  "How deep to scan LRU to keep it clean",
+  NULL, NULL, 1024, 100, ~0UL, 0);
+
+static MYSQL_SYSVAR_ULONG(flush_neighbors, srv_flush_neighbors,
+  PLUGIN_VAR_OPCMDARG,
+  "Set to 0 (don't flush neighbors from buffer pool),"
+  " 1 (flush contiguous neighbors from buffer pool)"
+  " or 2 (flush neighbors from buffer pool),"
+  " when flushing a block",
+  NULL, NULL, 1, 0, 2, 0);
+
+static MYSQL_SYSVAR_ULONG(commit_concurrency, innobase_commit_concurrency,
+  PLUGIN_VAR_RQCMDARG,
+  "Helps in performance tuning in heavily concurrent environments.",
+  innobase_commit_concurrency_validate, NULL, 0, 0, 1000, 0);
+
+static MYSQL_SYSVAR_ULONG(concurrency_tickets, srv_n_free_tickets_to_enter,
+  PLUGIN_VAR_RQCMDARG,
+  "Number of times a thread is allowed to enter InnoDB within the same SQL query after it has once got the ticket",
+  NULL, NULL, 5000L, 1L, ~0UL, 0);
+
+static MYSQL_SYSVAR_LONG(file_io_threads, innobase_file_io_threads,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR,
+  "Number of file I/O threads in InnoDB.",
+  NULL, NULL, 4, 4, 64, 0);
+
+static MYSQL_SYSVAR_BOOL(ft_enable_diag_print, fts_enable_diag_print,
+  PLUGIN_VAR_OPCMDARG,
+  "Whether to enable additional FTS diagnostic printout ",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(disable_sort_file_cache, srv_disable_sort_file_cache,
+  PLUGIN_VAR_OPCMDARG,
+  "Whether to disable OS system file cache for sort I/O",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_STR(ft_aux_table, fts_internal_tbl_name,
+  PLUGIN_VAR_NOCMDARG,
+  "FTS internal auxiliary table to be checked",
+  innodb_internal_table_validate,
+  innodb_internal_table_update, NULL);
+
+static MYSQL_SYSVAR_ULONG(ft_cache_size, fts_max_cache_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "InnoDB Fulltext search cache size in bytes",
+  NULL, NULL, 8000000, 1600000, 80000000, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_total_cache_size, fts_max_total_cache_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Total memory allocated for InnoDB Fulltext Search cache",
+  NULL, NULL, 640000000, 32000000, 1600000000, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_result_cache_limit, fts_result_cache_limit,
+  PLUGIN_VAR_RQCMDARG,
+  "InnoDB Fulltext search query result cache limit in bytes",
+  NULL, NULL, 2000000000L, 1000000L, 4294967295UL, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_min_token_size, fts_min_token_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "InnoDB Fulltext search minimum token size in characters",
+  NULL, NULL, 3, 0, 16, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_max_token_size, fts_max_token_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "InnoDB Fulltext search maximum token size in characters",
+  NULL, NULL, FTS_MAX_WORD_LEN_IN_CHAR, 10, FTS_MAX_WORD_LEN_IN_CHAR, 0);
+
+
+static MYSQL_SYSVAR_ULONG(ft_num_word_optimize, fts_num_word_optimize,
+  PLUGIN_VAR_OPCMDARG,
+  "InnoDB Fulltext search number of words to optimize for each optimize table call ",
+  NULL, NULL, 2000, 1000, 10000, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_sort_pll_degree, fts_sort_pll_degree,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "InnoDB Fulltext search parallel sort degree, will round up to nearest power of 2 number",
+  NULL, NULL, 2, 1, 16, 0);
+
+static MYSQL_SYSVAR_ULONG(sort_buffer_size, srv_sort_buf_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Memory buffer size for index creation",
+  NULL, NULL, 1048576, 65536, 64<<20, 0);
+
+static MYSQL_SYSVAR_ULONGLONG(online_alter_log_max_size, srv_online_max_size,
+  PLUGIN_VAR_RQCMDARG,
+  "Maximum modification log file size for online index creation",
+  NULL, NULL, 128<<20, 65536, ~0ULL, 0);
+
+static MYSQL_SYSVAR_BOOL(optimize_fulltext_only, innodb_optimize_fulltext_only,
+  PLUGIN_VAR_NOCMDARG,
+  "Only optimize the Fulltext index of the table",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(read_io_threads, innobase_read_io_threads,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Number of background read I/O threads in InnoDB.",
+  NULL, NULL, 4, 1, 64, 0);
+
+static MYSQL_SYSVAR_ULONG(write_io_threads, innobase_write_io_threads,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Number of background write I/O threads in InnoDB.",
+  NULL, NULL, 4, 1, 64, 0);
+
+static MYSQL_SYSVAR_ULONG(force_recovery, srv_force_recovery,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Helps to save your data in case the disk image of the database becomes corrupt.",
+  NULL, NULL, 0, 0, 6, 0);
+
+#ifndef DBUG_OFF
+static MYSQL_SYSVAR_ULONG(force_recovery_crash, srv_force_recovery_crash,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Kills the server during crash recovery.",
+  NULL, NULL, 0, 0, 10, 0);
+#endif /* !DBUG_OFF */
+
+static MYSQL_SYSVAR_ULONG(page_size, srv_page_size,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Page size to use for all InnoDB tablespaces.",
+  NULL, NULL, UNIV_PAGE_SIZE_DEF,
+  UNIV_PAGE_SIZE_MIN, UNIV_PAGE_SIZE_MAX, 0);
+
+static MYSQL_SYSVAR_LONG(log_buffer_size, innobase_log_buffer_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "The size of the buffer which InnoDB uses to write log to the log files on disk.",
+  NULL, NULL, 8*1024*1024L, 256*1024L, LONG_MAX, 1024);
+
+static MYSQL_SYSVAR_LONGLONG(log_file_size, innobase_log_file_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Size of each log file in a log group.",
+  NULL, NULL, 48*1024*1024L, 1*1024*1024L, LONGLONG_MAX, 1024*1024L);
+
+static MYSQL_SYSVAR_ULONG(log_files_in_group, srv_n_log_files,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Number of log files in the log group. InnoDB writes to the files in a circular fashion.",
+  NULL, NULL, 2, 2, SRV_N_LOG_FILES_MAX, 0);
+
+/* Note that the default and minimum values are set to 0 to
+detect if the option is passed and print deprecation message */
+static MYSQL_SYSVAR_LONG(mirrored_log_groups, innobase_mirrored_log_groups,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Number of identical copies of log groups we keep for the database. Currently this should be set to 1.",
+  NULL, NULL, 0, 0, 10, 0);
+
+static MYSQL_SYSVAR_UINT(old_blocks_pct, innobase_old_blocks_pct,
+  PLUGIN_VAR_RQCMDARG,
+  "Percentage of the buffer pool to reserve for 'old' blocks.",
+  NULL, innodb_old_blocks_pct_update, 100 * 3 / 8, 5, 95, 0);
+
+static MYSQL_SYSVAR_UINT(old_blocks_time, buf_LRU_old_threshold_ms,
+  PLUGIN_VAR_RQCMDARG,
+  "Move blocks to the 'new' end of the buffer pool if the first access"
+  " was at least this many milliseconds ago."
+  " The timeout is disabled if 0.",
+  NULL, NULL, 1000, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_LONG(open_files, innobase_open_files,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "How many files at the maximum InnoDB keeps open at the same time.",
+  NULL, NULL, 0L, 0L, LONG_MAX, 0);
+
+static MYSQL_SYSVAR_ULONG(sync_spin_loops, srv_n_spin_wait_rounds,
+  PLUGIN_VAR_RQCMDARG,
+  "Count of spin-loop rounds in InnoDB mutexes (30 by default)",
+  NULL, NULL, 30L, 0L, ~0UL, 0);
+
+static MYSQL_SYSVAR_ULONG(spin_wait_delay, srv_spin_wait_delay,
+  PLUGIN_VAR_OPCMDARG,
+  "Maximum delay between polling for a spin lock (6 by default)",
+  NULL, NULL, 6L, 0L, ~0UL, 0);
+
+static MYSQL_SYSVAR_ULONG(thread_concurrency, srv_thread_concurrency,
+  PLUGIN_VAR_RQCMDARG,
+  "Helps in performance tuning in heavily concurrent environments. Sets the maximum number of threads allowed inside InnoDB. Value 0 will disable the thread throttling.",
+  NULL, NULL, 0, 0, 1000, 0);
+
+#ifdef HAVE_ATOMIC_BUILTINS
+static MYSQL_SYSVAR_ULONG(
+  adaptive_max_sleep_delay, srv_adaptive_max_sleep_delay,
+  PLUGIN_VAR_RQCMDARG,
+  "The upper limit of the sleep delay in usec. Value of 0 disables it.",
+  NULL, NULL,
+  150000,			/* Default setting */
+  0,				/* Minimum value */
+  1000000, 0);			/* Maximum value */
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+static MYSQL_SYSVAR_ULONG(thread_sleep_delay, srv_thread_sleep_delay,
+  PLUGIN_VAR_RQCMDARG,
+  "Time of innodb thread sleeping before joining InnoDB queue (usec). "
+  "Value 0 disable a sleep",
+  NULL, NULL,
+  10000L,
+  0L,
+  1000000L, 0);
+
+static MYSQL_SYSVAR_STR(data_file_path, innobase_data_file_path,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Path to individual files and their sizes.",
+  NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_STR(undo_directory, srv_undo_dir,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Directory where undo tablespace files live, this path can be absolute.",
+  NULL, NULL, ".");
+
+static MYSQL_SYSVAR_ULONG(undo_tablespaces, srv_undo_tablespaces,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Number of undo tablespaces to use. ",
+  NULL, NULL,
+  0L,			/* Default seting */
+  0L,			/* Minimum value */
+  126L, 0);		/* Maximum value */
+
+static MYSQL_SYSVAR_ULONG(undo_logs, srv_undo_logs,
+  PLUGIN_VAR_OPCMDARG,
+  "Number of undo logs to use.",
+  NULL, NULL,
+  TRX_SYS_N_RSEGS,	/* Default setting */
+  1,			/* Minimum value */
+  TRX_SYS_N_RSEGS, 0);	/* Maximum value */
+
+/* Alias for innodb_undo_logs, this config variable is deprecated. */
+static MYSQL_SYSVAR_ULONG(rollback_segments, srv_undo_logs,
+  PLUGIN_VAR_OPCMDARG,
+  "Number of undo logs to use (deprecated).",
+  NULL, NULL,
+  TRX_SYS_N_RSEGS,	/* Default setting */
+  1,			/* Minimum value */
+  TRX_SYS_N_RSEGS, 0);	/* Maximum value */
+
+static MYSQL_SYSVAR_LONG(autoinc_lock_mode, innobase_autoinc_lock_mode,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "The AUTOINC lock modes supported by InnoDB:               "
+  "0 => Old style AUTOINC locking (for backward"
+  " compatibility)                                           "
+  "1 => New style AUTOINC locking                            "
+  "2 => No AUTOINC locking (unsafe for SBR)",
+  NULL, NULL,
+  AUTOINC_NEW_STYLE_LOCKING,	/* Default setting */
+  AUTOINC_OLD_STYLE_LOCKING,	/* Minimum value */
+  AUTOINC_NO_LOCKING, 0);	/* Maximum value */
+
+static MYSQL_SYSVAR_STR(version, innodb_version_str,
+  PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_READONLY,
+  "InnoDB version", NULL, NULL, INNODB_VERSION_STR);
+
+static MYSQL_SYSVAR_BOOL(use_sys_malloc, srv_use_sys_malloc,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "DEPRECATED. This option may be removed in future releases, "
+  "together with the InnoDB's internal memory allocator. "
+  "Use OS memory allocator instead of InnoDB's internal memory allocator",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_BOOL(use_native_aio, srv_use_native_aio,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Use native AIO if supported on this platform.",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_BOOL(api_enable_binlog, ib_binlog_enabled,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Enable binlog for applications direct access InnoDB through InnoDB APIs",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(api_enable_mdl, ib_mdl_enabled,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Enable MDL for applications direct access InnoDB through InnoDB APIs",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(api_disable_rowlock, ib_disable_row_lock,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Disable row lock when direct access InnoDB through InnoDB APIs",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(api_trx_level, ib_trx_level_setting,
+  PLUGIN_VAR_OPCMDARG,
+  "InnoDB API transaction isolation level",
+  NULL, NULL,
+  0,		/* Default setting */
+  0,		/* Minimum value */
+  3, 0);	/* Maximum value */
+
+static MYSQL_SYSVAR_ULONG(api_bk_commit_interval, ib_bk_commit_interval,
+  PLUGIN_VAR_OPCMDARG,
+  "Background commit interval in seconds",
+  NULL, NULL,
+  5,		/* Default setting */
+  1,		/* Minimum value */
+  1024 * 1024 * 1024, 0);	/* Maximum value */
+
+static MYSQL_SYSVAR_STR(change_buffering, innobase_change_buffering,
+  PLUGIN_VAR_RQCMDARG,
+  "Buffer changes to reduce random access: "
+  "OFF, ON, inserting, deleting, changing, or purging.",
+  innodb_change_buffering_validate,
+  innodb_change_buffering_update, "all");
+
+static MYSQL_SYSVAR_UINT(change_buffer_max_size,
+  innobase_change_buffer_max_size,
+  PLUGIN_VAR_RQCMDARG,
+  "Maximum on-disk size of change buffer in terms of percentage"
+  " of the buffer pool.",
+  NULL, innodb_change_buffer_max_size_update,
+  CHANGE_BUFFER_DEFAULT_SIZE, 0, 50, 0);
+
+static MYSQL_SYSVAR_ENUM(stats_method, srv_innodb_stats_method,
+   PLUGIN_VAR_RQCMDARG,
+  "Specifies how InnoDB index statistics collection code should "
+  "treat NULLs. Possible values are NULLS_EQUAL (default), "
+  "NULLS_UNEQUAL and NULLS_IGNORED",
+   NULL, NULL, SRV_STATS_NULLS_EQUAL, &innodb_stats_method_typelib);
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug,
+  PLUGIN_VAR_RQCMDARG,
+  "Debug flags for InnoDB change buffering (0=none, 2=crash at merge)",
+  NULL, NULL, 0, 0, 2, 0);
+
+static MYSQL_SYSVAR_BOOL(disable_background_merge,
+  srv_ibuf_disable_background_merge,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_RQCMDARG,
+  "Disable change buffering merges by the master thread",
+  NULL, NULL, FALSE);
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+static MYSQL_SYSVAR_BOOL(random_read_ahead, srv_random_read_ahead,
+  PLUGIN_VAR_NOCMDARG,
+  "Whether to use read ahead for random access within an extent.",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(read_ahead_threshold, srv_read_ahead_threshold,
+  PLUGIN_VAR_RQCMDARG,
+  "Number of pages that must be accessed sequentially for InnoDB to "
+  "trigger a readahead.",
+  NULL, NULL, 56, 0, 64, 0);
+
+static MYSQL_SYSVAR_STR(monitor_enable, innobase_enable_monitor_counter,
+  PLUGIN_VAR_RQCMDARG,
+  "Turn on a monitor counter",
+  innodb_monitor_validate,
+  innodb_enable_monitor_update, NULL);
+
+static MYSQL_SYSVAR_STR(monitor_disable, innobase_disable_monitor_counter,
+  PLUGIN_VAR_RQCMDARG,
+  "Turn off a monitor counter",
+  innodb_monitor_validate,
+  innodb_disable_monitor_update, NULL);
+
+static MYSQL_SYSVAR_STR(monitor_reset, innobase_reset_monitor_counter,
+  PLUGIN_VAR_RQCMDARG,
+  "Reset a monitor counter",
+  innodb_monitor_validate,
+  innodb_reset_monitor_update, NULL);
+
+static MYSQL_SYSVAR_STR(monitor_reset_all, innobase_reset_all_monitor_counter,
+  PLUGIN_VAR_RQCMDARG,
+  "Reset all values for a monitor counter",
+  innodb_monitor_validate,
+  innodb_reset_all_monitor_update, NULL);
+
+static MYSQL_SYSVAR_BOOL(status_output, srv_print_innodb_monitor,
+  PLUGIN_VAR_OPCMDARG, "Enable InnoDB monitor output to the error log.",
+  NULL, innodb_status_output_update, FALSE);
+
+static MYSQL_SYSVAR_BOOL(status_output_locks, srv_print_innodb_lock_monitor,
+  PLUGIN_VAR_OPCMDARG, "Enable InnoDB lock monitor output to the error log."
+  " Requires innodb_status_output=ON.",
+  NULL, innodb_status_output_update, FALSE);
+
+static MYSQL_SYSVAR_BOOL(print_all_deadlocks, srv_print_all_deadlocks,
+  PLUGIN_VAR_OPCMDARG,
+  "Print all deadlocks to MySQL error log (off by default)",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(compression_failure_threshold_pct,
+  zip_failure_threshold_pct, PLUGIN_VAR_OPCMDARG,
+  "If the compression failure rate of a table is greater than this number"
+  " more padding is added to the pages to reduce the failures. A value of"
+  " zero implies no padding",
+  NULL, NULL, 5, 0, 100, 0);
+
+static MYSQL_SYSVAR_ULONG(compression_pad_pct_max,
+  zip_pad_max, PLUGIN_VAR_OPCMDARG,
+  "Percentage of empty space on a data page that can be reserved"
+  " to make the page compressible.",
+  NULL, NULL, 50, 0, 75, 0);
+
+static MYSQL_SYSVAR_BOOL(read_only, srv_read_only_mode,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Start InnoDB in read only mode (off by default)",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(cmp_per_index_enabled, srv_cmp_per_index_enabled,
+  PLUGIN_VAR_OPCMDARG,
+  "Enable INFORMATION_SCHEMA.innodb_cmp_per_index, "
+  "may have negative impact on performance (off by default)",
+  NULL, innodb_cmp_per_index_update, FALSE);
+
+#ifdef UNIV_DEBUG
+static MYSQL_SYSVAR_UINT(trx_rseg_n_slots_debug, trx_rseg_n_slots_debug,
+  PLUGIN_VAR_RQCMDARG,
+  "Debug flags for InnoDB to limit TRX_RSEG_N_SLOTS for trx_rsegf_undo_find_free()",
+  NULL, NULL, 0, 0, 1024, 0);
+
+static MYSQL_SYSVAR_UINT(limit_optimistic_insert_debug,
+  btr_cur_limit_optimistic_insert_debug, PLUGIN_VAR_RQCMDARG,
+  "Artificially limit the number of records per B-tree page (0=unlimited).",
+  NULL, NULL, 0, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_BOOL(trx_purge_view_update_only_debug,
+  srv_purge_view_update_only_debug, PLUGIN_VAR_NOCMDARG,
+  "Pause actual purging any delete-marked records, but merely update the purge view. "
+  "It is to create artificially the situation the purge view have been updated "
+  "but the each purges were not done yet.",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(fil_make_page_dirty_debug,
+  srv_fil_make_page_dirty_debug, PLUGIN_VAR_OPCMDARG,
+  "Make the first page of the given tablespace dirty.",
+  NULL, innodb_make_page_dirty, 0, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_ULONG(saved_page_number_debug,
+  srv_saved_page_number_debug, PLUGIN_VAR_OPCMDARG,
+  "An InnoDB page number.",
+  NULL, innodb_save_page_no, 0, 0, UINT_MAX32, 0);
+#endif /* UNIV_DEBUG */
+
+static struct st_mysql_sys_var* innobase_system_variables[]= {
+  MYSQL_SYSVAR(additional_mem_pool_size),
+  MYSQL_SYSVAR(api_trx_level),
+  MYSQL_SYSVAR(api_bk_commit_interval),
+  MYSQL_SYSVAR(autoextend_increment),
+  MYSQL_SYSVAR(buffer_pool_size),
+  MYSQL_SYSVAR(buffer_pool_instances),
+  MYSQL_SYSVAR(buffer_pool_filename),
+  MYSQL_SYSVAR(buffer_pool_dump_now),
+  MYSQL_SYSVAR(buffer_pool_dump_at_shutdown),
+#ifdef UNIV_DEBUG
+  MYSQL_SYSVAR(buffer_pool_evict),
+#endif /* UNIV_DEBUG */
+  MYSQL_SYSVAR(buffer_pool_load_now),
+  MYSQL_SYSVAR(buffer_pool_load_abort),
+  MYSQL_SYSVAR(buffer_pool_load_at_startup),
+  MYSQL_SYSVAR(lru_scan_depth),
+  MYSQL_SYSVAR(flush_neighbors),
+  MYSQL_SYSVAR(checksum_algorithm),
+  MYSQL_SYSVAR(checksums),
+  MYSQL_SYSVAR(commit_concurrency),
+  MYSQL_SYSVAR(concurrency_tickets),
+  MYSQL_SYSVAR(compression_level),
+  MYSQL_SYSVAR(data_file_path),
+  MYSQL_SYSVAR(data_home_dir),
+  MYSQL_SYSVAR(doublewrite),
+  MYSQL_SYSVAR(api_enable_binlog),
+  MYSQL_SYSVAR(api_enable_mdl),
+  MYSQL_SYSVAR(api_disable_rowlock),
+  MYSQL_SYSVAR(fast_shutdown),
+  MYSQL_SYSVAR(file_io_threads),
+  MYSQL_SYSVAR(read_io_threads),
+  MYSQL_SYSVAR(write_io_threads),
+  MYSQL_SYSVAR(file_per_table),
+  MYSQL_SYSVAR(file_format),
+  MYSQL_SYSVAR(file_format_check),
+  MYSQL_SYSVAR(file_format_max),
+  MYSQL_SYSVAR(flush_log_at_timeout),
+  MYSQL_SYSVAR(flush_log_at_trx_commit),
+  MYSQL_SYSVAR(flush_method),
+  MYSQL_SYSVAR(force_recovery),
+#ifndef DBUG_OFF
+  MYSQL_SYSVAR(force_recovery_crash),
+#endif /* !DBUG_OFF */
+  MYSQL_SYSVAR(ft_cache_size),
+  MYSQL_SYSVAR(ft_total_cache_size),
+  MYSQL_SYSVAR(ft_result_cache_limit),
+  MYSQL_SYSVAR(ft_enable_stopword),
+  MYSQL_SYSVAR(ft_max_token_size),
+  MYSQL_SYSVAR(ft_min_token_size),
+  MYSQL_SYSVAR(ft_num_word_optimize),
+  MYSQL_SYSVAR(ft_sort_pll_degree),
+  MYSQL_SYSVAR(large_prefix),
+  MYSQL_SYSVAR(force_load_corrupted),
+  MYSQL_SYSVAR(locks_unsafe_for_binlog),
+  MYSQL_SYSVAR(lock_wait_timeout),
+#ifdef UNIV_LOG_ARCHIVE
+  MYSQL_SYSVAR(log_arch_dir),
+  MYSQL_SYSVAR(log_archive),
+#endif /* UNIV_LOG_ARCHIVE */
+  MYSQL_SYSVAR(page_size),
+  MYSQL_SYSVAR(log_buffer_size),
+  MYSQL_SYSVAR(log_file_size),
+  MYSQL_SYSVAR(log_files_in_group),
+  MYSQL_SYSVAR(log_group_home_dir),
+  MYSQL_SYSVAR(log_compressed_pages),
+  MYSQL_SYSVAR(max_dirty_pages_pct),
+  MYSQL_SYSVAR(max_dirty_pages_pct_lwm),
+  MYSQL_SYSVAR(adaptive_flushing_lwm),
+  MYSQL_SYSVAR(adaptive_flushing),
+  MYSQL_SYSVAR(flushing_avg_loops),
+  MYSQL_SYSVAR(max_purge_lag),
+  MYSQL_SYSVAR(max_purge_lag_delay),
+  MYSQL_SYSVAR(mirrored_log_groups),
+  MYSQL_SYSVAR(old_blocks_pct),
+  MYSQL_SYSVAR(old_blocks_time),
+  MYSQL_SYSVAR(open_files),
+  MYSQL_SYSVAR(optimize_fulltext_only),
+  MYSQL_SYSVAR(rollback_on_timeout),
+  MYSQL_SYSVAR(ft_aux_table),
+  MYSQL_SYSVAR(ft_enable_diag_print),
+  MYSQL_SYSVAR(ft_server_stopword_table),
+  MYSQL_SYSVAR(ft_user_stopword_table),
+  MYSQL_SYSVAR(disable_sort_file_cache),
+  MYSQL_SYSVAR(stats_on_metadata),
+  MYSQL_SYSVAR(stats_sample_pages),
+  MYSQL_SYSVAR(stats_transient_sample_pages),
+  MYSQL_SYSVAR(stats_persistent),
+  MYSQL_SYSVAR(stats_persistent_sample_pages),
+  MYSQL_SYSVAR(stats_auto_recalc),
+  MYSQL_SYSVAR(adaptive_hash_index),
+  MYSQL_SYSVAR(stats_method),
+  MYSQL_SYSVAR(replication_delay),
+  MYSQL_SYSVAR(status_file),
+  MYSQL_SYSVAR(strict_mode),
+  MYSQL_SYSVAR(support_xa),
+  MYSQL_SYSVAR(sort_buffer_size),
+  MYSQL_SYSVAR(online_alter_log_max_size),
+  MYSQL_SYSVAR(sync_spin_loops),
+  MYSQL_SYSVAR(spin_wait_delay),
+  MYSQL_SYSVAR(table_locks),
+  MYSQL_SYSVAR(thread_concurrency),
+#ifdef HAVE_ATOMIC_BUILTINS
+  MYSQL_SYSVAR(adaptive_max_sleep_delay),
+#endif /* HAVE_ATOMIC_BUILTINS */
+  MYSQL_SYSVAR(thread_sleep_delay),
+  MYSQL_SYSVAR(autoinc_lock_mode),
+  MYSQL_SYSVAR(version),
+  MYSQL_SYSVAR(use_sys_malloc),
+  MYSQL_SYSVAR(use_native_aio),
+  MYSQL_SYSVAR(change_buffering),
+  MYSQL_SYSVAR(change_buffer_max_size),
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+  MYSQL_SYSVAR(change_buffering_debug),
+  MYSQL_SYSVAR(disable_background_merge),
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+  MYSQL_SYSVAR(random_read_ahead),
+  MYSQL_SYSVAR(read_ahead_threshold),
+  MYSQL_SYSVAR(read_only),
+  MYSQL_SYSVAR(io_capacity),
+  MYSQL_SYSVAR(io_capacity_max),
+  MYSQL_SYSVAR(monitor_enable),
+  MYSQL_SYSVAR(monitor_disable),
+  MYSQL_SYSVAR(monitor_reset),
+  MYSQL_SYSVAR(monitor_reset_all),
+  MYSQL_SYSVAR(purge_threads),
+  MYSQL_SYSVAR(purge_batch_size),
+#ifdef UNIV_DEBUG
+  MYSQL_SYSVAR(purge_run_now),
+  MYSQL_SYSVAR(purge_stop_now),
+  MYSQL_SYSVAR(log_checkpoint_now),
+  MYSQL_SYSVAR(buf_flush_list_now),
+#endif /* UNIV_DEBUG */
+#if defined UNIV_DEBUG || defined UNIV_PERF_DEBUG
+  MYSQL_SYSVAR(page_hash_locks),
+  MYSQL_SYSVAR(doublewrite_batch_size),
+#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */
+  MYSQL_SYSVAR(status_output),
+  MYSQL_SYSVAR(status_output_locks),
+  MYSQL_SYSVAR(print_all_deadlocks),
+  MYSQL_SYSVAR(cmp_per_index_enabled),
+  MYSQL_SYSVAR(undo_logs),
+  MYSQL_SYSVAR(rollback_segments),
+  MYSQL_SYSVAR(undo_directory),
+  MYSQL_SYSVAR(undo_tablespaces),
+  MYSQL_SYSVAR(sync_array_size),
+  MYSQL_SYSVAR(compression_failure_threshold_pct),
+  MYSQL_SYSVAR(compression_pad_pct_max),
+#ifdef UNIV_DEBUG
+  MYSQL_SYSVAR(trx_rseg_n_slots_debug),
+  MYSQL_SYSVAR(limit_optimistic_insert_debug),
+  MYSQL_SYSVAR(trx_purge_view_update_only_debug),
+  MYSQL_SYSVAR(fil_make_page_dirty_debug),
+  MYSQL_SYSVAR(saved_page_number_debug),
+#endif /* UNIV_DEBUG */
+  NULL
+};
+
+mysql_declare_plugin(innobase)
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &innobase_storage_engine,
+  innobase_hton_name,
+  plugin_author,
+  "Supports transactions, row-level locking, and foreign keys",
+  PLUGIN_LICENSE_GPL,
+  innobase_init, /* Plugin Init */
+  NULL, /* Plugin Deinit */
+  INNODB_VERSION_SHORT,
+  innodb_status_variables_export,/* status variables             */
+  innobase_system_variables, /* system variables */
+  NULL, /* reserved */
+  0,    /* flags */
+},
+i_s_innodb_trx,
+i_s_innodb_locks,
+i_s_innodb_lock_waits,
+i_s_innodb_cmp,
+i_s_innodb_cmp_reset,
+i_s_innodb_cmpmem,
+i_s_innodb_cmpmem_reset,
+i_s_innodb_cmp_per_index,
+i_s_innodb_cmp_per_index_reset,
+i_s_innodb_buffer_page,
+i_s_innodb_buffer_page_lru,
+i_s_innodb_buffer_stats,
+i_s_innodb_metrics,
+i_s_innodb_ft_default_stopword,
+i_s_innodb_ft_deleted,
+i_s_innodb_ft_being_deleted,
+i_s_innodb_ft_config,
+i_s_innodb_ft_index_cache,
+i_s_innodb_ft_index_table,
+i_s_innodb_sys_tables,
+i_s_innodb_sys_tablestats,
+i_s_innodb_sys_indexes,
+i_s_innodb_sys_columns,
+i_s_innodb_sys_fields,
+i_s_innodb_sys_foreign,
+i_s_innodb_sys_foreign_cols,
+i_s_innodb_sys_tablespaces,
+i_s_innodb_sys_datafiles
+
+mysql_declare_plugin_end;
+
+/** @brief Initialize the default value of innodb_commit_concurrency.
+
+Once InnoDB is running, the innodb_commit_concurrency must not change
+from zero to nonzero. (Bug #42101)
+
+The initial default value is 0, and without this extra initialization,
+SET GLOBAL innodb_commit_concurrency=DEFAULT would set the parameter
+to 0, even if it was initially set to nonzero at the command line
+or configuration file. */
+static
+void
+innobase_commit_concurrency_init_default()
+/*======================================*/
+{
+	MYSQL_SYSVAR_NAME(commit_concurrency).def_val
+		= innobase_commit_concurrency;
+}
+
+/** @brief Initialize the default and max value of innodb_undo_logs.
+
+Once InnoDB is running, the default value and the max value of
+innodb_undo_logs must be equal to the available undo logs,
+given by srv_available_undo_logs. */
+static
+void
+innobase_undo_logs_init_default_max()
+/*=================================*/
+{
+	MYSQL_SYSVAR_NAME(undo_logs).max_val
+		= MYSQL_SYSVAR_NAME(undo_logs).def_val
+		= static_cast<unsigned long>(srv_available_undo_logs);
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+struct innobase_convert_name_test_t {
+	char*		buf;
+	ulint		buflen;
+	const char*	id;
+	ulint		idlen;
+	void*		thd;
+	ibool		file_id;
+
+	const char*	expected;
+};
+
+void
+test_innobase_convert_name()
+{
+	char	buf[1024];
+	ulint	i;
+
+	innobase_convert_name_test_t test_input[] = {
+		{buf, sizeof(buf), "abcd", 4, NULL, TRUE, "\"abcd\""},
+		{buf, 7, "abcd", 4, NULL, TRUE, "\"abcd\""},
+		{buf, 6, "abcd", 4, NULL, TRUE, "\"abcd\""},
+		{buf, 5, "abcd", 4, NULL, TRUE, "\"abc\""},
+		{buf, 4, "abcd", 4, NULL, TRUE, "\"ab\""},
+
+		{buf, sizeof(buf), "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""},
+		{buf, 9, "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""},
+		{buf, 8, "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""},
+		{buf, 7, "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""},
+		{buf, 6, "ab@0060cd", 9, NULL, TRUE, "\"ab`c\""},
+		{buf, 5, "ab@0060cd", 9, NULL, TRUE, "\"ab`\""},
+		{buf, 4, "ab@0060cd", 9, NULL, TRUE, "\"ab\""},
+
+		{buf, sizeof(buf), "ab\"cd", 5, NULL, TRUE,
+			"\"#mysql50#ab\"\"cd\""},
+		{buf, 17, "ab\"cd", 5, NULL, TRUE,
+			"\"#mysql50#ab\"\"cd\""},
+		{buf, 16, "ab\"cd", 5, NULL, TRUE,
+			"\"#mysql50#ab\"\"c\""},
+		{buf, 15, "ab\"cd", 5, NULL, TRUE,
+			"\"#mysql50#ab\"\"\""},
+		{buf, 14, "ab\"cd", 5, NULL, TRUE,
+			"\"#mysql50#ab\""},
+		{buf, 13, "ab\"cd", 5, NULL, TRUE,
+			"\"#mysql50#ab\""},
+		{buf, 12, "ab\"cd", 5, NULL, TRUE,
+			"\"#mysql50#a\""},
+		{buf, 11, "ab\"cd", 5, NULL, TRUE,
+			"\"#mysql50#\""},
+		{buf, 10, "ab\"cd", 5, NULL, TRUE,
+			"\"#mysql50\""},
+
+		{buf, sizeof(buf), "ab/cd", 5, NULL, TRUE, "\"ab\".\"cd\""},
+		{buf, 9, "ab/cd", 5, NULL, TRUE, "\"ab\".\"cd\""},
+		{buf, 8, "ab/cd", 5, NULL, TRUE, "\"ab\".\"c\""},
+		{buf, 7, "ab/cd", 5, NULL, TRUE, "\"ab\".\"\""},
+		{buf, 6, "ab/cd", 5, NULL, TRUE, "\"ab\"."},
+		{buf, 5, "ab/cd", 5, NULL, TRUE, "\"ab\"."},
+		{buf, 4, "ab/cd", 5, NULL, TRUE, "\"ab\""},
+		{buf, 3, "ab/cd", 5, NULL, TRUE, "\"a\""},
+		{buf, 2, "ab/cd", 5, NULL, TRUE, "\"\""},
+		/* XXX probably "" is a better result in this case
+		{buf, 1, "ab/cd", 5, NULL, TRUE, "."},
+		*/
+		{buf, 0, "ab/cd", 5, NULL, TRUE, ""},
+	};
+
+	for (i = 0; i < sizeof(test_input) / sizeof(test_input[0]); i++) {
+
+		char*	end;
+		ibool	ok = TRUE;
+		size_t	res_len;
+
+		fprintf(stderr, "TESTING %lu, %s, %lu, %s\n",
+			test_input[i].buflen,
+			test_input[i].id,
+			test_input[i].idlen,
+			test_input[i].expected);
+
+		end = innobase_convert_name(
+			test_input[i].buf,
+			test_input[i].buflen,
+			test_input[i].id,
+			test_input[i].idlen,
+			test_input[i].thd,
+			test_input[i].file_id);
+
+		res_len = (size_t) (end - test_input[i].buf);
+
+		if (res_len != strlen(test_input[i].expected)) {
+
+			fprintf(stderr, "unexpected len of the result: %u, "
+				"expected: %u\n", (unsigned) res_len,
+				(unsigned) strlen(test_input[i].expected));
+			ok = FALSE;
+		}
+
+		if (memcmp(test_input[i].buf,
+			   test_input[i].expected,
+			   strlen(test_input[i].expected)) != 0
+		    || !ok) {
+
+			fprintf(stderr, "unexpected result: %.*s, "
+				"expected: %s\n", (int) res_len,
+				test_input[i].buf,
+				test_input[i].expected);
+			ok = FALSE;
+		}
+
+		if (ok) {
+			fprintf(stderr, "OK: res: %.*s\n\n", (int) res_len,
+				buf);
+		} else {
+			fprintf(stderr, "FAILED\n\n");
+			return;
+		}
+	}
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
+
+/****************************************************************************
+ * DS-MRR implementation
+ ***************************************************************************/
+
+/**
+ * Multi Range Read interface, DS-MRR calls
+ */
+
+int
+ha_innobase::multi_range_read_init(
+	RANGE_SEQ_IF*	seq,
+	void*		seq_init_param,
+	uint		n_ranges,
+	uint		mode,
+	HANDLER_BUFFER*	buf)
+{
+	return(ds_mrr.dsmrr_init(this, seq, seq_init_param,
+				 n_ranges, mode, buf));
+}
+
+int
+ha_innobase::multi_range_read_next(
+	char**		range_info)
+{
+	return(ds_mrr.dsmrr_next(range_info));
+}
+
+ha_rows
+ha_innobase::multi_range_read_info_const(
+	uint		keyno,
+	RANGE_SEQ_IF*	seq,
+	void*		seq_init_param,
+	uint		n_ranges,
+	uint*		bufsz,
+	uint*		flags,
+	Cost_estimate*	cost)
+{
+	/* See comments in ha_myisam::multi_range_read_info_const */
+	ds_mrr.init(this, table);
+	return(ds_mrr.dsmrr_info_const(keyno, seq, seq_init_param,
+				       n_ranges, bufsz, flags, cost));
+}
+
+ha_rows
+ha_innobase::multi_range_read_info(
+	uint		keyno,
+	uint		n_ranges,
+	uint		keys,
+	uint*		bufsz,
+	uint*		flags,
+	Cost_estimate*	cost)
+{
+	ds_mrr.init(this, table);
+	return(ds_mrr.dsmrr_info(keyno, n_ranges, keys, bufsz, flags, cost));
+}
+
+
+/**
+ * Index Condition Pushdown interface implementation
+ */
+
+/*************************************************************//**
+InnoDB index push-down condition check
+@return ICP_NO_MATCH, ICP_MATCH, or ICP_OUT_OF_RANGE */
+UNIV_INTERN
+enum icp_result
+innobase_index_cond(
+/*================*/
+	void*	file)	/*!< in/out: pointer to ha_innobase */
+{
+	DBUG_ENTER("innobase_index_cond");
+
+	ha_innobase*	h = reinterpret_cast<class ha_innobase*>(file);
+
+	DBUG_ASSERT(h->pushed_idx_cond);
+	DBUG_ASSERT(h->pushed_idx_cond_keyno != MAX_KEY);
+
+	if (h->end_range && h->compare_key_icp(h->end_range) > 0) {
+
+		/* caller should return HA_ERR_END_OF_FILE already */
+		DBUG_RETURN(ICP_OUT_OF_RANGE);
+	}
+
+	DBUG_RETURN(h->pushed_idx_cond->val_int() ? ICP_MATCH : ICP_NO_MATCH);
+}
+
+/** Attempt to push down an index condition.
+* @param[in] keyno	MySQL key number
+* @param[in] idx_cond	Index condition to be checked
+* @return Part of idx_cond which the handler will not evaluate
+*/
+UNIV_INTERN
+class Item*
+ha_innobase::idx_cond_push(
+	uint		keyno,
+	class Item*	idx_cond)
+{
+	DBUG_ENTER("ha_innobase::idx_cond_push");
+	DBUG_ASSERT(keyno != MAX_KEY);
+	DBUG_ASSERT(idx_cond != NULL);
+
+	pushed_idx_cond = idx_cond;
+	pushed_idx_cond_keyno = keyno;
+	in_range_check_pushed_down = TRUE;
+	/* We will evaluate the condition entirely */
+	DBUG_RETURN(NULL);
+}
+
+/******************************************************************//**
+Use this when the args are passed to the format string from
+errmsg-utf8.txt directly as is.
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+	THD *thd, Sql_condition::enum_warning_level level,
+	uint code, const char *format, ...);
+*/
+UNIV_INTERN
+void
+ib_senderrf(
+/*========*/
+	THD*		thd,		/*!< in/out: session */
+	ib_log_level_t	level,		/*!< in: warning level */
+	ib_uint32_t	code,		/*!< MySQL error code */
+	...)				/*!< Args */
+{
+	char*		str;
+	va_list         args;
+	const char*	format = innobase_get_err_msg(code);
+
+	/* If the caller wants to push a message to the client then
+	the caller must pass a valid session handle. */
+
+	ut_a(thd != 0);
+
+	/* The error code must exist in the errmsg-utf8.txt file. */
+	ut_a(format != 0);
+
+	va_start(args, code);
+
+#ifdef __WIN__
+	int		size = _vscprintf(format, args) + 1;
+	str = static_cast<char*>(malloc(size));
+	str[size - 1] = 0x0;
+	vsnprintf(str, size, format, args);
+#elif HAVE_VASPRINTF
+	int	ret;
+	ret = vasprintf(&str, format, args);
+	ut_a(ret != -1);
+#else
+	/* Use a fixed length string. */
+	str = static_cast<char*>(malloc(BUFSIZ));
+	my_vsnprintf(str, BUFSIZ, format, args);
+#endif /* __WIN__ */
+
+	Sql_condition::enum_warning_level	l;
+
+	l = Sql_condition::WARN_LEVEL_NOTE;
+
+	switch(level) {
+	case IB_LOG_LEVEL_INFO:
+		break;
+	case IB_LOG_LEVEL_WARN:
+		l = Sql_condition::WARN_LEVEL_WARN;
+		break;
+	case IB_LOG_LEVEL_ERROR:
+		/* We can't use push_warning_printf(), it is a hard error. */
+		my_printf_error(code, "%s", MYF(0), str);
+		break;
+	case IB_LOG_LEVEL_FATAL:
+		l = Sql_condition::WARN_LEVEL_END;
+		break;
+	}
+
+	if (level != IB_LOG_LEVEL_ERROR) {
+		push_warning_printf(thd, l, code, "InnoDB: %s", str);
+	}
+
+	va_end(args);
+	free(str);
+
+	if (level == IB_LOG_LEVEL_FATAL) {
+		ut_error;
+	}
+}
+
+/******************************************************************//**
+Use this when the args are first converted to a formatted string and then
+passed to the format string from errmsg-utf8.txt. The error message format
+must be: "Some string ... %s".
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+	THD *thd, Sql_condition::enum_warning_level level,
+	uint code, const char *format, ...);
+*/
+UNIV_INTERN
+void
+ib_errf(
+/*====*/
+	THD*		thd,		/*!< in/out: session */
+	ib_log_level_t	level,		/*!< in: warning level */
+	ib_uint32_t	code,		/*!< MySQL error code */
+	const char*	format,		/*!< printf format */
+	...)				/*!< Args */
+{
+	char*		str;
+	va_list         args;
+
+	/* If the caller wants to push a message to the client then
+	the caller must pass a valid session handle. */
+
+	ut_a(thd != 0);
+	ut_a(format != 0);
+
+	va_start(args, format);
+
+#ifdef __WIN__
+	int		size = _vscprintf(format, args) + 1;
+	str = static_cast<char*>(malloc(size));
+	str[size - 1] = 0x0;
+	vsnprintf(str, size, format, args);
+#elif HAVE_VASPRINTF
+	int	ret;
+	ret = vasprintf(&str, format, args);
+	ut_a(ret != -1);
+#else
+	/* Use a fixed length string. */
+	str = static_cast<char*>(malloc(BUFSIZ));
+	my_vsnprintf(str, BUFSIZ, format, args);
+#endif /* __WIN__ */
+
+	ib_senderrf(thd, level, code, str);
+
+	va_end(args);
+	free(str);
+}
+
+/******************************************************************//**
+Write a message to the MySQL log, prefixed with "InnoDB: " */
+UNIV_INTERN
+void
+ib_logf(
+/*====*/
+	ib_log_level_t	level,		/*!< in: warning level */
+	const char*	format,		/*!< printf format */
+	...)				/*!< Args */
+{
+	char*		str;
+	va_list         args;
+
+	va_start(args, format);
+
+#ifdef __WIN__
+	int		size = _vscprintf(format, args) + 1;
+	str = static_cast<char*>(malloc(size));
+	str[size - 1] = 0x0;
+	vsnprintf(str, size, format, args);
+#elif HAVE_VASPRINTF
+	int	ret;
+	ret = vasprintf(&str, format, args);
+	ut_a(ret != -1);
+#else
+	/* Use a fixed length string. */
+	str = static_cast<char*>(malloc(BUFSIZ));
+	my_vsnprintf(str, BUFSIZ, format, args);
+#endif /* __WIN__ */
+
+	switch(level) {
+	case IB_LOG_LEVEL_INFO:
+		sql_print_information("InnoDB: %s", str);
+		break;
+	case IB_LOG_LEVEL_WARN:
+		sql_print_warning("InnoDB: %s", str);
+		break;
+	case IB_LOG_LEVEL_ERROR:
+		sql_print_error("InnoDB: %s", str);
+		break;
+	case IB_LOG_LEVEL_FATAL:
+		sql_print_error("InnoDB: %s", str);
+		break;
+	}
+
+	va_end(args);
+	free(str);
+
+	if (level == IB_LOG_LEVEL_FATAL) {
+		ut_error;
+	}
+}
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset.
+@return result string length, as returned by strconvert() */
+uint
+innobase_convert_to_filename_charset(
+/*=================================*/
+	char*		to,	/* out: converted identifier */
+	const char*	from,	/* in: identifier to convert */
+	ulint		len)	/* in: length of 'to', in bytes */
+{
+	uint		errors;
+	CHARSET_INFO*	cs_to = &my_charset_filename;
+	CHARSET_INFO*	cs_from = system_charset_info;
+
+	return(strconvert(
+		cs_from, from, cs_to, to, static_cast<uint>(len), &errors));
+}
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset.
+@return result string length, as returned by strconvert() */
+uint
+innobase_convert_to_system_charset(
+/*===============================*/
+	char*		to,	/* out: converted identifier */
+	const char*	from,	/* in: identifier to convert */
+	ulint		len,	/* in: length of 'to', in bytes */
+	uint*		errors)	/* out: error return */
+{
+	CHARSET_INFO*	cs1 = &my_charset_filename;
+	CHARSET_INFO*	cs2 = system_charset_info;
+
+	return(strconvert(
+		cs1, from, cs2, to, static_cast<uint>(len), errors));
+}
+
+/**********************************************************************
+Issue a warning that the row is too big. */
+void
+ib_warn_row_too_big(const dict_table_t*	table)
+{
+	/* If prefix is true then a 768-byte prefix is stored
+	locally for BLOB fields. Refer to dict_table_get_format() */
+	const bool prefix = (dict_tf_get_format(table->flags)
+			     == UNIV_FORMAT_A);
+
+	const ulint	free_space = page_get_free_space_of_empty(
+		table->flags & DICT_TF_COMPACT) / 2;
+
+	THD*	thd = current_thd;
+
+	push_warning_printf(
+		thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_TO_BIG_ROW,
+		"Row size too large (> %lu). Changing some columns to TEXT"
+		" or BLOB %smay help. In current row format, BLOB prefix of"
+		" %d bytes is stored inline.", free_space
+		, prefix ? "or using ROW_FORMAT=DYNAMIC or"
+		" ROW_FORMAT=COMPRESSED ": ""
+		, prefix ? DICT_MAX_FIXED_COL_LEN : 0);
+}
diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h
new file mode 100644
index 00000000000..f735b6fef2d
--- /dev/null
+++ b/storage/innobase/handler/ha_innodb.h
@@ -0,0 +1,642 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*
+  This file is based on ha_berkeley.h of MySQL distribution
+
+  This file defines the Innodb handler: the interface between MySQL and
+  Innodb
+*/
+
+#include "dict0stats.h"
+
+/* Structure defines translation table between mysql index and innodb
+index structures */
+struct innodb_idx_translate_t {
+	ulint		index_count;	/*!< number of valid index entries
+					in the index_mapping array */
+	ulint		array_size;	/*!< array size of index_mapping */
+	dict_index_t**	index_mapping;	/*!< index pointer array directly
+					maps to index in Innodb from MySQL
+					array index */
+};
+
+
+/** InnoDB table share */
+typedef struct st_innobase_share {
+	THR_LOCK		lock;		/*!< MySQL lock protecting
+						this structure */
+	const char*		table_name;	/*!< InnoDB table name */
+	uint			use_count;	/*!< reference count,
+						incremented in get_share()
+						and decremented in
+						free_share() */
+	void*			table_name_hash;/*!< hash table chain node */
+	innodb_idx_translate_t	idx_trans_tbl;	/*!< index translation
+						table between MySQL and
+						Innodb */
+} INNOBASE_SHARE;
+
+
+/** Prebuilt structures in an InnoDB table handle used within MySQL */
+struct row_prebuilt_t;
+
+/** The class defining a handle to an Innodb table */
+class ha_innobase: public handler
+{
+	row_prebuilt_t*	prebuilt;	/*!< prebuilt struct in InnoDB, used
+					to save CPU time with prebuilt data
+					structures*/
+	THD*		user_thd;	/*!< the thread handle of the user
+					currently using the handle; this is
+					set in external_lock function */
+	THR_LOCK_DATA	lock;
+	INNOBASE_SHARE*	share;		/*!< information for MySQL
+					table locking */
+
+	uchar*		upd_buf;	/*!< buffer used in updates */
+	ulint		upd_buf_size;	/*!< the size of upd_buf in bytes */
+	Table_flags	int_table_flags;
+	uint		primary_key;
+	ulong		start_of_scan;	/*!< this is set to 1 when we are
+					starting a table scan but have not
+					yet fetched any row, else 0 */
+	uint		last_match_mode;/* match mode of the latest search:
+					ROW_SEL_EXACT, ROW_SEL_EXACT_PREFIX,
+					or undefined */
+	uint		num_write_row;	/*!< number of write_row() calls */
+
+	uint store_key_val_for_row(uint keynr, char* buff, uint buff_len,
+                                   const uchar* record);
+	inline void update_thd(THD* thd);
+	void update_thd();
+	int change_active_index(uint keynr);
+	int general_fetch(uchar* buf, uint direction, uint match_mode);
+	dberr_t innobase_lock_autoinc();
+	ulonglong innobase_peek_autoinc();
+	dberr_t innobase_set_max_autoinc(ulonglong auto_inc);
+	dberr_t innobase_reset_autoinc(ulonglong auto_inc);
+	dberr_t innobase_get_autoinc(ulonglong* value);
+	void innobase_initialize_autoinc();
+	dict_index_t* innobase_get_index(uint keynr);
+
+	/* Init values for the class: */
+ public:
+	ha_innobase(handlerton *hton, TABLE_SHARE *table_arg);
+	~ha_innobase();
+	/*
+	  Get the row type from the storage engine.  If this method returns
+	  ROW_TYPE_NOT_USED, the information in HA_CREATE_INFO should be used.
+	*/
+	enum row_type get_row_type() const;
+
+	const char* table_type() const;
+	const char* index_type(uint key_number);
+	const char** bas_ext() const;
+	Table_flags table_flags() const;
+	ulong index_flags(uint idx, uint part, bool all_parts) const;
+	uint max_supported_keys() const;
+	uint max_supported_key_length() const;
+	uint max_supported_key_part_length() const;
+	const key_map* keys_to_use_for_scanning();
+
+	int open(const char *name, int mode, uint test_if_locked);
+	handler* clone(const char *name, MEM_ROOT *mem_root);
+	int close(void);
+	double scan_time();
+	double read_time(uint index, uint ranges, ha_rows rows);
+	longlong get_memory_buffer_size() const;
+
+	int write_row(uchar * buf);
+	int update_row(const uchar * old_data, uchar * new_data);
+	int delete_row(const uchar * buf);
+	bool was_semi_consistent_read();
+	void try_semi_consistent_read(bool yes);
+	void unlock_row();
+
+	int index_init(uint index, bool sorted);
+	int index_end();
+	int index_read(uchar * buf, const uchar * key,
+		uint key_len, enum ha_rkey_function find_flag);
+	int index_read_idx(uchar * buf, uint index, const uchar * key,
+			   uint key_len, enum ha_rkey_function find_flag);
+	int index_read_last(uchar * buf, const uchar * key, uint key_len);
+	int index_next(uchar * buf);
+	int index_next_same(uchar * buf, const uchar *key, uint keylen);
+	int index_prev(uchar * buf);
+	int index_first(uchar * buf);
+	int index_last(uchar * buf);
+
+	int rnd_init(bool scan);
+	int rnd_end();
+	int rnd_next(uchar *buf);
+	int rnd_pos(uchar * buf, uchar *pos);
+
+	int ft_init();
+	void ft_end();
+	FT_INFO *ft_init_ext(uint flags, uint inx, String* key);
+	int ft_read(uchar* buf);
+
+	void position(const uchar *record);
+	int info(uint);
+	int analyze(THD* thd,HA_CHECK_OPT* check_opt);
+	int optimize(THD* thd,HA_CHECK_OPT* check_opt);
+	int discard_or_import_tablespace(my_bool discard);
+	int extra(enum ha_extra_function operation);
+	int reset();
+	int external_lock(THD *thd, int lock_type);
+	int transactional_table_lock(THD *thd, int lock_type);
+	int start_stmt(THD *thd, thr_lock_type lock_type);
+	void position(uchar *record);
+	ha_rows records_in_range(uint inx, key_range *min_key, key_range
+								*max_key);
+	ha_rows estimate_rows_upper_bound();
+
+	void update_create_info(HA_CREATE_INFO* create_info);
+	int parse_table_name(const char*name,
+			     HA_CREATE_INFO* create_info,
+			     ulint flags,
+			     ulint flags2,
+			     char* norm_name,
+			     char* temp_path,
+			     char* remote_path);
+	int create(const char *name, register TABLE *form,
+					HA_CREATE_INFO *create_info);
+	int truncate();
+	int delete_table(const char *name);
+	int rename_table(const char* from, const char* to);
+	int check(THD* thd, HA_CHECK_OPT* check_opt);
+	char* update_table_comment(const char* comment);
+	char* get_foreign_key_create_info();
+	int get_foreign_key_list(THD *thd, List<FOREIGN_KEY_INFO> *f_key_list);
+	int get_parent_foreign_key_list(THD *thd,
+					List<FOREIGN_KEY_INFO> *f_key_list);
+	bool can_switch_engines();
+	uint referenced_by_foreign_key();
+	void free_foreign_key_create_info(char* str);
+	THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to,
+					enum thr_lock_type lock_type);
+	void init_table_handle_for_HANDLER();
+        virtual void get_auto_increment(ulonglong offset, ulonglong increment,
+                                        ulonglong nb_desired_values,
+                                        ulonglong *first_value,
+                                        ulonglong *nb_reserved_values);
+	int reset_auto_increment(ulonglong value);
+
+	virtual bool get_error_message(int error, String *buf);
+	virtual bool get_foreign_dup_key(char*, uint, char*, uint);
+	uint8 table_cache_type();
+	/*
+	  ask handler about permission to cache table during query registration
+	*/
+	my_bool register_query_cache_table(THD *thd, char *table_key,
+					   uint key_length,
+					   qc_engine_callback *call_back,
+					   ulonglong *engine_data);
+	static const char *get_mysql_bin_log_name();
+	static ulonglong get_mysql_bin_log_pos();
+	bool primary_key_is_clustered();
+	int cmp_ref(const uchar *ref1, const uchar *ref2);
+	/** On-line ALTER TABLE interface @see handler0alter.cc @{ */
+
+	/** Check if InnoDB supports a particular alter table in-place
+	@param altered_table	TABLE object for new version of table.
+	@param ha_alter_info	Structure describing changes to be done
+	by ALTER TABLE and holding data used during in-place alter.
+
+	@retval HA_ALTER_INPLACE_NOT_SUPPORTED	Not supported
+	@retval HA_ALTER_INPLACE_NO_LOCK	Supported
+	@retval HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE
+						Supported, but requires lock
+						during main phase and exclusive
+						lock during prepare phase.
+	@retval HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE
+						Supported, prepare phase
+						requires exclusive lock.
+	*/
+	enum_alter_inplace_result check_if_supported_inplace_alter(
+		TABLE*			altered_table,
+		Alter_inplace_info*	ha_alter_info);
+	/** Allows InnoDB to update internal structures with concurrent
+	writes blocked (provided that check_if_supported_inplace_alter()
+	did not return HA_ALTER_INPLACE_NO_LOCK).
+	This will be invoked before inplace_alter_table().
+
+	@param altered_table	TABLE object for new version of table.
+	@param ha_alter_info	Structure describing changes to be done
+	by ALTER TABLE and holding data used during in-place alter.
+
+	@retval true		Failure
+	@retval false		Success
+	*/
+	bool prepare_inplace_alter_table(
+		TABLE*			altered_table,
+		Alter_inplace_info*	ha_alter_info);
+
+	/** Alter the table structure in-place with operations
+	specified using HA_ALTER_FLAGS and Alter_inplace_information.
+	The level of concurrency allowed during this operation depends
+	on the return value from check_if_supported_inplace_alter().
+
+	@param altered_table	TABLE object for new version of table.
+	@param ha_alter_info	Structure describing changes to be done
+	by ALTER TABLE and holding data used during in-place alter.
+
+	@retval true		Failure
+	@retval false		Success
+	*/
+	bool inplace_alter_table(
+		TABLE*			altered_table,
+		Alter_inplace_info*	ha_alter_info);
+
+	/** Commit or rollback the changes made during
+	prepare_inplace_alter_table() and inplace_alter_table() inside
+	the storage engine. Note that the allowed level of concurrency
+	during this operation will be the same as for
+	inplace_alter_table() and thus might be higher than during
+	prepare_inplace_alter_table(). (E.g concurrent writes were
+	blocked during prepare, but might not be during commit).
+	@param altered_table	TABLE object for new version of table.
+	@param ha_alter_info	Structure describing changes to be done
+	by ALTER TABLE and holding data used during in-place alter.
+	@param commit		true => Commit, false => Rollback.
+	@retval true		Failure
+	@retval false		Success
+	*/
+	bool commit_inplace_alter_table(
+		TABLE*			altered_table,
+		Alter_inplace_info*	ha_alter_info,
+		bool			commit);
+	/** @} */
+	bool check_if_incompatible_data(HA_CREATE_INFO *info,
+					uint table_changes);
+private:
+	/** Builds a 'template' to the prebuilt struct.
+
+	The template is used in fast retrieval of just those column
+	values MySQL needs in its processing.
+	@param whole_row true if access is needed to a whole row,
+	false if accessing individual fields is enough */
+	void build_template(bool whole_row);
+	/** Resets a query execution 'template'.
+	@see build_template() */
+	inline void reset_template();
+
+	int info_low(uint, bool);
+
+public:
+	/** @name Multi Range Read interface @{ */
+	/** Initialize multi range read @see DsMrr_impl::dsmrr_init
+	* @param seq
+	* @param seq_init_param
+	* @param n_ranges
+	* @param mode
+	* @param buf
+	*/
+	int multi_range_read_init(RANGE_SEQ_IF* seq,
+				  void* seq_init_param,
+				  uint n_ranges, uint mode,
+				  HANDLER_BUFFER* buf);
+	/** Process next multi range read @see DsMrr_impl::dsmrr_next
+	* @param range_info
+	*/
+	int multi_range_read_next(char** range_info);
+	/** Initialize multi range read and get information.
+	* @see ha_myisam::multi_range_read_info_const
+	* @see DsMrr_impl::dsmrr_info_const
+	* @param keyno
+	* @param seq
+	* @param seq_init_param
+	* @param n_ranges
+	* @param bufsz
+	* @param flags
+	* @param cost
+	*/
+	ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF* seq,
+					   void* seq_init_param,
+					   uint n_ranges, uint* bufsz,
+					   uint* flags, Cost_estimate* cost);
+	/** Initialize multi range read and get information.
+	* @see DsMrr_impl::dsmrr_info
+	* @param keyno
+	* @param seq
+	* @param seq_init_param
+	* @param n_ranges
+	* @param bufsz
+	* @param flags
+	* @param cost
+	*/
+	ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
+				      uint* bufsz, uint* flags,
+				      Cost_estimate* cost);
+
+	/** Attempt to push down an index condition.
+	* @param[in] keyno	MySQL key number
+	* @param[in] idx_cond	Index condition to be checked
+	* @return idx_cond if pushed; NULL if not pushed
+	*/
+	class Item* idx_cond_push(uint keyno, class Item* idx_cond);
+
+private:
+	/** The multi range read session object */
+	DsMrr_impl ds_mrr;
+	/* @} */
+};
+
+/* Some accessor functions which the InnoDB plugin needs, but which
+can not be added to mysql/plugin.h as part of the public interface;
+the definitions are bracketed with #ifdef INNODB_COMPATIBILITY_HOOKS */
+
+#ifndef INNODB_COMPATIBILITY_HOOKS
+#error InnoDB needs MySQL to be built with #define INNODB_COMPATIBILITY_HOOKS
+#endif
+
+LEX_STRING* thd_query_string(MYSQL_THD thd);
+
+extern "C" {
+
+struct charset_info_st *thd_charset(MYSQL_THD thd);
+
+/**
+  Check if a user thread is a replication slave thread
+  @param thd  user thread
+  @retval 0 the user thread is not a replication slave thread
+  @retval 1 the user thread is a replication slave thread
+*/
+int thd_slave_thread(const MYSQL_THD thd);
+
+/**
+  Check if a user thread is running a non-transactional update
+  @param thd  user thread
+  @retval 0 the user thread is not running a non-transactional update
+  @retval 1 the user thread is running a non-transactional update
+*/
+int thd_non_transactional_update(const MYSQL_THD thd);
+
+/**
+  Get the user thread's binary logging format
+  @param thd  user thread
+  @return Value to be used as index into the binlog_format_names array
+*/
+int thd_binlog_format(const MYSQL_THD thd);
+
+/**
+  Mark transaction to rollback and mark error as fatal to a sub-statement.
+  @param  thd   Thread handle
+  @param  all   TRUE <=> rollback main transaction.
+*/
+void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all);
+
+/**
+  Check if binary logging is filtered for thread's current db.
+  @param  thd   Thread handle
+  @retval 1 the query is not filtered, 0 otherwise.
+*/
+bool thd_binlog_filter_ok(const MYSQL_THD thd);
+
+/**
+  Check if the query may generate row changes which
+  may end up in the binary.
+  @param  thd   Thread handle
+  @return 1 the query may generate row changes, 0 otherwise.
+*/
+bool thd_sqlcom_can_generate_row_events(const MYSQL_THD thd);
+
+/**
+  Gets information on the durability property requested by
+  a thread.
+  @param  thd   Thread handle
+  @return a durability property.
+*/
+enum durability_properties thd_get_durability_property(const MYSQL_THD thd);
+
+/** Get the auto_increment_offset auto_increment_increment.
+@param thd	Thread object
+@param off	auto_increment_offset
+@param inc	auto_increment_increment */
+void thd_get_autoinc(const MYSQL_THD thd, ulong* off, ulong* inc)
+__attribute__((nonnull));
+
+/** Is strict sql_mode set.
+@param thd	Thread object
+@return True if sql_mode has strict mode (all or trans), false otherwise.
+*/
+bool thd_is_strict_mode(const MYSQL_THD thd)
+__attribute__((nonnull));
+} /* extern "C" */
+
+struct trx_t;
+
+extern const struct _ft_vft ft_vft_result;
+
+/* Structure Returned by ha_innobase::ft_init_ext() */
+typedef struct new_ft_info
+{
+	struct _ft_vft		*please;
+	struct _ft_vft_ext	*could_you;
+	row_prebuilt_t*		ft_prebuilt;
+	fts_result_t*		ft_result;
+} NEW_FT_INFO;
+
+/*********************************************************************//**
+Allocates an InnoDB transaction for a MySQL handler object.
+@return	InnoDB transaction handle */
+trx_t*
+innobase_trx_allocate(
+/*==================*/
+	MYSQL_THD	thd);	/*!< in: user thread handle */
+
+/*********************************************************************//**
+This function checks each index name for a table against reserved
+system default primary index name 'GEN_CLUST_INDEX'. If a name
+matches, this function pushes an warning message to the client,
+and returns true.
+@return true if the index name matches the reserved name */
+UNIV_INTERN
+bool
+innobase_index_name_is_reserved(
+/*============================*/
+	THD*		thd,		/*!< in/out: MySQL connection */
+	const KEY*	key_info,	/*!< in: Indexes to be created */
+	ulint		num_of_keys)	/*!< in: Number of indexes to
+					be created. */
+	__attribute__((nonnull, warn_unused_result));
+
+/*****************************************************************//**
+Determines InnoDB table flags.
+@retval true if successful, false if error */
+UNIV_INTERN
+bool
+innobase_table_flags(
+/*=================*/
+	const TABLE*		form,		/*!< in: table */
+	const HA_CREATE_INFO*	create_info,	/*!< in: information
+						on table columns and indexes */
+	THD*			thd,		/*!< in: connection */
+	bool			use_tablespace,	/*!< in: whether to create
+						outside system tablespace */
+	ulint*			flags,		/*!< out: DICT_TF flags */
+	ulint*			flags2)		/*!< out: DICT_TF2 flags */
+	__attribute__((nonnull, warn_unused_result));
+
+/*****************************************************************//**
+Validates the create options. We may build on this function
+in future. For now, it checks two specifiers:
+KEY_BLOCK_SIZE and ROW_FORMAT
+If innodb_strict_mode is not set then this function is a no-op
+@return	NULL if valid, string if not. */
+UNIV_INTERN
+const char*
+create_options_are_invalid(
+/*=======================*/
+	THD*		thd,		/*!< in: connection thread. */
+	TABLE*		form,		/*!< in: information on table
+					columns and indexes */
+	HA_CREATE_INFO*	create_info,	/*!< in: create info. */
+	bool		use_tablespace)	/*!< in: srv_file_per_table */
+	__attribute__((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Retrieve the FTS Relevance Ranking result for doc with doc_id
+of prebuilt->fts_doc_id
+@return the relevance ranking value */
+UNIV_INTERN
+float
+innobase_fts_retrieve_ranking(
+/*==========================*/
+	FT_INFO*	fts_hdl);	/*!< in: FTS handler */
+
+/*********************************************************************//**
+Find and Retrieve the FTS Relevance Ranking result for doc with doc_id
+of prebuilt->fts_doc_id
+@return the relevance ranking value */
+UNIV_INTERN
+float
+innobase_fts_find_ranking(
+/*======================*/
+	FT_INFO*	fts_hdl,	/*!< in: FTS handler */
+	uchar*		record,		/*!< in: Unused */
+	uint		len);		/*!< in: Unused */
+/*********************************************************************//**
+Free the memory for the FTS handler */
+UNIV_INTERN
+void
+innobase_fts_close_ranking(
+/*=======================*/
+	FT_INFO*	fts_hdl)	/*!< in: FTS handler */
+	__attribute__((nonnull));
+/*****************************************************************//**
+Initialize the table FTS stopword list
+@return TRUE if success */
+UNIV_INTERN
+ibool
+innobase_fts_load_stopword(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: Table has the FTS */
+	trx_t*		trx,		/*!< in: transaction */
+	THD*		thd)		/*!< in: current thread */
+	__attribute__((nonnull(1,3), warn_unused_result));
+
+/** Some defines for innobase_fts_check_doc_id_index() return value */
+enum fts_doc_id_index_enum {
+	FTS_INCORRECT_DOC_ID_INDEX,
+	FTS_EXIST_DOC_ID_INDEX,
+	FTS_NOT_EXIST_DOC_ID_INDEX
+};
+
+/*******************************************************************//**
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column.
+@return the status of the FTS_DOC_ID index */
+UNIV_INTERN
+enum fts_doc_id_index_enum
+innobase_fts_check_doc_id_index(
+/*============================*/
+	const dict_table_t*	table,		/*!< in: table definition */
+	const TABLE*		altered_table,	/*!< in: MySQL table
+						that is being altered */
+	ulint*			fts_doc_col_no)	/*!< out: The column number for
+						Doc ID */
+	__attribute__((warn_unused_result));
+
+/*******************************************************************//**
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column in MySQL create index definition.
+@return FTS_EXIST_DOC_ID_INDEX if there exists the FTS_DOC_ID index,
+FTS_INCORRECT_DOC_ID_INDEX if the FTS_DOC_ID index is of wrong format */
+UNIV_INTERN
+enum fts_doc_id_index_enum
+innobase_fts_check_doc_id_index_in_def(
+/*===================================*/
+	ulint		n_key,		/*!< in: Number of keys */
+	const KEY*	key_info)	/*!< in: Key definitions */
+	__attribute__((nonnull, warn_unused_result));
+
+/***********************************************************************
+@return version of the extended FTS API */
+uint
+innobase_fts_get_version();
+
+/***********************************************************************
+@return Which part of the extended FTS API is supported */
+ulonglong
+innobase_fts_flags();
+
+/***********************************************************************
+Find and Retrieve the FTS doc_id for the current result row
+@return the document ID */
+ulonglong
+innobase_fts_retrieve_docid(
+/*============================*/
+	FT_INFO_EXT*	fts_hdl);	/*!< in: FTS handler */
+
+/***********************************************************************
+Find and retrieve the size of the current result
+@return number of matching rows */
+ulonglong
+innobase_fts_count_matches(
+/*============================*/
+	FT_INFO_EXT*	fts_hdl);	/*!< in: FTS handler */
+
+/** "GEN_CLUST_INDEX" is the name reserved for InnoDB default
+system clustered index when there is no primary key. */
+extern const char innobase_index_reserve_name[];
+
+/*********************************************************************//**
+Copy table flags from MySQL's HA_CREATE_INFO into an InnoDB table object.
+Those flags are stored in .frm file and end up in the MySQL table object,
+but are frequently used inside InnoDB so we keep their copies into the
+InnoDB table object. */
+UNIV_INTERN
+void
+innobase_copy_frm_flags_from_create_info(
+/*=====================================*/
+	dict_table_t*		innodb_table,	/*!< in/out: InnoDB table */
+	const HA_CREATE_INFO*	create_info);	/*!< in: create info */
+
+/*********************************************************************//**
+Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object.
+Those flags are stored in .frm file and end up in the MySQL table object,
+but are frequently used inside InnoDB so we keep their copies into the
+InnoDB table object. */
+UNIV_INTERN
+void
+innobase_copy_frm_flags_from_table_share(
+/*=====================================*/
+	dict_table_t*		innodb_table,	/*!< in/out: InnoDB table */
+	const TABLE_SHARE*	table_share);	/*!< in: table share */
diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc
new file mode 100644
index 00000000000..19812ce12f2
--- /dev/null
+++ b/storage/innobase/handler/handler0alter.cc
@@ -0,0 +1,5966 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file handler/handler0alter.cc
+Smart ALTER TABLE
+*******************************************************/
+
+#include <unireg.h>
+#include <mysqld_error.h>
+#include <log.h>
+#include <debug_sync.h>
+#include <mysql/innodb_priv.h>
+#include <sql_alter.h>
+#include <sql_class.h>
+
+#include "dict0crea.h"
+#include "dict0dict.h"
+#include "dict0priv.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "log0log.h"
+#include "rem0types.h"
+#include "row0log.h"
+#include "row0merge.h"
+#include "srv0srv.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "ha_prototypes.h"
+#include "handler0alter.h"
+#include "srv0mon.h"
+#include "fts0priv.h"
+#include "pars0pars.h"
+#include "row0sel.h"
+#include "ha_innodb.h"
+
+/** Operations for creating secondary indexes (no rebuild needed) */
+static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_ONLINE_CREATE
+	= Alter_inplace_info::ADD_INDEX
+	| Alter_inplace_info::ADD_UNIQUE_INDEX;
+
+/** Operations for rebuilding a table in place */
+static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_ALTER_REBUILD
+	= Alter_inplace_info::ADD_PK_INDEX
+	| Alter_inplace_info::DROP_PK_INDEX
+	| Alter_inplace_info::CHANGE_CREATE_OPTION
+	/* CHANGE_CREATE_OPTION needs to check innobase_need_rebuild() */
+	| Alter_inplace_info::ALTER_COLUMN_NULLABLE
+	| Alter_inplace_info::ALTER_COLUMN_NOT_NULLABLE
+	| Alter_inplace_info::ALTER_COLUMN_ORDER
+	| Alter_inplace_info::DROP_COLUMN
+	| Alter_inplace_info::ADD_COLUMN
+	| Alter_inplace_info::RECREATE_TABLE
+	/*
+	| Alter_inplace_info::ALTER_COLUMN_TYPE
+	| Alter_inplace_info::ALTER_COLUMN_EQUAL_PACK_LENGTH
+	*/
+	;
+
+/** Operations that require changes to data */
+static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_ALTER_DATA
+	= INNOBASE_ONLINE_CREATE | INNOBASE_ALTER_REBUILD;
+
+/** Operations for altering a table that InnoDB does not care about */
+static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_INPLACE_IGNORE
+	= Alter_inplace_info::ALTER_COLUMN_DEFAULT
+	| Alter_inplace_info::ALTER_COLUMN_COLUMN_FORMAT
+	| Alter_inplace_info::ALTER_COLUMN_STORAGE_TYPE
+	| Alter_inplace_info::ALTER_RENAME;
+
+/** Operations on foreign key definitions (changing the schema only) */
+static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_FOREIGN_OPERATIONS
+	= Alter_inplace_info::DROP_FOREIGN_KEY
+	| Alter_inplace_info::ADD_FOREIGN_KEY;
+
+/** Operations that InnoDB cares about and can perform without rebuild */
+static const Alter_inplace_info::HA_ALTER_FLAGS INNOBASE_ALTER_NOREBUILD
+	= INNOBASE_ONLINE_CREATE
+	| INNOBASE_FOREIGN_OPERATIONS
+	| Alter_inplace_info::DROP_INDEX
+	| Alter_inplace_info::DROP_UNIQUE_INDEX
+	| Alter_inplace_info::ALTER_COLUMN_NAME;
+
+/* Report an InnoDB error to the client by invoking my_error(). */
+static UNIV_COLD __attribute__((nonnull))
+void
+my_error_innodb(
+/*============*/
+	dberr_t		error,	/*!< in: InnoDB error code */
+	const char*	table,	/*!< in: table name */
+	ulint		flags)	/*!< in: table flags */
+{
+	switch (error) {
+	case DB_MISSING_HISTORY:
+		my_error(ER_TABLE_DEF_CHANGED, MYF(0));
+		break;
+	case DB_RECORD_NOT_FOUND:
+		my_error(ER_KEY_NOT_FOUND, MYF(0), table);
+		break;
+	case DB_DEADLOCK:
+		my_error(ER_LOCK_DEADLOCK, MYF(0));
+		break;
+	case DB_LOCK_WAIT_TIMEOUT:
+		my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
+		break;
+	case DB_INTERRUPTED:
+		my_error(ER_QUERY_INTERRUPTED, MYF(0));
+		break;
+	case DB_OUT_OF_MEMORY:
+		my_error(ER_OUT_OF_RESOURCES, MYF(0));
+		break;
+	case DB_OUT_OF_FILE_SPACE:
+		my_error(ER_RECORD_FILE_FULL, MYF(0), table);
+		break;
+	case DB_TEMP_FILE_WRITE_FAILURE:
+		my_error(ER_TEMP_FILE_WRITE_FAILURE, MYF(0));
+		break;
+	case DB_TOO_BIG_INDEX_COL:
+		my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
+			 DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags));
+		break;
+	case DB_TOO_MANY_CONCURRENT_TRXS:
+		my_error(ER_TOO_MANY_CONCURRENT_TRXS, MYF(0));
+		break;
+	case DB_LOCK_TABLE_FULL:
+		my_error(ER_LOCK_TABLE_FULL, MYF(0));
+		break;
+	case DB_UNDO_RECORD_TOO_BIG:
+		my_error(ER_UNDO_RECORD_TOO_BIG, MYF(0));
+		break;
+	case DB_CORRUPTION:
+		my_error(ER_NOT_KEYFILE, MYF(0), table);
+		break;
+	case DB_TOO_BIG_RECORD:
+		my_error(ER_TOO_BIG_ROWSIZE, MYF(0),
+			 page_get_free_space_of_empty(
+				 flags & DICT_TF_COMPACT) / 2);
+		break;
+	case DB_INVALID_NULL:
+		/* TODO: report the row, as we do for DB_DUPLICATE_KEY */
+		my_error(ER_INVALID_USE_OF_NULL, MYF(0));
+		break;
+#ifdef UNIV_DEBUG
+	case DB_SUCCESS:
+	case DB_DUPLICATE_KEY:
+	case DB_TABLESPACE_EXISTS:
+	case DB_ONLINE_LOG_TOO_BIG:
+		/* These codes should not be passed here. */
+		ut_error;
+#endif /* UNIV_DEBUG */
+	default:
+		my_error(ER_GET_ERRNO, MYF(0), error);
+		break;
+	}
+}
+
+/** Determine if fulltext indexes exist in a given table.
+@param table		MySQL table
+@return			whether fulltext indexes exist on the table */
+static
+bool
+innobase_fulltext_exist(
+/*====================*/
+	const TABLE*	table)
+{
+	for (uint i = 0; i < table->s->keys; i++) {
+		if (table->key_info[i].flags & HA_FULLTEXT) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/*******************************************************************//**
+Determine if ALTER TABLE needs to rebuild the table.
+@param ha_alter_info		the DDL operation
+@return whether it is necessary to rebuild the table */
+static __attribute__((nonnull, warn_unused_result))
+bool
+innobase_need_rebuild(
+/*==================*/
+	const Alter_inplace_info*	ha_alter_info)
+{
+	if (ha_alter_info->handler_flags
+	    == Alter_inplace_info::CHANGE_CREATE_OPTION
+	    && !(ha_alter_info->create_info->used_fields
+		 & (HA_CREATE_USED_ROW_FORMAT
+		    | HA_CREATE_USED_KEY_BLOCK_SIZE))) {
+		/* Any other CHANGE_CREATE_OPTION than changing
+		ROW_FORMAT or KEY_BLOCK_SIZE is ignored. */
+		return(false);
+	}
+
+	return(!!(ha_alter_info->handler_flags & INNOBASE_ALTER_REBUILD));
+}
+
+/** Check if InnoDB supports a particular alter table in-place
+@param altered_table	TABLE object for new version of table.
+@param ha_alter_info	Structure describing changes to be done
+by ALTER TABLE and holding data used during in-place alter.
+
+@retval HA_ALTER_INPLACE_NOT_SUPPORTED	Not supported
+@retval HA_ALTER_INPLACE_NO_LOCK	Supported
+@retval HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE Supported, but requires
+lock during main phase and exclusive lock during prepare phase.
+@retval HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE	Supported, prepare phase
+requires exclusive lock (any transactions that have accessed the table
+must commit or roll back first, and no transactions can access the table
+while prepare_inplace_alter_table() is executing)
+*/
+UNIV_INTERN
+enum_alter_inplace_result
+ha_innobase::check_if_supported_inplace_alter(
+/*==========================================*/
+	TABLE*			altered_table,
+	Alter_inplace_info*	ha_alter_info)
+{
+	DBUG_ENTER("check_if_supported_inplace_alter");
+
+	if (srv_read_only_mode) {
+		ha_alter_info->unsupported_reason =
+			innobase_get_err_msg(ER_READ_ONLY_MODE);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	} else if (srv_created_new_raw || srv_force_recovery) {
+
+		ha_alter_info->unsupported_reason =(srv_force_recovery)?
+			innobase_get_err_msg(ER_INNODB_FORCED_RECOVERY):
+			innobase_get_err_msg(ER_READ_ONLY_MODE);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	if (altered_table->s->fields > REC_MAX_N_USER_FIELDS) {
+		/* Deny the inplace ALTER TABLE. MySQL will try to
+		re-create the table and ha_innobase::create() will
+		return an error too. This is how we effectively
+		deny adding too many columns to a table. */
+		ha_alter_info->unsupported_reason =
+			innobase_get_err_msg(ER_TOO_MANY_FIELDS);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	update_thd();
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+
+	if (ha_alter_info->handler_flags
+	    & ~(INNOBASE_INPLACE_IGNORE
+		| INNOBASE_ALTER_NOREBUILD
+		| INNOBASE_ALTER_REBUILD)) {
+
+		if (ha_alter_info->handler_flags
+			& (Alter_inplace_info::ALTER_COLUMN_EQUAL_PACK_LENGTH
+			   | Alter_inplace_info::ALTER_COLUMN_TYPE))
+			ha_alter_info->unsupported_reason = innobase_get_err_msg(
+				ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_COLUMN_TYPE);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	/* Only support online add foreign key constraint when
+	check_foreigns is turned off */
+	if ((ha_alter_info->handler_flags
+	     & Alter_inplace_info::ADD_FOREIGN_KEY)
+	    && prebuilt->trx->check_foreigns) {
+		ha_alter_info->unsupported_reason = innobase_get_err_msg(
+			ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FK_CHECK);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) {
+		DBUG_RETURN(HA_ALTER_INPLACE_NO_LOCK);
+	}
+
+	/* Only support NULL -> NOT NULL change if strict table sql_mode
+	is set. Fall back to COPY for conversion if not strict tables.
+	In-Place will fail with an error when trying to convert
+	NULL to a NOT NULL value. */
+	if ((ha_alter_info->handler_flags
+	     & Alter_inplace_info::ALTER_COLUMN_NOT_NULLABLE)
+	    && !thd_is_strict_mode(user_thd)) {
+		ha_alter_info->unsupported_reason = innobase_get_err_msg(
+			ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_NOT_NULL);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	/* InnoDB cannot IGNORE when creating unique indexes. IGNORE
+	should silently delete some duplicate rows. Our inplace_alter
+	code will not delete anything from existing indexes. */
+	if (ha_alter_info->ignore
+	    && (ha_alter_info->handler_flags
+		& (Alter_inplace_info::ADD_PK_INDEX
+		   | Alter_inplace_info::ADD_UNIQUE_INDEX))) {
+		ha_alter_info->unsupported_reason = innobase_get_err_msg(
+			ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_IGNORE);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	/* DROP PRIMARY KEY is only allowed in combination with ADD
+	PRIMARY KEY. */
+	if ((ha_alter_info->handler_flags
+	     & (Alter_inplace_info::ADD_PK_INDEX
+		| Alter_inplace_info::DROP_PK_INDEX))
+	    == Alter_inplace_info::DROP_PK_INDEX) {
+		ha_alter_info->unsupported_reason = innobase_get_err_msg(
+			ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_NOPK);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	/* If a column change from NOT NULL to NULL,
+	and there's a implict pk on this column. the
+	table should be rebuild. The change should
+	only go through the "Copy" method.*/
+	if ((ha_alter_info->handler_flags
+	     & Alter_inplace_info::ALTER_COLUMN_NULLABLE)) {
+		uint primary_key = altered_table->s->primary_key;
+
+		/* See if MYSQL table has no pk but we do.*/
+		if (UNIV_UNLIKELY(primary_key >= MAX_KEY)
+		    && !row_table_got_default_clust_index(prebuilt->table)) {
+			ha_alter_info->unsupported_reason = innobase_get_err_msg(
+				ER_PRIMARY_CANT_HAVE_NULL);
+			DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+		}
+	}
+
+	/* We should be able to do the operation in-place.
+	See if we can do it online (LOCK=NONE). */
+	bool	online = true;
+
+	List_iterator_fast<Create_field> cf_it(
+		ha_alter_info->alter_info->create_list);
+
+	/* Fix the key parts. */
+	for (KEY* new_key = ha_alter_info->key_info_buffer;
+	     new_key < ha_alter_info->key_info_buffer
+		     + ha_alter_info->key_count;
+	     new_key++) {
+		for (KEY_PART_INFO* key_part = new_key->key_part;
+		     key_part < new_key->key_part + new_key->user_defined_key_parts;
+		     key_part++) {
+			const Create_field*	new_field;
+
+			DBUG_ASSERT(key_part->fieldnr
+				    < altered_table->s->fields);
+
+			cf_it.rewind();
+			for (uint fieldnr = 0; (new_field = cf_it++);
+			     fieldnr++) {
+				if (fieldnr == key_part->fieldnr) {
+					break;
+				}
+			}
+
+			DBUG_ASSERT(new_field);
+
+			key_part->field = altered_table->field[
+				key_part->fieldnr];
+			/* In some special cases InnoDB emits "false"
+			duplicate key errors with NULL key values. Let
+			us play safe and ensure that we can correctly
+			print key values even in such cases .*/
+			key_part->null_offset = key_part->field->null_offset();
+			key_part->null_bit = key_part->field->null_bit;
+
+			if (new_field->field) {
+				/* This is an existing column. */
+				continue;
+			}
+
+			/* This is an added column. */
+			DBUG_ASSERT(ha_alter_info->handler_flags
+				    & Alter_inplace_info::ADD_COLUMN);
+
+			/* We cannot replace a hidden FTS_DOC_ID
+			with a user-visible FTS_DOC_ID. */
+			if (prebuilt->table->fts
+			    && innobase_fulltext_exist(altered_table)
+			    && !my_strcasecmp(
+				    system_charset_info,
+				    key_part->field->field_name,
+				    FTS_DOC_ID_COL_NAME)) {
+				ha_alter_info->unsupported_reason = innobase_get_err_msg(
+					ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_HIDDEN_FTS);
+				DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+			}
+
+			DBUG_ASSERT((MTYP_TYPENR(key_part->field->unireg_check)
+				     == Field::NEXT_NUMBER)
+				    == !!(key_part->field->flags
+					  & AUTO_INCREMENT_FLAG));
+
+			if (key_part->field->flags & AUTO_INCREMENT_FLAG) {
+				/* We cannot assign an AUTO_INCREMENT
+				column values during online ALTER. */
+				DBUG_ASSERT(key_part->field == altered_table
+					    -> found_next_number_field);
+				ha_alter_info->unsupported_reason = innobase_get_err_msg(
+					ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_AUTOINC);
+				online = false;
+			}
+		}
+	}
+
+	DBUG_ASSERT(!prebuilt->table->fts || prebuilt->table->fts->doc_col
+		    <= table->s->fields);
+	DBUG_ASSERT(!prebuilt->table->fts || prebuilt->table->fts->doc_col
+		    < dict_table_get_n_user_cols(prebuilt->table));
+
+	if (prebuilt->table->fts
+	    && innobase_fulltext_exist(altered_table)) {
+		/* FULLTEXT indexes are supposed to remain. */
+		/* Disallow DROP INDEX FTS_DOC_ID_INDEX */
+
+		for (uint i = 0; i < ha_alter_info->index_drop_count; i++) {
+			if (!my_strcasecmp(
+				    system_charset_info,
+				    ha_alter_info->index_drop_buffer[i]->name,
+				    FTS_DOC_ID_INDEX_NAME)) {
+				ha_alter_info->unsupported_reason = innobase_get_err_msg(
+					ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_CHANGE_FTS);
+				DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+			}
+		}
+
+		/* InnoDB can have a hidden FTS_DOC_ID_INDEX on a
+		visible FTS_DOC_ID column as well. Prevent dropping or
+		renaming the FTS_DOC_ID. */
+
+		for (Field** fp = table->field; *fp; fp++) {
+			if (!((*fp)->flags
+			      & (FIELD_IS_RENAMED | FIELD_IS_DROPPED))) {
+				continue;
+			}
+
+			if (!my_strcasecmp(
+				    system_charset_info,
+				    (*fp)->field_name,
+				    FTS_DOC_ID_COL_NAME)) {
+				ha_alter_info->unsupported_reason = innobase_get_err_msg(
+					ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_CHANGE_FTS);
+				DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+			}
+		}
+	}
+
+	prebuilt->trx->will_lock++;
+
+	if (!online) {
+		/* We already determined that only a non-locking
+		operation is possible. */
+	} else if (((ha_alter_info->handler_flags
+		     & Alter_inplace_info::ADD_PK_INDEX)
+		    || innobase_need_rebuild(ha_alter_info))
+		   && (innobase_fulltext_exist(altered_table)
+		       || (prebuilt->table->flags2
+			   & DICT_TF2_FTS_HAS_DOC_ID))) {
+		/* Refuse to rebuild the table online, if
+		fulltext indexes are to survive the rebuild,
+		or if the table contains a hidden FTS_DOC_ID column. */
+		online = false;
+		/* If the table already contains fulltext indexes,
+		refuse to rebuild the table natively altogether. */
+		if (prebuilt->table->fts) {
+			ha_alter_info->unsupported_reason = innobase_get_err_msg(
+				ER_INNODB_FT_LIMIT);
+			DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+		}
+		ha_alter_info->unsupported_reason = innobase_get_err_msg(
+			ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FTS);
+	} else if ((ha_alter_info->handler_flags
+		    & Alter_inplace_info::ADD_INDEX)) {
+		/* Building a full-text index requires a lock.
+		We could do without a lock if the table already contains
+		an FTS_DOC_ID column, but in that case we would have
+		to apply the modification log to the full-text indexes. */
+
+		for (uint i = 0; i < ha_alter_info->index_add_count; i++) {
+			const KEY* key =
+				&ha_alter_info->key_info_buffer[
+					ha_alter_info->index_add_buffer[i]];
+			if (key->flags & HA_FULLTEXT) {
+				DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK
+					      & ~(HA_FULLTEXT
+						  | HA_PACK_KEY
+						  | HA_GENERATED_KEY
+						  | HA_BINARY_PACK_KEY)));
+				ha_alter_info->unsupported_reason = innobase_get_err_msg(
+					ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FTS);
+				online = false;
+				break;
+			}
+		}
+	}
+
+	DBUG_RETURN(online
+		    ? HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE
+		    : HA_ALTER_INPLACE_SHARED_LOCK_AFTER_PREPARE);
+}
+
+/*************************************************************//**
+Initialize the dict_foreign_t structure with supplied info
+@return true if added, false if duplicate foreign->id */
+static __attribute__((nonnull(1,3,5,7)))
+bool
+innobase_init_foreign(
+/*==================*/
+	dict_foreign_t*	foreign,		/*!< in/out: structure to
+						initialize */
+	char*		constraint_name,	/*!< in/out: constraint name if
+						exists */
+	dict_table_t*	table,			/*!< in: foreign table */
+	dict_index_t*	index,			/*!< in: foreign key index */
+	const char**	column_names,		/*!< in: foreign key column
+						names */
+	ulint		num_field,		/*!< in: number of columns */
+	const char*	referenced_table_name,	/*!< in: referenced table
+						name */
+	dict_table_t*	referenced_table,	/*!< in: referenced table */
+	dict_index_t*	referenced_index,	/*!< in: referenced index */
+	const char**	referenced_column_names,/*!< in: referenced column
+						names */
+	ulint		referenced_num_field)	/*!< in: number of referenced
+						columns */
+{
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+        if (constraint_name) {
+                ulint   db_len;
+
+                /* Catenate 'databasename/' to the constraint name specified
+                by the user: we conceive the constraint as belonging to the
+                same MySQL 'database' as the table itself. We store the name
+                to foreign->id. */
+
+                db_len = dict_get_db_name_len(table->name);
+
+                foreign->id = static_cast<char*>(mem_heap_alloc(
+                        foreign->heap, db_len + strlen(constraint_name) + 2));
+
+                ut_memcpy(foreign->id, table->name, db_len);
+                foreign->id[db_len] = '/';
+                strcpy(foreign->id + db_len + 1, constraint_name);
+
+		/* Check if any existing foreign key has the same id,
+		this is needed only if user supplies the constraint name */
+
+		if (table->foreign_set.find(foreign)
+		    != table->foreign_set.end()) {
+			return(false);
+		}
+        }
+
+        foreign->foreign_table = table;
+        foreign->foreign_table_name = mem_heap_strdup(
+                foreign->heap, table->name);
+        dict_mem_foreign_table_name_lookup_set(foreign, TRUE);
+
+        foreign->foreign_index = index;
+        foreign->n_fields = (unsigned int) num_field;
+
+        foreign->foreign_col_names = static_cast<const char**>(
+                mem_heap_alloc(foreign->heap, num_field * sizeof(void*)));
+
+        for (ulint i = 0; i < foreign->n_fields; i++) {
+                foreign->foreign_col_names[i] = mem_heap_strdup(
+                        foreign->heap, column_names[i]);
+        }
+
+	foreign->referenced_index = referenced_index;
+	foreign->referenced_table = referenced_table;
+
+	foreign->referenced_table_name = mem_heap_strdup(
+		foreign->heap, referenced_table_name);
+        dict_mem_referenced_table_name_lookup_set(foreign, TRUE);
+
+        foreign->referenced_col_names = static_cast<const char**>(
+                mem_heap_alloc(foreign->heap,
+			       referenced_num_field * sizeof(void*)));
+
+        for (ulint i = 0; i < foreign->n_fields; i++) {
+                foreign->referenced_col_names[i]
+                        = mem_heap_strdup(foreign->heap,
+					  referenced_column_names[i]);
+        }
+
+	return(true);
+}
+
+/*************************************************************//**
+Check whether the foreign key options is legit
+@return true if it is */
+static __attribute__((nonnull, warn_unused_result))
+bool
+innobase_check_fk_option(
+/*=====================*/
+	const dict_foreign_t*	foreign)	/*!< in: foreign key */
+{
+	if (!foreign->foreign_index) {
+		return(true);
+	}
+
+	if (foreign->type & (DICT_FOREIGN_ON_UPDATE_SET_NULL
+			     | DICT_FOREIGN_ON_DELETE_SET_NULL)) {
+
+		for (ulint j = 0; j < foreign->n_fields; j++) {
+			if ((dict_index_get_nth_col(
+				     foreign->foreign_index, j)->prtype)
+			    & DATA_NOT_NULL) {
+
+				/* It is not sensible to define
+				SET NULL if the column is not
+				allowed to be NULL! */
+				return(false);
+			}
+		}
+	}
+
+	return(true);
+}
+
+/*************************************************************//**
+Set foreign key options
+@return true if successfully set */
+static __attribute__((nonnull, warn_unused_result))
+bool
+innobase_set_foreign_key_option(
+/*============================*/
+	dict_foreign_t*	foreign,	/*!< in:InnoDB Foreign key */
+	Foreign_key*	fk_key)		/*!< in: Foreign key info from
+					MySQL */
+{
+	ut_ad(!foreign->type);
+
+	switch (fk_key->delete_opt) {
+	case Foreign_key::FK_OPTION_NO_ACTION:
+	case Foreign_key::FK_OPTION_RESTRICT:
+	case Foreign_key::FK_OPTION_DEFAULT:
+		foreign->type = DICT_FOREIGN_ON_DELETE_NO_ACTION;
+		break;
+	case Foreign_key::FK_OPTION_CASCADE:
+		foreign->type = DICT_FOREIGN_ON_DELETE_CASCADE;
+		break;
+	case Foreign_key::FK_OPTION_SET_NULL:
+		foreign->type = DICT_FOREIGN_ON_DELETE_SET_NULL;
+		break;
+	}
+
+	switch (fk_key->update_opt) {
+	case Foreign_key::FK_OPTION_NO_ACTION:
+	case Foreign_key::FK_OPTION_RESTRICT:
+	case Foreign_key::FK_OPTION_DEFAULT:
+		foreign->type |= DICT_FOREIGN_ON_UPDATE_NO_ACTION;
+		break;
+	case Foreign_key::FK_OPTION_CASCADE:
+		foreign->type |= DICT_FOREIGN_ON_UPDATE_CASCADE;
+		break;
+	case Foreign_key::FK_OPTION_SET_NULL:
+		foreign->type |= DICT_FOREIGN_ON_UPDATE_SET_NULL;
+		break;
+	}
+
+	return(innobase_check_fk_option(foreign));
+}
+
+/*******************************************************************//**
+Check if a foreign key constraint can make use of an index
+that is being created.
+@return	useable index, or NULL if none found */
+static __attribute__((nonnull, warn_unused_result))
+const KEY*
+innobase_find_equiv_index(
+/*======================*/
+	const char*const*	col_names,
+					/*!< in: column names */
+	uint			n_cols,	/*!< in: number of columns */
+	const KEY*		keys,	/*!< in: index information */
+	const uint*		add,	/*!< in: indexes being created */
+	uint			n_add)	/*!< in: number of indexes to create */
+{
+	for (uint i = 0; i < n_add; i++) {
+		const KEY*	key = &keys[add[i]];
+
+		if (key->user_defined_key_parts < n_cols) {
+no_match:
+			continue;
+		}
+
+		for (uint j = 0; j < n_cols; j++) {
+			const KEY_PART_INFO&	key_part = key->key_part[j];
+			uint32			col_len
+				= key_part.field->pack_length();
+
+			/* The MySQL pack length contains 1 or 2 bytes
+			length field for a true VARCHAR. */
+
+			if (key_part.field->type() == MYSQL_TYPE_VARCHAR) {
+				col_len -= static_cast<const Field_varstring*>(
+					key_part.field)->length_bytes;
+			}
+
+			if (key_part.length < col_len) {
+
+				/* Column prefix indexes cannot be
+				used for FOREIGN KEY constraints. */
+				goto no_match;
+			}
+
+			if (innobase_strcasecmp(col_names[j],
+						key_part.field->field_name)) {
+				/* Name mismatch */
+				goto no_match;
+			}
+		}
+
+		return(key);
+	}
+
+	return(NULL);
+}
+
+/*************************************************************//**
+Find an index whose first fields are the columns in the array
+in the same order and is not marked for deletion
+@return matching index, NULL if not found */
+static __attribute__((nonnull(1,2,6), warn_unused_result))
+dict_index_t*
+innobase_find_fk_index(
+/*===================*/
+	Alter_inplace_info*	ha_alter_info,
+					/*!< in: alter table info */
+	dict_table_t*		table,	/*!< in: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	dict_index_t**		drop_index,
+					/*!< in: indexes to be dropped */
+	ulint			n_drop_index,
+					/*!< in: size of drop_index[] */
+	const char**		columns,/*!< in: array of column names */
+	ulint			n_cols) /*!< in: number of columns */
+{
+	dict_index_t*	index;
+
+	index = dict_table_get_first_index(table);
+
+	while (index != NULL) {
+		if (!(index->type & DICT_FTS)
+		    && dict_foreign_qualify_index(
+			    table, col_names, columns, n_cols,
+			    index, NULL, true, 0)) {
+			for (ulint i = 0; i < n_drop_index; i++) {
+				if (index == drop_index[i]) {
+					/* Skip to-be-dropped indexes. */
+					goto next_rec;
+				}
+			}
+
+			return(index);
+		}
+
+next_rec:
+		index = dict_table_get_next_index(index);
+	}
+
+	return(NULL);
+}
+
+/*************************************************************//**
+Create InnoDB foreign key structure from MySQL alter_info
+@retval true if successful
+@retval false on error (will call my_error()) */
+static __attribute__((nonnull(1,2,3,7,8), warn_unused_result))
+bool
+innobase_get_foreign_key_info(
+/*==========================*/
+	Alter_inplace_info*
+			ha_alter_info,	/*!< in: alter table info */
+	const TABLE_SHARE*
+			table_share,	/*!< in: the TABLE_SHARE */
+	dict_table_t*	table,		/*!< in: table */
+	const char**	col_names,	/*!< in: column names, or NULL
+					to use table->col_names */
+	dict_index_t**	drop_index,	/*!< in: indexes to be dropped */
+	ulint		n_drop_index,	/*!< in: size of drop_index[] */
+	dict_foreign_t**add_fk,		/*!< out: foreign constraint added */
+	ulint*		n_add_fk,	/*!< out: number of foreign
+					constraints added */
+	const trx_t*	trx)		/*!< in: user transaction */
+{
+	Key*		key;
+	Foreign_key*	fk_key;
+	dict_table_t*	referenced_table = NULL;
+	char*		referenced_table_name = NULL;
+	ulint		num_fk = 0;
+	Alter_info*	alter_info = ha_alter_info->alter_info;
+
+	*n_add_fk = 0;
+
+	List_iterator<Key> key_iterator(alter_info->key_list);
+
+	while ((key=key_iterator++)) {
+		if (key->type != Key::FOREIGN_KEY) {
+			continue;
+		}
+
+		const char*	column_names[MAX_NUM_FK_COLUMNS];
+		dict_index_t*	index = NULL;
+		const char*	referenced_column_names[MAX_NUM_FK_COLUMNS];
+		dict_index_t*	referenced_index = NULL;
+		ulint		num_col = 0;
+		ulint		referenced_num_col = 0;
+		bool		correct_option;
+		char*		db_namep = NULL;
+		char*		tbl_namep = NULL;
+		ulint		db_name_len = 0;
+		ulint		tbl_name_len = 0;
+#ifdef __WIN__
+		char		db_name[MAX_DATABASE_NAME_LEN];
+		char		tbl_name[MAX_TABLE_NAME_LEN];
+#endif
+
+		fk_key = static_cast<Foreign_key*>(key);
+
+		if (fk_key->columns.elements > 0) {
+			ulint	i = 0;
+			Key_part_spec* column;
+			List_iterator<Key_part_spec> key_part_iterator(
+				fk_key->columns);
+
+			/* Get all the foreign key column info for the
+			current table */
+			while ((column = key_part_iterator++)) {
+				column_names[i] = column->field_name.str;
+				ut_ad(i < MAX_NUM_FK_COLUMNS);
+				i++;
+			}
+
+			index = innobase_find_fk_index(
+				ha_alter_info,
+				table, col_names,
+				drop_index, n_drop_index,
+				column_names, i);
+
+			/* MySQL would add a index in the creation
+			list if no such index for foreign table,
+			so we have to use DBUG_EXECUTE_IF to simulate
+			the scenario */
+			DBUG_EXECUTE_IF("innodb_test_no_foreign_idx",
+					index = NULL;);
+
+			/* Check whether there exist such
+			index in the the index create clause */
+			if (!index && !innobase_find_equiv_index(
+				    column_names, static_cast<uint>(i),
+				    ha_alter_info->key_info_buffer,
+				    ha_alter_info->index_add_buffer,
+				    ha_alter_info->index_add_count)) {
+				my_error(
+					ER_FK_NO_INDEX_CHILD,
+					MYF(0),
+					fk_key->name.str
+					? fk_key->name.str : "",
+					table_share->table_name.str);
+				goto err_exit;
+			}
+
+			num_col = i;
+		}
+
+		add_fk[num_fk] = dict_mem_foreign_create();
+
+#ifndef __WIN__
+		tbl_namep = fk_key->ref_table.str;
+		tbl_name_len = fk_key->ref_table.length;
+		db_namep = fk_key->ref_db.str;
+		db_name_len = fk_key->ref_db.length;
+#else
+		ut_ad(fk_key->ref_table.str);
+
+		memcpy(tbl_name, fk_key->ref_table.str,
+		       fk_key->ref_table.length);
+		tbl_name[fk_key->ref_table.length] = 0;
+		innobase_casedn_str(tbl_name);
+		tbl_name_len = strlen(tbl_name);
+		tbl_namep = &tbl_name[0];
+
+		if (fk_key->ref_db.str != NULL) {
+			memcpy(db_name, fk_key->ref_db.str,
+			       fk_key->ref_db.length);
+			db_name[fk_key->ref_db.length] = 0;
+			innobase_casedn_str(db_name);
+			db_name_len = strlen(db_name);
+			db_namep = &db_name[0];
+		}
+#endif
+		mutex_enter(&dict_sys->mutex);
+
+		referenced_table_name = dict_get_referenced_table(
+			table->name,
+			db_namep,
+			db_name_len,
+			tbl_namep,
+			tbl_name_len,
+			&referenced_table,
+			add_fk[num_fk]->heap);
+
+		/* Test the case when referenced_table failed to
+		open, if trx->check_foreigns is not set, we should
+		still be able to add the foreign key */
+		DBUG_EXECUTE_IF("innodb_test_open_ref_fail",
+				referenced_table = NULL;);
+
+		if (!referenced_table && trx->check_foreigns) {
+			mutex_exit(&dict_sys->mutex);
+			my_error(ER_FK_CANNOT_OPEN_PARENT,
+				 MYF(0), tbl_namep);
+
+			goto err_exit;
+		}
+
+		if (fk_key->ref_columns.elements > 0) {
+			ulint	i = 0;
+			Key_part_spec* column;
+			List_iterator<Key_part_spec> key_part_iterator(
+				fk_key->ref_columns);
+
+			while ((column = key_part_iterator++)) {
+				referenced_column_names[i] =
+					column->field_name.str;
+				ut_ad(i < MAX_NUM_FK_COLUMNS);
+				i++;
+			}
+
+			if (referenced_table) {
+				referenced_index =
+					dict_foreign_find_index(
+						referenced_table, 0,
+						referenced_column_names,
+						i, index,
+						TRUE, FALSE);
+
+				DBUG_EXECUTE_IF(
+					"innodb_test_no_reference_idx",
+					referenced_index = NULL;);
+
+				/* Check whether there exist such
+				index in the the index create clause */
+				if (!referenced_index) {
+					mutex_exit(&dict_sys->mutex);
+					my_error(ER_FK_NO_INDEX_PARENT, MYF(0),
+						 fk_key->name.str
+						 ? fk_key->name.str : "",
+						 tbl_namep);
+					goto err_exit;
+				}
+			} else {
+				ut_a(!trx->check_foreigns);
+			}
+
+			referenced_num_col = i;
+		} else {
+			/* Not possible to add a foreign key without a
+			referenced column */
+			mutex_exit(&dict_sys->mutex);
+			my_error(ER_CANNOT_ADD_FOREIGN, MYF(0), tbl_namep);
+			goto err_exit;
+		}
+
+		if (!innobase_init_foreign(
+			    add_fk[num_fk], fk_key->name.str,
+			    table, index, column_names,
+			    num_col, referenced_table_name,
+			    referenced_table, referenced_index,
+			    referenced_column_names, referenced_num_col)) {
+			mutex_exit(&dict_sys->mutex);
+			my_error(
+				ER_FK_DUP_NAME,
+				MYF(0),
+				add_fk[num_fk]->id);
+			goto err_exit;
+		}
+
+		mutex_exit(&dict_sys->mutex);
+
+		correct_option = innobase_set_foreign_key_option(
+			add_fk[num_fk], fk_key);
+
+		DBUG_EXECUTE_IF("innodb_test_wrong_fk_option",
+				correct_option = false;);
+
+		if (!correct_option) {
+			my_error(ER_FK_INCORRECT_OPTION,
+				 MYF(0),
+				 table_share->table_name.str,
+				 add_fk[num_fk]->id);
+			goto err_exit;
+		}
+
+		num_fk++;
+	}
+
+	*n_add_fk = num_fk;
+
+	return(true);
+err_exit:
+	for (ulint i = 0; i <= num_fk; i++) {
+		if (add_fk[i]) {
+			dict_foreign_free(add_fk[i]);
+		}
+	}
+
+	return(false);
+}
+
+/*************************************************************//**
+Copies an InnoDB column to a MySQL field.  This function is
+adapted from row_sel_field_store_in_mysql_format(). */
+static
+void
+innobase_col_to_mysql(
+/*==================*/
+	const dict_col_t*	col,	/*!< in: InnoDB column */
+	const uchar*		data,	/*!< in: InnoDB column data */
+	ulint			len,	/*!< in: length of data, in bytes */
+	Field*			field)	/*!< in/out: MySQL field */
+{
+	uchar*	ptr;
+	uchar*	dest	= field->ptr;
+	ulint	flen	= field->pack_length();
+
+	switch (col->mtype) {
+	case DATA_INT:
+		ut_ad(len == flen);
+
+		/* Convert integer data from Innobase to little-endian
+		format, sign bit restored to normal */
+
+		for (ptr = dest + len; ptr != dest; ) {
+			*--ptr = *data++;
+		}
+
+		if (!(field->flags & UNSIGNED_FLAG)) {
+			((byte*) dest)[len - 1] ^= 0x80;
+		}
+
+		break;
+
+	case DATA_VARCHAR:
+	case DATA_VARMYSQL:
+	case DATA_BINARY:
+		field->reset();
+
+		if (field->type() == MYSQL_TYPE_VARCHAR) {
+			/* This is a >= 5.0.3 type true VARCHAR. Store the
+			length of the data to the first byte or the first
+			two bytes of dest. */
+
+			dest = row_mysql_store_true_var_len(
+				dest, len, flen - field->key_length());
+		}
+
+		/* Copy the actual data */
+		memcpy(dest, data, len);
+		break;
+
+	case DATA_BLOB:
+		/* Skip MySQL BLOBs when reporting an erroneous row
+		during index creation or table rebuild. */
+		field->set_null();
+		break;
+
+#ifdef UNIV_DEBUG
+	case DATA_MYSQL:
+		ut_ad(flen >= len);
+		ut_ad(DATA_MBMAXLEN(col->mbminmaxlen)
+		      >= DATA_MBMINLEN(col->mbminmaxlen));
+		memcpy(dest, data, len);
+		break;
+
+	default:
+	case DATA_SYS_CHILD:
+	case DATA_SYS:
+		/* These column types should never be shipped to MySQL. */
+		ut_ad(0);
+
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+	case DATA_DECIMAL:
+		/* Above are the valid column types for MySQL data. */
+		ut_ad(flen == len);
+		/* fall through */
+	case DATA_FIXBINARY:
+	case DATA_CHAR:
+		/* We may have flen > len when there is a shorter
+		prefix on the CHAR and BINARY column. */
+		ut_ad(flen >= len);
+#else /* UNIV_DEBUG */
+	default:
+#endif /* UNIV_DEBUG */
+		memcpy(dest, data, len);
+	}
+}
+
+/*************************************************************//**
+Copies an InnoDB record to table->record[0]. */
+UNIV_INTERN
+void
+innobase_rec_to_mysql(
+/*==================*/
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: index */
+	const ulint*		offsets)/*!< in: rec_get_offsets(
+					rec, index, ...) */
+{
+	uint	n_fields	= table->s->fields;
+
+	ut_ad(n_fields == dict_table_get_n_user_cols(index->table)
+	      - !!(DICT_TF2_FLAG_IS_SET(index->table,
+					DICT_TF2_FTS_HAS_DOC_ID)));
+
+	for (uint i = 0; i < n_fields; i++) {
+		Field*		field	= table->field[i];
+		ulint		ipos;
+		ulint		ilen;
+		const uchar*	ifield;
+
+		field->reset();
+
+		ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE);
+
+		if (ipos == ULINT_UNDEFINED
+		    || rec_offs_nth_extern(offsets, ipos)) {
+null_field:
+			field->set_null();
+			continue;
+		}
+
+		ifield = rec_get_nth_field(rec, offsets, ipos, &ilen);
+
+		/* Assign the NULL flag */
+		if (ilen == UNIV_SQL_NULL) {
+			ut_ad(field->real_maybe_null());
+			goto null_field;
+		}
+
+		field->set_notnull();
+
+		innobase_col_to_mysql(
+			dict_field_get_col(
+				dict_index_get_nth_field(index, ipos)),
+			ifield, ilen, field);
+	}
+}
+
+/*************************************************************//**
+Copies an InnoDB index entry to table->record[0]. */
+UNIV_INTERN
+void
+innobase_fields_to_mysql(
+/*=====================*/
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const dict_index_t*	index,	/*!< in: InnoDB index */
+	const dfield_t*		fields)	/*!< in: InnoDB index fields */
+{
+	uint	n_fields	= table->s->fields;
+
+	ut_ad(n_fields == dict_table_get_n_user_cols(index->table)
+	      - !!(DICT_TF2_FLAG_IS_SET(index->table,
+					DICT_TF2_FTS_HAS_DOC_ID)));
+
+	for (uint i = 0; i < n_fields; i++) {
+		Field*		field	= table->field[i];
+		ulint		ipos;
+
+		field->reset();
+
+		ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE);
+
+		if (ipos == ULINT_UNDEFINED
+		    || dfield_is_ext(&fields[ipos])
+		    || dfield_is_null(&fields[ipos])) {
+
+			field->set_null();
+		} else {
+			field->set_notnull();
+
+			const dfield_t*	df	= &fields[ipos];
+
+			innobase_col_to_mysql(
+				dict_field_get_col(
+					dict_index_get_nth_field(index, ipos)),
+				static_cast<const uchar*>(dfield_get_data(df)),
+				dfield_get_len(df), field);
+		}
+	}
+}
+
+/*************************************************************//**
+Copies an InnoDB row to table->record[0]. */
+UNIV_INTERN
+void
+innobase_row_to_mysql(
+/*==================*/
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const dict_table_t*	itab,	/*!< in: InnoDB table */
+	const dtuple_t*		row)	/*!< in: InnoDB row */
+{
+	uint  n_fields	= table->s->fields;
+
+	/* The InnoDB row may contain an extra FTS_DOC_ID column at the end. */
+	ut_ad(row->n_fields == dict_table_get_n_cols(itab));
+	ut_ad(n_fields == row->n_fields - DATA_N_SYS_COLS
+	      - !!(DICT_TF2_FLAG_IS_SET(itab, DICT_TF2_FTS_HAS_DOC_ID)));
+
+	for (uint i = 0; i < n_fields; i++) {
+		Field*		field	= table->field[i];
+		const dfield_t*	df	= dtuple_get_nth_field(row, i);
+
+		field->reset();
+
+		if (dfield_is_ext(df) || dfield_is_null(df)) {
+			field->set_null();
+		} else {
+			field->set_notnull();
+
+			innobase_col_to_mysql(
+				dict_table_get_nth_col(itab, i),
+				static_cast<const uchar*>(dfield_get_data(df)),
+				dfield_get_len(df), field);
+		}
+	}
+}
+
+/*************************************************************//**
+Resets table->record[0]. */
+UNIV_INTERN
+void
+innobase_rec_reset(
+/*===============*/
+	TABLE*			table)		/*!< in/out: MySQL table */
+{
+	uint	n_fields	= table->s->fields;
+	uint	i;
+
+	for (i = 0; i < n_fields; i++) {
+		table->field[i]->set_default();
+	}
+}
+
+/*******************************************************************//**
+This function checks that index keys are sensible.
+@return	0 or error number */
+static __attribute__((nonnull, warn_unused_result))
+int
+innobase_check_index_keys(
+/*======================*/
+	const Alter_inplace_info*	info,
+				/*!< in: indexes to be created or dropped */
+	const dict_table_t*		innodb_table)
+				/*!< in: Existing indexes */
+{
+	for (uint key_num = 0; key_num < info->index_add_count;
+	     key_num++) {
+		const KEY&	key = info->key_info_buffer[
+			info->index_add_buffer[key_num]];
+
+		/* Check that the same index name does not appear
+		twice in indexes to be created. */
+
+		for (ulint i = 0; i < key_num; i++) {
+			const KEY&	key2 = info->key_info_buffer[
+				info->index_add_buffer[i]];
+
+			if (0 == strcmp(key.name, key2.name)) {
+				my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+					 key.name);
+
+				return(ER_WRONG_NAME_FOR_INDEX);
+			}
+		}
+
+		/* Check that the same index name does not already exist. */
+
+		const dict_index_t* index;
+
+		for (index = dict_table_get_first_index(innodb_table);
+		     index; index = dict_table_get_next_index(index)) {
+
+			if (!strcmp(key.name, index->name)) {
+				break;
+			}
+		}
+
+		if (index) {
+			/* If a key by the same name is being created and
+			dropped, the name clash is OK. */
+			for (uint i = 0; i < info->index_drop_count;
+			     i++) {
+				const KEY*	drop_key
+					= info->index_drop_buffer[i];
+
+				if (0 == strcmp(key.name, drop_key->name)) {
+					goto name_ok;
+				}
+			}
+
+			my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), key.name);
+
+			return(ER_WRONG_NAME_FOR_INDEX);
+		}
+
+name_ok:
+		for (ulint i = 0; i < key.user_defined_key_parts; i++) {
+			const KEY_PART_INFO&	key_part1
+				= key.key_part[i];
+			const Field*		field
+				= key_part1.field;
+			ibool			is_unsigned;
+
+			switch (get_innobase_type_from_mysql_type(
+					&is_unsigned, field)) {
+			default:
+				break;
+			case DATA_INT:
+			case DATA_FLOAT:
+			case DATA_DOUBLE:
+			case DATA_DECIMAL:
+				/* Check that MySQL does not try to
+				create a column prefix index field on
+				an inappropriate data type. */
+
+				if (field->type() == MYSQL_TYPE_VARCHAR) {
+					if (key_part1.length
+					    >= field->pack_length()
+					    - ((Field_varstring*) field)
+					    ->length_bytes) {
+						break;
+					}
+				} else {
+					if (key_part1.length
+					    >= field->pack_length()) {
+						break;
+					}
+				}
+
+				my_error(ER_WRONG_KEY_COLUMN, MYF(0),
+					 field->field_name);
+				return(ER_WRONG_KEY_COLUMN);
+			}
+
+			/* Check that the same column does not appear
+			twice in the index. */
+
+			for (ulint j = 0; j < i; j++) {
+				const KEY_PART_INFO&	key_part2
+					= key.key_part[j];
+
+				if (key_part1.fieldnr != key_part2.fieldnr) {
+					continue;
+				}
+
+				my_error(ER_WRONG_KEY_COLUMN, MYF(0),
+					 field->field_name);
+				return(ER_WRONG_KEY_COLUMN);
+			}
+		}
+	}
+
+	return(0);
+}
+
+/*******************************************************************//**
+Create index field definition for key part */
+static __attribute__((nonnull(2,3)))
+void
+innobase_create_index_field_def(
+/*============================*/
+	const TABLE*		altered_table,	/*!< in: MySQL table that is
+						being altered, or NULL
+						if a new clustered index is
+						not being created */
+	const KEY_PART_INFO*	key_part,	/*!< in: MySQL key definition */
+	index_field_t*		index_field)	/*!< out: index field
+						definition for key_part */
+{
+	const Field*	field;
+	ibool		is_unsigned;
+	ulint		col_type;
+
+	DBUG_ENTER("innobase_create_index_field_def");
+
+	ut_ad(key_part);
+	ut_ad(index_field);
+
+	field = altered_table
+		? altered_table->field[key_part->fieldnr]
+		: key_part->field;
+	ut_a(field);
+
+	index_field->col_no = key_part->fieldnr;
+
+	col_type = get_innobase_type_from_mysql_type(&is_unsigned, field);
+
+	if (DATA_BLOB == col_type
+	    || (key_part->length < field->pack_length()
+		&& field->type() != MYSQL_TYPE_VARCHAR)
+	    || (field->type() == MYSQL_TYPE_VARCHAR
+		&& key_part->length < field->pack_length()
+			- ((Field_varstring*) field)->length_bytes)) {
+
+		index_field->prefix_len = key_part->length;
+	} else {
+		index_field->prefix_len = 0;
+	}
+
+	DBUG_VOID_RETURN;
+}
+
+/*******************************************************************//**
+Create index definition for key */
+static __attribute__((nonnull))
+void
+innobase_create_index_def(
+/*======================*/
+	const TABLE*		altered_table,	/*!< in: MySQL table that is
+						being altered */
+	const KEY*		keys,		/*!< in: key definitions */
+	ulint			key_number,	/*!< in: MySQL key number */
+	bool			new_clustered,	/*!< in: true if generating
+						a new clustered index
+						on the table */
+	bool			key_clustered,	/*!< in: true if this is
+						the new clustered index */
+	index_def_t*		index,		/*!< out: index definition */
+	mem_heap_t*		heap)		/*!< in: heap where memory
+						is allocated */
+{
+	const KEY*	key = &keys[key_number];
+	ulint		i;
+	ulint		len;
+	ulint		n_fields = key->user_defined_key_parts;
+	char*		index_name;
+
+	DBUG_ENTER("innobase_create_index_def");
+	DBUG_ASSERT(!key_clustered || new_clustered);
+
+	index->fields = static_cast<index_field_t*>(
+		mem_heap_alloc(heap, n_fields * sizeof *index->fields));
+
+	index->ind_type = 0;
+	index->key_number = key_number;
+	index->n_fields = n_fields;
+	len = strlen(key->name) + 1;
+	index->name = index_name = static_cast<char*>(
+		mem_heap_alloc(heap, len + !new_clustered));
+
+	if (!new_clustered) {
+		*index_name++ = TEMP_INDEX_PREFIX;
+	}
+
+	memcpy(index_name, key->name, len);
+
+	if (key->flags & HA_NOSAME) {
+		index->ind_type |= DICT_UNIQUE;
+	}
+
+	if (key_clustered) {
+		DBUG_ASSERT(!(key->flags & HA_FULLTEXT));
+		index->ind_type |= DICT_CLUSTERED;
+	} else if (key->flags & HA_FULLTEXT) {
+		DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK
+			      & ~(HA_FULLTEXT
+				  | HA_PACK_KEY
+				  | HA_BINARY_PACK_KEY)));
+		DBUG_ASSERT(!(key->flags & HA_NOSAME));
+		DBUG_ASSERT(!index->ind_type);
+		index->ind_type |= DICT_FTS;
+	}
+
+	if (!new_clustered) {
+		altered_table = NULL;
+	}
+
+	for (i = 0; i < n_fields; i++) {
+		innobase_create_index_field_def(
+			altered_table, &key->key_part[i], &index->fields[i]);
+	}
+
+	DBUG_VOID_RETURN;
+}
+
+/*******************************************************************//**
+Check whether the table has the FTS_DOC_ID column
+@return whether there exists an FTS_DOC_ID column */
+static
+bool
+innobase_fts_check_doc_id_col(
+/*==========================*/
+	const dict_table_t*	table,  /*!< in: InnoDB table with
+					fulltext index */
+	const TABLE*		altered_table,
+					/*!< in: MySQL table with
+					fulltext index */
+	ulint*			fts_doc_col_no)
+					/*!< out: The column number for
+					Doc ID, or ULINT_UNDEFINED
+					if it is of wrong type */
+{
+	*fts_doc_col_no = ULINT_UNDEFINED;
+
+	const uint n_cols = altered_table->s->fields;
+	uint i;
+
+	for (i = 0; i < n_cols; i++) {
+		const Field*	field = altered_table->field[i];
+
+		if (my_strcasecmp(system_charset_info,
+				  field->field_name, FTS_DOC_ID_COL_NAME)) {
+			continue;
+		}
+
+		if (strcmp(field->field_name, FTS_DOC_ID_COL_NAME)) {
+			my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+				 field->field_name);
+		} else if (field->type() != MYSQL_TYPE_LONGLONG
+			   || field->pack_length() != 8
+			   || field->real_maybe_null()
+			   || !(field->flags & UNSIGNED_FLAG)) {
+			my_error(ER_INNODB_FT_WRONG_DOCID_COLUMN, MYF(0),
+				 field->field_name);
+		} else {
+			*fts_doc_col_no = i;
+		}
+
+		return(true);
+	}
+
+	if (!table) {
+		return(false);
+	}
+
+	for (; i + DATA_N_SYS_COLS < (uint) table->n_cols; i++) {
+		const char*     name = dict_table_get_col_name(table, i);
+
+		if (strcmp(name, FTS_DOC_ID_COL_NAME) == 0) {
+#ifdef UNIV_DEBUG
+			const dict_col_t*       col;
+
+			col = dict_table_get_nth_col(table, i);
+
+			/* Because the FTS_DOC_ID does not exist in
+			the MySQL data dictionary, this must be the
+			internally created FTS_DOC_ID column. */
+			ut_ad(col->mtype == DATA_INT);
+			ut_ad(col->len == 8);
+			ut_ad(col->prtype & DATA_NOT_NULL);
+			ut_ad(col->prtype & DATA_UNSIGNED);
+#endif /* UNIV_DEBUG */
+			*fts_doc_col_no = i;
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/*******************************************************************//**
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column.
+@return	the status of the FTS_DOC_ID index */
+UNIV_INTERN
+enum fts_doc_id_index_enum
+innobase_fts_check_doc_id_index(
+/*============================*/
+	const dict_table_t*	table,		/*!< in: table definition */
+	const TABLE*		altered_table,	/*!< in: MySQL table
+						that is being altered */
+	ulint*			fts_doc_col_no)	/*!< out: The column number for
+						Doc ID, or ULINT_UNDEFINED
+						if it is being created in
+						ha_alter_info */
+{
+	const dict_index_t*	index;
+	const dict_field_t*	field;
+
+	if (altered_table) {
+		/* Check if a unique index with the name of
+		FTS_DOC_ID_INDEX_NAME is being created. */
+
+		for (uint i = 0; i < altered_table->s->keys; i++) {
+			const KEY& key = altered_table->key_info[i];
+
+			if (innobase_strcasecmp(
+				    key.name, FTS_DOC_ID_INDEX_NAME)) {
+				continue;
+			}
+
+			if ((key.flags & HA_NOSAME)
+			    && key.user_defined_key_parts == 1
+			    && !strcmp(key.name, FTS_DOC_ID_INDEX_NAME)
+			    && !strcmp(key.key_part[0].field->field_name,
+				       FTS_DOC_ID_COL_NAME)) {
+				if (fts_doc_col_no) {
+					*fts_doc_col_no = ULINT_UNDEFINED;
+				}
+				return(FTS_EXIST_DOC_ID_INDEX);
+			} else {
+				return(FTS_INCORRECT_DOC_ID_INDEX);
+			}
+		}
+	}
+
+	if (!table) {
+		return(FTS_NOT_EXIST_DOC_ID_INDEX);
+	}
+
+	for (index = dict_table_get_first_index(table);
+	     index; index = dict_table_get_next_index(index)) {
+
+		/* Check if there exists a unique index with the name of
+		FTS_DOC_ID_INDEX_NAME */
+		if (innobase_strcasecmp(index->name, FTS_DOC_ID_INDEX_NAME)) {
+			continue;
+		}
+
+		if (!dict_index_is_unique(index)
+		    || dict_index_get_n_unique(index) > 1
+		    || strcmp(index->name, FTS_DOC_ID_INDEX_NAME)) {
+			return(FTS_INCORRECT_DOC_ID_INDEX);
+		}
+
+		/* Check whether the index has FTS_DOC_ID as its
+		first column */
+		field = dict_index_get_nth_field(index, 0);
+
+		/* The column would be of a BIGINT data type */
+		if (strcmp(field->name, FTS_DOC_ID_COL_NAME) == 0
+		    && field->col->mtype == DATA_INT
+		    && field->col->len == 8
+		    && field->col->prtype & DATA_NOT_NULL) {
+			if (fts_doc_col_no) {
+				*fts_doc_col_no = dict_col_get_no(field->col);
+			}
+			return(FTS_EXIST_DOC_ID_INDEX);
+		} else {
+			return(FTS_INCORRECT_DOC_ID_INDEX);
+		}
+	}
+
+
+	/* Not found */
+	return(FTS_NOT_EXIST_DOC_ID_INDEX);
+}
+/*******************************************************************//**
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column in MySQL create index definition.
+@return	FTS_EXIST_DOC_ID_INDEX if there exists the FTS_DOC_ID index,
+FTS_INCORRECT_DOC_ID_INDEX if the FTS_DOC_ID index is of wrong format */
+UNIV_INTERN
+enum fts_doc_id_index_enum
+innobase_fts_check_doc_id_index_in_def(
+/*===================================*/
+	ulint		n_key,		/*!< in: Number of keys */
+	const KEY*	key_info)	/*!< in: Key definition */
+{
+	/* Check whether there is a "FTS_DOC_ID_INDEX" in the to be built index
+	list */
+	for (ulint j = 0; j < n_key; j++) {
+		const KEY*	key = &key_info[j];
+
+		if (innobase_strcasecmp(key->name, FTS_DOC_ID_INDEX_NAME)) {
+			continue;
+		}
+
+		/* Do a check on FTS DOC ID_INDEX, it must be unique,
+		named as "FTS_DOC_ID_INDEX" and on column "FTS_DOC_ID" */
+		if (!(key->flags & HA_NOSAME)
+		    || key->user_defined_key_parts != 1
+		    || strcmp(key->name, FTS_DOC_ID_INDEX_NAME)
+		    || strcmp(key->key_part[0].field->field_name,
+			      FTS_DOC_ID_COL_NAME)) {
+			return(FTS_INCORRECT_DOC_ID_INDEX);
+		}
+
+		return(FTS_EXIST_DOC_ID_INDEX);
+	}
+
+	return(FTS_NOT_EXIST_DOC_ID_INDEX);
+}
+/*******************************************************************//**
+Create an index table where indexes are ordered as follows:
+
+IF a new primary key is defined for the table THEN
+
+	1) New primary key
+	2) The remaining keys in key_info
+
+ELSE
+
+	1) All new indexes in the order they arrive from MySQL
+
+ENDIF
+
+@return	key definitions */
+static __attribute__((nonnull, warn_unused_result, malloc))
+index_def_t*
+innobase_create_key_defs(
+/*=====================*/
+	mem_heap_t*			heap,
+			/*!< in/out: memory heap where space for key
+			definitions are allocated */
+	const Alter_inplace_info*	ha_alter_info,
+			/*!< in: alter operation */
+	const TABLE*			altered_table,
+			/*!< in: MySQL table that is being altered */
+	ulint&				n_add,
+			/*!< in/out: number of indexes to be created */
+	ulint&				n_fts_add,
+			/*!< out: number of FTS indexes to be created */
+	bool				got_default_clust,
+			/*!< in: whether the table lacks a primary key */
+	ulint&				fts_doc_id_col,
+			/*!< in: The column number for Doc ID */
+	bool&				add_fts_doc_id,
+			/*!< in: whether we need to add new DOC ID
+			column for FTS index */
+	bool&				add_fts_doc_idx)
+			/*!< in: whether we need to add new DOC ID
+			index for FTS index */
+{
+	index_def_t*		indexdef;
+	index_def_t*		indexdefs;
+	bool			new_primary;
+	const uint*const	add
+		= ha_alter_info->index_add_buffer;
+	const KEY*const		key_info
+		= ha_alter_info->key_info_buffer;
+
+	DBUG_ENTER("innobase_create_key_defs");
+	DBUG_ASSERT(!add_fts_doc_id || add_fts_doc_idx);
+	DBUG_ASSERT(ha_alter_info->index_add_count == n_add);
+
+	/* If there is a primary key, it is always the first index
+	defined for the innodb_table. */
+
+	new_primary = n_add > 0
+		&& !my_strcasecmp(system_charset_info,
+				  key_info[*add].name, "PRIMARY");
+	n_fts_add = 0;
+
+	/* If there is a UNIQUE INDEX consisting entirely of NOT NULL
+	columns and if the index does not contain column prefix(es)
+	(only prefix/part of the column is indexed), MySQL will treat the
+	index as a PRIMARY KEY unless the table already has one. */
+
+	if (n_add > 0 && !new_primary && got_default_clust
+	    && (key_info[*add].flags & HA_NOSAME)
+	    && !(key_info[*add].flags & HA_KEY_HAS_PART_KEY_SEG)) {
+		uint	key_part = key_info[*add].user_defined_key_parts;
+
+		new_primary = true;
+
+		while (key_part--) {
+			const uint	maybe_null
+				= key_info[*add].key_part[key_part].key_type
+				& FIELDFLAG_MAYBE_NULL;
+			DBUG_ASSERT(!maybe_null
+				    == !key_info[*add].key_part[key_part].
+				    field->real_maybe_null());
+
+			if (maybe_null) {
+				new_primary = false;
+				break;
+			}
+		}
+	}
+
+	const bool rebuild = new_primary || add_fts_doc_id
+		|| innobase_need_rebuild(ha_alter_info);
+	/* Reserve one more space if new_primary is true, and we might
+	need to add the FTS_DOC_ID_INDEX */
+	indexdef = indexdefs = static_cast<index_def_t*>(
+		mem_heap_alloc(
+			heap, sizeof *indexdef
+			* (ha_alter_info->key_count
+			   + rebuild
+			   + got_default_clust)));
+
+	if (rebuild) {
+		ulint	primary_key_number;
+
+		if (new_primary) {
+			DBUG_ASSERT(n_add > 0);
+			primary_key_number = *add;
+		} else if (got_default_clust) {
+			/* Create the GEN_CLUST_INDEX */
+			index_def_t*	index = indexdef++;
+
+			index->fields = NULL;
+			index->n_fields = 0;
+			index->ind_type = DICT_CLUSTERED;
+			index->name = mem_heap_strdup(
+				heap, innobase_index_reserve_name);
+			index->key_number = ~0;
+			primary_key_number = ULINT_UNDEFINED;
+			goto created_clustered;
+		} else {
+			primary_key_number = 0;
+		}
+
+		/* Create the PRIMARY key index definition */
+		innobase_create_index_def(
+			altered_table, key_info, primary_key_number,
+			TRUE, TRUE, indexdef++, heap);
+
+created_clustered:
+		n_add = 1;
+
+		for (ulint i = 0; i < ha_alter_info->key_count; i++) {
+			if (i == primary_key_number) {
+				continue;
+			}
+			/* Copy the index definitions. */
+			innobase_create_index_def(
+				altered_table, key_info, i, TRUE, FALSE,
+				indexdef, heap);
+
+			if (indexdef->ind_type & DICT_FTS) {
+				n_fts_add++;
+			}
+
+			indexdef++;
+			n_add++;
+		}
+
+		if (n_fts_add > 0) {
+			if (!add_fts_doc_id
+			    && !innobase_fts_check_doc_id_col(
+				    NULL, altered_table,
+				    &fts_doc_id_col)) {
+				fts_doc_id_col = altered_table->s->fields;
+				add_fts_doc_id = true;
+			}
+
+			if (!add_fts_doc_idx) {
+				fts_doc_id_index_enum	ret;
+				ulint			doc_col_no;
+
+				ret = innobase_fts_check_doc_id_index(
+					NULL, altered_table, &doc_col_no);
+
+				/* This should have been checked before */
+				ut_ad(ret != FTS_INCORRECT_DOC_ID_INDEX);
+
+				if (ret == FTS_NOT_EXIST_DOC_ID_INDEX) {
+					add_fts_doc_idx = true;
+				} else {
+					ut_ad(ret == FTS_EXIST_DOC_ID_INDEX);
+					ut_ad(doc_col_no == ULINT_UNDEFINED
+					      || doc_col_no == fts_doc_id_col);
+				}
+			}
+		}
+	} else {
+		/* Create definitions for added secondary indexes. */
+
+		for (ulint i = 0; i < n_add; i++) {
+			innobase_create_index_def(
+				altered_table, key_info, add[i], FALSE, FALSE,
+				indexdef, heap);
+
+			if (indexdef->ind_type & DICT_FTS) {
+				n_fts_add++;
+			}
+
+			indexdef++;
+		}
+	}
+
+	DBUG_ASSERT(indexdefs + n_add == indexdef);
+
+	if (add_fts_doc_idx) {
+		index_def_t*	index = indexdef++;
+
+		index->fields = static_cast<index_field_t*>(
+			mem_heap_alloc(heap, sizeof *index->fields));
+		index->n_fields = 1;
+		index->fields->col_no = fts_doc_id_col;
+		index->fields->prefix_len = 0;
+		index->ind_type = DICT_UNIQUE;
+
+		if (rebuild) {
+			index->name = mem_heap_strdup(
+				heap, FTS_DOC_ID_INDEX_NAME);
+			ut_ad(!add_fts_doc_id
+			      || fts_doc_id_col == altered_table->s->fields);
+		} else {
+			char*	index_name;
+			index->name = index_name = static_cast<char*>(
+				mem_heap_alloc(
+					heap,
+					1 + sizeof FTS_DOC_ID_INDEX_NAME));
+			*index_name++ = TEMP_INDEX_PREFIX;
+			memcpy(index_name, FTS_DOC_ID_INDEX_NAME,
+			       sizeof FTS_DOC_ID_INDEX_NAME);
+		}
+
+		/* TODO: assign a real MySQL key number for this */
+		index->key_number = ULINT_UNDEFINED;
+		n_add++;
+	}
+
+	DBUG_ASSERT(indexdef > indexdefs);
+	DBUG_ASSERT((ulint) (indexdef - indexdefs)
+		    <= ha_alter_info->key_count
+		    + add_fts_doc_idx + got_default_clust);
+	DBUG_ASSERT(ha_alter_info->index_add_count <= n_add);
+	DBUG_RETURN(indexdefs);
+}
+
+/*******************************************************************//**
+Check each index column size, make sure they do not exceed the max limit
+@return	true if index column size exceeds limit */
+static __attribute__((nonnull, warn_unused_result))
+bool
+innobase_check_column_length(
+/*=========================*/
+	ulint		max_col_len,	/*!< in: maximum column length */
+	const KEY*	key_info)	/*!< in: Indexes to be created */
+{
+	for (ulint key_part = 0; key_part < key_info->user_defined_key_parts; key_part++) {
+		if (key_info->key_part[key_part].length > max_col_len) {
+			return(true);
+		}
+	}
+	return(false);
+}
+
+struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx
+{
+	/** Dummy query graph */
+	que_thr_t*	thr;
+	/** reference to the prebuilt struct of the creating instance */
+	row_prebuilt_t*&prebuilt;
+	/** InnoDB indexes being created */
+	dict_index_t**	add_index;
+	/** MySQL key numbers for the InnoDB indexes that are being created */
+	const ulint*	add_key_numbers;
+	/** number of InnoDB indexes being created */
+	ulint		num_to_add_index;
+	/** InnoDB indexes being dropped */
+	dict_index_t**	drop_index;
+	/** number of InnoDB indexes being dropped */
+	const ulint	num_to_drop_index;
+	/** InnoDB foreign key constraints being dropped */
+	dict_foreign_t** drop_fk;
+	/** number of InnoDB foreign key constraints being dropped */
+	const ulint	num_to_drop_fk;
+	/** InnoDB foreign key constraints being added */
+	dict_foreign_t** add_fk;
+	/** number of InnoDB foreign key constraints being dropped */
+	const ulint	num_to_add_fk;
+	/** whether to create the indexes online */
+	bool		online;
+	/** memory heap */
+	mem_heap_t*	heap;
+	/** dictionary transaction */
+	trx_t*		trx;
+	/** original table (if rebuilt, differs from indexed_table) */
+	dict_table_t*	old_table;
+	/** table where the indexes are being created or dropped */
+	dict_table_t*	new_table;
+	/** mapping of old column numbers to new ones, or NULL */
+	const ulint*	col_map;
+	/** new column names, or NULL if nothing was renamed */
+	const char**	col_names;
+	/** added AUTO_INCREMENT column position, or ULINT_UNDEFINED */
+	const ulint	add_autoinc;
+	/** default values of ADD COLUMN, or NULL */
+	const dtuple_t*	add_cols;
+	/** autoinc sequence to use */
+	ib_sequence_t	sequence;
+	/** maximum auto-increment value */
+	ulonglong	max_autoinc;
+	/** temporary table name to use for old table when renaming tables */
+	const char*	tmp_name;
+
+	ha_innobase_inplace_ctx(row_prebuilt_t*& prebuilt_arg,
+				dict_index_t** drop_arg,
+				ulint num_to_drop_arg,
+				dict_foreign_t** drop_fk_arg,
+				ulint num_to_drop_fk_arg,
+				dict_foreign_t** add_fk_arg,
+				ulint num_to_add_fk_arg,
+				bool online_arg,
+				mem_heap_t* heap_arg,
+				dict_table_t* new_table_arg,
+				const char** col_names_arg,
+				ulint add_autoinc_arg,
+				ulonglong autoinc_col_min_value_arg,
+				ulonglong autoinc_col_max_value_arg) :
+		inplace_alter_handler_ctx(),
+		prebuilt (prebuilt_arg),
+		add_index (0), add_key_numbers (0), num_to_add_index (0),
+		drop_index (drop_arg), num_to_drop_index (num_to_drop_arg),
+		drop_fk (drop_fk_arg), num_to_drop_fk (num_to_drop_fk_arg),
+		add_fk (add_fk_arg), num_to_add_fk (num_to_add_fk_arg),
+		online (online_arg), heap (heap_arg), trx (0),
+		old_table (prebuilt_arg->table),
+		new_table (new_table_arg),
+		col_map (0), col_names (col_names_arg),
+		add_autoinc (add_autoinc_arg),
+		add_cols (0),
+		sequence(prebuilt->trx->mysql_thd,
+			 autoinc_col_min_value_arg, autoinc_col_max_value_arg),
+		max_autoinc (0),
+		tmp_name (0)
+	{
+#ifdef UNIV_DEBUG
+		for (ulint i = 0; i < num_to_add_index; i++) {
+			ut_ad(!add_index[i]->to_be_dropped);
+		}
+		for (ulint i = 0; i < num_to_drop_index; i++) {
+			ut_ad(drop_index[i]->to_be_dropped);
+		}
+#endif /* UNIV_DEBUG */
+
+		thr = pars_complete_graph_for_exec(NULL, prebuilt->trx, heap);
+	}
+
+	~ha_innobase_inplace_ctx()
+	{
+		mem_heap_free(heap);
+	}
+
+	/** Determine if the table will be rebuilt.
+	@return whether the table will be rebuilt */
+	bool need_rebuild () const { return(old_table != new_table); }
+
+private:
+	// Disable copying
+	ha_innobase_inplace_ctx(const ha_innobase_inplace_ctx&);
+	ha_innobase_inplace_ctx& operator=(const ha_innobase_inplace_ctx&);
+};
+
+/********************************************************************//**
+Drop any indexes that we were not able to free previously due to
+open table handles. */
+static
+void
+online_retry_drop_indexes_low(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	trx_t*		trx)	/*!< in/out: transaction */
+{
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+
+	/* We can have table->n_ref_count > 1, because other threads
+	may have prebuilt->table pointing to the table. However, these
+	other threads should be between statements, waiting for the
+	next statement to execute, or for a meta-data lock. */
+	ut_ad(table->n_ref_count >= 1);
+
+	if (table->drop_aborted) {
+		row_merge_drop_indexes(trx, table, TRUE);
+	}
+}
+
+/********************************************************************//**
+Drop any indexes that we were not able to free previously due to
+open table handles. */
+static __attribute__((nonnull))
+void
+online_retry_drop_indexes(
+/*======================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	THD*		user_thd)	/*!< in/out: MySQL connection */
+{
+	if (table->drop_aborted) {
+		trx_t*	trx = innobase_trx_allocate(user_thd);
+
+		trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+
+		row_mysql_lock_data_dictionary(trx);
+		online_retry_drop_indexes_low(table, trx);
+		trx_commit_for_mysql(trx);
+		row_mysql_unlock_data_dictionary(trx);
+		trx_free_for_mysql(trx);
+	}
+
+#ifdef UNIV_DEBUG
+	mutex_enter(&dict_sys->mutex);
+	dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE);
+	mutex_exit(&dict_sys->mutex);
+	ut_a(!table->drop_aborted);
+#endif /* UNIV_DEBUG */
+}
+
+/********************************************************************//**
+Commit a dictionary transaction and drop any indexes that we were not
+able to free previously due to open table handles. */
+static __attribute__((nonnull))
+void
+online_retry_drop_indexes_with_trx(
+/*===============================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	trx_t*		trx)	/*!< in/out: transaction */
+{
+	ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	/* Now that the dictionary is being locked, check if we can
+	drop any incompletely created indexes that may have been left
+	behind in rollback_inplace_alter_table() earlier. */
+	if (table->drop_aborted) {
+
+		trx->table_id = 0;
+
+		trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+
+		online_retry_drop_indexes_low(table, trx);
+		trx_commit_for_mysql(trx);
+	}
+}
+
+/** Determines if InnoDB is dropping a foreign key constraint.
+@param foreign		the constraint
+@param drop_fk		constraints being dropped
+@param n_drop_fk	number of constraints that are being dropped
+@return whether the constraint is being dropped */
+inline __attribute__((pure, nonnull, warn_unused_result))
+bool
+innobase_dropping_foreign(
+/*======================*/
+	const dict_foreign_t*	foreign,
+	dict_foreign_t**	drop_fk,
+	ulint			n_drop_fk)
+{
+	while (n_drop_fk--) {
+		if (*drop_fk++ == foreign) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/** Determines if an InnoDB FOREIGN KEY constraint depends on a
+column that is being dropped or modified to NOT NULL.
+@param user_table	InnoDB table as it is before the ALTER operation
+@param col_name		Name of the column being altered
+@param drop_fk		constraints being dropped
+@param n_drop_fk	number of constraints that are being dropped
+@param drop		true=drop column, false=set NOT NULL
+@retval true		Not allowed (will call my_error())
+@retval false		Allowed
+*/
+static __attribute__((pure, nonnull, warn_unused_result))
+bool
+innobase_check_foreigns_low(
+/*========================*/
+	const dict_table_t*	user_table,
+	dict_foreign_t**	drop_fk,
+	ulint			n_drop_fk,
+	const char*		col_name,
+	bool			drop)
+{
+	dict_foreign_t*	foreign;
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	/* Check if any FOREIGN KEY constraints are defined on this
+	column. */
+
+	for (dict_foreign_set::iterator it = user_table->foreign_set.begin();
+	     it != user_table->foreign_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		if (!drop && !(foreign->type
+			       & (DICT_FOREIGN_ON_DELETE_SET_NULL
+				  | DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+			continue;
+		}
+
+		if (innobase_dropping_foreign(foreign, drop_fk, n_drop_fk)) {
+			continue;
+		}
+
+		for (unsigned f = 0; f < foreign->n_fields; f++) {
+			if (!strcmp(foreign->foreign_col_names[f],
+				    col_name)) {
+				my_error(drop
+					 ? ER_FK_COLUMN_CANNOT_DROP
+					 : ER_FK_COLUMN_NOT_NULL, MYF(0),
+					 col_name, foreign->id);
+				return(true);
+			}
+		}
+	}
+
+	if (!drop) {
+		/* SET NULL clauses on foreign key constraints of
+		child tables affect the child tables, not the parent table.
+		The column can be NOT NULL in the parent table. */
+		return(false);
+	}
+
+	/* Check if any FOREIGN KEY constraints in other tables are
+	referring to the column that is being dropped. */
+	for (dict_foreign_set::iterator it
+		= user_table->referenced_set.begin();
+	     it != user_table->referenced_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		if (innobase_dropping_foreign(foreign, drop_fk, n_drop_fk)) {
+			continue;
+		}
+
+		for (unsigned f = 0; f < foreign->n_fields; f++) {
+			char display_name[FN_REFLEN];
+
+			if (strcmp(foreign->referenced_col_names[f],
+				   col_name)) {
+				continue;
+			}
+
+			char* buf_end = innobase_convert_name(
+				display_name, (sizeof display_name) - 1,
+				foreign->foreign_table_name,
+				strlen(foreign->foreign_table_name),
+				NULL, TRUE);
+			*buf_end = '\0';
+			my_error(ER_FK_COLUMN_CANNOT_DROP_CHILD,
+				 MYF(0), col_name, foreign->id,
+				 display_name);
+
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/** Determines if an InnoDB FOREIGN KEY constraint depends on a
+column that is being dropped or modified to NOT NULL.
+@param ha_alter_info	Data used during in-place alter
+@param altered_table	MySQL table that is being altered
+@param old_table	MySQL table as it is before the ALTER operation
+@param user_table	InnoDB table as it is before the ALTER operation
+@param drop_fk		constraints being dropped
+@param n_drop_fk	number of constraints that are being dropped
+@retval true		Not allowed (will call my_error())
+@retval false		Allowed
+*/
+static __attribute__((pure, nonnull, warn_unused_result))
+bool
+innobase_check_foreigns(
+/*====================*/
+	Alter_inplace_info*	ha_alter_info,
+	const TABLE*		altered_table,
+	const TABLE*		old_table,
+	const dict_table_t*	user_table,
+	dict_foreign_t**	drop_fk,
+	ulint			n_drop_fk)
+{
+	List_iterator_fast<Create_field> cf_it(
+		ha_alter_info->alter_info->create_list);
+
+	for (Field** fp = old_table->field; *fp; fp++) {
+		cf_it.rewind();
+		const Create_field* new_field;
+
+		ut_ad(!(*fp)->real_maybe_null()
+		      == !!((*fp)->flags & NOT_NULL_FLAG));
+
+		while ((new_field = cf_it++)) {
+			if (new_field->field == *fp) {
+				break;
+			}
+		}
+
+		if (!new_field || (new_field->flags & NOT_NULL_FLAG)) {
+			if (innobase_check_foreigns_low(
+				    user_table, drop_fk, n_drop_fk,
+				    (*fp)->field_name, !new_field)) {
+				return(true);
+			}
+		}
+	}
+
+	return(false);
+}
+
+/** Convert a default value for ADD COLUMN.
+
+@param heap	Memory heap where allocated
+@param dfield	InnoDB data field to copy to
+@param field	MySQL value for the column
+@param comp	nonzero if in compact format */
+static __attribute__((nonnull))
+void
+innobase_build_col_map_add(
+/*=======================*/
+	mem_heap_t*	heap,
+	dfield_t*	dfield,
+	const Field*	field,
+	ulint		comp)
+{
+	if (field->is_real_null()) {
+		dfield_set_null(dfield);
+		return;
+	}
+
+	ulint	size	= field->pack_length();
+
+	byte*	buf	= static_cast<byte*>(mem_heap_alloc(heap, size));
+
+	row_mysql_store_col_in_innobase_format(
+		dfield, buf, TRUE, field->ptr, size, comp);
+}
+
+/** Construct the translation table for reordering, dropping or
+adding columns.
+
+@param ha_alter_info	Data used during in-place alter
+@param altered_table	MySQL table that is being altered
+@param table		MySQL table as it is before the ALTER operation
+@param new_table	InnoDB table corresponding to MySQL altered_table
+@param old_table	InnoDB table corresponding to MYSQL table
+@param add_cols		Default values for ADD COLUMN, or NULL if no ADD COLUMN
+@param heap		Memory heap where allocated
+@return	array of integers, mapping column numbers in the table
+to column numbers in altered_table */
+static __attribute__((nonnull(1,2,3,4,5,7), warn_unused_result))
+const ulint*
+innobase_build_col_map(
+/*===================*/
+	Alter_inplace_info*	ha_alter_info,
+	const TABLE*		altered_table,
+	const TABLE*		table,
+	const dict_table_t*	new_table,
+	const dict_table_t*	old_table,
+	dtuple_t*		add_cols,
+	mem_heap_t*		heap)
+{
+	DBUG_ENTER("innobase_build_col_map");
+	DBUG_ASSERT(altered_table != table);
+	DBUG_ASSERT(new_table != old_table);
+	DBUG_ASSERT(dict_table_get_n_cols(new_table)
+		    >= altered_table->s->fields + DATA_N_SYS_COLS);
+	DBUG_ASSERT(dict_table_get_n_cols(old_table)
+		    >= table->s->fields + DATA_N_SYS_COLS);
+	DBUG_ASSERT(!!add_cols == !!(ha_alter_info->handler_flags
+				     & Alter_inplace_info::ADD_COLUMN));
+	DBUG_ASSERT(!add_cols || dtuple_get_n_fields(add_cols)
+		    == dict_table_get_n_cols(new_table));
+
+	ulint*	col_map = static_cast<ulint*>(
+		mem_heap_alloc(heap, old_table->n_cols * sizeof *col_map));
+
+	List_iterator_fast<Create_field> cf_it(
+		ha_alter_info->alter_info->create_list);
+	uint i = 0;
+
+	/* Any dropped columns will map to ULINT_UNDEFINED. */
+	for (uint old_i = 0; old_i + DATA_N_SYS_COLS < old_table->n_cols;
+	     old_i++) {
+		col_map[old_i] = ULINT_UNDEFINED;
+	}
+
+	while (const Create_field* new_field = cf_it++) {
+		for (uint old_i = 0; table->field[old_i]; old_i++) {
+			const Field* field = table->field[old_i];
+			if (new_field->field == field) {
+				col_map[old_i] = i;
+				goto found_col;
+			}
+		}
+
+		innobase_build_col_map_add(
+			heap, dtuple_get_nth_field(add_cols, i),
+			altered_table->field[i],
+			dict_table_is_comp(new_table));
+found_col:
+		i++;
+	}
+
+	DBUG_ASSERT(i == altered_table->s->fields);
+
+	i = table->s->fields;
+
+	/* Add the InnoDB hidden FTS_DOC_ID column, if any. */
+	if (i + DATA_N_SYS_COLS < old_table->n_cols) {
+		/* There should be exactly one extra field,
+		the FTS_DOC_ID. */
+		DBUG_ASSERT(DICT_TF2_FLAG_IS_SET(old_table,
+						 DICT_TF2_FTS_HAS_DOC_ID));
+		DBUG_ASSERT(i + DATA_N_SYS_COLS + 1 == old_table->n_cols);
+		DBUG_ASSERT(!strcmp(dict_table_get_col_name(
+					    old_table, table->s->fields),
+				    FTS_DOC_ID_COL_NAME));
+		if (altered_table->s->fields + DATA_N_SYS_COLS
+		    < new_table->n_cols) {
+			DBUG_ASSERT(DICT_TF2_FLAG_IS_SET(
+					    new_table,
+					    DICT_TF2_FTS_HAS_DOC_ID));
+			DBUG_ASSERT(altered_table->s->fields
+				    + DATA_N_SYS_COLS + 1
+				    == new_table->n_cols);
+			col_map[i] = altered_table->s->fields;
+		} else {
+			DBUG_ASSERT(!DICT_TF2_FLAG_IS_SET(
+					    new_table,
+					    DICT_TF2_FTS_HAS_DOC_ID));
+			col_map[i] = ULINT_UNDEFINED;
+		}
+
+		i++;
+	} else {
+		DBUG_ASSERT(!DICT_TF2_FLAG_IS_SET(
+				    old_table,
+				    DICT_TF2_FTS_HAS_DOC_ID));
+	}
+
+	for (; i < old_table->n_cols; i++) {
+		col_map[i] = i + new_table->n_cols - old_table->n_cols;
+	}
+
+	DBUG_RETURN(col_map);
+}
+
+/** Drop newly create FTS index related auxiliary table during
+FIC create index process, before fts_add_index is called
+@param table    table that was being rebuilt online
+@param trx	transaction
+@return		DB_SUCCESS if successful, otherwise last error code
+*/
+static
+dberr_t
+innobase_drop_fts_index_table(
+/*==========================*/
+        dict_table_t*   table,
+	trx_t*		trx)
+{
+	dberr_t		ret_err = DB_SUCCESS;
+
+	for (dict_index_t* index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+		if (index->type & DICT_FTS) {
+			dberr_t	err;
+
+			err = fts_drop_index_tables(trx, index);
+
+			if (err != DB_SUCCESS) {
+				ret_err = err;
+			}
+		}
+	}
+
+	return(ret_err);
+}
+
+/** Get the new column names if any columns were renamed
+@param ha_alter_info	Data used during in-place alter
+@param altered_table	MySQL table that is being altered
+@param table		MySQL table as it is before the ALTER operation
+@param user_table	InnoDB table as it is before the ALTER operation
+@param heap		Memory heap for the allocation
+@return array of new column names in rebuilt_table, or NULL if not renamed */
+static __attribute__((nonnull, warn_unused_result))
+const char**
+innobase_get_col_names(
+	Alter_inplace_info*	ha_alter_info,
+	const TABLE*		altered_table,
+	const TABLE*		table,
+	const dict_table_t*	user_table,
+	mem_heap_t*		heap)
+{
+	const char**		cols;
+	uint			i;
+
+	DBUG_ENTER("innobase_get_col_names");
+	DBUG_ASSERT(user_table->n_def > table->s->fields);
+	DBUG_ASSERT(ha_alter_info->handler_flags
+		    & Alter_inplace_info::ALTER_COLUMN_NAME);
+
+	cols = static_cast<const char**>(
+		mem_heap_zalloc(heap, user_table->n_def * sizeof *cols));
+
+	i = 0;
+	List_iterator_fast<Create_field> cf_it(
+		ha_alter_info->alter_info->create_list);
+	while (const Create_field* new_field = cf_it++) {
+		DBUG_ASSERT(i < altered_table->s->fields);
+
+		for (uint old_i = 0; table->field[old_i]; old_i++) {
+			if (new_field->field == table->field[old_i]) {
+				cols[old_i] = new_field->field_name;
+				break;
+			}
+		}
+
+		i++;
+	}
+
+	/* Copy the internal column names. */
+	i = table->s->fields;
+	cols[i] = dict_table_get_col_name(user_table, i);
+
+	while (++i < user_table->n_def) {
+		cols[i] = cols[i - 1] + strlen(cols[i - 1]) + 1;
+	}
+
+	DBUG_RETURN(cols);
+}
+
+/** Update internal structures with concurrent writes blocked,
+while preparing ALTER TABLE.
+
+@param ha_alter_info	Data used during in-place alter
+@param altered_table	MySQL table that is being altered
+@param old_table	MySQL table as it is before the ALTER operation
+@param table_name	Table name in MySQL
+@param flags		Table and tablespace flags
+@param flags2		Additional table flags
+@param fts_doc_id_col	The column number of FTS_DOC_ID
+@param add_fts_doc_id	Flag: add column FTS_DOC_ID?
+@param add_fts_doc_id_idx Flag: add index FTS_DOC_ID_INDEX (FTS_DOC_ID)?
+
+@retval true		Failure
+@retval false		Success
+*/
+static __attribute__((warn_unused_result, nonnull(1,2,3,4)))
+bool
+prepare_inplace_alter_table_dict(
+/*=============================*/
+	Alter_inplace_info*	ha_alter_info,
+	const TABLE*		altered_table,
+	const TABLE*		old_table,
+	const char*		table_name,
+	ulint			flags,
+	ulint			flags2,
+	ulint			fts_doc_id_col,
+	bool			add_fts_doc_id,
+	bool			add_fts_doc_id_idx)
+{
+	bool			dict_locked	= false;
+	ulint*			add_key_nums;	/* MySQL key numbers */
+	index_def_t*		index_defs;	/* index definitions */
+	dict_table_t*		user_table;
+	dict_index_t*		fts_index	= NULL;
+	ulint			new_clustered	= 0;
+	dberr_t			error;
+	ulint			num_fts_index;
+	ha_innobase_inplace_ctx*ctx;
+
+	DBUG_ENTER("prepare_inplace_alter_table_dict");
+
+	ctx = static_cast<ha_innobase_inplace_ctx*>
+		(ha_alter_info->handler_ctx);
+
+	DBUG_ASSERT((ctx->add_autoinc != ULINT_UNDEFINED)
+		    == (ctx->sequence.m_max_value > 0));
+	DBUG_ASSERT(!ctx->num_to_drop_index == !ctx->drop_index);
+	DBUG_ASSERT(!ctx->num_to_drop_fk == !ctx->drop_fk);
+	DBUG_ASSERT(!add_fts_doc_id || add_fts_doc_id_idx);
+	DBUG_ASSERT(!add_fts_doc_id_idx
+		    || innobase_fulltext_exist(altered_table));
+	DBUG_ASSERT(!ctx->add_cols);
+	DBUG_ASSERT(!ctx->add_index);
+	DBUG_ASSERT(!ctx->add_key_numbers);
+	DBUG_ASSERT(!ctx->num_to_add_index);
+
+	user_table = ctx->new_table;
+
+	trx_start_if_not_started_xa(ctx->prebuilt->trx);
+
+	/* Create a background transaction for the operations on
+	the data dictionary tables. */
+	ctx->trx = innobase_trx_allocate(ctx->prebuilt->trx->mysql_thd);
+
+	trx_start_for_ddl(ctx->trx, TRX_DICT_OP_INDEX);
+
+	/* Create table containing all indexes to be built in this
+	ALTER TABLE ADD INDEX so that they are in the correct order
+	in the table. */
+
+	ctx->num_to_add_index = ha_alter_info->index_add_count;
+
+	index_defs = innobase_create_key_defs(
+		ctx->heap, ha_alter_info, altered_table, ctx->num_to_add_index,
+		num_fts_index,
+		row_table_got_default_clust_index(ctx->new_table),
+		fts_doc_id_col, add_fts_doc_id, add_fts_doc_id_idx);
+
+	new_clustered = DICT_CLUSTERED & index_defs[0].ind_type;
+
+	if (num_fts_index > 1) {
+		my_error(ER_INNODB_FT_LIMIT, MYF(0));
+		goto error_handled;
+	}
+
+	if (!ctx->online) {
+		/* This is not an online operation (LOCK=NONE). */
+	} else if (ctx->add_autoinc == ULINT_UNDEFINED
+		   && num_fts_index == 0
+		   && (!innobase_need_rebuild(ha_alter_info)
+		       || !innobase_fulltext_exist(altered_table))) {
+		/* InnoDB can perform an online operation (LOCK=NONE). */
+	} else {
+		/* This should have been blocked in
+		check_if_supported_inplace_alter(). */
+		ut_ad(0);
+		my_error(ER_NOT_SUPPORTED_YET, MYF(0),
+			 thd_query_string(ctx->prebuilt->trx->mysql_thd)->str);
+		goto error_handled;
+	}
+
+	/* The primary index would be rebuilt if a FTS Doc ID
+	column is to be added, and the primary index definition
+	is just copied from old table and stored in indexdefs[0] */
+	DBUG_ASSERT(!add_fts_doc_id || new_clustered);
+	DBUG_ASSERT(!!new_clustered ==
+		    (innobase_need_rebuild(ha_alter_info)
+		     || add_fts_doc_id));
+
+	/* Allocate memory for dictionary index definitions */
+
+	ctx->add_index = static_cast<dict_index_t**>(
+		mem_heap_alloc(ctx->heap, ctx->num_to_add_index
+			       * sizeof *ctx->add_index));
+	ctx->add_key_numbers = add_key_nums = static_cast<ulint*>(
+		mem_heap_alloc(ctx->heap, ctx->num_to_add_index
+			       * sizeof *ctx->add_key_numbers));
+
+	/* This transaction should be dictionary operation, so that
+	the data dictionary will be locked during crash recovery. */
+
+	ut_ad(ctx->trx->dict_operation == TRX_DICT_OP_INDEX);
+
+	/* Acquire a lock on the table before creating any indexes. */
+
+	if (ctx->online) {
+		error = DB_SUCCESS;
+	} else {
+		error = row_merge_lock_table(
+			ctx->prebuilt->trx, ctx->new_table, LOCK_S);
+
+		if (error != DB_SUCCESS) {
+
+			goto error_handling;
+		}
+	}
+
+	/* Latch the InnoDB data dictionary exclusively so that no deadlocks
+	or lock waits can happen in it during an index create operation. */
+
+	row_mysql_lock_data_dictionary(ctx->trx);
+	dict_locked = true;
+
+	/* Wait for background stats processing to stop using the table that
+	we are going to alter. We know bg stats will not start using it again
+	until we are holding the data dict locked and we are holding it here
+	at least until checking ut_ad(user_table->n_ref_count == 1) below.
+	XXX what may happen if bg stats opens the table after we
+	have unlocked data dictionary below? */
+	dict_stats_wait_bg_to_stop_using_table(user_table, ctx->trx);
+
+	online_retry_drop_indexes_low(ctx->new_table, ctx->trx);
+
+	ut_d(dict_table_check_for_dup_indexes(
+		     ctx->new_table, CHECK_ABORTED_OK));
+
+	/* If a new clustered index is defined for the table we need
+	to rebuild the table with a temporary name. */
+
+	if (new_clustered) {
+		const char*	new_table_name
+			= dict_mem_create_temporary_tablename(
+				ctx->heap,
+				ctx->new_table->name,
+				ctx->new_table->id);
+		ulint		n_cols;
+		dtuple_t*	add_cols;
+
+		if (innobase_check_foreigns(
+			    ha_alter_info, altered_table, old_table,
+			    user_table, ctx->drop_fk, ctx->num_to_drop_fk)) {
+			goto new_clustered_failed;
+		}
+
+		n_cols = altered_table->s->fields;
+
+		if (add_fts_doc_id) {
+			n_cols++;
+			DBUG_ASSERT(flags2 & DICT_TF2_FTS);
+			DBUG_ASSERT(add_fts_doc_id_idx);
+			flags2 |= DICT_TF2_FTS_ADD_DOC_ID
+				| DICT_TF2_FTS_HAS_DOC_ID
+				| DICT_TF2_FTS;
+		}
+
+		DBUG_ASSERT(!add_fts_doc_id_idx || (flags2 & DICT_TF2_FTS));
+
+		/* Create the table. */
+		trx_set_dict_operation(ctx->trx, TRX_DICT_OP_TABLE);
+
+		if (dict_table_get_low(new_table_name)) {
+			my_error(ER_TABLE_EXISTS_ERROR, MYF(0),
+				 new_table_name);
+			goto new_clustered_failed;
+		}
+
+		/* The initial space id 0 may be overridden later. */
+		ctx->new_table = dict_mem_table_create(
+			new_table_name, 0, n_cols, flags, flags2);
+		/* The rebuilt indexed_table will use the renamed
+		column names. */
+		ctx->col_names = NULL;
+
+		if (DICT_TF_HAS_DATA_DIR(flags)) {
+			ctx->new_table->data_dir_path =
+				mem_heap_strdup(ctx->new_table->heap,
+				user_table->data_dir_path);
+		}
+
+		for (uint i = 0; i < altered_table->s->fields; i++) {
+			const Field*	field = altered_table->field[i];
+			ulint		is_unsigned;
+			ulint		field_type
+				= (ulint) field->type();
+			ulint		col_type
+				= get_innobase_type_from_mysql_type(
+					&is_unsigned, field);
+			ulint		charset_no;
+			ulint		col_len;
+
+			/* we assume in dtype_form_prtype() that this
+			fits in two bytes */
+			ut_a(field_type <= MAX_CHAR_COLL_NUM);
+
+			if (!field->real_maybe_null()) {
+				field_type |= DATA_NOT_NULL;
+			}
+
+			if (field->binary()) {
+				field_type |= DATA_BINARY_TYPE;
+			}
+
+			if (is_unsigned) {
+				field_type |= DATA_UNSIGNED;
+			}
+
+			if (dtype_is_string_type(col_type)) {
+				charset_no = (ulint) field->charset()->number;
+
+				if (charset_no > MAX_CHAR_COLL_NUM) {
+					dict_mem_table_free(
+						ctx->new_table);
+					my_error(ER_WRONG_KEY_COLUMN, MYF(0),
+						 field->field_name);
+					goto new_clustered_failed;
+				}
+			} else {
+				charset_no = 0;
+			}
+
+			col_len = field->pack_length();
+
+			/* The MySQL pack length contains 1 or 2 bytes
+			length field for a true VARCHAR. Let us
+			subtract that, so that the InnoDB column
+			length in the InnoDB data dictionary is the
+			real maximum byte length of the actual data. */
+
+			if (field->type() == MYSQL_TYPE_VARCHAR) {
+				uint32	length_bytes
+					= static_cast<const Field_varstring*>(
+						field)->length_bytes;
+
+				col_len -= length_bytes;
+
+				if (length_bytes == 2) {
+					field_type |= DATA_LONG_TRUE_VARCHAR;
+				}
+			}
+
+			if (dict_col_name_is_reserved(field->field_name)) {
+				dict_mem_table_free(ctx->new_table);
+				my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+					 field->field_name);
+				goto new_clustered_failed;
+			}
+
+			dict_mem_table_add_col(
+				ctx->new_table, ctx->heap,
+				field->field_name,
+				col_type,
+				dtype_form_prtype(field_type, charset_no),
+				col_len);
+		}
+
+		if (add_fts_doc_id) {
+			fts_add_doc_id_column(ctx->new_table, ctx->heap);
+			ctx->new_table->fts->doc_col = fts_doc_id_col;
+			ut_ad(fts_doc_id_col == altered_table->s->fields);
+		} else if (ctx->new_table->fts) {
+			ctx->new_table->fts->doc_col = fts_doc_id_col;
+		}
+
+		error = row_create_table_for_mysql(
+			ctx->new_table, ctx->trx, false);
+
+		switch (error) {
+			dict_table_t*	temp_table;
+		case DB_SUCCESS:
+			/* We need to bump up the table ref count and
+			before we can use it we need to open the
+			table. The new_table must be in the data
+			dictionary cache, because we are still holding
+			the dict_sys->mutex. */
+			ut_ad(mutex_own(&dict_sys->mutex));
+			temp_table = dict_table_open_on_name(
+				ctx->new_table->name, TRUE, FALSE,
+				DICT_ERR_IGNORE_NONE);
+			ut_a(ctx->new_table == temp_table);
+			/* n_ref_count must be 1, because purge cannot
+			be executing on this very table as we are
+			holding dict_operation_lock X-latch. */
+			DBUG_ASSERT(ctx->new_table->n_ref_count == 1);
+			break;
+		case DB_TABLESPACE_EXISTS:
+			my_error(ER_TABLESPACE_EXISTS, MYF(0),
+				 new_table_name);
+			goto new_clustered_failed;
+		case DB_DUPLICATE_KEY:
+			my_error(HA_ERR_TABLE_EXIST, MYF(0),
+				 altered_table->s->table_name.str);
+			goto new_clustered_failed;
+		default:
+			my_error_innodb(error, table_name, flags);
+		new_clustered_failed:
+			DBUG_ASSERT(ctx->trx != ctx->prebuilt->trx);
+			trx_rollback_to_savepoint(ctx->trx, NULL);
+
+			ut_ad(user_table->n_ref_count == 1);
+
+			online_retry_drop_indexes_with_trx(
+				user_table, ctx->trx);
+			goto err_exit;
+		}
+
+		if (ha_alter_info->handler_flags
+		    & Alter_inplace_info::ADD_COLUMN) {
+			add_cols = dtuple_create(
+				ctx->heap,
+				dict_table_get_n_cols(ctx->new_table));
+
+			dict_table_copy_types(add_cols, ctx->new_table);
+		} else {
+			add_cols = NULL;
+		}
+
+		ctx->col_map = innobase_build_col_map(
+			ha_alter_info, altered_table, old_table,
+			ctx->new_table, user_table,
+			add_cols, ctx->heap);
+		ctx->add_cols = add_cols;
+	} else {
+		DBUG_ASSERT(!innobase_need_rebuild(ha_alter_info));
+
+		if (!ctx->new_table->fts
+		    && innobase_fulltext_exist(altered_table)) {
+			ctx->new_table->fts = fts_create(
+				ctx->new_table);
+			ctx->new_table->fts->doc_col = fts_doc_id_col;
+		}
+	}
+
+	/* Assign table_id, so that no table id of
+	fts_create_index_tables() will be written to the undo logs. */
+	DBUG_ASSERT(ctx->new_table->id != 0);
+	ctx->trx->table_id = ctx->new_table->id;
+
+	/* Create the indexes in SYS_INDEXES and load into dictionary. */
+
+	for (ulint a = 0; a < ctx->num_to_add_index; a++) {
+
+		ctx->add_index[a] = row_merge_create_index(
+			ctx->trx, ctx->new_table,
+			&index_defs[a]);
+
+		add_key_nums[a] = index_defs[a].key_number;
+
+		if (!ctx->add_index[a]) {
+			error = ctx->trx->error_state;
+			DBUG_ASSERT(error != DB_SUCCESS);
+			goto error_handling;
+		}
+
+		if (ctx->add_index[a]->type & DICT_FTS) {
+			DBUG_ASSERT(num_fts_index);
+			DBUG_ASSERT(!fts_index);
+			DBUG_ASSERT(ctx->add_index[a]->type == DICT_FTS);
+			fts_index = ctx->add_index[a];
+		}
+
+		/* If only online ALTER TABLE operations have been
+		requested, allocate a modification log. If the table
+		will be locked anyway, the modification
+		log is unnecessary. When rebuilding the table
+		(new_clustered), we will allocate the log for the
+		clustered index of the old table, later. */
+		if (new_clustered
+		    || !ctx->online
+		    || user_table->ibd_file_missing
+		    || dict_table_is_discarded(user_table)) {
+			/* No need to allocate a modification log. */
+			ut_ad(!ctx->add_index[a]->online_log);
+		} else if (ctx->add_index[a]->type & DICT_FTS) {
+			/* Fulltext indexes are not covered
+			by a modification log. */
+		} else {
+			DBUG_EXECUTE_IF("innodb_OOM_prepare_inplace_alter",
+					error = DB_OUT_OF_MEMORY;
+					goto error_handling;);
+			rw_lock_x_lock(&ctx->add_index[a]->lock);
+			bool ok = row_log_allocate(ctx->add_index[a],
+						   NULL, true, NULL, NULL);
+			rw_lock_x_unlock(&ctx->add_index[a]->lock);
+
+			if (!ok) {
+				error = DB_OUT_OF_MEMORY;
+				goto error_handling;
+			}
+		}
+	}
+
+	ut_ad(new_clustered == ctx->need_rebuild());
+
+	DBUG_EXECUTE_IF("innodb_OOM_prepare_inplace_alter",
+			error = DB_OUT_OF_MEMORY;
+			goto error_handling;);
+
+	if (new_clustered && ctx->online) {
+		/* Allocate a log for online table rebuild. */
+		dict_index_t* clust_index = dict_table_get_first_index(
+			user_table);
+
+		rw_lock_x_lock(&clust_index->lock);
+		bool ok = row_log_allocate(
+			clust_index, ctx->new_table,
+			!(ha_alter_info->handler_flags
+			  & Alter_inplace_info::ADD_PK_INDEX),
+			ctx->add_cols, ctx->col_map);
+		rw_lock_x_unlock(&clust_index->lock);
+
+		if (!ok) {
+			error = DB_OUT_OF_MEMORY;
+			goto error_handling;
+		}
+	}
+
+	if (ctx->online) {
+		/* Assign a consistent read view for
+		row_merge_read_clustered_index(). */
+		trx_assign_read_view(ctx->prebuilt->trx);
+	}
+
+	if (fts_index) {
+		/* Ensure that the dictionary operation mode will
+		not change while creating the auxiliary tables. */
+		trx_dict_op_t	op = trx_get_dict_operation(ctx->trx);
+
+#ifdef UNIV_DEBUG
+		switch (op) {
+		case TRX_DICT_OP_NONE:
+			break;
+		case TRX_DICT_OP_TABLE:
+		case TRX_DICT_OP_INDEX:
+			goto op_ok;
+		}
+		ut_error;
+op_ok:
+#endif /* UNIV_DEBUG */
+		ut_ad(ctx->trx->dict_operation_lock_mode == RW_X_LATCH);
+		ut_ad(mutex_own(&dict_sys->mutex));
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+		DICT_TF2_FLAG_SET(ctx->new_table, DICT_TF2_FTS);
+
+		/* This function will commit the transaction and reset
+		the trx_t::dict_operation flag on success. */
+
+		error = fts_create_index_tables(ctx->trx, fts_index);
+
+		DBUG_EXECUTE_IF("innodb_test_fail_after_fts_index_table",
+				error = DB_LOCK_WAIT_TIMEOUT;
+				goto error_handling;);
+
+		if (error != DB_SUCCESS) {
+			goto error_handling;
+		}
+
+		trx_start_for_ddl(ctx->trx, op);
+
+		if (!ctx->new_table->fts
+		    || ib_vector_size(ctx->new_table->fts->indexes) == 0) {
+			error = fts_create_common_tables(
+				ctx->trx, ctx->new_table,
+				user_table->name, TRUE);
+
+			DBUG_EXECUTE_IF(
+				"innodb_test_fail_after_fts_common_table",
+				error = DB_LOCK_WAIT_TIMEOUT;);
+
+			if (error != DB_SUCCESS) {
+				goto error_handling;
+			}
+
+			ctx->new_table->fts->fts_status
+				|= TABLE_DICT_LOCKED;
+
+			error = innobase_fts_load_stopword(
+				ctx->new_table, ctx->trx,
+				ctx->prebuilt->trx->mysql_thd)
+				? DB_SUCCESS : DB_ERROR;
+			ctx->new_table->fts->fts_status
+				&= ~TABLE_DICT_LOCKED;
+
+			if (error != DB_SUCCESS) {
+				goto error_handling;
+			}
+		}
+
+		ut_ad(trx_get_dict_operation(ctx->trx) == op);
+	}
+
+	DBUG_ASSERT(error == DB_SUCCESS);
+
+	/* Commit the data dictionary transaction in order to release
+	the table locks on the system tables.  This means that if
+	MySQL crashes while creating a new primary key inside
+	row_merge_build_indexes(), ctx->new_table will not be dropped
+	by trx_rollback_active().  It will have to be recovered or
+	dropped by the database administrator. */
+	trx_commit_for_mysql(ctx->trx);
+
+	row_mysql_unlock_data_dictionary(ctx->trx);
+	dict_locked = false;
+
+	ut_a(ctx->trx->lock.n_active_thrs == 0);
+
+error_handling:
+	/* After an error, remove all those index definitions from the
+	dictionary which were defined. */
+
+	switch (error) {
+	case DB_SUCCESS:
+		ut_a(!dict_locked);
+
+		ut_d(mutex_enter(&dict_sys->mutex));
+		ut_d(dict_table_check_for_dup_indexes(
+			     user_table, CHECK_PARTIAL_OK));
+		ut_d(mutex_exit(&dict_sys->mutex));
+		DBUG_RETURN(false);
+	case DB_TABLESPACE_EXISTS:
+		my_error(ER_TABLESPACE_EXISTS, MYF(0), "(unknown)");
+		break;
+	case DB_DUPLICATE_KEY:
+		my_error(ER_DUP_KEY, MYF(0), "SYS_INDEXES");
+		break;
+	default:
+		my_error_innodb(error, table_name, user_table->flags);
+	}
+
+error_handled:
+
+	ctx->prebuilt->trx->error_info = NULL;
+	ctx->trx->error_state = DB_SUCCESS;
+
+	if (!dict_locked) {
+		row_mysql_lock_data_dictionary(ctx->trx);
+	}
+
+	if (new_clustered) {
+		if (ctx->need_rebuild()) {
+
+			if (DICT_TF2_FLAG_IS_SET(
+				    ctx->new_table, DICT_TF2_FTS)) {
+				innobase_drop_fts_index_table(
+					ctx->new_table, ctx->trx);
+			}
+
+			dict_table_close(ctx->new_table, TRUE, FALSE);
+
+#if defined UNIV_DEBUG || defined UNIV_DDL_DEBUG
+			/* Nobody should have initialized the stats of the
+			newly created table yet. When this is the case, we
+			know that it has not been added for background stats
+			gathering. */
+			ut_a(!ctx->new_table->stat_initialized);
+#endif /* UNIV_DEBUG || UNIV_DDL_DEBUG */
+
+			row_merge_drop_table(ctx->trx, ctx->new_table);
+
+			/* Free the log for online table rebuild, if
+			one was allocated. */
+
+			dict_index_t* clust_index = dict_table_get_first_index(
+				user_table);
+
+			rw_lock_x_lock(&clust_index->lock);
+
+			if (clust_index->online_log) {
+				ut_ad(ctx->online);
+				row_log_abort_sec(clust_index);
+				clust_index->online_status
+					= ONLINE_INDEX_COMPLETE;
+			}
+
+			rw_lock_x_unlock(&clust_index->lock);
+		}
+
+		trx_commit_for_mysql(ctx->trx);
+		/* n_ref_count must be 1, because purge cannot
+		be executing on this very table as we are
+		holding dict_operation_lock X-latch. */
+		DBUG_ASSERT(user_table->n_ref_count == 1 || ctx->online);
+
+		online_retry_drop_indexes_with_trx(user_table, ctx->trx);
+	} else {
+		ut_ad(!ctx->need_rebuild());
+		row_merge_drop_indexes(ctx->trx, user_table, TRUE);
+		trx_commit_for_mysql(ctx->trx);
+	}
+
+	ut_d(dict_table_check_for_dup_indexes(user_table, CHECK_ALL_COMPLETE));
+	ut_ad(!user_table->drop_aborted);
+
+err_exit:
+	/* Clear the to_be_dropped flag in the data dictionary cache. */
+	for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+		DBUG_ASSERT(*ctx->drop_index[i]->name != TEMP_INDEX_PREFIX);
+		DBUG_ASSERT(ctx->drop_index[i]->to_be_dropped);
+		ctx->drop_index[i]->to_be_dropped = 0;
+	}
+
+	row_mysql_unlock_data_dictionary(ctx->trx);
+
+	trx_free_for_mysql(ctx->trx);
+	trx_commit_for_mysql(ctx->prebuilt->trx);
+
+	delete ctx;
+	ha_alter_info->handler_ctx = NULL;
+
+	DBUG_RETURN(true);
+}
+
+/* Check whether an index is needed for the foreign key constraint.
+If so, if it is dropped, is there an equivalent index can play its role.
+@return true if the index is needed and can't be dropped */
+static __attribute__((nonnull(1,2,3,5), warn_unused_result))
+bool
+innobase_check_foreign_key_index(
+/*=============================*/
+	Alter_inplace_info*	ha_alter_info,	/*!< in: Structure describing
+						changes to be done by ALTER
+						TABLE */
+	dict_index_t*		index,		/*!< in: index to check */
+	dict_table_t*		indexed_table,	/*!< in: table that owns the
+						foreign keys */
+	const char**		col_names,	/*!< in: column names, or NULL
+						for indexed_table->col_names */
+	trx_t*			trx,		/*!< in/out: transaction */
+	dict_foreign_t**	drop_fk,	/*!< in: Foreign key constraints
+						to drop */
+	ulint			n_drop_fk)	/*!< in: Number of foreign keys
+						to drop */
+{
+	dict_foreign_t*	foreign;
+
+	/* Check if the index is referenced. */
+	foreign = dict_table_get_referenced_constraint(indexed_table, index);
+
+	ut_ad(!foreign || indexed_table
+	      == foreign->referenced_table);
+
+	if (foreign
+	    && !dict_foreign_find_index(
+		    indexed_table, col_names,
+		    foreign->referenced_col_names,
+		    foreign->n_fields, index,
+		    /*check_charsets=*/TRUE,
+		    /*check_null=*/FALSE)
+	    && !innobase_find_equiv_index(
+		    foreign->referenced_col_names,
+		    foreign->n_fields,
+		    ha_alter_info->key_info_buffer,
+		    ha_alter_info->index_add_buffer,
+		    ha_alter_info->index_add_count)
+	    ) {
+		trx->error_info = index;
+		return(true);
+	}
+
+	/* Check if this index references some
+	other table */
+	foreign = dict_table_get_foreign_constraint(
+		indexed_table, index);
+
+	ut_ad(!foreign || indexed_table
+	      == foreign->foreign_table);
+
+	if (foreign
+	    && !innobase_dropping_foreign(
+		    foreign, drop_fk, n_drop_fk)
+	    && !dict_foreign_find_index(
+		    indexed_table, col_names,
+		    foreign->foreign_col_names,
+		    foreign->n_fields, index,
+		    /*check_charsets=*/TRUE,
+		    /*check_null=*/FALSE)
+	    && !innobase_find_equiv_index(
+		    foreign->foreign_col_names,
+		    foreign->n_fields,
+		    ha_alter_info->key_info_buffer,
+		    ha_alter_info->index_add_buffer,
+		    ha_alter_info->index_add_count)
+	    ) {
+		trx->error_info = index;
+		return(true);
+	}
+
+	return(false);
+}
+
+/** Allows InnoDB to update internal structures with concurrent
+writes blocked (provided that check_if_supported_inplace_alter()
+did not return HA_ALTER_INPLACE_NO_LOCK).
+This will be invoked before inplace_alter_table().
+
+@param altered_table	TABLE object for new version of table.
+@param ha_alter_info	Structure describing changes to be done
+by ALTER TABLE and holding data used during in-place alter.
+
+@retval true		Failure
+@retval false		Success
+*/
+UNIV_INTERN
+bool
+ha_innobase::prepare_inplace_alter_table(
+/*=====================================*/
+	TABLE*			altered_table,
+	Alter_inplace_info*	ha_alter_info)
+{
+	dict_index_t**	drop_index;	/*!< Index to be dropped */
+	ulint		n_drop_index;	/*!< Number of indexes to drop */
+	dict_foreign_t**drop_fk;	/*!< Foreign key constraints to drop */
+	ulint		n_drop_fk;	/*!< Number of foreign keys to drop */
+	dict_foreign_t**add_fk = NULL;	/*!< Foreign key constraints to drop */
+	ulint		n_add_fk;	/*!< Number of foreign keys to drop */
+	dict_table_t*	indexed_table;	/*!< Table where indexes are created */
+	mem_heap_t*     heap;
+	const char**	col_names;
+	int		error;
+	ulint		flags;
+	ulint		flags2;
+	ulint		max_col_len;
+	ulint		add_autoinc_col_no	= ULINT_UNDEFINED;
+	ulonglong	autoinc_col_max_value	= 0;
+	ulint		fts_doc_col_no		= ULINT_UNDEFINED;
+	bool		add_fts_doc_id		= false;
+	bool		add_fts_doc_id_idx	= false;
+	bool		add_fts_idx		= false;
+
+	DBUG_ENTER("prepare_inplace_alter_table");
+	DBUG_ASSERT(!ha_alter_info->handler_ctx);
+	DBUG_ASSERT(ha_alter_info->create_info);
+	DBUG_ASSERT(!srv_read_only_mode);
+
+	MONITOR_ATOMIC_INC(MONITOR_PENDING_ALTER_TABLE);
+
+#ifdef UNIV_DEBUG
+	for (dict_index_t* index = dict_table_get_first_index(prebuilt->table);
+	     index;
+	     index = dict_table_get_next_index(index)) {
+		ut_ad(!index->to_be_dropped);
+	}
+#endif /* UNIV_DEBUG */
+
+	ut_d(mutex_enter(&dict_sys->mutex));
+	ut_d(dict_table_check_for_dup_indexes(
+		     prebuilt->table, CHECK_ABORTED_OK));
+	ut_d(mutex_exit(&dict_sys->mutex));
+
+	if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) {
+		/* Nothing to do */
+		goto func_exit;
+	}
+
+	if (ha_alter_info->handler_flags
+	    & Alter_inplace_info::CHANGE_CREATE_OPTION) {
+		if (const char* invalid_opt = create_options_are_invalid(
+			    user_thd, altered_table,
+			    ha_alter_info->create_info,
+			    prebuilt->table->space != 0)) {
+			my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0),
+				 table_type(), invalid_opt);
+			goto err_exit_no_heap;
+		}
+	}
+
+	/* Check if any index name is reserved. */
+	if (innobase_index_name_is_reserved(
+		    user_thd,
+		    ha_alter_info->key_info_buffer,
+		    ha_alter_info->key_count)) {
+err_exit_no_heap:
+		DBUG_ASSERT(prebuilt->trx->dict_operation_lock_mode == 0);
+		if (ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) {
+			online_retry_drop_indexes(prebuilt->table, user_thd);
+		}
+		DBUG_RETURN(true);
+	}
+
+	indexed_table = prebuilt->table;
+
+	/* Check that index keys are sensible */
+	error = innobase_check_index_keys(ha_alter_info, indexed_table);
+
+	if (error) {
+		goto err_exit_no_heap;
+	}
+
+	/* Prohibit renaming a column to something that the table
+	already contains. */
+	if (ha_alter_info->handler_flags
+	    & Alter_inplace_info::ALTER_COLUMN_NAME) {
+		List_iterator_fast<Create_field> cf_it(
+			ha_alter_info->alter_info->create_list);
+
+		for (Field** fp = table->field; *fp; fp++) {
+			if (!((*fp)->flags & FIELD_IS_RENAMED)) {
+				continue;
+			}
+
+			const char* name = 0;
+
+			cf_it.rewind();
+			while (Create_field* cf = cf_it++) {
+				if (cf->field == *fp) {
+					name = cf->field_name;
+					goto check_if_ok_to_rename;
+				}
+			}
+
+			ut_error;
+check_if_ok_to_rename:
+			/* Prohibit renaming a column from FTS_DOC_ID
+			if full-text indexes exist. */
+			if (!my_strcasecmp(system_charset_info,
+					   (*fp)->field_name,
+					   FTS_DOC_ID_COL_NAME)
+			    && innobase_fulltext_exist(altered_table)) {
+				my_error(ER_INNODB_FT_WRONG_DOCID_COLUMN,
+					 MYF(0), name);
+				goto err_exit_no_heap;
+			}
+
+			/* Prohibit renaming a column to an internal column. */
+			const char*	s = prebuilt->table->col_names;
+			unsigned j;
+			/* Skip user columns.
+			MySQL should have checked these already.
+			We want to allow renaming of c1 to c2, c2 to c1. */
+			for (j = 0; j < table->s->fields; j++) {
+				s += strlen(s) + 1;
+			}
+
+			for (; j < prebuilt->table->n_def; j++) {
+				if (!my_strcasecmp(
+					    system_charset_info, name, s)) {
+					my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+						 s);
+					goto err_exit_no_heap;
+				}
+
+				s += strlen(s) + 1;
+			}
+		}
+	}
+
+	if (!innobase_table_flags(altered_table,
+				  ha_alter_info->create_info,
+				  user_thd,
+				  srv_file_per_table
+				  || indexed_table->space != 0,
+				  &flags, &flags2)) {
+		goto err_exit_no_heap;
+	}
+
+	max_col_len = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags);
+
+	/* Check each index's column length to make sure they do not
+	exceed limit */
+	for (ulint i = 0; i < ha_alter_info->index_add_count; i++) {
+		const KEY* key = &ha_alter_info->key_info_buffer[
+			ha_alter_info->index_add_buffer[i]];
+
+		if (key->flags & HA_FULLTEXT) {
+			/* The column length does not matter for
+			fulltext search indexes. But, UNIQUE
+			fulltext indexes are not supported. */
+			DBUG_ASSERT(!(key->flags & HA_NOSAME));
+			DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK
+				      & ~(HA_FULLTEXT
+					  | HA_PACK_KEY
+					  | HA_BINARY_PACK_KEY)));
+			add_fts_idx = true;
+			continue;
+		}
+
+		if (innobase_check_column_length(max_col_len, key)) {
+			my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
+				 max_col_len);
+			goto err_exit_no_heap;
+		}
+	}
+
+	/* We won't be allowed to add fts index to a table with
+	fts indexes already but without AUX_HEX_NAME set.
+	This means the aux tables of the table failed to
+	rename to hex format but new created aux tables
+	shall be in hex format, which is contradictory. */
+	if (!DICT_TF2_FLAG_IS_SET(indexed_table, DICT_TF2_FTS_AUX_HEX_NAME)
+	    && indexed_table->fts != NULL && add_fts_idx) {
+		my_error(ER_INNODB_FT_AUX_NOT_HEX_ID, MYF(0));
+		goto err_exit_no_heap;
+	}
+
+	/* Check existing index definitions for too-long column
+	prefixes as well, in case max_col_len shrunk. */
+	for (const dict_index_t* index
+		     = dict_table_get_first_index(indexed_table);
+	     index;
+	     index = dict_table_get_next_index(index)) {
+		if (index->type & DICT_FTS) {
+			DBUG_ASSERT(index->type == DICT_FTS
+				    || (index->type & DICT_CORRUPT));
+			continue;
+		}
+
+		for (ulint i = 0; i < dict_index_get_n_fields(index); i++) {
+			const dict_field_t* field
+				= dict_index_get_nth_field(index, i);
+			if (field->prefix_len > max_col_len) {
+				my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
+					 max_col_len);
+				goto err_exit_no_heap;
+			}
+		}
+	}
+
+	n_drop_index = 0;
+	n_drop_fk = 0;
+
+	if (ha_alter_info->handler_flags
+	    & (INNOBASE_ALTER_NOREBUILD | INNOBASE_ALTER_REBUILD)) {
+		heap = mem_heap_create(1024);
+
+		if (ha_alter_info->handler_flags
+		    & Alter_inplace_info::ALTER_COLUMN_NAME) {
+			col_names = innobase_get_col_names(
+				ha_alter_info, altered_table, table,
+				indexed_table, heap);
+		} else {
+			col_names = NULL;
+		}
+	} else {
+		heap = NULL;
+		col_names = NULL;
+	}
+
+	if (ha_alter_info->handler_flags
+	    & Alter_inplace_info::DROP_FOREIGN_KEY) {
+		DBUG_ASSERT(ha_alter_info->alter_info->drop_list.elements > 0);
+
+		drop_fk = static_cast<dict_foreign_t**>(
+			mem_heap_alloc(
+				heap,
+				ha_alter_info->alter_info->drop_list.elements
+				* sizeof(dict_foreign_t*)));
+
+		List_iterator<Alter_drop> drop_it(
+			ha_alter_info->alter_info->drop_list);
+
+		while (Alter_drop* drop = drop_it++) {
+			if (drop->type != Alter_drop::FOREIGN_KEY) {
+				continue;
+			}
+
+			for (dict_foreign_set::iterator it
+				= prebuilt->table->foreign_set.begin();
+			     it != prebuilt->table->foreign_set.end();
+			     ++it) {
+
+				dict_foreign_t*	foreign = *it;
+				const char* fid = strchr(foreign->id, '/');
+
+				DBUG_ASSERT(fid);
+				/* If no database/ prefix was present in
+				the FOREIGN KEY constraint name, compare
+				to the full constraint name. */
+				fid = fid ? fid + 1 : foreign->id;
+
+				if (!my_strcasecmp(system_charset_info,
+						   fid, drop->name)) {
+					drop_fk[n_drop_fk++] = foreign;
+					goto found_fk;
+				}
+			}
+
+			my_error(ER_CANT_DROP_FIELD_OR_KEY, MYF(0),
+				 drop->name);
+			goto err_exit;
+found_fk:
+			continue;
+		}
+
+		DBUG_ASSERT(n_drop_fk > 0);
+		DBUG_ASSERT(n_drop_fk
+			    == ha_alter_info->alter_info->drop_list.elements);
+	} else {
+		drop_fk = NULL;
+	}
+
+	if (ha_alter_info->index_drop_count) {
+		dict_index_t*	drop_primary = NULL;
+
+		DBUG_ASSERT(ha_alter_info->handler_flags
+			    & (Alter_inplace_info::DROP_INDEX
+			       | Alter_inplace_info::DROP_UNIQUE_INDEX
+			       | Alter_inplace_info::DROP_PK_INDEX));
+		/* Check which indexes to drop. */
+		drop_index = static_cast<dict_index_t**>(
+			mem_heap_alloc(
+				heap, (ha_alter_info->index_drop_count + 1)
+				* sizeof *drop_index));
+
+		for (uint i = 0; i < ha_alter_info->index_drop_count; i++) {
+			const KEY*	key
+				= ha_alter_info->index_drop_buffer[i];
+			dict_index_t*	index
+				= dict_table_get_index_on_name_and_min_id(
+					indexed_table, key->name);
+
+			if (!index) {
+				push_warning_printf(
+					user_thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					HA_ERR_WRONG_INDEX,
+					"InnoDB could not find key "
+					"with name %s", key->name);
+			} else {
+				ut_ad(!index->to_be_dropped);
+				if (!dict_index_is_clust(index)) {
+					drop_index[n_drop_index++] = index;
+				} else {
+					drop_primary = index;
+				}
+			}
+		}
+
+		/* If all FULLTEXT indexes were removed, drop an
+		internal FTS_DOC_ID_INDEX as well, unless it exists in
+		the table. */
+
+		if (innobase_fulltext_exist(table)
+		    && !innobase_fulltext_exist(altered_table)
+		    && !DICT_TF2_FLAG_IS_SET(
+			indexed_table, DICT_TF2_FTS_HAS_DOC_ID)) {
+			dict_index_t*	fts_doc_index
+				= dict_table_get_index_on_name(
+					indexed_table, FTS_DOC_ID_INDEX_NAME);
+
+			// Add some fault tolerance for non-debug builds.
+			if (fts_doc_index == NULL) {
+				goto check_if_can_drop_indexes;
+			}
+
+			DBUG_ASSERT(!fts_doc_index->to_be_dropped);
+
+			for (uint i = 0; i < table->s->keys; i++) {
+				if (!my_strcasecmp(
+					    system_charset_info,
+					    FTS_DOC_ID_INDEX_NAME,
+					    table->key_info[i].name)) {
+					/* The index exists in the MySQL
+					data dictionary. Do not drop it,
+					even though it is no longer needed
+					by InnoDB fulltext search. */
+					goto check_if_can_drop_indexes;
+				}
+			}
+
+			drop_index[n_drop_index++] = fts_doc_index;
+		}
+
+check_if_can_drop_indexes:
+		/* Check if the indexes can be dropped. */
+
+		/* Prevent a race condition between DROP INDEX and
+		CREATE TABLE adding FOREIGN KEY constraints. */
+		row_mysql_lock_data_dictionary(prebuilt->trx);
+
+		if (!n_drop_index) {
+			drop_index = NULL;
+		} else {
+			/* Flag all indexes that are to be dropped. */
+			for (ulint i = 0; i < n_drop_index; i++) {
+				ut_ad(!drop_index[i]->to_be_dropped);
+				drop_index[i]->to_be_dropped = 1;
+			}
+		}
+
+		if (prebuilt->trx->check_foreigns) {
+			for (uint i = 0; i < n_drop_index; i++) {
+			     dict_index_t*	index = drop_index[i];
+
+				if (innobase_check_foreign_key_index(
+					ha_alter_info, index,
+					indexed_table, col_names,
+					prebuilt->trx, drop_fk, n_drop_fk)) {
+					row_mysql_unlock_data_dictionary(
+						prebuilt->trx);
+					prebuilt->trx->error_info = index;
+					print_error(HA_ERR_DROP_INDEX_FK,
+						    MYF(0));
+					goto err_exit;
+				}
+			}
+
+			/* If a primary index is dropped, need to check
+			any depending foreign constraints get affected */
+			if (drop_primary
+			    && innobase_check_foreign_key_index(
+				ha_alter_info, drop_primary,
+				indexed_table, col_names,
+				prebuilt->trx, drop_fk, n_drop_fk)) {
+				row_mysql_unlock_data_dictionary(prebuilt->trx);
+				print_error(HA_ERR_DROP_INDEX_FK, MYF(0));
+				goto err_exit;
+			}
+		}
+
+		row_mysql_unlock_data_dictionary(prebuilt->trx);
+	} else {
+		drop_index = NULL;
+	}
+
+	n_add_fk = 0;
+
+	if (ha_alter_info->handler_flags
+	    & Alter_inplace_info::ADD_FOREIGN_KEY) {
+		ut_ad(!prebuilt->trx->check_foreigns);
+
+		add_fk = static_cast<dict_foreign_t**>(
+			mem_heap_zalloc(
+				heap,
+				ha_alter_info->alter_info->key_list.elements
+				* sizeof(dict_foreign_t*)));
+
+		if (!innobase_get_foreign_key_info(
+			    ha_alter_info, table_share,
+			    prebuilt->table, col_names,
+			    drop_index, n_drop_index,
+			    add_fk, &n_add_fk, prebuilt->trx)) {
+err_exit:
+			if (n_drop_index) {
+				row_mysql_lock_data_dictionary(prebuilt->trx);
+
+				/* Clear the to_be_dropped flags, which might
+				have been set at this point. */
+				for (ulint i = 0; i < n_drop_index; i++) {
+					DBUG_ASSERT(*drop_index[i]->name
+						    != TEMP_INDEX_PREFIX);
+					drop_index[i]->to_be_dropped = 0;
+				}
+
+				row_mysql_unlock_data_dictionary(prebuilt->trx);
+			}
+
+			if (heap) {
+				mem_heap_free(heap);
+			}
+
+			goto err_exit_no_heap;
+		}
+	}
+
+	if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA)
+	    || (ha_alter_info->handler_flags
+		== Alter_inplace_info::CHANGE_CREATE_OPTION
+		&& !innobase_need_rebuild(ha_alter_info))) {
+
+		if (heap) {
+			ha_alter_info->handler_ctx
+				= new ha_innobase_inplace_ctx(
+					prebuilt,
+					drop_index, n_drop_index,
+					drop_fk, n_drop_fk,
+					add_fk, n_add_fk,
+					ha_alter_info->online,
+					heap, indexed_table,
+					col_names, ULINT_UNDEFINED, 0, 0);
+		}
+
+func_exit:
+		DBUG_ASSERT(prebuilt->trx->dict_operation_lock_mode == 0);
+		if (ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) {
+			online_retry_drop_indexes(prebuilt->table, user_thd);
+		}
+		DBUG_RETURN(false);
+	}
+
+	/* If we are to build a full-text search index, check whether
+	the table already has a DOC ID column.  If not, we will need to
+	add a Doc ID hidden column and rebuild the primary index */
+	if (innobase_fulltext_exist(altered_table)) {
+		ulint	doc_col_no;
+
+		if (!innobase_fts_check_doc_id_col(
+			    prebuilt->table, altered_table, &fts_doc_col_no)) {
+			fts_doc_col_no = altered_table->s->fields;
+			add_fts_doc_id = true;
+			add_fts_doc_id_idx = true;
+
+			push_warning_printf(
+				user_thd,
+				Sql_condition::WARN_LEVEL_WARN,
+				HA_ERR_WRONG_INDEX,
+				"InnoDB rebuilding table to add column "
+				FTS_DOC_ID_COL_NAME);
+		} else if (fts_doc_col_no == ULINT_UNDEFINED) {
+			goto err_exit;
+		}
+
+		switch (innobase_fts_check_doc_id_index(
+				prebuilt->table, altered_table, &doc_col_no)) {
+		case FTS_NOT_EXIST_DOC_ID_INDEX:
+			add_fts_doc_id_idx = true;
+			break;
+		case FTS_INCORRECT_DOC_ID_INDEX:
+			my_error(ER_INNODB_FT_WRONG_DOCID_INDEX, MYF(0),
+				 FTS_DOC_ID_INDEX_NAME);
+			goto err_exit;
+		case FTS_EXIST_DOC_ID_INDEX:
+			DBUG_ASSERT(doc_col_no == fts_doc_col_no
+				    || doc_col_no == ULINT_UNDEFINED
+				    || (ha_alter_info->handler_flags
+					& (Alter_inplace_info::ALTER_COLUMN_ORDER
+					   | Alter_inplace_info::DROP_COLUMN
+					   | Alter_inplace_info::ADD_COLUMN)));
+		}
+	}
+
+	/* See if an AUTO_INCREMENT column was added. */
+	uint i = 0;
+	List_iterator_fast<Create_field> cf_it(
+		ha_alter_info->alter_info->create_list);
+	while (const Create_field* new_field = cf_it++) {
+		const Field*	field;
+
+		DBUG_ASSERT(i < altered_table->s->fields);
+
+		for (uint old_i = 0; table->field[old_i]; old_i++) {
+			if (new_field->field == table->field[old_i]) {
+				goto found_col;
+			}
+		}
+
+		/* This is an added column. */
+		DBUG_ASSERT(!new_field->field);
+		DBUG_ASSERT(ha_alter_info->handler_flags
+			    & Alter_inplace_info::ADD_COLUMN);
+
+		field = altered_table->field[i];
+
+		DBUG_ASSERT((MTYP_TYPENR(field->unireg_check)
+			     == Field::NEXT_NUMBER)
+			    == !!(field->flags & AUTO_INCREMENT_FLAG));
+
+		if (field->flags & AUTO_INCREMENT_FLAG) {
+			if (add_autoinc_col_no != ULINT_UNDEFINED) {
+				/* This should have been blocked earlier. */
+				ut_ad(0);
+				my_error(ER_WRONG_AUTO_KEY, MYF(0));
+				goto err_exit;
+			}
+			add_autoinc_col_no = i;
+
+			autoinc_col_max_value = innobase_get_int_col_max_value(
+				field);
+		}
+found_col:
+		i++;
+	}
+
+	DBUG_ASSERT(heap);
+	DBUG_ASSERT(user_thd == prebuilt->trx->mysql_thd);
+	DBUG_ASSERT(!ha_alter_info->handler_ctx);
+
+	ha_alter_info->handler_ctx = new ha_innobase_inplace_ctx(
+		prebuilt,
+		drop_index, n_drop_index,
+		drop_fk, n_drop_fk, add_fk, n_add_fk,
+		ha_alter_info->online,
+		heap, prebuilt->table, col_names,
+		add_autoinc_col_no,
+		ha_alter_info->create_info->auto_increment_value,
+		autoinc_col_max_value);
+
+	DBUG_RETURN(prepare_inplace_alter_table_dict(
+			    ha_alter_info, altered_table, table,
+			    table_share->table_name.str,
+			    flags, flags2,
+			    fts_doc_col_no, add_fts_doc_id,
+			    add_fts_doc_id_idx));
+}
+
+/** Alter the table structure in-place with operations
+specified using Alter_inplace_info.
+The level of concurrency allowed during this operation depends
+on the return value from check_if_supported_inplace_alter().
+
+@param altered_table	TABLE object for new version of table.
+@param ha_alter_info	Structure describing changes to be done
+by ALTER TABLE and holding data used during in-place alter.
+
+@retval true		Failure
+@retval false		Success
+*/
+UNIV_INTERN
+bool
+ha_innobase::inplace_alter_table(
+/*=============================*/
+	TABLE*			altered_table,
+	Alter_inplace_info*	ha_alter_info)
+{
+	dberr_t	error;
+
+	DBUG_ENTER("inplace_alter_table");
+	DBUG_ASSERT(!srv_read_only_mode);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+	ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	DEBUG_SYNC(user_thd, "innodb_inplace_alter_table_enter");
+
+	if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA)) {
+ok_exit:
+		DEBUG_SYNC(user_thd, "innodb_after_inplace_alter_table");
+		DBUG_RETURN(false);
+	}
+
+	if (ha_alter_info->handler_flags
+	    == Alter_inplace_info::CHANGE_CREATE_OPTION
+	    && !innobase_need_rebuild(ha_alter_info)) {
+		goto ok_exit;
+	}
+
+	ha_innobase_inplace_ctx*	ctx
+		= static_cast<ha_innobase_inplace_ctx*>
+		(ha_alter_info->handler_ctx);
+
+	DBUG_ASSERT(ctx);
+	DBUG_ASSERT(ctx->trx);
+	DBUG_ASSERT(ctx->prebuilt == prebuilt);
+
+	if (prebuilt->table->ibd_file_missing
+	    || dict_table_is_discarded(prebuilt->table)) {
+		goto all_done;
+	}
+
+	/* Read the clustered index of the table and build
+	indexes based on this information using temporary
+	files and merge sort. */
+	DBUG_EXECUTE_IF("innodb_OOM_inplace_alter",
+			error = DB_OUT_OF_MEMORY; goto oom;);
+	error = row_merge_build_indexes(
+		prebuilt->trx,
+		prebuilt->table, ctx->new_table,
+		ctx->online,
+		ctx->add_index, ctx->add_key_numbers, ctx->num_to_add_index,
+		altered_table, ctx->add_cols, ctx->col_map,
+		ctx->add_autoinc, ctx->sequence);
+#ifndef DBUG_OFF
+oom:
+#endif /* !DBUG_OFF */
+	if (error == DB_SUCCESS && ctx->online && ctx->need_rebuild()) {
+		DEBUG_SYNC_C("row_log_table_apply1_before");
+		error = row_log_table_apply(
+			ctx->thr, prebuilt->table, altered_table);
+	}
+
+	DEBUG_SYNC_C("inplace_after_index_build");
+
+	DBUG_EXECUTE_IF("create_index_fail",
+			error = DB_DUPLICATE_KEY;
+			prebuilt->trx->error_key_num = ULINT_UNDEFINED;);
+
+	/* After an error, remove all those index definitions
+	from the dictionary which were defined. */
+
+	switch (error) {
+		KEY*	dup_key;
+	all_done:
+	case DB_SUCCESS:
+		ut_d(mutex_enter(&dict_sys->mutex));
+		ut_d(dict_table_check_for_dup_indexes(
+			     prebuilt->table, CHECK_PARTIAL_OK));
+		ut_d(mutex_exit(&dict_sys->mutex));
+		/* prebuilt->table->n_ref_count can be anything here,
+		given that we hold at most a shared lock on the table. */
+		goto ok_exit;
+	case DB_DUPLICATE_KEY:
+		if (prebuilt->trx->error_key_num == ULINT_UNDEFINED
+		    || ha_alter_info->key_count == 0) {
+			/* This should be the hidden index on
+			FTS_DOC_ID, or there is no PRIMARY KEY in the
+			table. Either way, we should be seeing and
+			reporting a bogus duplicate key error. */
+			dup_key = NULL;
+		} else {
+			DBUG_ASSERT(prebuilt->trx->error_key_num
+				    < ha_alter_info->key_count);
+			dup_key = &ha_alter_info->key_info_buffer[
+				prebuilt->trx->error_key_num];
+		}
+		print_keydup_error(altered_table, dup_key, MYF(0));
+		break;
+	case DB_ONLINE_LOG_TOO_BIG:
+		DBUG_ASSERT(ctx->online);
+		my_error(ER_INNODB_ONLINE_LOG_TOO_BIG, MYF(0),
+			 (prebuilt->trx->error_key_num == ULINT_UNDEFINED)
+			 ? FTS_DOC_ID_INDEX_NAME
+			 : ha_alter_info->key_info_buffer[
+				 prebuilt->trx->error_key_num].name);
+		break;
+	case DB_INDEX_CORRUPT:
+		my_error(ER_INDEX_CORRUPT, MYF(0),
+			 (prebuilt->trx->error_key_num == ULINT_UNDEFINED)
+			 ? FTS_DOC_ID_INDEX_NAME
+			 : ha_alter_info->key_info_buffer[
+				 prebuilt->trx->error_key_num].name);
+		break;
+	default:
+		my_error_innodb(error,
+				table_share->table_name.str,
+				prebuilt->table->flags);
+	}
+
+	/* prebuilt->table->n_ref_count can be anything here, given
+	that we hold at most a shared lock on the table. */
+	prebuilt->trx->error_info = NULL;
+	ctx->trx->error_state = DB_SUCCESS;
+
+	DBUG_RETURN(true);
+}
+
+/** Free the modification log for online table rebuild.
+@param table	table that was being rebuilt online */
+static
+void
+innobase_online_rebuild_log_free(
+/*=============================*/
+	dict_table_t*	table)
+{
+	dict_index_t* clust_index = dict_table_get_first_index(table);
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	rw_lock_x_lock(&clust_index->lock);
+
+	if (clust_index->online_log) {
+		ut_ad(dict_index_get_online_status(clust_index)
+		      == ONLINE_INDEX_CREATION);
+		clust_index->online_status = ONLINE_INDEX_COMPLETE;
+		row_log_free(clust_index->online_log);
+		DEBUG_SYNC_C("innodb_online_rebuild_log_free_aborted");
+	}
+
+	DBUG_ASSERT(dict_index_get_online_status(clust_index)
+		    == ONLINE_INDEX_COMPLETE);
+	rw_lock_x_unlock(&clust_index->lock);
+}
+
+/** Rollback a secondary index creation, drop the indexes with
+temparary index prefix
+@param user_table	InnoDB table
+@param table		the TABLE
+@param locked		TRUE=table locked, FALSE=may need to do a lazy drop
+@param trx		the transaction
+*/
+static __attribute__((nonnull))
+void
+innobase_rollback_sec_index(
+/*========================*/
+	dict_table_t*		user_table,
+	const TABLE*		table,
+	ibool			locked,
+	trx_t*			trx)
+{
+	row_merge_drop_indexes(trx, user_table, locked);
+
+	/* Free the table->fts only if there is no FTS_DOC_ID
+	in the table */
+	if (user_table->fts
+	    && !DICT_TF2_FLAG_IS_SET(user_table,
+				     DICT_TF2_FTS_HAS_DOC_ID)
+	    && !innobase_fulltext_exist(table)) {
+		fts_free(user_table);
+	}
+}
+
+/** Roll back the changes made during prepare_inplace_alter_table()
+and inplace_alter_table() inside the storage engine. Note that the
+allowed level of concurrency during this operation will be the same as
+for inplace_alter_table() and thus might be higher than during
+prepare_inplace_alter_table(). (E.g concurrent writes were blocked
+during prepare, but might not be during commit).
+
+@param ha_alter_info	Data used during in-place alter.
+@param table		the TABLE
+@param prebuilt		the prebuilt struct
+@retval true		Failure
+@retval false		Success
+*/
+inline __attribute__((nonnull, warn_unused_result))
+bool
+rollback_inplace_alter_table(
+/*=========================*/
+	Alter_inplace_info*	ha_alter_info,
+	const TABLE*		table,
+	row_prebuilt_t*		prebuilt)
+{
+	bool	fail	= false;
+
+	ha_innobase_inplace_ctx*	ctx
+		= static_cast<ha_innobase_inplace_ctx*>
+		(ha_alter_info->handler_ctx);
+
+	DBUG_ENTER("rollback_inplace_alter_table");
+
+	if (!ctx || !ctx->trx) {
+		/* If we have not started a transaction yet,
+		(almost) nothing has been or needs to be done. */
+		goto func_exit;
+	}
+
+	row_mysql_lock_data_dictionary(ctx->trx);
+
+	if (ctx->need_rebuild()) {
+		dberr_t	err;
+		ulint	flags	= ctx->new_table->flags;
+
+		/* DML threads can access ctx->new_table via the
+		online rebuild log. Free it first. */
+		innobase_online_rebuild_log_free(prebuilt->table);
+
+		/* Since the FTS index specific auxiliary tables has
+		not yet registered with "table->fts" by fts_add_index(),
+		we will need explicitly delete them here */
+		if (DICT_TF2_FLAG_IS_SET(ctx->new_table, DICT_TF2_FTS)) {
+
+			err = innobase_drop_fts_index_table(
+				ctx->new_table, ctx->trx);
+
+			if (err != DB_SUCCESS) {
+				my_error_innodb(
+					err, table->s->table_name.str,
+					flags);
+				fail = true;
+			}
+		}
+
+		/* Drop the table. */
+		dict_table_close(ctx->new_table, TRUE, FALSE);
+
+#if defined UNIV_DEBUG || defined UNIV_DDL_DEBUG
+		/* Nobody should have initialized the stats of the
+		newly created table yet. When this is the case, we
+		know that it has not been added for background stats
+		gathering. */
+		ut_a(!ctx->new_table->stat_initialized);
+#endif /* UNIV_DEBUG || UNIV_DDL_DEBUG */
+
+		err = row_merge_drop_table(ctx->trx, ctx->new_table);
+
+		switch (err) {
+		case DB_SUCCESS:
+			break;
+		default:
+			my_error_innodb(err, table->s->table_name.str,
+					flags);
+			fail = true;
+		}
+	} else {
+		DBUG_ASSERT(!(ha_alter_info->handler_flags
+			      & Alter_inplace_info::ADD_PK_INDEX));
+		DBUG_ASSERT(ctx->new_table == prebuilt->table);
+
+		trx_start_for_ddl(ctx->trx, TRX_DICT_OP_INDEX);
+
+		innobase_rollback_sec_index(
+			prebuilt->table, table, FALSE, ctx->trx);
+	}
+
+	trx_commit_for_mysql(ctx->trx);
+	row_mysql_unlock_data_dictionary(ctx->trx);
+	trx_free_for_mysql(ctx->trx);
+
+func_exit:
+#ifndef DBUG_OFF
+	dict_index_t* clust_index = dict_table_get_first_index(
+		prebuilt->table);
+	DBUG_ASSERT(!clust_index->online_log);
+	DBUG_ASSERT(dict_index_get_online_status(clust_index)
+		    == ONLINE_INDEX_COMPLETE);
+#endif /* !DBUG_OFF */
+
+	if (ctx) {
+		DBUG_ASSERT(ctx->prebuilt == prebuilt);
+
+		if (ctx->num_to_add_fk) {
+			for (ulint i = 0; i < ctx->num_to_add_fk; i++) {
+				dict_foreign_free(ctx->add_fk[i]);
+			}
+		}
+
+		if (ctx->num_to_drop_index) {
+			row_mysql_lock_data_dictionary(prebuilt->trx);
+
+			/* Clear the to_be_dropped flags
+			in the data dictionary cache.
+			The flags may already have been cleared,
+			in case an error was detected in
+			commit_inplace_alter_table(). */
+			for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+				dict_index_t*	index = ctx->drop_index[i];
+				DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX);
+
+				index->to_be_dropped = 0;
+			}
+
+			row_mysql_unlock_data_dictionary(prebuilt->trx);
+		}
+	}
+
+	trx_commit_for_mysql(prebuilt->trx);
+	MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE);
+	DBUG_RETURN(fail);
+}
+
+/** Drop a FOREIGN KEY constraint from the data dictionary tables.
+@param trx		data dictionary transaction
+@param table_name	Table name in MySQL
+@param foreign_id	Foreign key constraint identifier
+@retval true		Failure
+@retval false		Success */
+static __attribute__((nonnull, warn_unused_result))
+bool
+innobase_drop_foreign_try(
+/*======================*/
+	trx_t*			trx,
+	const char*		table_name,
+	const char*		foreign_id)
+{
+	DBUG_ENTER("innobase_drop_foreign_try");
+
+	DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(mutex_own(&dict_sys->mutex));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* Drop the constraint from the data dictionary. */
+	static const char sql[] =
+		"PROCEDURE DROP_FOREIGN_PROC () IS\n"
+		"BEGIN\n"
+		"DELETE FROM SYS_FOREIGN WHERE ID=:id;\n"
+		"DELETE FROM SYS_FOREIGN_COLS WHERE ID=:id;\n"
+		"END;\n";
+
+	dberr_t		error;
+	pars_info_t*	info;
+
+	info = pars_info_create();
+	pars_info_add_str_literal(info, "id", foreign_id);
+
+	trx->op_info = "dropping foreign key constraint from dictionary";
+	error = que_eval_sql(info, sql, FALSE, trx);
+	trx->op_info = "";
+
+	DBUG_EXECUTE_IF("ib_drop_foreign_error",
+			error = DB_OUT_OF_FILE_SPACE;);
+
+	if (error != DB_SUCCESS) {
+		my_error_innodb(error, table_name, 0);
+		trx->error_state = DB_SUCCESS;
+		DBUG_RETURN(true);
+	}
+
+	DBUG_RETURN(false);
+}
+
+/** Rename a column in the data dictionary tables.
+@param user_table	InnoDB table that was being altered
+@param trx		data dictionary transaction
+@param table_name	Table name in MySQL
+@param nth_col		0-based index of the column
+@param from		old column name
+@param to		new column name
+@param new_clustered	whether the table has been rebuilt
+@retval true		Failure
+@retval false		Success */
+static __attribute__((nonnull, warn_unused_result))
+bool
+innobase_rename_column_try(
+/*=======================*/
+	const dict_table_t*	user_table,
+	trx_t*			trx,
+	const char*		table_name,
+	ulint			nth_col,
+	const char*		from,
+	const char*		to,
+	bool			new_clustered)
+{
+	pars_info_t*	info;
+	dberr_t		error;
+
+	DBUG_ENTER("innobase_rename_column_try");
+
+	DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(mutex_own(&dict_sys->mutex));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (new_clustered) {
+		goto rename_foreign;
+	}
+
+	info = pars_info_create();
+
+	pars_info_add_ull_literal(info, "tableid", user_table->id);
+	pars_info_add_int4_literal(info, "nth", nth_col);
+	pars_info_add_str_literal(info, "old", from);
+	pars_info_add_str_literal(info, "new", to);
+
+	trx->op_info = "renaming column in SYS_COLUMNS";
+
+	error = que_eval_sql(
+		info,
+		"PROCEDURE RENAME_SYS_COLUMNS_PROC () IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_COLUMNS SET NAME=:new\n"
+		"WHERE TABLE_ID=:tableid AND NAME=:old\n"
+		"AND POS=:nth;\n"
+		"END;\n",
+		FALSE, trx);
+
+	DBUG_EXECUTE_IF("ib_rename_column_error",
+			error = DB_OUT_OF_FILE_SPACE;);
+
+	if (error != DB_SUCCESS) {
+err_exit:
+		my_error_innodb(error, table_name, 0);
+		trx->error_state = DB_SUCCESS;
+		trx->op_info = "";
+		DBUG_RETURN(true);
+	}
+
+	trx->op_info = "renaming column in SYS_FIELDS";
+
+	for (const dict_index_t* index = dict_table_get_first_index(
+		     user_table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		for (ulint i = 0; i < dict_index_get_n_fields(index); i++) {
+			if (strcmp(dict_index_get_nth_field(index, i)->name,
+				   from)) {
+				continue;
+			}
+
+			info = pars_info_create();
+
+			pars_info_add_ull_literal(info, "indexid", index->id);
+			pars_info_add_int4_literal(info, "nth", i);
+			pars_info_add_str_literal(info, "old", from);
+			pars_info_add_str_literal(info, "new", to);
+
+			error = que_eval_sql(
+				info,
+				"PROCEDURE RENAME_SYS_FIELDS_PROC () IS\n"
+				"BEGIN\n"
+
+				"UPDATE SYS_FIELDS SET COL_NAME=:new\n"
+				"WHERE INDEX_ID=:indexid AND COL_NAME=:old\n"
+				"AND POS=:nth;\n"
+
+				/* Try again, in case there is a prefix_len
+				encoded in SYS_FIELDS.POS */
+
+				"UPDATE SYS_FIELDS SET COL_NAME=:new\n"
+				"WHERE INDEX_ID=:indexid AND COL_NAME=:old\n"
+				"AND POS>=65536*:nth AND POS<65536*(:nth+1);\n"
+
+				"END;\n",
+				FALSE, trx);
+
+			if (error != DB_SUCCESS) {
+				goto err_exit;
+			}
+		}
+	}
+
+rename_foreign:
+	trx->op_info = "renaming column in SYS_FOREIGN_COLS";
+
+	std::list<dict_foreign_t*>	fk_evict;
+	bool		foreign_modified;
+
+	for (dict_foreign_set::iterator it = user_table->foreign_set.begin();
+	     it != user_table->foreign_set.end();
+	     ++it) {
+
+		dict_foreign_t*	foreign = *it;
+		foreign_modified = false;
+
+		for (unsigned i = 0; i < foreign->n_fields; i++) {
+			if (strcmp(foreign->foreign_col_names[i], from)) {
+				continue;
+			}
+
+			info = pars_info_create();
+
+			pars_info_add_str_literal(info, "id", foreign->id);
+			pars_info_add_int4_literal(info, "nth", i);
+			pars_info_add_str_literal(info, "old", from);
+			pars_info_add_str_literal(info, "new", to);
+
+			error = que_eval_sql(
+				info,
+				"PROCEDURE RENAME_SYS_FOREIGN_F_PROC () IS\n"
+				"BEGIN\n"
+				"UPDATE SYS_FOREIGN_COLS\n"
+				"SET FOR_COL_NAME=:new\n"
+				"WHERE ID=:id AND POS=:nth\n"
+				"AND FOR_COL_NAME=:old;\n"
+				"END;\n",
+				FALSE, trx);
+
+			if (error != DB_SUCCESS) {
+				goto err_exit;
+			}
+			foreign_modified = true;
+		}
+
+		if (foreign_modified) {
+			fk_evict.push_back(foreign);
+		}
+	}
+
+	for (dict_foreign_set::iterator it
+		= user_table->referenced_set.begin();
+	     it != user_table->referenced_set.end();
+	     ++it) {
+
+		foreign_modified = false;
+		dict_foreign_t*	foreign = *it;
+
+		for (unsigned i = 0; i < foreign->n_fields; i++) {
+			if (strcmp(foreign->referenced_col_names[i], from)) {
+				continue;
+			}
+
+			info = pars_info_create();
+
+			pars_info_add_str_literal(info, "id", foreign->id);
+			pars_info_add_int4_literal(info, "nth", i);
+			pars_info_add_str_literal(info, "old", from);
+			pars_info_add_str_literal(info, "new", to);
+
+			error = que_eval_sql(
+				info,
+				"PROCEDURE RENAME_SYS_FOREIGN_R_PROC () IS\n"
+				"BEGIN\n"
+				"UPDATE SYS_FOREIGN_COLS\n"
+				"SET REF_COL_NAME=:new\n"
+				"WHERE ID=:id AND POS=:nth\n"
+				"AND REF_COL_NAME=:old;\n"
+				"END;\n",
+				FALSE, trx);
+
+			if (error != DB_SUCCESS) {
+				goto err_exit;
+			}
+			foreign_modified = true;
+		}
+
+		if (foreign_modified) {
+			fk_evict.push_back(foreign);
+		}
+	}
+
+	if (new_clustered) {
+		std::for_each(fk_evict.begin(), fk_evict.end(),
+			      dict_foreign_remove_from_cache);
+	}
+
+	trx->op_info = "";
+	DBUG_RETURN(false);
+}
+
+/** Rename columns in the data dictionary tables.
+@param ha_alter_info	Data used during in-place alter.
+@param ctx		In-place ALTER TABLE context
+@param table		the TABLE
+@param trx		data dictionary transaction
+@param table_name	Table name in MySQL
+@retval true		Failure
+@retval false		Success */
+static __attribute__((nonnull, warn_unused_result))
+bool
+innobase_rename_columns_try(
+/*========================*/
+	Alter_inplace_info*	ha_alter_info,
+	ha_innobase_inplace_ctx*ctx,
+	const TABLE*		table,
+	trx_t*			trx,
+	const char*		table_name)
+{
+	List_iterator_fast<Create_field> cf_it(
+		ha_alter_info->alter_info->create_list);
+	uint i = 0;
+
+	DBUG_ASSERT(ctx);
+	DBUG_ASSERT(ha_alter_info->handler_flags
+		    & Alter_inplace_info::ALTER_COLUMN_NAME);
+
+	for (Field** fp = table->field; *fp; fp++, i++) {
+		if (!((*fp)->flags & FIELD_IS_RENAMED)) {
+			continue;
+		}
+
+		cf_it.rewind();
+		while (Create_field* cf = cf_it++) {
+			if (cf->field == *fp) {
+				if (innobase_rename_column_try(
+					    ctx->old_table, trx, table_name, i,
+					    cf->field->field_name,
+					    cf->field_name,
+					    ctx->need_rebuild())) {
+					return(true);
+				}
+				goto processed_field;
+			}
+		}
+
+		ut_error;
+processed_field:
+		continue;
+	}
+
+	return(false);
+}
+
+/** Rename columns in the data dictionary cache
+as part of commit_cache_norebuild().
+@param ha_alter_info	Data used during in-place alter.
+@param table		the TABLE
+@param user_table	InnoDB table that was being altered */
+static __attribute__((nonnull))
+void
+innobase_rename_columns_cache(
+/*==========================*/
+	Alter_inplace_info*	ha_alter_info,
+	const TABLE*		table,
+	dict_table_t*		user_table)
+{
+	if (!(ha_alter_info->handler_flags
+	      & Alter_inplace_info::ALTER_COLUMN_NAME)) {
+		return;
+	}
+
+	List_iterator_fast<Create_field> cf_it(
+		ha_alter_info->alter_info->create_list);
+	uint i = 0;
+
+	for (Field** fp = table->field; *fp; fp++, i++) {
+		if (!((*fp)->flags & FIELD_IS_RENAMED)) {
+			continue;
+		}
+
+		cf_it.rewind();
+		while (Create_field* cf = cf_it++) {
+			if (cf->field == *fp) {
+				dict_mem_table_col_rename(user_table, i,
+							  cf->field->field_name,
+							  cf->field_name);
+				goto processed_field;
+			}
+		}
+
+		ut_error;
+processed_field:
+		continue;
+	}
+}
+
+/** Get the auto-increment value of the table on commit.
+@param ha_alter_info	Data used during in-place alter
+@param ctx		In-place ALTER TABLE context
+@param altered_table	MySQL table that is being altered
+@param old_table	MySQL table as it is before the ALTER operation
+@return the next auto-increment value (0 if not present) */
+static __attribute__((nonnull, warn_unused_result))
+ulonglong
+commit_get_autoinc(
+/*===============*/
+	Alter_inplace_info*	ha_alter_info,
+	ha_innobase_inplace_ctx*ctx,
+	const TABLE*		altered_table,
+	const TABLE*		old_table)
+{
+	ulonglong		max_autoinc;
+
+	DBUG_ENTER("commit_get_autoinc");
+
+	if (!altered_table->found_next_number_field) {
+		/* There is no AUTO_INCREMENT column in the table
+		after the ALTER operation. */
+		max_autoinc = 0;
+	} else if (ctx->add_autoinc != ULINT_UNDEFINED) {
+		/* An AUTO_INCREMENT column was added. Get the last
+		value from the sequence, which may be based on a
+		supplied AUTO_INCREMENT value. */
+		max_autoinc = ctx->sequence.last();
+	} else if ((ha_alter_info->handler_flags
+		    & Alter_inplace_info::CHANGE_CREATE_OPTION)
+		   && (ha_alter_info->create_info->used_fields
+		       & HA_CREATE_USED_AUTO)) {
+		/* An AUTO_INCREMENT value was supplied, but the table was not
+		rebuilt. Get the user-supplied value or the last value from the
+		sequence. */
+		ib_uint64_t	max_value_table;
+		dberr_t		err;
+
+		Field*	autoinc_field =
+			old_table->found_next_number_field;
+
+		dict_index_t*	index = dict_table_get_index_on_first_col(
+			ctx->old_table, autoinc_field->field_index);
+
+		max_autoinc = ha_alter_info->create_info->auto_increment_value;
+
+		dict_table_autoinc_lock(ctx->old_table);
+
+		err = row_search_max_autoinc(
+			index, autoinc_field->field_name, &max_value_table);
+
+		if (err != DB_SUCCESS) {
+			ut_ad(0);
+			max_autoinc = 0;
+		} else if (max_autoinc <= max_value_table) {
+			ulonglong	col_max_value;
+			ulonglong	offset;
+
+			col_max_value = innobase_get_int_col_max_value(
+				old_table->found_next_number_field);
+
+			offset = ctx->prebuilt->autoinc_offset;
+			max_autoinc = innobase_next_autoinc(
+				max_value_table, 1, 1, offset,
+				col_max_value);
+		}
+		dict_table_autoinc_unlock(ctx->old_table);
+	} else {
+		/* An AUTO_INCREMENT value was not specified.
+		Read the old counter value from the table. */
+		ut_ad(old_table->found_next_number_field);
+		dict_table_autoinc_lock(ctx->old_table);
+		max_autoinc = ctx->old_table->autoinc;
+		dict_table_autoinc_unlock(ctx->old_table);
+	}
+
+	DBUG_RETURN(max_autoinc);
+}
+
+/** Add or drop foreign key constraints to the data dictionary tables,
+but do not touch the data dictionary cache.
+@param ha_alter_info	Data used during in-place alter
+@param ctx		In-place ALTER TABLE context
+@param trx		Data dictionary transaction
+@param table_name	Table name in MySQL
+@retval true		Failure
+@retval false		Success
+*/
+static __attribute__((nonnull, warn_unused_result))
+bool
+innobase_update_foreign_try(
+/*========================*/
+	ha_innobase_inplace_ctx*ctx,
+	trx_t*			trx,
+	const char*		table_name)
+{
+	ulint	foreign_id;
+	ulint	i;
+
+	DBUG_ENTER("innobase_update_foreign_try");
+	DBUG_ASSERT(ctx);
+
+	foreign_id = dict_table_get_highest_foreign_id(ctx->new_table);
+
+	foreign_id++;
+
+	for (i = 0; i < ctx->num_to_add_fk; i++) {
+		dict_foreign_t*		fk = ctx->add_fk[i];
+
+		ut_ad(fk->foreign_table == ctx->new_table
+		      || fk->foreign_table == ctx->old_table);
+
+		dberr_t error = dict_create_add_foreign_id(
+			&foreign_id, ctx->old_table->name, fk);
+
+		if (error != DB_SUCCESS) {
+			my_error(ER_TOO_LONG_IDENT, MYF(0),
+				 fk->id);
+			DBUG_RETURN(true);
+		}
+
+		if (!fk->foreign_index) {
+			fk->foreign_index = dict_foreign_find_index(
+				ctx->new_table, ctx->col_names,
+				fk->foreign_col_names,
+				fk->n_fields, fk->referenced_index, TRUE,
+				fk->type
+				& (DICT_FOREIGN_ON_DELETE_SET_NULL
+				   | DICT_FOREIGN_ON_UPDATE_SET_NULL));
+			if (!fk->foreign_index) {
+				my_error(ER_FK_INCORRECT_OPTION,
+					 MYF(0), table_name, fk->id);
+				DBUG_RETURN(true);
+			}
+		}
+
+		/* The fk->foreign_col_names[] uses renamed column
+		names, while the columns in ctx->old_table have not
+		been renamed yet. */
+		error = dict_create_add_foreign_to_dictionary(
+			ctx->old_table->name, fk, trx);
+
+		DBUG_EXECUTE_IF(
+			"innodb_test_cannot_add_fk_system",
+			error = DB_ERROR;);
+
+		if (error != DB_SUCCESS) {
+			my_error(ER_FK_FAIL_ADD_SYSTEM, MYF(0),
+				 fk->id);
+			DBUG_RETURN(true);
+		}
+	}
+
+	for (i = 0; i < ctx->num_to_drop_fk; i++) {
+		dict_foreign_t* fk = ctx->drop_fk[i];
+
+		DBUG_ASSERT(fk->foreign_table == ctx->old_table);
+
+		if (innobase_drop_foreign_try(trx, table_name, fk->id)) {
+			DBUG_RETURN(true);
+		}
+	}
+
+	DBUG_RETURN(false);
+}
+
+/** Update the foreign key constraint definitions in the data dictionary cache
+after the changes to data dictionary tables were committed.
+@param ctx	In-place ALTER TABLE context
+@param user_thd	MySQL connection
+@return		InnoDB error code (should always be DB_SUCCESS) */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+innobase_update_foreign_cache(
+/*==========================*/
+	ha_innobase_inplace_ctx*	ctx,
+	THD*				user_thd)
+{
+	dict_table_t*	user_table;
+	dberr_t		err = DB_SUCCESS;
+
+	DBUG_ENTER("innobase_update_foreign_cache");
+
+	user_table = ctx->old_table;
+
+	/* Discard the added foreign keys, because we will
+	load them from the data dictionary. */
+	for (ulint i = 0; i < ctx->num_to_add_fk; i++) {
+		dict_foreign_t*	fk = ctx->add_fk[i];
+		dict_foreign_free(fk);
+	}
+
+	if (ctx->need_rebuild()) {
+		/* The rebuilt table is already using the renamed
+		column names. No need to pass col_names or to drop
+		constraints from the data dictionary cache. */
+		DBUG_ASSERT(!ctx->col_names);
+		DBUG_ASSERT(user_table->foreign_set.empty());
+		DBUG_ASSERT(user_table->referenced_set.empty());
+		user_table = ctx->new_table;
+	} else {
+		/* Drop the foreign key constraints if the
+		table was not rebuilt. If the table is rebuilt,
+		there would not be any foreign key contraints for
+		it yet in the data dictionary cache. */
+		for (ulint i = 0; i < ctx->num_to_drop_fk; i++) {
+			dict_foreign_t* fk = ctx->drop_fk[i];
+			dict_foreign_remove_from_cache(fk);
+		}
+	}
+
+	/* Load the old or added foreign keys from the data dictionary
+	and prevent the table from being evicted from the data
+	dictionary cache (work around the lack of WL#6049). */
+	err = dict_load_foreigns(user_table->name,
+				 ctx->col_names, false, true,
+				 DICT_ERR_IGNORE_NONE);
+
+	if (err == DB_CANNOT_ADD_CONSTRAINT) {
+		/* It is possible there are existing foreign key are
+		loaded with "foreign_key checks" off,
+		so let's retry the loading with charset_check is off */
+		err = dict_load_foreigns(user_table->name,
+					 ctx->col_names, false, false,
+					 DICT_ERR_IGNORE_NONE);
+
+		/* The load with "charset_check" off is successful, warn
+		the user that the foreign key has loaded with mis-matched
+		charset */
+		if (err == DB_SUCCESS) {
+			push_warning_printf(
+				user_thd,
+				Sql_condition::WARN_LEVEL_WARN,
+				ER_ALTER_INFO,
+				"Foreign key constraints for table '%s'"
+				" are loaded with charset check off",
+				user_table->name);
+				
+		}
+	}
+
+	DBUG_RETURN(err);
+}
+
+/** Commit the changes made during prepare_inplace_alter_table()
+and inplace_alter_table() inside the data dictionary tables,
+when rebuilding the table.
+@param ha_alter_info	Data used during in-place alter
+@param ctx		In-place ALTER TABLE context
+@param altered_table	MySQL table that is being altered
+@param old_table	MySQL table as it is before the ALTER operation
+@param trx		Data dictionary transaction
+@param table_name	Table name in MySQL
+@retval true		Failure
+@retval false		Success
+*/
+inline __attribute__((nonnull, warn_unused_result))
+bool
+commit_try_rebuild(
+/*===============*/
+	Alter_inplace_info*	ha_alter_info,
+	ha_innobase_inplace_ctx*ctx,
+	TABLE*			altered_table,
+	const TABLE*		old_table,
+	trx_t*			trx,
+	const char*		table_name)
+{
+	dict_table_t*	rebuilt_table	= ctx->new_table;
+	dict_table_t*	user_table	= ctx->old_table;
+
+	DBUG_ENTER("commit_try_rebuild");
+	DBUG_ASSERT(ctx->need_rebuild());
+	DBUG_ASSERT(trx->dict_operation_lock_mode == RW_X_LATCH);
+	DBUG_ASSERT(!(ha_alter_info->handler_flags
+		      & Alter_inplace_info::DROP_FOREIGN_KEY)
+		    || ctx->num_to_drop_fk > 0);
+	DBUG_ASSERT(ctx->num_to_drop_fk
+		    == ha_alter_info->alter_info->drop_list.elements);
+
+	for (dict_index_t* index = dict_table_get_first_index(rebuilt_table);
+	     index;
+	     index = dict_table_get_next_index(index)) {
+		DBUG_ASSERT(dict_index_get_online_status(index)
+			    == ONLINE_INDEX_COMPLETE);
+		DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX);
+		if (dict_index_is_corrupted(index)) {
+			my_error(ER_INDEX_CORRUPT, MYF(0),
+				 index->name);
+			DBUG_RETURN(true);
+		}
+	}
+
+	if (innobase_update_foreign_try(ctx, trx, table_name)) {
+		DBUG_RETURN(true);
+	}
+
+	dberr_t	error;
+
+	/* Clear the to_be_dropped flag in the data dictionary cache
+	of user_table. */
+	for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+		dict_index_t*	index = ctx->drop_index[i];
+		DBUG_ASSERT(index->table == user_table);
+		DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX);
+		DBUG_ASSERT(index->to_be_dropped);
+		index->to_be_dropped = 0;
+	}
+
+	/* We copied the table. Any indexes that were requested to be
+	dropped were not created in the copy of the table. Apply any
+	last bit of the rebuild log and then rename the tables. */
+
+	if (ctx->online) {
+		DEBUG_SYNC_C("row_log_table_apply2_before");
+		error = row_log_table_apply(
+			ctx->thr, user_table, altered_table);
+		ulint	err_key = thr_get_trx(ctx->thr)->error_key_num;
+
+		switch (error) {
+			KEY*	dup_key;
+		case DB_SUCCESS:
+			break;
+		case DB_DUPLICATE_KEY:
+			if (err_key == ULINT_UNDEFINED) {
+				/* This should be the hidden index on
+				FTS_DOC_ID. */
+				dup_key = NULL;
+			} else {
+				DBUG_ASSERT(err_key <
+					    ha_alter_info->key_count);
+				dup_key = &ha_alter_info
+					->key_info_buffer[err_key];
+			}
+			print_keydup_error(altered_table, dup_key, MYF(0));
+			DBUG_RETURN(true);
+		case DB_ONLINE_LOG_TOO_BIG:
+			my_error(ER_INNODB_ONLINE_LOG_TOO_BIG, MYF(0),
+				 ha_alter_info->key_info_buffer[0].name);
+			DBUG_RETURN(true);
+		case DB_INDEX_CORRUPT:
+			my_error(ER_INDEX_CORRUPT, MYF(0),
+				 (err_key == ULINT_UNDEFINED)
+				 ? FTS_DOC_ID_INDEX_NAME
+				 : ha_alter_info->key_info_buffer[err_key]
+				 .name);
+			DBUG_RETURN(true);
+		default:
+			my_error_innodb(error, table_name, user_table->flags);
+			DBUG_RETURN(true);
+		}
+	}
+
+	if ((ha_alter_info->handler_flags
+	     & Alter_inplace_info::ALTER_COLUMN_NAME)
+	    && innobase_rename_columns_try(ha_alter_info, ctx, old_table,
+					   trx, table_name)) {
+		DBUG_RETURN(true);
+	}
+
+	DBUG_EXECUTE_IF("ib_ddl_crash_before_rename", DBUG_SUICIDE(););
+
+	/* The new table must inherit the flag from the
+	"parent" table. */
+	if (dict_table_is_discarded(user_table)) {
+		rebuilt_table->ibd_file_missing = true;
+		rebuilt_table->flags2 |= DICT_TF2_DISCARDED;
+	}
+
+	/* We can now rename the old table as a temporary table,
+	rename the new temporary table as the old table and drop the
+	old table. First, we only do this in the data dictionary
+	tables. The actual renaming will be performed in
+	commit_cache_rebuild(), once the data dictionary transaction
+	has been successfully committed. */
+
+	error = row_merge_rename_tables_dict(
+		user_table, rebuilt_table, ctx->tmp_name, trx);
+
+	/* We must be still holding a table handle. */
+	DBUG_ASSERT(user_table->n_ref_count >= 1);
+
+	DBUG_EXECUTE_IF("ib_ddl_crash_after_rename", DBUG_SUICIDE(););
+	DBUG_EXECUTE_IF("ib_rebuild_cannot_rename", error = DB_ERROR;);
+
+	if (user_table->n_ref_count > 1) {
+		/* This should only occur when an innodb_memcached
+		connection with innodb_api_enable_mdl=off was started
+		before commit_inplace_alter_table() locked the data
+		dictionary. We must roll back the ALTER TABLE, because
+		we cannot drop a table while it is being used. */
+
+		/* Normally, n_ref_count must be 1, because purge
+		cannot be executing on this very table as we are
+		holding dict_operation_lock X-latch. */
+
+		error = DB_LOCK_WAIT_TIMEOUT;
+	}
+
+	switch (error) {
+	case DB_SUCCESS:
+		DBUG_RETURN(false);
+	case DB_TABLESPACE_EXISTS:
+		ut_a(rebuilt_table->n_ref_count == 1);
+		my_error(ER_TABLESPACE_EXISTS, MYF(0), ctx->tmp_name);
+		DBUG_RETURN(true);
+	case DB_DUPLICATE_KEY:
+		ut_a(rebuilt_table->n_ref_count == 1);
+		my_error(ER_TABLE_EXISTS_ERROR, MYF(0), ctx->tmp_name);
+		DBUG_RETURN(true);
+	default:
+		my_error_innodb(error, table_name, user_table->flags);
+		DBUG_RETURN(true);
+	}
+}
+
+/** Apply the changes made during commit_try_rebuild(),
+to the data dictionary cache and the file system.
+@param ctx	In-place ALTER TABLE context */
+inline __attribute__((nonnull))
+void
+commit_cache_rebuild(
+/*=================*/
+	ha_innobase_inplace_ctx*	ctx)
+{
+	dberr_t		error;
+
+	DBUG_ENTER("commit_cache_rebuild");
+	DBUG_ASSERT(ctx->need_rebuild());
+	DBUG_ASSERT(dict_table_is_discarded(ctx->old_table)
+		    == dict_table_is_discarded(ctx->new_table));
+
+	const char* old_name = mem_heap_strdup(
+		ctx->heap, ctx->old_table->name);
+
+	/* We already committed and redo logged the renames,
+	so this must succeed. */
+	error = dict_table_rename_in_cache(
+		ctx->old_table, ctx->tmp_name, FALSE);
+	ut_a(error == DB_SUCCESS);
+
+	error = dict_table_rename_in_cache(
+		ctx->new_table, old_name, FALSE);
+	ut_a(error == DB_SUCCESS);
+
+	DBUG_VOID_RETURN;
+}
+
+/** Commit the changes made during prepare_inplace_alter_table()
+and inplace_alter_table() inside the data dictionary tables,
+when not rebuilding the table.
+@param ha_alter_info	Data used during in-place alter
+@param ctx		In-place ALTER TABLE context
+@param old_table	MySQL table as it is before the ALTER operation
+@param trx		Data dictionary transaction
+@param table_name	Table name in MySQL
+@retval true		Failure
+@retval false		Success
+*/
+inline __attribute__((nonnull, warn_unused_result))
+bool
+commit_try_norebuild(
+/*=================*/
+	Alter_inplace_info*	ha_alter_info,
+	ha_innobase_inplace_ctx*ctx,
+	const TABLE*		old_table,
+	trx_t*			trx,
+	const char*		table_name)
+{
+	DBUG_ENTER("commit_try_norebuild");
+	DBUG_ASSERT(!ctx->need_rebuild());
+	DBUG_ASSERT(trx->dict_operation_lock_mode == RW_X_LATCH);
+	DBUG_ASSERT(!(ha_alter_info->handler_flags
+		      & Alter_inplace_info::DROP_FOREIGN_KEY)
+		    || ctx->num_to_drop_fk > 0);
+	DBUG_ASSERT(ctx->num_to_drop_fk
+		    == ha_alter_info->alter_info->drop_list.elements);
+
+	for (ulint i = 0; i < ctx->num_to_add_index; i++) {
+		dict_index_t*	index = ctx->add_index[i];
+		DBUG_ASSERT(dict_index_get_online_status(index)
+			    == ONLINE_INDEX_COMPLETE);
+		DBUG_ASSERT(*index->name == TEMP_INDEX_PREFIX);
+		if (dict_index_is_corrupted(index)) {
+			/* Report a duplicate key
+			error for the index that was
+			flagged corrupted, most likely
+			because a duplicate value was
+			inserted (directly or by
+			rollback) after
+			ha_innobase::inplace_alter_table()
+			completed.
+			TODO: report this as a corruption
+			with a detailed reason once
+			WL#6379 has been implemented. */
+			my_error(ER_DUP_UNKNOWN_IN_INDEX,
+				 MYF(0), index->name + 1);
+			DBUG_RETURN(true);
+		}
+	}
+
+	if (innobase_update_foreign_try(ctx, trx, table_name)) {
+		DBUG_RETURN(true);
+	}
+
+	dberr_t	error;
+
+	/* We altered the table in place. */
+	/* Lose the TEMP_INDEX_PREFIX. */
+	for (ulint i = 0; i < ctx->num_to_add_index; i++) {
+		dict_index_t*	index = ctx->add_index[i];
+		DBUG_ASSERT(dict_index_get_online_status(index)
+			    == ONLINE_INDEX_COMPLETE);
+		DBUG_ASSERT(*index->name
+			    == TEMP_INDEX_PREFIX);
+		error = row_merge_rename_index_to_add(
+			trx, ctx->new_table->id, index->id);
+		if (error != DB_SUCCESS) {
+			sql_print_error(
+				"InnoDB: rename index to add: %lu\n",
+				(ulong) error);
+			DBUG_ASSERT(0);
+			my_error(ER_INTERNAL_ERROR, MYF(0),
+				 "rename index to add");
+			DBUG_RETURN(true);
+		}
+	}
+
+	/* Drop any indexes that were requested to be dropped.
+	Rename them to TEMP_INDEX_PREFIX in the data
+	dictionary first. We do not bother to rename
+	index->name in the dictionary cache, because the index
+	is about to be freed after row_merge_drop_indexes_dict(). */
+
+	for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+		dict_index_t*	index = ctx->drop_index[i];
+		DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX);
+		DBUG_ASSERT(index->table == ctx->new_table);
+		DBUG_ASSERT(index->to_be_dropped);
+
+		error = row_merge_rename_index_to_drop(
+			trx, index->table->id, index->id);
+		if (error != DB_SUCCESS) {
+			sql_print_error(
+				"InnoDB: rename index to drop: %lu\n",
+				(ulong) error);
+			DBUG_ASSERT(0);
+			my_error(ER_INTERNAL_ERROR, MYF(0),
+				 "rename index to drop");
+			DBUG_RETURN(true);
+		}
+	}
+
+	if (!(ha_alter_info->handler_flags
+	      & Alter_inplace_info::ALTER_COLUMN_NAME)) {
+		DBUG_RETURN(false);
+	}
+
+	DBUG_RETURN(innobase_rename_columns_try(ha_alter_info, ctx,
+						old_table, trx, table_name));
+}
+
+/** Commit the changes to the data dictionary cache
+after a successful commit_try_norebuild() call.
+@param ctx		In-place ALTER TABLE context
+@param table		the TABLE before the ALTER
+@param trx		Data dictionary transaction object
+(will be started and committed)
+@return whether all replacements were found for dropped indexes */
+inline __attribute__((nonnull, warn_unused_result))
+bool
+commit_cache_norebuild(
+/*===================*/
+	ha_innobase_inplace_ctx*ctx,
+	const TABLE*		table,
+	trx_t*			trx)
+{
+	DBUG_ENTER("commit_cache_norebuild");
+
+	bool	found = true;
+
+	DBUG_ASSERT(!ctx->need_rebuild());
+
+	for (ulint i = 0; i < ctx->num_to_add_index; i++) {
+		dict_index_t*	index = ctx->add_index[i];
+		DBUG_ASSERT(dict_index_get_online_status(index)
+			    == ONLINE_INDEX_COMPLETE);
+		DBUG_ASSERT(*index->name == TEMP_INDEX_PREFIX);
+		index->name++;
+	}
+
+	if (ctx->num_to_drop_index) {
+		/* Really drop the indexes that were dropped.
+		The transaction had to be committed first
+		(after renaming the indexes), so that in the
+		event of a crash, crash recovery will drop the
+		indexes, because it drops all indexes whose
+		names start with TEMP_INDEX_PREFIX. Once we
+		have started dropping an index tree, there is
+		no way to roll it back. */
+
+		for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+			dict_index_t*	index = ctx->drop_index[i];
+			DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX);
+			DBUG_ASSERT(index->table == ctx->new_table);
+			DBUG_ASSERT(index->to_be_dropped);
+
+			/* Replace the indexes in foreign key
+			constraints if needed. */
+
+			if (!dict_foreign_replace_index(
+				    index->table, ctx->col_names, index)) {
+				found = false;
+			}
+
+			/* Mark the index dropped
+			in the data dictionary cache. */
+			rw_lock_x_lock(dict_index_get_lock(index));
+			index->page = FIL_NULL;
+			rw_lock_x_unlock(dict_index_get_lock(index));
+		}
+
+		trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+		row_merge_drop_indexes_dict(trx, ctx->new_table->id);
+
+		for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+			dict_index_t*	index = ctx->drop_index[i];
+			DBUG_ASSERT(*index->name != TEMP_INDEX_PREFIX);
+			DBUG_ASSERT(index->table == ctx->new_table);
+
+			if (index->type & DICT_FTS) {
+				DBUG_ASSERT(index->type == DICT_FTS
+					    || (index->type
+						& DICT_CORRUPT));
+				DBUG_ASSERT(index->table->fts);
+				fts_drop_index(index->table, index, trx);
+			}
+
+			dict_index_remove_from_cache(index->table, index);
+		}
+
+		trx_commit_for_mysql(trx);
+	}
+
+	DBUG_RETURN(found);
+}
+
+/** Adjust the persistent statistics after non-rebuilding ALTER TABLE.
+Remove statistics for dropped indexes, add statistics for created indexes
+and rename statistics for renamed indexes.
+@param ha_alter_info	Data used during in-place alter
+@param ctx		In-place ALTER TABLE context
+@param altered_table	MySQL table that is being altered
+@param table_name	Table name in MySQL
+@param thd		MySQL connection
+*/
+static
+void
+alter_stats_norebuild(
+/*==================*/
+	Alter_inplace_info*		ha_alter_info,
+	ha_innobase_inplace_ctx*	ctx,
+	TABLE*				altered_table,
+	const char*			table_name,
+	THD*				thd)
+{
+	ulint	i;
+
+	DBUG_ENTER("alter_stats_norebuild");
+	DBUG_ASSERT(!ctx->need_rebuild());
+
+	if (!dict_stats_is_persistent_enabled(ctx->new_table)) {
+		DBUG_VOID_RETURN;
+	}
+
+	/* TODO: This will not drop the (unused) statistics for
+	FTS_DOC_ID_INDEX if it was a hidden index, dropped together
+	with the last renamining FULLTEXT index. */
+	for (i = 0; i < ha_alter_info->index_drop_count; i++) {
+		const KEY* key = ha_alter_info->index_drop_buffer[i];
+
+		if (key->flags & HA_FULLTEXT) {
+			/* There are no index cardinality
+			statistics for FULLTEXT indexes. */
+			continue;
+		}
+
+		char	errstr[1024];
+
+		if (dict_stats_drop_index(
+			    ctx->new_table->name, key->name,
+			    errstr, sizeof errstr) != DB_SUCCESS) {
+			push_warning(thd,
+				     Sql_condition::WARN_LEVEL_WARN,
+				     ER_LOCK_WAIT_TIMEOUT, errstr);
+		}
+	}
+
+	for (i = 0; i < ctx->num_to_add_index; i++) {
+		dict_index_t*	index = ctx->add_index[i];
+		DBUG_ASSERT(index->table == ctx->new_table);
+
+		if (!(index->type & DICT_FTS)) {
+			dict_stats_init(ctx->new_table);
+			dict_stats_update_for_index(index);
+		}
+	}
+
+	DBUG_VOID_RETURN;
+}
+
+/** Adjust the persistent statistics after rebuilding ALTER TABLE.
+Remove statistics for dropped indexes, add statistics for created indexes
+and rename statistics for renamed indexes.
+@param table		InnoDB table that was rebuilt by ALTER TABLE
+@param table_name	Table name in MySQL
+@param thd		MySQL connection
+*/
+static
+void
+alter_stats_rebuild(
+/*================*/
+	dict_table_t*	table,
+	const char*	table_name,
+	THD*		thd)
+{
+	DBUG_ENTER("alter_stats_rebuild");
+
+	if (dict_table_is_discarded(table)
+	    || !dict_stats_is_persistent_enabled(table)) {
+		DBUG_VOID_RETURN;
+	}
+
+	dberr_t	ret;
+
+	ret = dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT);
+
+	if (ret != DB_SUCCESS) {
+		push_warning_printf(
+			thd,
+			Sql_condition::WARN_LEVEL_WARN,
+			ER_ALTER_INFO,
+			"Error updating stats for table '%s' "
+			"after table rebuild: %s",
+			table_name, ut_strerr(ret));
+	}
+
+	DBUG_VOID_RETURN;
+}
+
+#ifndef DBUG_OFF
+# define DBUG_INJECT_CRASH(prefix, count)			\
+do {								\
+	char buf[32];						\
+	ut_snprintf(buf, sizeof buf, prefix "_%u", count);	\
+	DBUG_EXECUTE_IF(buf, DBUG_SUICIDE(););			\
+} while (0)
+#else
+# define DBUG_INJECT_CRASH(prefix, count)
+#endif
+
+/** Commit or rollback the changes made during
+prepare_inplace_alter_table() and inplace_alter_table() inside
+the storage engine. Note that the allowed level of concurrency
+during this operation will be the same as for
+inplace_alter_table() and thus might be higher than during
+prepare_inplace_alter_table(). (E.g concurrent writes were
+blocked during prepare, but might not be during commit).
+@param altered_table	TABLE object for new version of table.
+@param ha_alter_info	Structure describing changes to be done
+by ALTER TABLE and holding data used during in-place alter.
+@param commit		true => Commit, false => Rollback.
+@retval true		Failure
+@retval false		Success
+*/
+UNIV_INTERN
+bool
+ha_innobase::commit_inplace_alter_table(
+/*====================================*/
+	TABLE*			altered_table,
+	Alter_inplace_info*	ha_alter_info,
+	bool			commit)
+{
+	ha_innobase_inplace_ctx*	ctx0
+		= static_cast<ha_innobase_inplace_ctx*>
+		(ha_alter_info->handler_ctx);
+#ifndef DBUG_OFF
+	uint				crash_inject_count	= 1;
+	uint				crash_fail_inject_count	= 1;
+	uint				failure_inject_count	= 1;
+#endif
+
+	DBUG_ENTER("commit_inplace_alter_table");
+	DBUG_ASSERT(!srv_read_only_mode);
+	DBUG_ASSERT(!ctx0 || ctx0->prebuilt == prebuilt);
+	DBUG_ASSERT(!ctx0 || ctx0->old_table == prebuilt->table);
+
+	DEBUG_SYNC_C("innodb_commit_inplace_alter_table_enter");
+
+	DEBUG_SYNC_C("innodb_commit_inplace_alter_table_wait");
+
+	if (!commit) {
+		/* A rollback is being requested. So far we may at
+		most have created some indexes. If any indexes were to
+		be dropped, they would actually be dropped in this
+		method if commit=true. */
+		DBUG_RETURN(rollback_inplace_alter_table(
+				    ha_alter_info, table, prebuilt));
+	}
+
+	if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) {
+		DBUG_ASSERT(!ctx0);
+		MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE);
+		ha_alter_info->group_commit_ctx = NULL;
+		DBUG_RETURN(false);
+	}
+
+	DBUG_ASSERT(ctx0);
+
+	inplace_alter_handler_ctx**	ctx_array;
+	inplace_alter_handler_ctx*	ctx_single[2];
+
+	if (ha_alter_info->group_commit_ctx) {
+		ctx_array = ha_alter_info->group_commit_ctx;
+	} else {
+		ctx_single[0] = ctx0;
+		ctx_single[1] = NULL;
+		ctx_array = ctx_single;
+	}
+
+	DBUG_ASSERT(ctx0 == ctx_array[0]);
+	ut_ad(prebuilt->table == ctx0->old_table);
+	ha_alter_info->group_commit_ctx = NULL;
+
+	/* Free the ctx->trx of other partitions, if any. We will only
+	use the ctx0->trx here. Others may have been allocated in
+	the prepare stage. */
+
+	for (inplace_alter_handler_ctx** pctx = &ctx_array[1]; *pctx;
+	     pctx++) {
+		ha_innobase_inplace_ctx*	ctx
+			= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+
+		if (ctx->trx) {
+			trx_free_for_mysql(ctx->trx);
+			ctx->trx = NULL;
+		}
+	}
+
+	trx_start_if_not_started_xa(prebuilt->trx);
+
+	for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx; pctx++) {
+		ha_innobase_inplace_ctx*	ctx
+			= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+		DBUG_ASSERT(ctx->prebuilt->trx == prebuilt->trx);
+
+		/* Exclusively lock the table, to ensure that no other
+		transaction is holding locks on the table while we
+		change the table definition. The MySQL meta-data lock
+		should normally guarantee that no conflicting locks
+		exist. However, FOREIGN KEY constraints checks and any
+		transactions collected during crash recovery could be
+		holding InnoDB locks only, not MySQL locks. */
+
+		dberr_t error = row_merge_lock_table(
+			prebuilt->trx, ctx->old_table, LOCK_X);
+
+		if (error != DB_SUCCESS) {
+			my_error_innodb(
+				error, table_share->table_name.str, 0);
+			DBUG_RETURN(true);
+		}
+	}
+
+	DEBUG_SYNC(user_thd, "innodb_alter_commit_after_lock_table");
+
+	const bool	new_clustered	= ctx0->need_rebuild();
+	trx_t*		trx		= ctx0->trx;
+	bool		fail		= false;
+
+	if (new_clustered) {
+		for (inplace_alter_handler_ctx** pctx = ctx_array;
+		     *pctx; pctx++) {
+			ha_innobase_inplace_ctx*	ctx
+				= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+			DBUG_ASSERT(ctx->need_rebuild());
+
+			if (ctx->old_table->fts) {
+				ut_ad(!ctx->old_table->fts->add_wq);
+				fts_optimize_remove_table(
+					ctx->old_table);
+			}
+
+			if (ctx->new_table->fts) {
+				ut_ad(!ctx->new_table->fts->add_wq);
+				fts_optimize_remove_table(
+					ctx->new_table);
+			}
+		}
+	}
+
+	if (!trx) {
+		DBUG_ASSERT(!new_clustered);
+		trx = innobase_trx_allocate(user_thd);
+	}
+
+	trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+	/* Latch the InnoDB data dictionary exclusively so that no deadlocks
+	or lock waits can happen in it during the data dictionary operation. */
+	row_mysql_lock_data_dictionary(trx);
+
+	/* Prevent the background statistics collection from accessing
+	the tables. */
+	for (;;) {
+		bool	retry = false;
+
+		for (inplace_alter_handler_ctx** pctx = ctx_array;
+		     *pctx; pctx++) {
+			ha_innobase_inplace_ctx*	ctx
+				= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+
+			DBUG_ASSERT(new_clustered == ctx->need_rebuild());
+
+			if (new_clustered
+			    && !dict_stats_stop_bg(ctx->old_table)) {
+				retry = true;
+			}
+
+			if (!dict_stats_stop_bg(ctx->new_table)) {
+				retry = true;
+			}
+		}
+
+		if (!retry) {
+			break;
+		}
+
+		DICT_STATS_BG_YIELD(trx);
+	}
+
+	/* Apply the changes to the data dictionary tables, for all
+	partitions. */
+
+	for (inplace_alter_handler_ctx** pctx = ctx_array;
+	     *pctx && !fail; pctx++) {
+		ha_innobase_inplace_ctx*	ctx
+			= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+
+		DBUG_ASSERT(new_clustered == ctx->need_rebuild());
+
+		ctx->max_autoinc = commit_get_autoinc(
+			ha_alter_info, ctx, altered_table, table);
+
+		if (ctx->need_rebuild()) {
+			ctx->tmp_name = dict_mem_create_temporary_tablename(
+				ctx->heap, ctx->new_table->name,
+				ctx->new_table->id);
+
+			fail = commit_try_rebuild(
+				ha_alter_info, ctx, altered_table, table,
+				trx, table_share->table_name.str);
+		} else {
+			fail = commit_try_norebuild(
+				ha_alter_info, ctx, table, trx,
+				table_share->table_name.str);
+		}
+		DBUG_INJECT_CRASH("ib_commit_inplace_crash",
+				  crash_inject_count++);
+#ifndef DBUG_OFF
+		{
+			/* Generate a dynamic dbug text. */
+			char buf[32];
+			ut_snprintf(buf, sizeof buf, "ib_commit_inplace_fail_%u",
+				    failure_inject_count++);
+			DBUG_EXECUTE_IF(buf,
+					my_error(ER_INTERNAL_ERROR, MYF(0),
+						 "Injected error!");
+					fail = true;
+			);
+		}
+#endif
+	}
+
+	/* Commit or roll back the changes to the data dictionary. */
+
+	if (fail) {
+		trx_rollback_for_mysql(trx);
+	} else if (!new_clustered) {
+		trx_commit_for_mysql(trx);
+	} else {
+		mtr_t	mtr;
+		mtr_start(&mtr);
+
+		for (inplace_alter_handler_ctx** pctx = ctx_array;
+		     *pctx; pctx++) {
+			ha_innobase_inplace_ctx*	ctx
+				= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+
+			DBUG_ASSERT(ctx->need_rebuild());
+			/* Generate the redo log for the file
+			operations that will be performed in
+			commit_cache_rebuild(). */
+			fil_mtr_rename_log(ctx->old_table->space,
+					   ctx->old_table->name,
+					   ctx->new_table->space,
+					   ctx->new_table->name,
+					   ctx->tmp_name, &mtr);
+			DBUG_INJECT_CRASH("ib_commit_inplace_crash",
+					  crash_inject_count++);
+		}
+
+		/* Test what happens on crash if the redo logs
+		are flushed to disk here. The log records
+		about the rename should not be committed, and
+		the data dictionary transaction should be
+		rolled back, restoring the old table. */
+		DBUG_EXECUTE_IF("innodb_alter_commit_crash_before_commit",
+				log_buffer_flush_to_disk();
+				DBUG_SUICIDE(););
+		ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+		ut_ad(!trx->fts_trx);
+		ut_ad(trx->insert_undo || trx->update_undo);
+
+		/* The following call commits the
+		mini-transaction, making the data dictionary
+		transaction committed at mtr.end_lsn. The
+		transaction becomes 'durable' by the time when
+		log_buffer_flush_to_disk() returns. In the
+		logical sense the commit in the file-based
+		data structures happens here. */
+		trx_commit_low(trx, &mtr);
+
+		/* If server crashes here, the dictionary in
+		InnoDB and MySQL will differ.  The .ibd files
+		and the .frm files must be swapped manually by
+		the administrator. No loss of data. */
+		DBUG_EXECUTE_IF("innodb_alter_commit_crash_after_commit",
+				log_buffer_flush_to_disk();
+				DBUG_SUICIDE(););
+	}
+
+	/* Flush the log to reduce probability that the .frm files and
+	the InnoDB data dictionary get out-of-sync if the user runs
+	with innodb_flush_log_at_trx_commit = 0 */
+
+	log_buffer_flush_to_disk();
+
+	/* At this point, the changes to the persistent storage have
+	been committed or rolled back. What remains to be done is to
+	update the in-memory structures, close some handles, release
+	temporary files, and (unless we rolled back) update persistent
+	statistics. */
+	dberr_t	error		= DB_SUCCESS;
+
+	for (inplace_alter_handler_ctx** pctx = ctx_array;
+	     *pctx; pctx++) {
+		ha_innobase_inplace_ctx*	ctx
+			= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+
+		DBUG_ASSERT(ctx->need_rebuild() == new_clustered);
+
+		if (new_clustered) {
+			innobase_online_rebuild_log_free(ctx->old_table);
+		}
+
+		if (fail) {
+			if (new_clustered) {
+				dict_table_close(ctx->new_table,
+						 TRUE, FALSE);
+
+#if defined UNIV_DEBUG || defined UNIV_DDL_DEBUG
+				/* Nobody should have initialized the
+				stats of the newly created table
+				yet. When this is the case, we know
+				that it has not been added for
+				background stats gathering. */
+				ut_a(!ctx->new_table->stat_initialized);
+#endif /* UNIV_DEBUG || UNIV_DDL_DEBUG */
+
+				trx_start_for_ddl(trx, TRX_DICT_OP_TABLE);
+				row_merge_drop_table(trx, ctx->new_table);
+				trx_commit_for_mysql(trx);
+				ctx->new_table = NULL;
+			} else {
+				/* We failed, but did not rebuild the table.
+				Roll back any ADD INDEX, or get rid of garbage
+				ADD INDEX that was left over from a previous
+				ALTER TABLE statement. */
+				trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+				innobase_rollback_sec_index(
+					ctx->new_table, table, TRUE, trx);
+				trx_commit_for_mysql(trx);
+			}
+			DBUG_INJECT_CRASH("ib_commit_inplace_crash_fail",
+					  crash_fail_inject_count++);
+
+			continue;
+		}
+
+		innobase_copy_frm_flags_from_table_share(
+			ctx->new_table, altered_table->s);
+
+		if (new_clustered) {
+			/* We will reload and refresh the
+			in-memory foreign key constraint
+			metadata. This is a rename operation
+			in preparing for dropping the old
+			table. Set the table to_be_dropped bit
+			here, so to make sure DML foreign key
+			constraint check does not use the
+			stale dict_foreign_t. This is done
+			because WL#6049 (FK MDL) has not been
+			implemented yet. */
+			ctx->old_table->to_be_dropped = true;
+
+			/* Rename the tablespace files. */
+			commit_cache_rebuild(ctx);
+
+			error = innobase_update_foreign_cache(ctx, user_thd);
+			if (error != DB_SUCCESS) {
+				goto foreign_fail;
+			}
+		} else {
+			error = innobase_update_foreign_cache(ctx, user_thd);
+
+			if (error != DB_SUCCESS) {
+foreign_fail:
+				/* The data dictionary cache
+				should be corrupted now.  The
+				best solution should be to
+				kill and restart the server,
+				but the *.frm file has not
+				been replaced yet. */
+				my_error(ER_CANNOT_ADD_FOREIGN,
+					 MYF(0));
+				sql_print_error(
+					"InnoDB: dict_load_foreigns()"
+					" returned %u for %s",
+					(unsigned) error,
+					thd_query_string(user_thd)
+					->str);
+				ut_ad(0);
+			} else {
+				if (!commit_cache_norebuild(
+					    ctx, table, trx)) {
+					ut_a(!prebuilt->trx->check_foreigns);
+				}
+
+				innobase_rename_columns_cache(
+					ha_alter_info, table,
+					ctx->new_table);
+			}
+		}
+		DBUG_INJECT_CRASH("ib_commit_inplace_crash",
+				  crash_inject_count++);
+	}
+
+	/* Invalidate the index translation table. In partitioned
+	tables, there is one TABLE_SHARE (and also only one TABLE)
+	covering all partitions. */
+	share->idx_trans_tbl.index_count = 0;
+
+	if (trx == ctx0->trx) {
+		ctx0->trx = NULL;
+	}
+
+	/* Tell the InnoDB server that there might be work for
+	utility threads: */
+
+	srv_active_wake_master_thread();
+
+	if (fail) {
+		for (inplace_alter_handler_ctx** pctx = ctx_array;
+		     *pctx; pctx++) {
+			ha_innobase_inplace_ctx*	ctx
+				= static_cast<ha_innobase_inplace_ctx*>
+				(*pctx);
+			DBUG_ASSERT(ctx->need_rebuild() == new_clustered);
+
+			ut_d(dict_table_check_for_dup_indexes(
+				     ctx->old_table,
+				     CHECK_ABORTED_OK));
+			ut_a(fts_check_cached_index(ctx->old_table));
+			DBUG_INJECT_CRASH("ib_commit_inplace_crash_fail",
+					  crash_fail_inject_count++);
+		}
+
+		row_mysql_unlock_data_dictionary(trx);
+		trx_free_for_mysql(trx);
+		DBUG_RETURN(true);
+	}
+
+	/* Release the table locks. */
+	trx_commit_for_mysql(prebuilt->trx);
+
+	DBUG_EXECUTE_IF("ib_ddl_crash_after_user_trx_commit", DBUG_SUICIDE(););
+
+	for (inplace_alter_handler_ctx** pctx = ctx_array;
+	     *pctx; pctx++) {
+		ha_innobase_inplace_ctx*	ctx
+			= static_cast<ha_innobase_inplace_ctx*>
+			(*pctx);
+		DBUG_ASSERT(ctx->need_rebuild() == new_clustered);
+
+		if (altered_table->found_next_number_field) {
+			dict_table_t* t = ctx->new_table;
+
+			dict_table_autoinc_lock(t);
+			dict_table_autoinc_initialize(t, ctx->max_autoinc);
+			dict_table_autoinc_unlock(t);
+		}
+
+		bool	add_fts	= false;
+
+		/* Publish the created fulltext index, if any.
+		Note that a fulltext index can be created without
+		creating the clustered index, if there already exists
+		a suitable FTS_DOC_ID column. If not, one will be
+		created, implying new_clustered */
+		for (ulint i = 0; i < ctx->num_to_add_index; i++) {
+			dict_index_t*	index = ctx->add_index[i];
+
+			if (index->type & DICT_FTS) {
+				DBUG_ASSERT(index->type == DICT_FTS);
+				/* We reset DICT_TF2_FTS here because the bit
+				is left unset when a drop proceeds the add. */
+				DICT_TF2_FLAG_SET(ctx->new_table, DICT_TF2_FTS);
+				fts_add_index(index, ctx->new_table);
+				add_fts = true;
+			}
+		}
+
+		ut_d(dict_table_check_for_dup_indexes(
+			     ctx->new_table, CHECK_ALL_COMPLETE));
+
+		if (add_fts) {
+			fts_optimize_add_table(ctx->new_table);
+		}
+
+		ut_d(dict_table_check_for_dup_indexes(
+			     ctx->new_table, CHECK_ABORTED_OK));
+		ut_a(fts_check_cached_index(ctx->new_table));
+
+		if (new_clustered) {
+			/* Since the table has been rebuilt, we remove
+			all persistent statistics corresponding to the
+			old copy of the table (which was renamed to
+			ctx->tmp_name). */
+
+			char	errstr[1024];
+
+			DBUG_ASSERT(0 == strcmp(ctx->old_table->name,
+						ctx->tmp_name));
+
+			if (dict_stats_drop_table(
+				    ctx->new_table->name,
+				    errstr, sizeof(errstr))
+			    != DB_SUCCESS) {
+				push_warning_printf(
+					user_thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					ER_ALTER_INFO,
+					"Deleting persistent statistics"
+					" for rebuilt table '%s' in"
+					" InnoDB failed: %s",
+					table->s->table_name.str,
+					errstr);
+			}
+
+			DBUG_EXECUTE_IF("ib_ddl_crash_before_commit",
+					DBUG_SUICIDE(););
+
+			trx_t* const	user_trx = prebuilt->trx;
+
+			row_prebuilt_free(ctx->prebuilt, TRUE);
+
+			/* Drop the copy of the old table, which was
+			renamed to ctx->tmp_name at the atomic DDL
+			transaction commit.  If the system crashes
+			before this is completed, some orphan tables
+			with ctx->tmp_name may be recovered. */
+			trx_start_for_ddl(trx, TRX_DICT_OP_TABLE);
+			row_merge_drop_table(trx, ctx->old_table);
+			trx_commit_for_mysql(trx);
+
+			/* Rebuild the prebuilt object. */
+			ctx->prebuilt = row_create_prebuilt(
+				ctx->new_table, altered_table->s->reclength);
+			trx_start_if_not_started(user_trx);
+			user_trx->will_lock++;
+			prebuilt->trx = user_trx;
+		}
+		DBUG_INJECT_CRASH("ib_commit_inplace_crash",
+				  crash_inject_count++);
+	}
+
+	row_mysql_unlock_data_dictionary(trx);
+	trx_free_for_mysql(trx);
+
+	/* TODO: The following code could be executed
+	while allowing concurrent access to the table
+	(MDL downgrade). */
+
+	if (new_clustered) {
+		for (inplace_alter_handler_ctx** pctx = ctx_array;
+		     *pctx; pctx++) {
+			ha_innobase_inplace_ctx*	ctx
+				= static_cast<ha_innobase_inplace_ctx*>
+				(*pctx);
+			DBUG_ASSERT(ctx->need_rebuild());
+
+			alter_stats_rebuild(
+				ctx->new_table, table->s->table_name.str,
+				user_thd);
+			DBUG_INJECT_CRASH("ib_commit_inplace_crash",
+					  crash_inject_count++);
+		}
+	} else {
+		for (inplace_alter_handler_ctx** pctx = ctx_array;
+		     *pctx; pctx++) {
+			ha_innobase_inplace_ctx*	ctx
+				= static_cast<ha_innobase_inplace_ctx*>
+				(*pctx);
+			DBUG_ASSERT(!ctx->need_rebuild());
+
+			alter_stats_norebuild(
+				ha_alter_info, ctx, altered_table,
+				table->s->table_name.str, user_thd);
+			DBUG_INJECT_CRASH("ib_commit_inplace_crash",
+					  crash_inject_count++);
+		}
+	}
+
+	/* TODO: Also perform DROP TABLE and DROP INDEX after
+	the MDL downgrade. */
+
+#ifndef DBUG_OFF
+	dict_index_t* clust_index = dict_table_get_first_index(
+		prebuilt->table);
+	DBUG_ASSERT(!clust_index->online_log);
+	DBUG_ASSERT(dict_index_get_online_status(clust_index)
+		    == ONLINE_INDEX_COMPLETE);
+
+	for (dict_index_t* index = dict_table_get_first_index(
+		     prebuilt->table);
+	     index;
+	     index = dict_table_get_next_index(index)) {
+		DBUG_ASSERT(!index->to_be_dropped);
+	}
+#endif /* DBUG_OFF */
+
+	MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE);
+	DBUG_RETURN(false);
+}
+
+/**
+@param thd - the session
+@param start_value - the lower bound
+@param max_value - the upper bound (inclusive) */
+UNIV_INTERN
+ib_sequence_t::ib_sequence_t(
+	THD*		thd,
+	ulonglong	start_value,
+	ulonglong	max_value)
+	:
+	m_max_value(max_value),
+	m_increment(0),
+	m_offset(0),
+	m_next_value(start_value),
+	m_eof(false)
+{
+	if (thd != 0 && m_max_value > 0) {
+
+		thd_get_autoinc(thd, &m_offset, &m_increment);
+
+		if (m_increment > 1 || m_offset > 1) {
+
+			/* If there is an offset or increment specified
+			then we need to work out the exact next value. */
+
+			m_next_value = innobase_next_autoinc(
+				start_value, 1,
+				m_increment, m_offset, m_max_value);
+
+		} else if (start_value == 0) {
+			/* The next value can never be 0. */
+			m_next_value = 1;
+		}
+	} else {
+		m_eof = true;
+	}
+}
+
+/**
+Postfix increment
+@return the next value to insert */
+UNIV_INTERN
+ulonglong
+ib_sequence_t::operator++(int) UNIV_NOTHROW
+{
+	ulonglong	current = m_next_value;
+
+	ut_ad(!m_eof);
+	ut_ad(m_max_value > 0);
+
+	m_next_value = innobase_next_autoinc(
+		current, 1, m_increment, m_offset, m_max_value);
+
+	if (m_next_value == m_max_value && current == m_next_value) {
+		m_eof = true;
+	}
+
+	return(current);
+}
diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc
new file mode 100644
index 00000000000..cfe69274e8e
--- /dev/null
+++ b/storage/innobase/handler/i_s.cc
@@ -0,0 +1,8183 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file handler/i_s.cc
+InnoDB INFORMATION SCHEMA tables interface to MySQL.
+
+Created July 18, 2007 Vasil Dimov
+*******************************************************/
+
+#include <mysqld_error.h>
+#include <sql_acl.h>
+
+#include <m_ctype.h>
+#include <hash.h>
+#include <myisampack.h>
+#include <mysys_err.h>
+#include <my_sys.h>
+#include "i_s.h"
+#include <sql_plugin.h>
+#include <mysql/innodb_priv.h>
+
+#include "btr0pcur.h"
+#include "btr0types.h"
+#include "dict0dict.h"
+#include "dict0load.h"
+#include "buf0buddy.h"
+#include "buf0buf.h"
+#include "ibuf0ibuf.h"
+#include "dict0mem.h"
+#include "dict0types.h"
+#include "ha_prototypes.h"
+#include "srv0start.h"
+#include "trx0i_s.h"
+#include "trx0trx.h"
+#include "srv0mon.h"
+#include "fut0fut.h"
+#include "pars0pars.h"
+#include "fts0types.h"
+#include "fts0opt.h"
+#include "fts0priv.h"
+#include "btr0btr.h"
+#include "page0zip.h"
+
+/** structure associates a name string with a file page type and/or buffer
+page state. */
+struct buf_page_desc_t{
+	const char*	type_str;	/*!< String explain the page
+					type/state */
+	ulint		type_value;	/*!< Page type or page state */
+};
+
+/** Change buffer B-tree page */
+#define	I_S_PAGE_TYPE_IBUF		(FIL_PAGE_TYPE_LAST + 1)
+
+/** Any states greater than I_S_PAGE_TYPE_IBUF would be treated as
+unknown. */
+#define	I_S_PAGE_TYPE_UNKNOWN		(I_S_PAGE_TYPE_IBUF + 1)
+
+/** We also define I_S_PAGE_TYPE_INDEX as the Index Page's position
+in i_s_page_type[] array */
+#define I_S_PAGE_TYPE_INDEX		1
+
+/** Name string for File Page Types */
+static buf_page_desc_t	i_s_page_type[] = {
+	{"ALLOCATED", FIL_PAGE_TYPE_ALLOCATED},
+	{"INDEX", FIL_PAGE_INDEX},
+	{"UNDO_LOG", FIL_PAGE_UNDO_LOG},
+	{"INODE", FIL_PAGE_INODE},
+	{"IBUF_FREE_LIST", FIL_PAGE_IBUF_FREE_LIST},
+	{"IBUF_BITMAP", FIL_PAGE_IBUF_BITMAP},
+	{"SYSTEM", FIL_PAGE_TYPE_SYS},
+	{"TRX_SYSTEM", FIL_PAGE_TYPE_TRX_SYS},
+	{"FILE_SPACE_HEADER", FIL_PAGE_TYPE_FSP_HDR},
+	{"EXTENT_DESCRIPTOR", FIL_PAGE_TYPE_XDES},
+	{"BLOB", FIL_PAGE_TYPE_BLOB},
+	{"COMPRESSED_BLOB", FIL_PAGE_TYPE_ZBLOB},
+	{"COMPRESSED_BLOB2", FIL_PAGE_TYPE_ZBLOB2},
+	{"IBUF_INDEX", I_S_PAGE_TYPE_IBUF},
+	{"UNKNOWN", I_S_PAGE_TYPE_UNKNOWN}
+};
+
+/* Check if we can hold all page type in a 4 bit value */
+#if I_S_PAGE_TYPE_UNKNOWN > 1<<4
+# error "i_s_page_type[] is too large"
+#endif
+
+/** This structure defines information we will fetch from pages
+currently cached in the buffer pool. It will be used to populate
+table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE */
+struct buf_page_info_t{
+	ulint		block_id;	/*!< Buffer Pool block ID */
+	unsigned	space_id:32;	/*!< Tablespace ID */
+	unsigned	page_num:32;	/*!< Page number/offset */
+	unsigned	access_time:32;	/*!< Time of first access */
+	unsigned	pool_id:MAX_BUFFER_POOLS_BITS;
+					/*!< Buffer Pool ID. Must be less than
+					MAX_BUFFER_POOLS */
+	unsigned	flush_type:2;	/*!< Flush type */
+	unsigned	io_fix:2;	/*!< type of pending I/O operation */
+	unsigned	fix_count:19;	/*!< Count of how manyfold this block
+					is bufferfixed */
+	unsigned	hashed:1;	/*!< Whether hash index has been
+					built on this page */
+	unsigned	is_old:1;	/*!< TRUE if the block is in the old
+					blocks in buf_pool->LRU_old */
+	unsigned	freed_page_clock:31; /*!< the value of
+					buf_pool->freed_page_clock */
+	unsigned	zip_ssize:PAGE_ZIP_SSIZE_BITS;
+					/*!< Compressed page size */
+	unsigned	page_state:BUF_PAGE_STATE_BITS; /*!< Page state */
+	unsigned	page_type:4;	/*!< Page type */
+	unsigned	num_recs:UNIV_PAGE_SIZE_SHIFT_MAX-2;
+					/*!< Number of records on Page */
+	unsigned	data_size:UNIV_PAGE_SIZE_SHIFT_MAX;
+					/*!< Sum of the sizes of the records */
+	lsn_t		newest_mod;	/*!< Log sequence number of
+					the youngest modification */
+	lsn_t		oldest_mod;	/*!< Log sequence number of
+					the oldest modification */
+	index_id_t	index_id;	/*!< Index ID if a index page */
+};
+
+/** maximum number of buffer page info we would cache. */
+#define MAX_BUF_INFO_CACHED		10000
+
+#define OK(expr)		\
+	if ((expr) != 0) {	\
+		DBUG_RETURN(1);	\
+	}
+
+#define RETURN_IF_INNODB_NOT_STARTED(plugin_name)			\
+do {									\
+	if (!srv_was_started) {						\
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,	\
+				    ER_CANT_FIND_SYSTEM_REC,		\
+				    "InnoDB: SELECTing from "		\
+				    "INFORMATION_SCHEMA.%s but "	\
+				    "the InnoDB storage engine "	\
+				    "is not installed", plugin_name);	\
+		DBUG_RETURN(0);						\
+	}								\
+} while (0)
+
+#if !defined __STRICT_ANSI__ && defined __GNUC__ && (__GNUC__) > 2 &&	\
+	!defined __INTEL_COMPILER && !defined __clang__
+#define STRUCT_FLD(name, value)	name: value
+#else
+#define STRUCT_FLD(name, value)	value
+#endif
+
+/* Don't use a static const variable here, as some C++ compilers (notably
+HPUX aCC: HP ANSI C++ B3910B A.03.65) can't handle it. */
+#define END_OF_ST_FIELD_INFO \
+	{STRUCT_FLD(field_name,		NULL), \
+	 STRUCT_FLD(field_length,	0), \
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_NULL), \
+	 STRUCT_FLD(value,		0), \
+	 STRUCT_FLD(field_flags,	0), \
+	 STRUCT_FLD(old_name,		""), \
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)}
+
+/*
+Use the following types mapping:
+
+C type	ST_FIELD_INFO::field_type
+---------------------------------
+long			MYSQL_TYPE_LONGLONG
+(field_length=MY_INT64_NUM_DECIMAL_DIGITS)
+
+long unsigned		MYSQL_TYPE_LONGLONG
+(field_length=MY_INT64_NUM_DECIMAL_DIGITS, field_flags=MY_I_S_UNSIGNED)
+
+char*			MYSQL_TYPE_STRING
+(field_length=n)
+
+float			MYSQL_TYPE_FLOAT
+(field_length=0 is ignored)
+
+void*			MYSQL_TYPE_LONGLONG
+(field_length=MY_INT64_NUM_DECIMAL_DIGITS, field_flags=MY_I_S_UNSIGNED)
+
+boolean (if else)	MYSQL_TYPE_LONG
+(field_length=1)
+
+time_t			MYSQL_TYPE_DATETIME
+(field_length=0 ignored)
+---------------------------------
+*/
+
+/*******************************************************************//**
+Common function to fill any of the dynamic tables:
+INFORMATION_SCHEMA.innodb_trx
+INFORMATION_SCHEMA.innodb_locks
+INFORMATION_SCHEMA.innodb_lock_waits
+@return	0 on success */
+static
+int
+trx_i_s_common_fill_table(
+/*======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		);	/*!< in: condition (not used) */
+
+/*******************************************************************//**
+Unbind a dynamic INFORMATION_SCHEMA table.
+@return	0 on success */
+static
+int
+i_s_common_deinit(
+/*==============*/
+	void*	p);	/*!< in/out: table schema object */
+/*******************************************************************//**
+Auxiliary function to store time_t value in MYSQL_TYPE_DATETIME
+field.
+@return	0 on success */
+static
+int
+field_store_time_t(
+/*===============*/
+	Field*	field,	/*!< in/out: target field for storage */
+	time_t	time)	/*!< in: value to store */
+{
+	MYSQL_TIME	my_time;
+	struct tm	tm_time;
+
+	if (time) {
+#if 0
+		/* use this if you are sure that `variables' and `time_zone'
+		are always initialized */
+		thd->variables.time_zone->gmt_sec_to_TIME(
+			&my_time, (my_time_t) time);
+#else
+		localtime_r(&time, &tm_time);
+		localtime_to_TIME(&my_time, &tm_time);
+		my_time.time_type = MYSQL_TIMESTAMP_DATETIME;
+#endif
+	} else {
+		memset(&my_time, 0, sizeof(my_time));
+	}
+
+	return(field->store_time(&my_time, MYSQL_TIMESTAMP_DATETIME));
+}
+
+/*******************************************************************//**
+Auxiliary function to store char* value in MYSQL_TYPE_STRING field.
+@return	0 on success */
+static
+int
+field_store_string(
+/*===============*/
+	Field*		field,	/*!< in/out: target field for storage */
+	const char*	str)	/*!< in: NUL-terminated utf-8 string,
+				or NULL */
+{
+	int	ret;
+
+	if (str != NULL) {
+
+		ret = field->store(str, static_cast<uint>(strlen(str)),
+				   system_charset_info);
+		field->set_notnull();
+	} else {
+
+		ret = 0; /* success */
+		field->set_null();
+	}
+
+	return(ret);
+}
+
+/*******************************************************************//**
+Store the name of an index in a MYSQL_TYPE_VARCHAR field.
+Handles the names of incomplete secondary indexes.
+@return	0 on success */
+static
+int
+field_store_index_name(
+/*===================*/
+	Field*		field,		/*!< in/out: target field for
+					storage */
+	const char*	index_name)	/*!< in: NUL-terminated utf-8
+					index name, possibly starting with
+					TEMP_INDEX_PREFIX */
+{
+	int	ret;
+
+	ut_ad(index_name != NULL);
+	ut_ad(field->real_type() == MYSQL_TYPE_VARCHAR);
+
+	/* Since TEMP_INDEX_PREFIX is not a valid UTF8, we need to convert
+	it to something else. */
+	if (index_name[0] == TEMP_INDEX_PREFIX) {
+		char	buf[NAME_LEN + 1];
+		buf[0] = '?';
+		memcpy(buf + 1, index_name + 1, strlen(index_name));
+		ret = field->store(
+			buf, static_cast<uint>(strlen(buf)),
+			system_charset_info);
+	} else {
+		ret = field->store(
+			index_name, static_cast<uint>(strlen(index_name)),
+			system_charset_info);
+	}
+
+	field->set_notnull();
+
+	return(ret);
+}
+
+/*******************************************************************//**
+Auxiliary function to store ulint value in MYSQL_TYPE_LONGLONG field.
+If the value is ULINT_UNDEFINED then the field it set to NULL.
+@return	0 on success */
+static
+int
+field_store_ulint(
+/*==============*/
+	Field*	field,	/*!< in/out: target field for storage */
+	ulint	n)	/*!< in: value to store */
+{
+	int	ret;
+
+	if (n != ULINT_UNDEFINED) {
+
+		ret = field->store(static_cast<double>(n));
+		field->set_notnull();
+	} else {
+
+		ret = 0; /* success */
+		field->set_null();
+	}
+
+	return(ret);
+}
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_trx */
+static ST_FIELD_INFO	innodb_trx_fields_info[] =
+{
+#define IDX_TRX_ID		0
+	{STRUCT_FLD(field_name,		"trx_id"),
+	 STRUCT_FLD(field_length,	TRX_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_STATE		1
+	{STRUCT_FLD(field_name,		"trx_state"),
+	 STRUCT_FLD(field_length,	TRX_QUE_STATE_STR_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_STARTED		2
+	{STRUCT_FLD(field_name,		"trx_started"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_DATETIME),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_REQUESTED_LOCK_ID	3
+	{STRUCT_FLD(field_name,		"trx_requested_lock_id"),
+	 STRUCT_FLD(field_length,	TRX_I_S_LOCK_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_WAIT_STARTED	4
+	{STRUCT_FLD(field_name,		"trx_wait_started"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_DATETIME),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_WEIGHT		5
+	{STRUCT_FLD(field_name,		"trx_weight"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_MYSQL_THREAD_ID	6
+	{STRUCT_FLD(field_name,		"trx_mysql_thread_id"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_QUERY		7
+	{STRUCT_FLD(field_name,		"trx_query"),
+	 STRUCT_FLD(field_length,	TRX_I_S_TRX_QUERY_MAX_LEN),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_OPERATION_STATE	8
+	{STRUCT_FLD(field_name,		"trx_operation_state"),
+	 STRUCT_FLD(field_length,	TRX_I_S_TRX_OP_STATE_MAX_LEN),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_TABLES_IN_USE	9
+	{STRUCT_FLD(field_name,		"trx_tables_in_use"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_TABLES_LOCKED	10
+	{STRUCT_FLD(field_name,		"trx_tables_locked"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_LOCK_STRUCTS	11
+	{STRUCT_FLD(field_name,		"trx_lock_structs"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_LOCK_MEMORY_BYTES	12
+	{STRUCT_FLD(field_name,		"trx_lock_memory_bytes"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_ROWS_LOCKED	13
+	{STRUCT_FLD(field_name,		"trx_rows_locked"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_ROWS_MODIFIED		14
+	{STRUCT_FLD(field_name,		"trx_rows_modified"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_CONNCURRENCY_TICKETS	15
+	{STRUCT_FLD(field_name,		"trx_concurrency_tickets"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_ISOLATION_LEVEL	16
+	{STRUCT_FLD(field_name,		"trx_isolation_level"),
+	 STRUCT_FLD(field_length,	TRX_I_S_TRX_ISOLATION_LEVEL_MAX_LEN),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_UNIQUE_CHECKS	17
+	{STRUCT_FLD(field_name,		"trx_unique_checks"),
+	 STRUCT_FLD(field_length,	1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		1),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_FOREIGN_KEY_CHECKS	18
+	{STRUCT_FLD(field_name,		"trx_foreign_key_checks"),
+	 STRUCT_FLD(field_length,	1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		1),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_LAST_FOREIGN_KEY_ERROR	19
+	{STRUCT_FLD(field_name,		"trx_last_foreign_key_error"),
+	 STRUCT_FLD(field_length,	TRX_I_S_TRX_FK_ERROR_MAX_LEN),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_ADAPTIVE_HASH_LATCHED	20
+	{STRUCT_FLD(field_name,		"trx_adaptive_hash_latched"),
+	 STRUCT_FLD(field_length,	1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_ADAPTIVE_HASH_TIMEOUT	21
+	{STRUCT_FLD(field_name,		"trx_adaptive_hash_timeout"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_READ_ONLY		22
+	{STRUCT_FLD(field_name,		"trx_is_read_only"),
+	 STRUCT_FLD(field_length,	1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_AUTOCOMMIT_NON_LOCKING	23
+	{STRUCT_FLD(field_name,		"trx_autocommit_non_locking"),
+	 STRUCT_FLD(field_length,	1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Read data from cache buffer and fill the INFORMATION_SCHEMA.innodb_trx
+table with it.
+@return	0 on success */
+static
+int
+fill_innodb_trx_from_cache(
+/*=======================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache to read from */
+	THD*			thd,	/*!< in: used to call
+					schema_table_store_record() */
+	TABLE*			table)	/*!< in/out: fill this table */
+{
+	Field**	fields;
+	ulint	rows_num;
+	char	lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+	ulint	i;
+
+	DBUG_ENTER("fill_innodb_trx_from_cache");
+
+	fields = table->field;
+
+	rows_num = trx_i_s_cache_get_rows_used(cache,
+					       I_S_INNODB_TRX);
+
+	for (i = 0; i < rows_num; i++) {
+
+		i_s_trx_row_t*	row;
+		char		trx_id[TRX_ID_MAX_LEN + 1];
+
+		row = (i_s_trx_row_t*)
+			trx_i_s_cache_get_nth_row(
+				cache, I_S_INNODB_TRX, i);
+
+		/* trx_id */
+		ut_snprintf(trx_id, sizeof(trx_id), TRX_ID_FMT, row->trx_id);
+		OK(field_store_string(fields[IDX_TRX_ID], trx_id));
+
+		/* trx_state */
+		OK(field_store_string(fields[IDX_TRX_STATE],
+				      row->trx_state));
+
+		/* trx_started */
+		OK(field_store_time_t(fields[IDX_TRX_STARTED],
+				      (time_t) row->trx_started));
+
+		/* trx_requested_lock_id */
+		/* trx_wait_started */
+		if (row->trx_wait_started != 0) {
+
+			OK(field_store_string(
+				   fields[IDX_TRX_REQUESTED_LOCK_ID],
+				   trx_i_s_create_lock_id(
+					   row->requested_lock_row,
+					   lock_id, sizeof(lock_id))));
+			/* field_store_string() sets it no notnull */
+
+			OK(field_store_time_t(
+				   fields[IDX_TRX_WAIT_STARTED],
+				   (time_t) row->trx_wait_started));
+			fields[IDX_TRX_WAIT_STARTED]->set_notnull();
+		} else {
+
+			fields[IDX_TRX_REQUESTED_LOCK_ID]->set_null();
+			fields[IDX_TRX_WAIT_STARTED]->set_null();
+		}
+
+		/* trx_weight */
+		OK(fields[IDX_TRX_WEIGHT]->store((longlong) row->trx_weight,
+						 true));
+
+		/* trx_mysql_thread_id */
+		OK(fields[IDX_TRX_MYSQL_THREAD_ID]->store(
+			   static_cast<double>(row->trx_mysql_thread_id)));
+
+		/* trx_query */
+		if (row->trx_query) {
+			/* store will do appropriate character set
+			conversion check */
+			fields[IDX_TRX_QUERY]->store(
+				row->trx_query,
+				static_cast<uint>(strlen(row->trx_query)),
+				row->trx_query_cs);
+			fields[IDX_TRX_QUERY]->set_notnull();
+		} else {
+			fields[IDX_TRX_QUERY]->set_null();
+		}
+
+		/* trx_operation_state */
+		OK(field_store_string(fields[IDX_TRX_OPERATION_STATE],
+				      row->trx_operation_state));
+
+		/* trx_tables_in_use */
+		OK(fields[IDX_TRX_TABLES_IN_USE]->store(
+			   (longlong) row->trx_tables_in_use, true));
+
+		/* trx_tables_locked */
+		OK(fields[IDX_TRX_TABLES_LOCKED]->store(
+			   (longlong) row->trx_tables_locked, true));
+
+		/* trx_lock_structs */
+		OK(fields[IDX_TRX_LOCK_STRUCTS]->store(
+			   (longlong) row->trx_lock_structs, true));
+
+		/* trx_lock_memory_bytes */
+		OK(fields[IDX_TRX_LOCK_MEMORY_BYTES]->store(
+			   (longlong) row->trx_lock_memory_bytes, true));
+
+		/* trx_rows_locked */
+		OK(fields[IDX_TRX_ROWS_LOCKED]->store(
+			   (longlong) row->trx_rows_locked, true));
+
+		/* trx_rows_modified */
+		OK(fields[IDX_TRX_ROWS_MODIFIED]->store(
+			   (longlong) row->trx_rows_modified, true));
+
+		/* trx_concurrency_tickets */
+		OK(fields[IDX_TRX_CONNCURRENCY_TICKETS]->store(
+			   (longlong) row->trx_concurrency_tickets, true));
+
+		/* trx_isolation_level */
+		OK(field_store_string(fields[IDX_TRX_ISOLATION_LEVEL],
+				      row->trx_isolation_level));
+
+		/* trx_unique_checks */
+		OK(fields[IDX_TRX_UNIQUE_CHECKS]->store(
+			   static_cast<double>(row->trx_unique_checks)));
+
+		/* trx_foreign_key_checks */
+		OK(fields[IDX_TRX_FOREIGN_KEY_CHECKS]->store(
+			   static_cast<double>(row->trx_foreign_key_checks)));
+
+		/* trx_last_foreign_key_error */
+		OK(field_store_string(fields[IDX_TRX_LAST_FOREIGN_KEY_ERROR],
+				      row->trx_foreign_key_error));
+
+		/* trx_adaptive_hash_latched */
+		OK(fields[IDX_TRX_ADAPTIVE_HASH_LATCHED]->store(
+			   static_cast<double>(row->trx_has_search_latch)));
+
+		/* trx_adaptive_hash_timeout */
+		OK(fields[IDX_TRX_ADAPTIVE_HASH_TIMEOUT]->store(
+			   (longlong) row->trx_search_latch_timeout, true));
+
+		/* trx_is_read_only*/
+		OK(fields[IDX_TRX_READ_ONLY]->store(
+				(longlong) row->trx_is_read_only, true));
+
+		/* trx_is_autocommit_non_locking */
+		OK(fields[IDX_TRX_AUTOCOMMIT_NON_LOCKING]->store(
+				(longlong) row->trx_is_autocommit_non_locking,
+				true));
+
+		OK(schema_table_store_record(thd, table));
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_trx
+@return	0 on success */
+static
+int
+innodb_trx_init(
+/*============*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_trx_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_trx_fields_info;
+	schema->fill_table = trx_i_s_common_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+static struct st_mysql_information_schema	i_s_info =
+{
+	MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION
+};
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_trx =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_TRX"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB transactions"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_trx_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_locks */
+static ST_FIELD_INFO	innodb_locks_fields_info[] =
+{
+#define IDX_LOCK_ID		0
+	{STRUCT_FLD(field_name,		"lock_id"),
+	 STRUCT_FLD(field_length,	TRX_I_S_LOCK_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_TRX_ID		1
+	{STRUCT_FLD(field_name,		"lock_trx_id"),
+	 STRUCT_FLD(field_length,	TRX_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_MODE		2
+	{STRUCT_FLD(field_name,		"lock_mode"),
+	 /* S[,GAP] X[,GAP] IS[,GAP] IX[,GAP] AUTO_INC UNKNOWN */
+	 STRUCT_FLD(field_length,	32),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_TYPE		3
+	{STRUCT_FLD(field_name,		"lock_type"),
+	 STRUCT_FLD(field_length,	32 /* RECORD|TABLE|UNKNOWN */),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_TABLE		4
+	{STRUCT_FLD(field_name,		"lock_table"),
+	 STRUCT_FLD(field_length,	1024),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_INDEX		5
+	{STRUCT_FLD(field_name,		"lock_index"),
+	 STRUCT_FLD(field_length,	1024),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_SPACE		6
+	{STRUCT_FLD(field_name,		"lock_space"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_PAGE		7
+	{STRUCT_FLD(field_name,		"lock_page"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_REC		8
+	{STRUCT_FLD(field_name,		"lock_rec"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_DATA		9
+	{STRUCT_FLD(field_name,		"lock_data"),
+	 STRUCT_FLD(field_length,	TRX_I_S_LOCK_DATA_MAX_LEN),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Read data from cache buffer and fill the INFORMATION_SCHEMA.innodb_locks
+table with it.
+@return	0 on success */
+static
+int
+fill_innodb_locks_from_cache(
+/*=========================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache to read from */
+	THD*			thd,	/*!< in: MySQL client connection */
+	TABLE*			table)	/*!< in/out: fill this table */
+{
+	Field**	fields;
+	ulint	rows_num;
+	char	lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+	ulint	i;
+
+	DBUG_ENTER("fill_innodb_locks_from_cache");
+
+	fields = table->field;
+
+	rows_num = trx_i_s_cache_get_rows_used(cache,
+					       I_S_INNODB_LOCKS);
+
+	for (i = 0; i < rows_num; i++) {
+
+		i_s_locks_row_t*	row;
+		char			buf[MAX_FULL_NAME_LEN + 1];
+		const char*		bufend;
+
+		char			lock_trx_id[TRX_ID_MAX_LEN + 1];
+
+		row = (i_s_locks_row_t*)
+			trx_i_s_cache_get_nth_row(
+				cache, I_S_INNODB_LOCKS, i);
+
+		/* lock_id */
+		trx_i_s_create_lock_id(row, lock_id, sizeof(lock_id));
+		OK(field_store_string(fields[IDX_LOCK_ID],
+				      lock_id));
+
+		/* lock_trx_id */
+		ut_snprintf(lock_trx_id, sizeof(lock_trx_id),
+			    TRX_ID_FMT, row->lock_trx_id);
+		OK(field_store_string(fields[IDX_LOCK_TRX_ID], lock_trx_id));
+
+		/* lock_mode */
+		OK(field_store_string(fields[IDX_LOCK_MODE],
+				      row->lock_mode));
+
+		/* lock_type */
+		OK(field_store_string(fields[IDX_LOCK_TYPE],
+				      row->lock_type));
+
+		/* lock_table */
+		bufend = innobase_convert_name(buf, sizeof(buf),
+					       row->lock_table,
+					       strlen(row->lock_table),
+					       thd, TRUE);
+		OK(fields[IDX_LOCK_TABLE]->store(
+			buf, static_cast<uint>(bufend - buf),
+			system_charset_info));
+
+		/* lock_index */
+		if (row->lock_index != NULL) {
+			OK(field_store_index_name(fields[IDX_LOCK_INDEX],
+						  row->lock_index));
+		} else {
+			fields[IDX_LOCK_INDEX]->set_null();
+		}
+
+		/* lock_space */
+		OK(field_store_ulint(fields[IDX_LOCK_SPACE],
+				     row->lock_space));
+
+		/* lock_page */
+		OK(field_store_ulint(fields[IDX_LOCK_PAGE],
+				     row->lock_page));
+
+		/* lock_rec */
+		OK(field_store_ulint(fields[IDX_LOCK_REC],
+				     row->lock_rec));
+
+		/* lock_data */
+		OK(field_store_string(fields[IDX_LOCK_DATA],
+				      row->lock_data));
+
+		OK(schema_table_store_record(thd, table));
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_locks
+@return	0 on success */
+static
+int
+innodb_locks_init(
+/*==============*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_locks_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_locks_fields_info;
+	schema->fill_table = trx_i_s_common_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_locks =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_LOCKS"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB conflicting locks"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_locks_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_lock_waits */
+static ST_FIELD_INFO	innodb_lock_waits_fields_info[] =
+{
+#define IDX_REQUESTING_TRX_ID	0
+	{STRUCT_FLD(field_name,		"requesting_trx_id"),
+	 STRUCT_FLD(field_length,	TRX_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_REQUESTED_LOCK_ID	1
+	{STRUCT_FLD(field_name,		"requested_lock_id"),
+	 STRUCT_FLD(field_length,	TRX_I_S_LOCK_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BLOCKING_TRX_ID	2
+	{STRUCT_FLD(field_name,		"blocking_trx_id"),
+	 STRUCT_FLD(field_length,	TRX_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BLOCKING_LOCK_ID	3
+	{STRUCT_FLD(field_name,		"blocking_lock_id"),
+	 STRUCT_FLD(field_length,	TRX_I_S_LOCK_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Read data from cache buffer and fill the
+INFORMATION_SCHEMA.innodb_lock_waits table with it.
+@return	0 on success */
+static
+int
+fill_innodb_lock_waits_from_cache(
+/*==============================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache to read from */
+	THD*			thd,	/*!< in: used to call
+					schema_table_store_record() */
+	TABLE*			table)	/*!< in/out: fill this table */
+{
+	Field**	fields;
+	ulint	rows_num;
+	char	requested_lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+	char	blocking_lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+	ulint	i;
+
+	DBUG_ENTER("fill_innodb_lock_waits_from_cache");
+
+	fields = table->field;
+
+	rows_num = trx_i_s_cache_get_rows_used(cache,
+					       I_S_INNODB_LOCK_WAITS);
+
+	for (i = 0; i < rows_num; i++) {
+
+		i_s_lock_waits_row_t*	row;
+
+		char	requesting_trx_id[TRX_ID_MAX_LEN + 1];
+		char	blocking_trx_id[TRX_ID_MAX_LEN + 1];
+
+		row = (i_s_lock_waits_row_t*)
+			trx_i_s_cache_get_nth_row(
+				cache, I_S_INNODB_LOCK_WAITS, i);
+
+		/* requesting_trx_id */
+		ut_snprintf(requesting_trx_id, sizeof(requesting_trx_id),
+			    TRX_ID_FMT, row->requested_lock_row->lock_trx_id);
+		OK(field_store_string(fields[IDX_REQUESTING_TRX_ID],
+				      requesting_trx_id));
+
+		/* requested_lock_id */
+		OK(field_store_string(
+			   fields[IDX_REQUESTED_LOCK_ID],
+			   trx_i_s_create_lock_id(
+				   row->requested_lock_row,
+				   requested_lock_id,
+				   sizeof(requested_lock_id))));
+
+		/* blocking_trx_id */
+		ut_snprintf(blocking_trx_id, sizeof(blocking_trx_id),
+			    TRX_ID_FMT, row->blocking_lock_row->lock_trx_id);
+		OK(field_store_string(fields[IDX_BLOCKING_TRX_ID],
+				      blocking_trx_id));
+
+		/* blocking_lock_id */
+		OK(field_store_string(
+			   fields[IDX_BLOCKING_LOCK_ID],
+			   trx_i_s_create_lock_id(
+				   row->blocking_lock_row,
+				   blocking_lock_id,
+				   sizeof(blocking_lock_id))));
+
+		OK(schema_table_store_record(thd, table));
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_lock_waits
+@return	0 on success */
+static
+int
+innodb_lock_waits_init(
+/*===================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_lock_waits_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_lock_waits_fields_info;
+	schema->fill_table = trx_i_s_common_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_lock_waits =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_LOCK_WAITS"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB which lock is blocking which"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_lock_waits_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/*******************************************************************//**
+Common function to fill any of the dynamic tables:
+INFORMATION_SCHEMA.innodb_trx
+INFORMATION_SCHEMA.innodb_locks
+INFORMATION_SCHEMA.innodb_lock_waits
+@return	0 on success */
+static
+int
+trx_i_s_common_fill_table(
+/*======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	const char*		table_name;
+	int			ret;
+	trx_i_s_cache_t*	cache;
+
+	DBUG_ENTER("trx_i_s_common_fill_table");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	/* minimize the number of places where global variables are
+	referenced */
+	cache = trx_i_s_cache;
+
+	/* which table we have to fill? */
+	table_name = tables->schema_table_name;
+	/* or table_name = tables->schema_table->table_name; */
+
+	RETURN_IF_INNODB_NOT_STARTED(table_name);
+
+	/* update the cache */
+	trx_i_s_cache_start_write(cache);
+	trx_i_s_possibly_fetch_data_into_cache(cache);
+	trx_i_s_cache_end_write(cache);
+
+	if (trx_i_s_cache_is_truncated(cache)) {
+
+		/* XXX show warning to user if possible */
+		fprintf(stderr, "Warning: data in %s truncated due to "
+			"memory limit of %d bytes\n", table_name,
+			TRX_I_S_MEM_LIMIT);
+	}
+
+	ret = 0;
+
+	trx_i_s_cache_start_read(cache);
+
+	if (innobase_strcasecmp(table_name, "innodb_trx") == 0) {
+
+		if (fill_innodb_trx_from_cache(
+			cache, thd, tables->table) != 0) {
+
+			ret = 1;
+		}
+
+	} else if (innobase_strcasecmp(table_name, "innodb_locks") == 0) {
+
+		if (fill_innodb_locks_from_cache(
+			cache, thd, tables->table) != 0) {
+
+			ret = 1;
+		}
+
+	} else if (innobase_strcasecmp(table_name, "innodb_lock_waits") == 0) {
+
+		if (fill_innodb_lock_waits_from_cache(
+			cache, thd, tables->table) != 0) {
+
+			ret = 1;
+		}
+
+	} else {
+
+		/* huh! what happened!? */
+		fprintf(stderr,
+			"InnoDB: trx_i_s_common_fill_table() was "
+			"called to fill unknown table: %s.\n"
+			"This function only knows how to fill "
+			"innodb_trx, innodb_locks and "
+			"innodb_lock_waits tables.\n", table_name);
+
+		ret = 1;
+	}
+
+	trx_i_s_cache_end_read(cache);
+
+#if 0
+	DBUG_RETURN(ret);
+#else
+	/* if this function returns something else than 0 then a
+	deadlock occurs between the mysqld server and mysql client,
+	see http://bugs.mysql.com/29900 ; when that bug is resolved
+	we can enable the DBUG_RETURN(ret) above */
+	ret++;  // silence a gcc46 warning
+	DBUG_RETURN(0);
+#endif
+}
+
+/* Fields of the dynamic table information_schema.innodb_cmp. */
+static ST_FIELD_INFO	i_s_cmp_fields_info[] =
+{
+	{STRUCT_FLD(field_name,		"page_size"),
+	 STRUCT_FLD(field_length,	5),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Compressed Page Size"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"compress_ops"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Total Number of Compressions"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"compress_ops_ok"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Total Number of"
+					" Successful Compressions"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"compress_time"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Total Duration of Compressions,"
+		    " in Seconds"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"uncompress_ops"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Total Number of Decompressions"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"uncompress_time"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Total Duration of Decompressions,"
+		    " in Seconds"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp or
+innodb_cmp_reset.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_cmp_fill_low(
+/*=============*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		,	/*!< in: condition (ignored) */
+	ibool		reset)	/*!< in: TRUE=reset cumulated counts */
+{
+	TABLE*	table	= (TABLE*) tables->table;
+	int	status	= 0;
+
+	DBUG_ENTER("i_s_cmp_fill_low");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	for (uint i = 0; i < PAGE_ZIP_SSIZE_MAX; i++) {
+		page_zip_stat_t*	zip_stat = &page_zip_stat[i];
+
+		table->field[0]->store(UNIV_ZIP_SIZE_MIN << i);
+
+		/* The cumulated counts are not protected by any
+		mutex.  Thus, some operation in page0zip.cc could
+		increment a counter between the time we read it and
+		clear it.  We could introduce mutex protection, but it
+		could cause a measureable performance hit in
+		page0zip.cc. */
+		table->field[1]->store(
+			static_cast<double>(zip_stat->compressed));
+		table->field[2]->store(
+			static_cast<double>(zip_stat->compressed_ok));
+		table->field[3]->store(
+			static_cast<double>(zip_stat->compressed_usec / 1000000));
+		table->field[4]->store(
+			static_cast<double>(zip_stat->decompressed));
+		table->field[5]->store(
+			static_cast<double>(zip_stat->decompressed_usec / 1000000));
+
+		if (reset) {
+			memset(zip_stat, 0, sizeof *zip_stat);
+		}
+
+		if (schema_table_store_record(thd, table)) {
+			status = 1;
+			break;
+		}
+	}
+
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_cmp_fill(
+/*=========*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		cond)	/*!< in: condition (ignored) */
+{
+	return(i_s_cmp_fill_low(thd, tables, cond, FALSE));
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp_reset.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_cmp_reset_fill(
+/*===============*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		cond)	/*!< in: condition (ignored) */
+{
+	return(i_s_cmp_fill_low(thd, tables, cond, TRUE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmp.
+@return	0 on success */
+static
+int
+i_s_cmp_init(
+/*=========*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_cmp_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_cmp_fields_info;
+	schema->fill_table = i_s_cmp_fill;
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmp_reset.
+@return	0 on success */
+static
+int
+i_s_cmp_reset_init(
+/*===============*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_cmp_reset_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_cmp_fields_info;
+	schema->fill_table = i_s_cmp_reset_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmp =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_CMP"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "Statistics for the InnoDB compression"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_cmp_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmp_reset =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_CMP_RESET"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "Statistics for the InnoDB compression;"
+		   " reset cumulated counts"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_cmp_reset_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/* Fields of the dynamic tables
+information_schema.innodb_cmp_per_index and
+information_schema.innodb_cmp_per_index_reset. */
+static ST_FIELD_INFO	i_s_cmp_per_index_fields_info[] =
+{
+#define IDX_DATABASE_NAME	0
+	{STRUCT_FLD(field_name,		"database_name"),
+	 STRUCT_FLD(field_length,	192),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TABLE_NAME		1
+	{STRUCT_FLD(field_name,		"table_name"),
+	 STRUCT_FLD(field_length,	192),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_INDEX_NAME		2
+	{STRUCT_FLD(field_name,		"index_name"),
+	 STRUCT_FLD(field_length,	192),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_COMPRESS_OPS	3
+	{STRUCT_FLD(field_name,		"compress_ops"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_COMPRESS_OPS_OK	4
+	{STRUCT_FLD(field_name,		"compress_ops_ok"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_COMPRESS_TIME	5
+	{STRUCT_FLD(field_name,		"compress_time"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_UNCOMPRESS_OPS	6
+	{STRUCT_FLD(field_name,		"uncompress_ops"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_UNCOMPRESS_TIME	7
+	{STRUCT_FLD(field_name,		"uncompress_time"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Fill the dynamic table
+information_schema.innodb_cmp_per_index or
+information_schema.innodb_cmp_per_index_reset.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_cmp_per_index_fill_low(
+/*=======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		,	/*!< in: condition (ignored) */
+	ibool		reset)	/*!< in: TRUE=reset cumulated counts */
+{
+	TABLE*	table = tables->table;
+	Field**	fields = table->field;
+	int	status = 0;
+
+	DBUG_ENTER("i_s_cmp_per_index_fill_low");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	/* Create a snapshot of the stats so we do not bump into lock
+	order violations with dict_sys->mutex below. */
+	mutex_enter(&page_zip_stat_per_index_mutex);
+	page_zip_stat_per_index_t		snap (page_zip_stat_per_index);
+	mutex_exit(&page_zip_stat_per_index_mutex);
+
+	mutex_enter(&dict_sys->mutex);
+
+	page_zip_stat_per_index_t::iterator	iter;
+	ulint					i;
+
+	for (iter = snap.begin(), i = 0; iter != snap.end(); iter++, i++) {
+
+		char		name[192];
+		dict_index_t*	index = dict_index_find_on_id_low(iter->first);
+
+		if (index != NULL) {
+			char	db_utf8[MAX_DB_UTF8_LEN];
+			char	table_utf8[MAX_TABLE_UTF8_LEN];
+
+			dict_fs2utf8(index->table_name,
+				     db_utf8, sizeof(db_utf8),
+				     table_utf8, sizeof(table_utf8));
+
+			field_store_string(fields[IDX_DATABASE_NAME], db_utf8);
+			field_store_string(fields[IDX_TABLE_NAME], table_utf8);
+			field_store_index_name(fields[IDX_INDEX_NAME],
+					       index->name);
+		} else {
+			/* index not found */
+			ut_snprintf(name, sizeof(name),
+				    "index_id:" IB_ID_FMT, iter->first);
+			field_store_string(fields[IDX_DATABASE_NAME],
+					   "unknown");
+			field_store_string(fields[IDX_TABLE_NAME],
+					   "unknown");
+			field_store_string(fields[IDX_INDEX_NAME],
+					   name);
+		}
+
+		fields[IDX_COMPRESS_OPS]->store(
+			static_cast<double>(iter->second.compressed));
+
+		fields[IDX_COMPRESS_OPS_OK]->store(
+			static_cast<double>(iter->second.compressed_ok));
+
+		fields[IDX_COMPRESS_TIME]->store(
+			static_cast<double>(iter->second.compressed_usec / 1000000));
+
+		fields[IDX_UNCOMPRESS_OPS]->store(
+			static_cast<double>(iter->second.decompressed));
+
+		fields[IDX_UNCOMPRESS_TIME]->store(
+			static_cast<double>(iter->second.decompressed_usec / 1000000));
+
+		if (schema_table_store_record(thd, table)) {
+			status = 1;
+			break;
+		}
+
+		/* Release and reacquire the dict mutex to allow other
+		threads to proceed. This could eventually result in the
+		contents of INFORMATION_SCHEMA.innodb_cmp_per_index being
+		inconsistent, but it is an acceptable compromise. */
+		if (i % 1000 == 0) {
+			mutex_exit(&dict_sys->mutex);
+			mutex_enter(&dict_sys->mutex);
+		}
+	}
+
+	mutex_exit(&dict_sys->mutex);
+
+	if (reset) {
+		page_zip_reset_stat_per_index();
+	}
+
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp_per_index.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_cmp_per_index_fill(
+/*===================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		cond)	/*!< in: condition (ignored) */
+{
+	return(i_s_cmp_per_index_fill_low(thd, tables, cond, FALSE));
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp_per_index_reset.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_cmp_per_index_reset_fill(
+/*=========================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		cond)	/*!< in: condition (ignored) */
+{
+	return(i_s_cmp_per_index_fill_low(thd, tables, cond, TRUE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmp_per_index.
+@return	0 on success */
+static
+int
+i_s_cmp_per_index_init(
+/*===================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_cmp_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_cmp_per_index_fields_info;
+	schema->fill_table = i_s_cmp_per_index_fill;
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmp_per_index_reset.
+@return	0 on success */
+static
+int
+i_s_cmp_per_index_reset_init(
+/*=========================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_cmp_reset_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_cmp_per_index_fields_info;
+	schema->fill_table = i_s_cmp_per_index_reset_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmp_per_index =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_CMP_PER_INDEX"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "Statistics for the InnoDB compression (per index)"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_cmp_per_index_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmp_per_index_reset =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_CMP_PER_INDEX_RESET"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "Statistics for the InnoDB compression (per index);"
+		   " reset cumulated counts"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_cmp_per_index_reset_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/* Fields of the dynamic table information_schema.innodb_cmpmem. */
+static ST_FIELD_INFO	i_s_cmpmem_fields_info[] =
+{
+	{STRUCT_FLD(field_name,		"page_size"),
+	 STRUCT_FLD(field_length,	5),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Buddy Block Size"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"buffer_pool_instance"),
+	STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	STRUCT_FLD(value,		0),
+	STRUCT_FLD(field_flags,		0),
+	STRUCT_FLD(old_name,		"Buffer Pool Id"),
+	STRUCT_FLD(open_method,		SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"pages_used"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Currently in Use"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"pages_free"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Currently Available"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"relocation_ops"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Total Number of Relocations"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"relocation_time"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Total Duration of Relocations,"
+					" in Seconds"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmpmem or
+innodb_cmpmem_reset.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_cmpmem_fill_low(
+/*================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		,	/*!< in: condition (ignored) */
+	ibool		reset)	/*!< in: TRUE=reset cumulated counts */
+{
+	int		status = 0;
+	TABLE*	table	= (TABLE*) tables->table;
+
+	DBUG_ENTER("i_s_cmpmem_fill_low");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		status	= 0;
+
+		buf_pool = buf_pool_from_array(i);
+
+		buf_pool_mutex_enter(buf_pool);
+
+		for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) {
+			buf_buddy_stat_t*	buddy_stat;
+
+			buddy_stat = &buf_pool->buddy_stat[x];
+
+			table->field[0]->store(BUF_BUDDY_LOW << x);
+			table->field[1]->store(static_cast<double>(i));
+			table->field[2]->store(static_cast<double>(
+				buddy_stat->used));
+			table->field[3]->store(static_cast<double>(
+				(x < BUF_BUDDY_SIZES)
+				? UT_LIST_GET_LEN(buf_pool->zip_free[x])
+				: 0));
+			table->field[4]->store(
+				(longlong) buddy_stat->relocated, true);
+			table->field[5]->store(
+				static_cast<double>(buddy_stat->relocated_usec / 1000000));
+
+			if (reset) {
+				/* This is protected by buf_pool->mutex. */
+				buddy_stat->relocated = 0;
+				buddy_stat->relocated_usec = 0;
+			}
+
+			if (schema_table_store_record(thd, table)) {
+				status = 1;
+				break;
+			}
+		}
+
+		buf_pool_mutex_exit(buf_pool);
+
+		if (status) {
+			break;
+		}
+	}
+
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmpmem.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_cmpmem_fill(
+/*============*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		cond)	/*!< in: condition (ignored) */
+{
+	return(i_s_cmpmem_fill_low(thd, tables, cond, FALSE));
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmpmem_reset.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_cmpmem_reset_fill(
+/*==================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		cond)	/*!< in: condition (ignored) */
+{
+	return(i_s_cmpmem_fill_low(thd, tables, cond, TRUE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmpmem.
+@return	0 on success */
+static
+int
+i_s_cmpmem_init(
+/*============*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_cmpmem_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_cmpmem_fields_info;
+	schema->fill_table = i_s_cmpmem_fill;
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmpmem_reset.
+@return	0 on success */
+static
+int
+i_s_cmpmem_reset_init(
+/*==================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_cmpmem_reset_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_cmpmem_fields_info;
+	schema->fill_table = i_s_cmpmem_reset_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmpmem =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_CMPMEM"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "Statistics for the InnoDB compressed buffer pool"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_cmpmem_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmpmem_reset =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_CMPMEM_RESET"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "Statistics for the InnoDB compressed buffer pool;"
+		   " reset cumulated counts"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_cmpmem_reset_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_metrics */
+static ST_FIELD_INFO	innodb_metrics_fields_info[] =
+{
+#define	METRIC_NAME		0
+	{STRUCT_FLD(field_name,		"NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_SUBSYS		1
+	{STRUCT_FLD(field_name,		"SUBSYSTEM"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_VALUE_START	2
+	{STRUCT_FLD(field_name,		"COUNT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_MAX_VALUE_START	3
+	{STRUCT_FLD(field_name,		"MAX_COUNT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_MIN_VALUE_START	4
+	{STRUCT_FLD(field_name,		"MIN_COUNT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_AVG_VALUE_START	5
+	{STRUCT_FLD(field_name,		"AVG_COUNT"),
+	 STRUCT_FLD(field_length,	MAX_FLOAT_STR_LENGTH),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_VALUE_RESET	6
+	{STRUCT_FLD(field_name,		"COUNT_RESET"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_MAX_VALUE_RESET	7
+	{STRUCT_FLD(field_name,		"MAX_COUNT_RESET"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_MIN_VALUE_RESET	8
+	{STRUCT_FLD(field_name,		"MIN_COUNT_RESET"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_AVG_VALUE_RESET	9
+	{STRUCT_FLD(field_name,		"AVG_COUNT_RESET"),
+	 STRUCT_FLD(field_length,	MAX_FLOAT_STR_LENGTH),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_START_TIME	10
+	{STRUCT_FLD(field_name,		"TIME_ENABLED"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_DATETIME),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_STOP_TIME	11
+	{STRUCT_FLD(field_name,		"TIME_DISABLED"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_DATETIME),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_TIME_ELAPSED	12
+	{STRUCT_FLD(field_name,		"TIME_ELAPSED"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_RESET_TIME	13
+	{STRUCT_FLD(field_name,		"TIME_RESET"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_DATETIME),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_STATUS		14
+	{STRUCT_FLD(field_name,		"STATUS"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_TYPE		15
+	{STRUCT_FLD(field_name,		"TYPE"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	METRIC_DESC		16
+	{STRUCT_FLD(field_name,		"COMMENT"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Fill the information schema metrics table.
+@return	0 on success */
+static
+int
+i_s_metrics_fill(
+/*=============*/
+	THD*		thd,		/*!< in: thread */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	int		count;
+	Field**		fields;
+	double		time_diff = 0;
+	monitor_info_t*	monitor_info;
+	mon_type_t	min_val;
+	mon_type_t	max_val;
+
+	DBUG_ENTER("i_s_metrics_fill");
+	fields = table_to_fill->field;
+
+	for (count = 0; count < NUM_MONITOR; count++) {
+		monitor_info = srv_mon_get_info((monitor_id_t) count);
+
+		/* A good place to sanity check the Monitor ID */
+		ut_a(count == monitor_info->monitor_id);
+
+		/* If the item refers to a Module, nothing to fill,
+		continue. */
+		if ((monitor_info->monitor_type & MONITOR_MODULE)
+		    || (monitor_info->monitor_type & MONITOR_HIDDEN)) {
+			continue;
+		}
+
+		/* If this is an existing "status variable", and
+		its corresponding counter is still on, we need
+		to calculate the result from its corresponding
+		counter. */
+		if (monitor_info->monitor_type & MONITOR_EXISTING
+		    && MONITOR_IS_ON(count)) {
+			srv_mon_process_existing_counter((monitor_id_t) count,
+							 MONITOR_GET_VALUE);
+		}
+
+		/* Fill in counter's basic information */
+		OK(field_store_string(fields[METRIC_NAME],
+				      monitor_info->monitor_name));
+
+		OK(field_store_string(fields[METRIC_SUBSYS],
+				      monitor_info->monitor_module));
+
+		OK(field_store_string(fields[METRIC_DESC],
+				      monitor_info->monitor_desc));
+
+		/* Fill in counter values */
+		OK(fields[METRIC_VALUE_RESET]->store(
+			MONITOR_VALUE(count), FALSE));
+
+		OK(fields[METRIC_VALUE_START]->store(
+			MONITOR_VALUE_SINCE_START(count), FALSE));
+
+		/* If the max value is MAX_RESERVED, counter max
+		value has not been updated. Set the column value
+		to NULL. */
+		if (MONITOR_MAX_VALUE(count) == MAX_RESERVED
+		    || MONITOR_MAX_MIN_NOT_INIT(count)) {
+			fields[METRIC_MAX_VALUE_RESET]->set_null();
+		} else {
+			OK(fields[METRIC_MAX_VALUE_RESET]->store(
+				MONITOR_MAX_VALUE(count), FALSE));
+			fields[METRIC_MAX_VALUE_RESET]->set_notnull();
+		}
+
+		/* If the min value is MAX_RESERVED, counter min
+		value has not been updated. Set the column value
+		to NULL. */
+		if (MONITOR_MIN_VALUE(count) == MIN_RESERVED
+		    || MONITOR_MAX_MIN_NOT_INIT(count)) {
+			fields[METRIC_MIN_VALUE_RESET]->set_null();
+		} else {
+			OK(fields[METRIC_MIN_VALUE_RESET]->store(
+				MONITOR_MIN_VALUE(count), FALSE));
+			fields[METRIC_MIN_VALUE_RESET]->set_notnull();
+		}
+
+		/* Calculate the max value since counter started */
+		max_val = srv_mon_calc_max_since_start((monitor_id_t) count);
+
+		if (max_val == MAX_RESERVED
+		    || MONITOR_MAX_MIN_NOT_INIT(count)) {
+			fields[METRIC_MAX_VALUE_START]->set_null();
+		} else {
+			OK(fields[METRIC_MAX_VALUE_START]->store(
+				max_val, FALSE));
+			fields[METRIC_MAX_VALUE_START]->set_notnull();
+		}
+
+		/* Calculate the min value since counter started */
+		min_val = srv_mon_calc_min_since_start((monitor_id_t) count);
+
+		if (min_val == MIN_RESERVED
+		    || MONITOR_MAX_MIN_NOT_INIT(count)) {
+			fields[METRIC_MIN_VALUE_START]->set_null();
+		} else {
+			OK(fields[METRIC_MIN_VALUE_START]->store(
+				min_val, FALSE));
+
+			fields[METRIC_MIN_VALUE_START]->set_notnull();
+		}
+
+		/* If monitor has been enabled (no matter it is disabled
+		or not now), fill METRIC_START_TIME and METRIC_TIME_ELAPSED
+		field */
+		if (MONITOR_FIELD(count, mon_start_time)) {
+			OK(field_store_time_t(fields[METRIC_START_TIME],
+				(time_t)MONITOR_FIELD(count, mon_start_time)));
+			fields[METRIC_START_TIME]->set_notnull();
+
+			/* If monitor is enabled, the TIME_ELAPSED is the
+			time difference between current and time when monitor
+			is enabled. Otherwise, it is the time difference
+			between time when monitor is enabled and time
+			when it is disabled */
+			if (MONITOR_IS_ON(count)) {
+				time_diff = difftime(time(NULL),
+					MONITOR_FIELD(count, mon_start_time));
+			} else {
+				time_diff =  difftime(
+					MONITOR_FIELD(count, mon_stop_time),
+					MONITOR_FIELD(count, mon_start_time));
+			}
+
+			OK(fields[METRIC_TIME_ELAPSED]->store(
+				time_diff));
+			fields[METRIC_TIME_ELAPSED]->set_notnull();
+		} else {
+			fields[METRIC_START_TIME]->set_null();
+			fields[METRIC_TIME_ELAPSED]->set_null();
+			time_diff = 0;
+		}
+
+		/* Unless MONITOR__NO_AVERAGE is marked, we will need
+		to calculate the average value. If this is a monitor set
+		owner marked by MONITOR_SET_OWNER, divide
+		the value by another counter (number of calls) designated
+		by monitor_info->monitor_related_id.
+		Otherwise average the counter value by the time between the
+		time that the counter is enabled and time it is disabled
+		or time it is sampled. */
+		if (!(monitor_info->monitor_type & MONITOR_NO_AVERAGE)
+		    && (monitor_info->monitor_type & MONITOR_SET_OWNER)
+		    && monitor_info->monitor_related_id) {
+			mon_type_t	value_start
+				 = MONITOR_VALUE_SINCE_START(
+					monitor_info->monitor_related_id);
+
+			if (value_start) {
+				OK(fields[METRIC_AVG_VALUE_START]->store(
+					MONITOR_VALUE_SINCE_START(count)
+					/ value_start, FALSE));
+
+				fields[METRIC_AVG_VALUE_START]->set_notnull();
+			} else {
+				fields[METRIC_AVG_VALUE_START]->set_null();
+			}
+
+			if (MONITOR_VALUE(monitor_info->monitor_related_id)) {
+				OK(fields[METRIC_AVG_VALUE_RESET]->store(
+					MONITOR_VALUE(count)
+					/ MONITOR_VALUE(
+					monitor_info->monitor_related_id),
+					FALSE));
+			} else {
+				fields[METRIC_AVG_VALUE_RESET]->set_null();
+			}
+		} else if (!(monitor_info->monitor_type & MONITOR_NO_AVERAGE)
+			   && !(monitor_info->monitor_type
+				& MONITOR_DISPLAY_CURRENT)) {
+			if (time_diff) {
+				OK(fields[METRIC_AVG_VALUE_START]->store(
+					(double) MONITOR_VALUE_SINCE_START(
+						count) / time_diff));
+				fields[METRIC_AVG_VALUE_START]->set_notnull();
+			} else {
+				fields[METRIC_AVG_VALUE_START]->set_null();
+			}
+
+			if (MONITOR_FIELD(count, mon_reset_time)) {
+				/* calculate the time difference since last
+				reset */
+				if (MONITOR_IS_ON(count)) {
+					time_diff = difftime(
+						time(NULL), MONITOR_FIELD(
+							count, mon_reset_time));
+				} else {
+					time_diff =  difftime(
+					MONITOR_FIELD(count, mon_stop_time),
+					MONITOR_FIELD(count, mon_reset_time));
+				}
+			} else {
+				time_diff = 0;
+			}
+
+			if (time_diff) {
+				OK(fields[METRIC_AVG_VALUE_RESET]->store(
+					static_cast<double>(
+						MONITOR_VALUE(count) / time_diff)));
+				fields[METRIC_AVG_VALUE_RESET]->set_notnull();
+			} else {
+				fields[METRIC_AVG_VALUE_RESET]->set_null();
+			}
+		} else {
+			fields[METRIC_AVG_VALUE_START]->set_null();
+			fields[METRIC_AVG_VALUE_RESET]->set_null();
+		}
+
+
+		if (MONITOR_IS_ON(count)) {
+			/* If monitor is on, the stop time will set to NULL */
+			fields[METRIC_STOP_TIME]->set_null();
+
+			/* Display latest Monitor Reset Time only if Monitor
+			counter is on. */
+			if (MONITOR_FIELD(count, mon_reset_time)) {
+				OK(field_store_time_t(
+					fields[METRIC_RESET_TIME],
+					(time_t)MONITOR_FIELD(
+						count, mon_reset_time)));
+				fields[METRIC_RESET_TIME]->set_notnull();
+			} else {
+				fields[METRIC_RESET_TIME]->set_null();
+			}
+
+			/* Display the monitor status as "enabled" */
+			OK(field_store_string(fields[METRIC_STATUS],
+					      "enabled"));
+		} else {
+			if (MONITOR_FIELD(count, mon_stop_time)) {
+				OK(field_store_time_t(fields[METRIC_STOP_TIME],
+				(time_t)MONITOR_FIELD(count, mon_stop_time)));
+				fields[METRIC_STOP_TIME]->set_notnull();
+			} else {
+				fields[METRIC_STOP_TIME]->set_null();
+			}
+
+			fields[METRIC_RESET_TIME]->set_null();
+
+			OK(field_store_string(fields[METRIC_STATUS],
+					      "disabled"));
+		}
+
+		if (monitor_info->monitor_type & MONITOR_DISPLAY_CURRENT) {
+			OK(field_store_string(fields[METRIC_TYPE],
+					      "value"));
+		} else if (monitor_info->monitor_type & MONITOR_EXISTING) {
+			OK(field_store_string(fields[METRIC_TYPE],
+					      "status_counter"));
+		} else if (monitor_info->monitor_type & MONITOR_SET_OWNER) {
+			OK(field_store_string(fields[METRIC_TYPE],
+					      "set_owner"));
+		} else if ( monitor_info->monitor_type & MONITOR_SET_MEMBER) {
+			OK(field_store_string(fields[METRIC_TYPE],
+					      "set_member"));
+		} else {
+			OK(field_store_string(fields[METRIC_TYPE],
+					      "counter"));
+		}
+
+		OK(schema_table_store_record(thd, table_to_fill));
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Function to fill information schema metrics tables.
+@return	0 on success */
+static
+int
+i_s_metrics_fill_table(
+/*===================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	DBUG_ENTER("i_s_metrics_fill_table");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	i_s_metrics_fill(thd, tables->table);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_metrics
+@return	0 on success */
+static
+int
+innodb_metrics_init(
+/*================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_metrics_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_metrics_fields_info;
+	schema->fill_table = i_s_metrics_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_metrics =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_METRICS"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB Metrics Info"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_metrics_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_ft_default_stopword */
+static ST_FIELD_INFO	i_s_stopword_fields_info[] =
+{
+#define STOPWORD_VALUE	0
+	{STRUCT_FLD(field_name,		"value"),
+	 STRUCT_FLD(field_length,	TRX_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_ft_default_stopword.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_stopword_fill(
+/*==============*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	Field**	fields;
+	ulint	i = 0;
+	TABLE*	table = (TABLE*) tables->table;
+
+	DBUG_ENTER("i_s_stopword_fill");
+
+	fields = table->field;
+
+	/* Fill with server default stopword list in array
+	fts_default_stopword */
+	while (fts_default_stopword[i]) {
+		OK(field_store_string(fields[STOPWORD_VALUE],
+				      fts_default_stopword[i]));
+
+		OK(schema_table_store_record(thd, table));
+		i++;
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_ft_default_stopword.
+@return	0 on success */
+static
+int
+i_s_stopword_init(
+/*==============*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_stopword_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_stopword_fields_info;
+	schema->fill_table = i_s_stopword_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_ft_default_stopword =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_FT_DEFAULT_STOPWORD"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "Default stopword list for InnDB Full Text Search"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_stopword_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED
+INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED */
+static ST_FIELD_INFO	i_s_fts_doc_fields_info[] =
+{
+#define	I_S_FTS_DOC_ID			0
+	{STRUCT_FLD(field_name,		"DOC_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED or
+INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_deleted_generic_fill(
+/*=========================*/
+	THD*		thd,		/*!< in: thread */
+	TABLE_LIST*	tables,		/*!< in/out: tables to fill */
+	ibool		being_deleted)	/*!< in: BEING_DELTED table */
+{
+	Field**			fields;
+	TABLE*			table = (TABLE*) tables->table;
+	trx_t*			trx;
+	fts_table_t		fts_table;
+	fts_doc_ids_t*		deleted;
+	dict_table_t*		user_table;
+
+	DBUG_ENTER("i_s_fts_deleted_generic_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	if (!fts_internal_tbl_name) {
+		DBUG_RETURN(0);
+	}
+
+	deleted = fts_doc_ids_create();
+
+	user_table = dict_table_open_on_name(
+		fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+
+	if (!user_table) {
+		DBUG_RETURN(0);
+	}
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "Select for FTS DELETE TABLE";
+
+	FTS_INIT_FTS_TABLE(&fts_table,
+			   (being_deleted) ? "BEING_DELETED" : "DELETED",
+			   FTS_COMMON_TABLE, user_table);
+
+	fts_table_fetch_doc_ids(trx, &fts_table, deleted);
+
+	fields = table->field;
+
+	for (ulint j = 0; j < ib_vector_size(deleted->doc_ids); ++j) {
+		doc_id_t	doc_id;
+
+		doc_id = *(doc_id_t*) ib_vector_get_const(deleted->doc_ids, j);
+
+		OK(fields[I_S_FTS_DOC_ID]->store((longlong) doc_id, true));
+
+		OK(schema_table_store_record(thd, table));
+	}
+
+	trx_free_for_background(trx);
+
+	fts_doc_ids_free(deleted);
+
+	dict_table_close(user_table, FALSE, FALSE);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_deleted_fill(
+/*=================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	DBUG_ENTER("i_s_fts_deleted_fill");
+
+	DBUG_RETURN(i_s_fts_deleted_generic_fill(thd, tables, FALSE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED
+@return	0 on success */
+static
+int
+i_s_fts_deleted_init(
+/*=================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_deleted_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_fts_doc_fields_info;
+	schema->fill_table = i_s_fts_deleted_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_ft_deleted =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_FT_DELETED"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "INNODB AUXILIARY FTS DELETED TABLE"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_fts_deleted_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_being_deleted_fill(
+/*=======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	DBUG_ENTER("i_s_fts_being_deleted_fill");
+
+	DBUG_RETURN(i_s_fts_deleted_generic_fill(thd, tables, TRUE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED
+@return	0 on success */
+static
+int
+i_s_fts_being_deleted_init(
+/*=======================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_deleted_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_fts_doc_fields_info;
+	schema->fill_table = i_s_fts_being_deleted_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_ft_being_deleted =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_FT_BEING_DELETED"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "INNODB AUXILIARY FTS BEING DELETED TABLE"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_fts_being_deleted_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED and
+INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE */
+static ST_FIELD_INFO	i_s_fts_index_fields_info[] =
+{
+#define	I_S_FTS_WORD			0
+	{STRUCT_FLD(field_name,		"WORD"),
+	 STRUCT_FLD(field_length,	FTS_MAX_WORD_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	I_S_FTS_FIRST_DOC_ID		1
+	{STRUCT_FLD(field_name,		"FIRST_DOC_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	I_S_FTS_LAST_DOC_ID		2
+	{STRUCT_FLD(field_name,		"LAST_DOC_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	I_S_FTS_DOC_COUNT		3
+	{STRUCT_FLD(field_name,		"DOC_COUNT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	I_S_FTS_ILIST_DOC_ID		4
+	{STRUCT_FLD(field_name,		"DOC_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	I_S_FTS_ILIST_DOC_POS		5
+	{STRUCT_FLD(field_name,		"POSITION"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Go through the Doc Node and its ilist, fill the dynamic table
+INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED for one FTS index on the table.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_index_cache_fill_one_index(
+/*===============================*/
+	fts_index_cache_t*	index_cache,	/*!< in: FTS index cache */
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables)		/*!< in/out: tables to fill */
+{
+	TABLE*			table = (TABLE*) tables->table;
+	Field**			fields;
+	CHARSET_INFO*		index_charset;
+	const ib_rbt_node_t*	rbt_node;
+	fts_string_t		conv_str;
+	uint			dummy_errors;
+	char*			word_str;
+
+	DBUG_ENTER("i_s_fts_index_cache_fill_one_index");
+
+	fields = table->field;
+
+	index_charset = index_cache->charset;
+	conv_str.f_len = system_charset_info->mbmaxlen
+		* FTS_MAX_WORD_LEN_IN_CHAR;
+	conv_str.f_str = static_cast<byte*>(ut_malloc(conv_str.f_len));
+	conv_str.f_n_char = 0;
+
+	/* Go through each word in the index cache */
+	for (rbt_node = rbt_first(index_cache->words);
+	     rbt_node;
+	     rbt_node = rbt_next(index_cache->words, rbt_node)) {
+		doc_id_t	doc_id = 0;
+
+		fts_tokenizer_word_t* word;
+
+		word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+		/* Convert word from index charset to system_charset_info */
+		if (index_charset->cset != system_charset_info->cset) {
+			conv_str.f_n_char = my_convert(
+				reinterpret_cast<char*>(conv_str.f_str),
+				static_cast<uint32>(conv_str.f_len),
+				system_charset_info,
+				reinterpret_cast<char*>(word->text.f_str),
+				static_cast<uint32>(word->text.f_len),
+				index_charset, &dummy_errors);
+			ut_ad(conv_str.f_n_char <= conv_str.f_len);
+			conv_str.f_str[conv_str.f_n_char] = 0;
+			word_str = reinterpret_cast<char*>(conv_str.f_str);
+		} else {
+			word_str = reinterpret_cast<char*>(word->text.f_str);
+		}
+
+		/* Decrypt the ilist, and display Dod ID and word position */
+		for (ulint i = 0; i < ib_vector_size(word->nodes); i++) {
+			fts_node_t*	node;
+			byte*		ptr;
+			ulint		decoded = 0;
+
+			node = static_cast<fts_node_t*> (ib_vector_get(
+				word->nodes, i));
+
+			ptr = node->ilist;
+
+			while (decoded < node->ilist_size) {
+				ulint	pos = fts_decode_vlc(&ptr);
+
+				doc_id += pos;
+
+				/* Get position info */
+				while (*ptr) {
+					pos = fts_decode_vlc(&ptr);
+
+					OK(field_store_string(
+						fields[I_S_FTS_WORD],
+						word_str));
+
+					OK(fields[I_S_FTS_FIRST_DOC_ID]->store(
+						(longlong) node->first_doc_id,
+						true));
+
+					OK(fields[I_S_FTS_LAST_DOC_ID]->store(
+						(longlong) node->last_doc_id,
+						true));
+
+					OK(fields[I_S_FTS_DOC_COUNT]->store(
+						static_cast<double>(node->doc_count)));
+
+					OK(fields[I_S_FTS_ILIST_DOC_ID]->store(
+						(longlong) doc_id, true));
+
+					OK(fields[I_S_FTS_ILIST_DOC_POS]->store(
+						static_cast<double>(pos)));
+
+					OK(schema_table_store_record(
+						thd, table));
+				}
+
+				++ptr;
+
+				decoded = ptr - (byte*) node->ilist;
+			}
+		}
+	}
+
+	ut_free(conv_str.f_str);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_index_cache_fill(
+/*=====================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	dict_table_t*		user_table;
+	fts_cache_t*		cache;
+
+	DBUG_ENTER("i_s_fts_index_cache_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	if (!fts_internal_tbl_name) {
+		DBUG_RETURN(0);
+	}
+
+	user_table = dict_table_open_on_name(
+		fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+
+	if (!user_table) {
+		DBUG_RETURN(0);
+	}
+
+	cache = user_table->fts->cache;
+
+	ut_a(cache);
+
+	for (ulint i = 0; i < ib_vector_size(cache->indexes); i++) {
+		fts_index_cache_t*      index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*> (
+			ib_vector_get(cache->indexes, i));
+
+		i_s_fts_index_cache_fill_one_index(index_cache, thd, tables);
+	}
+
+	dict_table_close(user_table, FALSE, FALSE);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHE
+@return	0 on success */
+static
+int
+i_s_fts_index_cache_init(
+/*=====================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_index_cache_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_fts_index_fields_info;
+	schema->fill_table = i_s_fts_index_cache_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_ft_index_cache =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_FT_INDEX_CACHE"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "INNODB AUXILIARY FTS INDEX CACHED"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_fts_index_cache_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/*******************************************************************//**
+Go through a FTS index auxiliary table, fetch its rows and fill
+FTS word cache structure.
+@return	DB_SUCCESS on success, otherwise error code */
+static
+dberr_t
+i_s_fts_index_table_fill_selected(
+/*==============================*/
+	dict_index_t*		index,		/*!< in: FTS index */
+	ib_vector_t*		words,		/*!< in/out: vector to hold
+						fetched words */
+	ulint			selected,	/*!< in: selected FTS index */
+	fts_string_t*		word)		/*!< in: word to select */
+{
+	pars_info_t*		info;
+	fts_table_t		fts_table;
+	trx_t*			trx;
+	que_t*			graph;
+	dberr_t			error;
+	fts_fetch_t		fetch;
+
+	info = pars_info_create();
+
+	fetch.read_arg = words;
+	fetch.read_record = fts_optimize_index_fetch_node;
+	fetch.total_memory = 0;
+
+	DBUG_EXECUTE_IF("fts_instrument_result_cache_limit",
+	        fts_result_cache_limit = 8192;
+	);
+
+	trx = trx_allocate_for_background();
+
+	trx->op_info = "fetching FTS index nodes";
+
+	pars_info_bind_function(info, "my_func", fetch.read_record, &fetch);
+	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+	FTS_INIT_INDEX_TABLE(&fts_table, fts_get_suffix(selected),
+			     FTS_INDEX_TABLE, index);
+
+	graph = fts_parse_sql(
+		&fts_table, info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT word, doc_count, first_doc_id, last_doc_id, "
+		"ilist\n"
+		" FROM %s WHERE word >= :word;\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for(;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+
+			break;
+		} else {
+			fts_sql_rollback(trx);
+
+			ut_print_timestamp(stderr);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				fprintf(stderr, "  InnoDB: Warning: "
+					"lock wait timeout reading "
+					"FTS index.  Retrying!\n");
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				fprintf(stderr, "  InnoDB: Error: %d "
+				"while reading FTS index.\n", error);
+				break;
+			}
+		}
+	}
+
+	mutex_enter(&dict_sys->mutex);
+	que_graph_free(graph);
+	mutex_exit(&dict_sys->mutex);
+
+	trx_free_for_background(trx);
+
+	if (fetch.total_memory >= fts_result_cache_limit) {
+		error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
+	}
+
+	return(error);
+}
+
+/*******************************************************************//**
+Free words. */
+static
+void
+i_s_fts_index_table_free_one_fetch(
+/*===============================*/
+	ib_vector_t*		words)		/*!< in: words fetched */
+{
+	for (ulint i = 0; i < ib_vector_size(words); i++) {
+		fts_word_t*	word;
+
+		word = static_cast<fts_word_t*>(ib_vector_get(words, i));
+
+		for (ulint j = 0; j < ib_vector_size(word->nodes); j++) {
+			fts_node_t*     node;
+
+			node = static_cast<fts_node_t*> (ib_vector_get(
+				word->nodes, j));
+			ut_free(node->ilist);
+		}
+
+		fts_word_free(word);
+	}
+
+	ib_vector_reset(words);
+}
+
+/*******************************************************************//**
+Go through words, fill INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_index_table_fill_one_fetch(
+/*===============================*/
+	CHARSET_INFO*		index_charset,	/*!< in: FTS index charset */
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables,		/*!< in/out: tables to fill */
+	ib_vector_t*		words,		/*!< in: words fetched */
+	fts_string_t*		conv_str,	/*!< in: string for conversion*/
+	bool			has_more)	/*!< in: has more to fetch */
+{
+	TABLE*			table = (TABLE*) tables->table;
+	Field**			fields;
+	uint			dummy_errors;
+	char*			word_str;
+	ulint			words_size;
+	int			ret = 0;
+
+	DBUG_ENTER("i_s_fts_index_table_fill_one_fetch");
+
+	fields = table->field;
+
+	words_size = ib_vector_size(words);
+	if (has_more) {
+		/* the last word is not fetched completely. */
+		ut_ad(words_size > 1);
+		words_size -= 1;
+	}
+
+	/* Go through each word in the index cache */
+	for (ulint i = 0; i < words_size; i++) {
+		fts_word_t*	word;
+
+		word = static_cast<fts_word_t*>(ib_vector_get(words, i));
+
+		word->text.f_str[word->text.f_len] = 0;
+
+		/* Convert word from index charset to system_charset_info */
+		if (index_charset->cset != system_charset_info->cset) {
+			conv_str->f_n_char = my_convert(
+				reinterpret_cast<char*>(conv_str->f_str),
+				static_cast<uint32>(conv_str->f_len),
+				system_charset_info,
+				reinterpret_cast<char*>(word->text.f_str),
+				static_cast<uint32>(word->text.f_len),
+				index_charset, &dummy_errors);
+			ut_ad(conv_str->f_n_char <= conv_str->f_len);
+			conv_str->f_str[conv_str->f_n_char] = 0;
+			word_str = reinterpret_cast<char*>(conv_str->f_str);
+		} else {
+			word_str = reinterpret_cast<char*>(word->text.f_str);
+		}
+
+		/* Decrypt the ilist, and display Dod ID and word position */
+		for (ulint i = 0; i < ib_vector_size(word->nodes); i++) {
+			fts_node_t*	node;
+			byte*		ptr;
+			ulint		decoded = 0;
+			doc_id_t	doc_id = 0;
+
+			node = static_cast<fts_node_t*> (ib_vector_get(
+				word->nodes, i));
+
+			ptr = node->ilist;
+
+			while (decoded < node->ilist_size) {
+				ulint	pos = fts_decode_vlc(&ptr);
+
+				doc_id += pos;
+
+				/* Get position info */
+				while (*ptr) {
+					pos = fts_decode_vlc(&ptr);
+
+					OK(field_store_string(
+						fields[I_S_FTS_WORD],
+						word_str));
+
+					OK(fields[I_S_FTS_FIRST_DOC_ID]->store(
+						(longlong) node->first_doc_id,
+						true));
+
+					OK(fields[I_S_FTS_LAST_DOC_ID]->store(
+						(longlong) node->last_doc_id,
+						true));
+
+					OK(fields[I_S_FTS_DOC_COUNT]->store(
+						static_cast<double>(node->doc_count)));
+
+					OK(fields[I_S_FTS_ILIST_DOC_ID]->store(
+						(longlong) doc_id, true));
+
+					OK(fields[I_S_FTS_ILIST_DOC_POS]->store(
+						static_cast<double>(pos)));
+
+					OK(schema_table_store_record(
+						thd, table));
+				}
+
+				++ptr;
+
+				decoded = ptr - (byte*) node->ilist;
+			}
+		}
+	}
+
+	i_s_fts_index_table_free_one_fetch(words);
+
+	DBUG_RETURN(ret);
+}
+
+/*******************************************************************//**
+Go through a FTS index and its auxiliary tables, fetch rows in each table
+and fill INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_index_table_fill_one_index(
+/*===============================*/
+	dict_index_t*		index,		/*!< in: FTS index */
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables)		/*!< in/out: tables to fill */
+{
+	ib_vector_t*		words;
+	mem_heap_t*		heap;
+	fts_string_t		word;
+	CHARSET_INFO*		index_charset;
+	fts_string_t		conv_str;
+	dberr_t			error;
+	int			ret = 0;
+
+	DBUG_ENTER("i_s_fts_index_table_fill_one_index");
+	DBUG_ASSERT(!dict_index_is_online_ddl(index));
+
+	heap = mem_heap_create(1024);
+
+	words = ib_vector_create(ib_heap_allocator_create(heap),
+				 sizeof(fts_word_t), 256);
+
+	word.f_str = NULL;
+	word.f_len = 0;
+	word.f_n_char = 0;
+
+	index_charset = fts_index_get_charset(index);
+	conv_str.f_len = system_charset_info->mbmaxlen
+		* FTS_MAX_WORD_LEN_IN_CHAR;
+	conv_str.f_str = static_cast<byte*>(ut_malloc(conv_str.f_len));
+	conv_str.f_n_char = 0;
+
+	/* Iterate through each auxiliary table as described in
+	fts_index_selector */
+	for (ulint selected = 0; fts_index_selector[selected].value;
+	     selected++) {
+		bool	has_more = false;
+
+		do {
+			/* Fetch from index */
+			error = i_s_fts_index_table_fill_selected(
+				index, words, selected, &word);
+
+			if (error == DB_SUCCESS) {
+				has_more = false;
+			} else if (error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT) {
+				has_more = true;
+			} else {
+				i_s_fts_index_table_free_one_fetch(words);
+				ret = 1;
+				goto func_exit;
+			}
+
+			if (has_more) {
+				fts_word_t*	last_word;
+
+				/* Prepare start point for next fetch */
+				last_word = static_cast<fts_word_t*>(ib_vector_last(words));
+				ut_ad(last_word != NULL);
+				fts_utf8_string_dup(&word, &last_word->text, heap);
+			}
+
+			/* Fill into tables */
+			ret = i_s_fts_index_table_fill_one_fetch(
+				index_charset, thd, tables, words, &conv_str, has_more);
+
+			if (ret != 0) {
+				i_s_fts_index_table_free_one_fetch(words);
+				goto func_exit;
+			}
+		} while (has_more);
+	}
+
+func_exit:
+	ut_free(conv_str.f_str);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(ret);
+}
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_index_table_fill(
+/*=====================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	dict_table_t*		user_table;
+	dict_index_t*		index;
+
+	DBUG_ENTER("i_s_fts_index_table_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	if (!fts_internal_tbl_name) {
+		DBUG_RETURN(0);
+	}
+
+	user_table = dict_table_open_on_name(
+		fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+
+	if (!user_table) {
+		DBUG_RETURN(0);
+	}
+
+	for (index = dict_table_get_first_index(user_table);
+	     index; index = dict_table_get_next_index(index)) {
+		if (index->type & DICT_FTS) {
+			i_s_fts_index_table_fill_one_index(index, thd, tables);
+		}
+	}
+
+	dict_table_close(user_table, FALSE, FALSE);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE
+@return	0 on success */
+static
+int
+i_s_fts_index_table_init(
+/*=====================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_index_table_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_fts_index_fields_info;
+	schema->fill_table = i_s_fts_index_table_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_ft_index_table =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_FT_INDEX_TABLE"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "INNODB AUXILIARY FTS INDEX TABLE"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_fts_index_table_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG */
+static ST_FIELD_INFO	i_s_fts_config_fields_info[] =
+{
+#define	FTS_CONFIG_KEY			0
+	{STRUCT_FLD(field_name,		"KEY"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	FTS_CONFIG_VALUE		1
+	{STRUCT_FLD(field_name,		"VALUE"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+static const char* fts_config_key[] = {
+	FTS_OPTIMIZE_LIMIT_IN_SECS,
+	FTS_SYNCED_DOC_ID,
+	FTS_STOPWORD_TABLE_NAME,
+	FTS_USE_STOPWORD,
+        NULL
+};
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_config_fill(
+/*================*/
+	THD*		thd,		/*!< in: thread */
+	TABLE_LIST*	tables,		/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	Field**			fields;
+	TABLE*			table = (TABLE*) tables->table;
+	trx_t*			trx;
+	fts_table_t		fts_table;
+	dict_table_t*		user_table;
+	ulint			i = 0;
+	dict_index_t*		index = NULL;
+	unsigned char		str[FTS_MAX_CONFIG_VALUE_LEN + 1];
+
+	DBUG_ENTER("i_s_fts_config_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	if (!fts_internal_tbl_name) {
+		DBUG_RETURN(0);
+	}
+
+	fields = table->field;
+
+	user_table = dict_table_open_on_name(
+		fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+
+	if (!user_table) {
+		DBUG_RETURN(0);
+	}
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "Select for FTS DELETE TABLE";
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, user_table);
+
+	if (!ib_vector_is_empty(user_table->fts->indexes)) {
+		index = (dict_index_t*) ib_vector_getp_const(
+				user_table->fts->indexes, 0);
+		DBUG_ASSERT(!dict_index_is_online_ddl(index));
+	}
+
+	while (fts_config_key[i]) {
+		fts_string_t	value;
+		char*		key_name;
+		ulint		allocated = FALSE;
+
+		value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+
+		value.f_str = str;
+
+		if (index
+		    && strcmp(fts_config_key[i], FTS_TOTAL_WORD_COUNT) == 0) {
+			key_name = fts_config_create_index_param_name(
+				fts_config_key[i], index);
+			allocated = TRUE;
+		} else {
+			key_name = (char*) fts_config_key[i];
+		}
+
+		fts_config_get_value(trx, &fts_table, key_name, &value);
+
+		if (allocated) {
+			ut_free(key_name);
+		}
+
+		OK(field_store_string(
+                        fields[FTS_CONFIG_KEY], fts_config_key[i]));
+
+		OK(field_store_string(
+                        fields[FTS_CONFIG_VALUE], (const char*) value.f_str));
+
+		OK(schema_table_store_record(thd, table));
+
+		i++;
+	}
+
+	fts_sql_commit(trx);
+
+	trx_free_for_background(trx);
+
+	dict_table_close(user_table, FALSE, FALSE);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG
+@return	0 on success */
+static
+int
+i_s_fts_config_init(
+/*=================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_config_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_fts_config_fields_info;
+	schema->fill_table = i_s_fts_config_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_ft_config =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_FT_CONFIG"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "INNODB AUXILIARY FTS CONFIG TABLE"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_fts_config_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/* Fields of the dynamic table INNODB_BUFFER_POOL_STATS. */
+static ST_FIELD_INFO	i_s_innodb_buffer_stats_fields_info[] =
+{
+#define IDX_BUF_STATS_POOL_ID		0
+	{STRUCT_FLD(field_name,		"POOL_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_POOL_SIZE		1
+	{STRUCT_FLD(field_name,		"POOL_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_FREE_BUFFERS	2
+	{STRUCT_FLD(field_name,		"FREE_BUFFERS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_LRU_LEN		3
+	{STRUCT_FLD(field_name,		"DATABASE_PAGES"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_OLD_LRU_LEN	4
+	{STRUCT_FLD(field_name,		"OLD_DATABASE_PAGES"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_FLUSH_LIST_LEN	5
+	{STRUCT_FLD(field_name,		"MODIFIED_DATABASE_PAGES"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_PENDING_ZIP	6
+	{STRUCT_FLD(field_name,		"PENDING_DECOMPRESS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_PENDING_READ	7
+	{STRUCT_FLD(field_name,		"PENDING_READS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_FLUSH_LRU		8
+	{STRUCT_FLD(field_name,		"PENDING_FLUSH_LRU"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_FLUSH_LIST	9
+	{STRUCT_FLD(field_name,		"PENDING_FLUSH_LIST"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_PAGE_YOUNG	10
+	{STRUCT_FLD(field_name,		"PAGES_MADE_YOUNG"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_PAGE_NOT_YOUNG	11
+	{STRUCT_FLD(field_name,		"PAGES_NOT_MADE_YOUNG"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	IDX_BUF_STATS_PAGE_YOUNG_RATE	12
+	{STRUCT_FLD(field_name,		"PAGES_MADE_YOUNG_RATE"),
+	 STRUCT_FLD(field_length,	MAX_FLOAT_STR_LENGTH),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	IDX_BUF_STATS_PAGE_NOT_YOUNG_RATE 13
+	{STRUCT_FLD(field_name,		"PAGES_MADE_NOT_YOUNG_RATE"),
+	 STRUCT_FLD(field_length,	MAX_FLOAT_STR_LENGTH),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_PAGE_READ		14
+	{STRUCT_FLD(field_name,		"NUMBER_PAGES_READ"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_PAGE_CREATED	15
+	{STRUCT_FLD(field_name,		"NUMBER_PAGES_CREATED"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_PAGE_WRITTEN	16
+	{STRUCT_FLD(field_name,		"NUMBER_PAGES_WRITTEN"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	IDX_BUF_STATS_PAGE_READ_RATE	17
+	{STRUCT_FLD(field_name,		"PAGES_READ_RATE"),
+	 STRUCT_FLD(field_length,	MAX_FLOAT_STR_LENGTH),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	IDX_BUF_STATS_PAGE_CREATE_RATE	18
+	{STRUCT_FLD(field_name,		"PAGES_CREATE_RATE"),
+	 STRUCT_FLD(field_length,	MAX_FLOAT_STR_LENGTH),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	IDX_BUF_STATS_PAGE_WRITTEN_RATE	19
+	{STRUCT_FLD(field_name,		"PAGES_WRITTEN_RATE"),
+	 STRUCT_FLD(field_length,	MAX_FLOAT_STR_LENGTH),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_GET		20
+	{STRUCT_FLD(field_name,		"NUMBER_PAGES_GET"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_HIT_RATE		21
+	{STRUCT_FLD(field_name,		"HIT_RATE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_MADE_YOUNG_PCT	22
+	{STRUCT_FLD(field_name,		"YOUNG_MAKE_PER_THOUSAND_GETS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_NOT_MADE_YOUNG_PCT 23
+	{STRUCT_FLD(field_name,		"NOT_YOUNG_MAKE_PER_THOUSAND_GETS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_READ_AHREAD	24
+	{STRUCT_FLD(field_name,		"NUMBER_PAGES_READ_AHEAD"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_READ_AHEAD_EVICTED 25
+	{STRUCT_FLD(field_name,		"NUMBER_READ_AHEAD_EVICTED"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	IDX_BUF_STATS_READ_AHEAD_RATE	26
+	{STRUCT_FLD(field_name,		"READ_AHEAD_RATE"),
+	 STRUCT_FLD(field_length,	MAX_FLOAT_STR_LENGTH),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define	IDX_BUF_STATS_READ_AHEAD_EVICT_RATE 27
+	{STRUCT_FLD(field_name,		"READ_AHEAD_EVICTED_RATE"),
+	 STRUCT_FLD(field_length,	MAX_FLOAT_STR_LENGTH),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_FLOAT),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_LRU_IO_SUM	28
+	{STRUCT_FLD(field_name,		"LRU_IO_TOTAL"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_LRU_IO_CUR	29
+	{STRUCT_FLD(field_name,		"LRU_IO_CURRENT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_UNZIP_SUM		30
+	{STRUCT_FLD(field_name,		"UNCOMPRESS_TOTAL"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_STATS_UNZIP_CUR		31
+	{STRUCT_FLD(field_name,		"UNCOMPRESS_CURRENT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Fill Information Schema table INNODB_BUFFER_POOL_STATS for a particular
+buffer pool
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_stats_fill(
+/*==================*/
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables,		/*!< in/out: tables to fill */
+	const buf_pool_info_t*	info)		/*!< in: buffer pool
+						information */
+{
+	TABLE*			table;
+	Field**			fields;
+
+	DBUG_ENTER("i_s_innodb_stats_fill");
+
+	table = tables->table;
+
+	fields = table->field;
+
+	OK(fields[IDX_BUF_STATS_POOL_ID]->store(
+		static_cast<double>(info->pool_unique_id)));
+
+	OK(fields[IDX_BUF_STATS_POOL_SIZE]->store(
+		static_cast<double>(info->pool_size)));
+
+	OK(fields[IDX_BUF_STATS_LRU_LEN]->store(
+		static_cast<double>(info->lru_len)));
+
+	OK(fields[IDX_BUF_STATS_OLD_LRU_LEN]->store(
+		static_cast<double>(info->old_lru_len)));
+
+	OK(fields[IDX_BUF_STATS_FREE_BUFFERS]->store(
+		static_cast<double>(info->free_list_len)));
+
+	OK(fields[IDX_BUF_STATS_FLUSH_LIST_LEN]->store(
+		static_cast<double>(info->flush_list_len)));
+
+	OK(fields[IDX_BUF_STATS_PENDING_ZIP]->store(
+		static_cast<double>(info->n_pend_unzip)));
+
+	OK(fields[IDX_BUF_STATS_PENDING_READ]->store(
+		static_cast<double>(info->n_pend_reads)));
+
+	OK(fields[IDX_BUF_STATS_FLUSH_LRU]->store(
+		static_cast<double>(info->n_pending_flush_lru)));
+
+	OK(fields[IDX_BUF_STATS_FLUSH_LIST]->store(
+		static_cast<double>(info->n_pending_flush_list)));
+
+	OK(fields[IDX_BUF_STATS_PAGE_YOUNG]->store(
+		static_cast<double>(info->n_pages_made_young)));
+
+	OK(fields[IDX_BUF_STATS_PAGE_NOT_YOUNG]->store(
+		static_cast<double>(info->n_pages_not_made_young)));
+
+	OK(fields[IDX_BUF_STATS_PAGE_YOUNG_RATE]->store(
+		info->page_made_young_rate));
+
+	OK(fields[IDX_BUF_STATS_PAGE_NOT_YOUNG_RATE]->store(
+		info->page_not_made_young_rate));
+
+	OK(fields[IDX_BUF_STATS_PAGE_READ]->store(
+		static_cast<double>(info->n_pages_read)));
+
+	OK(fields[IDX_BUF_STATS_PAGE_CREATED]->store(
+		static_cast<double>(info->n_pages_created)));
+
+	OK(fields[IDX_BUF_STATS_PAGE_WRITTEN]->store(
+		static_cast<double>(info->n_pages_written)));
+
+	OK(fields[IDX_BUF_STATS_GET]->store(
+		static_cast<double>(info->n_page_gets)));
+
+	OK(fields[IDX_BUF_STATS_PAGE_READ_RATE]->store(
+		info->pages_read_rate));
+
+	OK(fields[IDX_BUF_STATS_PAGE_CREATE_RATE]->store(
+		info->pages_created_rate));
+
+	OK(fields[IDX_BUF_STATS_PAGE_WRITTEN_RATE]->store(
+		info->pages_written_rate));
+
+	if (info->n_page_get_delta) {
+		OK(fields[IDX_BUF_STATS_HIT_RATE]->store(
+			static_cast<double>(
+				1000 - (1000 * info->page_read_delta
+				/ info->n_page_get_delta))));
+
+		OK(fields[IDX_BUF_STATS_MADE_YOUNG_PCT]->store(
+			static_cast<double>(
+				1000 * info->young_making_delta
+				/ info->n_page_get_delta)));
+
+		OK(fields[IDX_BUF_STATS_NOT_MADE_YOUNG_PCT]->store(
+			static_cast<double>(
+				1000 * info->not_young_making_delta
+				/ info->n_page_get_delta)));
+	} else {
+		OK(fields[IDX_BUF_STATS_HIT_RATE]->store(0));
+		OK(fields[IDX_BUF_STATS_MADE_YOUNG_PCT]->store(0));
+		OK(fields[IDX_BUF_STATS_NOT_MADE_YOUNG_PCT]->store(0));
+	}
+
+	OK(fields[IDX_BUF_STATS_READ_AHREAD]->store(
+		static_cast<double>(info->n_ra_pages_read)));
+
+	OK(fields[IDX_BUF_STATS_READ_AHEAD_EVICTED]->store(
+		static_cast<double>(info->n_ra_pages_evicted)));
+
+	OK(fields[IDX_BUF_STATS_READ_AHEAD_RATE]->store(
+		info->pages_readahead_rate));
+
+	OK(fields[IDX_BUF_STATS_READ_AHEAD_EVICT_RATE]->store(
+		info->pages_evicted_rate));
+
+	OK(fields[IDX_BUF_STATS_LRU_IO_SUM]->store(
+		static_cast<double>(info->io_sum)));
+
+	OK(fields[IDX_BUF_STATS_LRU_IO_CUR]->store(
+		static_cast<double>(info->io_cur)));
+
+	OK(fields[IDX_BUF_STATS_UNZIP_SUM]->store(
+		static_cast<double>(info->unzip_sum)));
+
+	OK(fields[IDX_BUF_STATS_UNZIP_CUR]->store(
+		static_cast<double>(info->unzip_cur)));
+
+	DBUG_RETURN(schema_table_store_record(thd, table));
+}
+
+/*******************************************************************//**
+This is the function that loops through each buffer pool and fetch buffer
+pool stats to information schema  table: I_S_INNODB_BUFFER_POOL_STATS
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_stats_fill_table(
+/*===============================*/
+	THD*		thd,		/*!< in: thread */
+	TABLE_LIST*	tables,		/*!< in/out: tables to fill */
+	Item*		)		/*!< in: condition (ignored) */
+{
+	int			status	= 0;
+	buf_pool_info_t*	pool_info;
+
+	DBUG_ENTER("i_s_innodb_buffer_fill_general");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	/* Only allow the PROCESS privilege holder to access the stats */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	pool_info = (buf_pool_info_t*) mem_zalloc(
+		srv_buf_pool_instances *  sizeof *pool_info);
+
+	/* Walk through each buffer pool */
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*		buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		/* Fetch individual buffer pool info */
+		buf_stats_get_pool_info(buf_pool, i, pool_info);
+
+		status = i_s_innodb_stats_fill(thd, tables, &pool_info[i]);
+
+		/* If something goes wrong, break and return */
+		if (status) {
+			break;
+		}
+	}
+
+	mem_free(pool_info);
+
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_pool_stats_init(
+/*==============================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("i_s_innodb_buffer_pool_stats_init");
+
+	schema = reinterpret_cast<ST_SCHEMA_TABLE*>(p);
+
+	schema->fields_info = i_s_innodb_buffer_stats_fields_info;
+	schema->fill_table = i_s_innodb_buffer_stats_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_buffer_stats =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_BUFFER_POOL_STATS"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB Buffer Pool Statistics Information "),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_innodb_buffer_pool_stats_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/* Fields of the dynamic table INNODB_BUFFER_POOL_PAGE. */
+static ST_FIELD_INFO	i_s_innodb_buffer_page_fields_info[] =
+{
+#define IDX_BUFFER_POOL_ID		0
+	{STRUCT_FLD(field_name,		"POOL_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_BLOCK_ID		1
+	{STRUCT_FLD(field_name,		"BLOCK_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_SPACE		2
+	{STRUCT_FLD(field_name,		"SPACE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_NUM		3
+	{STRUCT_FLD(field_name,		"PAGE_NUMBER"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_TYPE		4
+	{STRUCT_FLD(field_name,		"PAGE_TYPE"),
+	 STRUCT_FLD(field_length,	64),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_FLUSH_TYPE	5
+	{STRUCT_FLD(field_name,		"FLUSH_TYPE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_FIX_COUNT	6
+	{STRUCT_FLD(field_name,		"FIX_COUNT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_HASHED		7
+	{STRUCT_FLD(field_name,		"IS_HASHED"),
+	 STRUCT_FLD(field_length,	3),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_NEWEST_MOD	8
+	{STRUCT_FLD(field_name,		"NEWEST_MODIFICATION"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_OLDEST_MOD	9
+	{STRUCT_FLD(field_name,		"OLDEST_MODIFICATION"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_ACCESS_TIME	10
+	{STRUCT_FLD(field_name,		"ACCESS_TIME"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_TABLE_NAME	11
+	{STRUCT_FLD(field_name,		"TABLE_NAME"),
+	 STRUCT_FLD(field_length,	1024),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_INDEX_NAME	12
+	{STRUCT_FLD(field_name,		"INDEX_NAME"),
+	 STRUCT_FLD(field_length,	1024),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_NUM_RECS	13
+	{STRUCT_FLD(field_name,		"NUMBER_RECORDS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_DATA_SIZE	14
+	{STRUCT_FLD(field_name,		"DATA_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_ZIP_SIZE	15
+	{STRUCT_FLD(field_name,		"COMPRESSED_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_STATE		16
+	{STRUCT_FLD(field_name,		"PAGE_STATE"),
+	 STRUCT_FLD(field_length,	64),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_IO_FIX		17
+	{STRUCT_FLD(field_name,		"IO_FIX"),
+	 STRUCT_FLD(field_length,	64),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_IS_OLD		18
+	{STRUCT_FLD(field_name,		"IS_OLD"),
+	 STRUCT_FLD(field_length,	3),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUFFER_PAGE_FREE_CLOCK	19
+	{STRUCT_FLD(field_name,		"FREE_PAGE_CLOCK"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Fill Information Schema table INNODB_BUFFER_PAGE with information
+cached in the buf_page_info_t array
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_page_fill(
+/*========================*/
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables,		/*!< in/out: tables to fill */
+	const buf_page_info_t*	info_array,	/*!< in: array cached page
+						info */
+	ulint			num_page)	/*!< in: number of page info
+						cached */
+{
+	TABLE*			table;
+	Field**			fields;
+
+	DBUG_ENTER("i_s_innodb_buffer_page_fill");
+
+	table = tables->table;
+
+	fields = table->field;
+
+	/* Iterate through the cached array and fill the I_S table rows */
+	for (ulint i = 0; i < num_page; i++) {
+		const buf_page_info_t*	page_info;
+		char			table_name[MAX_FULL_NAME_LEN + 1];
+		const char*		table_name_end = NULL;
+		const char*		state_str;
+		enum buf_page_state	state;
+
+		page_info = info_array + i;
+
+		state_str = NULL;
+
+		OK(fields[IDX_BUFFER_POOL_ID]->store(
+			static_cast<double>(page_info->pool_id)));
+
+		OK(fields[IDX_BUFFER_BLOCK_ID]->store(
+			static_cast<double>(page_info->block_id)));
+
+		OK(fields[IDX_BUFFER_PAGE_SPACE]->store(
+			static_cast<double>(page_info->space_id)));
+
+		OK(fields[IDX_BUFFER_PAGE_NUM]->store(
+			static_cast<double>(page_info->page_num)));
+
+		OK(field_store_string(
+			fields[IDX_BUFFER_PAGE_TYPE],
+			i_s_page_type[page_info->page_type].type_str));
+
+		OK(fields[IDX_BUFFER_PAGE_FLUSH_TYPE]->store(
+			page_info->flush_type));
+
+		OK(fields[IDX_BUFFER_PAGE_FIX_COUNT]->store(
+			page_info->fix_count));
+
+		if (page_info->hashed) {
+			OK(field_store_string(
+				fields[IDX_BUFFER_PAGE_HASHED], "YES"));
+		} else {
+			OK(field_store_string(
+				fields[IDX_BUFFER_PAGE_HASHED], "NO"));
+		}
+
+		OK(fields[IDX_BUFFER_PAGE_NEWEST_MOD]->store(
+			(longlong) page_info->newest_mod, true));
+
+		OK(fields[IDX_BUFFER_PAGE_OLDEST_MOD]->store(
+			(longlong) page_info->oldest_mod, true));
+
+		OK(fields[IDX_BUFFER_PAGE_ACCESS_TIME]->store(
+			page_info->access_time));
+
+		fields[IDX_BUFFER_PAGE_TABLE_NAME]->set_null();
+
+		fields[IDX_BUFFER_PAGE_INDEX_NAME]->set_null();
+
+		/* If this is an index page, fetch the index name
+		and table name */
+		if (page_info->page_type == I_S_PAGE_TYPE_INDEX) {
+			const dict_index_t*	index;
+
+			mutex_enter(&dict_sys->mutex);
+			index = dict_index_get_if_in_cache_low(
+				page_info->index_id);
+
+			if (index) {
+
+				table_name_end = innobase_convert_name(
+					table_name, sizeof(table_name),
+					index->table_name,
+					strlen(index->table_name),
+					thd, TRUE);
+
+				OK(fields[IDX_BUFFER_PAGE_TABLE_NAME]->store(
+					table_name,
+					static_cast<uint>(table_name_end - table_name),
+					system_charset_info));
+				fields[IDX_BUFFER_PAGE_TABLE_NAME]->set_notnull();
+
+				OK(field_store_index_name(
+					fields[IDX_BUFFER_PAGE_INDEX_NAME],
+					index->name));
+			}
+
+			mutex_exit(&dict_sys->mutex);
+		}
+
+		OK(fields[IDX_BUFFER_PAGE_NUM_RECS]->store(
+			page_info->num_recs));
+
+		OK(fields[IDX_BUFFER_PAGE_DATA_SIZE]->store(
+			page_info->data_size));
+
+		OK(fields[IDX_BUFFER_PAGE_ZIP_SIZE]->store(
+			page_info->zip_ssize
+			? (UNIV_ZIP_SIZE_MIN >> 1) << page_info->zip_ssize
+			: 0));
+
+#if BUF_PAGE_STATE_BITS > 3
+# error "BUF_PAGE_STATE_BITS > 3, please ensure that all 1<<BUF_PAGE_STATE_BITS values are checked for"
+#endif
+		state = static_cast<enum buf_page_state>(page_info->page_state);
+
+		switch (state) {
+		/* First three states are for compression pages and
+		are not states we would get as we scan pages through
+		buffer blocks */
+		case BUF_BLOCK_POOL_WATCH:
+		case BUF_BLOCK_ZIP_PAGE:
+		case BUF_BLOCK_ZIP_DIRTY:
+			state_str = NULL;
+			break;
+		case BUF_BLOCK_NOT_USED:
+			state_str = "NOT_USED";
+			break;
+		case BUF_BLOCK_READY_FOR_USE:
+			state_str = "READY_FOR_USE";
+			break;
+		case BUF_BLOCK_FILE_PAGE:
+			state_str = "FILE_PAGE";
+			break;
+		case BUF_BLOCK_MEMORY:
+			state_str = "MEMORY";
+			break;
+		case BUF_BLOCK_REMOVE_HASH:
+			state_str = "REMOVE_HASH";
+			break;
+		};
+
+		OK(field_store_string(fields[IDX_BUFFER_PAGE_STATE],
+				      state_str));
+
+		switch (page_info->io_fix) {
+		case BUF_IO_NONE:
+			OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX],
+					      "IO_NONE"));
+			break;
+		case BUF_IO_READ:
+			OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX],
+					      "IO_READ"));
+			break;
+		case BUF_IO_WRITE:
+			OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX],
+					      "IO_WRITE"));
+			break;
+		case BUF_IO_PIN:
+			OK(field_store_string(fields[IDX_BUFFER_PAGE_IO_FIX],
+					      "IO_PIN"));
+			break;
+		}
+
+		OK(field_store_string(fields[IDX_BUFFER_PAGE_IS_OLD],
+				      (page_info->is_old) ? "YES" : "NO"));
+
+		OK(fields[IDX_BUFFER_PAGE_FREE_CLOCK]->store(
+			page_info->freed_page_clock));
+
+		if (schema_table_store_record(thd, table)) {
+			DBUG_RETURN(1);
+		}
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Set appropriate page type to a buf_page_info_t structure */
+static
+void
+i_s_innodb_set_page_type(
+/*=====================*/
+	buf_page_info_t*page_info,	/*!< in/out: structure to fill with
+					scanned info */
+	ulint		page_type,	/*!< in: page type */
+	const byte*	frame)		/*!< in: buffer frame */
+{
+	if (page_type == FIL_PAGE_INDEX) {
+		const page_t*	page = (const page_t*) frame;
+
+		page_info->index_id = btr_page_get_index_id(page);
+
+		/* FIL_PAGE_INDEX is a bit special, its value
+		is defined as 17855, so we cannot use FIL_PAGE_INDEX
+		to index into i_s_page_type[] array, its array index
+		in the i_s_page_type[] array is I_S_PAGE_TYPE_INDEX
+		(1) for index pages or I_S_PAGE_TYPE_IBUF for
+		change buffer index pages */
+		if (page_info->index_id
+		    == static_cast<index_id_t>(DICT_IBUF_ID_MIN
+					       + IBUF_SPACE_ID)) {
+			page_info->page_type = I_S_PAGE_TYPE_IBUF;
+		} else {
+			page_info->page_type = I_S_PAGE_TYPE_INDEX;
+		}
+
+		page_info->data_size = (ulint)(page_header_get_field(
+			page, PAGE_HEAP_TOP) - (page_is_comp(page)
+						? PAGE_NEW_SUPREMUM_END
+						: PAGE_OLD_SUPREMUM_END)
+			- page_header_get_field(page, PAGE_GARBAGE));
+
+		page_info->num_recs = page_get_n_recs(page);
+	} else if (page_type > FIL_PAGE_TYPE_LAST) {
+		/* Encountered an unknown page type */
+		page_info->page_type = I_S_PAGE_TYPE_UNKNOWN;
+	} else {
+		/* Make sure we get the right index into the
+		i_s_page_type[] array */
+		ut_a(page_type == i_s_page_type[page_type].type_value);
+
+		page_info->page_type = page_type;
+	}
+
+	if (page_info->page_type == FIL_PAGE_TYPE_ZBLOB
+	    || page_info->page_type == FIL_PAGE_TYPE_ZBLOB2) {
+		page_info->page_num = mach_read_from_4(
+			frame + FIL_PAGE_OFFSET);
+		page_info->space_id = mach_read_from_4(
+			frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	}
+}
+/*******************************************************************//**
+Scans pages in the buffer cache, and collect their general information
+into the buf_page_info_t array which is zero-filled. So any fields
+that are not initialized in the function will default to 0 */
+static
+void
+i_s_innodb_buffer_page_get_info(
+/*============================*/
+	const buf_page_t*bpage,		/*!< in: buffer pool page to scan */
+	ulint		pool_id,	/*!< in: buffer pool id */
+	ulint		pos,		/*!< in: buffer block position in
+					buffer pool or in the LRU list */
+	buf_page_info_t*page_info)	/*!< in: zero filled info structure;
+					out: structure filled with scanned
+					info */
+{
+	ut_ad(pool_id < MAX_BUFFER_POOLS);
+
+	page_info->pool_id = pool_id;
+
+	page_info->block_id = pos;
+
+	page_info->page_state = buf_page_get_state(bpage);
+
+	/* Only fetch information for buffers that map to a tablespace,
+	that is, buffer page with state BUF_BLOCK_ZIP_PAGE,
+	BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_FILE_PAGE */
+	if (buf_page_in_file(bpage)) {
+		const byte*	frame;
+		ulint		page_type;
+
+		page_info->space_id = buf_page_get_space(bpage);
+
+		page_info->page_num = buf_page_get_page_no(bpage);
+
+		page_info->flush_type = bpage->flush_type;
+
+		page_info->fix_count = bpage->buf_fix_count;
+
+		page_info->newest_mod = bpage->newest_modification;
+
+		page_info->oldest_mod = bpage->oldest_modification;
+
+		page_info->access_time = bpage->access_time;
+
+		page_info->zip_ssize = bpage->zip.ssize;
+
+		page_info->io_fix = bpage->io_fix;
+
+		page_info->is_old = bpage->old;
+
+		page_info->freed_page_clock = bpage->freed_page_clock;
+
+		switch (buf_page_get_io_fix(bpage)) {
+		case BUF_IO_NONE:
+		case BUF_IO_WRITE:
+		case BUF_IO_PIN:
+			break;
+		case BUF_IO_READ:
+			page_info->page_type = I_S_PAGE_TYPE_UNKNOWN;
+			return;
+		}
+
+		if (page_info->page_state == BUF_BLOCK_FILE_PAGE) {
+			const buf_block_t*block;
+
+			block = reinterpret_cast<const buf_block_t*>(bpage);
+			frame = block->frame;
+			page_info->hashed = (block->index != NULL);
+		} else {
+			ut_ad(page_info->zip_ssize);
+			frame = bpage->zip.data;
+		}
+
+		page_type = fil_page_get_type(frame);
+
+		i_s_innodb_set_page_type(page_info, page_type, frame);
+	} else {
+		page_info->page_type = I_S_PAGE_TYPE_UNKNOWN;
+	}
+}
+
+/*******************************************************************//**
+This is the function that goes through each block of the buffer pool
+and fetch information to information schema tables: INNODB_BUFFER_PAGE.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_fill_buffer_pool(
+/*========================*/
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables,		/*!< in/out: tables to fill */
+	buf_pool_t*		buf_pool,	/*!< in: buffer pool to scan */
+	const ulint		pool_id)	/*!< in: buffer pool id */
+{
+	int			status	= 0;
+	mem_heap_t*		heap;
+
+	DBUG_ENTER("i_s_innodb_fill_buffer_pool");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	heap = mem_heap_create(10000);
+
+	/* Go through each chunk of buffer pool. Currently, we only
+	have one single chunk for each buffer pool */
+	for (ulint n = 0; n < buf_pool->n_chunks; n++) {
+		const buf_block_t*	block;
+		ulint			n_blocks;
+		buf_page_info_t*	info_buffer;
+		ulint			num_page;
+		ulint			mem_size;
+		ulint			chunk_size;
+		ulint			num_to_process = 0;
+		ulint			block_id = 0;
+
+		/* Get buffer block of the nth chunk */
+		block = buf_get_nth_chunk_block(buf_pool, n, &chunk_size);
+		num_page = 0;
+
+		while (chunk_size > 0) {
+			/* we cache maximum MAX_BUF_INFO_CACHED number of
+			buffer page info */
+			num_to_process = ut_min(chunk_size,
+						MAX_BUF_INFO_CACHED);
+
+			mem_size = num_to_process * sizeof(buf_page_info_t);
+
+			/* For each chunk, we'll pre-allocate information
+			structures to cache the page information read from
+			the buffer pool. Doing so before obtain any mutex */
+			info_buffer = (buf_page_info_t*) mem_heap_zalloc(
+				heap, mem_size);
+
+			/* Obtain appropriate mutexes. Since this is diagnostic
+			buffer pool info printout, we are not required to
+			preserve the overall consistency, so we can
+			release mutex periodically */
+			buf_pool_mutex_enter(buf_pool);
+
+			/* GO through each block in the chunk */
+			for (n_blocks = num_to_process; n_blocks--; block++) {
+				i_s_innodb_buffer_page_get_info(
+					&block->page, pool_id, block_id,
+					info_buffer + num_page);
+				block_id++;
+				num_page++;
+			}
+
+			buf_pool_mutex_exit(buf_pool);
+
+			/* Fill in information schema table with information
+			just collected from the buffer chunk scan */
+			status = i_s_innodb_buffer_page_fill(
+				thd, tables, info_buffer,
+				num_page);
+
+			/* If something goes wrong, break and return */
+			if (status) {
+				break;
+			}
+
+			mem_heap_empty(heap);
+			chunk_size -= num_to_process;
+			num_page = 0;
+		}
+	}
+
+	mem_heap_free(heap);
+
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Fill page information for pages in InnoDB buffer pool to the
+dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_page_fill_table(
+/*==============================*/
+	THD*		thd,		/*!< in: thread */
+	TABLE_LIST*	tables,		/*!< in/out: tables to fill */
+	Item*		)		/*!< in: condition (ignored) */
+{
+	int	status	= 0;
+
+	DBUG_ENTER("i_s_innodb_buffer_page_fill_table");
+
+	/* deny access to user without PROCESS privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	/* Walk through each buffer pool */
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		/* Fetch information from pages in this buffer pool,
+		and fill the corresponding I_S table */
+		status = i_s_innodb_fill_buffer_pool(thd, tables, buf_pool, i);
+
+		/* If something wrong, break and return */
+		if (status) {
+			break;
+		}
+	}
+
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_page_init(
+/*========================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("i_s_innodb_buffer_page_init");
+
+	schema = reinterpret_cast<ST_SCHEMA_TABLE*>(p);
+
+	schema->fields_info = i_s_innodb_buffer_page_fields_info;
+	schema->fill_table = i_s_innodb_buffer_page_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_buffer_page =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_BUFFER_PAGE"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB Buffer Page Information"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_innodb_buffer_page_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+static ST_FIELD_INFO	i_s_innodb_buf_page_lru_fields_info[] =
+{
+#define IDX_BUF_LRU_POOL_ID		0
+	{STRUCT_FLD(field_name,		"POOL_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_POS			1
+	{STRUCT_FLD(field_name,		"LRU_POSITION"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_SPACE		2
+	{STRUCT_FLD(field_name,		"SPACE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_NUM		3
+	{STRUCT_FLD(field_name,		"PAGE_NUMBER"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_TYPE		4
+	{STRUCT_FLD(field_name,		"PAGE_TYPE"),
+	 STRUCT_FLD(field_length,	64),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_FLUSH_TYPE	5
+	{STRUCT_FLD(field_name,		"FLUSH_TYPE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_FIX_COUNT	6
+	{STRUCT_FLD(field_name,		"FIX_COUNT"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_HASHED		7
+	{STRUCT_FLD(field_name,		"IS_HASHED"),
+	 STRUCT_FLD(field_length,	3),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_NEWEST_MOD	8
+	{STRUCT_FLD(field_name,		"NEWEST_MODIFICATION"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_OLDEST_MOD	9
+	{STRUCT_FLD(field_name,		"OLDEST_MODIFICATION"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_ACCESS_TIME	10
+	{STRUCT_FLD(field_name,		"ACCESS_TIME"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_TABLE_NAME	11
+	{STRUCT_FLD(field_name,		"TABLE_NAME"),
+	 STRUCT_FLD(field_length,	1024),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_INDEX_NAME	12
+	{STRUCT_FLD(field_name,		"INDEX_NAME"),
+	 STRUCT_FLD(field_length,	1024),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_NUM_RECS	13
+	{STRUCT_FLD(field_name,		"NUMBER_RECORDS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_DATA_SIZE	14
+	{STRUCT_FLD(field_name,		"DATA_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_ZIP_SIZE	15
+	{STRUCT_FLD(field_name,		"COMPRESSED_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_STATE		16
+	{STRUCT_FLD(field_name,		"COMPRESSED"),
+	 STRUCT_FLD(field_length,	3),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_IO_FIX		17
+	{STRUCT_FLD(field_name,		"IO_FIX"),
+	 STRUCT_FLD(field_length,	64),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_IS_OLD		18
+	{STRUCT_FLD(field_name,		"IS_OLD"),
+	 STRUCT_FLD(field_length,	3),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BUF_LRU_PAGE_FREE_CLOCK	19
+	{STRUCT_FLD(field_name,		"FREE_PAGE_CLOCK"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Fill Information Schema table INNODB_BUFFER_PAGE_LRU with information
+cached in the buf_page_info_t array
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_buf_page_lru_fill(
+/*=========================*/
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables,		/*!< in/out: tables to fill */
+	const buf_page_info_t*	info_array,	/*!< in: array cached page
+						info */
+	ulint			num_page)	/*!< in: number of page info
+						 cached */
+{
+	TABLE*			table;
+	Field**			fields;
+	mem_heap_t*		heap;
+
+	DBUG_ENTER("i_s_innodb_buf_page_lru_fill");
+
+	table = tables->table;
+
+	fields = table->field;
+
+	heap = mem_heap_create(1000);
+
+	/* Iterate through the cached array and fill the I_S table rows */
+	for (ulint i = 0; i < num_page; i++) {
+		const buf_page_info_t*	page_info;
+		char			table_name[MAX_FULL_NAME_LEN + 1];
+		const char*		table_name_end = NULL;
+		const char*		state_str;
+		enum buf_page_state	state;
+
+		state_str = NULL;
+
+		page_info = info_array + i;
+
+		OK(fields[IDX_BUF_LRU_POOL_ID]->store(
+			static_cast<double>(page_info->pool_id)));
+
+		OK(fields[IDX_BUF_LRU_POS]->store(
+			static_cast<double>(page_info->block_id)));
+
+		OK(fields[IDX_BUF_LRU_PAGE_SPACE]->store(
+			static_cast<double>(page_info->space_id)));
+
+		OK(fields[IDX_BUF_LRU_PAGE_NUM]->store(
+			static_cast<double>(page_info->page_num)));
+
+		OK(field_store_string(
+			fields[IDX_BUF_LRU_PAGE_TYPE],
+			i_s_page_type[page_info->page_type].type_str));
+
+		OK(fields[IDX_BUF_LRU_PAGE_FLUSH_TYPE]->store(
+			static_cast<double>(page_info->flush_type)));
+
+		OK(fields[IDX_BUF_LRU_PAGE_FIX_COUNT]->store(
+			static_cast<double>(page_info->fix_count)));
+
+		if (page_info->hashed) {
+			OK(field_store_string(
+				fields[IDX_BUF_LRU_PAGE_HASHED], "YES"));
+		} else {
+			OK(field_store_string(
+				fields[IDX_BUF_LRU_PAGE_HASHED], "NO"));
+		}
+
+		OK(fields[IDX_BUF_LRU_PAGE_NEWEST_MOD]->store(
+			page_info->newest_mod, true));
+
+		OK(fields[IDX_BUF_LRU_PAGE_OLDEST_MOD]->store(
+			page_info->oldest_mod, true));
+
+		OK(fields[IDX_BUF_LRU_PAGE_ACCESS_TIME]->store(
+			page_info->access_time));
+
+		fields[IDX_BUF_LRU_PAGE_TABLE_NAME]->set_null();
+
+		fields[IDX_BUF_LRU_PAGE_INDEX_NAME]->set_null();
+
+		/* If this is an index page, fetch the index name
+		and table name */
+		if (page_info->page_type == I_S_PAGE_TYPE_INDEX) {
+			const dict_index_t*	index;
+
+			mutex_enter(&dict_sys->mutex);
+			index = dict_index_get_if_in_cache_low(
+				page_info->index_id);
+
+			if (index) {
+
+				table_name_end = innobase_convert_name(
+					table_name, sizeof(table_name),
+					index->table_name,
+					strlen(index->table_name),
+					thd, TRUE);
+
+				OK(fields[IDX_BUF_LRU_PAGE_TABLE_NAME]->store(
+					table_name,
+					static_cast<uint>(table_name_end - table_name),
+					system_charset_info));
+				fields[IDX_BUF_LRU_PAGE_TABLE_NAME]->set_notnull();
+
+				OK(field_store_index_name(
+					fields[IDX_BUF_LRU_PAGE_INDEX_NAME],
+					index->name));
+			}
+
+			mutex_exit(&dict_sys->mutex);
+		}
+
+		OK(fields[IDX_BUF_LRU_PAGE_NUM_RECS]->store(
+			page_info->num_recs));
+
+		OK(fields[IDX_BUF_LRU_PAGE_DATA_SIZE]->store(
+			page_info->data_size));
+
+		OK(fields[IDX_BUF_LRU_PAGE_ZIP_SIZE]->store(
+			page_info->zip_ssize ?
+				 512 << page_info->zip_ssize : 0));
+
+		state = static_cast<enum buf_page_state>(page_info->page_state);
+
+		switch (state) {
+		/* Compressed page */
+		case BUF_BLOCK_ZIP_PAGE:
+		case BUF_BLOCK_ZIP_DIRTY:
+			state_str = "YES";
+			break;
+		/* Uncompressed page */
+		case BUF_BLOCK_FILE_PAGE:
+			state_str = "NO";
+			break;
+		/* We should not see following states */
+		case BUF_BLOCK_POOL_WATCH:
+		case BUF_BLOCK_READY_FOR_USE:
+		case BUF_BLOCK_NOT_USED:
+		case BUF_BLOCK_MEMORY:
+		case BUF_BLOCK_REMOVE_HASH:
+			state_str = NULL;
+			break;
+		};
+
+		OK(field_store_string(fields[IDX_BUF_LRU_PAGE_STATE],
+				      state_str));
+
+		switch (page_info->io_fix) {
+		case BUF_IO_NONE:
+			OK(field_store_string(fields[IDX_BUF_LRU_PAGE_IO_FIX],
+					      "IO_NONE"));
+			break;
+		case BUF_IO_READ:
+			OK(field_store_string(fields[IDX_BUF_LRU_PAGE_IO_FIX],
+					      "IO_READ"));
+			break;
+		case BUF_IO_WRITE:
+			OK(field_store_string(fields[IDX_BUF_LRU_PAGE_IO_FIX],
+					      "IO_WRITE"));
+			break;
+		}
+
+		OK(field_store_string(fields[IDX_BUF_LRU_PAGE_IS_OLD],
+				      (page_info->is_old) ? "YES" : "NO"));
+
+		OK(fields[IDX_BUF_LRU_PAGE_FREE_CLOCK]->store(
+			page_info->freed_page_clock));
+
+		if (schema_table_store_record(thd, table)) {
+			mem_heap_free(heap);
+			DBUG_RETURN(1);
+		}
+
+		mem_heap_empty(heap);
+	}
+
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+This is the function that goes through buffer pool's LRU list
+and fetch information to INFORMATION_SCHEMA.INNODB_BUFFER_PAGE_LRU.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_fill_buffer_lru(
+/*=======================*/
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables,		/*!< in/out: tables to fill */
+	buf_pool_t*		buf_pool,	/*!< in: buffer pool to scan */
+	const ulint		pool_id)	/*!< in: buffer pool id */
+{
+	int			status = 0;
+	buf_page_info_t*	info_buffer;
+	ulint			lru_pos = 0;
+	const buf_page_t*	bpage;
+	ulint			lru_len;
+
+	DBUG_ENTER("i_s_innodb_fill_buffer_lru");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	/* Obtain buf_pool mutex before allocate info_buffer, since
+	UT_LIST_GET_LEN(buf_pool->LRU) could change */
+	buf_pool_mutex_enter(buf_pool);
+
+	lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+
+	/* Print error message if malloc fail */
+	info_buffer = (buf_page_info_t*) my_malloc(
+		lru_len * sizeof *info_buffer, MYF(MY_WME));
+
+	if (!info_buffer) {
+		status = 1;
+		goto exit;
+	}
+
+	memset(info_buffer, 0, lru_len * sizeof *info_buffer);
+
+	/* Walk through Pool's LRU list and print the buffer page
+	information */
+	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+
+	while (bpage != NULL) {
+		/* Use the same function that collect buffer info for
+		INNODB_BUFFER_PAGE to get buffer page info */
+		i_s_innodb_buffer_page_get_info(bpage, pool_id, lru_pos,
+						(info_buffer + lru_pos));
+
+		bpage = UT_LIST_GET_PREV(LRU, bpage);
+
+		lru_pos++;
+	}
+
+	ut_ad(lru_pos == lru_len);
+	ut_ad(lru_pos == UT_LIST_GET_LEN(buf_pool->LRU));
+
+exit:
+	buf_pool_mutex_exit(buf_pool);
+
+	if (info_buffer) {
+		status = i_s_innodb_buf_page_lru_fill(
+			thd, tables, info_buffer, lru_len);
+
+		my_free(info_buffer);
+	}
+
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Fill page information for pages in InnoDB buffer pool to the
+dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE_LRU
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_buf_page_lru_fill_table(
+/*===============================*/
+	THD*		thd,		/*!< in: thread */
+	TABLE_LIST*	tables,		/*!< in/out: tables to fill */
+	Item*		)		/*!< in: condition (ignored) */
+{
+	int	status	= 0;
+
+	DBUG_ENTER("i_s_innodb_buf_page_lru_fill_table");
+
+	/* deny access to any users that do not hold PROCESS_ACL */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	/* Walk through each buffer pool */
+	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+
+		/* Fetch information from pages in this buffer pool's LRU list,
+		and fill the corresponding I_S table */
+		status = i_s_innodb_fill_buffer_lru(thd, tables, buf_pool, i);
+
+		/* If something wrong, break and return */
+		if (status) {
+			break;
+		}
+	}
+
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE_LRU.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_page_lru_init(
+/*============================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("i_s_innodb_buffer_page_lru_init");
+
+	schema = reinterpret_cast<ST_SCHEMA_TABLE*>(p);
+
+	schema->fields_info = i_s_innodb_buf_page_lru_fields_info;
+	schema->fill_table = i_s_innodb_buf_page_lru_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_buffer_page_lru =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_BUFFER_PAGE_LRU"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB Buffer Page in LRU"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_innodb_buffer_page_lru_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/*******************************************************************//**
+Unbind a dynamic INFORMATION_SCHEMA table.
+@return	0 on success */
+static
+int
+i_s_common_deinit(
+/*==============*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_common_deinit");
+
+	/* Do nothing */
+
+	DBUG_RETURN(0);
+}
+
+/**  SYS_TABLES  ***************************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_TABLES */
+static ST_FIELD_INFO	innodb_sys_tables_fields_info[] =
+{
+#define SYS_TABLES_ID			0
+	{STRUCT_FLD(field_name,		"TABLE_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLES_NAME			1
+	{STRUCT_FLD(field_name,		"NAME"),
+	 STRUCT_FLD(field_length,	MAX_FULL_NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLES_FLAG			2
+	{STRUCT_FLD(field_name,		"FLAG"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLES_NUM_COLUMN		3
+	{STRUCT_FLD(field_name,		"N_COLS"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLES_SPACE		4
+	{STRUCT_FLD(field_name,		"SPACE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLES_FILE_FORMAT		5
+	{STRUCT_FLD(field_name,		"FILE_FORMAT"),
+	 STRUCT_FLD(field_length,	10),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLES_ROW_FORMAT		6
+	{STRUCT_FLD(field_name,		"ROW_FORMAT"),
+	 STRUCT_FLD(field_length,	12),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLES_ZIP_PAGE_SIZE	7
+	{STRUCT_FLD(field_name,		"ZIP_PAGE_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Populate information_schema.innodb_sys_tables table with information
+from SYS_TABLES.
+@return	0 on success */
+static
+int
+i_s_dict_fill_sys_tables(
+/*=====================*/
+	THD*		thd,		/*!< in: thread */
+	dict_table_t*	table,		/*!< in: table */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+	ulint	compact		= DICT_TF_GET_COMPACT(table->flags);
+	ulint	atomic_blobs	= DICT_TF_HAS_ATOMIC_BLOBS(table->flags);
+	ulint	zip_size	= dict_tf_get_zip_size(table->flags);
+	const char* file_format;
+	const char* row_format;
+
+	file_format = trx_sys_file_format_id_to_name(atomic_blobs);
+	if (!compact) {
+		row_format = "Redundant";
+	} else if (!atomic_blobs) {
+		row_format = "Compact";
+	} else if DICT_TF_GET_ZIP_SSIZE(table->flags) {
+		row_format = "Compressed";
+	} else {
+		row_format = "Dynamic";
+	}
+
+	DBUG_ENTER("i_s_dict_fill_sys_tables");
+
+	fields = table_to_fill->field;
+
+	OK(fields[SYS_TABLES_ID]->store(longlong(table->id), TRUE));
+
+	OK(field_store_string(fields[SYS_TABLES_NAME], table->name));
+
+	OK(fields[SYS_TABLES_FLAG]->store(table->flags));
+
+	OK(fields[SYS_TABLES_NUM_COLUMN]->store(table->n_cols));
+
+	OK(fields[SYS_TABLES_SPACE]->store(table->space));
+
+	OK(field_store_string(fields[SYS_TABLES_FILE_FORMAT], file_format));
+
+	OK(field_store_string(fields[SYS_TABLES_ROW_FORMAT], row_format));
+
+	OK(fields[SYS_TABLES_ZIP_PAGE_SIZE]->store(
+		static_cast<double>(zip_size)));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to go through each record in SYS_TABLES table, and fill the
+information_schema.innodb_sys_tables table with related table information
+@return 0 on success */
+static
+int
+i_s_sys_tables_fill_table(
+/*======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_tables_fill_table");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mutex_enter(&(dict_sys->mutex));
+	mtr_start(&mtr);
+
+	rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES);
+
+	while (rec) {
+		const char*	err_msg;
+		dict_table_t*	table_rec;
+
+		/* Create and populate a dict_table_t structure with
+		information from SYS_TABLES row */
+		err_msg = dict_process_sys_tables_rec_and_mtr_commit(
+			heap, rec, &table_rec,
+			DICT_TABLE_LOAD_FROM_RECORD, &mtr);
+
+		mutex_exit(&dict_sys->mutex);
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_tables(thd, table_rec, tables->table);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		/* Since dict_process_sys_tables_rec_and_mtr_commit()
+		is called with DICT_TABLE_LOAD_FROM_RECORD, the table_rec
+		is created in dict_process_sys_tables_rec(), we will
+		need to free it */
+		if (table_rec) {
+			dict_mem_table_free(table_rec);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mutex_enter(&dict_sys->mutex);
+		mtr_start(&mtr);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+	mutex_exit(&dict_sys->mutex);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_tables
+@return 0 on success */
+static
+int
+innodb_sys_tables_init(
+/*===================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_tables_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_sys_tables_fields_info;
+	schema->fill_table = i_s_sys_tables_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_sys_tables =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_SYS_TABLES"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB SYS_TABLES"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_sys_tables_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/**  SYS_TABLESTATS  ***********************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_TABLESTATS */
+static ST_FIELD_INFO	innodb_sys_tablestats_fields_info[] =
+{
+#define SYS_TABLESTATS_ID		0
+	{STRUCT_FLD(field_name,		"TABLE_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESTATS_NAME		1
+	{STRUCT_FLD(field_name,		"NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESTATS_INIT		2
+	{STRUCT_FLD(field_name,		"STATS_INITIALIZED"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESTATS_NROW		3
+	{STRUCT_FLD(field_name,		"NUM_ROWS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESTATS_CLUST_SIZE	4
+	{STRUCT_FLD(field_name,		"CLUST_INDEX_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESTATS_INDEX_SIZE	5
+	{STRUCT_FLD(field_name,		"OTHER_INDEX_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESTATS_MODIFIED		6
+	{STRUCT_FLD(field_name,		"MODIFIED_COUNTER"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESTATS_AUTONINC		7
+	{STRUCT_FLD(field_name,		"AUTOINC"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESTATS_TABLE_REF_COUNT	8
+	{STRUCT_FLD(field_name,		"REF_COUNT"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Populate information_schema.innodb_sys_tablestats table with information
+from SYS_TABLES.
+@return	0 on success */
+static
+int
+i_s_dict_fill_sys_tablestats(
+/*=========================*/
+	THD*		thd,		/*!< in: thread */
+	dict_table_t*	table,		/*!< in: table */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+
+	DBUG_ENTER("i_s_dict_fill_sys_tablestats");
+
+	fields = table_to_fill->field;
+
+	OK(fields[SYS_TABLESTATS_ID]->store(longlong(table->id), TRUE));
+
+	OK(field_store_string(fields[SYS_TABLESTATS_NAME], table->name));
+
+	dict_table_stats_lock(table, RW_S_LATCH);
+
+	if (table->stat_initialized) {
+		OK(field_store_string(fields[SYS_TABLESTATS_INIT],
+				      "Initialized"));
+
+		OK(fields[SYS_TABLESTATS_NROW]->store(table->stat_n_rows,
+						      TRUE));
+
+		OK(fields[SYS_TABLESTATS_CLUST_SIZE]->store(
+			static_cast<double>(table->stat_clustered_index_size)));
+
+		OK(fields[SYS_TABLESTATS_INDEX_SIZE]->store(
+			static_cast<double>(table->stat_sum_of_other_index_sizes)));
+
+		OK(fields[SYS_TABLESTATS_MODIFIED]->store(
+			static_cast<double>(table->stat_modified_counter)));
+	} else {
+		OK(field_store_string(fields[SYS_TABLESTATS_INIT],
+				      "Uninitialized"));
+
+		OK(fields[SYS_TABLESTATS_NROW]->store(0, TRUE));
+
+		OK(fields[SYS_TABLESTATS_CLUST_SIZE]->store(0));
+
+		OK(fields[SYS_TABLESTATS_INDEX_SIZE]->store(0));
+
+		OK(fields[SYS_TABLESTATS_MODIFIED]->store(0));
+	}
+
+	dict_table_stats_unlock(table, RW_S_LATCH);
+
+	OK(fields[SYS_TABLESTATS_AUTONINC]->store(table->autoinc, TRUE));
+
+	OK(fields[SYS_TABLESTATS_TABLE_REF_COUNT]->store(
+		static_cast<double>(table->n_ref_count)));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Function to go through each record in SYS_TABLES table, and fill the
+information_schema.innodb_sys_tablestats table with table statistics
+related information
+@return 0 on success */
+static
+int
+i_s_sys_tables_fill_table_stats(
+/*============================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_tables_fill_table_stats");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
+
+	rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES);
+
+	while (rec) {
+		const char*	err_msg;
+		dict_table_t*	table_rec;
+
+		/* Fetch the dict_table_t structure corresponding to
+		this SYS_TABLES record */
+		err_msg = dict_process_sys_tables_rec_and_mtr_commit(
+			heap, rec, &table_rec,
+			DICT_TABLE_LOAD_FROM_CACHE, &mtr);
+
+		mutex_exit(&dict_sys->mutex);
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_tablestats(thd, table_rec,
+						     tables->table);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mutex_enter(&dict_sys->mutex);
+		mtr_start(&mtr);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+	mutex_exit(&dict_sys->mutex);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_tablestats
+@return 0 on success */
+static
+int
+innodb_sys_tablestats_init(
+/*=======================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_tablestats_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_sys_tablestats_fields_info;
+	schema->fill_table = i_s_sys_tables_fill_table_stats;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_sys_tablestats =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_SYS_TABLESTATS"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB SYS_TABLESTATS"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_sys_tablestats_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/**  SYS_INDEXES  **************************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_INDEXES */
+static ST_FIELD_INFO	innodb_sysindex_fields_info[] =
+{
+#define SYS_INDEX_ID		0
+	{STRUCT_FLD(field_name,		"INDEX_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_INDEX_NAME		1
+	{STRUCT_FLD(field_name,		"NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_INDEX_TABLE_ID	2
+	{STRUCT_FLD(field_name,		"TABLE_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_INDEX_TYPE		3
+	{STRUCT_FLD(field_name,		"TYPE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_INDEX_NUM_FIELDS	4
+	{STRUCT_FLD(field_name,		"N_FIELDS"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_INDEX_PAGE_NO	5
+	{STRUCT_FLD(field_name,		"PAGE_NO"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_INDEX_SPACE		6
+	{STRUCT_FLD(field_name,		"SPACE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Function to populate the information_schema.innodb_sys_indexes table with
+collected index information
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_indexes(
+/*======================*/
+	THD*		thd,		/*!< in: thread */
+	table_id_t	table_id,	/*!< in: table id */
+	dict_index_t*	index,		/*!< in: populated dict_index_t
+					struct with index info */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+
+	DBUG_ENTER("i_s_dict_fill_sys_indexes");
+
+	fields = table_to_fill->field;
+
+	OK(field_store_index_name(fields[SYS_INDEX_NAME], index->name));
+
+	OK(fields[SYS_INDEX_ID]->store(longlong(index->id), TRUE));
+
+	OK(fields[SYS_INDEX_TABLE_ID]->store(longlong(table_id), TRUE));
+
+	OK(fields[SYS_INDEX_TYPE]->store(index->type));
+
+	OK(fields[SYS_INDEX_NUM_FIELDS]->store(index->n_fields));
+
+	/* FIL_NULL is ULINT32_UNDEFINED */
+	if (index->page == FIL_NULL) {
+		OK(fields[SYS_INDEX_PAGE_NO]->store(-1));
+	} else {
+		OK(fields[SYS_INDEX_PAGE_NO]->store(index->page));
+	}
+
+	OK(fields[SYS_INDEX_SPACE]->store(index->space));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to go through each record in SYS_INDEXES table, and fill the
+information_schema.innodb_sys_indexes table with related index information
+@return 0 on success */
+static
+int
+i_s_sys_indexes_fill_table(
+/*=======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t		pcur;
+	const rec_t*		rec;
+	mem_heap_t*		heap;
+	mtr_t			mtr;
+
+	DBUG_ENTER("i_s_sys_indexes_fill_table");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
+
+	/* Start scan the SYS_INDEXES table */
+	rec = dict_startscan_system(&pcur, &mtr, SYS_INDEXES);
+
+	/* Process each record in the table */
+	while (rec) {
+		const char*	err_msg;
+		table_id_t	table_id;
+		dict_index_t	index_rec;
+
+		/* Populate a dict_index_t structure with information from
+		a SYS_INDEXES row */
+		err_msg = dict_process_sys_indexes_rec(heap, rec, &index_rec,
+						       &table_id);
+
+		mtr_commit(&mtr);
+		mutex_exit(&dict_sys->mutex);
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_indexes(thd, table_id, &index_rec,
+						 tables->table);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mutex_enter(&dict_sys->mutex);
+		mtr_start(&mtr);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+	mutex_exit(&dict_sys->mutex);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_indexes
+@return 0 on success */
+static
+int
+innodb_sys_indexes_init(
+/*====================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_indexes_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_sysindex_fields_info;
+	schema->fill_table = i_s_sys_indexes_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_sys_indexes =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_SYS_INDEXES"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB SYS_INDEXES"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_sys_indexes_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/**  SYS_COLUMNS  **************************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_COLUMNS */
+static ST_FIELD_INFO	innodb_sys_columns_fields_info[] =
+{
+#define SYS_COLUMN_TABLE_ID		0
+	{STRUCT_FLD(field_name,		"TABLE_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_COLUMN_NAME		1
+	{STRUCT_FLD(field_name,		"NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_COLUMN_POSITION	2
+	{STRUCT_FLD(field_name,		"POS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_COLUMN_MTYPE		3
+	{STRUCT_FLD(field_name,		"MTYPE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_COLUMN__PRTYPE	4
+	{STRUCT_FLD(field_name,		"PRTYPE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_COLUMN_COLUMN_LEN	5
+	{STRUCT_FLD(field_name,		"LEN"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Function to populate the information_schema.innodb_sys_columns with
+related column information
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_columns(
+/*======================*/
+	THD*		thd,		/*!< in: thread */
+	table_id_t	table_id,	/*!< in: table ID */
+	const char*	col_name,	/*!< in: column name */
+	dict_col_t*	column,		/*!< in: dict_col_t struct holding
+					more column information */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+
+	DBUG_ENTER("i_s_dict_fill_sys_columns");
+
+	fields = table_to_fill->field;
+
+	OK(fields[SYS_COLUMN_TABLE_ID]->store(longlong(table_id), TRUE));
+
+	OK(field_store_string(fields[SYS_COLUMN_NAME], col_name));
+
+	OK(fields[SYS_COLUMN_POSITION]->store(column->ind));
+
+	OK(fields[SYS_COLUMN_MTYPE]->store(column->mtype));
+
+	OK(fields[SYS_COLUMN__PRTYPE]->store(column->prtype));
+
+	OK(fields[SYS_COLUMN_COLUMN_LEN]->store(column->len));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to fill information_schema.innodb_sys_columns with information
+collected by scanning SYS_COLUMNS table.
+@return 0 on success */
+static
+int
+i_s_sys_columns_fill_table(
+/*=======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	const char*	col_name;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_columns_fill_table");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
+
+	rec = dict_startscan_system(&pcur, &mtr, SYS_COLUMNS);
+
+	while (rec) {
+		const char*	err_msg;
+		dict_col_t	column_rec;
+		table_id_t	table_id;
+
+		/* populate a dict_col_t structure with information from
+		a SYS_COLUMNS row */
+		err_msg = dict_process_sys_columns_rec(heap, rec, &column_rec,
+						       &table_id, &col_name);
+
+		mtr_commit(&mtr);
+		mutex_exit(&dict_sys->mutex);
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_columns(thd, table_id, col_name,
+						 &column_rec,
+						 tables->table);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mutex_enter(&dict_sys->mutex);
+		mtr_start(&mtr);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+	mutex_exit(&dict_sys->mutex);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_columns
+@return 0 on success */
+static
+int
+innodb_sys_columns_init(
+/*====================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_columns_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_sys_columns_fields_info;
+	schema->fill_table = i_s_sys_columns_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_sys_columns =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_SYS_COLUMNS"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB SYS_COLUMNS"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_sys_columns_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/**  SYS_FIELDS  ***************************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FIELDS */
+static ST_FIELD_INFO	innodb_sys_fields_fields_info[] =
+{
+#define SYS_FIELD_INDEX_ID	0
+	{STRUCT_FLD(field_name,		"INDEX_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_FIELD_NAME		1
+	{STRUCT_FLD(field_name,		"NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_FIELD_POS		2
+	{STRUCT_FLD(field_name,		"POS"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Function to fill information_schema.innodb_sys_fields with information
+collected by scanning SYS_FIELDS table.
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_fields(
+/*=====================*/
+	THD*		thd,		/*!< in: thread */
+	index_id_t	index_id,	/*!< in: index id for the field */
+	dict_field_t*	field,		/*!< in: table */
+	ulint		pos,		/*!< in: Field position */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+
+	DBUG_ENTER("i_s_dict_fill_sys_fields");
+
+	fields = table_to_fill->field;
+
+	OK(fields[SYS_FIELD_INDEX_ID]->store(longlong(index_id), TRUE));
+
+	OK(field_store_string(fields[SYS_FIELD_NAME], field->name));
+
+	OK(fields[SYS_FIELD_POS]->store(static_cast<double>(pos)));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to go through each record in SYS_FIELDS table, and fill the
+information_schema.innodb_sys_fields table with related index field
+information
+@return 0 on success */
+static
+int
+i_s_sys_fields_fill_table(
+/*======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mem_heap_t*	heap;
+	index_id_t	last_id;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_fields_fill_table");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
+
+	/* will save last index id so that we know whether we move to
+	the next index. This is used to calculate prefix length */
+	last_id = 0;
+
+	rec = dict_startscan_system(&pcur, &mtr, SYS_FIELDS);
+
+	while (rec) {
+		ulint		pos;
+		const char*	err_msg;
+		index_id_t	index_id;
+		dict_field_t	field_rec;
+
+		/* Populate a dict_field_t structure with information from
+		a SYS_FIELDS row */
+		err_msg = dict_process_sys_fields_rec(heap, rec, &field_rec,
+						      &pos, &index_id, last_id);
+
+		mtr_commit(&mtr);
+		mutex_exit(&dict_sys->mutex);
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_fields(thd, index_id, &field_rec,
+						 pos, tables->table);
+			last_id = index_id;
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mutex_enter(&dict_sys->mutex);
+		mtr_start(&mtr);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+	mutex_exit(&dict_sys->mutex);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_fields
+@return 0 on success */
+static
+int
+innodb_sys_fields_init(
+/*===================*/
+	void*   p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_field_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_sys_fields_fields_info;
+	schema->fill_table = i_s_sys_fields_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_sys_fields =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_SYS_FIELDS"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB SYS_FIELDS"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_sys_fields_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/**  SYS_FOREIGN        ********************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FOREIGN */
+static ST_FIELD_INFO	innodb_sys_foreign_fields_info[] =
+{
+#define SYS_FOREIGN_ID		0
+	{STRUCT_FLD(field_name,		"ID"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_FOREIGN_FOR_NAME	1
+	{STRUCT_FLD(field_name,		"FOR_NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_FOREIGN_REF_NAME	2
+	{STRUCT_FLD(field_name,		"REF_NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_FOREIGN_NUM_COL	3
+	{STRUCT_FLD(field_name,		"N_COLS"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_FOREIGN_TYPE	4
+	{STRUCT_FLD(field_name,		"TYPE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Function to fill information_schema.innodb_sys_foreign with information
+collected by scanning SYS_FOREIGN table.
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_foreign(
+/*======================*/
+	THD*		thd,		/*!< in: thread */
+	dict_foreign_t*	foreign,	/*!< in: table */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+
+	DBUG_ENTER("i_s_dict_fill_sys_foreign");
+
+	fields = table_to_fill->field;
+
+	OK(field_store_string(fields[SYS_FOREIGN_ID], foreign->id));
+
+	OK(field_store_string(fields[SYS_FOREIGN_FOR_NAME],
+			      foreign->foreign_table_name));
+
+	OK(field_store_string(fields[SYS_FOREIGN_REF_NAME],
+			      foreign->referenced_table_name));
+
+	OK(fields[SYS_FOREIGN_NUM_COL]->store(foreign->n_fields));
+
+	OK(fields[SYS_FOREIGN_TYPE]->store(foreign->type));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.innodb_sys_foreign table. Loop
+through each record in SYS_FOREIGN, and extract the foreign key
+information.
+@return 0 on success */
+static
+int
+i_s_sys_foreign_fill_table(
+/*=======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_foreign_fill_table");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
+
+	rec = dict_startscan_system(&pcur, &mtr, SYS_FOREIGN);
+
+	while (rec) {
+		const char*	err_msg;
+		dict_foreign_t	foreign_rec;
+
+		/* Populate a dict_foreign_t structure with information from
+		a SYS_FOREIGN row */
+		err_msg = dict_process_sys_foreign_rec(heap, rec, &foreign_rec);
+
+		mtr_commit(&mtr);
+		mutex_exit(&dict_sys->mutex);
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_foreign(thd, &foreign_rec,
+						 tables->table);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mtr_start(&mtr);
+		mutex_enter(&dict_sys->mutex);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+	mutex_exit(&dict_sys->mutex);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign
+@return 0 on success */
+static
+int
+innodb_sys_foreign_init(
+/*====================*/
+	void*   p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_foreign_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_sys_foreign_fields_info;
+	schema->fill_table = i_s_sys_foreign_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_sys_foreign =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_SYS_FOREIGN"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB SYS_FOREIGN"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_sys_foreign_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/**  SYS_FOREIGN_COLS   ********************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FOREIGN_COLS */
+static ST_FIELD_INFO	innodb_sys_foreign_cols_fields_info[] =
+{
+#define SYS_FOREIGN_COL_ID		0
+	{STRUCT_FLD(field_name,		"ID"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_FOREIGN_COL_FOR_NAME	1
+	{STRUCT_FLD(field_name,		"FOR_COL_NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_FOREIGN_COL_REF_NAME	2
+	{STRUCT_FLD(field_name,		"REF_COL_NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_FOREIGN_COL_POS		3
+	{STRUCT_FLD(field_name,		"POS"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Function to fill information_schema.innodb_sys_foreign_cols with information
+collected by scanning SYS_FOREIGN_COLS table.
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_foreign_cols(
+/*==========================*/
+	THD*		thd,		/*!< in: thread */
+	const char*	name,		/*!< in: foreign key constraint name */
+	const char*	for_col_name,	/*!< in: referencing column name*/
+	const char*	ref_col_name,	/*!< in: referenced column
+					name */
+	ulint		pos,		/*!< in: column position */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+
+	DBUG_ENTER("i_s_dict_fill_sys_foreign_cols");
+
+	fields = table_to_fill->field;
+
+	OK(field_store_string(fields[SYS_FOREIGN_COL_ID], name));
+
+	OK(field_store_string(fields[SYS_FOREIGN_COL_FOR_NAME], for_col_name));
+
+	OK(field_store_string(fields[SYS_FOREIGN_COL_REF_NAME], ref_col_name));
+
+	OK(fields[SYS_FOREIGN_COL_POS]->store(static_cast<double>(pos)));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.innodb_sys_foreign_cols table. Loop
+through each record in SYS_FOREIGN_COLS, and extract the foreign key column
+information and fill the INFORMATION_SCHEMA.innodb_sys_foreign_cols table.
+@return 0 on success */
+static
+int
+i_s_sys_foreign_cols_fill_table(
+/*============================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_foreign_cols_fill_table");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
+
+	rec = dict_startscan_system(&pcur, &mtr, SYS_FOREIGN_COLS);
+
+	while (rec) {
+		const char*	err_msg;
+		const char*	name;
+		const char*	for_col_name;
+		const char*	ref_col_name;
+		ulint		pos;
+
+		/* Extract necessary information from a SYS_FOREIGN_COLS row */
+		err_msg = dict_process_sys_foreign_col_rec(
+			heap, rec, &name, &for_col_name, &ref_col_name, &pos);
+
+		mtr_commit(&mtr);
+		mutex_exit(&dict_sys->mutex);
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_foreign_cols(
+				thd, name, for_col_name, ref_col_name, pos,
+				tables->table);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mutex_enter(&dict_sys->mutex);
+		mtr_start(&mtr);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+	mutex_exit(&dict_sys->mutex);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign_cols
+@return 0 on success */
+static
+int
+innodb_sys_foreign_cols_init(
+/*========================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_foreign_cols_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_sys_foreign_cols_fields_info;
+	schema->fill_table = i_s_sys_foreign_cols_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_sys_foreign_cols =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_SYS_FOREIGN_COLS"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB SYS_FOREIGN_COLS"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_sys_foreign_cols_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/**  SYS_TABLESPACES    ********************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES */
+static ST_FIELD_INFO	innodb_sys_tablespaces_fields_info[] =
+{
+#define SYS_TABLESPACES_SPACE		0
+	{STRUCT_FLD(field_name,		"SPACE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESPACES_NAME		1
+	{STRUCT_FLD(field_name,		"NAME"),
+	 STRUCT_FLD(field_length,	MAX_FULL_NAME_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESPACES_FLAGS		2
+	{STRUCT_FLD(field_name,		"FLAG"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESPACES_FILE_FORMAT	3
+	{STRUCT_FLD(field_name,		"FILE_FORMAT"),
+	 STRUCT_FLD(field_length,	10),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESPACES_ROW_FORMAT	4
+	{STRUCT_FLD(field_name,		"ROW_FORMAT"),
+	 STRUCT_FLD(field_length,	22),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESPACES_PAGE_SIZE	5
+	{STRUCT_FLD(field_name,		"PAGE_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_TABLESPACES_ZIP_PAGE_SIZE	6
+	{STRUCT_FLD(field_name,		"ZIP_PAGE_SIZE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+
+};
+
+/**********************************************************************//**
+Function to fill INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES with information
+collected by scanning SYS_TABLESPACESS table.
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_tablespaces(
+/*==========================*/
+	THD*		thd,		/*!< in: thread */
+	ulint		space,		/*!< in: space ID */
+	const char*	name,		/*!< in: tablespace name */
+	ulint		flags,		/*!< in: tablespace flags */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**	fields;
+	ulint	atomic_blobs	= FSP_FLAGS_HAS_ATOMIC_BLOBS(flags);
+	ulint	page_size	= fsp_flags_get_page_size(flags);;
+	ulint	zip_size	= fsp_flags_get_zip_size(flags);
+	const char* file_format;
+	const char* row_format;
+
+	DBUG_ENTER("i_s_dict_fill_sys_tablespaces");
+
+	file_format = trx_sys_file_format_id_to_name(atomic_blobs);
+	if (!atomic_blobs) {
+		row_format = "Compact or Redundant";
+	} else if DICT_TF_GET_ZIP_SSIZE(flags) {
+		row_format = "Compressed";
+	} else {
+		row_format = "Dynamic";
+	}
+
+	fields = table_to_fill->field;
+
+	OK(fields[SYS_TABLESPACES_SPACE]->store(
+		static_cast<double>(space)));
+
+	OK(field_store_string(fields[SYS_TABLESPACES_NAME], name));
+
+	OK(fields[SYS_TABLESPACES_FLAGS]->store(
+		static_cast<double>(flags)));
+
+	OK(field_store_string(fields[SYS_TABLESPACES_FILE_FORMAT],
+			      file_format));
+
+	OK(field_store_string(fields[SYS_TABLESPACES_ROW_FORMAT],
+			      row_format));
+
+	OK(fields[SYS_TABLESPACES_PAGE_SIZE]->store(
+		static_cast<double>(page_size)));
+
+	OK(fields[SYS_TABLESPACES_ZIP_PAGE_SIZE]->store(
+		static_cast<double>(zip_size)));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES table.
+Loop through each record in SYS_TABLESPACES, and extract the column
+information and fill the INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES table.
+@return 0 on success */
+static
+int
+i_s_sys_tablespaces_fill_table(
+/*===========================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_tablespaces_fill_table");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
+
+	rec = dict_startscan_system(&pcur, &mtr, SYS_TABLESPACES);
+
+	while (rec) {
+		const char*	err_msg;
+		ulint		space;
+		const char*	name;
+		ulint		flags;
+
+		/* Extract necessary information from a SYS_TABLESPACES row */
+		err_msg = dict_process_sys_tablespaces(
+			heap, rec, &space, &name, &flags);
+
+		mtr_commit(&mtr);
+		mutex_exit(&dict_sys->mutex);
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_tablespaces(
+				thd, space, name, flags,
+				tables->table);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mutex_enter(&dict_sys->mutex);
+		mtr_start(&mtr);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+	mutex_exit(&dict_sys->mutex);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES
+@return 0 on success */
+static
+int
+innodb_sys_tablespaces_init(
+/*========================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_tablespaces_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_sys_tablespaces_fields_info;
+	schema->fill_table = i_s_sys_tablespaces_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_sys_tablespaces =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_SYS_TABLESPACES"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB SYS_TABLESPACES"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_sys_tablespaces_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
+
+/**  SYS_DATAFILES  ************************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_DATAFILES */
+static ST_FIELD_INFO	innodb_sys_datafiles_fields_info[] =
+{
+#define SYS_DATAFILES_SPACE		0
+	{STRUCT_FLD(field_name,		"SPACE"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define SYS_DATAFILES_PATH		1
+	{STRUCT_FLD(field_name,		"PATH"),
+	 STRUCT_FLD(field_length,	OS_FILE_MAX_PATH),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/**********************************************************************//**
+Function to fill INFORMATION_SCHEMA.INNODB_SYS_DATAFILES with information
+collected by scanning SYS_DATAFILESS table.
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_datafiles(
+/*========================*/
+	THD*		thd,		/*!< in: thread */
+	ulint		space,		/*!< in: space ID */
+	const char*	path,		/*!< in: absolute path */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+
+	DBUG_ENTER("i_s_dict_fill_sys_datafiles");
+
+	fields = table_to_fill->field;
+
+	OK(field_store_ulint(fields[SYS_DATAFILES_SPACE], space));
+
+	OK(field_store_string(fields[SYS_DATAFILES_PATH], path));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.INNODB_SYS_DATAFILES table.
+Loop through each record in SYS_DATAFILES, and extract the column
+information and fill the INFORMATION_SCHEMA.INNODB_SYS_DATAFILES table.
+@return 0 on success */
+static
+int
+i_s_sys_datafiles_fill_table(
+/*=========================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_datafiles_fill_table");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
+
+	rec = dict_startscan_system(&pcur, &mtr, SYS_DATAFILES);
+
+	while (rec) {
+		const char*	err_msg;
+		ulint		space;
+		const char*	path;
+
+		/* Extract necessary information from a SYS_DATAFILES row */
+		err_msg = dict_process_sys_datafiles(
+			heap, rec, &space, &path);
+
+		mtr_commit(&mtr);
+		mutex_exit(&dict_sys->mutex);
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_datafiles(
+				thd, space, path, tables->table);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mutex_enter(&dict_sys->mutex);
+		mtr_start(&mtr);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+	mutex_exit(&dict_sys->mutex);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_SYS_DATAFILES
+@return 0 on success */
+static
+int
+innodb_sys_datafiles_init(
+/*======================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_datafiles_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_sys_datafiles_fields_info;
+	schema->fill_table = i_s_sys_datafiles_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_sys_datafiles =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_SYS_DATAFILES"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB SYS_DATAFILES"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_sys_datafiles_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL),
+
+	/* Plugin flags */
+	/* unsigned long */
+	STRUCT_FLD(flags, 0UL),
+};
diff --git a/storage/innobase/handler/i_s.h b/storage/innobase/handler/i_s.h
new file mode 100644
index 00000000000..9c8849345f0
--- /dev/null
+++ b/storage/innobase/handler/i_s.h
@@ -0,0 +1,60 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file handler/i_s.h
+InnoDB INFORMATION SCHEMA tables interface to MySQL.
+
+Created July 18, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef i_s_h
+#define i_s_h
+
+const char plugin_author[] = "Oracle Corporation";
+
+extern struct st_mysql_plugin	i_s_innodb_trx;
+extern struct st_mysql_plugin	i_s_innodb_locks;
+extern struct st_mysql_plugin	i_s_innodb_lock_waits;
+extern struct st_mysql_plugin	i_s_innodb_cmp;
+extern struct st_mysql_plugin	i_s_innodb_cmp_reset;
+extern struct st_mysql_plugin	i_s_innodb_cmp_per_index;
+extern struct st_mysql_plugin	i_s_innodb_cmp_per_index_reset;
+extern struct st_mysql_plugin	i_s_innodb_cmpmem;
+extern struct st_mysql_plugin	i_s_innodb_cmpmem_reset;
+extern struct st_mysql_plugin   i_s_innodb_metrics;
+extern struct st_mysql_plugin	i_s_innodb_ft_default_stopword;
+extern struct st_mysql_plugin	i_s_innodb_ft_deleted;
+extern struct st_mysql_plugin	i_s_innodb_ft_being_deleted;
+extern struct st_mysql_plugin	i_s_innodb_ft_index_cache;
+extern struct st_mysql_plugin	i_s_innodb_ft_index_table;
+extern struct st_mysql_plugin	i_s_innodb_ft_config;
+extern struct st_mysql_plugin	i_s_innodb_buffer_page;
+extern struct st_mysql_plugin	i_s_innodb_buffer_page_lru;
+extern struct st_mysql_plugin	i_s_innodb_buffer_stats;
+extern struct st_mysql_plugin	i_s_innodb_sys_tables;
+extern struct st_mysql_plugin	i_s_innodb_sys_tablestats;
+extern struct st_mysql_plugin	i_s_innodb_sys_indexes;
+extern struct st_mysql_plugin	i_s_innodb_sys_columns;
+extern struct st_mysql_plugin	i_s_innodb_sys_fields;
+extern struct st_mysql_plugin	i_s_innodb_sys_foreign;
+extern struct st_mysql_plugin	i_s_innodb_sys_foreign_cols;
+extern struct st_mysql_plugin	i_s_innodb_sys_tablespaces;
+extern struct st_mysql_plugin	i_s_innodb_sys_datafiles;
+
+#endif /* i_s_h */
diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc
new file mode 100644
index 00000000000..caf1f1a864b
--- /dev/null
+++ b/storage/innobase/ibuf/ibuf0ibuf.cc
@@ -0,0 +1,5223 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ibuf/ibuf0ibuf.cc
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "ibuf0ibuf.h"
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+UNIV_INTERN my_bool	srv_ibuf_disable_background_merge;
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+/** Number of bits describing a single page */
+#define IBUF_BITS_PER_PAGE	4
+#if IBUF_BITS_PER_PAGE % 2
+# error "IBUF_BITS_PER_PAGE must be an even number!"
+#endif
+/** The start address for an insert buffer bitmap page bitmap */
+#define IBUF_BITMAP		PAGE_DATA
+
+#ifdef UNIV_NONINL
+#include "ibuf0ibuf.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+
+#include "buf0buf.h"
+#include "buf0rea.h"
+#include "fsp0fsp.h"
+#include "trx0sys.h"
+#include "fil0fil.h"
+#include "rem0rec.h"
+#include "btr0cur.h"
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "row0upd.h"
+#include "sync0sync.h"
+#include "dict0boot.h"
+#include "fut0lst.h"
+#include "lock0lock.h"
+#include "log0recv.h"
+#include "que0que.h"
+#include "srv0start.h" /* srv_shutdown_state */
+#include "ha_prototypes.h"
+#include "rem0cmp.h"
+
+/*	STRUCTURE OF AN INSERT BUFFER RECORD
+
+In versions < 4.1.x:
+
+1. The first field is the page number.
+2. The second field is an array which stores type info for each subsequent
+   field. We store the information which affects the ordering of records, and
+   also the physical storage size of an SQL NULL value. E.g., for CHAR(10) it
+   is 10 bytes.
+3. Next we have the fields of the actual index record.
+
+In versions >= 4.1.x:
+
+Note that contary to what we planned in the 1990's, there will only be one
+insert buffer tree, and that is in the system tablespace of InnoDB.
+
+1. The first field is the space id.
+2. The second field is a one-byte marker (0) which differentiates records from
+   the < 4.1.x storage format.
+3. The third field is the page number.
+4. The fourth field contains the type info, where we have also added 2 bytes to
+   store the charset. In the compressed table format of 5.0.x we must add more
+   information here so that we can build a dummy 'index' struct which 5.0.x
+   can use in the binary search on the index page in the ibuf merge phase.
+5. The rest of the fields contain the fields of the actual index record.
+
+In versions >= 5.0.3:
+
+The first byte of the fourth field is an additional marker (0) if the record
+is in the compact format.  The presence of this marker can be detected by
+looking at the length of the field modulo DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE.
+
+The high-order bit of the character set field in the type info is the
+"nullable" flag for the field.
+
+In versions >= 5.5:
+
+The optional marker byte at the start of the fourth field is replaced by
+mandatory 3 fields, totaling 4 bytes:
+
+ 1. 2 bytes: Counter field, used to sort records within a (space id, page
+    no) in the order they were added. This is needed so that for example the
+    sequence of operations "INSERT x, DEL MARK x, INSERT x" is handled
+    correctly.
+
+ 2. 1 byte: Operation type (see ibuf_op_t).
+
+ 3. 1 byte: Flags. Currently only one flag exists, IBUF_REC_COMPACT.
+
+To ensure older records, which do not have counters to enforce correct
+sorting, are merged before any new records, ibuf_insert checks if we're
+trying to insert to a position that contains old-style records, and if so,
+refuses the insert. Thus, ibuf pages are gradually converted to the new
+format as their corresponding buffer pool pages are read into memory.
+*/
+
+
+/*	PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM
+
+If an OS thread performs any operation that brings in disk pages from
+non-system tablespaces into the buffer pool, or creates such a page there,
+then the operation may have as a side effect an insert buffer index tree
+compression. Thus, the tree latch of the insert buffer tree may be acquired
+in the x-mode, and also the file space latch of the system tablespace may
+be acquired in the x-mode.
+
+Also, an insert to an index in a non-system tablespace can have the same
+effect. How do we know this cannot lead to a deadlock of OS threads? There
+is a problem with the i\o-handler threads: they break the latching order
+because they own x-latches to pages which are on a lower level than the
+insert buffer tree latch, its page latches, and the tablespace latch an
+insert buffer operation can reserve.
+
+The solution is the following: Let all the tree and page latches connected
+with the insert buffer be later in the latching order than the fsp latch and
+fsp page latches.
+
+Insert buffer pages must be such that the insert buffer is never invoked
+when these pages are accessed as this would result in a recursion violating
+the latching order. We let a special i/o-handler thread take care of i/o to
+the insert buffer pages and the ibuf bitmap pages, as well as the fsp bitmap
+pages and the first inode page, which contains the inode of the ibuf tree: let
+us call all these ibuf pages. To prevent deadlocks, we do not let a read-ahead
+access both non-ibuf and ibuf pages.
+
+Then an i/o-handler for the insert buffer never needs to access recursively the
+insert buffer tree and thus obeys the latching order. On the other hand, other
+i/o-handlers for other tablespaces may require access to the insert buffer,
+but because all kinds of latches they need to access there are later in the
+latching order, no violation of the latching order occurs in this case,
+either.
+
+A problem is how to grow and contract an insert buffer tree. As it is later
+in the latching order than the fsp management, we have to reserve the fsp
+latch first, before adding or removing pages from the insert buffer tree.
+We let the insert buffer tree have its own file space management: a free
+list of pages linked to the tree root. To prevent recursive using of the
+insert buffer when adding pages to the tree, we must first load these pages
+to memory, obtaining a latch on them, and only after that add them to the
+free list of the insert buffer tree. More difficult is removing of pages
+from the free list. If there is an excess of pages in the free list of the
+ibuf tree, they might be needed if some thread reserves the fsp latch,
+intending to allocate more file space. So we do the following: if a thread
+reserves the fsp latch, we check the writer count field of the latch. If
+this field has value 1, it means that the thread did not own the latch
+before entering the fsp system, and the mtr of the thread contains no
+modifications to the fsp pages. Now we are free to reserve the ibuf latch,
+and check if there is an excess of pages in the free list. We can then, in a
+separate mini-transaction, take them out of the free list and free them to
+the fsp system.
+
+To avoid deadlocks in the ibuf system, we divide file pages into three levels:
+
+(1) non-ibuf pages,
+(2) ibuf tree pages and the pages in the ibuf tree free list, and
+(3) ibuf bitmap pages.
+
+No OS thread is allowed to access higher level pages if it has latches to
+lower level pages; even if the thread owns a B-tree latch it must not access
+the B-tree non-leaf pages if it has latches on lower level pages. Read-ahead
+is only allowed for level 1 and 2 pages. Dedicated i/o-handler threads handle
+exclusively level 1 i/o. A dedicated i/o handler thread handles exclusively
+level 2 i/o. However, if an OS thread does the i/o handling for itself, i.e.,
+it uses synchronous aio, it can access any pages, as long as it obeys the
+access order rules. */
+
+/** Table name for the insert buffer. */
+#define IBUF_TABLE_NAME		"SYS_IBUF_TABLE"
+
+/** Operations that can currently be buffered. */
+UNIV_INTERN ibuf_use_t	ibuf_use		= IBUF_USE_ALL;
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+/** Flag to control insert buffer debugging. */
+UNIV_INTERN uint	ibuf_debug;
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+/** The insert buffer control structure */
+UNIV_INTERN ibuf_t*	ibuf			= NULL;
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	ibuf_pessimistic_insert_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	ibuf_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	ibuf_bitmap_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+/** Number of tablespaces in the ibuf_counts array */
+#define IBUF_COUNT_N_SPACES	4
+/** Number of pages within each tablespace in the ibuf_counts array */
+#define IBUF_COUNT_N_PAGES	130000
+
+/** Buffered entry counts for file pages, used in debugging */
+static ulint	ibuf_counts[IBUF_COUNT_N_SPACES][IBUF_COUNT_N_PAGES];
+
+/******************************************************************//**
+Checks that the indexes to ibuf_counts[][] are within limits. */
+UNIV_INLINE
+void
+ibuf_count_check(
+/*=============*/
+	ulint	space_id,	/*!< in: space identifier */
+	ulint	page_no)	/*!< in: page number */
+{
+	if (space_id < IBUF_COUNT_N_SPACES && page_no < IBUF_COUNT_N_PAGES) {
+		return;
+	}
+
+	fprintf(stderr,
+		"InnoDB: UNIV_IBUF_COUNT_DEBUG limits space_id and page_no\n"
+		"InnoDB: and breaks crash recovery.\n"
+		"InnoDB: space_id=%lu, should be 0<=space_id<%lu\n"
+		"InnoDB: page_no=%lu, should be 0<=page_no<%lu\n",
+		(ulint) space_id, (ulint) IBUF_COUNT_N_SPACES,
+		(ulint) page_no, (ulint) IBUF_COUNT_N_PAGES);
+	ut_error;
+}
+#endif
+
+/** @name Offsets to the per-page bits in the insert buffer bitmap */
+/* @{ */
+#define	IBUF_BITMAP_FREE	0	/*!< Bits indicating the
+					amount of free space */
+#define IBUF_BITMAP_BUFFERED	2	/*!< TRUE if there are buffered
+					changes for the page */
+#define IBUF_BITMAP_IBUF	3	/*!< TRUE if page is a part of
+					the ibuf tree, excluding the
+					root page, or is in the free
+					list of the ibuf */
+/* @} */
+
+#define IBUF_REC_FIELD_SPACE	0	/*!< in the pre-4.1 format,
+					the page number. later, the space_id */
+#define IBUF_REC_FIELD_MARKER	1	/*!< starting with 4.1, a marker
+					consisting of 1 byte that is 0 */
+#define IBUF_REC_FIELD_PAGE	2	/*!< starting with 4.1, the
+					page number */
+#define IBUF_REC_FIELD_METADATA	3	/* the metadata field */
+#define IBUF_REC_FIELD_USER	4	/* first user field */
+
+/* Various constants for checking the type of an ibuf record and extracting
+data from it. For details, see the description of the record format at the
+top of this file. */
+
+/** @name Format of the IBUF_REC_FIELD_METADATA of an insert buffer record
+The fourth column in the MySQL 5.5 format contains an operation
+type, counter, and some flags. */
+/* @{ */
+#define IBUF_REC_INFO_SIZE	4	/*!< Combined size of info fields at
+					the beginning of the fourth field */
+#if IBUF_REC_INFO_SIZE >= DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+# error "IBUF_REC_INFO_SIZE >= DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE"
+#endif
+
+/* Offsets for the fields at the beginning of the fourth field */
+#define IBUF_REC_OFFSET_COUNTER	0	/*!< Operation counter */
+#define IBUF_REC_OFFSET_TYPE	2	/*!< Type of operation */
+#define IBUF_REC_OFFSET_FLAGS	3	/*!< Additional flags */
+
+/* Record flag masks */
+#define IBUF_REC_COMPACT	0x1	/*!< Set in
+					IBUF_REC_OFFSET_FLAGS if the
+					user index is in COMPACT
+					format or later */
+
+
+/** The mutex used to block pessimistic inserts to ibuf trees */
+static ib_mutex_t	ibuf_pessimistic_insert_mutex;
+
+/** The mutex protecting the insert buffer structs */
+static ib_mutex_t	ibuf_mutex;
+
+/** The mutex protecting the insert buffer bitmaps */
+static ib_mutex_t	ibuf_bitmap_mutex;
+
+/** The area in pages from which contract looks for page numbers for merge */
+#define	IBUF_MERGE_AREA			8UL
+
+/** Inside the merge area, pages which have at most 1 per this number less
+buffered entries compared to maximum volume that can buffered for a single
+page are merged along with the page whose buffer became full */
+#define IBUF_MERGE_THRESHOLD		4
+
+/** In ibuf_contract at most this number of pages is read to memory in one
+batch, in order to merge the entries for them in the insert buffer */
+#define	IBUF_MAX_N_PAGES_MERGED		IBUF_MERGE_AREA
+
+/** If the combined size of the ibuf trees exceeds ibuf->max_size by this
+many pages, we start to contract it in connection to inserts there, using
+non-synchronous contract */
+#define IBUF_CONTRACT_ON_INSERT_NON_SYNC	0
+
+/** If the combined size of the ibuf trees exceeds ibuf->max_size by this
+many pages, we start to contract it in connection to inserts there, using
+synchronous contract */
+#define IBUF_CONTRACT_ON_INSERT_SYNC		5
+
+/** If the combined size of the ibuf trees exceeds ibuf->max_size by
+this many pages, we start to contract it synchronous contract, but do
+not insert */
+#define IBUF_CONTRACT_DO_NOT_INSERT		10
+
+/* TODO: how to cope with drop table if there are records in the insert
+buffer for the indexes of the table? Is there actually any problem,
+because ibuf merge is done to a page when it is read in, and it is
+still physically like the index page even if the index would have been
+dropped! So, there seems to be no problem. */
+
+/******************************************************************//**
+Sets the flag in the current mini-transaction record indicating we're
+inside an insert buffer routine. */
+UNIV_INLINE
+void
+ibuf_enter(
+/*=======*/
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(!mtr->inside_ibuf);
+	mtr->inside_ibuf = TRUE;
+}
+
+/******************************************************************//**
+Sets the flag in the current mini-transaction record indicating we're
+exiting an insert buffer routine. */
+UNIV_INLINE
+void
+ibuf_exit(
+/*======*/
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(mtr->inside_ibuf);
+	mtr->inside_ibuf = FALSE;
+}
+
+/**************************************************************//**
+Commits an insert buffer mini-transaction and sets the persistent
+cursor latch mode to BTR_NO_LATCHES, that is, detaches the cursor. */
+UNIV_INLINE
+void
+ibuf_btr_pcur_commit_specify_mtr(
+/*=============================*/
+	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ut_d(ibuf_exit(mtr));
+	btr_pcur_commit_specify_mtr(pcur, mtr);
+}
+
+/******************************************************************//**
+Gets the ibuf header page and x-latches it.
+@return	insert buffer header page */
+static
+page_t*
+ibuf_header_page_get(
+/*=================*/
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	buf_block_t*	block;
+
+	ut_ad(!ibuf_inside(mtr));
+
+	block = buf_page_get(
+		IBUF_SPACE_ID, 0, FSP_IBUF_HEADER_PAGE_NO, RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_IBUF_HEADER);
+
+	return(buf_block_get_frame(block));
+}
+
+/******************************************************************//**
+Gets the root page and x-latches it.
+@return	insert buffer tree root page */
+static
+page_t*
+ibuf_tree_root_get(
+/*===============*/
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+	page_t*		root;
+
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(mutex_own(&ibuf_mutex));
+
+	mtr_x_lock(dict_index_get_lock(ibuf->index), mtr);
+
+	block = buf_page_get(
+		IBUF_SPACE_ID, 0, FSP_IBUF_TREE_ROOT_PAGE_NO, RW_X_LATCH, mtr);
+
+	buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
+
+	root = buf_block_get_frame(block);
+
+	ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
+	ut_ad(page_get_page_no(root) == FSP_IBUF_TREE_ROOT_PAGE_NO);
+	ut_ad(ibuf->empty == page_is_empty(root));
+
+	return(root);
+}
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+/******************************************************************//**
+Gets the ibuf count for a given page.
+@return number of entries in the insert buffer currently buffered for
+this page */
+UNIV_INTERN
+ulint
+ibuf_count_get(
+/*===========*/
+	ulint	space,	/*!< in: space id */
+	ulint	page_no)/*!< in: page number */
+{
+	ibuf_count_check(space, page_no);
+
+	return(ibuf_counts[space][page_no]);
+}
+
+/******************************************************************//**
+Sets the ibuf count for a given page. */
+static
+void
+ibuf_count_set(
+/*===========*/
+	ulint	space,	/*!< in: space id */
+	ulint	page_no,/*!< in: page number */
+	ulint	val)	/*!< in: value to set */
+{
+	ibuf_count_check(space, page_no);
+	ut_a(val < UNIV_PAGE_SIZE);
+
+	ibuf_counts[space][page_no] = val;
+}
+#endif
+
+/******************************************************************//**
+Closes insert buffer and frees the data structures. */
+UNIV_INTERN
+void
+ibuf_close(void)
+/*============*/
+{
+	mutex_free(&ibuf_pessimistic_insert_mutex);
+	memset(&ibuf_pessimistic_insert_mutex,
+	       0x0, sizeof(ibuf_pessimistic_insert_mutex));
+
+	mutex_free(&ibuf_mutex);
+	memset(&ibuf_mutex, 0x0, sizeof(ibuf_mutex));
+
+	mutex_free(&ibuf_bitmap_mutex);
+	memset(&ibuf_bitmap_mutex, 0x0, sizeof(ibuf_mutex));
+
+	mem_free(ibuf);
+	ibuf = NULL;
+}
+
+/******************************************************************//**
+Updates the size information of the ibuf, assuming the segment size has not
+changed. */
+static
+void
+ibuf_size_update(
+/*=============*/
+	const page_t*	root,	/*!< in: ibuf tree root */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(mutex_own(&ibuf_mutex));
+
+	ibuf->free_list_len = flst_get_len(root + PAGE_HEADER
+					   + PAGE_BTR_IBUF_FREE_LIST, mtr);
+
+	ibuf->height = 1 + btr_page_get_level(root, mtr);
+
+	/* the '1 +' is the ibuf header page */
+	ibuf->size = ibuf->seg_size - (1 + ibuf->free_list_len);
+}
+
+/******************************************************************//**
+Creates the insert buffer data structure at a database startup and initializes
+the data structures for the insert buffer. */
+UNIV_INTERN
+void
+ibuf_init_at_db_start(void)
+/*=======================*/
+{
+	page_t*		root;
+	mtr_t		mtr;
+	dict_table_t*	table;
+	mem_heap_t*	heap;
+	dict_index_t*	index;
+	ulint		n_used;
+	page_t*		header_page;
+	dberr_t		error;
+
+	ibuf = static_cast<ibuf_t*>(mem_zalloc(sizeof(ibuf_t)));
+
+	/* At startup we intialize ibuf to have a maximum of
+	CHANGE_BUFFER_DEFAULT_SIZE in terms of percentage of the
+	buffer pool size. Once ibuf struct is initialized this
+	value is updated with the user supplied size by calling
+	ibuf_max_size_update(). */
+	ibuf->max_size = ((buf_pool_get_curr_size() / UNIV_PAGE_SIZE)
+			  * CHANGE_BUFFER_DEFAULT_SIZE) / 100;
+
+	mutex_create(ibuf_pessimistic_insert_mutex_key,
+		     &ibuf_pessimistic_insert_mutex,
+		     SYNC_IBUF_PESS_INSERT_MUTEX);
+
+	mutex_create(ibuf_mutex_key,
+		     &ibuf_mutex, SYNC_IBUF_MUTEX);
+
+	mutex_create(ibuf_bitmap_mutex_key,
+		     &ibuf_bitmap_mutex, SYNC_IBUF_BITMAP_MUTEX);
+
+	mtr_start(&mtr);
+
+	mutex_enter(&ibuf_mutex);
+
+	mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, NULL), &mtr);
+
+	header_page = ibuf_header_page_get(&mtr);
+
+	fseg_n_reserved_pages(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
+			      &n_used, &mtr);
+	ibuf_enter(&mtr);
+
+	ut_ad(n_used >= 2);
+
+	ibuf->seg_size = n_used;
+
+	{
+		buf_block_t*	block;
+
+		block = buf_page_get(
+			IBUF_SPACE_ID, 0, FSP_IBUF_TREE_ROOT_PAGE_NO,
+			RW_X_LATCH, &mtr);
+		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
+
+		root = buf_block_get_frame(block);
+	}
+
+	ibuf_size_update(root, &mtr);
+	mutex_exit(&ibuf_mutex);
+
+	ibuf->empty = page_is_empty(root);
+	ibuf_mtr_commit(&mtr);
+
+	heap = mem_heap_create(450);
+
+	/* Use old-style record format for the insert buffer. */
+	table = dict_mem_table_create(IBUF_TABLE_NAME, IBUF_SPACE_ID, 1, 0, 0);
+
+	dict_mem_table_add_col(table, heap, "DUMMY_COLUMN", DATA_BINARY, 0, 0);
+
+	table->id = DICT_IBUF_ID_MIN + IBUF_SPACE_ID;
+
+	dict_table_add_to_cache(table, FALSE, heap);
+	mem_heap_free(heap);
+
+	index = dict_mem_index_create(
+		IBUF_TABLE_NAME, "CLUST_IND",
+		IBUF_SPACE_ID, DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, 1);
+
+	dict_mem_index_add_field(index, "DUMMY_COLUMN", 0);
+
+	index->id = DICT_IBUF_ID_MIN + IBUF_SPACE_ID;
+
+	error = dict_index_add_to_cache(table, index,
+					FSP_IBUF_TREE_ROOT_PAGE_NO, FALSE);
+	ut_a(error == DB_SUCCESS);
+
+	ibuf->index = dict_table_get_first_index(table);
+}
+
+/*********************************************************************//**
+Updates the max_size value for ibuf. */
+UNIV_INTERN
+void
+ibuf_max_size_update(
+/*=================*/
+	ulint	new_val)	/*!< in: new value in terms of
+				percentage of the buffer pool size */
+{
+	ulint	new_size = ((buf_pool_get_curr_size() / UNIV_PAGE_SIZE)
+			    * new_val) / 100;
+	mutex_enter(&ibuf_mutex);
+	ibuf->max_size = new_size;
+	mutex_exit(&ibuf_mutex);
+}
+
+
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Initializes an ibuf bitmap page. */
+UNIV_INTERN
+void
+ibuf_bitmap_page_init(
+/*==================*/
+	buf_block_t*	block,	/*!< in: bitmap page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*	page;
+	ulint	byte_offset;
+	ulint	zip_size = buf_block_get_zip_size(block);
+
+	ut_a(ut_is_2pow(zip_size));
+
+	page = buf_block_get_frame(block);
+	fil_page_set_type(page, FIL_PAGE_IBUF_BITMAP);
+
+	/* Write all zeros to the bitmap */
+
+	if (!zip_size) {
+		byte_offset = UT_BITS_IN_BYTES(UNIV_PAGE_SIZE
+					       * IBUF_BITS_PER_PAGE);
+	} else {
+		byte_offset = UT_BITS_IN_BYTES(zip_size * IBUF_BITS_PER_PAGE);
+	}
+
+	memset(page + IBUF_BITMAP, 0, byte_offset);
+
+	/* The remaining area (up to the page trailer) is uninitialized. */
+
+#ifndef UNIV_HOTBACKUP
+	mlog_write_initial_log_record(page, MLOG_IBUF_BITMAP_INIT, mtr);
+#endif /* !UNIV_HOTBACKUP */
+}
+
+/*********************************************************************//**
+Parses a redo log record of an ibuf bitmap page init.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+ibuf_parse_bitmap_init(
+/*===================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr __attribute__((unused)), /*!< in: buffer end */
+	buf_block_t*	block,	/*!< in: block or NULL */
+	mtr_t*		mtr)	/*!< in: mtr or NULL */
+{
+	ut_ad(ptr && end_ptr);
+
+	if (block) {
+		ibuf_bitmap_page_init(block, mtr);
+	}
+
+	return(ptr);
+}
+#ifndef UNIV_HOTBACKUP
+# ifdef UNIV_DEBUG
+/** Gets the desired bits for a given page from a bitmap page.
+@param page	in: bitmap page
+@param offset	in: page whose bits to get
+@param zs	in: compressed page size in bytes; 0 for uncompressed pages
+@param bit	in: IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
+@param mtr	in: mini-transaction holding an x-latch on the bitmap page
+@return	value of bits */
+#  define ibuf_bitmap_page_get_bits(page, offset, zs, bit, mtr)	\
+	ibuf_bitmap_page_get_bits_low(page, offset, zs,	\
+				      MTR_MEMO_PAGE_X_FIX, mtr, bit)
+# else /* UNIV_DEBUG */
+/** Gets the desired bits for a given page from a bitmap page.
+@param page	in: bitmap page
+@param offset	in: page whose bits to get
+@param zs	in: compressed page size in bytes; 0 for uncompressed pages
+@param bit	in: IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
+@param mtr	in: mini-transaction holding an x-latch on the bitmap page
+@return	value of bits */
+#  define ibuf_bitmap_page_get_bits(page, offset, zs, bit, mtr)		\
+	ibuf_bitmap_page_get_bits_low(page, offset, zs, bit)
+# endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Gets the desired bits for a given page from a bitmap page.
+@return	value of bits */
+UNIV_INLINE
+ulint
+ibuf_bitmap_page_get_bits_low(
+/*==========================*/
+	const page_t*	page,	/*!< in: bitmap page */
+	ulint		page_no,/*!< in: page whose bits to get */
+	ulint		zip_size,/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+#ifdef UNIV_DEBUG
+	ulint		latch_type,
+				/*!< in: MTR_MEMO_PAGE_X_FIX,
+				MTR_MEMO_BUF_FIX, ... */
+	mtr_t*		mtr,	/*!< in: mini-transaction holding latch_type
+				on the bitmap page */
+#endif /* UNIV_DEBUG */
+	ulint		bit)	/*!< in: IBUF_BITMAP_FREE,
+				IBUF_BITMAP_BUFFERED, ... */
+{
+	ulint	byte_offset;
+	ulint	bit_offset;
+	ulint	map_byte;
+	ulint	value;
+
+	ut_ad(bit < IBUF_BITS_PER_PAGE);
+#if IBUF_BITS_PER_PAGE % 2
+# error "IBUF_BITS_PER_PAGE % 2 != 0"
+#endif
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(mtr_memo_contains_page(mtr, page, latch_type));
+
+	if (!zip_size) {
+		bit_offset = (page_no % UNIV_PAGE_SIZE) * IBUF_BITS_PER_PAGE
+			+ bit;
+	} else {
+		bit_offset = (page_no & (zip_size - 1)) * IBUF_BITS_PER_PAGE
+			+ bit;
+	}
+
+	byte_offset = bit_offset / 8;
+	bit_offset = bit_offset % 8;
+
+	ut_ad(byte_offset + IBUF_BITMAP < UNIV_PAGE_SIZE);
+
+	map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset);
+
+	value = ut_bit_get_nth(map_byte, bit_offset);
+
+	if (bit == IBUF_BITMAP_FREE) {
+		ut_ad(bit_offset + 1 < 8);
+
+		value = value * 2 + ut_bit_get_nth(map_byte, bit_offset + 1);
+	}
+
+	return(value);
+}
+
+/********************************************************************//**
+Sets the desired bit for a given page in a bitmap page. */
+static
+void
+ibuf_bitmap_page_set_bits(
+/*======================*/
+	page_t*	page,	/*!< in: bitmap page */
+	ulint	page_no,/*!< in: page whose bits to set */
+	ulint	zip_size,/*!< in: compressed page size in bytes;
+			0 for uncompressed pages */
+	ulint	bit,	/*!< in: IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... */
+	ulint	val,	/*!< in: value to set */
+	mtr_t*	mtr)	/*!< in: mtr containing an x-latch to the bitmap page */
+{
+	ulint	byte_offset;
+	ulint	bit_offset;
+	ulint	map_byte;
+
+	ut_ad(bit < IBUF_BITS_PER_PAGE);
+#if IBUF_BITS_PER_PAGE % 2
+# error "IBUF_BITS_PER_PAGE % 2 != 0"
+#endif
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a((bit != IBUF_BITMAP_BUFFERED) || (val != FALSE)
+	     || (0 == ibuf_count_get(page_get_space_id(page),
+				     page_no)));
+#endif
+	if (!zip_size) {
+		bit_offset = (page_no % UNIV_PAGE_SIZE) * IBUF_BITS_PER_PAGE
+			+ bit;
+	} else {
+		bit_offset = (page_no & (zip_size - 1)) * IBUF_BITS_PER_PAGE
+			+ bit;
+	}
+
+	byte_offset = bit_offset / 8;
+	bit_offset = bit_offset % 8;
+
+	ut_ad(byte_offset + IBUF_BITMAP < UNIV_PAGE_SIZE);
+
+	map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset);
+
+	if (bit == IBUF_BITMAP_FREE) {
+		ut_ad(bit_offset + 1 < 8);
+		ut_ad(val <= 3);
+
+		map_byte = ut_bit_set_nth(map_byte, bit_offset, val / 2);
+		map_byte = ut_bit_set_nth(map_byte, bit_offset + 1, val % 2);
+	} else {
+		ut_ad(val <= 1);
+		map_byte = ut_bit_set_nth(map_byte, bit_offset, val);
+	}
+
+	mlog_write_ulint(page + IBUF_BITMAP + byte_offset, map_byte,
+			 MLOG_1BYTE, mtr);
+}
+
+/********************************************************************//**
+Calculates the bitmap page number for a given page number.
+@return	the bitmap page number where the file page is mapped */
+UNIV_INLINE
+ulint
+ibuf_bitmap_page_no_calc(
+/*=====================*/
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	page_no)	/*!< in: tablespace page number */
+{
+	ut_ad(ut_is_2pow(zip_size));
+
+	if (!zip_size) {
+		return(FSP_IBUF_BITMAP_OFFSET
+		       + (page_no & ~(UNIV_PAGE_SIZE - 1)));
+	} else {
+		return(FSP_IBUF_BITMAP_OFFSET
+		       + (page_no & ~(zip_size - 1)));
+	}
+}
+
+/********************************************************************//**
+Gets the ibuf bitmap page where the bits describing a given file page are
+stored.
+@return bitmap page where the file page is mapped, that is, the bitmap
+page containing the descriptor bits for the file page; the bitmap page
+is x-latched */
+static
+page_t*
+ibuf_bitmap_get_map_page_func(
+/*==========================*/
+	ulint		space,	/*!< in: space id of the file page */
+	ulint		page_no,/*!< in: page number of the file page */
+	ulint		zip_size,/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+
+	block = buf_page_get_gen(space, zip_size,
+				 ibuf_bitmap_page_no_calc(zip_size, page_no),
+				 RW_X_LATCH, NULL, BUF_GET,
+				 file, line, mtr);
+	buf_block_dbg_add_level(block, SYNC_IBUF_BITMAP);
+
+	return(buf_block_get_frame(block));
+}
+
+/********************************************************************//**
+Gets the ibuf bitmap page where the bits describing a given file page are
+stored.
+@return bitmap page where the file page is mapped, that is, the bitmap
+page containing the descriptor bits for the file page; the bitmap page
+is x-latched
+@param space	in: space id of the file page
+@param page_no	in: page number of the file page
+@param zip_size	in: compressed page size in bytes; 0 for uncompressed pages
+@param mtr	in: mini-transaction */
+#define ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr)		\
+	ibuf_bitmap_get_map_page_func(space, page_no, zip_size,		\
+				      __FILE__, __LINE__, mtr)
+
+/************************************************************************//**
+Sets the free bits of the page in the ibuf bitmap. This is done in a separate
+mini-transaction, hence this operation does not restrict further work to only
+ibuf bitmap operations, which would result if the latch to the bitmap page
+were kept. */
+UNIV_INLINE
+void
+ibuf_set_free_bits_low(
+/*===================*/
+	ulint			zip_size,/*!< in: compressed page size in bytes;
+					0 for uncompressed pages */
+	const buf_block_t*	block,	/*!< in: index page; free bits are set if
+					the index is non-clustered and page
+					level is 0 */
+	ulint			val,	/*!< in: value to set: < 4 */
+	mtr_t*			mtr)	/*!< in/out: mtr */
+{
+	page_t*	bitmap_page;
+	ulint	space;
+	ulint	page_no;
+
+	if (!page_is_leaf(buf_block_get_frame(block))) {
+
+		return;
+	}
+
+	space = buf_block_get_space(block);
+	page_no = buf_block_get_page_no(block);
+	bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr);
+#ifdef UNIV_IBUF_DEBUG
+# if 0
+	fprintf(stderr,
+		"Setting space %lu page %lu free bits to %lu should be %lu\n",
+		space, page_no, val,
+		ibuf_index_page_calc_free(zip_size, block));
+# endif
+
+	ut_a(val <= ibuf_index_page_calc_free(zip_size, block));
+#endif /* UNIV_IBUF_DEBUG */
+	ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size,
+				  IBUF_BITMAP_FREE, val, mtr);
+}
+
+/************************************************************************//**
+Sets the free bit of the page in the ibuf bitmap. This is done in a separate
+mini-transaction, hence this operation does not restrict further work to only
+ibuf bitmap operations, which would result if the latch to the bitmap page
+were kept. */
+UNIV_INTERN
+void
+ibuf_set_free_bits_func(
+/*====================*/
+	buf_block_t*	block,	/*!< in: index page of a non-clustered index;
+				free bit is reset if page level is 0 */
+#ifdef UNIV_IBUF_DEBUG
+	ulint		max_val,/*!< in: ULINT_UNDEFINED or a maximum
+				value which the bits must have before
+				setting; this is for debugging */
+#endif /* UNIV_IBUF_DEBUG */
+	ulint		val)	/*!< in: value to set: < 4 */
+{
+	mtr_t	mtr;
+	page_t*	page;
+	page_t*	bitmap_page;
+	ulint	space;
+	ulint	page_no;
+	ulint	zip_size;
+
+	page = buf_block_get_frame(block);
+
+	if (!page_is_leaf(page)) {
+
+		return;
+	}
+
+	mtr_start(&mtr);
+
+	space = buf_block_get_space(block);
+	page_no = buf_block_get_page_no(block);
+	zip_size = buf_block_get_zip_size(block);
+	bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, &mtr);
+
+#ifdef UNIV_IBUF_DEBUG
+	if (max_val != ULINT_UNDEFINED) {
+		ulint	old_val;
+
+		old_val = ibuf_bitmap_page_get_bits(
+			bitmap_page, page_no, zip_size,
+			IBUF_BITMAP_FREE, &mtr);
+# if 0
+		if (old_val != max_val) {
+			fprintf(stderr,
+				"Ibuf: page %lu old val %lu max val %lu\n",
+				page_get_page_no(page),
+				old_val, max_val);
+		}
+# endif
+
+		ut_a(old_val <= max_val);
+	}
+# if 0
+	fprintf(stderr, "Setting page no %lu free bits to %lu should be %lu\n",
+		page_get_page_no(page), val,
+		ibuf_index_page_calc_free(zip_size, block));
+# endif
+
+	ut_a(val <= ibuf_index_page_calc_free(zip_size, block));
+#endif /* UNIV_IBUF_DEBUG */
+	ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size,
+				  IBUF_BITMAP_FREE, val, &mtr);
+	mtr_commit(&mtr);
+}
+
+/************************************************************************//**
+Resets the free bits of the page in the ibuf bitmap. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to decrement or reset the bits in the bitmap in a mini-transaction
+that is committed before the mini-transaction that affects the free
+space. */
+UNIV_INTERN
+void
+ibuf_reset_free_bits(
+/*=================*/
+	buf_block_t*	block)	/*!< in: index page; free bits are set to 0
+				if the index is a non-clustered
+				non-unique, and page level is 0 */
+{
+	ibuf_set_free_bits(block, 0, ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Updates the free bits for an uncompressed page to reflect the present
+state.  Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_low(
+/*======================*/
+	const buf_block_t*	block,		/*!< in: index page */
+	ulint			max_ins_size,	/*!< in: value of
+						maximum insert size
+						with reorganize before
+						the latest operation
+						performed to the page */
+	mtr_t*			mtr)		/*!< in/out: mtr */
+{
+	ulint	before;
+	ulint	after;
+
+	ut_a(!buf_block_get_page_zip(block));
+
+	before = ibuf_index_page_calc_free_bits(0, max_ins_size);
+
+	after = ibuf_index_page_calc_free(0, block);
+
+	/* This approach cannot be used on compressed pages, since the
+	computed value of "before" often does not match the current
+	state of the bitmap.  This is because the free space may
+	increase or decrease when a compressed page is reorganized. */
+	if (before != after) {
+		ibuf_set_free_bits_low(0, block, after, mtr);
+	}
+}
+
+/**********************************************************************//**
+Updates the free bits for a compressed page to reflect the present
+state.  Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_zip(
+/*======================*/
+	buf_block_t*	block,	/*!< in/out: index page */
+	mtr_t*		mtr)	/*!< in/out: mtr */
+{
+	page_t*	bitmap_page;
+	ulint	space;
+	ulint	page_no;
+	ulint	zip_size;
+	ulint	after;
+
+	space = buf_block_get_space(block);
+	page_no = buf_block_get_page_no(block);
+	zip_size = buf_block_get_zip_size(block);
+
+	ut_a(page_is_leaf(buf_block_get_frame(block)));
+	ut_a(zip_size);
+
+	bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr);
+
+	after = ibuf_index_page_calc_free_zip(zip_size, block);
+
+	if (after == 0) {
+		/* We move the page to the front of the buffer pool LRU list:
+		the purpose of this is to prevent those pages to which we
+		cannot make inserts using the insert buffer from slipping
+		out of the buffer pool */
+
+		buf_page_make_young(&block->page);
+	}
+
+	ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size,
+				  IBUF_BITMAP_FREE, after, mtr);
+}
+
+/**********************************************************************//**
+Updates the free bits for the two pages to reflect the present state.
+Does this in the mtr given, which means that the latching order rules
+virtually prevent any further operations until mtr is committed.
+NOTE: The free bits in the insert buffer bitmap must never exceed the
+free space on a page.  It is safe to set the free bits in the same
+mini-transaction that updated the pages. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_for_two_pages_low(
+/*====================================*/
+	ulint		zip_size,/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	buf_block_t*	block1,	/*!< in: index page */
+	buf_block_t*	block2,	/*!< in: index page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint	state;
+
+	/* As we have to x-latch two random bitmap pages, we have to acquire
+	the bitmap mutex to prevent a deadlock with a similar operation
+	performed by another OS thread. */
+
+	mutex_enter(&ibuf_bitmap_mutex);
+
+	state = ibuf_index_page_calc_free(zip_size, block1);
+
+	ibuf_set_free_bits_low(zip_size, block1, state, mtr);
+
+	state = ibuf_index_page_calc_free(zip_size, block2);
+
+	ibuf_set_free_bits_low(zip_size, block2, state, mtr);
+
+	mutex_exit(&ibuf_bitmap_mutex);
+}
+
+/**********************************************************************//**
+Returns TRUE if the page is one of the fixed address ibuf pages.
+@return	TRUE if a fixed address ibuf i/o page */
+UNIV_INLINE
+ibool
+ibuf_fixed_addr_page(
+/*=================*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes;
+			0 for uncompressed pages */
+	ulint	page_no)/*!< in: page number */
+{
+	return((space == IBUF_SPACE_ID && page_no == IBUF_TREE_ROOT_PAGE_NO)
+	       || ibuf_bitmap_page(zip_size, page_no));
+}
+
+/***********************************************************************//**
+Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==TRUE.
+@return	TRUE if level 2 or level 3 page */
+UNIV_INTERN
+ibool
+ibuf_page_low(
+/*==========*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes, or 0 */
+	ulint		page_no,/*!< in: page number */
+#ifdef UNIV_DEBUG
+	ibool		x_latch,/*!< in: FALSE if relaxed check
+				(avoid latching the bitmap page) */
+#endif /* UNIV_DEBUG */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mtr which will contain an
+				x-latch to the bitmap page if the page
+				is not one of the fixed address ibuf
+				pages, or NULL, in which case a new
+				transaction is created. */
+{
+	ibool	ret;
+	mtr_t	local_mtr;
+	page_t*	bitmap_page;
+
+	ut_ad(!recv_no_ibuf_operations);
+	ut_ad(x_latch || mtr == NULL);
+
+	if (ibuf_fixed_addr_page(space, zip_size, page_no)) {
+
+		return(TRUE);
+	} else if (space != IBUF_SPACE_ID) {
+
+		return(FALSE);
+	}
+
+	ut_ad(fil_space_get_type(IBUF_SPACE_ID) == FIL_TABLESPACE);
+
+#ifdef UNIV_DEBUG
+	if (!x_latch) {
+		mtr_start(&local_mtr);
+
+		/* Get the bitmap page without a page latch, so that
+		we will not be violating the latching order when
+		another bitmap page has already been latched by this
+		thread. The page will be buffer-fixed, and thus it
+		cannot be removed or relocated while we are looking at
+		it. The contents of the page could change, but the
+		IBUF_BITMAP_IBUF bit that we are interested in should
+		not be modified by any other thread. Nobody should be
+		calling ibuf_add_free_page() or ibuf_remove_free_page()
+		while the page is linked to the insert buffer b-tree. */
+
+		bitmap_page = buf_block_get_frame(
+			buf_page_get_gen(
+				space, zip_size,
+				ibuf_bitmap_page_no_calc(zip_size, page_no),
+				RW_NO_LATCH, NULL, BUF_GET_NO_LATCH,
+				file, line, &local_mtr));
+
+		ret = ibuf_bitmap_page_get_bits_low(
+			bitmap_page, page_no, zip_size,
+			MTR_MEMO_BUF_FIX, &local_mtr, IBUF_BITMAP_IBUF);
+
+		mtr_commit(&local_mtr);
+		return(ret);
+	}
+#endif /* UNIV_DEBUG */
+
+	if (mtr == NULL) {
+		mtr = &local_mtr;
+		mtr_start(mtr);
+	}
+
+	bitmap_page = ibuf_bitmap_get_map_page_func(space, page_no, zip_size,
+						    file, line, mtr);
+
+	ret = ibuf_bitmap_page_get_bits(bitmap_page, page_no, zip_size,
+					IBUF_BITMAP_IBUF, mtr);
+
+	if (mtr == &local_mtr) {
+		mtr_commit(mtr);
+	}
+
+	return(ret);
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_rec_get_page_no(mtr,rec) ibuf_rec_get_page_no_func(mtr,rec)
+#else /* UNIV_DEBUG */
+# define ibuf_rec_get_page_no(mtr,rec) ibuf_rec_get_page_no_func(rec)
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Returns the page number field of an ibuf record.
+@return	page number */
+static
+ulint
+ibuf_rec_get_page_no_func(
+/*======================*/
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,	/*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+	const rec_t*	rec)	/*!< in: ibuf record */
+{
+	const byte*	field;
+	ulint		len;
+
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(rec_get_n_fields_old(rec) > 2);
+
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
+
+	ut_a(len == 1);
+
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len);
+
+	ut_a(len == 4);
+
+	return(mach_read_from_4(field));
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_rec_get_space(mtr,rec) ibuf_rec_get_space_func(mtr,rec)
+#else /* UNIV_DEBUG */
+# define ibuf_rec_get_space(mtr,rec) ibuf_rec_get_space_func(rec)
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Returns the space id field of an ibuf record. For < 4.1.x format records
+returns 0.
+@return	space id */
+static
+ulint
+ibuf_rec_get_space_func(
+/*====================*/
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,	/*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+	const rec_t*	rec)	/*!< in: ibuf record */
+{
+	const byte*	field;
+	ulint		len;
+
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(rec_get_n_fields_old(rec) > 2);
+
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
+
+	ut_a(len == 1);
+
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
+
+	ut_a(len == 4);
+
+	return(mach_read_from_4(field));
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_rec_get_info(mtr,rec,op,comp,info_len,counter)	\
+	ibuf_rec_get_info_func(mtr,rec,op,comp,info_len,counter)
+#else /* UNIV_DEBUG */
+# define ibuf_rec_get_info(mtr,rec,op,comp,info_len,counter)	\
+	ibuf_rec_get_info_func(rec,op,comp,info_len,counter)
+#endif
+/****************************************************************//**
+Get various information about an ibuf record in >= 4.1.x format. */
+static
+void
+ibuf_rec_get_info_func(
+/*===================*/
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,	/*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+	const rec_t*	rec,		/*!< in: ibuf record */
+	ibuf_op_t*	op,		/*!< out: operation type, or NULL */
+	ibool*		comp,		/*!< out: compact flag, or NULL */
+	ulint*		info_len,	/*!< out: length of info fields at the
+					start of the fourth field, or
+					NULL */
+	ulint*		counter)	/*!< in: counter value, or NULL */
+{
+	const byte*	types;
+	ulint		fields;
+	ulint		len;
+
+	/* Local variables to shadow arguments. */
+	ibuf_op_t	op_local;
+	ibool		comp_local;
+	ulint		info_len_local;
+	ulint		counter_local;
+
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+	ut_ad(ibuf_inside(mtr));
+	fields = rec_get_n_fields_old(rec);
+	ut_a(fields > IBUF_REC_FIELD_USER);
+
+	types = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
+
+	info_len_local = len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
+
+	switch (info_len_local) {
+	case 0:
+	case 1:
+		op_local = IBUF_OP_INSERT;
+		comp_local = info_len_local;
+		ut_ad(!counter);
+		counter_local = ULINT_UNDEFINED;
+		break;
+
+	case IBUF_REC_INFO_SIZE:
+		op_local = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE];
+		comp_local = types[IBUF_REC_OFFSET_FLAGS] & IBUF_REC_COMPACT;
+		counter_local = mach_read_from_2(
+			types + IBUF_REC_OFFSET_COUNTER);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	ut_a(op_local < IBUF_OP_COUNT);
+	ut_a((len - info_len_local) ==
+	     (fields - IBUF_REC_FIELD_USER)
+	     * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+	if (op) {
+		*op = op_local;
+	}
+
+	if (comp) {
+		*comp = comp_local;
+	}
+
+	if (info_len) {
+		*info_len = info_len_local;
+	}
+
+	if (counter) {
+		*counter = counter_local;
+	}
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_rec_get_op_type(mtr,rec) ibuf_rec_get_op_type_func(mtr,rec)
+#else /* UNIV_DEBUG */
+# define ibuf_rec_get_op_type(mtr,rec) ibuf_rec_get_op_type_func(rec)
+#endif
+
+/****************************************************************//**
+Returns the operation type field of an ibuf record.
+@return	operation type */
+static
+ibuf_op_t
+ibuf_rec_get_op_type_func(
+/*======================*/
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,	/*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+	const rec_t*	rec)	/*!< in: ibuf record */
+{
+	ulint		len;
+
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(rec_get_n_fields_old(rec) > 2);
+
+	(void) rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
+
+	if (len > 1) {
+		/* This is a < 4.1.x format record */
+
+		return(IBUF_OP_INSERT);
+	} else {
+		ibuf_op_t	op;
+
+		ibuf_rec_get_info(mtr, rec, &op, NULL, NULL, NULL);
+
+		return(op);
+	}
+}
+
+/****************************************************************//**
+Read the first two bytes from a record's fourth field (counter field in new
+records; something else in older records).
+@return "counter" field, or ULINT_UNDEFINED if for some reason it
+can't be read */
+UNIV_INTERN
+ulint
+ibuf_rec_get_counter(
+/*=================*/
+	const rec_t*	rec)	/*!< in: ibuf record */
+{
+	const byte*	ptr;
+	ulint		len;
+
+	if (rec_get_n_fields_old(rec) <= IBUF_REC_FIELD_METADATA) {
+
+		return(ULINT_UNDEFINED);
+	}
+
+	ptr = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
+
+	if (len >= 2) {
+
+		return(mach_read_from_2(ptr));
+	} else {
+
+		return(ULINT_UNDEFINED);
+	}
+}
+
+/****************************************************************//**
+Add accumulated operation counts to a permanent array. Both arrays must be
+of size IBUF_OP_COUNT. */
+static
+void
+ibuf_add_ops(
+/*=========*/
+	ulint*		arr,	/*!< in/out: array to modify */
+	const ulint*	ops)	/*!< in: operation counts */
+
+{
+	ulint	i;
+
+#ifndef HAVE_ATOMIC_BUILTINS
+	ut_ad(mutex_own(&ibuf_mutex));
+#endif /* !HAVE_ATOMIC_BUILTINS */
+
+	for (i = 0; i < IBUF_OP_COUNT; i++) {
+#ifdef HAVE_ATOMIC_BUILTINS
+		os_atomic_increment_ulint(&arr[i], ops[i]);
+#else /* HAVE_ATOMIC_BUILTINS */
+		arr[i] += ops[i];
+#endif /* HAVE_ATOMIC_BUILTINS */
+	}
+}
+
+/****************************************************************//**
+Print operation counts. The array must be of size IBUF_OP_COUNT. */
+static
+void
+ibuf_print_ops(
+/*===========*/
+	const ulint*	ops,	/*!< in: operation counts */
+	FILE*		file)	/*!< in: file where to print */
+{
+	static const char* op_names[] = {
+		"insert",
+		"delete mark",
+		"delete"
+	};
+	ulint	i;
+
+	ut_a(UT_ARR_SIZE(op_names) == IBUF_OP_COUNT);
+
+	for (i = 0; i < IBUF_OP_COUNT; i++) {
+		fprintf(file, "%s %lu%s", op_names[i],
+			(ulong) ops[i], (i < (IBUF_OP_COUNT - 1)) ? ", " : "");
+	}
+
+	putc('\n', file);
+}
+
+/********************************************************************//**
+Creates a dummy index for inserting a record to a non-clustered index.
+@return	dummy index */
+static
+dict_index_t*
+ibuf_dummy_index_create(
+/*====================*/
+	ulint		n,	/*!< in: number of fields */
+	ibool		comp)	/*!< in: TRUE=use compact record format */
+{
+	dict_table_t*	table;
+	dict_index_t*	index;
+
+	table = dict_mem_table_create("IBUF_DUMMY",
+				      DICT_HDR_SPACE, n,
+				      comp ? DICT_TF_COMPACT : 0, 0);
+
+	index = dict_mem_index_create("IBUF_DUMMY", "IBUF_DUMMY",
+				      DICT_HDR_SPACE, 0, n);
+
+	index->table = table;
+
+	/* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+	index->cached = TRUE;
+
+	return(index);
+}
+/********************************************************************//**
+Add a column to the dummy index */
+static
+void
+ibuf_dummy_index_add_col(
+/*=====================*/
+	dict_index_t*	index,	/*!< in: dummy index */
+	const dtype_t*	type,	/*!< in: the data type of the column */
+	ulint		len)	/*!< in: length of the column */
+{
+	ulint	i	= index->table->n_def;
+	dict_mem_table_add_col(index->table, NULL, NULL,
+			       dtype_get_mtype(type),
+			       dtype_get_prtype(type),
+			       dtype_get_len(type));
+	dict_index_add_col(index, index->table,
+			   dict_table_get_nth_col(index->table, i), len);
+}
+/********************************************************************//**
+Deallocates a dummy index for inserting a record to a non-clustered index. */
+static
+void
+ibuf_dummy_index_free(
+/*==================*/
+	dict_index_t*	index)	/*!< in, own: dummy index */
+{
+	dict_table_t*	table = index->table;
+
+	dict_mem_index_free(index);
+	dict_mem_table_free(table);
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex)	\
+	ibuf_build_entry_from_ibuf_rec_func(mtr,ibuf_rec,heap,pindex)
+#else /* UNIV_DEBUG */
+# define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex)	\
+	ibuf_build_entry_from_ibuf_rec_func(ibuf_rec,heap,pindex)
+#endif
+
+/*********************************************************************//**
+Builds the entry used to
+
+1) IBUF_OP_INSERT: insert into a non-clustered index
+
+2) IBUF_OP_DELETE_MARK: find the record whose delete-mark flag we need to
+   activate
+
+3) IBUF_OP_DELETE: find the record we need to delete
+
+when we have the corresponding record in an ibuf index.
+
+NOTE that as we copy pointers to fields in ibuf_rec, the caller must
+hold a latch to the ibuf_rec page as long as the entry is used!
+
+@return own: entry to insert to a non-clustered index */
+static
+dtuple_t*
+ibuf_build_entry_from_ibuf_rec_func(
+/*================================*/
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,	/*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+	const rec_t*	ibuf_rec,	/*!< in: record in an insert buffer */
+	mem_heap_t*	heap,		/*!< in: heap where built */
+	dict_index_t**	pindex)		/*!< out, own: dummy index that
+					describes the entry */
+{
+	dtuple_t*	tuple;
+	dfield_t*	field;
+	ulint		n_fields;
+	const byte*	types;
+	const byte*	data;
+	ulint		len;
+	ulint		info_len;
+	ulint		i;
+	ulint		comp;
+	dict_index_t*	index;
+
+	ut_ad(mtr_memo_contains_page(mtr, ibuf_rec, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, ibuf_rec, MTR_MEMO_PAGE_S_FIX));
+	ut_ad(ibuf_inside(mtr));
+
+	data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len);
+
+	ut_a(len == 1);
+	ut_a(*data == 0);
+	ut_a(rec_get_n_fields_old(ibuf_rec) > IBUF_REC_FIELD_USER);
+
+	n_fields = rec_get_n_fields_old(ibuf_rec) - IBUF_REC_FIELD_USER;
+
+	tuple = dtuple_create(heap, n_fields);
+
+	types = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_METADATA, &len);
+
+	ibuf_rec_get_info(mtr, ibuf_rec, NULL, &comp, &info_len, NULL);
+
+	index = ibuf_dummy_index_create(n_fields, comp);
+
+	len -= info_len;
+	types += info_len;
+
+	ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+	for (i = 0; i < n_fields; i++) {
+		field = dtuple_get_nth_field(tuple, i);
+
+		data = rec_get_nth_field_old(
+			ibuf_rec, i + IBUF_REC_FIELD_USER, &len);
+
+		dfield_set_data(field, data, len);
+
+		dtype_new_read_for_order_and_null_size(
+			dfield_get_type(field),
+			types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+		ibuf_dummy_index_add_col(index, dfield_get_type(field), len);
+	}
+
+	/* Prevent an ut_ad() failure in page_zip_write_rec() by
+	adding system columns to the dummy table pointed to by the
+	dummy secondary index.  The insert buffer is only used for
+	secondary indexes, whose records never contain any system
+	columns, such as DB_TRX_ID. */
+	ut_d(dict_table_add_system_columns(index->table, index->table->heap));
+
+	*pindex = index;
+
+	return(tuple);
+}
+
+/******************************************************************//**
+Get the data size.
+@return	size of fields */
+UNIV_INLINE
+ulint
+ibuf_rec_get_size(
+/*==============*/
+	const rec_t*	rec,			/*!< in: ibuf record */
+	const byte*	types,			/*!< in: fields */
+	ulint		n_fields,		/*!< in: number of fields */
+	ulint		comp)			/*!< in: 0=ROW_FORMAT=REDUNDANT,
+						nonzero=ROW_FORMAT=COMPACT */
+{
+	ulint	i;
+	ulint	field_offset;
+	ulint	types_offset;
+	ulint	size = 0;
+
+	field_offset = IBUF_REC_FIELD_USER;
+	types_offset = DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
+
+	for (i = 0; i < n_fields; i++) {
+		ulint		len;
+		dtype_t		dtype;
+
+		rec_get_nth_field_offs_old(rec, i + field_offset, &len);
+
+		if (len != UNIV_SQL_NULL) {
+			size += len;
+		} else {
+			dtype_new_read_for_order_and_null_size(&dtype, types);
+
+			size += dtype_get_sql_null_size(&dtype, comp);
+		}
+
+		types += types_offset;
+	}
+
+	return(size);
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_rec_get_volume(mtr,rec) ibuf_rec_get_volume_func(mtr,rec)
+#else /* UNIV_DEBUG */
+# define ibuf_rec_get_volume(mtr,rec) ibuf_rec_get_volume_func(rec)
+#endif
+
+/********************************************************************//**
+Returns the space taken by a stored non-clustered index entry if converted to
+an index record.
+@return size of index record in bytes + an upper limit of the space
+taken in the page directory */
+static
+ulint
+ibuf_rec_get_volume_func(
+/*=====================*/
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,	/*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+	const rec_t*	ibuf_rec)/*!< in: ibuf record */
+{
+	ulint		len;
+	const byte*	data;
+	const byte*	types;
+	ulint		n_fields;
+	ulint		data_size;
+	ulint		comp;
+	ibuf_op_t	op;
+	ulint		info_len;
+
+	ut_ad(mtr_memo_contains_page(mtr, ibuf_rec, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, ibuf_rec, MTR_MEMO_PAGE_S_FIX));
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(rec_get_n_fields_old(ibuf_rec) > 2);
+
+	data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len);
+	ut_a(len == 1);
+	ut_a(*data == 0);
+
+	types = rec_get_nth_field_old(
+		ibuf_rec, IBUF_REC_FIELD_METADATA, &len);
+
+	ibuf_rec_get_info(mtr, ibuf_rec, &op, &comp, &info_len, NULL);
+
+	if (op == IBUF_OP_DELETE_MARK || op == IBUF_OP_DELETE) {
+		/* Delete-marking a record doesn't take any
+		additional space, and while deleting a record
+		actually frees up space, we have to play it safe and
+		pretend it takes no additional space (the record
+		might not exist, etc.).  */
+
+		return(0);
+	} else if (comp) {
+		dtuple_t*	entry;
+		ulint		volume;
+		dict_index_t*	dummy_index;
+		mem_heap_t*	heap = mem_heap_create(500);
+
+		entry = ibuf_build_entry_from_ibuf_rec(mtr, ibuf_rec,
+			heap, &dummy_index);
+
+		volume = rec_get_converted_size(dummy_index, entry, 0);
+
+		ibuf_dummy_index_free(dummy_index);
+		mem_heap_free(heap);
+
+		return(volume + page_dir_calc_reserved_space(1));
+	}
+
+	types += info_len;
+	n_fields = rec_get_n_fields_old(ibuf_rec)
+		- IBUF_REC_FIELD_USER;
+
+	data_size = ibuf_rec_get_size(ibuf_rec, types, n_fields, comp);
+
+	return(data_size + rec_get_converted_extra_size(data_size, n_fields, 0)
+	       + page_dir_calc_reserved_space(1));
+}
+
+/*********************************************************************//**
+Builds the tuple to insert to an ibuf tree when we have an entry for a
+non-clustered index.
+
+NOTE that the original entry must be kept because we copy pointers to
+its fields.
+
+@return	own: entry to insert into an ibuf index tree */
+static
+dtuple_t*
+ibuf_entry_build(
+/*=============*/
+	ibuf_op_t	op,	/*!< in: operation type */
+	dict_index_t*	index,	/*!< in: non-clustered index */
+	const dtuple_t*	entry,	/*!< in: entry for a non-clustered index */
+	ulint		space,	/*!< in: space id */
+	ulint		page_no,/*!< in: index page number where entry should
+				be inserted */
+	ulint		counter,/*!< in: counter value;
+				ULINT_UNDEFINED=not used */
+	mem_heap_t*	heap)	/*!< in: heap into which to build */
+{
+	dtuple_t*	tuple;
+	dfield_t*	field;
+	const dfield_t*	entry_field;
+	ulint		n_fields;
+	byte*		buf;
+	byte*		ti;
+	byte*		type_info;
+	ulint		i;
+
+	ut_ad(counter != ULINT_UNDEFINED || op == IBUF_OP_INSERT);
+	ut_ad(counter == ULINT_UNDEFINED || counter <= 0xFFFF);
+	ut_ad(op < IBUF_OP_COUNT);
+
+	/* We have to build a tuple with the following fields:
+
+	1-4) These are described at the top of this file.
+
+	5) The rest of the fields are copied from the entry.
+
+	All fields in the tuple are ordered like the type binary in our
+	insert buffer tree. */
+
+	n_fields = dtuple_get_n_fields(entry);
+
+	tuple = dtuple_create(heap, n_fields + IBUF_REC_FIELD_USER);
+
+	/* 1) Space Id */
+
+	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+	mach_write_to_4(buf, space);
+
+	dfield_set_data(field, buf, 4);
+
+	/* 2) Marker byte */
+
+	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 1));
+
+	/* We set the marker byte zero */
+
+	mach_write_to_1(buf, 0);
+
+	dfield_set_data(field, buf, 1);
+
+	/* 3) Page number */
+
+	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+	mach_write_to_4(buf, page_no);
+
+	dfield_set_data(field, buf, 4);
+
+	/* 4) Type info, part #1 */
+
+	if (counter == ULINT_UNDEFINED) {
+		i = dict_table_is_comp(index->table) ? 1 : 0;
+	} else {
+		ut_ad(counter <= 0xFFFF);
+		i = IBUF_REC_INFO_SIZE;
+	}
+
+	ti = type_info = static_cast<byte*>(
+		mem_heap_alloc(
+			heap,
+			i + n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE));
+
+	switch (i) {
+	default:
+		ut_error;
+		break;
+	case 1:
+		/* set the flag for ROW_FORMAT=COMPACT */
+		*ti++ = 0;
+		/* fall through */
+	case 0:
+		/* the old format does not allow delete buffering */
+		ut_ad(op == IBUF_OP_INSERT);
+		break;
+	case IBUF_REC_INFO_SIZE:
+		mach_write_to_2(ti + IBUF_REC_OFFSET_COUNTER, counter);
+
+		ti[IBUF_REC_OFFSET_TYPE] = (byte) op;
+		ti[IBUF_REC_OFFSET_FLAGS] = dict_table_is_comp(index->table)
+			? IBUF_REC_COMPACT : 0;
+		ti += IBUF_REC_INFO_SIZE;
+		break;
+	}
+
+	/* 5+) Fields from the entry */
+
+	for (i = 0; i < n_fields; i++) {
+		ulint			fixed_len;
+		const dict_field_t*	ifield;
+
+		field = dtuple_get_nth_field(tuple, i + IBUF_REC_FIELD_USER);
+		entry_field = dtuple_get_nth_field(entry, i);
+		dfield_copy(field, entry_field);
+
+		ifield = dict_index_get_nth_field(index, i);
+		/* Prefix index columns of fixed-length columns are of
+		fixed length.  However, in the function call below,
+		dfield_get_type(entry_field) contains the fixed length
+		of the column in the clustered index.  Replace it with
+		the fixed length of the secondary index column. */
+		fixed_len = ifield->fixed_len;
+
+#ifdef UNIV_DEBUG
+		if (fixed_len) {
+			/* dict_index_add_col() should guarantee these */
+			ut_ad(fixed_len <= (ulint)
+			      dfield_get_type(entry_field)->len);
+			if (ifield->prefix_len) {
+				ut_ad(ifield->prefix_len == fixed_len);
+			} else {
+				ut_ad(fixed_len == (ulint)
+				      dfield_get_type(entry_field)->len);
+			}
+		}
+#endif /* UNIV_DEBUG */
+
+		dtype_new_store_for_order_and_null_size(
+			ti, dfield_get_type(entry_field), fixed_len);
+		ti += DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
+	}
+
+	/* 4) Type info, part #2 */
+
+	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_METADATA);
+
+	dfield_set_data(field, type_info, ti - type_info);
+
+	/* Set all the types in the new tuple binary */
+
+	dtuple_set_types_binary(tuple, n_fields + IBUF_REC_FIELD_USER);
+
+	return(tuple);
+}
+
+/*********************************************************************//**
+Builds a search tuple used to search buffered inserts for an index page.
+This is for >= 4.1.x format records.
+@return	own: search tuple */
+static
+dtuple_t*
+ibuf_search_tuple_build(
+/*====================*/
+	ulint		space,	/*!< in: space id */
+	ulint		page_no,/*!< in: index page number */
+	mem_heap_t*	heap)	/*!< in: heap into which to build */
+{
+	dtuple_t*	tuple;
+	dfield_t*	field;
+	byte*		buf;
+
+	tuple = dtuple_create(heap, IBUF_REC_FIELD_METADATA);
+
+	/* Store the space id in tuple */
+
+	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+	mach_write_to_4(buf, space);
+
+	dfield_set_data(field, buf, 4);
+
+	/* Store the new format record marker byte */
+
+	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 1));
+
+	mach_write_to_1(buf, 0);
+
+	dfield_set_data(field, buf, 1);
+
+	/* Store the page number in tuple */
+
+	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+	mach_write_to_4(buf, page_no);
+
+	dfield_set_data(field, buf, 4);
+
+	dtuple_set_types_binary(tuple, IBUF_REC_FIELD_METADATA);
+
+	return(tuple);
+}
+
+/*********************************************************************//**
+Checks if there are enough pages in the free list of the ibuf tree that we
+dare to start a pessimistic insert to the insert buffer.
+@return	TRUE if enough free pages in list */
+UNIV_INLINE
+ibool
+ibuf_data_enough_free_for_insert(void)
+/*==================================*/
+{
+	ut_ad(mutex_own(&ibuf_mutex));
+
+	/* We want a big margin of free pages, because a B-tree can sometimes
+	grow in size also if records are deleted from it, as the node pointers
+	can change, and we must make sure that we are able to delete the
+	inserts buffered for pages that we read to the buffer pool, without
+	any risk of running out of free space in the insert buffer. */
+
+	return(ibuf->free_list_len >= (ibuf->size / 2) + 3 * ibuf->height);
+}
+
+/*********************************************************************//**
+Checks if there are enough pages in the free list of the ibuf tree that we
+should remove them and free to the file space management.
+@return	TRUE if enough free pages in list */
+UNIV_INLINE
+ibool
+ibuf_data_too_much_free(void)
+/*=========================*/
+{
+	ut_ad(mutex_own(&ibuf_mutex));
+
+	return(ibuf->free_list_len >= 3 + (ibuf->size / 2) + 3 * ibuf->height);
+}
+
+/*********************************************************************//**
+Allocates a new page from the ibuf file segment and adds it to the free
+list.
+@return	TRUE on success, FALSE if no space left */
+static
+ibool
+ibuf_add_free_page(void)
+/*====================*/
+{
+	mtr_t		mtr;
+	page_t*		header_page;
+	ulint		flags;
+	ulint		zip_size;
+	buf_block_t*	block;
+	page_t*		page;
+	page_t*		root;
+	page_t*		bitmap_page;
+
+	mtr_start(&mtr);
+
+	/* Acquire the fsp latch before the ibuf header, obeying the latching
+	order */
+	mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, &flags), &mtr);
+	zip_size = fsp_flags_get_zip_size(flags);
+
+	header_page = ibuf_header_page_get(&mtr);
+
+	/* Allocate a new page: NOTE that if the page has been a part of a
+	non-clustered index which has subsequently been dropped, then the
+	page may have buffered inserts in the insert buffer, and these
+	should be deleted from there. These get deleted when the page
+	allocation creates the page in buffer. Thus the call below may end
+	up calling the insert buffer routines and, as we yet have no latches
+	to insert buffer tree pages, these routines can run without a risk
+	of a deadlock. This is the reason why we created a special ibuf
+	header page apart from the ibuf tree. */
+
+	block = fseg_alloc_free_page(
+		header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, 0, FSP_UP,
+		&mtr);
+
+	if (block == NULL) {
+		mtr_commit(&mtr);
+
+		return(FALSE);
+	}
+
+	ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1);
+	ibuf_enter(&mtr);
+	mutex_enter(&ibuf_mutex);
+	root = ibuf_tree_root_get(&mtr);
+
+	buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
+	page = buf_block_get_frame(block);
+
+	/* Add the page to the free list and update the ibuf size data */
+
+	flst_add_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+		      page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
+
+	mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_IBUF_FREE_LIST,
+			 MLOG_2BYTES, &mtr);
+
+	ibuf->seg_size++;
+	ibuf->free_list_len++;
+
+	/* Set the bit indicating that this page is now an ibuf tree page
+	(level 2 page) */
+
+	bitmap_page = ibuf_bitmap_get_map_page(
+		IBUF_SPACE_ID, buf_block_get_page_no(block), zip_size, &mtr);
+
+	mutex_exit(&ibuf_mutex);
+
+	ibuf_bitmap_page_set_bits(
+		bitmap_page, buf_block_get_page_no(block), zip_size,
+		IBUF_BITMAP_IBUF, TRUE, &mtr);
+
+	ibuf_mtr_commit(&mtr);
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Removes a page from the free list and frees it to the fsp system. */
+static
+void
+ibuf_remove_free_page(void)
+/*=======================*/
+{
+	mtr_t	mtr;
+	mtr_t	mtr2;
+	page_t*	header_page;
+	ulint	flags;
+	ulint	zip_size;
+	ulint	page_no;
+	page_t*	page;
+	page_t*	root;
+	page_t*	bitmap_page;
+
+	mtr_start(&mtr);
+
+	/* Acquire the fsp latch before the ibuf header, obeying the latching
+	order */
+	mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, &flags), &mtr);
+	zip_size = fsp_flags_get_zip_size(flags);
+
+	header_page = ibuf_header_page_get(&mtr);
+
+	/* Prevent pessimistic inserts to insert buffer trees for a while */
+	ibuf_enter(&mtr);
+	mutex_enter(&ibuf_pessimistic_insert_mutex);
+	mutex_enter(&ibuf_mutex);
+
+	if (!ibuf_data_too_much_free()) {
+
+		mutex_exit(&ibuf_mutex);
+		mutex_exit(&ibuf_pessimistic_insert_mutex);
+
+		ibuf_mtr_commit(&mtr);
+
+		return;
+	}
+
+	ibuf_mtr_start(&mtr2);
+
+	root = ibuf_tree_root_get(&mtr2);
+
+	mutex_exit(&ibuf_mutex);
+
+	page_no = flst_get_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+				&mtr2).page;
+
+	/* NOTE that we must release the latch on the ibuf tree root
+	because in fseg_free_page we access level 1 pages, and the root
+	is a level 2 page. */
+
+	ibuf_mtr_commit(&mtr2);
+	ibuf_exit(&mtr);
+
+	/* Since pessimistic inserts were prevented, we know that the
+	page is still in the free list. NOTE that also deletes may take
+	pages from the free list, but they take them from the start, and
+	the free list was so long that they cannot have taken the last
+	page from it. */
+
+	fseg_free_page(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
+		       IBUF_SPACE_ID, page_no, &mtr);
+
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+	buf_page_reset_file_page_was_freed(IBUF_SPACE_ID, page_no);
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
+
+	ibuf_enter(&mtr);
+
+	mutex_enter(&ibuf_mutex);
+
+	root = ibuf_tree_root_get(&mtr);
+
+	ut_ad(page_no == flst_get_last(root + PAGE_HEADER
+				       + PAGE_BTR_IBUF_FREE_LIST, &mtr).page);
+
+	{
+		buf_block_t*	block;
+
+		block = buf_page_get(
+			IBUF_SPACE_ID, 0, page_no, RW_X_LATCH, &mtr);
+
+		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
+
+		page = buf_block_get_frame(block);
+	}
+
+	/* Remove the page from the free list and update the ibuf size data */
+
+	flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+		    page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
+
+	mutex_exit(&ibuf_pessimistic_insert_mutex);
+
+	ibuf->seg_size--;
+	ibuf->free_list_len--;
+
+	/* Set the bit indicating that this page is no more an ibuf tree page
+	(level 2 page) */
+
+	bitmap_page = ibuf_bitmap_get_map_page(
+		IBUF_SPACE_ID, page_no, zip_size, &mtr);
+
+	mutex_exit(&ibuf_mutex);
+
+	ibuf_bitmap_page_set_bits(
+		bitmap_page, page_no, zip_size, IBUF_BITMAP_IBUF, FALSE, &mtr);
+
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+	buf_page_set_file_page_was_freed(IBUF_SPACE_ID, page_no);
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
+	ibuf_mtr_commit(&mtr);
+}
+
+/***********************************************************************//**
+Frees excess pages from the ibuf free list. This function is called when an OS
+thread calls fsp services to allocate a new file segment, or a new page to a
+file segment, and the thread did not own the fsp latch before this call. */
+UNIV_INTERN
+void
+ibuf_free_excess_pages(void)
+/*========================*/
+{
+	ulint		i;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(fil_space_get_latch(IBUF_SPACE_ID, NULL),
+			  RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_ad(rw_lock_get_x_lock_count(
+		fil_space_get_latch(IBUF_SPACE_ID, NULL)) == 1);
+
+	/* NOTE: We require that the thread did not own the latch before,
+	because then we know that we can obey the correct latching order
+	for ibuf latches */
+
+	if (!ibuf) {
+		/* Not yet initialized; not sure if this is possible, but
+		does no harm to check for it. */
+
+		return;
+	}
+
+	/* Free at most a few pages at a time, so that we do not delay the
+	requested service too much */
+
+	for (i = 0; i < 4; i++) {
+
+		ibool	too_much_free;
+
+		mutex_enter(&ibuf_mutex);
+		too_much_free = ibuf_data_too_much_free();
+		mutex_exit(&ibuf_mutex);
+
+		if (!too_much_free) {
+			return;
+		}
+
+		ibuf_remove_free_page();
+	}
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_get_merge_page_nos(contract,rec,mtr,ids,vers,pages,n_stored) \
+	ibuf_get_merge_page_nos_func(contract,rec,mtr,ids,vers,pages,n_stored)
+#else /* UNIV_DEBUG */
+# define ibuf_get_merge_page_nos(contract,rec,mtr,ids,vers,pages,n_stored) \
+	ibuf_get_merge_page_nos_func(contract,rec,ids,vers,pages,n_stored)
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Reads page numbers from a leaf in an ibuf tree.
+@return a lower limit for the combined volume of records which will be
+merged */
+static
+ulint
+ibuf_get_merge_page_nos_func(
+/*=========================*/
+	ibool		contract,/*!< in: TRUE if this function is called to
+				contract the tree, FALSE if this is called
+				when a single page becomes full and we look
+				if it pays to read also nearby pages */
+	const rec_t*	rec,	/*!< in: insert buffer record */
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,	/*!< in: mini-transaction holding rec */
+#endif /* UNIV_DEBUG */
+	ulint*		space_ids,/*!< in/out: space id's of the pages */
+	ib_int64_t*	space_versions,/*!< in/out: tablespace version
+				timestamps; used to prevent reading in old
+				pages after DISCARD + IMPORT tablespace */
+	ulint*		page_nos,/*!< in/out: buffer for at least
+				IBUF_MAX_N_PAGES_MERGED many page numbers;
+				the page numbers are in an ascending order */
+	ulint*		n_stored)/*!< out: number of page numbers stored to
+				page_nos in this function */
+{
+	ulint	prev_page_no;
+	ulint	prev_space_id;
+	ulint	first_page_no;
+	ulint	first_space_id;
+	ulint	rec_page_no;
+	ulint	rec_space_id;
+	ulint	sum_volumes;
+	ulint	volume_for_page;
+	ulint	rec_volume;
+	ulint	limit;
+	ulint	n_pages;
+
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+	ut_ad(ibuf_inside(mtr));
+
+	*n_stored = 0;
+
+	limit = ut_min(IBUF_MAX_N_PAGES_MERGED, buf_pool_get_curr_size() / 4);
+
+	if (page_rec_is_supremum(rec)) {
+
+		rec = page_rec_get_prev_const(rec);
+	}
+
+	if (page_rec_is_infimum(rec)) {
+
+		rec = page_rec_get_next_const(rec);
+	}
+
+	if (page_rec_is_supremum(rec)) {
+
+		return(0);
+	}
+
+	first_page_no = ibuf_rec_get_page_no(mtr, rec);
+	first_space_id = ibuf_rec_get_space(mtr, rec);
+	n_pages = 0;
+	prev_page_no = 0;
+	prev_space_id = 0;
+
+	/* Go backwards from the first rec until we reach the border of the
+	'merge area', or the page start or the limit of storeable pages is
+	reached */
+
+	while (!page_rec_is_infimum(rec) && UNIV_LIKELY(n_pages < limit)) {
+
+		rec_page_no = ibuf_rec_get_page_no(mtr, rec);
+		rec_space_id = ibuf_rec_get_space(mtr, rec);
+
+		if (rec_space_id != first_space_id
+		    || (rec_page_no / IBUF_MERGE_AREA)
+		    != (first_page_no / IBUF_MERGE_AREA)) {
+
+			break;
+		}
+
+		if (rec_page_no != prev_page_no
+		    || rec_space_id != prev_space_id) {
+			n_pages++;
+		}
+
+		prev_page_no = rec_page_no;
+		prev_space_id = rec_space_id;
+
+		rec = page_rec_get_prev_const(rec);
+	}
+
+	rec = page_rec_get_next_const(rec);
+
+	/* At the loop start there is no prev page; we mark this with a pair
+	of space id, page no (0, 0) for which there can never be entries in
+	the insert buffer */
+
+	prev_page_no = 0;
+	prev_space_id = 0;
+	sum_volumes = 0;
+	volume_for_page = 0;
+
+	while (*n_stored < limit) {
+		if (page_rec_is_supremum(rec)) {
+			/* When no more records available, mark this with
+			another 'impossible' pair of space id, page no */
+			rec_page_no = 1;
+			rec_space_id = 0;
+		} else {
+			rec_page_no = ibuf_rec_get_page_no(mtr, rec);
+			rec_space_id = ibuf_rec_get_space(mtr, rec);
+			/* In the system tablespace, the smallest
+			possible secondary index leaf page number is
+			bigger than IBUF_TREE_ROOT_PAGE_NO (4). In
+			other tablespaces, the clustered index tree is
+			created at page 3, which makes page 4 the
+			smallest possible secondary index leaf page
+			(and that only after DROP INDEX). */
+			ut_ad(rec_page_no
+			      > (ulint) IBUF_TREE_ROOT_PAGE_NO
+			      - (rec_space_id != 0));
+		}
+
+#ifdef UNIV_IBUF_DEBUG
+		ut_a(*n_stored < IBUF_MAX_N_PAGES_MERGED);
+#endif
+		if ((rec_space_id != prev_space_id
+		     || rec_page_no != prev_page_no)
+		    && (prev_space_id != 0 || prev_page_no != 0)) {
+
+			if (contract
+			    || (prev_page_no == first_page_no
+				&& prev_space_id == first_space_id)
+			    || (volume_for_page
+				> ((IBUF_MERGE_THRESHOLD - 1)
+				   * 4 * UNIV_PAGE_SIZE
+				   / IBUF_PAGE_SIZE_PER_FREE_SPACE)
+				/ IBUF_MERGE_THRESHOLD)) {
+
+				space_ids[*n_stored] = prev_space_id;
+				space_versions[*n_stored]
+					= fil_space_get_version(prev_space_id);
+				page_nos[*n_stored] = prev_page_no;
+
+				(*n_stored)++;
+
+				sum_volumes += volume_for_page;
+			}
+
+			if (rec_space_id != first_space_id
+			    || rec_page_no / IBUF_MERGE_AREA
+			    != first_page_no / IBUF_MERGE_AREA) {
+
+				break;
+			}
+
+			volume_for_page = 0;
+		}
+
+		if (rec_page_no == 1 && rec_space_id == 0) {
+			/* Supremum record */
+
+			break;
+		}
+
+		rec_volume = ibuf_rec_get_volume(mtr, rec);
+
+		volume_for_page += rec_volume;
+
+		prev_page_no = rec_page_no;
+		prev_space_id = rec_space_id;
+
+		rec = page_rec_get_next_const(rec);
+	}
+
+#ifdef UNIV_IBUF_DEBUG
+	ut_a(*n_stored <= IBUF_MAX_N_PAGES_MERGED);
+#endif
+#if 0
+	fprintf(stderr, "Ibuf merge batch %lu pages %lu volume\n",
+		*n_stored, sum_volumes);
+#endif
+	return(sum_volumes);
+}
+
+/*******************************************************************//**
+Get the matching records for space id.
+@return	current rec or NULL */
+static	__attribute__((nonnull, warn_unused_result))
+const rec_t*
+ibuf_get_user_rec(
+/*===============*/
+	btr_pcur_t*	pcur,		/*!< in: the current cursor */
+	mtr_t*		mtr)		/*!< in: mini transaction */
+{
+	do {
+		const rec_t* rec = btr_pcur_get_rec(pcur);
+
+		if (page_rec_is_user_rec(rec)) {
+			return(rec);
+		}
+	} while (btr_pcur_move_to_next(pcur, mtr));
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Reads page numbers for a space id from an ibuf tree.
+@return a lower limit for the combined volume of records which will be
+merged */
+static	__attribute__((nonnull, warn_unused_result))
+ulint
+ibuf_get_merge_pages(
+/*=================*/
+	btr_pcur_t*	pcur,	/*!< in/out: cursor */
+	ulint		space,	/*!< in: space for which to merge */
+	ulint		limit,	/*!< in: max page numbers to read */
+	ulint*		pages,	/*!< out: pages read */
+	ulint*		spaces,	/*!< out: spaces read */
+	ib_int64_t*	versions,/*!< out: space versions read */
+	ulint*		n_pages,/*!< out: number of pages read */
+	mtr_t*		mtr)	/*!< in: mini transaction */
+{
+	const rec_t*	rec;
+	ulint		volume = 0;
+	ib_int64_t	version = fil_space_get_version(space);
+
+	ut_a(space != ULINT_UNDEFINED);
+
+	*n_pages = 0;
+
+	while ((rec = ibuf_get_user_rec(pcur, mtr)) != 0
+	       && ibuf_rec_get_space(mtr, rec) == space
+	       && *n_pages < limit) {
+
+		ulint	page_no = ibuf_rec_get_page_no(mtr, rec);
+
+		if (*n_pages == 0 || pages[*n_pages - 1] != page_no) {
+			spaces[*n_pages] = space;
+			pages[*n_pages] = page_no;
+			versions[*n_pages] = version;
+			++*n_pages;
+		}
+
+		volume += ibuf_rec_get_volume(mtr, rec);
+
+		btr_pcur_move_to_next(pcur, mtr);
+	}
+
+	return(volume);
+}
+
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+static
+ulint
+ibuf_merge_pages(
+/*=============*/
+	ulint*	n_pages,	/*!< out: number of pages to which merged */
+	bool	sync)		/*!< in: true if the caller wants to wait for
+				the issued read with the highest tablespace
+				address to complete */
+{
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+	ulint		sum_sizes;
+	ulint		page_nos[IBUF_MAX_N_PAGES_MERGED];
+	ulint		space_ids[IBUF_MAX_N_PAGES_MERGED];
+	ib_int64_t	space_versions[IBUF_MAX_N_PAGES_MERGED];
+
+	*n_pages = 0;
+
+	ibuf_mtr_start(&mtr);
+
+	/* Open a cursor to a randomly chosen leaf of the tree, at a random
+	position within the leaf */
+
+	btr_pcur_open_at_rnd_pos(ibuf->index, BTR_SEARCH_LEAF, &pcur, &mtr);
+
+	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
+
+	if (page_is_empty(btr_pcur_get_page(&pcur))) {
+		/* If a B-tree page is empty, it must be the root page
+		and the whole B-tree must be empty. InnoDB does not
+		allow empty B-tree pages other than the root. */
+		ut_ad(ibuf->empty);
+		ut_ad(page_get_space_id(btr_pcur_get_page(&pcur))
+		      == IBUF_SPACE_ID);
+		ut_ad(page_get_page_no(btr_pcur_get_page(&pcur))
+		      == FSP_IBUF_TREE_ROOT_PAGE_NO);
+
+		ibuf_mtr_commit(&mtr);
+		btr_pcur_close(&pcur);
+
+		return(0);
+	}
+
+	sum_sizes = ibuf_get_merge_page_nos(TRUE,
+					    btr_pcur_get_rec(&pcur), &mtr,
+					    space_ids, space_versions,
+					    page_nos, n_pages);
+#if 0 /* defined UNIV_IBUF_DEBUG */
+	fprintf(stderr, "Ibuf contract sync %lu pages %lu volume %lu\n",
+		sync, *n_pages, sum_sizes);
+#endif
+	ibuf_mtr_commit(&mtr);
+	btr_pcur_close(&pcur);
+
+	buf_read_ibuf_merge_pages(
+		sync, space_ids, space_versions, page_nos, *n_pages);
+
+	return(sum_sizes + 1);
+}
+
+/*********************************************************************//**
+Get the table instance from the table id.
+@return table instance */
+static __attribute__((warn_unused_result))
+dict_table_t*
+ibuf_get_table(
+/*===========*/
+	table_id_t	table_id)	/*!< in: valid table id */
+{
+	rw_lock_s_lock_func(&dict_operation_lock, 0, __FILE__, __LINE__);
+
+	dict_table_t*	table = dict_table_open_on_id(
+		table_id, FALSE, DICT_TABLE_OP_NORMAL);
+
+	rw_lock_s_unlock_gen(&dict_operation_lock, 0);
+
+	return(table);
+}
+
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+static
+ulint
+ibuf_merge_space(
+/*=============*/
+	ulint		space,	/*!< in: tablespace id to merge */
+	ulint*		n_pages)/*!< out: number of pages to which merged */
+{
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+	mem_heap_t*	heap = mem_heap_create(512);
+	dtuple_t*	tuple = ibuf_search_tuple_build(space, 0, heap);
+
+	ibuf_mtr_start(&mtr);
+
+	/* Position the cursor on the first matching record. */
+
+	btr_pcur_open(
+		ibuf->index, tuple, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur,
+		&mtr);
+
+	mem_heap_free(heap);
+
+	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
+
+	ulint		sum_sizes = 0;
+	ulint		pages[IBUF_MAX_N_PAGES_MERGED];
+	ulint		spaces[IBUF_MAX_N_PAGES_MERGED];
+	ib_int64_t	versions[IBUF_MAX_N_PAGES_MERGED];
+
+	if (page_is_empty(btr_pcur_get_page(&pcur))) {
+		/* If a B-tree page is empty, it must be the root page
+		and the whole B-tree must be empty. InnoDB does not
+		allow empty B-tree pages other than the root. */
+		ut_ad(ibuf->empty);
+		ut_ad(page_get_space_id(btr_pcur_get_page(&pcur))
+		      == IBUF_SPACE_ID);
+		ut_ad(page_get_page_no(btr_pcur_get_page(&pcur))
+		      == FSP_IBUF_TREE_ROOT_PAGE_NO);
+
+	} else {
+
+		sum_sizes = ibuf_get_merge_pages(
+			&pcur, space, IBUF_MAX_N_PAGES_MERGED,
+			&pages[0], &spaces[0], &versions[0], n_pages,
+			&mtr);
+
+		++sum_sizes;
+	}
+
+	ibuf_mtr_commit(&mtr);
+
+	btr_pcur_close(&pcur);
+
+	if (sum_sizes > 0) {
+
+		ut_a(*n_pages > 0 || sum_sizes == 1);
+
+#ifdef UNIV_DEBUG
+		ut_ad(*n_pages <= UT_ARR_SIZE(pages));
+
+		for (ulint i = 0; i < *n_pages; ++i) {
+			ut_ad(spaces[i] == space);
+			ut_ad(i == 0 || versions[i] == versions[i - 1]);
+		}
+#endif /* UNIV_DEBUG */
+
+		buf_read_ibuf_merge_pages(
+			true, spaces, versions, pages, *n_pages);
+	}
+
+	return(sum_sizes);
+}
+
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+static __attribute__((nonnull, warn_unused_result))
+ulint
+ibuf_merge(
+/*=======*/
+	table_id_t	table_id,	/*!< in: if merge should be
+					done only for a specific
+					table, for all tables this
+					should be 0 */
+	ulint*		n_pages,	/*!< out: number of pages to
+					which merged */
+	bool		sync)		/*!< in: TRUE if the caller
+					wants to wait for the issued
+					read with the highest
+					tablespace address to complete */
+{
+	dict_table_t*	table;
+
+	*n_pages = 0;
+
+	/* We perform a dirty read of ibuf->empty, without latching
+	the insert buffer root page. We trust this dirty read except
+	when a slow shutdown is being executed. During a slow
+	shutdown, the insert buffer merge must be completed. */
+
+	if (ibuf->empty && !srv_shutdown_state) {
+		return(0);
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+	} else if (ibuf_debug) {
+		return(0);
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+	} else if (table_id == 0) {
+		return(ibuf_merge_pages(n_pages, sync));
+	} else if ((table = ibuf_get_table(table_id)) == 0) {
+		/* Table has been dropped. */
+		return(0);
+	}
+
+	ulint	volume = ibuf_merge_space(table->space, n_pages);
+
+	dict_table_close(table, FALSE, FALSE);
+
+	return(volume);
+}
+
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+static
+ulint
+ibuf_contract(
+/*==========*/
+	ibool	sync)	/*!< in: TRUE if the caller wants to wait for the
+			issued read with the highest tablespace address
+			to complete */
+{
+	ulint	n_pages;
+
+	return(ibuf_merge(0, &n_pages, sync));
+}
+
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+UNIV_INTERN
+ulint
+ibuf_contract_in_background(
+/*========================*/
+	table_id_t	table_id,	/*!< in: if merge should be done only
+					for a specific table, for all tables
+					this should be 0 */
+	ibool		full)		/*!< in: TRUE if the caller wants to
+					do a full contract based on PCT_IO(100).
+					If FALSE then the size of contract
+					batch is determined based on the
+					current size of the ibuf tree. */
+{
+	ulint	sum_bytes	= 0;
+	ulint	sum_pages	= 0;
+	ulint	n_pag2;
+	ulint	n_pages;
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+	if (srv_ibuf_disable_background_merge && table_id == 0) {
+		return(0);
+	}
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+	if (full) {
+		/* Caller has requested a full batch */
+		n_pages = PCT_IO(100);
+	} else {
+		/* By default we do a batch of 5% of the io_capacity */
+		n_pages = PCT_IO(5);
+
+		mutex_enter(&ibuf_mutex);
+
+		/* If the ibuf->size is more than half the max_size
+		then we make more agreesive contraction.
+		+1 is to avoid division by zero. */
+		if (ibuf->size > ibuf->max_size / 2) {
+			ulint diff = ibuf->size - ibuf->max_size / 2;
+			n_pages += PCT_IO((diff * 100)
+					   / (ibuf->max_size + 1));
+		}
+
+		mutex_exit(&ibuf_mutex);
+	}
+
+	while (sum_pages < n_pages) {
+		ulint	n_bytes;
+
+		n_bytes = ibuf_merge(table_id, &n_pag2, FALSE);
+
+		if (n_bytes == 0) {
+			return(sum_bytes);
+		}
+
+		sum_bytes += n_bytes;
+		sum_pages += n_pag2;
+	}
+
+	return(sum_bytes);
+}
+
+/*********************************************************************//**
+Contract insert buffer trees after insert if they are too big. */
+UNIV_INLINE
+void
+ibuf_contract_after_insert(
+/*=======================*/
+	ulint	entry_size)	/*!< in: size of a record which was inserted
+				into an ibuf tree */
+{
+	ibool	sync;
+	ulint	sum_sizes;
+	ulint	size;
+	ulint	max_size;
+
+	/* Perform dirty reads of ibuf->size and ibuf->max_size, to
+	reduce ibuf_mutex contention. ibuf->max_size remains constant
+	after ibuf_init_at_db_start(), but ibuf->size should be
+	protected by ibuf_mutex. Given that ibuf->size fits in a
+	machine word, this should be OK; at worst we are doing some
+	excessive ibuf_contract() or occasionally skipping a
+	ibuf_contract(). */
+	size = ibuf->size;
+	max_size = ibuf->max_size;
+
+	if (size < max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) {
+		return;
+	}
+
+	sync = (size >= max_size + IBUF_CONTRACT_ON_INSERT_SYNC);
+
+	/* Contract at least entry_size many bytes */
+	sum_sizes = 0;
+	size = 1;
+
+	do {
+
+		size = ibuf_contract(sync);
+		sum_sizes += size;
+	} while (size > 0 && sum_sizes < entry_size);
+}
+
+/*********************************************************************//**
+Determine if an insert buffer record has been encountered already.
+@return	TRUE if a new record, FALSE if possible duplicate */
+static
+ibool
+ibuf_get_volume_buffered_hash(
+/*==========================*/
+	const rec_t*	rec,	/*!< in: ibuf record in post-4.1 format */
+	const byte*	types,	/*!< in: fields */
+	const byte*	data,	/*!< in: start of user record data */
+	ulint		comp,	/*!< in: 0=ROW_FORMAT=REDUNDANT,
+				nonzero=ROW_FORMAT=COMPACT */
+	ulint*		hash,	/*!< in/out: hash array */
+	ulint		size)	/*!< in: number of elements in hash array */
+{
+	ulint		len;
+	ulint		fold;
+	ulint		bitmask;
+
+	len = ibuf_rec_get_size(
+		rec, types,
+		rec_get_n_fields_old(rec) - IBUF_REC_FIELD_USER, comp);
+	fold = ut_fold_binary(data, len);
+
+	hash += (fold / (CHAR_BIT * sizeof *hash)) % size;
+	bitmask = static_cast<ulint>(
+		1 << (fold % (CHAR_BIT * sizeof(*hash))));
+
+	if (*hash & bitmask) {
+
+		return(FALSE);
+	}
+
+	/* We have not seen this record yet.  Insert it. */
+	*hash |= bitmask;
+
+	return(TRUE);
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_get_volume_buffered_count(mtr,rec,hash,size,n_recs)	\
+	ibuf_get_volume_buffered_count_func(mtr,rec,hash,size,n_recs)
+#else /* UNIV_DEBUG */
+# define ibuf_get_volume_buffered_count(mtr,rec,hash,size,n_recs)	\
+	ibuf_get_volume_buffered_count_func(rec,hash,size,n_recs)
+#endif
+/*********************************************************************//**
+Update the estimate of the number of records on a page, and
+get the space taken by merging the buffered record to the index page.
+@return size of index record in bytes + an upper limit of the space
+taken in the page directory */
+static
+ulint
+ibuf_get_volume_buffered_count_func(
+/*================================*/
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,	/*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+	const rec_t*	rec,	/*!< in: insert buffer record */
+	ulint*		hash,	/*!< in/out: hash array */
+	ulint		size,	/*!< in: number of elements in hash array */
+	lint*		n_recs)	/*!< in/out: estimated number of records
+				on the page that rec points to */
+{
+	ulint		len;
+	ibuf_op_t	ibuf_op;
+	const byte*	types;
+	ulint		n_fields;
+
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+	ut_ad(ibuf_inside(mtr));
+
+	n_fields = rec_get_n_fields_old(rec);
+	ut_ad(n_fields > IBUF_REC_FIELD_USER);
+	n_fields -= IBUF_REC_FIELD_USER;
+
+	rec_get_nth_field_offs_old(rec, 1, &len);
+	/* This function is only invoked when buffering new
+	operations.  All pre-4.1 records should have been merged
+	when the database was started up. */
+	ut_a(len == 1);
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		/* This record has been merged already,
+		but apparently the system crashed before
+		the change was discarded from the buffer.
+		Pretend that the record does not exist. */
+		return(0);
+	}
+
+	types = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
+
+	switch (UNIV_EXPECT(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE,
+			    IBUF_REC_INFO_SIZE)) {
+	default:
+		ut_error;
+	case 0:
+		/* This ROW_TYPE=REDUNDANT record does not include an
+		operation counter.  Exclude it from the *n_recs,
+		because deletes cannot be buffered if there are
+		old-style inserts buffered for the page. */
+
+		len = ibuf_rec_get_size(rec, types, n_fields, 0);
+
+		return(len
+		       + rec_get_converted_extra_size(len, n_fields, 0)
+		       + page_dir_calc_reserved_space(1));
+	case 1:
+		/* This ROW_TYPE=COMPACT record does not include an
+		operation counter.  Exclude it from the *n_recs,
+		because deletes cannot be buffered if there are
+		old-style inserts buffered for the page. */
+		goto get_volume_comp;
+
+	case IBUF_REC_INFO_SIZE:
+		ibuf_op = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE];
+		break;
+	}
+
+	switch (ibuf_op) {
+	case IBUF_OP_INSERT:
+		/* Inserts can be done by updating a delete-marked record.
+		Because delete-mark and insert operations can be pointing to
+		the same records, we must not count duplicates. */
+	case IBUF_OP_DELETE_MARK:
+		/* There must be a record to delete-mark.
+		See if this record has been already buffered. */
+		if (n_recs && ibuf_get_volume_buffered_hash(
+			    rec, types + IBUF_REC_INFO_SIZE,
+			    types + len,
+			    types[IBUF_REC_OFFSET_FLAGS] & IBUF_REC_COMPACT,
+			    hash, size)) {
+			(*n_recs)++;
+		}
+
+		if (ibuf_op == IBUF_OP_DELETE_MARK) {
+			/* Setting the delete-mark flag does not
+			affect the available space on the page. */
+			return(0);
+		}
+		break;
+	case IBUF_OP_DELETE:
+		/* A record will be removed from the page. */
+		if (n_recs) {
+			(*n_recs)--;
+		}
+		/* While deleting a record actually frees up space,
+		we have to play it safe and pretend that it takes no
+		additional space (the record might not exist, etc.). */
+		return(0);
+	default:
+		ut_error;
+	}
+
+	ut_ad(ibuf_op == IBUF_OP_INSERT);
+
+get_volume_comp:
+	{
+		dtuple_t*	entry;
+		ulint		volume;
+		dict_index_t*	dummy_index;
+		mem_heap_t*	heap = mem_heap_create(500);
+
+		entry = ibuf_build_entry_from_ibuf_rec(
+			mtr, rec, heap, &dummy_index);
+
+		volume = rec_get_converted_size(dummy_index, entry, 0);
+
+		ibuf_dummy_index_free(dummy_index);
+		mem_heap_free(heap);
+
+		return(volume + page_dir_calc_reserved_space(1));
+	}
+}
+
+/*********************************************************************//**
+Gets an upper limit for the combined size of entries buffered in the insert
+buffer for a given page.
+@return upper limit for the volume of buffered inserts for the index
+page, in bytes; UNIV_PAGE_SIZE, if the entries for the index page span
+several pages in the insert buffer */
+static
+ulint
+ibuf_get_volume_buffered(
+/*=====================*/
+	const btr_pcur_t*pcur,	/*!< in: pcur positioned at a place in an
+				insert buffer tree where we would insert an
+				entry for the index page whose number is
+				page_no, latch mode has to be BTR_MODIFY_PREV
+				or BTR_MODIFY_TREE */
+	ulint		space,	/*!< in: space id */
+	ulint		page_no,/*!< in: page number of an index page */
+	lint*		n_recs,	/*!< in/out: minimum number of records on the
+				page after the buffered changes have been
+				applied, or NULL to disable the counting */
+	mtr_t*		mtr)	/*!< in: mini-transaction of pcur */
+{
+	ulint		volume;
+	const rec_t*	rec;
+	const page_t*	page;
+	ulint		prev_page_no;
+	const page_t*	prev_page;
+	ulint		next_page_no;
+	const page_t*	next_page;
+	/* bitmap of buffered recs */
+	ulint		hash_bitmap[128 / sizeof(ulint)];
+
+	ut_ad((pcur->latch_mode == BTR_MODIFY_PREV)
+	      || (pcur->latch_mode == BTR_MODIFY_TREE));
+
+	/* Count the volume of inserts earlier in the alphabetical order than
+	pcur */
+
+	volume = 0;
+
+	if (n_recs) {
+		memset(hash_bitmap, 0, sizeof hash_bitmap);
+	}
+
+	rec = btr_pcur_get_rec(pcur);
+	page = page_align(rec);
+	ut_ad(page_validate(page, ibuf->index));
+
+	if (page_rec_is_supremum(rec)) {
+		rec = page_rec_get_prev_const(rec);
+	}
+
+	for (; !page_rec_is_infimum(rec);
+	     rec = page_rec_get_prev_const(rec)) {
+		ut_ad(page_align(rec) == page);
+
+		if (page_no != ibuf_rec_get_page_no(mtr, rec)
+		    || space != ibuf_rec_get_space(mtr, rec)) {
+
+			goto count_later;
+		}
+
+		volume += ibuf_get_volume_buffered_count(
+			mtr, rec,
+			hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
+	}
+
+	/* Look at the previous page */
+
+	prev_page_no = btr_page_get_prev(page, mtr);
+
+	if (prev_page_no == FIL_NULL) {
+
+		goto count_later;
+	}
+
+	{
+		buf_block_t*	block;
+
+		block = buf_page_get(
+			IBUF_SPACE_ID, 0, prev_page_no, RW_X_LATCH,
+			mtr);
+
+		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
+
+
+		prev_page = buf_block_get_frame(block);
+		ut_ad(page_validate(prev_page, ibuf->index));
+	}
+
+#ifdef UNIV_BTR_DEBUG
+	ut_a(btr_page_get_next(prev_page, mtr) == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+	rec = page_get_supremum_rec(prev_page);
+	rec = page_rec_get_prev_const(rec);
+
+	for (;; rec = page_rec_get_prev_const(rec)) {
+		ut_ad(page_align(rec) == prev_page);
+
+		if (page_rec_is_infimum(rec)) {
+
+			/* We cannot go to yet a previous page, because we
+			do not have the x-latch on it, and cannot acquire one
+			because of the latching order: we have to give up */
+
+			return(UNIV_PAGE_SIZE);
+		}
+
+		if (page_no != ibuf_rec_get_page_no(mtr, rec)
+		    || space != ibuf_rec_get_space(mtr, rec)) {
+
+			goto count_later;
+		}
+
+		volume += ibuf_get_volume_buffered_count(
+			mtr, rec,
+			hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
+	}
+
+count_later:
+	rec = btr_pcur_get_rec(pcur);
+
+	if (!page_rec_is_supremum(rec)) {
+		rec = page_rec_get_next_const(rec);
+	}
+
+	for (; !page_rec_is_supremum(rec);
+	     rec = page_rec_get_next_const(rec)) {
+		if (page_no != ibuf_rec_get_page_no(mtr, rec)
+		    || space != ibuf_rec_get_space(mtr, rec)) {
+
+			return(volume);
+		}
+
+		volume += ibuf_get_volume_buffered_count(
+			mtr, rec,
+			hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
+	}
+
+	/* Look at the next page */
+
+	next_page_no = btr_page_get_next(page, mtr);
+
+	if (next_page_no == FIL_NULL) {
+
+		return(volume);
+	}
+
+	{
+		buf_block_t*	block;
+
+		block = buf_page_get(
+			IBUF_SPACE_ID, 0, next_page_no, RW_X_LATCH,
+			mtr);
+
+		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
+
+
+		next_page = buf_block_get_frame(block);
+		ut_ad(page_validate(next_page, ibuf->index));
+	}
+
+#ifdef UNIV_BTR_DEBUG
+	ut_a(btr_page_get_prev(next_page, mtr) == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+	rec = page_get_infimum_rec(next_page);
+	rec = page_rec_get_next_const(rec);
+
+	for (;; rec = page_rec_get_next_const(rec)) {
+		ut_ad(page_align(rec) == next_page);
+
+		if (page_rec_is_supremum(rec)) {
+
+			/* We give up */
+
+			return(UNIV_PAGE_SIZE);
+		}
+
+		if (page_no != ibuf_rec_get_page_no(mtr, rec)
+		    || space != ibuf_rec_get_space(mtr, rec)) {
+
+			return(volume);
+		}
+
+		volume += ibuf_get_volume_buffered_count(
+			mtr, rec,
+			hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
+	}
+}
+
+/*********************************************************************//**
+Reads the biggest tablespace id from the high end of the insert buffer
+tree and updates the counter in fil_system. */
+UNIV_INTERN
+void
+ibuf_update_max_tablespace_id(void)
+/*===============================*/
+{
+	ulint		max_space_id;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+
+	ut_a(!dict_table_is_comp(ibuf->index->table));
+
+	ibuf_mtr_start(&mtr);
+
+	btr_pcur_open_at_index_side(
+		false, ibuf->index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
+
+	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
+
+	btr_pcur_move_to_prev(&pcur, &mtr);
+
+	if (btr_pcur_is_before_first_on_page(&pcur)) {
+		/* The tree is empty */
+
+		max_space_id = 0;
+	} else {
+		rec = btr_pcur_get_rec(&pcur);
+
+		field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
+
+		ut_a(len == 4);
+
+		max_space_id = mach_read_from_4(field);
+	}
+
+	ibuf_mtr_commit(&mtr);
+
+	/* printf("Maximum space id in insert buffer %lu\n", max_space_id); */
+
+	fil_set_max_space_id_if_bigger(max_space_id);
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_get_entry_counter_low(mtr,rec,space,page_no)	\
+	ibuf_get_entry_counter_low_func(mtr,rec,space,page_no)
+#else /* UNIV_DEBUG */
+# define ibuf_get_entry_counter_low(mtr,rec,space,page_no)	\
+	ibuf_get_entry_counter_low_func(rec,space,page_no)
+#endif
+/****************************************************************//**
+Helper function for ibuf_get_entry_counter_func. Checks if rec is for
+(space, page_no), and if so, reads counter value from it and returns
+that + 1.
+@retval ULINT_UNDEFINED if the record does not contain any counter
+@retval 0 if the record is not for (space, page_no)
+@retval 1 + previous counter value, otherwise */
+static
+ulint
+ibuf_get_entry_counter_low_func(
+/*============================*/
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,		/*!< in: mini-transaction of rec */
+#endif /* UNIV_DEBUG */
+	const rec_t*	rec,		/*!< in: insert buffer record */
+	ulint		space,		/*!< in: space id */
+	ulint		page_no)	/*!< in: page number */
+{
+	ulint		counter;
+	const byte*	field;
+	ulint		len;
+
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+	ut_ad(rec_get_n_fields_old(rec) > 2);
+
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
+
+	ut_a(len == 1);
+
+	/* Check the tablespace identifier. */
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
+
+	ut_a(len == 4);
+
+	if (mach_read_from_4(field) != space) {
+
+		return(0);
+	}
+
+	/* Check the page offset. */
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len);
+	ut_a(len == 4);
+
+	if (mach_read_from_4(field) != page_no) {
+
+		return(0);
+	}
+
+	/* Check if the record contains a counter field. */
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
+
+	switch (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) {
+	default:
+		ut_error;
+	case 0: /* ROW_FORMAT=REDUNDANT */
+	case 1: /* ROW_FORMAT=COMPACT */
+		return(ULINT_UNDEFINED);
+
+	case IBUF_REC_INFO_SIZE:
+		counter = mach_read_from_2(field + IBUF_REC_OFFSET_COUNTER);
+		ut_a(counter < 0xFFFF);
+		return(counter + 1);
+	}
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_get_entry_counter(space,page_no,rec,mtr,exact_leaf) \
+	ibuf_get_entry_counter_func(space,page_no,rec,mtr,exact_leaf)
+#else /* UNIV_DEBUG */
+# define ibuf_get_entry_counter(space,page_no,rec,mtr,exact_leaf) \
+	ibuf_get_entry_counter_func(space,page_no,rec,exact_leaf)
+#endif
+
+/****************************************************************//**
+Calculate the counter field for an entry based on the current
+last record in ibuf for (space, page_no).
+@return	the counter field, or ULINT_UNDEFINED
+if we should abort this insertion to ibuf */
+static
+ulint
+ibuf_get_entry_counter_func(
+/*========================*/
+	ulint		space,		/*!< in: space id of entry */
+	ulint		page_no,	/*!< in: page number of entry */
+	const rec_t*	rec,		/*!< in: the record preceding the
+					insertion point */
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,		/*!< in: mini-transaction */
+#endif /* UNIV_DEBUG */
+	ibool		only_leaf)	/*!< in: TRUE if this is the only
+					leaf page that can contain entries
+					for (space,page_no), that is, there
+					was no exact match for (space,page_no)
+					in the node pointer */
+{
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(page_validate(page_align(rec), ibuf->index));
+
+	if (page_rec_is_supremum(rec)) {
+		/* This is just for safety. The record should be a
+		page infimum or a user record. */
+		ut_ad(0);
+		return(ULINT_UNDEFINED);
+	} else if (!page_rec_is_infimum(rec)) {
+		return(ibuf_get_entry_counter_low(mtr, rec, space, page_no));
+	} else if (only_leaf
+		   || fil_page_get_prev(page_align(rec)) == FIL_NULL) {
+		/* The parent node pointer did not contain the
+		searched for (space, page_no), which means that the
+		search ended on the correct page regardless of the
+		counter value, and since we're at the infimum record,
+		there are no existing records. */
+		return(0);
+	} else {
+		/* We used to read the previous page here. It would
+		break the latching order, because the caller has
+		buffer-fixed an insert buffer bitmap page. */
+		return(ULINT_UNDEFINED);
+	}
+}
+
+/*********************************************************************//**
+Buffer an operation in the insert/delete buffer, instead of doing it
+directly to the disk page, if this is possible.
+@return	DB_SUCCESS, DB_STRONG_FAIL or other error */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+ibuf_insert_low(
+/*============*/
+	ulint		mode,	/*!< in: BTR_MODIFY_PREV or BTR_MODIFY_TREE */
+	ibuf_op_t	op,	/*!< in: operation type */
+	ibool		no_counter,
+				/*!< in: TRUE=use 5.0.3 format;
+				FALSE=allow delete buffering */
+	const dtuple_t*	entry,	/*!< in: index entry to insert */
+	ulint		entry_size,
+				/*!< in: rec_get_converted_size(index, entry) */
+	dict_index_t*	index,	/*!< in: index where to insert; must not be
+				unique or clustered */
+	ulint		space,	/*!< in: space id where to insert */
+	ulint		zip_size,/*!< in: compressed page size in bytes, or 0 */
+	ulint		page_no,/*!< in: page number where to insert */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	big_rec_t*	dummy_big_rec;
+	btr_pcur_t	pcur;
+	btr_cur_t*	cursor;
+	dtuple_t*	ibuf_entry;
+	mem_heap_t*	offsets_heap	= NULL;
+	mem_heap_t*	heap;
+	ulint*		offsets		= NULL;
+	ulint		buffered;
+	lint		min_n_recs;
+	rec_t*		ins_rec;
+	ibool		old_bit_value;
+	page_t*		bitmap_page;
+	buf_block_t*	block;
+	page_t*		root;
+	dberr_t		err;
+	ibool		do_merge;
+	ulint		space_ids[IBUF_MAX_N_PAGES_MERGED];
+	ib_int64_t	space_versions[IBUF_MAX_N_PAGES_MERGED];
+	ulint		page_nos[IBUF_MAX_N_PAGES_MERGED];
+	ulint		n_stored;
+	mtr_t		mtr;
+	mtr_t		bitmap_mtr;
+
+	ut_a(!dict_index_is_clust(index));
+	ut_ad(dtuple_check_typed(entry));
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(!no_counter || op == IBUF_OP_INSERT);
+	ut_a(op < IBUF_OP_COUNT);
+
+	do_merge = FALSE;
+
+	/* Perform dirty reads of ibuf->size and ibuf->max_size, to
+	reduce ibuf_mutex contention. Given that ibuf->max_size and
+	ibuf->size fit in a machine word, this should be OK; at worst
+	we are doing some excessive ibuf_contract() or occasionally
+	skipping an ibuf_contract(). */
+	if (ibuf->max_size == 0) {
+		return(DB_STRONG_FAIL);
+	}
+
+	if (ibuf->size >= ibuf->max_size + IBUF_CONTRACT_DO_NOT_INSERT) {
+		/* Insert buffer is now too big, contract it but do not try
+		to insert */
+
+
+#ifdef UNIV_IBUF_DEBUG
+		fputs("Ibuf too big\n", stderr);
+#endif
+		/* Use synchronous contract (== TRUE) */
+		ibuf_contract(TRUE);
+
+		return(DB_STRONG_FAIL);
+	}
+
+	heap = mem_heap_create(1024);
+
+	/* Build the entry which contains the space id and the page number
+	as the first fields and the type information for other fields, and
+	which will be inserted to the insert buffer. Using a counter value
+	of 0xFFFF we find the last record for (space, page_no), from which
+	we can then read the counter value N and use N + 1 in the record we
+	insert. (We patch the ibuf_entry's counter field to the correct
+	value just before actually inserting the entry.) */
+
+	ibuf_entry = ibuf_entry_build(
+		op, index, entry, space, page_no,
+		no_counter ? ULINT_UNDEFINED : 0xFFFF, heap);
+
+	/* Open a cursor to the insert buffer tree to calculate if we can add
+	the new entry to it without exceeding the free space limit for the
+	page. */
+
+	if (mode == BTR_MODIFY_TREE) {
+		for (;;) {
+			mutex_enter(&ibuf_pessimistic_insert_mutex);
+			mutex_enter(&ibuf_mutex);
+
+			if (UNIV_LIKELY(ibuf_data_enough_free_for_insert())) {
+
+				break;
+			}
+
+			mutex_exit(&ibuf_mutex);
+			mutex_exit(&ibuf_pessimistic_insert_mutex);
+
+			if (UNIV_UNLIKELY(!ibuf_add_free_page())) {
+
+				mem_heap_free(heap);
+				return(DB_STRONG_FAIL);
+			}
+		}
+	}
+
+	ibuf_mtr_start(&mtr);
+
+	btr_pcur_open(ibuf->index, ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr);
+	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
+
+	/* Find out the volume of already buffered inserts for the same index
+	page */
+	min_n_recs = 0;
+	buffered = ibuf_get_volume_buffered(&pcur, space, page_no,
+					    op == IBUF_OP_DELETE
+					    ? &min_n_recs
+					    : NULL, &mtr);
+
+	if (op == IBUF_OP_DELETE
+	    && (min_n_recs < 2
+		|| buf_pool_watch_occurred(space, page_no))) {
+		/* The page could become empty after the record is
+		deleted, or the page has been read in to the buffer
+		pool.  Refuse to buffer the operation. */
+
+		/* The buffer pool watch is needed for IBUF_OP_DELETE
+		because of latching order considerations.  We can
+		check buf_pool_watch_occurred() only after latching
+		the insert buffer B-tree pages that contain buffered
+		changes for the page.  We never buffer IBUF_OP_DELETE,
+		unless some IBUF_OP_INSERT or IBUF_OP_DELETE_MARK have
+		been previously buffered for the page.  Because there
+		are buffered operations for the page, the insert
+		buffer B-tree page latches held by mtr will guarantee
+		that no changes for the user page will be merged
+		before mtr_commit(&mtr).  We must not mtr_commit(&mtr)
+		until after the IBUF_OP_DELETE has been buffered. */
+
+fail_exit:
+		if (mode == BTR_MODIFY_TREE) {
+			mutex_exit(&ibuf_mutex);
+			mutex_exit(&ibuf_pessimistic_insert_mutex);
+		}
+
+		err = DB_STRONG_FAIL;
+		goto func_exit;
+	}
+
+	/* After this point, the page could still be loaded to the
+	buffer pool, but we do not have to care about it, since we are
+	holding a latch on the insert buffer leaf page that contains
+	buffered changes for (space, page_no).  If the page enters the
+	buffer pool, buf_page_io_complete() for (space, page_no) will
+	have to acquire a latch on the same insert buffer leaf page,
+	which it cannot do until we have buffered the IBUF_OP_DELETE
+	and done mtr_commit(&mtr) to release the latch. */
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a((buffered == 0) || ibuf_count_get(space, page_no));
+#endif
+	ibuf_mtr_start(&bitmap_mtr);
+
+	bitmap_page = ibuf_bitmap_get_map_page(space, page_no,
+					       zip_size, &bitmap_mtr);
+
+	/* We check if the index page is suitable for buffered entries */
+
+	if (buf_page_peek(space, page_no)
+	    || lock_rec_expl_exist_on_page(space, page_no)) {
+
+		ibuf_mtr_commit(&bitmap_mtr);
+		goto fail_exit;
+	}
+
+	if (op == IBUF_OP_INSERT) {
+		ulint	bits = ibuf_bitmap_page_get_bits(
+			bitmap_page, page_no, zip_size, IBUF_BITMAP_FREE,
+			&bitmap_mtr);
+
+		if (buffered + entry_size + page_dir_calc_reserved_space(1)
+		    > ibuf_index_page_calc_free_from_bits(zip_size, bits)) {
+			/* Release the bitmap page latch early. */
+			ibuf_mtr_commit(&bitmap_mtr);
+
+			/* It may not fit */
+			do_merge = TRUE;
+
+			ibuf_get_merge_page_nos(FALSE,
+						btr_pcur_get_rec(&pcur), &mtr,
+						space_ids, space_versions,
+						page_nos, &n_stored);
+
+			goto fail_exit;
+		}
+	}
+
+	if (!no_counter) {
+		/* Patch correct counter value to the entry to
+		insert. This can change the insert position, which can
+		result in the need to abort in some cases. */
+		ulint		counter = ibuf_get_entry_counter(
+			space, page_no, btr_pcur_get_rec(&pcur), &mtr,
+			btr_pcur_get_btr_cur(&pcur)->low_match
+			< IBUF_REC_FIELD_METADATA);
+		dfield_t*	field;
+
+		if (counter == ULINT_UNDEFINED) {
+			ibuf_mtr_commit(&bitmap_mtr);
+			goto fail_exit;
+		}
+
+		field = dtuple_get_nth_field(
+			ibuf_entry, IBUF_REC_FIELD_METADATA);
+		mach_write_to_2(
+			(byte*) dfield_get_data(field)
+			+ IBUF_REC_OFFSET_COUNTER, counter);
+	}
+
+	/* Set the bitmap bit denoting that the insert buffer contains
+	buffered entries for this index page, if the bit is not set yet */
+
+	old_bit_value = ibuf_bitmap_page_get_bits(
+		bitmap_page, page_no, zip_size,
+		IBUF_BITMAP_BUFFERED, &bitmap_mtr);
+
+	if (!old_bit_value) {
+		ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size,
+					  IBUF_BITMAP_BUFFERED, TRUE,
+					  &bitmap_mtr);
+	}
+
+	ibuf_mtr_commit(&bitmap_mtr);
+
+	cursor = btr_pcur_get_btr_cur(&pcur);
+
+	if (mode == BTR_MODIFY_PREV) {
+		err = btr_cur_optimistic_insert(
+			BTR_NO_LOCKING_FLAG,
+			cursor, &offsets, &offsets_heap,
+			ibuf_entry, &ins_rec,
+			&dummy_big_rec, 0, thr, &mtr);
+		block = btr_cur_get_block(cursor);
+		ut_ad(buf_block_get_space(block) == IBUF_SPACE_ID);
+
+		/* If this is the root page, update ibuf->empty. */
+		if (UNIV_UNLIKELY(buf_block_get_page_no(block)
+				  == FSP_IBUF_TREE_ROOT_PAGE_NO)) {
+			const page_t*	root = buf_block_get_frame(block);
+
+			ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
+			ut_ad(page_get_page_no(root)
+			      == FSP_IBUF_TREE_ROOT_PAGE_NO);
+
+			ibuf->empty = page_is_empty(root);
+		}
+	} else {
+		ut_ad(mode == BTR_MODIFY_TREE);
+
+		/* We acquire an x-latch to the root page before the insert,
+		because a pessimistic insert releases the tree x-latch,
+		which would cause the x-latching of the root after that to
+		break the latching order. */
+
+		root = ibuf_tree_root_get(&mtr);
+
+		err = btr_cur_optimistic_insert(
+			BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG,
+			cursor, &offsets, &offsets_heap,
+			ibuf_entry, &ins_rec,
+			&dummy_big_rec, 0, thr, &mtr);
+
+		if (err == DB_FAIL) {
+			err = btr_cur_pessimistic_insert(
+				BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG,
+				cursor, &offsets, &offsets_heap,
+				ibuf_entry, &ins_rec,
+				&dummy_big_rec, 0, thr, &mtr);
+		}
+
+		mutex_exit(&ibuf_pessimistic_insert_mutex);
+		ibuf_size_update(root, &mtr);
+		mutex_exit(&ibuf_mutex);
+		ibuf->empty = page_is_empty(root);
+
+		block = btr_cur_get_block(cursor);
+		ut_ad(buf_block_get_space(block) == IBUF_SPACE_ID);
+	}
+
+	if (offsets_heap) {
+		mem_heap_free(offsets_heap);
+	}
+
+	if (err == DB_SUCCESS && op != IBUF_OP_DELETE) {
+		/* Update the page max trx id field */
+		page_update_max_trx_id(block, NULL,
+				       thr_get_trx(thr)->id, &mtr);
+	}
+
+func_exit:
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	if (err == DB_SUCCESS) {
+		fprintf(stderr,
+			"Incrementing ibuf count of space %lu page %lu\n"
+			"from %lu by 1\n", space, page_no,
+			ibuf_count_get(space, page_no));
+
+		ibuf_count_set(space, page_no,
+			       ibuf_count_get(space, page_no) + 1);
+	}
+#endif
+
+	ibuf_mtr_commit(&mtr);
+	btr_pcur_close(&pcur);
+
+	mem_heap_free(heap);
+
+	if (err == DB_SUCCESS && mode == BTR_MODIFY_TREE) {
+		ibuf_contract_after_insert(entry_size);
+	}
+
+	if (do_merge) {
+#ifdef UNIV_IBUF_DEBUG
+		ut_a(n_stored <= IBUF_MAX_N_PAGES_MERGED);
+#endif
+		buf_read_ibuf_merge_pages(false, space_ids, space_versions,
+					  page_nos, n_stored);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Buffer an operation in the insert/delete buffer, instead of doing it
+directly to the disk page, if this is possible. Does not do it if the index
+is clustered or unique.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+ibuf_insert(
+/*========*/
+	ibuf_op_t	op,	/*!< in: operation type */
+	const dtuple_t*	entry,	/*!< in: index entry to insert */
+	dict_index_t*	index,	/*!< in: index where to insert */
+	ulint		space,	/*!< in: space id where to insert */
+	ulint		zip_size,/*!< in: compressed page size in bytes, or 0 */
+	ulint		page_no,/*!< in: page number where to insert */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t		err;
+	ulint		entry_size;
+	ibool		no_counter;
+	/* Read the settable global variable ibuf_use only once in
+	this function, so that we will have a consistent view of it. */
+	ibuf_use_t	use		= ibuf_use;
+	DBUG_ENTER("ibuf_insert");
+
+	DBUG_PRINT("ibuf", ("op: %d, space: %ld, page_no: %ld",
+			    op, space, page_no));
+
+	ut_ad(dtuple_check_typed(entry));
+	ut_ad(ut_is_2pow(zip_size));
+
+	ut_a(!dict_index_is_clust(index));
+
+	no_counter = use <= IBUF_USE_INSERT;
+
+	switch (op) {
+	case IBUF_OP_INSERT:
+		switch (use) {
+		case IBUF_USE_NONE:
+		case IBUF_USE_DELETE:
+		case IBUF_USE_DELETE_MARK:
+			DBUG_RETURN(FALSE);
+		case IBUF_USE_INSERT:
+		case IBUF_USE_INSERT_DELETE_MARK:
+		case IBUF_USE_ALL:
+			goto check_watch;
+		case IBUF_USE_COUNT:
+			break;
+		}
+		break;
+	case IBUF_OP_DELETE_MARK:
+		switch (use) {
+		case IBUF_USE_NONE:
+		case IBUF_USE_INSERT:
+			DBUG_RETURN(FALSE);
+		case IBUF_USE_DELETE_MARK:
+		case IBUF_USE_DELETE:
+		case IBUF_USE_INSERT_DELETE_MARK:
+		case IBUF_USE_ALL:
+			ut_ad(!no_counter);
+			goto check_watch;
+		case IBUF_USE_COUNT:
+			break;
+		}
+		break;
+	case IBUF_OP_DELETE:
+		switch (use) {
+		case IBUF_USE_NONE:
+		case IBUF_USE_INSERT:
+		case IBUF_USE_INSERT_DELETE_MARK:
+			DBUG_RETURN(FALSE);
+		case IBUF_USE_DELETE_MARK:
+		case IBUF_USE_DELETE:
+		case IBUF_USE_ALL:
+			ut_ad(!no_counter);
+			goto skip_watch;
+		case IBUF_USE_COUNT:
+			break;
+		}
+		break;
+	case IBUF_OP_COUNT:
+		break;
+	}
+
+	/* unknown op or use */
+	ut_error;
+
+check_watch:
+	/* If a thread attempts to buffer an insert on a page while a
+	purge is in progress on the same page, the purge must not be
+	buffered, because it could remove a record that was
+	re-inserted later.  For simplicity, we block the buffering of
+	all operations on a page that has a purge pending.
+
+	We do not check this in the IBUF_OP_DELETE case, because that
+	would always trigger the buffer pool watch during purge and
+	thus prevent the buffering of delete operations.  We assume
+	that the issuer of IBUF_OP_DELETE has called
+	buf_pool_watch_set(space, page_no). */
+
+	{
+		buf_page_t*	bpage;
+		buf_pool_t*	buf_pool = buf_pool_get(space, page_no);
+		bpage = buf_page_get_also_watch(buf_pool, space, page_no);
+
+		if (UNIV_LIKELY_NULL(bpage)) {
+			/* A buffer pool watch has been set or the
+			page has been read into the buffer pool.
+			Do not buffer the request.  If a purge operation
+			is being buffered, have this request executed
+			directly on the page in the buffer pool after the
+			buffered entries for this page have been merged. */
+			DBUG_RETURN(FALSE);
+		}
+	}
+
+skip_watch:
+	entry_size = rec_get_converted_size(index, entry, 0);
+
+	if (entry_size
+	    >= page_get_free_space_of_empty(dict_table_is_comp(index->table))
+	    / 2) {
+
+		DBUG_RETURN(FALSE);
+	}
+
+	err = ibuf_insert_low(BTR_MODIFY_PREV, op, no_counter,
+			      entry, entry_size,
+			      index, space, zip_size, page_no, thr);
+	if (err == DB_FAIL) {
+		err = ibuf_insert_low(BTR_MODIFY_TREE, op, no_counter,
+				      entry, entry_size,
+				      index, space, zip_size, page_no, thr);
+	}
+
+	if (err == DB_SUCCESS) {
+#ifdef UNIV_IBUF_DEBUG
+		/* fprintf(stderr, "Ibuf insert for page no %lu of index %s\n",
+		page_no, index->name); */
+#endif
+		DBUG_RETURN(TRUE);
+
+	} else {
+		ut_a(err == DB_STRONG_FAIL || err == DB_TOO_BIG_RECORD);
+
+		DBUG_RETURN(FALSE);
+	}
+}
+
+/********************************************************************//**
+During merge, inserts to an index page a secondary index entry extracted
+from the insert buffer.
+@return	newly inserted record */
+static __attribute__((nonnull))
+rec_t*
+ibuf_insert_to_index_page_low(
+/*==========================*/
+	const dtuple_t*	entry,	/*!< in: buffered entry to insert */
+	buf_block_t*	block,	/*!< in/out: index page where the buffered
+				entry should be placed */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint**		offsets,/*!< out: offsets on *rec */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
+	mtr_t*		mtr,	/*!< in/out: mtr */
+	page_cur_t*	page_cur)/*!< in/out: cursor positioned on the record
+				after which to insert the buffered entry */
+{
+	const page_t*	page;
+	ulint		space;
+	ulint		page_no;
+	ulint		zip_size;
+	const page_t*	bitmap_page;
+	ulint		old_bits;
+	rec_t*		rec;
+	DBUG_ENTER("ibuf_insert_to_index_page_low");
+
+	rec = page_cur_tuple_insert(page_cur, entry, index,
+				    offsets, &heap, 0, mtr);
+	if (rec != NULL) {
+		DBUG_RETURN(rec);
+	}
+
+	/* Page reorganization or recompression should already have
+	been attempted by page_cur_tuple_insert(). Besides, per
+	ibuf_index_page_calc_free_zip() the page should not have been
+	recompressed or reorganized. */
+	ut_ad(!buf_block_get_page_zip(block));
+
+	/* If the record did not fit, reorganize */
+
+	btr_page_reorganize(page_cur, index, mtr);
+
+	/* This time the record must fit */
+
+	rec = page_cur_tuple_insert(page_cur, entry, index,
+				    offsets, &heap, 0, mtr);
+	if (rec != NULL) {
+		DBUG_RETURN(rec);
+	}
+
+	page = buf_block_get_frame(block);
+
+	ut_print_timestamp(stderr);
+
+	fprintf(stderr,
+		"  InnoDB: Error: Insert buffer insert fails;"
+		" page free %lu, dtuple size %lu\n",
+		(ulong) page_get_max_insert_size(page, 1),
+		(ulong) rec_get_converted_size(index, entry, 0));
+	fputs("InnoDB: Cannot insert index record ", stderr);
+	dtuple_print(stderr, entry);
+	fputs("\nInnoDB: The table where this index record belongs\n"
+	      "InnoDB: is now probably corrupt. Please run CHECK TABLE on\n"
+	      "InnoDB: that table.\n", stderr);
+
+	space = page_get_space_id(page);
+	zip_size = buf_block_get_zip_size(block);
+	page_no = page_get_page_no(page);
+
+	bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr);
+	old_bits = ibuf_bitmap_page_get_bits(bitmap_page, page_no, zip_size,
+					     IBUF_BITMAP_FREE, mtr);
+
+	fprintf(stderr,
+		"InnoDB: space %lu, page %lu, zip_size %lu, bitmap bits %lu\n",
+		(ulong) space, (ulong) page_no,
+		(ulong) zip_size, (ulong) old_bits);
+
+	fputs("InnoDB: Submit a detailed bug report"
+	      " to http://bugs.mysql.com\n", stderr);
+	ut_ad(0);
+	DBUG_RETURN(NULL);
+}
+
+/************************************************************************
+During merge, inserts to an index page a secondary index entry extracted
+from the insert buffer. */
+static
+void
+ibuf_insert_to_index_page(
+/*======================*/
+	const dtuple_t*	entry,	/*!< in: buffered entry to insert */
+	buf_block_t*	block,	/*!< in/out: index page where the buffered entry
+				should be placed */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_cur_t	page_cur;
+	ulint		low_match;
+	page_t*		page		= buf_block_get_frame(block);
+	rec_t*		rec;
+	ulint*		offsets;
+	mem_heap_t*	heap;
+
+	DBUG_ENTER("ibuf_insert_to_index_page");
+
+	DBUG_PRINT("ibuf", ("page_no: %ld", buf_block_get_page_no(block)));
+	DBUG_PRINT("ibuf", ("index name: %s", index->name));
+	DBUG_PRINT("ibuf", ("online status: %d",
+			    dict_index_get_online_status(index)));
+
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(dtuple_check_typed(entry));
+	ut_ad(!buf_block_align(page)->index);
+
+	if (UNIV_UNLIKELY(dict_table_is_comp(index->table)
+			  != (ibool)!!page_is_comp(page))) {
+		fputs("InnoDB: Trying to insert a record from"
+		      " the insert buffer to an index page\n"
+		      "InnoDB: but the 'compact' flag does not match!\n",
+		      stderr);
+		goto dump;
+	}
+
+	rec = page_rec_get_next(page_get_infimum_rec(page));
+
+	if (page_rec_is_supremum(rec)) {
+		fputs("InnoDB: Trying to insert a record from"
+		      " the insert buffer to an index page\n"
+		      "InnoDB: but the index page is empty!\n",
+		      stderr);
+		goto dump;
+	}
+
+	if (UNIV_UNLIKELY(rec_get_n_fields(rec, index)
+			  != dtuple_get_n_fields(entry))) {
+		fputs("InnoDB: Trying to insert a record from"
+		      " the insert buffer to an index page\n"
+		      "InnoDB: but the number of fields does not match!\n",
+		      stderr);
+dump:
+		buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
+
+		dtuple_print(stderr, entry);
+		ut_ad(0);
+
+		fputs("InnoDB: The table where where"
+		      " this index record belongs\n"
+		      "InnoDB: is now probably corrupt."
+		      " Please run CHECK TABLE on\n"
+		      "InnoDB: your tables.\n"
+		      "InnoDB: Submit a detailed bug report to"
+		      " http://bugs.mysql.com!\n", stderr);
+
+		DBUG_VOID_RETURN;
+	}
+
+	low_match = page_cur_search(block, index, entry,
+				    PAGE_CUR_LE, &page_cur);
+
+	heap = mem_heap_create(
+		sizeof(upd_t)
+		+ REC_OFFS_HEADER_SIZE * sizeof(*offsets)
+		+ dtuple_get_n_fields(entry)
+		* (sizeof(upd_field_t) + sizeof *offsets));
+
+	if (UNIV_UNLIKELY(low_match == dtuple_get_n_fields(entry))) {
+		upd_t*		update;
+		page_zip_des_t*	page_zip;
+
+		rec = page_cur_get_rec(&page_cur);
+
+		/* This is based on
+		row_ins_sec_index_entry_by_modify(BTR_MODIFY_LEAF). */
+		ut_ad(rec_get_deleted_flag(rec, page_is_comp(page)));
+
+		offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED,
+					  &heap);
+		update = row_upd_build_sec_rec_difference_binary(
+			rec, index, offsets, entry, heap);
+
+		page_zip = buf_block_get_page_zip(block);
+
+		if (update->n_fields == 0) {
+			/* The records only differ in the delete-mark.
+			Clear the delete-mark, like we did before
+			Bug #56680 was fixed. */
+			btr_cur_set_deleted_flag_for_ibuf(
+				rec, page_zip, FALSE, mtr);
+			goto updated_in_place;
+		}
+
+		/* Copy the info bits. Clear the delete-mark. */
+		update->info_bits = rec_get_info_bits(rec, page_is_comp(page));
+		update->info_bits &= ~REC_INFO_DELETED_FLAG;
+
+		/* We cannot invoke btr_cur_optimistic_update() here,
+		because we do not have a btr_cur_t or que_thr_t,
+		as the insert buffer merge occurs at a very low level. */
+		if (!row_upd_changes_field_size_or_external(index, offsets,
+							    update)
+		    && (!page_zip || btr_cur_update_alloc_zip(
+				page_zip, &page_cur, index, offsets,
+				rec_offs_size(offsets), false, mtr))) {
+			/* This is the easy case. Do something similar
+			to btr_cur_update_in_place(). */
+			rec = page_cur_get_rec(&page_cur);
+			row_upd_rec_in_place(rec, index, offsets,
+					     update, page_zip);
+
+			/* Log the update in place operation. During recovery
+			MLOG_COMP_REC_UPDATE_IN_PLACE/MLOG_REC_UPDATE_IN_PLACE
+			expects trx_id, roll_ptr for secondary indexes. So we
+			just write dummy trx_id(0), roll_ptr(0) */
+			btr_cur_update_in_place_log(BTR_KEEP_SYS_FLAG, rec,
+						    index, update, 0, 0, mtr);
+			DBUG_EXECUTE_IF(
+				"crash_after_log_ibuf_upd_inplace",
+				log_buffer_flush_to_disk();
+				ib_logf(IB_LOG_LEVEL_INFO,
+					"Wrote log record for ibuf update in "
+					"place operation");
+				DBUG_SUICIDE();
+			);
+
+			goto updated_in_place;
+		}
+
+		/* btr_cur_update_alloc_zip() may have changed this */
+		rec = page_cur_get_rec(&page_cur);
+
+		/* A collation may identify values that differ in
+		storage length.
+		Some examples (1 or 2 bytes):
+		utf8_turkish_ci: I = U+0131 LATIN SMALL LETTER DOTLESS I
+		utf8_general_ci: S = U+00DF LATIN SMALL LETTER SHARP S
+		utf8_general_ci: A = U+00E4 LATIN SMALL LETTER A WITH DIAERESIS
+
+		latin1_german2_ci: SS = U+00DF LATIN SMALL LETTER SHARP S
+
+		Examples of a character (3-byte UTF-8 sequence)
+		identified with 2 or 4 characters (1-byte UTF-8 sequences):
+
+		utf8_unicode_ci: 'II' = U+2171 SMALL ROMAN NUMERAL TWO
+		utf8_unicode_ci: '(10)' = U+247D PARENTHESIZED NUMBER TEN
+		*/
+
+		/* Delete the different-length record, and insert the
+		buffered one. */
+
+		lock_rec_store_on_page_infimum(block, rec);
+		page_cur_delete_rec(&page_cur, index, offsets, mtr);
+		page_cur_move_to_prev(&page_cur);
+		rec = ibuf_insert_to_index_page_low(entry, block, index,
+				      		    &offsets, heap, mtr,
+						    &page_cur);
+
+		ut_ad(!cmp_dtuple_rec(entry, rec, offsets));
+		lock_rec_restore_from_page_infimum(block, rec, block);
+	} else {
+		offsets = NULL;
+		ibuf_insert_to_index_page_low(entry, block, index,
+					      &offsets, heap, mtr,
+					      &page_cur);
+	}
+updated_in_place:
+	mem_heap_free(heap);
+
+	DBUG_VOID_RETURN;
+}
+
+/****************************************************************//**
+During merge, sets the delete mark on a record for a secondary index
+entry. */
+static
+void
+ibuf_set_del_mark(
+/*==============*/
+	const dtuple_t*		entry,	/*!< in: entry */
+	buf_block_t*		block,	/*!< in/out: block */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*			mtr)	/*!< in: mtr */
+{
+	page_cur_t	page_cur;
+	ulint		low_match;
+
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(dtuple_check_typed(entry));
+
+	low_match = page_cur_search(
+		block, index, entry, PAGE_CUR_LE, &page_cur);
+
+	if (low_match == dtuple_get_n_fields(entry)) {
+		rec_t*		rec;
+		page_zip_des_t*	page_zip;
+
+		rec = page_cur_get_rec(&page_cur);
+		page_zip = page_cur_get_page_zip(&page_cur);
+
+		/* Delete mark the old index record. According to a
+		comment in row_upd_sec_index_entry(), it can already
+		have been delete marked if a lock wait occurred in
+		row_ins_sec_index_entry() in a previous invocation of
+		row_upd_sec_index_entry(). */
+
+		if (UNIV_LIKELY
+		    (!rec_get_deleted_flag(
+			    rec, dict_table_is_comp(index->table)))) {
+			btr_cur_set_deleted_flag_for_ibuf(rec, page_zip,
+							  TRUE, mtr);
+		}
+	} else {
+		const page_t*		page
+			= page_cur_get_page(&page_cur);
+		const buf_block_t*	block
+			= page_cur_get_block(&page_cur);
+
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: unable to find a record to delete-mark\n",
+		      stderr);
+		fputs("InnoDB: tuple ", stderr);
+		dtuple_print(stderr, entry);
+		fputs("\n"
+		      "InnoDB: record ", stderr);
+		rec_print(stderr, page_cur_get_rec(&page_cur), index);
+		fprintf(stderr, "\nspace %u offset %u"
+			" (%u records, index id %llu)\n"
+			"InnoDB: Submit a detailed bug report"
+			" to http://bugs.mysql.com\n",
+			(unsigned) buf_block_get_space(block),
+			(unsigned) buf_block_get_page_no(block),
+			(unsigned) page_get_n_recs(page),
+			(ulonglong) btr_page_get_index_id(page));
+		ut_ad(0);
+	}
+}
+
+/****************************************************************//**
+During merge, delete a record for a secondary index entry. */
+static
+void
+ibuf_delete(
+/*========*/
+	const dtuple_t*	entry,	/*!< in: entry */
+	buf_block_t*	block,	/*!< in/out: block */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in/out: mtr; must be committed
+				before latching any further pages */
+{
+	page_cur_t	page_cur;
+	ulint		low_match;
+
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(dtuple_check_typed(entry));
+
+	low_match = page_cur_search(
+		block, index, entry, PAGE_CUR_LE, &page_cur);
+
+	if (low_match == dtuple_get_n_fields(entry)) {
+		page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
+		page_t*		page	= buf_block_get_frame(block);
+		rec_t*		rec	= page_cur_get_rec(&page_cur);
+
+		/* TODO: the below should probably be a separate function,
+		it's a bastardized version of btr_cur_optimistic_delete. */
+
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		ulint*		offsets	= offsets_;
+		mem_heap_t*	heap = NULL;
+		ulint		max_ins_size = 0;
+
+		rec_offs_init(offsets_);
+
+		offsets = rec_get_offsets(
+			rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+		if (page_get_n_recs(page) <= 1
+		    || !(REC_INFO_DELETED_FLAG
+			 & rec_get_info_bits(rec, page_is_comp(page)))) {
+			/* Refuse to purge the last record or a
+			record that has not been marked for deletion. */
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: unable to purge a record\n",
+			      stderr);
+			fputs("InnoDB: tuple ", stderr);
+			dtuple_print(stderr, entry);
+			fputs("\n"
+			      "InnoDB: record ", stderr);
+			rec_print_new(stderr, rec, offsets);
+			fprintf(stderr, "\nspace %u offset %u"
+				" (%u records, index id %llu)\n"
+				"InnoDB: Submit a detailed bug report"
+				" to http://bugs.mysql.com\n",
+				(unsigned) buf_block_get_space(block),
+				(unsigned) buf_block_get_page_no(block),
+				(unsigned) page_get_n_recs(page),
+				(ulonglong) btr_page_get_index_id(page));
+
+			ut_ad(0);
+			return;
+		}
+
+		lock_update_delete(block, rec);
+
+		if (!page_zip) {
+			max_ins_size
+				= page_get_max_insert_size_after_reorganize(
+					page, 1);
+		}
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+		page_cur_delete_rec(&page_cur, index, offsets, mtr);
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+		if (page_zip) {
+			ibuf_update_free_bits_zip(block, mtr);
+		} else {
+			ibuf_update_free_bits_low(block, max_ins_size, mtr);
+		}
+
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	} else {
+		/* The record must have been purged already. */
+	}
+}
+
+/*********************************************************************//**
+Restores insert buffer tree cursor position
+@return	TRUE if the position was restored; FALSE if not */
+static __attribute__((nonnull))
+ibool
+ibuf_restore_pos(
+/*=============*/
+	ulint		space,	/*!< in: space id */
+	ulint		page_no,/*!< in: index page number where the record
+				should belong */
+	const dtuple_t*	search_tuple,
+				/*!< in: search tuple for entries of page_no */
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor whose
+				position is to be restored */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(mode == BTR_MODIFY_LEAF || mode == BTR_MODIFY_TREE);
+
+	if (btr_pcur_restore_position(mode, pcur, mtr)) {
+
+		return(TRUE);
+	}
+
+	if (fil_space_get_flags(space) == ULINT_UNDEFINED) {
+		/* The tablespace has been dropped.  It is possible
+		that another thread has deleted the insert buffer
+		entry.  Do not complain. */
+		ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
+	} else {
+		fprintf(stderr,
+			"InnoDB: ERROR: Submit the output to"
+			" http://bugs.mysql.com\n"
+			"InnoDB: ibuf cursor restoration fails!\n"
+			"InnoDB: ibuf record inserted to page %lu:%lu\n",
+			(ulong) space, (ulong) page_no);
+		fflush(stderr);
+
+		rec_print_old(stderr, btr_pcur_get_rec(pcur));
+		rec_print_old(stderr, pcur->old_rec);
+		dtuple_print(stderr, search_tuple);
+
+		rec_print_old(stderr,
+			      page_rec_get_next(btr_pcur_get_rec(pcur)));
+		fflush(stderr);
+
+		ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
+		ut_ad(0);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Deletes from ibuf the record on which pcur is positioned. If we have to
+resort to a pessimistic delete, this function commits mtr and closes
+the cursor.
+@return	TRUE if mtr was committed and pcur closed in this operation */
+static __attribute__((warn_unused_result))
+ibool
+ibuf_delete_rec(
+/*============*/
+	ulint		space,	/*!< in: space id */
+	ulint		page_no,/*!< in: index page number that the record
+				should belong to */
+	btr_pcur_t*	pcur,	/*!< in: pcur positioned on the record to
+				delete, having latch mode BTR_MODIFY_LEAF */
+	const dtuple_t*	search_tuple,
+				/*!< in: search tuple for entries of page_no */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ibool		success;
+	page_t*		root;
+	dberr_t		err;
+
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur)));
+	ut_ad(ibuf_rec_get_page_no(mtr, btr_pcur_get_rec(pcur)) == page_no);
+	ut_ad(ibuf_rec_get_space(mtr, btr_pcur_get_rec(pcur)) == space);
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+	if (ibuf_debug == 2) {
+		/* Inject a fault (crash). We do this before trying
+		optimistic delete, because a pessimistic delete in the
+		change buffer would require a larger test case. */
+
+		/* Flag the buffered record as processed, to avoid
+		an assertion failure after crash recovery. */
+		btr_cur_set_deleted_flag_for_ibuf(
+			btr_pcur_get_rec(pcur), NULL, TRUE, mtr);
+		ibuf_mtr_commit(mtr);
+		log_write_up_to(LSN_MAX, LOG_WAIT_ALL_GROUPS, TRUE);
+		DBUG_SUICIDE();
+	}
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+	success = btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur),
+					    0, mtr);
+
+	if (success) {
+		if (page_is_empty(btr_pcur_get_page(pcur))) {
+			/* If a B-tree page is empty, it must be the root page
+			and the whole B-tree must be empty. InnoDB does not
+			allow empty B-tree pages other than the root. */
+			root = btr_pcur_get_page(pcur);
+
+			ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
+			ut_ad(page_get_page_no(root)
+			      == FSP_IBUF_TREE_ROOT_PAGE_NO);
+
+			/* ibuf->empty is protected by the root page latch.
+			Before the deletion, it had to be FALSE. */
+			ut_ad(!ibuf->empty);
+			ibuf->empty = true;
+		}
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+		fprintf(stderr,
+			"Decrementing ibuf count of space %lu page %lu\n"
+			"from %lu by 1\n", space, page_no,
+			ibuf_count_get(space, page_no));
+		ibuf_count_set(space, page_no,
+			       ibuf_count_get(space, page_no) - 1);
+#endif
+		return(FALSE);
+	}
+
+	ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur)));
+	ut_ad(ibuf_rec_get_page_no(mtr, btr_pcur_get_rec(pcur)) == page_no);
+	ut_ad(ibuf_rec_get_space(mtr, btr_pcur_get_rec(pcur)) == space);
+
+	/* We have to resort to a pessimistic delete from ibuf.
+	Delete-mark the record so that it will not be applied again,
+	in case the server crashes before the pessimistic delete is
+	made persistent. */
+	btr_cur_set_deleted_flag_for_ibuf(
+		btr_pcur_get_rec(pcur), NULL, TRUE, mtr);
+
+	btr_pcur_store_position(pcur, mtr);
+	ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
+
+	ibuf_mtr_start(mtr);
+	mutex_enter(&ibuf_mutex);
+
+	if (!ibuf_restore_pos(space, page_no, search_tuple,
+			      BTR_MODIFY_TREE, pcur, mtr)) {
+
+		mutex_exit(&ibuf_mutex);
+		ut_ad(mtr->state == MTR_COMMITTED);
+		goto func_exit;
+	}
+
+	root = ibuf_tree_root_get(mtr);
+
+	btr_cur_pessimistic_delete(&err, TRUE, btr_pcur_get_btr_cur(pcur), 0,
+				   RB_NONE, mtr);
+	ut_a(err == DB_SUCCESS);
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ibuf_count_set(space, page_no, ibuf_count_get(space, page_no) - 1);
+#endif
+	ibuf_size_update(root, mtr);
+	mutex_exit(&ibuf_mutex);
+
+	ibuf->empty = page_is_empty(root);
+	ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
+
+func_exit:
+	ut_ad(mtr->state == MTR_COMMITTED);
+	btr_pcur_close(pcur);
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+When an index page is read from a disk to the buffer pool, this function
+applies any buffered operations to the page and deletes the entries from the
+insert buffer. If the page is not read, but created in the buffer pool, this
+function deletes its buffered entries from the insert buffer; there can
+exist entries for such a page if the page belonged to an index which
+subsequently was dropped. */
+UNIV_INTERN
+void
+ibuf_merge_or_delete_for_page(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: if page has been read from
+				disk, pointer to the page x-latched,
+				else NULL */
+	ulint		space,	/*!< in: space id of the index page */
+	ulint		page_no,/*!< in: page number of the index page */
+	ulint		zip_size,/*!< in: compressed page size in bytes,
+				or 0 */
+	ibool		update_ibuf_bitmap)/*!< in: normally this is set
+				to TRUE, but if we have deleted or are
+				deleting the tablespace, then we
+				naturally do not want to update a
+				non-existent bitmap page */
+{
+	mem_heap_t*	heap;
+	btr_pcur_t	pcur;
+	dtuple_t*	search_tuple;
+#ifdef UNIV_IBUF_DEBUG
+	ulint		volume			= 0;
+#endif
+	page_zip_des_t*	page_zip		= NULL;
+	ibool		tablespace_being_deleted = FALSE;
+	ibool		corruption_noticed	= FALSE;
+	mtr_t		mtr;
+
+	/* Counts for merged & discarded operations. */
+	ulint		mops[IBUF_OP_COUNT];
+	ulint		dops[IBUF_OP_COUNT];
+
+	ut_ad(!block || buf_block_get_space(block) == space);
+	ut_ad(!block || buf_block_get_page_no(block) == page_no);
+	ut_ad(!block || buf_block_get_zip_size(block) == zip_size);
+	ut_ad(!block || buf_block_get_io_fix(block) == BUF_IO_READ);
+
+	if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE
+	    || trx_sys_hdr_page(space, page_no)) {
+		return;
+	}
+
+	/* We cannot refer to zip_size in the following, because
+	zip_size is passed as ULINT_UNDEFINED (it is unknown) when
+	buf_read_ibuf_merge_pages() is merging (discarding) changes
+	for a dropped tablespace.  When block != NULL or
+	update_ibuf_bitmap is specified, the zip_size must be known.
+	That is why we will repeat the check below, with zip_size in
+	place of 0.  Passing zip_size as 0 assumes that the
+	uncompressed page size always is a power-of-2 multiple of the
+	compressed page size. */
+
+	if (ibuf_fixed_addr_page(space, 0, page_no)
+	    || fsp_descr_page(0, page_no)) {
+		return;
+	}
+
+	if (UNIV_LIKELY(update_ibuf_bitmap)) {
+		ut_a(ut_is_2pow(zip_size));
+
+		if (ibuf_fixed_addr_page(space, zip_size, page_no)
+		    || fsp_descr_page(zip_size, page_no)) {
+			return;
+		}
+
+		/* If the following returns FALSE, we get the counter
+		incremented, and must decrement it when we leave this
+		function. When the counter is > 0, that prevents tablespace
+		from being dropped. */
+
+		tablespace_being_deleted = fil_inc_pending_ops(space, true);
+
+		if (UNIV_UNLIKELY(tablespace_being_deleted)) {
+			/* Do not try to read the bitmap page from space;
+			just delete the ibuf records for the page */
+
+			block = NULL;
+			update_ibuf_bitmap = FALSE;
+		} else {
+			page_t*	bitmap_page;
+			ulint	bitmap_bits;
+
+			ibuf_mtr_start(&mtr);
+
+			bitmap_page = ibuf_bitmap_get_map_page(
+				space, page_no, zip_size, &mtr);
+			bitmap_bits = ibuf_bitmap_page_get_bits(
+				bitmap_page, page_no, zip_size,
+				IBUF_BITMAP_BUFFERED, &mtr);
+
+			ibuf_mtr_commit(&mtr);
+
+			if (!bitmap_bits) {
+				/* No inserts buffered for this page */
+
+				if (!tablespace_being_deleted) {
+					fil_decr_pending_ops(space);
+				}
+
+				return;
+			}
+		}
+	} else if (block
+		   && (ibuf_fixed_addr_page(space, zip_size, page_no)
+		      || fsp_descr_page(zip_size, page_no))) {
+
+		return;
+	}
+
+	heap = mem_heap_create(512);
+
+	search_tuple = ibuf_search_tuple_build(space, page_no, heap);
+
+	if (block) {
+		/* Move the ownership of the x-latch on the page to this OS
+		thread, so that we can acquire a second x-latch on it. This
+		is needed for the insert operations to the index page to pass
+		the debug checks. */
+
+		rw_lock_x_lock_move_ownership(&(block->lock));
+		page_zip = buf_block_get_page_zip(block);
+
+		if (UNIV_UNLIKELY(fil_page_get_type(block->frame)
+				  != FIL_PAGE_INDEX)
+		    || UNIV_UNLIKELY(!page_is_leaf(block->frame))) {
+
+			page_t*	bitmap_page;
+
+			corruption_noticed = TRUE;
+
+			ut_print_timestamp(stderr);
+
+			ibuf_mtr_start(&mtr);
+
+			fputs("  InnoDB: Dump of the ibuf bitmap page:\n",
+			      stderr);
+
+			bitmap_page = ibuf_bitmap_get_map_page(space, page_no,
+							       zip_size, &mtr);
+			buf_page_print(bitmap_page, 0,
+				       BUF_PAGE_PRINT_NO_CRASH);
+			ibuf_mtr_commit(&mtr);
+
+			fputs("\nInnoDB: Dump of the page:\n", stderr);
+
+			buf_page_print(block->frame, 0,
+				       BUF_PAGE_PRINT_NO_CRASH);
+
+			fprintf(stderr,
+				"InnoDB: Error: corruption in the tablespace."
+				" Bitmap shows insert\n"
+				"InnoDB: buffer records to page n:o %lu"
+				" though the page\n"
+				"InnoDB: type is %lu, which is"
+				" not an index leaf page!\n"
+				"InnoDB: We try to resolve the problem"
+				" by skipping the insert buffer\n"
+				"InnoDB: merge for this page."
+				" Please run CHECK TABLE on your tables\n"
+				"InnoDB: to determine if they are corrupt"
+				" after this.\n\n"
+				"InnoDB: Please submit a detailed bug report"
+				" to http://bugs.mysql.com\n\n",
+				(ulong) page_no,
+				(ulong)
+				fil_page_get_type(block->frame));
+			ut_ad(0);
+		}
+	}
+
+	memset(mops, 0, sizeof(mops));
+	memset(dops, 0, sizeof(dops));
+
+loop:
+	ibuf_mtr_start(&mtr);
+
+	/* Position pcur in the insert buffer at the first entry for this
+	index page */
+	btr_pcur_open_on_user_rec(
+		ibuf->index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
+		&pcur, &mtr);
+
+	if (block) {
+		ibool success;
+
+		success = buf_page_get_known_nowait(
+			RW_X_LATCH, block,
+			BUF_KEEP_OLD, __FILE__, __LINE__, &mtr);
+
+		ut_a(success);
+
+		/* This is a user page (secondary index leaf page),
+		but we pretend that it is a change buffer page in
+		order to obey the latching order. This should be OK,
+		because buffered changes are applied immediately while
+		the block is io-fixed. Other threads must not try to
+		latch an io-fixed block. */
+		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
+	}
+
+	if (!btr_pcur_is_on_user_rec(&pcur)) {
+		ut_ad(btr_pcur_is_after_last_in_tree(&pcur, &mtr));
+
+		goto reset_bit;
+	}
+
+	for (;;) {
+		rec_t*	rec;
+
+		ut_ad(btr_pcur_is_on_user_rec(&pcur));
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		/* Check if the entry is for this index page */
+		if (ibuf_rec_get_page_no(&mtr, rec) != page_no
+		    || ibuf_rec_get_space(&mtr, rec) != space) {
+
+			if (block) {
+				page_header_reset_last_insert(
+					block->frame, page_zip, &mtr);
+			}
+
+			goto reset_bit;
+		}
+
+		if (UNIV_UNLIKELY(corruption_noticed)) {
+			fputs("InnoDB: Discarding record\n ", stderr);
+			rec_print_old(stderr, rec);
+			fputs("\nInnoDB: from the insert buffer!\n\n", stderr);
+		} else if (block && !rec_get_deleted_flag(rec, 0)) {
+			/* Now we have at pcur a record which should be
+			applied on the index page; NOTE that the call below
+			copies pointers to fields in rec, and we must
+			keep the latch to the rec page until the
+			insertion is finished! */
+			dtuple_t*	entry;
+			trx_id_t	max_trx_id;
+			dict_index_t*	dummy_index;
+			ibuf_op_t	op = ibuf_rec_get_op_type(&mtr, rec);
+
+			max_trx_id = page_get_max_trx_id(page_align(rec));
+			page_update_max_trx_id(block, page_zip, max_trx_id,
+					       &mtr);
+
+			ut_ad(page_validate(page_align(rec), ibuf->index));
+
+			entry = ibuf_build_entry_from_ibuf_rec(
+				&mtr, rec, heap, &dummy_index);
+
+			ut_ad(page_validate(block->frame, dummy_index));
+
+			switch (op) {
+				ibool	success;
+			case IBUF_OP_INSERT:
+#ifdef UNIV_IBUF_DEBUG
+				volume += rec_get_converted_size(
+					dummy_index, entry, 0);
+
+				volume += page_dir_calc_reserved_space(1);
+
+				ut_a(volume <= 4 * UNIV_PAGE_SIZE
+					/ IBUF_PAGE_SIZE_PER_FREE_SPACE);
+#endif
+				ibuf_insert_to_index_page(
+					entry, block, dummy_index, &mtr);
+				break;
+
+			case IBUF_OP_DELETE_MARK:
+				ibuf_set_del_mark(
+					entry, block, dummy_index, &mtr);
+				break;
+
+			case IBUF_OP_DELETE:
+				ibuf_delete(entry, block, dummy_index, &mtr);
+				/* Because ibuf_delete() will latch an
+				insert buffer bitmap page, commit mtr
+				before latching any further pages.
+				Store and restore the cursor position. */
+				ut_ad(rec == btr_pcur_get_rec(&pcur));
+				ut_ad(page_rec_is_user_rec(rec));
+				ut_ad(ibuf_rec_get_page_no(&mtr, rec)
+				      == page_no);
+				ut_ad(ibuf_rec_get_space(&mtr, rec) == space);
+
+				/* Mark the change buffer record processed,
+				so that it will not be merged again in case
+				the server crashes between the following
+				mtr_commit() and the subsequent mtr_commit()
+				of deleting the change buffer record. */
+
+				btr_cur_set_deleted_flag_for_ibuf(
+					btr_pcur_get_rec(&pcur), NULL,
+					TRUE, &mtr);
+
+				btr_pcur_store_position(&pcur, &mtr);
+				ibuf_btr_pcur_commit_specify_mtr(&pcur, &mtr);
+
+				ibuf_mtr_start(&mtr);
+
+				success = buf_page_get_known_nowait(
+					RW_X_LATCH, block,
+					BUF_KEEP_OLD,
+					__FILE__, __LINE__, &mtr);
+				ut_a(success);
+
+				/* This is a user page (secondary
+				index leaf page), but it should be OK
+				to use too low latching order for it,
+				as the block is io-fixed. */
+				buf_block_dbg_add_level(
+					block, SYNC_IBUF_TREE_NODE);
+
+				if (!ibuf_restore_pos(space, page_no,
+						      search_tuple,
+						      BTR_MODIFY_LEAF,
+						      &pcur, &mtr)) {
+
+					ut_ad(mtr.state == MTR_COMMITTED);
+					mops[op]++;
+					ibuf_dummy_index_free(dummy_index);
+					goto loop;
+				}
+
+				break;
+			default:
+				ut_error;
+			}
+
+			mops[op]++;
+
+			ibuf_dummy_index_free(dummy_index);
+		} else {
+			dops[ibuf_rec_get_op_type(&mtr, rec)]++;
+		}
+
+		/* Delete the record from ibuf */
+		if (ibuf_delete_rec(space, page_no, &pcur, search_tuple,
+				    &mtr)) {
+			/* Deletion was pessimistic and mtr was committed:
+			we start from the beginning again */
+
+			ut_ad(mtr.state == MTR_COMMITTED);
+			goto loop;
+		} else if (btr_pcur_is_after_last_on_page(&pcur)) {
+			ibuf_mtr_commit(&mtr);
+			btr_pcur_close(&pcur);
+
+			goto loop;
+		}
+	}
+
+reset_bit:
+	if (UNIV_LIKELY(update_ibuf_bitmap)) {
+		page_t*	bitmap_page;
+
+		bitmap_page = ibuf_bitmap_get_map_page(
+			space, page_no, zip_size, &mtr);
+
+		ibuf_bitmap_page_set_bits(
+			bitmap_page, page_no, zip_size,
+			IBUF_BITMAP_BUFFERED, FALSE, &mtr);
+
+		if (block) {
+			ulint old_bits = ibuf_bitmap_page_get_bits(
+				bitmap_page, page_no, zip_size,
+				IBUF_BITMAP_FREE, &mtr);
+
+			ulint new_bits = ibuf_index_page_calc_free(
+				zip_size, block);
+
+			if (old_bits != new_bits) {
+				ibuf_bitmap_page_set_bits(
+					bitmap_page, page_no, zip_size,
+					IBUF_BITMAP_FREE, new_bits, &mtr);
+			}
+		}
+	}
+
+	ibuf_mtr_commit(&mtr);
+	btr_pcur_close(&pcur);
+	mem_heap_free(heap);
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	os_atomic_increment_ulint(&ibuf->n_merges, 1);
+	ibuf_add_ops(ibuf->n_merged_ops, mops);
+	ibuf_add_ops(ibuf->n_discarded_ops, dops);
+#else /* HAVE_ATOMIC_BUILTINS */
+	/* Protect our statistics keeping from race conditions */
+	mutex_enter(&ibuf_mutex);
+
+	ibuf->n_merges++;
+	ibuf_add_ops(ibuf->n_merged_ops, mops);
+	ibuf_add_ops(ibuf->n_discarded_ops, dops);
+
+	mutex_exit(&ibuf_mutex);
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+	if (update_ibuf_bitmap && !tablespace_being_deleted) {
+
+		fil_decr_pending_ops(space);
+	}
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a(ibuf_count_get(space, page_no) == 0);
+#endif
+}
+
+/*********************************************************************//**
+Deletes all entries in the insert buffer for a given space id. This is used
+in DISCARD TABLESPACE and IMPORT TABLESPACE.
+NOTE: this does not update the page free bitmaps in the space. The space will
+become CORRUPT when you call this function! */
+UNIV_INTERN
+void
+ibuf_delete_for_discarded_space(
+/*============================*/
+	ulint	space)	/*!< in: space id */
+{
+	mem_heap_t*	heap;
+	btr_pcur_t	pcur;
+	dtuple_t*	search_tuple;
+	const rec_t*	ibuf_rec;
+	ulint		page_no;
+	mtr_t		mtr;
+
+	/* Counts for discarded operations. */
+	ulint		dops[IBUF_OP_COUNT];
+
+	heap = mem_heap_create(512);
+
+	/* Use page number 0 to build the search tuple so that we get the
+	cursor positioned at the first entry for this space id */
+
+	search_tuple = ibuf_search_tuple_build(space, 0, heap);
+
+	memset(dops, 0, sizeof(dops));
+loop:
+	ibuf_mtr_start(&mtr);
+
+	/* Position pcur in the insert buffer at the first entry for the
+	space */
+	btr_pcur_open_on_user_rec(
+		ibuf->index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
+		&pcur, &mtr);
+
+	if (!btr_pcur_is_on_user_rec(&pcur)) {
+		ut_ad(btr_pcur_is_after_last_in_tree(&pcur, &mtr));
+
+		goto leave_loop;
+	}
+
+	for (;;) {
+		ut_ad(btr_pcur_is_on_user_rec(&pcur));
+
+		ibuf_rec = btr_pcur_get_rec(&pcur);
+
+		/* Check if the entry is for this space */
+		if (ibuf_rec_get_space(&mtr, ibuf_rec) != space) {
+
+			goto leave_loop;
+		}
+
+		page_no = ibuf_rec_get_page_no(&mtr, ibuf_rec);
+
+		dops[ibuf_rec_get_op_type(&mtr, ibuf_rec)]++;
+
+		/* Delete the record from ibuf */
+		if (ibuf_delete_rec(space, page_no, &pcur, search_tuple,
+				    &mtr)) {
+			/* Deletion was pessimistic and mtr was committed:
+			we start from the beginning again */
+
+			ut_ad(mtr.state == MTR_COMMITTED);
+			goto loop;
+		}
+
+		if (btr_pcur_is_after_last_on_page(&pcur)) {
+			ibuf_mtr_commit(&mtr);
+			btr_pcur_close(&pcur);
+
+			goto loop;
+		}
+	}
+
+leave_loop:
+	ibuf_mtr_commit(&mtr);
+	btr_pcur_close(&pcur);
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	ibuf_add_ops(ibuf->n_discarded_ops, dops);
+#else /* HAVE_ATOMIC_BUILTINS */
+	/* Protect our statistics keeping from race conditions */
+	mutex_enter(&ibuf_mutex);
+	ibuf_add_ops(ibuf->n_discarded_ops, dops);
+	mutex_exit(&ibuf_mutex);
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+	mem_heap_free(heap);
+}
+
+/******************************************************************//**
+Looks if the insert buffer is empty.
+@return	true if empty */
+UNIV_INTERN
+bool
+ibuf_is_empty(void)
+/*===============*/
+{
+	bool		is_empty;
+	const page_t*	root;
+	mtr_t		mtr;
+
+	ibuf_mtr_start(&mtr);
+
+	mutex_enter(&ibuf_mutex);
+	root = ibuf_tree_root_get(&mtr);
+	mutex_exit(&ibuf_mutex);
+
+	is_empty = page_is_empty(root);
+	ut_a(is_empty == ibuf->empty);
+	ibuf_mtr_commit(&mtr);
+
+	return(is_empty);
+}
+
+/******************************************************************//**
+Prints info of ibuf. */
+UNIV_INTERN
+void
+ibuf_print(
+/*=======*/
+	FILE*	file)	/*!< in: file where to print */
+{
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ulint		i;
+	ulint		j;
+#endif
+
+	mutex_enter(&ibuf_mutex);
+
+	fprintf(file,
+		"Ibuf: size %lu, free list len %lu,"
+		" seg size %lu, %lu merges\n",
+		(ulong) ibuf->size,
+		(ulong) ibuf->free_list_len,
+		(ulong) ibuf->seg_size,
+		(ulong) ibuf->n_merges);
+
+	fputs("merged operations:\n ", file);
+	ibuf_print_ops(ibuf->n_merged_ops, file);
+
+	fputs("discarded operations:\n ", file);
+	ibuf_print_ops(ibuf->n_discarded_ops, file);
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	for (i = 0; i < IBUF_COUNT_N_SPACES; i++) {
+		for (j = 0; j < IBUF_COUNT_N_PAGES; j++) {
+			ulint	count = ibuf_count_get(i, j);
+
+			if (count > 0) {
+				fprintf(stderr,
+					"Ibuf count for space/page %lu/%lu"
+					" is %lu\n",
+					(ulong) i, (ulong) j, (ulong) count);
+			}
+		}
+	}
+#endif /* UNIV_IBUF_COUNT_DEBUG */
+
+	mutex_exit(&ibuf_mutex);
+}
+
+/******************************************************************//**
+Checks the insert buffer bitmaps on IMPORT TABLESPACE.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+ibuf_check_bitmap_on_import(
+/*========================*/
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		space_id)	/*!< in: tablespace identifier */
+{
+	ulint	zip_size;
+	ulint	page_size;
+	ulint	size;
+	ulint	page_no;
+
+	ut_ad(space_id);
+	ut_ad(trx->mysql_thd);
+
+	zip_size = fil_space_get_zip_size(space_id);
+
+	if (zip_size == ULINT_UNDEFINED) {
+		return(DB_TABLE_NOT_FOUND);
+	}
+
+	size = fil_space_get_size(space_id);
+
+	if (size == 0) {
+		return(DB_TABLE_NOT_FOUND);
+	}
+
+	mutex_enter(&ibuf_mutex);
+
+	page_size = zip_size ? zip_size : UNIV_PAGE_SIZE;
+
+	for (page_no = 0; page_no < size; page_no += page_size) {
+		mtr_t	mtr;
+		page_t*	bitmap_page;
+		ulint	i;
+
+		if (trx_is_interrupted(trx)) {
+			mutex_exit(&ibuf_mutex);
+			return(DB_INTERRUPTED);
+		}
+
+		mtr_start(&mtr);
+
+		mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
+
+		ibuf_enter(&mtr);
+
+		bitmap_page = ibuf_bitmap_get_map_page(
+			space_id, page_no, zip_size, &mtr);
+
+		for (i = FSP_IBUF_BITMAP_OFFSET + 1; i < page_size; i++) {
+			const ulint	offset = page_no + i;
+
+			if (ibuf_bitmap_page_get_bits(
+				    bitmap_page, offset, zip_size,
+				    IBUF_BITMAP_IBUF, &mtr)) {
+
+				mutex_exit(&ibuf_mutex);
+				ibuf_exit(&mtr);
+				mtr_commit(&mtr);
+
+				ib_errf(trx->mysql_thd,
+					IB_LOG_LEVEL_ERROR,
+					 ER_INNODB_INDEX_CORRUPT,
+					 "Space %u page %u"
+					 " is wrongly flagged to belong to the"
+					 " insert buffer",
+					 (unsigned) space_id,
+					 (unsigned) offset);
+
+				return(DB_CORRUPTION);
+			}
+
+			if (ibuf_bitmap_page_get_bits(
+				    bitmap_page, offset, zip_size,
+				    IBUF_BITMAP_BUFFERED, &mtr)) {
+
+				ib_errf(trx->mysql_thd,
+					IB_LOG_LEVEL_WARN,
+					ER_INNODB_INDEX_CORRUPT,
+					"Buffered changes"
+					" for space %u page %u are lost",
+					(unsigned) space_id,
+					(unsigned) offset);
+
+				/* Tolerate this error, so that
+				slightly corrupted tables can be
+				imported and dumped.  Clear the bit. */
+				ibuf_bitmap_page_set_bits(
+					bitmap_page, offset, zip_size,
+					IBUF_BITMAP_BUFFERED, FALSE, &mtr);
+			}
+		}
+
+		ibuf_exit(&mtr);
+		mtr_commit(&mtr);
+	}
+
+	mutex_exit(&ibuf_mutex);
+	return(DB_SUCCESS);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/api0api.h b/storage/innobase/include/api0api.h
new file mode 100644
index 00000000000..d77d691becc
--- /dev/null
+++ b/storage/innobase/include/api0api.h
@@ -0,0 +1,1304 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/api0api.h
+InnoDB Native API
+
+2008-08-01 Created by Sunny Bains.
+3/20/2011 Jimmy Yang extracted from Embedded InnoDB
+*******************************************************/
+
+#ifndef api0api_h
+#define api0api_h
+
+#include "db0err.h"
+#include <stdio.h>
+
+#ifdef _MSC_VER
+#define strncasecmp		_strnicmp
+#define strcasecmp		_stricmp
+#endif
+
+#if defined(__GNUC__) && (__GNUC__ > 2) && ! defined(__INTEL_COMPILER)
+#define UNIV_NO_IGNORE		__attribute__ ((warn_unused_result))
+#else
+#define UNIV_NO_IGNORE
+#endif /* __GNUC__ && __GNUC__ > 2 && !__INTEL_COMPILER */
+
+/* See comment about ib_bool_t as to why the two macros are unsigned long. */
+/** The boolean value of "true" used internally within InnoDB */
+#define IB_TRUE			0x1UL
+/** The boolean value of "false" used internally within InnoDB */
+#define IB_FALSE		0x0UL
+
+/* Basic types used by the InnoDB API. */
+/** All InnoDB error codes are represented by ib_err_t */
+typedef enum dberr_t		ib_err_t;
+/** Representation of a byte within InnoDB */
+typedef unsigned char		ib_byte_t;
+/** Representation of an unsigned long int within InnoDB */
+typedef unsigned long int	ib_ulint_t;
+
+/* We assume C99 support except when using VisualStudio. */
+#if !defined(_MSC_VER)
+#include <stdint.h>
+#endif /* _MSC_VER */
+
+/* Integer types used by the API. Microsft VS defines its own types
+and we use the Microsoft types when building with Visual Studio. */
+#if defined(_MSC_VER)
+/** A signed 8 bit integral type. */
+typedef __int8			ib_i8_t;
+#else
+/** A signed 8 bit integral type. */
+typedef int8_t                  ib_i8_t;
+#endif
+
+#if defined(_MSC_VER)
+/** An unsigned 8 bit integral type. */
+typedef unsigned __int8		ib_u8_t;
+#else
+/** An unsigned 8 bit integral type. */
+typedef uint8_t                 ib_u8_t;
+#endif
+
+#if defined(_MSC_VER)
+/** A signed 16 bit integral type. */
+typedef __int16			ib_i16_t;
+#else
+/** A signed 16 bit integral type. */
+typedef int16_t                 ib_i16_t;
+#endif
+
+#if defined(_MSC_VER)
+/** An unsigned 16 bit integral type. */
+typedef unsigned __int16	ib_u16_t;
+#else
+/** An unsigned 16 bit integral type. */
+typedef uint16_t                ib_u16_t;
+#endif
+
+#if defined(_MSC_VER)
+/** A signed 32 bit integral type. */
+typedef __int32			ib_i32_t;
+#else
+/** A signed 32 bit integral type. */
+typedef int32_t                 ib_i32_t;
+#endif
+
+#if defined(_MSC_VER)
+/** An unsigned 32 bit integral type. */
+typedef unsigned __int32	ib_u32_t;
+#else
+/** An unsigned 32 bit integral type. */
+typedef uint32_t                ib_u32_t;
+#endif
+
+#if defined(_MSC_VER)
+/** A signed 64 bit integral type. */
+typedef __int64			ib_i64_t;
+#else
+/** A signed 64 bit integral type. */
+typedef int64_t                 ib_i64_t;
+#endif
+
+#if defined(_MSC_VER)
+/** An unsigned 64 bit integral type. */
+typedef unsigned __int64	ib_u64_t;
+#else
+/** An unsigned 64 bit integral type. */
+typedef uint64_t                ib_u64_t;
+#endif
+
+typedef void*			ib_opaque_t;
+typedef ib_opaque_t		ib_charset_t;
+typedef ib_ulint_t		ib_bool_t;
+typedef ib_u64_t		ib_id_u64_t;
+
+/** @enum ib_cfg_type_t Possible types for a configuration variable. */
+typedef enum {
+	IB_CFG_IBOOL,			/*!< The configuration parameter is
+					of type ibool */
+
+	/* XXX Can we avoid having different types for ulint and ulong?
+	- On Win64 "unsigned long" is 32 bits
+	- ulong is always defined as "unsigned long"
+	- On Win64 ulint is defined as 64 bit integer
+	=> On Win64 ulint != ulong.
+	If we typecast all ulong and ulint variables to the smaller type
+	ulong, then we will cut the range of the ulint variables.
+	This is not a problem for most ulint variables because their max
+	allowed values do not exceed 2^32-1 (e.g. log_groups is ulint
+	but its max allowed value is 10). BUT buffer_pool_size and
+	log_file_size allow up to 2^64-1. */
+
+	IB_CFG_ULINT,			/*!< The configuration parameter is
+					of type ulint */
+
+	IB_CFG_ULONG,			/*!< The configuration parameter is
+					of type ulong */
+
+	IB_CFG_TEXT,			/*!< The configuration parameter is
+					of type char* */
+
+	IB_CFG_CB			/*!< The configuration parameter is
+					a callback parameter */
+} ib_cfg_type_t;
+
+/** @enum ib_col_type_t  column types that are supported. */
+typedef enum {
+	IB_VARCHAR =	1,		/*!< Character varying length. The
+					column is not padded. */
+
+	IB_CHAR =	2,		/*!< Fixed length character string. The
+					column is padded to the right. */
+
+	IB_BINARY =	3,		/*!< Fixed length binary, similar to
+					IB_CHAR but the column is not padded
+					to the right. */
+
+	IB_VARBINARY =	4,		/*!< Variable length binary */
+
+	IB_BLOB	=	5,		/*!< Binary large object, or
+					a TEXT type */
+
+	IB_INT =	6,		/*!< Integer: can be any size
+					from 1 - 8 bytes. If the size is
+					1, 2, 4 and 8 bytes then you can use
+					the typed read and write functions. For
+					other sizes you will need to use the
+					ib_col_get_value() function and do the
+					conversion yourself. */
+
+	IB_SYS =	8,		/*!< System column, this column can
+					be one of DATA_TRX_ID, DATA_ROLL_PTR
+					or DATA_ROW_ID. */
+
+	IB_FLOAT =	9,		/*!< C (float)  floating point value. */
+
+	IB_DOUBLE =	10,		/*!> C (double) floating point value. */
+
+	IB_DECIMAL =	11,		/*!< Decimal stored as an ASCII
+					string */
+
+	IB_VARCHAR_ANYCHARSET =	12,	/*!< Any charset, varying length */
+
+	IB_CHAR_ANYCHARSET =	13	/*!< Any charset, fixed length */
+
+} ib_col_type_t;
+
+/** @enum ib_tbl_fmt_t InnoDB table format types */
+typedef enum {
+	IB_TBL_REDUNDANT,		/*!< Redundant row format, the column
+					type and length is stored in the row.*/
+
+	IB_TBL_COMPACT,			/*!< Compact row format, the column
+					type is not stored in the row. The
+					length is stored in the row but the
+					storage format uses a compact format
+					to store the length of the column data
+					and record data storage format also
+					uses less storage. */
+
+	IB_TBL_DYNAMIC,			/*!< Compact row format. BLOB prefixes
+					are not stored in the clustered index */
+
+	IB_TBL_COMPRESSED		/*!< Similar to dynamic format but
+					with pages compressed */
+} ib_tbl_fmt_t;
+
+/** @enum ib_col_attr_t InnoDB column attributes */
+typedef enum {
+	IB_COL_NONE = 0,		/*!< No special attributes. */
+
+	IB_COL_NOT_NULL = 1,		/*!< Column data can't be NULL. */
+
+	IB_COL_UNSIGNED = 2,		/*!< Column is IB_INT and unsigned. */
+
+	IB_COL_NOT_USED = 4,		/*!< Future use, reserved. */
+
+	IB_COL_CUSTOM1 = 8,		/*!< Custom precision type, this is
+					a bit that is ignored by InnoDB and so
+					can be set and queried by users. */
+
+	IB_COL_CUSTOM2 = 16,		/*!< Custom precision type, this is
+					a bit that is ignored by InnoDB and so
+					can be set and queried by users. */
+
+	IB_COL_CUSTOM3 = 32		/*!< Custom precision type, this is
+					a bit that is ignored by InnoDB and so
+					can be set and queried by users. */
+} ib_col_attr_t;
+
+/* Note: must match lock0types.h */
+/** @enum ib_lck_mode_t InnoDB lock modes. */
+typedef enum {
+	IB_LOCK_IS = 0,			/*!< Intention shared, an intention
+					lock should be used to lock tables */
+
+	IB_LOCK_IX,			/*!< Intention exclusive, an intention
+					lock should be used to lock tables */
+
+	IB_LOCK_S,			/*!< Shared locks should be used to
+					lock rows */
+
+	IB_LOCK_X,			/*!< Exclusive locks should be used to
+					lock rows*/
+
+	IB_LOCK_TABLE_X,		/*!< exclusive table lock */
+
+	IB_LOCK_NONE,			/*!< This is used internally to note
+					consistent read */
+
+	IB_LOCK_NUM = IB_LOCK_NONE	/*!< number of lock modes */
+} ib_lck_mode_t;
+
+typedef enum {
+	IB_CLUSTERED = 1,	/*!< clustered index */
+	IB_UNIQUE = 2		/*!< unique index */
+} ib_index_type_t;
+
+/** @enum ib_srch_mode_t InnoDB cursor search modes for ib_cursor_moveto().
+Note: Values must match those found in page0cur.h */
+typedef enum {
+	IB_CUR_G = 1,			/*!< If search key is not found then
+					position the cursor on the row that
+					is greater than the search key */
+
+	IB_CUR_GE = 2,			/*!< If the search key not found then
+					position the cursor on the row that
+					is greater than or equal to the search
+					key */
+
+	IB_CUR_L = 3,			/*!< If search key is not found then
+					position the cursor on the row that
+					is less than the search key */
+
+	IB_CUR_LE = 4			/*!< If search key is not found then
+					position the cursor on the row that
+					is less than or equal to the search
+					key */
+} ib_srch_mode_t;
+
+/** @enum ib_match_mode_t Various match modes used by ib_cursor_moveto() */
+typedef enum {
+	IB_CLOSEST_MATCH,		/*!< Closest match possible */
+
+	IB_EXACT_MATCH,			/*!< Search using a complete key
+					value */
+
+	IB_EXACT_PREFIX			/*!< Search using a key prefix which
+					must match to rows: the prefix may
+					contain an incomplete field (the
+					last field in prefix may be just
+					a prefix of a fixed length column) */
+} ib_match_mode_t;
+
+/** @struct ib_col_meta_t InnoDB column meta data. */
+typedef struct {
+	ib_col_type_t	type;		/*!< Type of the column */
+
+	ib_col_attr_t	attr;		/*!< Column attributes */
+
+	ib_u32_t	type_len;	/*!< Length of type */
+
+	ib_u16_t	client_type;	/*!< 16 bits of data relevant only to
+					the client. InnoDB doesn't care */
+
+	ib_charset_t*	charset;	/*!< Column charset */
+} ib_col_meta_t;
+
+/* Note: Must be in sync with trx0trx.h */
+/** @enum ib_trx_state_t The transaction state can be queried using the
+ib_trx_state() function. The InnoDB deadlock monitor can roll back a
+transaction and users should be prepared for this, especially where there
+is high contention. The way to determine the state of the transaction is to
+query it's state and check. */
+typedef enum {
+	IB_TRX_NOT_STARTED,		/*!< Has not started yet, the
+					transaction has not ben started yet.*/
+
+	IB_TRX_ACTIVE,			/*!< The transaction is currently
+					active and needs to be either
+					committed or rolled back. */
+
+	IB_TRX_COMMITTED_IN_MEMORY,	/*!< Not committed to disk yet */
+
+	IB_TRX_PREPARED			/*!< Support for 2PC/XA */
+} ib_trx_state_t;
+
+/* Note: Must be in sync with trx0trx.h */
+/** @enum ib_trx_level_t Transaction isolation levels */
+typedef enum {
+	IB_TRX_READ_UNCOMMITTED = 0,	/*!< Dirty read: non-locking SELECTs are
+					performed so that we do not look at a
+					possible earlier version of a record;
+					thus they are not 'consistent' reads
+					under this isolation level; otherwise
+					like level 2 */
+
+	IB_TRX_READ_COMMITTED = 1,	/*!< Somewhat Oracle-like isolation,
+					except that in range UPDATE and DELETE
+					we must block phantom rows with
+					next-key locks; SELECT ... FOR UPDATE
+					and ...  LOCK IN SHARE MODE only lock
+					the index records, NOT the gaps before
+					them, and thus allow free inserting;
+					each consistent read reads its own
+					snapshot */
+
+	IB_TRX_REPEATABLE_READ = 2,	/*!< All consistent reads in the same
+					trx read the same snapshot; full
+					next-key locking used in locking reads
+					to block insertions into gaps */
+
+	IB_TRX_SERIALIZABLE = 3		/*!< All plain SELECTs are converted to
+					LOCK IN SHARE MODE reads */
+} ib_trx_level_t;
+
+/** Generical InnoDB callback prototype. */
+typedef void (*ib_cb_t)(void);
+
+#define IB_CFG_BINLOG_ENABLED	0x1
+#define IB_CFG_MDL_ENABLED	0x2
+#define IB_CFG_DISABLE_ROWLOCK	0x4
+
+/** The first argument to the InnoDB message logging function. By default
+it's set to stderr. You should treat ib_msg_stream_t as a void*, since
+it will probably change in the future. */
+typedef FILE* ib_msg_stream_t;
+
+/** All log messages are written to this function.It should have the same
+behavior as fprintf(3). */
+typedef int (*ib_msg_log_t)(ib_msg_stream_t, const char*, ...);
+
+/* Note: This is to make it easy for API users to have type
+checking for arguments to our functions. Making it ib_opaque_t
+by itself will result in pointer decay resulting in subverting
+of the compiler's type checking. */
+
+/** InnoDB tuple handle. This handle can refer to either a cluster index
+tuple or a secondary index tuple. There are two types of tuples for each
+type of index, making a total of four types of tuple handles. There
+is a tuple for reading the entire row contents and another for searching
+on the index key. */
+typedef struct ib_tuple_t* ib_tpl_t;
+
+/** InnoDB transaction handle, all database operations need to be covered
+by transactions. This handle represents a transaction. The handle can be
+created with ib_trx_begin(), you commit your changes with ib_trx_commit()
+and undo your changes using ib_trx_rollback(). If the InnoDB deadlock
+monitor rolls back the transaction then you need to free the transaction
+using the function ib_trx_release(). You can query the state of an InnoDB
+transaction by calling ib_trx_state(). */
+typedef struct trx_t* ib_trx_t;
+
+/** InnoDB cursor handle */
+typedef struct ib_cursor_t* ib_crsr_t;
+
+/*************************************************************//**
+This function is used to compare two data fields for which the data type
+is such that we must use the client code to compare them.
+
+@param col_meta		column meta data
+@param p1		key
+@oaram p1_len		key length
+@param p2		second key
+@param p2_len		second key length
+@return 1, 0, -1, if a is greater, equal, less than b, respectively */
+
+typedef int (*ib_client_cmp_t)(
+	const ib_col_meta_t*	col_meta,
+	const ib_byte_t*	p1,
+	ib_ulint_t		p1_len,
+	const ib_byte_t*	p2,
+	ib_ulint_t		p2_len);
+
+/* This should be the same as univ.i */
+/** Represents SQL_NULL length */
+#define	IB_SQL_NULL		0xFFFFFFFF
+/** The number of system columns in a row. */
+#define IB_N_SYS_COLS		3
+
+/** The maximum length of a text column. */
+#define MAX_TEXT_LEN		4096
+
+/* MySQL uses 3 byte UTF-8 encoding. */
+/** The maximum length of a column name in a table schema. */
+#define IB_MAX_COL_NAME_LEN	(64 * 3)
+
+/** The maximum length of a table name (plus database name). */
+#define IB_MAX_TABLE_NAME_LEN	(64 * 3) * 2
+
+/*****************************************************************//**
+Start a transaction that's been rolled back. This special function
+exists for the case when InnoDB's deadlock detector has rolledack
+a transaction. While the transaction has been rolled back the handle
+is still valid and can be reused by calling this function. If you
+don't want to reuse the transaction handle then you can free the handle
+by calling ib_trx_release().
+@return	innobase txn handle */
+
+ib_err_t
+ib_trx_start(
+/*=========*/
+	ib_trx_t	ib_trx,		/*!< in: transaction to restart */
+	ib_trx_level_t	ib_trx_level,	/*!< in: trx isolation level */
+	ib_bool_t	read_write,	/*!< in: true if read write
+					transaction */
+	ib_bool_t	auto_commit,	/*!< in: auto commit after each
+					single DML */
+	void*		thd);		/*!< in: THD */
+
+/*****************************************************************//**
+Begin a transaction. This will allocate a new transaction handle and
+put the transaction in the active state.
+@return	innobase txn handle */
+
+ib_trx_t
+ib_trx_begin(
+/*=========*/
+	ib_trx_level_t	ib_trx_level,	/*!< in: trx isolation level */
+	ib_bool_t	read_write,	/*!< in: true if read write
+					transaction */
+	ib_bool_t	auto_commit);	/*!< in: auto commit after each
+					single DML */
+
+/*****************************************************************//**
+Query the transaction's state. This function can be used to check for
+the state of the transaction in case it has been rolled back by the
+InnoDB deadlock detector. Note that when a transaction is selected as
+a victim for rollback, InnoDB will always return an appropriate error
+code indicating this. @see DB_DEADLOCK, @see DB_LOCK_TABLE_FULL and
+@see DB_LOCK_WAIT_TIMEOUT
+@return	transaction state */
+
+ib_trx_state_t
+ib_trx_state(
+/*=========*/
+	ib_trx_t	ib_trx);	/*!< in: trx handle */
+
+/*****************************************************************//**
+Release the resources of the transaction. If the transaction was
+selected as a victim by InnoDB and rolled back then use this function
+to free the transaction handle.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_trx_release(
+/*===========*/
+	ib_trx_t	ib_trx);	/*!< in: trx handle */
+
+/*****************************************************************//**
+Commit a transaction. This function will release the schema latches too.
+It will also free the transaction handle.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_trx_commit(
+/*==========*/
+	ib_trx_t	ib_trx);	/*!< in: trx handle */
+
+/*****************************************************************//**
+Rollback a transaction. This function will release the schema latches too.
+It will also free the transaction handle.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_trx_rollback(
+/*============*/
+	ib_trx_t	ib_trx);	/*!< in: trx handle */
+
+/*****************************************************************//**
+Open an InnoDB table and return a cursor handle to it.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_open_table_using_id(
+/*==========================*/
+	ib_id_u64_t	table_id,	/*!< in: table id of table to open */
+	ib_trx_t	ib_trx,		/*!< in: Current transaction handle
+					can be NULL */
+	ib_crsr_t*	ib_crsr);	/*!< out,own: InnoDB cursor */
+
+/*****************************************************************//**
+Open an InnoDB index and return a cursor handle to it.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_open_index_using_id(
+/*==========================*/
+	ib_id_u64_t	index_id,	/*!< in: index id of index to open */
+	ib_trx_t	ib_trx,		/*!< in: Current transaction handle
+					can be NULL */
+	ib_crsr_t*	ib_crsr);	/*!< out: InnoDB cursor */
+
+/*****************************************************************//**
+Open an InnoDB secondary index cursor and return a cursor handle to it.
+@return DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_open_index_using_name(
+/*============================*/
+	ib_crsr_t	ib_open_crsr,	/*!< in: open/active cursor */
+	const char*	index_name,	/*!< in: secondary index name */
+	ib_crsr_t*	ib_crsr,	/*!< out,own: InnoDB index cursor */
+	int*		idx_type,	/*!< out: index is cluster index */
+	ib_id_u64_t*	idx_id);	/*!< out: index id */
+
+/*****************************************************************//**
+Open an InnoDB table by name and return a cursor handle to it.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_open_table(
+/*=================*/
+	const char*	name,		/*!< in: table name */
+	ib_trx_t	ib_trx,		/*!< in: Current transaction handle
+					can be NULL */
+	ib_crsr_t*	ib_crsr);	/*!< out,own: InnoDB cursor */
+
+/*****************************************************************//**
+Reset the cursor.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_reset(
+/*============*/
+	ib_crsr_t	ib_crsr);	/*!< in/out: InnoDB cursor */
+
+
+/*****************************************************************//**
+set a cursor trx to NULL*/
+
+void
+ib_cursor_clear_trx(
+/*================*/
+	ib_crsr_t	ib_crsr);	/*!< in/out: InnoDB cursor */
+
+/*****************************************************************//**
+Close an InnoDB table and free the cursor.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_close(
+/*============*/
+	ib_crsr_t	ib_crsr);	/*!< in/out: InnoDB cursor */
+
+/*****************************************************************//**
+Close the table, decrement n_ref_count count.
+@return DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_close_table(
+/*==================*/
+	ib_crsr_t	ib_crsr);	/*!< in/out: InnoDB cursor */
+
+/*****************************************************************//**
+update the cursor with new transactions and also reset the cursor
+@return DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_new_trx(
+/*==============*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor */
+	ib_trx_t	ib_trx);	/*!< in: transaction */
+
+/*****************************************************************//**
+Commit the transaction in a cursor
+@return DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_commit_trx(
+/*=================*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor */
+	ib_trx_t	ib_trx);	/*!< in: transaction */
+
+/********************************************************************//**
+Open a table using the table name, if found then increment table ref count.
+@return table instance if found */
+
+void*
+ib_open_table_by_name(
+/*==================*/
+	const char*	name);		/*!< in: table name to lookup */
+
+/*****************************************************************//**
+Insert a row to a table.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_insert_row(
+/*=================*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor instance */
+	const ib_tpl_t	ib_tpl);	/*!< in: tuple to insert */
+
+/*****************************************************************//**
+Update a row in a table.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_update_row(
+/*=================*/
+	ib_crsr_t	ib_crsr,	/*!< in: InnoDB cursor instance */
+	const ib_tpl_t	ib_old_tpl,	/*!< in: Old tuple in table */
+	const ib_tpl_t	ib_new_tpl);	/*!< in: New tuple to update */
+
+/*****************************************************************//**
+Delete a row in a table.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_delete_row(
+/*=================*/
+	ib_crsr_t	ib_crsr);	/*!< in: cursor instance */
+
+/*****************************************************************//**
+Read current row.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_read_row(
+/*===============*/
+	ib_crsr_t	ib_crsr,	/*!< in: InnoDB cursor instance */
+	ib_tpl_t	ib_tpl,		/*!< out: read cols into this tuple */
+	void**		row_buf,	/*!< in/out: row buffer */
+	ib_ulint_t*	row_len);	/*!< in/out: row buffer len */
+
+/*****************************************************************//**
+Move cursor to the first record in the table.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_first(
+/*============*/
+	ib_crsr_t	ib_crsr);	/*!< in: InnoDB cursor instance */
+
+/*****************************************************************//**
+Move cursor to the last record in the table.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_last(
+/*===========*/
+	ib_crsr_t	ib_crsr);	/*!< in: InnoDB cursor instance */
+
+/*****************************************************************//**
+Move cursor to the next record in the table.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_next(
+/*===========*/
+	ib_crsr_t	ib_crsr);	/*!< in: InnoDB cursor instance */
+
+/*****************************************************************//**
+Search for key.
+@return	DB_SUCCESS or err code */
+
+ib_err_t
+ib_cursor_moveto(
+/*=============*/
+	ib_crsr_t	ib_crsr,	/*!< in: InnoDB cursor instance */
+	ib_tpl_t	ib_tpl,		/*!< in: Key to search for */
+	ib_srch_mode_t	ib_srch_mode);	/*!< in: search mode */
+
+/*****************************************************************//**
+Set the match mode for ib_cursor_move(). */
+
+void
+ib_cursor_set_match_mode(
+/*=====================*/
+	ib_crsr_t	ib_crsr,	/*!< in: Cursor instance */
+	ib_match_mode_t	match_mode);	/*!< in: ib_cursor_moveto match mode */
+
+/*****************************************************************//**
+Set a column of the tuple. Make a copy using the tuple's heap.
+@return	DB_SUCCESS or error code */
+
+ib_err_t
+ib_col_set_value(
+/*=============*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	col_no,		/*!< in: column index in tuple */
+	const void*	src,		/*!< in: data value */
+	ib_ulint_t	len,		/*!< in: data value len */
+	ib_bool_t	need_cpy);	/*!< in: if need memcpy */
+
+
+/*****************************************************************//**
+Get the size of the data available in the column the tuple.
+@return	bytes avail or IB_SQL_NULL */
+
+ib_ulint_t
+ib_col_get_len(
+/*===========*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	i);		/*!< in: column index in tuple */
+
+/*****************************************************************//**
+Copy a column value from the tuple.
+@return	bytes copied or IB_SQL_NULL */
+
+ib_ulint_t
+ib_col_copy_value(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in: tuple instance */
+	ib_ulint_t	i,		/*!< in: column index in tuple */
+	void*		dst,		/*!< out: copied data value */
+	ib_ulint_t	len);		/*!< in: max data value len to copy */
+
+/*************************************************************//**
+Read a signed int 8 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_i8(
+/*=============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i,		/*!< in: column number */
+	ib_i8_t*	ival);		/*!< out: integer value */
+
+/*************************************************************//**
+Read an unsigned int 8 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_u8(
+/*=============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i,		/*!< in: column number */
+	ib_u8_t*	ival);		/*!< out: integer value */
+
+/*************************************************************//**
+Read a signed int 16 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_i16(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i,		/*!< in: column number */
+	ib_i16_t*	ival);		/*!< out: integer value */
+
+/*************************************************************//**
+Read an unsigned int 16 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_u16(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i,		/*!< in: column number */
+	ib_u16_t*	ival);		/*!< out: integer value */
+
+/*************************************************************//**
+Read a signed int 32 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_i32(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i,		/*!< in: column number */
+	ib_i32_t*	ival);		/*!< out: integer value */
+
+/*************************************************************//**
+Read an unsigned int 32 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_u32(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i,		/*!< in: column number */
+	ib_u32_t*	ival);		/*!< out: integer value */
+
+/*************************************************************//**
+Read a signed int 64 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_i64(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i,		/*!< in: column number */
+	ib_i64_t*	ival);		/*!< out: integer value */
+
+/*************************************************************//**
+Read an unsigned int 64 bit column from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_u64(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i,		/*!< in: column number */
+	ib_u64_t*	ival);		/*!< out: integer value */
+
+/*****************************************************************//**
+Get a column value pointer from the tuple.
+@return	NULL or pointer to buffer */
+
+const void*
+ib_col_get_value(
+/*=============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i);		/*!< in: column number */
+
+/*****************************************************************//**
+Get a column type, length and attributes from the tuple.
+@return	len of column data */
+
+ib_ulint_t
+ib_col_get_meta(
+/*============*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	i,		/*!< in: column number */
+	ib_col_meta_t*	ib_col_meta);	/*!< out: column meta data */
+
+/*****************************************************************//**
+"Clear" or reset an InnoDB tuple. We free the heap and recreate the tuple.
+@return	new tuple, or NULL */
+
+ib_tpl_t
+ib_tuple_clear(
+/*============*/
+	ib_tpl_t	ib_tpl);	/*!< in: InnoDB tuple */
+
+/*****************************************************************//**
+Create a new cluster key search tuple and copy the contents of  the
+secondary index key tuple columns that refer to the cluster index record
+to the cluster key. It does a deep copy of the column data.
+@return	DB_SUCCESS or error code */
+
+ib_err_t
+ib_tuple_get_cluster_key(
+/*=====================*/
+	ib_crsr_t	ib_crsr,	/*!< in: secondary index cursor */
+	ib_tpl_t*	ib_dst_tpl,	/*!< out,own: destination tuple */
+	const ib_tpl_t	ib_src_tpl);	/*!< in: source tuple */
+
+/*****************************************************************//**
+Copy the contents of  source tuple to destination tuple. The tuples
+must be of the same type and belong to the same table/index.
+@return	DB_SUCCESS or error code */
+
+ib_err_t
+ib_tuple_copy(
+/*==========*/
+	ib_tpl_t	ib_dst_tpl,	/*!< in: destination tuple */
+	const ib_tpl_t	ib_src_tpl);	/*!< in: source tuple */
+
+/*****************************************************************//**
+Create an InnoDB tuple used for index/table search.
+@return tuple for current index */
+
+ib_tpl_t
+ib_sec_search_tuple_create(
+/*=======================*/
+	ib_crsr_t	ib_crsr);	/*!< in: Cursor instance */
+
+/*****************************************************************//**
+Create an InnoDB tuple used for index/table search.
+@return	tuple for current index */
+
+ib_tpl_t
+ib_sec_read_tuple_create(
+/*=====================*/
+	ib_crsr_t	ib_crsr);	/*!< in: Cursor instance */
+
+/*****************************************************************//**
+Create an InnoDB tuple used for table key operations.
+@return	tuple for current table */
+
+ib_tpl_t
+ib_clust_search_tuple_create(
+/*=========================*/
+	ib_crsr_t	ib_crsr);	/*!< in: Cursor instance */
+
+/*****************************************************************//**
+Create an InnoDB tuple for table row operations.
+@return	tuple for current table */
+
+ib_tpl_t
+ib_clust_read_tuple_create(
+/*=======================*/
+	ib_crsr_t	ib_crsr);	/*!< in: Cursor instance */
+
+/*****************************************************************//**
+Return the number of user columns in the tuple definition.
+@return	number of user columns */
+
+ib_ulint_t
+ib_tuple_get_n_user_cols(
+/*=====================*/
+	const ib_tpl_t	ib_tpl);	/*!< in: Tuple for current table */
+
+/*****************************************************************//**
+Return the number of columns in the tuple definition.
+@return	number of columns */
+
+ib_ulint_t
+ib_tuple_get_n_cols(
+/*================*/
+	const ib_tpl_t	ib_tpl);	/*!< in: Tuple for current table */
+
+/*****************************************************************//**
+Destroy an InnoDB tuple. */
+
+void
+ib_tuple_delete(
+/*============*/
+	ib_tpl_t	ib_tpl);	/*!< in,own: Tuple instance to delete */
+
+/*****************************************************************//**
+Truncate a table. The cursor handle will be closed and set to NULL
+on success.
+@return	DB_SUCCESS or error code */
+
+ib_err_t
+ib_cursor_truncate(
+/*===============*/
+	ib_crsr_t*	ib_crsr,	/*!< in/out: cursor for table
+					to truncate */
+	ib_id_u64_t*	table_id);	/*!< out: new table id */
+
+/*****************************************************************//**
+Get a table id.
+@return	DB_SUCCESS if found */
+
+ib_err_t
+ib_table_get_id(
+/*============*/
+	const char*	table_name,	/*!< in: table to find */
+	ib_id_u64_t*	table_id);	/*!< out: table id if found */
+
+/*****************************************************************//**
+Get an index id.
+@return	DB_SUCCESS if found */
+
+ib_err_t
+ib_index_get_id(
+/*============*/
+	const char*	table_name,	/*!< in: find index for this table */
+	const char*	index_name,	/*!< in: index to find */
+	ib_id_u64_t*	index_id);	/*!< out: index id if found */
+
+/*****************************************************************//**
+Check if cursor is positioned.
+@return	IB_TRUE if positioned */
+
+ib_bool_t
+ib_cursor_is_positioned(
+/*====================*/
+	const ib_crsr_t	ib_crsr);	/*!< in: InnoDB cursor instance */
+
+/*****************************************************************//**
+Checks if the data dictionary is latched in exclusive mode by a
+user transaction.
+@return TRUE if exclusive latch */
+
+ib_bool_t
+ib_schema_lock_is_exclusive(
+/*========================*/
+	const ib_trx_t	ib_trx);	/*!< in: transaction */
+
+/*****************************************************************//**
+Lock an InnoDB cursor/table.
+@return	DB_SUCCESS or error code */
+
+ib_err_t
+ib_cursor_lock(
+/*===========*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor */
+	ib_lck_mode_t	ib_lck_mode);	/*!< in: InnoDB lock mode */
+
+/*****************************************************************//**
+Set the Lock an InnoDB table using the table id.
+@return	DB_SUCCESS or error code */
+
+ib_err_t
+ib_table_lock(
+/*===========*/
+	ib_trx_t	ib_trx,		/*!< in/out: transaction */
+	ib_id_u64_t	table_id,	/*!< in: table id */
+	ib_lck_mode_t	ib_lck_mode);	/*!< in: InnoDB lock mode */
+
+/*****************************************************************//**
+Set the Lock mode of the cursor.
+@return	DB_SUCCESS or error code */
+
+ib_err_t
+ib_cursor_set_lock_mode(
+/*====================*/
+	ib_crsr_t	ib_crsr,	/*!< in/out: InnoDB cursor */
+	ib_lck_mode_t	ib_lck_mode);	/*!< in: InnoDB lock mode */
+
+/*****************************************************************//**
+Set need to access clustered index record flag. */
+
+void
+ib_cursor_set_cluster_access(
+/*=========================*/
+	ib_crsr_t	ib_crsr);	/*!< in/out: InnoDB cursor */
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+
+ib_err_t
+ib_tuple_write_i8(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_i8_t		val);		/*!< in: value to write */
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+
+ib_err_t
+ib_tuple_write_i16(
+/*=================*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_i16_t	val);		/*!< in: value to write */
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+
+ib_err_t
+ib_tuple_write_i32(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_i32_t	val);		/*!< in: value to write */
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+
+ib_err_t
+ib_tuple_write_i64(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_i64_t	val);		/*!< in: value to write */
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+
+ib_err_t
+ib_tuple_write_u8(
+/*==============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_u8_t		val);		/*!< in: value to write */
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+
+ib_err_t
+ib_tuple_write_u16(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_u16_t	val);		/*!< in: value to write */
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+
+ib_err_t
+ib_tuple_write_u32(
+/*=================*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_u32_t	val);		/*!< in: value to write */
+
+/*****************************************************************//**
+Write an integer value to a column. Integers are stored in big-endian
+format and will need to be converted from the host format.
+@return	DB_SUCESS or error */
+
+ib_err_t
+ib_tuple_write_u64(
+/*===============*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	ib_u64_t	val);		/*!< in: value to write */
+
+/*****************************************************************//**
+Inform the cursor that it's the start of an SQL statement. */
+
+void
+ib_cursor_stmt_begin(
+/*=================*/
+	ib_crsr_t	ib_crsr);	/*!< in: cursor */
+
+/*****************************************************************//**
+Write a double value to a column.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_write_double(
+/*==================*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	int		col_no,		/*!< in: column number */
+	double		val);		/*!< in: value to write */
+
+/*************************************************************//**
+Read a double column value from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_double(
+/*=================*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	col_no,		/*!< in: column number */
+	double*		dval);		/*!< out: double value */
+
+/*****************************************************************//**
+Write a float value to a column.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_write_float(
+/*=================*/
+	ib_tpl_t	ib_tpl,		/*!< in/out: tuple to write to */
+	int		col_no,		/*!< in: column number */
+	float		val);		/*!< in: value to write */
+
+/*************************************************************//**
+Read a float value from an InnoDB tuple.
+@return	DB_SUCCESS or error */
+
+ib_err_t
+ib_tuple_read_float(
+/*================*/
+	ib_tpl_t	ib_tpl,		/*!< in: InnoDB tuple */
+	ib_ulint_t	col_no,		/*!< in: column number */
+	float*		fval);		/*!< out: float value */
+
+/*****************************************************************//**
+Get a column type, length and attributes from the tuple.
+@return len of column data */
+
+const char*
+ib_col_get_name(
+/*============*/
+	ib_crsr_t	ib_crsr,	/*!< in: InnoDB cursor instance */
+	ib_ulint_t	i);		/*!< in: column index in tuple */
+
+/*****************************************************************//**
+Get an index field name from the cursor.
+@return name of the field */
+
+const char*
+ib_get_idx_field_name(
+/*==================*/
+	ib_crsr_t	ib_crsr,	/*!< in: InnoDB cursor instance */
+	ib_ulint_t	i);		/*!< in: column index in tuple */
+
+/*****************************************************************//**
+Truncate a table.
+@return DB_SUCCESS or error code */
+
+ib_err_t
+ib_table_truncate(
+/*==============*/
+	const char*	table_name,	/*!< in: table name */
+	ib_id_u64_t*	table_id);	/*!< out: new table id */
+
+/*****************************************************************//**
+Frees a possible InnoDB trx object associated with the current THD.
+@return DB_SUCCESS or error number */
+
+ib_err_t
+ib_close_thd(
+/*=========*/
+	void*		thd);		/*!< in: handle to the MySQL
+					thread of the user whose resources
+					should be free'd */
+
+/*****************************************************************//**
+Get generic configure status
+@return configure status*/
+
+int
+ib_cfg_get_cfg();
+/*============*/
+
+/*****************************************************************//**
+Increase/decrease the memcached sync count of table to sync memcached
+DML with SQL DDLs.
+@return DB_SUCCESS or error number */
+ib_err_t
+ib_cursor_set_memcached_sync(
+/*=========================*/
+	ib_crsr_t	ib_crsr,	/*!< in: cursor */
+	ib_bool_t	flag);		/*!< in: true for increasing */
+
+/*****************************************************************//**
+Check whether the table name conforms to our requirements. Currently
+we only do a simple check for the presence of a '/'.
+@return DB_SUCCESS or err code */
+
+ib_err_t
+ib_table_name_check(
+/*================*/
+	const char*	name);		/*!< in: table name to check */
+
+/*****************************************************************//**
+Return isolation configuration set by "innodb_api_trx_level"
+@return trx isolation level*/
+
+ib_trx_state_t
+ib_cfg_trx_level();
+/*==============*/
+
+/*****************************************************************//**
+Return configure value for background commit interval (in seconds)
+@return background commit interval (in seconds) */
+
+ib_ulint_t
+ib_cfg_bk_commit_interval();
+/*=======================*/
+
+/*****************************************************************//**
+Get a trx start time.
+@return trx start_time */
+
+ib_u64_t
+ib_trx_get_start_time(
+/*==================*/
+	ib_trx_t	ib_trx);	/*!< in: transaction */
+
+#endif /* api0api_h */
diff --git a/storage/innobase/include/api0misc.h b/storage/innobase/include/api0misc.h
new file mode 100644
index 00000000000..fcd748390d1
--- /dev/null
+++ b/storage/innobase/include/api0misc.h
@@ -0,0 +1,78 @@
+/*****************************************************************************
+
+Copyright (c) 2008, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/api0misc.h
+InnoDB Native API
+
+3/20/2011 Jimmy Yang extracted from Embedded InnoDB
+2008 Created by Sunny Bains
+*******************************************************/
+
+#ifndef api0misc_h
+#define	api0misc_h
+
+#include "univ.i"
+#include "os0file.h"
+#include "que0que.h"
+#include "trx0trx.h"
+
+/** Whether binlog is enabled for applications using InnoDB APIs */
+extern my_bool                  ib_binlog_enabled;
+
+/** Whether MySQL MDL is enabled for applications using InnoDB APIs */
+extern my_bool                  ib_mdl_enabled;
+
+/** Whether InnoDB row lock is disabled for applications using InnoDB APIs */
+extern my_bool                  ib_disable_row_lock;
+
+/** configure value for transaction isolation level */
+extern ulong			ib_trx_level_setting;
+
+/** configure value for background commit interval (in seconds) */
+extern ulong			ib_bk_commit_interval;
+
+/********************************************************************
+Handles user errors and lock waits detected by the database engine.
+@return	TRUE if it was a lock wait and we should continue running
+the query thread */
+UNIV_INTERN
+ibool
+ib_handle_errors(
+/*=============*/
+	dberr_t*	new_err,	/*!< out: possible new error
+					encountered in lock wait, or if
+					no new error, the value of
+					trx->error_state at the entry of this
+					function */
+	trx_t*		trx,		/*!< in: transaction */
+	que_thr_t*	thr,		/*!< in: query thread */
+	trx_savept_t*	savept);	/*!< in: savepoint or NULL */
+
+/*************************************************************************
+Sets a lock on a table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+ib_trx_lock_table_with_retry(
+/*=========================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	dict_table_t*	table,		/*!< in: table to lock */
+	enum lock_mode	mode);		/*!< in: lock mode */
+
+#endif /* api0misc_h */
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
new file mode 100644
index 00000000000..305acf7e322
--- /dev/null
+++ b/storage/innobase/include/btr0btr.h
@@ -0,0 +1,773 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0btr.h
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0btr_h
+#define btr0btr_h
+
+#include "univ.i"
+
+#include "dict0dict.h"
+#include "data0data.h"
+#include "page0cur.h"
+#include "mtr0mtr.h"
+#include "btr0types.h"
+
+#ifndef UNIV_HOTBACKUP
+/** Maximum record size which can be stored on a page, without using the
+special big record storage structure */
+#define	BTR_PAGE_MAX_REC_SIZE	(UNIV_PAGE_SIZE / 2 - 200)
+
+/** @brief Maximum depth of a B-tree in InnoDB.
+
+Note that this isn't a maximum as such; none of the tree operations
+avoid producing trees bigger than this. It is instead a "max depth
+that other code must work with", useful for e.g.  fixed-size arrays
+that must store some information about each level in a tree. In other
+words: if a B-tree with bigger depth than this is encountered, it is
+not acceptable for it to lead to mysterious memory corruption, but it
+is acceptable for the program to die with a clear assert failure. */
+#define BTR_MAX_LEVELS		100
+
+/** Latching modes for btr_cur_search_to_nth_level(). */
+enum btr_latch_mode {
+	/** Search a record on a leaf page and S-latch it. */
+	BTR_SEARCH_LEAF = RW_S_LATCH,
+	/** (Prepare to) modify a record on a leaf page and X-latch it. */
+	BTR_MODIFY_LEAF	= RW_X_LATCH,
+	/** Obtain no latches. */
+	BTR_NO_LATCHES = RW_NO_LATCH,
+	/** Start modifying the entire B-tree. */
+	BTR_MODIFY_TREE = 33,
+	/** Continue modifying the entire B-tree. */
+	BTR_CONT_MODIFY_TREE = 34,
+	/** Search the previous record. */
+	BTR_SEARCH_PREV = 35,
+	/** Modify the previous record. */
+	BTR_MODIFY_PREV = 36
+};
+
+/* BTR_INSERT, BTR_DELETE and BTR_DELETE_MARK are mutually exclusive. */
+
+/** If this is ORed to btr_latch_mode, it means that the search tuple
+will be inserted to the index, at the searched position.
+When the record is not in the buffer pool, try to use the insert buffer. */
+#define BTR_INSERT		512
+
+/** This flag ORed to btr_latch_mode says that we do the search in query
+optimization */
+#define BTR_ESTIMATE		1024
+
+/** This flag ORed to BTR_INSERT says that we can ignore possible
+UNIQUE definition on secondary indexes when we decide if we can use
+the insert buffer to speed up inserts */
+#define BTR_IGNORE_SEC_UNIQUE	2048
+
+/** Try to delete mark the record at the searched position using the
+insert/delete buffer when the record is not in the buffer pool. */
+#define BTR_DELETE_MARK		4096
+
+/** Try to purge the record at the searched position using the insert/delete
+buffer when the record is not in the buffer pool. */
+#define BTR_DELETE		8192
+
+/** In the case of BTR_SEARCH_LEAF or BTR_MODIFY_LEAF, the caller is
+already holding an S latch on the index tree */
+#define BTR_ALREADY_S_LATCHED	16384
+
+#define BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode)	\
+	((latch_mode) & ~(BTR_INSERT			\
+			  | BTR_DELETE_MARK		\
+			  | BTR_DELETE			\
+			  | BTR_ESTIMATE		\
+			  | BTR_IGNORE_SEC_UNIQUE	\
+			  | BTR_ALREADY_S_LATCHED))
+#endif /* UNIV_HOTBACKUP */
+
+/**************************************************************//**
+Report that an index page is corrupted. */
+UNIV_INTERN
+void
+btr_corruption_report(
+/*==================*/
+	const buf_block_t*	block,	/*!< in: corrupted block */
+	const dict_index_t*	index)	/*!< in: index tree */
+	UNIV_COLD __attribute__((nonnull));
+
+/** Assert that a B-tree page is not corrupted.
+@param block buffer block containing a B-tree page
+@param index the B-tree index */
+#define btr_assert_not_corrupted(block, index)			\
+	if ((ibool) !!page_is_comp(buf_block_get_frame(block))	\
+	    != dict_table_is_comp((index)->table)) {		\
+		btr_corruption_report(block, index);		\
+		ut_error;					\
+	}
+
+#ifndef UNIV_HOTBACKUP
+#ifdef UNIV_BLOB_DEBUG
+# include "ut0rbt.h"
+/** An index->blobs entry for keeping track of off-page column references */
+struct btr_blob_dbg_t
+{
+	unsigned	blob_page_no:32;	/*!< first BLOB page number */
+	unsigned	ref_page_no:32;		/*!< referring page number */
+	unsigned	ref_heap_no:16;		/*!< referring heap number */
+	unsigned	ref_field_no:10;	/*!< referring field number */
+	unsigned	owner:1;		/*!< TRUE if BLOB owner */
+	unsigned	always_owner:1;		/*!< TRUE if always
+						has been the BLOB owner;
+						reset to TRUE on B-tree
+						page splits and merges */
+	unsigned	del:1;			/*!< TRUE if currently
+						delete-marked */
+};
+
+/**************************************************************//**
+Add a reference to an off-page column to the index->blobs map. */
+UNIV_INTERN
+void
+btr_blob_dbg_add_blob(
+/*==================*/
+	const rec_t*	rec,		/*!< in: clustered index record */
+	ulint		field_no,	/*!< in: number of off-page column */
+	ulint		page_no,	/*!< in: start page of the column */
+	dict_index_t*	index,		/*!< in/out: index tree */
+	const char*	ctx)		/*!< in: context (for logging) */
+	__attribute__((nonnull));
+/**************************************************************//**
+Display the references to off-page columns.
+This function is to be called from a debugger,
+for example when a breakpoint on ut_dbg_assertion_failed is hit. */
+UNIV_INTERN
+void
+btr_blob_dbg_print(
+/*===============*/
+	const dict_index_t*	index)	/*!< in: index tree */
+	__attribute__((nonnull));
+/**************************************************************//**
+Check that there are no references to off-page columns from or to
+the given page. Invoked when freeing or clearing a page.
+@return TRUE when no orphan references exist */
+UNIV_INTERN
+ibool
+btr_blob_dbg_is_empty(
+/*==================*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		page_no)	/*!< in: page number */
+	__attribute__((nonnull, warn_unused_result));
+
+/**************************************************************//**
+Modify the 'deleted' flag of a record. */
+UNIV_INTERN
+void
+btr_blob_dbg_set_deleted_flag(
+/*==========================*/
+	const rec_t*		rec,	/*!< in: record */
+	dict_index_t*		index,	/*!< in/out: index */
+	const ulint*		offsets,/*!< in: rec_get_offs(rec, index) */
+	ibool			del)	/*!< in: TRUE=deleted, FALSE=exists */
+	__attribute__((nonnull));
+/**************************************************************//**
+Change the ownership of an off-page column. */
+UNIV_INTERN
+void
+btr_blob_dbg_owner(
+/*===============*/
+	const rec_t*		rec,	/*!< in: record */
+	dict_index_t*		index,	/*!< in/out: index */
+	const ulint*		offsets,/*!< in: rec_get_offs(rec, index) */
+	ulint			i,	/*!< in: ith field in rec */
+	ibool			own)	/*!< in: TRUE=owned, FALSE=disowned */
+	__attribute__((nonnull));
+/** Assert that there are no BLOB references to or from the given page. */
+# define btr_blob_dbg_assert_empty(index, page_no)	\
+	ut_a(btr_blob_dbg_is_empty(index, page_no))
+#else /* UNIV_BLOB_DEBUG */
+# define btr_blob_dbg_add_blob(rec, field_no, page, index, ctx)	((void) 0)
+# define btr_blob_dbg_set_deleted_flag(rec, index, offsets, del)((void) 0)
+# define btr_blob_dbg_owner(rec, index, offsets, i, val)	((void) 0)
+# define btr_blob_dbg_assert_empty(index, page_no)		((void) 0)
+#endif /* UNIV_BLOB_DEBUG */
+
+/**************************************************************//**
+Gets the root node of a tree and x-latches it.
+@return	root page, x-latched */
+UNIV_INTERN
+page_t*
+btr_root_get(
+/*=========*/
+	const dict_index_t*	index,	/*!< in: index tree */
+	mtr_t*			mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
+
+/**************************************************************//**
+Checks and adjusts the root node of a tree during IMPORT TABLESPACE.
+@return error code, or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+btr_root_adjust_on_import(
+/*======================*/
+	const dict_index_t*	index)	/*!< in: index tree */
+	__attribute__((nonnull, warn_unused_result));
+
+/**************************************************************//**
+Gets the height of the B-tree (the level of the root, when the leaf
+level is assumed to be 0). The caller must hold an S or X latch on
+the index.
+@return	tree height (level of the root) */
+UNIV_INTERN
+ulint
+btr_height_get(
+/*===========*/
+	dict_index_t*	index,	/*!< in: index tree */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull, warn_unused_result));
+/**************************************************************//**
+Gets a buffer page and declares its latching order level. */
+UNIV_INLINE
+buf_block_t*
+btr_block_get_func(
+/*===============*/
+	ulint		space,		/*!< in: space id */
+	ulint		zip_size,	/*!< in: compressed page size in bytes
+					or 0 for uncompressed pages */
+	ulint		page_no,	/*!< in: page number */
+	ulint		mode,		/*!< in: latch mode */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+# ifdef UNIV_SYNC_DEBUG
+	const dict_index_t*	index,	/*!< in: index tree, may be NULL
+					if it is not an insert buffer tree */
+# endif /* UNIV_SYNC_DEBUG */
+	mtr_t*		mtr);		/*!< in/out: mini-transaction */
+# ifdef UNIV_SYNC_DEBUG
+/** Gets a buffer page and declares its latching order level.
+@param space	tablespace identifier
+@param zip_size	compressed page size in bytes or 0 for uncompressed pages
+@param page_no	page number
+@param mode	latch mode
+@param index	index tree, may be NULL if not the insert buffer tree
+@param mtr	mini-transaction handle
+@return the block descriptor */
+#  define btr_block_get(space,zip_size,page_no,mode,index,mtr)	\
+	btr_block_get_func(space,zip_size,page_no,mode,		\
+			   __FILE__,__LINE__,index,mtr)
+# else /* UNIV_SYNC_DEBUG */
+/** Gets a buffer page and declares its latching order level.
+@param space	tablespace identifier
+@param zip_size	compressed page size in bytes or 0 for uncompressed pages
+@param page_no	page number
+@param mode	latch mode
+@param idx	index tree, may be NULL if not the insert buffer tree
+@param mtr	mini-transaction handle
+@return the block descriptor */
+#  define btr_block_get(space,zip_size,page_no,mode,idx,mtr)		\
+	btr_block_get_func(space,zip_size,page_no,mode,__FILE__,__LINE__,mtr)
+# endif /* UNIV_SYNC_DEBUG */
+/** Gets a buffer page and declares its latching order level.
+@param space	tablespace identifier
+@param zip_size	compressed page size in bytes or 0 for uncompressed pages
+@param page_no	page number
+@param mode	latch mode
+@param idx	index tree, may be NULL if not the insert buffer tree
+@param mtr	mini-transaction handle
+@return the uncompressed page frame */
+# define btr_page_get(space,zip_size,page_no,mode,idx,mtr)		\
+	buf_block_get_frame(btr_block_get(space,zip_size,page_no,mode,idx,mtr))
+#endif /* !UNIV_HOTBACKUP */
+/**************************************************************//**
+Gets the index id field of a page.
+@return	index id */
+UNIV_INLINE
+index_id_t
+btr_page_get_index_id(
+/*==================*/
+	const page_t*	page)	/*!< in: index page */
+	__attribute__((nonnull, pure, warn_unused_result));
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Gets the node level field in an index page.
+@return	level, leaf level == 0 */
+UNIV_INLINE
+ulint
+btr_page_get_level_low(
+/*===================*/
+	const page_t*	page)	/*!< in: index page */
+	__attribute__((nonnull, pure, warn_unused_result));
+#define btr_page_get_level(page, mtr) btr_page_get_level_low(page)
+/********************************************************//**
+Gets the next index page number.
+@return	next page number */
+UNIV_INLINE
+ulint
+btr_page_get_next(
+/*==============*/
+	const page_t*	page,	/*!< in: index page */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************//**
+Gets the previous index page number.
+@return	prev page number */
+UNIV_INLINE
+ulint
+btr_page_get_prev(
+/*==============*/
+	const page_t*	page,	/*!< in: index page */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+	__attribute__((nonnull, warn_unused_result));
+/*************************************************************//**
+Gets pointer to the previous user record in the tree. It is assumed
+that the caller has appropriate latches on the page and its neighbor.
+@return	previous user record, NULL if there is none */
+UNIV_INTERN
+rec_t*
+btr_get_prev_user_rec(
+/*==================*/
+	rec_t*	rec,	/*!< in: record on leaf level */
+	mtr_t*	mtr)	/*!< in: mtr holding a latch on the page, and if
+			needed, also to the previous page */
+	__attribute__((nonnull, warn_unused_result));
+/*************************************************************//**
+Gets pointer to the next user record in the tree. It is assumed
+that the caller has appropriate latches on the page and its neighbor.
+@return	next user record, NULL if there is none */
+UNIV_INTERN
+rec_t*
+btr_get_next_user_rec(
+/*==================*/
+	rec_t*	rec,	/*!< in: record on leaf level */
+	mtr_t*	mtr)	/*!< in: mtr holding a latch on the page, and if
+			needed, also to the next page */
+	__attribute__((nonnull, warn_unused_result));
+/**************************************************************//**
+Releases the latch on a leaf page and bufferunfixes it. */
+UNIV_INLINE
+void
+btr_leaf_page_release(
+/*==================*/
+	buf_block_t*	block,		/*!< in: buffer block */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF or
+					BTR_MODIFY_LEAF */
+	mtr_t*		mtr)		/*!< in: mtr */
+	__attribute__((nonnull));
+/**************************************************************//**
+Gets the child node file address in a node pointer.
+NOTE: the offsets array must contain all offsets for the record since
+we read the last field according to offsets and assume that it contains
+the child page number. In other words offsets must have been retrieved
+with rec_get_offsets(n_fields=ULINT_UNDEFINED).
+@return	child node address */
+UNIV_INLINE
+ulint
+btr_node_ptr_get_child_page_no(
+/*===========================*/
+	const rec_t*	rec,	/*!< in: node pointer record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
+/************************************************************//**
+Creates the root node for a new index tree.
+@return	page number of the created root, FIL_NULL if did not succeed */
+UNIV_INTERN
+ulint
+btr_create(
+/*=======*/
+	ulint		type,	/*!< in: type of the index */
+	ulint		space,	/*!< in: space where created */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	index_id_t	index_id,/*!< in: index id */
+	dict_index_t*	index,	/*!< in: index */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+	__attribute__((nonnull));
+/************************************************************//**
+Frees a B-tree except the root page, which MUST be freed after this
+by calling btr_free_root. */
+UNIV_INTERN
+void
+btr_free_but_not_root(
+/*==================*/
+	ulint	space,		/*!< in: space where created */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	root_page_no);	/*!< in: root page number */
+/************************************************************//**
+Frees the B-tree root page. Other tree MUST already have been freed. */
+UNIV_INTERN
+void
+btr_free_root(
+/*==========*/
+	ulint	space,		/*!< in: space where created */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	root_page_no,	/*!< in: root page number */
+	mtr_t*	mtr)		/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
+/*************************************************************//**
+Makes tree one level higher by splitting the root, and inserts
+the tuple. It is assumed that mtr contains an x-latch on the tree.
+NOTE that the operation of this function must always succeed,
+we cannot reverse it: therefore enough free disk space must be
+guaranteed to be available before this function is called.
+@return	inserted record */
+UNIV_INTERN
+rec_t*
+btr_root_raise_and_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert: must be
+				on the root page; when the function returns,
+				the cursor is positioned on the predecessor
+				of the inserted record */
+	ulint**		offsets,/*!< out: offsets on inserted record */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
+				that can be emptied, or NULL */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull, warn_unused_result));
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
+UNIV_INTERN
+bool
+btr_page_reorganize_low(
+/*====================*/
+	bool		recovery,/*!< in: true if called in recovery:
+				locks should not be updated, i.e.,
+				there cannot exist locks on the
+				page, and a hash index should not be
+				dropped: it cannot exist */
+	ulint		z_level,/*!< in: compression level to be used
+				if dealing with compressed page */
+	page_cur_t*	cursor,	/*!< in/out: page cursor */
+	dict_index_t*	index,	/*!< in: the index tree of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull, warn_unused_result));
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
+UNIV_INTERN
+bool
+btr_page_reorganize(
+/*================*/
+	page_cur_t*	cursor,	/*!< in/out: page cursor */
+	dict_index_t*	index,	/*!< in: the index tree of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
+/*************************************************************//**
+Decides if the page should be split at the convergence point of
+inserts converging to left.
+@return	TRUE if split recommended */
+UNIV_INTERN
+ibool
+btr_page_get_split_rec_to_left(
+/*===========================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert */
+	rec_t**		split_rec)/*!< out: if split recommended,
+				the first record on upper half page,
+				or NULL if tuple should be first */
+	__attribute__((nonnull, warn_unused_result));
+/*************************************************************//**
+Decides if the page should be split at the convergence point of
+inserts converging to right.
+@return	TRUE if split recommended */
+UNIV_INTERN
+ibool
+btr_page_get_split_rec_to_right(
+/*============================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert */
+	rec_t**		split_rec)/*!< out: if split recommended,
+				the first record on upper half page,
+				or NULL if tuple should be first */
+	__attribute__((nonnull, warn_unused_result));
+/*************************************************************//**
+Splits an index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+
+@return inserted record */
+UNIV_INTERN
+rec_t*
+btr_page_split_and_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert; when the
+				function returns, the cursor is positioned
+				on the predecessor of the inserted record */
+	ulint**		offsets,/*!< out: offsets on inserted record */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
+				that can be emptied, or NULL */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull, warn_unused_result));
+/*******************************************************//**
+Inserts a data tuple to a tree on a non-leaf level. It is assumed
+that mtr holds an x-latch on the tree. */
+UNIV_INTERN
+void
+btr_insert_on_non_leaf_level_func(
+/*==============================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: level, must be > 0 */
+	dtuple_t*	tuple,	/*!< in: the record to be inserted */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
+# define btr_insert_on_non_leaf_level(f,i,l,t,m)			\
+	btr_insert_on_non_leaf_level_func(f,i,l,t,__FILE__,__LINE__,m)
+#endif /* !UNIV_HOTBACKUP */
+/****************************************************************//**
+Sets a record as the predefined minimum record. */
+UNIV_INTERN
+void
+btr_set_min_rec_mark(
+/*=================*/
+	rec_t*	rec,	/*!< in/out: record */
+	mtr_t*	mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Deletes on the upper level the node pointer to a page. */
+UNIV_INTERN
+void
+btr_node_ptr_delete(
+/*================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: page whose node pointer is deleted */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Checks that the node pointer to a page is appropriate.
+@return	TRUE */
+UNIV_INTERN
+ibool
+btr_check_node_ptr(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: index page */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+Tries to merge the page first to the left immediate brother if such a
+brother exists, and the node pointers to the current page and to the
+brother reside on the same page. If the left brother does not satisfy these
+conditions, looks at the right brother. If the page is the only one on that
+level lifts the records of the page to the father page, thus reducing the
+tree height. It is assumed that mtr holds an x-latch on the tree and on the
+page. If cursor is on the leaf level, mtr must also hold x-latches to
+the brothers, if they exist.
+@return	TRUE on success */
+UNIV_INTERN
+ibool
+btr_compress(
+/*=========*/
+	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to merge
+				or lift; the page must not be empty:
+				when deleting records, use btr_discard_page()
+				if the page would become empty */
+	ibool		adjust,	/*!< in: TRUE if should adjust the
+				cursor position even if compression occurs */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
+/*************************************************************//**
+Discards a page from a B-tree. This is used to remove the last record from
+a B-tree page: the whole page must be removed at the same time. This cannot
+be used for the root page, which is allowed to be empty. */
+UNIV_INTERN
+void
+btr_discard_page(
+/*=============*/
+	btr_cur_t*	cursor,	/*!< in: cursor on the page to discard: not on
+				the root page */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
+#endif /* !UNIV_HOTBACKUP */
+/****************************************************************//**
+Parses the redo log record for setting an index record as the predefined
+minimum record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_parse_set_min_rec_mark(
+/*=======================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	ulint	comp,	/*!< in: nonzero=compact page format */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr)	/*!< in: mtr or NULL */
+	__attribute__((nonnull(1,2), warn_unused_result));
+/***********************************************************//**
+Parses a redo log record of reorganizing a page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_parse_page_reorganize(
+/*======================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	bool		compressed,/*!< in: true if compressed page */
+	buf_block_t*	block,	/*!< in: page to be reorganized, or NULL */
+	mtr_t*		mtr)	/*!< in: mtr or NULL */
+	__attribute__((nonnull(1,2,3), warn_unused_result));
+#ifndef UNIV_HOTBACKUP
+/**************************************************************//**
+Gets the number of pages in a B-tree.
+@return	number of pages, or ULINT_UNDEFINED if the index is unavailable */
+UNIV_INTERN
+ulint
+btr_get_size(
+/*=========*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		flag,	/*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction where index
+				is s-latched */
+	__attribute__((nonnull, warn_unused_result));
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@retval NULL if no page could be allocated
+@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
+(init_mtr == mtr, or the page was not previously freed in mtr)
+@retval block (not allocated or initialized) otherwise */
+UNIV_INTERN
+buf_block_t*
+btr_page_alloc(
+/*===========*/
+	dict_index_t*	index,		/*!< in: index tree */
+	ulint		hint_page_no,	/*!< in: hint of a good page */
+	byte		file_direction,	/*!< in: direction where a possible
+					page split is made */
+	ulint		level,		/*!< in: level where the page is placed
+					in the tree */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr)	/*!< in/out: mini-transaction
+					for x-latching and initializing
+					the page */
+	__attribute__((nonnull, warn_unused_result));
+/**************************************************************//**
+Frees a file page used in an index tree. NOTE: cannot free field external
+storage pages because the page must contain info on its level. */
+UNIV_INTERN
+void
+btr_page_free(
+/*==========*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
+/**************************************************************//**
+Frees a file page used in an index tree. Can be used also to BLOB
+external storage pages, because the page level 0 can be given as an
+argument. */
+UNIV_INTERN
+void
+btr_page_free_low(
+/*==============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
+	ulint		level,	/*!< in: page level */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
+#ifdef UNIV_BTR_PRINT
+/*************************************************************//**
+Prints size info of a B-tree. */
+UNIV_INTERN
+void
+btr_print_size(
+/*===========*/
+	dict_index_t*	index)	/*!< in: index tree */
+	__attribute__((nonnull));
+/**************************************************************//**
+Prints directories and other info of all nodes in the index. */
+UNIV_INTERN
+void
+btr_print_index(
+/*============*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		width)	/*!< in: print this many entries from start
+				and end */
+	__attribute__((nonnull));
+#endif /* UNIV_BTR_PRINT */
+/************************************************************//**
+Checks the size and number of fields in a record based on the definition of
+the index.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+btr_index_rec_validate(
+/*===================*/
+	const rec_t*		rec,		/*!< in: index record */
+	const dict_index_t*	index,		/*!< in: index */
+	ibool			dump_on_error)	/*!< in: TRUE if the function
+						should print hex dump of record
+						and page on error */
+	__attribute__((nonnull, warn_unused_result));
+/**************************************************************//**
+Checks the consistency of an index tree.
+@return	TRUE if ok */
+UNIV_INTERN
+bool
+btr_validate_index(
+/*===============*/
+	dict_index_t*	index,			/*!< in: index */
+	const trx_t*	trx)			/*!< in: transaction or 0 */
+	__attribute__((nonnull(1), warn_unused_result));
+
+#define BTR_N_LEAF_PAGES	1
+#define BTR_TOTAL_SIZE		2
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "btr0btr.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic
new file mode 100644
index 00000000000..00f50b5dcaf
--- /dev/null
+++ b/storage/innobase/include/btr0btr.ic
@@ -0,0 +1,290 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0btr.ic
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#ifndef UNIV_HOTBACKUP
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "page0zip.h"
+
+#define BTR_MAX_NODE_LEVEL	50	/*!< Maximum B-tree page level
+					(not really a hard limit).
+					Used in debug assertions
+					in btr_page_set_level and
+					btr_page_get_level_low */
+
+/**************************************************************//**
+Gets a buffer page and declares its latching order level. */
+UNIV_INLINE
+buf_block_t*
+btr_block_get_func(
+/*===============*/
+	ulint		space,		/*!< in: space id */
+	ulint		zip_size,	/*!< in: compressed page size in bytes
+					or 0 for uncompressed pages */
+	ulint		page_no,	/*!< in: page number */
+	ulint		mode,		/*!< in: latch mode */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+#ifdef UNIV_SYNC_DEBUG
+	const dict_index_t*	index,	/*!< in: index tree, may be NULL
+					if it is not an insert buffer tree */
+#endif /* UNIV_SYNC_DEBUG */
+	mtr_t*		mtr)		/*!< in/out: mtr */
+{
+	buf_block_t*	block;
+
+	block = buf_page_get_gen(space, zip_size, page_no, mode,
+				 NULL, BUF_GET, file, line, mtr);
+
+	if (mode != RW_NO_LATCH) {
+
+		buf_block_dbg_add_level(
+			block, index != NULL && dict_index_is_ibuf(index)
+			? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
+	}
+
+	return(block);
+}
+
+/**************************************************************//**
+Sets the index id field of a page. */
+UNIV_INLINE
+void
+btr_page_set_index_id(
+/*==================*/
+	page_t*		page,	/*!< in: page to be created */
+	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
+				part will be updated, or NULL */
+	index_id_t	id,	/*!< in: index id */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	if (page_zip) {
+		mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), id);
+		page_zip_write_header(page_zip,
+				      page + (PAGE_HEADER + PAGE_INDEX_ID),
+				      8, mtr);
+	} else {
+		mlog_write_ull(page + (PAGE_HEADER + PAGE_INDEX_ID), id, mtr);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**************************************************************//**
+Gets the index id field of a page.
+@return	index id */
+UNIV_INLINE
+index_id_t
+btr_page_get_index_id(
+/*==================*/
+	const page_t*	page)	/*!< in: index page */
+{
+	return(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID));
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Gets the node level field in an index page.
+@return	level, leaf level == 0 */
+UNIV_INLINE
+ulint
+btr_page_get_level_low(
+/*===================*/
+	const page_t*	page)	/*!< in: index page */
+{
+	ulint	level;
+
+	ut_ad(page);
+
+	level = mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL);
+
+	ut_ad(level <= BTR_MAX_NODE_LEVEL);
+
+	return(level);
+}
+
+/********************************************************//**
+Sets the node level field in an index page. */
+UNIV_INLINE
+void
+btr_page_set_level(
+/*===============*/
+	page_t*		page,	/*!< in: index page */
+	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
+				part will be updated, or NULL */
+	ulint		level,	/*!< in: level, leaf level == 0 */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+{
+	ut_ad(page && mtr);
+	ut_ad(level <= BTR_MAX_NODE_LEVEL);
+
+	if (page_zip) {
+		mach_write_to_2(page + (PAGE_HEADER + PAGE_LEVEL), level);
+		page_zip_write_header(page_zip,
+				      page + (PAGE_HEADER + PAGE_LEVEL),
+				      2, mtr);
+	} else {
+		mlog_write_ulint(page + (PAGE_HEADER + PAGE_LEVEL), level,
+				 MLOG_2BYTES, mtr);
+	}
+}
+
+/********************************************************//**
+Gets the next index page number.
+@return	next page number */
+UNIV_INLINE
+ulint
+btr_page_get_next(
+/*==============*/
+	const page_t*	page,	/*!< in: index page */
+	mtr_t*		mtr __attribute__((unused)))
+				/*!< in: mini-transaction handle */
+{
+	ut_ad(page && mtr);
+	ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_S_FIX));
+
+	return(mach_read_from_4(page + FIL_PAGE_NEXT));
+}
+
+/********************************************************//**
+Sets the next index page field. */
+UNIV_INLINE
+void
+btr_page_set_next(
+/*==============*/
+	page_t*		page,	/*!< in: index page */
+	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
+				part will be updated, or NULL */
+	ulint		next,	/*!< in: next page number */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+{
+	ut_ad(page && mtr);
+
+	if (page_zip) {
+		mach_write_to_4(page + FIL_PAGE_NEXT, next);
+		page_zip_write_header(page_zip, page + FIL_PAGE_NEXT, 4, mtr);
+	} else {
+		mlog_write_ulint(page + FIL_PAGE_NEXT, next, MLOG_4BYTES, mtr);
+	}
+}
+
+/********************************************************//**
+Gets the previous index page number.
+@return	prev page number */
+UNIV_INLINE
+ulint
+btr_page_get_prev(
+/*==============*/
+	const page_t*	page,	/*!< in: index page */
+	mtr_t*	mtr __attribute__((unused))) /*!< in: mini-transaction handle */
+{
+	ut_ad(page && mtr);
+
+	return(mach_read_from_4(page + FIL_PAGE_PREV));
+}
+
+/********************************************************//**
+Sets the previous index page field. */
+UNIV_INLINE
+void
+btr_page_set_prev(
+/*==============*/
+	page_t*		page,	/*!< in: index page */
+	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
+				part will be updated, or NULL */
+	ulint		prev,	/*!< in: previous page number */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+{
+	ut_ad(page && mtr);
+
+	if (page_zip) {
+		mach_write_to_4(page + FIL_PAGE_PREV, prev);
+		page_zip_write_header(page_zip, page + FIL_PAGE_PREV, 4, mtr);
+	} else {
+		mlog_write_ulint(page + FIL_PAGE_PREV, prev, MLOG_4BYTES, mtr);
+	}
+}
+
+/**************************************************************//**
+Gets the child node file address in a node pointer.
+NOTE: the offsets array must contain all offsets for the record since
+we read the last field according to offsets and assume that it contains
+the child page number. In other words offsets must have been retrieved
+with rec_get_offsets(n_fields=ULINT_UNDEFINED).
+@return	child node address */
+UNIV_INLINE
+ulint
+btr_node_ptr_get_child_page_no(
+/*===========================*/
+	const rec_t*	rec,	/*!< in: node pointer record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	const byte*	field;
+	ulint		len;
+	ulint		page_no;
+
+	ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
+
+	/* The child address is in the last field */
+	field = rec_get_nth_field(rec, offsets,
+				  rec_offs_n_fields(offsets) - 1, &len);
+
+	ut_ad(len == 4);
+
+	page_no = mach_read_from_4(field);
+
+	if (page_no == 0) {
+		fprintf(stderr,
+			"InnoDB: a nonsensical page number 0"
+			" in a node ptr record at offset %lu\n",
+			(ulong) page_offset(rec));
+		buf_page_print(page_align(rec), 0, 0);
+		ut_ad(0);
+	}
+
+	return(page_no);
+}
+
+/**************************************************************//**
+Releases the latches on a leaf page and bufferunfixes it. */
+UNIV_INLINE
+void
+btr_leaf_page_release(
+/*==================*/
+	buf_block_t*	block,		/*!< in: buffer block */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF or
+					BTR_MODIFY_LEAF */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF);
+	ut_ad(!mtr_memo_contains(mtr, block, MTR_MEMO_MODIFY));
+
+	mtr_memo_release(mtr, block,
+			 latch_mode == BTR_SEARCH_LEAF
+			 ? MTR_MEMO_PAGE_S_FIX
+			 : MTR_MEMO_PAGE_X_FIX);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h
new file mode 100644
index 00000000000..f1e4406fcf7
--- /dev/null
+++ b/storage/innobase/include/btr0cur.h
@@ -0,0 +1,937 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0cur.h
+The index tree cursor
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0cur_h
+#define btr0cur_h
+
+#include "univ.i"
+#include "dict0dict.h"
+#include "page0cur.h"
+#include "btr0types.h"
+
+/** Mode flags for btr_cur operations; these can be ORed */
+enum {
+	/** do no undo logging */
+	BTR_NO_UNDO_LOG_FLAG = 1,
+	/** do no record lock checking */
+	BTR_NO_LOCKING_FLAG = 2,
+	/** sys fields will be found in the update vector or inserted
+	entry */
+	BTR_KEEP_SYS_FLAG = 4,
+	/** btr_cur_pessimistic_update() must keep cursor position
+	when moving columns to big_rec */
+	BTR_KEEP_POS_FLAG = 8,
+	/** the caller is creating the index or wants to bypass the
+	index->info.online creation log */
+	BTR_CREATE_FLAG = 16,
+	/** the caller of btr_cur_optimistic_update() or
+	btr_cur_update_in_place() will take care of
+	updating IBUF_BITMAP_FREE */
+	BTR_KEEP_IBUF_BITMAP = 32
+};
+
+#ifndef UNIV_HOTBACKUP
+#include "que0types.h"
+#include "row0types.h"
+#include "ha0ha.h"
+
+#define BTR_CUR_ADAPT
+#define BTR_CUR_HASH_ADAPT
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the page cursor component of a tree cursor.
+@return	pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_cur_get_page_cur(
+/*=================*/
+	const btr_cur_t*	cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the buffer block on which the tree cursor is positioned.
+@return	pointer to buffer block */
+UNIV_INLINE
+buf_block_t*
+btr_cur_get_block(
+/*==============*/
+	const btr_cur_t*	cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the record pointer of a tree cursor.
+@return	pointer to record */
+UNIV_INLINE
+rec_t*
+btr_cur_get_rec(
+/*============*/
+	const btr_cur_t*	cursor);/*!< in: tree cursor */
+#else /* UNIV_DEBUG */
+# define btr_cur_get_page_cur(cursor)	(&(cursor)->page_cur)
+# define btr_cur_get_block(cursor)	((cursor)->page_cur.block)
+# define btr_cur_get_rec(cursor)	((cursor)->page_cur.rec)
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Returns the compressed page on which the tree cursor is positioned.
+@return	pointer to compressed page, or NULL if the page is not compressed */
+UNIV_INLINE
+page_zip_des_t*
+btr_cur_get_page_zip(
+/*=================*/
+	btr_cur_t*	cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Invalidates a tree cursor by setting record pointer to NULL. */
+UNIV_INLINE
+void
+btr_cur_invalidate(
+/*===============*/
+	btr_cur_t*	cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the page of a tree cursor.
+@return	pointer to page */
+UNIV_INLINE
+page_t*
+btr_cur_get_page(
+/*=============*/
+	btr_cur_t*	cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the index of a cursor.
+@param cursor	b-tree cursor
+@return	index */
+#define btr_cur_get_index(cursor) ((cursor)->index)
+/*********************************************************//**
+Positions a tree cursor at a given record. */
+UNIV_INLINE
+void
+btr_cur_position(
+/*=============*/
+	dict_index_t*	index,	/*!< in: index */
+	rec_t*		rec,	/*!< in: record in tree */
+	buf_block_t*	block,	/*!< in: buffer block of rec */
+	btr_cur_t*	cursor);/*!< in: cursor */
+/********************************************************************//**
+Searches an index tree and positions a tree cursor on a given level.
+NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
+to node pointer page number fields on the upper levels of the tree!
+Note that if mode is PAGE_CUR_LE, which is used in inserts, then
+cursor->up_match and cursor->low_match both will have sensible values.
+If mode is PAGE_CUR_GE, then up_match will a have a sensible value. */
+UNIV_INTERN
+void
+btr_cur_search_to_nth_level(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: the tree level of search */
+	const dtuple_t*	tuple,	/*!< in: data tuple; NOTE: n_fields_cmp in
+				tuple must be set so that it cannot get
+				compared to the node ptr page number field! */
+	ulint		mode,	/*!< in: PAGE_CUR_L, ...;
+				NOTE that if the search is made using a unique
+				prefix of a record, mode should be PAGE_CUR_LE,
+				not PAGE_CUR_GE, as the latter may end up on
+				the previous page of the record! Inserts
+				should always be made using PAGE_CUR_LE to
+				search the position! */
+	ulint		latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
+				at most one of BTR_INSERT, BTR_DELETE_MARK,
+				BTR_DELETE, or BTR_ESTIMATE;
+				cursor->left_block is used to store a pointer
+				to the left neighbor page, in the cases
+				BTR_SEARCH_PREV and BTR_MODIFY_PREV;
+				NOTE that if has_search_latch
+				is != 0, we maybe do not have a latch set
+				on the cursor page, we assume
+				the caller uses his search latch
+				to protect the record! */
+	btr_cur_t*	cursor, /*!< in/out: tree cursor; the cursor page is
+				s- or x-latched, but see also above! */
+	ulint		has_search_latch,/*!< in: latch mode the caller
+				currently has on btr_search_latch:
+				RW_S_LATCH, or 0 */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*****************************************************************//**
+Opens a cursor at either end of an index. */
+UNIV_INTERN
+void
+btr_cur_open_at_index_side_func(
+/*============================*/
+	bool		from_left,	/*!< in: true if open to the low end,
+					false if to the high end */
+	dict_index_t*	index,		/*!< in: index */
+	ulint		latch_mode,	/*!< in: latch mode */
+	btr_cur_t*	cursor,		/*!< in/out: cursor */
+	ulint		level,		/*!< in: level to search for
+					(0=leaf) */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
+#define btr_cur_open_at_index_side(f,i,l,c,lv,m)			\
+	btr_cur_open_at_index_side_func(f,i,l,c,lv,__FILE__,__LINE__,m)
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree. */
+UNIV_INTERN
+void
+btr_cur_open_at_rnd_pos_func(
+/*=========================*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_cur_t*	cursor,		/*!< in/out: B-tree cursor */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr);		/*!< in: mtr */
+#define btr_cur_open_at_rnd_pos(i,l,c,m)				\
+	btr_cur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m)
+/*************************************************************//**
+Tries to perform an insert to a page in an index tree, next to cursor.
+It is assumed that mtr holds an x-latch on the page. The operation does
+not succeed if there is too little space on the page. If there is just
+one record on the page, the insert will always succeed; this is to
+prevent trying to split a page with just one record.
+@return	DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
+UNIV_INTERN
+dberr_t
+btr_cur_optimistic_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if not
+				zero, the parameters index and thr should be
+				specified */
+	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
+				cursor stays valid */
+	ulint**		offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	dtuple_t*	entry,	/*!< in/out: entry to insert */
+	rec_t**		rec,	/*!< out: pointer to inserted record if
+				succeed */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller, or
+				NULL */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr,	/*!< in: query thread or NULL */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction;
+				if this function returns DB_SUCCESS on
+				a leaf page of a secondary index in a
+				compressed tablespace, the caller must
+				mtr_commit(mtr) before latching
+				any further pages */
+	__attribute__((nonnull(2,3,4,5,6,7,10), warn_unused_result));
+/*************************************************************//**
+Performs an insert on a page of an index tree. It is assumed that mtr
+holds an x-latch on the tree and on the cursor page. If the insert is
+made on the leaf level, to avoid deadlocks, mtr must also own x-latches
+to brothers of page, if those brothers exist.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+dberr_t
+btr_cur_pessimistic_insert(
+/*=======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if not
+				zero, the parameter thr should be
+				specified; if no undo logging is specified,
+				then the caller must have reserved enough
+				free extents in the file space so that the
+				insertion will certainly succeed */
+	btr_cur_t*	cursor,	/*!< in: cursor after which to insert;
+				cursor stays valid */
+	ulint**		offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
+				that can be emptied, or NULL */
+	dtuple_t*	entry,	/*!< in/out: entry to insert */
+	rec_t**		rec,	/*!< out: pointer to inserted record if
+				succeed */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller, or
+				NULL */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr,	/*!< in: query thread or NULL */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull(2,3,4,5,6,7,10), warn_unused_result));
+/*************************************************************//**
+See if there is enough place in the page modification log to log
+an update-in-place.
+
+@retval false if out of space; IBUF_BITMAP_FREE will be reset
+outside mtr if the page was recompressed
+@retval	true if enough place;
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
+a secondary index leaf page. This has to be done either within the
+same mini-transaction, or by invoking ibuf_reset_free_bits() before
+mtr_commit(mtr). */
+UNIV_INTERN
+bool
+btr_cur_update_alloc_zip_func(
+/*==========================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	page_cur_t*	cursor,	/*!< in/out: B-tree page cursor */
+	dict_index_t*	index,	/*!< in: the index corresponding to cursor */
+#ifdef UNIV_DEBUG
+	ulint*		offsets,/*!< in/out: offsets of the cursor record */
+#endif /* UNIV_DEBUG */
+	ulint		length,	/*!< in: size needed */
+	bool		create,	/*!< in: true=delete-and-insert,
+				false=update-in-place */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull, warn_unused_result));
+#ifdef UNIV_DEBUG
+# define btr_cur_update_alloc_zip(page_zip,cursor,index,offsets,len,cr,mtr) \
+	btr_cur_update_alloc_zip_func(page_zip,cursor,index,offsets,len,cr,mtr)
+#else /* UNIV_DEBUG */
+# define btr_cur_update_alloc_zip(page_zip,cursor,index,offsets,len,cr,mtr) \
+	btr_cur_update_alloc_zip_func(page_zip,cursor,index,len,cr,mtr)
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+Updates a record when the update causes no size changes in its fields.
+@return locking or undo log related error code, or
+@retval DB_SUCCESS on success
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
+UNIV_INTERN
+dberr_t
+btr_cur_update_in_place(
+/*====================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
+				cursor stays valid and positioned on the
+				same record */
+	ulint*		offsets,/*!< in/out: offsets on cursor->page_cur.rec */
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
+				is a secondary index, the caller must
+				mtr_commit(mtr) before latching any
+				further pages */
+	__attribute__((warn_unused_result, nonnull));
+/***********************************************************//**
+Writes a redo log record of updating a record in-place. */
+UNIV_INTERN
+void
+btr_cur_update_in_place_log(
+/*========================*/
+	ulint		flags,		/*!< in: flags */
+	const rec_t*	rec,		/*!< in: record */
+	dict_index_t*	index,		/*!< in: index of the record */
+	const upd_t*	update,		/*!< in: update vector */
+	trx_id_t	trx_id,		/*!< in: transaction id */
+	roll_ptr_t	roll_ptr,	/*!< in: roll ptr */
+	mtr_t*		mtr)		/*!< in: mtr */
+	__attribute__((nonnull));
+/*************************************************************//**
+Tries to update a record on a page in an index tree. It is assumed that mtr
+holds an x-latch on the page. The operation does not succeed if there is too
+little space on the page or if the update would result in too empty a page,
+so that tree compression is recommended.
+@return error code, including
+@retval DB_SUCCESS on success
+@retval DB_OVERFLOW if the updated record does not fit
+@retval DB_UNDERFLOW if the page would become too empty
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page */
+UNIV_INTERN
+dberr_t
+btr_cur_optimistic_update(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
+				cursor stays valid and positioned on the
+				same record */
+	ulint**		offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to NULL or memory heap */
+	const upd_t*	update,	/*!< in: update vector; this must also
+				contain trx id and roll ptr fields */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
+				is a secondary index, the caller must
+				mtr_commit(mtr) before latching any
+				further pages */
+	__attribute__((warn_unused_result, nonnull));
+/*************************************************************//**
+Performs an update of a record on a page of a tree. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. If the
+update is made on the leaf level, to avoid deadlocks, mtr must also
+own x-latches to brothers of page, if those brothers exist.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+btr_cur_pessimistic_update(
+/*=======================*/
+	ulint		flags,	/*!< in: undo logging, locking, and rollback
+				flags */
+	btr_cur_t*	cursor,	/*!< in/out: cursor on the record to update;
+				cursor may become invalid if *big_rec == NULL
+				|| !(flags & BTR_KEEP_POS_FLAG) */
+	ulint**		offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: pointer to memory heap
+				that can be emptied, or NULL */
+	mem_heap_t*	entry_heap,
+				/*!< in/out: memory heap for allocating
+				big_rec and the index tuple */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller, or NULL */
+	const upd_t*	update,	/*!< in: update vector; this is allowed also
+				contain trx id and roll ptr fields, but
+				the values in update vector have no effect */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; must be committed
+				before latching any further pages */
+	__attribute__((warn_unused_result, nonnull));
+/***********************************************************//**
+Marks a clustered index record deleted. Writes an undo log record to
+undo log on this delete marking. Writes in the trx id field the id
+of the deleting transaction, and in the roll ptr field pointer to the
+undo log record created.
+@return	DB_SUCCESS, DB_LOCK_WAIT, or error number */
+UNIV_INTERN
+dberr_t
+btr_cur_del_mark_set_clust_rec(
+/*===========================*/
+	buf_block_t*	block,	/*!< in/out: buffer block of the record */
+	rec_t*		rec,	/*!< in/out: record */
+	dict_index_t*	index,	/*!< in: clustered index of the record */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec) */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull, warn_unused_result));
+/***********************************************************//**
+Sets a secondary index record delete mark to TRUE or FALSE.
+@return	DB_SUCCESS, DB_LOCK_WAIT, or error number */
+UNIV_INTERN
+dberr_t
+btr_cur_del_mark_set_sec_rec(
+/*=========================*/
+	ulint		flags,	/*!< in: locking flag */
+	btr_cur_t*	cursor,	/*!< in: cursor */
+	ibool		val,	/*!< in: value to set */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull, warn_unused_result));
+/*************************************************************//**
+Tries to compress a page of the tree if it seems useful. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. To avoid
+deadlocks, mtr must also own x-latches to brothers of page, if those
+brothers exist. NOTE: it is assumed that the caller has reserved enough
+free extents so that the compression will always succeed if done!
+@return	TRUE if compression occurred */
+UNIV_INTERN
+ibool
+btr_cur_compress_if_useful(
+/*=======================*/
+	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to compress;
+				cursor does not stay valid if compression
+				occurs */
+	ibool		adjust,	/*!< in: TRUE if should adjust the
+				cursor position even if compression occurs */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
+/*******************************************************//**
+Removes the record on which the tree cursor is positioned. It is assumed
+that the mtr has an x-latch on the page where the cursor is positioned,
+but no latch on the whole tree.
+@return	TRUE if success, i.e., the page did not become too empty */
+UNIV_INTERN
+ibool
+btr_cur_optimistic_delete_func(
+/*===========================*/
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
+				cursor stays valid: if deletion succeeds,
+				on function exit it points to the successor
+				of the deleted record */
+# ifdef UNIV_DEBUG
+	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
+# endif /* UNIV_DEBUG */
+	mtr_t*		mtr)	/*!< in: mtr; if this function returns
+				TRUE on a leaf page of a secondary
+				index, the mtr must be committed
+				before latching any further pages */
+	__attribute__((nonnull, warn_unused_result));
+# ifdef UNIV_DEBUG
+#  define btr_cur_optimistic_delete(cursor, flags, mtr)		\
+	btr_cur_optimistic_delete_func(cursor, flags, mtr)
+# else /* UNIV_DEBUG */
+#  define btr_cur_optimistic_delete(cursor, flags, mtr)		\
+	btr_cur_optimistic_delete_func(cursor, mtr)
+# endif /* UNIV_DEBUG */
+/*************************************************************//**
+Removes the record on which the tree cursor is positioned. Tries
+to compress the page if its fillfactor drops below a threshold
+or if it is the only page on the level. It is assumed that mtr holds
+an x-latch on the tree and on the cursor page. To avoid deadlocks,
+mtr must also own x-latches to brothers of page, if those brothers
+exist.
+@return	TRUE if compression occurred */
+UNIV_INTERN
+ibool
+btr_cur_pessimistic_delete(
+/*=======================*/
+	dberr_t*		err,	/*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
+				the latter may occur because we may have
+				to update node pointers on upper levels,
+				and in the case of variable length keys
+				these may actually grow in size */
+	ibool		has_reserved_extents, /*!< in: TRUE if the
+				caller has already reserved enough free
+				extents so that he knows that the operation
+				will succeed */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
+				if compression does not occur, the cursor
+				stays valid: it points to successor of
+				deleted record on function exit */
+	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
+	enum trx_rb_ctx	rb_ctx,	/*!< in: rollback context */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Parses a redo log record of updating a record in-place.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_update_in_place(
+/*==========================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	page_t*		page,	/*!< in/out: page or NULL */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	dict_index_t*	index);	/*!< in: index corresponding to page */
+/****************************************************************//**
+Parses the redo log record for delete marking or unmarking of a clustered
+index record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_del_mark_set_clust_rec(
+/*=================================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	page_t*		page,	/*!< in/out: page or NULL */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	dict_index_t*	index);	/*!< in: index corresponding to page */
+/****************************************************************//**
+Parses the redo log record for delete marking or unmarking of a secondary
+index record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_del_mark_set_sec_rec(
+/*===============================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	page_t*		page,	/*!< in/out: page or NULL */
+	page_zip_des_t*	page_zip);/*!< in/out: compressed page, or NULL */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Estimates the number of rows in a given index range.
+@return	estimated number of rows */
+UNIV_INTERN
+ib_int64_t
+btr_estimate_n_rows_in_range(
+/*=========================*/
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	tuple1,	/*!< in: range start, may also be empty tuple */
+	ulint		mode1,	/*!< in: search mode for range start */
+	const dtuple_t*	tuple2,	/*!< in: range end, may also be empty tuple */
+	ulint		mode2);	/*!< in: search mode for range end */
+/*******************************************************************//**
+Estimates the number of different key values in a given index, for
+each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
+The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
+0..n_uniq-1) and the number of pages that were sampled is saved in
+index->stat_n_sample_sizes[].
+If innodb_stats_method is nulls_ignored, we also record the number of
+non-null values for each prefix and stored the estimates in
+array index->stat_n_non_null_key_vals. */
+UNIV_INTERN
+void
+btr_estimate_number_of_different_key_vals(
+/*======================================*/
+	dict_index_t*	index);	/*!< in: index */
+
+/** Gets the externally stored size of a record, in units of a database page.
+@param[in]	rec	record
+@param[in]	offsets	array returned by rec_get_offsets()
+@return externally stored part, in units of a database page */
+
+ulint
+btr_rec_get_externally_stored_len(
+	const rec_t*	rec,
+	const ulint*	offsets);
+
+/*******************************************************************//**
+Marks non-updated off-page fields as disowned by this record. The ownership
+must be transferred to the updated record which is inserted elsewhere in the
+index tree. In purge only the owner of externally stored field is allowed
+to free the field. */
+UNIV_INTERN
+void
+btr_cur_disown_inherited_fields(
+/*============================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
+				part will be updated, or NULL */
+	rec_t*		rec,	/*!< in/out: record in a clustered index */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	const upd_t*	update,	/*!< in: update vector */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull(2,3,4,5,6)));
+
+/** Operation code for btr_store_big_rec_extern_fields(). */
+enum blob_op {
+	/** Store off-page columns for a freshly inserted record */
+	BTR_STORE_INSERT = 0,
+	/** Store off-page columns for an insert by update */
+	BTR_STORE_INSERT_UPDATE,
+	/** Store off-page columns for an update */
+	BTR_STORE_UPDATE
+};
+
+/*******************************************************************//**
+Determine if an operation on off-page columns is an update.
+@return TRUE if op != BTR_STORE_INSERT */
+UNIV_INLINE
+ibool
+btr_blob_op_is_update(
+/*==================*/
+	enum blob_op	op)	/*!< in: operation */
+	__attribute__((warn_unused_result));
+
+/*******************************************************************//**
+Stores the fields in big_rec_vec to the tablespace and puts pointers to
+them in rec.  The extern flags in rec will have to be set beforehand.
+The fields are stored on pages allocated from leaf node
+file segment of the index tree.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+UNIV_INTERN
+dberr_t
+btr_store_big_rec_extern_fields(
+/*============================*/
+	dict_index_t*	index,		/*!< in: index of rec; the index tree
+					MUST be X-latched */
+	buf_block_t*	rec_block,	/*!< in/out: block containing rec */
+	rec_t*		rec,		/*!< in/out: record */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index);
+					the "external storage" flags in offsets
+					will not correspond to rec when
+					this function returns */
+	const big_rec_t*big_rec_vec,	/*!< in: vector containing fields
+					to be stored externally */
+	mtr_t*		btr_mtr,	/*!< in: mtr containing the
+					latches to the clustered index */
+	enum blob_op	op)		/*! in: operation code */
+	__attribute__((nonnull, warn_unused_result));
+
+/*******************************************************************//**
+Frees the space in an externally stored field to the file space
+management if the field in data is owned the externally stored field,
+in a rollback we may have the additional condition that the field must
+not be inherited. */
+UNIV_INTERN
+void
+btr_free_externally_stored_field(
+/*=============================*/
+	dict_index_t*	index,		/*!< in: index of the data, the index
+					tree MUST be X-latched; if the tree
+					height is 1, then also the root page
+					must be X-latched! (this is relevant
+					in the case this function is called
+					from purge where 'data' is located on
+					an undo log page, not an index
+					page) */
+	byte*		field_ref,	/*!< in/out: field reference */
+	const rec_t*	rec,		/*!< in: record containing field_ref, for
+					page_zip_write_blob_ptr(), or NULL */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index),
+					or NULL */
+	page_zip_des_t*	page_zip,	/*!< in: compressed page corresponding
+					to rec, or NULL if rec == NULL */
+	ulint		i,		/*!< in: field number of field_ref;
+					ignored if rec == NULL */
+	enum trx_rb_ctx	rb_ctx,		/*!< in: rollback context */
+	mtr_t*		local_mtr);	/*!< in: mtr containing the latch to
+					data an an X-latch to the index
+					tree */
+/*******************************************************************//**
+Copies the prefix of an externally stored field of a record.  The
+clustered index record must be protected by a lock or a page latch.
+@return the length of the copied field, or 0 if the column was being
+or has been deleted */
+UNIV_INTERN
+ulint
+btr_copy_externally_stored_field_prefix(
+/*====================================*/
+	byte*		buf,	/*!< out: the field, or a prefix of it */
+	ulint		len,	/*!< in: length of buf, in bytes */
+	ulint		zip_size,/*!< in: nonzero=compressed BLOB page size,
+				zero for uncompressed BLOBs */
+	const byte*	data,	/*!< in: 'internally' stored part of the
+				field containing also the reference to
+				the external part; must be protected by
+				a lock or a page latch */
+	ulint		local_len);/*!< in: length of data, in bytes */
+/*******************************************************************//**
+Copies an externally stored field of a record to mem heap.  The
+clustered index record must be protected by a lock or a page latch.
+@return the whole field copied to heap */
+UNIV_INTERN
+byte*
+btr_copy_externally_stored_field(
+/*=============================*/
+	ulint*		len,	/*!< out: length of the whole field */
+	const byte*	data,	/*!< in: 'internally' stored part of the
+				field containing also the reference to
+				the external part; must be protected by
+				a lock or a page latch */
+	ulint		zip_size,/*!< in: nonzero=compressed BLOB page size,
+				zero for uncompressed BLOBs */
+	ulint		local_len,/*!< in: length of data */
+	mem_heap_t*	heap);	/*!< in: mem heap */
+/*******************************************************************//**
+Copies an externally stored field of a record to mem heap.
+@return	the field copied to heap, or NULL if the field is incomplete */
+UNIV_INTERN
+byte*
+btr_rec_copy_externally_stored_field(
+/*=================================*/
+	const rec_t*	rec,	/*!< in: record in a clustered index;
+				must be protected by a lock or a page latch */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		zip_size,/*!< in: nonzero=compressed BLOB page size,
+				zero for uncompressed BLOBs */
+	ulint		no,	/*!< in: field number */
+	ulint*		len,	/*!< out: length of the field */
+	mem_heap_t*	heap);	/*!< in: mem heap */
+/*******************************************************************//**
+Flags the data tuple fields that are marked as extern storage in the
+update vector.  We use this function to remember which fields we must
+mark as extern storage in a record inserted for an update.
+@return	number of flagged external columns */
+UNIV_INTERN
+ulint
+btr_push_update_extern_fields(
+/*==========================*/
+	dtuple_t*	tuple,	/*!< in/out: data tuple */
+	const upd_t*	update,	/*!< in: update vector */
+	mem_heap_t*	heap)	/*!< in: memory heap */
+	__attribute__((nonnull));
+/***********************************************************//**
+Sets a secondary index record's delete mark to the given value. This
+function is only used by the insert buffer merge mechanism. */
+UNIV_INTERN
+void
+btr_cur_set_deleted_flag_for_ibuf(
+/*==============================*/
+	rec_t*		rec,		/*!< in/out: record */
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page
+					corresponding to rec, or NULL
+					when the tablespace is
+					uncompressed */
+	ibool		val,		/*!< in: value to set */
+	mtr_t*		mtr);		/*!< in/out: mini-transaction */
+/*######################################################################*/
+
+/** In the pessimistic delete, if the page data size drops below this
+limit, merging it to a neighbor is tried */
+#define BTR_CUR_PAGE_COMPRESS_LIMIT	(UNIV_PAGE_SIZE / 2)
+
+/** A slot in the path array. We store here info on a search path down the
+tree. Each slot contains data on a single level of the tree. */
+
+struct btr_path_t{
+	ulint	nth_rec;	/*!< index of the record
+				where the page cursor stopped on
+				this level (index in alphabetical
+				order); value ULINT_UNDEFINED
+				denotes array end */
+	ulint	n_recs;		/*!< number of records on the page */
+	ulint	page_no;	/*!< no of the page containing the record */
+	ulint	page_level;	/*!< level of the page, if later we fetch
+				the page under page_no and it is no different
+				level then we know that the tree has been
+				reorganized */
+};
+
+#define BTR_PATH_ARRAY_N_SLOTS	250	/*!< size of path array (in slots) */
+
+/** Values for the flag documenting the used search method */
+enum btr_cur_method {
+	BTR_CUR_HASH = 1,	/*!< successful shortcut using
+				the hash index */
+	BTR_CUR_HASH_FAIL,	/*!< failure using hash, success using
+				binary search: the misleading hash
+				reference is stored in the field
+				hash_node, and might be necessary to
+				update */
+	BTR_CUR_BINARY,		/*!< success using the binary search */
+	BTR_CUR_INSERT_TO_IBUF,	/*!< performed the intended insert to
+				the insert buffer */
+	BTR_CUR_DEL_MARK_IBUF,	/*!< performed the intended delete
+				mark in the insert/delete buffer */
+	BTR_CUR_DELETE_IBUF,	/*!< performed the intended delete in
+				the insert/delete buffer */
+	BTR_CUR_DELETE_REF	/*!< row_purge_poss_sec() failed */
+};
+
+/** The tree cursor: the definition appears here only for the compiler
+to know struct size! */
+struct btr_cur_t {
+	dict_index_t*	index;		/*!< index where positioned */
+	page_cur_t	page_cur;	/*!< page cursor */
+	purge_node_t*	purge_node;	/*!< purge node, for BTR_DELETE */
+	buf_block_t*	left_block;	/*!< this field is used to store
+					a pointer to the left neighbor
+					page, in the cases
+					BTR_SEARCH_PREV and
+					BTR_MODIFY_PREV */
+	/*------------------------------*/
+	que_thr_t*	thr;		/*!< this field is only used
+					when btr_cur_search_to_nth_level
+					is called for an index entry
+					insertion: the calling query
+					thread is passed here to be
+					used in the insert buffer */
+	/*------------------------------*/
+	/** The following fields are used in
+	btr_cur_search_to_nth_level to pass information: */
+	/* @{ */
+	enum btr_cur_method	flag;	/*!< Search method used */
+	ulint		tree_height;	/*!< Tree height if the search is done
+					for a pessimistic insert or update
+					operation */
+	ulint		up_match;	/*!< If the search mode was PAGE_CUR_LE,
+					the number of matched fields to the
+					the first user record to the right of
+					the cursor record after
+					btr_cur_search_to_nth_level;
+					for the mode PAGE_CUR_GE, the matched
+					fields to the first user record AT THE
+					CURSOR or to the right of it;
+					NOTE that the up_match and low_match
+					values may exceed the correct values
+					for comparison to the adjacent user
+					record if that record is on a
+					different leaf page! (See the note in
+					row_ins_duplicate_error_in_clust.) */
+	ulint		up_bytes;	/*!< number of matched bytes to the
+					right at the time cursor positioned;
+					only used internally in searches: not
+					defined after the search */
+	ulint		low_match;	/*!< if search mode was PAGE_CUR_LE,
+					the number of matched fields to the
+					first user record AT THE CURSOR or
+					to the left of it after
+					btr_cur_search_to_nth_level;
+					NOT defined for PAGE_CUR_GE or any
+					other search modes; see also the NOTE
+					in up_match! */
+	ulint		low_bytes;	/*!< number of matched bytes to the
+					right at the time cursor positioned;
+					only used internally in searches: not
+					defined after the search */
+	ulint		n_fields;	/*!< prefix length used in a hash
+					search if hash_node != NULL */
+	ulint		n_bytes;	/*!< hash prefix bytes if hash_node !=
+					NULL */
+	ulint		fold;		/*!< fold value used in the search if
+					flag is BTR_CUR_HASH */
+	/* @} */
+	btr_path_t*	path_arr;	/*!< in estimating the number of
+					rows in range, we store in this array
+					information of the path through
+					the tree */
+};
+
+/** If pessimistic delete fails because of lack of file space, there
+is still a good change of success a little later.  Try this many
+times. */
+#define BTR_CUR_RETRY_DELETE_N_TIMES	100
+/** If pessimistic delete fails because of lack of file space, there
+is still a good change of success a little later.  Sleep this many
+microseconds between retries. */
+#define BTR_CUR_RETRY_SLEEP_TIME	50000
+
+/** The reference in a field for which data is stored on a different page.
+The reference is at the end of the 'locally' stored part of the field.
+'Locally' means storage in the index record.
+We store locally a long enough prefix of each column so that we can determine
+the ordering parts of each index record without looking into the externally
+stored part. */
+/*-------------------------------------- @{ */
+#define BTR_EXTERN_SPACE_ID		0	/*!< space id where stored */
+#define BTR_EXTERN_PAGE_NO		4	/*!< page no where stored */
+#define BTR_EXTERN_OFFSET		8	/*!< offset of BLOB header
+						on that page */
+#define BTR_EXTERN_LEN			12	/*!< 8 bytes containing the
+						length of the externally
+						stored part of the BLOB.
+						The 2 highest bits are
+						reserved to the flags below. */
+/*-------------------------------------- @} */
+/* #define BTR_EXTERN_FIELD_REF_SIZE	20 // moved to btr0types.h */
+
+/** The most significant bit of BTR_EXTERN_LEN (i.e., the most
+significant bit of the byte at smallest address) is set to 1 if this
+field does not 'own' the externally stored field; only the owner field
+is allowed to free the field in purge! */
+#define BTR_EXTERN_OWNER_FLAG		128
+/** If the second most significant bit of BTR_EXTERN_LEN (i.e., the
+second most significant bit of the byte at smallest address) is 1 then
+it means that the externally stored field was inherited from an
+earlier version of the row.  In rollback we are not allowed to free an
+inherited external field. */
+#define BTR_EXTERN_INHERITED_FLAG	64
+
+/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
+extern ulint	btr_cur_n_non_sea;
+/** Number of successful adaptive hash index lookups in
+btr_cur_search_to_nth_level(). */
+extern ulint	btr_cur_n_sea;
+/** Old value of btr_cur_n_non_sea.  Copied by
+srv_refresh_innodb_monitor_stats().  Referenced by
+srv_printf_innodb_monitor(). */
+extern ulint	btr_cur_n_non_sea_old;
+/** Old value of btr_cur_n_sea.  Copied by
+srv_refresh_innodb_monitor_stats().  Referenced by
+srv_printf_innodb_monitor(). */
+extern ulint	btr_cur_n_sea_old;
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/* Flag to limit optimistic insert records */
+extern uint	btr_cur_limit_optimistic_insert_debug;
+#endif /* UNIV_DEBUG */
+
+#ifndef UNIV_NONINL
+#include "btr0cur.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/btr0cur.ic b/storage/innobase/include/btr0cur.ic
new file mode 100644
index 00000000000..43ee3304c0e
--- /dev/null
+++ b/storage/innobase/include/btr0cur.ic
@@ -0,0 +1,223 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0cur.ic
+The index tree cursor
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+#include "btr0btr.h"
+
+#ifdef UNIV_DEBUG
+# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE)\
+if (btr_cur_limit_optimistic_insert_debug > 1\
+    && (NREC) >= (ulint)btr_cur_limit_optimistic_insert_debug) {\
+        CODE;\
+}
+#else
+# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE)
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the page cursor component of a tree cursor.
+@return	pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_cur_get_page_cur(
+/*=================*/
+	const btr_cur_t*	cursor)	/*!< in: tree cursor */
+{
+	return(&((btr_cur_t*) cursor)->page_cur);
+}
+
+/*********************************************************//**
+Returns the buffer block on which the tree cursor is positioned.
+@return	pointer to buffer block */
+UNIV_INLINE
+buf_block_t*
+btr_cur_get_block(
+/*==============*/
+	const btr_cur_t*	cursor)	/*!< in: tree cursor */
+{
+	return(page_cur_get_block(btr_cur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Returns the record pointer of a tree cursor.
+@return	pointer to record */
+UNIV_INLINE
+rec_t*
+btr_cur_get_rec(
+/*============*/
+	const btr_cur_t*	cursor)	/*!< in: tree cursor */
+{
+	return(page_cur_get_rec(btr_cur_get_page_cur(cursor)));
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************//**
+Returns the compressed page on which the tree cursor is positioned.
+@return	pointer to compressed page, or NULL if the page is not compressed */
+UNIV_INLINE
+page_zip_des_t*
+btr_cur_get_page_zip(
+/*=================*/
+	btr_cur_t*	cursor)	/*!< in: tree cursor */
+{
+	return(buf_block_get_page_zip(btr_cur_get_block(cursor)));
+}
+
+/*********************************************************//**
+Invalidates a tree cursor by setting record pointer to NULL. */
+UNIV_INLINE
+void
+btr_cur_invalidate(
+/*===============*/
+	btr_cur_t*	cursor)	/*!< in: tree cursor */
+{
+	page_cur_invalidate(&(cursor->page_cur));
+}
+
+/*********************************************************//**
+Returns the page of a tree cursor.
+@return	pointer to page */
+UNIV_INLINE
+page_t*
+btr_cur_get_page(
+/*=============*/
+	btr_cur_t*	cursor)	/*!< in: tree cursor */
+{
+	return(page_align(page_cur_get_rec(&(cursor->page_cur))));
+}
+
+/*********************************************************//**
+Positions a tree cursor at a given record. */
+UNIV_INLINE
+void
+btr_cur_position(
+/*=============*/
+	dict_index_t*	index,	/*!< in: index */
+	rec_t*		rec,	/*!< in: record in tree */
+	buf_block_t*	block,	/*!< in: buffer block of rec */
+	btr_cur_t*	cursor)	/*!< out: cursor */
+{
+	ut_ad(page_align(rec) == block->frame);
+
+	page_cur_position(rec, block, btr_cur_get_page_cur(cursor));
+
+	cursor->index = index;
+}
+
+/*********************************************************************//**
+Checks if compressing an index page where a btr cursor is placed makes
+sense.
+@return	TRUE if compression is recommended */
+UNIV_INLINE
+ibool
+btr_cur_compress_recommendation(
+/*============================*/
+	btr_cur_t*	cursor,	/*!< in: btr cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	const page_t*	page;
+
+	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+				MTR_MEMO_PAGE_X_FIX));
+
+	page = btr_cur_get_page(cursor);
+
+	LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page) * 2,
+				      return(FALSE));
+
+	if ((page_get_data_size(page) < BTR_CUR_PAGE_COMPRESS_LIMIT)
+	    || ((btr_page_get_next(page, mtr) == FIL_NULL)
+		&& (btr_page_get_prev(page, mtr) == FIL_NULL))) {
+
+		/* The page fillfactor has dropped below a predefined
+		minimum value OR the level in the B-tree contains just
+		one page: we recommend compression if this is not the
+		root page. */
+
+		return(dict_index_get_page(cursor->index)
+		       != page_get_page_no(page));
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Checks if the record on which the cursor is placed can be deleted without
+making tree compression necessary (or, recommended).
+@return	TRUE if can be deleted without recommended compression */
+UNIV_INLINE
+ibool
+btr_cur_can_delete_without_compress(
+/*================================*/
+	btr_cur_t*	cursor,	/*!< in: btr cursor */
+	ulint		rec_size,/*!< in: rec_get_size(btr_cur_get_rec(cursor))*/
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*		page;
+
+	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+				MTR_MEMO_PAGE_X_FIX));
+
+	page = btr_cur_get_page(cursor);
+
+	if ((page_get_data_size(page) - rec_size < BTR_CUR_PAGE_COMPRESS_LIMIT)
+	    || ((btr_page_get_next(page, mtr) == FIL_NULL)
+		&& (btr_page_get_prev(page, mtr) == FIL_NULL))
+	    || (page_get_n_recs(page) < 2)) {
+
+		/* The page fillfactor will drop below a predefined
+		minimum value, OR the level in the B-tree contains just
+		one page, OR the page will become empty: we recommend
+		compression if this is not the root page. */
+
+		return(dict_index_get_page(cursor->index)
+		       == page_get_page_no(page));
+	}
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Determine if an operation on off-page columns is an update.
+@return TRUE if op != BTR_STORE_INSERT */
+UNIV_INLINE
+ibool
+btr_blob_op_is_update(
+/*==================*/
+	enum blob_op	op)	/*!< in: operation */
+{
+	switch (op) {
+	case BTR_STORE_INSERT:
+		return(FALSE);
+	case BTR_STORE_INSERT_UPDATE:
+	case BTR_STORE_UPDATE:
+		return(TRUE);
+	}
+
+	ut_ad(0);
+	return(FALSE);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h
new file mode 100644
index 00000000000..cfbaacf4de3
--- /dev/null
+++ b/storage/innobase/include/btr0pcur.h
@@ -0,0 +1,548 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0pcur.h
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0pcur_h
+#define btr0pcur_h
+
+#include "univ.i"
+#include "dict0dict.h"
+#include "data0data.h"
+#include "mtr0mtr.h"
+#include "page0cur.h"
+#include "btr0cur.h"
+#include "btr0btr.h"
+#include "btr0types.h"
+
+/* Relative positions for a stored cursor position */
+#define BTR_PCUR_ON			1
+#define BTR_PCUR_BEFORE			2
+#define BTR_PCUR_AFTER			3
+/* Note that if the tree is not empty, btr_pcur_store_position does not
+use the following, but only uses the above three alternatives, where the
+position is stored relative to a specific record: this makes implementation
+of a scroll cursor easier */
+#define BTR_PCUR_BEFORE_FIRST_IN_TREE	4	/* in an empty tree */
+#define BTR_PCUR_AFTER_LAST_IN_TREE	5	/* in an empty tree */
+
+/**************************************************************//**
+Allocates memory for a persistent cursor object and initializes the cursor.
+@return	own: persistent cursor */
+UNIV_INTERN
+btr_pcur_t*
+btr_pcur_create_for_mysql(void);
+/*============================*/
+
+/**************************************************************//**
+Resets a persistent cursor object, freeing ::old_rec_buf if it is
+allocated and resetting the other members to their initial values. */
+UNIV_INTERN
+void
+btr_pcur_reset(
+/*===========*/
+	btr_pcur_t*	cursor);/*!< in, out: persistent cursor */
+
+/**************************************************************//**
+Frees the memory for a persistent cursor object. */
+UNIV_INTERN
+void
+btr_pcur_free_for_mysql(
+/*====================*/
+	btr_pcur_t*	cursor);	/*!< in, own: persistent cursor */
+/**************************************************************//**
+Copies the stored position of a pcur to another pcur. */
+UNIV_INTERN
+void
+btr_pcur_copy_stored_position(
+/*==========================*/
+	btr_pcur_t*	pcur_receive,	/*!< in: pcur which will receive the
+					position info */
+	btr_pcur_t*	pcur_donate);	/*!< in: pcur from which the info is
+					copied */
+/**************************************************************//**
+Sets the old_rec_buf field to NULL. */
+UNIV_INLINE
+void
+btr_pcur_init(
+/*==========*/
+	btr_pcur_t*	pcur);	/*!< in: persistent cursor */
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. It should be
+closed with btr_pcur_close. */
+UNIV_INLINE
+void
+btr_pcur_open_low(
+/*==============*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: level in the btree */
+	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
+	ulint		mode,	/*!< in: PAGE_CUR_L, ...;
+				NOTE that if the search is made using a unique
+				prefix of a record, mode should be
+				PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+				may end up on the previous page from the
+				record! */
+	ulint		latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_pcur_t*	cursor, /*!< in: memory buffer for persistent cursor */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr);	/*!< in: mtr */
+#define btr_pcur_open(i,t,md,l,c,m)				\
+	btr_pcur_open_low(i,0,t,md,l,c,__FILE__,__LINE__,m)
+/**************************************************************//**
+Opens an persistent cursor to an index tree without initializing the
+cursor. */
+UNIV_INLINE
+void
+btr_pcur_open_with_no_init_func(
+/*============================*/
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
+	ulint		mode,	/*!< in: PAGE_CUR_L, ...;
+				NOTE that if the search is made using a unique
+				prefix of a record, mode should be
+				PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+				may end up on the previous page of the
+				record! */
+	ulint		latch_mode,/*!< in: BTR_SEARCH_LEAF, ...;
+				NOTE that if has_search_latch != 0 then
+				we maybe do not acquire a latch on the cursor
+				page, but assume that the caller uses his
+				btr search latch to protect the record! */
+	btr_pcur_t*	cursor, /*!< in: memory buffer for persistent cursor */
+	ulint		has_search_latch,/*!< in: latch mode the caller
+				currently has on btr_search_latch:
+				RW_S_LATCH, or 0 */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr);	/*!< in: mtr */
+#define btr_pcur_open_with_no_init(ix,t,md,l,cur,has,m)			\
+	btr_pcur_open_with_no_init_func(ix,t,md,l,cur,has,__FILE__,__LINE__,m)
+
+/*****************************************************************//**
+Opens a persistent cursor at either end of an index. */
+UNIV_INLINE
+void
+btr_pcur_open_at_index_side(
+/*========================*/
+	bool		from_left,	/*!< in: true if open to the low end,
+					false if to the high end */
+	dict_index_t*	index,		/*!< in: index */
+	ulint		latch_mode,	/*!< in: latch mode */
+	btr_pcur_t*	pcur,		/*!< in/out: cursor */
+	bool		init_pcur,	/*!< in: whether to initialize pcur */
+	ulint		level,		/*!< in: level to search for
+					(0=leaf) */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
+/**************************************************************//**
+Gets the up_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_GE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_up_match(
+/*==================*/
+	const btr_pcur_t*	cursor); /*!< in: persistent cursor */
+/**************************************************************//**
+Gets the low_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_LE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_low_match(
+/*===================*/
+	const btr_pcur_t*	cursor); /*!< in: persistent cursor */
+/**************************************************************//**
+If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first
+user record satisfying the search condition, in the case PAGE_CUR_L or
+PAGE_CUR_LE, on the last user record. If no such user record exists, then
+in the first case sets the cursor after last in tree, and in the latter case
+before first in tree. The latching mode must be BTR_SEARCH_LEAF or
+BTR_MODIFY_LEAF. */
+UNIV_INTERN
+void
+btr_pcur_open_on_user_rec_func(
+/*===========================*/
+	dict_index_t*	index,		/*!< in: index */
+	const dtuple_t*	tuple,		/*!< in: tuple on which search done */
+	ulint		mode,		/*!< in: PAGE_CUR_L, ... */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF or
+					BTR_MODIFY_LEAF */
+	btr_pcur_t*	cursor,		/*!< in: memory buffer for persistent
+					cursor */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr);		/*!< in: mtr */
+#define btr_pcur_open_on_user_rec(i,t,md,l,c,m)				\
+	btr_pcur_open_on_user_rec_func(i,t,md,l,c,__FILE__,__LINE__,m)
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree. */
+UNIV_INLINE
+void
+btr_pcur_open_at_rnd_pos_func(
+/*==========================*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_pcur_t*	cursor,		/*!< in/out: B-tree pcur */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr);		/*!< in: mtr */
+#define btr_pcur_open_at_rnd_pos(i,l,c,m)				\
+	btr_pcur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m)
+/**************************************************************//**
+Frees the possible memory heap of a persistent cursor and sets the latch
+mode of the persistent cursor to BTR_NO_LATCHES.
+WARNING: this function does not release the latch on the page where the
+cursor is currently positioned. The latch is acquired by the
+"move to next/previous" family of functions. Since recursive shared locks
+are not allowed, you must take care (if using the cursor in S-mode) to
+manually release the latch by either calling
+btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr)
+or by committing the mini-transaction right after btr_pcur_close().
+A subsequent attempt to crawl the same page in the same mtr would cause
+an assertion failure. */
+UNIV_INLINE
+void
+btr_pcur_close(
+/*===========*/
+	btr_pcur_t*	cursor);	/*!< in: persistent cursor */
+/**************************************************************//**
+The position of the cursor is stored by taking an initial segment of the
+record the cursor is positioned on, before, or after, and copying it to the
+cursor data structure, or just setting a flag if the cursor id before the
+first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the
+page where the cursor is positioned must not be empty if the index tree is
+not totally empty! */
+UNIV_INTERN
+void
+btr_pcur_store_position(
+/*====================*/
+	btr_pcur_t*	cursor, /*!< in: persistent cursor */
+	mtr_t*		mtr);	/*!< in: mtr */
+/**************************************************************//**
+Restores the stored position of a persistent cursor bufferfixing the page and
+obtaining the specified latches. If the cursor position was saved when the
+(1) cursor was positioned on a user record: this function restores the position
+to the last record LESS OR EQUAL to the stored record;
+(2) cursor was positioned on a page infimum record: restores the position to
+the last record LESS than the user record which was the successor of the page
+infimum;
+(3) cursor was positioned on the page supremum: restores to the first record
+GREATER than the user record which was the predecessor of the supremum.
+(4) cursor was positioned before the first or after the last in an empty tree:
+restores to before first or after the last in the tree.
+@return TRUE if the cursor position was stored when it was on a user
+record and it can be restored on a user record whose ordering fields
+are identical to the ones of the original user record */
+UNIV_INTERN
+ibool
+btr_pcur_restore_position_func(
+/*===========================*/
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_pcur_t*	cursor,		/*!< in: detached persistent cursor */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr);		/*!< in: mtr */
+#define btr_pcur_restore_position(l,cur,mtr)				\
+	btr_pcur_restore_position_func(l,cur,__FILE__,__LINE__,mtr)
+/*********************************************************//**
+Gets the rel_pos field for a cursor whose position has been stored.
+@return	BTR_PCUR_ON, ... */
+UNIV_INLINE
+ulint
+btr_pcur_get_rel_pos(
+/*=================*/
+	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/**************************************************************//**
+Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used before calling this,
+if restoration of cursor is wanted later. */
+UNIV_INLINE
+void
+btr_pcur_commit_specify_mtr(
+/*========================*/
+	btr_pcur_t*	pcur,	/*!< in: persistent cursor */
+	mtr_t*		mtr);	/*!< in: mtr to commit */
+/*********************************************************//**
+Moves the persistent cursor to the next record in the tree. If no records are
+left, the cursor stays 'after last in tree'.
+@return	TRUE if the cursor was not after last in tree */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next(
+/*==================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the previous record in the tree. If no records
+are left, the cursor stays 'before first in tree'.
+@return	TRUE if the cursor was not before first in tree */
+UNIV_INTERN
+ibool
+btr_pcur_move_to_prev(
+/*==================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the last record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_last_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the next user record in the tree. If no user
+records are left, the cursor ends up 'after last in tree'.
+@return	TRUE if the cursor moved forward, ending on a user record */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next_user_rec(
+/*===========================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the first record on the next page.
+Releases the latch on the current page, and bufferunfixes it.
+Note that there must not be modifications on the current page,
+as then the x-latch can be released only in mtr_commit. */
+UNIV_INTERN
+void
+btr_pcur_move_to_next_page(
+/*=======================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; must be on the
+				last record of the current page */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor backward if it is on the first record
+of the page. Releases the latch on the current page, and bufferunfixes
+it. Note that to prevent a possible deadlock, the operation first
+stores the position of the cursor, releases the leaf latch, acquires
+necessary latches and restores the cursor position again before returning.
+The alphabetical position of the cursor is guaranteed to be sensible
+on return, but it may happen that the cursor is not positioned on the
+last record of any page, because the structure of the tree may have
+changed while the cursor had no latches. */
+UNIV_INTERN
+void
+btr_pcur_move_backward_from_page(
+/*=============================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor, must be on the
+				first record of the current page */
+	mtr_t*		mtr);	/*!< in: mtr */
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the btr cursor component of a persistent cursor.
+@return	pointer to btr cursor component */
+UNIV_INLINE
+btr_cur_t*
+btr_pcur_get_btr_cur(
+/*=================*/
+	const btr_pcur_t*	cursor);	/*!< in: persistent cursor */
+/*********************************************************//**
+Returns the page cursor component of a persistent cursor.
+@return	pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_pcur_get_page_cur(
+/*==================*/
+	const btr_pcur_t*	cursor);	/*!< in: persistent cursor */
+/*********************************************************//**
+Returns the page of a persistent cursor.
+@return	pointer to the page */
+UNIV_INLINE
+page_t*
+btr_pcur_get_page(
+/*==============*/
+	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Returns the buffer block of a persistent cursor.
+@return	pointer to the block */
+UNIV_INLINE
+buf_block_t*
+btr_pcur_get_block(
+/*===============*/
+	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Returns the record of a persistent cursor.
+@return	pointer to the record */
+UNIV_INLINE
+rec_t*
+btr_pcur_get_rec(
+/*=============*/
+	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
+#else /* UNIV_DEBUG */
+# define btr_pcur_get_btr_cur(cursor) (&(cursor)->btr_cur)
+# define btr_pcur_get_page_cur(cursor) (&(cursor)->btr_cur.page_cur)
+# define btr_pcur_get_page(cursor) ((cursor)->btr_cur.page_cur.block->frame)
+# define btr_pcur_get_block(cursor) ((cursor)->btr_cur.page_cur.block)
+# define btr_pcur_get_rec(cursor) ((cursor)->btr_cur.page_cur.rec)
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Checks if the persistent cursor is on a user record. */
+UNIV_INLINE
+ibool
+btr_pcur_is_on_user_rec(
+/*====================*/
+	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_on_page(
+/*===========================*/
+	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_on_page(
+/*=============================*/
+	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_in_tree(
+/*=============================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_in_tree(
+/*===========================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the next record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_next_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor);/*!< in/out: persistent cursor */
+/*********************************************************//**
+Moves the persistent cursor to the previous record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_prev_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor);/*!< in/out: persistent cursor */
+/*********************************************************//**
+Moves the persistent cursor to the infimum record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_before_first_on_page(
+/*===============================*/
+	btr_pcur_t*	cursor); /*!< in/out: persistent cursor */
+
+/** Position state of persistent B-tree cursor. */
+enum pcur_pos_t {
+	/** The persistent cursor is not positioned. */
+	BTR_PCUR_NOT_POSITIONED = 0,
+	/** The persistent cursor was previously positioned.
+	TODO: currently, the state can be BTR_PCUR_IS_POSITIONED,
+	though it really should be BTR_PCUR_WAS_POSITIONED,
+	because we have no obligation to commit the cursor with
+	mtr; similarly latch_mode may be out of date. This can
+	lead to problems if btr_pcur is not used the right way;
+	all current code should be ok. */
+	BTR_PCUR_WAS_POSITIONED,
+	/** The persistent cursor is positioned by optimistic get to the same
+	record as it was positioned at. Not used for rel_pos == BTR_PCUR_ON.
+	It may need adjustment depending on previous/current search direction
+	and rel_pos. */
+	BTR_PCUR_IS_POSITIONED_OPTIMISTIC,
+	/** The persistent cursor is positioned by index search.
+	Or optimistic get for rel_pos == BTR_PCUR_ON. */
+	BTR_PCUR_IS_POSITIONED
+};
+
+/* The persistent B-tree cursor structure. This is used mainly for SQL
+selects, updates, and deletes. */
+
+struct btr_pcur_t{
+	btr_cur_t	btr_cur;	/*!< a B-tree cursor */
+	ulint		latch_mode;	/*!< see TODO note below!
+					BTR_SEARCH_LEAF, BTR_MODIFY_LEAF,
+					BTR_MODIFY_TREE, or BTR_NO_LATCHES,
+					depending on the latching state of
+					the page and tree where the cursor is
+					positioned; BTR_NO_LATCHES means that
+					the cursor is not currently positioned:
+					we say then that the cursor is
+					detached; it can be restored to
+					attached if the old position was
+					stored in old_rec */
+	ulint		old_stored;	/*!< BTR_PCUR_OLD_STORED
+					or BTR_PCUR_OLD_NOT_STORED */
+	rec_t*		old_rec;	/*!< if cursor position is stored,
+					contains an initial segment of the
+					latest record cursor was positioned
+					either on, before, or after */
+	ulint		old_n_fields;	/*!< number of fields in old_rec */
+	ulint		rel_pos;	/*!< BTR_PCUR_ON, BTR_PCUR_BEFORE, or
+					BTR_PCUR_AFTER, depending on whether
+					cursor was on, before, or after the
+					old_rec record */
+	buf_block_t*	block_when_stored;/* buffer block when the position was
+					stored */
+	ib_uint64_t	modify_clock;	/*!< the modify clock value of the
+					buffer block when the cursor position
+					was stored */
+	enum pcur_pos_t	pos_state;	/*!< btr_pcur_store_position() and
+					btr_pcur_restore_position() state. */
+	ulint		search_mode;	/*!< PAGE_CUR_G, ... */
+	trx_t*		trx_if_known;	/*!< the transaction, if we know it;
+					otherwise this field is not defined;
+					can ONLY BE USED in error prints in
+					fatal assertion failures! */
+	/*-----------------------------*/
+	/* NOTE that the following fields may possess dynamically allocated
+	memory which should be freed if not needed anymore! */
+
+	byte*		old_rec_buf;	/*!< NULL, or a dynamically allocated
+					buffer for old_rec */
+	ulint		buf_size;	/*!< old_rec_buf size if old_rec_buf
+					is not NULL */
+};
+
+#define BTR_PCUR_OLD_STORED	908467085
+#define BTR_PCUR_OLD_NOT_STORED	122766467
+
+#ifndef UNIV_NONINL
+#include "btr0pcur.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/btr0pcur.ic b/storage/innobase/include/btr0pcur.ic
new file mode 100644
index 00000000000..7e355d3709d
--- /dev/null
+++ b/storage/innobase/include/btr0pcur.ic
@@ -0,0 +1,606 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0pcur.ic
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+
+/*********************************************************//**
+Gets the rel_pos field for a cursor whose position has been stored.
+@return	BTR_PCUR_ON, ... */
+UNIV_INLINE
+ulint
+btr_pcur_get_rel_pos(
+/*=================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor);
+	ut_ad(cursor->old_rec);
+	ut_ad(cursor->old_stored == BTR_PCUR_OLD_STORED);
+	ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED
+	      || cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+	return(cursor->rel_pos);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the btr cursor component of a persistent cursor.
+@return	pointer to btr cursor component */
+UNIV_INLINE
+btr_cur_t*
+btr_pcur_get_btr_cur(
+/*=================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	const btr_cur_t*	btr_cur = &cursor->btr_cur;
+	return((btr_cur_t*) btr_cur);
+}
+
+/*********************************************************//**
+Returns the page cursor component of a persistent cursor.
+@return	pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_pcur_get_page_cur(
+/*==================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	return(btr_cur_get_page_cur(btr_pcur_get_btr_cur(cursor)));
+}
+
+/*********************************************************//**
+Returns the page of a persistent cursor.
+@return	pointer to the page */
+UNIV_INLINE
+page_t*
+btr_pcur_get_page(
+/*==============*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+	return(btr_cur_get_page(btr_pcur_get_btr_cur(cursor)));
+}
+
+/*********************************************************//**
+Returns the buffer block of a persistent cursor.
+@return	pointer to the block */
+UNIV_INLINE
+buf_block_t*
+btr_pcur_get_block(
+/*===============*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+	return(btr_cur_get_block(btr_pcur_get_btr_cur(cursor)));
+}
+
+/*********************************************************//**
+Returns the record of a persistent cursor.
+@return	pointer to the record */
+UNIV_INLINE
+rec_t*
+btr_pcur_get_rec(
+/*=============*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	return(btr_cur_get_rec(btr_pcur_get_btr_cur(cursor)));
+}
+#endif /* UNIV_DEBUG */
+
+/**************************************************************//**
+Gets the up_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_GE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_up_match(
+/*==================*/
+	const btr_pcur_t*	cursor) /*!< in: persistent cursor */
+{
+	const btr_cur_t*	btr_cursor;
+
+	ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED)
+	      || (cursor->pos_state == BTR_PCUR_IS_POSITIONED));
+
+	btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+	ut_ad(btr_cursor->up_match != ULINT_UNDEFINED);
+
+	return(btr_cursor->up_match);
+}
+
+/**************************************************************//**
+Gets the low_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_LE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_low_match(
+/*===================*/
+	const btr_pcur_t*	cursor) /*!< in: persistent cursor */
+{
+	const btr_cur_t*	btr_cursor;
+
+	ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED)
+	      || (cursor->pos_state == BTR_PCUR_IS_POSITIONED));
+
+	btr_cursor = btr_pcur_get_btr_cur(cursor);
+	ut_ad(btr_cursor->low_match != ULINT_UNDEFINED);
+
+	return(btr_cursor->low_match);
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_on_page(
+/*===========================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_on_page(
+/*=============================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is on a user record. */
+UNIV_INLINE
+ibool
+btr_pcur_is_on_user_rec(
+/*====================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	if (btr_pcur_is_before_first_on_page(cursor)
+	    || btr_pcur_is_after_last_on_page(cursor)) {
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_in_tree(
+/*=============================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	if (btr_page_get_prev(btr_pcur_get_page(cursor), mtr) != FIL_NULL) {
+
+		return(FALSE);
+	}
+
+	return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_in_tree(
+/*===========================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	if (btr_page_get_next(btr_pcur_get_page(cursor), mtr) != FIL_NULL) {
+
+		return(FALSE);
+	}
+
+	return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_next_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor)	/*!< in/out: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	page_cur_move_to_next(btr_pcur_get_page_cur(cursor));
+
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the previous record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_prev_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor)	/*!< in/out: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	page_cur_move_to_prev(btr_pcur_get_page_cur(cursor));
+
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the last record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_last_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	UT_NOT_USED(mtr);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	page_cur_set_after_last(btr_pcur_get_block(cursor),
+				btr_pcur_get_page_cur(cursor));
+
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next user record in the tree. If no user
+records are left, the cursor ends up 'after last in tree'.
+@return	TRUE if the cursor moved forward, ending on a user record */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next_user_rec(
+/*===========================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+loop:
+	if (btr_pcur_is_after_last_on_page(cursor)) {
+
+		if (btr_pcur_is_after_last_in_tree(cursor, mtr)) {
+
+			return(FALSE);
+		}
+
+		btr_pcur_move_to_next_page(cursor, mtr);
+	} else {
+		btr_pcur_move_to_next_on_page(cursor);
+	}
+
+	if (btr_pcur_is_on_user_rec(cursor)) {
+
+		return(TRUE);
+	}
+
+	goto loop;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next record in the tree. If no records are
+left, the cursor stays 'after last in tree'.
+@return	TRUE if the cursor was not after last in tree */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next(
+/*==================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+	if (btr_pcur_is_after_last_on_page(cursor)) {
+
+		if (btr_pcur_is_after_last_in_tree(cursor, mtr)) {
+
+			return(FALSE);
+		}
+
+		btr_pcur_move_to_next_page(cursor, mtr);
+
+		return(TRUE);
+	}
+
+	btr_pcur_move_to_next_on_page(cursor);
+
+	return(TRUE);
+}
+
+/**************************************************************//**
+Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used before calling this,
+if restoration of cursor is wanted later. */
+UNIV_INLINE
+void
+btr_pcur_commit_specify_mtr(
+/*========================*/
+	btr_pcur_t*	pcur,	/*!< in: persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr to commit */
+{
+	ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+
+	pcur->latch_mode = BTR_NO_LATCHES;
+
+	mtr_commit(mtr);
+
+	pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/**************************************************************//**
+Sets the old_rec_buf field to NULL. */
+UNIV_INLINE
+void
+btr_pcur_init(
+/*==========*/
+	btr_pcur_t*	pcur)	/*!< in: persistent cursor */
+{
+	pcur->old_stored = BTR_PCUR_OLD_NOT_STORED;
+	pcur->old_rec_buf = NULL;
+	pcur->old_rec = NULL;
+}
+
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. It should be
+closed with btr_pcur_close. */
+UNIV_INLINE
+void
+btr_pcur_open_low(
+/*==============*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: level in the btree */
+	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
+	ulint		mode,	/*!< in: PAGE_CUR_L, ...;
+				NOTE that if the search is made using a unique
+				prefix of a record, mode should be
+				PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+				may end up on the previous page from the
+				record! */
+	ulint		latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_pcur_t*	cursor, /*!< in: memory buffer for persistent cursor */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	btr_cur_t*	btr_cursor;
+
+	/* Initialize the cursor */
+
+	btr_pcur_init(cursor);
+
+	cursor->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+	cursor->search_mode = mode;
+
+	/* Search with the tree cursor */
+
+	btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+	btr_cur_search_to_nth_level(index, level, tuple, mode, latch_mode,
+				    btr_cursor, 0, file, line, mtr);
+	cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+
+	cursor->trx_if_known = NULL;
+}
+
+/**************************************************************//**
+Opens an persistent cursor to an index tree without initializing the
+cursor. */
+UNIV_INLINE
+void
+btr_pcur_open_with_no_init_func(
+/*============================*/
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
+	ulint		mode,	/*!< in: PAGE_CUR_L, ...;
+				NOTE that if the search is made using a unique
+				prefix of a record, mode should be
+				PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+				may end up on the previous page of the
+				record! */
+	ulint		latch_mode,/*!< in: BTR_SEARCH_LEAF, ...;
+				NOTE that if has_search_latch != 0 then
+				we maybe do not acquire a latch on the cursor
+				page, but assume that the caller uses his
+				btr search latch to protect the record! */
+	btr_pcur_t*	cursor, /*!< in: memory buffer for persistent cursor */
+	ulint		has_search_latch,/*!< in: latch mode the caller
+				currently has on btr_search_latch:
+				RW_S_LATCH, or 0 */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	btr_cur_t*	btr_cursor;
+
+	cursor->latch_mode = latch_mode;
+	cursor->search_mode = mode;
+
+	/* Search with the tree cursor */
+
+	btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+	btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
+				    btr_cursor, has_search_latch,
+				    file, line, mtr);
+	cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+	cursor->trx_if_known = NULL;
+}
+
+/*****************************************************************//**
+Opens a persistent cursor at either end of an index. */
+UNIV_INLINE
+void
+btr_pcur_open_at_index_side(
+/*========================*/
+	bool		from_left,	/*!< in: true if open to the low end,
+					false if to the high end */
+	dict_index_t*	index,		/*!< in: index */
+	ulint		latch_mode,	/*!< in: latch mode */
+	btr_pcur_t*	pcur,		/*!< in/out: cursor */
+	bool		init_pcur,	/*!< in: whether to initialize pcur */
+	ulint		level,		/*!< in: level to search for
+					(0=leaf) */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	pcur->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+
+	pcur->search_mode = from_left ? PAGE_CUR_G : PAGE_CUR_L;
+
+	if (init_pcur) {
+		btr_pcur_init(pcur);
+	}
+
+	btr_cur_open_at_index_side(from_left, index, latch_mode,
+				   btr_pcur_get_btr_cur(pcur), level, mtr);
+	pcur->pos_state = BTR_PCUR_IS_POSITIONED;
+
+	pcur->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+	pcur->trx_if_known = NULL;
+}
+
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree. */
+UNIV_INLINE
+void
+btr_pcur_open_at_rnd_pos_func(
+/*==========================*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_pcur_t*	cursor,		/*!< in/out: B-tree pcur */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	/* Initialize the cursor */
+
+	cursor->latch_mode = latch_mode;
+	cursor->search_mode = PAGE_CUR_G;
+
+	btr_pcur_init(cursor);
+
+	btr_cur_open_at_rnd_pos_func(index, latch_mode,
+				     btr_pcur_get_btr_cur(cursor),
+				     file, line, mtr);
+	cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+	cursor->trx_if_known = NULL;
+}
+
+/**************************************************************//**
+Frees the possible memory heap of a persistent cursor and sets the latch
+mode of the persistent cursor to BTR_NO_LATCHES.
+WARNING: this function does not release the latch on the page where the
+cursor is currently positioned. The latch is acquired by the
+"move to next/previous" family of functions. Since recursive shared locks
+are not allowed, you must take care (if using the cursor in S-mode) to
+manually release the latch by either calling
+btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr)
+or by committing the mini-transaction right after btr_pcur_close().
+A subsequent attempt to crawl the same page in the same mtr would cause
+an assertion failure. */
+UNIV_INLINE
+void
+btr_pcur_close(
+/*===========*/
+	btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	if (cursor->old_rec_buf != NULL) {
+
+		mem_free(cursor->old_rec_buf);
+
+		cursor->old_rec = NULL;
+		cursor->old_rec_buf = NULL;
+	}
+
+	cursor->btr_cur.page_cur.rec = NULL;
+	cursor->btr_cur.page_cur.block = NULL;
+	cursor->old_rec = NULL;
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+	cursor->latch_mode = BTR_NO_LATCHES;
+	cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
+
+	cursor->trx_if_known = NULL;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the infimum record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_before_first_on_page(
+/*===============================*/
+	btr_pcur_t*	cursor) /*!< in/out: persistent cursor */
+{
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	page_cur_set_before_first(btr_pcur_get_block(cursor),
+		btr_pcur_get_page_cur(cursor));
+
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
diff --git a/storage/innobase/include/btr0sea.h b/storage/innobase/include/btr0sea.h
new file mode 100644
index 00000000000..848bde451a0
--- /dev/null
+++ b/storage/innobase/include/btr0sea.h
@@ -0,0 +1,288 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0sea.h
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#ifndef btr0sea_h
+#define btr0sea_h
+
+#include "univ.i"
+
+#include "rem0rec.h"
+#include "dict0dict.h"
+#include "btr0types.h"
+#include "mtr0mtr.h"
+#include "ha0ha.h"
+
+/*****************************************************************//**
+Creates and initializes the adaptive search system at a database start. */
+UNIV_INTERN
+void
+btr_search_sys_create(
+/*==================*/
+	ulint	hash_size);	/*!< in: hash index hash table size */
+/*****************************************************************//**
+Frees the adaptive search system at a database shutdown. */
+UNIV_INTERN
+void
+btr_search_sys_free(void);
+/*=====================*/
+
+/********************************************************************//**
+Disable the adaptive hash search system and empty the index. */
+UNIV_INTERN
+void
+btr_search_disable(void);
+/*====================*/
+/********************************************************************//**
+Enable the adaptive hash search system. */
+UNIV_INTERN
+void
+btr_search_enable(void);
+/*====================*/
+
+/********************************************************************//**
+Returns search info for an index.
+@return	search info; search mutex reserved */
+UNIV_INLINE
+btr_search_t*
+btr_search_get_info(
+/*================*/
+	dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull));
+/*****************************************************************//**
+Creates and initializes a search info struct.
+@return	own: search info struct */
+UNIV_INTERN
+btr_search_t*
+btr_search_info_create(
+/*===================*/
+	mem_heap_t*	heap);	/*!< in: heap where created */
+/*****************************************************************//**
+Returns the value of ref_count. The value is protected by
+btr_search_latch.
+@return	ref_count value. */
+UNIV_INTERN
+ulint
+btr_search_info_get_ref_count(
+/*==========================*/
+	btr_search_t*   info);	/*!< in: search info. */
+/*********************************************************************//**
+Updates the search info. */
+UNIV_INLINE
+void
+btr_search_info_update(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index of the cursor */
+	btr_cur_t*	cursor);/*!< in: cursor which was just positioned */
+/******************************************************************//**
+Tries to guess the right search position based on the hash search info
+of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts,
+and the function returns TRUE, then cursor->up_match and cursor->low_match
+both have sensible values.
+@return	TRUE if succeeded */
+UNIV_INTERN
+ibool
+btr_search_guess_on_hash(
+/*=====================*/
+	dict_index_t*	index,		/*!< in: index */
+	btr_search_t*	info,		/*!< in: index search info */
+	const dtuple_t*	tuple,		/*!< in: logical record */
+	ulint		mode,		/*!< in: PAGE_CUR_L, ... */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_cur_t*	cursor,		/*!< out: tree cursor */
+	ulint		has_search_latch,/*!< in: latch mode the caller
+					currently has on btr_search_latch:
+					RW_S_LATCH, RW_X_LATCH, or 0 */
+	mtr_t*		mtr);		/*!< in: mtr */
+/********************************************************************//**
+Moves or deletes hash entries for moved records. If new_page is already hashed,
+then the hash index for page, if any, is dropped. If new_page is not hashed,
+and page is hashed, then a new hash index is built to new_page with the same
+parameters as page (this often happens when a page is split). */
+UNIV_INTERN
+void
+btr_search_move_or_delete_hash_entries(
+/*===================================*/
+	buf_block_t*	new_block,	/*!< in: records are copied
+					to this page */
+	buf_block_t*	block,		/*!< in: index page from which
+					records were copied, and the
+					copied records will be deleted
+					from this page */
+	dict_index_t*	index);		/*!< in: record descriptor */
+/********************************************************************//**
+Drops a page hash index. */
+UNIV_INTERN
+void
+btr_search_drop_page_hash_index(
+/*============================*/
+	buf_block_t*	block);	/*!< in: block containing index page,
+				s- or x-latched, or an index page
+				for which we know that
+				block->buf_fix_count == 0 */
+/********************************************************************//**
+Drops a possible page hash index when a page is evicted from the buffer pool
+or freed in a file segment. */
+UNIV_INTERN
+void
+btr_search_drop_page_hash_when_freed(
+/*=================================*/
+	ulint	space,		/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no);	/*!< in: page number */
+/********************************************************************//**
+Updates the page hash index when a single record is inserted on a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_node_on_insert(
+/*==================================*/
+	btr_cur_t*	cursor);/*!< in: cursor which was positioned to the
+				place to insert using btr_cur_search_...,
+				and the new record has been inserted next
+				to the cursor */
+/********************************************************************//**
+Updates the page hash index when a single record is inserted on a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_on_insert(
+/*=============================*/
+	btr_cur_t*	cursor);/*!< in: cursor which was positioned to the
+				place to insert using btr_cur_search_...,
+				and the new record has been inserted next
+				to the cursor */
+/********************************************************************//**
+Updates the page hash index when a single record is deleted from a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_on_delete(
+/*=============================*/
+	btr_cur_t*	cursor);/*!< in: cursor which was positioned on the
+				record to delete using btr_cur_search_...,
+				the record is not yet deleted */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/********************************************************************//**
+Validates the search system.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+btr_search_validate(void);
+/*======================*/
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
+
+/** The search info struct in an index */
+struct btr_search_t{
+	ulint	ref_count;	/*!< Number of blocks in this index tree
+				that have search index built
+				i.e. block->index points to this index.
+				Protected by btr_search_latch except
+				when during initialization in
+				btr_search_info_create(). */
+
+	/* @{ The following fields are not protected by any latch.
+	Unfortunately, this means that they must be aligned to
+	the machine word, i.e., they cannot be turned into bit-fields. */
+	buf_block_t* root_guess;/*!< the root page frame when it was last time
+				fetched, or NULL */
+	ulint	hash_analysis;	/*!< when this exceeds
+				BTR_SEARCH_HASH_ANALYSIS, the hash
+				analysis starts; this is reset if no
+				success noticed */
+	ibool	last_hash_succ;	/*!< TRUE if the last search would have
+				succeeded, or did succeed, using the hash
+				index; NOTE that the value here is not exact:
+				it is not calculated for every search, and the
+				calculation itself is not always accurate! */
+	ulint	n_hash_potential;
+				/*!< number of consecutive searches
+				which would have succeeded, or did succeed,
+				using the hash index;
+				the range is 0 .. BTR_SEARCH_BUILD_LIMIT + 5 */
+	/* @} */
+	/*---------------------- @{ */
+	ulint	n_fields;	/*!< recommended prefix length for hash search:
+				number of full fields */
+	ulint	n_bytes;	/*!< recommended prefix: number of bytes in
+				an incomplete field
+				@see BTR_PAGE_MAX_REC_SIZE */
+	ibool	left_side;	/*!< TRUE or FALSE, depending on whether
+				the leftmost record of several records with
+				the same prefix should be indexed in the
+				hash index */
+	/*---------------------- @} */
+#ifdef UNIV_SEARCH_PERF_STAT
+	ulint	n_hash_succ;	/*!< number of successful hash searches thus
+				far */
+	ulint	n_hash_fail;	/*!< number of failed hash searches */
+	ulint	n_patt_succ;	/*!< number of successful pattern searches thus
+				far */
+	ulint	n_searches;	/*!< number of searches */
+#endif /* UNIV_SEARCH_PERF_STAT */
+#ifdef UNIV_DEBUG
+	ulint	magic_n;	/*!< magic number @see BTR_SEARCH_MAGIC_N */
+/** value of btr_search_t::magic_n, used in assertions */
+# define BTR_SEARCH_MAGIC_N	1112765
+#endif /* UNIV_DEBUG */
+};
+
+/** The hash index system */
+struct btr_search_sys_t{
+	hash_table_t*	hash_index;	/*!< the adaptive hash index,
+					mapping dtuple_fold values
+					to rec_t pointers on index pages */
+};
+
+/** The adaptive hash index */
+extern btr_search_sys_t*	btr_search_sys;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+/** Number of successful adaptive hash index lookups */
+extern ulint	btr_search_n_succ;
+/** Number of failed adaptive hash index lookups */
+extern ulint	btr_search_n_hash_fail;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+/** After change in n_fields or n_bytes in info, this many rounds are waited
+before starting the hash analysis again: this is to save CPU time when there
+is no hope in building a hash index. */
+#define BTR_SEARCH_HASH_ANALYSIS	17
+
+/** Limit of consecutive searches for trying a search shortcut on the search
+pattern */
+#define BTR_SEARCH_ON_PATTERN_LIMIT	3
+
+/** Limit of consecutive searches for trying a search shortcut using
+the hash index */
+#define BTR_SEARCH_ON_HASH_LIMIT	3
+
+/** We do this many searches before trying to keep the search latch
+over calls from MySQL. If we notice someone waiting for the latch, we
+again set this much timeout. This is to reduce contention. */
+#define BTR_SEA_TIMEOUT			10000
+
+#ifndef UNIV_NONINL
+#include "btr0sea.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/btr0sea.ic b/storage/innobase/include/btr0sea.ic
new file mode 100644
index 00000000000..0bd869be136
--- /dev/null
+++ b/storage/innobase/include/btr0sea.ic
@@ -0,0 +1,82 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0sea.ic
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "dict0mem.h"
+#include "btr0cur.h"
+#include "buf0buf.h"
+
+/*********************************************************************//**
+Updates the search info. */
+UNIV_INTERN
+void
+btr_search_info_update_slow(
+/*========================*/
+	btr_search_t*	info,	/*!< in/out: search info */
+	btr_cur_t*	cursor);/*!< in: cursor which was just positioned */
+
+/********************************************************************//**
+Returns search info for an index.
+@return	search info; search mutex reserved */
+UNIV_INLINE
+btr_search_t*
+btr_search_get_info(
+/*================*/
+	dict_index_t*	index)	/*!< in: index */
+{
+	return(index->search_info);
+}
+
+/*********************************************************************//**
+Updates the search info. */
+UNIV_INLINE
+void
+btr_search_info_update(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index of the cursor */
+	btr_cur_t*	cursor)	/*!< in: cursor which was just positioned */
+{
+	btr_search_t*	info;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	info = btr_search_get_info(index);
+
+	info->hash_analysis++;
+
+	if (info->hash_analysis < BTR_SEARCH_HASH_ANALYSIS) {
+
+		/* Do nothing */
+
+		return;
+
+	}
+
+	ut_ad(cursor->flag != BTR_CUR_HASH);
+
+	btr_search_info_update_slow(info, cursor);
+}
diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h
new file mode 100644
index 00000000000..c1a4531f861
--- /dev/null
+++ b/storage/innobase/include/btr0types.h
@@ -0,0 +1,203 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0types.h
+The index tree general types
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#ifndef btr0types_h
+#define btr0types_h
+
+#include "univ.i"
+
+#include "rem0types.h"
+#include "page0types.h"
+#include "sync0rw.h"
+
+/** Persistent cursor */
+struct btr_pcur_t;
+/** B-tree cursor */
+struct btr_cur_t;
+/** B-tree search information for the adaptive hash index */
+struct btr_search_t;
+
+#ifndef UNIV_HOTBACKUP
+
+/** @brief The latch protecting the adaptive search system
+
+This latch protects the
+(1) hash index;
+(2) columns of a record to which we have a pointer in the hash index;
+
+but does NOT protect:
+
+(3) next record offset field in a record;
+(4) next or previous records on the same page.
+
+Bear in mind (3) and (4) when using the hash index.
+*/
+extern rw_lock_t*	btr_search_latch_temp;
+
+#endif /* UNIV_HOTBACKUP */
+
+/** The latch protecting the adaptive search system */
+#define btr_search_latch	(*btr_search_latch_temp)
+
+/** Flag: has the search system been enabled?
+Protected by btr_search_latch. */
+extern char	btr_search_enabled;
+
+#ifdef UNIV_BLOB_DEBUG
+# include "buf0types.h"
+/** An index->blobs entry for keeping track of off-page column references */
+struct btr_blob_dbg_t;
+
+/** Insert to index->blobs a reference to an off-page column.
+@param index	the index tree
+@param b	the reference
+@param ctx	context (for logging) */
+UNIV_INTERN
+void
+btr_blob_dbg_rbt_insert(
+/*====================*/
+	dict_index_t*		index,	/*!< in/out: index tree */
+	const btr_blob_dbg_t*	b,	/*!< in: the reference */
+	const char*		ctx)	/*!< in: context (for logging) */
+	__attribute__((nonnull));
+
+/** Remove from index->blobs a reference to an off-page column.
+@param index	the index tree
+@param b	the reference
+@param ctx	context (for logging) */
+UNIV_INTERN
+void
+btr_blob_dbg_rbt_delete(
+/*====================*/
+	dict_index_t*		index,	/*!< in/out: index tree */
+	const btr_blob_dbg_t*	b,	/*!< in: the reference */
+	const char*		ctx)	/*!< in: context (for logging) */
+	__attribute__((nonnull));
+
+/**************************************************************//**
+Add to index->blobs any references to off-page columns from a record.
+@return number of references added */
+UNIV_INTERN
+ulint
+btr_blob_dbg_add_rec(
+/*=================*/
+	const rec_t*	rec,	/*!< in: record */
+	dict_index_t*	index,	/*!< in/out: index */
+	const ulint*	offsets,/*!< in: offsets */
+	const char*	ctx)	/*!< in: context (for logging) */
+	__attribute__((nonnull));
+/**************************************************************//**
+Remove from index->blobs any references to off-page columns from a record.
+@return number of references removed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_remove_rec(
+/*====================*/
+	const rec_t*	rec,	/*!< in: record */
+	dict_index_t*	index,	/*!< in/out: index */
+	const ulint*	offsets,/*!< in: offsets */
+	const char*	ctx)	/*!< in: context (for logging) */
+	__attribute__((nonnull));
+/**************************************************************//**
+Count and add to index->blobs any references to off-page columns
+from records on a page.
+@return number of references added */
+UNIV_INTERN
+ulint
+btr_blob_dbg_add(
+/*=============*/
+	const page_t*	page,	/*!< in: rewritten page */
+	dict_index_t*	index,	/*!< in/out: index */
+	const char*	ctx)	/*!< in: context (for logging) */
+	__attribute__((nonnull));
+/**************************************************************//**
+Count and remove from index->blobs any references to off-page columns
+from records on a page.
+Used when reorganizing a page, before copying the records.
+@return number of references removed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_remove(
+/*================*/
+	const page_t*	page,	/*!< in: b-tree page */
+	dict_index_t*	index,	/*!< in/out: index */
+	const char*	ctx)	/*!< in: context (for logging) */
+	__attribute__((nonnull));
+/**************************************************************//**
+Restore in index->blobs any references to off-page columns
+Used when page reorganize fails due to compressed page overflow. */
+UNIV_INTERN
+void
+btr_blob_dbg_restore(
+/*=================*/
+	const page_t*	npage,	/*!< in: page that failed to compress */
+	const page_t*	page,	/*!< in: copy of original page */
+	dict_index_t*	index,	/*!< in/out: index */
+	const char*	ctx)	/*!< in: context (for logging) */
+	__attribute__((nonnull));
+
+/** Operation that processes the BLOB references of an index record
+@param[in]	rec	record on index page
+@param[in/out]	index	the index tree of the record
+@param[in]	offsets	rec_get_offsets(rec,index)
+@param[in]	ctx	context (for logging)
+@return			number of BLOB references processed */
+typedef ulint (*btr_blob_dbg_op_f)
+(const rec_t* rec,dict_index_t* index,const ulint* offsets,const char* ctx);
+
+/**************************************************************//**
+Count and process all references to off-page columns on a page.
+@return number of references processed */
+UNIV_INTERN
+ulint
+btr_blob_dbg_op(
+/*============*/
+	const page_t*		page,	/*!< in: B-tree leaf page */
+	const rec_t*		rec,	/*!< in: record to start from
+					(NULL to process the whole page) */
+	dict_index_t*		index,	/*!< in/out: index */
+	const char*		ctx,	/*!< in: context (for logging) */
+	const btr_blob_dbg_op_f	op)	/*!< in: operation on records */
+	__attribute__((nonnull(1,3,4,5)));
+#else /* UNIV_BLOB_DEBUG */
+# define btr_blob_dbg_add_rec(rec, index, offsets, ctx)		((void) 0)
+# define btr_blob_dbg_add(page, index, ctx)			((void) 0)
+# define btr_blob_dbg_remove_rec(rec, index, offsets, ctx)	((void) 0)
+# define btr_blob_dbg_remove(page, index, ctx)			((void) 0)
+# define btr_blob_dbg_restore(npage, page, index, ctx)		((void) 0)
+# define btr_blob_dbg_op(page, rec, index, ctx, op)		((void) 0)
+#endif /* UNIV_BLOB_DEBUG */
+
+/** The size of a reference to data stored on a different page.
+The reference is stored at the end of the prefix of the field
+in the index record. */
+#define BTR_EXTERN_FIELD_REF_SIZE	20
+
+/** A BLOB field reference full of zero, for use in assertions and tests.
+Initially, BLOB field references are set to zero, in
+dtuple_convert_big_rec(). */
+extern const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE];
+
+#endif
diff --git a/storage/innobase/include/buf0buddy.h b/storage/innobase/include/buf0buddy.h
new file mode 100644
index 00000000000..fab9a4b828b
--- /dev/null
+++ b/storage/innobase/include/buf0buddy.h
@@ -0,0 +1,77 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buddy.h
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#ifndef buf0buddy_h
+#define buf0buddy_h
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE
+#endif
+
+#include "univ.i"
+#include "buf0types.h"
+
+/**********************************************************************//**
+Allocate a block.  The thread calling this function must hold
+buf_pool->mutex and must not hold buf_pool->zip_mutex or any
+block->mutex.  The buf_pool->mutex may be released and reacquired.
+This function should only be used for allocating compressed page frames.
+@return	allocated block, never NULL */
+UNIV_INLINE
+byte*
+buf_buddy_alloc(
+/*============*/
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool in which
+					the page resides */
+	ulint		size,		/*!< in: compressed page size
+					(between UNIV_ZIP_SIZE_MIN and
+					UNIV_PAGE_SIZE) */
+	ibool*		lru)		/*!< in: pointer to a variable
+					that will be assigned TRUE if
+				       	storage was allocated from the
+				       	LRU list and buf_pool->mutex was
+				       	temporarily released */
+	__attribute__((malloc, nonnull));
+
+/**********************************************************************//**
+Deallocate a block. */
+UNIV_INLINE
+void
+buf_buddy_free(
+/*===========*/
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool in which
+					the block resides */
+	void*		buf,		/*!< in: block to be freed, must not
+					be pointed to by the buffer pool */
+	ulint		size)		/*!< in: block size,
+					up to UNIV_PAGE_SIZE */
+	__attribute__((nonnull));
+
+#ifndef UNIV_NONINL
+# include "buf0buddy.ic"
+#endif
+
+#endif /* buf0buddy_h */
diff --git a/storage/innobase/include/buf0buddy.ic b/storage/innobase/include/buf0buddy.ic
new file mode 100644
index 00000000000..be2f950162d
--- /dev/null
+++ b/storage/innobase/include/buf0buddy.ic
@@ -0,0 +1,143 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buddy.ic
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE
+#endif
+
+#include "buf0buf.h"
+#include "buf0buddy.h"
+#include "ut0ut.h"
+#include "sync0sync.h"
+
+/**********************************************************************//**
+Allocate a block.  The thread calling this function must hold
+buf_pool->mutex and must not hold buf_pool->zip_mutex or any block->mutex.
+The buf_pool_mutex may be released and reacquired.
+@return	allocated block, never NULL */
+UNIV_INTERN
+void*
+buf_buddy_alloc_low(
+/*================*/
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
+	ulint		i,		/*!< in: index of buf_pool->zip_free[],
+					or BUF_BUDDY_SIZES */
+	ibool*		lru)		/*!< in: pointer to a variable that
+					will be assigned TRUE if storage was
+					allocated from the LRU list and
+					buf_pool->mutex was temporarily
+					released */
+	__attribute__((malloc, nonnull));
+
+/**********************************************************************//**
+Deallocate a block. */
+UNIV_INTERN
+void
+buf_buddy_free_low(
+/*===============*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	void*		buf,		/*!< in: block to be freed, must not be
+					pointed to by the buffer pool */
+	ulint		i)		/*!< in: index of buf_pool->zip_free[],
+					or BUF_BUDDY_SIZES */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Get the index of buf_pool->zip_free[] for a given block size.
+@return	index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */
+UNIV_INLINE
+ulint
+buf_buddy_get_slot(
+/*===============*/
+	ulint	size)	/*!< in: block size */
+{
+	ulint	i;
+	ulint	s;
+
+	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
+
+	for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) {
+	}
+
+	ut_ad(i <= BUF_BUDDY_SIZES);
+	return(i);
+}
+
+/**********************************************************************//**
+Allocate a block.  The thread calling this function must hold
+buf_pool->mutex and must not hold buf_pool->zip_mutex or any
+block->mutex.  The buf_pool->mutex may be released and reacquired.
+This function should only be used for allocating compressed page frames.
+@return	allocated block, never NULL */
+UNIV_INLINE
+byte*
+buf_buddy_alloc(
+/*============*/
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool in which
+					the page resides */
+	ulint		size,		/*!< in: compressed page size
+					(between UNIV_ZIP_SIZE_MIN and
+					UNIV_PAGE_SIZE) */
+	ibool*		lru)		/*!< in: pointer to a variable
+					that will be assigned TRUE if
+				       	storage was allocated from the
+				       	LRU list and buf_pool->mutex was
+				       	temporarily released */
+{
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(ut_is_2pow(size));
+	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
+	ut_ad(size <= UNIV_PAGE_SIZE);
+
+	return((byte*) buf_buddy_alloc_low(buf_pool, buf_buddy_get_slot(size),
+					   lru));
+}
+
+/**********************************************************************//**
+Deallocate a block. */
+UNIV_INLINE
+void
+buf_buddy_free(
+/*===========*/
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool in which
+					the block resides */
+	void*		buf,		/*!< in: block to be freed, must not
+					be pointed to by the buffer pool */
+	ulint		size)		/*!< in: block size,
+					up to UNIV_PAGE_SIZE */
+{
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(ut_is_2pow(size));
+	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
+	ut_ad(size <= UNIV_PAGE_SIZE);
+
+	buf_buddy_free_low(buf_pool, buf, buf_buddy_get_slot(size));
+}
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE	UNIV_INLINE_ORIGINAL
+#endif
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
new file mode 100644
index 00000000000..b669bd203e0
--- /dev/null
+++ b/storage/innobase/include/buf0buf.h
@@ -0,0 +1,2179 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buf.h
+The database buffer pool high-level routines
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0buf_h
+#define buf0buf_h
+
+#include "univ.i"
+#include "fil0fil.h"
+#include "mtr0types.h"
+#include "buf0types.h"
+#include "hash0hash.h"
+#include "ut0byte.h"
+#include "page0types.h"
+#ifndef UNIV_HOTBACKUP
+#include "ut0rbt.h"
+#include "os0proc.h"
+#include "log0log.h"
+
+/** @name Modes for buf_page_get_gen */
+/* @{ */
+#define BUF_GET			10	/*!< get always */
+#define	BUF_GET_IF_IN_POOL	11	/*!< get if in pool */
+#define BUF_PEEK_IF_IN_POOL	12	/*!< get if in pool, do not make
+					the block young in the LRU list */
+#define BUF_GET_NO_LATCH	14	/*!< get and bufferfix, but
+					set no latch; we have
+					separated this case, because
+					it is error-prone programming
+					not to set a latch, and it
+					should be used with care */
+#define BUF_GET_IF_IN_POOL_OR_WATCH	15
+					/*!< Get the page only if it's in the
+					buffer pool, if not then set a watch
+					on the page. */
+#define BUF_GET_POSSIBLY_FREED		16
+					/*!< Like BUF_GET, but do not mind
+					if the file page has been freed. */
+/* @} */
+/** @name Modes for buf_page_get_known_nowait */
+/* @{ */
+#define BUF_MAKE_YOUNG	51		/*!< Move the block to the
+					start of the LRU list if there
+					is a danger that the block
+					would drift out of the buffer
+					pool*/
+#define BUF_KEEP_OLD	52		/*!< Preserve the current LRU
+					position of the block. */
+/* @} */
+
+#define MAX_BUFFER_POOLS_BITS	6	/*!< Number of bits to representing
+					a buffer pool ID */
+
+#define MAX_BUFFER_POOLS 	(1 << MAX_BUFFER_POOLS_BITS)
+					/*!< The maximum number of buffer
+					pools that can be defined */
+
+#define BUF_POOL_WATCH_SIZE		(srv_n_purge_threads + 1)
+					/*!< Maximum number of concurrent
+					buffer pool watches */
+#define MAX_PAGE_HASH_LOCKS	1024	/*!< The maximum number of
+					page_hash locks */
+
+extern	buf_pool_t*	buf_pool_ptr;	/*!< The buffer pools
+					of the database */
+#ifdef UNIV_DEBUG
+extern ibool		buf_debug_prints;/*!< If this is set TRUE, the program
+					prints info whenever read or flush
+					occurs */
+#endif /* UNIV_DEBUG */
+extern ulint srv_buf_pool_instances;
+extern ulint srv_buf_pool_curr_size;
+#else /* !UNIV_HOTBACKUP */
+extern buf_block_t*	back_block1;	/*!< first block, for --apply-log */
+extern buf_block_t*	back_block2;	/*!< second block, for page reorganize */
+#endif /* !UNIV_HOTBACKUP */
+
+/** Magic value to use instead of checksums when they are disabled */
+#define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL
+
+/** @brief States of a control block
+@see buf_page_t
+
+The enumeration values must be 0..7. */
+enum buf_page_state {
+	BUF_BLOCK_POOL_WATCH,		/*!< a sentinel for the buffer pool
+					watch, element of buf_pool->watch[] */
+	BUF_BLOCK_ZIP_PAGE,		/*!< contains a clean
+					compressed page */
+	BUF_BLOCK_ZIP_DIRTY,		/*!< contains a compressed
+					page that is in the
+					buf_pool->flush_list */
+
+	BUF_BLOCK_NOT_USED,		/*!< is in the free list;
+					must be after the BUF_BLOCK_ZIP_
+					constants for compressed-only pages
+					@see buf_block_state_valid() */
+	BUF_BLOCK_READY_FOR_USE,	/*!< when buf_LRU_get_free_block
+					returns a block, it is in this state */
+	BUF_BLOCK_FILE_PAGE,		/*!< contains a buffered file page */
+	BUF_BLOCK_MEMORY,		/*!< contains some main memory
+					object */
+	BUF_BLOCK_REMOVE_HASH		/*!< hash index should be removed
+					before putting to the free list */
+};
+
+
+/** This structure defines information we will fetch from each buffer pool. It
+will be used to print table IO stats */
+struct buf_pool_info_t{
+	/* General buffer pool info */
+	ulint	pool_unique_id;		/*!< Buffer Pool ID */
+	ulint	pool_size;		/*!< Buffer Pool size in pages */
+	ulint	lru_len;		/*!< Length of buf_pool->LRU */
+	ulint	old_lru_len;		/*!< buf_pool->LRU_old_len */
+	ulint	free_list_len;		/*!< Length of buf_pool->free list */
+	ulint	flush_list_len;		/*!< Length of buf_pool->flush_list */
+	ulint	n_pend_unzip;		/*!< buf_pool->n_pend_unzip, pages
+					pending decompress */
+	ulint	n_pend_reads;		/*!< buf_pool->n_pend_reads, pages
+					pending read */
+	ulint	n_pending_flush_lru;	/*!< Pages pending flush in LRU */
+	ulint	n_pending_flush_single_page;/*!< Pages pending to be
+					flushed as part of single page
+					flushes issued by various user
+					threads */
+	ulint	n_pending_flush_list;	/*!< Pages pending flush in FLUSH
+					LIST */
+	ulint	n_pages_made_young;	/*!< number of pages made young */
+	ulint	n_pages_not_made_young;	/*!< number of pages not made young */
+	ulint	n_pages_read;		/*!< buf_pool->n_pages_read */
+	ulint	n_pages_created;	/*!< buf_pool->n_pages_created */
+	ulint	n_pages_written;	/*!< buf_pool->n_pages_written */
+	ulint	n_page_gets;		/*!< buf_pool->n_page_gets */
+	ulint	n_ra_pages_read_rnd;	/*!< buf_pool->n_ra_pages_read_rnd,
+					number of pages readahead */
+	ulint	n_ra_pages_read;	/*!< buf_pool->n_ra_pages_read, number
+					of pages readahead */
+	ulint	n_ra_pages_evicted;	/*!< buf_pool->n_ra_pages_evicted,
+					number of readahead pages evicted
+					without access */
+	ulint	n_page_get_delta;	/*!< num of buffer pool page gets since
+					last printout */
+
+	/* Buffer pool access stats */
+	double	page_made_young_rate;	/*!< page made young rate in pages
+					per second */
+	double	page_not_made_young_rate;/*!< page not made young rate
+					in pages per second */
+	double	pages_read_rate;	/*!< num of pages read per second */
+	double	pages_created_rate;	/*!< num of pages create per second */
+	double	pages_written_rate;	/*!< num of  pages written per second */
+	ulint	page_read_delta;	/*!< num of pages read since last
+					printout */
+	ulint	young_making_delta;	/*!< num of pages made young since
+					last printout */
+	ulint	not_young_making_delta;	/*!< num of pages not make young since
+					last printout */
+
+	/* Statistics about read ahead algorithm.  */
+	double	pages_readahead_rnd_rate;/*!< random readahead rate in pages per
+					second */
+	double	pages_readahead_rate;	/*!< readahead rate in pages per
+					second */
+	double	pages_evicted_rate;	/*!< rate of readahead page evicted
+					without access, in pages per second */
+
+	/* Stats about LRU eviction */
+	ulint	unzip_lru_len;		/*!< length of buf_pool->unzip_LRU
+					list */
+	/* Counters for LRU policy */
+	ulint	io_sum;			/*!< buf_LRU_stat_sum.io */
+	ulint	io_cur;			/*!< buf_LRU_stat_cur.io, num of IO
+					for current interval */
+	ulint	unzip_sum;		/*!< buf_LRU_stat_sum.unzip */
+	ulint	unzip_cur;		/*!< buf_LRU_stat_cur.unzip, num
+					pages decompressed in current
+					interval */
+};
+
+/** The occupied bytes of lists in all buffer pools */
+struct buf_pools_list_size_t {
+	ulint	LRU_bytes;		/*!< LRU size in bytes */
+	ulint	unzip_LRU_bytes;	/*!< unzip_LRU size in bytes */
+	ulint	flush_list_bytes;	/*!< flush_list size in bytes */
+};
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Acquire mutex on all buffer pool instances */
+UNIV_INLINE
+void
+buf_pool_mutex_enter_all(void);
+/*===========================*/
+
+/********************************************************************//**
+Release mutex on all buffer pool instances */
+UNIV_INLINE
+void
+buf_pool_mutex_exit_all(void);
+/*==========================*/
+
+/********************************************************************//**
+Creates the buffer pool.
+@return	DB_SUCCESS if success, DB_ERROR if not enough memory or error */
+UNIV_INTERN
+dberr_t
+buf_pool_init(
+/*=========*/
+	ulint	size,		/*!< in: Size of the total pool in bytes */
+	ulint	n_instances);	/*!< in: Number of instances */
+/********************************************************************//**
+Frees the buffer pool at shutdown.  This must not be invoked before
+freeing all mutexes. */
+UNIV_INTERN
+void
+buf_pool_free(
+/*==========*/
+	ulint	n_instances);	/*!< in: numbere of instances to free */
+
+/********************************************************************//**
+Clears the adaptive hash index on all pages in the buffer pool. */
+UNIV_INTERN
+void
+buf_pool_clear_hash_index(void);
+/*===========================*/
+
+/********************************************************************//**
+Relocate a buffer control block.  Relocates the block on the LRU list
+and in buf_pool->page_hash.  Does not relocate bpage->list.
+The caller must take care of relocating bpage->list. */
+UNIV_INTERN
+void
+buf_relocate(
+/*=========*/
+	buf_page_t*	bpage,	/*!< in/out: control block being relocated;
+				buf_page_get_state(bpage) must be
+				BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */
+	buf_page_t*	dpage)	/*!< in/out: destination control block */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Gets the current size of buffer buf_pool in bytes.
+@return	size in bytes */
+UNIV_INLINE
+ulint
+buf_pool_get_curr_size(void);
+/*========================*/
+/*********************************************************************//**
+Gets the current size of buffer buf_pool in frames.
+@return	size in pages */
+UNIV_INLINE
+ulint
+buf_pool_get_n_pages(void);
+/*=======================*/
+/********************************************************************//**
+Gets the smallest oldest_modification lsn for any page in the pool. Returns
+zero if all modified pages have been flushed to disk.
+@return	oldest modification in pool, zero if none */
+UNIV_INTERN
+lsn_t
+buf_pool_get_oldest_modification(void);
+/*==================================*/
+
+/********************************************************************//**
+Allocates a buf_page_t descriptor. This function must succeed. In case
+of failure we assert in this function. */
+UNIV_INLINE
+buf_page_t*
+buf_page_alloc_descriptor(void)
+/*===========================*/
+	__attribute__((malloc));
+/********************************************************************//**
+Free a buf_page_t descriptor. */
+UNIV_INLINE
+void
+buf_page_free_descriptor(
+/*=====================*/
+	buf_page_t*	bpage)	/*!< in: bpage descriptor to free. */
+	__attribute__((nonnull));
+
+/********************************************************************//**
+Allocates a buffer block.
+@return	own: the allocated block, in state BUF_BLOCK_MEMORY */
+UNIV_INTERN
+buf_block_t*
+buf_block_alloc(
+/*============*/
+	buf_pool_t*	buf_pool);	/*!< in: buffer pool instance,
+					or NULL for round-robin selection
+					of the buffer pool */
+/********************************************************************//**
+Frees a buffer block which does not contain a file page. */
+UNIV_INLINE
+void
+buf_block_free(
+/*===========*/
+	buf_block_t*	block);	/*!< in, own: block to be freed */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Copies contents of a buffer frame to a given buffer.
+@return	buf */
+UNIV_INLINE
+byte*
+buf_frame_copy(
+/*===========*/
+	byte*			buf,	/*!< in: buffer to copy to */
+	const buf_frame_t*	frame);	/*!< in: buffer frame */
+#ifndef UNIV_HOTBACKUP
+/**************************************************************//**
+NOTE! The following macros should be used instead of buf_page_get_gen,
+to improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed
+in LA! */
+#define buf_page_get(SP, ZS, OF, LA, MTR)	 buf_page_get_gen(\
+				SP, ZS, OF, LA, NULL,\
+				BUF_GET, __FILE__, __LINE__, MTR)
+/**************************************************************//**
+Use these macros to bufferfix a page with no latching. Remember not to
+read the contents of the page unless you know it is safe. Do not modify
+the contents of the page! We have separated this case, because it is
+error-prone programming not to set a latch, and it should be used
+with care. */
+#define buf_page_get_with_no_latch(SP, ZS, OF, MTR)	   buf_page_get_gen(\
+				SP, ZS, OF, RW_NO_LATCH, NULL,\
+				BUF_GET_NO_LATCH, __FILE__, __LINE__, MTR)
+/********************************************************************//**
+This is the general function used to get optimistic access to a database
+page.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+buf_page_optimistic_get(
+/*====================*/
+	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+	buf_block_t*	block,	/*!< in: guessed block */
+	ib_uint64_t	modify_clock,/*!< in: modify clock value */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr);	/*!< in: mini-transaction */
+/********************************************************************//**
+This is used to get access to a known database page, when no waiting can be
+done.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+buf_page_get_known_nowait(
+/*======================*/
+	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+	buf_block_t*	block,	/*!< in: the known page */
+	ulint		mode,	/*!< in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr);	/*!< in: mini-transaction */
+
+/*******************************************************************//**
+Given a tablespace id and page number tries to get that page. If the
+page is not in the buffer pool it is not loaded and NULL is returned.
+Suitable for using when holding the lock_sys_t::mutex. */
+UNIV_INTERN
+const buf_block_t*
+buf_page_try_get_func(
+/*==================*/
+	ulint		space_id,/*!< in: tablespace id */
+	ulint		page_no,/*!< in: page number */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr);	/*!< in: mini-transaction */
+
+/** Tries to get a page. If the page is not in the buffer pool it is
+not loaded.  Suitable for using when holding the lock_sys_t::mutex.
+@param space_id	in: tablespace id
+@param page_no	in: page number
+@param mtr	in: mini-transaction
+@return		the page if in buffer pool, NULL if not */
+#define buf_page_try_get(space_id, page_no, mtr)	\
+	buf_page_try_get_func(space_id, page_no, __FILE__, __LINE__, mtr);
+
+/********************************************************************//**
+Get read access to a compressed page (usually of type
+FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
+The page must be released with buf_page_release_zip().
+NOTE: the page is not protected by any latch.  Mutual exclusion has to
+be implemented at a higher level.  In other words, all possible
+accesses to a given page through this function must be protected by
+the same set of mutexes or latches.
+@return	pointer to the block, or NULL if not compressed */
+UNIV_INTERN
+buf_page_t*
+buf_page_get_zip(
+/*=============*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size */
+	ulint		offset);/*!< in: page number */
+/********************************************************************//**
+This is the general function used to get access to a database page.
+@return	pointer to the block or NULL */
+UNIV_INTERN
+buf_block_t*
+buf_page_get_gen(
+/*=============*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint		offset,	/*!< in: page number */
+	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
+	buf_block_t*	guess,	/*!< in: guessed block or NULL */
+	ulint		mode,	/*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
+				BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH or
+				BUF_GET_IF_IN_POOL_OR_WATCH */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr);	/*!< in: mini-transaction */
+/********************************************************************//**
+Initializes a page to the buffer buf_pool. The page is usually not read
+from a file even if it cannot be found in the buffer buf_pool. This is one
+of the functions which perform to a block a state transition NOT_USED =>
+FILE_PAGE (the other is buf_page_get_gen).
+@return	pointer to the block, page bufferfixed */
+UNIV_INTERN
+buf_block_t*
+buf_page_create(
+/*============*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset,	/*!< in: offset of the page within space in units of
+			a page */
+	ulint	zip_size,/*!< in: compressed page size, or 0 */
+	mtr_t*	mtr);	/*!< in: mini-transaction handle */
+#else /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Inits a page to the buffer buf_pool, for use in mysqlbackup --restore. */
+UNIV_INTERN
+void
+buf_page_init_for_backup_restore(
+/*=============================*/
+	ulint		space,	/*!< in: space id */
+	ulint		offset,	/*!< in: offset of the page within space
+				in units of a page */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	buf_block_t*	block);	/*!< in: block to init */
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Releases a compressed-only page acquired with buf_page_get_zip(). */
+UNIV_INLINE
+void
+buf_page_release_zip(
+/*=================*/
+	buf_page_t*	bpage);		/*!< in: buffer block */
+/********************************************************************//**
+Decrements the bufferfix count of a buffer control block and releases
+a latch, if specified. */
+UNIV_INLINE
+void
+buf_page_release(
+/*=============*/
+	buf_block_t*	block,		/*!< in: buffer block */
+	ulint		rw_latch);	/*!< in: RW_S_LATCH, RW_X_LATCH,
+					RW_NO_LATCH */
+/********************************************************************//**
+Moves a page to the start of the buffer pool LRU list. This high-level
+function can be used to prevent an important page from slipping out of
+the buffer pool. */
+UNIV_INTERN
+void
+buf_page_make_young(
+/*================*/
+	buf_page_t*	bpage);	/*!< in: buffer block of a file page */
+/********************************************************************//**
+Returns TRUE if the page can be found in the buffer pool hash table.
+
+NOTE that it is possible that the page is not yet read from disk,
+though.
+
+@return	TRUE if found in the page hash table */
+UNIV_INLINE
+ibool
+buf_page_peek(
+/*==========*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset);/*!< in: page number */
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+/********************************************************************//**
+Sets file_page_was_freed TRUE if the page is found in the buffer pool.
+This function should be called when we free a file page and want the
+debug version to check that it is not accessed any more unless
+reallocated.
+@return	control block if found in page hash table, otherwise NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_set_file_page_was_freed(
+/*=============================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset);/*!< in: page number */
+/********************************************************************//**
+Sets file_page_was_freed FALSE if the page is found in the buffer pool.
+This function should be called when we free a file page and want the
+debug version to check that it is not accessed any more unless
+reallocated.
+@return	control block if found in page hash table, otherwise NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_reset_file_page_was_freed(
+/*===============================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset);	/*!< in: page number */
+#endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return	freed_page_clock */
+UNIV_INLINE
+ulint
+buf_page_get_freed_page_clock(
+/*==========================*/
+	const buf_page_t*	bpage)	/*!< in: block */
+	__attribute__((pure));
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return	freed_page_clock */
+UNIV_INLINE
+ulint
+buf_block_get_freed_page_clock(
+/*===========================*/
+	const buf_block_t*	block)	/*!< in: block */
+	__attribute__((pure));
+
+/********************************************************************//**
+Tells if a block is still close enough to the MRU end of the LRU list
+meaning that it is not in danger of getting evicted and also implying
+that it has been accessed recently.
+Note that this is for heuristics only and does not reserve buffer pool
+mutex.
+@return	TRUE if block is close to MRU end of LRU */
+UNIV_INLINE
+ibool
+buf_page_peek_if_young(
+/*===================*/
+	const buf_page_t*	bpage);	/*!< in: block */
+/********************************************************************//**
+Recommends a move of a block to the start of the LRU list if there is danger
+of dropping from the buffer pool. NOTE: does not reserve the buffer pool
+mutex.
+@return	TRUE if should be made younger */
+UNIV_INLINE
+ibool
+buf_page_peek_if_too_old(
+/*=====================*/
+	const buf_page_t*	bpage);	/*!< in: block to make younger */
+/********************************************************************//**
+Gets the youngest modification log sequence number for a frame.
+Returns zero if not file page or no modification occurred yet.
+@return	newest modification to page */
+UNIV_INLINE
+lsn_t
+buf_page_get_newest_modification(
+/*=============================*/
+	const buf_page_t*	bpage);	/*!< in: block containing the
+					page frame */
+/********************************************************************//**
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool->mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+void
+buf_block_modify_clock_inc(
+/*=======================*/
+	buf_block_t*	block);	/*!< in: block */
+/********************************************************************//**
+Returns the value of the modify clock. The caller must have an s-lock
+or x-lock on the block.
+@return	value */
+UNIV_INLINE
+ib_uint64_t
+buf_block_get_modify_clock(
+/*=======================*/
+	buf_block_t*	block);	/*!< in: block */
+/*******************************************************************//**
+Increments the bufferfix count. */
+UNIV_INLINE
+void
+buf_block_buf_fix_inc_func(
+/*=======================*/
+# ifdef UNIV_SYNC_DEBUG
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line */
+# endif /* UNIV_SYNC_DEBUG */
+	buf_block_t*	block)	/*!< in/out: block to bufferfix */
+	__attribute__((nonnull));
+
+/*******************************************************************//**
+Increments the bufferfix count. */
+UNIV_INLINE
+void
+buf_block_fix(
+/*===========*/
+	buf_block_t*	block);	/*!< in/out: block to bufferfix */
+
+/*******************************************************************//**
+Increments the bufferfix count. */
+UNIV_INLINE
+void
+buf_block_unfix(
+/*===========*/
+	buf_block_t*	block);	/*!< in/out: block to bufferfix */
+
+# ifdef UNIV_SYNC_DEBUG
+/** Increments the bufferfix count.
+@param b	in/out: block to bufferfix
+@param f	in: file name where requested
+@param l	in: line number where requested */
+# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(f,l,b)
+# else /* UNIV_SYNC_DEBUG */
+/** Increments the bufferfix count.
+@param b	in/out: block to bufferfix
+@param f	in: file name where requested
+@param l	in: line number where requested */
+# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(b)
+# endif /* UNIV_SYNC_DEBUG */
+#else /* !UNIV_HOTBACKUP */
+# define buf_block_modify_clock_inc(block) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Checks if a page is corrupt.
+@return	TRUE if corrupted */
+UNIV_INTERN
+ibool
+buf_page_is_corrupted(
+/*==================*/
+	bool		check_lsn,	/*!< in: true if we need to check the
+					and complain about the LSN */
+	const byte*	read_buf,	/*!< in: a database page */
+	ulint		zip_size)	/*!< in: size of compressed page;
+					0 for uncompressed pages */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Checks if a page is all zeroes.
+@return	TRUE if the page is all zeroes */
+bool
+buf_page_is_zeroes(
+/*===============*/
+	const byte*	read_buf,	/*!< in: a database page */
+	const ulint	zip_size);	/*!< in: size of compressed page;
+					0 for uncompressed pages */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Gets the space id, page offset, and byte offset within page of a
+pointer pointing to a buffer frame containing a file page. */
+UNIV_INLINE
+void
+buf_ptr_get_fsp_addr(
+/*=================*/
+	const void*	ptr,	/*!< in: pointer to a buffer frame */
+	ulint*		space,	/*!< out: space id */
+	fil_addr_t*	addr);	/*!< out: page offset and byte offset */
+/**********************************************************************//**
+Gets the hash value of a block. This can be used in searches in the
+lock hash table.
+@return	lock hash value */
+UNIV_INLINE
+ulint
+buf_block_get_lock_hash_val(
+/*========================*/
+	const buf_block_t*	block)	/*!< in: block */
+	__attribute__((pure));
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Finds a block in the buffer pool that points to a
+given compressed page.
+@return	buffer block pointing to the compressed page, or NULL */
+UNIV_INTERN
+buf_block_t*
+buf_pool_contains_zip(
+/*==================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	const void*	data);		/*!< in: pointer to compressed page */
+#endif /* UNIV_DEBUG */
+
+/***********************************************************************
+FIXME_FTS: Gets the frame the pointer is pointing to. */
+UNIV_INLINE
+buf_frame_t*
+buf_frame_align(
+/*============*/
+                        /* out: pointer to frame */
+        byte*   ptr);   /* in: pointer to a frame */
+
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/*********************************************************************//**
+Validates the buffer pool data structure.
+@return	TRUE */
+UNIV_INTERN
+ibool
+buf_validate(void);
+/*==============*/
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/*********************************************************************//**
+Prints info of the buffer pool data structure. */
+UNIV_INTERN
+void
+buf_print(void);
+/*============*/
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+enum buf_page_print_flags {
+	/** Do not crash at the end of buf_page_print(). */
+	BUF_PAGE_PRINT_NO_CRASH	= 1,
+	/** Do not print the full page dump. */
+	BUF_PAGE_PRINT_NO_FULL = 2
+};
+
+/********************************************************************//**
+Prints a page to stderr. */
+UNIV_INTERN
+void
+buf_page_print(
+/*===========*/
+	const byte*	read_buf,	/*!< in: a database page */
+	ulint		zip_size,	/*!< in: compressed page size, or
+					0 for uncompressed pages */
+	ulint		flags)		/*!< in: 0 or
+					BUF_PAGE_PRINT_NO_CRASH or
+					BUF_PAGE_PRINT_NO_FULL */
+	UNIV_COLD __attribute__((nonnull));
+/********************************************************************//**
+Decompress a block.
+@return	TRUE if successful */
+UNIV_INTERN
+ibool
+buf_zip_decompress(
+/*===============*/
+	buf_block_t*	block,	/*!< in/out: block */
+	ibool		check);	/*!< in: TRUE=verify the page checksum */
+#ifndef UNIV_HOTBACKUP
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the number of latched pages in the buffer pool.
+@return	number of latched pages */
+UNIV_INTERN
+ulint
+buf_get_latched_pages_number(void);
+/*==============================*/
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Returns the number of pending buf pool read ios.
+@return	number of pending read I/O operations */
+UNIV_INTERN
+ulint
+buf_get_n_pending_read_ios(void);
+/*============================*/
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+UNIV_INTERN
+void
+buf_print_io(
+/*=========*/
+	FILE*	file);	/*!< in: file where to print */
+/*******************************************************************//**
+Collect buffer pool stats information for a buffer pool. Also
+record aggregated stats if there are more than one buffer pool
+in the server */
+UNIV_INTERN
+void
+buf_stats_get_pool_info(
+/*====================*/
+	buf_pool_t*		buf_pool,	/*!< in: buffer pool */
+	ulint			pool_id,	/*!< in: buffer pool ID */
+	buf_pool_info_t*	all_pool_info);	/*!< in/out: buffer pool info
+						to fill */
+/*********************************************************************//**
+Returns the ratio in percents of modified pages in the buffer pool /
+database pages in the buffer pool.
+@return	modified page percentage ratio */
+UNIV_INTERN
+ulint
+buf_get_modified_ratio_pct(void);
+/*============================*/
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+buf_refresh_io_stats(
+/*=================*/
+	buf_pool_t*	buf_pool);	/*!< buffer pool instance */
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+buf_refresh_io_stats_all(void);
+/*=================*/
+/*********************************************************************//**
+Asserts that all file pages in the buffer are in a replaceable state.
+@return	TRUE */
+UNIV_INTERN
+ibool
+buf_all_freed(void);
+/*===============*/
+/*********************************************************************//**
+Checks that there currently are no pending i/o-operations for the buffer
+pool.
+@return	number of pending i/o operations */
+UNIV_INTERN
+ulint
+buf_pool_check_no_pending_io(void);
+/*==============================*/
+/*********************************************************************//**
+Invalidates the file pages in the buffer pool when an archive recovery is
+completed. All the file pages buffered must be in a replaceable state when
+this function is called: not latched and not modified. */
+UNIV_INTERN
+void
+buf_pool_invalidate(void);
+/*=====================*/
+#endif /* !UNIV_HOTBACKUP */
+
+/*========================================================================
+--------------------------- LOWER LEVEL ROUTINES -------------------------
+=========================================================================*/
+
+#ifdef UNIV_SYNC_DEBUG
+/*********************************************************************//**
+Adds latch level info for the rw-lock protecting the buffer frame. This
+should be called in the debug version after a successful latching of a
+page if we know the latching order level of the acquired latch. */
+UNIV_INLINE
+void
+buf_block_dbg_add_level(
+/*====================*/
+	buf_block_t*	block,	/*!< in: buffer page
+				where we have acquired latch */
+	ulint		level);	/*!< in: latching order level */
+#else /* UNIV_SYNC_DEBUG */
+# define buf_block_dbg_add_level(block, level) /* nothing */
+#endif /* UNIV_SYNC_DEBUG */
+/*********************************************************************//**
+Gets the state of a block.
+@return	state */
+UNIV_INLINE
+enum buf_page_state
+buf_page_get_state(
+/*===============*/
+	const buf_page_t*	bpage);	/*!< in: pointer to the control block */
+/*********************************************************************//**
+Gets the state of a block.
+@return	state */
+UNIV_INLINE
+enum buf_page_state
+buf_block_get_state(
+/*================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Sets the state of a block. */
+UNIV_INLINE
+void
+buf_page_set_state(
+/*===============*/
+	buf_page_t*		bpage,	/*!< in/out: pointer to control block */
+	enum buf_page_state	state);	/*!< in: state */
+/*********************************************************************//**
+Sets the state of a block. */
+UNIV_INLINE
+void
+buf_block_set_state(
+/*================*/
+	buf_block_t*		block,	/*!< in/out: pointer to control block */
+	enum buf_page_state	state);	/*!< in: state */
+/*********************************************************************//**
+Determines if a block is mapped to a tablespace.
+@return	TRUE if mapped */
+UNIV_INLINE
+ibool
+buf_page_in_file(
+/*=============*/
+	const buf_page_t*	bpage)	/*!< in: pointer to control block */
+	__attribute__((pure));
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Determines if a block should be on unzip_LRU list.
+@return	TRUE if block belongs to unzip_LRU */
+UNIV_INLINE
+ibool
+buf_page_belongs_to_unzip_LRU(
+/*==========================*/
+	const buf_page_t*	bpage)	/*!< in: pointer to control block */
+	__attribute__((pure));
+
+/*********************************************************************//**
+Gets the mutex of a block.
+@return	pointer to mutex protecting bpage */
+UNIV_INLINE
+ib_mutex_t*
+buf_page_get_mutex(
+/*===============*/
+	const buf_page_t*	bpage)	/*!< in: pointer to control block */
+	__attribute__((pure));
+
+/*********************************************************************//**
+Get the flush type of a page.
+@return	flush type */
+UNIV_INLINE
+buf_flush_t
+buf_page_get_flush_type(
+/*====================*/
+	const buf_page_t*	bpage)	/*!< in: buffer page */
+	__attribute__((pure));
+/*********************************************************************//**
+Set the flush type of a page. */
+UNIV_INLINE
+void
+buf_page_set_flush_type(
+/*====================*/
+	buf_page_t*	bpage,		/*!< in: buffer page */
+	buf_flush_t	flush_type);	/*!< in: flush type */
+/*********************************************************************//**
+Map a block to a file page. */
+UNIV_INLINE
+void
+buf_block_set_file_page(
+/*====================*/
+	buf_block_t*		block,	/*!< in/out: pointer to control block */
+	ulint			space,	/*!< in: tablespace id */
+	ulint			page_no);/*!< in: page number */
+/*********************************************************************//**
+Gets the io_fix state of a block.
+@return	io_fix state */
+UNIV_INLINE
+enum buf_io_fix
+buf_page_get_io_fix(
+/*================*/
+	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Gets the io_fix state of a block.
+@return	io_fix state */
+UNIV_INLINE
+enum buf_io_fix
+buf_block_get_io_fix(
+/*================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Sets the io_fix state of a block. */
+UNIV_INLINE
+void
+buf_page_set_io_fix(
+/*================*/
+	buf_page_t*	bpage,	/*!< in/out: control block */
+	enum buf_io_fix	io_fix);/*!< in: io_fix state */
+/*********************************************************************//**
+Sets the io_fix state of a block. */
+UNIV_INLINE
+void
+buf_block_set_io_fix(
+/*=================*/
+	buf_block_t*	block,	/*!< in/out: control block */
+	enum buf_io_fix	io_fix);/*!< in: io_fix state */
+/*********************************************************************//**
+Makes a block sticky. A sticky block implies that even after we release
+the buf_pool->mutex and the block->mutex:
+* it cannot be removed from the flush_list
+* the block descriptor cannot be relocated
+* it cannot be removed from the LRU list
+Note that:
+* the block can still change its position in the LRU list
+* the next and previous pointers can change. */
+UNIV_INLINE
+void
+buf_page_set_sticky(
+/*================*/
+	buf_page_t*	bpage);	/*!< in/out: control block */
+/*********************************************************************//**
+Removes stickiness of a block. */
+UNIV_INLINE
+void
+buf_page_unset_sticky(
+/*==================*/
+	buf_page_t*	bpage);	/*!< in/out: control block */
+/********************************************************************//**
+Determine if a buffer block can be relocated in memory.  The block
+can be dirty, but it must not be I/O-fixed or bufferfixed. */
+UNIV_INLINE
+ibool
+buf_page_can_relocate(
+/*==================*/
+	const buf_page_t*	bpage)	/*!< control block being relocated */
+	__attribute__((pure));
+
+/*********************************************************************//**
+Determine if a block has been flagged old.
+@return	TRUE if old */
+UNIV_INLINE
+ibool
+buf_page_is_old(
+/*============*/
+	const buf_page_t*	bpage)	/*!< in: control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Flag a block old. */
+UNIV_INLINE
+void
+buf_page_set_old(
+/*=============*/
+	buf_page_t*	bpage,	/*!< in/out: control block */
+	ibool		old);	/*!< in: old */
+/*********************************************************************//**
+Determine the time of first access of a block in the buffer pool.
+@return	ut_time_ms() at the time of first access, 0 if not accessed */
+UNIV_INLINE
+unsigned
+buf_page_is_accessed(
+/*=================*/
+	const buf_page_t*	bpage)	/*!< in: control block */
+	__attribute__((nonnull, pure));
+/*********************************************************************//**
+Flag a block accessed. */
+UNIV_INLINE
+void
+buf_page_set_accessed(
+/*==================*/
+	buf_page_t*	bpage)		/*!< in/out: control block */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Gets the buf_block_t handle of a buffered file block if an uncompressed
+page frame exists, or NULL. Note: even though bpage is not declared a
+const we don't update its value. It is safe to make this pure.
+@return	control block, or NULL */
+UNIV_INLINE
+buf_block_t*
+buf_page_get_block(
+/*===============*/
+	buf_page_t*	bpage)	/*!< in: control block, or NULL */
+	__attribute__((pure));
+#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets a pointer to the memory frame of a block.
+@return	pointer to the frame */
+UNIV_INLINE
+buf_frame_t*
+buf_block_get_frame(
+/*================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+#else /* UNIV_DEBUG */
+# define buf_block_get_frame(block) (block)->frame
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Gets the space id of a block.
+@return	space id */
+UNIV_INLINE
+ulint
+buf_page_get_space(
+/*===============*/
+	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Gets the space id of a block.
+@return	space id */
+UNIV_INLINE
+ulint
+buf_block_get_space(
+/*================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Gets the page number of a block.
+@return	page number */
+UNIV_INLINE
+ulint
+buf_page_get_page_no(
+/*=================*/
+	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Gets the page number of a block.
+@return	page number */
+UNIV_INLINE
+ulint
+buf_block_get_page_no(
+/*==================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Gets the compressed page size of a block.
+@return	compressed page size, or 0 */
+UNIV_INLINE
+ulint
+buf_page_get_zip_size(
+/*==================*/
+	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Gets the compressed page size of a block.
+@return	compressed page size, or 0 */
+UNIV_INLINE
+ulint
+buf_block_get_zip_size(
+/*===================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Gets the compressed page descriptor corresponding to an uncompressed page
+if applicable. */
+#define buf_block_get_page_zip(block) \
+	((block)->page.zip.data ? &(block)->page.zip : NULL)
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Gets the block to whose frame the pointer is pointing to.
+@return	pointer to block, never NULL */
+UNIV_INTERN
+buf_block_t*
+buf_block_align(
+/*============*/
+	const byte*	ptr);	/*!< in: pointer to a frame */
+/********************************************************************//**
+Find out if a pointer belongs to a buf_block_t. It can be a pointer to
+the buf_block_t itself or a member of it
+@return	TRUE if ptr belongs to a buf_block_t struct */
+UNIV_INTERN
+ibool
+buf_pointer_is_block_field(
+/*=======================*/
+	const void*		ptr);	/*!< in: pointer not
+					dereferenced */
+/** Find out if a pointer corresponds to a buf_block_t::mutex.
+@param m	in: mutex candidate
+@return		TRUE if m is a buf_block_t::mutex */
+#define buf_pool_is_block_mutex(m)			\
+	buf_pointer_is_block_field((const void*)(m))
+/** Find out if a pointer corresponds to a buf_block_t::lock.
+@param l	in: rw-lock candidate
+@return		TRUE if l is a buf_block_t::lock */
+#define buf_pool_is_block_lock(l)			\
+	buf_pointer_is_block_field((const void*)(l))
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+/*********************************************************************//**
+Gets the compressed page descriptor corresponding to an uncompressed page
+if applicable.
+@return	compressed page descriptor, or NULL */
+UNIV_INLINE
+const page_zip_des_t*
+buf_frame_get_page_zip(
+/*===================*/
+	const byte*	ptr);	/*!< in: pointer to the page */
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+/********************************************************************//**
+Function which inits a page for read to the buffer buf_pool. If the page is
+(1) already in buf_pool, or
+(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
+(3) if the space is deleted or being deleted,
+then this function does nothing.
+Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
+on the buffer frame. The io-handler must take care that the flag is cleared
+and the lock released later.
+@return	pointer to the block or NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_init_for_read(
+/*===================*/
+	dberr_t*	err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */
+	ulint		mode,	/*!< in: BUF_READ_IBUF_PAGES_ONLY, ... */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size, or 0 */
+	ibool		unzip,	/*!< in: TRUE=request uncompressed page */
+	ib_int64_t	tablespace_version,/*!< in: prevents reading from a wrong
+				version of the tablespace in case we have done
+				DISCARD + IMPORT */
+	ulint		offset);/*!< in: page number */
+/********************************************************************//**
+Completes an asynchronous read or write request of a file page to or from
+the buffer pool.
+@return true if successful */
+UNIV_INTERN
+bool
+buf_page_io_complete(
+/*=================*/
+	buf_page_t*	bpage);	/*!< in: pointer to the block in question */
+/********************************************************************//**
+Calculates a folded value of a file page address to use in the page hash
+table.
+@return	the folded value */
+UNIV_INLINE
+ulint
+buf_page_address_fold(
+/*==================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: offset of the page within space */
+	__attribute__((const));
+/********************************************************************//**
+Calculates the index of a buffer pool to the buf_pool[] array.
+@return	the position of the buffer pool in buf_pool[] */
+UNIV_INLINE
+ulint
+buf_pool_index(
+/*===========*/
+	const buf_pool_t*	buf_pool)	/*!< in: buffer pool */
+	__attribute__((nonnull, const));
+/******************************************************************//**
+Returns the buffer pool instance given a page instance
+@return buf_pool */
+UNIV_INLINE
+buf_pool_t*
+buf_pool_from_bpage(
+/*================*/
+	const buf_page_t*	bpage); /*!< in: buffer pool page */
+/******************************************************************//**
+Returns the buffer pool instance given a block instance
+@return buf_pool */
+UNIV_INLINE
+buf_pool_t*
+buf_pool_from_block(
+/*================*/
+	const buf_block_t*	block); /*!< in: block */
+/******************************************************************//**
+Returns the buffer pool instance given space and offset of page
+@return buffer pool */
+UNIV_INLINE
+buf_pool_t*
+buf_pool_get(
+/*==========*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset);/*!< in: offset of the page within space */
+/******************************************************************//**
+Returns the buffer pool instance given its array index
+@return buffer pool */
+UNIV_INLINE
+buf_pool_t*
+buf_pool_from_array(
+/*================*/
+	ulint	index);		/*!< in: array index to get
+				buffer pool instance from */
+/******************************************************************//**
+Returns the control block of a file page, NULL if not found.
+@return	block, NULL if not found */
+UNIV_INLINE
+buf_page_t*
+buf_page_hash_get_low(
+/*==================*/
+	buf_pool_t*	buf_pool,/*!< buffer pool instance */
+	ulint		space,	/*!< in: space id */
+	ulint		offset,	/*!< in: offset of the page within space */
+	ulint		fold);	/*!< in: buf_page_address_fold(space, offset) */
+/******************************************************************//**
+Returns the control block of a file page, NULL if not found.
+If the block is found and lock is not NULL then the appropriate
+page_hash lock is acquired in the specified lock mode. Otherwise,
+mode value is ignored. It is up to the caller to release the
+lock. If the block is found and the lock is NULL then the page_hash
+lock is released by this function.
+@return	block, NULL if not found, or watch sentinel (if watch is true) */
+UNIV_INLINE
+buf_page_t*
+buf_page_hash_get_locked(
+/*=====================*/
+					/*!< out: pointer to the bpage,
+					or NULL; if NULL, hash_lock
+					is also NULL. */
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	ulint		space,		/*!< in: space id */
+	ulint		offset,		/*!< in: page number */
+	rw_lock_t**	lock,		/*!< in/out: lock of the page
+					hash acquired if bpage is
+					found. NULL otherwise. If NULL
+					is passed then the hash_lock
+					is released by this function */
+	ulint		lock_mode,	/*!< in: RW_LOCK_EX or
+					RW_LOCK_SHARED. Ignored if
+					lock == NULL */
+	bool		watch = false);	/*!< in: if true, return watch
+					sentinel also. */
+/******************************************************************//**
+Returns the control block of a file page, NULL if not found.
+If the block is found and lock is not NULL then the appropriate
+page_hash lock is acquired in the specified lock mode. Otherwise,
+mode value is ignored. It is up to the caller to release the
+lock. If the block is found and the lock is NULL then the page_hash
+lock is released by this function.
+@return	block, NULL if not found */
+UNIV_INLINE
+buf_block_t*
+buf_block_hash_get_locked(
+/*=====================*/
+					/*!< out: pointer to the bpage,
+					or NULL; if NULL, hash_lock
+					is also NULL. */
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	ulint		space,		/*!< in: space id */
+	ulint		offset,		/*!< in: page number */
+	rw_lock_t**	lock,		/*!< in/out: lock of the page
+					hash acquired if bpage is
+					found. NULL otherwise. If NULL
+					is passed then the hash_lock
+					is released by this function */
+	ulint		lock_mode);	/*!< in: RW_LOCK_EX or
+					RW_LOCK_SHARED. Ignored if
+					lock == NULL */
+/* There are four different ways we can try to get a bpage or block
+from the page hash:
+1) Caller already holds the appropriate page hash lock: in the case call
+buf_page_hash_get_low() function.
+2) Caller wants to hold page hash lock in x-mode
+3) Caller wants to hold page hash lock in s-mode
+4) Caller doesn't want to hold page hash lock */
+#define buf_page_hash_get_s_locked(b, s, o, l)			\
+	buf_page_hash_get_locked(b, s, o, l, RW_LOCK_SHARED)
+#define buf_page_hash_get_x_locked(b, s, o, l)			\
+	buf_page_hash_get_locked(b, s, o, l, RW_LOCK_EX)
+#define buf_page_hash_get(b, s, o)				\
+	buf_page_hash_get_locked(b, s, o, NULL, 0)
+#define buf_page_get_also_watch(b, s, o)			\
+	buf_page_hash_get_locked(b, s, o, NULL, 0, true)
+
+#define buf_block_hash_get_s_locked(b, s, o, l)			\
+	buf_block_hash_get_locked(b, s, o, l, RW_LOCK_SHARED)
+#define buf_block_hash_get_x_locked(b, s, o, l)			\
+	buf_block_hash_get_locked(b, s, o, l, RW_LOCK_EX)
+#define buf_block_hash_get(b, s, o)				\
+	buf_block_hash_get_locked(b, s, o, NULL, 0)
+
+/*********************************************************************//**
+Gets the current length of the free list of buffer blocks.
+@return	length of the free list */
+UNIV_INTERN
+ulint
+buf_get_free_list_len(void);
+/*=======================*/
+
+/********************************************************************//**
+Determine if a block is a sentinel for a buffer pool watch.
+@return	TRUE if a sentinel for a buffer pool watch, FALSE if not */
+UNIV_INTERN
+ibool
+buf_pool_watch_is_sentinel(
+/*=======================*/
+	buf_pool_t*		buf_pool,	/*!< buffer pool instance */
+	const buf_page_t*	bpage)		/*!< in: block */
+	__attribute__((nonnull, warn_unused_result));
+/****************************************************************//**
+Add watch for the given page to be read in. Caller must have the buffer pool
+@return NULL if watch set, block if the page is in the buffer pool */
+UNIV_INTERN
+buf_page_t*
+buf_pool_watch_set(
+/*===============*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset,	/*!< in: page number */
+	ulint	fold)	/*!< in: buf_page_address_fold(space, offset) */
+	__attribute__((warn_unused_result));
+/****************************************************************//**
+Stop watching if the page has been read in.
+buf_pool_watch_set(space,offset) must have returned NULL before. */
+UNIV_INTERN
+void
+buf_pool_watch_unset(
+/*=================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset);/*!< in: page number */
+/****************************************************************//**
+Check if the page has been read in.
+This may only be called after buf_pool_watch_set(space,offset)
+has returned NULL and before invoking buf_pool_watch_unset(space,offset).
+@return	FALSE if the given page was not read in, TRUE if it was */
+UNIV_INTERN
+ibool
+buf_pool_watch_occurred(
+/*====================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: page number */
+	__attribute__((warn_unused_result));
+/********************************************************************//**
+Get total buffer pool statistics. */
+UNIV_INTERN
+void
+buf_get_total_list_len(
+/*===================*/
+	ulint*		LRU_len,	/*!< out: length of all LRU lists */
+	ulint*		free_len,	/*!< out: length of all free lists */
+	ulint*		flush_list_len);/*!< out: length of all flush lists */
+/********************************************************************//**
+Get total list size in bytes from all buffer pools. */
+UNIV_INTERN
+void
+buf_get_total_list_size_in_bytes(
+/*=============================*/
+	buf_pools_list_size_t*	buf_pools_list_size);	/*!< out: list sizes
+							in all buffer pools */
+/********************************************************************//**
+Get total buffer pool statistics. */
+UNIV_INTERN
+void
+buf_get_total_stat(
+/*===============*/
+	buf_pool_stat_t*tot_stat);	/*!< out: buffer pool stats */
+/*********************************************************************//**
+Get the nth chunk's buffer block in the specified buffer pool.
+@return the nth chunk's buffer block. */
+UNIV_INLINE
+buf_block_t*
+buf_get_nth_chunk_block(
+/*====================*/
+	const buf_pool_t* buf_pool,	/*!< in: buffer pool instance */
+	ulint		n,		/*!< in: nth chunk in the buffer pool */
+	ulint*		chunk_size);	/*!< in: chunk size */
+
+/********************************************************************//**
+Calculate the checksum of a page from compressed table and update the page. */
+UNIV_INTERN
+void
+buf_flush_update_zip_checksum(
+/*==========================*/
+	buf_frame_t*	page,		/*!< in/out: Page to update */
+	ulint		zip_size,	/*!< in: Compressed page size */
+	lsn_t		lsn);		/*!< in: Lsn to stamp on the page */
+
+#endif /* !UNIV_HOTBACKUP */
+
+/** The common buffer control block structure
+for compressed and uncompressed frames */
+
+/** Number of bits used for buffer page states. */
+#define BUF_PAGE_STATE_BITS	3
+
+struct buf_page_t{
+	/** @name General fields
+	None of these bit-fields must be modified without holding
+	buf_page_get_mutex() [buf_block_t::mutex or
+	buf_pool->zip_mutex], since they can be stored in the same
+	machine word.  Some of these fields are additionally protected
+	by buf_pool->mutex. */
+	/* @{ */
+
+	ib_uint32_t	space;		/*!< tablespace id; also protected
+					by buf_pool->mutex. */
+	ib_uint32_t	offset;		/*!< page number; also protected
+					by buf_pool->mutex. */
+	/** count of how manyfold this block is currently bufferfixed */
+#ifdef PAGE_ATOMIC_REF_COUNT
+	ib_uint32_t	buf_fix_count;
+
+	/** type of pending I/O operation; also protected by
+	buf_pool->mutex for writes only @see enum buf_io_fix */
+	byte		io_fix;
+
+	byte		state;
+#else
+	unsigned	buf_fix_count:19;
+
+	/** type of pending I/O operation; also protected by
+	buf_pool->mutex for writes only @see enum buf_io_fix */
+	unsigned	io_fix:2;
+
+	/*!< state of the control block; also protected by buf_pool->mutex.
+	State transitions from BUF_BLOCK_READY_FOR_USE to BUF_BLOCK_MEMORY
+	need not be protected by buf_page_get_mutex(). @see enum buf_page_state.
+	State changes that are relevant to page_hash are additionally protected
+	by the appropriate page_hash mutex i.e.: if a page is in page_hash or
+	is being added to/removed from page_hash then the corresponding changes
+	must also be protected by page_hash mutex. */
+	unsigned	state:BUF_PAGE_STATE_BITS;
+
+#endif /* PAGE_ATOMIC_REF_COUNT */
+
+#ifndef UNIV_HOTBACKUP
+	unsigned	flush_type:2;	/*!< if this block is currently being
+					flushed to disk, this tells the
+					flush_type.
+					@see buf_flush_t */
+	unsigned	buf_pool_index:6;/*!< index number of the buffer pool
+					that this block belongs to */
+# if MAX_BUFFER_POOLS > 64
+#  error "MAX_BUFFER_POOLS > 64; redefine buf_pool_index:6"
+# endif
+	/* @} */
+#endif /* !UNIV_HOTBACKUP */
+	page_zip_des_t	zip;		/*!< compressed page; zip.data
+					(but not the data it points to) is
+					also protected by buf_pool->mutex;
+					state == BUF_BLOCK_ZIP_PAGE and
+					zip.data == NULL means an active
+					buf_pool->watch */
+#ifndef UNIV_HOTBACKUP
+	buf_page_t*	hash;		/*!< node used in chaining to
+					buf_pool->page_hash or
+					buf_pool->zip_hash */
+#ifdef UNIV_DEBUG
+	ibool		in_page_hash;	/*!< TRUE if in buf_pool->page_hash */
+	ibool		in_zip_hash;	/*!< TRUE if in buf_pool->zip_hash */
+#endif /* UNIV_DEBUG */
+
+	/** @name Page flushing fields
+	All these are protected by buf_pool->mutex. */
+	/* @{ */
+
+	UT_LIST_NODE_T(buf_page_t) list;
+					/*!< based on state, this is a
+					list node, protected either by
+					buf_pool->mutex or by
+					buf_pool->flush_list_mutex,
+					in one of the following lists in
+					buf_pool:
+
+					- BUF_BLOCK_NOT_USED:	free
+					- BUF_BLOCK_FILE_PAGE:	flush_list
+					- BUF_BLOCK_ZIP_DIRTY:	flush_list
+					- BUF_BLOCK_ZIP_PAGE:	zip_clean
+
+					If bpage is part of flush_list
+					then the node pointers are
+					covered by buf_pool->flush_list_mutex.
+					Otherwise these pointers are
+					protected by buf_pool->mutex.
+
+					The contents of the list node
+					is undefined if !in_flush_list
+					&& state == BUF_BLOCK_FILE_PAGE,
+					or if state is one of
+					BUF_BLOCK_MEMORY,
+					BUF_BLOCK_REMOVE_HASH or
+					BUF_BLOCK_READY_IN_USE. */
+
+#ifdef UNIV_DEBUG
+	ibool		in_flush_list;	/*!< TRUE if in buf_pool->flush_list;
+					when buf_pool->flush_list_mutex is
+					free, the following should hold:
+					in_flush_list
+					== (state == BUF_BLOCK_FILE_PAGE
+					    || state == BUF_BLOCK_ZIP_DIRTY)
+					Writes to this field must be
+					covered by both block->mutex
+					and buf_pool->flush_list_mutex. Hence
+					reads can happen while holding
+					any one of the two mutexes */
+	ibool		in_free_list;	/*!< TRUE if in buf_pool->free; when
+					buf_pool->mutex is free, the following
+					should hold: in_free_list
+					== (state == BUF_BLOCK_NOT_USED) */
+#endif /* UNIV_DEBUG */
+	lsn_t		newest_modification;
+					/*!< log sequence number of
+					the youngest modification to
+					this block, zero if not
+					modified. Protected by block
+					mutex */
+	lsn_t		oldest_modification;
+					/*!< log sequence number of
+					the START of the log entry
+					written of the oldest
+					modification to this block
+					which has not yet been flushed
+					on disk; zero if all
+					modifications are on disk.
+					Writes to this field must be
+					covered by both block->mutex
+					and buf_pool->flush_list_mutex. Hence
+					reads can happen while holding
+					any one of the two mutexes */
+	/* @} */
+	/** @name LRU replacement algorithm fields
+	These fields are protected by buf_pool->mutex only (not
+	buf_pool->zip_mutex or buf_block_t::mutex). */
+	/* @{ */
+
+	UT_LIST_NODE_T(buf_page_t) LRU;
+					/*!< node of the LRU list */
+#ifdef UNIV_DEBUG
+	ibool		in_LRU_list;	/*!< TRUE if the page is in
+					the LRU list; used in
+					debugging */
+#endif /* UNIV_DEBUG */
+	unsigned	old:1;		/*!< TRUE if the block is in the old
+					blocks in buf_pool->LRU_old */
+	unsigned	freed_page_clock:31;/*!< the value of
+					buf_pool->freed_page_clock
+					when this block was the last
+					time put to the head of the
+					LRU list; a thread is allowed
+					to read this for heuristic
+					purposes without holding any
+					mutex or latch */
+	/* @} */
+	unsigned	access_time;	/*!< time of first access, or
+					0 if the block was never accessed
+					in the buffer pool. Protected by
+					block mutex */
+# if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+	ibool		file_page_was_freed;
+					/*!< this is set to TRUE when
+					fsp frees a page in buffer pool;
+					protected by buf_pool->zip_mutex
+					or buf_block_t::mutex. */
+# endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+};
+
+/** The buffer control block structure */
+
+struct buf_block_t{
+
+	/** @name General fields */
+	/* @{ */
+
+	buf_page_t	page;		/*!< page information; this must
+					be the first field, so that
+					buf_pool->page_hash can point
+					to buf_page_t or buf_block_t */
+	byte*		frame;		/*!< pointer to buffer frame which
+					is of size UNIV_PAGE_SIZE, and
+					aligned to an address divisible by
+					UNIV_PAGE_SIZE */
+#ifndef UNIV_HOTBACKUP
+	UT_LIST_NODE_T(buf_block_t) unzip_LRU;
+					/*!< node of the decompressed LRU list;
+					a block is in the unzip_LRU list
+					if page.state == BUF_BLOCK_FILE_PAGE
+					and page.zip.data != NULL */
+#ifdef UNIV_DEBUG
+	ibool		in_unzip_LRU_list;/*!< TRUE if the page is in the
+					decompressed LRU list;
+					used in debugging */
+#endif /* UNIV_DEBUG */
+	ib_mutex_t	mutex;		/*!< mutex protecting this block:
+					state (also protected by the buffer
+					pool mutex), io_fix, buf_fix_count,
+					and accessed; we introduce this new
+					mutex in InnoDB-5.1 to relieve
+					contention on the buffer pool mutex */
+	rw_lock_t	lock;		/*!< read-write lock of the buffer
+					frame */
+	unsigned	lock_hash_val:32;/*!< hashed value of the page address
+					in the record lock hash table;
+					protected by buf_block_t::lock
+					(or buf_block_t::mutex, buf_pool->mutex
+				        in buf_page_get_gen(),
+					buf_page_init_for_read()
+					and buf_page_create()) */
+	ibool		check_index_page_at_flush;
+					/*!< TRUE if we know that this is
+					an index page, and want the database
+					to check its consistency before flush;
+					note that there may be pages in the
+					buffer pool which are index pages,
+					but this flag is not set because
+					we do not keep track of all pages;
+					NOT protected by any mutex */
+	/* @} */
+	/** @name Optimistic search field */
+	/* @{ */
+
+	ib_uint64_t	modify_clock;	/*!< this clock is incremented every
+					time a pointer to a record on the
+					page may become obsolete; this is
+					used in the optimistic cursor
+					positioning: if the modify clock has
+					not changed, we know that the pointer
+					is still valid; this field may be
+					changed if the thread (1) owns the
+					pool mutex and the page is not
+					bufferfixed, or (2) the thread has an
+					x-latch on the block */
+	/* @} */
+	/** @name Hash search fields (unprotected)
+	NOTE that these fields are NOT protected by any semaphore! */
+	/* @{ */
+
+	ulint		n_hash_helps;	/*!< counter which controls building
+					of a new hash index for the page */
+	ulint		n_fields;	/*!< recommended prefix length for hash
+					search: number of full fields */
+	ulint		n_bytes;	/*!< recommended prefix: number of bytes
+					in an incomplete field */
+	ibool		left_side;	/*!< TRUE or FALSE, depending on
+					whether the leftmost record of several
+					records with the same prefix should be
+					indexed in the hash index */
+	/* @} */
+
+	/** @name Hash search fields
+	These 5 fields may only be modified when we have
+	an x-latch on btr_search_latch AND
+	- we are holding an s-latch or x-latch on buf_block_t::lock or
+	- we know that buf_block_t::buf_fix_count == 0.
+
+	An exception to this is when we init or create a page
+	in the buffer pool in buf0buf.cc.
+
+	Another exception is that assigning block->index = NULL
+	is allowed whenever holding an x-latch on btr_search_latch. */
+
+	/* @{ */
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	ulint		n_pointers;	/*!< used in debugging: the number of
+					pointers in the adaptive hash index
+					pointing to this frame */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	unsigned	curr_n_fields:10;/*!< prefix length for hash indexing:
+					number of full fields */
+	unsigned	curr_n_bytes:15;/*!< number of bytes in hash
+					indexing */
+	unsigned	curr_left_side:1;/*!< TRUE or FALSE in hash indexing */
+	dict_index_t*	index;		/*!< Index for which the
+					adaptive hash index has been
+					created, or NULL if the page
+					does not exist in the
+					index. Note that it does not
+					guarantee that the index is
+					complete, though: there may
+					have been hash collisions,
+					record deletions, etc. */
+	/* @} */
+# ifdef UNIV_SYNC_DEBUG
+	/** @name Debug fields */
+	/* @{ */
+	rw_lock_t	debug_latch;	/*!< in the debug version, each thread
+					which bufferfixes the block acquires
+					an s-latch here; so we can use the
+					debug utilities in sync0rw */
+	/* @} */
+# endif
+#endif /* !UNIV_HOTBACKUP */
+};
+
+/** Check if a buf_block_t object is in a valid state
+@param block	buffer block
+@return		TRUE if valid */
+#define buf_block_state_valid(block)				\
+(buf_block_get_state(block) >= BUF_BLOCK_NOT_USED		\
+ && (buf_block_get_state(block) <= BUF_BLOCK_REMOVE_HASH))
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Compute the hash fold value for blocks in buf_pool->zip_hash. */
+/* @{ */
+#define BUF_POOL_ZIP_FOLD_PTR(ptr) ((ulint) (ptr) / UNIV_PAGE_SIZE)
+#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame)
+#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b))
+/* @} */
+
+/** Struct that is embedded in the free zip blocks */
+struct buf_buddy_free_t {
+	union {
+		ulint	size;	/*!< size of the block */
+		byte	bytes[FIL_PAGE_DATA];
+				/*!< stamp[FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID]
+				== BUF_BUDDY_FREE_STAMP denotes a free
+				block. If the space_id field of buddy
+				block != BUF_BUDDY_FREE_STAMP, the block
+				is not in any zip_free list. If the
+				space_id is BUF_BUDDY_FREE_STAMP then
+				stamp[0] will contain the
+				buddy block size. */
+	} stamp;
+
+	buf_page_t	bpage;	/*!< Embedded bpage descriptor */
+	UT_LIST_NODE_T(buf_buddy_free_t) list;
+				/*!< Node of zip_free list */
+};
+
+/** @brief The buffer pool statistics structure. */
+struct buf_pool_stat_t{
+	ulint	n_page_gets;	/*!< number of page gets performed;
+				also successful searches through
+				the adaptive hash index are
+				counted as page gets; this field
+				is NOT protected by the buffer
+				pool mutex */
+	ulint	n_pages_read;	/*!< number read operations */
+	ulint	n_pages_written;/*!< number write operations */
+	ulint	n_pages_created;/*!< number of pages created
+				in the pool with no read */
+	ulint	n_ra_pages_read_rnd;/*!< number of pages read in
+				as part of random read ahead */
+	ulint	n_ra_pages_read;/*!< number of pages read in
+				as part of read ahead */
+	ulint	n_ra_pages_evicted;/*!< number of read ahead
+				pages that are evicted without
+				being accessed */
+	ulint	n_pages_made_young; /*!< number of pages made young, in
+				calls to buf_LRU_make_block_young() */
+	ulint	n_pages_not_made_young; /*!< number of pages not made
+				young because the first access
+				was not long enough ago, in
+				buf_page_peek_if_too_old() */
+	ulint	LRU_bytes;	/*!< LRU size in bytes */
+	ulint	flush_list_bytes;/*!< flush_list size in bytes */
+};
+
+/** Statistics of buddy blocks of a given size. */
+struct buf_buddy_stat_t {
+	/** Number of blocks allocated from the buddy system. */
+	ulint		used;
+	/** Number of blocks relocated by the buddy system. */
+	ib_uint64_t	relocated;
+	/** Total duration of block relocations, in microseconds. */
+	ib_uint64_t	relocated_usec;
+};
+
+/** @brief The buffer pool structure.
+
+NOTE! The definition appears here only for other modules of this
+directory (buf) to see it. Do not use from outside! */
+
+struct buf_pool_t{
+
+	/** @name General fields */
+	/* @{ */
+	ib_mutex_t	mutex;		/*!< Buffer pool mutex of this
+					instance */
+	ib_mutex_t	zip_mutex;	/*!< Zip mutex of this buffer
+					pool instance, protects compressed
+					only pages (of type buf_page_t, not
+					buf_block_t */
+	ulint		instance_no;	/*!< Array index of this buffer
+					pool instance */
+	ulint		old_pool_size;  /*!< Old pool size in bytes */
+	ulint		curr_pool_size;	/*!< Current pool size in bytes */
+	ulint		LRU_old_ratio;  /*!< Reserve this much of the buffer
+					pool for "old" blocks */
+#ifdef UNIV_DEBUG
+	ulint		buddy_n_frames; /*!< Number of frames allocated from
+					the buffer pool to the buddy system */
+#endif
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ulint		mutex_exit_forbidden; /*!< Forbid release mutex */
+#endif
+	ulint		n_chunks;	/*!< number of buffer pool chunks */
+	buf_chunk_t*	chunks;		/*!< buffer pool chunks */
+	ulint		curr_size;	/*!< current pool size in pages */
+	hash_table_t*	page_hash;	/*!< hash table of buf_page_t or
+					buf_block_t file pages,
+					buf_page_in_file() == TRUE,
+					indexed by (space_id, offset).
+					page_hash is protected by an
+					array of mutexes.
+					Changes in page_hash are protected
+					by buf_pool->mutex and the relevant
+					page_hash mutex. Lookups can happen
+					while holding the buf_pool->mutex or
+					the relevant page_hash mutex. */
+	hash_table_t*	zip_hash;	/*!< hash table of buf_block_t blocks
+					whose frames are allocated to the
+					zip buddy system,
+					indexed by block->frame */
+	ulint		n_pend_reads;	/*!< number of pending read
+					operations */
+	ulint		n_pend_unzip;	/*!< number of pending decompressions */
+
+	time_t		last_printout_time;
+					/*!< when buf_print_io was last time
+					called */
+	buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES_MAX + 1];
+					/*!< Statistics of buddy system,
+					indexed by block size */
+	buf_pool_stat_t	stat;		/*!< current statistics */
+	buf_pool_stat_t	old_stat;	/*!< old statistics */
+
+	/* @} */
+
+	/** @name Page flushing algorithm fields */
+
+	/* @{ */
+
+	ib_mutex_t	flush_list_mutex;/*!< mutex protecting the
+					flush list access. This mutex
+					protects flush_list, flush_rbt
+					and bpage::list pointers when
+					the bpage is on flush_list. It
+					also protects writes to
+					bpage::oldest_modification and
+					flush_list_hp */
+	const buf_page_t*	flush_list_hp;/*!< "hazard pointer"
+					used during scan of flush_list
+					while doing flush list batch.
+					Protected by flush_list_mutex */
+	UT_LIST_BASE_NODE_T(buf_page_t) flush_list;
+					/*!< base node of the modified block
+					list */
+	ibool		init_flush[BUF_FLUSH_N_TYPES];
+					/*!< this is TRUE when a flush of the
+					given type is being initialized */
+	ulint		n_flush[BUF_FLUSH_N_TYPES];
+					/*!< this is the number of pending
+					writes in the given flush type */
+	os_event_t	no_flush[BUF_FLUSH_N_TYPES];
+					/*!< this is in the set state
+					when there is no flush batch
+					of the given type running */
+	ib_rbt_t*	flush_rbt;	/*!< a red-black tree is used
+					exclusively during recovery to
+					speed up insertions in the
+					flush_list. This tree contains
+					blocks in order of
+					oldest_modification LSN and is
+					kept in sync with the
+					flush_list.
+					Each member of the tree MUST
+					also be on the flush_list.
+					This tree is relevant only in
+					recovery and is set to NULL
+					once the recovery is over.
+					Protected by flush_list_mutex */
+	ulint		freed_page_clock;/*!< a sequence number used
+					to count the number of buffer
+					blocks removed from the end of
+					the LRU list; NOTE that this
+					counter may wrap around at 4
+					billion! A thread is allowed
+					to read this for heuristic
+					purposes without holding any
+					mutex or latch */
+	ibool		try_LRU_scan;	/*!< Set to FALSE when an LRU
+					scan for free block fails. This
+					flag is used to avoid repeated
+					scans of LRU list when we know
+					that there is no free block
+					available in the scan depth for
+					eviction. Set to TRUE whenever
+					we flush a batch from the
+					buffer pool. Protected by the
+					buf_pool->mutex */
+	/* @} */
+
+	/** @name LRU replacement algorithm fields */
+	/* @{ */
+
+	UT_LIST_BASE_NODE_T(buf_page_t) free;
+					/*!< base node of the free
+					block list */
+	UT_LIST_BASE_NODE_T(buf_page_t) LRU;
+					/*!< base node of the LRU list */
+	buf_page_t*	LRU_old;	/*!< pointer to the about
+					LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
+					oldest blocks in the LRU list;
+					NULL if LRU length less than
+					BUF_LRU_OLD_MIN_LEN;
+					NOTE: when LRU_old != NULL, its length
+					should always equal LRU_old_len */
+	ulint		LRU_old_len;	/*!< length of the LRU list from
+					the block to which LRU_old points
+					onward, including that block;
+					see buf0lru.cc for the restrictions
+					on this value; 0 if LRU_old == NULL;
+					NOTE: LRU_old_len must be adjusted
+					whenever LRU_old shrinks or grows! */
+
+	UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU;
+					/*!< base node of the
+					unzip_LRU list */
+
+	/* @} */
+	/** @name Buddy allocator fields
+	The buddy allocator is used for allocating compressed page
+	frames and buf_page_t descriptors of blocks that exist
+	in the buffer pool only in compressed form. */
+	/* @{ */
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	UT_LIST_BASE_NODE_T(buf_page_t)	zip_clean;
+					/*!< unmodified compressed pages */
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+	UT_LIST_BASE_NODE_T(buf_buddy_free_t) zip_free[BUF_BUDDY_SIZES_MAX];
+					/*!< buddy free lists */
+
+	buf_page_t*			watch;
+					/*!< Sentinel records for buffer
+					pool watches. Protected by
+					buf_pool->mutex. */
+
+#if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN
+# error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN"
+#endif
+	/* @} */
+};
+
+/** @name Accessors for buf_pool->mutex.
+Use these instead of accessing buf_pool->mutex directly. */
+/* @{ */
+
+/** Test if a buffer pool mutex is owned. */
+#define buf_pool_mutex_own(b) mutex_own(&b->mutex)
+/** Acquire a buffer pool mutex. */
+#define buf_pool_mutex_enter(b) do {			\
+	ut_ad(!mutex_own(&b->zip_mutex));		\
+	mutex_enter(&b->mutex);				\
+} while (0)
+
+/** Test if flush list mutex is owned. */
+#define buf_flush_list_mutex_own(b) mutex_own(&b->flush_list_mutex)
+
+/** Acquire the flush list mutex. */
+#define buf_flush_list_mutex_enter(b) do {		\
+	mutex_enter(&b->flush_list_mutex);		\
+} while (0)
+/** Release the flush list mutex. */
+# define buf_flush_list_mutex_exit(b) do {		\
+	mutex_exit(&b->flush_list_mutex);		\
+} while (0)
+
+/** Test if block->mutex is owned. */
+#define buf_block_mutex_own(b)	mutex_own(&(b)->mutex)
+
+/** Acquire the block->mutex. */
+#define buf_block_mutex_enter(b) do {			\
+	mutex_enter(&(b)->mutex);			\
+} while (0)
+
+/** Release the trx->mutex. */
+#define buf_block_mutex_exit(b) do {			\
+	mutex_exit(&(b)->mutex);				\
+} while (0)
+
+
+/** Get appropriate page_hash_lock. */
+# define buf_page_hash_lock_get(b, f)			\
+	hash_get_lock(b->page_hash, f)
+
+#ifdef UNIV_SYNC_DEBUG
+/** Test if page_hash lock is held in s-mode. */
+# define buf_page_hash_lock_held_s(b, p)		\
+	rw_lock_own(buf_page_hash_lock_get(b,		\
+		  buf_page_address_fold(p->space,	\
+					p->offset)),	\
+					RW_LOCK_SHARED)
+
+/** Test if page_hash lock is held in x-mode. */
+# define buf_page_hash_lock_held_x(b, p)		\
+	rw_lock_own(buf_page_hash_lock_get(b,		\
+		  buf_page_address_fold(p->space,	\
+					p->offset)),	\
+					RW_LOCK_EX)
+
+/** Test if page_hash lock is held in x or s-mode. */
+# define buf_page_hash_lock_held_s_or_x(b, p)		\
+	(buf_page_hash_lock_held_s(b, p)		\
+	 || buf_page_hash_lock_held_x(b, p))
+
+# define buf_block_hash_lock_held_s(b, p)		\
+	buf_page_hash_lock_held_s(b, &(p->page))
+
+# define buf_block_hash_lock_held_x(b, p)		\
+	buf_page_hash_lock_held_x(b, &(p->page))
+
+# define buf_block_hash_lock_held_s_or_x(b, p)		\
+	buf_page_hash_lock_held_s_or_x(b, &(p->page))
+#else /* UNIV_SYNC_DEBUG */
+# define buf_page_hash_lock_held_s(b, p)	(TRUE)
+# define buf_page_hash_lock_held_x(b, p)	(TRUE)
+# define buf_page_hash_lock_held_s_or_x(b, p)	(TRUE)
+# define buf_block_hash_lock_held_s(b, p)	(TRUE)
+# define buf_block_hash_lock_held_x(b, p)	(TRUE)
+# define buf_block_hash_lock_held_s_or_x(b, p)	(TRUE)
+#endif /* UNIV_SYNC_DEBUG */
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/** Forbid the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_forbid(b) do {	\
+	ut_ad(buf_pool_mutex_own(b));		\
+	b->mutex_exit_forbidden++;		\
+} while (0)
+/** Allow the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_allow(b) do {	\
+	ut_ad(buf_pool_mutex_own(b));		\
+	ut_a(b->mutex_exit_forbidden);	\
+	b->mutex_exit_forbidden--;		\
+} while (0)
+/** Release the buffer pool mutex. */
+# define buf_pool_mutex_exit(b) do {		\
+	ut_a(!b->mutex_exit_forbidden);		\
+	mutex_exit(&b->mutex);			\
+} while (0)
+#else
+/** Forbid the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_forbid(b) ((void) 0)
+/** Allow the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_allow(b) ((void) 0)
+/** Release the buffer pool mutex. */
+# define buf_pool_mutex_exit(b) mutex_exit(&b->mutex)
+#endif
+#endif /* !UNIV_HOTBACKUP */
+/* @} */
+
+/**********************************************************************
+Let us list the consistency conditions for different control block states.
+
+NOT_USED:	is in free list, not in LRU list, not in flush list, nor
+		page hash table
+READY_FOR_USE:	is not in free list, LRU list, or flush list, nor page
+		hash table
+MEMORY:		is not in free list, LRU list, or flush list, nor page
+		hash table
+FILE_PAGE:	space and offset are defined, is in page hash table
+		if io_fix == BUF_IO_WRITE,
+			pool: no_flush[flush_type] is in reset state,
+			pool: n_flush[flush_type] > 0
+
+		(1) if buf_fix_count == 0, then
+			is in LRU list, not in free list
+			is in flush list,
+				if and only if oldest_modification > 0
+			is x-locked,
+				if and only if io_fix == BUF_IO_READ
+			is s-locked,
+				if and only if io_fix == BUF_IO_WRITE
+
+		(2) if buf_fix_count > 0, then
+			is not in LRU list, not in free list
+			is in flush list,
+				if and only if oldest_modification > 0
+			if io_fix == BUF_IO_READ,
+				is x-locked
+			if io_fix == BUF_IO_WRITE,
+				is s-locked
+
+State transitions:
+
+NOT_USED => READY_FOR_USE
+READY_FOR_USE => MEMORY
+READY_FOR_USE => FILE_PAGE
+MEMORY => NOT_USED
+FILE_PAGE => NOT_USED	NOTE: This transition is allowed if and only if
+				(1) buf_fix_count == 0,
+				(2) oldest_modification == 0, and
+				(3) io_fix == 0.
+*/
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/** Functor to validate the LRU list. */
+struct	CheckInLRUList {
+	void	operator()(const buf_page_t* elem) const
+	{
+		ut_a(elem->in_LRU_list);
+	}
+};
+
+/** Functor to validate the LRU list. */
+struct	CheckInFreeList {
+	void	operator()(const buf_page_t* elem) const
+	{
+		ut_a(elem->in_free_list);
+	}
+};
+
+struct	CheckUnzipLRUAndLRUList {
+	void	operator()(const buf_block_t* elem) const
+	{
+                ut_a(elem->page.in_LRU_list);
+                ut_a(elem->in_unzip_LRU_list);
+	}
+};
+#endif /* UNIV_DEBUG || defined UNIV_BUF_DEBUG */
+
+#ifndef UNIV_NONINL
+#include "buf0buf.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic
new file mode 100644
index 00000000000..56616c6deeb
--- /dev/null
+++ b/storage/innobase/include/buf0buf.ic
@@ -0,0 +1,1460 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buf.ic
+The database buffer buf_pool
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0mtr.h"
+#ifndef UNIV_HOTBACKUP
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "buf0rea.h"
+
+/** A chunk of buffers. The buffer pool is allocated in chunks. */
+struct buf_chunk_t{
+	ulint		mem_size;	/*!< allocated size of the chunk */
+	ulint		size;		/*!< size of frames[] and blocks[] */
+	void*		mem;		/*!< pointer to the memory area which
+					was allocated for the frames */
+	buf_block_t*	blocks;		/*!< array of buffer control blocks */
+};
+
+/*********************************************************************//**
+Gets the current size of buffer buf_pool in bytes.
+@return size in bytes */
+UNIV_INLINE
+ulint
+buf_pool_get_curr_size(void)
+/*========================*/
+{
+	return(srv_buf_pool_curr_size);
+}
+
+/********************************************************************//**
+Calculates the index of a buffer pool to the buf_pool[] array.
+@return	the position of the buffer pool in buf_pool[] */
+UNIV_INLINE
+ulint
+buf_pool_index(
+/*===========*/
+	const buf_pool_t*	buf_pool)	/*!< in: buffer pool */
+{
+	ulint	i = buf_pool - buf_pool_ptr;
+	ut_ad(i < MAX_BUFFER_POOLS);
+	ut_ad(i < srv_buf_pool_instances);
+	return(i);
+}
+
+/******************************************************************//**
+Returns the buffer pool instance given a page instance
+@return buf_pool */
+UNIV_INLINE
+buf_pool_t*
+buf_pool_from_bpage(
+/*================*/
+	const buf_page_t*	bpage) /*!< in: buffer pool page */
+{
+	ulint	i;
+	i = bpage->buf_pool_index;
+	ut_ad(i < srv_buf_pool_instances);
+	return(&buf_pool_ptr[i]);
+}
+
+/******************************************************************//**
+Returns the buffer pool instance given a block instance
+@return buf_pool */
+UNIV_INLINE
+buf_pool_t*
+buf_pool_from_block(
+/*================*/
+	const buf_block_t*	block) /*!< in: block */
+{
+	return(buf_pool_from_bpage(&block->page));
+}
+
+/*********************************************************************//**
+Gets the current size of buffer buf_pool in pages.
+@return size in pages*/
+UNIV_INLINE
+ulint
+buf_pool_get_n_pages(void)
+/*======================*/
+{
+	return(buf_pool_get_curr_size() / UNIV_PAGE_SIZE);
+}
+
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return	freed_page_clock */
+UNIV_INLINE
+ulint
+buf_page_get_freed_page_clock(
+/*==========================*/
+	const buf_page_t*	bpage)	/*!< in: block */
+{
+	/* This is sometimes read without holding buf_pool->mutex. */
+	return(bpage->freed_page_clock);
+}
+
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return	freed_page_clock */
+UNIV_INLINE
+ulint
+buf_block_get_freed_page_clock(
+/*===========================*/
+	const buf_block_t*	block)	/*!< in: block */
+{
+	return(buf_page_get_freed_page_clock(&block->page));
+}
+
+/********************************************************************//**
+Tells if a block is still close enough to the MRU end of the LRU list
+meaning that it is not in danger of getting evicted and also implying
+that it has been accessed recently.
+Note that this is for heuristics only and does not reserve buffer pool
+mutex.
+@return	TRUE if block is close to MRU end of LRU */
+UNIV_INLINE
+ibool
+buf_page_peek_if_young(
+/*===================*/
+	const buf_page_t*	bpage)	/*!< in: block */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+	/* FIXME: bpage->freed_page_clock is 31 bits */
+	return((buf_pool->freed_page_clock & ((1UL << 31) - 1))
+	       < ((ulint) bpage->freed_page_clock
+		  + (buf_pool->curr_size
+		     * (BUF_LRU_OLD_RATIO_DIV - buf_pool->LRU_old_ratio)
+		     / (BUF_LRU_OLD_RATIO_DIV * 4))));
+}
+
+/********************************************************************//**
+Recommends a move of a block to the start of the LRU list if there is danger
+of dropping from the buffer pool. NOTE: does not reserve the buffer pool
+mutex.
+@return	TRUE if should be made younger */
+UNIV_INLINE
+ibool
+buf_page_peek_if_too_old(
+/*=====================*/
+	const buf_page_t*	bpage)	/*!< in: block to make younger */
+{
+	buf_pool_t*		buf_pool = buf_pool_from_bpage(bpage);
+
+	if (buf_pool->freed_page_clock == 0) {
+		/* If eviction has not started yet, do not update the
+		statistics or move blocks in the LRU list.  This is
+		either the warm-up phase or an in-memory workload. */
+		return(FALSE);
+	} else if (buf_LRU_old_threshold_ms && bpage->old) {
+		unsigned	access_time = buf_page_is_accessed(bpage);
+
+		if (access_time > 0
+		    && ((ib_uint32_t) (ut_time_ms() - access_time))
+		    >= buf_LRU_old_threshold_ms) {
+			return(TRUE);
+		}
+
+		buf_pool->stat.n_pages_not_made_young++;
+		return(FALSE);
+	} else {
+		return(!buf_page_peek_if_young(bpage));
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Gets the state of a block.
+@return	state */
+UNIV_INLINE
+enum buf_page_state
+buf_page_get_state(
+/*===============*/
+	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
+{
+	enum buf_page_state	state = (enum buf_page_state) bpage->state;
+
+#ifdef UNIV_DEBUG
+	switch (state) {
+	case BUF_BLOCK_POOL_WATCH:
+	case BUF_BLOCK_ZIP_PAGE:
+	case BUF_BLOCK_ZIP_DIRTY:
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_FILE_PAGE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		break;
+	default:
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+
+	return(state);
+}
+/*********************************************************************//**
+Gets the state of a block.
+@return	state */
+UNIV_INLINE
+enum buf_page_state
+buf_block_get_state(
+/*================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+{
+	return(buf_page_get_state(&block->page));
+}
+/*********************************************************************//**
+Sets the state of a block. */
+UNIV_INLINE
+void
+buf_page_set_state(
+/*===============*/
+	buf_page_t*		bpage,	/*!< in/out: pointer to control block */
+	enum buf_page_state	state)	/*!< in: state */
+{
+#ifdef UNIV_DEBUG
+	enum buf_page_state	old_state	= buf_page_get_state(bpage);
+
+	switch (old_state) {
+	case BUF_BLOCK_POOL_WATCH:
+		ut_error;
+		break;
+	case BUF_BLOCK_ZIP_PAGE:
+		ut_a(state == BUF_BLOCK_ZIP_DIRTY);
+		break;
+	case BUF_BLOCK_ZIP_DIRTY:
+		ut_a(state == BUF_BLOCK_ZIP_PAGE);
+		break;
+	case BUF_BLOCK_NOT_USED:
+		ut_a(state == BUF_BLOCK_READY_FOR_USE);
+		break;
+	case BUF_BLOCK_READY_FOR_USE:
+		ut_a(state == BUF_BLOCK_MEMORY
+		     || state == BUF_BLOCK_FILE_PAGE
+		     || state == BUF_BLOCK_NOT_USED);
+		break;
+	case BUF_BLOCK_MEMORY:
+		ut_a(state == BUF_BLOCK_NOT_USED);
+		break;
+	case BUF_BLOCK_FILE_PAGE:
+		ut_a(state == BUF_BLOCK_NOT_USED
+		     || state == BUF_BLOCK_REMOVE_HASH);
+		break;
+	case BUF_BLOCK_REMOVE_HASH:
+		ut_a(state == BUF_BLOCK_MEMORY);
+		break;
+	}
+#endif /* UNIV_DEBUG */
+	bpage->state = state;
+	ut_ad(buf_page_get_state(bpage) == state);
+}
+
+/*********************************************************************//**
+Sets the state of a block. */
+UNIV_INLINE
+void
+buf_block_set_state(
+/*================*/
+	buf_block_t*		block,	/*!< in/out: pointer to control block */
+	enum buf_page_state	state)	/*!< in: state */
+{
+	buf_page_set_state(&block->page, state);
+}
+
+/*********************************************************************//**
+Determines if a block is mapped to a tablespace.
+@return	TRUE if mapped */
+UNIV_INLINE
+ibool
+buf_page_in_file(
+/*=============*/
+	const buf_page_t*	bpage)	/*!< in: pointer to control block */
+{
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_POOL_WATCH:
+		ut_error;
+		break;
+	case BUF_BLOCK_ZIP_PAGE:
+	case BUF_BLOCK_ZIP_DIRTY:
+	case BUF_BLOCK_FILE_PAGE:
+		return(TRUE);
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		break;
+	}
+
+	return(FALSE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Determines if a block should be on unzip_LRU list.
+@return	TRUE if block belongs to unzip_LRU */
+UNIV_INLINE
+ibool
+buf_page_belongs_to_unzip_LRU(
+/*==========================*/
+	const buf_page_t*	bpage)	/*!< in: pointer to control block */
+{
+	ut_ad(buf_page_in_file(bpage));
+
+	return(bpage->zip.data
+	       && buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+}
+
+/*********************************************************************//**
+Gets the mutex of a block.
+@return	pointer to mutex protecting bpage */
+UNIV_INLINE
+ib_mutex_t*
+buf_page_get_mutex(
+/*===============*/
+	const buf_page_t*	bpage)	/*!< in: pointer to control block */
+{
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_POOL_WATCH:
+		ut_error;
+		return(NULL);
+	case BUF_BLOCK_ZIP_PAGE:
+	case BUF_BLOCK_ZIP_DIRTY: {
+		buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+
+		return(&buf_pool->zip_mutex);
+		}
+	default:
+		return(&((buf_block_t*) bpage)->mutex);
+	}
+}
+
+/*********************************************************************//**
+Get the flush type of a page.
+@return	flush type */
+UNIV_INLINE
+buf_flush_t
+buf_page_get_flush_type(
+/*====================*/
+	const buf_page_t*	bpage)	/*!< in: buffer page */
+{
+	buf_flush_t	flush_type = (buf_flush_t) bpage->flush_type;
+
+#ifdef UNIV_DEBUG
+	switch (flush_type) {
+	case BUF_FLUSH_LRU:
+	case BUF_FLUSH_LIST:
+	case BUF_FLUSH_SINGLE_PAGE:
+		return(flush_type);
+	case BUF_FLUSH_N_TYPES:
+		ut_error;
+	}
+	ut_error;
+#endif /* UNIV_DEBUG */
+	return(flush_type);
+}
+/*********************************************************************//**
+Set the flush type of a page. */
+UNIV_INLINE
+void
+buf_page_set_flush_type(
+/*====================*/
+	buf_page_t*	bpage,		/*!< in: buffer page */
+	buf_flush_t	flush_type)	/*!< in: flush type */
+{
+	bpage->flush_type = flush_type;
+	ut_ad(buf_page_get_flush_type(bpage) == flush_type);
+}
+
+/*********************************************************************//**
+Map a block to a file page. */
+UNIV_INLINE
+void
+buf_block_set_file_page(
+/*====================*/
+	buf_block_t*		block,	/*!< in/out: pointer to control block */
+	ulint			space,	/*!< in: tablespace id */
+	ulint			page_no)/*!< in: page number */
+{
+	buf_block_set_state(block, BUF_BLOCK_FILE_PAGE);
+	block->page.space = static_cast<ib_uint32_t>(space);
+	block->page.offset = static_cast<ib_uint32_t>(page_no);
+}
+
+/*********************************************************************//**
+Gets the io_fix state of a block.
+@return	io_fix state */
+UNIV_INLINE
+enum buf_io_fix
+buf_page_get_io_fix(
+/*================*/
+	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
+{
+	ut_ad(bpage != NULL);
+
+	enum buf_io_fix	io_fix = (enum buf_io_fix) bpage->io_fix;
+#ifdef UNIV_DEBUG
+	switch (io_fix) {
+	case BUF_IO_NONE:
+	case BUF_IO_READ:
+	case BUF_IO_WRITE:
+	case BUF_IO_PIN:
+		return(io_fix);
+	}
+	ut_error;
+#endif /* UNIV_DEBUG */
+	return(io_fix);
+}
+
+/*********************************************************************//**
+Gets the io_fix state of a block.
+@return	io_fix state */
+UNIV_INLINE
+enum buf_io_fix
+buf_block_get_io_fix(
+/*=================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+{
+	return(buf_page_get_io_fix(&block->page));
+}
+
+/*********************************************************************//**
+Sets the io_fix state of a block. */
+UNIV_INLINE
+void
+buf_page_set_io_fix(
+/*================*/
+	buf_page_t*	bpage,	/*!< in/out: control block */
+	enum buf_io_fix	io_fix)	/*!< in: io_fix state */
+{
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+#endif
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+	bpage->io_fix = io_fix;
+	ut_ad(buf_page_get_io_fix(bpage) == io_fix);
+}
+
+/*********************************************************************//**
+Sets the io_fix state of a block. */
+UNIV_INLINE
+void
+buf_block_set_io_fix(
+/*=================*/
+	buf_block_t*	block,	/*!< in/out: control block */
+	enum buf_io_fix	io_fix)	/*!< in: io_fix state */
+{
+	buf_page_set_io_fix(&block->page, io_fix);
+}
+
+/*********************************************************************//**
+Makes a block sticky. A sticky block implies that even after we release
+the buf_pool->mutex and the block->mutex:
+* it cannot be removed from the flush_list
+* the block descriptor cannot be relocated
+* it cannot be removed from the LRU list
+Note that:
+* the block can still change its position in the LRU list
+* the next and previous pointers can change. */
+UNIV_INLINE
+void
+buf_page_set_sticky(
+/*================*/
+	buf_page_t*	bpage)	/*!< in/out: control block */
+{
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+#endif
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
+
+	bpage->io_fix = BUF_IO_PIN;
+}
+
+/*********************************************************************//**
+Removes stickiness of a block. */
+UNIV_INLINE
+void
+buf_page_unset_sticky(
+/*==================*/
+	buf_page_t*	bpage)	/*!< in/out: control block */
+{
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+#endif
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_PIN);
+
+	bpage->io_fix = BUF_IO_NONE;
+}
+
+/********************************************************************//**
+Determine if a buffer block can be relocated in memory.  The block
+can be dirty, but it must not be I/O-fixed or bufferfixed. */
+UNIV_INLINE
+ibool
+buf_page_can_relocate(
+/*==================*/
+	const buf_page_t*	bpage)	/*!< control block being relocated */
+{
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+#endif
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(buf_page_in_file(bpage));
+	ut_ad(bpage->in_LRU_list);
+
+	return(buf_page_get_io_fix(bpage) == BUF_IO_NONE
+	       && bpage->buf_fix_count == 0);
+}
+
+/*********************************************************************//**
+Determine if a block has been flagged old.
+@return	TRUE if old */
+UNIV_INLINE
+ibool
+buf_page_is_old(
+/*============*/
+	const buf_page_t*	bpage)	/*!< in: control block */
+{
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(buf_pool_mutex_own(buf_pool));
+#endif
+	ut_ad(buf_page_in_file(bpage));
+
+	return(bpage->old);
+}
+
+/*********************************************************************//**
+Flag a block old. */
+UNIV_INLINE
+void
+buf_page_set_old(
+/*=============*/
+	buf_page_t*	bpage,	/*!< in/out: control block */
+	ibool		old)	/*!< in: old */
+{
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+#endif /* UNIV_DEBUG */
+	ut_a(buf_page_in_file(bpage));
+	ut_ad(buf_pool_mutex_own(buf_pool));
+	ut_ad(bpage->in_LRU_list);
+
+#ifdef UNIV_LRU_DEBUG
+	ut_a((buf_pool->LRU_old_len == 0) == (buf_pool->LRU_old == NULL));
+	/* If a block is flagged "old", the LRU_old list must exist. */
+	ut_a(!old || buf_pool->LRU_old);
+
+	if (UT_LIST_GET_PREV(LRU, bpage) && UT_LIST_GET_NEXT(LRU, bpage)) {
+		const buf_page_t*	prev = UT_LIST_GET_PREV(LRU, bpage);
+		const buf_page_t*	next = UT_LIST_GET_NEXT(LRU, bpage);
+		if (prev->old == next->old) {
+			ut_a(prev->old == old);
+		} else {
+			ut_a(!prev->old);
+			ut_a(buf_pool->LRU_old == (old ? bpage : next));
+		}
+	}
+#endif /* UNIV_LRU_DEBUG */
+
+	bpage->old = old;
+}
+
+/*********************************************************************//**
+Determine the time of first access of a block in the buffer pool.
+@return	ut_time_ms() at the time of first access, 0 if not accessed */
+UNIV_INLINE
+unsigned
+buf_page_is_accessed(
+/*=================*/
+	const buf_page_t*	bpage)	/*!< in: control block */
+{
+	ut_ad(buf_page_in_file(bpage));
+
+	return(bpage->access_time);
+}
+
+/*********************************************************************//**
+Flag a block accessed. */
+UNIV_INLINE
+void
+buf_page_set_accessed(
+/*==================*/
+	buf_page_t*	bpage)		/*!< in/out: control block */
+{
+#ifdef UNIV_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
+	ut_ad(!buf_pool_mutex_own(buf_pool));
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+#endif /* UNIV_DEBUG */
+
+	ut_a(buf_page_in_file(bpage));
+
+	if (bpage->access_time == 0) {
+		/* Make this the time of the first access. */
+		bpage->access_time = static_cast<uint>(ut_time_ms());
+	}
+}
+
+/*********************************************************************//**
+Gets the buf_block_t handle of a buffered file block if an uncompressed
+page frame exists, or NULL.
+@return	control block, or NULL */
+UNIV_INLINE
+buf_block_t*
+buf_page_get_block(
+/*===============*/
+	buf_page_t*	bpage)	/*!< in: control block, or NULL */
+{
+	if (bpage != NULL) {
+		ut_ad(buf_page_in_file(bpage));
+
+		if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
+			return((buf_block_t*) bpage);
+		}
+	}
+
+	return(NULL);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets a pointer to the memory frame of a block.
+@return	pointer to the frame */
+UNIV_INLINE
+buf_frame_t*
+buf_block_get_frame(
+/*================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+{
+	ut_ad(block);
+
+	switch (buf_block_get_state(block)) {
+	case BUF_BLOCK_POOL_WATCH:
+	case BUF_BLOCK_ZIP_PAGE:
+	case BUF_BLOCK_ZIP_DIRTY:
+	case BUF_BLOCK_NOT_USED:
+		ut_error;
+		break;
+	case BUF_BLOCK_FILE_PAGE:
+# ifndef UNIV_HOTBACKUP
+		ut_a(block->page.buf_fix_count > 0);
+# endif /* !UNIV_HOTBACKUP */
+		/* fall through */
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		goto ok;
+	}
+	ut_error;
+ok:
+	return((buf_frame_t*) block->frame);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets the space id of a block.
+@return	space id */
+UNIV_INLINE
+ulint
+buf_page_get_space(
+/*===============*/
+	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
+{
+	ut_ad(bpage);
+	ut_a(buf_page_in_file(bpage));
+
+	return(bpage->space);
+}
+
+/*********************************************************************//**
+Gets the space id of a block.
+@return	space id */
+UNIV_INLINE
+ulint
+buf_block_get_space(
+/*================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+{
+	ut_ad(block);
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+	return(block->page.space);
+}
+
+/*********************************************************************//**
+Gets the page number of a block.
+@return	page number */
+UNIV_INLINE
+ulint
+buf_page_get_page_no(
+/*=================*/
+	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
+{
+	ut_ad(bpage);
+	ut_a(buf_page_in_file(bpage));
+
+	return(bpage->offset);
+}
+/***********************************************************************
+FIXME_FTS Gets the frame the pointer is pointing to. */
+UNIV_INLINE
+buf_frame_t*
+buf_frame_align(
+/*============*/
+                        /* out: pointer to frame */
+        byte*   ptr)    /* in: pointer to a frame */
+{
+        buf_frame_t*    frame;
+
+        ut_ad(ptr);
+
+        frame = (buf_frame_t*) ut_align_down(ptr, UNIV_PAGE_SIZE);
+
+        return(frame);
+}
+
+/*********************************************************************//**
+Gets the page number of a block.
+@return	page number */
+UNIV_INLINE
+ulint
+buf_block_get_page_no(
+/*==================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+{
+	ut_ad(block);
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+	return(block->page.offset);
+}
+
+/*********************************************************************//**
+Gets the compressed page size of a block.
+@return	compressed page size, or 0 */
+UNIV_INLINE
+ulint
+buf_page_get_zip_size(
+/*==================*/
+	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
+{
+	return(bpage->zip.ssize
+	       ? (UNIV_ZIP_SIZE_MIN >> 1) << bpage->zip.ssize : 0);
+}
+
+/*********************************************************************//**
+Gets the compressed page size of a block.
+@return	compressed page size, or 0 */
+UNIV_INLINE
+ulint
+buf_block_get_zip_size(
+/*===================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+{
+	return(block->page.zip.ssize
+	       ? (UNIV_ZIP_SIZE_MIN >> 1) << block->page.zip.ssize : 0);
+}
+
+#ifndef UNIV_HOTBACKUP
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+/*********************************************************************//**
+Gets the compressed page descriptor corresponding to an uncompressed page
+if applicable.
+@return	compressed page descriptor, or NULL */
+UNIV_INLINE
+const page_zip_des_t*
+buf_frame_get_page_zip(
+/*===================*/
+	const byte*	ptr)	/*!< in: pointer to the page */
+{
+	return(buf_block_get_page_zip(buf_block_align(ptr)));
+}
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Gets the space id, page offset, and byte offset within page of a
+pointer pointing to a buffer frame containing a file page. */
+UNIV_INLINE
+void
+buf_ptr_get_fsp_addr(
+/*=================*/
+	const void*	ptr,	/*!< in: pointer to a buffer frame */
+	ulint*		space,	/*!< out: space id */
+	fil_addr_t*	addr)	/*!< out: page offset and byte offset */
+{
+	const page_t*	page = (const page_t*) ut_align_down(ptr,
+							     UNIV_PAGE_SIZE);
+
+	*space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	addr->page = mach_read_from_4(page + FIL_PAGE_OFFSET);
+	addr->boffset = ut_align_offset(ptr, UNIV_PAGE_SIZE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Gets the hash value of the page the pointer is pointing to. This can be used
+in searches in the lock hash table.
+@return	lock hash value */
+UNIV_INLINE
+ulint
+buf_block_get_lock_hash_val(
+/*========================*/
+	const buf_block_t*	block)	/*!< in: block */
+{
+	ut_ad(block);
+	ut_ad(buf_page_in_file(&block->page));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(((buf_block_t*) block)->lock), RW_LOCK_EXCLUSIVE)
+	      || rw_lock_own(&(((buf_block_t*) block)->lock), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+	return(block->lock_hash_val);
+}
+
+/********************************************************************//**
+Allocates a buf_page_t descriptor. This function must succeed. In case
+of failure we assert in this function.
+@return: the allocated descriptor. */
+UNIV_INLINE
+buf_page_t*
+buf_page_alloc_descriptor(void)
+/*===========================*/
+{
+	buf_page_t*	bpage;
+
+	bpage = (buf_page_t*) ut_malloc(sizeof *bpage);
+	ut_d(memset(bpage, 0, sizeof *bpage));
+	UNIV_MEM_ALLOC(bpage, sizeof *bpage);
+
+	return(bpage);
+}
+
+/********************************************************************//**
+Free a buf_page_t descriptor. */
+UNIV_INLINE
+void
+buf_page_free_descriptor(
+/*=====================*/
+	buf_page_t*	bpage)	/*!< in: bpage descriptor to free. */
+{
+	ut_free(bpage);
+}
+
+/********************************************************************//**
+Frees a buffer block which does not contain a file page. */
+UNIV_INLINE
+void
+buf_block_free(
+/*===========*/
+	buf_block_t*	block)	/*!< in, own: block to be freed */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_bpage((buf_page_t*) block);
+
+	buf_pool_mutex_enter(buf_pool);
+
+	mutex_enter(&block->mutex);
+
+	ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
+
+	buf_LRU_block_free_non_file_page(block);
+
+	mutex_exit(&block->mutex);
+
+	buf_pool_mutex_exit(buf_pool);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Copies contents of a buffer frame to a given buffer.
+@return	buf */
+UNIV_INLINE
+byte*
+buf_frame_copy(
+/*===========*/
+	byte*			buf,	/*!< in: buffer to copy to */
+	const buf_frame_t*	frame)	/*!< in: buffer frame */
+{
+	ut_ad(buf && frame);
+
+	ut_memcpy(buf, frame, UNIV_PAGE_SIZE);
+
+	return(buf);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Calculates a folded value of a file page address to use in the page hash
+table.
+@return	the folded value */
+UNIV_INLINE
+ulint
+buf_page_address_fold(
+/*==================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: offset of the page within space */
+{
+	return((space << 20) + space + offset);
+}
+
+/********************************************************************//**
+Gets the youngest modification log sequence number for a frame.
+Returns zero if not file page or no modification occurred yet.
+@return	newest modification to page */
+UNIV_INLINE
+lsn_t
+buf_page_get_newest_modification(
+/*=============================*/
+	const buf_page_t*	bpage)	/*!< in: block containing the
+					page frame */
+{
+	lsn_t		lsn;
+	ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
+
+	mutex_enter(block_mutex);
+
+	if (buf_page_in_file(bpage)) {
+		lsn = bpage->newest_modification;
+	} else {
+		lsn = 0;
+	}
+
+	mutex_exit(block_mutex);
+
+	return(lsn);
+}
+
+/********************************************************************//**
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+void
+buf_block_modify_clock_inc(
+/*=======================*/
+	buf_block_t*	block)	/*!< in: block */
+{
+#ifdef UNIV_SYNC_DEBUG
+	buf_pool_t*	buf_pool = buf_pool_from_bpage((buf_page_t*) block);
+
+	ut_ad((buf_pool_mutex_own(buf_pool)
+	       && (block->page.buf_fix_count == 0))
+	      || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE));
+#endif /* UNIV_SYNC_DEBUG */
+
+	block->modify_clock++;
+}
+
+/********************************************************************//**
+Returns the value of the modify clock. The caller must have an s-lock
+or x-lock on the block.
+@return	value */
+UNIV_INLINE
+ib_uint64_t
+buf_block_get_modify_clock(
+/*=======================*/
+	buf_block_t*	block)	/*!< in: block */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED)
+	      || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE));
+#endif /* UNIV_SYNC_DEBUG */
+
+	return(block->modify_clock);
+}
+
+/*******************************************************************//**
+Increments the bufferfix count. */
+UNIV_INLINE
+void
+buf_block_fix(
+/*===========*/
+	buf_block_t*	block)	/*!< in/out: block to bufferfix */
+{
+#ifdef PAGE_ATOMIC_REF_COUNT
+	os_atomic_increment_uint32(&block->page.buf_fix_count, 1);
+#else
+	ib_mutex_t*	block_mutex = buf_page_get_mutex(&block->page);
+
+	mutex_enter(block_mutex);
+	++block->page.buf_fix_count;
+	mutex_exit(block_mutex);
+#endif /* PAGE_ATOMIC_REF_COUNT */
+}
+
+/*******************************************************************//**
+Increments the bufferfix count. */
+UNIV_INLINE
+void
+buf_block_buf_fix_inc_func(
+/*=======================*/
+#ifdef UNIV_SYNC_DEBUG
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line */
+#endif /* UNIV_SYNC_DEBUG */
+	buf_block_t*	block)	/*!< in/out: block to bufferfix */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ibool	ret;
+
+	ret = rw_lock_s_lock_nowait(&(block->debug_latch), file, line);
+	ut_a(ret);
+#endif /* UNIV_SYNC_DEBUG */
+
+#ifdef PAGE_ATOMIC_REF_COUNT
+	os_atomic_increment_uint32(&block->page.buf_fix_count, 1);
+#else
+	ut_ad(mutex_own(&block->mutex));
+
+	++block->page.buf_fix_count;
+#endif /* PAGE_ATOMIC_REF_COUNT */
+}
+
+/*******************************************************************//**
+Decrements the bufferfix count. */
+UNIV_INLINE
+void
+buf_block_unfix(
+/*============*/
+	buf_block_t*	block)	/*!< in/out: block to bufferunfix */
+{
+	ut_ad(block->page.buf_fix_count > 0);
+
+#ifdef PAGE_ATOMIC_REF_COUNT
+	os_atomic_decrement_uint32(&block->page.buf_fix_count, 1);
+#else
+	ib_mutex_t*	block_mutex = buf_page_get_mutex(&block->page);
+
+	mutex_enter(block_mutex);
+	--block->page.buf_fix_count;
+	mutex_exit(block_mutex);
+#endif /* PAGE_ATOMIC_REF_COUNT */
+}
+
+/*******************************************************************//**
+Decrements the bufferfix count. */
+UNIV_INLINE
+void
+buf_block_buf_fix_dec(
+/*==================*/
+	buf_block_t*	block)	/*!< in/out: block to bufferunfix */
+{
+	ut_ad(block->page.buf_fix_count > 0);
+
+#ifdef PAGE_ATOMIC_REF_COUNT
+	os_atomic_decrement_uint32(&block->page.buf_fix_count, 1);
+#else
+	mutex_enter(&block->mutex);
+	--block->page.buf_fix_count;
+	mutex_exit(&block->mutex);
+#endif /* PAGE_ATOMIC_REF_COUNT */
+
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_s_unlock(&block->debug_latch);
+#endif
+}
+
+/******************************************************************//**
+Returns the buffer pool instance given space and offset of page
+@return buffer pool */
+UNIV_INLINE
+buf_pool_t*
+buf_pool_get(
+/*==========*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: offset of the page within space */
+{
+	ulint	fold;
+	ulint	index;
+	ulint	ignored_offset;
+
+	ignored_offset = offset >> 6; /* 2log of BUF_READ_AHEAD_AREA (64)*/
+	fold = buf_page_address_fold(space, ignored_offset);
+	index = fold % srv_buf_pool_instances;
+	return(&buf_pool_ptr[index]);
+}
+
+/******************************************************************//**
+Returns the buffer pool instance given its array index
+@return buffer pool */
+UNIV_INLINE
+buf_pool_t*
+buf_pool_from_array(
+/*================*/
+	ulint	index)		/*!< in: array index to get
+				buffer pool instance from */
+{
+	ut_ad(index < MAX_BUFFER_POOLS);
+	ut_ad(index < srv_buf_pool_instances);
+	return(&buf_pool_ptr[index]);
+}
+
+/******************************************************************//**
+Returns the control block of a file page, NULL if not found.
+@return	block, NULL if not found */
+UNIV_INLINE
+buf_page_t*
+buf_page_hash_get_low(
+/*==================*/
+	buf_pool_t*	buf_pool,/*!< buffer pool instance */
+	ulint		space,	/*!< in: space id */
+	ulint		offset,	/*!< in: offset of the page within space */
+	ulint		fold)	/*!< in: buf_page_address_fold(space, offset) */
+{
+	buf_page_t*	bpage;
+
+#ifdef UNIV_SYNC_DEBUG
+	ulint		hash_fold;
+	rw_lock_t*	hash_lock;
+
+	hash_fold = buf_page_address_fold(space, offset);
+	ut_ad(hash_fold == fold);
+
+	hash_lock = hash_get_lock(buf_pool->page_hash, fold);
+	ut_ad(rw_lock_own(hash_lock, RW_LOCK_EX)
+	      || rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* Look for the page in the hash table */
+
+	HASH_SEARCH(hash, buf_pool->page_hash, fold, buf_page_t*, bpage,
+		    ut_ad(bpage->in_page_hash && !bpage->in_zip_hash
+			  && buf_page_in_file(bpage)),
+		    bpage->space == space && bpage->offset == offset);
+	if (bpage) {
+		ut_a(buf_page_in_file(bpage));
+		ut_ad(bpage->in_page_hash);
+		ut_ad(!bpage->in_zip_hash);
+	}
+
+	return(bpage);
+}
+
+/******************************************************************//**
+Returns the control block of a file page, NULL if not found.
+If the block is found and lock is not NULL then the appropriate
+page_hash lock is acquired in the specified lock mode. Otherwise,
+mode value is ignored. It is up to the caller to release the
+lock. If the block is found and the lock is NULL then the page_hash
+lock is released by this function.
+@return	block, NULL if not found, or watch sentinel (if watch is true) */
+UNIV_INLINE
+buf_page_t*
+buf_page_hash_get_locked(
+/*=====================*/
+					/*!< out: pointer to the bpage,
+					or NULL; if NULL, hash_lock
+					is also NULL. */
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	ulint		space,		/*!< in: space id */
+	ulint		offset,		/*!< in: page number */
+	rw_lock_t**	lock,		/*!< in/out: lock of the page
+					hash acquired if bpage is
+					found. NULL otherwise. If NULL
+					is passed then the hash_lock
+					is released by this function */
+	ulint		lock_mode,	/*!< in: RW_LOCK_EX or
+					RW_LOCK_SHARED. Ignored if
+					lock == NULL */
+	bool		watch)		/*!< in: if true, return watch
+					sentinel also. */
+{
+	buf_page_t*	bpage = NULL;
+	ulint		fold;
+	rw_lock_t*	hash_lock;
+	ulint		mode = RW_LOCK_SHARED;
+
+	if (lock != NULL) {
+		*lock = NULL;
+		ut_ad(lock_mode == RW_LOCK_EX
+		      || lock_mode == RW_LOCK_SHARED);
+		mode = lock_mode;
+	}
+
+	fold = buf_page_address_fold(space, offset);
+	hash_lock = hash_get_lock(buf_pool->page_hash, fold);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_EX)
+	      && !rw_lock_own(hash_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (mode == RW_LOCK_SHARED) {
+		rw_lock_s_lock(hash_lock);
+	} else {
+		rw_lock_x_lock(hash_lock);
+	}
+
+	bpage = buf_page_hash_get_low(buf_pool, space, offset, fold);
+
+	if (!bpage || buf_pool_watch_is_sentinel(buf_pool, bpage)) {
+		if (!watch) {
+			bpage = NULL;
+		}
+		goto unlock_and_exit;
+	}
+
+	ut_ad(buf_page_in_file(bpage));
+	ut_ad(offset == bpage->offset);
+	ut_ad(space == bpage->space);
+
+	if (lock == NULL) {
+		/* The caller wants us to release the page_hash lock */
+		goto unlock_and_exit;
+	} else {
+		/* To be released by the caller */
+		*lock = hash_lock;
+		goto exit;
+	}
+
+unlock_and_exit:
+	if (mode == RW_LOCK_SHARED) {
+		rw_lock_s_unlock(hash_lock);
+	} else {
+		rw_lock_x_unlock(hash_lock);
+	}
+exit:
+	return(bpage);
+}
+
+/******************************************************************//**
+Returns the control block of a file page, NULL if not found.
+If the block is found and lock is not NULL then the appropriate
+page_hash lock is acquired in the specified lock mode. Otherwise,
+mode value is ignored. It is up to the caller to release the
+lock. If the block is found and the lock is NULL then the page_hash
+lock is released by this function.
+@return	block, NULL if not found */
+UNIV_INLINE
+buf_block_t*
+buf_block_hash_get_locked(
+/*=====================*/
+					/*!< out: pointer to the bpage,
+					or NULL; if NULL, hash_lock
+					is also NULL. */
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	ulint		space,		/*!< in: space id */
+	ulint		offset,		/*!< in: page number */
+	rw_lock_t**	lock,		/*!< in/out: lock of the page
+					hash acquired if bpage is
+					found. NULL otherwise. If NULL
+					is passed then the hash_lock
+					is released by this function */
+	ulint		lock_mode)	/*!< in: RW_LOCK_EX or
+					RW_LOCK_SHARED. Ignored if
+					lock == NULL */
+{
+	buf_page_t*	bpage = buf_page_hash_get_locked(buf_pool,
+							 space,
+							 offset,
+							 lock,
+							 lock_mode);
+	buf_block_t*	block = buf_page_get_block(bpage);
+
+	if (block) {
+		ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(!lock || rw_lock_own(*lock, lock_mode));
+#endif /* UNIV_SYNC_DEBUG */
+		return(block);
+	} else if (bpage) {
+		/* It is not a block. Just a bpage */
+		ut_ad(buf_page_in_file(bpage));
+
+		if (lock) {
+			if (lock_mode == RW_LOCK_SHARED) {
+				rw_lock_s_unlock(*lock);
+			} else {
+				rw_lock_x_unlock(*lock);
+			}
+		}
+		*lock = NULL;
+		return(NULL);
+	}
+
+	ut_ad(!bpage);
+	ut_ad(lock == NULL ||*lock == NULL);
+	return(NULL);
+}
+
+/********************************************************************//**
+Returns TRUE if the page can be found in the buffer pool hash table.
+
+NOTE that it is possible that the page is not yet read from disk,
+though.
+
+@return	TRUE if found in the page hash table */
+UNIV_INLINE
+ibool
+buf_page_peek(
+/*==========*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: page number */
+{
+	buf_pool_t*		buf_pool = buf_pool_get(space, offset);
+
+	return(buf_page_hash_get(buf_pool, space, offset) != NULL);
+}
+
+/********************************************************************//**
+Releases a compressed-only page acquired with buf_page_get_zip(). */
+UNIV_INLINE
+void
+buf_page_release_zip(
+/*=================*/
+	buf_page_t*	bpage)		/*!< in: buffer block */
+{
+	buf_block_t*	block;
+
+	block = (buf_block_t*) bpage;
+
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_FILE_PAGE:
+#ifdef UNIV_SYNC_DEBUG
+		rw_lock_s_unlock(&block->debug_latch);
+#endif /* UNUV_SYNC_DEBUG */
+		/* Fall through */
+	case BUF_BLOCK_ZIP_PAGE:
+	case BUF_BLOCK_ZIP_DIRTY:
+		buf_block_unfix(block);
+		return;
+
+	case BUF_BLOCK_POOL_WATCH:
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		break;
+	}
+
+	ut_error;
+}
+
+/********************************************************************//**
+Decrements the bufferfix count of a buffer control block and releases
+a latch, if specified. */
+UNIV_INLINE
+void
+buf_page_release(
+/*=============*/
+	buf_block_t*	block,		/*!< in: buffer block */
+	ulint		rw_latch)	/*!< in: RW_S_LATCH, RW_X_LATCH,
+					RW_NO_LATCH */
+{
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_s_unlock(&(block->debug_latch));
+#endif
+	if (rw_latch == RW_S_LATCH) {
+		rw_lock_s_unlock(&(block->lock));
+	} else if (rw_latch == RW_X_LATCH) {
+		rw_lock_x_unlock(&(block->lock));
+	}
+
+	buf_block_unfix(block);
+}
+
+#ifdef UNIV_SYNC_DEBUG
+/*********************************************************************//**
+Adds latch level info for the rw-lock protecting the buffer frame. This
+should be called in the debug version after a successful latching of a
+page if we know the latching order level of the acquired latch. */
+UNIV_INLINE
+void
+buf_block_dbg_add_level(
+/*====================*/
+	buf_block_t*	block,	/*!< in: buffer page
+				where we have acquired latch */
+	ulint		level)	/*!< in: latching order level */
+{
+	sync_thread_add_level(&block->lock, level, FALSE);
+}
+
+#endif /* UNIV_SYNC_DEBUG */
+/********************************************************************//**
+Acquire mutex on all buffer pool instances. */
+UNIV_INLINE
+void
+buf_pool_mutex_enter_all(void)
+/*==========================*/
+{
+	ulint   i;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+		buf_pool_mutex_enter(buf_pool);
+	}
+}
+
+/********************************************************************//**
+Release mutex on all buffer pool instances. */
+UNIV_INLINE
+void
+buf_pool_mutex_exit_all(void)
+/*=========================*/
+{
+	ulint   i;
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		buf_pool_t*	buf_pool;
+
+		buf_pool = buf_pool_from_array(i);
+		buf_pool_mutex_exit(buf_pool);
+	}
+}
+/*********************************************************************//**
+Get the nth chunk's buffer block in the specified buffer pool.
+@return the nth chunk's buffer block. */
+UNIV_INLINE
+buf_block_t*
+buf_get_nth_chunk_block(
+/*====================*/
+	const buf_pool_t* buf_pool,	/*!< in: buffer pool instance */
+	ulint		n,		/*!< in: nth chunk in the buffer pool */
+	ulint*		chunk_size)	/*!< in: chunk size */
+{
+	const buf_chunk_t*	chunk;
+
+	chunk = buf_pool->chunks + n;
+	*chunk_size = chunk->size;
+	return(chunk->blocks);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/buf0checksum.h b/storage/innobase/include/buf0checksum.h
new file mode 100644
index 00000000000..cd21781dc6e
--- /dev/null
+++ b/storage/innobase/include/buf0checksum.h
@@ -0,0 +1,88 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0checksum.h
+Buffer pool checksum functions, also linked from /extra/innochecksum.cc
+
+Created Aug 11, 2011 Vasil Dimov
+*******************************************************/
+
+#ifndef buf0checksum_h
+#define buf0checksum_h
+
+#include "univ.i"
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "buf0types.h"
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/********************************************************************//**
+Calculates a page CRC32 which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@return	checksum */
+UNIV_INTERN
+ib_uint32_t
+buf_calc_page_crc32(
+/*================*/
+	const byte*	page);	/*!< in: buffer page */
+
+/********************************************************************//**
+Calculates a page checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@return	checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_new_checksum(
+/*=======================*/
+	const byte*	page);	/*!< in: buffer page */
+
+/********************************************************************//**
+In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
+looked at the first few bytes of the page. This calculates that old
+checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@return	checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_old_checksum(
+/*=======================*/
+	const byte*	page);	/*!< in: buffer page */
+
+#ifndef UNIV_INNOCHECKSUM
+
+/********************************************************************//**
+Return a printable string describing the checksum algorithm.
+@return	algorithm name */
+UNIV_INTERN
+const char*
+buf_checksum_algorithm_name(
+/*========================*/
+	srv_checksum_algorithm_t	algo);	/*!< in: algorithm */
+
+extern ulong	srv_checksum_algorithm;
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif /* buf0checksum_h */
diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h
new file mode 100644
index 00000000000..a62a6400d97
--- /dev/null
+++ b/storage/innobase/include/buf0dblwr.h
@@ -0,0 +1,162 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0dblwr.h
+Doublewrite buffer module
+
+Created 2011/12/19 Inaam Rana
+*******************************************************/
+
+#ifndef buf0dblwr_h
+#define buf0dblwr_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "log0log.h"
+#include "log0recv.h"
+
+#ifndef UNIV_HOTBACKUP
+
+/** Doublewrite system */
+extern buf_dblwr_t*	buf_dblwr;
+/** Set to TRUE when the doublewrite buffer is being created */
+extern ibool		buf_dblwr_being_created;
+
+/****************************************************************//**
+Creates the doublewrite buffer to a new InnoDB installation. The header of the
+doublewrite buffer is placed on the trx system header page. */
+UNIV_INTERN
+void
+buf_dblwr_create(void);
+/*==================*/
+
+/****************************************************************//**
+At a database startup initializes the doublewrite buffer memory structure if
+we already have a doublewrite buffer created in the data files. If we are
+upgrading to an InnoDB version which supports multiple tablespaces, then this
+function performs the necessary update operations. If we are in a crash
+recovery, this function loads the pages from double write buffer into memory. */
+void
+buf_dblwr_init_or_load_pages(
+/*=========================*/
+	os_file_t	file,
+	char*		path,
+	bool		load_corrupt_pages);
+
+/****************************************************************//**
+Process the double write buffer pages. */
+void
+buf_dblwr_process(void);
+/*===================*/
+
+/****************************************************************//**
+frees doublewrite buffer. */
+UNIV_INTERN
+void
+buf_dblwr_free(void);
+/*================*/
+/********************************************************************//**
+Updates the doublewrite buffer when an IO request is completed. */
+UNIV_INTERN
+void
+buf_dblwr_update(
+/*=============*/
+	const buf_page_t*	bpage,	/*!< in: buffer block descriptor */
+	buf_flush_t		flush_type);/*!< in: flush type */
+/****************************************************************//**
+Determines if a page number is located inside the doublewrite buffer.
+@return TRUE if the location is inside the two blocks of the
+doublewrite buffer */
+UNIV_INTERN
+ibool
+buf_dblwr_page_inside(
+/*==================*/
+	ulint	page_no);	/*!< in: page number */
+/********************************************************************//**
+Posts a buffer page for writing. If the doublewrite memory buffer is
+full, calls buf_dblwr_flush_buffered_writes and waits for for free
+space to appear. */
+UNIV_INTERN
+void
+buf_dblwr_add_to_batch(
+/*====================*/
+	buf_page_t*	bpage);	/*!< in: buffer block to write */
+/********************************************************************//**
+Flushes possible buffered writes from the doublewrite memory buffer to disk,
+and also wakes up the aio thread if simulated aio is used. It is very
+important to call this function after a batch of writes has been posted,
+and also when we may have to wait for a page latch! Otherwise a deadlock
+of threads can occur. */
+UNIV_INTERN
+void
+buf_dblwr_flush_buffered_writes(void);
+/*=================================*/
+/********************************************************************//**
+Writes a page to the doublewrite buffer on disk, sync it, then write
+the page to the datafile and sync the datafile. This function is used
+for single page flushes. If all the buffers allocated for single page
+flushes in the doublewrite buffer are in use we wait here for one to
+become free. We are guaranteed that a slot will become free because any
+thread that is using a slot must also release the slot before leaving
+this function. */
+UNIV_INTERN
+void
+buf_dblwr_write_single_page(
+/*========================*/
+	buf_page_t*	bpage,	/*!< in: buffer block to write */
+	bool		sync);	/*!< in: true if sync IO requested */
+
+/** Doublewrite control struct */
+struct buf_dblwr_t{
+	ib_mutex_t	mutex;	/*!< mutex protecting the first_free
+				field and write_buf */
+	ulint		block1;	/*!< the page number of the first
+				doublewrite block (64 pages) */
+	ulint		block2;	/*!< page number of the second block */
+	ulint		first_free;/*!< first free position in write_buf
+				measured in units of UNIV_PAGE_SIZE */
+	ulint		b_reserved;/*!< number of slots currently reserved
+				for batch flush. */
+	os_event_t	b_event;/*!< event where threads wait for a
+				batch flush to end. */
+	ulint		s_reserved;/*!< number of slots currently
+				reserved for single page flushes. */
+	os_event_t	s_event;/*!< event where threads wait for a
+				single page flush slot. */
+	bool*		in_use;	/*!< flag used to indicate if a slot is
+				in use. Only used for single page
+				flushes. */
+	bool		batch_running;/*!< set to TRUE if currently a batch
+				is being written from the doublewrite
+				buffer. */
+	byte*		write_buf;/*!< write buffer used in writing to the
+				doublewrite buffer, aligned to an
+				address divisible by UNIV_PAGE_SIZE
+				(which is required by Windows aio) */
+	byte*		write_buf_unaligned;/*!< pointer to write_buf,
+				but unaligned */
+	buf_page_t**	buf_block_arr;/*!< array to store pointers to
+				the buffer blocks which have been
+				cached to write_buf */
+};
+
+
+#endif /* UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/innobase/include/buf0dump.h b/storage/innobase/include/buf0dump.h
new file mode 100644
index 00000000000..c704a8e97e0
--- /dev/null
+++ b/storage/innobase/include/buf0dump.h
@@ -0,0 +1,72 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dump.h
+Implements a buffer pool dump/load.
+
+Created April 08, 2011 Vasil Dimov
+*******************************************************/
+
+#ifndef buf0dump_h
+#define buf0dump_h
+
+#include "univ.i"
+
+/*****************************************************************//**
+Wakes up the buffer pool dump/load thread and instructs it to start
+a dump. This function is called by MySQL code via buffer_pool_dump_now()
+and it should return immediately because the whole MySQL is frozen during
+its execution. */
+UNIV_INTERN
+void
+buf_dump_start();
+/*============*/
+
+/*****************************************************************//**
+Wakes up the buffer pool dump/load thread and instructs it to start
+a load. This function is called by MySQL code via buffer_pool_load_now()
+and it should return immediately because the whole MySQL is frozen during
+its execution. */
+UNIV_INTERN
+void
+buf_load_start();
+/*============*/
+
+/*****************************************************************//**
+Aborts a currently running buffer pool load. This function is called by
+MySQL code via buffer_pool_load_abort() and it should return immediately
+because the whole MySQL is frozen during its execution. */
+UNIV_INTERN
+void
+buf_load_abort();
+/*============*/
+
+/*****************************************************************//**
+This is the main thread for buffer pool dump/load. It waits for an
+event and when waked up either performs a dump or load and sleeps
+again.
+@return this function does not return, it calls os_thread_exit() */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(buf_dump_thread)(
+/*============================*/
+	void*	arg);				/*!< in: a dummy parameter
+						required by os_thread_create */
+
+#endif /* buf0dump_h */
diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h
new file mode 100644
index 00000000000..f116720574b
--- /dev/null
+++ b/storage/innobase/include/buf0flu.h
@@ -0,0 +1,286 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0flu.h
+The database buffer pool flush algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0flu_h
+#define buf0flu_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "log0log.h"
+#ifndef UNIV_HOTBACKUP
+#include "mtr0types.h"
+#include "buf0types.h"
+
+/** Flag indicating if the page_cleaner is in active state. */
+extern ibool buf_page_cleaner_is_active;
+
+/********************************************************************//**
+Remove a block from the flush list of modified blocks. */
+UNIV_INTERN
+void
+buf_flush_remove(
+/*=============*/
+	buf_page_t*	bpage);	/*!< in: pointer to the block in question */
+/*******************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage has already been
+copied to dpage. */
+UNIV_INTERN
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+	buf_page_t*	bpage,	/*!< in/out: control block being moved */
+	buf_page_t*	dpage);	/*!< in/out: destination block */
+/********************************************************************//**
+Updates the flush system data structures when a write is completed. */
+UNIV_INTERN
+void
+buf_flush_write_complete(
+/*=====================*/
+	buf_page_t*	bpage);	/*!< in: pointer to the block in question */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Initializes a page for writing to the tablespace. */
+UNIV_INTERN
+void
+buf_flush_init_for_writing(
+/*=======================*/
+	byte*	page,		/*!< in/out: page */
+	void*	page_zip_,	/*!< in/out: compressed page, or NULL */
+	lsn_t	newest_lsn);	/*!< in: newest modification lsn
+				to the page */
+#ifndef UNIV_HOTBACKUP
+# if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+/********************************************************************//**
+Writes a flushable page asynchronously from the buffer pool to a file.
+NOTE: buf_pool->mutex and block->mutex must be held upon entering this
+function, and they will be released by this function after flushing.
+This is loosely based on buf_flush_batch() and buf_flush_page().
+@return TRUE if the page was flushed and the mutexes released */
+UNIV_INTERN
+ibool
+buf_flush_page_try(
+/*===============*/
+	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
+	buf_block_t*	block)		/*!< in/out: buffer control block */
+	__attribute__((nonnull, warn_unused_result));
+# endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the flush list of
+all buffer pool instances.
+NOTE: The calling thread is not allowed to own any latches on pages!
+@return true if a batch was queued successfully for each buffer pool
+instance. false if another batch of same type was already running in
+at least one of the buffer pool instance */
+UNIV_INTERN
+bool
+buf_flush_list(
+/*===========*/
+	ulint		min_n,		/*!< in: wished minimum mumber of blocks
+					flushed (it is not guaranteed that the
+					actual number is that big, though) */
+	lsn_t		lsn_limit,	/*!< in the case BUF_FLUSH_LIST all
+					blocks whose oldest_modification is
+					smaller than this should be flushed
+					(if their number does not exceed
+					min_n), otherwise ignored */
+	ulint*		n_processed);	/*!< out: the number of pages
+					which were processed is passed
+					back to caller. Ignored if NULL */
+/******************************************************************//**
+This function picks up a single dirty page from the tail of the LRU
+list, flushes it, removes it from page_hash and LRU list and puts
+it on the free list. It is called from user threads when they are
+unable to find a replacable page at the tail of the LRU list i.e.:
+when the background LRU flushing in the page_cleaner thread is not
+fast enough to keep pace with the workload.
+@return TRUE if success. */
+UNIV_INTERN
+ibool
+buf_flush_single_page_from_LRU(
+/*===========================*/
+	buf_pool_t*	buf_pool);	/*!< in/out: buffer pool instance */
+/******************************************************************//**
+Waits until a flush batch of the given type ends */
+UNIV_INTERN
+void
+buf_flush_wait_batch_end(
+/*=====================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	buf_flush_t	type);		/*!< in: BUF_FLUSH_LRU
+					or BUF_FLUSH_LIST */
+/******************************************************************//**
+Waits until a flush batch of the given type ends. This is called by
+a thread that only wants to wait for a flush to end but doesn't do
+any flushing itself. */
+UNIV_INTERN
+void
+buf_flush_wait_batch_end_wait_only(
+/*===============================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	buf_flush_t	type);		/*!< in: BUF_FLUSH_LRU
+					or BUF_FLUSH_LIST */
+/********************************************************************//**
+This function should be called at a mini-transaction commit, if a page was
+modified in it. Puts the block to the list of modified blocks, if it not
+already in it. */
+UNIV_INLINE
+void
+buf_flush_note_modification(
+/*========================*/
+	buf_block_t*	block,	/*!< in: block which is modified */
+	mtr_t*		mtr);	/*!< in: mtr */
+/********************************************************************//**
+This function should be called when recovery has modified a buffer page. */
+UNIV_INLINE
+void
+buf_flush_recv_note_modification(
+/*=============================*/
+	buf_block_t*	block,		/*!< in: block which is modified */
+	lsn_t		start_lsn,	/*!< in: start lsn of the first mtr in a
+					set of mtr's */
+	lsn_t		end_lsn);	/*!< in: end lsn of the last mtr in the
+					set of mtr's */
+/********************************************************************//**
+Returns TRUE if the file page block is immediately suitable for replacement,
+i.e., transition FILE_PAGE => NOT_USED allowed.
+@return	TRUE if can replace immediately */
+UNIV_INTERN
+ibool
+buf_flush_ready_for_replace(
+/*========================*/
+	buf_page_t*	bpage);	/*!< in: buffer control block, must be
+				buf_page_in_file(bpage) and in the LRU list */
+/******************************************************************//**
+page_cleaner thread tasked with flushing dirty pages from the buffer
+pools. As of now we'll have only one instance of this thread.
+@return a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(buf_flush_page_cleaner_thread)(
+/*==========================================*/
+	void*	arg);		/*!< in: a dummy parameter required by
+				os_thread_create */
+/*********************************************************************//**
+Clears up tail of the LRU lists:
+* Put replaceable pages at the tail of LRU to the free list
+* Flush dirty pages at the tail of LRU to the disk
+The depth to which we scan each buffer pool is controlled by dynamic
+config parameter innodb_LRU_scan_depth.
+@return total pages flushed */
+UNIV_INTERN
+ulint
+buf_flush_LRU_tail(void);
+/*====================*/
+/*********************************************************************//**
+Wait for any possible LRU flushes that are in progress to end. */
+UNIV_INTERN
+void
+buf_flush_wait_LRU_batch_end(void);
+/*==============================*/
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/******************************************************************//**
+Validates the flush list.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+buf_flush_validate(
+/*===============*/
+	buf_pool_t*	buf_pool);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+/********************************************************************//**
+Initialize the red-black tree to speed up insertions into the flush_list
+during recovery process. Should be called at the start of recovery
+process before any page has been read/written. */
+UNIV_INTERN
+void
+buf_flush_init_flush_rbt(void);
+/*==========================*/
+
+/********************************************************************//**
+Frees up the red-black tree. */
+UNIV_INTERN
+void
+buf_flush_free_flush_rbt(void);
+/*==========================*/
+
+/********************************************************************//**
+Writes a flushable page asynchronously from the buffer pool to a file.
+NOTE: in simulated aio we must call
+os_aio_simulated_wake_handler_threads after we have posted a batch of
+writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be
+held upon entering this function, and they will be released by this
+function if it returns true.
+@return TRUE if the page was flushed */
+UNIV_INTERN
+bool
+buf_flush_page(
+/*===========*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	buf_page_t*	bpage,		/*!< in: buffer control block */
+	buf_flush_t	flush_type,	/*!< in: type of flush */
+	bool		sync);		/*!< in: true if sync IO request */
+/********************************************************************//**
+Returns true if the block is modified and ready for flushing.
+@return	true if can flush immediately */
+UNIV_INTERN
+bool
+buf_flush_ready_for_flush(
+/*======================*/
+	buf_page_t*	bpage,	/*!< in: buffer control block, must be
+				buf_page_in_file(bpage) */
+	buf_flush_t	flush_type)/*!< in: type of flush */
+	__attribute__((warn_unused_result));
+
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Check if there are any dirty pages that belong to a space id in the flush
+list in a particular buffer pool.
+@return	number of dirty pages present in a single buffer pool */
+UNIV_INTERN
+ulint
+buf_pool_get_dirty_pages_count(
+/*===========================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool */
+	ulint		id);		/*!< in: space id to check */
+/******************************************************************//**
+Check if there are any dirty pages that belong to a space id in the flush list.
+@return	count of dirty pages present in all the buffer pools */
+UNIV_INTERN
+ulint
+buf_flush_get_dirty_pages_count(
+/*============================*/
+	ulint		id);		/*!< in: space id to check */
+#endif /* UNIV_DEBUG */
+
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "buf0flu.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/buf0flu.ic b/storage/innobase/include/buf0flu.ic
new file mode 100644
index 00000000000..a763cd115fe
--- /dev/null
+++ b/storage/innobase/include/buf0flu.ic
@@ -0,0 +1,139 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0flu.ic
+The database buffer pool flush algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+#include "buf0buf.h"
+#include "mtr0mtr.h"
+#include "srv0srv.h"
+
+/********************************************************************//**
+Inserts a modified block into the flush list. */
+UNIV_INTERN
+void
+buf_flush_insert_into_flush_list(
+/*=============================*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	buf_block_t*	block,		/*!< in/out: block which is modified */
+	lsn_t		lsn);		/*!< in: oldest modification */
+/********************************************************************//**
+Inserts a modified block into the flush list in the right sorted position.
+This function is used by recovery, because there the modifications do not
+necessarily come in the order of lsn's. */
+UNIV_INTERN
+void
+buf_flush_insert_sorted_into_flush_list(
+/*====================================*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	buf_block_t*	block,		/*!< in/out: block which is modified */
+	lsn_t		lsn);		/*!< in: oldest modification */
+
+/********************************************************************//**
+This function should be called at a mini-transaction commit, if a page was
+modified in it. Puts the block to the list of modified blocks, if it is not
+already in it. */
+UNIV_INLINE
+void
+buf_flush_note_modification(
+/*========================*/
+	buf_block_t*	block,	/*!< in: block which is modified */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_block(block);
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(block->page.buf_fix_count > 0);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_ad(!buf_pool_mutex_own(buf_pool));
+	ut_ad(!buf_flush_list_mutex_own(buf_pool));
+	ut_ad(!mtr->made_dirty || log_flush_order_mutex_own());
+
+	ut_ad(mtr->start_lsn != 0);
+	ut_ad(mtr->modifications);
+
+	mutex_enter(&block->mutex);
+	ut_ad(block->page.newest_modification <= mtr->end_lsn);
+
+	block->page.newest_modification = mtr->end_lsn;
+
+	if (!block->page.oldest_modification) {
+		ut_a(mtr->made_dirty);
+		ut_ad(log_flush_order_mutex_own());
+		buf_flush_insert_into_flush_list(
+			buf_pool, block, mtr->start_lsn);
+	} else {
+		ut_ad(block->page.oldest_modification <= mtr->start_lsn);
+	}
+
+	mutex_exit(&block->mutex);
+
+	srv_stats.buf_pool_write_requests.inc();
+}
+
+/********************************************************************//**
+This function should be called when recovery has modified a buffer page. */
+UNIV_INLINE
+void
+buf_flush_recv_note_modification(
+/*=============================*/
+	buf_block_t*	block,		/*!< in: block which is modified */
+	lsn_t		start_lsn,	/*!< in: start lsn of the first mtr in a
+					set of mtr's */
+	lsn_t		end_lsn)	/*!< in: end lsn of the last mtr in the
+					set of mtr's */
+{
+	buf_pool_t*	buf_pool = buf_pool_from_block(block);
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(block->page.buf_fix_count > 0);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_ad(!buf_pool_mutex_own(buf_pool));
+	ut_ad(!buf_flush_list_mutex_own(buf_pool));
+	ut_ad(log_flush_order_mutex_own());
+
+	ut_ad(start_lsn != 0);
+	ut_ad(block->page.newest_modification <= end_lsn);
+
+	mutex_enter(&block->mutex);
+	block->page.newest_modification = end_lsn;
+
+	if (!block->page.oldest_modification) {
+		buf_flush_insert_sorted_into_flush_list(
+			buf_pool, block, start_lsn);
+	} else {
+		ut_ad(block->page.oldest_modification <= start_lsn);
+	}
+
+	mutex_exit(&block->mutex);
+
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h
new file mode 100644
index 00000000000..ecdaef685a1
--- /dev/null
+++ b/storage/innobase/include/buf0lru.h
@@ -0,0 +1,310 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0lru.h
+The database buffer pool LRU replacement algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0lru_h
+#define buf0lru_h
+
+#include "univ.i"
+#ifndef UNIV_HOTBACKUP
+#include "ut0byte.h"
+#include "buf0types.h"
+
+// Forward declaration
+struct trx_t;
+
+/******************************************************************//**
+Returns TRUE if less than 25 % of the buffer pool is available. This can be
+used in heuristics to prevent huge transactions eating up the whole buffer
+pool for their locks.
+@return	TRUE if less than 25 % of buffer pool left */
+UNIV_INTERN
+ibool
+buf_LRU_buf_pool_running_out(void);
+/*==============================*/
+
+/*#######################################################################
+These are low-level functions
+#########################################################################*/
+
+/** Minimum LRU list length for which the LRU_old pointer is defined */
+#define BUF_LRU_OLD_MIN_LEN	512	/* 8 megabytes of 16k pages */
+
+/******************************************************************//**
+Flushes all dirty pages or removes all pages belonging
+to a given tablespace. A PROBLEM: if readahead is being started, what
+guarantees that it will not try to read in pages after this operation
+has completed? */
+UNIV_INTERN
+void
+buf_LRU_flush_or_remove_pages(
+/*==========================*/
+	ulint		id,		/*!< in: space id */
+	buf_remove_t	buf_remove,	/*!< in: remove or flush strategy */
+	const trx_t*	trx);		/*!< to check if the operation must
+					be interrupted */
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/********************************************************************//**
+Insert a compressed block into buf_pool->zip_clean in the LRU order. */
+UNIV_INTERN
+void
+buf_LRU_insert_zip_clean(
+/*=====================*/
+	buf_page_t*	bpage);	/*!< in: pointer to the block in question */
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+/******************************************************************//**
+Try to free a block.  If bpage is a descriptor of a compressed-only
+page, the descriptor object will be freed as well.
+
+NOTE: If this function returns true, it will temporarily
+release buf_pool->mutex.  Furthermore, the page frame will no longer be
+accessible via bpage.
+
+The caller must hold buf_pool->mutex and must not hold any
+buf_page_get_mutex() when calling this function.
+@return true if freed, false otherwise. */
+UNIV_INTERN
+bool
+buf_LRU_free_page(
+/*==============*/
+	buf_page_t*	bpage,	/*!< in: block to be freed */
+	bool		zip)	/*!< in: true if should remove also the
+				compressed page of an uncompressed page */
+	__attribute__((nonnull));
+/******************************************************************//**
+Try to free a replaceable block.
+@return	TRUE if found and freed */
+UNIV_INTERN
+ibool
+buf_LRU_scan_and_free_block(
+/*========================*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	ibool		scan_all)	/*!< in: scan whole LRU list
+					if TRUE, otherwise scan only
+					'old' blocks. */
+	__attribute__((nonnull,warn_unused_result));
+/******************************************************************//**
+Returns a free block from the buf_pool.  The block is taken off the
+free list.  If it is empty, returns NULL.
+@return	a free control block, or NULL if the buf_block->free list is empty */
+UNIV_INTERN
+buf_block_t*
+buf_LRU_get_free_only(
+/*==================*/
+	buf_pool_t*	buf_pool);	/*!< buffer pool instance */
+/******************************************************************//**
+Returns a free block from the buf_pool. The block is taken off the
+free list. If it is empty, blocks are moved from the end of the
+LRU list to the free list.
+This function is called from a user thread when it needs a clean
+block to read in a page. Note that we only ever get a block from
+the free list. Even when we flush a page or find a page in LRU scan
+we put it to free list to be used.
+* iteration 0:
+  * get a block from free list, success:done
+  * if there is an LRU flush batch in progress:
+    * wait for batch to end: retry free list
+  * if buf_pool->try_LRU_scan is set
+    * scan LRU up to srv_LRU_scan_depth to find a clean block
+    * the above will put the block on free list
+    * success:retry the free list
+  * flush one dirty page from tail of LRU to disk
+    * the above will put the block on free list
+    * success: retry the free list
+* iteration 1:
+  * same as iteration 0 except:
+    * scan whole LRU list
+    * scan LRU list even if buf_pool->try_LRU_scan is not set
+* iteration > 1:
+  * same as iteration 1 but sleep 100ms
+@return	the free control block, in state BUF_BLOCK_READY_FOR_USE */
+UNIV_INTERN
+buf_block_t*
+buf_LRU_get_free_block(
+/*===================*/
+	buf_pool_t*	buf_pool)	/*!< in/out: buffer pool instance */
+	__attribute__((nonnull,warn_unused_result));
+/******************************************************************//**
+Determines if the unzip_LRU list should be used for evicting a victim
+instead of the general LRU list.
+@return	TRUE if should use unzip_LRU */
+UNIV_INTERN
+ibool
+buf_LRU_evict_from_unzip_LRU(
+/*=========================*/
+	buf_pool_t*	buf_pool);
+/******************************************************************//**
+Puts a block back to the free list. */
+UNIV_INTERN
+void
+buf_LRU_block_free_non_file_page(
+/*=============================*/
+	buf_block_t*	block);	/*!< in: block, must not contain a file page */
+/******************************************************************//**
+Adds a block to the LRU list. Please make sure that the zip_size is
+already set into the page zip when invoking the function, so that we
+can get correct zip_size from the buffer page when adding a block
+into LRU */
+UNIV_INTERN
+void
+buf_LRU_add_block(
+/*==============*/
+	buf_page_t*	bpage,	/*!< in: control block */
+	ibool		old);	/*!< in: TRUE if should be put to the old
+				blocks in the LRU list, else put to the
+				start; if the LRU list is very short, added to
+				the start regardless of this parameter */
+/******************************************************************//**
+Adds a block to the LRU list of decompressed zip pages. */
+UNIV_INTERN
+void
+buf_unzip_LRU_add_block(
+/*====================*/
+	buf_block_t*	block,	/*!< in: control block */
+	ibool		old);	/*!< in: TRUE if should be put to the end
+				of the list, else put to the start */
+/******************************************************************//**
+Moves a block to the start of the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_make_block_young(
+/*=====================*/
+	buf_page_t*	bpage);	/*!< in: control block */
+/******************************************************************//**
+Moves a block to the end of the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_make_block_old(
+/*===================*/
+	buf_page_t*	bpage);	/*!< in: control block */
+/**********************************************************************//**
+Updates buf_pool->LRU_old_ratio.
+@return	updated old_pct */
+UNIV_INTERN
+ulint
+buf_LRU_old_ratio_update(
+/*=====================*/
+	uint	old_pct,/*!< in: Reserve this percentage of
+			the buffer pool for "old" blocks. */
+	ibool	adjust);/*!< in: TRUE=adjust the LRU list;
+			FALSE=just assign buf_pool->LRU_old_ratio
+			during the initialization of InnoDB */
+/********************************************************************//**
+Update the historical stats that we are collecting for LRU eviction
+policy at the end of each interval. */
+UNIV_INTERN
+void
+buf_LRU_stat_update(void);
+/*=====================*/
+
+/******************************************************************//**
+Remove one page from LRU list and put it to free list */
+UNIV_INTERN
+void
+buf_LRU_free_one_page(
+/*==================*/
+	buf_page_t*	bpage)	/*!< in/out: block, must contain a file page and
+				be in a state where it can be freed; there
+				may or may not be a hash index to the page */
+	__attribute__((nonnull));
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Validates the LRU list.
+@return	TRUE */
+UNIV_INTERN
+ibool
+buf_LRU_validate(void);
+/*==================*/
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Prints the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_print(void);
+/*===============*/
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+/** @name Heuristics for detecting index scan @{ */
+/** The denominator of buf_pool->LRU_old_ratio. */
+#define BUF_LRU_OLD_RATIO_DIV	1024
+/** Maximum value of buf_pool->LRU_old_ratio.
+@see buf_LRU_old_adjust_len
+@see buf_pool->LRU_old_ratio_update */
+#define BUF_LRU_OLD_RATIO_MAX	BUF_LRU_OLD_RATIO_DIV
+/** Minimum value of buf_pool->LRU_old_ratio.
+@see buf_LRU_old_adjust_len
+@see buf_pool->LRU_old_ratio_update
+The minimum must exceed
+(BUF_LRU_OLD_TOLERANCE + 5) * BUF_LRU_OLD_RATIO_DIV / BUF_LRU_OLD_MIN_LEN. */
+#define BUF_LRU_OLD_RATIO_MIN	51
+
+#if BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX
+# error "BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX"
+#endif
+#if BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV
+# error "BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV"
+#endif
+
+/** Move blocks to "new" LRU list only if the first access was at
+least this many milliseconds ago.  Not protected by any mutex or latch. */
+extern uint	buf_LRU_old_threshold_ms;
+/* @} */
+
+/** @brief Statistics for selecting the LRU list for eviction.
+
+These statistics are not 'of' LRU but 'for' LRU.  We keep count of I/O
+and page_zip_decompress() operations.  Based on the statistics we decide
+if we want to evict from buf_pool->unzip_LRU or buf_pool->LRU. */
+struct buf_LRU_stat_t
+{
+	ulint	io;	/**< Counter of buffer pool I/O operations. */
+	ulint	unzip;	/**< Counter of page_zip_decompress operations. */
+};
+
+/** Current operation counters.  Not protected by any mutex.
+Cleared by buf_LRU_stat_update(). */
+extern buf_LRU_stat_t	buf_LRU_stat_cur;
+
+/** Running sum of past values of buf_LRU_stat_cur.
+Updated by buf_LRU_stat_update().  Protected by buf_pool->mutex. */
+extern buf_LRU_stat_t	buf_LRU_stat_sum;
+
+/********************************************************************//**
+Increments the I/O counter in buf_LRU_stat_cur. */
+#define buf_LRU_stat_inc_io() buf_LRU_stat_cur.io++
+/********************************************************************//**
+Increments the page_zip_decompress() counter in buf_LRU_stat_cur. */
+#define buf_LRU_stat_inc_unzip() buf_LRU_stat_cur.unzip++
+
+#ifndef UNIV_NONINL
+#include "buf0lru.ic"
+#endif
+
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/innobase/include/buf0lru.ic b/storage/innobase/include/buf0lru.ic
new file mode 100644
index 00000000000..6e0da7a2588
--- /dev/null
+++ b/storage/innobase/include/buf0lru.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0lru.ic
+The database buffer replacement algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h
new file mode 100644
index 00000000000..d2a1f264ff5
--- /dev/null
+++ b/storage/innobase/include/buf0rea.h
@@ -0,0 +1,177 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0rea.h
+The database buffer read
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0rea_h
+#define buf0rea_h
+
+#include "univ.i"
+#include "buf0types.h"
+
+/********************************************************************//**
+High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@return TRUE if page has been read in, FALSE in case of failure */
+UNIV_INTERN
+ibool
+buf_read_page(
+/*==========*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes, or 0 */
+	ulint	offset);/*!< in: page number */
+/********************************************************************//**
+High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@return TRUE if page has been read in, FALSE in case of failure */
+UNIV_INTERN
+ibool
+buf_read_page_async(
+/*================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset);/*!< in: page number */
+/********************************************************************//**
+Applies a random read-ahead in buf_pool if there are at least a threshold
+value of accessed pages from the random read-ahead area. Does not read any
+page, not even the one at the position (space, offset), if the read-ahead
+mechanism is not activated. NOTE 1: the calling thread may own latches on
+pages: to avoid deadlocks this function must be written such that it cannot
+end up waiting for these latches! NOTE 2: the calling thread must want
+access to the page given: this rule is set to prevent unintended read-aheads
+performed by ibuf routines, a situation which could result in a deadlock if
+the OS does not support asynchronous i/o.
+@return number of page read requests issued; NOTE that if we read ibuf
+pages, it may happen that the page at the given page number does not
+get read even if we return a positive value!
+@return	number of page read requests issued */
+UNIV_INTERN
+ulint
+buf_read_ahead_random(
+/*==================*/
+	ulint	space,		/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes,
+				or 0 */
+	ulint	offset,		/*!< in: page number of a page which
+				the current thread wants to access */
+	ibool	inside_ibuf);	/*!< in: TRUE if we are inside ibuf
+				routine */
+/********************************************************************//**
+Applies linear read-ahead if in the buf_pool the page is a border page of
+a linear read-ahead area and all the pages in the area have been accessed.
+Does not read any page if the read-ahead mechanism is not activated. Note
+that the algorithm looks at the 'natural' adjacent successor and
+predecessor of the page, which on the leaf level of a B-tree are the next
+and previous page in the chain of leaves. To know these, the page specified
+in (space, offset) must already be present in the buf_pool. Thus, the
+natural way to use this function is to call it when a page in the buf_pool
+is accessed the first time, calling this function just after it has been
+bufferfixed.
+NOTE 1: as this function looks at the natural predecessor and successor
+fields on the page, what happens, if these are not initialized to any
+sensible value? No problem, before applying read-ahead we check that the
+area to read is within the span of the space, if not, read-ahead is not
+applied. An uninitialized value may result in a useless read operation, but
+only very improbably.
+NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
+function must be written such that it cannot end up waiting for these
+latches!
+NOTE 3: the calling thread must want access to the page given: this rule is
+set to prevent unintended read-aheads performed by ibuf routines, a situation
+which could result in a deadlock if the OS does not support asynchronous io.
+@return	number of page read requests issued */
+UNIV_INTERN
+ulint
+buf_read_ahead_linear(
+/*==================*/
+	ulint	space,		/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes, or 0 */
+	ulint	offset,		/*!< in: page number; see NOTE 3 above */
+	ibool	inside_ibuf);	/*!< in: TRUE if we are inside ibuf routine */
+/********************************************************************//**
+Issues read requests for pages which the ibuf module wants to read in, in
+order to contract the insert buffer tree. Technically, this function is like
+a read-ahead function. */
+UNIV_INTERN
+void
+buf_read_ibuf_merge_pages(
+/*======================*/
+	bool		sync,		/*!< in: true if the caller
+					wants this function to wait
+					for the highest address page
+					to get read in, before this
+					function returns */
+	const ulint*	space_ids,	/*!< in: array of space ids */
+	const ib_int64_t* space_versions,/*!< in: the spaces must have
+					this version number
+					(timestamp), otherwise we
+					discard the read; we use this
+					to cancel reads if DISCARD +
+					IMPORT may have changed the
+					tablespace size */
+	const ulint*	page_nos,	/*!< in: array of page numbers
+					to read, with the highest page
+					number the last in the
+					array */
+	ulint		n_stored);	/*!< in: number of elements
+					in the arrays */
+/********************************************************************//**
+Issues read requests for pages which recovery wants to read in. */
+UNIV_INTERN
+void
+buf_read_recv_pages(
+/*================*/
+	ibool		sync,		/*!< in: TRUE if the caller
+					wants this function to wait
+					for the highest address page
+					to get read in, before this
+					function returns */
+	ulint		space,		/*!< in: space id */
+	ulint		zip_size,	/*!< in: compressed page size in
+					bytes, or 0 */
+	const ulint*	page_nos,	/*!< in: array of page numbers
+					to read, with the highest page
+					number the last in the
+					array */
+	ulint		n_stored);	/*!< in: number of page numbers
+					in the array */
+
+/** The size in pages of the area which the read-ahead algorithms read if
+invoked */
+#define	BUF_READ_AHEAD_AREA(b)					\
+	ut_min(64, ut_2_power_up((b)->curr_size / 32))
+
+/** @name Modes used in read-ahead @{ */
+/** read only pages belonging to the insert buffer tree */
+#define BUF_READ_IBUF_PAGES_ONLY	131
+/** read any page */
+#define BUF_READ_ANY_PAGE		132
+/** read any page, but ignore (return an error) if a page does not exist
+instead of crashing like BUF_READ_ANY_PAGE does */
+#define BUF_READ_IGNORE_NONEXISTENT_PAGES 1024
+/* @} */
+
+#endif
diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h
new file mode 100644
index 00000000000..11bbc9b5c8a
--- /dev/null
+++ b/storage/innobase/include/buf0types.h
@@ -0,0 +1,120 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0types.h
+The database buffer pool global types for the directory
+
+Created 11/17/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0types_h
+#define buf0types_h
+
+#if defined(INNODB_PAGE_ATOMIC_REF_COUNT) && defined(HAVE_ATOMIC_BUILTINS)
+#define PAGE_ATOMIC_REF_COUNT
+#endif /* INNODB_PAGE_ATOMIC_REF_COUNT && HAVE_ATOMIC_BUILTINS */
+
+/** Buffer page (uncompressed or compressed) */
+struct buf_page_t;
+/** Buffer block for which an uncompressed page exists */
+struct buf_block_t;
+/** Buffer pool chunk comprising buf_block_t */
+struct buf_chunk_t;
+/** Buffer pool comprising buf_chunk_t */
+struct buf_pool_t;
+/** Buffer pool statistics struct */
+struct buf_pool_stat_t;
+/** Buffer pool buddy statistics struct */
+struct buf_buddy_stat_t;
+/** Doublewrite memory struct */
+struct buf_dblwr_t;
+
+/** A buffer frame. @see page_t */
+typedef	byte	buf_frame_t;
+
+/** Flags for flush types */
+enum buf_flush_t {
+	BUF_FLUSH_LRU = 0,		/*!< flush via the LRU list */
+	BUF_FLUSH_LIST,			/*!< flush via the flush list
+					of dirty blocks */
+	BUF_FLUSH_SINGLE_PAGE,		/*!< flush via the LRU list
+					but only a single page */
+	BUF_FLUSH_N_TYPES		/*!< index of last element + 1  */
+};
+
+/** Algorithm to remove the pages for a tablespace from the buffer pool.
+See buf_LRU_flush_or_remove_pages(). */
+enum buf_remove_t {
+	BUF_REMOVE_ALL_NO_WRITE,	/*!< Remove all pages from the buffer
+					pool, don't write or sync to disk */
+	BUF_REMOVE_FLUSH_NO_WRITE,	/*!< Remove only, from the flush list,
+					don't write or sync to disk */
+	BUF_REMOVE_FLUSH_WRITE		/*!< Flush dirty pages to disk only
+					don't remove from the buffer pool */
+};
+
+/** Flags for io_fix types */
+enum buf_io_fix {
+	BUF_IO_NONE = 0,		/**< no pending I/O */
+	BUF_IO_READ,			/**< read pending */
+	BUF_IO_WRITE,			/**< write pending */
+	BUF_IO_PIN			/**< disallow relocation of
+					block and its removal of from
+					the flush_list */
+};
+
+/** Alternatives for srv_checksum_algorithm, which can be changed by
+setting innodb_checksum_algorithm */
+enum srv_checksum_algorithm_t {
+	SRV_CHECKSUM_ALGORITHM_CRC32,		/*!< Write crc32, allow crc32,
+						innodb or none when reading */
+	SRV_CHECKSUM_ALGORITHM_STRICT_CRC32,	/*!< Write crc32, allow crc32
+						when reading */
+	SRV_CHECKSUM_ALGORITHM_INNODB,		/*!< Write innodb, allow crc32,
+						innodb or none when reading */
+	SRV_CHECKSUM_ALGORITHM_STRICT_INNODB,	/*!< Write innodb, allow
+						innodb when reading */
+	SRV_CHECKSUM_ALGORITHM_NONE,		/*!< Write none, allow crc32,
+						innodb or none when reading */
+	SRV_CHECKSUM_ALGORITHM_STRICT_NONE	/*!< Write none, allow none
+						when reading */
+};
+
+/** Parameters of binary buddy system for compressed pages (buf0buddy.h) */
+/* @{ */
+/** Zip shift value for the smallest page size */
+#define BUF_BUDDY_LOW_SHIFT	UNIV_ZIP_SIZE_SHIFT_MIN
+
+/** Smallest buddy page size */
+#define BUF_BUDDY_LOW		(1U << BUF_BUDDY_LOW_SHIFT)
+
+/** Actual number of buddy sizes based on current page size */
+#define BUF_BUDDY_SIZES		(UNIV_PAGE_SIZE_SHIFT - BUF_BUDDY_LOW_SHIFT)
+
+/** Maximum number of buddy sizes based on the max page size */
+#define BUF_BUDDY_SIZES_MAX	(UNIV_PAGE_SIZE_SHIFT_MAX	\
+				- BUF_BUDDY_LOW_SHIFT)
+
+/** twice the maximum block size of the buddy system;
+the underlying memory is aligned by this amount:
+this must be equal to UNIV_PAGE_SIZE */
+#define BUF_BUDDY_HIGH	(BUF_BUDDY_LOW << BUF_BUDDY_SIZES)
+/* @} */
+
+#endif /* buf0types.h */
diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h
new file mode 100644
index 00000000000..a548c7b89b3
--- /dev/null
+++ b/storage/innobase/include/data0data.h
@@ -0,0 +1,536 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0data.h
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef data0data_h
+#define data0data_h
+
+#include "univ.i"
+
+#include "data0types.h"
+#include "data0type.h"
+#include "mem0mem.h"
+#include "dict0types.h"
+
+/** Storage for overflow data in a big record, that is, a clustered
+index record which needs external storage of data fields */
+struct big_rec_t;
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets pointer to the type struct of SQL data field.
+@return	pointer to the type struct */
+UNIV_INLINE
+dtype_t*
+dfield_get_type(
+/*============*/
+	const dfield_t*	field)	/*!< in: SQL data field */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets pointer to the data in a field.
+@return	pointer to data */
+UNIV_INLINE
+void*
+dfield_get_data(
+/*============*/
+	const dfield_t* field)	/*!< in: field */
+	__attribute__((nonnull, warn_unused_result));
+#else /* UNIV_DEBUG */
+# define dfield_get_type(field) (&(field)->type)
+# define dfield_get_data(field) ((field)->data)
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Sets the type struct of SQL data field. */
+UNIV_INLINE
+void
+dfield_set_type(
+/*============*/
+	dfield_t*	field,	/*!< in: SQL data field */
+	const dtype_t*	type)	/*!< in: pointer to data type struct */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Gets length of field data.
+@return	length of data; UNIV_SQL_NULL if SQL null data */
+UNIV_INLINE
+ulint
+dfield_get_len(
+/*===========*/
+	const dfield_t* field)	/*!< in: field */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Sets length in a field. */
+UNIV_INLINE
+void
+dfield_set_len(
+/*===========*/
+	dfield_t*	field,	/*!< in: field */
+	ulint		len)	/*!< in: length or UNIV_SQL_NULL */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Determines if a field is SQL NULL
+@return	nonzero if SQL null data */
+UNIV_INLINE
+ulint
+dfield_is_null(
+/*===========*/
+	const dfield_t* field)	/*!< in: field */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Determines if a field is externally stored
+@return	nonzero if externally stored */
+UNIV_INLINE
+ulint
+dfield_is_ext(
+/*==========*/
+	const dfield_t* field)	/*!< in: field */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Sets the "external storage" flag */
+UNIV_INLINE
+void
+dfield_set_ext(
+/*===========*/
+	dfield_t*	field)	/*!< in/out: field */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_set_data(
+/*============*/
+	dfield_t*	field,	/*!< in: field */
+	const void*	data,	/*!< in: data */
+	ulint		len)	/*!< in: length or UNIV_SQL_NULL */
+	__attribute__((nonnull(1)));
+/*********************************************************************//**
+Sets a data field to SQL NULL. */
+UNIV_INLINE
+void
+dfield_set_null(
+/*============*/
+	dfield_t*	field)	/*!< in/out: field */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Writes an SQL null field full of zeros. */
+UNIV_INLINE
+void
+data_write_sql_null(
+/*================*/
+	byte*	data,	/*!< in: pointer to a buffer of size len */
+	ulint	len)	/*!< in: SQL null size in bytes */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Copies the data and len fields. */
+UNIV_INLINE
+void
+dfield_copy_data(
+/*=============*/
+	dfield_t*	field1,	/*!< out: field to copy to */
+	const dfield_t*	field2)	/*!< in: field to copy from */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Copies a data field to another. */
+UNIV_INLINE
+void
+dfield_copy(
+/*========*/
+	dfield_t*	field1,	/*!< out: field to copy to */
+	const dfield_t*	field2)	/*!< in: field to copy from */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Copies the data pointed to by a data field. */
+UNIV_INLINE
+void
+dfield_dup(
+/*=======*/
+	dfield_t*	field,	/*!< in/out: data field */
+	mem_heap_t*	heap)	/*!< in: memory heap where allocated */
+	__attribute__((nonnull));
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Tests if two data fields are equal.
+If len==0, tests the data length and content for equality.
+If len>0, tests the first len bytes of the content for equality.
+@return	TRUE if both fields are NULL or if they are equal */
+UNIV_INLINE
+ibool
+dfield_datas_are_binary_equal(
+/*==========================*/
+	const dfield_t*	field1,	/*!< in: field */
+	const dfield_t*	field2,	/*!< in: field */
+	ulint		len)	/*!< in: maximum prefix to compare,
+				or 0 to compare the whole field length */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Tests if dfield data length and content is equal to the given.
+@return	TRUE if equal */
+UNIV_INLINE
+ibool
+dfield_data_is_binary_equal(
+/*========================*/
+	const dfield_t*	field,	/*!< in: field */
+	ulint		len,	/*!< in: data length or UNIV_SQL_NULL */
+	const byte*	data)	/*!< in: data */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Gets number of fields in a data tuple.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields(
+/*================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	__attribute__((nonnull, warn_unused_result));
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets nth field of a tuple.
+@return	nth field */
+UNIV_INLINE
+dfield_t*
+dtuple_get_nth_field(
+/*=================*/
+	const dtuple_t*	tuple,	/*!< in: tuple */
+	ulint		n);	/*!< in: index of field */
+#else /* UNIV_DEBUG */
+# define dtuple_get_nth_field(tuple, n) ((tuple)->fields + (n))
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Gets info bits in a data tuple.
+@return	info bits */
+UNIV_INLINE
+ulint
+dtuple_get_info_bits(
+/*=================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Sets info bits in a data tuple. */
+UNIV_INLINE
+void
+dtuple_set_info_bits(
+/*=================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		info_bits)	/*!< in: info bits */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Gets number of fields used in record comparisons.
+@return	number of fields used in comparisons in rem0cmp.* */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields_cmp(
+/*====================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets number of fields used in record comparisons. */
+UNIV_INLINE
+void
+dtuple_set_n_fields_cmp(
+/*====================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		n_fields_cmp)	/*!< in: number of fields used in
+					comparisons in rem0cmp.* */
+	__attribute__((nonnull));
+
+/* Estimate the number of bytes that are going to be allocated when
+creating a new dtuple_t object */
+#define DTUPLE_EST_ALLOC(n_fields)	\
+	(sizeof(dtuple_t) + (n_fields) * sizeof(dfield_t))
+
+/**********************************************************//**
+Creates a data tuple from an already allocated chunk of memory.
+The size of the chunk must be at least DTUPLE_EST_ALLOC(n_fields).
+The default value for number of fields used in record comparisons
+for this tuple is n_fields.
+@return	created tuple (inside buf) */
+UNIV_INLINE
+dtuple_t*
+dtuple_create_from_mem(
+/*===================*/
+	void*	buf,		/*!< in, out: buffer to use */
+	ulint	buf_size,	/*!< in: buffer size */
+	ulint	n_fields)	/*!< in: number of fields */
+	__attribute__((nonnull, warn_unused_result));
+
+/**********************************************************//**
+Creates a data tuple to a memory heap. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return	own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create(
+/*==========*/
+	mem_heap_t*	heap,	/*!< in: memory heap where the tuple
+				is created, DTUPLE_EST_ALLOC(n_fields)
+				bytes will be allocated from this heap */
+	ulint		n_fields)/*!< in: number of fields */
+	__attribute__((nonnull, malloc));
+
+/*********************************************************************//**
+Sets number of fields used in a tuple. Normally this is set in
+dtuple_create, but if you want later to set it smaller, you can use this. */
+UNIV_INTERN
+void
+dtuple_set_n_fields(
+/*================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		n_fields)	/*!< in: number of fields */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Copies a data tuple to another.  This is a shallow copy; if a deep copy
+is desired, dfield_dup() will have to be invoked on each field.
+@return	own: copy of tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_copy(
+/*========*/
+	const dtuple_t*	tuple,	/*!< in: tuple to copy from */
+	mem_heap_t*	heap)	/*!< in: memory heap
+				where the tuple is created */
+	__attribute__((nonnull, malloc));
+/**********************************************************//**
+The following function returns the sum of data lengths of a tuple. The space
+occupied by the field structs or the tuple struct is not counted.
+@return	sum of data lens */
+UNIV_INLINE
+ulint
+dtuple_get_data_size(
+/*=================*/
+	const dtuple_t*	tuple,	/*!< in: typed data tuple */
+	ulint		comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Computes the number of externally stored fields in a data tuple.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_ext(
+/*=============*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	__attribute__((nonnull));
+/************************************************************//**
+Compare two data tuples, respecting the collation of character fields.
+@return 1, 0 , -1 if tuple1 is greater, equal, less, respectively,
+than tuple2 */
+UNIV_INTERN
+int
+dtuple_coll_cmp(
+/*============*/
+	const dtuple_t*	tuple1,	/*!< in: tuple 1 */
+	const dtuple_t*	tuple2)	/*!< in: tuple 2 */
+	__attribute__((nonnull, warn_unused_result));
+/************************************************************//**
+Folds a prefix given as the number of fields of a tuple.
+@return	the folded value */
+UNIV_INLINE
+ulint
+dtuple_fold(
+/*========*/
+	const dtuple_t*	tuple,	/*!< in: the tuple */
+	ulint		n_fields,/*!< in: number of complete fields to fold */
+	ulint		n_bytes,/*!< in: number of bytes to fold in an
+				incomplete last field */
+	index_id_t	tree_id)/*!< in: index tree id */
+	__attribute__((nonnull, pure, warn_unused_result));
+/*******************************************************************//**
+Sets types of fields binary in a tuple. */
+UNIV_INLINE
+void
+dtuple_set_types_binary(
+/*====================*/
+	dtuple_t*	tuple,	/*!< in: data tuple */
+	ulint		n)	/*!< in: number of fields to set */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Checks if a dtuple contains an SQL null value.
+@return	TRUE if some field is SQL null */
+UNIV_INLINE
+ibool
+dtuple_contains_null(
+/*=================*/
+	const dtuple_t*	tuple)	/*!< in: dtuple */
+	__attribute__((nonnull, warn_unused_result));
+/**********************************************************//**
+Checks that a data field is typed. Asserts an error if not.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dfield_check_typed(
+/*===============*/
+	const dfield_t*	field)	/*!< in: data field */
+	__attribute__((nonnull, warn_unused_result));
+/**********************************************************//**
+Checks that a data tuple is typed. Asserts an error if not.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_check_typed(
+/*===============*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	__attribute__((nonnull, warn_unused_result));
+/**********************************************************//**
+Checks that a data tuple is typed.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_check_typed_no_assert(
+/*=========================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	__attribute__((nonnull, warn_unused_result));
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Validates the consistency of a tuple which must be complete, i.e,
+all fields must have been set.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_validate(
+/*============*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. */
+UNIV_INTERN
+void
+dfield_print(
+/*=========*/
+	const dfield_t*	dfield)	/*!< in: dfield */
+	__attribute__((nonnull));
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. Also the hex string
+is printed if a string contains non-printable characters. */
+UNIV_INTERN
+void
+dfield_print_also_hex(
+/*==================*/
+	const dfield_t*	dfield)	 /*!< in: dfield */
+	__attribute__((nonnull));
+/**********************************************************//**
+The following function prints the contents of a tuple. */
+UNIV_INTERN
+void
+dtuple_print(
+/*=========*/
+	FILE*		f,	/*!< in: output stream */
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	__attribute__((nonnull));
+/**************************************************************//**
+Moves parts of long fields in entry to the big record vector so that
+the size of tuple drops below the maximum record size allowed in the
+database. Moves data only from those fields which are not necessary
+to determine uniquely the insertion place of the tuple in the index.
+@return own: created big record vector, NULL if we are not able to
+shorten the entry enough, i.e., if there are too many fixed-length or
+short fields in entry or the index is clustered */
+UNIV_INTERN
+big_rec_t*
+dtuple_convert_big_rec(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in/out: index entry */
+	ulint*		n_ext)	/*!< in/out: number of
+				externally stored columns */
+	__attribute__((nonnull, malloc, warn_unused_result));
+/**************************************************************//**
+Puts back to entry the data stored in vector. Note that to ensure the
+fields in entry can accommodate the data, vector must have been created
+from entry with dtuple_convert_big_rec. */
+UNIV_INTERN
+void
+dtuple_convert_back_big_rec(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: entry whose data was put to vector */
+	big_rec_t*	vector)	/*!< in, own: big rec vector; it is
+				freed in this function */
+	__attribute__((nonnull));
+/**************************************************************//**
+Frees the memory in a big rec vector. */
+UNIV_INLINE
+void
+dtuple_big_rec_free(
+/*================*/
+	big_rec_t*	vector)	/*!< in, own: big rec vector; it is
+				freed in this function */
+	__attribute__((nonnull));
+
+/*######################################################################*/
+
+/** Structure for an SQL data field */
+struct dfield_t{
+	void*		data;	/*!< pointer to data */
+	unsigned	ext:1;	/*!< TRUE=externally stored, FALSE=local */
+	unsigned	len:32;	/*!< data length; UNIV_SQL_NULL if SQL null */
+	dtype_t		type;	/*!< type of data */
+};
+
+/** Structure for an SQL data tuple of fields (logical record) */
+struct dtuple_t {
+	ulint		info_bits;	/*!< info bits of an index record:
+					the default is 0; this field is used
+					if an index record is built from
+					a data tuple */
+	ulint		n_fields;	/*!< number of fields in dtuple */
+	ulint		n_fields_cmp;	/*!< number of fields which should
+					be used in comparison services
+					of rem0cmp.*; the index search
+					is performed by comparing only these
+					fields, others are ignored; the
+					default value in dtuple creation is
+					the same value as n_fields */
+	dfield_t*	fields;		/*!< fields */
+	UT_LIST_NODE_T(dtuple_t) tuple_list;
+					/*!< data tuples can be linked into a
+					list using this field */
+#ifdef UNIV_DEBUG
+	ulint		magic_n;	/*!< magic number, used in
+					debug assertions */
+/** Value of dtuple_t::magic_n */
+# define		DATA_TUPLE_MAGIC_N	65478679
+#endif /* UNIV_DEBUG */
+};
+
+/** A slot for a field in a big rec vector */
+struct big_rec_field_t {
+	ulint		field_no;	/*!< field number in record */
+	ulint		len;		/*!< stored data length, in bytes */
+	const void*	data;		/*!< stored data */
+};
+
+/** Storage format for overflow data in a big record, that is, a
+clustered index record which needs external storage of data fields */
+struct big_rec_t {
+	mem_heap_t*	heap;		/*!< memory heap from which
+					allocated */
+	ulint		n_fields;	/*!< number of stored fields */
+	big_rec_field_t*fields;		/*!< stored fields */
+};
+
+#ifndef UNIV_NONINL
+#include "data0data.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/data0data.ic b/storage/innobase/include/data0data.ic
new file mode 100644
index 00000000000..6937d55d211
--- /dev/null
+++ b/storage/innobase/include/data0data.ic
@@ -0,0 +1,649 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0data.ic
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mem0mem.h"
+#include "ut0rnd.h"
+
+#ifdef UNIV_DEBUG
+/** Dummy variable to catch access to uninitialized fields.  In the
+debug version, dtuple_create() will make all fields of dtuple_t point
+to data_error. */
+extern byte data_error;
+
+/*********************************************************************//**
+Gets pointer to the type struct of SQL data field.
+@return	pointer to the type struct */
+UNIV_INLINE
+dtype_t*
+dfield_get_type(
+/*============*/
+	const dfield_t*	field)	/*!< in: SQL data field */
+{
+	ut_ad(field);
+
+	return((dtype_t*) &(field->type));
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Sets the type struct of SQL data field. */
+UNIV_INLINE
+void
+dfield_set_type(
+/*============*/
+	dfield_t*	field,	/*!< in: SQL data field */
+	const dtype_t*	type)	/*!< in: pointer to data type struct */
+{
+	ut_ad(field && type);
+
+	field->type = *type;
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets pointer to the data in a field.
+@return	pointer to data */
+UNIV_INLINE
+void*
+dfield_get_data(
+/*============*/
+	const dfield_t* field)	/*!< in: field */
+{
+	ut_ad(field);
+	ut_ad((field->len == UNIV_SQL_NULL)
+	      || (field->data != &data_error));
+
+	return((void*) field->data);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets length of field data.
+@return	length of data; UNIV_SQL_NULL if SQL null data */
+UNIV_INLINE
+ulint
+dfield_get_len(
+/*===========*/
+	const dfield_t*	field)	/*!< in: field */
+{
+	ut_ad(field);
+	ut_ad((field->len == UNIV_SQL_NULL)
+	      || (field->data != &data_error));
+
+	return(field->len);
+}
+
+/*********************************************************************//**
+Sets length in a field. */
+UNIV_INLINE
+void
+dfield_set_len(
+/*===========*/
+	dfield_t*	field,	/*!< in: field */
+	ulint		len)	/*!< in: length or UNIV_SQL_NULL */
+{
+	ut_ad(field);
+#ifdef UNIV_VALGRIND_DEBUG
+	if (len != UNIV_SQL_NULL) UNIV_MEM_ASSERT_RW(field->data, len);
+#endif /* UNIV_VALGRIND_DEBUG */
+
+	field->ext = 0;
+	field->len = len;
+}
+
+/*********************************************************************//**
+Determines if a field is SQL NULL
+@return	nonzero if SQL null data */
+UNIV_INLINE
+ulint
+dfield_is_null(
+/*===========*/
+	const dfield_t* field)	/*!< in: field */
+{
+	ut_ad(field);
+
+	return(field->len == UNIV_SQL_NULL);
+}
+
+/*********************************************************************//**
+Determines if a field is externally stored
+@return	nonzero if externally stored */
+UNIV_INLINE
+ulint
+dfield_is_ext(
+/*==========*/
+	const dfield_t* field)	/*!< in: field */
+{
+	ut_ad(field);
+
+	return(field->ext);
+}
+
+/*********************************************************************//**
+Sets the "external storage" flag */
+UNIV_INLINE
+void
+dfield_set_ext(
+/*===========*/
+	dfield_t*	field)	/*!< in/out: field */
+{
+	ut_ad(field);
+
+	field->ext = 1;
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_set_data(
+/*============*/
+	dfield_t*	field,	/*!< in: field */
+	const void*	data,	/*!< in: data */
+	ulint		len)	/*!< in: length or UNIV_SQL_NULL */
+{
+	ut_ad(field);
+
+#ifdef UNIV_VALGRIND_DEBUG
+	if (len != UNIV_SQL_NULL) UNIV_MEM_ASSERT_RW(data, len);
+#endif /* UNIV_VALGRIND_DEBUG */
+	field->data = (void*) data;
+	field->ext = 0;
+	field->len = len;
+}
+
+/*********************************************************************//**
+Sets a data field to SQL NULL. */
+UNIV_INLINE
+void
+dfield_set_null(
+/*============*/
+	dfield_t*	field)	/*!< in/out: field */
+{
+	dfield_set_data(field, NULL, UNIV_SQL_NULL);
+}
+
+/*********************************************************************//**
+Copies the data and len fields. */
+UNIV_INLINE
+void
+dfield_copy_data(
+/*=============*/
+	dfield_t*	field1,	/*!< out: field to copy to */
+	const dfield_t*	field2)	/*!< in: field to copy from */
+{
+	ut_ad(field1 && field2);
+
+	field1->data = field2->data;
+	field1->len = field2->len;
+	field1->ext = field2->ext;
+}
+
+/*********************************************************************//**
+Copies a data field to another. */
+UNIV_INLINE
+void
+dfield_copy(
+/*========*/
+	dfield_t*	field1,	/*!< out: field to copy to */
+	const dfield_t*	field2)	/*!< in: field to copy from */
+{
+	*field1 = *field2;
+}
+
+/*********************************************************************//**
+Copies the data pointed to by a data field. */
+UNIV_INLINE
+void
+dfield_dup(
+/*=======*/
+	dfield_t*	field,	/*!< in/out: data field */
+	mem_heap_t*	heap)	/*!< in: memory heap where allocated */
+{
+	if (!dfield_is_null(field)) {
+		UNIV_MEM_ASSERT_RW(field->data, field->len);
+		field->data = mem_heap_dup(heap, field->data, field->len);
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Tests if two data fields are equal.
+If len==0, tests the data length and content for equality.
+If len>0, tests the first len bytes of the content for equality.
+@return	TRUE if both fields are NULL or if they are equal */
+UNIV_INLINE
+ibool
+dfield_datas_are_binary_equal(
+/*==========================*/
+	const dfield_t*	field1,	/*!< in: field */
+	const dfield_t*	field2,	/*!< in: field */
+	ulint		len)	/*!< in: maximum prefix to compare,
+				or 0 to compare the whole field length */
+{
+	ulint	len2 = len;
+
+	if (field1->len == UNIV_SQL_NULL || len == 0 || field1->len < len) {
+		len = field1->len;
+	}
+
+	if (field2->len == UNIV_SQL_NULL || len2 == 0 || field2->len < len2) {
+		len2 = field2->len;
+	}
+
+	return(len == len2
+	       && (len == UNIV_SQL_NULL
+		   || !memcmp(field1->data, field2->data, len)));
+}
+
+/*********************************************************************//**
+Tests if dfield data length and content is equal to the given.
+@return	TRUE if equal */
+UNIV_INLINE
+ibool
+dfield_data_is_binary_equal(
+/*========================*/
+	const dfield_t*	field,	/*!< in: field */
+	ulint		len,	/*!< in: data length or UNIV_SQL_NULL */
+	const byte*	data)	/*!< in: data */
+{
+	return(len == dfield_get_len(field)
+	       && (len == UNIV_SQL_NULL
+		   || !memcmp(dfield_get_data(field), data, len)));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Gets info bits in a data tuple.
+@return	info bits */
+UNIV_INLINE
+ulint
+dtuple_get_info_bits(
+/*=================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	ut_ad(tuple);
+
+	return(tuple->info_bits);
+}
+
+/*********************************************************************//**
+Sets info bits in a data tuple. */
+UNIV_INLINE
+void
+dtuple_set_info_bits(
+/*=================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		info_bits)	/*!< in: info bits */
+{
+	ut_ad(tuple);
+
+	tuple->info_bits = info_bits;
+}
+
+/*********************************************************************//**
+Gets number of fields used in record comparisons.
+@return	number of fields used in comparisons in rem0cmp.* */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields_cmp(
+/*====================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	ut_ad(tuple);
+
+	return(tuple->n_fields_cmp);
+}
+
+/*********************************************************************//**
+Sets number of fields used in record comparisons. */
+UNIV_INLINE
+void
+dtuple_set_n_fields_cmp(
+/*====================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		n_fields_cmp)	/*!< in: number of fields used in
+					comparisons in rem0cmp.* */
+{
+	ut_ad(tuple);
+	ut_ad(n_fields_cmp <= tuple->n_fields);
+
+	tuple->n_fields_cmp = n_fields_cmp;
+}
+
+/*********************************************************************//**
+Gets number of fields in a data tuple.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields(
+/*================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	ut_ad(tuple);
+
+	return(tuple->n_fields);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets nth field of a tuple.
+@return	nth field */
+UNIV_INLINE
+dfield_t*
+dtuple_get_nth_field(
+/*=================*/
+	const dtuple_t*	tuple,	/*!< in: tuple */
+	ulint		n)	/*!< in: index of field */
+{
+	ut_ad(tuple);
+	ut_ad(n < tuple->n_fields);
+
+	return((dfield_t*) tuple->fields + n);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************//**
+Creates a data tuple from an already allocated chunk of memory.
+The size of the chunk must be at least DTUPLE_EST_ALLOC(n_fields).
+The default value for number of fields used in record comparisons
+for this tuple is n_fields.
+@return	created tuple (inside buf) */
+UNIV_INLINE
+dtuple_t*
+dtuple_create_from_mem(
+/*===================*/
+	void*	buf,		/*!< in, out: buffer to use */
+	ulint	buf_size,	/*!< in: buffer size */
+	ulint	n_fields)	/*!< in: number of fields */
+{
+	dtuple_t*	tuple;
+
+	ut_ad(buf != NULL);
+	ut_a(buf_size >= DTUPLE_EST_ALLOC(n_fields));
+
+	tuple = (dtuple_t*) buf;
+	tuple->info_bits = 0;
+	tuple->n_fields = n_fields;
+	tuple->n_fields_cmp = n_fields;
+	tuple->fields = (dfield_t*) &tuple[1];
+
+#ifdef UNIV_DEBUG
+	tuple->magic_n = DATA_TUPLE_MAGIC_N;
+
+	{	/* In the debug version, initialize fields to an error value */
+		ulint	i;
+
+		for (i = 0; i < n_fields; i++) {
+			dfield_t*       field;
+
+			field = dtuple_get_nth_field(tuple, i);
+
+			dfield_set_len(field, UNIV_SQL_NULL);
+			field->data = &data_error;
+			dfield_get_type(field)->mtype = DATA_ERROR;
+		}
+	}
+#endif
+	UNIV_MEM_ASSERT_W(tuple->fields, n_fields * sizeof *tuple->fields);
+	UNIV_MEM_INVALID(tuple->fields, n_fields * sizeof *tuple->fields);
+	return(tuple);
+}
+
+/**********************************************************//**
+Creates a data tuple to a memory heap. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return	own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create(
+/*==========*/
+	mem_heap_t*	heap,	/*!< in: memory heap where the tuple
+				is created, DTUPLE_EST_ALLOC(n_fields)
+				bytes will be allocated from this heap */
+	ulint		n_fields) /*!< in: number of fields */
+{
+	void*		buf;
+	ulint		buf_size;
+	dtuple_t*	tuple;
+
+	ut_ad(heap);
+
+	buf_size = DTUPLE_EST_ALLOC(n_fields);
+	buf = mem_heap_alloc(heap, buf_size);
+
+	tuple = dtuple_create_from_mem(buf, buf_size, n_fields);
+
+	return(tuple);
+}
+
+/*********************************************************************//**
+Copies a data tuple to another.  This is a shallow copy; if a deep copy
+is desired, dfield_dup() will have to be invoked on each field.
+@return	own: copy of tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_copy(
+/*========*/
+	const dtuple_t*	tuple,	/*!< in: tuple to copy from */
+	mem_heap_t*	heap)	/*!< in: memory heap
+				where the tuple is created */
+{
+	ulint		n_fields	= dtuple_get_n_fields(tuple);
+	dtuple_t*	new_tuple	= dtuple_create(heap, n_fields);
+	ulint		i;
+
+	for (i = 0; i < n_fields; i++) {
+		dfield_copy(dtuple_get_nth_field(new_tuple, i),
+			    dtuple_get_nth_field(tuple, i));
+	}
+
+	return(new_tuple);
+}
+
+/**********************************************************//**
+The following function returns the sum of data lengths of a tuple. The space
+occupied by the field structs or the tuple struct is not counted. Neither
+is possible space in externally stored parts of the field.
+@return	sum of data lengths */
+UNIV_INLINE
+ulint
+dtuple_get_data_size(
+/*=================*/
+	const dtuple_t*	tuple,	/*!< in: typed data tuple */
+	ulint		comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+{
+	const dfield_t*	field;
+	ulint		n_fields;
+	ulint		len;
+	ulint		i;
+	ulint		sum	= 0;
+
+	ut_ad(tuple);
+	ut_ad(dtuple_check_typed(tuple));
+	ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+
+	n_fields = tuple->n_fields;
+
+	for (i = 0; i < n_fields; i++) {
+		field = dtuple_get_nth_field(tuple,  i);
+		len = dfield_get_len(field);
+
+		if (len == UNIV_SQL_NULL) {
+			len = dtype_get_sql_null_size(dfield_get_type(field),
+						      comp);
+		}
+
+		sum += len;
+	}
+
+	return(sum);
+}
+
+/*********************************************************************//**
+Computes the number of externally stored fields in a data tuple.
+@return	number of externally stored fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_ext(
+/*=============*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	ulint	n_ext		= 0;
+	ulint	n_fields	= tuple->n_fields;
+	ulint	i;
+
+	ut_ad(tuple);
+	ut_ad(dtuple_check_typed(tuple));
+	ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+
+	for (i = 0; i < n_fields; i++) {
+		n_ext += dtuple_get_nth_field(tuple, i)->ext;
+	}
+
+	return(n_ext);
+}
+
+/*******************************************************************//**
+Sets types of fields binary in a tuple. */
+UNIV_INLINE
+void
+dtuple_set_types_binary(
+/*====================*/
+	dtuple_t*	tuple,	/*!< in: data tuple */
+	ulint		n)	/*!< in: number of fields to set */
+{
+	dtype_t*	dfield_type;
+	ulint		i;
+
+	for (i = 0; i < n; i++) {
+		dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i));
+		dtype_set(dfield_type, DATA_BINARY, 0, 0);
+	}
+}
+
+/************************************************************//**
+Folds a prefix given as the number of fields of a tuple.
+@return	the folded value */
+UNIV_INLINE
+ulint
+dtuple_fold(
+/*========*/
+	const dtuple_t*	tuple,	/*!< in: the tuple */
+	ulint		n_fields,/*!< in: number of complete fields to fold */
+	ulint		n_bytes,/*!< in: number of bytes to fold in an
+				incomplete last field */
+	index_id_t	tree_id)/*!< in: index tree id */
+{
+	const dfield_t*	field;
+	ulint		i;
+	const byte*	data;
+	ulint		len;
+	ulint		fold;
+
+	ut_ad(tuple);
+	ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+	ut_ad(dtuple_check_typed(tuple));
+
+	fold = ut_fold_ull(tree_id);
+
+	for (i = 0; i < n_fields; i++) {
+		field = dtuple_get_nth_field(tuple, i);
+
+		data = (const byte*) dfield_get_data(field);
+		len = dfield_get_len(field);
+
+		if (len != UNIV_SQL_NULL) {
+			fold = ut_fold_ulint_pair(fold,
+						  ut_fold_binary(data, len));
+		}
+	}
+
+	if (n_bytes > 0) {
+		field = dtuple_get_nth_field(tuple, i);
+
+		data = (const byte*) dfield_get_data(field);
+		len = dfield_get_len(field);
+
+		if (len != UNIV_SQL_NULL) {
+			if (len > n_bytes) {
+				len = n_bytes;
+			}
+
+			fold = ut_fold_ulint_pair(fold,
+						  ut_fold_binary(data, len));
+		}
+	}
+
+	return(fold);
+}
+
+/**********************************************************************//**
+Writes an SQL null field full of zeros. */
+UNIV_INLINE
+void
+data_write_sql_null(
+/*================*/
+	byte*	data,	/*!< in: pointer to a buffer of size len */
+	ulint	len)	/*!< in: SQL null size in bytes */
+{
+	memset(data, 0, len);
+}
+
+/**********************************************************************//**
+Checks if a dtuple contains an SQL null value.
+@return	TRUE if some field is SQL null */
+UNIV_INLINE
+ibool
+dtuple_contains_null(
+/*=================*/
+	const dtuple_t*	tuple)	/*!< in: dtuple */
+{
+	ulint	n;
+	ulint	i;
+
+	n = dtuple_get_n_fields(tuple);
+
+	for (i = 0; i < n; i++) {
+		if (dfield_is_null(dtuple_get_nth_field(tuple, i))) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/**************************************************************//**
+Frees the memory in a big rec vector. */
+UNIV_INLINE
+void
+dtuple_big_rec_free(
+/*================*/
+	big_rec_t*	vector)	/*!< in, own: big rec vector; it is
+				freed in this function */
+{
+	mem_heap_free(vector->heap);
+}
diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h
new file mode 100644
index 00000000000..111664b0b52
--- /dev/null
+++ b/storage/innobase/include/data0type.h
@@ -0,0 +1,544 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/data0type.h
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef data0type_h
+#define data0type_h
+
+#include "univ.i"
+
+extern ulint	data_mysql_default_charset_coll;
+#define DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL 8
+#define DATA_MYSQL_BINARY_CHARSET_COLL 63
+
+/* SQL data type struct */
+struct dtype_t;
+
+/* SQL Like operator comparison types */
+enum ib_like_t {
+	IB_LIKE_EXACT,                  /* e.g.  STRING */
+	IB_LIKE_PREFIX,                 /* e.g., STRING% */
+	IB_LIKE_SUFFIX,                 /* e.g., %STRING */
+	IB_LIKE_SUBSTR,                 /* e.g., %STRING% */
+	IB_LIKE_REGEXP                  /* Future */
+};
+
+/*-------------------------------------------*/
+/* The 'MAIN TYPE' of a column */
+#define DATA_MISSING	0	/* missing column */
+#define	DATA_VARCHAR	1	/* character varying of the
+				latin1_swedish_ci charset-collation; note
+				that the MySQL format for this, DATA_BINARY,
+				DATA_VARMYSQL, is also affected by whether the
+				'precise type' contains
+				DATA_MYSQL_TRUE_VARCHAR */
+#define DATA_CHAR	2	/* fixed length character of the
+				latin1_swedish_ci charset-collation */
+#define DATA_FIXBINARY	3	/* binary string of fixed length */
+#define DATA_BINARY	4	/* binary string */
+#define DATA_BLOB	5	/* binary large object, or a TEXT type;
+				if prtype & DATA_BINARY_TYPE == 0, then this is
+				actually a TEXT column (or a BLOB created
+				with < 4.0.14; since column prefix indexes
+				came only in 4.0.14, the missing flag in BLOBs
+				created before that does not cause any harm) */
+#define	DATA_INT	6	/* integer: can be any size 1 - 8 bytes */
+#define	DATA_SYS_CHILD	7	/* address of the child page in node pointer */
+#define	DATA_SYS	8	/* system column */
+
+/* Data types >= DATA_FLOAT must be compared using the whole field, not as
+binary strings */
+
+#define DATA_FLOAT	9
+#define DATA_DOUBLE	10
+#define DATA_DECIMAL	11	/* decimal number stored as an ASCII string */
+#define	DATA_VARMYSQL	12	/* any charset varying length char */
+#define	DATA_MYSQL	13	/* any charset fixed length char */
+				/* NOTE that 4.1.1 used DATA_MYSQL and
+				DATA_VARMYSQL for all character sets, and the
+				charset-collation for tables created with it
+				can also be latin1_swedish_ci */
+#define DATA_MTYPE_MAX	63	/* dtype_store_for_order_and_null_size()
+				requires the values are <= 63 */
+/*-------------------------------------------*/
+/* The 'PRECISE TYPE' of a column */
+/*
+Tables created by a MySQL user have the following convention:
+
+- In the least significant byte in the precise type we store the MySQL type
+code (not applicable for system columns).
+
+- In the second least significant byte we OR flags DATA_NOT_NULL,
+DATA_UNSIGNED, DATA_BINARY_TYPE.
+
+- In the third least significant byte of the precise type of string types we
+store the MySQL charset-collation code. In DATA_BLOB columns created with
+< 4.0.14 we do not actually know if it is a BLOB or a TEXT column. Since there
+are no indexes on prefixes of BLOB or TEXT columns in < 4.0.14, this is no
+problem, though.
+
+Note that versions < 4.1.2 or < 5.0.1 did not store the charset code to the
+precise type, since the charset was always the default charset of the MySQL
+installation. If the stored charset code is 0 in the system table SYS_COLUMNS
+of InnoDB, that means that the default charset of this MySQL installation
+should be used.
+
+When loading a table definition from the system tables to the InnoDB data
+dictionary cache in main memory, InnoDB versions >= 4.1.2 and >= 5.0.1 check
+if the stored charset-collation is 0, and if that is the case and the type is
+a non-binary string, replace that 0 by the default charset-collation code of
+this MySQL installation. In short, in old tables, the charset-collation code
+in the system tables on disk can be 0, but in in-memory data structures
+(dtype_t), the charset-collation code is always != 0 for non-binary string
+types.
+
+In new tables, in binary string types, the charset-collation code is the
+MySQL code for the 'binary charset', that is, != 0.
+
+For binary string types and for DATA_CHAR, DATA_VARCHAR, and for those
+DATA_BLOB which are binary or have the charset-collation latin1_swedish_ci,
+InnoDB performs all comparisons internally, without resorting to the MySQL
+comparison functions. This is to save CPU time.
+
+InnoDB's own internal system tables have different precise types for their
+columns, and for them the precise type is usually not used at all.
+*/
+
+#define DATA_ENGLISH	4	/* English language character string: this
+				is a relic from pre-MySQL time and only used
+				for InnoDB's own system tables */
+#define DATA_ERROR	111	/* another relic from pre-MySQL time */
+
+#define DATA_MYSQL_TYPE_MASK 255 /* AND with this mask to extract the MySQL
+				 type from the precise type */
+#define DATA_MYSQL_TRUE_VARCHAR 15 /* MySQL type code for the >= 5.0.3
+				   format true VARCHAR */
+
+/* Precise data types for system columns and the length of those columns;
+NOTE: the values must run from 0 up in the order given! All codes must
+be less than 256 */
+#define	DATA_ROW_ID	0	/* row id: a 48-bit integer */
+#define DATA_ROW_ID_LEN	6	/* stored length for row id */
+
+#define DATA_TRX_ID	1	/* transaction id: 6 bytes */
+#define DATA_TRX_ID_LEN	6
+
+#define	DATA_ROLL_PTR	2	/* rollback data pointer: 7 bytes */
+#define DATA_ROLL_PTR_LEN 7
+
+#define	DATA_N_SYS_COLS 3	/* number of system columns defined above */
+
+#define DATA_FTS_DOC_ID	3	/* Used as FTS DOC ID column */
+
+#define DATA_SYS_PRTYPE_MASK 0xF /* mask to extract the above from prtype */
+
+/* Flags ORed to the precise data type */
+#define DATA_NOT_NULL	256	/* this is ORed to the precise type when
+				the column is declared as NOT NULL */
+#define DATA_UNSIGNED	512	/* this id ORed to the precise type when
+				we have an unsigned integer type */
+#define	DATA_BINARY_TYPE 1024	/* if the data type is a binary character
+				string, this is ORed to the precise type:
+				this only holds for tables created with
+				>= MySQL-4.0.14 */
+/* #define	DATA_NONLATIN1	2048 This is a relic from < 4.1.2 and < 5.0.1.
+				In earlier versions this was set for some
+				BLOB columns.
+*/
+#define	DATA_LONG_TRUE_VARCHAR 4096	/* this is ORed to the precise data
+				type when the column is true VARCHAR where
+				MySQL uses 2 bytes to store the data len;
+				for shorter VARCHARs MySQL uses only 1 byte */
+/*-------------------------------------------*/
+
+/* This many bytes we need to store the type information affecting the
+alphabetical order for a single field and decide the storage size of an
+SQL null*/
+#define DATA_ORDER_NULL_TYPE_BUF_SIZE		4
+/* In the >= 4.1.x storage format we add 2 bytes more so that we can also
+store the charset-collation number; one byte is left unused, though */
+#define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE	6
+
+/* Maximum multi-byte character length in bytes, plus 1 */
+#define DATA_MBMAX	5
+
+/* Pack mbminlen, mbmaxlen to mbminmaxlen. */
+#define DATA_MBMINMAXLEN(mbminlen, mbmaxlen)	\
+	((mbmaxlen) * DATA_MBMAX + (mbminlen))
+/* Get mbminlen from mbminmaxlen. Cast the result of UNIV_EXPECT to ulint
+because in GCC it returns a long. */
+#define DATA_MBMINLEN(mbminmaxlen) ((ulint) \
+                                    UNIV_EXPECT(((mbminmaxlen) % DATA_MBMAX), \
+                                                1))
+/* Get mbmaxlen from mbminmaxlen. */
+#define DATA_MBMAXLEN(mbminmaxlen) ((ulint) ((mbminmaxlen) / DATA_MBMAX))
+
+/* We now support 15 bits (up to 32767) collation number */
+#define MAX_CHAR_COLL_NUM	32767
+
+/* Mask to get the Charset Collation number (0x7fff) */
+#define CHAR_COLL_MASK		MAX_CHAR_COLL_NUM
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Gets the MySQL type code from a dtype.
+@return	MySQL type code; this is NOT an InnoDB type code! */
+UNIV_INLINE
+ulint
+dtype_get_mysql_type(
+/*=================*/
+	const dtype_t*	type);	/*!< in: type struct */
+/*********************************************************************//**
+Determine how many bytes the first n characters of the given string occupy.
+If the string is shorter than n characters, returns the number of bytes
+the characters in the string occupy.
+@return	length of the prefix, in bytes */
+UNIV_INTERN
+ulint
+dtype_get_at_most_n_mbchars(
+/*========================*/
+	ulint		prtype,		/*!< in: precise type */
+	ulint		mbminmaxlen,	/*!< in: minimum and maximum length of
+					a multi-byte character */
+	ulint		prefix_len,	/*!< in: length of the requested
+					prefix, in characters, multiplied by
+					dtype_get_mbmaxlen(dtype) */
+	ulint		data_len,	/*!< in: length of str (in bytes) */
+	const char*	str);		/*!< in: the string whose prefix
+					length is being determined */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Checks if a data main type is a string type. Also a BLOB is considered a
+string type.
+@return	TRUE if string type */
+UNIV_INTERN
+ibool
+dtype_is_string_type(
+/*=================*/
+	ulint	mtype);	/*!< in: InnoDB main data type code: DATA_CHAR, ... */
+/*********************************************************************//**
+Checks if a type is a binary string type. Note that for tables created with
+< 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. For
+those DATA_BLOB columns this function currently returns FALSE.
+@return	TRUE if binary string type */
+UNIV_INTERN
+ibool
+dtype_is_binary_string_type(
+/*========================*/
+	ulint	mtype,	/*!< in: main data type */
+	ulint	prtype);/*!< in: precise type */
+/*********************************************************************//**
+Checks if a type is a non-binary string type. That is, dtype_is_string_type is
+TRUE and dtype_is_binary_string_type is FALSE. Note that for tables created
+with < 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column.
+For those DATA_BLOB columns this function currently returns TRUE.
+@return	TRUE if non-binary string type */
+UNIV_INTERN
+ibool
+dtype_is_non_binary_string_type(
+/*============================*/
+	ulint	mtype,	/*!< in: main data type */
+	ulint	prtype);/*!< in: precise type */
+/*********************************************************************//**
+Sets a data type structure. */
+UNIV_INLINE
+void
+dtype_set(
+/*======*/
+	dtype_t*	type,	/*!< in: type struct to init */
+	ulint		mtype,	/*!< in: main data type */
+	ulint		prtype,	/*!< in: precise type */
+	ulint		len);	/*!< in: precision of type */
+/*********************************************************************//**
+Copies a data type structure. */
+UNIV_INLINE
+void
+dtype_copy(
+/*=======*/
+	dtype_t*	type1,	/*!< in: type struct to copy to */
+	const dtype_t*	type2);	/*!< in: type struct to copy from */
+/*********************************************************************//**
+Gets the SQL main data type.
+@return	SQL main data type */
+UNIV_INLINE
+ulint
+dtype_get_mtype(
+/*============*/
+	const dtype_t*	type);	/*!< in: data type */
+/*********************************************************************//**
+Gets the precise data type.
+@return	precise data type */
+UNIV_INLINE
+ulint
+dtype_get_prtype(
+/*=============*/
+	const dtype_t*	type);	/*!< in: data type */
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+UNIV_INLINE
+void
+dtype_get_mblen(
+/*============*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type (and collation) */
+	ulint*	mbminlen,	/*!< out: minimum length of a
+				multi-byte character */
+	ulint*	mbmaxlen);	/*!< out: maximum length of a
+				multi-byte character */
+/*********************************************************************//**
+Gets the MySQL charset-collation code for MySQL string types.
+@return	MySQL charset-collation code */
+UNIV_INLINE
+ulint
+dtype_get_charset_coll(
+/*===================*/
+	ulint	prtype);/*!< in: precise data type */
+/*********************************************************************//**
+Forms a precise type from the < 4.1.2 format precise type plus the
+charset-collation code.
+@return precise type, including the charset-collation code */
+UNIV_INTERN
+ulint
+dtype_form_prtype(
+/*==============*/
+	ulint	old_prtype,	/*!< in: the MySQL type code and the flags
+				DATA_BINARY_TYPE etc. */
+	ulint	charset_coll);	/*!< in: MySQL charset-collation code */
+/*********************************************************************//**
+Determines if a MySQL string type is a subset of UTF-8.  This function
+may return false negatives, in case further character-set collation
+codes are introduced in MySQL later.
+@return	TRUE if a subset of UTF-8 */
+UNIV_INLINE
+ibool
+dtype_is_utf8(
+/*==========*/
+	ulint	prtype);/*!< in: precise data type */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Gets the type length.
+@return	fixed length of the type, in bytes, or 0 if variable-length */
+UNIV_INLINE
+ulint
+dtype_get_len(
+/*==========*/
+	const dtype_t*	type);	/*!< in: data type */
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Gets the minimum length of a character, in bytes.
+@return minimum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbminlen(
+/*===============*/
+	const dtype_t*	type);	/*!< in: type */
+/*********************************************************************//**
+Gets the maximum length of a character, in bytes.
+@return maximum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbmaxlen(
+/*===============*/
+	const dtype_t*	type);	/*!< in: type */
+/*********************************************************************//**
+Sets the minimum and maximum length of a character, in bytes. */
+UNIV_INLINE
+void
+dtype_set_mbminmaxlen(
+/*==================*/
+	dtype_t*	type,		/*!< in/out: type */
+	ulint		mbminlen,	/*!< in: minimum length of a char,
+					in bytes, or 0 if this is not
+					a character type */
+	ulint		mbmaxlen);	/*!< in: maximum length of a char,
+					in bytes, or 0 if this is not
+					a character type */
+/*********************************************************************//**
+Gets the padding character code for the type.
+@return	padding character code, or ULINT_UNDEFINED if no padding specified */
+UNIV_INLINE
+ulint
+dtype_get_pad_char(
+/*===============*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype);	/*!< in: precise type */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************************//**
+Returns the size of a fixed size data type, 0 if not a fixed size type.
+@return	fixed size, or 0 */
+UNIV_INLINE
+ulint
+dtype_get_fixed_size_low(
+/*=====================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type */
+	ulint	len,		/*!< in: length */
+	ulint	mbminmaxlen,	/*!< in: minimum and maximum length of a
+				multibyte character, in bytes */
+	ulint	comp);		/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Returns the minimum size of a data type.
+@return	minimum size */
+UNIV_INLINE
+ulint
+dtype_get_min_size_low(
+/*===================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type */
+	ulint	len,		/*!< in: length */
+	ulint	mbminmaxlen);	/*!< in: minimum and maximum length of a
+				multibyte character */
+/***********************************************************************//**
+Returns the maximum size of a data type. Note: types in system tables may be
+incomplete and return incorrect information.
+@return	maximum size */
+UNIV_INLINE
+ulint
+dtype_get_max_size_low(
+/*===================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	len);		/*!< in: length */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return	SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dtype_get_sql_null_size(
+/*====================*/
+	const dtype_t*	type,	/*!< in: type */
+	ulint		comp);	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. */
+UNIV_INLINE
+void
+dtype_read_for_order_and_null_size(
+/*===============================*/
+	dtype_t*	type,	/*!< in: type struct */
+	const byte*	buf);	/*!< in: buffer for the stored order info */
+/**********************************************************************//**
+Stores for a type the information which determines its alphabetical ordering
+and the storage size of an SQL NULL value. This is the >= 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_store_for_order_and_null_size(
+/*====================================*/
+	byte*		buf,	/*!< in: buffer for
+				DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+				bytes where we store the info */
+	const dtype_t*	type,	/*!< in: type struct */
+	ulint		prefix_len);/*!< in: prefix length to
+				replace type->len, or 0 */
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_read_for_order_and_null_size(
+/*===================================*/
+	dtype_t*	type,	/*!< in: type struct */
+	const byte*	buf);	/*!< in: buffer for stored type order info */
+
+/*********************************************************************//**
+Returns the type's SQL name (e.g. BIGINT UNSIGNED) from mtype,prtype,len
+@return the SQL type name */
+UNIV_INLINE
+char*
+dtype_sql_name(
+/*===========*/
+	unsigned	mtype,	/*!< in: mtype */
+	unsigned	prtype,	/*!< in: prtype */
+	unsigned	len,	/*!< in: len */
+	char*		name,	/*!< out: SQL name */
+	unsigned	name_sz);/*!< in: size of the name buffer */
+
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Validates a data type structure.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dtype_validate(
+/*===========*/
+	const dtype_t*	type);	/*!< in: type struct to validate */
+/*********************************************************************//**
+Prints a data type structure. */
+UNIV_INTERN
+void
+dtype_print(
+/*========*/
+	const dtype_t*	type);	/*!< in: type */
+
+/* Structure for an SQL data type.
+If you add fields to this structure, be sure to initialize them everywhere.
+This structure is initialized in the following functions:
+dtype_set()
+dtype_read_for_order_and_null_size()
+dtype_new_read_for_order_and_null_size()
+sym_tab_add_null_lit() */
+
+struct dtype_t{
+	unsigned	prtype:32;	/*!< precise type; MySQL data
+					type, charset code, flags to
+					indicate nullability,
+					signedness, whether this is a
+					binary string, whether this is
+					a true VARCHAR where MySQL
+					uses 2 bytes to store the length */
+	unsigned	mtype:8;	/*!< main data type */
+
+	/* the remaining fields do not affect alphabetical ordering: */
+
+	unsigned	len:16;		/*!< length; for MySQL data this
+					is field->pack_length(),
+					except that for a >= 5.0.3
+					type true VARCHAR this is the
+					maximum byte length of the
+					string data (in addition to
+					the string, MySQL uses 1 or 2
+					bytes to store the string length) */
+#ifndef UNIV_HOTBACKUP
+	unsigned	mbminmaxlen:5;	/*!< minimum and maximum length of a
+					character, in bytes;
+					DATA_MBMINMAXLEN(mbminlen,mbmaxlen);
+					mbminlen=DATA_MBMINLEN(mbminmaxlen);
+					mbmaxlen=DATA_MBMINLEN(mbminmaxlen) */
+#endif /* !UNIV_HOTBACKUP */
+};
+
+#ifndef UNIV_NONINL
+#include "data0type.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/data0type.ic b/storage/innobase/include/data0type.ic
new file mode 100644
index 00000000000..d489bef89a8
--- /dev/null
+++ b/storage/innobase/include/data0type.ic
@@ -0,0 +1,711 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/data0type.ic
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#include <string.h> /* strlen() */
+
+#include "mach0data.h"
+#ifndef UNIV_HOTBACKUP
+# include "ha_prototypes.h"
+
+/*********************************************************************//**
+Gets the MySQL charset-collation code for MySQL string types.
+@return	MySQL charset-collation code */
+UNIV_INLINE
+ulint
+dtype_get_charset_coll(
+/*===================*/
+	ulint	prtype)	/*!< in: precise data type */
+{
+	return((prtype >> 16) & CHAR_COLL_MASK);
+}
+
+/*********************************************************************//**
+Determines if a MySQL string type is a subset of UTF-8.  This function
+may return false negatives, in case further character-set collation
+codes are introduced in MySQL later.
+@return	TRUE if a subset of UTF-8 */
+UNIV_INLINE
+ibool
+dtype_is_utf8(
+/*==========*/
+	ulint	prtype)	/*!< in: precise data type */
+{
+	/* These codes have been copied from strings/ctype-extra.c
+	and strings/ctype-utf8.c. */
+	switch (dtype_get_charset_coll(prtype)) {
+	case 11: /* ascii_general_ci */
+	case 65: /* ascii_bin */
+	case 33: /* utf8_general_ci */
+	case 83: /* utf8_bin */
+	case 254: /* utf8_general_cs */
+			return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Gets the MySQL type code from a dtype.
+@return	MySQL type code; this is NOT an InnoDB type code! */
+UNIV_INLINE
+ulint
+dtype_get_mysql_type(
+/*=================*/
+	const dtype_t*	type)	/*!< in: type struct */
+{
+	return(type->prtype & 0xFFUL);
+}
+
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+UNIV_INLINE
+void
+dtype_get_mblen(
+/*============*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type (and collation) */
+	ulint*	mbminlen,	/*!< out: minimum length of a
+				multi-byte character */
+	ulint*	mbmaxlen)	/*!< out: maximum length of a
+				multi-byte character */
+{
+	if (dtype_is_string_type(mtype)) {
+		innobase_get_cset_width(dtype_get_charset_coll(prtype),
+					mbminlen, mbmaxlen);
+		ut_ad(*mbminlen <= *mbmaxlen);
+		ut_ad(*mbminlen < DATA_MBMAX);
+		ut_ad(*mbmaxlen < DATA_MBMAX);
+	} else {
+		*mbminlen = *mbmaxlen = 0;
+	}
+}
+
+/*********************************************************************//**
+Sets the minimum and maximum length of a character, in bytes. */
+UNIV_INLINE
+void
+dtype_set_mbminmaxlen(
+/*==================*/
+	dtype_t*	type,		/*!< in/out: type */
+	ulint		mbminlen,	/*!< in: minimum length of a char,
+					in bytes, or 0 if this is not
+					a character type */
+	ulint		mbmaxlen)	/*!< in: maximum length of a char,
+					in bytes, or 0 if this is not
+					a character type */
+{
+	ut_ad(mbminlen < DATA_MBMAX);
+	ut_ad(mbmaxlen < DATA_MBMAX);
+	ut_ad(mbminlen <= mbmaxlen);
+
+	type->mbminmaxlen = DATA_MBMINMAXLEN(mbminlen, mbmaxlen);
+}
+
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+UNIV_INLINE
+void
+dtype_set_mblen(
+/*============*/
+	dtype_t*	type)	/*!< in/out: type */
+{
+	ulint	mbminlen;
+	ulint	mbmaxlen;
+
+	dtype_get_mblen(type->mtype, type->prtype, &mbminlen, &mbmaxlen);
+	dtype_set_mbminmaxlen(type, mbminlen, mbmaxlen);
+
+	ut_ad(dtype_validate(type));
+}
+#else /* !UNIV_HOTBACKUP */
+# define dtype_set_mblen(type) (void) 0
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Sets a data type structure. */
+UNIV_INLINE
+void
+dtype_set(
+/*======*/
+	dtype_t*	type,	/*!< in: type struct to init */
+	ulint		mtype,	/*!< in: main data type */
+	ulint		prtype,	/*!< in: precise type */
+	ulint		len)	/*!< in: precision of type */
+{
+	ut_ad(type);
+	ut_ad(mtype <= DATA_MTYPE_MAX);
+
+	type->mtype = mtype;
+	type->prtype = prtype;
+	type->len = len;
+
+	dtype_set_mblen(type);
+}
+
+/*********************************************************************//**
+Copies a data type structure. */
+UNIV_INLINE
+void
+dtype_copy(
+/*=======*/
+	dtype_t*	type1,	/*!< in: type struct to copy to */
+	const dtype_t*	type2)	/*!< in: type struct to copy from */
+{
+	*type1 = *type2;
+
+	ut_ad(dtype_validate(type1));
+}
+
+/*********************************************************************//**
+Gets the SQL main data type.
+@return	SQL main data type */
+UNIV_INLINE
+ulint
+dtype_get_mtype(
+/*============*/
+	const dtype_t*	type)	/*!< in: data type */
+{
+	ut_ad(type);
+
+	return(type->mtype);
+}
+
+/*********************************************************************//**
+Gets the precise data type.
+@return	precise data type */
+UNIV_INLINE
+ulint
+dtype_get_prtype(
+/*=============*/
+	const dtype_t*	type)	/*!< in: data type */
+{
+	ut_ad(type);
+
+	return(type->prtype);
+}
+
+/*********************************************************************//**
+Gets the type length.
+@return	fixed length of the type, in bytes, or 0 if variable-length */
+UNIV_INLINE
+ulint
+dtype_get_len(
+/*==========*/
+	const dtype_t*	type)	/*!< in: data type */
+{
+	ut_ad(type);
+
+	return(type->len);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Gets the minimum length of a character, in bytes.
+@return minimum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbminlen(
+/*===============*/
+	const dtype_t*	type)	/*!< in: type */
+{
+	ut_ad(type);
+	return(DATA_MBMINLEN(type->mbminmaxlen));
+}
+/*********************************************************************//**
+Gets the maximum length of a character, in bytes.
+@return maximum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbmaxlen(
+/*===============*/
+	const dtype_t*	type)	/*!< in: type */
+{
+	ut_ad(type);
+	return(DATA_MBMAXLEN(type->mbminmaxlen));
+}
+
+/*********************************************************************//**
+Gets the padding character code for a type.
+@return	padding character code, or ULINT_UNDEFINED if no padding specified */
+UNIV_INLINE
+ulint
+dtype_get_pad_char(
+/*===============*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype)		/*!< in: precise type */
+{
+	switch (mtype) {
+	case DATA_FIXBINARY:
+	case DATA_BINARY:
+		if (dtype_get_charset_coll(prtype)
+		    == DATA_MYSQL_BINARY_CHARSET_COLL) {
+			/* Starting from 5.0.18, do not pad
+			VARBINARY or BINARY columns. */
+			return(ULINT_UNDEFINED);
+		}
+		/* Fall through */
+	case DATA_CHAR:
+	case DATA_VARCHAR:
+	case DATA_MYSQL:
+	case DATA_VARMYSQL:
+		/* Space is the padding character for all char and binary
+		strings, and starting from 5.0.3, also for TEXT strings. */
+
+		return(0x20);
+	case DATA_BLOB:
+		if (!(prtype & DATA_BINARY_TYPE)) {
+			return(0x20);
+		}
+		/* Fall through */
+	default:
+		/* No padding specified */
+		return(ULINT_UNDEFINED);
+	}
+}
+
+/**********************************************************************//**
+Stores for a type the information which determines its alphabetical ordering
+and the storage size of an SQL NULL value. This is the >= 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_store_for_order_and_null_size(
+/*====================================*/
+	byte*		buf,	/*!< in: buffer for
+				DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+				bytes where we store the info */
+	const dtype_t*	type,	/*!< in: type struct */
+	ulint		prefix_len)/*!< in: prefix length to
+				replace type->len, or 0 */
+{
+#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE"
+#endif
+	ulint	len;
+
+	ut_ad(type);
+	ut_ad(type->mtype >= DATA_VARCHAR);
+	ut_ad(type->mtype <= DATA_MYSQL);
+
+	buf[0] = (byte)(type->mtype & 0xFFUL);
+
+	if (type->prtype & DATA_BINARY_TYPE) {
+		buf[0] |= 128;
+	}
+
+	/* In versions < 4.1.2 we had:	if (type->prtype & DATA_NONLATIN1) {
+	buf[0] |= 64;
+	}
+	*/
+
+	buf[1] = (byte)(type->prtype & 0xFFUL);
+
+	len = prefix_len ? prefix_len : type->len;
+
+	mach_write_to_2(buf + 2, len & 0xFFFFUL);
+
+	ut_ad(dtype_get_charset_coll(type->prtype) <= MAX_CHAR_COLL_NUM);
+	mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype));
+
+	if (type->prtype & DATA_NOT_NULL) {
+		buf[4] |= 128;
+	}
+}
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the < 4.1.x
+storage format. */
+UNIV_INLINE
+void
+dtype_read_for_order_and_null_size(
+/*===============================*/
+	dtype_t*	type,	/*!< in: type struct */
+	const byte*	buf)	/*!< in: buffer for stored type order info */
+{
+#if 4 != DATA_ORDER_NULL_TYPE_BUF_SIZE
+# error "4 != DATA_ORDER_NULL_TYPE_BUF_SIZE"
+#endif
+
+	type->mtype = buf[0] & 63;
+	type->prtype = buf[1];
+
+	if (buf[0] & 128) {
+		type->prtype |= DATA_BINARY_TYPE;
+	}
+
+	type->len = mach_read_from_2(buf + 2);
+
+	type->prtype = dtype_form_prtype(type->prtype,
+					 data_mysql_default_charset_coll);
+	dtype_set_mblen(type);
+}
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the >= 4.1.x
+storage format. */
+UNIV_INLINE
+void
+dtype_new_read_for_order_and_null_size(
+/*===================================*/
+	dtype_t*	type,	/*!< in: type struct */
+	const byte*	buf)	/*!< in: buffer for stored type order info */
+{
+	ulint	charset_coll;
+
+#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE"
+#endif
+
+	type->mtype = buf[0] & 63;
+	type->prtype = buf[1];
+
+	if (buf[0] & 128) {
+		type->prtype |= DATA_BINARY_TYPE;
+	}
+
+	if (buf[4] & 128) {
+		type->prtype |= DATA_NOT_NULL;
+	}
+
+	type->len = mach_read_from_2(buf + 2);
+
+	charset_coll = mach_read_from_2(buf + 4) & CHAR_COLL_MASK;
+
+	if (dtype_is_string_type(type->mtype)) {
+		ut_a(charset_coll <= MAX_CHAR_COLL_NUM);
+
+		if (charset_coll == 0) {
+			/* This insert buffer record was inserted with MySQL
+			version < 4.1.2, and the charset-collation code was not
+			explicitly stored to dtype->prtype at that time. It
+			must be the default charset-collation of this MySQL
+			installation. */
+
+			charset_coll = data_mysql_default_charset_coll;
+		}
+
+		type->prtype = dtype_form_prtype(type->prtype, charset_coll);
+	}
+	dtype_set_mblen(type);
+}
+
+/*********************************************************************//**
+Returns the type's SQL name (e.g. BIGINT UNSIGNED) from mtype,prtype,len
+@return the SQL type name */
+UNIV_INLINE
+char*
+dtype_sql_name(
+/*===========*/
+	unsigned	mtype,	/*!< in: mtype */
+	unsigned	prtype,	/*!< in: prtype */
+	unsigned	len,	/*!< in: len */
+	char*		name,	/*!< out: SQL name */
+	unsigned	name_sz)/*!< in: size of the name buffer */
+{
+
+#define APPEND_UNSIGNED()					\
+	do {							\
+		if (prtype & DATA_UNSIGNED) {			\
+			ut_snprintf(name + strlen(name),	\
+				    name_sz - strlen(name),	\
+				    " UNSIGNED");		\
+		}						\
+	} while (0)
+
+	ut_snprintf(name, name_sz, "UNKNOWN");
+
+	switch (mtype) {
+	case DATA_INT:
+		switch (len) {
+		case 1:
+			ut_snprintf(name, name_sz, "TINYINT");
+			break;
+		case 2:
+			ut_snprintf(name, name_sz, "SMALLINT");
+			break;
+		case 3:
+			ut_snprintf(name, name_sz, "MEDIUMINT");
+			break;
+		case 4:
+			ut_snprintf(name, name_sz, "INT");
+			break;
+		case 8:
+			ut_snprintf(name, name_sz, "BIGINT");
+			break;
+		}
+		APPEND_UNSIGNED();
+		break;
+	case DATA_FLOAT:
+		ut_snprintf(name, name_sz, "FLOAT");
+		APPEND_UNSIGNED();
+		break;
+	case DATA_DOUBLE:
+		ut_snprintf(name, name_sz, "DOUBLE");
+		APPEND_UNSIGNED();
+		break;
+	case DATA_FIXBINARY:
+		ut_snprintf(name, name_sz, "BINARY(%u)", len);
+		break;
+	case DATA_CHAR:
+	case DATA_MYSQL:
+		ut_snprintf(name, name_sz, "CHAR(%u)", len);
+		break;
+	case DATA_VARCHAR:
+	case DATA_VARMYSQL:
+		ut_snprintf(name, name_sz, "VARCHAR(%u)", len);
+		break;
+	case DATA_BINARY:
+		ut_snprintf(name, name_sz, "VARBINARY(%u)", len);
+		break;
+	case DATA_BLOB:
+		switch (len) {
+		case 9:
+			ut_snprintf(name, name_sz, "TINYBLOB");
+			break;
+		case 10:
+			ut_snprintf(name, name_sz, "BLOB");
+			break;
+		case 11:
+			ut_snprintf(name, name_sz, "MEDIUMBLOB");
+			break;
+		case 12:
+			ut_snprintf(name, name_sz, "LONGBLOB");
+			break;
+		}
+	}
+
+	if (prtype & DATA_NOT_NULL) {
+		ut_snprintf(name + strlen(name),
+			    name_sz - strlen(name),
+			    " NOT NULL");
+	}
+
+	return(name);
+}
+
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Returns the size of a fixed size data type, 0 if not a fixed size type.
+@return	fixed size, or 0 */
+UNIV_INLINE
+ulint
+dtype_get_fixed_size_low(
+/*=====================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type */
+	ulint	len,		/*!< in: length */
+	ulint	mbminmaxlen,	/*!< in: minimum and maximum length of
+				a multibyte character, in bytes */
+	ulint	comp)		/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+{
+	switch (mtype) {
+	case DATA_SYS:
+#ifdef UNIV_DEBUG
+		switch (prtype & DATA_MYSQL_TYPE_MASK) {
+		case DATA_ROW_ID:
+			ut_ad(len == DATA_ROW_ID_LEN);
+			break;
+		case DATA_TRX_ID:
+			ut_ad(len == DATA_TRX_ID_LEN);
+			break;
+		case DATA_ROLL_PTR:
+			ut_ad(len == DATA_ROLL_PTR_LEN);
+			break;
+		default:
+			ut_ad(0);
+			return(0);
+		}
+#endif /* UNIV_DEBUG */
+	case DATA_CHAR:
+	case DATA_FIXBINARY:
+	case DATA_INT:
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+		return(len);
+	case DATA_MYSQL:
+#ifndef UNIV_HOTBACKUP
+		if (prtype & DATA_BINARY_TYPE) {
+			return(len);
+		} else if (!comp) {
+			return(len);
+		} else {
+#ifdef UNIV_DEBUG
+			ulint	i_mbminlen, i_mbmaxlen;
+
+			innobase_get_cset_width(
+				dtype_get_charset_coll(prtype),
+				&i_mbminlen, &i_mbmaxlen);
+
+			ut_ad(DATA_MBMINMAXLEN(i_mbminlen, i_mbmaxlen)
+			      == mbminmaxlen);
+#endif /* UNIV_DEBUG */
+			if (DATA_MBMINLEN(mbminmaxlen)
+			    == DATA_MBMAXLEN(mbminmaxlen)) {
+				return(len);
+			}
+		}
+#else /* !UNIV_HOTBACKUP */
+		return(len);
+#endif /* !UNIV_HOTBACKUP */
+		/* fall through for variable-length charsets */
+	case DATA_VARCHAR:
+	case DATA_BINARY:
+	case DATA_DECIMAL:
+	case DATA_VARMYSQL:
+	case DATA_BLOB:
+		return(0);
+	default:
+		ut_error;
+	}
+
+	return(0);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Returns the minimum size of a data type.
+@return	minimum size */
+UNIV_INLINE
+ulint
+dtype_get_min_size_low(
+/*===================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type */
+	ulint	len,		/*!< in: length */
+	ulint	mbminmaxlen)	/*!< in: minimum and maximum length of a
+				multi-byte character */
+{
+	switch (mtype) {
+	case DATA_SYS:
+#ifdef UNIV_DEBUG
+		switch (prtype & DATA_MYSQL_TYPE_MASK) {
+		case DATA_ROW_ID:
+			ut_ad(len == DATA_ROW_ID_LEN);
+			break;
+		case DATA_TRX_ID:
+			ut_ad(len == DATA_TRX_ID_LEN);
+			break;
+		case DATA_ROLL_PTR:
+			ut_ad(len == DATA_ROLL_PTR_LEN);
+			break;
+		default:
+			ut_ad(0);
+			return(0);
+		}
+#endif /* UNIV_DEBUG */
+	case DATA_CHAR:
+	case DATA_FIXBINARY:
+	case DATA_INT:
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+		return(len);
+	case DATA_MYSQL:
+		if (prtype & DATA_BINARY_TYPE) {
+			return(len);
+		} else {
+			ulint	mbminlen = DATA_MBMINLEN(mbminmaxlen);
+			ulint	mbmaxlen = DATA_MBMAXLEN(mbminmaxlen);
+
+			if (mbminlen == mbmaxlen) {
+				return(len);
+			}
+
+			/* this is a variable-length character set */
+			ut_a(mbminlen > 0);
+			ut_a(mbmaxlen > mbminlen);
+			ut_a(len % mbmaxlen == 0);
+			return(len * mbminlen / mbmaxlen);
+		}
+	case DATA_VARCHAR:
+	case DATA_BINARY:
+	case DATA_DECIMAL:
+	case DATA_VARMYSQL:
+	case DATA_BLOB:
+		return(0);
+	default:
+		ut_error;
+	}
+
+	return(0);
+}
+
+/***********************************************************************//**
+Returns the maximum size of a data type. Note: types in system tables may be
+incomplete and return incorrect information.
+@return	maximum size */
+UNIV_INLINE
+ulint
+dtype_get_max_size_low(
+/*===================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	len)		/*!< in: length */
+{
+	switch (mtype) {
+	case DATA_SYS:
+	case DATA_CHAR:
+	case DATA_FIXBINARY:
+	case DATA_INT:
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+	case DATA_MYSQL:
+	case DATA_VARCHAR:
+	case DATA_BINARY:
+	case DATA_DECIMAL:
+	case DATA_VARMYSQL:
+		return(len);
+	case DATA_BLOB:
+		break;
+	default:
+		ut_error;
+	}
+
+	return(ULINT_MAX);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return	SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dtype_get_sql_null_size(
+/*====================*/
+	const dtype_t*	type,	/*!< in: type */
+	ulint		comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+{
+#ifndef UNIV_HOTBACKUP
+	return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len,
+					type->mbminmaxlen, comp));
+#else /* !UNIV_HOTBACKUP */
+	return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len,
+					0, 0));
+#endif /* !UNIV_HOTBACKUP */
+}
diff --git a/storage/innobase/include/data0types.h b/storage/innobase/include/data0types.h
new file mode 100644
index 00000000000..bd2bb577611
--- /dev/null
+++ b/storage/innobase/include/data0types.h
@@ -0,0 +1,36 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0types.h
+Some type definitions
+
+Created 9/21/2000 Heikki Tuuri
+*************************************************************************/
+
+#ifndef data0types_h
+#define data0types_h
+
+/* SQL data field struct */
+struct dfield_t;
+
+/* SQL data tuple struct */
+struct dtuple_t;
+
+#endif
+
diff --git a/storage/innobase/include/db0err.h b/storage/innobase/include/db0err.h
new file mode 100644
index 00000000000..1e87ce3fdb8
--- /dev/null
+++ b/storage/innobase/include/db0err.h
@@ -0,0 +1,161 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/db0err.h
+Global error codes for the database
+
+Created 5/24/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef db0err_h
+#define db0err_h
+
+
+enum dberr_t {
+	DB_SUCCESS_LOCKED_REC = 9,	/*!< like DB_SUCCESS, but a new
+					explicit record lock was created */
+	DB_SUCCESS = 10,
+
+	/* The following are error codes */
+	DB_ERROR,
+	DB_INTERRUPTED,
+	DB_OUT_OF_MEMORY,
+	DB_OUT_OF_FILE_SPACE,
+	DB_LOCK_WAIT,
+	DB_DEADLOCK,
+	DB_ROLLBACK,
+	DB_DUPLICATE_KEY,
+	DB_QUE_THR_SUSPENDED,
+	DB_MISSING_HISTORY,		/*!< required history data has been
+					deleted due to lack of space in
+					rollback segment */
+	DB_CLUSTER_NOT_FOUND = 30,
+	DB_TABLE_NOT_FOUND,
+	DB_MUST_GET_MORE_FILE_SPACE,	/*!< the database has to be stopped
+					and restarted with more file space */
+	DB_TABLE_IS_BEING_USED,
+	DB_TOO_BIG_RECORD,		/*!< a record in an index would not fit
+					on a compressed page, or it would
+					become bigger than 1/2 free space in
+					an uncompressed page frame */
+	DB_LOCK_WAIT_TIMEOUT,		/*!< lock wait lasted too long */
+	DB_NO_REFERENCED_ROW,		/*!< referenced key value not found
+					for a foreign key in an insert or
+					update of a row */
+	DB_ROW_IS_REFERENCED,		/*!< cannot delete or update a row
+					because it contains a key value
+					which is referenced */
+	DB_CANNOT_ADD_CONSTRAINT,	/*!< adding a foreign key constraint
+					to a table failed */
+	DB_CORRUPTION,			/*!< data structure corruption noticed */
+	DB_CANNOT_DROP_CONSTRAINT,	/*!< dropping a foreign key constraint
+					from a table failed */
+	DB_NO_SAVEPOINT,		/*!< no savepoint exists with the given
+					name */
+	DB_TABLESPACE_EXISTS,		/*!< we cannot create a new single-table
+					tablespace because a file of the same
+					name already exists */
+	DB_TABLESPACE_DELETED,		/*!< tablespace was deleted or is
+					being dropped right now */
+	DB_TABLESPACE_NOT_FOUND,	/*<! Attempt to delete a tablespace
+					instance that was not found in the
+					tablespace hash table */
+	DB_LOCK_TABLE_FULL,		/*!< lock structs have exhausted the
+					buffer pool (for big transactions,
+					InnoDB stores the lock structs in the
+					buffer pool) */
+	DB_FOREIGN_DUPLICATE_KEY,	/*!< foreign key constraints
+					activated by the operation would
+					lead to a duplicate key in some
+					table */
+	DB_TOO_MANY_CONCURRENT_TRXS,	/*!< when InnoDB runs out of the
+					preconfigured undo slots, this can
+					only happen when there are too many
+					concurrent transactions */
+	DB_UNSUPPORTED,			/*!< when InnoDB sees any artefact or
+					a feature that it can't recoginize or
+					work with e.g., FT indexes created by
+					a later version of the engine. */
+
+	DB_INVALID_NULL,		/*!< a NOT NULL column was found to
+					be NULL during table rebuild */
+
+	DB_STATS_DO_NOT_EXIST,		/*!< an operation that requires the
+					persistent storage, used for recording
+					table and index statistics, was
+					requested but this storage does not
+					exist itself or the stats for a given
+					table do not exist */
+	DB_FOREIGN_EXCEED_MAX_CASCADE,	/*!< Foreign key constraint related
+					cascading delete/update exceeds
+					maximum allowed depth */
+	DB_CHILD_NO_INDEX,		/*!< the child (foreign) table does
+					not have an index that contains the
+					foreign keys as its prefix columns */
+	DB_PARENT_NO_INDEX,		/*!< the parent table does not
+					have an index that contains the
+					foreign keys as its prefix columns */
+	DB_TOO_BIG_INDEX_COL,		/*!< index column size exceeds
+					maximum limit */
+	DB_INDEX_CORRUPT,		/*!< we have corrupted index */
+	DB_UNDO_RECORD_TOO_BIG,		/*!< the undo log record is too big */
+	DB_READ_ONLY,			/*!< Update operation attempted in
+					a read-only transaction */
+	DB_FTS_INVALID_DOCID,		/* FTS Doc ID cannot be zero */
+	DB_TABLE_IN_FK_CHECK,		/* table is being used in foreign
+					key check */
+	DB_ONLINE_LOG_TOO_BIG,		/*!< Modification log grew too big
+					during online index creation */
+
+	DB_IO_ERROR,			/*!< Generic IO error */
+	DB_IDENTIFIER_TOO_LONG,		/*!< Identifier name too long */
+	DB_FTS_EXCEED_RESULT_CACHE_LIMIT,	/*!< FTS query memory
+					exceeds result cache limit */
+	DB_TEMP_FILE_WRITE_FAILURE,	/*!< Temp file write failure */
+	DB_FTS_TOO_MANY_WORDS_IN_PHRASE,
+					/*< Too many words in a phrase */
+	DB_TOO_BIG_FOR_REDO,		/* Record length greater than 10%
+					of redo log */
+	/* The following are partial failure codes */
+	DB_FAIL = 1000,
+	DB_OVERFLOW,
+	DB_UNDERFLOW,
+	DB_STRONG_FAIL,
+	DB_ZIP_OVERFLOW,
+	DB_RECORD_NOT_FOUND = 1500,
+	DB_END_OF_INDEX,
+	DB_DICT_CHANGED,		/*!< Some part of table dictionary has
+					changed. Such as index dropped or
+					foreign key dropped */
+
+
+        /* The following are API only error codes. */
+	DB_DATA_MISMATCH = 2000,	/*!< Column update or read failed
+					because the types mismatch */
+
+	DB_SCHEMA_NOT_LOCKED,		/*!< If an API function expects the
+					schema to be locked in exclusive mode
+					and if it's not then that API function
+					will return this error code */
+
+	DB_NOT_FOUND			/*!< Generic error code for "Not found"
+					type of errors */
+};
+
+#endif
diff --git a/storage/innobase/include/dict0boot.h b/storage/innobase/include/dict0boot.h
new file mode 100644
index 00000000000..a994c9d8ff1
--- /dev/null
+++ b/storage/innobase/include/dict0boot.h
@@ -0,0 +1,342 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0boot.h
+Data dictionary creation and booting
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0boot_h
+#define dict0boot_h
+
+#include "univ.i"
+
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "ut0byte.h"
+#include "buf0buf.h"
+#include "fsp0fsp.h"
+#include "dict0dict.h"
+
+typedef	byte	dict_hdr_t;
+
+/**********************************************************************//**
+Gets a pointer to the dictionary header and x-latches its page.
+@return	pointer to the dictionary header, page x-latched */
+UNIV_INTERN
+dict_hdr_t*
+dict_hdr_get(
+/*=========*/
+	mtr_t*	mtr);	/*!< in: mtr */
+/**********************************************************************//**
+Returns a new table, index, or space id. */
+UNIV_INTERN
+void
+dict_hdr_get_new_id(
+/*================*/
+	table_id_t*	table_id,	/*!< out: table id
+					(not assigned if NULL) */
+	index_id_t*	index_id,	/*!< out: index id
+					(not assigned if NULL) */
+	ulint*		space_id);	/*!< out: space id
+					(not assigned if NULL) */
+/**********************************************************************//**
+Writes the current value of the row id counter to the dictionary header file
+page. */
+UNIV_INTERN
+void
+dict_hdr_flush_row_id(void);
+/*=======================*/
+/**********************************************************************//**
+Returns a new row id.
+@return	the new id */
+UNIV_INLINE
+row_id_t
+dict_sys_get_new_row_id(void);
+/*=========================*/
+/**********************************************************************//**
+Reads a row id from a record or other 6-byte stored form.
+@return	row id */
+UNIV_INLINE
+row_id_t
+dict_sys_read_row_id(
+/*=================*/
+	const byte*	field);	/*!< in: record field */
+/**********************************************************************//**
+Writes a row id to a record or other 6-byte stored form. */
+UNIV_INLINE
+void
+dict_sys_write_row_id(
+/*==================*/
+	byte*		field,	/*!< in: record field */
+	row_id_t	row_id);/*!< in: row id */
+/*****************************************************************//**
+Initializes the data dictionary memory structures when the database is
+started. This function is also called when the data dictionary is created.
+@return DB_SUCCESS or error code. */
+UNIV_INTERN
+dberr_t
+dict_boot(void)
+/*===========*/
+	__attribute__((warn_unused_result));
+
+/*****************************************************************//**
+Creates and initializes the data dictionary at the server bootstrap.
+@return DB_SUCCESS or error code. */
+UNIV_INTERN
+dberr_t
+dict_create(void)
+/*=============*/
+	__attribute__((warn_unused_result));
+
+/*********************************************************************//**
+Check if a table id belongs to  system table.
+@return true if the table id belongs to a system table. */
+UNIV_INLINE
+bool
+dict_is_sys_table(
+/*==============*/
+	table_id_t	id)		/*!< in: table id to check */
+	__attribute__((warn_unused_result));
+
+/* Space id and page no where the dictionary header resides */
+#define	DICT_HDR_SPACE		0	/* the SYSTEM tablespace */
+#define	DICT_HDR_PAGE_NO	FSP_DICT_HDR_PAGE_NO
+
+/* The ids for the basic system tables and their indexes */
+#define DICT_TABLES_ID		1
+#define DICT_COLUMNS_ID		2
+#define DICT_INDEXES_ID		3
+#define DICT_FIELDS_ID		4
+/* The following is a secondary index on SYS_TABLES */
+#define DICT_TABLE_IDS_ID	5
+
+#define	DICT_HDR_FIRST_ID	10	/* the ids for tables etc. start
+					from this number, except for basic
+					system tables and their above defined
+					indexes; ibuf tables and indexes are
+					assigned as the id the number
+					DICT_IBUF_ID_MIN plus the space id */
+
+/* The offset of the dictionary header on the page */
+#define	DICT_HDR		FSEG_PAGE_DATA
+
+/*-------------------------------------------------------------*/
+/* Dictionary header offsets */
+#define DICT_HDR_ROW_ID		0	/* The latest assigned row id */
+#define DICT_HDR_TABLE_ID	8	/* The latest assigned table id */
+#define DICT_HDR_INDEX_ID	16	/* The latest assigned index id */
+#define DICT_HDR_MAX_SPACE_ID	24	/* The latest assigned space id,or 0*/
+#define DICT_HDR_MIX_ID_LOW	28	/* Obsolete,always DICT_HDR_FIRST_ID*/
+#define DICT_HDR_TABLES		32	/* Root of SYS_TABLES clust index */
+#define DICT_HDR_TABLE_IDS	36	/* Root of SYS_TABLE_IDS sec index */
+#define DICT_HDR_COLUMNS	40	/* Root of SYS_COLUMNS clust index */
+#define DICT_HDR_INDEXES	44	/* Root of SYS_INDEXES clust index */
+#define DICT_HDR_FIELDS		48	/* Root of SYS_FIELDS clust index */
+
+#define DICT_HDR_FSEG_HEADER	56	/* Segment header for the tablespace
+					segment into which the dictionary
+					header is created */
+/*-------------------------------------------------------------*/
+
+/* The columns in SYS_TABLES */
+enum dict_col_sys_tables_enum {
+	DICT_COL__SYS_TABLES__NAME		= 0,
+	DICT_COL__SYS_TABLES__ID		= 1,
+	DICT_COL__SYS_TABLES__N_COLS		= 2,
+	DICT_COL__SYS_TABLES__TYPE		= 3,
+	DICT_COL__SYS_TABLES__MIX_ID		= 4,
+	DICT_COL__SYS_TABLES__MIX_LEN		= 5,
+	DICT_COL__SYS_TABLES__CLUSTER_ID	= 6,
+	DICT_COL__SYS_TABLES__SPACE		= 7,
+	DICT_NUM_COLS__SYS_TABLES		= 8
+};
+/* The field numbers in the SYS_TABLES clustered index */
+enum dict_fld_sys_tables_enum {
+	DICT_FLD__SYS_TABLES__NAME		= 0,
+	DICT_FLD__SYS_TABLES__DB_TRX_ID		= 1,
+	DICT_FLD__SYS_TABLES__DB_ROLL_PTR	= 2,
+	DICT_FLD__SYS_TABLES__ID		= 3,
+	DICT_FLD__SYS_TABLES__N_COLS		= 4,
+	DICT_FLD__SYS_TABLES__TYPE		= 5,
+	DICT_FLD__SYS_TABLES__MIX_ID		= 6,
+	DICT_FLD__SYS_TABLES__MIX_LEN		= 7,
+	DICT_FLD__SYS_TABLES__CLUSTER_ID	= 8,
+	DICT_FLD__SYS_TABLES__SPACE		= 9,
+	DICT_NUM_FIELDS__SYS_TABLES		= 10
+};
+/* The field numbers in the SYS_TABLE_IDS index */
+enum dict_fld_sys_table_ids_enum {
+	DICT_FLD__SYS_TABLE_IDS__ID		= 0,
+	DICT_FLD__SYS_TABLE_IDS__NAME		= 1,
+	DICT_NUM_FIELDS__SYS_TABLE_IDS		= 2
+};
+/* The columns in SYS_COLUMNS */
+enum dict_col_sys_columns_enum {
+	DICT_COL__SYS_COLUMNS__TABLE_ID		= 0,
+	DICT_COL__SYS_COLUMNS__POS		= 1,
+	DICT_COL__SYS_COLUMNS__NAME		= 2,
+	DICT_COL__SYS_COLUMNS__MTYPE		= 3,
+	DICT_COL__SYS_COLUMNS__PRTYPE		= 4,
+	DICT_COL__SYS_COLUMNS__LEN		= 5,
+	DICT_COL__SYS_COLUMNS__PREC		= 6,
+	DICT_NUM_COLS__SYS_COLUMNS		= 7
+};
+/* The field numbers in the SYS_COLUMNS clustered index */
+enum dict_fld_sys_columns_enum {
+	DICT_FLD__SYS_COLUMNS__TABLE_ID		= 0,
+	DICT_FLD__SYS_COLUMNS__POS		= 1,
+	DICT_FLD__SYS_COLUMNS__DB_TRX_ID	= 2,
+	DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR	= 3,
+	DICT_FLD__SYS_COLUMNS__NAME		= 4,
+	DICT_FLD__SYS_COLUMNS__MTYPE		= 5,
+	DICT_FLD__SYS_COLUMNS__PRTYPE		= 6,
+	DICT_FLD__SYS_COLUMNS__LEN		= 7,
+	DICT_FLD__SYS_COLUMNS__PREC		= 8,
+	DICT_NUM_FIELDS__SYS_COLUMNS		= 9
+};
+/* The columns in SYS_INDEXES */
+enum dict_col_sys_indexes_enum {
+	DICT_COL__SYS_INDEXES__TABLE_ID		= 0,
+	DICT_COL__SYS_INDEXES__ID		= 1,
+	DICT_COL__SYS_INDEXES__NAME		= 2,
+	DICT_COL__SYS_INDEXES__N_FIELDS		= 3,
+	DICT_COL__SYS_INDEXES__TYPE		= 4,
+	DICT_COL__SYS_INDEXES__SPACE		= 5,
+	DICT_COL__SYS_INDEXES__PAGE_NO		= 6,
+	DICT_NUM_COLS__SYS_INDEXES		= 7
+};
+/* The field numbers in the SYS_INDEXES clustered index */
+enum dict_fld_sys_indexes_enum {
+	DICT_FLD__SYS_INDEXES__TABLE_ID		= 0,
+	DICT_FLD__SYS_INDEXES__ID		= 1,
+	DICT_FLD__SYS_INDEXES__DB_TRX_ID	= 2,
+	DICT_FLD__SYS_INDEXES__DB_ROLL_PTR	= 3,
+	DICT_FLD__SYS_INDEXES__NAME		= 4,
+	DICT_FLD__SYS_INDEXES__N_FIELDS		= 5,
+	DICT_FLD__SYS_INDEXES__TYPE		= 6,
+	DICT_FLD__SYS_INDEXES__SPACE		= 7,
+	DICT_FLD__SYS_INDEXES__PAGE_NO		= 8,
+	DICT_NUM_FIELDS__SYS_INDEXES		= 9
+};
+/* The columns in SYS_FIELDS */
+enum dict_col_sys_fields_enum {
+	DICT_COL__SYS_FIELDS__INDEX_ID		= 0,
+	DICT_COL__SYS_FIELDS__POS		= 1,
+	DICT_COL__SYS_FIELDS__COL_NAME		= 2,
+	DICT_NUM_COLS__SYS_FIELDS		= 3
+};
+/* The field numbers in the SYS_FIELDS clustered index */
+enum dict_fld_sys_fields_enum {
+	DICT_FLD__SYS_FIELDS__INDEX_ID		= 0,
+	DICT_FLD__SYS_FIELDS__POS		= 1,
+	DICT_FLD__SYS_FIELDS__DB_TRX_ID		= 2,
+	DICT_FLD__SYS_FIELDS__DB_ROLL_PTR	= 3,
+	DICT_FLD__SYS_FIELDS__COL_NAME		= 4,
+	DICT_NUM_FIELDS__SYS_FIELDS		= 5
+};
+/* The columns in SYS_FOREIGN */
+enum dict_col_sys_foreign_enum {
+	DICT_COL__SYS_FOREIGN__ID		= 0,
+	DICT_COL__SYS_FOREIGN__FOR_NAME		= 1,
+	DICT_COL__SYS_FOREIGN__REF_NAME		= 2,
+	DICT_COL__SYS_FOREIGN__N_COLS		= 3,
+	DICT_NUM_COLS__SYS_FOREIGN		= 4
+};
+/* The field numbers in the SYS_FOREIGN clustered index */
+enum dict_fld_sys_foreign_enum {
+	DICT_FLD__SYS_FOREIGN__ID		= 0,
+	DICT_FLD__SYS_FOREIGN__DB_TRX_ID	= 1,
+	DICT_FLD__SYS_FOREIGN__DB_ROLL_PTR	= 2,
+	DICT_FLD__SYS_FOREIGN__FOR_NAME		= 3,
+	DICT_FLD__SYS_FOREIGN__REF_NAME		= 4,
+	DICT_FLD__SYS_FOREIGN__N_COLS		= 5,
+	DICT_NUM_FIELDS__SYS_FOREIGN		= 6
+};
+/* The field numbers in the SYS_FOREIGN_FOR_NAME secondary index */
+enum dict_fld_sys_foreign_for_name_enum {
+	DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME	= 0,
+	DICT_FLD__SYS_FOREIGN_FOR_NAME__ID	= 1,
+	DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME	= 2
+};
+/* The columns in SYS_FOREIGN_COLS */
+enum dict_col_sys_foreign_cols_enum {
+	DICT_COL__SYS_FOREIGN_COLS__ID			= 0,
+	DICT_COL__SYS_FOREIGN_COLS__POS			= 1,
+	DICT_COL__SYS_FOREIGN_COLS__FOR_COL_NAME	= 2,
+	DICT_COL__SYS_FOREIGN_COLS__REF_COL_NAME	= 3,
+	DICT_NUM_COLS__SYS_FOREIGN_COLS			= 4
+};
+/* The field numbers in the SYS_FOREIGN_COLS clustered index */
+enum dict_fld_sys_foreign_cols_enum {
+	DICT_FLD__SYS_FOREIGN_COLS__ID			= 0,
+	DICT_FLD__SYS_FOREIGN_COLS__POS			= 1,
+	DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID		= 2,
+	DICT_FLD__SYS_FOREIGN_COLS__DB_ROLL_PTR		= 3,
+	DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME	= 4,
+	DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME	= 5,
+	DICT_NUM_FIELDS__SYS_FOREIGN_COLS		= 6
+};
+/* The columns in SYS_TABLESPACES */
+enum dict_col_sys_tablespaces_enum {
+	DICT_COL__SYS_TABLESPACES__SPACE		= 0,
+	DICT_COL__SYS_TABLESPACES__NAME			= 1,
+	DICT_COL__SYS_TABLESPACES__FLAGS		= 2,
+	DICT_NUM_COLS__SYS_TABLESPACES			= 3
+};
+/* The field numbers in the SYS_TABLESPACES clustered index */
+enum dict_fld_sys_tablespaces_enum {
+	DICT_FLD__SYS_TABLESPACES__SPACE		= 0,
+	DICT_FLD__SYS_TABLESPACES__DB_TRX_ID		= 1,
+	DICT_FLD__SYS_TABLESPACES__DB_ROLL_PTR		= 2,
+	DICT_FLD__SYS_TABLESPACES__NAME			= 3,
+	DICT_FLD__SYS_TABLESPACES__FLAGS		= 4,
+	DICT_NUM_FIELDS__SYS_TABLESPACES		= 5
+};
+/* The columns in SYS_DATAFILES */
+enum dict_col_sys_datafiles_enum {
+	DICT_COL__SYS_DATAFILES__SPACE			= 0,
+	DICT_COL__SYS_DATAFILES__PATH			= 1,
+	DICT_NUM_COLS__SYS_DATAFILES			= 2
+};
+/* The field numbers in the SYS_DATAFILES clustered index */
+enum dict_fld_sys_datafiles_enum {
+	DICT_FLD__SYS_DATAFILES__SPACE			= 0,
+	DICT_FLD__SYS_DATAFILES__DB_TRX_ID		= 1,
+	DICT_FLD__SYS_DATAFILES__DB_ROLL_PTR		= 2,
+	DICT_FLD__SYS_DATAFILES__PATH			= 3,
+	DICT_NUM_FIELDS__SYS_DATAFILES			= 4
+};
+
+/* A number of the columns above occur in multiple tables.  These are the
+length of thos fields. */
+#define	DICT_FLD_LEN_SPACE	4
+#define	DICT_FLD_LEN_FLAGS	4
+
+/* When a row id which is zero modulo this number (which must be a power of
+two) is assigned, the field DICT_HDR_ROW_ID on the dictionary header page is
+updated */
+#define DICT_HDR_ROW_ID_WRITE_MARGIN	256
+
+#ifndef UNIV_NONINL
+#include "dict0boot.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/dict0boot.ic b/storage/innobase/include/dict0boot.ic
new file mode 100644
index 00000000000..2b156a4f672
--- /dev/null
+++ b/storage/innobase/include/dict0boot.ic
@@ -0,0 +1,96 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0boot.ic
+Data dictionary creation and booting
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+/**********************************************************************//**
+Returns a new row id.
+@return	the new id */
+UNIV_INLINE
+row_id_t
+dict_sys_get_new_row_id(void)
+/*=========================*/
+{
+	row_id_t	id;
+
+	mutex_enter(&(dict_sys->mutex));
+
+	id = dict_sys->row_id;
+
+	if (0 == (id % DICT_HDR_ROW_ID_WRITE_MARGIN)) {
+
+		dict_hdr_flush_row_id();
+	}
+
+	dict_sys->row_id++;
+
+	mutex_exit(&(dict_sys->mutex));
+
+	return(id);
+}
+
+/**********************************************************************//**
+Reads a row id from a record or other 6-byte stored form.
+@return	row id */
+UNIV_INLINE
+row_id_t
+dict_sys_read_row_id(
+/*=================*/
+	const byte*	field)	/*!< in: record field */
+{
+#if DATA_ROW_ID_LEN != 6
+# error "DATA_ROW_ID_LEN != 6"
+#endif
+
+	return(mach_read_from_6(field));
+}
+
+/**********************************************************************//**
+Writes a row id to a record or other 6-byte stored form. */
+UNIV_INLINE
+void
+dict_sys_write_row_id(
+/*==================*/
+	byte*		field,	/*!< in: record field */
+	row_id_t	row_id)	/*!< in: row id */
+{
+#if DATA_ROW_ID_LEN != 6
+# error "DATA_ROW_ID_LEN != 6"
+#endif
+
+	mach_write_to_6(field, row_id);
+}
+
+/*********************************************************************//**
+Check if a table id belongs to  system table.
+@return true if the table id belongs to a system table. */
+UNIV_INLINE
+bool
+dict_is_sys_table(
+/*==============*/
+	table_id_t	id)		/*!< in: table id to check */
+{
+	return(id < DICT_HDR_FIRST_ID);
+}
+
+
diff --git a/storage/innobase/include/dict0crea.h b/storage/innobase/include/dict0crea.h
new file mode 100644
index 00000000000..67eab9058da
--- /dev/null
+++ b/storage/innobase/include/dict0crea.h
@@ -0,0 +1,246 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0crea.h
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0crea_h
+#define dict0crea_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "dict0dict.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/*********************************************************************//**
+Creates a table create graph.
+@return	own: table create node */
+UNIV_INTERN
+tab_node_t*
+tab_create_graph_create(
+/*====================*/
+	dict_table_t*	table,	/*!< in: table to create, built as a memory data
+				structure */
+	mem_heap_t*	heap,	/*!< in: heap where created */
+	bool		commit);/*!< in: true if the commit node should be
+				added to the query graph */
+/*********************************************************************//**
+Creates an index create graph.
+@return	own: index create node */
+UNIV_INTERN
+ind_node_t*
+ind_create_graph_create(
+/*====================*/
+	dict_index_t*	index,	/*!< in: index to create, built as a memory data
+				structure */
+	mem_heap_t*	heap,	/*!< in: heap where created */
+	bool		commit);/*!< in: true if the commit node should be
+				added to the query graph */
+/***********************************************************//**
+Creates a table. This is a high-level function used in SQL execution graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+dict_create_table_step(
+/*===================*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/***********************************************************//**
+Creates an index. This is a high-level function used in SQL execution
+graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+dict_create_index_step(
+/*===================*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/*******************************************************************//**
+Truncates the index tree associated with a row in SYS_INDEXES table.
+@return	new root page number, or FIL_NULL on failure */
+UNIV_INTERN
+ulint
+dict_truncate_index_tree(
+/*=====================*/
+	dict_table_t*	table,	/*!< in: the table the index belongs to */
+	ulint		space,	/*!< in: 0=truncate,
+				nonzero=create the index tree in the
+				given tablespace */
+	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor pointing to
+				record in the clustered index of
+				SYS_INDEXES table. The cursor may be
+				repositioned in this call. */
+	mtr_t*		mtr);	/*!< in: mtr having the latch
+				on the record page. The mtr may be
+				committed and restarted in this call. */
+/*******************************************************************//**
+Drops the index tree associated with a row in SYS_INDEXES table. */
+UNIV_INTERN
+void
+dict_drop_index_tree(
+/*=================*/
+	rec_t*	rec,	/*!< in/out: record in the clustered index
+			of SYS_INDEXES table */
+	mtr_t*	mtr);	/*!< in: mtr having the latch on the record page */
+/****************************************************************//**
+Creates the foreign key constraints system tables inside InnoDB
+at server bootstrap or server start if they are not found or are
+not of the right form.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_create_or_check_foreign_constraint_tables(void);
+/*================================================*/
+/********************************************************************//**
+Generate a foreign key constraint name when it was not named by the user.
+A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER,
+where the numbers start from 1, and are given locally for this table, that is,
+the number is not global, as it used to be before MySQL 4.0.18.  */
+UNIV_INLINE
+dberr_t
+dict_create_add_foreign_id(
+/*=======================*/
+	ulint*		id_nr,	/*!< in/out: number to use in id generation;
+				incremented if used */
+	const char*	name,	/*!< in: table name */
+	dict_foreign_t*	foreign)/*!< in/out: foreign key */
+	__attribute__((nonnull));
+
+/** Adds the given set of foreign key objects to the dictionary tables
+in the database. This function does not modify the dictionary cache. The
+caller must ensure that all foreign key objects contain a valid constraint
+name in foreign->id.
+@param[in]	local_fk_set	set of foreign key objects, to be added to
+the dictionary tables
+@param[in]	table		table to which the foreign key objects in
+local_fk_set belong to
+@param[in,out]	trx		transaction
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_add_foreigns_to_dictionary(
+/*===================================*/
+	const dict_foreign_set&	local_fk_set,
+	const dict_table_t*	table,
+	trx_t*			trx)
+	__attribute__((nonnull, warn_unused_result));
+/****************************************************************//**
+Creates the tablespaces and datafiles system tables inside InnoDB
+at server bootstrap or server start if they are not found or are
+not of the right form.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_create_or_check_sys_tablespace(void);
+/*=====================================*/
+/********************************************************************//**
+Add a single tablespace definition to the data dictionary tables in the
+database.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_add_tablespace_to_dictionary(
+/*=====================================*/
+	ulint		space,		/*!< in: tablespace id */
+	const char*	name,		/*!< in: tablespace name */
+	ulint		flags,		/*!< in: tablespace flags */
+	const char*	path,		/*!< in: tablespace path */
+	trx_t*		trx,		/*!< in: transaction */
+	bool		commit);	/*!< in: if true then commit the
+					transaction */
+/********************************************************************//**
+Add a foreign key definition to the data dictionary tables.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_add_foreign_to_dictionary(
+/*==================================*/
+	const char*		name,	/*!< in: table name */
+	const dict_foreign_t*	foreign,/*!< in: foreign key */
+	trx_t*			trx)	/*!< in/out: dictionary transaction */
+	__attribute__((nonnull, warn_unused_result));
+
+/* Table create node structure */
+struct tab_node_t{
+	que_common_t	common;	/*!< node type: QUE_NODE_TABLE_CREATE */
+	dict_table_t*	table;	/*!< table to create, built as a memory data
+				structure with dict_mem_... functions */
+	ins_node_t*	tab_def; /* child node which does the insert of
+				the table definition; the row to be inserted
+				is built by the parent node  */
+	ins_node_t*	col_def; /* child node which does the inserts of
+				the column definitions; the row to be inserted
+				is built by the parent node  */
+	commit_node_t*	commit_node;
+				/* child node which performs a commit after
+				a successful table creation */
+	/*----------------------*/
+	/* Local storage for this graph node */
+	ulint		state;	/*!< node execution state */
+	ulint		col_no;	/*!< next column definition to insert */
+	mem_heap_t*	heap;	/*!< memory heap used as auxiliary storage */
+};
+
+/* Table create node states */
+#define	TABLE_BUILD_TABLE_DEF	1
+#define	TABLE_BUILD_COL_DEF	2
+#define	TABLE_COMMIT_WORK	3
+#define	TABLE_ADD_TO_CACHE	4
+#define	TABLE_COMPLETED		5
+
+/* Index create node struct */
+
+struct ind_node_t{
+	que_common_t	common;	/*!< node type: QUE_NODE_INDEX_CREATE */
+	dict_index_t*	index;	/*!< index to create, built as a memory data
+				structure with dict_mem_... functions */
+	ins_node_t*	ind_def; /* child node which does the insert of
+				the index definition; the row to be inserted
+				is built by the parent node  */
+	ins_node_t*	field_def; /* child node which does the inserts of
+				the field definitions; the row to be inserted
+				is built by the parent node  */
+	commit_node_t*	commit_node;
+				/* child node which performs a commit after
+				a successful index creation */
+	/*----------------------*/
+	/* Local storage for this graph node */
+	ulint		state;	/*!< node execution state */
+	ulint		page_no;/* root page number of the index */
+	dict_table_t*	table;	/*!< table which owns the index */
+	dtuple_t*	ind_row;/* index definition row built */
+	ulint		field_no;/* next field definition to insert */
+	mem_heap_t*	heap;	/*!< memory heap used as auxiliary storage */
+};
+
+/* Index create node states */
+#define	INDEX_BUILD_INDEX_DEF	1
+#define	INDEX_BUILD_FIELD_DEF	2
+#define	INDEX_CREATE_INDEX_TREE	3
+#define	INDEX_COMMIT_WORK	4
+#define	INDEX_ADD_TO_CACHE	5
+
+#ifndef UNIV_NONINL
+#include "dict0crea.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/dict0crea.ic b/storage/innobase/include/dict0crea.ic
new file mode 100644
index 00000000000..2d0d9dcb858
--- /dev/null
+++ b/storage/innobase/include/dict0crea.ic
@@ -0,0 +1,98 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0crea.ic
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#include "mem0mem.h"
+
+/*********************************************************************//**
+Checks if a table name contains the string "/#sql" which denotes temporary
+tables in MySQL.
+@return true if temporary table */
+UNIV_INTERN
+bool
+row_is_mysql_tmp_table_name(
+/*========================*/
+	const char*     name) __attribute__((warn_unused_result));
+				/*!< in: table name in the form
+				'database/tablename' */
+
+
+/********************************************************************//**
+Generate a foreign key constraint name when it was not named by the user.
+A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER,
+where the numbers start from 1, and are given locally for this table, that is,
+the number is not global, as it used to be before MySQL 4.0.18.  */
+UNIV_INLINE
+dberr_t
+dict_create_add_foreign_id(
+/*=======================*/
+	ulint*		id_nr,	/*!< in/out: number to use in id generation;
+				incremented if used */
+	const char*	name,	/*!< in: table name */
+	dict_foreign_t*	foreign)/*!< in/out: foreign key */
+{
+	if (foreign->id == NULL) {
+		/* Generate a new constraint id */
+		ulint	namelen	= strlen(name);
+		char*	id	= static_cast<char*>(
+					mem_heap_alloc(foreign->heap,
+						       namelen + 20));
+
+		if (row_is_mysql_tmp_table_name(name)) {
+
+			/* no overflow if number < 1e13 */
+			sprintf(id, "%s_ibfk_%lu", name,
+				(ulong) (*id_nr)++);
+		} else {
+			char	table_name[MAX_TABLE_NAME_LEN + 20] = "";
+			uint	errors = 0;
+
+			strncpy(table_name, name,
+				MAX_TABLE_NAME_LEN + 20);
+
+			innobase_convert_to_system_charset(
+				strchr(table_name, '/') + 1,
+				strchr(name, '/') + 1,
+				MAX_TABLE_NAME_LEN, &errors);
+
+			if (errors) {
+				strncpy(table_name, name,
+					MAX_TABLE_NAME_LEN + 20);
+			}
+
+			/* no overflow if number < 1e13 */
+			sprintf(id, "%s_ibfk_%lu", table_name,
+				(ulong) (*id_nr)++);
+
+			if (innobase_check_identifier_length(
+				strchr(id,'/') + 1)) {
+				return(DB_IDENTIFIER_TOO_LONG);
+			}
+		}
+		foreign->id = id;
+	}
+
+	return(DB_SUCCESS);
+}
+
diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h
new file mode 100644
index 00000000000..dd61e5becc1
--- /dev/null
+++ b/storage/innobase/include/dict0dict.h
@@ -0,0 +1,1841 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0dict.h
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0dict_h
+#define dict0dict_h
+
+#include "univ.i"
+#include "db0err.h"
+#include "dict0types.h"
+#include "dict0mem.h"
+#include "data0type.h"
+#include "data0data.h"
+#include "mem0mem.h"
+#include "rem0types.h"
+#include "ut0mem.h"
+#include "ut0lst.h"
+#include "hash0hash.h"
+#include "ut0rnd.h"
+#include "ut0byte.h"
+#include "trx0types.h"
+#include "row0types.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "sync0sync.h"
+# include "sync0rw.h"
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+UNIV_INTERN
+void
+dict_casedn_str(
+/*============*/
+	char*	a)	/*!< in/out: string to put in lower case */
+	__attribute__((nonnull));
+/********************************************************************//**
+Get the database name length in a table name.
+@return	database name length */
+UNIV_INTERN
+ulint
+dict_get_db_name_len(
+/*=================*/
+	const char*	name)	/*!< in: table name in the form
+				dbname '/' tablename */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Open a table from its database and table name, this is currently used by
+foreign constraint parser to get the referenced table.
+@return complete table name with database and table name, allocated from
+heap memory passed in */
+UNIV_INTERN
+char*
+dict_get_referenced_table(
+/*======================*/
+	const char*	name,		/*!< in: foreign key table name */
+	const char*	database_name,	/*!< in: table db name */
+	ulint		database_name_len,/*!< in: db name length */
+	const char*	table_name,	/*!< in: table name */
+	ulint		table_name_len,	/*!< in: table name length */
+	dict_table_t**	table,		/*!< out: table object or NULL */
+	mem_heap_t*	heap);		/*!< in: heap memory */
+/*********************************************************************//**
+Frees a foreign key struct. */
+
+void
+dict_foreign_free(
+/*==============*/
+	dict_foreign_t*	foreign);	/*!< in, own: foreign key struct */
+/*********************************************************************//**
+Finds the highest [number] for foreign key constraints of the table. Looks
+only at the >= 4.0.18-format id's, which are of the form
+databasename/tablename_ibfk_[number].
+@return highest number, 0 if table has no new format foreign key constraints */
+UNIV_INTERN
+ulint
+dict_table_get_highest_foreign_id(
+/*==============================*/
+	dict_table_t*	table);		/*!< in: table in the dictionary
+					memory cache */
+/********************************************************************//**
+Return the end of table name where we have removed dbname and '/'.
+@return	table name */
+UNIV_INTERN
+const char*
+dict_remove_db_name(
+/*================*/
+	const char*	name)	/*!< in: table name in the form
+				dbname '/' tablename */
+	__attribute__((nonnull, warn_unused_result));
+
+/** Operation to perform when opening a table */
+enum dict_table_op_t {
+	/** Expect the tablespace to exist. */
+	DICT_TABLE_OP_NORMAL = 0,
+	/** Drop any orphan indexes after an aborted online index creation */
+	DICT_TABLE_OP_DROP_ORPHAN,
+	/** Silently load the tablespace if it does not exist,
+	and do not load the definitions of incomplete indexes. */
+	DICT_TABLE_OP_LOAD_TABLESPACE
+};
+
+/**********************************************************************//**
+Returns a table object based on table id.
+@return	table, NULL if does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_table_open_on_id(
+/*==================*/
+	table_id_t	table_id,	/*!< in: table id */
+	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
+	dict_table_op_t	table_op)	/*!< in: operation to perform */
+	__attribute__((warn_unused_result));
+/********************************************************************//**
+Decrements the count of open handles to a table. */
+UNIV_INTERN
+void
+dict_table_close(
+/*=============*/
+	dict_table_t*	table,		/*!< in/out: table */
+	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
+	ibool		try_drop)	/*!< in: TRUE=try to drop any orphan
+					indexes after an aborted online
+					index creation */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Inits the data dictionary module. */
+UNIV_INTERN
+void
+dict_init(void);
+/*===========*/
+/********************************************************************//**
+Gets the space id of every table of the data dictionary and makes a linear
+list and a hash table of them to the data dictionary cache. This function
+can be called at database startup if we did not need to do a crash recovery.
+In crash recovery we must scan the space id's from the .ibd files in MySQL
+database directories. */
+UNIV_INTERN
+void
+dict_load_space_id_list(void);
+/*=========================*/
+/*********************************************************************//**
+Gets the minimum number of bytes per character.
+@return minimum multi-byte char size, in bytes */
+UNIV_INLINE
+ulint
+dict_col_get_mbminlen(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the maximum number of bytes per character.
+@return maximum multi-byte char size, in bytes */
+UNIV_INLINE
+ulint
+dict_col_get_mbmaxlen(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Sets the minimum and maximum number of bytes per character. */
+UNIV_INLINE
+void
+dict_col_set_mbminmaxlen(
+/*=====================*/
+	dict_col_t*	col,		/*!< in/out: column */
+	ulint		mbminlen,	/*!< in: minimum multi-byte
+					character size, in bytes */
+	ulint		mbmaxlen)	/*!< in: minimum multi-byte
+					character size, in bytes */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Gets the column data type. */
+UNIV_INLINE
+void
+dict_col_copy_type(
+/*===============*/
+	const dict_col_t*	col,	/*!< in: column */
+	dtype_t*		type)	/*!< out: data type */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Determine bytes of column prefix to be stored in the undo log. Please
+note if the table format is UNIV_FORMAT_A (< UNIV_FORMAT_B), no prefix
+needs to be stored in the undo log.
+@return bytes of column prefix to be stored in the undo log */
+UNIV_INLINE
+ulint
+dict_max_field_len_store_undo(
+/*==========================*/
+	dict_table_t*		table,	/*!< in: table */
+	const dict_col_t*	col)	/*!< in: column which index prefix
+					is based on */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Assert that a column and a data type match.
+@return	TRUE */
+UNIV_INLINE
+ibool
+dict_col_type_assert_equal(
+/*=======================*/
+	const dict_col_t*	col,	/*!< in: column */
+	const dtype_t*		type)	/*!< in: data type */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Returns the minimum size of the column.
+@return	minimum size */
+UNIV_INLINE
+ulint
+dict_col_get_min_size(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+	__attribute__((nonnull, warn_unused_result));
+/***********************************************************************//**
+Returns the maximum size of the column.
+@return	maximum size */
+UNIV_INLINE
+ulint
+dict_col_get_max_size(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+	__attribute__((nonnull, warn_unused_result));
+/***********************************************************************//**
+Returns the size of a fixed size column, 0 if not a fixed size column.
+@return	fixed size, or 0 */
+UNIV_INLINE
+ulint
+dict_col_get_fixed_size(
+/*====================*/
+	const dict_col_t*	col,	/*!< in: column */
+	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+	__attribute__((nonnull, warn_unused_result));
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return	SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dict_col_get_sql_null_size(
+/*=======================*/
+	const dict_col_t*	col,	/*!< in: column */
+	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the column number.
+@return	col->ind, table column position (starting from 0) */
+UNIV_INLINE
+ulint
+dict_col_get_no(
+/*============*/
+	const dict_col_t*	col)	/*!< in: column */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the column position in the clustered index. */
+UNIV_INLINE
+ulint
+dict_col_get_clust_pos(
+/*===================*/
+	const dict_col_t*	col,		/*!< in: table column */
+	const dict_index_t*	clust_index)	/*!< in: clustered index */
+	__attribute__((nonnull, warn_unused_result));
+/****************************************************************//**
+If the given column name is reserved for InnoDB system columns, return
+TRUE.
+@return	TRUE if name is reserved */
+UNIV_INTERN
+ibool
+dict_col_name_is_reserved(
+/*======================*/
+	const char*	name)	/*!< in: column name */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Acquire the autoinc lock. */
+UNIV_INTERN
+void
+dict_table_autoinc_lock(
+/*====================*/
+	dict_table_t*	table)	/*!< in/out: table */
+	__attribute__((nonnull));
+/********************************************************************//**
+Unconditionally set the autoinc counter. */
+UNIV_INTERN
+void
+dict_table_autoinc_initialize(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	ib_uint64_t	value)	/*!< in: next value to assign to a row */
+	__attribute__((nonnull));
+/********************************************************************//**
+Reads the next autoinc value (== autoinc counter value), 0 if not yet
+initialized.
+@return	value for a new row, or 0 */
+UNIV_INTERN
+ib_uint64_t
+dict_table_autoinc_read(
+/*====================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Updates the autoinc counter if the value supplied is greater than the
+current value. */
+UNIV_INTERN
+void
+dict_table_autoinc_update_if_greater(
+/*=================================*/
+
+	dict_table_t*	table,	/*!< in/out: table */
+	ib_uint64_t	value)	/*!< in: value which was assigned to a row */
+	__attribute__((nonnull));
+/********************************************************************//**
+Release the autoinc lock. */
+UNIV_INTERN
+void
+dict_table_autoinc_unlock(
+/*======================*/
+	dict_table_t*	table)	/*!< in/out: table */
+	__attribute__((nonnull));
+#endif /* !UNIV_HOTBACKUP */
+/**********************************************************************//**
+Adds system columns to a table object. */
+UNIV_INTERN
+void
+dict_table_add_system_columns(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	mem_heap_t*	heap)	/*!< in: temporary heap */
+	__attribute__((nonnull));
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Adds a table object to the dictionary cache. */
+UNIV_INTERN
+void
+dict_table_add_to_cache(
+/*====================*/
+	dict_table_t*	table,		/*!< in: table */
+	ibool		can_be_evicted,	/*!< in: TRUE if can be evicted*/
+	mem_heap_t*	heap)		/*!< in: temporary heap */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Removes a table object from the dictionary cache. */
+UNIV_INTERN
+void
+dict_table_remove_from_cache(
+/*=========================*/
+	dict_table_t*	table)	/*!< in, own: table */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Renames a table object.
+@return	TRUE if success */
+UNIV_INTERN
+dberr_t
+dict_table_rename_in_cache(
+/*=======================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	const char*	new_name,	/*!< in: new name */
+	ibool		rename_also_foreigns)
+					/*!< in: in ALTER TABLE we want
+					to preserve the original table name
+					in constraints which reference it */
+	__attribute__((nonnull, warn_unused_result));
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+UNIV_INTERN
+void
+dict_index_remove_from_cache(
+/*=========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	dict_index_t*	index)	/*!< in, own: index */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Change the id of a table object in the dictionary cache. This is used in
+DISCARD TABLESPACE. */
+UNIV_INTERN
+void
+dict_table_change_id_in_cache(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table object already in cache */
+	table_id_t	new_id)	/*!< in: new id to set */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Removes a foreign constraint struct from the dictionary cache. */
+UNIV_INTERN
+void
+dict_foreign_remove_from_cache(
+/*===========================*/
+	dict_foreign_t*	foreign)	/*!< in, own: foreign constraint */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Adds a foreign key constraint object to the dictionary cache. May free
+the object if there already is an object with the same identifier in.
+At least one of foreign table or referenced table must already be in
+the dictionary cache!
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_foreign_add_to_cache(
+/*======================*/
+	dict_foreign_t*		foreign,
+				/*!< in, own: foreign key constraint */
+	const char**		col_names,
+				/*!< in: column names, or NULL to use
+				foreign->foreign_table->col_names */
+	bool			check_charsets,
+				/*!< in: whether to check charset
+				compatibility */
+	dict_err_ignore_t	ignore_err)
+				/*!< in: error to be ignored */
+	__attribute__((nonnull(1), warn_unused_result));
+/*********************************************************************//**
+Check if the index is referenced by a foreign key, if TRUE return the
+matching instance NULL otherwise.
+@return pointer to foreign key struct if index is defined for foreign
+key, otherwise NULL */
+UNIV_INTERN
+dict_foreign_t*
+dict_table_get_referenced_constraint(
+/*=================================*/
+	dict_table_t*	table,	/*!< in: InnoDB table */
+	dict_index_t*	index)	/*!< in: InnoDB index */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Checks if a table is referenced by foreign keys.
+@return	TRUE if table is referenced by a foreign key */
+UNIV_INTERN
+ibool
+dict_table_is_referenced_by_foreign_key(
+/*====================================*/
+	const dict_table_t*	table)	/*!< in: InnoDB table */
+	__attribute__((nonnull, warn_unused_result));
+/**********************************************************************//**
+Replace the index passed in with another equivalent index in the
+foreign key lists of the table.
+@return whether all replacements were found */
+UNIV_INTERN
+bool
+dict_foreign_replace_index(
+/*=======================*/
+	dict_table_t*		table,  /*!< in/out: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const dict_index_t*	index)	/*!< in: index to be replaced */
+	__attribute__((nonnull(1,3), warn_unused_result));
+/**********************************************************************//**
+Determines whether a string starts with the specified keyword.
+@return TRUE if str starts with keyword */
+UNIV_INTERN
+ibool
+dict_str_starts_with_keyword(
+/*=========================*/
+	THD*		thd,		/*!< in: MySQL thread handle */
+	const char*	str,		/*!< in: string to scan for keyword */
+	const char*	keyword)	/*!< in: keyword to look for */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Checks if a index is defined for a foreign key constraint. Index is a part
+of a foreign key constraint if the index is referenced by foreign key
+or index is a foreign key index
+@return pointer to foreign key struct if index is defined for foreign
+key, otherwise NULL */
+UNIV_INTERN
+dict_foreign_t*
+dict_table_get_foreign_constraint(
+/*==============================*/
+	dict_table_t*	table,	/*!< in: InnoDB table */
+	dict_index_t*	index)	/*!< in: InnoDB index */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Scans a table create SQL string and adds to the data dictionary
+the foreign key constraints declared in the string. This function
+should be called after the indexes for a table have been created.
+Each foreign key constraint must be accompanied with indexes in
+bot participating tables. The indexes are allowed to contain more
+fields than mentioned in the constraint.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_foreign_constraints(
+/*============================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const char*	sql_string,	/*!< in: table create statement where
+					foreign keys are declared like:
+					FOREIGN KEY (a, b) REFERENCES
+					table2(c, d), table2 can be written
+					also with the database
+					name before it: test.table2; the
+					default database id the database of
+					parameter name */
+	size_t		sql_length,	/*!< in: length of sql_string */
+	const char*	name,		/*!< in: table full name in the
+					normalized form
+					database_name/table_name */
+	ibool		reject_fks)	/*!< in: if TRUE, fail with error
+					code DB_CANNOT_ADD_CONSTRAINT if
+					any foreign keys are found. */
+	__attribute__((nonnull, warn_unused_result));
+/**********************************************************************//**
+Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement.
+@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the
+constraint id does not match */
+UNIV_INTERN
+dberr_t
+dict_foreign_parse_drop_constraints(
+/*================================*/
+	mem_heap_t*	heap,			/*!< in: heap from which we can
+						allocate memory */
+	trx_t*		trx,			/*!< in: transaction */
+	dict_table_t*	table,			/*!< in: table */
+	ulint*		n,			/*!< out: number of constraints
+						to drop */
+	const char***	constraints_to_drop)	/*!< out: id's of the
+						constraints to drop */
+	__attribute__((nonnull, warn_unused_result));
+/**********************************************************************//**
+Returns a table object and increments its open handle count.
+NOTE! This is a high-level function to be used mainly from outside the
+'dict' directory. Inside this directory dict_table_get_low
+is usually the appropriate function.
+@return	table, NULL if does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_table_open_on_name(
+/*====================*/
+	const char*	table_name,	/*!< in: table name */
+	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
+	ibool		try_drop,	/*!< in: TRUE=try to drop any orphan
+					indexes after an aborted online
+					index creation */
+	dict_err_ignore_t
+			ignore_err)	/*!< in: error to be ignored when
+					loading the table */
+	__attribute__((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Tries to find an index whose first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return	matching index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_foreign_find_index(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const char**		columns,/*!< in: array of column names */
+	ulint			n_cols,	/*!< in: number of columns */
+	const dict_index_t*	types_idx,
+					/*!< in: NULL or an index
+					whose types the column types
+					must match */
+	bool			check_charsets,
+					/*!< in: whether to check
+					charsets.  only has an effect
+					if types_idx != NULL */
+	ulint			check_null)
+					/*!< in: nonzero if none of
+					the columns must be declared
+					NOT NULL */
+	__attribute__((nonnull(1,3), warn_unused_result));
+/**********************************************************************//**
+Returns a column's name.
+@return column name. NOTE: not guaranteed to stay valid if table is
+modified in any way (columns added, etc.). */
+UNIV_INTERN
+const char*
+dict_table_get_col_name(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			col_nr)	/*!< in: column number */
+	__attribute__((nonnull, warn_unused_result));
+/**********************************************************************//**
+Prints a table data. */
+UNIV_INTERN
+void
+dict_table_print(
+/*=============*/
+	dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Outputs info on foreign keys of a table. */
+UNIV_INTERN
+void
+dict_print_info_on_foreign_keys(
+/*============================*/
+	ibool		create_table_format, /*!< in: if TRUE then print in
+				a format suitable to be inserted into
+				a CREATE TABLE, otherwise in the format
+				of SHOW TABLE STATUS */
+	FILE*		file,	/*!< in: file where to print */
+	trx_t*		trx,	/*!< in: transaction */
+	dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Outputs info on a foreign key of a table in a format suitable for
+CREATE TABLE. */
+UNIV_INTERN
+void
+dict_print_info_on_foreign_key_in_create_format(
+/*============================================*/
+	FILE*		file,		/*!< in: file where to print */
+	trx_t*		trx,		/*!< in: transaction */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
+	ibool		add_newline)	/*!< in: whether to add a newline */
+	__attribute__((nonnull(1,3)));
+/********************************************************************//**
+Displays the names of the index and the table. */
+UNIV_INTERN
+void
+dict_index_name_print(
+/*==================*/
+	FILE*			file,	/*!< in: output stream */
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_index_t*	index)	/*!< in: index to print */
+	__attribute__((nonnull(1,3)));
+/*********************************************************************//**
+Tries to find an index whose first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return	matching index, NULL if not found */
+UNIV_INTERN
+bool
+dict_foreign_qualify_index(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const char**		columns,/*!< in: array of column names */
+	ulint			n_cols,	/*!< in: number of columns */
+	const dict_index_t*	index,	/*!< in: index to check */
+	const dict_index_t*	types_idx,
+					/*!< in: NULL or an index
+					whose types the column types
+					must match */
+	bool			check_charsets,
+					/*!< in: whether to check
+					charsets.  only has an effect
+					if types_idx != NULL */
+	ulint			check_null)
+					/*!< in: nonzero if none of
+					the columns must be declared
+					NOT NULL */
+	__attribute__((nonnull(1,3), warn_unused_result));
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the first index on the table (the clustered index).
+@return	index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_first_index(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the last index on the table.
+@return	index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_last_index(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the next index on the table.
+@return	index, NULL if none left */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_next_index(
+/*======================*/
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull, warn_unused_result));
+#else /* UNIV_DEBUG */
+# define dict_table_get_first_index(table) UT_LIST_GET_FIRST((table)->indexes)
+# define dict_table_get_last_index(table) UT_LIST_GET_LAST((table)->indexes)
+# define dict_table_get_next_index(index) UT_LIST_GET_NEXT(indexes, index)
+#endif /* UNIV_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+/* Skip corrupted index */
+#define dict_table_skip_corrupt_index(index)			\
+	while (index && dict_index_is_corrupted(index)) {	\
+		index = dict_table_get_next_index(index);	\
+	}
+
+/* Get the next non-corrupt index */
+#define dict_table_next_uncorrupted_index(index)		\
+do {								\
+	index = dict_table_get_next_index(index);		\
+	dict_table_skip_corrupt_index(index);			\
+} while (0)
+
+/********************************************************************//**
+Check whether the index is the clustered index.
+@return	nonzero for clustered index, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_clust(
+/*================*/
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull, pure, warn_unused_result));
+/********************************************************************//**
+Check whether the index is unique.
+@return	nonzero for unique index, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_unique(
+/*=================*/
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull, pure, warn_unused_result));
+/********************************************************************//**
+Check whether the index is the insert buffer tree.
+@return	nonzero for insert buffer, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_ibuf(
+/*===============*/
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull, pure, warn_unused_result));
+/********************************************************************//**
+Check whether the index is a secondary index or the insert buffer tree.
+@return	nonzero for insert buffer, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_sec_or_ibuf(
+/*======================*/
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull, pure, warn_unused_result));
+
+/************************************************************************
+Gets the all the FTS indexes for the table. NOTE: must not be called for
+tables which do not have an FTS-index. */
+UNIV_INTERN
+ulint
+dict_table_get_all_fts_indexes(
+/*===========================*/
+				/* out: number of indexes collected */
+	dict_table_t*	table,	/* in: table */
+	ib_vector_t*	indexes)/* out: vector for collecting FTS indexes */
+	__attribute__((nonnull));
+/********************************************************************//**
+Gets the number of user-defined columns in a table in the dictionary
+cache.
+@return	number of user-defined (e.g., not ROW_ID) columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_user_cols(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, pure, warn_unused_result));
+/********************************************************************//**
+Gets the number of system columns in a table in the dictionary cache.
+@return	number of system (e.g., ROW_ID) columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_sys_cols(
+/*======================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, pure, warn_unused_result));
+/********************************************************************//**
+Gets the number of all columns (also system) in a table in the dictionary
+cache.
+@return	number of columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_cols(
+/*==================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, pure, warn_unused_result));
+/********************************************************************//**
+Gets the approximately estimated number of rows in the table.
+@return	estimated number of rows */
+UNIV_INLINE
+ib_uint64_t
+dict_table_get_n_rows(
+/*==================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Increment the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_inc(
+/*==================*/
+	dict_table_t*	table)	/*!< in/out: table */
+	__attribute__((nonnull));
+/********************************************************************//**
+Decrement the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_dec(
+/*==================*/
+	dict_table_t*	table)	/*!< in/out: table */
+	__attribute__((nonnull));
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth column of a table.
+@return	pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_nth_col(
+/*===================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			pos)	/*!< in: position of column */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the given system column of a table.
+@return	pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_sys_col(
+/*===================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			sys)	/*!< in: DATA_ROW_ID, ... */
+	__attribute__((nonnull, warn_unused_result));
+#else /* UNIV_DEBUG */
+#define dict_table_get_nth_col(table, pos) \
+((table)->cols + (pos))
+#define dict_table_get_sys_col(table, sys) \
+((table)->cols + (table)->n_cols + (sys) - DATA_N_SYS_COLS)
+#endif /* UNIV_DEBUG */
+/********************************************************************//**
+Gets the given system column number of a table.
+@return	column number */
+UNIV_INLINE
+ulint
+dict_table_get_sys_col_no(
+/*======================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			sys)	/*!< in: DATA_ROW_ID, ... */
+	__attribute__((nonnull, warn_unused_result));
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Returns the minimum data size of an index record.
+@return	minimum data size in bytes */
+UNIV_INLINE
+ulint
+dict_index_get_min_size(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Check whether the table uses the compact page format.
+@return	TRUE if table uses the compact page format */
+UNIV_INLINE
+ibool
+dict_table_is_comp(
+/*===============*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Determine the file format of a table.
+@return	file format version */
+UNIV_INLINE
+ulint
+dict_table_get_format(
+/*==================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Determine the file format from a dict_table_t::flags.
+@return	file format version */
+UNIV_INLINE
+ulint
+dict_tf_get_format(
+/*===============*/
+	ulint		flags)		/*!< in: dict_table_t::flags */
+	__attribute__((warn_unused_result));
+/********************************************************************//**
+Set the various values in a dict_table_t::flags pointer. */
+UNIV_INLINE
+void
+dict_tf_set(
+/*========*/
+	ulint*		flags,		/*!< in/out: table */
+	rec_format_t	format,		/*!< in: file format */
+	ulint		zip_ssize,	/*!< in: zip shift size */
+	bool		remote_path)	/*!< in: table uses DATA DIRECTORY */
+	__attribute__((nonnull));
+/********************************************************************//**
+Convert a 32 bit integer table flags to the 32 bit integer that is
+written into the tablespace header at the offset FSP_SPACE_FLAGS and is
+also stored in the fil_space_t::flags field.  The following chart shows
+the translation of the low order bit.  Other bits are the same.
+========================= Low order bit ==========================
+                    | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC
+dict_table_t::flags |     0     |    1    |     1      |    1
+fil_space_t::flags  |     0     |    0    |     1      |    1
+==================================================================
+@return	tablespace flags (fil_space_t::flags) */
+UNIV_INLINE
+ulint
+dict_tf_to_fsp_flags(
+/*=================*/
+	ulint	flags)	/*!< in: dict_table_t::flags */
+	__attribute__((const));
+/********************************************************************//**
+Extract the compressed page size from table flags.
+@return	compressed page size, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_tf_get_zip_size(
+/*=================*/
+	ulint	flags)			/*!< in: flags */
+	__attribute__((const));
+/********************************************************************//**
+Check whether the table uses the compressed compact page format.
+@return	compressed page size, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_zip_size(
+/*================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, warn_unused_result));
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Obtain exclusive locks on all index trees of the table. This is to prevent
+accessing index trees while InnoDB is updating internal metadata for
+operations such as truncate tables. */
+UNIV_INLINE
+void
+dict_table_x_lock_indexes(
+/*======================*/
+	dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Release the exclusive locks on all index tree. */
+UNIV_INLINE
+void
+dict_table_x_unlock_indexes(
+/*========================*/
+	dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull));
+/********************************************************************//**
+Checks if a column is in the ordering columns of the clustered index of a
+table. Column prefixes are treated like whole columns.
+@return	TRUE if the column, or its prefix, is in the clustered key */
+UNIV_INTERN
+ibool
+dict_table_col_in_clustered_key(
+/*============================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			n)	/*!< in: column number */
+	__attribute__((nonnull, warn_unused_result));
+/*******************************************************************//**
+Check if the table has an FTS index.
+@return TRUE if table has an FTS index */
+UNIV_INLINE
+ibool
+dict_table_has_fts_index(
+/*=====================*/
+	dict_table_t*   table)		/*!< in: table */
+	__attribute__((nonnull, warn_unused_result));
+/*******************************************************************//**
+Copies types of columns contained in table to tuple and sets all
+fields of the tuple to the SQL NULL value.  This function should
+be called right after dtuple_create(). */
+UNIV_INTERN
+void
+dict_table_copy_types(
+/*==================*/
+	dtuple_t*		tuple,	/*!< in/out: data tuple */
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull));
+/********************************************************************
+Wait until all the background threads of the given table have exited, i.e.,
+bg_threads == 0. Note: bg_threads_mutex must be reserved when
+calling this. */
+UNIV_INTERN
+void
+dict_table_wait_for_bg_threads_to_exit(
+/*===================================*/
+	dict_table_t*	table,	/* in: table */
+	ulint		delay)	/* in: time in microseconds to wait between
+				checks of bg_threads. */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Looks for an index with the given id. NOTE that we do not reserve
+the dictionary mutex: this function is for emergency purposes like
+printing info of a corrupt database page!
+@return	index or NULL if not found from cache */
+UNIV_INTERN
+dict_index_t*
+dict_index_find_on_id_low(
+/*======================*/
+	index_id_t	id)	/*!< in: index id */
+	__attribute__((warn_unused_result));
+/**********************************************************************//**
+Make room in the table cache by evicting an unused table. The unused table
+should not be part of FK relationship and currently not used in any user
+transaction. There is no guarantee that it will remove a table.
+@return number of tables evicted. */
+UNIV_INTERN
+ulint
+dict_make_room_in_cache(
+/*====================*/
+	ulint		max_tables,	/*!< in: max tables allowed in cache */
+	ulint		pct_check);	/*!< in: max percent to check */
+/**********************************************************************//**
+Adds an index to the dictionary cache.
+@return	DB_SUCCESS, DB_TOO_BIG_RECORD, or DB_CORRUPTION */
+UNIV_INTERN
+dberr_t
+dict_index_add_to_cache(
+/*====================*/
+	dict_table_t*	table,	/*!< in: table on which the index is */
+	dict_index_t*	index,	/*!< in, own: index; NOTE! The index memory
+				object is freed in this function! */
+	ulint		page_no,/*!< in: root page number of the index */
+	ibool		strict)	/*!< in: TRUE=refuse to create the index
+				if records could be too big to fit in
+				an B-tree page */
+	__attribute__((nonnull, warn_unused_result));
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+UNIV_INTERN
+void
+dict_index_remove_from_cache(
+/*=========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	dict_index_t*	index)	/*!< in, own: index */
+	__attribute__((nonnull));
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index,
+including fields added by the dictionary system.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_fields(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: an internal
+					representation of index (in
+					the dictionary cache) */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+that uniquely determine the position of an index entry in the index, if
+we do not take multiversioning into account: in the B-tree use the value
+returned by dict_index_get_n_unique_in_tree.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+which uniquely determine the position of an index entry in the index, if
+we also take multiversioning into account.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique_in_tree(
+/*============================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the number of user-defined ordering fields in the index. In the internal
+representation we add the row id to the ordering fields to make all indexes
+unique, but this function returns the number of fields the user defined
+in the index as ordering fields.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_ordering_defined_by_user(
+/*======================================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+	__attribute__((nonnull, warn_unused_result));
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth field of an index.
+@return	pointer to field object */
+UNIV_INLINE
+dict_field_t*
+dict_index_get_nth_field(
+/*=====================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of field */
+	__attribute__((nonnull, warn_unused_result));
+#else /* UNIV_DEBUG */
+# define dict_index_get_nth_field(index, pos) ((index)->fields + (pos))
+#endif /* UNIV_DEBUG */
+/********************************************************************//**
+Gets pointer to the nth column in an index.
+@return	column */
+UNIV_INLINE
+const dict_col_t*
+dict_index_get_nth_col(
+/*===================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of the field */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the column number of the nth field in an index.
+@return	column number */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_no(
+/*======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of the field */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_pos(
+/*=======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			n)	/*!< in: column number */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INTERN
+ulint
+dict_index_get_nth_col_or_prefix_pos(
+/*=================================*/
+	const dict_index_t*	index,		/*!< in: index */
+	ulint			n,		/*!< in: column number */
+	ibool			inc_prefix)	/*!< in: TRUE=consider
+						column prefixes too */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Returns TRUE if the index contains a column or a prefix of that column.
+@return	TRUE if contains the column or its prefix */
+UNIV_INTERN
+ibool
+dict_index_contains_col_or_prefix(
+/*==============================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			n)	/*!< in: column number */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Looks for a matching field in an index. The column has to be the same. The
+column in index must be complete, or must contain a prefix longer than the
+column in index2. That is, we must be able to construct the prefix in index2
+from the prefix in index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INTERN
+ulint
+dict_index_get_nth_field_pos(
+/*=========================*/
+	const dict_index_t*	index,	/*!< in: index from which to search */
+	const dict_index_t*	index2,	/*!< in: index */
+	ulint			n)	/*!< in: field number in index2 */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Looks for column n position in the clustered index.
+@return	position in internal representation of the clustered index */
+UNIV_INTERN
+ulint
+dict_table_get_nth_col_pos(
+/*=======================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			n)	/*!< in: column number */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Returns the position of a system column in an index.
+@return	position, ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_sys_col_pos(
+/*=======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			type)	/*!< in: DATA_ROW_ID, ... */
+	__attribute__((nonnull, warn_unused_result));
+/*******************************************************************//**
+Adds a column to index. */
+UNIV_INTERN
+void
+dict_index_add_col(
+/*===============*/
+	dict_index_t*		index,		/*!< in/out: index */
+	const dict_table_t*	table,		/*!< in: table */
+	dict_col_t*		col,		/*!< in: column */
+	ulint			prefix_len)	/*!< in: column prefix length */
+	__attribute__((nonnull));
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Copies types of fields contained in index to tuple. */
+UNIV_INTERN
+void
+dict_index_copy_types(
+/*==================*/
+	dtuple_t*		tuple,		/*!< in/out: data tuple */
+	const dict_index_t*	index,		/*!< in: index */
+	ulint			n_fields)	/*!< in: number of
+						field types to copy */
+	__attribute__((nonnull));
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Gets the field column.
+@return	field->col, pointer to the table column */
+UNIV_INLINE
+const dict_col_t*
+dict_field_get_col(
+/*===============*/
+	const dict_field_t*	field)	/*!< in: index field */
+	__attribute__((nonnull, warn_unused_result));
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+Assumes that dict_sys->mutex is already being held.
+@return	index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_index_get_if_in_cache_low(
+/*===========================*/
+	index_id_t	index_id)	/*!< in: index id */
+	__attribute__((warn_unused_result));
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+@return	index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_index_get_if_in_cache(
+/*=======================*/
+	index_id_t	index_id)	/*!< in: index id */
+	__attribute__((warn_unused_result));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Checks that a tuple has n_fields_cmp value in a sensible range, so that
+no comparison can occur with the page number field in a node pointer.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dict_index_check_search_tuple(
+/*==========================*/
+	const dict_index_t*	index,	/*!< in: index tree */
+	const dtuple_t*		tuple)	/*!< in: tuple used in a search */
+	__attribute__((nonnull, warn_unused_result));
+/** Whether and when to allow temporary index names */
+enum check_name {
+	/** Require all indexes to be complete. */
+	CHECK_ALL_COMPLETE,
+	/** Allow aborted online index creation. */
+	CHECK_ABORTED_OK,
+	/** Allow partial indexes to exist. */
+	CHECK_PARTIAL_OK
+};
+/**********************************************************************//**
+Check for duplicate index entries in a table [using the index name] */
+UNIV_INTERN
+void
+dict_table_check_for_dup_indexes(
+/*=============================*/
+	const dict_table_t*	table,	/*!< in: Check for dup indexes
+					in this table */
+	enum check_name		check)	/*!< in: whether and when to allow
+					temporary index names */
+	__attribute__((nonnull));
+#endif /* UNIV_DEBUG */
+/**********************************************************************//**
+Builds a node pointer out of a physical record and a page number.
+@return	own: node pointer */
+UNIV_INTERN
+dtuple_t*
+dict_index_build_node_ptr(
+/*======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_t*		rec,	/*!< in: record for which to build node
+					pointer */
+	ulint			page_no,/*!< in: page number to put in node
+					pointer */
+	mem_heap_t*		heap,	/*!< in: memory heap where pointer
+					created */
+	ulint			level)	/*!< in: level of rec in tree:
+					0 means leaf level */
+	__attribute__((nonnull, warn_unused_result));
+/**********************************************************************//**
+Copies an initial segment of a physical record, long enough to specify an
+index entry uniquely.
+@return	pointer to the prefix record */
+UNIV_INTERN
+rec_t*
+dict_index_copy_rec_order_prefix(
+/*=============================*/
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_t*		rec,	/*!< in: record for which to
+					copy prefix */
+	ulint*			n_fields,/*!< out: number of fields copied */
+	byte**			buf,	/*!< in/out: memory buffer for the
+					copied prefix, or NULL */
+	ulint*			buf_size)/*!< in/out: buffer size */
+	__attribute__((nonnull, warn_unused_result));
+/**********************************************************************//**
+Builds a typed data tuple out of a physical record.
+@return	own: data tuple */
+UNIV_INTERN
+dtuple_t*
+dict_index_build_data_tuple(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index */
+	rec_t*		rec,	/*!< in: record for which to build data tuple */
+	ulint		n_fields,/*!< in: number of data fields */
+	mem_heap_t*	heap)	/*!< in: memory heap where tuple created */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the space id of the root of the index tree.
+@return	space id */
+UNIV_INLINE
+ulint
+dict_index_get_space(
+/*=================*/
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Sets the space id of the root of the index tree. */
+UNIV_INLINE
+void
+dict_index_set_space(
+/*=================*/
+	dict_index_t*	index,	/*!< in/out: index */
+	ulint		space)	/*!< in: space id */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Gets the page number of the root of the index tree.
+@return	page number */
+UNIV_INLINE
+ulint
+dict_index_get_page(
+/*================*/
+	const dict_index_t*	tree)	/*!< in: index */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the read-write lock of the index tree.
+@return	read-write lock */
+UNIV_INLINE
+rw_lock_t*
+dict_index_get_lock(
+/*================*/
+	dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Returns free space reserved for future updates of records. This is
+relevant only in the case of many consecutive inserts, as updates
+which make the records bigger might fragment the index.
+@return	number of free bytes on page, reserved for updates */
+UNIV_INLINE
+ulint
+dict_index_get_space_reserve(void);
+/*==============================*/
+
+/* Online index creation @{ */
+/********************************************************************//**
+Gets the status of online index creation.
+@return the status */
+UNIV_INLINE
+enum online_index_status
+dict_index_get_online_status(
+/*=========================*/
+	const dict_index_t*	index)	/*!< in: secondary index */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Sets the status of online index creation. */
+UNIV_INLINE
+void
+dict_index_set_online_status(
+/*=========================*/
+	dict_index_t*			index,	/*!< in/out: index */
+	enum online_index_status	status)	/*!< in: status */
+	__attribute__((nonnull));
+/********************************************************************//**
+Determines if a secondary index is being or has been created online,
+or if the table is being rebuilt online, allowing concurrent modifications
+to the table.
+@retval true if the index is being or has been built online, or
+if this is a clustered index and the table is being or has been rebuilt online
+@retval false if the index has been created or the table has been
+rebuilt completely */
+UNIV_INLINE
+bool
+dict_index_is_online_ddl(
+/*=====================*/
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Calculates the minimum record length in an index. */
+UNIV_INTERN
+ulint
+dict_index_calc_min_rec_len(
+/*========================*/
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Reserves the dictionary system mutex for MySQL. */
+UNIV_INTERN
+void
+dict_mutex_enter_for_mysql(void);
+/*============================*/
+/********************************************************************//**
+Releases the dictionary system mutex for MySQL. */
+UNIV_INTERN
+void
+dict_mutex_exit_for_mysql(void);
+/*===========================*/
+
+/** Create a dict_table_t's stats latch or delay for lazy creation.
+This function is only called from either single threaded environment
+or from a thread that has not shared the table object with other threads.
+@param[in,out]	table	table whose stats latch to create
+@param[in]	enabled	if false then the latch is disabled
+and dict_table_stats_lock()/unlock() become noop on this table. */
+
+void
+dict_table_stats_latch_create(
+	dict_table_t*	table,
+	bool		enabled);
+
+/** Destroy a dict_table_t's stats latch.
+This function is only called from either single threaded environment
+or from a thread that has not shared the table object with other threads.
+@param[in,out]	table	table whose stats latch to destroy */
+
+void
+dict_table_stats_latch_destroy(
+	dict_table_t*	table);
+
+/**********************************************************************//**
+Lock the appropriate latch to protect a given table's statistics.
+table->id is used to pick the corresponding latch from a global array of
+latches. */
+UNIV_INTERN
+void
+dict_table_stats_lock(
+/*==================*/
+	dict_table_t*	table,		/*!< in: table */
+	ulint		latch_mode);	/*!< in: RW_S_LATCH or RW_X_LATCH */
+/**********************************************************************//**
+Unlock the latch that has been locked by dict_table_stats_lock() */
+UNIV_INTERN
+void
+dict_table_stats_unlock(
+/*====================*/
+	dict_table_t*	table,		/*!< in: table */
+	ulint		latch_mode);	/*!< in: RW_S_LATCH or RW_X_LATCH */
+/********************************************************************//**
+Checks if the database name in two table names is the same.
+@return	TRUE if same db name */
+UNIV_INTERN
+ibool
+dict_tables_have_same_db(
+/*=====================*/
+	const char*	name1,	/*!< in: table name in the form
+				dbname '/' tablename */
+	const char*	name2)	/*!< in: table name in the form
+				dbname '/' tablename */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Removes an index from the cache */
+UNIV_INTERN
+void
+dict_index_remove_from_cache(
+/*=========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	dict_index_t*	index)	/*!< in, own: index */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Get index by name
+@return	index, NULL if does not exist */
+UNIV_INTERN
+dict_index_t*
+dict_table_get_index_on_name(
+/*=========================*/
+	dict_table_t*	table,	/*!< in: table */
+	const char*	name)	/*!< in: name of the index to find */
+	__attribute__((nonnull, warn_unused_result));
+/**********************************************************************//**
+In case there is more than one index with the same name return the index
+with the min(id).
+@return	index, NULL if does not exist */
+UNIV_INTERN
+dict_index_t*
+dict_table_get_index_on_name_and_min_id(
+/*====================================*/
+	dict_table_t*	table,	/*!< in: table */
+	const char*	name)	/*!< in: name of the index to find */
+	__attribute__((nonnull, warn_unused_result));
+/***************************************************************
+Check whether a column exists in an FTS index. */
+UNIV_INLINE
+ulint
+dict_table_is_fts_column(
+/*=====================*/
+				/* out: ULINT_UNDEFINED if no match else
+				the offset within the vector */
+	ib_vector_t*	indexes,/* in: vector containing only FTS indexes */
+	ulint		col_no)	/* in: col number to search for */
+	__attribute__((nonnull, warn_unused_result));
+/**********************************************************************//**
+Move a table to the non LRU end of the LRU list. */
+UNIV_INTERN
+void
+dict_table_move_from_lru_to_non_lru(
+/*================================*/
+	dict_table_t*	table)	/*!< in: table to move from LRU to non-LRU */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Move a table to the LRU list from the non-LRU list. */
+UNIV_INTERN
+void
+dict_table_move_from_non_lru_to_lru(
+/*================================*/
+	dict_table_t*	table)	/*!< in: table to move from non-LRU to LRU */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Move to the most recently used segment of the LRU list. */
+UNIV_INTERN
+void
+dict_move_to_mru(
+/*=============*/
+	dict_table_t*	table)	/*!< in: table to move to MRU */
+	__attribute__((nonnull));
+
+/** Maximum number of columns in a foreign key constraint. Please Note MySQL
+has a much lower limit on the number of columns allowed in a foreign key
+constraint */
+#define MAX_NUM_FK_COLUMNS		500
+
+/* Buffers for storing detailed information about the latest foreign key
+and unique key errors */
+extern FILE*	dict_foreign_err_file;
+extern ib_mutex_t	dict_foreign_err_mutex; /* mutex protecting the buffers */
+
+/** the dictionary system */
+extern dict_sys_t*	dict_sys;
+/** the data dictionary rw-latch protecting dict_sys */
+extern rw_lock_t	dict_operation_lock;
+
+/* Dictionary system struct */
+struct dict_sys_t{
+	ib_mutex_t		mutex;		/*!< mutex protecting the data
+					dictionary; protects also the
+					disk-based dictionary system tables;
+					this mutex serializes CREATE TABLE
+					and DROP TABLE, as well as reading
+					the dictionary data for a table from
+					system tables */
+	row_id_t	row_id;		/*!< the next row id to assign;
+					NOTE that at a checkpoint this
+					must be written to the dict system
+					header and flushed to a file; in
+					recovery this must be derived from
+					the log records */
+	hash_table_t*	table_hash;	/*!< hash table of the tables, based
+					on name */
+	hash_table_t*	table_id_hash;	/*!< hash table of the tables, based
+					on id */
+	ulint		size;		/*!< varying space in bytes occupied
+					by the data dictionary table and
+					index objects */
+	dict_table_t*	sys_tables;	/*!< SYS_TABLES table */
+	dict_table_t*	sys_columns;	/*!< SYS_COLUMNS table */
+	dict_table_t*	sys_indexes;	/*!< SYS_INDEXES table */
+	dict_table_t*	sys_fields;	/*!< SYS_FIELDS table */
+
+	/*=============================*/
+	UT_LIST_BASE_NODE_T(dict_table_t)
+			table_LRU;	/*!< List of tables that can be evicted
+					from the cache */
+	UT_LIST_BASE_NODE_T(dict_table_t)
+			table_non_LRU;	/*!< List of tables that can't be
+					evicted from the cache */
+};
+#endif /* !UNIV_HOTBACKUP */
+
+/** dummy index for ROW_FORMAT=REDUNDANT supremum and infimum records */
+extern dict_index_t*	dict_ind_redundant;
+/** dummy index for ROW_FORMAT=COMPACT supremum and infimum records */
+extern dict_index_t*	dict_ind_compact;
+
+/**********************************************************************//**
+Inits dict_ind_redundant and dict_ind_compact. */
+UNIV_INTERN
+void
+dict_ind_init(void);
+/*===============*/
+
+/* Auxiliary structs for checking a table definition @{ */
+
+/* This struct is used to specify the name and type that a column must
+have when checking a table's schema. */
+struct dict_col_meta_t {
+	const char*	name;		/* column name */
+	ulint		mtype;		/* required column main type */
+	ulint		prtype_mask;	/* required column precise type mask;
+					if this is non-zero then all the
+					bits it has set must also be set
+					in the column's prtype */
+	ulint		len;		/* required column length */
+};
+
+/* This struct is used for checking whether a given table exists and
+whether it has a predefined schema (number of columns and columns names
+and types) */
+struct dict_table_schema_t {
+	const char*		table_name;	/* the name of the table whose
+						structure we are checking */
+	ulint			n_cols;		/* the number of columns the
+						table must have */
+	dict_col_meta_t*	columns;	/* metadata for the columns;
+						this array has n_cols
+						elements */
+	ulint			n_foreign;	/* number of foreign keys this
+						table has, pointing to other
+						tables (where this table is
+						FK child) */
+	ulint			n_referenced;	/* number of foreign keys other
+						tables have, pointing to this
+						table (where this table is
+						parent) */
+};
+/* @} */
+
+/*********************************************************************//**
+Checks whether a table exists and whether it has the given structure.
+The table must have the same number of columns with the same names and
+types. The order of the columns does not matter.
+The caller must own the dictionary mutex.
+dict_table_schema_check() @{
+@return DB_SUCCESS if the table exists and contains the necessary columns */
+UNIV_INTERN
+dberr_t
+dict_table_schema_check(
+/*====================*/
+	dict_table_schema_t*	req_schema,	/*!< in/out: required table
+						schema */
+	char*			errstr,		/*!< out: human readable error
+						message if != DB_SUCCESS and
+						!= DB_TABLE_NOT_FOUND is
+						returned */
+	size_t			errstr_sz)	/*!< in: errstr size */
+	__attribute__((nonnull, warn_unused_result));
+/* @} */
+
+/*********************************************************************//**
+Converts a database and table name from filesystem encoding
+(e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two
+strings in UTF8 encoding (e.g. dцb and aюbØc). The output buffers must be
+at least MAX_DB_UTF8_LEN and MAX_TABLE_UTF8_LEN bytes. */
+UNIV_INTERN
+void
+dict_fs2utf8(
+/*=========*/
+	const char*	db_and_table,	/*!< in: database and table names,
+					e.g. d@i1b/a@q1b@1Kc */
+	char*		db_utf8,	/*!< out: database name, e.g. dцb */
+	size_t		db_utf8_size,	/*!< in: dbname_utf8 size */
+	char*		table_utf8,	/*!< out: table name, e.g. aюbØc */
+	size_t		table_utf8_size)/*!< in: table_utf8 size */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Closes the data dictionary module. */
+UNIV_INTERN
+void
+dict_close(void);
+/*============*/
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Check whether the table is corrupted.
+@return	nonzero for corrupted table, zero for valid tables */
+UNIV_INLINE
+ulint
+dict_table_is_corrupted(
+/*====================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, warn_unused_result));
+
+/**********************************************************************//**
+Check whether the index is corrupted.
+@return	nonzero for corrupted index, zero for valid indexes */
+UNIV_INLINE
+ulint
+dict_index_is_corrupted(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((nonnull, warn_unused_result));
+
+#endif /* !UNIV_HOTBACKUP */
+/**********************************************************************//**
+Flags an index and table corrupted both in the data dictionary cache
+and in the system table SYS_INDEXES. */
+UNIV_INTERN
+void
+dict_set_corrupted(
+/*===============*/
+	dict_index_t*	index,	/*!< in/out: index */
+	trx_t*		trx,	/*!< in/out: transaction */
+	const char*	ctx)	/*!< in: context */
+	UNIV_COLD __attribute__((nonnull));
+
+/**********************************************************************//**
+Flags an index corrupted in the data dictionary cache only. This
+is used mostly to mark a corrupted index when index's own dictionary
+is corrupted, and we force to load such index for repair purpose */
+UNIV_INTERN
+void
+dict_set_corrupted_index_cache_only(
+/*================================*/
+	dict_index_t*	index,		/*!< in/out: index */
+	dict_table_t*	table)		/*!< in/out: table */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Flags a table with specified space_id corrupted in the table dictionary
+cache.
+@return TRUE if successful */
+UNIV_INTERN
+ibool
+dict_set_corrupted_by_space(
+/*========================*/
+	ulint		space_id);	/*!< in: space ID */
+
+/********************************************************************//**
+Validate the table flags.
+@return	true if valid. */
+UNIV_INLINE
+bool
+dict_tf_is_valid(
+/*=============*/
+	ulint		flags)		/*!< in: table flags */
+	__attribute__((warn_unused_result));
+
+/********************************************************************//**
+Check if the tablespace for the table has been discarded.
+@return	true if the tablespace has been discarded. */
+UNIV_INLINE
+bool
+dict_table_is_discarded(
+/*====================*/
+	const dict_table_t*	table)	/*!< in: table to check */
+	__attribute__((nonnull, pure, warn_unused_result));
+
+/********************************************************************//**
+Check if it is a temporary table.
+@return	true if temporary table flag is set. */
+UNIV_INLINE
+bool
+dict_table_is_temporary(
+/*====================*/
+	const dict_table_t*	table)	/*!< in: table to check */
+	__attribute__((nonnull, pure, warn_unused_result));
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+This function should be called whenever a page is successfully
+compressed. Updates the compression padding information. */
+UNIV_INTERN
+void
+dict_index_zip_success(
+/*===================*/
+	dict_index_t*	index)	/*!< in/out: index to be updated. */
+	__attribute__((nonnull));
+/*********************************************************************//**
+This function should be called whenever a page compression attempt
+fails. Updates the compression padding information. */
+UNIV_INTERN
+void
+dict_index_zip_failure(
+/*===================*/
+	dict_index_t*	index)	/*!< in/out: index to be updated. */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Return the optimal page size, for which page will likely compress.
+@return page size beyond which page may not compress*/
+UNIV_INTERN
+ulint
+dict_index_zip_pad_optimal_page_size(
+/*=================================*/
+	dict_index_t*	index)	/*!< in: index for which page size
+				is requested */
+	__attribute__((nonnull, warn_unused_result));
+/*************************************************************//**
+Convert table flag to row format string.
+@return row format name */
+UNIV_INTERN
+const char*
+dict_tf_to_row_format_string(
+/*=========================*/
+	ulint	table_flag);		/*!< in: row format setting */
+/*****************************************************************//**
+Get index by first field of the index
+@return index which is having first field matches
+with the field present in field_index position of table */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_index_on_first_col(
+/*==============================*/
+	const dict_table_t*	table,		/*!< in: table */
+	ulint			col_index);	/*!< in: position of column
+						in table */
+
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "dict0dict.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic
new file mode 100644
index 00000000000..066ffe47e4a
--- /dev/null
+++ b/storage/innobase/include/dict0dict.ic
@@ -0,0 +1,1433 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0dict.ic
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "data0type.h"
+#ifndef UNIV_HOTBACKUP
+#include "dict0load.h"
+#include "rem0types.h"
+#include "fsp0fsp.h"
+#include "srv0srv.h"
+#include "sync0rw.h" /* RW_S_LATCH */
+
+/*********************************************************************//**
+Gets the minimum number of bytes per character.
+@return minimum multi-byte char size, in bytes */
+UNIV_INLINE
+ulint
+dict_col_get_mbminlen(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+{
+	return(DATA_MBMINLEN(col->mbminmaxlen));
+}
+/*********************************************************************//**
+Gets the maximum number of bytes per character.
+@return maximum multi-byte char size, in bytes */
+UNIV_INLINE
+ulint
+dict_col_get_mbmaxlen(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+{
+	return(DATA_MBMAXLEN(col->mbminmaxlen));
+}
+/*********************************************************************//**
+Sets the minimum and maximum number of bytes per character. */
+UNIV_INLINE
+void
+dict_col_set_mbminmaxlen(
+/*=====================*/
+	dict_col_t*	col,		/*!< in/out: column */
+	ulint		mbminlen,	/*!< in: minimum multi-byte
+					character size, in bytes */
+	ulint		mbmaxlen)	/*!< in: minimum multi-byte
+					character size, in bytes */
+{
+	ut_ad(mbminlen < DATA_MBMAX);
+	ut_ad(mbmaxlen < DATA_MBMAX);
+	ut_ad(mbminlen <= mbmaxlen);
+
+	col->mbminmaxlen = DATA_MBMINMAXLEN(mbminlen, mbmaxlen);
+}
+/*********************************************************************//**
+Gets the column data type. */
+UNIV_INLINE
+void
+dict_col_copy_type(
+/*===============*/
+	const dict_col_t*	col,	/*!< in: column */
+	dtype_t*		type)	/*!< out: data type */
+{
+	ut_ad(col && type);
+
+	type->mtype = col->mtype;
+	type->prtype = col->prtype;
+	type->len = col->len;
+	type->mbminmaxlen = col->mbminmaxlen;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Assert that a column and a data type match.
+@return	TRUE */
+UNIV_INLINE
+ibool
+dict_col_type_assert_equal(
+/*=======================*/
+	const dict_col_t*	col,	/*!< in: column */
+	const dtype_t*		type)	/*!< in: data type */
+{
+	ut_ad(col);
+	ut_ad(type);
+
+	ut_ad(col->mtype == type->mtype);
+	ut_ad(col->prtype == type->prtype);
+	//ut_ad(col->len == type->len);
+# ifndef UNIV_HOTBACKUP
+	ut_ad(col->mbminmaxlen == type->mbminmaxlen);
+# endif /* !UNIV_HOTBACKUP */
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Returns the minimum size of the column.
+@return	minimum size */
+UNIV_INLINE
+ulint
+dict_col_get_min_size(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+{
+	return(dtype_get_min_size_low(col->mtype, col->prtype, col->len,
+				      col->mbminmaxlen));
+}
+/***********************************************************************//**
+Returns the maximum size of the column.
+@return	maximum size */
+UNIV_INLINE
+ulint
+dict_col_get_max_size(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+{
+	return(dtype_get_max_size_low(col->mtype, col->len));
+}
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************************//**
+Returns the size of a fixed size column, 0 if not a fixed size column.
+@return	fixed size, or 0 */
+UNIV_INLINE
+ulint
+dict_col_get_fixed_size(
+/*====================*/
+	const dict_col_t*	col,	/*!< in: column */
+	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+	return(dtype_get_fixed_size_low(col->mtype, col->prtype, col->len,
+					col->mbminmaxlen, comp));
+}
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return	SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dict_col_get_sql_null_size(
+/*=======================*/
+	const dict_col_t*	col,	/*!< in: column */
+	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+{
+	return(dict_col_get_fixed_size(col, comp));
+}
+
+/*********************************************************************//**
+Gets the column number.
+@return	col->ind, table column position (starting from 0) */
+UNIV_INLINE
+ulint
+dict_col_get_no(
+/*============*/
+	const dict_col_t*	col)	/*!< in: column */
+{
+	ut_ad(col);
+
+	return(col->ind);
+}
+
+/*********************************************************************//**
+Gets the column position in the clustered index. */
+UNIV_INLINE
+ulint
+dict_col_get_clust_pos(
+/*===================*/
+	const dict_col_t*	col,		/*!< in: table column */
+	const dict_index_t*	clust_index)	/*!< in: clustered index */
+{
+	ulint	i;
+
+	ut_ad(col);
+	ut_ad(clust_index);
+	ut_ad(dict_index_is_clust(clust_index));
+
+	for (i = 0; i < clust_index->n_def; i++) {
+		const dict_field_t*	field = &clust_index->fields[i];
+
+		if (!field->prefix_len && field->col == col) {
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+#ifndef UNIV_HOTBACKUP
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the first index on the table (the clustered index).
+@return	index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_first_index(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	return(UT_LIST_GET_FIRST(((dict_table_t*) table)->indexes));
+}
+
+/********************************************************************//**
+Gets the last index on the table.
+@return	index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_last_index(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	return(UT_LIST_GET_LAST((const_cast<dict_table_t*>(table))
+				->indexes));
+}
+
+/********************************************************************//**
+Gets the next index on the table.
+@return	index, NULL if none left */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_next_index(
+/*======================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(UT_LIST_GET_NEXT(indexes, (dict_index_t*) index));
+}
+#endif /* UNIV_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Check whether the index is the clustered index.
+@return	nonzero for clustered index, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_clust(
+/*================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(index->type & DICT_CLUSTERED);
+}
+/********************************************************************//**
+Check whether the index is unique.
+@return	nonzero for unique index, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_unique(
+/*=================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(index->type & DICT_UNIQUE);
+}
+
+/********************************************************************//**
+Check whether the index is the insert buffer tree.
+@return	nonzero for insert buffer, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_ibuf(
+/*===============*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(index->type & DICT_IBUF);
+}
+
+/********************************************************************//**
+Check whether the index is an universal index tree.
+@return	nonzero for universal tree, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_univ(
+/*===============*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(index->type & DICT_UNIVERSAL);
+}
+
+/********************************************************************//**
+Check whether the index is a secondary index or the insert buffer tree.
+@return	nonzero for insert buffer, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_sec_or_ibuf(
+/*======================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ulint	type;
+
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	type = index->type;
+
+	return(!(type & DICT_CLUSTERED) || (type & DICT_IBUF));
+}
+
+/********************************************************************//**
+Gets the number of user-defined columns in a table in the dictionary
+cache.
+@return	number of user-defined (e.g., not ROW_ID) columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_user_cols(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	return(table->n_cols - DATA_N_SYS_COLS);
+}
+
+/********************************************************************//**
+Gets the number of system columns in a table in the dictionary cache.
+@return	number of system (e.g., ROW_ID) columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_sys_cols(
+/*======================*/
+	const dict_table_t*	table __attribute__((unused)))	/*!< in: table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(table->cached);
+
+	return(DATA_N_SYS_COLS);
+}
+
+/********************************************************************//**
+Gets the number of all columns (also system) in a table in the dictionary
+cache.
+@return	number of columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_cols(
+/*==================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	return(table->n_cols);
+}
+
+/********************************************************************//**
+Gets the approximately estimated number of rows in the table.
+@return	estimated number of rows */
+UNIV_INLINE
+ib_uint64_t
+dict_table_get_n_rows(
+/*==================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table->stat_initialized);
+
+	return(table->stat_n_rows);
+}
+
+/********************************************************************//**
+Increment the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_inc(
+/*==================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	if (table->stat_initialized) {
+		ib_uint64_t	n_rows = table->stat_n_rows;
+		if (n_rows < 0xFFFFFFFFFFFFFFFFULL) {
+			table->stat_n_rows = n_rows + 1;
+		}
+	}
+}
+
+/********************************************************************//**
+Decrement the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_dec(
+/*==================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	if (table->stat_initialized) {
+		ib_uint64_t	n_rows = table->stat_n_rows;
+		if (n_rows > 0) {
+			table->stat_n_rows = n_rows - 1;
+		}
+	}
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth column of a table.
+@return	pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_nth_col(
+/*===================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			pos)	/*!< in: position of column */
+{
+	ut_ad(table);
+	ut_ad(pos < table->n_def);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	return((dict_col_t*) (table->cols) + pos);
+}
+
+/********************************************************************//**
+Gets the given system column of a table.
+@return	pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_sys_col(
+/*===================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			sys)	/*!< in: DATA_ROW_ID, ... */
+{
+	dict_col_t*	col;
+
+	ut_ad(table);
+	ut_ad(sys < DATA_N_SYS_COLS);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	col = dict_table_get_nth_col(table, table->n_cols
+				     - DATA_N_SYS_COLS + sys);
+	ut_ad(col->mtype == DATA_SYS);
+	ut_ad(col->prtype == (sys | DATA_NOT_NULL));
+
+	return(col);
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Gets the given system column number of a table.
+@return	column number */
+UNIV_INLINE
+ulint
+dict_table_get_sys_col_no(
+/*======================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			sys)	/*!< in: DATA_ROW_ID, ... */
+{
+	ut_ad(table);
+	ut_ad(sys < DATA_N_SYS_COLS);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	return(table->n_cols - DATA_N_SYS_COLS + sys);
+}
+
+/********************************************************************//**
+Check whether the table uses the compact page format.
+@return	TRUE if table uses the compact page format */
+UNIV_INLINE
+ibool
+dict_table_is_comp(
+/*===============*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+
+#if DICT_TF_COMPACT != 1
+#error "DICT_TF_COMPACT must be 1"
+#endif
+
+	return(table->flags & DICT_TF_COMPACT);
+}
+
+/************************************************************************
+Check if the table has an FTS index. */
+UNIV_INLINE
+ibool
+dict_table_has_fts_index(
+/*=====================*/
+				/* out: TRUE if table has an FTS index */
+	dict_table_t*   table)  /* in: table */
+{
+	ut_ad(table);
+
+	return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS));
+}
+
+/********************************************************************//**
+Validate the table flags.
+@return	true if valid. */
+UNIV_INLINE
+bool
+dict_tf_is_valid(
+/*=============*/
+	ulint	flags)		/*!< in: table flags */
+{
+	ulint	compact = DICT_TF_GET_COMPACT(flags);
+	ulint	zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags);
+	ulint	atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(flags);
+	ulint	unused = DICT_TF_GET_UNUSED(flags);
+
+	/* Make sure there are no bits that we do not know about. */
+	if (unused != 0) {
+
+		return(false);
+
+	} else if (atomic_blobs) {
+		/* Barracuda row formats COMPRESSED and DYNAMIC build on
+		the page structure introduced for the COMPACT row format
+		by allowing keys in secondary indexes to be made from
+		data stored off-page in the clustered index. */
+
+		if (!compact) {
+			return(false);
+		}
+
+	} else if (zip_ssize) {
+
+		/* Antelope does not support COMPRESSED row format. */
+		return(false);
+	}
+
+	if (zip_ssize) {
+
+		/* COMPRESSED row format must have compact and atomic_blobs
+		bits set and validate the number is within allowed range. */
+
+		if (!compact
+		    || !atomic_blobs
+		    || zip_ssize > PAGE_ZIP_SSIZE_MAX) {
+
+			return(false);
+		}
+	}
+
+	/* CREATE TABLE ... DATA DIRECTORY is supported for any row format,
+	so the DATA_DIR flag is compatible with all other table flags. */
+
+	return(true);
+}
+
+/********************************************************************//**
+Validate a SYS_TABLES TYPE field and return it.
+@return	Same as input after validating it as a SYS_TABLES TYPE field.
+If there is an error, return ULINT_UNDEFINED. */
+UNIV_INLINE
+ulint
+dict_sys_tables_type_validate(
+/*==========================*/
+	ulint	type,		/*!< in: SYS_TABLES.TYPE */
+	ulint	n_cols)		/*!< in: SYS_TABLES.N_COLS */
+{
+	ulint	low_order_bit = DICT_TF_GET_COMPACT(type);
+	ulint	redundant = !(n_cols & DICT_N_COLS_COMPACT);
+	ulint	zip_ssize = DICT_TF_GET_ZIP_SSIZE(type);
+	ulint	atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(type);
+	ulint	unused = DICT_TF_GET_UNUSED(type);
+
+	/* The low order bit of SYS_TABLES.TYPE is always set to 1.
+	If the format is UNIV_FORMAT_B or higher, this field is the same
+	as dict_table_t::flags. Zero is not allowed here. */
+	if (!low_order_bit) {
+		return(ULINT_UNDEFINED);
+	}
+
+	if (redundant) {
+		if (zip_ssize || atomic_blobs) {
+			return(ULINT_UNDEFINED);
+		}
+	}
+
+	/* Make sure there are no bits that we do not know about. */
+	if (unused) {
+		return(ULINT_UNDEFINED);
+	}
+
+	if (atomic_blobs) {
+		/* Barracuda row formats COMPRESSED and DYNAMIC build on
+		the page structure introduced for the COMPACT row format
+		by allowing keys in secondary indexes to be made from
+		data stored off-page in the clustered index.
+
+		The DICT_N_COLS_COMPACT flag should be in N_COLS,
+		but we already know that. */
+
+	} else if (zip_ssize) {
+		/* Antelope does not support COMPRESSED format. */
+		return(ULINT_UNDEFINED);
+	}
+
+	if (zip_ssize) {
+		/* COMPRESSED row format must have low_order_bit and
+		atomic_blobs bits set and the DICT_N_COLS_COMPACT flag
+		should be in N_COLS, but we already know about the
+		low_order_bit and DICT_N_COLS_COMPACT flags. */
+		if (!atomic_blobs) {
+			return(ULINT_UNDEFINED);
+		}
+
+		/* Validate that the number is within allowed range. */
+		if (zip_ssize > PAGE_ZIP_SSIZE_MAX) {
+			return(ULINT_UNDEFINED);
+		}
+	}
+
+	/* There is nothing to validate for the data_dir field.
+	CREATE TABLE ... DATA DIRECTORY is supported for any row
+	format, so the DATA_DIR flag is compatible with any other
+	table flags. However, it is not used with TEMPORARY tables.*/
+
+	/* Return the validated SYS_TABLES.TYPE. */
+	return(type);
+}
+
+/********************************************************************//**
+Determine the file format from dict_table_t::flags
+The low order bit will be zero for REDUNDANT and 1 for COMPACT. For any
+other row_format, file_format is > 0 and DICT_TF_COMPACT will also be set.
+@return	file format version */
+UNIV_INLINE
+rec_format_t
+dict_tf_get_rec_format(
+/*===================*/
+	ulint		flags)	/*!< in: dict_table_t::flags */
+{
+	ut_a(dict_tf_is_valid(flags));
+
+	if (!DICT_TF_GET_COMPACT(flags)) {
+		return(REC_FORMAT_REDUNDANT);
+	}
+
+	if (!DICT_TF_HAS_ATOMIC_BLOBS(flags)) {
+		return(REC_FORMAT_COMPACT);
+	}
+
+	if (DICT_TF_GET_ZIP_SSIZE(flags)) {
+		return(REC_FORMAT_COMPRESSED);
+	}
+
+	return(REC_FORMAT_DYNAMIC);
+}
+
+/********************************************************************//**
+Determine the file format from a dict_table_t::flags.
+@return	file format version */
+UNIV_INLINE
+ulint
+dict_tf_get_format(
+/*===============*/
+	ulint		flags)	/*!< in: dict_table_t::flags */
+{
+	if (DICT_TF_HAS_ATOMIC_BLOBS(flags)) {
+		return(UNIV_FORMAT_B);
+	}
+
+	return(UNIV_FORMAT_A);
+}
+
+/********************************************************************//**
+Determine the file format of a table.
+@return	file format version */
+UNIV_INLINE
+ulint
+dict_table_get_format(
+/*==================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+
+	return(dict_tf_get_format(table->flags));
+}
+
+/********************************************************************//**
+Set the file format and zip size in a dict_table_t::flags.  If zip size
+is not needed, it should be 0. */
+UNIV_INLINE
+void
+dict_tf_set(
+/*========*/
+	ulint*		flags,		/*!< in/out: table flags */
+	rec_format_t	format,		/*!< in: file format */
+	ulint		zip_ssize,	/*!< in: zip shift size */
+	bool		use_data_dir)	/*!< in: table uses DATA DIRECTORY */
+{
+	switch (format) {
+	case REC_FORMAT_REDUNDANT:
+		*flags = 0;
+		ut_ad(zip_ssize == 0);
+		break;
+	case REC_FORMAT_COMPACT:
+		*flags = DICT_TF_COMPACT;
+		ut_ad(zip_ssize == 0);
+		break;
+	case REC_FORMAT_COMPRESSED:
+		*flags = DICT_TF_COMPACT
+			| (1 << DICT_TF_POS_ATOMIC_BLOBS)
+			| (zip_ssize << DICT_TF_POS_ZIP_SSIZE);
+		break;
+	case REC_FORMAT_DYNAMIC:
+		*flags = DICT_TF_COMPACT
+			| (1 << DICT_TF_POS_ATOMIC_BLOBS);
+		ut_ad(zip_ssize == 0);
+		break;
+	}
+
+	if (use_data_dir) {
+		*flags |= (1 << DICT_TF_POS_DATA_DIR);
+	}
+}
+
+/********************************************************************//**
+Convert a 32 bit integer table flags to the 32 bit integer that is
+written into the tablespace header at the offset FSP_SPACE_FLAGS and is
+also stored in the fil_space_t::flags field.  The following chart shows
+the translation of the low order bit.  Other bits are the same.
+========================= Low order bit ==========================
+                    | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC
+dict_table_t::flags |     0     |    1    |     1      |    1
+fil_space_t::flags  |     0     |    0    |     1      |    1
+==================================================================
+@return	tablespace flags (fil_space_t::flags) */
+UNIV_INLINE
+ulint
+dict_tf_to_fsp_flags(
+/*=================*/
+	ulint	table_flags)	/*!< in: dict_table_t::flags */
+{
+	ulint fsp_flags;
+
+	DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure",
+			return(ULINT_UNDEFINED););
+
+	/* Adjust bit zero. */
+	fsp_flags = DICT_TF_HAS_ATOMIC_BLOBS(table_flags) ? 1 : 0;
+
+	/* ZIP_SSIZE and ATOMIC_BLOBS are at the same position. */
+	fsp_flags |= table_flags & DICT_TF_MASK_ZIP_SSIZE;
+	fsp_flags |= table_flags & DICT_TF_MASK_ATOMIC_BLOBS;
+
+	/* In addition, tablespace flags also contain the page size. */
+	fsp_flags |= fsp_flags_set_page_size(fsp_flags, UNIV_PAGE_SIZE);
+
+	/* The DATA_DIR flag is in a different position in fsp_flag */
+	fsp_flags |= DICT_TF_HAS_DATA_DIR(table_flags)
+		     ? FSP_FLAGS_MASK_DATA_DIR : 0;
+
+	ut_a(fsp_flags_is_valid(fsp_flags));
+
+	return(fsp_flags);
+}
+
+/********************************************************************//**
+Convert a 32 bit integer from SYS_TABLES.TYPE to dict_table_t::flags
+The following chart shows the translation of the low order bit.
+Other bits are the same.
+========================= Low order bit ==========================
+                    | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC
+SYS_TABLES.TYPE     |     1     |    1    |     1
+dict_table_t::flags |     0     |    1    |     1
+==================================================================
+@return	ulint containing SYS_TABLES.TYPE */
+UNIV_INLINE
+ulint
+dict_sys_tables_type_to_tf(
+/*=======================*/
+	ulint	type,	/*!< in: SYS_TABLES.TYPE field */
+	ulint	n_cols)	/*!< in: SYS_TABLES.N_COLS field */
+{
+	ulint	flags;
+	ulint	redundant = !(n_cols & DICT_N_COLS_COMPACT);
+
+	/* Adjust bit zero. */
+	flags = redundant ? 0 : 1;
+
+	/* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */
+	flags |= type & (DICT_TF_MASK_ZIP_SSIZE
+			 | DICT_TF_MASK_ATOMIC_BLOBS
+			 | DICT_TF_MASK_DATA_DIR);
+
+	return(flags);
+}
+
+/********************************************************************//**
+Convert a 32 bit integer table flags to the 32bit integer that is written
+to a SYS_TABLES.TYPE field. The following chart shows the translation of
+the low order bit.  Other bits are the same.
+========================= Low order bit ==========================
+                    | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC
+dict_table_t::flags |     0     |    1    |     1
+SYS_TABLES.TYPE     |     1     |    1    |     1
+==================================================================
+@return	ulint containing SYS_TABLES.TYPE */
+UNIV_INLINE
+ulint
+dict_tf_to_sys_tables_type(
+/*=======================*/
+	ulint	flags)	/*!< in: dict_table_t::flags */
+{
+	ulint type;
+
+	ut_a(dict_tf_is_valid(flags));
+
+	/* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */
+	type = 1;
+
+	/* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */
+	type |= flags & (DICT_TF_MASK_ZIP_SSIZE
+			 | DICT_TF_MASK_ATOMIC_BLOBS
+			 | DICT_TF_MASK_DATA_DIR);
+
+	return(type);
+}
+
+/********************************************************************//**
+Extract the compressed page size from dict_table_t::flags.
+These flags are in memory, so assert that they are valid.
+@return	compressed page size, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_tf_get_zip_size(
+/*=================*/
+	ulint	flags)	/*!< in: flags */
+{
+	ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags);
+	ulint zip_size = (zip_ssize
+			  ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize
+			  : 0);
+
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
+
+	return(zip_size);
+}
+
+/********************************************************************//**
+Check whether the table uses the compressed compact page format.
+@return	compressed page size, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_zip_size(
+/*================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+
+	return(dict_tf_get_zip_size(table->flags));
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Obtain exclusive locks on all index trees of the table. This is to prevent
+accessing index trees while InnoDB is updating internal metadata for
+operations such as truncate tables. */
+UNIV_INLINE
+void
+dict_table_x_lock_indexes(
+/*======================*/
+	dict_table_t*	table)	/*!< in: table */
+{
+	dict_index_t*   index;
+
+	ut_a(table);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	/* Loop through each index of the table and lock them */
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+		rw_lock_x_lock(dict_index_get_lock(index));
+	}
+}
+
+/*********************************************************************//**
+Release the exclusive locks on all index tree. */
+UNIV_INLINE
+void
+dict_table_x_unlock_indexes(
+/*========================*/
+	dict_table_t*	table)	/*!< in: table */
+{
+	dict_index_t*   index;
+
+	ut_a(table);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+		rw_lock_x_unlock(dict_index_get_lock(index));
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index,
+including fields added by the dictionary system.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_fields(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: an internal
+					representation of index (in
+					the dictionary cache) */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(index->n_fields);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+that uniquely determine the position of an index entry in the index, if
+we do not take multiversioning into account: in the B-tree use the value
+returned by dict_index_get_n_unique_in_tree.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(index->cached);
+
+	return(index->n_uniq);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+which uniquely determine the position of an index entry in the index, if
+we also take multiversioning into account.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique_in_tree(
+/*============================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(index->cached);
+
+	if (dict_index_is_clust(index)) {
+
+		return(dict_index_get_n_unique(index));
+	}
+
+	return(dict_index_get_n_fields(index));
+}
+
+/********************************************************************//**
+Gets the number of user-defined ordering fields in the index. In the internal
+representation of clustered indexes we add the row id to the ordering fields
+to make a clustered index unique, but this function returns the number of
+fields the user defined in the index as ordering fields.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_ordering_defined_by_user(
+/*======================================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+{
+	return(index->n_user_defined_cols);
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth field of an index.
+@return	pointer to field object */
+UNIV_INLINE
+dict_field_t*
+dict_index_get_nth_field(
+/*=====================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of field */
+{
+	ut_ad(index);
+	ut_ad(pos < index->n_def);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return((dict_field_t*) (index->fields) + pos);
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Returns the position of a system column in an index.
+@return	position, ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_sys_col_pos(
+/*=======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			type)	/*!< in: DATA_ROW_ID, ... */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(!dict_index_is_univ(index));
+
+	if (dict_index_is_clust(index)) {
+
+		return(dict_col_get_clust_pos(
+			       dict_table_get_sys_col(index->table, type),
+			       index));
+	}
+
+	return(dict_index_get_nth_col_pos(
+		       index, dict_table_get_sys_col_no(index->table, type)));
+}
+
+/*********************************************************************//**
+Gets the field column.
+@return	field->col, pointer to the table column */
+UNIV_INLINE
+const dict_col_t*
+dict_field_get_col(
+/*===============*/
+	const dict_field_t*	field)	/*!< in: index field */
+{
+	ut_ad(field);
+
+	return(field->col);
+}
+
+/********************************************************************//**
+Gets pointer to the nth column in an index.
+@return	column */
+UNIV_INLINE
+const dict_col_t*
+dict_index_get_nth_col(
+/*===================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of the field */
+{
+	return(dict_field_get_col(dict_index_get_nth_field(index, pos)));
+}
+
+/********************************************************************//**
+Gets the column number the nth field in an index.
+@return	column number */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_no(
+/*======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of the field */
+{
+	return(dict_col_get_no(dict_index_get_nth_col(index, pos)));
+}
+
+/********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_pos(
+/*=======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			n)	/*!< in: column number */
+{
+	return(dict_index_get_nth_col_or_prefix_pos(index, n, FALSE));
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Returns the minimum data size of an index record.
+@return	minimum data size in bytes */
+UNIV_INLINE
+ulint
+dict_index_get_min_size(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ulint	n	= dict_index_get_n_fields(index);
+	ulint	size	= 0;
+
+	while (n--) {
+		size += dict_col_get_min_size(dict_index_get_nth_col(index,
+								     n));
+	}
+
+	return(size);
+}
+
+/*********************************************************************//**
+Gets the space id of the root of the index tree.
+@return	space id */
+UNIV_INLINE
+ulint
+dict_index_get_space(
+/*=================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(index->space);
+}
+
+/*********************************************************************//**
+Sets the space id of the root of the index tree. */
+UNIV_INLINE
+void
+dict_index_set_space(
+/*=================*/
+	dict_index_t*	index,	/*!< in/out: index */
+	ulint		space)	/*!< in: space id */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	index->space = space;
+}
+
+/*********************************************************************//**
+Gets the page number of the root of the index tree.
+@return	page number */
+UNIV_INLINE
+ulint
+dict_index_get_page(
+/*================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(index->page);
+}
+
+/*********************************************************************//**
+Gets the read-write lock of the index tree.
+@return	read-write lock */
+UNIV_INLINE
+rw_lock_t*
+dict_index_get_lock(
+/*================*/
+	dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(&(index->lock));
+}
+
+/********************************************************************//**
+Returns free space reserved for future updates of records. This is
+relevant only in the case of many consecutive inserts, as updates
+which make the records bigger might fragment the index.
+@return	number of free bytes on page, reserved for updates */
+UNIV_INLINE
+ulint
+dict_index_get_space_reserve(void)
+/*==============================*/
+{
+	return(UNIV_PAGE_SIZE / 16);
+}
+
+/********************************************************************//**
+Gets the status of online index creation.
+@return the status */
+UNIV_INLINE
+enum online_index_status
+dict_index_get_online_status(
+/*=========================*/
+	const dict_index_t*	index)	/*!< in: secondary index */
+{
+	enum online_index_status	status;
+
+	status = (enum online_index_status) index->online_status;
+
+	/* Without the index->lock protection, the online
+	status can change from ONLINE_INDEX_CREATION to
+	ONLINE_INDEX_COMPLETE (or ONLINE_INDEX_ABORTED) in
+	row_log_apply() once log application is done. So to make
+	sure the status is ONLINE_INDEX_CREATION or ONLINE_INDEX_COMPLETE
+	you should always do the recheck after acquiring index->lock */
+
+#ifdef UNIV_DEBUG
+	switch (status) {
+	case ONLINE_INDEX_COMPLETE:
+	case ONLINE_INDEX_CREATION:
+	case ONLINE_INDEX_ABORTED:
+	case ONLINE_INDEX_ABORTED_DROPPED:
+		return(status);
+	}
+	ut_error;
+#endif /* UNIV_DEBUG */
+	return(status);
+}
+
+/********************************************************************//**
+Sets the status of online index creation. */
+UNIV_INLINE
+void
+dict_index_set_online_status(
+/*=========================*/
+	dict_index_t*			index,	/*!< in/out: index */
+	enum online_index_status	status)	/*!< in: status */
+{
+	ut_ad(!(index->type & DICT_FTS));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+#ifdef UNIV_DEBUG
+	switch (dict_index_get_online_status(index)) {
+	case ONLINE_INDEX_COMPLETE:
+	case ONLINE_INDEX_CREATION:
+		break;
+	case ONLINE_INDEX_ABORTED:
+		ut_ad(status == ONLINE_INDEX_ABORTED_DROPPED);
+		break;
+	case ONLINE_INDEX_ABORTED_DROPPED:
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+
+	index->online_status = status;
+	ut_ad(dict_index_get_online_status(index) == status);
+}
+
+/********************************************************************//**
+Determines if a secondary index is being or has been created online,
+or if the table is being rebuilt online, allowing concurrent modifications
+to the table.
+@retval true if the index is being or has been built online, or
+if this is a clustered index and the table is being or has been rebuilt online
+@retval false if the index has been created or the table has been
+rebuilt completely */
+UNIV_INLINE
+bool
+dict_index_is_online_ddl(
+/*=====================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+#ifdef UNIV_DEBUG
+	if (dict_index_is_clust(index)) {
+		switch (dict_index_get_online_status(index)) {
+		case ONLINE_INDEX_CREATION:
+			return(true);
+		case ONLINE_INDEX_COMPLETE:
+			return(false);
+		case ONLINE_INDEX_ABORTED:
+		case ONLINE_INDEX_ABORTED_DROPPED:
+			break;
+		}
+		ut_ad(0);
+		return(false);
+	}
+#endif /* UNIV_DEBUG */
+
+	return(UNIV_UNLIKELY(dict_index_get_online_status(index)
+			     != ONLINE_INDEX_COMPLETE));
+}
+
+/**********************************************************************//**
+Check whether a column exists in an FTS index.
+@return ULINT_UNDEFINED if no match else the offset within the vector */
+UNIV_INLINE
+ulint
+dict_table_is_fts_column(
+/*=====================*/
+	ib_vector_t*	indexes,/*!< in: vector containing only FTS indexes */
+	ulint		col_no)	/*!< in: col number to search for */
+
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(indexes); ++i) {
+		dict_index_t*	index;
+
+		index = (dict_index_t*) ib_vector_getp(indexes, i);
+
+		if (dict_index_contains_col_or_prefix(index, col_no)) {
+
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Determine bytes of column prefix to be stored in the undo log. Please
+note if the table format is UNIV_FORMAT_A (< UNIV_FORMAT_B), no prefix
+needs to be stored in the undo log.
+@return bytes of column prefix to be stored in the undo log */
+UNIV_INLINE
+ulint
+dict_max_field_len_store_undo(
+/*==========================*/
+	dict_table_t*		table,	/*!< in: table */
+	const dict_col_t*	col)	/*!< in: column which index prefix
+					is based on */
+{
+	ulint	prefix_len = 0;
+
+	if (dict_table_get_format(table) >= UNIV_FORMAT_B)
+	{
+		prefix_len = col->max_prefix
+			? col->max_prefix
+			: DICT_MAX_FIELD_LEN_BY_FORMAT(table);
+	}
+
+	return(prefix_len);
+}
+
+/********************************************************************//**
+Check whether the table is corrupted.
+@return	nonzero for corrupted table, zero for valid tables */
+UNIV_INLINE
+ulint
+dict_table_is_corrupted(
+/*====================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	return(table->corrupted);
+}
+
+/********************************************************************//**
+Check whether the index is corrupted.
+@return	nonzero for corrupted index, zero for valid indexes */
+UNIV_INLINE
+ulint
+dict_index_is_corrupted(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return((index->type & DICT_CORRUPT)
+	       || (index->table && index->table->corrupted));
+}
+
+/********************************************************************//**
+Check if the tablespace for the table has been discarded.
+@return	true if the tablespace has been discarded. */
+UNIV_INLINE
+bool
+dict_table_is_discarded(
+/*====================*/
+	const dict_table_t*	table)	/*!< in: table to check */
+{
+	return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_DISCARDED));
+}
+
+/********************************************************************//**
+Check if it is a temporary table.
+@return	true if temporary table flag is set. */
+UNIV_INLINE
+bool
+dict_table_is_temporary(
+/*====================*/
+	const dict_table_t*	table)	/*!< in: table to check */
+{
+	return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY));
+}
+
+/**********************************************************************//**
+Get index by first field of the index
+@return index which is having first field matches
+with the field present in field_index position of table */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_index_on_first_col(
+/*==============================*/
+	const dict_table_t*	table,		/*!< in: table */
+	ulint			col_index)	/*!< in: position of column
+						in table */
+{
+	ut_ad(col_index < table->n_cols);
+
+	dict_col_t* column = dict_table_get_nth_col(table, col_index);
+
+	for (dict_index_t* index = dict_table_get_first_index(table);
+		index != NULL; index = dict_table_get_next_index(index)) {
+
+		if (index->fields[0].col == column) {
+			return(index);
+		}
+	}
+	ut_error;
+	return(0);
+}
+
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h
new file mode 100644
index 00000000000..030190b1a8e
--- /dev/null
+++ b/storage/innobase/include/dict0load.h
@@ -0,0 +1,428 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0load.h
+Loads to the memory cache database object definitions
+from dictionary tables
+
+Created 4/24/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0load_h
+#define dict0load_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "ut0byte.h"
+#include "mem0mem.h"
+#include "btr0types.h"
+
+/** enum that defines all system table IDs. @see SYSTEM_TABLE_NAME[] */
+enum dict_system_id_t {
+	SYS_TABLES = 0,
+	SYS_INDEXES,
+	SYS_COLUMNS,
+	SYS_FIELDS,
+	SYS_FOREIGN,
+	SYS_FOREIGN_COLS,
+	SYS_TABLESPACES,
+	SYS_DATAFILES,
+
+	/* This must be last item. Defines the number of system tables. */
+	SYS_NUM_SYSTEM_TABLES
+};
+
+/** Status bit for dict_process_sys_tables_rec_and_mtr_commit() */
+enum dict_table_info_t {
+	DICT_TABLE_LOAD_FROM_RECORD = 0,/*!< Directly populate a dict_table_t
+					structure with information from
+					a SYS_TABLES record */
+	DICT_TABLE_LOAD_FROM_CACHE = 1	/*!< Check first whether dict_table_t
+					is in the cache, if so, return it */
+};
+
+/** Check type for dict_check_tablespaces_and_store_max_id() */
+enum dict_check_t {
+	/** No user tablespaces have been opened
+	(no crash recovery, no transactions recovered). */
+	DICT_CHECK_NONE_LOADED = 0,
+	/** Some user tablespaces may have been opened
+	(no crash recovery; recovered table locks for transactions). */
+	DICT_CHECK_SOME_LOADED,
+	/** All user tablespaces have been opened (crash recovery). */
+	DICT_CHECK_ALL_LOADED
+};
+
+/********************************************************************//**
+In a crash recovery we already have all the tablespace objects created.
+This function compares the space id information in the InnoDB data dictionary
+to what we already read with fil_load_single_table_tablespaces().
+
+In a normal startup, we create the tablespace objects for every table in
+InnoDB's data dictionary, if the corresponding .ibd file exists.
+We also scan the biggest space id, and store it to fil_system. */
+UNIV_INTERN
+void
+dict_check_tablespaces_and_store_max_id(
+/*====================================*/
+	dict_check_t	dict_check);	/*!< in: how to check */
+/********************************************************************//**
+Finds the first table name in the given database.
+@return own: table name, NULL if does not exist; the caller must free
+the memory in the string! */
+UNIV_INTERN
+char*
+dict_get_first_table_name_in_db(
+/*============================*/
+	const char*	name);	/*!< in: database name which ends to '/' */
+
+/********************************************************************//**
+Loads a table definition from a SYS_TABLES record to dict_table_t.
+Does not load any columns or indexes.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_load_table_low(
+/*================*/
+	const char*	name,		/*!< in: table name */
+	const rec_t*	rec,		/*!< in: SYS_TABLES record */
+	dict_table_t**	table);		/*!< out,own: table, or NULL */
+/********************************************************************//**
+Loads a table column definition from a SYS_COLUMNS record to
+dict_table_t.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_load_column_low(
+/*=================*/
+	dict_table_t*	table,		/*!< in/out: table, could be NULL
+					if we just populate a dict_column_t
+					struct with information from
+					a SYS_COLUMNS record */
+	mem_heap_t*	heap,		/*!< in/out: memory heap
+					for temporary storage */
+	dict_col_t*	column,		/*!< out: dict_column_t to fill,
+					or NULL if table != NULL */
+	table_id_t*	table_id,	/*!< out: table id */
+	const char**	col_name,	/*!< out: column name */
+	const rec_t*	rec);		/*!< in: SYS_COLUMNS record */
+/********************************************************************//**
+Loads an index definition from a SYS_INDEXES record to dict_index_t.
+If allocate=TRUE, we will create a dict_index_t structure and fill it
+accordingly. If allocated=FALSE, the dict_index_t will be supplied by
+the caller and filled with information read from the record.  @return
+error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_load_index_low(
+/*================*/
+	byte*		table_id,	/*!< in/out: table id (8 bytes),
+					an "in" value if allocate=TRUE
+					and "out" when allocate=FALSE */
+	const char*	table_name,	/*!< in: table name */
+	mem_heap_t*	heap,		/*!< in/out: temporary memory heap */
+	const rec_t*	rec,		/*!< in: SYS_INDEXES record */
+	ibool		allocate,	/*!< in: TRUE=allocate *index,
+					FALSE=fill in a pre-allocated
+					*index */
+	dict_index_t**	index);		/*!< out,own: index, or NULL */
+/********************************************************************//**
+Loads an index field definition from a SYS_FIELDS record to
+dict_index_t.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_load_field_low(
+/*================*/
+	byte*		index_id,	/*!< in/out: index id (8 bytes)
+					an "in" value if index != NULL
+					and "out" if index == NULL */
+	dict_index_t*	index,		/*!< in/out: index, could be NULL
+					if we just populate a dict_field_t
+					struct with information from
+					a SYS_FIELDS record */
+	dict_field_t*	sys_field,	/*!< out: dict_field_t to be
+					filled */
+	ulint*		pos,		/*!< out: Field position */
+	byte*		last_index_id,	/*!< in: last index id */
+	mem_heap_t*	heap,		/*!< in/out: memory heap
+					for temporary storage */
+	const rec_t*	rec);		/*!< in: SYS_FIELDS record */
+/********************************************************************//**
+Using the table->heap, copy the null-terminated filepath into
+table->data_dir_path and put a null byte before the extension.
+This allows SHOW CREATE TABLE to return the correct DATA DIRECTORY path.
+Make this data directory path only if it has not yet been saved. */
+UNIV_INTERN
+void
+dict_save_data_dir_path(
+/*====================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	char*		filepath);	/*!< in: filepath of tablespace */
+/*****************************************************************//**
+Make sure the data_file_name is saved in dict_table_t if needed. Try to
+read it from the file dictionary first, then from SYS_DATAFILES. */
+UNIV_INTERN
+void
+dict_get_and_save_data_dir_path(
+/*============================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	bool		dict_mutex_own);	/*!< in: true if dict_sys->mutex
+					is owned already */
+/********************************************************************//**
+Loads a table definition and also all its index definitions, and also
+the cluster definition if the table is a member in a cluster. Also loads
+all foreign key constraints where the foreign key is in the table or where
+a foreign key references columns in this table.
+@return table, NULL if does not exist; if the table is stored in an
+.ibd file, but the file does not exist, then we set the
+ibd_file_missing flag TRUE in the table object we return */
+UNIV_INTERN
+dict_table_t*
+dict_load_table(
+/*============*/
+	const char*	name,	/*!< in: table name in the
+				databasename/tablename format */
+	ibool		cached,	/*!< in: TRUE=add to cache, FALSE=do not */
+	dict_err_ignore_t ignore_err);
+				/*!< in: error to be ignored when loading
+				table and its indexes' definition */
+/***********************************************************************//**
+Loads a table object based on the table id.
+@return	table; NULL if table does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_load_table_on_id(
+/*==================*/
+	table_id_t		table_id,	/*!< in: table id */
+	dict_err_ignore_t	ignore_err);	/*!< in: errors to ignore
+						when loading the table */
+/********************************************************************//**
+This function is called when the database is booted.
+Loads system table index definitions except for the clustered index which
+is added to the dictionary cache at booting before calling this function. */
+UNIV_INTERN
+void
+dict_load_sys_table(
+/*================*/
+	dict_table_t*	table);	/*!< in: system table */
+/***********************************************************************//**
+Loads foreign key constraints where the table is either the foreign key
+holder or where the table is referenced by a foreign key. Adds these
+constraints to the data dictionary. Note that we know that the dictionary
+cache already contains all constraints where the other relevant table is
+already in the dictionary cache.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_load_foreigns(
+/*===============*/
+	const char*		table_name,	/*!< in: table name */
+	const char**		col_names,	/*!< in: column names, or NULL
+						to use table->col_names */
+	bool			check_recursive,/*!< in: Whether to check
+						recursive load of tables
+						chained by FK */
+	bool			check_charsets,	/*!< in: whether to check
+						charset compatibility */
+	dict_err_ignore_t	ignore_err)	/*!< in: error to be ignored */
+	__attribute__((nonnull(1), warn_unused_result));
+/********************************************************************//**
+Prints to the standard output information on all tables found in the data
+dictionary system table. */
+UNIV_INTERN
+void
+dict_print(void);
+/*============*/
+
+/********************************************************************//**
+This function opens a system table, and return the first record.
+@return	first record of the system table */
+UNIV_INTERN
+const rec_t*
+dict_startscan_system(
+/*==================*/
+	btr_pcur_t*	pcur,		/*!< out: persistent cursor to
+					the record */
+	mtr_t*		mtr,		/*!< in: the mini-transaction */
+	dict_system_id_t system_id);	/*!< in: which system table to open */
+/********************************************************************//**
+This function get the next system table record as we scan the table.
+@return	the record if found, NULL if end of scan. */
+UNIV_INTERN
+const rec_t*
+dict_getnext_system(
+/*================*/
+	btr_pcur_t*	pcur,		/*!< in/out: persistent cursor
+					to the record */
+	mtr_t*		mtr);		/*!< in: the mini-transaction */
+/********************************************************************//**
+This function processes one SYS_TABLES record and populate the dict_table_t
+struct for the table. Extracted out of dict_print() to be used by
+both monitor table output and information schema innodb_sys_tables output.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_tables_rec_and_mtr_commit(
+/*=======================================*/
+	mem_heap_t*	heap,		/*!< in: temporary memory heap */
+	const rec_t*	rec,		/*!< in: SYS_TABLES record */
+	dict_table_t**	table,		/*!< out: dict_table_t to fill */
+	dict_table_info_t status,	/*!< in: status bit controls
+					options such as whether we shall
+					look for dict_table_t from cache
+					first */
+	mtr_t*		mtr);		/*!< in/out: mini-transaction,
+					will be committed */
+/********************************************************************//**
+This function parses a SYS_INDEXES record and populate a dict_index_t
+structure with the information from the record. For detail information
+about SYS_INDEXES fields, please refer to dict_boot() function.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_indexes_rec(
+/*=========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_INDEXES rec */
+	dict_index_t*	index,		/*!< out: dict_index_t to be
+					filled */
+	table_id_t*	table_id);	/*!< out: table id */
+/********************************************************************//**
+This function parses a SYS_COLUMNS record and populate a dict_column_t
+structure with the information from the record.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_columns_rec(
+/*=========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_COLUMNS rec */
+	dict_col_t*	column,		/*!< out: dict_col_t to be filled */
+	table_id_t*	table_id,	/*!< out: table id */
+	const char**	col_name);	/*!< out: column name */
+/********************************************************************//**
+This function parses a SYS_FIELDS record and populate a dict_field_t
+structure with the information from the record.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_fields_rec(
+/*========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_FIELDS rec */
+	dict_field_t*	sys_field,	/*!< out: dict_field_t to be
+					filled */
+	ulint*		pos,		/*!< out: Field position */
+	index_id_t*	index_id,	/*!< out: current index id */
+	index_id_t	last_id);	/*!< in: previous index id */
+/********************************************************************//**
+This function parses a SYS_FOREIGN record and populate a dict_foreign_t
+structure with the information from the record. For detail information
+about SYS_FOREIGN fields, please refer to dict_load_foreign() function
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_foreign_rec(
+/*=========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_FOREIGN rec */
+	dict_foreign_t*	foreign);	/*!< out: dict_foreign_t to be
+					filled */
+/********************************************************************//**
+This function parses a SYS_FOREIGN_COLS record and extract necessary
+information from the record and return to caller.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_foreign_col_rec(
+/*=============================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_FOREIGN_COLS rec */
+	const char**	name,		/*!< out: foreign key constraint name */
+	const char**	for_col_name,	/*!< out: referencing column name */
+	const char**	ref_col_name,	/*!< out: referenced column name
+					in referenced table */
+	ulint*		pos);		/*!< out: column position */
+/********************************************************************//**
+This function parses a SYS_TABLESPACES record, extracts necessary
+information from the record and returns to caller.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_tablespaces(
+/*=========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_TABLESPACES rec */
+	ulint*		space,		/*!< out: pace id */
+	const char**	name,		/*!< out: tablespace name */
+	ulint*		flags);		/*!< out: tablespace flags */
+/********************************************************************//**
+This function parses a SYS_DATAFILES record, extracts necessary
+information from the record and returns to caller.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_datafiles(
+/*=======================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_DATAFILES rec */
+	ulint*		space,		/*!< out: pace id */
+	const char**	path);		/*!< out: datafile path */
+/********************************************************************//**
+Get the filepath for a spaceid from SYS_DATAFILES. This function provides
+a temporary heap which is used for the table lookup, but not for the path.
+The caller must free the memory for the path returned. This function can
+return NULL if the space ID is not found in SYS_DATAFILES, then the caller
+will assume that the ibd file is in the normal datadir.
+@return	own: A copy of the first datafile found in SYS_DATAFILES.PATH for
+the given space ID. NULL if space ID is zero or not found. */
+UNIV_INTERN
+char*
+dict_get_first_path(
+/*================*/
+	ulint		space,	/*!< in: space id */
+	const char*	name);	/*!< in: tablespace name */
+/********************************************************************//**
+Update the record for space_id in SYS_TABLESPACES to this filepath.
+@return	DB_SUCCESS if OK, dberr_t if the insert failed */
+UNIV_INTERN
+dberr_t
+dict_update_filepath(
+/*=================*/
+	ulint		space_id,	/*!< in: space id */
+	const char*	filepath);	/*!< in: filepath */
+/********************************************************************//**
+Insert records into SYS_TABLESPACES and SYS_DATAFILES.
+@return	DB_SUCCESS if OK, dberr_t if the insert failed */
+UNIV_INTERN
+dberr_t
+dict_insert_tablespace_and_filepath(
+/*================================*/
+	ulint		space,		/*!< in: space id */
+	const char*	name,		/*!< in: talespace name */
+	const char*	filepath,	/*!< in: filepath */
+	ulint		fsp_flags);	/*!< in: tablespace flags */
+
+#ifndef UNIV_NONINL
+#include "dict0load.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/dict0load.ic b/storage/innobase/include/dict0load.ic
new file mode 100644
index 00000000000..2c0f1ff38a5
--- /dev/null
+++ b/storage/innobase/include/dict0load.ic
@@ -0,0 +1,26 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0load.ic
+Loads to the memory cache database object definitions
+from dictionary tables
+
+Created 4/24/1996 Heikki Tuuri
+*******************************************************/
+
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
new file mode 100644
index 00000000000..460a7e125ad
--- /dev/null
+++ b/storage/innobase/include/dict0mem.h
@@ -0,0 +1,1214 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0mem.h
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0mem_h
+#define dict0mem_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "data0type.h"
+#include "mem0mem.h"
+#include "row0types.h"
+#include "rem0types.h"
+#include "btr0types.h"
+#ifndef UNIV_HOTBACKUP
+# include "lock0types.h"
+# include "que0types.h"
+# include "sync0rw.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "ut0mem.h"
+#include "ut0lst.h"
+#include "ut0rnd.h"
+#include "ut0byte.h"
+#include "hash0hash.h"
+#include "trx0types.h"
+#include "fts0fts.h"
+#include "os0once.h"
+#include <set>
+#include <algorithm>
+#include <iterator>
+
+/* Forward declaration. */
+struct ib_rbt_t;
+
+/** Type flags of an index: OR'ing of the flags is allowed to define a
+combination of types */
+/* @{ */
+#define DICT_CLUSTERED	1	/*!< clustered index */
+#define DICT_UNIQUE	2	/*!< unique index */
+#define	DICT_UNIVERSAL	4	/*!< index which can contain records from any
+				other index */
+#define	DICT_IBUF	8	/*!< insert buffer tree */
+#define	DICT_CORRUPT	16	/*!< bit to store the corrupted flag
+				in SYS_INDEXES.TYPE */
+#define	DICT_FTS	32	/* FTS index; can't be combined with the
+				other flags */
+
+#define	DICT_IT_BITS	6	/*!< number of bits used for
+				SYS_INDEXES.TYPE */
+/* @} */
+
+#if 0 /* not implemented, retained for history */
+/** Types for a table object */
+#define DICT_TABLE_ORDINARY		1 /*!< ordinary table */
+#define	DICT_TABLE_CLUSTER_MEMBER	2
+#define	DICT_TABLE_CLUSTER		3 /* this means that the table is
+					  really a cluster definition */
+#endif
+
+/* Table and tablespace flags are generally not used for the Antelope file
+format except for the low order bit, which is used differently depending on
+where the flags are stored.
+
+==================== Low order flags bit =========================
+                    | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC
+SYS_TABLES.TYPE     |     1     |    1    |     1
+dict_table_t::flags |     0     |    1    |     1
+FSP_SPACE_FLAGS     |     0     |    0    |     1
+fil_space_t::flags  |     0     |    0    |     1
+
+Before the 5.1 plugin, SYS_TABLES.TYPE was always DICT_TABLE_ORDINARY (1)
+and the tablespace flags field was always 0. In the 5.1 plugin, these fields
+were repurposed to identify compressed and dynamic row formats.
+
+The following types and constants describe the flags found in dict_table_t
+and SYS_TABLES.TYPE.  Similar flags found in fil_space_t and FSP_SPACE_FLAGS
+are described in fsp0fsp.h. */
+
+/* @{ */
+/** dict_table_t::flags bit 0 is equal to 0 if the row format = Redundant */
+#define DICT_TF_REDUNDANT		0	/*!< Redundant row format. */
+/** dict_table_t::flags bit 0 is equal to 1 if the row format = Compact */
+#define DICT_TF_COMPACT			1	/*!< Compact row format. */
+
+/** This bitmask is used in SYS_TABLES.N_COLS to set and test whether
+the Compact page format is used, i.e ROW_FORMAT != REDUNDANT */
+#define DICT_N_COLS_COMPACT	0x80000000UL
+
+/** Width of the COMPACT flag */
+#define DICT_TF_WIDTH_COMPACT		1
+/** Width of the ZIP_SSIZE flag */
+#define DICT_TF_WIDTH_ZIP_SSIZE		4
+/** Width of the ATOMIC_BLOBS flag.  The Antelope file formats broke up
+BLOB and TEXT fields, storing the first 768 bytes in the clustered index.
+Brracuda row formats store the whole blob or text field off-page atomically.
+Secondary indexes are created from this external data using row_ext_t
+to cache the BLOB prefixes. */
+#define DICT_TF_WIDTH_ATOMIC_BLOBS	1
+/** If a table is created with the MYSQL option DATA DIRECTORY and
+innodb-file-per-table, an older engine will not be able to find that table.
+This flag prevents older engines from attempting to open the table and
+allows InnoDB to update_create_info() accordingly. */
+#define DICT_TF_WIDTH_DATA_DIR		1
+
+/** Width of all the currently known table flags */
+#define DICT_TF_BITS	(DICT_TF_WIDTH_COMPACT		\
+			+ DICT_TF_WIDTH_ZIP_SSIZE	\
+			+ DICT_TF_WIDTH_ATOMIC_BLOBS	\
+			+ DICT_TF_WIDTH_DATA_DIR)
+
+/** A mask of all the known/used bits in table flags */
+#define DICT_TF_BIT_MASK	(~(~0 << DICT_TF_BITS))
+
+/** Zero relative shift position of the COMPACT field */
+#define DICT_TF_POS_COMPACT		0
+/** Zero relative shift position of the ZIP_SSIZE field */
+#define DICT_TF_POS_ZIP_SSIZE		(DICT_TF_POS_COMPACT		\
+					+ DICT_TF_WIDTH_COMPACT)
+/** Zero relative shift position of the ATOMIC_BLOBS field */
+#define DICT_TF_POS_ATOMIC_BLOBS	(DICT_TF_POS_ZIP_SSIZE		\
+					+ DICT_TF_WIDTH_ZIP_SSIZE)
+/** Zero relative shift position of the DATA_DIR field */
+#define DICT_TF_POS_DATA_DIR		(DICT_TF_POS_ATOMIC_BLOBS	\
+					+ DICT_TF_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the start of the UNUSED bits */
+#define DICT_TF_POS_UNUSED		(DICT_TF_POS_DATA_DIR		\
+					+ DICT_TF_WIDTH_DATA_DIR)
+
+/** Bit mask of the COMPACT field */
+#define DICT_TF_MASK_COMPACT				\
+		((~(~0 << DICT_TF_WIDTH_COMPACT))	\
+		<< DICT_TF_POS_COMPACT)
+/** Bit mask of the ZIP_SSIZE field */
+#define DICT_TF_MASK_ZIP_SSIZE				\
+		((~(~0 << DICT_TF_WIDTH_ZIP_SSIZE))	\
+		<< DICT_TF_POS_ZIP_SSIZE)
+/** Bit mask of the ATOMIC_BLOBS field */
+#define DICT_TF_MASK_ATOMIC_BLOBS			\
+		((~(~0 << DICT_TF_WIDTH_ATOMIC_BLOBS))	\
+		<< DICT_TF_POS_ATOMIC_BLOBS)
+/** Bit mask of the DATA_DIR field */
+#define DICT_TF_MASK_DATA_DIR				\
+		((~(~0 << DICT_TF_WIDTH_DATA_DIR))	\
+		<< DICT_TF_POS_DATA_DIR)
+
+/** Return the value of the COMPACT field */
+#define DICT_TF_GET_COMPACT(flags)			\
+		((flags & DICT_TF_MASK_COMPACT)		\
+		>> DICT_TF_POS_COMPACT)
+/** Return the value of the ZIP_SSIZE field */
+#define DICT_TF_GET_ZIP_SSIZE(flags)			\
+		((flags & DICT_TF_MASK_ZIP_SSIZE)	\
+		>> DICT_TF_POS_ZIP_SSIZE)
+/** Return the value of the ATOMIC_BLOBS field */
+#define DICT_TF_HAS_ATOMIC_BLOBS(flags)			\
+		((flags & DICT_TF_MASK_ATOMIC_BLOBS)	\
+		>> DICT_TF_POS_ATOMIC_BLOBS)
+/** Return the value of the ATOMIC_BLOBS field */
+#define DICT_TF_HAS_DATA_DIR(flags)			\
+		((flags & DICT_TF_MASK_DATA_DIR)	\
+		>> DICT_TF_POS_DATA_DIR)
+/** Return the contents of the UNUSED bits */
+#define DICT_TF_GET_UNUSED(flags)			\
+		(flags >> DICT_TF_POS_UNUSED)
+/* @} */
+
+/** @brief Table Flags set number 2.
+
+These flags will be stored in SYS_TABLES.MIX_LEN.  All unused flags
+will be written as 0.  The column may contain garbage for tables
+created with old versions of InnoDB that only implemented
+ROW_FORMAT=REDUNDANT.  InnoDB engines do not check these flags
+for unknown bits in order to protect backward incompatibility. */
+/* @{ */
+/** Total number of bits in table->flags2. */
+#define DICT_TF2_BITS			7
+#define DICT_TF2_BIT_MASK		~(~0 << DICT_TF2_BITS)
+
+/** TEMPORARY; TRUE for tables from CREATE TEMPORARY TABLE. */
+#define DICT_TF2_TEMPORARY		1
+/** The table has an internal defined DOC ID column */
+#define DICT_TF2_FTS_HAS_DOC_ID		2
+/** The table has an FTS index */
+#define DICT_TF2_FTS			4
+/** Need to add Doc ID column for FTS index build.
+This is a transient bit for index build */
+#define DICT_TF2_FTS_ADD_DOC_ID		8
+/** This bit is used during table creation to indicate that it will
+use its own tablespace instead of the system tablespace. */
+#define DICT_TF2_USE_TABLESPACE		16
+
+/** Set when we discard/detach the tablespace */
+#define DICT_TF2_DISCARDED		32
+
+/** This bit is set if all aux table names (both common tables and
+index tables) of a FTS table are in HEX format. */
+#define DICT_TF2_FTS_AUX_HEX_NAME	64
+/* @} */
+
+#define DICT_TF2_FLAG_SET(table, flag)				\
+	(table->flags2 |= (flag))
+
+#define DICT_TF2_FLAG_IS_SET(table, flag)			\
+	(table->flags2 & (flag))
+
+#define DICT_TF2_FLAG_UNSET(table, flag)			\
+	(table->flags2 &= ~(flag))
+
+/** Tables could be chained together with Foreign key constraint. When
+first load the parent table, we would load all of its descedents.
+This could result in rescursive calls and out of stack error eventually.
+DICT_FK_MAX_RECURSIVE_LOAD defines the maximum number of recursive loads,
+when exceeded, the child table will not be loaded. It will be loaded when
+the foreign constraint check needs to be run. */
+#define DICT_FK_MAX_RECURSIVE_LOAD	20
+
+/** Similarly, when tables are chained together with foreign key constraints
+with on cascading delete/update clause, delete from parent table could
+result in recursive cascading calls. This defines the maximum number of
+such cascading deletes/updates allowed. When exceeded, the delete from
+parent table will fail, and user has to drop excessive foreign constraint
+before proceeds. */
+#define FK_MAX_CASCADE_DEL		255
+
+/**********************************************************************//**
+Creates a table memory object.
+@return	own: table object */
+UNIV_INTERN
+dict_table_t*
+dict_mem_table_create(
+/*==================*/
+	const char*	name,		/*!< in: table name */
+	ulint		space,		/*!< in: space where the clustered index
+					of the table is placed */
+	ulint		n_cols,		/*!< in: number of columns */
+	ulint		flags,		/*!< in: table flags */
+	ulint		flags2);	/*!< in: table flags2 */
+/****************************************************************//**
+Free a table memory object. */
+UNIV_INTERN
+void
+dict_mem_table_free(
+/*================*/
+	dict_table_t*	table);		/*!< in: table */
+/**********************************************************************//**
+Adds a column definition to a table. */
+UNIV_INTERN
+void
+dict_mem_table_add_col(
+/*===================*/
+	dict_table_t*	table,	/*!< in: table */
+	mem_heap_t*	heap,	/*!< in: temporary memory heap, or NULL */
+	const char*	name,	/*!< in: column name, or NULL */
+	ulint		mtype,	/*!< in: main datatype */
+	ulint		prtype,	/*!< in: precise type */
+	ulint		len)	/*!< in: precision */
+	__attribute__((nonnull(1)));
+/**********************************************************************//**
+Renames a column of a table in the data dictionary cache. */
+UNIV_INTERN
+void
+dict_mem_table_col_rename(
+/*======================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	unsigned	nth_col,/*!< in: column index */
+	const char*	from,	/*!< in: old column name */
+	const char*	to)	/*!< in: new column name */
+	__attribute__((nonnull));
+/**********************************************************************//**
+This function populates a dict_col_t memory structure with
+supplied information. */
+UNIV_INTERN
+void
+dict_mem_fill_column_struct(
+/*========================*/
+	dict_col_t*	column,		/*!< out: column struct to be
+					filled */
+	ulint		col_pos,	/*!< in: column position */
+	ulint		mtype,		/*!< in: main data type */
+	ulint		prtype,		/*!< in: precise type */
+	ulint		col_len);	/*!< in: column length */
+/**********************************************************************//**
+This function poplulates a dict_index_t index memory structure with
+supplied information. */
+UNIV_INLINE
+void
+dict_mem_fill_index_struct(
+/*=======================*/
+	dict_index_t*	index,		/*!< out: index to be filled */
+	mem_heap_t*	heap,		/*!< in: memory heap */
+	const char*	table_name,	/*!< in: table name */
+	const char*	index_name,	/*!< in: index name */
+	ulint		space,		/*!< in: space where the index tree is
+					placed, ignored if the index is of
+					the clustered type */
+	ulint		type,		/*!< in: DICT_UNIQUE,
+					DICT_CLUSTERED, ... ORed */
+	ulint		n_fields);	/*!< in: number of fields */
+/**********************************************************************//**
+Creates an index memory object.
+@return	own: index object */
+UNIV_INTERN
+dict_index_t*
+dict_mem_index_create(
+/*==================*/
+	const char*	table_name,	/*!< in: table name */
+	const char*	index_name,	/*!< in: index name */
+	ulint		space,		/*!< in: space where the index tree is
+					placed, ignored if the index is of
+					the clustered type */
+	ulint		type,		/*!< in: DICT_UNIQUE,
+					DICT_CLUSTERED, ... ORed */
+	ulint		n_fields);	/*!< in: number of fields */
+/**********************************************************************//**
+Adds a field definition to an index. NOTE: does not take a copy
+of the column name if the field is a column. The memory occupied
+by the column name may be released only after publishing the index. */
+UNIV_INTERN
+void
+dict_mem_index_add_field(
+/*=====================*/
+	dict_index_t*	index,		/*!< in: index */
+	const char*	name,		/*!< in: column name */
+	ulint		prefix_len);	/*!< in: 0 or the column prefix length
+					in a MySQL index like
+					INDEX (textcol(25)) */
+/**********************************************************************//**
+Frees an index memory object. */
+UNIV_INTERN
+void
+dict_mem_index_free(
+/*================*/
+	dict_index_t*	index);	/*!< in: index */
+/**********************************************************************//**
+Creates and initializes a foreign constraint memory object.
+@return	own: foreign constraint struct */
+UNIV_INTERN
+dict_foreign_t*
+dict_mem_foreign_create(void);
+/*=========================*/
+
+/**********************************************************************//**
+Sets the foreign_table_name_lookup pointer based on the value of
+lower_case_table_names.  If that is 0 or 1, foreign_table_name_lookup
+will point to foreign_table_name.  If 2, then another string is
+allocated from the heap and set to lower case. */
+UNIV_INTERN
+void
+dict_mem_foreign_table_name_lookup_set(
+/*===================================*/
+	dict_foreign_t*	foreign,	/*!< in/out: foreign struct */
+	ibool		do_alloc);	/*!< in: is an alloc needed */
+
+/**********************************************************************//**
+Sets the referenced_table_name_lookup pointer based on the value of
+lower_case_table_names.  If that is 0 or 1, referenced_table_name_lookup
+will point to referenced_table_name.  If 2, then another string is
+allocated from the heap and set to lower case. */
+UNIV_INTERN
+void
+dict_mem_referenced_table_name_lookup_set(
+/*======================================*/
+	dict_foreign_t*	foreign,	/*!< in/out: foreign struct */
+	ibool		do_alloc);	/*!< in: is an alloc needed */
+
+/** Create a temporary tablename like "#sql-ibtid-inc where
+  tid = the Table ID
+  inc = a randomly initialized number that is incremented for each file
+The table ID is a 64 bit integer, can use up to 20 digits, and is
+initialized at bootstrap. The second number is 32 bits, can use up to 10
+digits, and is initialized at startup to a randomly distributed number.
+It is hoped that the combination of these two numbers will provide a
+reasonably unique temporary file name.
+@param[in]	heap	A memory heap
+@param[in]	dbtab	Table name in the form database/table name
+@param[in]	id	Table id
+@return A unique temporary tablename suitable for InnoDB use */
+UNIV_INTERN
+char*
+dict_mem_create_temporary_tablename(
+	mem_heap_t*	heap,
+	const char*	dbtab,
+	table_id_t	id);
+
+/** Initialize dict memory variables */
+
+void
+dict_mem_init(void);
+
+/** Data structure for a column in a table */
+struct dict_col_t{
+	/*----------------------*/
+	/** The following are copied from dtype_t,
+	so that all bit-fields can be packed tightly. */
+	/* @{ */
+	unsigned	prtype:32;	/*!< precise type; MySQL data
+					type, charset code, flags to
+					indicate nullability,
+					signedness, whether this is a
+					binary string, whether this is
+					a true VARCHAR where MySQL
+					uses 2 bytes to store the length */
+	unsigned	mtype:8;	/*!< main data type */
+
+	/* the remaining fields do not affect alphabetical ordering: */
+
+	unsigned	len:16;		/*!< length; for MySQL data this
+					is field->pack_length(),
+					except that for a >= 5.0.3
+					type true VARCHAR this is the
+					maximum byte length of the
+					string data (in addition to
+					the string, MySQL uses 1 or 2
+					bytes to store the string length) */
+
+	unsigned	mbminmaxlen:5;	/*!< minimum and maximum length of a
+					character, in bytes;
+					DATA_MBMINMAXLEN(mbminlen,mbmaxlen);
+					mbminlen=DATA_MBMINLEN(mbminmaxlen);
+					mbmaxlen=DATA_MBMINLEN(mbminmaxlen) */
+	/*----------------------*/
+	/* End of definitions copied from dtype_t */
+	/* @} */
+
+	unsigned	ind:10;		/*!< table column position
+					(starting from 0) */
+	unsigned	ord_part:1;	/*!< nonzero if this column
+					appears in the ordering fields
+					of an index */
+	unsigned	max_prefix:12;	/*!< maximum index prefix length on
+					this column. Our current max limit is
+					3072 for Barracuda table */
+};
+
+/** @brief DICT_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and
+is the maximum indexed column length (or indexed prefix length) in
+ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT. Also, in any format,
+any fixed-length field that is longer than this will be encoded as
+a variable-length field.
+
+It is set to 3*256, so that one can create a column prefix index on
+256 characters of a TEXT or VARCHAR column also in the UTF-8
+charset. In that charset, a character may take at most 3 bytes.  This
+constant MUST NOT BE CHANGED, or the compatibility of InnoDB data
+files would be at risk! */
+#define DICT_ANTELOPE_MAX_INDEX_COL_LEN	REC_ANTELOPE_MAX_INDEX_COL_LEN
+
+/** Find out maximum indexed column length by its table format.
+For ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT, the maximum
+field length is REC_ANTELOPE_MAX_INDEX_COL_LEN - 1 (767). For
+Barracuda row formats COMPRESSED and DYNAMIC, the length could
+be REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes */
+#define DICT_MAX_FIELD_LEN_BY_FORMAT(table)				\
+		((dict_table_get_format(table) < UNIV_FORMAT_B)		\
+			? (REC_ANTELOPE_MAX_INDEX_COL_LEN - 1)		\
+			: REC_VERSION_56_MAX_INDEX_COL_LEN)
+
+#define DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags)			\
+		((DICT_TF_HAS_ATOMIC_BLOBS(flags) < UNIV_FORMAT_B)	\
+			? (REC_ANTELOPE_MAX_INDEX_COL_LEN - 1)		\
+			: REC_VERSION_56_MAX_INDEX_COL_LEN)
+
+/** Defines the maximum fixed length column size */
+#define DICT_MAX_FIXED_COL_LEN		DICT_ANTELOPE_MAX_INDEX_COL_LEN
+
+/** Data structure for a field in an index */
+struct dict_field_t{
+	dict_col_t*	col;		/*!< pointer to the table column */
+	const char*	name;		/*!< name of the column */
+	unsigned	prefix_len:12;	/*!< 0 or the length of the column
+					prefix in bytes in a MySQL index of
+					type, e.g., INDEX (textcol(25));
+					must be smaller than
+					DICT_MAX_FIELD_LEN_BY_FORMAT;
+					NOTE that in the UTF-8 charset, MySQL
+					sets this to (mbmaxlen * the prefix len)
+					in UTF-8 chars */
+	unsigned	fixed_len:10;	/*!< 0 or the fixed length of the
+					column if smaller than
+					DICT_ANTELOPE_MAX_INDEX_COL_LEN */
+};
+
+/**********************************************************************//**
+PADDING HEURISTIC BASED ON LINEAR INCREASE OF PADDING TO AVOID
+COMPRESSION FAILURES
+(Note: this is relevant only for compressed indexes)
+GOAL: Avoid compression failures by maintaining information about the
+compressibility of data. If data is not very compressible then leave
+some extra space 'padding' in the uncompressed page making it more
+likely that compression of less than fully packed uncompressed page will
+succeed.
+
+This padding heuristic works by increasing the pad linearly until the
+desired failure rate is reached. A "round" is a fixed number of
+compression operations.
+After each round, the compression failure rate for that round is
+computed. If the failure rate is too high, then padding is incremented
+by a fixed value, otherwise it's left intact.
+If the compression failure is lower than the desired rate for a fixed
+number of consecutive rounds, then the padding is decreased by a fixed
+value. This is done to prevent overshooting the padding value,
+and to accommodate the possible change in data compressibility. */
+
+/** Number of zip ops in one round. */
+#define ZIP_PAD_ROUND_LEN			(128)
+
+/** Number of successful rounds after which the padding is decreased */
+#define ZIP_PAD_SUCCESSFUL_ROUND_LIMIT		(5)
+
+/** Amount by which padding is increased. */
+#define ZIP_PAD_INCR				(128)
+
+/** Percentage of compression failures that are allowed in a single
+round */
+extern ulong	zip_failure_threshold_pct;
+
+/** Maximum percentage of a page that can be allowed as a pad to avoid
+compression failures */
+extern ulong	zip_pad_max;
+
+/** Data structure to hold information about about how much space in
+an uncompressed page should be left as padding to avoid compression
+failures. This estimate is based on a self-adapting heuristic. */
+struct zip_pad_info_t {
+	os_fast_mutex_t	mutex;	/*!< mutex protecting the info */
+	ulint		pad;	/*!< number of bytes used as pad */
+	ulint		success;/*!< successful compression ops during
+				current round */
+	ulint		failure;/*!< failed compression ops during
+				current round */
+	ulint		n_rounds;/*!< number of currently successful
+				rounds */
+};
+
+/** Data structure for an index.  Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_index_create(). */
+struct dict_index_t{
+	index_id_t	id;	/*!< id of the index */
+	mem_heap_t*	heap;	/*!< memory heap */
+	const char*	name;	/*!< index name */
+	const char*	table_name;/*!< table name */
+	dict_table_t*	table;	/*!< back pointer to table */
+#ifndef UNIV_HOTBACKUP
+	unsigned	space:32;
+				/*!< space where the index tree is placed */
+	unsigned	page:32;/*!< index tree root page number */
+#endif /* !UNIV_HOTBACKUP */
+	unsigned	type:DICT_IT_BITS;
+				/*!< index type (DICT_CLUSTERED, DICT_UNIQUE,
+				DICT_UNIVERSAL, DICT_IBUF, DICT_CORRUPT) */
+#define MAX_KEY_LENGTH_BITS 12
+	unsigned	trx_id_offset:MAX_KEY_LENGTH_BITS;
+				/*!< position of the trx id column
+				in a clustered index record, if the fields
+				before it are known to be of a fixed size,
+				0 otherwise */
+#if (1<<MAX_KEY_LENGTH_BITS) < MAX_KEY_LENGTH
+# error (1<<MAX_KEY_LENGTH_BITS) < MAX_KEY_LENGTH
+#endif
+	unsigned	n_user_defined_cols:10;
+				/*!< number of columns the user defined to
+				be in the index: in the internal
+				representation we add more columns */
+	unsigned	n_uniq:10;/*!< number of fields from the beginning
+				which are enough to determine an index
+				entry uniquely */
+	unsigned	n_def:10;/*!< number of fields defined so far */
+	unsigned	n_fields:10;/*!< number of fields in the index */
+	unsigned	n_nullable:10;/*!< number of nullable fields */
+	unsigned	cached:1;/*!< TRUE if the index object is in the
+				dictionary cache */
+	unsigned	to_be_dropped:1;
+				/*!< TRUE if the index is to be dropped;
+				protected by dict_operation_lock */
+	unsigned	online_status:2;
+				/*!< enum online_index_status.
+				Transitions from ONLINE_INDEX_COMPLETE (to
+				ONLINE_INDEX_CREATION) are protected
+				by dict_operation_lock and
+				dict_sys->mutex. Other changes are
+				protected by index->lock. */
+	dict_field_t*	fields;	/*!< array of field descriptions */
+#ifndef UNIV_HOTBACKUP
+	UT_LIST_NODE_T(dict_index_t)
+			indexes;/*!< list of indexes of the table */
+	btr_search_t*	search_info;
+				/*!< info used in optimistic searches */
+	row_log_t*	online_log;
+				/*!< the log of modifications
+				during online index creation;
+				valid when online_status is
+				ONLINE_INDEX_CREATION */
+	/*----------------------*/
+	/** Statistics for query optimization */
+	/* @{ */
+	ib_uint64_t*	stat_n_diff_key_vals;
+				/*!< approximate number of different
+				key values for this index, for each
+				n-column prefix where 1 <= n <=
+				dict_get_n_unique(index) (the array is
+				indexed from 0 to n_uniq-1); we
+				periodically calculate new
+				estimates */
+	ib_uint64_t*	stat_n_sample_sizes;
+				/*!< number of pages that were sampled
+				to calculate each of stat_n_diff_key_vals[],
+				e.g. stat_n_sample_sizes[3] pages were sampled
+				to get the number stat_n_diff_key_vals[3]. */
+	ib_uint64_t*	stat_n_non_null_key_vals;
+				/* approximate number of non-null key values
+				for this index, for each column where
+				1 <= n <= dict_get_n_unique(index) (the array
+				is indexed from 0 to n_uniq-1); This
+				is used when innodb_stats_method is
+				"nulls_ignored". */
+	ulint		stat_index_size;
+				/*!< approximate index size in
+				database pages */
+	ulint		stat_n_leaf_pages;
+				/*!< approximate number of leaf pages in the
+				index tree */
+	/* @} */
+	rw_lock_t	lock;	/*!< read-write lock protecting the
+				upper levels of the index tree */
+	trx_id_t	trx_id; /*!< id of the transaction that created this
+				index, or 0 if the index existed
+				when InnoDB was started up */
+	zip_pad_info_t	zip_pad;/*!< Information about state of
+				compression failures and successes */
+#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_BLOB_DEBUG
+	ib_mutex_t		blobs_mutex;
+				/*!< mutex protecting blobs */
+	ib_rbt_t*	blobs;	/*!< map of (page_no,heap_no,field_no)
+				to first_blob_page_no; protected by
+				blobs_mutex; @see btr_blob_dbg_t */
+#endif /* UNIV_BLOB_DEBUG */
+#ifdef UNIV_DEBUG
+	ulint		magic_n;/*!< magic number */
+/** Value of dict_index_t::magic_n */
+# define DICT_INDEX_MAGIC_N	76789786
+#endif
+};
+
+/** The status of online index creation */
+enum online_index_status {
+	/** the index is complete and ready for access */
+	ONLINE_INDEX_COMPLETE = 0,
+	/** the index is being created, online
+	(allowing concurrent modifications) */
+	ONLINE_INDEX_CREATION,
+	/** secondary index creation was aborted and the index
+	should be dropped as soon as index->table->n_ref_count reaches 0,
+	or online table rebuild was aborted and the clustered index
+	of the original table should soon be restored to
+	ONLINE_INDEX_COMPLETE */
+	ONLINE_INDEX_ABORTED,
+	/** the online index creation was aborted, the index was
+	dropped from the data dictionary and the tablespace, and it
+	should be dropped from the data dictionary cache as soon as
+	index->table->n_ref_count reaches 0. */
+	ONLINE_INDEX_ABORTED_DROPPED
+};
+
+/** Data structure for a foreign key constraint; an example:
+FOREIGN KEY (A, B) REFERENCES TABLE2 (C, D).  Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_foreign_create(). */
+struct dict_foreign_t{
+	mem_heap_t*	heap;		/*!< this object is allocated from
+					this memory heap */
+	char*		id;		/*!< id of the constraint as a
+					null-terminated string */
+	unsigned	n_fields:10;	/*!< number of indexes' first fields
+					for which the foreign key
+					constraint is defined: we allow the
+					indexes to contain more fields than
+					mentioned in the constraint, as long
+					as the first fields are as mentioned */
+	unsigned	type:6;		/*!< 0 or DICT_FOREIGN_ON_DELETE_CASCADE
+					or DICT_FOREIGN_ON_DELETE_SET_NULL */
+	char*		foreign_table_name;/*!< foreign table name */
+	char*		foreign_table_name_lookup;
+				/*!< foreign table name used for dict lookup */
+	dict_table_t*	foreign_table;	/*!< table where the foreign key is */
+	const char**	foreign_col_names;/*!< names of the columns in the
+					foreign key */
+	char*		referenced_table_name;/*!< referenced table name */
+	char*		referenced_table_name_lookup;
+				/*!< referenced table name for dict lookup*/
+	dict_table_t*	referenced_table;/*!< table where the referenced key
+					is */
+	const char**	referenced_col_names;/*!< names of the referenced
+					columns in the referenced table */
+	dict_index_t*	foreign_index;	/*!< foreign index; we require that
+					both tables contain explicitly defined
+					indexes for the constraint: InnoDB
+					does not generate new indexes
+					implicitly */
+	dict_index_t*	referenced_index;/*!< referenced index */
+};
+
+std::ostream&
+operator<< (std::ostream& out, const dict_foreign_t& foreign);
+
+struct dict_foreign_print {
+
+	dict_foreign_print(std::ostream& out)
+		: m_out(out)
+	{}
+
+	void operator()(const dict_foreign_t* foreign) {
+		m_out << *foreign;
+	}
+private:
+	std::ostream&	m_out;
+};
+
+/** Compare two dict_foreign_t objects using their ids. Used in the ordering
+of dict_table_t::foreign_set and dict_table_t::referenced_set.  It returns
+true if the first argument is considered to go before the second in the
+strict weak ordering it defines, and false otherwise. */
+struct dict_foreign_compare {
+
+	bool operator()(
+		const dict_foreign_t*	lhs,
+		const dict_foreign_t*	rhs) const
+	{
+		return(ut_strcmp(lhs->id, rhs->id) < 0);
+	}
+};
+
+/** A function object to find a foreign key with the given index as the
+referenced index. Return the foreign key with matching criteria or NULL */
+struct dict_foreign_with_index {
+
+	dict_foreign_with_index(const dict_index_t*	index)
+	: m_index(index)
+	{}
+
+	bool operator()(const dict_foreign_t*	foreign) const
+	{
+		return(foreign->referenced_index == m_index);
+	}
+
+	const dict_index_t*	m_index;
+};
+
+/* A function object to check if the foreign constraint is between different
+tables.  Returns true if foreign key constraint is between different tables,
+false otherwise. */
+struct dict_foreign_different_tables {
+
+	bool operator()(const dict_foreign_t*	foreign) const
+	{
+		return(foreign->foreign_table != foreign->referenced_table);
+	}
+};
+
+/** A function object to check if the foreign key constraint has the same
+name as given.  If the full name of the foreign key constraint doesn't match,
+then, check if removing the database name from the foreign key constraint
+matches. Return true if it matches, false otherwise. */
+struct dict_foreign_matches_id {
+
+	dict_foreign_matches_id(const char* id)
+		: m_id(id)
+	{}
+
+	bool operator()(const dict_foreign_t*	foreign) const
+	{
+		if (0 == innobase_strcasecmp(foreign->id, m_id)) {
+			return(true);
+		}
+		if (const char* pos = strchr(foreign->id, '/')) {
+			if (0 == innobase_strcasecmp(m_id, pos + 1)) {
+				return(true);
+			}
+		}
+		return(false);
+	}
+
+	const char*	m_id;
+};
+
+typedef std::set<dict_foreign_t*, dict_foreign_compare> dict_foreign_set;
+
+std::ostream&
+operator<< (std::ostream& out, const dict_foreign_set& fk_set);
+
+/** Function object to check if a foreign key object is there
+in the given foreign key set or not.  It returns true if the
+foreign key is not found, false otherwise */
+struct dict_foreign_not_exists {
+	dict_foreign_not_exists(const dict_foreign_set& obj_)
+		: m_foreigns(obj_)
+	{}
+
+	/* Return true if the given foreign key is not found */
+	bool operator()(dict_foreign_t* const & foreign) const {
+		return(m_foreigns.find(foreign) == m_foreigns.end());
+	}
+private:
+	const dict_foreign_set&	m_foreigns;
+};
+
+/** Validate the search order in the foreign key set.
+@param[in]	fk_set	the foreign key set to be validated
+@return true if search order is fine in the set, false otherwise. */
+bool
+dict_foreign_set_validate(
+	const dict_foreign_set&	fk_set);
+
+/** Validate the search order in the foreign key sets of the table
+(foreign_set and referenced_set).
+@param[in]	table	table whose foreign key sets are to be validated
+@return true if foreign key sets are fine, false otherwise. */
+bool
+dict_foreign_set_validate(
+	const dict_table_t&	table);
+
+/*********************************************************************//**
+Frees a foreign key struct. */
+inline
+void
+dict_foreign_free(
+/*==============*/
+	dict_foreign_t*	foreign)	/*!< in, own: foreign key struct */
+{
+	mem_heap_free(foreign->heap);
+}
+
+/** The destructor will free all the foreign key constraints in the set
+by calling dict_foreign_free() on each of the foreign key constraints.
+This is used to free the allocated memory when a local set goes out
+of scope. */
+struct dict_foreign_set_free {
+
+	dict_foreign_set_free(const dict_foreign_set&	foreign_set)
+		: m_foreign_set(foreign_set)
+	{}
+
+	~dict_foreign_set_free()
+	{
+		std::for_each(m_foreign_set.begin(),
+			      m_foreign_set.end(),
+			      dict_foreign_free);
+	}
+
+	const dict_foreign_set&	m_foreign_set;
+};
+
+/** The flags for ON_UPDATE and ON_DELETE can be ORed; the default is that
+a foreign key constraint is enforced, therefore RESTRICT just means no flag */
+/* @{ */
+#define DICT_FOREIGN_ON_DELETE_CASCADE	1	/*!< ON DELETE CASCADE */
+#define DICT_FOREIGN_ON_DELETE_SET_NULL	2	/*!< ON UPDATE SET NULL */
+#define DICT_FOREIGN_ON_UPDATE_CASCADE	4	/*!< ON DELETE CASCADE */
+#define DICT_FOREIGN_ON_UPDATE_SET_NULL	8	/*!< ON UPDATE SET NULL */
+#define DICT_FOREIGN_ON_DELETE_NO_ACTION 16	/*!< ON DELETE NO ACTION */
+#define DICT_FOREIGN_ON_UPDATE_NO_ACTION 32	/*!< ON UPDATE NO ACTION */
+/* @} */
+
+/* This flag is for sync SQL DDL and memcached DML.
+if table->memcached_sync_count == DICT_TABLE_IN_DDL means there's DDL running on
+the table, DML from memcached will be blocked. */
+#define DICT_TABLE_IN_DDL -1
+
+/** Data structure for a database table.  Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_table_create(). */
+struct dict_table_t{
+
+
+	table_id_t	id;	/*!< id of the table */
+	mem_heap_t*	heap;	/*!< memory heap */
+	char*		name;	/*!< table name */
+	const char*	dir_path_of_temp_table;/*!< NULL or the directory path
+				where a TEMPORARY table that was explicitly
+				created by a user should be placed if
+				innodb_file_per_table is defined in my.cnf;
+				in Unix this is usually /tmp/..., in Windows
+				temp\... */
+	char*		data_dir_path; /*!< NULL or the directory path
+				specified by DATA DIRECTORY */
+	unsigned	space:32;
+				/*!< space where the clustered index of the
+				table is placed */
+	unsigned	flags:DICT_TF_BITS;	/*!< DICT_TF_... */
+	unsigned	flags2:DICT_TF2_BITS;	/*!< DICT_TF2_... */
+	unsigned	ibd_file_missing:1;
+				/*!< TRUE if this is in a single-table
+				tablespace and the .ibd file is missing; then
+				we must return in ha_innodb.cc an error if the
+				user tries to query such an orphaned table */
+	unsigned	cached:1;/*!< TRUE if the table object has been added
+				to the dictionary cache */
+	unsigned	to_be_dropped:1;
+				/*!< TRUE if the table is to be dropped, but
+				not yet actually dropped (could in the bk
+				drop list); It is turned on at the beginning
+				of row_drop_table_for_mysql() and turned off
+				just before we start to update system tables
+				for the drop. It is protected by
+				dict_operation_lock */
+	unsigned	n_def:10;/*!< number of columns defined so far */
+	unsigned	n_cols:10;/*!< number of columns */
+	unsigned	can_be_evicted:1;
+				/*!< TRUE if it's not an InnoDB system table
+				or a table that has no FK relationships */
+	unsigned	corrupted:1;
+				/*!< TRUE if table is corrupted */
+	unsigned	drop_aborted:1;
+				/*!< TRUE if some indexes should be dropped
+				after ONLINE_INDEX_ABORTED
+				or ONLINE_INDEX_ABORTED_DROPPED */
+	dict_col_t*	cols;	/*!< array of column descriptions */
+	const char*	col_names;
+				/*!< Column names packed in a character string
+				"name1\0name2\0...nameN\0".  Until
+				the string contains n_cols, it will be
+				allocated from a temporary heap.  The final
+				string will be allocated from table->heap. */
+#ifndef UNIV_HOTBACKUP
+	hash_node_t	name_hash; /*!< hash chain node */
+	hash_node_t	id_hash; /*!< hash chain node */
+	UT_LIST_BASE_NODE_T(dict_index_t)
+			indexes; /*!< list of indexes of the table */
+
+	dict_foreign_set	foreign_set;
+				/*!< set of foreign key constraints
+				in the table; these refer to columns
+				in other tables */
+
+	dict_foreign_set	referenced_set;
+				/*!< list of foreign key constraints
+				which refer to this table */
+
+	UT_LIST_NODE_T(dict_table_t)
+			table_LRU; /*!< node of the LRU list of tables */
+	unsigned	fk_max_recusive_level:8;
+				/*!< maximum recursive level we support when
+				loading tables chained together with FK
+				constraints. If exceeds this level, we will
+				stop loading child table into memory along with
+				its parent table */
+	ulint		n_foreign_key_checks_running;
+				/*!< count of how many foreign key check
+				operations are currently being performed
+				on the table: we cannot drop the table while
+				there are foreign key checks running on
+				it! */
+	trx_id_t	def_trx_id;
+				/*!< transaction id that last touched
+				the table definition, either when
+				loading the definition or CREATE
+				TABLE, or ALTER TABLE (prepare,
+				commit, and rollback phases) */
+	trx_id_t	query_cache_inv_trx_id;
+				/*!< transactions whose trx id is
+				smaller than this number are not
+				allowed to store to the MySQL query
+				cache or retrieve from it; when a trx
+				with undo logs commits, it sets this
+				to the value of the trx id counter for
+				the tables it had an IX lock on */
+#ifdef UNIV_DEBUG
+	/*----------------------*/
+	ibool		does_not_fit_in_memory;
+				/*!< this field is used to specify in
+				simulations tables which are so big
+				that disk should be accessed: disk
+				access is simulated by putting the
+				thread to sleep for a while; NOTE that
+				this flag is not stored to the data
+				dictionary on disk, and the database
+				will forget about value TRUE if it has
+				to reload the table definition from
+				disk */
+#endif /* UNIV_DEBUG */
+	/*----------------------*/
+	unsigned	big_rows:1;
+				/*!< flag: TRUE if the maximum length of
+				a single row exceeds BIG_ROW_SIZE;
+				initialized in dict_table_add_to_cache() */
+				/** Statistics for query optimization */
+				/* @{ */
+
+	volatile os_once::state_t	stats_latch_created;
+				/*!< Creation state of 'stats_latch'. */
+
+	rw_lock_t*	stats_latch; /*!< this latch protects:
+				dict_table_t::stat_initialized
+				dict_table_t::stat_n_rows (*)
+				dict_table_t::stat_clustered_index_size
+				dict_table_t::stat_sum_of_other_index_sizes
+				dict_table_t::stat_modified_counter (*)
+				dict_table_t::indexes*::stat_n_diff_key_vals[]
+				dict_table_t::indexes*::stat_index_size
+				dict_table_t::indexes*::stat_n_leaf_pages
+				(*) those are not always protected for
+				performance reasons */
+	unsigned	stat_initialized:1; /*!< TRUE if statistics have
+				been calculated the first time
+				after database startup or table creation */
+#define DICT_TABLE_IN_USED      -1
+	lint		memcached_sync_count;
+				/*!< count of how many handles are opened
+				to this table from memcached; DDL on the
+				table is NOT allowed until this count
+				goes to zero. If it's -1, means there's DDL
+		                on the table, DML from memcached will be
+				blocked. */
+	ib_time_t	stats_last_recalc;
+				/*!< Timestamp of last recalc of the stats */
+	ib_uint32_t	stat_persistent;
+				/*!< The two bits below are set in the
+				::stat_persistent member and have the following
+				meaning:
+				1. _ON=0, _OFF=0, no explicit persistent stats
+				setting for this table, the value of the global
+				srv_stats_persistent is used to determine
+				whether the table has persistent stats enabled
+				or not
+				2. _ON=0, _OFF=1, persistent stats are
+				explicitly disabled for this table, regardless
+				of the value of the global srv_stats_persistent
+				3. _ON=1, _OFF=0, persistent stats are
+				explicitly enabled for this table, regardless
+				of the value of the global srv_stats_persistent
+				4. _ON=1, _OFF=1, not allowed, we assert if
+				this ever happens. */
+#define DICT_STATS_PERSISTENT_ON	(1 << 1)
+#define DICT_STATS_PERSISTENT_OFF	(1 << 2)
+	ib_uint32_t	stats_auto_recalc;
+				/*!< The two bits below are set in the
+				::stats_auto_recalc member and have
+				the following meaning:
+				1. _ON=0, _OFF=0, no explicit auto recalc
+				setting for this table, the value of the global
+				srv_stats_persistent_auto_recalc is used to
+				determine whether the table has auto recalc
+				enabled or not
+				2. _ON=0, _OFF=1, auto recalc is explicitly
+				disabled for this table, regardless of the
+				value of the global
+				srv_stats_persistent_auto_recalc
+				3. _ON=1, _OFF=0, auto recalc is explicitly
+				enabled for this table, regardless of the
+				value of the global
+				srv_stats_persistent_auto_recalc
+				4. _ON=1, _OFF=1, not allowed, we assert if
+				this ever happens. */
+#define DICT_STATS_AUTO_RECALC_ON	(1 << 1)
+#define DICT_STATS_AUTO_RECALC_OFF	(1 << 2)
+	ulint		stats_sample_pages;
+				/*!< the number of pages to sample for this
+				table during persistent stats estimation;
+				if this is 0, then the value of the global
+				srv_stats_persistent_sample_pages will be
+				used instead. */
+	ib_uint64_t	stat_n_rows;
+				/*!< approximate number of rows in the table;
+				we periodically calculate new estimates */
+	ulint		stat_clustered_index_size;
+				/*!< approximate clustered index size in
+				database pages */
+	ulint		stat_sum_of_other_index_sizes;
+				/*!< other indexes in database pages */
+	ib_uint64_t	stat_modified_counter;
+				/*!< when a row is inserted, updated,
+				or deleted,
+				we add 1 to this number; we calculate new
+				estimates for the stat_... values for the
+				table and the indexes when about 1 / 16 of
+				table has been modified;
+				also when the estimate operation is
+				called for MySQL SHOW TABLE STATUS; the
+				counter is reset to zero at statistics
+				calculation; this counter is not protected by
+				any latch, because this is only used for
+				heuristics */
+#define BG_STAT_NONE		0
+#define BG_STAT_IN_PROGRESS	(1 << 0)
+				/*!< BG_STAT_IN_PROGRESS is set in
+				stats_bg_flag when the background
+				stats code is working on this table. The DROP
+				TABLE code waits for this to be cleared
+				before proceeding. */
+#define BG_STAT_SHOULD_QUIT	(1 << 1)
+				/*!< BG_STAT_SHOULD_QUIT is set in
+				stats_bg_flag when DROP TABLE starts
+				waiting on BG_STAT_IN_PROGRESS to be cleared,
+				the background stats thread will detect this
+				and will eventually quit sooner */
+	byte		stats_bg_flag;
+				/*!< see BG_STAT_* above.
+				Writes are covered by dict_sys->mutex.
+				Dirty reads are possible. */
+				/* @} */
+	/*----------------------*/
+				/**!< The following fields are used by the
+				AUTOINC code.  The actual collection of
+				tables locked during AUTOINC read/write is
+				kept in trx_t. In order to quickly determine
+				whether a transaction has locked the AUTOINC
+				lock we keep a pointer to the transaction
+				here in the autoinc_trx variable. This is to
+				avoid acquiring the lock_sys_t::mutex and
+				scanning the vector in trx_t.
+
+				When an AUTOINC lock has to wait, the
+				corresponding lock instance is created on
+				the trx lock heap rather than use the
+				pre-allocated instance in autoinc_lock below.*/
+				/* @{ */
+	lock_t*		autoinc_lock;
+				/*!< a buffer for an AUTOINC lock
+				for this table: we allocate the memory here
+				so that individual transactions can get it
+				and release it without a need to allocate
+				space from the lock heap of the trx:
+				otherwise the lock heap would grow rapidly
+				if we do a large insert from a select */
+	ib_mutex_t		autoinc_mutex;
+				/*!< mutex protecting the autoincrement
+				counter */
+	ib_uint64_t	autoinc;/*!< autoinc counter value to give to the
+				next inserted row */
+	ulong		n_waiting_or_granted_auto_inc_locks;
+				/*!< This counter is used to track the number
+				of granted and pending autoinc locks on this
+				table. This value is set after acquiring the
+				lock_sys_t::mutex but we peek the contents to
+				determine whether other transactions have
+				acquired the AUTOINC lock or not. Of course
+				only one transaction can be granted the
+				lock but there can be multiple waiters. */
+	const trx_t*	autoinc_trx;
+				/*!< The transaction that currently holds the
+				the AUTOINC lock on this table.
+				Protected by lock_sys->mutex. */
+	fts_t*		fts;	/* FTS specific state variables */
+				/* @} */
+	/*----------------------*/
+
+	ib_quiesce_t	 quiesce;/*!< Quiescing states, protected by the
+				dict_index_t::lock. ie. we can only change
+				the state if we acquire all the latches
+				(dict_index_t::lock) in X mode of this table's
+				indexes. */
+
+	/*----------------------*/
+	ulint		n_rec_locks;
+				/*!< Count of the number of record locks on
+				this table. We use this to determine whether
+				we can evict the table from the dictionary
+				cache. It is protected by lock_sys->mutex. */
+	ulint		n_ref_count;
+				/*!< count of how many handles are opened
+				to this table; dropping of the table is
+				NOT allowed until this count gets to zero;
+				MySQL does NOT itself check the number of
+				open handles at drop */
+	UT_LIST_BASE_NODE_T(lock_t)
+			locks;	/*!< list of locks on the table; protected
+				by lock_sys->mutex */
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+	ulint		magic_n;/*!< magic number */
+/** Value of dict_table_t::magic_n */
+# define DICT_TABLE_MAGIC_N	76333786
+#endif /* UNIV_DEBUG */
+};
+
+/** A function object to add the foreign key constraint to the referenced set
+of the referenced table, if it exists in the dictionary cache. */
+struct dict_foreign_add_to_referenced_table {
+	void operator()(dict_foreign_t*	foreign) const
+	{
+		if (dict_table_t* table = foreign->referenced_table) {
+			std::pair<dict_foreign_set::iterator, bool>	ret
+				= table->referenced_set.insert(foreign);
+			ut_a(ret.second);
+		}
+	}
+};
+
+#ifndef UNIV_NONINL
+#include "dict0mem.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/dict0mem.ic b/storage/innobase/include/dict0mem.ic
new file mode 100644
index 00000000000..38d51f61789
--- /dev/null
+++ b/storage/innobase/include/dict0mem.ic
@@ -0,0 +1,74 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0mem.ic
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "data0type.h"
+#include "dict0mem.h"
+#include "fil0fil.h"
+
+/**********************************************************************//**
+This function poplulates a dict_index_t index memory structure with
+supplied information. */
+UNIV_INLINE
+void
+dict_mem_fill_index_struct(
+/*=======================*/
+	dict_index_t*	index,		/*!< out: index to be filled */
+	mem_heap_t*	heap,		/*!< in: memory heap */
+	const char*	table_name,	/*!< in: table name */
+	const char*	index_name,	/*!< in: index name */
+	ulint		space,		/*!< in: space where the index tree is
+					placed, ignored if the index is of
+					the clustered type */
+	ulint		type,		/*!< in: DICT_UNIQUE,
+					DICT_CLUSTERED, ... ORed */
+	ulint		n_fields)	/*!< in: number of fields */
+{
+
+	if (heap) {
+		index->heap = heap;
+		index->name = mem_heap_strdup(heap, index_name);
+		index->fields = (dict_field_t*) mem_heap_alloc(
+			heap, 1 + n_fields * sizeof(dict_field_t));
+	} else {
+		index->name = index_name;
+		index->heap = NULL;
+		index->fields = NULL;
+	}
+
+	/* Assign a ulint to a 4-bit-mapped field.
+	Only the low-order 4 bits are assigned. */
+	index->type = type;
+#ifndef UNIV_HOTBACKUP
+	index->space = (unsigned int) space;
+	index->page = FIL_NULL;
+#endif /* !UNIV_HOTBACKUP */
+	index->table_name = table_name;
+	index->n_fields = (unsigned int) n_fields;
+	/* The '1 +' above prevents allocation
+	of an empty mem block */
+#ifdef UNIV_DEBUG
+	index->magic_n = DICT_INDEX_MAGIC_N;
+#endif /* UNIV_DEBUG */
+}
diff --git a/storage/innobase/include/dict0priv.h b/storage/innobase/include/dict0priv.h
new file mode 100644
index 00000000000..9a3c8e22992
--- /dev/null
+++ b/storage/innobase/include/dict0priv.h
@@ -0,0 +1,63 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0priv.h
+Data dictionary private functions
+
+Created  Fri 2 Jul 2010 13:30:38 EST - Sunny Bains
+*******************************************************/
+
+#ifndef dict0priv_h
+#define dict0priv_h
+
+/**********************************************************************//**
+Gets a table; loads it to the dictionary cache if necessary. A low-level
+function. Note: Not to be called from outside dict0*c functions.
+@return	table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_low(
+/*===============*/
+	const char*	table_name);		/*!< in: table name */
+
+/**********************************************************************//**
+Checks if a table is in the dictionary cache.
+@return	table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_check_if_in_cache_low(
+/*=============================*/
+	const char*	table_name);		/*!< in: table name */
+
+/**********************************************************************//**
+Returns a table object based on table id.
+@return	table, NULL if does not exist */
+UNIV_INLINE
+dict_table_t*
+dict_table_open_on_id_low(
+/*=====================*/
+	table_id_t		table_id,	/*!< in: table id */
+	dict_err_ignore_t	ignore_err);	/*!< in: errors to ignore
+						when loading the table */
+
+#ifndef UNIV_NONINL
+#include "dict0priv.ic"
+#endif
+
+#endif /* dict0priv.h */
diff --git a/storage/innobase/include/dict0priv.ic b/storage/innobase/include/dict0priv.ic
new file mode 100644
index 00000000000..30ba8fb60aa
--- /dev/null
+++ b/storage/innobase/include/dict0priv.ic
@@ -0,0 +1,125 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0priv.ic
+Data dictionary system private include file
+
+Created  Wed 13 Oct 2010 16:10:14 EST Sunny Bains
+***********************************************************************/
+
+#include "dict0dict.h"
+#include "dict0load.h"
+#include "dict0priv.h"
+#ifndef UNIV_HOTBACKUP
+
+/**********************************************************************//**
+Gets a table; loads it to the dictionary cache if necessary. A low-level
+function.
+@return	table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_low(
+/*===============*/
+	const char*	table_name)	/*!< in: table name */
+{
+	dict_table_t*	table;
+
+	ut_ad(table_name);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	table = dict_table_check_if_in_cache_low(table_name);
+
+	if (table && table->corrupted) {
+		fprintf(stderr, "InnoDB: table");
+		ut_print_name(stderr, NULL, TRUE, table->name);
+		if (srv_load_corrupted) {
+			fputs(" is corrupted, but"
+			      " innodb_force_load_corrupted is set\n", stderr);
+		} else {
+			fputs(" is corrupted\n", stderr);
+			return(NULL);
+		}
+	}
+
+	if (table == NULL) {
+		table = dict_load_table(table_name, TRUE, DICT_ERR_IGNORE_NONE);
+	}
+
+	ut_ad(!table || table->cached);
+
+	return(table);
+}
+
+/**********************************************************************//**
+Returns a table object based on table id.
+@return	table, NULL if does not exist */
+UNIV_INLINE
+dict_table_t*
+dict_table_open_on_id_low(
+/*======================*/
+	table_id_t		table_id,	/*!< in: table id */
+	dict_err_ignore_t	ignore_err)	/*!< in: errors to ignore
+						when loading the table */
+{
+	dict_table_t*	table;
+	ulint		fold;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	/* Look for the table name in the hash table */
+	fold = ut_fold_ull(table_id);
+
+	HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold,
+		    dict_table_t*, table, ut_ad(table->cached),
+		    table->id == table_id);
+	if (table == NULL) {
+		table = dict_load_table_on_id(table_id, ignore_err);
+	}
+
+	ut_ad(!table || table->cached);
+
+	/* TODO: should get the type information from MySQL */
+
+	return(table);
+}
+
+/**********************************************************************//**
+Checks if a table is in the dictionary cache.
+@return	table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_check_if_in_cache_low(
+/*=============================*/
+	const char*	table_name)	/*!< in: table name */
+{
+	dict_table_t*	table;
+	ulint		table_fold;
+
+	ut_ad(table_name);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	/* Look for the table name in the hash table */
+	table_fold = ut_fold_string(table_name);
+
+	HASH_SEARCH(name_hash, dict_sys->table_hash, table_fold,
+		    dict_table_t*, table, ut_ad(table->cached),
+		    !strcmp(table->name, table_name));
+	return(table);
+}
+#endif /*! UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h
new file mode 100644
index 00000000000..186f90e3694
--- /dev/null
+++ b/storage/innobase/include/dict0stats.h
@@ -0,0 +1,202 @@
+/*****************************************************************************
+
+Copyright (c) 2009, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats.h
+Code used for calculating and manipulating table statistics.
+
+Created Jan 06, 2010 Vasil Dimov
+*******************************************************/
+
+#ifndef dict0stats_h
+#define dict0stats_h
+
+#include "univ.i"
+
+#include "db0err.h"
+#include "dict0types.h"
+#include "trx0types.h"
+
+enum dict_stats_upd_option_t {
+	DICT_STATS_RECALC_PERSISTENT,/* (re) calculate the
+				statistics using a precise and slow
+				algo and save them to the persistent
+				storage, if the persistent storage is
+				not present then emit a warning and
+				fall back to transient stats */
+	DICT_STATS_RECALC_TRANSIENT,/* (re) calculate the statistics
+				using an imprecise quick algo
+				without saving the results
+				persistently */
+	DICT_STATS_EMPTY_TABLE,	/* Write all zeros (or 1 where it makes sense)
+				into a table and its indexes' statistics
+				members. The resulting stats correspond to an
+				empty table. If the table is using persistent
+				statistics, then they are saved on disk. */
+	DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY /* fetch the stats
+				from the persistent storage if the in-memory
+				structures have not been initialized yet,
+				otherwise do nothing */
+};
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. This function
+is relatively quick and is used to calculate transient statistics that
+are not saved on disk.
+This was the only way to calculate statistics before the
+Persistent Statistics feature was introduced. */
+UNIV_INTERN
+void
+dict_stats_update_transient(
+/*========================*/
+	dict_table_t*	table);	/*!< in/out: table */
+
+/*********************************************************************//**
+Set the persistent statistics flag for a given table. This is set only
+in the in-memory table object and is not saved on disk. It will be read
+from the .frm file upon first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_set_persistent(
+/*======================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	ibool		ps_on,	/*!< in: persistent stats explicitly enabled */
+	ibool		ps_off)	/*!< in: persistent stats explicitly disabled */
+	__attribute__((nonnull));
+
+/*********************************************************************//**
+Check whether persistent statistics is enabled for a given table.
+@return TRUE if enabled, FALSE otherwise */
+UNIV_INLINE
+ibool
+dict_stats_is_persistent_enabled(
+/*=============================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Set the auto recalc flag for a given table (only honored for a persistent
+stats enabled table). The flag is set only in the in-memory table object
+and is not saved in InnoDB files. It will be read from the .frm file upon
+first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_auto_recalc_set(
+/*=======================*/
+	dict_table_t*	table,			/*!< in/out: table */
+	ibool		auto_recalc_on,		/*!< in: explicitly enabled */
+	ibool		auto_recalc_off);	/*!< in: explicitly disabled */
+
+/*********************************************************************//**
+Check whether auto recalc is enabled for a given table.
+@return TRUE if enabled, FALSE otherwise */
+UNIV_INLINE
+ibool
+dict_stats_auto_recalc_is_enabled(
+/*==============================*/
+	const dict_table_t*	table);	/*!< in: table */
+
+/*********************************************************************//**
+Initialize table's stats for the first time when opening a table. */
+UNIV_INLINE
+void
+dict_stats_init(
+/*============*/
+	dict_table_t*	table);	/*!< in/out: table */
+
+/*********************************************************************//**
+Deinitialize table's stats after the last close of the table. This is
+used to detect "FLUSH TABLE" and refresh the stats upon next open. */
+UNIV_INLINE
+void
+dict_stats_deinit(
+/*==============*/
+	dict_table_t*	table)	/*!< in/out: table */
+	__attribute__((nonnull));
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization.
+@return DB_* error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_stats_update(
+/*==============*/
+	dict_table_t*		table,	/*!< in/out: table */
+	dict_stats_upd_option_t	stats_upd_option);
+					/*!< in: whether to (re) calc
+					the stats or to fetch them from
+					the persistent storage */
+
+/*********************************************************************//**
+Removes the information for a particular index's stats from the persistent
+storage if it exists and if there is data stored for this index.
+This function creates its own trx and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_drop_index(
+/*==================*/
+	const char*	tname,	/*!< in: table name */
+	const char*	iname,	/*!< in: index name */
+	char*		errstr, /*!< out: error message if != DB_SUCCESS
+				is returned */
+	ulint		errstr_sz);/*!< in: size of the errstr buffer */
+
+/*********************************************************************//**
+Removes the statistics for a table and all of its indexes from the
+persistent storage if it exists and if there is data stored for the table.
+This function creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_drop_table(
+/*==================*/
+	const char*	table_name,	/*!< in: table name */
+	char*		errstr,		/*!< out: error message
+					if != DB_SUCCESS is returned */
+	ulint		errstr_sz);	/*!< in: size of errstr buffer */
+
+/*********************************************************************//**
+Fetches or calculates new estimates for index statistics. */
+UNIV_INTERN
+void
+dict_stats_update_for_index(
+/*========================*/
+	dict_index_t*	index)	/*!< in/out: index */
+	__attribute__((nonnull));
+
+/*********************************************************************//**
+Renames a table in InnoDB persistent stats storage.
+This function creates its own transaction and commits it.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_rename_table(
+/*====================*/
+	const char*	old_name,	/*!< in: old table name */
+	const char*	new_name,	/*!< in: new table name */
+	char*		errstr,		/*!< out: error string if != DB_SUCCESS
+					is returned */
+	size_t		errstr_sz);	/*!< in: errstr size */
+
+#ifndef UNIV_NONINL
+#include "dict0stats.ic"
+#endif
+
+#endif /* dict0stats_h */
diff --git a/storage/innobase/include/dict0stats.ic b/storage/innobase/include/dict0stats.ic
new file mode 100644
index 00000000000..ec9a9065470
--- /dev/null
+++ b/storage/innobase/include/dict0stats.ic
@@ -0,0 +1,236 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats.ic
+Code used for calculating and manipulating table statistics.
+
+Created Jan 23, 2012 Vasil Dimov
+*******************************************************/
+
+#include "univ.i"
+#include "dict0dict.h" /* dict_table_stats_lock() */
+#include "dict0types.h" /* dict_table_t */
+#include "srv0srv.h" /* srv_stats_persistent, srv_stats_auto_recalc */
+
+/*********************************************************************//**
+Set the persistent statistics flag for a given table. This is set only
+in the in-memory table object and is not saved on disk. It will be read
+from the .frm file upon first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_set_persistent(
+/*======================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	ibool		ps_on,	/*!< in: persistent stats explicitly enabled */
+	ibool		ps_off)	/*!< in: persistent stats explicitly disabled */
+{
+	/* Not allowed to have both flags set, but a CREATE or ALTER
+	statement that contains "STATS_PERSISTENT=0 STATS_PERSISTENT=1" would
+	end up having both set. In this case we clear the OFF flag. */
+	if (ps_on && ps_off) {
+		ps_off = FALSE;
+	}
+
+	ib_uint32_t	stat_persistent = 0;
+
+	if (ps_on) {
+		stat_persistent |= DICT_STATS_PERSISTENT_ON;
+	}
+
+	if (ps_off) {
+		stat_persistent |= DICT_STATS_PERSISTENT_OFF;
+	}
+
+	/* we rely on this assignment to be atomic */
+	table->stat_persistent = stat_persistent;
+}
+
+/*********************************************************************//**
+Check whether persistent statistics is enabled for a given table.
+@return TRUE if enabled, FALSE otherwise */
+UNIV_INLINE
+ibool
+dict_stats_is_persistent_enabled(
+/*=============================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	/* Because of the nature of this check (non-locking) it is possible
+	that a table becomes:
+	* PS-disabled immediately after this function has returned TRUE or
+	* PS-enabled immediately after this function has returned FALSE.
+	This means that it is possible that we do:
+	+ dict_stats_update(DICT_STATS_RECALC_PERSISTENT) on a table that has
+	  just been PS-disabled or
+	+ dict_stats_update(DICT_STATS_RECALC_TRANSIENT) on a table that has
+	  just been PS-enabled.
+	This is acceptable. Avoiding this would mean that we would have to
+	protect the ::stat_persistent with dict_table_stats_lock() like the
+	other ::stat_ members which would be too big performance penalty,
+	especially when this function is called from
+	row_update_statistics_if_needed(). */
+
+	/* we rely on this read to be atomic */
+	ib_uint32_t	stat_persistent = table->stat_persistent;
+
+	if (stat_persistent & DICT_STATS_PERSISTENT_ON) {
+		ut_ad(!(stat_persistent & DICT_STATS_PERSISTENT_OFF));
+		return(TRUE);
+	} else if (stat_persistent & DICT_STATS_PERSISTENT_OFF) {
+		return(FALSE);
+	} else {
+		return(srv_stats_persistent);
+	}
+}
+
+/*********************************************************************//**
+Set the auto recalc flag for a given table (only honored for a persistent
+stats enabled table). The flag is set only in the in-memory table object
+and is not saved in InnoDB files. It will be read from the .frm file upon
+first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_auto_recalc_set(
+/*=======================*/
+	dict_table_t*	table,			/*!< in/out: table */
+	ibool		auto_recalc_on,		/*!< in: explicitly enabled */
+	ibool		auto_recalc_off)	/*!< in: explicitly disabled */
+{
+	ut_ad(!auto_recalc_on || !auto_recalc_off);
+
+	ib_uint32_t	stats_auto_recalc = 0;
+
+	if (auto_recalc_on) {
+		stats_auto_recalc |= DICT_STATS_AUTO_RECALC_ON;
+	}
+
+	if (auto_recalc_off) {
+		stats_auto_recalc |= DICT_STATS_AUTO_RECALC_OFF;
+	}
+
+	/* we rely on this assignment to be atomic */
+	table->stats_auto_recalc = stats_auto_recalc;
+}
+
+/*********************************************************************//**
+Check whether auto recalc is enabled for a given table.
+@return TRUE if enabled, FALSE otherwise */
+UNIV_INLINE
+ibool
+dict_stats_auto_recalc_is_enabled(
+/*==============================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	/* we rely on this read to be atomic */
+	ib_uint32_t	stats_auto_recalc = table->stats_auto_recalc;
+
+	if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_ON) {
+		ut_ad(!(stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF));
+		return(TRUE);
+	} else if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF) {
+		return(FALSE);
+	} else {
+		return(srv_stats_auto_recalc);
+	}
+}
+
+/*********************************************************************//**
+Initialize table's stats for the first time when opening a table. */
+UNIV_INLINE
+void
+dict_stats_init(
+/*============*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	ut_ad(!mutex_own(&dict_sys->mutex));
+
+	if (table->stat_initialized) {
+		return;
+	}
+
+	dict_stats_upd_option_t	opt;
+
+	if (dict_stats_is_persistent_enabled(table)) {
+		opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY;
+	} else {
+		opt = DICT_STATS_RECALC_TRANSIENT;
+	}
+
+	dict_stats_update(table, opt);
+}
+
+/*********************************************************************//**
+Deinitialize table's stats after the last close of the table. This is
+used to detect "FLUSH TABLE" and refresh the stats upon next open. */
+UNIV_INLINE
+void
+dict_stats_deinit(
+/*==============*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	ut_a(table->n_ref_count == 0);
+
+	dict_table_stats_lock(table, RW_X_LATCH);
+
+	if (!table->stat_initialized) {
+		dict_table_stats_unlock(table, RW_X_LATCH);
+		return;
+	}
+
+	table->stat_initialized = FALSE;
+
+#ifdef UNIV_DEBUG_VALGRIND
+	UNIV_MEM_INVALID(&table->stat_n_rows,
+			 sizeof(table->stat_n_rows));
+	UNIV_MEM_INVALID(&table->stat_clustered_index_size,
+			 sizeof(table->stat_clustered_index_size));
+	UNIV_MEM_INVALID(&table->stat_sum_of_other_index_sizes,
+			 sizeof(table->stat_sum_of_other_index_sizes));
+	UNIV_MEM_INVALID(&table->stat_modified_counter,
+			 sizeof(table->stat_modified_counter));
+
+	dict_index_t*   index;
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		ulint	n_uniq = dict_index_get_n_unique(index);
+
+		UNIV_MEM_INVALID(
+			index->stat_n_diff_key_vals,
+			n_uniq * sizeof(index->stat_n_diff_key_vals[0]));
+		UNIV_MEM_INVALID(
+			index->stat_n_sample_sizes,
+			n_uniq * sizeof(index->stat_n_sample_sizes[0]));
+		UNIV_MEM_INVALID(
+			index->stat_n_non_null_key_vals,
+			n_uniq * sizeof(index->stat_n_non_null_key_vals[0]));
+		UNIV_MEM_INVALID(
+			&index->stat_index_size,
+			sizeof(index->stat_index_size));
+		UNIV_MEM_INVALID(
+			&index->stat_n_leaf_pages,
+			sizeof(index->stat_n_leaf_pages));
+	}
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	dict_table_stats_unlock(table, RW_X_LATCH);
+}
diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h
new file mode 100644
index 00000000000..e866ab419fe
--- /dev/null
+++ b/storage/innobase/include/dict0stats_bg.h
@@ -0,0 +1,127 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats_bg.h
+Code used for background table and index stats gathering.
+
+Created Apr 26, 2012 Vasil Dimov
+*******************************************************/
+
+#ifndef dict0stats_bg_h
+#define dict0stats_bg_h
+
+#include "univ.i"
+
+#include "dict0types.h" /* dict_table_t, table_id_t */
+#include "os0sync.h" /* os_event_t */
+#include "os0thread.h" /* DECLARE_THREAD */
+
+/** Event to wake up the stats thread */
+extern os_event_t	dict_stats_event;
+
+/*****************************************************************//**
+Add a table to the recalc pool, which is processed by the
+background stats gathering thread. Only the table id is added to the
+list, so the table can be closed after being enqueued and it will be
+opened when needed. If the table does not exist later (has been DROPped),
+then it will be removed from the pool and skipped. */
+UNIV_INTERN
+void
+dict_stats_recalc_pool_add(
+/*=======================*/
+	const dict_table_t*	table);	/*!< in: table to add */
+
+/*****************************************************************//**
+Delete a given table from the auto recalc pool.
+dict_stats_recalc_pool_del() */
+UNIV_INTERN
+void
+dict_stats_recalc_pool_del(
+/*=======================*/
+	const dict_table_t*	table);	/*!< in: table to remove */
+
+/** Yield the data dictionary latch when waiting
+for the background thread to stop accessing a table.
+@param trx	transaction holding the data dictionary locks */
+#define DICT_STATS_BG_YIELD(trx)	do {	\
+	row_mysql_unlock_data_dictionary(trx);	\
+	os_thread_sleep(250000);		\
+	row_mysql_lock_data_dictionary(trx);	\
+} while (0)
+
+/*****************************************************************//**
+Request the background collection of statistics to stop for a table.
+@retval true when no background process is active
+@retval false when it is not safe to modify the table definition */
+UNIV_INLINE
+bool
+dict_stats_stop_bg(
+/*===============*/
+	dict_table_t*	table)	/*!< in/out: table */
+	__attribute__((warn_unused_result));
+
+/*****************************************************************//**
+Wait until background stats thread has stopped using the specified table.
+The caller must have locked the data dictionary using
+row_mysql_lock_data_dictionary() and this function may unlock it temporarily
+and restore the lock before it exits.
+The background stats thread is guaranteed not to start using the specified
+table after this function returns and before the caller unlocks the data
+dictionary because it sets the BG_STAT_IN_PROGRESS bit in table->stats_bg_flag
+under dict_sys->mutex. */
+UNIV_INTERN
+void
+dict_stats_wait_bg_to_stop_using_table(
+/*===================================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	trx_t*		trx);	/*!< in/out: transaction to use for
+				unlocking/locking the data dict */
+/*****************************************************************//**
+Initialize global variables needed for the operation of dict_stats_thread().
+Must be called before dict_stats_thread() is started. */
+UNIV_INTERN
+void
+dict_stats_thread_init();
+/*====================*/
+
+/*****************************************************************//**
+Free resources allocated by dict_stats_thread_init(), must be called
+after dict_stats_thread() has exited. */
+UNIV_INTERN
+void
+dict_stats_thread_deinit();
+/*======================*/
+
+/*****************************************************************//**
+This is the thread for background stats gathering. It pops tables, from
+the auto recalc list and proceeds them, eventually recalculating their
+statistics.
+@return this function does not return, it calls os_thread_exit() */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(dict_stats_thread)(
+/*==============================*/
+	void*	arg);	/*!< in: a dummy parameter
+			required by os_thread_create */
+
+# ifndef UNIV_NONINL
+#  include "dict0stats_bg.ic"
+# endif
+
+#endif /* dict0stats_bg_h */
diff --git a/storage/innobase/include/dict0stats_bg.ic b/storage/innobase/include/dict0stats_bg.ic
new file mode 100644
index 00000000000..87e3225de58
--- /dev/null
+++ b/storage/innobase/include/dict0stats_bg.ic
@@ -0,0 +1,45 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats_bg.ic
+Code used for background table and index stats gathering.
+
+Created Feb 8, 2013 Marko Makela
+*******************************************************/
+
+/*****************************************************************//**
+Request the background collection of statistics to stop for a table.
+@retval true when no background process is active
+@retval false when it is not safe to modify the table definition */
+UNIV_INLINE
+bool
+dict_stats_stop_bg(
+/*===============*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	ut_ad(!srv_read_only_mode);
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	if (!(table->stats_bg_flag & BG_STAT_IN_PROGRESS)) {
+		return(true);
+	}
+
+	table->stats_bg_flag |= BG_STAT_SHOULD_QUIT;
+	return(false);
+}
diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h
new file mode 100644
index 00000000000..d34b6f7eab3
--- /dev/null
+++ b/storage/innobase/include/dict0types.h
@@ -0,0 +1,91 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0types.h
+Data dictionary global types
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0types_h
+#define dict0types_h
+
+struct dict_sys_t;
+struct dict_col_t;
+struct dict_field_t;
+struct dict_index_t;
+struct dict_table_t;
+struct dict_foreign_t;
+
+struct ind_node_t;
+struct tab_node_t;
+
+/* Space id and page no where the dictionary header resides */
+#define	DICT_HDR_SPACE		0	/* the SYSTEM tablespace */
+#define	DICT_HDR_PAGE_NO	FSP_DICT_HDR_PAGE_NO
+
+/* The ibuf table and indexes's ID are assigned as the number
+DICT_IBUF_ID_MIN plus the space id */
+#define DICT_IBUF_ID_MIN	0xFFFFFFFF00000000ULL
+
+typedef ib_id_t		table_id_t;
+typedef ib_id_t		index_id_t;
+
+/** Error to ignore when we load table dictionary into memory. However,
+the table and index will be marked as "corrupted", and caller will
+be responsible to deal with corrupted table or index.
+Note: please define the IGNORE_ERR_* as bits, so their value can
+be or-ed together */
+enum dict_err_ignore_t {
+	DICT_ERR_IGNORE_NONE = 0,	/*!< no error to ignore */
+	DICT_ERR_IGNORE_INDEX_ROOT = 1,	/*!< ignore error if index root
+					page is FIL_NULL or incorrect value */
+	DICT_ERR_IGNORE_CORRUPT = 2,	/*!< skip corrupted indexes */
+	DICT_ERR_IGNORE_FK_NOKEY = 4,	/*!< ignore error if any foreign
+					key is missing */
+	DICT_ERR_IGNORE_RECOVER_LOCK = 8,
+					/*!< Used when recovering table locks
+					for resurrected transactions.
+					Silently load a missing
+					tablespace, and do not load
+					incomplete index definitions. */
+	DICT_ERR_IGNORE_ALL = 0xFFFF	/*!< ignore all errors */
+};
+
+/** Quiescing states for flushing tables to disk. */
+enum ib_quiesce_t {
+	QUIESCE_NONE,
+	QUIESCE_START,			/*!< Initialise, prepare to start */
+	QUIESCE_COMPLETE		/*!< All done */
+};
+
+/** Prefix for tmp tables, adopted from sql/table.h */
+#define tmp_file_prefix		"#sql"
+#define tmp_file_prefix_length	4
+#define TEMP_FILE_PREFIX_INNODB	"#sql-ib"
+
+#define TEMP_TABLE_PREFIX                "#sql"
+#define TEMP_TABLE_PATH_PREFIX           "/" TEMP_TABLE_PREFIX
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+/** Flag to control insert buffer debugging. */
+extern uint		ibuf_debug;
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+#endif
diff --git a/storage/innobase/include/dyn0dyn.h b/storage/innobase/include/dyn0dyn.h
new file mode 100644
index 00000000000..7f23302d1ff
--- /dev/null
+++ b/storage/innobase/include/dyn0dyn.h
@@ -0,0 +1,199 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dyn0dyn.h
+The dynamically allocated array
+
+Created 2/5/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dyn0dyn_h
+#define dyn0dyn_h
+
+#include "univ.i"
+#include "ut0lst.h"
+#include "mem0mem.h"
+
+/** A block in a dynamically allocated array */
+struct dyn_block_t;
+/** Dynamically allocated array */
+typedef dyn_block_t		dyn_array_t;
+
+/** This is the initial 'payload' size of a dynamic array;
+this must be > MLOG_BUF_MARGIN + 30! */
+#define	DYN_ARRAY_DATA_SIZE	512
+
+/*********************************************************************//**
+Initializes a dynamic array.
+@return	initialized dyn array */
+UNIV_INLINE
+dyn_array_t*
+dyn_array_create(
+/*=============*/
+	dyn_array_t*	arr)	/*!< in/out memory buffer of
+				size sizeof(dyn_array_t) */
+	__attribute__((nonnull));
+/************************************************************//**
+Frees a dynamic array. */
+UNIV_INLINE
+void
+dyn_array_free(
+/*===========*/
+	dyn_array_t*	arr)	/*!< in,own: dyn array */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Makes room on top of a dyn array and returns a pointer to a buffer in it.
+After copying the elements, the caller must close the buffer using
+dyn_array_close.
+@return	pointer to the buffer */
+UNIV_INLINE
+byte*
+dyn_array_open(
+/*===========*/
+	dyn_array_t*	arr,	/*!< in: dynamic array */
+	ulint		size)	/*!< in: size in bytes of the buffer; MUST be
+				smaller than DYN_ARRAY_DATA_SIZE! */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Closes the buffer returned by dyn_array_open. */
+UNIV_INLINE
+void
+dyn_array_close(
+/*============*/
+	dyn_array_t*	arr,	/*!< in: dynamic array */
+	const byte*	ptr)	/*!< in: end of used space */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Makes room on top of a dyn array and returns a pointer to
+the added element. The caller must copy the element to
+the pointer returned.
+@return	pointer to the element */
+UNIV_INLINE
+void*
+dyn_array_push(
+/*===========*/
+	dyn_array_t*	arr,	/*!< in/out: dynamic array */
+	ulint		size)	/*!< in: size in bytes of the element */
+	__attribute__((nonnull, warn_unused_result));
+/************************************************************//**
+Returns pointer to an element in dyn array.
+@return	pointer to element */
+UNIV_INLINE
+void*
+dyn_array_get_element(
+/*==================*/
+	const dyn_array_t*	arr,	/*!< in: dyn array */
+	ulint			pos)	/*!< in: position of element
+					in bytes from array start */
+	__attribute__((nonnull, warn_unused_result));
+/************************************************************//**
+Returns the size of stored data in a dyn array.
+@return	data size in bytes */
+UNIV_INLINE
+ulint
+dyn_array_get_data_size(
+/*====================*/
+	const dyn_array_t*	arr)	/*!< in: dyn array */
+	__attribute__((nonnull, warn_unused_result, pure));
+/************************************************************//**
+Gets the first block in a dyn array.
+@param arr	dyn array
+@return		first block */
+#define dyn_array_get_first_block(arr) (arr)
+/************************************************************//**
+Gets the last block in a dyn array.
+@param arr	dyn array
+@return		last block */
+#define dyn_array_get_last_block(arr)				\
+	((arr)->heap ? UT_LIST_GET_LAST((arr)->base) : (arr))
+/********************************************************************//**
+Gets the next block in a dyn array.
+@param arr	dyn array
+@param block	dyn array block
+@return		pointer to next, NULL if end of list */
+#define dyn_array_get_next_block(arr, block)			\
+	((arr)->heap ? UT_LIST_GET_NEXT(list, block) : NULL)
+/********************************************************************//**
+Gets the previous block in a dyn array.
+@param arr	dyn array
+@param block	dyn array block
+@return		pointer to previous, NULL if end of list */
+#define dyn_array_get_prev_block(arr, block)			\
+	((arr)->heap ? UT_LIST_GET_PREV(list, block) : NULL)
+/********************************************************************//**
+Gets the number of used bytes in a dyn array block.
+@return	number of bytes used */
+UNIV_INLINE
+ulint
+dyn_block_get_used(
+/*===============*/
+	const dyn_block_t*	block)	/*!< in: dyn array block */
+	__attribute__((nonnull, warn_unused_result, pure));
+/********************************************************************//**
+Gets pointer to the start of data in a dyn array block.
+@return	pointer to data */
+UNIV_INLINE
+byte*
+dyn_block_get_data(
+/*===============*/
+	const dyn_block_t*	block)	/*!< in: dyn array block */
+	__attribute__((nonnull, warn_unused_result, pure));
+/********************************************************//**
+Pushes n bytes to a dyn array. */
+UNIV_INLINE
+void
+dyn_push_string(
+/*============*/
+	dyn_array_t*	arr,	/*!< in/out: dyn array */
+	const byte*	str,	/*!< in: string to write */
+	ulint		len)	/*!< in: string length */
+	__attribute__((nonnull));
+
+/*#################################################################*/
+
+/** @brief A block in a dynamically allocated array.
+NOTE! Do not access the fields of the struct directly: the definition
+appears here only for the compiler to know its size! */
+struct dyn_block_t{
+	mem_heap_t*	heap;	/*!< in the first block this is != NULL
+				if dynamic allocation has been needed */
+	ulint		used;	/*!< number of data bytes used in this block;
+				DYN_BLOCK_FULL_FLAG is set when the block
+				becomes full */
+	byte		data[DYN_ARRAY_DATA_SIZE];
+				/*!< storage for array elements */
+	UT_LIST_BASE_NODE_T(dyn_block_t) base;
+				/*!< linear list of dyn blocks: this node is
+				used only in the first block */
+	UT_LIST_NODE_T(dyn_block_t) list;
+				/*!< linear list node: used in all blocks */
+#ifdef UNIV_DEBUG
+	ulint		buf_end;/*!< only in the debug version: if dyn
+				array is opened, this is the buffer
+				end offset, else this is 0 */
+	ulint		magic_n;/*!< magic number (DYN_BLOCK_MAGIC_N) */
+#endif
+};
+
+
+#ifndef UNIV_NONINL
+#include "dyn0dyn.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/dyn0dyn.ic b/storage/innobase/include/dyn0dyn.ic
new file mode 100644
index 00000000000..0296554e2ee
--- /dev/null
+++ b/storage/innobase/include/dyn0dyn.ic
@@ -0,0 +1,306 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dyn0dyn.ic
+The dynamically allocated array
+
+Created 2/5/1996 Heikki Tuuri
+*******************************************************/
+
+/** Value of dyn_block_t::magic_n */
+#define DYN_BLOCK_MAGIC_N	375767
+/** Flag for dyn_block_t::used that indicates a full block */
+#define DYN_BLOCK_FULL_FLAG	0x1000000UL
+
+/************************************************************//**
+Adds a new block to a dyn array.
+@return	created block */
+UNIV_INTERN
+dyn_block_t*
+dyn_array_add_block(
+/*================*/
+	dyn_array_t*	arr)	/*!< in/out: dyn array */
+	__attribute__((nonnull, warn_unused_result));
+
+/********************************************************************//**
+Gets the number of used bytes in a dyn array block.
+@return	number of bytes used */
+UNIV_INLINE
+ulint
+dyn_block_get_used(
+/*===============*/
+	const dyn_block_t*	block)	/*!< in: dyn array block */
+{
+	ut_ad(block);
+
+	return((block->used) & ~DYN_BLOCK_FULL_FLAG);
+}
+
+/********************************************************************//**
+Gets pointer to the start of data in a dyn array block.
+@return	pointer to data */
+UNIV_INLINE
+byte*
+dyn_block_get_data(
+/*===============*/
+	const dyn_block_t*	block)	/*!< in: dyn array block */
+{
+	ut_ad(block);
+
+	return(const_cast<byte*>(block->data));
+}
+
+/*********************************************************************//**
+Initializes a dynamic array.
+@return	initialized dyn array */
+UNIV_INLINE
+dyn_array_t*
+dyn_array_create(
+/*=============*/
+	dyn_array_t*	arr)	/*!< in/out: memory buffer of
+				size sizeof(dyn_array_t) */
+{
+	ut_ad(arr);
+#if DYN_ARRAY_DATA_SIZE >= DYN_BLOCK_FULL_FLAG
+# error "DYN_ARRAY_DATA_SIZE >= DYN_BLOCK_FULL_FLAG"
+#endif
+
+	arr->heap = NULL;
+	arr->used = 0;
+
+	ut_d(arr->buf_end = 0);
+	ut_d(arr->magic_n = DYN_BLOCK_MAGIC_N);
+
+	return(arr);
+}
+
+/************************************************************//**
+Frees a dynamic array. */
+UNIV_INLINE
+void
+dyn_array_free(
+/*===========*/
+	dyn_array_t*	arr)	/*!< in: dyn array */
+{
+	if (arr->heap != NULL) {
+		mem_heap_free(arr->heap);
+	}
+
+	ut_d(arr->magic_n = 0);
+}
+
+/*********************************************************************//**
+Makes room on top of a dyn array and returns a pointer to the added element.
+The caller must copy the element to the pointer returned.
+@return	pointer to the element */
+UNIV_INLINE
+void*
+dyn_array_push(
+/*===========*/
+	dyn_array_t*	arr,	/*!< in/out: dynamic array */
+	ulint		size)	/*!< in: size in bytes of the element */
+{
+	dyn_block_t*	block;
+	ulint		used;
+
+	ut_ad(arr);
+	ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+	ut_ad(size <= DYN_ARRAY_DATA_SIZE);
+	ut_ad(size);
+
+	block = arr;
+
+	if (block->used + size > DYN_ARRAY_DATA_SIZE) {
+		/* Get the last array block */
+
+		block = dyn_array_get_last_block(arr);
+
+		if (block->used + size > DYN_ARRAY_DATA_SIZE) {
+			block = dyn_array_add_block(arr);
+		}
+	}
+
+	used = block->used;
+
+	block->used = used + size;
+	ut_ad(block->used <= DYN_ARRAY_DATA_SIZE);
+
+	return(block->data + used);
+}
+
+/*********************************************************************//**
+Makes room on top of a dyn array and returns a pointer to a buffer in it.
+After copying the elements, the caller must close the buffer using
+dyn_array_close.
+@return	pointer to the buffer */
+UNIV_INLINE
+byte*
+dyn_array_open(
+/*===========*/
+	dyn_array_t*	arr,	/*!< in: dynamic array */
+	ulint		size)	/*!< in: size in bytes of the buffer; MUST be
+				smaller than DYN_ARRAY_DATA_SIZE! */
+{
+	dyn_block_t*	block;
+
+	ut_ad(arr);
+	ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+	ut_ad(size <= DYN_ARRAY_DATA_SIZE);
+	ut_ad(size);
+
+	block = arr;
+
+	if (block->used + size > DYN_ARRAY_DATA_SIZE) {
+		/* Get the last array block */
+
+		block = dyn_array_get_last_block(arr);
+
+		if (block->used + size > DYN_ARRAY_DATA_SIZE) {
+			block = dyn_array_add_block(arr);
+			ut_a(size <= DYN_ARRAY_DATA_SIZE);
+		}
+	}
+
+	ut_ad(block->used <= DYN_ARRAY_DATA_SIZE);
+	ut_ad(arr->buf_end == 0);
+	ut_d(arr->buf_end = block->used + size);
+
+	return(block->data + block->used);
+}
+
+/*********************************************************************//**
+Closes the buffer returned by dyn_array_open. */
+UNIV_INLINE
+void
+dyn_array_close(
+/*============*/
+	dyn_array_t*	arr,	/*!< in/out: dynamic array */
+	const byte*	ptr)	/*!< in: end of used space */
+{
+	dyn_block_t*	block;
+
+	ut_ad(arr);
+	ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+
+	block = dyn_array_get_last_block(arr);
+
+	ut_ad(arr->buf_end + block->data >= ptr);
+
+	block->used = ptr - block->data;
+
+	ut_ad(block->used <= DYN_ARRAY_DATA_SIZE);
+
+	ut_d(arr->buf_end = 0);
+}
+
+/************************************************************//**
+Returns pointer to an element in dyn array.
+@return	pointer to element */
+UNIV_INLINE
+void*
+dyn_array_get_element(
+/*==================*/
+	const dyn_array_t*	arr,	/*!< in: dyn array */
+	ulint			pos)	/*!< in: position of element
+					in bytes from array start */
+{
+	const dyn_block_t*	block;
+
+	ut_ad(arr);
+	ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+
+	/* Get the first array block */
+	block = dyn_array_get_first_block(arr);
+
+	if (arr->heap != NULL) {
+		for (;;) {
+			ulint	used = dyn_block_get_used(block);
+
+			if (pos < used) {
+				break;
+			}
+
+			pos -= used;
+			block = UT_LIST_GET_NEXT(list, block);
+			ut_ad(block);
+		}
+	}
+
+	ut_ad(block);
+	ut_ad(dyn_block_get_used(block) >= pos);
+
+	return(const_cast<byte*>(block->data) + pos);
+}
+
+/************************************************************//**
+Returns the size of stored data in a dyn array.
+@return	data size in bytes */
+UNIV_INLINE
+ulint
+dyn_array_get_data_size(
+/*====================*/
+	const dyn_array_t*	arr)	/*!< in: dyn array */
+{
+	const dyn_block_t*	block;
+	ulint			sum	= 0;
+
+	ut_ad(arr);
+	ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+
+	if (arr->heap == NULL) {
+
+		return(arr->used);
+	}
+
+	/* Get the first array block */
+	block = dyn_array_get_first_block(arr);
+
+	while (block != NULL) {
+		sum += dyn_block_get_used(block);
+		block = dyn_array_get_next_block(arr, block);
+	}
+
+	return(sum);
+}
+
+/********************************************************//**
+Pushes n bytes to a dyn array. */
+UNIV_INLINE
+void
+dyn_push_string(
+/*============*/
+	dyn_array_t*	arr,	/*!< in/out: dyn array */
+	const byte*	str,	/*!< in: string to write */
+	ulint		len)	/*!< in: string length */
+{
+	ulint	n_copied;
+
+	while (len > 0) {
+		if (len > DYN_ARRAY_DATA_SIZE) {
+			n_copied = DYN_ARRAY_DATA_SIZE;
+		} else {
+			n_copied = len;
+		}
+
+		memcpy(dyn_array_push(arr, n_copied), str, n_copied);
+
+		str += n_copied;
+		len -= n_copied;
+	}
+}
diff --git a/storage/innobase/include/eval0eval.h b/storage/innobase/include/eval0eval.h
new file mode 100644
index 00000000000..e3b1e6c16b6
--- /dev/null
+++ b/storage/innobase/include/eval0eval.h
@@ -0,0 +1,114 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0eval.h
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef eval0eval_h
+#define eval0eval_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+
+/*****************************************************************//**
+Free the buffer from global dynamic memory for a value of a que_node,
+if it has been allocated in the above function. The freeing for pushed
+column values is done in sel_col_prefetch_buf_free. */
+UNIV_INTERN
+void
+eval_node_free_val_buf(
+/*===================*/
+	que_node_t*	node);	/*!< in: query graph node */
+/*****************************************************************//**
+Evaluates a symbol table symbol. */
+UNIV_INLINE
+void
+eval_sym(
+/*=====*/
+	sym_node_t*	sym_node);	/*!< in: symbol table node */
+/*****************************************************************//**
+Evaluates an expression. */
+UNIV_INLINE
+void
+eval_exp(
+/*=====*/
+	que_node_t*	exp_node);	/*!< in: expression */
+/*****************************************************************//**
+Sets an integer value as the value of an expression node. */
+UNIV_INLINE
+void
+eval_node_set_int_val(
+/*==================*/
+	que_node_t*	node,	/*!< in: expression node */
+	lint		val);	/*!< in: value to set */
+/*****************************************************************//**
+Gets an integer value from an expression node.
+@return	integer value */
+UNIV_INLINE
+lint
+eval_node_get_int_val(
+/*==================*/
+	que_node_t*	node);	/*!< in: expression node */
+/*****************************************************************//**
+Copies a binary string value as the value of a query graph node. Allocates a
+new buffer if necessary. */
+UNIV_INLINE
+void
+eval_node_copy_and_alloc_val(
+/*=========================*/
+	que_node_t*	node,	/*!< in: query graph node */
+	const byte*	str,	/*!< in: binary string */
+	ulint		len);	/*!< in: string length or UNIV_SQL_NULL */
+/*****************************************************************//**
+Copies a query node value to another node. */
+UNIV_INLINE
+void
+eval_node_copy_val(
+/*===============*/
+	que_node_t*	node1,	/*!< in: node to copy to */
+	que_node_t*	node2);	/*!< in: node to copy from */
+/*****************************************************************//**
+Gets a iboolean value from a query node.
+@return	iboolean value */
+UNIV_INLINE
+ibool
+eval_node_get_ibool_val(
+/*====================*/
+	que_node_t*	node);	/*!< in: query graph node */
+/*****************************************************************//**
+Evaluates a comparison node.
+@return	the result of the comparison */
+UNIV_INTERN
+ibool
+eval_cmp(
+/*=====*/
+	func_node_t*	cmp_node);	/*!< in: comparison node */
+
+
+#ifndef UNIV_NONINL
+#include "eval0eval.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/eval0eval.ic b/storage/innobase/include/eval0eval.ic
new file mode 100644
index 00000000000..e4b1dd08017
--- /dev/null
+++ b/storage/innobase/include/eval0eval.ic
@@ -0,0 +1,255 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0eval.ic
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "pars0grm.h"
+
+/*****************************************************************//**
+Evaluates a function node. */
+UNIV_INTERN
+void
+eval_func(
+/*======*/
+	func_node_t*	func_node);	/*!< in: function node */
+/*****************************************************************//**
+Allocate a buffer from global dynamic memory for a value of a que_node.
+NOTE that this memory must be explicitly freed when the query graph is
+freed. If the node already has allocated buffer, that buffer is freed
+here. NOTE that this is the only function where dynamic memory should be
+allocated for a query node val field.
+@return	pointer to allocated buffer */
+UNIV_INTERN
+byte*
+eval_node_alloc_val_buf(
+/*====================*/
+	que_node_t*	node,	/*!< in: query graph node; sets the val field
+				data field to point to the new buffer, and
+				len field equal to size */
+	ulint		size);	/*!< in: buffer size */
+
+
+/*****************************************************************//**
+Allocates a new buffer if needed.
+@return	pointer to buffer */
+UNIV_INLINE
+byte*
+eval_node_ensure_val_buf(
+/*=====================*/
+	que_node_t*	node,	/*!< in: query graph node; sets the val field
+				data field to point to the new buffer, and
+				len field equal to size */
+	ulint		size)	/*!< in: buffer size */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	dfield = que_node_get_val(node);
+	dfield_set_len(dfield, size);
+
+	data = static_cast<byte*>(dfield_get_data(dfield));
+
+	if (!data || que_node_get_val_buf_size(node) < size) {
+
+		data = eval_node_alloc_val_buf(node, size);
+	}
+
+	return(data);
+}
+
+/*****************************************************************//**
+Evaluates a symbol table symbol. */
+UNIV_INLINE
+void
+eval_sym(
+/*=====*/
+	sym_node_t*	sym_node)	/*!< in: symbol table node */
+{
+
+	ut_ad(que_node_get_type(sym_node) == QUE_NODE_SYMBOL);
+
+	if (sym_node->indirection) {
+		/* The symbol table node is an alias for a variable or a
+		column */
+
+		dfield_copy_data(que_node_get_val(sym_node),
+				 que_node_get_val(sym_node->indirection));
+	}
+}
+
+/*****************************************************************//**
+Evaluates an expression. */
+UNIV_INLINE
+void
+eval_exp(
+/*=====*/
+	que_node_t*	exp_node)	/*!< in: expression */
+{
+	if (que_node_get_type(exp_node) == QUE_NODE_SYMBOL) {
+
+		eval_sym((sym_node_t*) exp_node);
+
+		return;
+	}
+
+	eval_func(static_cast<func_node_t*>(exp_node));
+}
+
+/*****************************************************************//**
+Sets an integer value as the value of an expression node. */
+UNIV_INLINE
+void
+eval_node_set_int_val(
+/*==================*/
+	que_node_t*	node,	/*!< in: expression node */
+	lint		val)	/*!< in: value to set */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	dfield = que_node_get_val(node);
+
+	data = static_cast<byte*>(dfield_get_data(dfield));
+
+	if (data == NULL) {
+		data = eval_node_alloc_val_buf(node, 4);
+	}
+
+	ut_ad(dfield_get_len(dfield) == 4);
+
+	mach_write_to_4(data, (ulint) val);
+}
+
+/*****************************************************************//**
+Gets an integer non-SQL null value from an expression node.
+@return	integer value */
+UNIV_INLINE
+lint
+eval_node_get_int_val(
+/*==================*/
+	que_node_t*	node)	/*!< in: expression node */
+{
+	const byte*	ptr;
+	dfield_t*	dfield;
+
+	dfield = que_node_get_val(node);
+	ptr = static_cast<byte*>(dfield_get_data(dfield));
+
+	ut_ad(dfield_get_len(dfield) == 4);
+
+	return((int) mach_read_from_4(ptr));
+}
+
+/*****************************************************************//**
+Gets a iboolean value from a query node.
+@return	iboolean value */
+UNIV_INLINE
+ibool
+eval_node_get_ibool_val(
+/*====================*/
+	que_node_t*	node)	/*!< in: query graph node */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	dfield = que_node_get_val(node);
+
+	data = static_cast<byte*>(dfield_get_data(dfield));
+
+	ut_ad(data != NULL);
+
+	return(mach_read_from_1(data));
+}
+
+/*****************************************************************//**
+Sets a iboolean value as the value of a function node. */
+UNIV_INLINE
+void
+eval_node_set_ibool_val(
+/*====================*/
+	func_node_t*	func_node,	/*!< in: function node */
+	ibool		val)		/*!< in: value to set */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	dfield = que_node_get_val(func_node);
+
+	data = static_cast<byte*>(dfield_get_data(dfield));
+
+	if (data == NULL) {
+		/* Allocate 1 byte to hold the value */
+
+		data = eval_node_alloc_val_buf(func_node, 1);
+	}
+
+	ut_ad(dfield_get_len(dfield) == 1);
+
+	mach_write_to_1(data, val);
+}
+
+/*****************************************************************//**
+Copies a binary string value as the value of a query graph node. Allocates a
+new buffer if necessary. */
+UNIV_INLINE
+void
+eval_node_copy_and_alloc_val(
+/*=========================*/
+	que_node_t*	node,	/*!< in: query graph node */
+	const byte*	str,	/*!< in: binary string */
+	ulint		len)	/*!< in: string length or UNIV_SQL_NULL */
+{
+	byte*		data;
+
+	if (len == UNIV_SQL_NULL) {
+		dfield_set_len(que_node_get_val(node), len);
+
+		return;
+	}
+
+	data = eval_node_ensure_val_buf(node, len);
+
+	ut_memcpy(data, str, len);
+}
+
+/*****************************************************************//**
+Copies a query node value to another node. */
+UNIV_INLINE
+void
+eval_node_copy_val(
+/*===============*/
+	que_node_t*	node1,	/*!< in: node to copy to */
+	que_node_t*	node2)	/*!< in: node to copy from */
+{
+	dfield_t*	dfield2;
+
+	dfield2 = que_node_get_val(node2);
+
+	eval_node_copy_and_alloc_val(
+		node1,
+		static_cast<byte*>(dfield_get_data(dfield2)),
+		dfield_get_len(dfield2));
+}
diff --git a/storage/innobase/include/eval0proc.h b/storage/innobase/include/eval0proc.h
new file mode 100644
index 00000000000..7755fb10343
--- /dev/null
+++ b/storage/innobase/include/eval0proc.h
@@ -0,0 +1,104 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0proc.h
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef eval0proc_h
+#define eval0proc_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+
+/**********************************************************************//**
+Performs an execution step of a procedure node.
+@return	query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_step(
+/*======*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an if-statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+if_step(
+/*====*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a while-statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+while_step(
+/*=======*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a for-loop node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+for_step(
+/*=====*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an assignment statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+assign_step(
+/*========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a procedure call node.
+@return	query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_eval_step(
+/*===========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an exit statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+exit_step(
+/*======*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a return-statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+return_step(
+/*========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+
+#ifndef UNIV_NONINL
+#include "eval0proc.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/eval0proc.ic b/storage/innobase/include/eval0proc.ic
new file mode 100644
index 00000000000..81418bae2c9
--- /dev/null
+++ b/storage/innobase/include/eval0proc.ic
@@ -0,0 +1,88 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0proc.ic
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#include "pars0pars.h"
+#include "que0que.h"
+#include "eval0eval.h"
+
+/**********************************************************************//**
+Performs an execution step of a procedure node.
+@return	query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_step(
+/*======*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	proc_node_t*	node;
+
+	ut_ad(thr);
+
+	node = static_cast<proc_node_t*>(thr->run_node);
+	ut_ad(que_node_get_type(node) == QUE_NODE_PROC);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		/* Start execution from the first statement in the statement
+		list */
+
+		thr->run_node = node->stat_list;
+	} else {
+		/* Move to the next statement */
+		ut_ad(que_node_get_next(thr->prev_node) == NULL);
+
+		thr->run_node = NULL;
+	}
+
+	if (thr->run_node == NULL) {
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a procedure call node.
+@return	query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_eval_step(
+/*===========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	func_node_t*	node;
+
+	ut_ad(thr);
+
+	node = static_cast<func_node_t*>(thr->run_node);
+	ut_ad(que_node_get_type(node) == QUE_NODE_FUNC);
+
+	/* Evaluate the procedure */
+
+	eval_exp(node);
+
+	thr->run_node = que_node_get_parent(node);
+
+	return(thr);
+}
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
new file mode 100644
index 00000000000..168f2f5b594
--- /dev/null
+++ b/storage/innobase/include/fil0fil.h
@@ -0,0 +1,1019 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0fil.h
+The low-level file system
+
+Created 10/25/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef fil0fil_h
+#define fil0fil_h
+
+#include "univ.i"
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "dict0types.h"
+#include "ut0byte.h"
+#include "os0file.h"
+#ifndef UNIV_HOTBACKUP
+#include "sync0rw.h"
+#include "ibuf0types.h"
+#include "log0log.h"
+#endif /* !UNIV_HOTBACKUP */
+
+#include <list>
+
+extern my_bool lower_case_file_system;
+// Forward declaration
+struct trx_t;
+struct fil_space_t;
+
+typedef std::list<const char*> space_name_list_t;
+
+/** When mysqld is run, the default directory "." is the mysqld datadir,
+but in the MySQL Embedded Server Library and mysqlbackup it is not the default
+directory, and we must set the base file path explicitly */
+extern const char*	fil_path_to_mysql_datadir;
+
+/** Initial size of a single-table tablespace in pages */
+#define FIL_IBD_FILE_INITIAL_SIZE	4
+
+/** 'null' (undefined) page offset in the context of file spaces */
+#define	FIL_NULL	ULINT32_UNDEFINED
+
+/* Space address data type; this is intended to be used when
+addresses accurate to a byte are stored in file pages. If the page part
+of the address is FIL_NULL, the address is considered undefined. */
+
+typedef	byte	fil_faddr_t;	/*!< 'type' definition in C: an address
+				stored in a file page is a string of bytes */
+#define FIL_ADDR_PAGE	0	/* first in address is the page offset */
+#define	FIL_ADDR_BYTE	4	/* then comes 2-byte byte offset within page*/
+
+#define	FIL_ADDR_SIZE	6	/* address size is 6 bytes */
+
+/** File space address */
+struct fil_addr_t{
+	ulint	page;		/*!< page number within a space */
+	ulint	boffset;	/*!< byte offset within the page */
+};
+
+/** The null file address */
+extern fil_addr_t	fil_addr_null;
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** The byte offsets on a file page for various variables @{ */
+#define FIL_PAGE_SPACE_OR_CHKSUM 0	/*!< in < MySQL-4.0.14 space id the
+					page belongs to (== 0) but in later
+					versions the 'new' checksum of the
+					page */
+#define FIL_PAGE_OFFSET		4	/*!< page offset inside space */
+#define FIL_PAGE_PREV		8	/*!< if there is a 'natural'
+					predecessor of the page, its
+					offset.  Otherwise FIL_NULL.
+					This field is not set on BLOB
+					pages, which are stored as a
+					singly-linked list.  See also
+					FIL_PAGE_NEXT. */
+#define FIL_PAGE_NEXT		12	/*!< if there is a 'natural' successor
+					of the page, its offset.
+					Otherwise FIL_NULL.
+					B-tree index pages
+					(FIL_PAGE_TYPE contains FIL_PAGE_INDEX)
+					on the same PAGE_LEVEL are maintained
+					as a doubly linked list via
+					FIL_PAGE_PREV and FIL_PAGE_NEXT
+					in the collation order of the
+					smallest user record on each page. */
+#define FIL_PAGE_LSN		16	/*!< lsn of the end of the newest
+					modification log record to the page */
+#define	FIL_PAGE_TYPE		24	/*!< file page type: FIL_PAGE_INDEX,...,
+					2 bytes.
+
+					The contents of this field can only
+					be trusted in the following case:
+					if the page is an uncompressed
+					B-tree index page, then it is
+					guaranteed that the value is
+					FIL_PAGE_INDEX.
+					The opposite does not hold.
+
+					In tablespaces created by
+					MySQL/InnoDB 5.1.7 or later, the
+					contents of this field is valid
+					for all uncompressed pages. */
+#define FIL_PAGE_FILE_FLUSH_LSN	26	/*!< this is only defined for the
+					first page in a system tablespace
+					data file (ibdata*, not *.ibd):
+					the file has been flushed to disk
+					at least up to this lsn */
+#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID  34 /*!< starting from 4.1.x this
+					contains the space id of the page */
+#define FIL_PAGE_SPACE_ID  FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID
+
+#define FIL_PAGE_DATA		38	/*!< start of the data on the page */
+/* @} */
+/** File page trailer @{ */
+#define FIL_PAGE_END_LSN_OLD_CHKSUM 8	/*!< the low 4 bytes of this are used
+					to store the page checksum, the
+					last 4 bytes should be identical
+					to the last 4 bytes of FIL_PAGE_LSN */
+#define FIL_PAGE_DATA_END	8	/*!< size of the page trailer */
+/* @} */
+
+#ifndef UNIV_INNOCHECKSUM
+
+/** File page types (values of FIL_PAGE_TYPE) @{ */
+#define FIL_PAGE_INDEX		17855	/*!< B-tree node */
+#define FIL_PAGE_UNDO_LOG	2	/*!< Undo log page */
+#define FIL_PAGE_INODE		3	/*!< Index node */
+#define FIL_PAGE_IBUF_FREE_LIST	4	/*!< Insert buffer free list */
+/* File page types introduced in MySQL/InnoDB 5.1.7 */
+#define FIL_PAGE_TYPE_ALLOCATED	0	/*!< Freshly allocated page */
+#define FIL_PAGE_IBUF_BITMAP	5	/*!< Insert buffer bitmap */
+#define FIL_PAGE_TYPE_SYS	6	/*!< System page */
+#define FIL_PAGE_TYPE_TRX_SYS	7	/*!< Transaction system data */
+#define FIL_PAGE_TYPE_FSP_HDR	8	/*!< File space header */
+#define FIL_PAGE_TYPE_XDES	9	/*!< Extent descriptor page */
+#define FIL_PAGE_TYPE_BLOB	10	/*!< Uncompressed BLOB page */
+#define FIL_PAGE_TYPE_ZBLOB	11	/*!< First compressed BLOB page */
+#define FIL_PAGE_TYPE_ZBLOB2	12	/*!< Subsequent compressed BLOB page */
+#define FIL_PAGE_TYPE_LAST	FIL_PAGE_TYPE_ZBLOB2
+					/*!< Last page type */
+/* @} */
+
+/** Space types @{ */
+#define FIL_TABLESPACE		501	/*!< tablespace */
+#define FIL_LOG			502	/*!< redo log */
+/* @} */
+
+/** The number of fsyncs done to the log */
+extern ulint	fil_n_log_flushes;
+
+/** Number of pending redo log flushes */
+extern ulint	fil_n_pending_log_flushes;
+/** Number of pending tablespace flushes */
+extern ulint	fil_n_pending_tablespace_flushes;
+
+/** Number of files currently open */
+extern ulint	fil_n_file_opened;
+
+struct fsp_open_info {
+	ibool		success;	/*!< Has the tablespace been opened? */
+	const char*	check_msg;	/*!< fil_check_first_page() message */
+	ibool		valid;		/*!< Is the tablespace valid? */
+	os_file_t	file;		/*!< File handle */
+	char*		filepath;	/*!< File path to open */
+	lsn_t		lsn;		/*!< Flushed LSN from header page */
+	ulint		id;		/*!< Space ID */
+	ulint		flags;		/*!< Tablespace flags */
+#ifdef UNIV_LOG_ARCHIVE
+	ulint		arch_log_no;	/*!< latest archived log file number */
+#endif /* UNIV_LOG_ARCHIVE */
+};
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Returns the version number of a tablespace, -1 if not found.
+@return version number, -1 if the tablespace does not exist in the
+memory cache */
+UNIV_INTERN
+ib_int64_t
+fil_space_get_version(
+/*==================*/
+	ulint	id);	/*!< in: space id */
+/*******************************************************************//**
+Returns the latch of a file space.
+@return	latch protecting storage allocation */
+UNIV_INTERN
+rw_lock_t*
+fil_space_get_latch(
+/*================*/
+	ulint	id,	/*!< in: space id */
+	ulint*	zip_size);/*!< out: compressed page size, or
+			0 for uncompressed tablespaces */
+/*******************************************************************//**
+Returns the type of a file space.
+@return	FIL_TABLESPACE or FIL_LOG */
+UNIV_INTERN
+ulint
+fil_space_get_type(
+/*===============*/
+	ulint	id);	/*!< in: space id */
+#endif /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Appends a new file to the chain of files of a space. File must be closed.
+@return pointer to the file name, or NULL on error */
+UNIV_INTERN
+char*
+fil_node_create(
+/*============*/
+	const char*	name,	/*!< in: file name (file must be closed) */
+	ulint		size,	/*!< in: file size in database blocks, rounded
+				downwards to an integer */
+	ulint		id,	/*!< in: space id where to append */
+	ibool		is_raw)	/*!< in: TRUE if a raw device or
+				a raw disk partition */
+	__attribute__((nonnull, warn_unused_result));
+#ifdef UNIV_LOG_ARCHIVE
+/****************************************************************//**
+Drops files from the start of a file space, so that its size is cut by
+the amount given. */
+UNIV_INTERN
+void
+fil_space_truncate_start(
+/*=====================*/
+	ulint	id,		/*!< in: space id */
+	ulint	trunc_len);	/*!< in: truncate by this much; it is an error
+				if this does not equal to the combined size of
+				some initial files in the space */
+#endif /* UNIV_LOG_ARCHIVE */
+/*******************************************************************//**
+Creates a space memory object and puts it to the 'fil system' hash table.
+If there is an error, prints an error message to the .err log.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_space_create(
+/*=============*/
+	const char*	name,	/*!< in: space name */
+	ulint		id,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size, or
+				0 for uncompressed tablespaces */
+	ulint		purpose);/*!< in: FIL_TABLESPACE, or FIL_LOG if log */
+/*******************************************************************//**
+Assigns a new space id for a new single-table tablespace. This works simply by
+incrementing the global counter. If 4 billion id's is not enough, we may need
+to recycle id's.
+@return	TRUE if assigned, FALSE if not */
+UNIV_INTERN
+ibool
+fil_assign_new_space_id(
+/*====================*/
+	ulint*	space_id);	/*!< in/out: space id */
+/*******************************************************************//**
+Returns the path from the first fil_node_t found for the space ID sent.
+The caller is responsible for freeing the memory allocated here for the
+value returned.
+@return	a copy of fil_node_t::path, NULL if space is zero or not found. */
+UNIV_INTERN
+char*
+fil_space_get_first_path(
+/*=====================*/
+	ulint	id);	/*!< in: space id */
+/*******************************************************************//**
+Returns the size of the space in pages. The tablespace must be cached in the
+memory cache.
+@return	space size, 0 if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_size(
+/*===============*/
+	ulint	id);	/*!< in: space id */
+/*******************************************************************//**
+Returns the flags of the space. The tablespace must be cached
+in the memory cache.
+@return	flags, ULINT_UNDEFINED if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_flags(
+/*================*/
+	ulint	id);	/*!< in: space id */
+/*******************************************************************//**
+Returns the compressed page size of the space, or 0 if the space
+is not compressed. The tablespace must be cached in the memory cache.
+@return	compressed page size, ULINT_UNDEFINED if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_zip_size(
+/*===================*/
+	ulint	id);	/*!< in: space id */
+/*******************************************************************//**
+Checks if the pair space, page_no refers to an existing page in a tablespace
+file space. The tablespace must be cached in the memory cache.
+@return	TRUE if the address is meaningful */
+UNIV_INTERN
+ibool
+fil_check_adress_in_tablespace(
+/*===========================*/
+	ulint	id,	/*!< in: space id */
+	ulint	page_no);/*!< in: page number */
+/****************************************************************//**
+Initializes the tablespace memory cache. */
+UNIV_INTERN
+void
+fil_init(
+/*=====*/
+	ulint	hash_size,	/*!< in: hash table size */
+	ulint	max_n_open);	/*!< in: max number of open files */
+/*******************************************************************//**
+Initializes the tablespace memory cache. */
+UNIV_INTERN
+void
+fil_close(void);
+/*===========*/
+/*******************************************************************//**
+Opens all log files and system tablespace data files. They stay open until the
+database server shutdown. This should be called at a server startup after the
+space objects for the log and the system tablespace have been created. The
+purpose of this operation is to make sure we never run out of file descriptors
+if we need to read from the insert buffer or to write to the log. */
+UNIV_INTERN
+void
+fil_open_log_and_system_tablespace_files(void);
+/*==========================================*/
+/*******************************************************************//**
+Closes all open files. There must not be any pending i/o's or not flushed
+modifications in the files. */
+UNIV_INTERN
+void
+fil_close_all_files(void);
+/*=====================*/
+/*******************************************************************//**
+Closes the redo log files. There must not be any pending i/o's or not
+flushed modifications in the files. */
+UNIV_INTERN
+void
+fil_close_log_files(
+/*================*/
+	bool	free);	/*!< in: whether to free the memory object */
+/*******************************************************************//**
+Sets the max tablespace id counter if the given number is bigger than the
+previous value. */
+UNIV_INTERN
+void
+fil_set_max_space_id_if_bigger(
+/*===========================*/
+	ulint	max_id);/*!< in: maximum known id */
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Writes the flushed lsn and the latest archived log number to the page
+header of the first page of each data file in the system tablespace.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+dberr_t
+fil_write_flushed_lsn_to_data_files(
+/*================================*/
+	lsn_t	lsn,		/*!< in: lsn to write */
+	ulint	arch_log_no);	/*!< in: latest archived log file number */
+/*******************************************************************//**
+Reads the flushed lsn, arch no, and tablespace flag fields from a data
+file at database startup.
+@retval NULL on success, or if innodb_force_recovery is set
+@return pointer to an error message string */
+UNIV_INTERN
+const char*
+fil_read_first_page(
+/*================*/
+	os_file_t	data_file,		/*!< in: open data file */
+	ibool		one_read_already,	/*!< in: TRUE if min and max
+						parameters below already
+						contain sensible data */
+	ulint*		flags,			/*!< out: tablespace flags */
+	ulint*		space_id,		/*!< out: tablespace ID */
+#ifdef UNIV_LOG_ARCHIVE
+	ulint*		min_arch_log_no,	/*!< out: min of archived
+						log numbers in data files */
+	ulint*		max_arch_log_no,	/*!< out: max of archived
+						log numbers in data files */
+#endif /* UNIV_LOG_ARCHIVE */
+	lsn_t*		min_flushed_lsn,	/*!< out: min of flushed
+						lsn values in data files */
+	lsn_t*		max_flushed_lsn)	/*!< out: max of flushed
+						lsn values in data files */
+	__attribute__((warn_unused_result));
+/*******************************************************************//**
+Increments the count of pending operation, if space is not being deleted.
+@return	TRUE if being deleted, and operation should be skipped */
+UNIV_INTERN
+ibool
+fil_inc_pending_ops(
+/*================*/
+	ulint	id,		/*!< in: space id */
+	ibool	print_err);	/*!< in: need to print error or not */
+/*******************************************************************//**
+Decrements the count of pending operations. */
+UNIV_INTERN
+void
+fil_decr_pending_ops(
+/*=================*/
+	ulint	id);	/*!< in: space id */
+#endif /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Parses the body of a log record written about an .ibd file operation. That is,
+the log record part after the standard (type, space id, page no) header of the
+log record.
+
+If desired, also replays the delete or rename operation if the .ibd file
+exists and the space id in it matches. Replays the create operation if a file
+at that path does not exist yet. If the database directory for the file to be
+created does not exist, then we create the directory, too.
+
+Note that mysqlbackup --apply-log sets fil_path_to_mysql_datadir to point to
+the datadir that we should use in replaying the file operations.
+@return end of log record, or NULL if the record was not completely
+contained between ptr and end_ptr */
+UNIV_INTERN
+byte*
+fil_op_log_parse_or_replay(
+/*=======================*/
+	byte*	ptr,		/*!< in: buffer containing the log record body,
+				or an initial segment of it, if the record does
+				not fir completely between ptr and end_ptr */
+	byte*	end_ptr,	/*!< in: buffer end */
+	ulint	type,		/*!< in: the type of this log record */
+	ulint	space_id,	/*!< in: the space id of the tablespace in
+				question, or 0 if the log record should
+				only be parsed but not replayed */
+	ulint	log_flags);	/*!< in: redo log flags
+				(stored in the page number parameter) */
+/*******************************************************************//**
+Deletes a single-table tablespace. The tablespace must be cached in the
+memory cache.
+@return	TRUE if success */
+UNIV_INTERN
+dberr_t
+fil_delete_tablespace(
+/*==================*/
+	ulint		id,		/*!< in: space id */
+	buf_remove_t	buf_remove);	/*!< in: specify the action to take
+					on the tables pages in the buffer
+					pool */
+/*******************************************************************//**
+Closes a single-table tablespace. The tablespace must be cached in the
+memory cache. Free all pages used by the tablespace.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+dberr_t
+fil_close_tablespace(
+/*=================*/
+	trx_t*	trx,	/*!< in/out: Transaction covering the close */
+	ulint	id);	/*!< in: space id */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Discards a single-table tablespace. The tablespace must be cached in the
+memory cache. Discarding is like deleting a tablespace, but
+
+ 1. We do not drop the table from the data dictionary;
+
+ 2. We remove all insert buffer entries for the tablespace immediately;
+    in DROP TABLE they are only removed gradually in the background;
+
+ 3. When the user does IMPORT TABLESPACE, the tablespace will have the
+    same id as it originally had.
+
+ 4. Free all the pages in use by the tablespace if rename=TRUE.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+dberr_t
+fil_discard_tablespace(
+/*===================*/
+	ulint	id)	/*!< in: space id */
+	__attribute__((warn_unused_result));
+#endif /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Renames a single-table tablespace. The tablespace must be cached in the
+tablespace memory cache.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_rename_tablespace(
+/*==================*/
+	const char*	old_name_in,	/*!< in: old table name in the
+					standard databasename/tablename
+					format of InnoDB, or NULL if we
+					do the rename based on the space
+					id only */
+	ulint		id,		/*!< in: space id */
+	const char*	new_name,	/*!< in: new table name in the
+					standard databasename/tablename
+					format of InnoDB */
+	const char*	new_path);	/*!< in: new full datafile path
+					if the tablespace is remotely
+					located, or NULL if it is located
+					in the normal data directory. */
+
+/*******************************************************************//**
+Allocates a file name for a single-table tablespace. The string must be freed
+by caller with mem_free().
+@return	own: file name */
+UNIV_INTERN
+char*
+fil_make_ibd_name(
+/*==============*/
+	const char*	name,		/*!< in: table name or a dir path */
+	bool		is_full_path);	/*!< in: TRUE if it is a dir path */
+/*******************************************************************//**
+Allocates a file name for a tablespace ISL file (InnoDB Symbolic Link).
+The string must be freed by caller with mem_free().
+@return	own: file name */
+UNIV_INTERN
+char*
+fil_make_isl_name(
+/*==============*/
+	const char*	name);	/*!< in: table name */
+/*******************************************************************//**
+Creates a new InnoDB Symbolic Link (ISL) file.  It is always created
+under the 'datadir' of MySQL. The datadir is the directory of a
+running mysqld program. We can refer to it by simply using the path '.'.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fil_create_link_file(
+/*=================*/
+	const char*	tablename,	/*!< in: tablename */
+	const char*	filepath);	/*!< in: pathname of tablespace */
+/*******************************************************************//**
+Deletes an InnoDB Symbolic Link (ISL) file. */
+UNIV_INTERN
+void
+fil_delete_link_file(
+/*==================*/
+	const char*	tablename);	/*!< in: name of table */
+/*******************************************************************//**
+Reads an InnoDB Symbolic Link (ISL) file.
+It is always created under the 'datadir' of MySQL.  The name is of the
+form {databasename}/{tablename}. and the isl file is expected to be in a
+'{databasename}' directory called '{tablename}.isl'. The caller must free
+the memory of the null-terminated path returned if it is not null.
+@return	own: filepath found in link file, NULL if not found. */
+UNIV_INTERN
+char*
+fil_read_link_file(
+/*===============*/
+	const char*	name);		/*!< in: tablespace name */
+/*******************************************************************//**
+Creates a new single-table tablespace to a database directory of MySQL.
+Database directories are under the 'datadir' of MySQL. The datadir is the
+directory of a running mysqld program. We can refer to it by simply the
+path '.'. Tables created with CREATE TEMPORARY TABLE we place in the temp
+dir of the mysqld server.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fil_create_new_single_table_tablespace(
+/*===================================*/
+	ulint		space_id,	/*!< in: space id */
+	const char*	tablename,	/*!< in: the table name in the usual
+					databasename/tablename format
+					of InnoDB */
+	const char*	dir_path,	/*!< in: NULL or a dir path */
+	ulint		flags,		/*!< in: tablespace flags */
+	ulint		flags2,		/*!< in: table flags2 */
+	ulint		size)		/*!< in: the initial size of the
+					tablespace file in pages,
+					must be >= FIL_IBD_FILE_INITIAL_SIZE */
+	__attribute__((nonnull, warn_unused_result));
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Tries to open a single-table tablespace and optionally checks the space id is
+right in it. If does not succeed, prints an error message to the .err log. This
+function is used to open a tablespace when we start up mysqld, and also in
+IMPORT TABLESPACE.
+NOTE that we assume this operation is used either at the database startup
+or under the protection of the dictionary mutex, so that two users cannot
+race here. This operation does not leave the file associated with the
+tablespace open, but closes it after we have looked at the space id in it.
+
+If the validate boolean is set, we read the first page of the file and
+check that the space id in the file is what we expect. We assume that
+this function runs much faster if no check is made, since accessing the
+file inode probably is much faster (the OS caches them) than accessing
+the first page of the file.  This boolean may be initially FALSE, but if
+a remote tablespace is found it will be changed to true.
+
+If the fix_dict boolean is set, then it is safe to use an internal SQL
+statement to update the dictionary tables if they are incorrect.
+
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fil_open_single_table_tablespace(
+/*=============================*/
+	bool		validate,	/*!< in: Do we validate tablespace? */
+	bool		fix_dict,	/*!< in: Can we fix the dictionary? */
+	ulint		id,		/*!< in: space id */
+	ulint		flags,		/*!< in: tablespace flags */
+	const char*	tablename,	/*!< in: table name in the
+					databasename/tablename format */
+	const char*	filepath)	/*!< in: tablespace filepath */
+	__attribute__((nonnull(5), warn_unused_result));
+
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+At the server startup, if we need crash recovery, scans the database
+directories under the MySQL datadir, looking for .ibd files. Those files are
+single-table tablespaces. We need to know the space id in each of them so that
+we know into which file we should look to check the contents of a page stored
+in the doublewrite buffer, also to know where to apply log records where the
+space id is != 0.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+dberr_t
+fil_load_single_table_tablespaces(void);
+/*===================================*/
+/*******************************************************************//**
+Returns TRUE if a single-table tablespace does not exist in the memory cache,
+or is being deleted there.
+@return	TRUE if does not exist or is being deleted */
+UNIV_INTERN
+ibool
+fil_tablespace_deleted_or_being_deleted_in_mem(
+/*===========================================*/
+	ulint		id,	/*!< in: space id */
+	ib_int64_t	version);/*!< in: tablespace_version should be this; if
+				you pass -1 as the value of this, then this
+				parameter is ignored */
+/*******************************************************************//**
+Returns TRUE if a single-table tablespace exists in the memory cache.
+@return	TRUE if exists */
+UNIV_INTERN
+ibool
+fil_tablespace_exists_in_mem(
+/*=========================*/
+	ulint	id);	/*!< in: space id */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory
+cache. Note that if we have not done a crash recovery at the database startup,
+there may be many tablespaces which are not yet in the memory cache.
+@return	TRUE if a matching tablespace exists in the memory cache */
+UNIV_INTERN
+ibool
+fil_space_for_table_exists_in_mem(
+/*==============================*/
+	ulint		id,		/*!< in: space id */
+	const char*	name,		/*!< in: table name in the standard
+					'databasename/tablename' format */
+	ibool		mark_space,	/*!< in: in crash recovery, at database
+					startup we mark all spaces which have
+					an associated table in the InnoDB
+					data dictionary, so that
+					we can print a warning about orphaned
+					tablespaces */
+	ibool		print_error_if_does_not_exist,
+					/*!< in: print detailed error
+					information to the .err log if a
+					matching tablespace is not found from
+					memory */
+	bool		adjust_space,	/*!< in: whether to adjust space id
+					when find table space mismatch */
+	mem_heap_t*	heap,		/*!< in: heap memory */
+	table_id_t	table_id);	/*!< in: table id */
+#else /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Extends all tablespaces to the size stored in the space header. During the
+mysqlbackup --apply-log phase we extended the spaces on-demand so that log
+records could be appllied, but that may have left spaces still too small
+compared to the size stored in the space header. */
+UNIV_INTERN
+void
+fil_extend_tablespaces_to_stored_len(void);
+/*======================================*/
+#endif /* !UNIV_HOTBACKUP */
+/**********************************************************************//**
+Tries to extend a data file so that it would accommodate the number of pages
+given. The tablespace must be cached in the memory cache. If the space is big
+enough already, does nothing.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_extend_space_to_desired_size(
+/*=============================*/
+	ulint*	actual_size,	/*!< out: size of the space after extension;
+				if we ran out of disk space this may be lower
+				than the desired size */
+	ulint	space_id,	/*!< in: space id */
+	ulint	size_after_extend);/*!< in: desired size in pages after the
+				extension; if the current space size is bigger
+				than this already, the function does nothing */
+/*******************************************************************//**
+Tries to reserve free extents in a file space.
+@return	TRUE if succeed */
+UNIV_INTERN
+ibool
+fil_space_reserve_free_extents(
+/*===========================*/
+	ulint	id,		/*!< in: space id */
+	ulint	n_free_now,	/*!< in: number of free extents now */
+	ulint	n_to_reserve);	/*!< in: how many one wants to reserve */
+/*******************************************************************//**
+Releases free extents in a file space. */
+UNIV_INTERN
+void
+fil_space_release_free_extents(
+/*===========================*/
+	ulint	id,		/*!< in: space id */
+	ulint	n_reserved);	/*!< in: how many one reserved */
+/*******************************************************************//**
+Gets the number of reserved extents. If the database is silent, this number
+should be zero. */
+UNIV_INTERN
+ulint
+fil_space_get_n_reserved_extents(
+/*=============================*/
+	ulint	id);		/*!< in: space id */
+/********************************************************************//**
+Reads or writes data. This operation is asynchronous (aio).
+@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
+i/o on a tablespace which does not exist */
+UNIV_INTERN
+dberr_t
+fil_io(
+/*===*/
+	ulint	type,		/*!< in: OS_FILE_READ or OS_FILE_WRITE,
+				ORed to OS_FILE_LOG, if a log i/o
+				and ORed to OS_AIO_SIMULATED_WAKE_LATER
+				if simulated aio and we want to post a
+				batch of i/os; NOTE that a simulated batch
+				may introduce hidden chances of deadlocks,
+				because i/os are not actually handled until
+				all have been posted: use with great
+				caution! */
+	bool	sync,		/*!< in: true if synchronous aio is desired */
+	ulint	space_id,	/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	block_offset,	/*!< in: offset in number of blocks */
+	ulint	byte_offset,	/*!< in: remainder of offset in bytes; in
+				aio this must be divisible by the OS block
+				size */
+	ulint	len,		/*!< in: how many bytes to read or write; this
+				must not cross a file boundary; in aio this
+				must be a block size multiple */
+	void*	buf,		/*!< in/out: buffer where to store read data
+				or from where to write; in aio this must be
+				appropriately aligned */
+	void*	message)	/*!< in: message for aio handler if non-sync
+				aio used, else ignored */
+	__attribute__((nonnull(8)));
+/**********************************************************************//**
+Waits for an aio operation to complete. This function is used to write the
+handler for completed requests. The aio array of pending requests is divided
+into segments (see os0file.cc for more info). The thread specifies which
+segment it wants to wait for. */
+UNIV_INTERN
+void
+fil_aio_wait(
+/*=========*/
+	ulint	segment);	/*!< in: the number of the segment in the aio
+				array to wait for */
+/**********************************************************************//**
+Flushes to disk possible writes cached by the OS. If the space does not exist
+or is being dropped, does not do anything. */
+UNIV_INTERN
+void
+fil_flush(
+/*======*/
+	ulint	space_id);	/*!< in: file space id (this can be a group of
+				log files or a tablespace of the database) */
+/**********************************************************************//**
+Flushes to disk writes in file spaces of the given type possibly cached by
+the OS. */
+UNIV_INTERN
+void
+fil_flush_file_spaces(
+/*==================*/
+	ulint	purpose);	/*!< in: FIL_TABLESPACE, FIL_LOG */
+/******************************************************************//**
+Checks the consistency of the tablespace cache.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+fil_validate(void);
+/*==============*/
+/********************************************************************//**
+Returns TRUE if file address is undefined.
+@return	TRUE if undefined */
+UNIV_INTERN
+ibool
+fil_addr_is_null(
+/*=============*/
+	fil_addr_t	addr);	/*!< in: address */
+/********************************************************************//**
+Get the predecessor of a file page.
+@return	FIL_PAGE_PREV */
+UNIV_INTERN
+ulint
+fil_page_get_prev(
+/*==============*/
+	const byte*	page);	/*!< in: file page */
+/********************************************************************//**
+Get the successor of a file page.
+@return	FIL_PAGE_NEXT */
+UNIV_INTERN
+ulint
+fil_page_get_next(
+/*==============*/
+	const byte*	page);	/*!< in: file page */
+/*********************************************************************//**
+Sets the file page type. */
+UNIV_INTERN
+void
+fil_page_set_type(
+/*==============*/
+	byte*	page,	/*!< in/out: file page */
+	ulint	type);	/*!< in: type */
+/*********************************************************************//**
+Gets the file page type.
+@return type; NOTE that if the type has not been written to page, the
+return value not defined */
+UNIV_INTERN
+ulint
+fil_page_get_type(
+/*==============*/
+	const byte*	page);	/*!< in: file page */
+
+/*******************************************************************//**
+Returns TRUE if a single-table tablespace is being deleted.
+@return TRUE if being deleted */
+UNIV_INTERN
+ibool
+fil_tablespace_is_being_deleted(
+/*============================*/
+	ulint		id);	/*!< in: space id */
+
+/********************************************************************//**
+Delete the tablespace file and any related files like .cfg.
+This should not be called for temporary tables. */
+UNIV_INTERN
+void
+fil_delete_file(
+/*============*/
+	const char*	path);	/*!< in: filepath of the ibd tablespace */
+
+/** Callback functor. */
+struct PageCallback {
+
+	/**
+	Default constructor */
+	PageCallback()
+		:
+		m_zip_size(),
+		m_page_size(),
+		m_filepath() UNIV_NOTHROW {}
+
+	virtual ~PageCallback() UNIV_NOTHROW {}
+
+	/**
+	Called for page 0 in the tablespace file at the start.
+	@param file_size - size of the file in bytes
+	@param block - contents of the first page in the tablespace file
+	@retval DB_SUCCESS or error code.*/
+	virtual dberr_t init(
+		os_offset_t		file_size,
+		const buf_block_t*	block) UNIV_NOTHROW = 0;
+
+	/**
+	Called for every page in the tablespace. If the page was not
+	updated then its state must be set to BUF_PAGE_NOT_USED. For
+	compressed tables the page descriptor memory will be at offset:
+       		block->frame + UNIV_PAGE_SIZE;
+	@param offset - physical offset within the file
+	@param block - block read from file, note it is not from the buffer pool
+	@retval DB_SUCCESS or error code. */
+	virtual dberr_t operator()(
+		os_offset_t 	offset,
+		buf_block_t*	block) UNIV_NOTHROW = 0;
+
+	/**
+	Set the name of the physical file and the file handle that is used
+	to open it for the file that is being iterated over.
+	@param filename - then physical name of the tablespace file.
+	@param file - OS file handle */
+	void set_file(const char* filename, os_file_t file) UNIV_NOTHROW
+	{
+		m_file = file;
+		m_filepath = filename;
+	}
+
+	/**
+	@return the space id of the tablespace */
+	virtual ulint get_space_id() const UNIV_NOTHROW = 0;
+
+	/** The compressed page size
+	@return the compressed page size */
+	ulint get_zip_size() const
+	{
+		return(m_zip_size);
+	}
+
+	/**
+	Set the tablespace compressed table size.
+	@return DB_SUCCESS if it is valie or DB_CORRUPTION if not */
+	dberr_t set_zip_size(const buf_frame_t* page) UNIV_NOTHROW;
+
+	/** The compressed page size
+	@return the compressed page size */
+	ulint get_page_size() const
+	{
+		return(m_page_size);
+	}
+
+	/** Compressed table page size */
+	ulint			m_zip_size;
+
+	/** The tablespace page size. */
+	ulint			m_page_size;
+
+	/** File handle to the tablespace */
+	os_file_t		m_file;
+
+	/** Physical file path. */
+	const char*		m_filepath;
+
+protected:
+	// Disable copying
+	PageCallback(const PageCallback&);
+	PageCallback& operator=(const PageCallback&);
+};
+
+/********************************************************************//**
+Iterate over all the pages in the tablespace.
+@param table - the table definiton in the server
+@param n_io_buffers - number of blocks to read and write together
+@param callback - functor that will do the page updates
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fil_tablespace_iterate(
+/*===================*/
+	dict_table_t*		table,
+	ulint			n_io_buffers,
+	PageCallback&		callback)
+	__attribute__((nonnull, warn_unused_result));
+
+/*******************************************************************//**
+Checks if a single-table tablespace for a given table name exists in the
+tablespace memory cache.
+@return	space id, ULINT_UNDEFINED if not found */
+UNIV_INTERN
+ulint
+fil_get_space_id_for_table(
+/*=======================*/
+	const char*	name);	/*!< in: table name in the standard
+				'databasename/tablename' format */
+
+/**
+Iterate over all the spaces in the space list and fetch the
+tablespace names. It will return a copy of the name that must be
+freed by the caller using: delete[].
+@return DB_SUCCESS if all OK. */
+UNIV_INTERN
+dberr_t
+fil_get_space_names(
+/*================*/
+	space_name_list_t&	space_name_list)
+				/*!< in/out: Vector for collecting the names. */
+	__attribute__((warn_unused_result));
+
+/****************************************************************//**
+Generate redo logs for swapping two .ibd files */
+UNIV_INTERN
+void
+fil_mtr_rename_log(
+/*===============*/
+	ulint		old_space_id,	/*!< in: tablespace id of the old
+					table. */
+	const char*	old_name,	/*!< in: old table name */
+	ulint		new_space_id,	/*!< in: tablespace id of the new
+					table */
+	const char*	new_name,	/*!< in: new table name */
+	const char*	tmp_name,	/*!< in: temp table name used while
+					swapping */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
+
+/*******************************************************************//**
+Finds the given page_no of the given space id from the double write buffer,
+and copies it to the corresponding .ibd file.
+@return true if copy was successful, or false. */
+bool
+fil_user_tablespace_restore_page(
+/*==============================*/
+	fsp_open_info*	fsp,		/* in: contains space id and .ibd
+					file information */
+	ulint		page_no);	/* in: page_no to obtain from double
+					write buffer */
+
+#endif /* !UNIV_INNOCHECKSUM */
+#endif /* fil0fil_h */
diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h
new file mode 100644
index 00000000000..a587ccc9f20
--- /dev/null
+++ b/storage/innobase/include/fsp0fsp.h
@@ -0,0 +1,747 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0fsp.h
+File space management
+
+Created 12/18/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef fsp0fsp_h
+#define fsp0fsp_h
+
+#include "univ.i"
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "mtr0mtr.h"
+#include "fut0lst.h"
+#include "ut0byte.h"
+#include "page0types.h"
+#include "fsp0types.h"
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/* @defgroup fsp_flags InnoDB Tablespace Flag Constants @{ */
+
+/** Width of the POST_ANTELOPE flag */
+#define FSP_FLAGS_WIDTH_POST_ANTELOPE	1
+/** Number of flag bits used to indicate the tablespace zip page size */
+#define FSP_FLAGS_WIDTH_ZIP_SSIZE	4
+/** Width of the ATOMIC_BLOBS flag.  The ability to break up a long
+column into an in-record prefix and an externally stored part is available
+to the two Barracuda row formats COMPRESSED and DYNAMIC. */
+#define FSP_FLAGS_WIDTH_ATOMIC_BLOBS	1
+/** Number of flag bits used to indicate the tablespace page size */
+#define FSP_FLAGS_WIDTH_PAGE_SSIZE	4
+/** Width of the DATA_DIR flag.  This flag indicates that the tablespace
+is found in a remote location, not the default data directory. */
+#define FSP_FLAGS_WIDTH_DATA_DIR	1
+/** Width of all the currently known tablespace flags */
+#define FSP_FLAGS_WIDTH		(FSP_FLAGS_WIDTH_POST_ANTELOPE	\
+				+ FSP_FLAGS_WIDTH_ZIP_SSIZE	\
+				+ FSP_FLAGS_WIDTH_ATOMIC_BLOBS	\
+				+ FSP_FLAGS_WIDTH_PAGE_SSIZE	\
+				+ FSP_FLAGS_WIDTH_DATA_DIR)
+
+/** A mask of all the known/used bits in tablespace flags */
+#define FSP_FLAGS_MASK		(~(~0 << FSP_FLAGS_WIDTH))
+
+/** Zero relative shift position of the POST_ANTELOPE field */
+#define FSP_FLAGS_POS_POST_ANTELOPE	0
+/** Zero relative shift position of the ZIP_SSIZE field */
+#define FSP_FLAGS_POS_ZIP_SSIZE		(FSP_FLAGS_POS_POST_ANTELOPE	\
+					+ FSP_FLAGS_WIDTH_POST_ANTELOPE)
+/** Zero relative shift position of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_POS_ATOMIC_BLOBS	(FSP_FLAGS_POS_ZIP_SSIZE	\
+					+ FSP_FLAGS_WIDTH_ZIP_SSIZE)
+/** Zero relative shift position of the PAGE_SSIZE field */
+#define FSP_FLAGS_POS_PAGE_SSIZE	(FSP_FLAGS_POS_ATOMIC_BLOBS	\
+					+ FSP_FLAGS_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the start of the UNUSED bits */
+#define FSP_FLAGS_POS_DATA_DIR		(FSP_FLAGS_POS_PAGE_SSIZE	\
+					+ FSP_FLAGS_WIDTH_PAGE_SSIZE)
+/** Zero relative shift position of the start of the UNUSED bits */
+#define FSP_FLAGS_POS_UNUSED		(FSP_FLAGS_POS_DATA_DIR	\
+					+ FSP_FLAGS_WIDTH_DATA_DIR)
+
+/** Bit mask of the POST_ANTELOPE field */
+#define FSP_FLAGS_MASK_POST_ANTELOPE				\
+		((~(~0 << FSP_FLAGS_WIDTH_POST_ANTELOPE))	\
+		<< FSP_FLAGS_POS_POST_ANTELOPE)
+/** Bit mask of the ZIP_SSIZE field */
+#define FSP_FLAGS_MASK_ZIP_SSIZE				\
+		((~(~0 << FSP_FLAGS_WIDTH_ZIP_SSIZE))		\
+		<< FSP_FLAGS_POS_ZIP_SSIZE)
+/** Bit mask of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_MASK_ATOMIC_BLOBS				\
+		((~(~0 << FSP_FLAGS_WIDTH_ATOMIC_BLOBS))	\
+		<< FSP_FLAGS_POS_ATOMIC_BLOBS)
+/** Bit mask of the PAGE_SSIZE field */
+#define FSP_FLAGS_MASK_PAGE_SSIZE				\
+		((~(~0 << FSP_FLAGS_WIDTH_PAGE_SSIZE))		\
+		<< FSP_FLAGS_POS_PAGE_SSIZE)
+/** Bit mask of the DATA_DIR field */
+#define FSP_FLAGS_MASK_DATA_DIR					\
+		((~(~0 << FSP_FLAGS_WIDTH_DATA_DIR))		\
+		<< FSP_FLAGS_POS_DATA_DIR)
+
+/** Return the value of the POST_ANTELOPE field */
+#define FSP_FLAGS_GET_POST_ANTELOPE(flags)			\
+		((flags & FSP_FLAGS_MASK_POST_ANTELOPE)		\
+		>> FSP_FLAGS_POS_POST_ANTELOPE)
+/** Return the value of the ZIP_SSIZE field */
+#define FSP_FLAGS_GET_ZIP_SSIZE(flags)				\
+		((flags & FSP_FLAGS_MASK_ZIP_SSIZE)		\
+		>> FSP_FLAGS_POS_ZIP_SSIZE)
+/** Return the value of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_HAS_ATOMIC_BLOBS(flags)			\
+		((flags & FSP_FLAGS_MASK_ATOMIC_BLOBS)		\
+		>> FSP_FLAGS_POS_ATOMIC_BLOBS)
+/** Return the value of the PAGE_SSIZE field */
+#define FSP_FLAGS_GET_PAGE_SSIZE(flags)				\
+		((flags & FSP_FLAGS_MASK_PAGE_SSIZE)		\
+		>> FSP_FLAGS_POS_PAGE_SSIZE)
+/** Return the value of the DATA_DIR field */
+#define FSP_FLAGS_HAS_DATA_DIR(flags)				\
+		((flags & FSP_FLAGS_MASK_DATA_DIR)		\
+		>> FSP_FLAGS_POS_DATA_DIR)
+/** Return the contents of the UNUSED bits */
+#define FSP_FLAGS_GET_UNUSED(flags)				\
+		(flags >> FSP_FLAGS_POS_UNUSED)
+
+/** Set a PAGE_SSIZE into the correct bits in a given
+tablespace flags. */
+#define FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize)			\
+		(flags | (ssize << FSP_FLAGS_POS_PAGE_SSIZE))
+
+/* @} */
+
+/* @defgroup Tablespace Header Constants (moved from fsp0fsp.c) @{ */
+
+/** Offset of the space header within a file page */
+#define FSP_HEADER_OFFSET	FIL_PAGE_DATA
+
+/* The data structures in files are defined just as byte strings in C */
+typedef	byte	fsp_header_t;
+typedef	byte	xdes_t;
+
+/*			SPACE HEADER
+			============
+
+File space header data structure: this data structure is contained in the
+first page of a space. The space for this header is reserved in every extent
+descriptor page, but used only in the first. */
+
+/*-------------------------------------*/
+#define FSP_SPACE_ID		0	/* space id */
+#define FSP_NOT_USED		4	/* this field contained a value up to
+					which we know that the modifications
+					in the database have been flushed to
+					the file space; not used now */
+#define	FSP_SIZE		8	/* Current size of the space in
+					pages */
+#define	FSP_FREE_LIMIT		12	/* Minimum page number for which the
+					free list has not been initialized:
+					the pages >= this limit are, by
+					definition, free; note that in a
+					single-table tablespace where size
+					< 64 pages, this number is 64, i.e.,
+					we have initialized the space
+					about the first extent, but have not
+					physically allocted those pages to the
+					file */
+#define	FSP_SPACE_FLAGS		16	/* fsp_space_t.flags, similar to
+					dict_table_t::flags */
+#define	FSP_FRAG_N_USED		20	/* number of used pages in the
+					FSP_FREE_FRAG list */
+#define	FSP_FREE		24	/* list of free extents */
+#define	FSP_FREE_FRAG		(24 + FLST_BASE_NODE_SIZE)
+					/* list of partially free extents not
+					belonging to any segment */
+#define	FSP_FULL_FRAG		(24 + 2 * FLST_BASE_NODE_SIZE)
+					/* list of full extents not belonging
+					to any segment */
+#define FSP_SEG_ID		(24 + 3 * FLST_BASE_NODE_SIZE)
+					/* 8 bytes which give the first unused
+					segment id */
+#define FSP_SEG_INODES_FULL	(32 + 3 * FLST_BASE_NODE_SIZE)
+					/* list of pages containing segment
+					headers, where all the segment inode
+					slots are reserved */
+#define FSP_SEG_INODES_FREE	(32 + 4 * FLST_BASE_NODE_SIZE)
+					/* list of pages containing segment
+					headers, where not all the segment
+					header slots are reserved */
+/*-------------------------------------*/
+/* File space header size */
+#define	FSP_HEADER_SIZE		(32 + 5 * FLST_BASE_NODE_SIZE)
+
+#define	FSP_FREE_ADD		4	/* this many free extents are added
+					to the free list from above
+					FSP_FREE_LIMIT at a time */
+/* @} */
+
+#ifndef UNIV_INNOCHECKSUM
+
+/* @defgroup File Segment Inode Constants (moved from fsp0fsp.c) @{ */
+
+/*			FILE SEGMENT INODE
+			==================
+
+Segment inode which is created for each segment in a tablespace. NOTE: in
+purge we assume that a segment having only one currently used page can be
+freed in a few steps, so that the freeing cannot fill the file buffer with
+bufferfixed file pages. */
+
+typedef	byte	fseg_inode_t;
+
+#define FSEG_INODE_PAGE_NODE	FSEG_PAGE_DATA
+					/* the list node for linking
+					segment inode pages */
+
+#define FSEG_ARR_OFFSET		(FSEG_PAGE_DATA + FLST_NODE_SIZE)
+/*-------------------------------------*/
+#define	FSEG_ID			0	/* 8 bytes of segment id: if this is 0,
+					it means that the header is unused */
+#define FSEG_NOT_FULL_N_USED	8
+					/* number of used segment pages in
+					the FSEG_NOT_FULL list */
+#define	FSEG_FREE		12
+					/* list of free extents of this
+					segment */
+#define	FSEG_NOT_FULL		(12 + FLST_BASE_NODE_SIZE)
+					/* list of partially free extents */
+#define	FSEG_FULL		(12 + 2 * FLST_BASE_NODE_SIZE)
+					/* list of full extents */
+#define	FSEG_MAGIC_N		(12 + 3 * FLST_BASE_NODE_SIZE)
+					/* magic number used in debugging */
+#define	FSEG_FRAG_ARR		(16 + 3 * FLST_BASE_NODE_SIZE)
+					/* array of individual pages
+					belonging to this segment in fsp
+					fragment extent lists */
+#define FSEG_FRAG_ARR_N_SLOTS	(FSP_EXTENT_SIZE / 2)
+					/* number of slots in the array for
+					the fragment pages */
+#define	FSEG_FRAG_SLOT_SIZE	4	/* a fragment page slot contains its
+					page number within space, FIL_NULL
+					means that the slot is not in use */
+/*-------------------------------------*/
+#define FSEG_INODE_SIZE					\
+	(16 + 3 * FLST_BASE_NODE_SIZE			\
+	 + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE)
+
+#define FSP_SEG_INODES_PER_PAGE(zip_size)		\
+	(((zip_size ? zip_size : UNIV_PAGE_SIZE)	\
+	  - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE)
+				/* Number of segment inodes which fit on a
+				single page */
+
+#define FSEG_MAGIC_N_VALUE	97937874
+
+#define	FSEG_FILLFACTOR		8	/* If this value is x, then if
+					the number of unused but reserved
+					pages in a segment is less than
+					reserved pages * 1/x, and there are
+					at least FSEG_FRAG_LIMIT used pages,
+					then we allow a new empty extent to
+					be added to the segment in
+					fseg_alloc_free_page. Otherwise, we
+					use unused pages of the segment. */
+
+#define FSEG_FRAG_LIMIT		FSEG_FRAG_ARR_N_SLOTS
+					/* If the segment has >= this many
+					used pages, it may be expanded by
+					allocating extents to the segment;
+					until that only individual fragment
+					pages are allocated from the space */
+
+#define	FSEG_FREE_LIST_LIMIT	40	/* If the reserved size of a segment
+					is at least this many extents, we
+					allow extents to be put to the free
+					list of the extent: at most
+					FSEG_FREE_LIST_MAX_LEN many */
+#define	FSEG_FREE_LIST_MAX_LEN	4
+/* @} */
+
+/* @defgroup Extent Descriptor Constants (moved from fsp0fsp.c) @{ */
+
+/*			EXTENT DESCRIPTOR
+			=================
+
+File extent descriptor data structure: contains bits to tell which pages in
+the extent are free and which contain old tuple version to clean. */
+
+/*-------------------------------------*/
+#define	XDES_ID			0	/* The identifier of the segment
+					to which this extent belongs */
+#define XDES_FLST_NODE		8	/* The list node data structure
+					for the descriptors */
+#define	XDES_STATE		(FLST_NODE_SIZE + 8)
+					/* contains state information
+					of the extent */
+#define	XDES_BITMAP		(FLST_NODE_SIZE + 12)
+					/* Descriptor bitmap of the pages
+					in the extent */
+/*-------------------------------------*/
+
+#define	XDES_BITS_PER_PAGE	2	/* How many bits are there per page */
+#define	XDES_FREE_BIT		0	/* Index of the bit which tells if
+					the page is free */
+#define	XDES_CLEAN_BIT		1	/* NOTE: currently not used!
+					Index of the bit which tells if
+					there are old versions of tuples
+					on the page */
+/* States of a descriptor */
+#define	XDES_FREE		1	/* extent is in free list of space */
+#define	XDES_FREE_FRAG		2	/* extent is in free fragment list of
+					space */
+#define	XDES_FULL_FRAG		3	/* extent is in full fragment list of
+					space */
+#define	XDES_FSEG		4	/* extent belongs to a segment */
+
+/** File extent data structure size in bytes. */
+#define	XDES_SIZE							\
+	(XDES_BITMAP							\
+	+ UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE))
+
+/** File extent data structure size in bytes for MAX page size. */
+#define	XDES_SIZE_MAX							\
+	(XDES_BITMAP							\
+	+ UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MAX * XDES_BITS_PER_PAGE))
+
+/** File extent data structure size in bytes for MIN page size. */
+#define	XDES_SIZE_MIN							\
+	(XDES_BITMAP							\
+	+ UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MIN * XDES_BITS_PER_PAGE))
+
+/** Offset of the descriptor array on a descriptor page */
+#define	XDES_ARR_OFFSET		(FSP_HEADER_OFFSET + FSP_HEADER_SIZE)
+
+/* @} */
+
+/**********************************************************************//**
+Initializes the file space system. */
+UNIV_INTERN
+void
+fsp_init(void);
+/*==========*/
+/**********************************************************************//**
+Gets the size of the system tablespace from the tablespace header.  If
+we do not have an auto-extending data file, this should be equal to
+the size of the data files.  If there is an auto-extending data file,
+this can be smaller.
+@return	size in pages */
+UNIV_INTERN
+ulint
+fsp_header_get_tablespace_size(void);
+/*================================*/
+/**********************************************************************//**
+Reads the file space size stored in the header page.
+@return	tablespace size stored in the space header */
+UNIV_INTERN
+ulint
+fsp_get_size_low(
+/*=============*/
+	page_t*	page);	/*!< in: header page (page 0 in the tablespace) */
+/**********************************************************************//**
+Reads the space id from the first page of a tablespace.
+@return	space id, ULINT UNDEFINED if error */
+UNIV_INTERN
+ulint
+fsp_header_get_space_id(
+/*====================*/
+	const page_t*	page);	/*!< in: first page of a tablespace */
+/**********************************************************************//**
+Reads the space flags from the first page of a tablespace.
+@return	flags */
+UNIV_INTERN
+ulint
+fsp_header_get_flags(
+/*=================*/
+	const page_t*	page);	/*!< in: first page of a tablespace */
+/**********************************************************************//**
+Reads the compressed page size from the first page of a tablespace.
+@return	compressed page size in bytes, or 0 if uncompressed */
+UNIV_INTERN
+ulint
+fsp_header_get_zip_size(
+/*====================*/
+	const page_t*	page);	/*!< in: first page of a tablespace */
+/**********************************************************************//**
+Writes the space id and flags to a tablespace header.  The flags contain
+row type, physical/compressed page size, and logical/uncompressed page
+size of the tablespace. */
+UNIV_INTERN
+void
+fsp_header_init_fields(
+/*===================*/
+	page_t*	page,		/*!< in/out: first page in the space */
+	ulint	space_id,	/*!< in: space id */
+	ulint	flags);		/*!< in: tablespace flags (FSP_SPACE_FLAGS):
+				0, or table->flags if newer than COMPACT */
+/**********************************************************************//**
+Initializes the space header of a new created space and creates also the
+insert buffer tree root if space == 0. */
+UNIV_INTERN
+void
+fsp_header_init(
+/*============*/
+	ulint	space,		/*!< in: space id */
+	ulint	size,		/*!< in: current size in blocks */
+	mtr_t*	mtr);		/*!< in/out: mini-transaction */
+/**********************************************************************//**
+Increases the space size field of a space. */
+UNIV_INTERN
+void
+fsp_header_inc_size(
+/*================*/
+	ulint	space,		/*!< in: space id */
+	ulint	size_inc,	/*!< in: size increment in pages */
+	mtr_t*	mtr);		/*!< in/out: mini-transaction */
+/**********************************************************************//**
+Creates a new segment.
+@return the block where the segment header is placed, x-latched, NULL
+if could not create segment because of lack of space */
+UNIV_INTERN
+buf_block_t*
+fseg_create(
+/*========*/
+	ulint	space,	/*!< in: space id */
+	ulint	page,	/*!< in: page where the segment header is placed: if
+			this is != 0, the page must belong to another segment,
+			if this is 0, a new page will be allocated and it
+			will belong to the created segment */
+	ulint	byte_offset, /*!< in: byte offset of the created segment header
+			on the page */
+	mtr_t*	mtr);	/*!< in/out: mini-transaction */
+/**********************************************************************//**
+Creates a new segment.
+@return the block where the segment header is placed, x-latched, NULL
+if could not create segment because of lack of space */
+UNIV_INTERN
+buf_block_t*
+fseg_create_general(
+/*================*/
+	ulint	space,	/*!< in: space id */
+	ulint	page,	/*!< in: page where the segment header is placed: if
+			this is != 0, the page must belong to another segment,
+			if this is 0, a new page will be allocated and it
+			will belong to the created segment */
+	ulint	byte_offset, /*!< in: byte offset of the created segment header
+			on the page */
+	ibool	has_done_reservation, /*!< in: TRUE if the caller has already
+			done the reservation for the pages with
+			fsp_reserve_free_extents (at least 2 extents: one for
+			the inode and the other for the segment) then there is
+			no need to do the check for this individual
+			operation */
+	mtr_t*	mtr);	/*!< in/out: mini-transaction */
+/**********************************************************************//**
+Calculates the number of pages reserved by a segment, and how many pages are
+currently used.
+@return	number of reserved pages */
+UNIV_INTERN
+ulint
+fseg_n_reserved_pages(
+/*==================*/
+	fseg_header_t*	header,	/*!< in: segment header */
+	ulint*		used,	/*!< out: number of pages used (<= reserved) */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize
+file space fragmentation.
+@param[in/out] seg_header	segment header
+@param[in] hint			hint of which page would be desirable
+@param[in] direction		if the new page is needed because
+				of an index page split, and records are
+				inserted there in order, into which
+				direction they go alphabetically: FSP_DOWN,
+				FSP_UP, FSP_NO_DIR
+@param[in/out] mtr		mini-transaction
+@return	X-latched block, or NULL if no page could be allocated */
+#define fseg_alloc_free_page(seg_header, hint, direction, mtr)		\
+	fseg_alloc_free_page_general(seg_header, hint, direction,	\
+				     FALSE, mtr, mtr)
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@retval NULL if no page could be allocated
+@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
+(init_mtr == mtr, or the page was not previously freed in mtr)
+@retval block (not allocated or initialized) otherwise */
+UNIV_INTERN
+buf_block_t*
+fseg_alloc_free_page_general(
+/*=========================*/
+	fseg_header_t*	seg_header,/*!< in/out: segment header */
+	ulint		hint,	/*!< in: hint of which page would be
+				desirable */
+	byte		direction,/*!< in: if the new page is needed because
+				of an index page split, and records are
+				inserted there in order, into which
+				direction they go alphabetically: FSP_DOWN,
+				FSP_UP, FSP_NO_DIR */
+	ibool		has_done_reservation, /*!< in: TRUE if the caller has
+				already done the reservation for the page
+				with fsp_reserve_free_extents, then there
+				is no need to do the check for this individual
+				page */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	mtr_t*		init_mtr)/*!< in/out: mtr or another mini-transaction
+				in which the page should be initialized.
+				If init_mtr!=mtr, but the page is already
+				latched in mtr, do not initialize the page. */
+	__attribute__((warn_unused_result, nonnull));
+/**********************************************************************//**
+Reserves free pages from a tablespace. All mini-transactions which may
+use several pages from the tablespace should call this function beforehand
+and reserve enough free extents so that they certainly will be able
+to do their operation, like a B-tree page split, fully. Reservations
+must be released with function fil_space_release_free_extents!
+
+The alloc_type below has the following meaning: FSP_NORMAL means an
+operation which will probably result in more space usage, like an
+insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are
+deleting rows, then this allocation will in the long run result in
+less space usage (after a purge); FSP_CLEANING means allocation done
+in a physical record delete (like in a purge) or other cleaning operation
+which will result in less space usage in the long run. We prefer the latter
+two types of allocation: when space is scarce, FSP_NORMAL allocations
+will not succeed, but the latter two allocations will succeed, if possible.
+The purpose is to avoid dead end where the database is full but the
+user cannot free any space because these freeing operations temporarily
+reserve some space.
+
+Single-table tablespaces whose size is < 32 pages are a special case. In this
+function we would liberally reserve several 64 page extents for every page
+split or merge in a B-tree. But we do not want to waste disk space if the table
+only occupies < 32 pages. That is why we apply different rules in that special
+case, just ensuring that there are 3 free pages available.
+@return	TRUE if we were able to make the reservation */
+UNIV_INTERN
+ibool
+fsp_reserve_free_extents(
+/*=====================*/
+	ulint*	n_reserved,/*!< out: number of extents actually reserved; if we
+			return TRUE and the tablespace size is < 64 pages,
+			then this can be 0, otherwise it is n_ext */
+	ulint	space,	/*!< in: space id */
+	ulint	n_ext,	/*!< in: number of extents to reserve */
+	ulint	alloc_type,/*!< in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */
+	mtr_t*	mtr);	/*!< in: mini-transaction */
+/**********************************************************************//**
+This function should be used to get information on how much we still
+will be able to insert new data to the database without running out the
+tablespace. Only free extents are taken into account and we also subtract
+the safety margin required by the above function fsp_reserve_free_extents.
+@return	available space in kB */
+UNIV_INTERN
+ullint
+fsp_get_available_space_in_free_extents(
+/*====================================*/
+	ulint	space);	/*!< in: space id */
+/**********************************************************************//**
+Frees a single page of a segment. */
+UNIV_INTERN
+void
+fseg_free_page(
+/*===========*/
+	fseg_header_t*	seg_header, /*!< in: segment header */
+	ulint		space,	/*!< in: space id */
+	ulint		page,	/*!< in: page offset */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+/**********************************************************************//**
+Checks if a single page of a segment is free.
+@return	true if free */
+UNIV_INTERN
+bool
+fseg_page_is_free(
+/*==============*/
+	fseg_header_t*	seg_header,	/*!< in: segment header */
+	ulint		space,		/*!< in: space id */
+	ulint		page)		/*!< in: page offset */
+	__attribute__((nonnull, warn_unused_result));
+/**********************************************************************//**
+Frees part of a segment. This function can be used to free a segment
+by repeatedly calling this function in different mini-transactions.
+Doing the freeing in a single mini-transaction might result in
+too big a mini-transaction.
+@return	TRUE if freeing completed */
+UNIV_INTERN
+ibool
+fseg_free_step(
+/*===========*/
+	fseg_header_t*	header,	/*!< in, own: segment header; NOTE: if the header
+				resides on the first page of the frag list
+				of the segment, this pointer becomes obsolete
+				after the last freeing step */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+/**********************************************************************//**
+Frees part of a segment. Differs from fseg_free_step because this function
+leaves the header page unfreed.
+@return	TRUE if freeing completed, except the header page */
+UNIV_INTERN
+ibool
+fseg_free_step_not_header(
+/*======================*/
+	fseg_header_t*	header,	/*!< in: segment header which must reside on
+				the first fragment page of the segment */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+/***********************************************************************//**
+Checks if a page address is an extent descriptor page address.
+@return	TRUE if a descriptor page */
+UNIV_INLINE
+ibool
+fsp_descr_page(
+/*===========*/
+	ulint	zip_size,/*!< in: compressed page size in bytes;
+			0 for uncompressed pages */
+	ulint	page_no);/*!< in: page number */
+/***********************************************************//**
+Parses a redo log record of a file page init.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+fsp_parse_init_file_page(
+/*=====================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr, /*!< in: buffer end */
+	buf_block_t*	block);	/*!< in: block or NULL */
+/*******************************************************************//**
+Validates the file space system and its segments.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+fsp_validate(
+/*=========*/
+	ulint	space);	/*!< in: space id */
+/*******************************************************************//**
+Prints info of a file space. */
+UNIV_INTERN
+void
+fsp_print(
+/*======*/
+	ulint	space);	/*!< in: space id */
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Validates a segment.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+fseg_validate(
+/*==========*/
+	fseg_header_t*	header, /*!< in: segment header */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+#endif /* UNIV_DEBUG */
+#ifdef UNIV_BTR_PRINT
+/*******************************************************************//**
+Writes info of a segment. */
+UNIV_INTERN
+void
+fseg_print(
+/*=======*/
+	fseg_header_t*	header, /*!< in: segment header */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+#endif /* UNIV_BTR_PRINT */
+
+/********************************************************************//**
+Validate and return the tablespace flags, which are stored in the
+tablespace header at offset FSP_SPACE_FLAGS.  They should be 0 for
+ROW_FORMAT=COMPACT and ROW_FORMAT=REDUNDANT. The newer row formats,
+COMPRESSED and DYNAMIC, use a file format > Antelope so they should
+have a file format number plus the DICT_TF_COMPACT bit set.
+@return	true if check ok */
+UNIV_INLINE
+bool
+fsp_flags_is_valid(
+/*===============*/
+	ulint	flags)		/*!< in: tablespace flags */
+	__attribute__((warn_unused_result, const));
+/********************************************************************//**
+Determine if the tablespace is compressed from dict_table_t::flags.
+@return	TRUE if compressed, FALSE if not compressed */
+UNIV_INLINE
+ibool
+fsp_flags_is_compressed(
+/*====================*/
+	ulint	flags);	/*!< in: tablespace flags */
+
+/********************************************************************//**
+Calculates the descriptor index within a descriptor page.
+@return	descriptor index */
+UNIV_INLINE
+ulint
+xdes_calc_descriptor_index(
+/*=======================*/
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	offset);	/*!< in: page offset */
+
+/**********************************************************************//**
+Gets a descriptor bit of a page.
+@return	TRUE if free */
+UNIV_INLINE
+ibool
+xdes_get_bit(
+/*=========*/
+	const xdes_t*	descr,	/*!< in: descriptor */
+	ulint		bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
+	ulint		offset);/*!< in: page offset within extent:
+				0 ... FSP_EXTENT_SIZE - 1 */
+
+/********************************************************************//**
+Calculates the page where the descriptor of a page resides.
+@return	descriptor page offset */
+UNIV_INLINE
+ulint
+xdes_calc_descriptor_page(
+/*======================*/
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	offset);	/*!< in: page offset */
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/********************************************************************//**
+Extract the zip size from tablespace flags.  A tablespace has only one
+physical page size whether that page is compressed or not.
+@return	compressed page size of the file-per-table tablespace in bytes,
+or zero if the table is not compressed.  */
+UNIV_INLINE
+ulint
+fsp_flags_get_zip_size(
+/*====================*/
+	ulint	flags);		/*!< in: tablespace flags */
+/********************************************************************//**
+Extract the page size from tablespace flags.
+@return	page size of the tablespace in bytes */
+UNIV_INLINE
+ulint
+fsp_flags_get_page_size(
+/*====================*/
+	ulint	flags);		/*!< in: tablespace flags */
+
+#ifndef UNIV_NONINL
+#include "fsp0fsp.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/fsp0fsp.ic b/storage/innobase/include/fsp0fsp.ic
new file mode 100644
index 00000000000..0d81e817cc9
--- /dev/null
+++ b/storage/innobase/include/fsp0fsp.ic
@@ -0,0 +1,314 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0fsp.ic
+File space management
+
+Created 12/18/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_INNOCHECKSUM
+
+/***********************************************************************//**
+Checks if a page address is an extent descriptor page address.
+@return	TRUE if a descriptor page */
+UNIV_INLINE
+ibool
+fsp_descr_page(
+/*===========*/
+	ulint	zip_size,/*!< in: compressed page size in bytes;
+			0 for uncompressed pages */
+	ulint	page_no)/*!< in: page number */
+{
+	ut_ad(ut_is_2pow(zip_size));
+
+	if (!zip_size) {
+		return((page_no & (UNIV_PAGE_SIZE - 1)) == FSP_XDES_OFFSET);
+	}
+
+	return((page_no & (zip_size - 1)) == FSP_XDES_OFFSET);
+}
+
+/********************************************************************//**
+Validate and return the tablespace flags, which are stored in the
+tablespace header at offset FSP_SPACE_FLAGS.  They should be 0 for
+ROW_FORMAT=COMPACT and ROW_FORMAT=REDUNDANT. The newer row formats,
+COMPRESSED and DYNAMIC, use a file format > Antelope so they should
+have a file format number plus the DICT_TF_COMPACT bit set.
+@return	true if check ok */
+UNIV_INLINE
+bool
+fsp_flags_is_valid(
+/*===============*/
+	ulint	flags)		/*!< in: tablespace flags */
+{
+	ulint	post_antelope = FSP_FLAGS_GET_POST_ANTELOPE(flags);
+	ulint	zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(flags);
+	ulint	atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(flags);
+	ulint	page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags);
+	ulint	unused = FSP_FLAGS_GET_UNUSED(flags);
+
+	DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(false););
+
+	/* fsp_flags is zero unless atomic_blobs is set. */
+	/* Make sure there are no bits that we do not know about. */
+	if (unused != 0 || flags == 1) {
+		return(false);
+	} else if (post_antelope) {
+		/* The Antelope row formats REDUNDANT and COMPACT did
+		not use tablespace flags, so this flag and the entire
+		4-byte field is zero for Antelope row formats. */
+
+		if (!atomic_blobs) {
+			return(false);
+		}
+	}
+
+	if (!atomic_blobs) {
+		/* Barracuda row formats COMPRESSED and DYNAMIC build on
+		the page structure introduced for the COMPACT row format
+		by allowing long fields to be broken into prefix and
+		externally stored parts. */
+
+		if (post_antelope || zip_ssize != 0) {
+			return(false);
+		}
+
+	} else if (!post_antelope || zip_ssize > PAGE_ZIP_SSIZE_MAX) {
+		return(false);
+	} else if (page_ssize > UNIV_PAGE_SSIZE_MAX) {
+
+		/* The page size field can be used for any row type, or it may
+		be zero for an original 16k page size.
+		Validate the page shift size is within allowed range. */
+
+		return(false);
+
+	} else if (UNIV_PAGE_SIZE != UNIV_PAGE_SIZE_ORIG && !page_ssize) {
+		return(false);
+	}
+
+#if UNIV_FORMAT_MAX != UNIV_FORMAT_B
+# error "UNIV_FORMAT_MAX != UNIV_FORMAT_B, Add more validations."
+#endif
+
+	/* The DATA_DIR field can be used for any row type so there is
+	nothing here to validate. */
+
+	return(true);
+}
+
+/********************************************************************//**
+Determine if the tablespace is compressed from dict_table_t::flags.
+@return	TRUE if compressed, FALSE if not compressed */
+UNIV_INLINE
+ibool
+fsp_flags_is_compressed(
+/*====================*/
+	ulint	flags)	/*!< in: tablespace flags */
+{
+	return(FSP_FLAGS_GET_ZIP_SSIZE(flags) != 0);
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/********************************************************************//**
+Extract the zip size from tablespace flags.
+@return	compressed page size of the file-per-table tablespace in bytes,
+or zero if the table is not compressed. */
+UNIV_INLINE
+ulint
+fsp_flags_get_zip_size(
+/*===================*/
+	ulint	flags)	/*!< in: tablespace flags */
+{
+	ulint	zip_size = 0;
+	ulint	ssize = FSP_FLAGS_GET_ZIP_SSIZE(flags);
+
+	/* Convert from a 'log2 minus 9' to a page size in bytes. */
+	if (ssize) {
+		zip_size = ((UNIV_ZIP_SIZE_MIN >> 1) << ssize);
+
+		ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
+	}
+
+	return(zip_size);
+}
+
+/********************************************************************//**
+Extract the page size from tablespace flags.
+@return	page size of the tablespace in bytes */
+UNIV_INLINE
+ulint
+fsp_flags_get_page_size(
+/*====================*/
+	ulint	flags)	/*!< in: tablespace flags */
+{
+	ulint	page_size = 0;
+	ulint	ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags);
+
+	/* Convert from a 'log2 minus 9' to a page size in bytes. */
+	if (UNIV_UNLIKELY(ssize)) {
+		page_size = ((UNIV_ZIP_SIZE_MIN >> 1) << ssize);
+
+		ut_ad(page_size <= UNIV_PAGE_SIZE_MAX);
+	} else {
+		/* If the page size was not stored, then it is the
+		original 16k. */
+		page_size = UNIV_PAGE_SIZE_ORIG;
+	}
+
+	return(page_size);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/********************************************************************//**
+Add the page size to the tablespace flags.
+@return	tablespace flags after page size is added */
+UNIV_INLINE
+ulint
+fsp_flags_set_page_size(
+/*====================*/
+	ulint	flags,		/*!< in: tablespace flags */
+	ulint	page_size)	/*!< in: page size in bytes */
+{
+	ulint ssize = 0;
+	ulint shift;
+
+	/* Page size should be > UNIV_PAGE_SIZE_MIN */
+	ut_ad(page_size >= UNIV_PAGE_SIZE_MIN);
+	ut_ad(page_size <= UNIV_PAGE_SIZE_MAX);
+
+	if (page_size == UNIV_PAGE_SIZE_ORIG) {
+		ut_ad(0 == FSP_FLAGS_GET_PAGE_SSIZE(flags));
+		return(flags);
+	}
+
+	for (shift = UNIV_PAGE_SIZE_SHIFT_MAX;
+	     shift >= UNIV_PAGE_SIZE_SHIFT_MIN;
+	     shift--) {
+		ulint	mask = (1 << shift);
+		if (page_size & mask) {
+			ut_ad(!(page_size & ~mask));
+			ssize = shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1;
+			break;
+		}
+	}
+
+	ut_ad(ssize);
+	ut_ad(ssize <= UNIV_PAGE_SSIZE_MAX);
+
+	flags = FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize);
+
+	ut_ad(fsp_flags_is_valid(flags));
+
+	return(flags);
+}
+
+/********************************************************************//**
+Calculates the descriptor index within a descriptor page.
+@return	descriptor index */
+UNIV_INLINE
+ulint
+xdes_calc_descriptor_index(
+/*=======================*/
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	offset)		/*!< in: page offset */
+{
+	ut_ad(ut_is_2pow(zip_size));
+
+	if (zip_size == 0) {
+		return(ut_2pow_remainder(offset, UNIV_PAGE_SIZE)
+		       / FSP_EXTENT_SIZE);
+	} else {
+		return(ut_2pow_remainder(offset, zip_size) / FSP_EXTENT_SIZE);
+	}
+}
+
+/**********************************************************************//**
+Gets a descriptor bit of a page.
+@return	TRUE if free */
+UNIV_INLINE
+ibool
+xdes_get_bit(
+/*=========*/
+	const xdes_t*	descr,	/*!< in: descriptor */
+	ulint		bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
+	ulint		offset)	/*!< in: page offset within extent:
+				0 ... FSP_EXTENT_SIZE - 1 */
+{
+	ut_ad(offset < FSP_EXTENT_SIZE);
+	ut_ad(bit == XDES_FREE_BIT || bit == XDES_CLEAN_BIT);
+
+	ulint	index = bit + XDES_BITS_PER_PAGE * offset;
+
+	ulint	bit_index = index % 8;
+	ulint	byte_index = index / 8;
+
+	return(ut_bit_get_nth(
+			mach_read_ulint(descr + XDES_BITMAP + byte_index,
+					MLOG_1BYTE),
+			bit_index));
+}
+
+/********************************************************************//**
+Calculates the page where the descriptor of a page resides.
+@return	descriptor page offset */
+UNIV_INLINE
+ulint
+xdes_calc_descriptor_page(
+/*======================*/
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	offset)		/*!< in: page offset */
+{
+#ifndef DOXYGEN /* Doxygen gets confused by these */
+# if UNIV_PAGE_SIZE_MAX <= XDES_ARR_OFFSET				\
+			   + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX)	\
+			   * XDES_SIZE_MAX
+#  error
+# endif
+# if UNIV_ZIP_SIZE_MIN <= XDES_ARR_OFFSET				\
+			  + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE_MIN)	\
+			  * XDES_SIZE_MIN
+#  error
+# endif
+#endif /* !DOXYGEN */
+
+	ut_ad(UNIV_PAGE_SIZE > XDES_ARR_OFFSET
+	      + (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE)
+	      * XDES_SIZE);
+	ut_ad(UNIV_ZIP_SIZE_MIN > XDES_ARR_OFFSET
+	      + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE)
+	      * XDES_SIZE);
+
+	ut_ad(ut_is_2pow(zip_size));
+
+	if (zip_size == 0) {
+		return(ut_2pow_round(offset, UNIV_PAGE_SIZE));
+	} else {
+		ut_ad(zip_size > XDES_ARR_OFFSET
+		      + (zip_size / FSP_EXTENT_SIZE) * XDES_SIZE);
+		return(ut_2pow_round(offset, zip_size));
+	}
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h
new file mode 100644
index 00000000000..94fd908ab0c
--- /dev/null
+++ b/storage/innobase/include/fsp0types.h
@@ -0,0 +1,116 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************
+@file include/fsp0types.h
+File space management types
+
+Created May 26, 2009 Vasil Dimov
+*******************************************************/
+
+#ifndef fsp0types_h
+#define fsp0types_h
+
+#include "univ.i"
+
+#include "fil0fil.h" /* for FIL_PAGE_DATA */
+
+/** @name Flags for inserting records in order
+If records are inserted in order, there are the following
+flags to tell this (their type is made byte for the compiler
+to warn if direction and hint parameters are switched in
+fseg_alloc_free_page) */
+/* @{ */
+#define	FSP_UP		((byte)111)	/*!< alphabetically upwards */
+#define	FSP_DOWN	((byte)112)	/*!< alphabetically downwards */
+#define	FSP_NO_DIR	((byte)113)	/*!< no order */
+/* @} */
+
+/** File space extent size (one megabyte) in pages */
+#define	FSP_EXTENT_SIZE		(1048576U / UNIV_PAGE_SIZE)
+
+/** File space extent size (one megabyte) in pages for MAX page size */
+#define	FSP_EXTENT_SIZE_MAX	(1048576 / UNIV_PAGE_SIZE_MAX)
+
+/** File space extent size (one megabyte) in pages for MIN page size */
+#define	FSP_EXTENT_SIZE_MIN	(1048576 / UNIV_PAGE_SIZE_MIN)
+
+/** On a page of any file segment, data may be put starting from this
+offset */
+#define FSEG_PAGE_DATA		FIL_PAGE_DATA
+
+/** @name File segment header
+The file segment header points to the inode describing the file segment. */
+/* @{ */
+/** Data type for file segment header */
+typedef	byte	fseg_header_t;
+
+#define FSEG_HDR_SPACE		0	/*!< space id of the inode */
+#define FSEG_HDR_PAGE_NO	4	/*!< page number of the inode */
+#define FSEG_HDR_OFFSET		8	/*!< byte offset of the inode */
+
+#define FSEG_HEADER_SIZE	10	/*!< Length of the file system
+					header, in bytes */
+/* @} */
+
+/** Flags for fsp_reserve_free_extents @{ */
+#define FSP_NORMAL	1000000
+#define	FSP_UNDO	2000000
+#define FSP_CLEANING	3000000
+/* @} */
+
+/* Number of pages described in a single descriptor page: currently each page
+description takes less than 1 byte; a descriptor page is repeated every
+this many file pages */
+/* #define XDES_DESCRIBED_PER_PAGE		UNIV_PAGE_SIZE */
+/* This has been replaced with either UNIV_PAGE_SIZE or page_zip->size. */
+
+/** @name The space low address page map
+The pages at FSP_XDES_OFFSET and FSP_IBUF_BITMAP_OFFSET are repeated
+every XDES_DESCRIBED_PER_PAGE pages in every tablespace. */
+/* @{ */
+/*--------------------------------------*/
+#define FSP_XDES_OFFSET			0	/* !< extent descriptor */
+#define FSP_IBUF_BITMAP_OFFSET		1	/* !< insert buffer bitmap */
+				/* The ibuf bitmap pages are the ones whose
+				page number is the number above plus a
+				multiple of XDES_DESCRIBED_PER_PAGE */
+
+#define FSP_FIRST_INODE_PAGE_NO		2	/*!< in every tablespace */
+				/* The following pages exist
+				in the system tablespace (space 0). */
+#define FSP_IBUF_HEADER_PAGE_NO		3	/*!< insert buffer
+						header page, in
+						tablespace 0 */
+#define FSP_IBUF_TREE_ROOT_PAGE_NO	4	/*!< insert buffer
+						B-tree root page in
+						tablespace 0 */
+				/* The ibuf tree root page number in
+				tablespace 0; its fseg inode is on the page
+				number FSP_FIRST_INODE_PAGE_NO */
+#define FSP_TRX_SYS_PAGE_NO		5	/*!< transaction
+						system header, in
+						tablespace 0 */
+#define	FSP_FIRST_RSEG_PAGE_NO		6	/*!< first rollback segment
+						page, in tablespace 0 */
+#define FSP_DICT_HDR_PAGE_NO		7	/*!< data dictionary header
+						page, in tablespace 0 */
+/*--------------------------------------*/
+/* @} */
+
+#endif /* fsp0types_h */
diff --git a/storage/innobase/include/fts0ast.h b/storage/innobase/include/fts0ast.h
new file mode 100644
index 00000000000..b2380f78b39
--- /dev/null
+++ b/storage/innobase/include/fts0ast.h
@@ -0,0 +1,339 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0ast.h
+The FTS query parser (AST) abstract syntax tree routines
+
+Created 2007/03/16/03 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FST0AST_H
+#define INNOBASE_FST0AST_H
+
+#include "mem0mem.h"
+#include "ha_prototypes.h"
+
+/* The type of AST Node */
+enum fts_ast_type_t {
+	FTS_AST_OPER,				/*!< Operator */
+	FTS_AST_NUMB,				/*!< Number */
+	FTS_AST_TERM,				/*!< Term (or word) */
+	FTS_AST_TEXT,				/*!< Text string */
+	FTS_AST_LIST,				/*!< Expression list */
+	FTS_AST_SUBEXP_LIST			/*!< Sub-Expression list */
+};
+
+/* The FTS query operators that we support */
+enum fts_ast_oper_t {
+	FTS_NONE,				/*!< No operator */
+
+	FTS_IGNORE,				/*!< Ignore rows that contain
+						this word */
+
+	FTS_EXIST,				/*!< Include rows that contain
+						this word */
+
+	FTS_NEGATE,				/*!< Include rows that contain
+						this word but rank them
+						lower*/
+
+	FTS_INCR_RATING,			/*!< Increase the rank for this
+						word*/
+
+	FTS_DECR_RATING,			/*!< Decrease the rank for this
+						word*/
+
+	FTS_DISTANCE,				/*!< Proximity distance */
+	FTS_IGNORE_SKIP,			/*!< Transient node operator
+						signifies that this is a
+						FTS_IGNORE node, and ignored in
+						the first pass of
+						fts_ast_visit() */
+	FTS_EXIST_SKIP				/*!< Transient node operator
+						signifies that this ia a
+						FTS_EXIST node, and ignored in
+						the first pass of
+						fts_ast_visit() */
+};
+
+/* Data types used by the FTS parser */
+struct fts_lexer_t;
+struct fts_ast_node_t;
+struct fts_ast_state_t;
+struct fts_ast_string_t;
+
+typedef dberr_t (*fts_ast_callback)(fts_ast_oper_t, fts_ast_node_t*, void*);
+
+/********************************************************************
+Parse the string using the lexer setup within state.*/
+int
+fts_parse(
+/*======*/
+						/* out: 0 on OK, 1 on error */
+	fts_ast_state_t* state);		/*!< in: ast state instance.*/
+
+/********************************************************************
+Create an AST operator node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_oper(
+/*=====================*/
+	void*		arg,			/*!< in: ast state */
+	fts_ast_oper_t	oper);			/*!< in: ast operator */
+/********************************************************************
+Create an AST term node, makes a copy of ptr */
+extern
+fts_ast_node_t*
+fts_ast_create_node_term(
+/*=====================*/
+	void*			arg,		/*!< in: ast state */
+	const fts_ast_string_t*	ptr);		/*!< in: term string */
+/********************************************************************
+Create an AST text node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_text(
+/*=====================*/
+	void*			arg,		/*!< in: ast state */
+	const fts_ast_string_t*	ptr);		/*!< in: text string */
+/********************************************************************
+Create an AST expr list node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_list(
+/*=====================*/
+	void*		arg,			/*!< in: ast state */
+	fts_ast_node_t*	expr);			/*!< in: ast expr */
+/********************************************************************
+Create a sub-expression list node. This function takes ownership of
+expr and is responsible for deleting it. */
+extern
+fts_ast_node_t*
+fts_ast_create_node_subexp_list(
+/*============================*/
+						/* out: new node */
+	void*		arg,			/*!< in: ast state instance */
+	fts_ast_node_t*	expr);			/*!< in: ast expr instance */
+/********************************************************************
+Set the wildcard attribute of a term.*/
+extern
+void
+fts_ast_term_set_wildcard(
+/*======================*/
+	fts_ast_node_t*	node);			/*!< in: term to change */
+/********************************************************************
+Set the proximity attribute of a text node. */
+
+void
+fts_ast_term_set_distance(
+/*======================*/
+	fts_ast_node_t*	node,			/*!< in/out: text node */
+	ulint		distance);		/*!< in: the text proximity
+						distance */
+/********************************************************************//**
+Free a fts_ast_node_t instance.
+@return next node to free */
+UNIV_INTERN
+fts_ast_node_t*
+fts_ast_free_node(
+/*==============*/
+	fts_ast_node_t*	node);			/*!< in: node to free */
+/********************************************************************
+Add a sub-expression to an AST*/
+extern
+fts_ast_node_t*
+fts_ast_add_node(
+/*=============*/
+	fts_ast_node_t*	list,			/*!< in: list node instance */
+	fts_ast_node_t*	node);			/*!< in: (sub) expr to add */
+/********************************************************************
+Print the AST node recursively.*/
+extern
+void
+fts_ast_node_print(
+/*===============*/
+	fts_ast_node_t*	node);			/*!< in: ast node to print */
+/********************************************************************
+For tracking node allocations, in case there is an during parsing.*/
+extern
+void
+fts_ast_state_add_node(
+/*===================*/
+	fts_ast_state_t*state,			/*!< in: ast state instance */
+	fts_ast_node_t*	node);			/*!< in: node to add to state */
+/********************************************************************
+Free node and expr allocations.*/
+extern
+void
+fts_ast_state_free(
+/*===============*/
+	fts_ast_state_t*state);			/*!< in: state instance
+						to free */
+/******************************************************************//**
+Traverse the AST - in-order traversal.
+@return DB_SUCCESS if all went well */
+UNIV_INTERN
+dberr_t
+fts_ast_visit(
+/*==========*/
+	fts_ast_oper_t		oper,		/*!< in: FTS operator */
+	fts_ast_node_t*		node,		/*!< in: instance to traverse*/
+	fts_ast_callback	visitor,	/*!< in: callback */
+	void*			arg,		/*!< in: callback arg */
+	bool*			has_ignore)	/*!< out: whether we encounter
+						and ignored processing an
+						operator, currently we only
+						ignore FTS_IGNORE operator */
+	__attribute__((nonnull, warn_unused_result));
+/*****************************************************************//**
+Process (nested) sub-expression, create a new result set to store the
+sub-expression result by processing nodes under current sub-expression
+list. Merge the sub-expression result with that of parent expression list.
+@return DB_SUCCESS if all went well */
+UNIV_INTERN
+dberr_t
+fts_ast_visit_sub_exp(
+/*==================*/
+	fts_ast_node_t*		node,		/*!< in: instance to traverse*/
+	fts_ast_callback	visitor,	/*!< in: callback */
+	void*			arg)		/*!< in: callback arg */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************
+Create a lex instance.*/
+UNIV_INTERN
+fts_lexer_t*
+fts_lexer_create(
+/*=============*/
+	ibool		boolean_mode,		/*!< in: query type */
+	const byte*	query,			/*!< in: query string */
+	ulint		query_len)		/*!< in: query string len */
+	__attribute__((nonnull, malloc, warn_unused_result));
+/********************************************************************
+Free an fts_lexer_t instance.*/
+UNIV_INTERN
+void
+fts_lexer_free(
+/*===========*/
+	fts_lexer_t*	fts_lexer)		/*!< in: lexer instance to
+						free */
+	__attribute__((nonnull));
+
+/**
+Create an ast string object, with NUL-terminator, so the string
+has one more byte than len
+@param[in] str		pointer to string
+@param[in] len		length of the string
+@return ast string with NUL-terminator */
+UNIV_INTERN
+fts_ast_string_t*
+fts_ast_string_create(
+	const byte*	str,
+	ulint		len);
+
+/**
+Free an ast string instance
+@param[in,out] ast_str		string to free */
+UNIV_INTERN
+void
+fts_ast_string_free(
+	fts_ast_string_t*	ast_str);
+
+/**
+Translate ast string of type FTS_AST_NUMB to unsigned long by strtoul
+@param[in] str		string to translate
+@param[in] base		the base
+@return translated number */
+UNIV_INTERN
+ulint
+fts_ast_string_to_ul(
+	const fts_ast_string_t*	ast_str,
+	int			base);
+
+/**
+Print the ast string
+@param[in] str		string to print */
+UNIV_INTERN
+void
+fts_ast_string_print(
+	const fts_ast_string_t*	ast_str);
+
+/* String of length len.
+We always store the string of length len with a terminating '\0',
+regardless of there is any 0x00 in the string itself */
+struct fts_ast_string_t {
+	/*!< Pointer to string. */
+	byte*		str;
+
+	/*!< Length of the string. */
+	ulint		len;
+};
+
+/* Query term type */
+struct fts_ast_term_t {
+	fts_ast_string_t*	ptr;		/*!< Pointer to term string.*/
+	ibool			wildcard;	/*!< TRUE if wild card set.*/
+};
+
+/* Query text type */
+struct fts_ast_text_t {
+	fts_ast_string_t*	ptr;		/*!< Pointer to text string.*/
+	ulint			distance;	/*!< > 0 if proximity distance
+						set */
+};
+
+/* The list of nodes in an expr list */
+struct fts_ast_list_t {
+	fts_ast_node_t*	head;			/*!< Children list head */
+	fts_ast_node_t*	tail;			/*!< Children list tail */
+};
+
+/* FTS AST node to store the term, text, operator and sub-expressions.*/
+struct fts_ast_node_t {
+	fts_ast_type_t	type;			/*!< The type of node */
+	fts_ast_text_t	text;			/*!< Text node */
+	fts_ast_term_t	term;			/*!< Term node */
+	fts_ast_oper_t	oper;			/*!< Operator value */
+	fts_ast_list_t	list;			/*!< Expression list */
+	fts_ast_node_t*	next;			/*!< Link for expr list */
+	fts_ast_node_t*	next_alloc;		/*!< For tracking allocations */
+	bool		visited;		/*!< whether this node is
+						already processed */
+};
+
+/* To track state during parsing */
+struct fts_ast_state_t {
+	mem_heap_t*	heap;			/*!< Heap to use for alloc */
+	fts_ast_node_t*	root;			/*!< If all goes OK, then this
+						will point to the root.*/
+
+	fts_ast_list_t	list;			/*!< List of nodes allocated */
+
+	fts_lexer_t*	lexer;			/*!< Lexer callback + arg */
+	CHARSET_INFO*	charset;		/*!< charset used for
+						tokenization */
+};
+
+#ifdef UNIV_DEBUG
+const char*
+fts_ast_oper_name_get(fts_ast_oper_t	oper);
+const char*
+fts_ast_node_type_get(fts_ast_type_t	type);
+#endif /* UNIV_DEBUG */
+
+#endif /* INNOBASE_FSTS0AST_H */
diff --git a/storage/innobase/include/fts0blex.h b/storage/innobase/include/fts0blex.h
new file mode 100644
index 00000000000..d0e4cae0678
--- /dev/null
+++ b/storage/innobase/include/fts0blex.h
@@ -0,0 +1,349 @@
+#ifndef fts0bHEADER_H
+#define fts0bHEADER_H 1
+#define fts0bIN_HEADER 1
+
+#line 6 "../include/fts0blex.h"
+
+#line 8 "../include/fts0blex.h"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 5
+#define YY_FLEX_SUBMINOR_VERSION 35
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types. 
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t; 
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+#ifdef __cplusplus
+
+/* The "const" storage-class-modifier is valid. */
+#define YY_USE_CONST
+
+#else	/* ! __cplusplus */
+
+/* C99 requires __STDC__ to be defined as 1. */
+#if defined (__STDC__)
+
+#define YY_USE_CONST
+
+#endif	/* defined (__STDC__) */
+#endif	/* ! __cplusplus */
+
+#ifdef YY_USE_CONST
+#define yyconst const
+#else
+#define yyconst
+#endif
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	yy_size_t yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+    
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+void fts0brestart (FILE *input_file ,yyscan_t yyscanner );
+void fts0b_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0b_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
+void fts0b_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0b_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0bpush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+void fts0bpop_buffer_state (yyscan_t yyscanner );
+
+YY_BUFFER_STATE fts0b_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0b_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0b_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner );
+
+void *fts0balloc (yy_size_t ,yyscan_t yyscanner );
+void *fts0brealloc (void *,yy_size_t ,yyscan_t yyscanner );
+void fts0bfree (void * ,yyscan_t yyscanner );
+
+/* Begin user sect3 */
+
+#define fts0bwrap(n) 1
+#define YY_SKIP_YYWRAP
+
+#define yytext_ptr yytext_r
+
+#ifdef YY_HEADER_EXPORT_START_CONDITIONS
+#define INITIAL 0
+
+#endif
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+int fts0blex_init (yyscan_t* scanner);
+
+int fts0blex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int fts0blex_destroy (yyscan_t yyscanner );
+
+int fts0bget_debug (yyscan_t yyscanner );
+
+void fts0bset_debug (int debug_flag ,yyscan_t yyscanner );
+
+YY_EXTRA_TYPE fts0bget_extra (yyscan_t yyscanner );
+
+void fts0bset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
+
+FILE *fts0bget_in (yyscan_t yyscanner );
+
+void fts0bset_in  (FILE * in_str ,yyscan_t yyscanner );
+
+FILE *fts0bget_out (yyscan_t yyscanner );
+
+void fts0bset_out  (FILE * out_str ,yyscan_t yyscanner );
+
+int fts0bget_leng (yyscan_t yyscanner );
+
+char *fts0bget_text (yyscan_t yyscanner );
+
+int fts0bget_lineno (yyscan_t yyscanner );
+
+void fts0bset_lineno (int line_number ,yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int fts0bwrap (yyscan_t yyscanner );
+#else
+extern int fts0bwrap (yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int fts0blex (yyscan_t yyscanner);
+
+#define YY_DECL int fts0blex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+
+#line 73 "fts0blex.l"
+
+
+#line 348 "../include/fts0blex.h"
+#undef fts0bIN_HEADER
+#endif /* fts0bHEADER_H */
diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h
new file mode 100644
index 00000000000..a2996ecacc8
--- /dev/null
+++ b/storage/innobase/include/fts0fts.h
@@ -0,0 +1,1039 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0fts.h
+Full text search header file
+
+Created 2011/09/02 Sunny Bains
+***********************************************************************/
+
+#ifndef fts0fts_h
+#define fts0fts_h
+
+#include "univ.i"
+
+#include "data0type.h"
+#include "data0types.h"
+#include "dict0types.h"
+#include "hash0hash.h"
+#include "mem0mem.h"
+#include "rem0types.h"
+#include "row0types.h"
+#include "trx0types.h"
+#include "ut0vec.h"
+#include "ut0rbt.h"
+#include "ut0wqueue.h"
+#include "que0types.h"
+#include "ft_global.h"
+
+/** "NULL" value of a document id. */
+#define FTS_NULL_DOC_ID			0
+
+/** FTS hidden column that is used to map to and from the row */
+#define FTS_DOC_ID_COL_NAME		"FTS_DOC_ID"
+
+/** The name of the index created by FTS */
+#define FTS_DOC_ID_INDEX_NAME		"FTS_DOC_ID_INDEX"
+
+#define FTS_DOC_ID_INDEX_NAME_LEN	16
+
+/** Doc ID is a 8 byte value */
+#define FTS_DOC_ID_LEN			8
+
+/** The number of fields to sort when we build FT index with
+FIC. Three fields are sort: (word, doc_id, position) */
+#define FTS_NUM_FIELDS_SORT		3
+
+/** Maximum number of rows in a table, smaller than which, we will
+optimize using a 4 byte Doc ID for FIC merge sort to reduce sort size */
+#define MAX_DOC_ID_OPT_VAL		1073741824
+
+/** Document id type. */
+typedef ib_uint64_t doc_id_t;
+
+/** doc_id_t printf format */
+#define FTS_DOC_ID_FORMAT	IB_ID_FMT
+
+/** Convert document id to the InnoDB (BIG ENDIAN) storage format. */
+#define fts_write_doc_id(d, s)	mach_write_to_8(d, s)
+
+/** Read a document id to internal format. */
+#define fts_read_doc_id(s)	mach_read_from_8(s)
+
+/** Bind the doc id to a variable */
+#define fts_bind_doc_id(i, n, v) pars_info_bind_int8_literal(i, n, v)
+
+/** Defines for FTS query mode, they have the same values as
+those defined in mysql file ft_global.h */
+#define FTS_NL		0
+#define FTS_BOOL	1
+#define FTS_SORTED	2
+#define FTS_EXPAND	4
+#define FTS_PROXIMITY	8
+#define FTS_PHRASE	16
+#define FTS_OPT_RANKING	32
+
+#define FTS_INDEX_TABLE_IND_NAME	"FTS_INDEX_TABLE_IND"
+
+/** Threshold where our optimize thread automatically kicks in */
+#define FTS_OPTIMIZE_THRESHOLD		10000000
+
+#define FTS_DOC_ID_MAX_STEP		10000
+/** Variable specifying the FTS parallel sort degree */
+extern ulong		fts_sort_pll_degree;
+
+/** Variable specifying the number of word to optimize for each optimize table
+call */
+extern ulong		fts_num_word_optimize;
+
+/** Variable specifying whether we do additional FTS diagnostic printout
+in the log */
+extern char		fts_enable_diag_print;
+
+/** FTS rank type, which will be between 0 .. 1 inclusive */
+typedef float 		fts_rank_t;
+
+/** Type of a row during a transaction. FTS_NOTHING means the row can be
+forgotten from the FTS system's POV, FTS_INVALID is an internal value used
+to mark invalid states.
+
+NOTE: Do not change the order or value of these, fts_trx_row_get_new_state
+depends on them being exactly as they are. */
+enum fts_row_state {
+	FTS_INSERT = 0,
+	FTS_MODIFY,
+	FTS_DELETE,
+	FTS_NOTHING,
+	FTS_INVALID
+};
+
+/** The FTS table types. */
+enum fts_table_type_t {
+	FTS_INDEX_TABLE,		/*!< FTS auxiliary table that is
+					specific to a particular FTS index
+					on a table */
+
+	FTS_COMMON_TABLE		/*!< FTS auxiliary table that is common
+					for all FTS index on a table */
+};
+
+struct fts_doc_t;
+struct fts_cache_t;
+struct fts_token_t;
+struct fts_doc_ids_t;
+struct fts_index_cache_t;
+
+
+/** Initialize the "fts_table" for internal query into FTS auxiliary
+tables */
+#define FTS_INIT_FTS_TABLE(fts_table, m_suffix, m_type, m_table)\
+do {								\
+	(fts_table)->suffix = m_suffix;				\
+        (fts_table)->type = m_type;				\
+        (fts_table)->table_id = m_table->id;			\
+        (fts_table)->parent = m_table->name;			\
+        (fts_table)->table = m_table;				\
+} while (0);
+
+#define FTS_INIT_INDEX_TABLE(fts_table, m_suffix, m_type, m_index)\
+do {								\
+	(fts_table)->suffix = m_suffix;				\
+        (fts_table)->type = m_type;				\
+        (fts_table)->table_id = m_index->table->id;		\
+        (fts_table)->parent = m_index->table->name;		\
+        (fts_table)->table = m_index->table;			\
+        (fts_table)->index_id = m_index->id;			\
+} while (0);
+
+/** Information about changes in a single transaction affecting
+the FTS system. */
+struct fts_trx_t {
+	trx_t*		trx;		/*!< InnoDB transaction */
+
+	ib_vector_t*	savepoints;	/*!< Active savepoints, must have at
+					least one element, the implied
+					savepoint */
+	ib_vector_t*	last_stmt;	/*!< last_stmt */
+
+	mem_heap_t*	heap;		/*!< heap */
+};
+
+/** Information required for transaction savepoint handling. */
+struct fts_savepoint_t {
+	char*		name;		/*!< First entry is always NULL, the
+					default instance. Otherwise the name
+					of the savepoint */
+
+	ib_rbt_t*	tables;		/*!< Modified FTS tables */
+};
+
+/** Information about changed rows in a transaction for a single table. */
+struct fts_trx_table_t {
+	dict_table_t*	table;		/*!< table */
+
+	fts_trx_t*	fts_trx;	/*!< link to parent */
+
+	ib_rbt_t*	rows;		/*!< rows changed; indexed by doc-id,
+					cells are fts_trx_row_t* */
+
+	fts_doc_ids_t*	added_doc_ids;	/*!< list of added doc ids (NULL until
+					the first addition) */
+
+					/*!< for adding doc ids */
+	que_t*		docs_added_graph;
+};
+
+/** Information about one changed row in a transaction. */
+struct fts_trx_row_t {
+	doc_id_t	doc_id;		/*!< Id of the ins/upd/del document */
+
+	fts_row_state	state;		/*!< state of the row */
+
+	ib_vector_t*	fts_indexes;	/*!< The indexes that are affected */
+};
+
+/** List of document ids that were added during a transaction. This
+list is passed on to a background 'Add' thread and OPTIMIZE, so it
+needs its own memory heap. */
+struct fts_doc_ids_t {
+	ib_vector_t*	doc_ids;	/*!< document ids (each element is
+					of type doc_id_t). */
+
+	ib_alloc_t*	self_heap;	/*!< Allocator used to create an
+					instance of this type and the
+					doc_ids vector */
+};
+
+// FIXME: Get rid of this if possible.
+/** Since MySQL's character set support for Unicode is woefully inadequate
+(it supports basic operations like isalpha etc. only for 8-bit characters),
+we have to implement our own. We use UTF-16 without surrogate processing
+as our in-memory format. This typedef is a single such character. */
+typedef unsigned short ib_uc_t;
+
+/** An UTF-16 ro UTF-8 string. */
+struct fts_string_t {
+	byte*		f_str;		/*!< string, not necessary terminated in
+					any way */
+	ulint		f_len;		/*!< Length of the string in bytes */
+	ulint		f_n_char;	/*!< Number of characters */
+};
+
+/** Query ranked doc ids. */
+struct fts_ranking_t {
+	doc_id_t	doc_id;		/*!< Document id */
+
+	fts_rank_t	rank;		/*!< Rank is between 0 .. 1 */
+
+	byte*		words;		/*!< this contains the words
+					that were queried
+					and found in this document */
+	ulint		words_len;	/*!< words len */
+};
+
+/** Query result. */
+struct fts_result_t {
+	ib_rbt_node_t*	current;	/*!< Current element */
+
+	ib_rbt_t*	rankings_by_id;	/*!< RB tree of type fts_ranking_t
+					indexed by doc id */
+	ib_rbt_t*	rankings_by_rank;/*!< RB tree of type fts_ranking_t
+					indexed by rank */
+};
+
+/** This is used to generate the FTS auxiliary table name, we need the
+table id and the index id to generate the column specific FTS auxiliary
+table name. */
+struct fts_table_t {
+	const char*	parent;		/*!< Parent table name, this is
+					required only for the database
+					name */
+
+	fts_table_type_t
+			type;		/*!< The auxiliary table type */
+
+	table_id_t	table_id;	/*!< The table id */
+
+	index_id_t	index_id;	/*!< The index id */
+
+	const char*	suffix;		/*!< The suffix of the fts auxiliary
+					table name, can be NULL, not used
+					everywhere (yet) */
+	const dict_table_t*
+			table;		/*!< Parent table */
+	CHARSET_INFO*	charset;	/*!< charset info if it is for FTS
+					index auxiliary table */
+};
+
+enum	fts_status {
+	BG_THREAD_STOP = 1,	 	/*!< TRUE if the FTS background thread
+					has finished reading the ADDED table,
+					meaning more items can be added to
+					the table. */
+
+	BG_THREAD_READY = 2,		/*!< TRUE if the FTS background thread
+					is ready */
+
+	ADD_THREAD_STARTED = 4,		/*!< TRUE if the FTS add thread
+					has started */
+
+	ADDED_TABLE_SYNCED = 8,		/*!< TRUE if the ADDED table record is
+					sync-ed after crash recovery */
+
+	TABLE_DICT_LOCKED = 16		/*!< Set if the table has
+					dict_sys->mutex */
+};
+
+typedef	enum fts_status	fts_status_t;
+
+/** The state of the FTS sub system. */
+struct fts_t {
+					/*!< mutex protecting bg_threads* and
+					fts_add_wq. */
+	ib_mutex_t		bg_threads_mutex;
+
+	ulint		bg_threads;	/*!< number of background threads
+					accessing this table */
+
+					/*!< TRUE if background threads running
+					should stop themselves */
+	ulint		fts_status;	/*!< Status bit regarding fts
+					running state */
+
+	ib_wqueue_t*	add_wq;		/*!< Work queue for scheduling jobs
+					for the FTS 'Add' thread, or NULL
+					if the thread has not yet been
+					created. Each work item is a
+					fts_trx_doc_ids_t*. */
+
+	fts_cache_t*	cache;		/*!< FTS memory buffer for this table,
+					or NULL if the table has no FTS
+					index. */
+
+	ulint		doc_col;	/*!< FTS doc id hidden column number
+					in the CLUSTERED index. */
+
+	ib_vector_t*	indexes;	/*!< Vector of FTS indexes, this is
+					mainly for caching purposes. */
+	mem_heap_t*	fts_heap;	/*!< heap for fts_t allocation */
+};
+
+struct fts_stopword_t;
+
+/** status bits for fts_stopword_t status field. */
+#define STOPWORD_NOT_INIT               0x1
+#define STOPWORD_OFF                    0x2
+#define STOPWORD_FROM_DEFAULT           0x4
+#define STOPWORD_USER_TABLE             0x8
+
+extern const char*	fts_default_stopword[];
+
+/** Variable specifying the maximum FTS cache size for each table */
+extern ulong		fts_max_cache_size;
+
+/** Variable specifying the total memory allocated for FTS cache */
+extern ulong		fts_max_total_cache_size;
+
+/** Variable specifying the FTS result cache limit for each query */
+extern ulong		fts_result_cache_limit;
+
+/** Variable specifying the maximum FTS max token size */
+extern ulong		fts_max_token_size;
+
+/** Variable specifying the minimum FTS max token size */
+extern ulong		fts_min_token_size;
+
+/** Whether the total memory used for FTS cache is exhausted, and we will
+need a sync to free some memory */
+extern bool		fts_need_sync;
+
+/** Maximum possible Fulltext word length */
+#define FTS_MAX_WORD_LEN		HA_FT_MAXBYTELEN
+
+/** Maximum possible Fulltext word length (in characters) */
+#define FTS_MAX_WORD_LEN_IN_CHAR	HA_FT_MAXCHARLEN
+
+/** Variable specifying the table that has Fulltext index to display its
+content through information schema table */
+extern char*		fts_internal_tbl_name;
+
+#define	fts_que_graph_free(graph)			\
+do {							\
+	mutex_enter(&dict_sys->mutex);			\
+	que_graph_free(graph);				\
+	mutex_exit(&dict_sys->mutex);			\
+} while (0)
+
+/******************************************************************//**
+Create a FTS cache. */
+UNIV_INTERN
+fts_cache_t*
+fts_cache_create(
+/*=============*/
+	dict_table_t*	table);			/*!< table owns the FTS cache */
+
+/******************************************************************//**
+Create a FTS index cache.
+@return Index Cache */
+UNIV_INTERN
+fts_index_cache_t*
+fts_cache_index_cache_create(
+/*=========================*/
+	dict_table_t*	table,			/*!< in: table with FTS index */
+	dict_index_t*	index);			/*!< in: FTS index */
+
+/******************************************************************//**
+Get the next available document id. This function creates a new
+transaction to generate the document id.
+@return DB_SUCCESS if OK */
+UNIV_INTERN
+dberr_t
+fts_get_next_doc_id(
+/*================*/
+	const dict_table_t*	table,	/*!< in: table */
+	doc_id_t*		doc_id)	/*!< out: new document id */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Update the next and last Doc ID in the CONFIG table to be the input
+"doc_id" value (+ 1). We would do so after each FTS index build or
+table truncate */
+UNIV_INTERN
+void
+fts_update_next_doc_id(
+/*===================*/
+	trx_t*			trx,		/*!< in/out: transaction */
+	const dict_table_t*	table,		/*!< in: table */
+	const char*		table_name,	/*!< in: table name, or NULL */
+	doc_id_t		doc_id)		/*!< in: DOC ID to set */
+	__attribute__((nonnull(2)));
+
+/******************************************************************//**
+Create a new document id .
+@return DB_SUCCESS if all went well else error */
+UNIV_INTERN
+dberr_t
+fts_create_doc_id(
+/*==============*/
+	dict_table_t*	table,			/*!< in: row is of this
+						table. */
+	dtuple_t*	row,			/*!< in/out: add doc id
+						value to this row. This is the
+						current row that is being
+						inserted. */
+	mem_heap_t*	heap)			/*!< in: heap */
+	__attribute__((nonnull));
+/******************************************************************//**
+Create a new fts_doc_ids_t.
+@return new fts_doc_ids_t. */
+UNIV_INTERN
+fts_doc_ids_t*
+fts_doc_ids_create(void);
+/*=====================*/
+
+/******************************************************************//**
+Free a fts_doc_ids_t. */
+UNIV_INTERN
+void
+fts_doc_ids_free(
+/*=============*/
+	fts_doc_ids_t*	doc_ids);		/*!< in: doc_ids to free */
+
+/******************************************************************//**
+Notify the FTS system about an operation on an FTS-indexed table. */
+UNIV_INTERN
+void
+fts_trx_add_op(
+/*===========*/
+	trx_t*		trx,			/*!< in: InnoDB transaction */
+	dict_table_t*	table,			/*!< in: table */
+	doc_id_t	doc_id,			/*!< in: doc id */
+	fts_row_state	state,			/*!< in: state of the row */
+	ib_vector_t*	fts_indexes)		/*!< in: FTS indexes affected
+						(NULL=all) */
+	__attribute__((nonnull(1,2)));
+
+/******************************************************************//**
+Free an FTS trx. */
+UNIV_INTERN
+void
+fts_trx_free(
+/*=========*/
+	fts_trx_t*	fts_trx);		/*!< in, own: FTS trx */
+
+/******************************************************************//**
+Creates the common ancillary tables needed for supporting an FTS index
+on the given table. row_mysql_lock_data_dictionary must have been
+called before this.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_create_common_tables(
+/*=====================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const dict_table_t*
+			table,			/*!< in: table with one FTS
+						index */
+	const char*	name,			/*!< in: table name */
+	bool		skip_doc_id_index)	/*!< in: Skip index on doc id */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Wrapper function of fts_create_index_tables_low(), create auxiliary
+tables for an FTS index
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_create_index_tables(
+/*====================*/
+	trx_t*			trx,		/*!< in: transaction handle */
+	const dict_index_t*	index)		/*!< in: the FTS index
+						instance */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Creates the column specific ancillary tables needed for supporting an
+FTS index on the given table. row_mysql_lock_data_dictionary must have
+been called before this.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_create_index_tables_low(
+/*========================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const dict_index_t*
+			index,			/*!< in: the FTS index
+						instance */
+	const char*	table_name,		/*!< in: the table name */
+	table_id_t	table_id)		/*!< in: the table id */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Add the FTS document id hidden column. */
+UNIV_INTERN
+void
+fts_add_doc_id_column(
+/*==================*/
+	dict_table_t*	table,	/*!< in/out: Table with FTS index */
+	mem_heap_t*	heap)	/*!< in: temporary memory heap, or NULL */
+	__attribute__((nonnull(1)));
+
+/*********************************************************************//**
+Drops the ancillary tables needed for supporting an FTS index on the
+given table. row_mysql_lock_data_dictionary must have been called before
+this.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_drop_tables(
+/*============*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_table_t*	table)			/*!< in: table has the FTS
+						index */
+	__attribute__((nonnull));
+/******************************************************************//**
+The given transaction is about to be committed; do whatever is necessary
+from the FTS system's POV.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_commit(
+/*=======*/
+	trx_t*		trx)			/*!< in: transaction */
+	__attribute__((nonnull, warn_unused_result));
+
+/*******************************************************************//**
+FTS Query entry point.
+@return DB_SUCCESS if successful otherwise error code */
+UNIV_INTERN
+dberr_t
+fts_query(
+/*======*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index to search */
+	uint		flags,			/*!< in: FTS search mode */
+	const byte*	query,			/*!< in: FTS query */
+	ulint		query_len,		/*!< in: FTS query string len
+						in bytes */
+	fts_result_t**	result)			/*!< out: query result, to be
+						freed by the caller.*/
+	__attribute__((nonnull, warn_unused_result));
+
+/******************************************************************//**
+Retrieve the FTS Relevance Ranking result for doc with doc_id
+@return the relevance ranking value. */
+UNIV_INTERN
+float
+fts_retrieve_ranking(
+/*=================*/
+	fts_result_t*	result,			/*!< in: FTS result structure */
+	doc_id_t	doc_id);		/*!< in: the interested document
+						doc_id */
+
+/******************************************************************//**
+FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
+UNIV_INTERN
+void
+fts_query_sort_result_on_rank(
+/*==========================*/
+	fts_result_t*	result);		/*!< out: result instance
+						to sort.*/
+
+/******************************************************************//**
+FTS Query free result, returned by fts_query(). */
+UNIV_INTERN
+void
+fts_query_free_result(
+/*==================*/
+	fts_result_t*	result);		/*!< in: result instance
+						to free.*/
+
+/******************************************************************//**
+Extract the doc id from the FTS hidden column. */
+UNIV_INTERN
+doc_id_t
+fts_get_doc_id_from_row(
+/*====================*/
+	dict_table_t*	table,			/*!< in: table */
+	dtuple_t*	row);			/*!< in: row whose FTS doc id we
+						want to extract.*/
+
+/******************************************************************//**
+Extract the doc id from the FTS hidden column. */
+UNIV_INTERN
+doc_id_t
+fts_get_doc_id_from_rec(
+/*====================*/
+	dict_table_t*	table,			/*!< in: table */
+	const rec_t*	rec,			/*!< in: rec */
+	mem_heap_t*	heap);			/*!< in: heap */
+
+/******************************************************************//**
+Update the query graph with a new document id.
+@return Doc ID used */
+UNIV_INTERN
+doc_id_t
+fts_update_doc_id(
+/*==============*/
+	dict_table_t*	table,			/*!< in: table */
+	upd_field_t*	ufield,			/*!< out: update node */
+	doc_id_t*	next_doc_id);		/*!< out: buffer for writing */
+
+/******************************************************************//**
+FTS initialize. */
+UNIV_INTERN
+void
+fts_startup(void);
+/*==============*/
+
+/******************************************************************//**
+Signal FTS threads to initiate shutdown. */
+UNIV_INTERN
+void
+fts_start_shutdown(
+/*===============*/
+	dict_table_t*	table,			/*!< in: table with FTS
+						indexes */
+	fts_t*		fts);			/*!< in: fts instance to
+						shutdown */
+
+/******************************************************************//**
+Wait for FTS threads to shutdown. */
+UNIV_INTERN
+void
+fts_shutdown(
+/*=========*/
+	dict_table_t*	table,			/*!< in: table with FTS
+						indexes */
+	fts_t*		fts);			/*!< in: fts instance to
+						shutdown */
+
+/******************************************************************//**
+Create an instance of fts_t.
+@return instance of fts_t */
+UNIV_INTERN
+fts_t*
+fts_create(
+/*=======*/
+	dict_table_t*	table);			/*!< out: table with FTS
+						indexes */
+
+/**********************************************************************//**
+Free the FTS resources. */
+UNIV_INTERN
+void
+fts_free(
+/*=====*/
+	dict_table_t*   table);			/*!< in/out: table with
+						FTS indexes */
+
+/*********************************************************************//**
+Run OPTIMIZE on the given table.
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+dberr_t
+fts_optimize_table(
+/*===============*/
+	dict_table_t*	table)			/*!< in: table to optimiza */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Startup the optimize thread and create the work queue. */
+UNIV_INTERN
+void
+fts_optimize_init(void);
+/*====================*/
+
+/**********************************************************************//**
+Check whether the work queue is initialized.
+@return TRUE if optimze queue is initialized. */
+UNIV_INTERN
+ibool
+fts_optimize_is_init(void);
+/*======================*/
+
+/****************************************************************//**
+Drops index ancillary tables for a FTS index
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_drop_index_tables(
+/*==================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index)			/*!< in: Index to drop */
+	__attribute__((nonnull, warn_unused_result));
+
+/******************************************************************//**
+Remove the table from the OPTIMIZER's list. We do wait for
+acknowledgement from the consumer of the message. */
+UNIV_INTERN
+void
+fts_optimize_remove_table(
+/*======================*/
+	dict_table_t*	table);			/*!< in: table to remove */
+
+/**********************************************************************//**
+Signal the optimize thread to prepare for shutdown. */
+UNIV_INTERN
+void
+fts_optimize_start_shutdown(void);
+/*==============================*/
+
+/**********************************************************************//**
+Inform optimize to clean up. */
+UNIV_INTERN
+void
+fts_optimize_end(void);
+/*===================*/
+
+/**********************************************************************//**
+Take a FTS savepoint. */
+UNIV_INTERN
+void
+fts_savepoint_take(
+/*===============*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_trx_t*	fts_trx,		/*!< in: fts transaction */
+	const char*	name)			/*!< in: savepoint name */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Refresh last statement savepoint. */
+UNIV_INTERN
+void
+fts_savepoint_laststmt_refresh(
+/*===========================*/
+	trx_t*		trx)			/*!< in: transaction */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Release the savepoint data identified by  name. */
+UNIV_INTERN
+void
+fts_savepoint_release(
+/*==================*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	name);			/*!< in: savepoint name */
+
+/**********************************************************************//**
+Free the FTS cache. */
+UNIV_INTERN
+void
+fts_cache_destroy(
+/*==============*/
+	fts_cache_t*	cache);			/*!< in: cache*/
+
+/*********************************************************************//**
+Clear cache. */
+UNIV_INTERN
+void
+fts_cache_clear(
+/*============*/
+	fts_cache_t*	cache);			/*!< in: cache */
+
+/*********************************************************************//**
+Initialize things in cache. */
+UNIV_INTERN
+void
+fts_cache_init(
+/*===========*/
+	fts_cache_t*	cache);			/*!< in: cache */
+
+/*********************************************************************//**
+Rollback to and including savepoint indentified by name. */
+UNIV_INTERN
+void
+fts_savepoint_rollback(
+/*===================*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	name);			/*!< in: savepoint name */
+
+/*********************************************************************//**
+Rollback to and including savepoint indentified by name. */
+UNIV_INTERN
+void
+fts_savepoint_rollback_last_stmt(
+/*=============================*/
+	trx_t*		trx);			/*!< in: transaction */
+
+/***********************************************************************//**
+Drop all orphaned FTS auxiliary tables, those that don't have a parent
+table or FTS index defined on them. */
+UNIV_INTERN
+void
+fts_drop_orphaned_tables(void);
+/*==========================*/
+
+/******************************************************************//**
+Since we do a horizontal split on the index table, we need to drop
+all the split tables.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_drop_index_split_tables(
+/*========================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index)			/*!< in: fts instance */
+	__attribute__((nonnull, warn_unused_result));
+
+/****************************************************************//**
+Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end. */
+UNIV_INTERN
+dberr_t
+fts_sync_table(
+/*===========*/
+	dict_table_t*	table)			/*!< in: table */
+	__attribute__((nonnull));
+
+/****************************************************************//**
+Free the query graph but check whether dict_sys->mutex is already
+held */
+UNIV_INTERN
+void
+fts_que_graph_free_check_lock(
+/*==========================*/
+	fts_table_t*		fts_table,	/*!< in: FTS table */
+	const fts_index_cache_t*index_cache,	/*!< in: FTS index cache */
+	que_t*			graph);		/*!< in: query graph */
+
+/****************************************************************//**
+Create an FTS index cache. */
+UNIV_INTERN
+CHARSET_INFO*
+fts_index_get_charset(
+/*==================*/
+	dict_index_t*		index);		/*!< in: FTS index */
+
+/*********************************************************************//**
+Get the initial Doc ID by consulting the CONFIG table
+@return initial Doc ID */
+UNIV_INTERN
+doc_id_t
+fts_init_doc_id(
+/*============*/
+	const dict_table_t*		table);	/*!< in: table */
+
+/******************************************************************//**
+compare two character string according to their charset. */
+extern
+int
+innobase_fts_text_cmp(
+/*==================*/
+	const void*	cs,			/*!< in: Character set */
+	const void*	p1,			/*!< in: key */
+	const void*	p2);			/*!< in: node */
+
+/******************************************************************//**
+Makes all characters in a string lower case. */
+extern
+size_t
+innobase_fts_casedn_str(
+/*====================*/
+        CHARSET_INFO*	cs,			/*!< in: Character set */
+	char*		src,			/*!< in: string to put in
+						lower case */
+	size_t		src_len,		/*!< in: input string length */
+	char*		dst,			/*!< in: buffer for result
+						string */
+	size_t		dst_len);		/*!< in: buffer size */
+
+
+/******************************************************************//**
+compare two character string according to their charset. */
+extern
+int
+innobase_fts_text_cmp_prefix(
+/*=========================*/
+	const void*	cs,			/*!< in: Character set */
+	const void*	p1,			/*!< in: key */
+	const void*	p2);			/*!< in: node */
+
+/*************************************************************//**
+Get the next token from the given string and store it in *token. */
+extern
+ulint
+innobase_mysql_fts_get_token(
+/*=========================*/
+	CHARSET_INFO*	charset,		/*!< in: Character set */
+	const byte*	start,			/*!< in: start of text */
+	const byte*	end,			/*!< in: one character past
+						end of text */
+	fts_string_t*	token,			/*!< out: token's text */
+	ulint*		offset);		/*!< out: offset to token,
+						measured as characters from
+						'start' */
+
+/*********************************************************************//**
+Fetch COUNT(*) from specified table.
+@return the number of rows in the table */
+UNIV_INTERN
+ulint
+fts_get_rows_count(
+/*===============*/
+	fts_table_t*	fts_table);		/*!< in: fts table to read */
+
+/*************************************************************//**
+Get maximum Doc ID in a table if index "FTS_DOC_ID_INDEX" exists
+@return max Doc ID or 0 if index "FTS_DOC_ID_INDEX" does not exist */
+UNIV_INTERN
+doc_id_t
+fts_get_max_doc_id(
+/*===============*/
+	dict_table_t*	table);			/*!< in: user table */
+
+/******************************************************************//**
+Check whether user supplied stopword table exists and is of
+the right format.
+@return the stopword column charset if qualifies */
+UNIV_INTERN
+CHARSET_INFO*
+fts_valid_stopword_table(
+/*=====================*/
+	const char*	stopword_table_name);	/*!< in: Stopword table
+						name */
+/****************************************************************//**
+This function loads specified stopword into FTS cache
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fts_load_stopword(
+/*==============*/
+	const dict_table_t*
+			table,			/*!< in: Table with FTS */
+	trx_t*		trx,			/*!< in: Transaction */
+	const char*	global_stopword_table,	/*!< in: Global stopword table
+						name */
+	const char*	session_stopword_table,	/*!< in: Session stopword table
+						name */
+	ibool		stopword_is_on,		/*!< in: Whether stopword
+						option is turned on/off */
+	ibool		reload);		/*!< in: Whether it is during
+						reload of FTS table */
+
+/****************************************************************//**
+Create the vector of fts_get_doc_t instances.
+@return vector of fts_get_doc_t instances */
+UNIV_INTERN
+ib_vector_t*
+fts_get_docs_create(
+/*================*/
+	fts_cache_t*	cache);			/*!< in: fts cache */
+
+/****************************************************************//**
+Read the rows from the FTS index
+@return DB_SUCCESS if OK */
+UNIV_INTERN
+dberr_t
+fts_table_fetch_doc_ids(
+/*====================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table,		/*!< in: aux table */
+	fts_doc_ids_t*	doc_ids);		/*!< in: For collecting
+						doc ids */
+/****************************************************************//**
+This function brings FTS index in sync when FTS index is first
+used. There are documents that have not yet sync-ed to auxiliary
+tables from last server abnormally shutdown, we will need to bring
+such document into FTS cache before any further operations
+@return TRUE if all OK */
+UNIV_INTERN
+ibool
+fts_init_index(
+/*===========*/
+	dict_table_t*	table,			/*!< in: Table with FTS */
+	ibool		has_cache_lock);	/*!< in: Whether we already
+						have cache lock */
+/*******************************************************************//**
+Add a newly create index in FTS cache */
+UNIV_INTERN
+void
+fts_add_index(
+/*==========*/
+	dict_index_t*	index,			/*!< FTS index to be added */
+	dict_table_t*	table);			/*!< table */
+
+/*******************************************************************//**
+Drop auxiliary tables related to an FTS index
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+dberr_t
+fts_drop_index(
+/*===========*/
+	dict_table_t*	table,	/*!< in: Table where indexes are dropped */
+	dict_index_t*	index,	/*!< in: Index to be dropped */
+	trx_t*		trx)	/*!< in: Transaction for the drop */
+	__attribute__((nonnull));
+
+/****************************************************************//**
+Rename auxiliary tables for all fts index for a table
+@return DB_SUCCESS or error code */
+
+dberr_t
+fts_rename_aux_tables(
+/*==================*/
+	dict_table_t*	table,		/*!< in: user Table */
+	const char*	new_name,	/*!< in: new table name */
+	trx_t*		trx);		/*!< in: transaction */
+
+/*******************************************************************//**
+Check indexes in the fts->indexes is also present in index cache and
+table->indexes list
+@return TRUE if all indexes match */
+UNIV_INTERN
+ibool
+fts_check_cached_index(
+/*===================*/
+	dict_table_t*	table);  /*!< in: Table where indexes are dropped */
+#endif /*!< fts0fts.h */
+
diff --git a/storage/innobase/include/fts0opt.h b/storage/innobase/include/fts0opt.h
new file mode 100644
index 00000000000..92eaf8270d2
--- /dev/null
+++ b/storage/innobase/include/fts0opt.h
@@ -0,0 +1,37 @@
+/*****************************************************************************
+
+Copyright (c) 2001, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0opt.h
+Full Text Search optimize thread
+
+Created 2011-02-15 Jimmy Yang
+***********************************************************************/
+#ifndef INNODB_FTS0OPT_H
+#define INNODB_FTS0OPT_H
+
+/********************************************************************
+Callback function to fetch the rows in an FTS INDEX record. */
+UNIV_INTERN
+ibool
+fts_optimize_index_fetch_node(
+/*==========================*/
+                                        /* out: always returns non-NULL */
+        void*           row,		/* in: sel_node_t* */
+        void*           user_arg);	/* in: pointer to ib_vector_t */
+#endif
diff --git a/storage/innobase/include/fts0pars.h b/storage/innobase/include/fts0pars.h
new file mode 100644
index 00000000000..8108e811599
--- /dev/null
+++ b/storage/innobase/include/fts0pars.h
@@ -0,0 +1,72 @@
+/* A Bison parser, made by GNU Bison 2.5.  */
+
+/* Bison interface for Yacc-like parsers in C
+   
+      Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc.
+   
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* As a special exception, you may create a larger work that contains
+   part or all of the Bison parser skeleton and distribute that work
+   under terms of your choice, so long as that work isn't itself a
+   parser generator using the skeleton or a modified version thereof
+   as a parser skeleton.  Alternatively, if you modify or redistribute
+   the parser skeleton itself, you may (at your option) remove this
+   special exception, which will cause the skeleton and the resulting
+   Bison output files to be licensed under the GNU General Public
+   License without this special exception.
+   
+   This special exception was added by the Free Software Foundation in
+   version 2.2 of Bison.  */
+
+
+/* Tokens.  */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+   /* Put the tokens into the symbol table, so that GDB and other debuggers
+      know about them.  */
+   enum yytokentype {
+     FTS_OPER = 258,
+     FTS_TEXT = 259,
+     FTS_TERM = 260,
+     FTS_NUMB = 261
+   };
+#endif
+
+
+
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef union YYSTYPE
+{
+
+/* Line 2068 of yacc.c  */
+#line 61 "fts0pars.y"
+
+	int			oper;
+	fts_ast_string_t*	token;
+	fts_ast_node_t*		node;
+
+
+
+/* Line 2068 of yacc.c  */
+#line 64 "fts0pars.hh"
+} YYSTYPE;
+# define YYSTYPE_IS_TRIVIAL 1
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+#endif
+
+
+
+
diff --git a/storage/innobase/include/fts0priv.h b/storage/innobase/include/fts0priv.h
new file mode 100644
index 00000000000..b4d9e1d41ec
--- /dev/null
+++ b/storage/innobase/include/fts0priv.h
@@ -0,0 +1,653 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0priv.h
+Full text search internal header file
+
+Created 2011/09/02 Sunny Bains
+***********************************************************************/
+
+#ifndef INNOBASE_FTS0PRIV_H
+#define INNOBASE_FTS0PRIV_H
+
+#include "dict0dict.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "que0types.h"
+#include "fts0types.h"
+
+/* The various states of the FTS sub system pertaining to a table with
+FTS indexes defined on it. */
+enum fts_table_state_enum {
+					/* !<This must be 0 since we insert
+					a hard coded '0' at create time
+					to the config table */
+
+	FTS_TABLE_STATE_RUNNING = 0,	/*!< Auxiliary tables created OK */
+
+	FTS_TABLE_STATE_OPTIMIZING,	/*!< This is a substate of RUNNING */
+
+	FTS_TABLE_STATE_DELETED		/*!< All aux tables to be dropped when
+					it's safe to do so */
+};
+
+typedef enum fts_table_state_enum fts_table_state_t;
+
+/** The default time to wait for the background thread (in microsecnds). */
+#define FTS_MAX_BACKGROUND_THREAD_WAIT		10000
+
+/** Maximum number of iterations to wait before we complain */
+#define FTS_BACKGROUND_THREAD_WAIT_COUNT	1000
+
+/** The maximum length of the config table's value column in bytes */
+#define FTS_MAX_CONFIG_NAME_LEN			64
+
+/** The maximum length of the config table's value column in bytes */
+#define FTS_MAX_CONFIG_VALUE_LEN		1024
+
+/** Approx. upper limit of ilist length in bytes. */
+#define FTS_ILIST_MAX_SIZE			(64 * 1024)
+
+/** FTS config table name parameters */
+
+/** The number of seconds after which an OPTIMIZE run will stop */
+#define FTS_OPTIMIZE_LIMIT_IN_SECS	"optimize_checkpoint_limit"
+
+/** The next doc id */
+#define FTS_SYNCED_DOC_ID		"synced_doc_id"
+
+/** The last word that was OPTIMIZED */
+#define FTS_LAST_OPTIMIZED_WORD		"last_optimized_word"
+
+/** Total number of documents that have been deleted. The next_doc_id
+minus this count gives us the total number of documents. */
+#define FTS_TOTAL_DELETED_COUNT		"deleted_doc_count"
+
+/** Total number of words parsed from all documents */
+#define FTS_TOTAL_WORD_COUNT		"total_word_count"
+
+/** Start of optimize of an FTS index */
+#define FTS_OPTIMIZE_START_TIME		"optimize_start_time"
+
+/** End of optimize for an FTS index */
+#define FTS_OPTIMIZE_END_TIME		"optimize_end_time"
+
+/** User specified stopword table name */
+#define	FTS_STOPWORD_TABLE_NAME		"stopword_table_name"
+
+/** Whether to use (turn on/off) stopword */
+#define	FTS_USE_STOPWORD		"use_stopword"
+
+/** State of the FTS system for this table. It can be one of
+ RUNNING, OPTIMIZING, DELETED. */
+#define FTS_TABLE_STATE			"table_state"
+
+/** The minimum length of an FTS auxiliary table names's id component
+e.g., For an auxiliary table name
+
+	FTS_<TABLE_ID>_SUFFIX
+
+This constant is for the minimum length required to store the <TABLE_ID>
+component.
+*/
+#define FTS_AUX_MIN_TABLE_ID_LENGTH	48
+
+/** Maximum length of an integer stored in the config table value column. */
+#define FTS_MAX_INT_LEN			32
+
+/******************************************************************//**
+Parse an SQL string. %s is replaced with the table's id.
+@return query graph */
+UNIV_INTERN
+que_t*
+fts_parse_sql(
+/*==========*/
+	fts_table_t*	fts_table,	/*!< in: FTS aux table */
+	pars_info_t*	info,		/*!< in: info struct, or NULL */
+	const char*	sql)		/*!< in: SQL string to evaluate */
+	__attribute__((nonnull(3), malloc, warn_unused_result));
+/******************************************************************//**
+Evaluate a parsed SQL statement
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_eval_sql(
+/*=========*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t*		graph)		/*!< in: Parsed statement */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Construct the name of an ancillary FTS table for the given table.
+@return own: table name, must be freed with mem_free() */
+UNIV_INTERN
+char*
+fts_get_table_name(
+/*===============*/
+	const fts_table_t*
+			fts_table)	/*!< in: FTS aux table info */
+	__attribute__((nonnull, malloc, warn_unused_result));
+/******************************************************************//**
+Construct the column specification part of the SQL string for selecting the
+indexed FTS columns for the given table. Adds the necessary bound
+ids to the given 'info' and returns the SQL string. Examples:
+
+One indexed column named "text":
+
+ "$sel0",
+ info/ids: sel0 -> "text"
+
+Two indexed columns named "subject" and "content":
+
+ "$sel0, $sel1",
+ info/ids: sel0 -> "subject", sel1 -> "content",
+@return heap-allocated WHERE string */
+UNIV_INTERN
+const char*
+fts_get_select_columns_str(
+/*=======================*/
+	dict_index_t*	index,		/*!< in: FTS index */
+	pars_info_t*	info,		/*!< in/out: parser info */
+	mem_heap_t*	heap)		/*!< in: memory heap */
+	__attribute__((nonnull, warn_unused_result));
+
+/** define for fts_doc_fetch_by_doc_id() "option" value, defines whether
+we want to get Doc whose ID is equal to or greater or smaller than supplied
+ID */
+#define	FTS_FETCH_DOC_BY_ID_EQUAL	1
+#define	FTS_FETCH_DOC_BY_ID_LARGE	2
+#define	FTS_FETCH_DOC_BY_ID_SMALL	3
+
+/*************************************************************//**
+Fetch document (= a single row's indexed text) with the given
+document id.
+@return: DB_SUCCESS if fetch is successful, else error */
+UNIV_INTERN
+dberr_t
+fts_doc_fetch_by_doc_id(
+/*====================*/
+	fts_get_doc_t*	get_doc,	/*!< in: state */
+	doc_id_t	doc_id,		/*!< in: id of document to fetch */
+	dict_index_t*	index_to_use,	/*!< in: caller supplied FTS index,
+					or NULL */
+	ulint		option,         /*!< in: search option, if it is
+                                        greater than doc_id or equal */
+	fts_sql_callback
+			callback,	/*!< in: callback to read
+					records */
+	void*		arg)		/*!< in: callback arg */
+	__attribute__((nonnull(6)));
+
+/*******************************************************************//**
+Callback function for fetch that stores the text of an FTS document,
+converting each column to UTF-16.
+@return always FALSE */
+UNIV_INTERN
+ibool
+fts_query_expansion_fetch_doc(
+/*==========================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: fts_doc_t* */
+	__attribute__((nonnull));
+/********************************************************************
+Write out a single word's data as new entry/entries in the INDEX table.
+@return DB_SUCCESS if all OK. */
+UNIV_INTERN
+dberr_t
+fts_write_node(
+/*===========*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t**		graph,		/*!< in: query graph */
+	fts_table_t*	fts_table,	/*!< in: the FTS aux index */
+	fts_string_t*	word,		/*!< in: word in UTF-8 */
+	fts_node_t*	node)		/*!< in: node columns */
+	__attribute__((nonnull, warn_unused_result));
+/*******************************************************************//**
+Tokenize a document. */
+UNIV_INTERN
+void
+fts_tokenize_document(
+/*==================*/
+	fts_doc_t*	doc,		/*!< in/out: document to
+					tokenize */
+	fts_doc_t*	result)		/*!< out: if provided, save
+					result tokens here */
+	__attribute__((nonnull(1)));
+
+/*******************************************************************//**
+Continue to tokenize a document. */
+UNIV_INTERN
+void
+fts_tokenize_document_next(
+/*=======================*/
+	fts_doc_t*	doc,		/*!< in/out: document to
+					tokenize */
+	ulint		add_pos,	/*!< in: add this position to all
+					tokens from this tokenization */
+	fts_doc_t*	result)		/*!< out: if provided, save
+					result tokens here */
+	__attribute__((nonnull(1)));
+/******************************************************************//**
+Initialize a document. */
+UNIV_INTERN
+void
+fts_doc_init(
+/*=========*/
+	fts_doc_t*	doc)		/*!< in: doc to initialize */
+	__attribute__((nonnull));
+
+/******************************************************************//**
+Do a binary search for a doc id in the array
+@return +ve index if found -ve index where it should be
+        inserted if not found */
+UNIV_INTERN
+int
+fts_bsearch(
+/*========*/
+	fts_update_t*	array,		/*!< in: array to sort */
+	int		lower,		/*!< in: lower bound of array*/
+	int		upper,		/*!< in: upper bound of array*/
+	doc_id_t	doc_id)		/*!< in: doc id to lookup */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Free document. */
+UNIV_INTERN
+void
+fts_doc_free(
+/*=========*/
+	fts_doc_t*	doc)		/*!< in: document */
+	__attribute__((nonnull));
+/******************************************************************//**
+Free fts_optimizer_word_t instanace.*/
+UNIV_INTERN
+void
+fts_word_free(
+/*==========*/
+	fts_word_t*	word)		/*!< in: instance to free.*/
+	__attribute__((nonnull));
+/******************************************************************//**
+Read the rows from the FTS inde
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_index_fetch_nodes(
+/*==================*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t**		graph,		/*!< in: prepared statement */
+	fts_table_t*	fts_table,	/*!< in: FTS aux table */
+	const fts_string_t*
+			word,		/*!< in: the word to fetch */
+	fts_fetch_t*	fetch)		/*!< in: fetch callback.*/
+	__attribute__((nonnull));
+/******************************************************************//**
+Create a fts_optimizer_word_t instance.
+@return new instance */
+UNIV_INTERN
+fts_word_t*
+fts_word_init(
+/*==========*/
+	fts_word_t*	word,		/*!< in: word to initialize */
+	byte*		utf8,		/*!< in: UTF-8 string */
+	ulint		len)		/*!< in: length of string in bytes */
+	__attribute__((nonnull));
+/******************************************************************//**
+Compare two fts_trx_table_t instances, we actually compare the
+table id's here.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_cmp(
+/*==============*/
+	const void*	v1,		/*!< in: id1 */
+	const void*	v2)		/*!< in: id2 */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Compare a table id with a trx_table_t table id.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_id_cmp(
+/*=================*/
+	const void*	p1,		/*!< in: id1 */
+	const void*	p2)		/*!< in: id2 */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Commit a transaction.
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+dberr_t
+fts_sql_commit(
+/*===========*/
+	trx_t*		trx)		/*!< in: transaction */
+	__attribute__((nonnull));
+/******************************************************************//**
+Rollback a transaction.
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+dberr_t
+fts_sql_rollback(
+/*=============*/
+	trx_t*		trx)		/*!< in: transaction */
+	__attribute__((nonnull));
+/******************************************************************//**
+Parse an SQL string. %s is replaced with the table's id. Don't acquire
+the dict mutex
+@return query graph */
+UNIV_INTERN
+que_t*
+fts_parse_sql_no_dict_lock(
+/*=======================*/
+	fts_table_t*	fts_table,	/*!< in: table with FTS index */
+	pars_info_t*	info,		/*!< in: parser info */
+	const char*	sql)		/*!< in: SQL string to evaluate */
+	__attribute__((nonnull(3), malloc, warn_unused_result));
+/******************************************************************//**
+Get value from config table. The caller must ensure that enough
+space is allocated for value to hold the column contents
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_get_value(
+/*=================*/
+	trx_t*		trx,		/* transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: get config value for
+					this parameter name */
+	fts_string_t*	value)		/*!< out: value read from
+					config table */
+	__attribute__((nonnull));
+/******************************************************************//**
+Get value specific to an FTS index from the config table. The caller
+must ensure that enough space is allocated for value to hold the
+column contents.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_get_index_value(
+/*=======================*/
+	trx_t*		trx,		/*!< transaction */
+	dict_index_t*	index,		/*!< in: index */
+	const char*	param,		/*!< in: get config value for
+					this parameter name */
+	fts_string_t*	value)		/*!< out: value read from
+					config table */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Set the value in the config table for name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_set_value(
+/*=================*/
+	trx_t*		trx,		/*!< transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: get config value for
+					this parameter name */
+	const fts_string_t*
+			value)		/*!< in: value to update */
+	__attribute__((nonnull));
+/****************************************************************//**
+Set an ulint value in the config table.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+dberr_t
+fts_config_set_ulint(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: param name */
+	ulint		int_value)	/*!< in: value */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Set the value specific to an FTS index in the config table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_set_index_value(
+/*=======================*/
+	trx_t*		trx,		/*!< transaction */
+	dict_index_t*	index,		/*!< in: index */
+	const char*	param,		/*!< in: get config value for
+					this parameter name */
+	fts_string_t*	value)		/*!< out: value read from
+					config table */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Increment the value in the config table for column name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_increment_value(
+/*=======================*/
+	trx_t*		trx,		/*!< transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: increment config value
+					for this parameter name */
+	ulint		delta)		/*!< in: increment by this much */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Increment the per index value in the config table for column name.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_increment_index_value(
+/*=============================*/
+	trx_t*		trx,		/*!< transaction */
+	dict_index_t*	index,		/*!< in: FTS index */
+	const char*	name,		/*!< in: increment config value
+					for this parameter name */
+	ulint		delta)		/*!< in: increment by this much */
+	__attribute__((nonnull));
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_get_index_ulint(
+/*=======================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: FTS index */
+	const char*	name,		/*!< in: param name */
+	ulint*		int_value)	/*!< out: value */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Set an ulint value int the config table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_set_index_ulint(
+/*=======================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: FTS index */
+	const char*	name,		/*!< in: param name */
+	ulint		int_value)	/*!< in: value */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_config_get_ulint(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: param name */
+	ulint*		int_value)	/*!< out: value */
+	__attribute__((nonnull));
+/******************************************************************//**
+Search cache for word.
+@return the word node vector if found else NULL */
+UNIV_INTERN
+const ib_vector_t*
+fts_cache_find_word(
+/*================*/
+	const fts_index_cache_t*
+			index_cache,	/*!< in: cache to search */
+	const fts_string_t*
+			text)		/*!< in: word to search for */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Check cache for deleted doc id.
+@return TRUE if deleted */
+UNIV_INTERN
+ibool
+fts_cache_is_deleted_doc_id(
+/*========================*/
+	const fts_cache_t*
+			cache,		/*!< in: cache ito search */
+	doc_id_t	doc_id)		/*!< in: doc id to search for */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Append deleted doc ids to vector and sort the vector. */
+UNIV_INTERN
+void
+fts_cache_append_deleted_doc_ids(
+/*=============================*/
+	const fts_cache_t*
+			cache,		/*!< in: cache to use */
+	ib_vector_t*	vector);	/*!< in: append to this vector */
+/******************************************************************//**
+Wait for the background thread to start. We poll to detect change
+of state, which is acceptable, since the wait should happen only
+once during startup.
+@return true if the thread started else FALSE (i.e timed out) */
+UNIV_INTERN
+ibool
+fts_wait_for_background_thread_to_start(
+/*====================================*/
+	dict_table_t*	table,		/*!< in: table to which the thread
+					is attached */
+	ulint		max_wait);	/*!< in: time in microseconds, if set
+					to 0 then it disables timeout
+					checking */
+#ifdef FTS_DOC_STATS_DEBUG
+/******************************************************************//**
+Get the total number of words in the FTS for a particular FTS index.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+fts_get_total_word_count(
+/*=====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: for this index */
+	ulint*		total)		/*!< out: total words */
+	__attribute__((nonnull, warn_unused_result));
+#endif
+/******************************************************************//**
+Search the index specific cache for a particular FTS index.
+@return the index specific cache else NULL */
+UNIV_INTERN
+fts_index_cache_t*
+fts_find_index_cache(
+/*================*/
+	const fts_cache_t*
+			cache,		/*!< in: cache to search */
+	const dict_index_t*
+			index)		/*!< in: index to search for */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Write the table id to the given buffer (including final NUL). Buffer must be
+at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long.
+@return	number of bytes written */
+UNIV_INLINE
+int
+fts_write_object_id(
+/*================*/
+	ib_id_t		id,		/*!< in: a table/index id */
+	char*		str,		/*!< in: buffer to write the id to */
+	bool		hex_format __attribute__((unused)))
+					/*!< in: true for fixed hex format,
+					false for old ambiguous format */
+	__attribute__((nonnull));
+/******************************************************************//**
+Read the table id from the string generated by fts_write_object_id().
+@return TRUE if parse successful */
+UNIV_INLINE
+ibool
+fts_read_object_id(
+/*===============*/
+	ib_id_t*	id,		/*!< out: a table id */
+	const char*	str)		/*!< in: buffer to read from */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Get the table id.
+@return number of bytes written */
+UNIV_INTERN
+int
+fts_get_table_id(
+/*=============*/
+	const fts_table_t*
+			fts_table,	/*!< in: FTS Auxiliary table */
+	char*		table_id)	/*!< out: table id, must be at least
+					FTS_AUX_MIN_TABLE_ID_LENGTH bytes
+					long */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Add the table to add to the OPTIMIZER's list. */
+UNIV_INTERN
+void
+fts_optimize_add_table(
+/*===================*/
+	dict_table_t*	table)		/*!< in: table to add */
+	__attribute__((nonnull));
+/******************************************************************//**
+Optimize a table. */
+UNIV_INTERN
+void
+fts_optimize_do_table(
+/*==================*/
+	dict_table_t*	table)		/*!< in: table to optimize */
+	__attribute__((nonnull));
+/******************************************************************//**
+Construct the prefix name of an FTS table.
+@return own: table name, must be freed with mem_free() */
+UNIV_INTERN
+char*
+fts_get_table_name_prefix(
+/*======================*/
+	const fts_table_t*
+			fts_table)	/*!< in: Auxiliary table type */
+	__attribute__((nonnull, malloc, warn_unused_result));
+/******************************************************************//**
+Add node positions. */
+UNIV_INTERN
+void
+fts_cache_node_add_positions(
+/*=========================*/
+	fts_cache_t*	cache,		/*!< in: cache */
+	fts_node_t*	node,		/*!< in: word node */
+	doc_id_t	doc_id,		/*!< in: doc id */
+	ib_vector_t*	positions)	/*!< in: fts_token_t::positions */
+	__attribute__((nonnull(2,4)));
+
+/******************************************************************//**
+Create the config table name for retrieving index specific value.
+@return index config parameter name */
+UNIV_INTERN
+char*
+fts_config_create_index_param_name(
+/*===============================*/
+	const char*		param,		/*!< in: base name of param */
+	const dict_index_t*	index)		/*!< in: index for config */
+	__attribute__((nonnull, malloc, warn_unused_result));
+
+#ifndef UNIV_NONINL
+#include "fts0priv.ic"
+#endif
+
+#endif /* INNOBASE_FTS0PRIV_H */
diff --git a/storage/innobase/include/fts0priv.ic b/storage/innobase/include/fts0priv.ic
new file mode 100644
index 00000000000..2d07c60f980
--- /dev/null
+++ b/storage/innobase/include/fts0priv.ic
@@ -0,0 +1,130 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0priv.ic
+Full text search internal header file
+
+Created 2011/11/12 Sunny Bains
+***********************************************************************/
+
+/******************************************************************//**
+Write the table id to the given buffer (including final NUL). Buffer must be
+at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long.
+@return	number of bytes written */
+UNIV_INLINE
+int
+fts_write_object_id(
+/*================*/
+	ib_id_t		id,		/* in: a table/index id */
+	char*		str,		/* in: buffer to write the id to */
+	bool		hex_format __attribute__((unused)))
+					/* in: true for fixed hex format,
+					false for old ambiguous format */
+{
+
+#ifdef _WIN32
+
+	DBUG_EXECUTE_IF("innodb_test_wrong_non_windows_fts_aux_table_name",
+			return(sprintf(str, UINT64PFx, id)););
+
+	/* Use this to construct old(5.6.14 and 5.7.3) windows
+	ambiguous aux table names */
+	DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+			return(sprintf(str, "%016llu", id)););
+
+#else /* _WIN32 */
+
+	/* Use this to construct old(5.6.14 and 5.7.3) windows
+	ambiguous aux table names */
+	DBUG_EXECUTE_IF("innodb_test_wrong_windows_fts_aux_table_name",
+			return(sprintf(str, "%016"PRIu64, id)););
+
+	DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+			return(sprintf(str, UINT64PFx, id)););
+
+#endif /* _WIN32 */
+
+	/* As above, but this is only for those tables failing to rename. */
+	if (!hex_format) {
+#ifdef _WIN32
+		// FIXME: Use ut_snprintf(), so does following one.
+		return(sprintf(str, "%016llu", id));
+#else /* _WIN32 */
+		return(sprintf(str, "%016"PRIu64, id));
+#endif /* _WIN32 */
+	}
+
+	return(sprintf(str, UINT64PFx, id));
+}
+
+/******************************************************************//**
+Read the table id from the string generated by fts_write_object_id().
+@return	TRUE if parse successful */
+UNIV_INLINE
+ibool
+fts_read_object_id(
+/*===============*/
+	ib_id_t*	id,		/* out: an id */
+	const char*	str)		/* in: buffer to read from */
+{
+	/* NOTE: this func doesn't care about whether current table
+	is set with HEX_NAME, the user of the id read here will check
+	if the id is HEX or DEC and do the right thing with it. */
+	return(sscanf(str, UINT64PFx, id) == 1);
+}
+
+/******************************************************************//**
+Compare two fts_trx_table_t instances.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2  */
+UNIV_INLINE
+int
+fts_trx_table_cmp(
+/*==============*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const dict_table_t* table1 = (*(const fts_trx_table_t**) p1)->table;
+	const dict_table_t* table2 = (*(const fts_trx_table_t**) p2)->table;
+
+	return((table1->id > table2->id)
+	       ? 1
+	       : (table1->id == table2->id)
+		  ? 0
+		  : -1);
+}
+
+/******************************************************************//**
+Compare a table id with a fts_trx_table_t table id.
+@return < 0 if n1 < n2, 0 if n1 == n2,> 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_id_cmp(
+/*=================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const ullint* table_id = (const ullint*) p1;
+	const dict_table_t* table2 = (*(const fts_trx_table_t**) p2)->table;
+
+	return((*table_id > table2->id)
+	       ? 1
+	       : (*table_id == table2->id)
+		  ? 0
+		  : -1);
+}
diff --git a/storage/innobase/include/fts0tlex.h b/storage/innobase/include/fts0tlex.h
new file mode 100644
index 00000000000..f91533803e8
--- /dev/null
+++ b/storage/innobase/include/fts0tlex.h
@@ -0,0 +1,349 @@
+#ifndef fts0tHEADER_H
+#define fts0tHEADER_H 1
+#define fts0tIN_HEADER 1
+
+#line 6 "../include/fts0tlex.h"
+
+#line 8 "../include/fts0tlex.h"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 5
+#define YY_FLEX_SUBMINOR_VERSION 35
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types. 
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t; 
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+#ifdef __cplusplus
+
+/* The "const" storage-class-modifier is valid. */
+#define YY_USE_CONST
+
+#else	/* ! __cplusplus */
+
+/* C99 requires __STDC__ to be defined as 1. */
+#if defined (__STDC__)
+
+#define YY_USE_CONST
+
+#endif	/* defined (__STDC__) */
+#endif	/* ! __cplusplus */
+
+#ifdef YY_USE_CONST
+#define yyconst const
+#else
+#define yyconst
+#endif
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	yy_size_t yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+    
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+void fts0trestart (FILE *input_file ,yyscan_t yyscanner );
+void fts0t_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0t_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
+void fts0t_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0t_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
+void fts0tpush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
+void fts0tpop_buffer_state (yyscan_t yyscanner );
+
+YY_BUFFER_STATE fts0t_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0t_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
+YY_BUFFER_STATE fts0t_scan_bytes (yyconst char *bytes,int len ,yyscan_t yyscanner );
+
+void *fts0talloc (yy_size_t ,yyscan_t yyscanner );
+void *fts0trealloc (void *,yy_size_t ,yyscan_t yyscanner );
+void fts0tfree (void * ,yyscan_t yyscanner );
+
+/* Begin user sect3 */
+
+#define fts0twrap(n) 1
+#define YY_SKIP_YYWRAP
+
+#define yytext_ptr yytext_r
+
+#ifdef YY_HEADER_EXPORT_START_CONDITIONS
+#define INITIAL 0
+
+#endif
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+int fts0tlex_init (yyscan_t* scanner);
+
+int fts0tlex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int fts0tlex_destroy (yyscan_t yyscanner );
+
+int fts0tget_debug (yyscan_t yyscanner );
+
+void fts0tset_debug (int debug_flag ,yyscan_t yyscanner );
+
+YY_EXTRA_TYPE fts0tget_extra (yyscan_t yyscanner );
+
+void fts0tset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
+
+FILE *fts0tget_in (yyscan_t yyscanner );
+
+void fts0tset_in  (FILE * in_str ,yyscan_t yyscanner );
+
+FILE *fts0tget_out (yyscan_t yyscanner );
+
+void fts0tset_out  (FILE * out_str ,yyscan_t yyscanner );
+
+int fts0tget_leng (yyscan_t yyscanner );
+
+char *fts0tget_text (yyscan_t yyscanner );
+
+int fts0tget_lineno (yyscan_t yyscanner );
+
+void fts0tset_lineno (int line_number ,yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int fts0twrap (yyscan_t yyscanner );
+#else
+extern int fts0twrap (yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int fts0tlex (yyscan_t yyscanner);
+
+#define YY_DECL int fts0tlex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+
+#line 68 "fts0tlex.l"
+
+
+#line 348 "../include/fts0tlex.h"
+#undef fts0tIN_HEADER
+#endif /* fts0tHEADER_H */
diff --git a/storage/innobase/include/fts0types.h b/storage/innobase/include/fts0types.h
new file mode 100644
index 00000000000..64677428331
--- /dev/null
+++ b/storage/innobase/include/fts0types.h
@@ -0,0 +1,474 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0types.h
+Full text search types file
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0TYPES_H
+#define INNOBASE_FTS0TYPES_H
+
+#include "que0types.h"
+#include "ut0byte.h"
+#include "fut0fut.h"
+#include "ut0rbt.h"
+#include "fts0fts.h"
+
+/** Types used within FTS. */
+struct fts_que_t;
+struct fts_node_t;
+struct fts_utf8_str_t;
+
+/** Callbacks used within FTS. */
+typedef pars_user_func_cb_t fts_sql_callback;
+typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len);
+
+/** Statistics relevant to a particular document, used during retrieval. */
+struct fts_doc_stats_t {
+	doc_id_t	doc_id;		/*!< Document id */
+	ulint		word_count;	/*!< Total words in the document */
+};
+
+/** It's main purpose is to store the SQL prepared statements that
+are required to retrieve a document from the database. */
+struct fts_get_doc_t {
+	fts_index_cache_t*
+			index_cache;	/*!< The index cache instance */
+
+					/*!< Parsed sql statement */
+	que_t*		get_document_graph;
+	fts_cache_t*	cache;		/*!< The parent cache */
+};
+
+/** Since we can have multiple FTS indexes on a table, we keep a
+per index cache of words etc. */
+struct fts_index_cache_t {
+	dict_index_t*	index;		/*!< The FTS index instance */
+
+	ib_rbt_t*	words;		/*!< Nodes; indexed by fts_string_t*,
+					cells are fts_tokenizer_word_t*.*/
+
+	ib_vector_t*	doc_stats;	/*!< Array of the fts_doc_stats_t
+					contained in the memory buffer.
+					Must be in sorted order (ascending).
+					The  ideal choice is an rb tree but
+					the rb tree imposes a space overhead
+					that we can do without */
+
+	que_t**		ins_graph;	/*!< Insert query graphs */
+
+	que_t**		sel_graph;	/*!< Select query graphs */
+	CHARSET_INFO*	charset;	/*!< charset */
+};
+
+/** For supporting the tracking of updates on multiple FTS indexes we need
+to track which FTS indexes need to be updated. For INSERT and DELETE we
+update all fts indexes. */
+struct fts_update_t {
+	doc_id_t	doc_id;		/*!< The doc id affected */
+
+	ib_vector_t*	fts_indexes;	/*!< The FTS indexes that need to be
+					updated. A NULL value means all
+					indexes need to be updated.  This
+					vector is not allocated on the heap
+					and so must be freed explicitly,
+					when we are done with it */
+};
+
+/** Stop word control infotmation. */
+struct fts_stopword_t {
+	ulint		status;		/*!< Status of the stopword tree */
+	ib_alloc_t*	heap;		/*!< The memory allocator to use */
+	ib_rbt_t*	cached_stopword;/*!< This stores all active stopwords */
+	CHARSET_INFO*	charset;	/*!< charset for stopword */
+};
+
+/** The SYNC state of the cache. There is one instance of this struct
+associated with each ADD thread. */
+struct fts_sync_t {
+	trx_t*		trx;		/*!< The transaction used for SYNCing
+					the cache to disk */
+	dict_table_t*	table;		/*!< Table with FTS index(es) */
+	ulint		max_cache_size;	/*!< Max size in bytes of the cache */
+	ibool		cache_full;	/*!< flag, when true it indicates that
+					we need to sync the cache to disk */
+	ulint		lower_index;	/*!< the start index of the doc id
+					vector from where to start adding
+					documents to the FTS cache */
+	ulint		upper_index;	/*!< max index of the doc id vector to
+					add to the FTS cache */
+	ibool		interrupted;	/*!< TRUE if SYNC was interrupted */
+	doc_id_t	min_doc_id;	/*!< The smallest doc id added to the
+					cache. It should equal to
+					doc_ids[lower_index] */
+	doc_id_t	max_doc_id;	/*!< The doc id at which the cache was
+					noted as being full, we use this to
+					set the upper_limit field */
+        ib_time_t	start_time;	/*!< SYNC start time */
+};
+
+/** The cache for the FTS system. It is a memory-based inverted index
+that new entries are added to, until it grows over the configured maximum
+size, at which time its contents are written to the INDEX table. */
+struct fts_cache_t {
+	rw_lock_t	lock;		/*!< lock protecting all access to the
+					memory buffer. FIXME: this needs to
+					be our new upgrade-capable rw-lock */
+
+	rw_lock_t	init_lock;	/*!< lock used for the cache
+					intialization, it has different
+					SYNC level as above cache lock */
+
+	ib_mutex_t	optimize_lock;	/*!< Lock for OPTIMIZE */
+
+	ib_mutex_t	deleted_lock;	/*!< Lock covering deleted_doc_ids */
+
+	ib_mutex_t	doc_id_lock;	/*!< Lock covering Doc ID */
+
+	ib_vector_t*	deleted_doc_ids;/*!< Array of deleted doc ids, each
+					element is of type fts_update_t */
+
+	ib_vector_t*	indexes;	/*!< We store the stats and inverted
+					index for the individual FTS indexes
+					in this vector. Each element is
+					an instance of fts_index_cache_t */
+
+	ib_vector_t*	get_docs;	/*!< information required to read
+					the document from the table. Each
+					element is of type fts_doc_t */
+
+	ulint		total_size;	/*!< total size consumed by the ilist
+					field of all nodes. SYNC is run
+					whenever this gets too big */
+	fts_sync_t*	sync;		/*!< sync structure to sync data to
+					disk */
+	ib_alloc_t*	sync_heap;	/*!< The heap allocator, for indexes
+					and deleted_doc_ids, ie. transient
+					objects, they are recreated after
+					a SYNC is completed */
+
+
+	ib_alloc_t*	self_heap;	/*!< This heap is the heap out of
+					which an instance of the cache itself
+					was created. Objects created using
+					this heap will last for the lifetime
+					of the cache */
+
+	doc_id_t	next_doc_id;	/*!< Next doc id */
+
+	doc_id_t	synced_doc_id;	/*!< Doc ID sync-ed to CONFIG table */
+
+	doc_id_t	first_doc_id;	/*!< first doc id since this table
+					was opened */
+
+	ulint		deleted;	/*!< Number of doc ids deleted since
+					last optimized. This variable is
+					covered by deleted_lock */
+
+	ulint		added;		/*!< Number of doc ids added since last
+					optimized. This variable is covered by
+					the deleted lock */
+
+	fts_stopword_t	stopword_info;	/*!< Cached stopwords for the FTS */
+	mem_heap_t*	cache_heap;	/*!< Cache Heap */
+};
+
+/** Columns of the FTS auxiliary INDEX table */
+struct fts_node_t {
+	doc_id_t	first_doc_id;	/*!< First document id in ilist. */
+
+	doc_id_t	last_doc_id;	/*!< Last document id in ilist. */
+
+	byte*		ilist;		/*!< Binary list of documents & word
+					positions the token appears in.
+					TODO: For now, these are simply
+					ut_malloc'd, but if testing shows
+					that they waste memory unacceptably, a
+					special memory allocator will have
+					to be written */
+
+	ulint		doc_count;	/*!< Number of doc ids in ilist */
+
+	ulint		ilist_size;	/*!< Used size of ilist in bytes. */
+
+	ulint		ilist_size_alloc;
+					/*!< Allocated size of ilist in
+					bytes */
+};
+
+/** A tokenizer word. Contains information about one word. */
+struct fts_tokenizer_word_t {
+	fts_string_t	text;		/*!< Token text. */
+
+	ib_vector_t*	nodes;		/*!< Word node ilists, each element is
+					of type fts_node_t */
+};
+
+/** Word text plus it's array of nodes as on disk in FTS index */
+struct fts_word_t {
+	fts_string_t	text;		/*!< Word value in UTF-8 */
+	ib_vector_t*	nodes;		/*!< Nodes read from disk */
+
+	ib_alloc_t*	heap_alloc;	/*!< For handling all allocations */
+};
+
+/** Callback for reading and filtering nodes that are read from FTS index */
+struct fts_fetch_t {
+	void*		read_arg;	/*!< Arg for the sql_callback */
+
+	fts_sql_callback
+			read_record;	/*!< Callback for reading index
+					record */
+	ulint		total_memory;	/*!< Total memory used */
+};
+
+/** For horizontally splitting an FTS auxiliary index */
+struct fts_index_selector_t {
+	ulint		value;		/*!< Character value at which
+					to split */
+
+	const char*	suffix;		/*!< FTS aux index suffix */
+};
+
+/** This type represents a single document. */
+struct fts_doc_t {
+	fts_string_t	text;		/*!< document text */
+
+	ibool		found;		/*!< TRUE if the document was found
+					successfully in the database */
+
+	ib_rbt_t*	tokens;		/*!< This is filled when the document
+					is tokenized. Tokens; indexed by
+					fts_string_t*, cells are of type
+					fts_token_t* */
+
+	ib_alloc_t*	self_heap;	/*!< An instance of this type is
+					allocated from this heap along
+					with any objects that have the
+					same lifespan, most notably
+					the vector of token positions */
+	CHARSET_INFO*	charset;	/*!< Document's charset info */
+};
+
+/** A token and its positions within a document. */
+struct fts_token_t {
+	fts_string_t	text;		/*!< token text */
+
+	ib_vector_t*	positions;	/*!< an array of the positions the
+					token is found in; each item is
+					actually an ulint. */
+};
+
+/** It's defined in fts/fts0fts.c */
+extern const fts_index_selector_t fts_index_selector[];
+
+/******************************************************************//**
+Compare two UTF-8 strings. */
+UNIV_INLINE
+int
+fts_utf8_string_cmp(
+/*================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: key */
+	const void*	p2);			/*!< in: node */
+
+/******************************************************************//**
+Compare two UTF-8 strings, and return match (0) if
+passed in "key" value equals or is the prefix of the "node" value. */
+UNIV_INLINE
+int
+fts_utf8_string_cmp_prefix(
+/*=======================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: key */
+	const void*	p2);			/*!< in: node */
+
+/******************************************************************//**
+Compare two fts_trx_row_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_trx_row_doc_id_cmp(
+/*===================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2);			/*!< in: id2 */
+
+/******************************************************************//**
+Compare two fts_ranking_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_ranking_doc_id_cmp(
+/*===================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2);			/*!< in: id2 */
+
+/******************************************************************//**
+Compare two fts_update_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_update_doc_id_cmp(
+/*==================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2);			/*!< in: id2 */
+
+/******************************************************************//**
+Decode and return the integer that was encoded using our VLC scheme.*/
+UNIV_INLINE
+ulint
+fts_decode_vlc(
+/*===========*/
+			/*!< out: value decoded */
+	byte**	ptr);	/*!< in: ptr to decode from, this ptr is
+			incremented by the number of bytes decoded */
+
+/******************************************************************//**
+Duplicate an UTF-8 string. */
+UNIV_INLINE
+void
+fts_utf8_string_dup(
+/*================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	fts_string_t*		dst,		/*!< in: dup to here */
+	const fts_string_t*	src,		/*!< in: src string */
+	mem_heap_t*		heap);		/*!< in: heap to use */
+
+/******************************************************************//**
+Return length of val if it were encoded using our VLC scheme. */
+UNIV_INLINE
+ulint
+fts_get_encoded_len(
+/*================*/
+						/*!< out: length of value
+						 encoded, in bytes */
+	ulint		val);			/*!< in: value to encode */
+
+/******************************************************************//**
+Encode an integer using our VLC scheme and return the length in bytes. */
+UNIV_INLINE
+ulint
+fts_encode_int(
+/*===========*/
+						/*!< out: length of value
+						encoded, in bytes */
+	ulint		val,			/*!< in: value to encode */
+	byte*		buf);			/*!< in: buffer, must have
+						enough space */
+
+/******************************************************************//**
+Decode a UTF-8 character.
+
+http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf:
+
+ Scalar Value              1st Byte 2nd Byte 3rd Byte 4th Byte
+00000000 0xxxxxxx          0xxxxxxx
+00000yyy yyxxxxxx          110yyyyy 10xxxxxx
+zzzzyyyy yyxxxxxx          1110zzzz 10yyyyyy 10xxxxxx
+000uuuzz zzzzyyyy yyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
+
+This function decodes UTF-8 sequences up to 6 bytes (31 bits).
+
+On error *ptr will point to the first byte that was not correctly
+decoded. This will hopefully help in resyncing the input. */
+UNIV_INLINE
+ulint
+fts_utf8_decode(
+/*============*/
+						/*!< out: UTF8_ERROR if *ptr
+						did not point to a valid
+						UTF-8 sequence, or the
+						Unicode code point. */
+	const byte**	ptr);			/*!< in/out: pointer to
+						UTF-8 string. The
+						pointer is advanced to
+						the start of the next
+						character. */
+
+/******************************************************************//**
+Lowercase an UTF-8 string. */
+UNIV_INLINE
+void
+fts_utf8_tolower(
+/*=============*/
+	fts_string_t*	str);			/*!< in: string */
+
+/******************************************************************//**
+Get the selected FTS aux INDEX suffix. */
+UNIV_INLINE
+const char*
+fts_get_suffix(
+/*===========*/
+	ulint		selected);		/*!< in: selected index */
+
+/********************************************************************
+Get the number of index selectors. */
+UNIV_INLINE
+ulint
+fts_get_n_selectors(void);
+/*=====================*/
+
+/******************************************************************//**
+Select the FTS auxiliary index for the given string.
+@return the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index(
+/*=============*/
+	const CHARSET_INFO*	cs,		/*!< Charset */
+	const byte*		str,		/*!< in: word string */
+	ulint			len);		/*!< in: string length */
+
+/********************************************************************
+Select the next FTS auxiliary index for the given character.
+@return the next index to use for character */
+UNIV_INLINE
+ulint
+fts_select_next_index(
+/*==================*/
+	const CHARSET_INFO*	cs,		/*!< Charset */
+	const byte*		str,		/*!< in: string */
+	ulint			len);		/*!< in: string length */
+
+#ifndef UNIV_NONINL
+#include "fts0types.ic"
+#include "fts0vlc.ic"
+#endif
+
+#endif /* INNOBASE_FTS0TYPES_H */
diff --git a/storage/innobase/include/fts0types.ic b/storage/innobase/include/fts0types.ic
new file mode 100644
index 00000000000..f0dfd023a70
--- /dev/null
+++ b/storage/innobase/include/fts0types.ic
@@ -0,0 +1,388 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0types.ic
+Full text search types.
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0TYPES_IC
+#define INNOBASE_FTS0TYPES_IC
+
+#include <ctype.h>
+
+#include "rem0cmp.h"
+#include "ha_prototypes.h"
+
+extern const ulint UTF8_ERROR;
+
+/* Determine if a UTF-8 continuation byte is valid. */
+#define fts_utf8_is_valid(b) (((b) & 0xC0) == 0x80)
+
+/******************************************************************//**
+Duplicate an UTF-8 string.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+void
+fts_utf8_string_dup(
+/*================*/
+	fts_string_t*		dst,		/*!< in: dup to here */
+	const fts_string_t*	src,		/*!< in: src string */
+	mem_heap_t*		heap)		/*!< in: heap to use */
+{
+	dst->f_str = (byte*)mem_heap_alloc(heap, src->f_len + 1);
+	memcpy(dst->f_str, src->f_str, src->f_len);
+
+	dst->f_len = src->f_len;
+	dst->f_str[src->f_len] = 0;
+	dst->f_n_char = src->f_n_char;
+}
+
+/******************************************************************//**
+Compare two fts_trx_row_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_row_doc_id_cmp(
+/*===================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const fts_trx_row_t*	tr1 = (const fts_trx_row_t*) p1;
+	const fts_trx_row_t*	tr2 = (const fts_trx_row_t*) p2;
+
+	return((int)(tr1->doc_id - tr2->doc_id));
+}
+
+/******************************************************************//**
+Compare two fts_ranking_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_ranking_doc_id_cmp(
+/*===================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const fts_ranking_t*	rk1 = (const fts_ranking_t*) p1;
+	const fts_ranking_t*	rk2 = (const fts_ranking_t*) p2;
+
+	return((int)(rk1->doc_id - rk2->doc_id));
+}
+
+/******************************************************************//**
+Compare two fts_update_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_update_doc_id_cmp(
+/*==================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const fts_update_t*	up1 = (const fts_update_t*) p1;
+	const fts_update_t*	up2 = (const fts_update_t*) p2;
+
+	return((int)(up1->doc_id - up2->doc_id));
+}
+
+
+/******************************************************************//**
+Lowercase an UTF-8 string. */
+UNIV_INLINE
+void
+fts_utf8_tolower(
+/*=============*/
+	fts_string_t*	str)			/*!< in: string */
+{
+	innobase_casedn_str((char*) str->f_str);
+}
+
+/******************************************************************//**
+Compare two UTF-8 strings.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_utf8_string_cmp(
+/*================*/
+	const void*	p1,			/*!< in: key */
+	const void*	p2)			/*!< in: node */
+{
+	const fts_string_t* s1 = (const fts_string_t*) p1;
+	const fts_string_t* s2 = (const fts_string_t*) p2;
+
+	return(cmp_data_data_slow_varchar(
+		s1->f_str, s1->f_len, s2->f_str, s2->f_len));
+}
+
+/******************************************************************//**
+Compare two UTF-8 strings, and return match (0) if
+passed in "key" value equals or is the prefix of the "node" value.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_utf8_string_cmp_prefix(
+/*=======================*/
+	const void*	p1,			/*!< in: key */
+	const void*	p2)			/*!< in: node */
+{
+	int	result;
+	ulint	len;
+
+	const fts_string_t* s1 = (const fts_string_t*) p1;
+	const fts_string_t* s2 = (const fts_string_t*) p2;
+
+	len = ut_min(s1->f_len, s2->f_len);
+
+	result = cmp_data_data_slow_varchar(s1->f_str, len, s2->f_str, len);
+
+	if (result) {
+		return(result);
+	}
+
+	if (s1->f_len > s2->f_len) {
+		return(1);
+	}
+
+	return(0);
+}
+
+/******************************************************************//**
+Decode a UTF-8 character.
+
+http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf:
+
+ Scalar Value              1st Byte 2nd Byte 3rd Byte 4th Byte
+00000000 0xxxxxxx          0xxxxxxx
+00000yyy yyxxxxxx          110yyyyy 10xxxxxx
+zzzzyyyy yyxxxxxx          1110zzzz 10yyyyyy 10xxxxxx
+000uuuzz zzzzyyyy yyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
+
+This function decodes UTF-8 sequences up to 6 bytes (31 bits).
+
+On error *ptr will point to the first byte that was not correctly
+decoded. This will hopefully help in resyncing the input.
+@return UTF8_ERROR if *ptr did not point to a valid
+UTF-8 sequence, or the Unicode code point. */
+UNIV_INLINE
+ulint
+fts_utf8_decode(
+/*============*/
+	const byte**	ptr)			/*!< in/out: pointer to
+						UTF-8 string. The
+						pointer is advanced to
+						the start of the next
+						character. */
+{
+	const byte*	p = *ptr;
+	ulint		ch = *p++;
+#ifdef UNIV_DEBUG
+	ulint		min_ch;
+#endif /* UNIV_DEBUG */
+
+	if (UNIV_LIKELY(ch < 0x80)) {
+		/* 0xxxxxxx */
+	} else if (UNIV_UNLIKELY(ch < 0xC0)) {
+		/* A continuation byte cannot start a code. */
+		goto err_exit;
+	} else if (ch < 0xE0) {
+		/* 110yyyyy 10xxxxxx */
+		ch &= 0x1F;
+		ut_d(min_ch = 0x80);
+		goto get1;
+	} else if (ch < 0xF0) {
+		/* 1110zzzz 10yyyyyy 10xxxxxx */
+		ch &= 0x0F;
+		ut_d(min_ch = 0x800);
+		goto get2;
+	} else if (ch < 0xF8) {
+		/* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */
+		ch &= 0x07;
+		ut_d(min_ch = 0x10000);
+		goto get3;
+	} else if (ch < 0xFC) {
+		/* 111110tt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */
+		ch &= 0x03;
+		ut_d(min_ch = 0x200000);
+		goto get4;
+	} else if (ch < 0xFE) {
+		/* 1111110s 10tttttt 10uuuuuu 10zzzzzz 10yyyyyy 10xxxxxx */
+		ut_d(min_ch = 0x4000000);
+		if (!fts_utf8_is_valid(*p)) {
+			goto err_exit;
+		}
+		ch <<= 6;
+		ch |= (*p++) & 0x3F;
+get4:
+		if (!fts_utf8_is_valid(*p)) {
+			goto err_exit;
+		}
+		ch <<= 6;
+		ch |= (*p++) & 0x3F;
+get3:
+		if (!fts_utf8_is_valid(*p)) {
+			goto err_exit;
+		}
+		ch <<= 6;
+		ch |= (*p++) & 0x3F;
+get2:
+		if (!fts_utf8_is_valid(*p)) {
+			goto err_exit;
+		}
+		ch <<= 6;
+		ch |= (*p++) & 0x3F;
+get1:
+		if (!fts_utf8_is_valid(*p)) {
+			goto err_exit;
+		}
+		ch <<= 6;
+		ch |= (*p++) & 0x3F;
+
+		/* The following is needed in the 6-byte case
+		when ulint is wider than 32 bits. */
+		ch &= 0xFFFFFFFF;
+
+		/* The code positions U+D800 to U+DFFF (UTF-16 surrogate pairs)
+		and U+FFFE and U+FFFF cannot occur in valid UTF-8. */
+
+		if ( (ch >= 0xD800 && ch <= 0xDFFF)
+#ifdef UNIV_DEBUG
+		     || ch < min_ch
+#endif /* UNIV_DEBUG */
+		     || ch == 0xFFFE || ch == 0xFFFF) {
+
+			ch = UTF8_ERROR;
+		}
+	} else {
+err_exit:
+		ch = UTF8_ERROR;
+	}
+
+	*ptr = p;
+
+	return(ch);
+}
+
+/******************************************************************//**
+Get the first character's code position for FTS index partition */
+extern
+ulint
+innobase_strnxfrm(
+/*==============*/
+        const CHARSET_INFO*	cs,	/*!< in: Character set */
+        const uchar*		p2,	/*!< in: string */
+        const ulint		len2);	/*!< in: string length */
+
+/******************************************************************//**
+Select the FTS auxiliary index for the given character.
+@return the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index(
+/*=============*/
+	const CHARSET_INFO*	cs,	/*!< in: Charset */
+	const byte*		str,	/*!< in: string */
+	ulint			len)	/*!< in: string length */
+{
+	ulint			selected = 0;
+	ulint			value = innobase_strnxfrm(cs, str, len);
+
+	while (fts_index_selector[selected].value != 0) {
+
+		if (fts_index_selector[selected].value == value) {
+
+			return(selected);
+
+		} else if (fts_index_selector[selected].value > value) {
+
+			return(selected > 0 ? selected - 1 : 0);
+		}
+
+		++selected;
+	}
+
+	ut_ad(selected > 1);
+
+	return(selected - 1);
+}
+
+/******************************************************************//**
+Select the next FTS auxiliary index for the given character.
+@return the next index to use for character */
+UNIV_INLINE
+ulint
+fts_select_next_index(
+/*==================*/
+	const CHARSET_INFO*	cs,	/*!< in: Charset */
+	const byte*		str,	/*!< in: string */
+	ulint			len)	/*!< in: string length */
+{
+	ulint		selected = 0;
+	ulint		value = innobase_strnxfrm(cs, str, len);
+
+	while (fts_index_selector[selected].value != 0) {
+
+		if (fts_index_selector[selected].value == value) {
+
+			return(selected + 1);
+
+		} else if (fts_index_selector[selected].value > value) {
+
+			return(selected);
+		}
+
+		++selected;
+	}
+
+	ut_ad(selected > 0);
+
+	return((ulint) selected);
+}
+
+/******************************************************************//**
+Return the selected FTS aux index suffix. */
+UNIV_INLINE
+const char*
+fts_get_suffix(
+/*===========*/
+	ulint		selected)	/*!< in: selected index */
+{
+	return(fts_index_selector[selected].suffix);
+}
+
+/******************************************************************//**
+Get the number of index selectors.
+@return The number of selectors */
+UNIV_INLINE
+ulint
+fts_get_n_selectors(void)
+/*=====================*/
+{
+	ulint	i = 0;
+
+	// FIXME: This is a hack
+	while (fts_index_selector[i].value != 0) {
+		++i;
+	}
+
+	return(i);
+}
+
+#endif /* INNOBASE_FTS0TYPES_IC */
diff --git a/storage/innobase/include/fts0vlc.ic b/storage/innobase/include/fts0vlc.ic
new file mode 100644
index 00000000000..e79bcf59347
--- /dev/null
+++ b/storage/innobase/include/fts0vlc.ic
@@ -0,0 +1,142 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0vlc.ic
+Full text variable length integer encoding/decoding.
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0VLC_IC
+#define INNOBASE_FTS0VLC_IC
+
+#include "fts0types.h"
+
+/******************************************************************//**
+Return length of val if it were encoded using our VLC scheme.
+FIXME: We will need to be able encode 8 bytes value
+@return length of value encoded, in bytes */
+UNIV_INLINE
+ulint
+fts_get_encoded_len(
+/*================*/
+	ulint	val)	/* in: value to encode */
+{
+	if (val <= 127) {
+		return(1);
+	} else if (val <= 16383) {
+		return(2);
+	} else if (val <= 2097151) {
+		return(3);
+	} else if (val <= 268435455) {
+		return(4);
+	} else {
+		/* Possibly we should care that on 64-bit machines ulint can
+		contain values that we can't encode in 5 bytes, but
+		fts_encode_int doesn't handle them either so it doesn't much
+		matter. */
+
+		return(5);
+	}
+}
+
+/******************************************************************//**
+Encode an integer using our VLC scheme and return the length in bytes.
+@return length of value encoded, in bytes */
+UNIV_INLINE
+ulint
+fts_encode_int(
+/*===========*/
+	ulint	val,	/* in: value to encode */
+	byte*	buf)	/* in: buffer, must have enough space */
+{
+	ulint	len;
+
+	if (val <= 127) {
+		*buf = (byte) val;
+
+		len = 1;
+	} else if (val <= 16383) {
+		*buf++ = (byte)(val >> 7);
+		*buf = (byte)(val & 0x7F);
+
+		len = 2;
+	} else if (val <= 2097151) {
+		*buf++ = (byte)(val >> 14);
+		*buf++ = (byte)((val >> 7) & 0x7F);
+		*buf = (byte)(val & 0x7F);
+
+		len = 3;
+	} else if (val <= 268435455) {
+		*buf++ = (byte)(val >> 21);
+		*buf++ = (byte)((val >> 14) & 0x7F);
+		*buf++ = (byte)((val >> 7) & 0x7F);
+		*buf = (byte)(val & 0x7F);
+
+		len = 4;
+	} else {
+		/* Best to keep the limitations of the 32/64 bit versions
+		identical, at least for the time being. */
+		ut_ad(val <= 4294967295u);
+
+		*buf++ = (byte)(val >> 28);
+		*buf++ = (byte)((val >> 21) & 0x7F);
+		*buf++ = (byte)((val >> 14) & 0x7F);
+		*buf++ = (byte)((val >> 7) & 0x7F);
+		*buf = (byte)(val & 0x7F);
+
+		len = 5;
+	}
+
+	/* High-bit on means "last byte in the encoded integer". */
+	*buf |= 0x80;
+
+	return(len);
+}
+
+/******************************************************************//**
+Decode and return the integer that was encoded using our VLC scheme.
+@return value decoded */
+UNIV_INLINE
+ulint
+fts_decode_vlc(
+/*===========*/
+	byte**	ptr)	/* in: ptr to decode from, this ptr is
+			incremented by the number of bytes decoded */
+{
+	ulint	val = 0;
+
+	for (;;) {
+		byte	b = **ptr;
+
+		++*ptr;
+		val |= (b & 0x7F);
+
+		/* High-bit on means "last byte in the encoded integer". */
+		if (b & 0x80) {
+			break;
+		} else {
+			val <<= 7;
+		}
+	}
+
+	return(val);
+}
+
+#endif
diff --git a/storage/innobase/include/fut0fut.h b/storage/innobase/include/fut0fut.h
new file mode 100644
index 00000000000..851cdb44cdf
--- /dev/null
+++ b/storage/innobase/include/fut0fut.h
@@ -0,0 +1,55 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0fut.h
+File-based utilities
+
+Created 12/13/1995 Heikki Tuuri
+***********************************************************************/
+
+
+#ifndef fut0fut_h
+#define fut0fut_h
+
+#include "univ.i"
+
+#include "fil0fil.h"
+#include "mtr0mtr.h"
+
+/********************************************************************//**
+Gets a pointer to a file address and latches the page.
+@return pointer to a byte in a frame; the file page in the frame is
+bufferfixed and latched */
+UNIV_INLINE
+byte*
+fut_get_ptr(
+/*========*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	fil_addr_t	addr,	/*!< in: file address */
+	ulint		rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH */
+	mtr_t*		mtr);	/*!< in: mtr handle */
+
+#ifndef UNIV_NONINL
+#include "fut0fut.ic"
+#endif
+
+#endif
+
diff --git a/storage/innobase/include/fut0fut.ic b/storage/innobase/include/fut0fut.ic
new file mode 100644
index 00000000000..b065b10b9ca
--- /dev/null
+++ b/storage/innobase/include/fut0fut.ic
@@ -0,0 +1,56 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0fut.ic
+File-based utilities
+
+Created 12/13/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "sync0rw.h"
+#include "buf0buf.h"
+
+/********************************************************************//**
+Gets a pointer to a file address and latches the page.
+@return pointer to a byte in a frame; the file page in the frame is
+bufferfixed and latched */
+UNIV_INLINE
+byte*
+fut_get_ptr(
+/*========*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	fil_addr_t	addr,	/*!< in: file address */
+	ulint		rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH */
+	mtr_t*		mtr)	/*!< in: mtr handle */
+{
+	buf_block_t*	block;
+	byte*		ptr;
+
+	ut_ad(addr.boffset < UNIV_PAGE_SIZE);
+	ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
+
+	block = buf_page_get(space, zip_size, addr.page, rw_latch, mtr);
+	ptr = buf_block_get_frame(block) + addr.boffset;
+
+	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+	return(ptr);
+}
diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h
new file mode 100644
index 00000000000..90f9a65d4fa
--- /dev/null
+++ b/storage/innobase/include/fut0lst.h
@@ -0,0 +1,217 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0lst.h
+File-based list utilities
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef fut0lst_h
+#define fut0lst_h
+
+#include "univ.i"
+
+#include "fil0fil.h"
+#include "mtr0mtr.h"
+
+
+/* The C 'types' of base node and list node: these should be used to
+write self-documenting code. Of course, the sizeof macro cannot be
+applied to these types! */
+
+typedef	byte	flst_base_node_t;
+typedef	byte	flst_node_t;
+
+/* The physical size of a list base node in bytes */
+#define	FLST_BASE_NODE_SIZE	(4 + 2 * FIL_ADDR_SIZE)
+
+/* The physical size of a list node in bytes */
+#define	FLST_NODE_SIZE		(2 * FIL_ADDR_SIZE)
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Initializes a list base node. */
+UNIV_INLINE
+void
+flst_init(
+/*======*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Adds a node as the last node in a list. */
+UNIV_INTERN
+void
+flst_add_last(
+/*==========*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node,	/*!< in: node to add */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Adds a node as the first node in a list. */
+UNIV_INTERN
+void
+flst_add_first(
+/*===========*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node,	/*!< in: node to add */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Inserts a node after another in a list. */
+UNIV_INTERN
+void
+flst_insert_after(
+/*==============*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node1,	/*!< in: node to insert after */
+	flst_node_t*		node2,	/*!< in: node to add */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Inserts a node before another in a list. */
+UNIV_INTERN
+void
+flst_insert_before(
+/*===============*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node2,	/*!< in: node to insert */
+	flst_node_t*		node3,	/*!< in: node to insert before */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Removes a node. */
+UNIV_INTERN
+void
+flst_remove(
+/*========*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node2,	/*!< in: node to remove */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Cuts off the tail of the list, including the node given. The number of
+nodes which will be removed must be provided by the caller, as this function
+does not measure the length of the tail. */
+UNIV_INTERN
+void
+flst_cut_end(
+/*=========*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node2,	/*!< in: first node to remove */
+	ulint			n_nodes,/*!< in: number of nodes to remove,
+					must be >= 1 */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Cuts off the tail of the list, not including the given node. The number of
+nodes which will be removed must be provided by the caller, as this function
+does not measure the length of the tail. */
+UNIV_INTERN
+void
+flst_truncate_end(
+/*==============*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node2,	/*!< in: first node not to remove */
+	ulint			n_nodes,/*!< in: number of nodes to remove */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list length.
+@return	length */
+UNIV_INLINE
+ulint
+flst_get_len(
+/*=========*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list first node address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_first(
+/*===========*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list last node address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_last(
+/*==========*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list next node address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_next_addr(
+/*===============*/
+	const flst_node_t*	node,	/*!< in: pointer to node */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list prev node address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_prev_addr(
+/*===============*/
+	const flst_node_t*	node,	/*!< in: pointer to node */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Writes a file address. */
+UNIV_INLINE
+void
+flst_write_addr(
+/*============*/
+	fil_faddr_t*	faddr,	/*!< in: pointer to file faddress */
+	fil_addr_t	addr,	/*!< in: file address */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Reads a file address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_read_addr(
+/*===========*/
+	const fil_faddr_t*	faddr,	/*!< in: pointer to file faddress */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Validates a file-based list.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+flst_validate(
+/*==========*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	mtr_t*			mtr1);	/*!< in: mtr */
+/********************************************************************//**
+Prints info of a file-based list. */
+UNIV_INTERN
+void
+flst_print(
+/*=======*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	mtr_t*			mtr);	/*!< in: mtr */
+
+
+#ifndef UNIV_NONINL
+#include "fut0lst.ic"
+#endif
+
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/innobase/include/fut0lst.ic b/storage/innobase/include/fut0lst.ic
new file mode 100644
index 00000000000..d18cf21378f
--- /dev/null
+++ b/storage/innobase/include/fut0lst.ic
@@ -0,0 +1,167 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0lst.ic
+File-based list utilities
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "fut0fut.h"
+#include "mtr0log.h"
+#include "buf0buf.h"
+
+/* We define the field offsets of a node for the list */
+#define FLST_PREV	0	/* 6-byte address of the previous list element;
+				the page part of address is FIL_NULL, if no
+				previous element */
+#define FLST_NEXT	FIL_ADDR_SIZE	/* 6-byte address of the next
+				list element; the page part of address
+				is FIL_NULL, if no next element */
+
+/* We define the field offsets of a base node for the list */
+#define FLST_LEN	0	/* 32-bit list length field */
+#define	FLST_FIRST	4	/* 6-byte address of the first element
+				of the list; undefined if empty list */
+#define	FLST_LAST	(4 + FIL_ADDR_SIZE) /* 6-byte address of the
+				last element of the list; undefined
+				if empty list */
+
+/********************************************************************//**
+Writes a file address. */
+UNIV_INLINE
+void
+flst_write_addr(
+/*============*/
+	fil_faddr_t*	faddr,	/*!< in: pointer to file faddress */
+	fil_addr_t	addr,	/*!< in: file address */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+{
+	ut_ad(faddr && mtr);
+	ut_ad(mtr_memo_contains_page(mtr, faddr, MTR_MEMO_PAGE_X_FIX));
+	ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA);
+	ut_a(ut_align_offset(faddr, UNIV_PAGE_SIZE) >= FIL_PAGE_DATA);
+
+	mlog_write_ulint(faddr + FIL_ADDR_PAGE, addr.page, MLOG_4BYTES, mtr);
+	mlog_write_ulint(faddr + FIL_ADDR_BYTE, addr.boffset,
+			 MLOG_2BYTES, mtr);
+}
+
+/********************************************************************//**
+Reads a file address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_read_addr(
+/*===========*/
+	const fil_faddr_t*	faddr,	/*!< in: pointer to file faddress */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	fil_addr_t	addr;
+
+	ut_ad(faddr && mtr);
+
+	addr.page = mtr_read_ulint(faddr + FIL_ADDR_PAGE, MLOG_4BYTES, mtr);
+	addr.boffset = mtr_read_ulint(faddr + FIL_ADDR_BYTE, MLOG_2BYTES,
+				      mtr);
+	ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA);
+	ut_a(ut_align_offset(faddr, UNIV_PAGE_SIZE) >= FIL_PAGE_DATA);
+	return(addr);
+}
+
+/********************************************************************//**
+Initializes a list base node. */
+UNIV_INLINE
+void
+flst_init(
+/*======*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+
+	mlog_write_ulint(base + FLST_LEN, 0, MLOG_4BYTES, mtr);
+	flst_write_addr(base + FLST_FIRST, fil_addr_null, mtr);
+	flst_write_addr(base + FLST_LAST, fil_addr_null, mtr);
+}
+
+/********************************************************************//**
+Gets list length.
+@return	length */
+UNIV_INLINE
+ulint
+flst_get_len(
+/*=========*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	return(mtr_read_ulint(base + FLST_LEN, MLOG_4BYTES, mtr));
+}
+
+/********************************************************************//**
+Gets list first node address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_first(
+/*===========*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	return(flst_read_addr(base + FLST_FIRST, mtr));
+}
+
+/********************************************************************//**
+Gets list last node address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_last(
+/*==========*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	return(flst_read_addr(base + FLST_LAST, mtr));
+}
+
+/********************************************************************//**
+Gets list next node address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_next_addr(
+/*===============*/
+	const flst_node_t*	node,	/*!< in: pointer to node */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	return(flst_read_addr(node + FLST_NEXT, mtr));
+}
+
+/********************************************************************//**
+Gets list prev node address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_prev_addr(
+/*===============*/
+	const flst_node_t*	node,	/*!< in: pointer to node */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	return(flst_read_addr(node + FLST_PREV, mtr));
+}
diff --git a/storage/innobase/include/ha0ha.h b/storage/innobase/include/ha0ha.h
new file mode 100644
index 00000000000..7351b407e8c
--- /dev/null
+++ b/storage/innobase/include/ha0ha.h
@@ -0,0 +1,265 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0ha.h
+The hash table with external chains
+
+Created 8/18/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef ha0ha_h
+#define ha0ha_h
+
+#include "univ.i"
+
+#include "hash0hash.h"
+#include "page0types.h"
+#include "buf0types.h"
+#include "rem0types.h"
+
+/*************************************************************//**
+Looks for an element in a hash table.
+@return pointer to the data of the first hash table node in chain
+having the fold number, NULL if not found */
+UNIV_INLINE
+const rec_t*
+ha_search_and_get_data(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: folded value of the searched data */
+/*********************************************************//**
+Looks for an element when we know the pointer to the data and updates
+the pointer to data if found.
+@return TRUE if found */
+UNIV_INTERN
+ibool
+ha_search_and_update_if_found_func(
+/*===============================*/
+	hash_table_t*	table,	/*!< in/out: hash table */
+	ulint		fold,	/*!< in: folded value of the searched data */
+	const rec_t*	data,	/*!< in: pointer to the data */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	buf_block_t*	new_block,/*!< in: block containing new_data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	const rec_t*	new_data);/*!< in: new pointer to the data */
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/** Looks for an element when we know the pointer to the data and
+updates the pointer to data if found.
+@param table		in/out: hash table
+@param fold		in: folded value of the searched data
+@param data		in: pointer to the data
+@param new_block	in: block containing new_data
+@param new_data		in: new pointer to the data */
+# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \
+	ha_search_and_update_if_found_func(table,fold,data,new_block,new_data)
+#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/** Looks for an element when we know the pointer to the data and
+updates the pointer to data if found.
+@param table		in/out: hash table
+@param fold		in: folded value of the searched data
+@param data		in: pointer to the data
+@param new_block	ignored: block containing new_data
+@param new_data		in: new pointer to the data */
+# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \
+	ha_search_and_update_if_found_func(table,fold,data,new_data)
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/*************************************************************//**
+Creates a hash table with at least n array cells.  The actual number
+of cells is chosen to be a prime number slightly bigger than n.
+@return	own: created table */
+UNIV_INTERN
+hash_table_t*
+ha_create_func(
+/*===========*/
+	ulint	n,		/*!< in: number of array cells */
+#ifdef UNIV_SYNC_DEBUG
+	ulint	mutex_level,	/*!< in: level of the mutexes in the latching
+				order: this is used in the debug version */
+#endif /* UNIV_SYNC_DEBUG */
+	ulint	n_mutexes,	/*!< in: number of mutexes to protect the
+				hash table: must be a power of 2, or 0 */
+	ulint	type);		/*!< in: type of datastructure for which
+				the memory heap is going to be used e.g.:
+				MEM_HEAP_FOR_BTR_SEARCH or
+				MEM_HEAP_FOR_PAGE_HASH */
+#ifdef UNIV_SYNC_DEBUG
+/** Creates a hash table.
+@return		own: created table
+@param n_c	in: number of array cells.  The actual number of cells is
+chosen to be a slightly bigger prime number.
+@param level	in: level of the mutexes in the latching order
+@param n_m	in: number of mutexes to protect the hash table;
+		must be a power of 2, or 0 */
+# define ha_create(n_c,n_m,type,level) ha_create_func(n_c,level,n_m,type)
+#else /* UNIV_SYNC_DEBUG */
+/** Creates a hash table.
+@return		own: created table
+@param n_c	in: number of array cells.  The actual number of cells is
+chosen to be a slightly bigger prime number.
+@param level	in: level of the mutexes in the latching order
+@param n_m	in: number of mutexes to protect the hash table;
+		must be a power of 2, or 0 */
+# define ha_create(n_c,n_m,type,level) ha_create_func(n_c,n_m,type)
+#endif /* UNIV_SYNC_DEBUG */
+
+/*************************************************************//**
+Empties a hash table and frees the memory heaps. */
+UNIV_INTERN
+void
+ha_clear(
+/*=====*/
+	hash_table_t*	table);	/*!< in, own: hash table */
+
+/*************************************************************//**
+Inserts an entry into a hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted.
+@return	TRUE if succeed, FALSE if no more memory could be allocated */
+UNIV_INTERN
+ibool
+ha_insert_for_fold_func(
+/*====================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold,	/*!< in: folded value of data; if a node with
+				the same fold value already exists, it is
+				updated to point to the same data, and no new
+				node is created! */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	buf_block_t*	block,	/*!< in: buffer block containing the data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	const rec_t*	data);	/*!< in: data, must not be NULL */
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/**
+Inserts an entry into a hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted.
+@return	TRUE if succeed, FALSE if no more memory could be allocated
+@param t	in: hash table
+@param f	in: folded value of data
+@param b	in: buffer block containing the data
+@param d	in: data, must not be NULL */
+# define ha_insert_for_fold(t,f,b,d) 	do {		\
+	ha_insert_for_fold_func(t,f,b,d);		\
+	MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);	\
+} while(0)
+#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/**
+Inserts an entry into a hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted.
+@return	TRUE if succeed, FALSE if no more memory could be allocated
+@param t	in: hash table
+@param f	in: folded value of data
+@param b	ignored: buffer block containing the data
+@param d	in: data, must not be NULL */
+# define ha_insert_for_fold(t,f,b,d)	do {		\
+	ha_insert_for_fold_func(t,f,d);			\
+	MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);	\
+} while (0)
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+/*********************************************************//**
+Looks for an element when we know the pointer to the data and deletes
+it from the hash table if found.
+@return	TRUE if found */
+UNIV_INLINE
+ibool
+ha_search_and_delete_if_found(
+/*==========================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold,	/*!< in: folded value of the searched data */
+	const rec_t*	data);	/*!< in: pointer to the data */
+#ifndef UNIV_HOTBACKUP
+/*****************************************************************//**
+Removes from the chain determined by fold all nodes whose data pointer
+points to the page given. */
+UNIV_INTERN
+void
+ha_remove_all_nodes_to_page(
+/*========================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold,	/*!< in: fold value */
+	const page_t*	page);	/*!< in: buffer page */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/*************************************************************//**
+Validates a given range of the cells in hash table.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+ha_validate(
+/*========*/
+	hash_table_t*	table,		/*!< in: hash table */
+	ulint		start_index,	/*!< in: start index */
+	ulint		end_index);	/*!< in: end index */
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
+/*************************************************************//**
+Prints info of a hash table. */
+UNIV_INTERN
+void
+ha_print_info(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	hash_table_t*	table);	/*!< in: hash table */
+#endif /* !UNIV_HOTBACKUP */
+
+/** The hash table external chain node */
+struct ha_node_t {
+	ha_node_t*	next;	/*!< next chain node or NULL if none */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	buf_block_t*	block;	/*!< buffer block containing the data, or NULL */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	const rec_t*	data;	/*!< pointer to the data */
+	ulint		fold;	/*!< fold value for the data */
+};
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Assert that the synchronization object in a hash operation involving
+possible change in the hash table is held.
+Note that in case of mutexes we assert that mutex is owned while in case
+of rw-locks we assert that it is held in exclusive mode. */
+UNIV_INLINE
+void
+hash_assert_can_modify(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold value */
+/********************************************************************//**
+Assert that the synchronization object in a hash search operation is held.
+Note that in case of mutexes we assert that mutex is owned while in case
+of rw-locks we assert that it is held either in x-mode or s-mode. */
+UNIV_INLINE
+void
+hash_assert_can_search(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold value */
+#else /* UNIV_DEBUG */
+#define hash_assert_can_modify(t, f)
+#define hash_assert_can_search(t, f)
+#endif /* UNIV_DEBUG */
+
+
+#ifndef UNIV_NONINL
+#include "ha0ha.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/ha0ha.ic b/storage/innobase/include/ha0ha.ic
new file mode 100644
index 00000000000..c478ff54303
--- /dev/null
+++ b/storage/innobase/include/ha0ha.ic
@@ -0,0 +1,246 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/ha0ha.ic
+The hash table with external chains
+
+Created 8/18/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0rnd.h"
+#include "mem0mem.h"
+#include "btr0types.h"
+
+/***********************************************************//**
+Deletes a hash node. */
+UNIV_INTERN
+void
+ha_delete_hash_node(
+/*================*/
+	hash_table_t*	table,		/*!< in: hash table */
+	ha_node_t*	del_node);	/*!< in: node to be deleted */
+
+/******************************************************************//**
+Gets a hash node data.
+@return	pointer to the data */
+UNIV_INLINE
+const rec_t*
+ha_node_get_data(
+/*=============*/
+	const ha_node_t*	node)	/*!< in: hash chain node */
+{
+	return(node->data);
+}
+
+/******************************************************************//**
+Sets hash node data. */
+UNIV_INLINE
+void
+ha_node_set_data_func(
+/*==================*/
+	ha_node_t*	node,	/*!< in: hash chain node */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	buf_block_t*	block,	/*!< in: buffer block containing the data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	const rec_t*	data)	/*!< in: pointer to the data */
+{
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	node->block = block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	node->data = data;
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/** Sets hash node data.
+@param n	in: hash chain node
+@param b	in: buffer block containing the data
+@param d	in: pointer to the data */
+# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,b,d)
+#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/** Sets hash node data.
+@param n	in: hash chain node
+@param b	in: buffer block containing the data
+@param d	in: pointer to the data */
+# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,d)
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+/******************************************************************//**
+Gets the next node in a hash chain.
+@return	next node, NULL if none */
+UNIV_INLINE
+ha_node_t*
+ha_chain_get_next(
+/*==============*/
+	ha_node_t*	node)	/*!< in: hash chain node */
+{
+	return(node->next);
+}
+
+/******************************************************************//**
+Gets the first node in a hash chain.
+@return	first node, NULL if none */
+UNIV_INLINE
+ha_node_t*
+ha_chain_get_first(
+/*===============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold value determining the chain */
+{
+	return((ha_node_t*)
+	       hash_get_nth_cell(table, hash_calc_hash(fold, table))->node);
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Assert that the synchronization object in a hash operation involving
+possible change in the hash table is held.
+Note that in case of mutexes we assert that mutex is owned while in case
+of rw-locks we assert that it is held in exclusive mode. */
+UNIV_INLINE
+void
+hash_assert_can_modify(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold value */
+{
+	if (table->type == HASH_TABLE_SYNC_MUTEX) {
+		ut_ad(mutex_own(hash_get_mutex(table, fold)));
+	} else if (table->type == HASH_TABLE_SYNC_RW_LOCK) {
+# ifdef UNIV_SYNC_DEBUG
+		rw_lock_t* lock = hash_get_lock(table, fold);
+		ut_ad(rw_lock_own(lock, RW_LOCK_EX));
+# endif
+	} else {
+		ut_ad(table->type == HASH_TABLE_SYNC_NONE);
+	}
+}
+
+/********************************************************************//**
+Assert that the synchronization object in a hash search operation is held.
+Note that in case of mutexes we assert that mutex is owned while in case
+of rw-locks we assert that it is held either in x-mode or s-mode. */
+UNIV_INLINE
+void
+hash_assert_can_search(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold value */
+{
+	if (table->type == HASH_TABLE_SYNC_MUTEX) {
+		ut_ad(mutex_own(hash_get_mutex(table, fold)));
+	} else if (table->type == HASH_TABLE_SYNC_RW_LOCK) {
+# ifdef UNIV_SYNC_DEBUG
+		rw_lock_t* lock = hash_get_lock(table, fold);
+		ut_ad(rw_lock_own(lock, RW_LOCK_EX)
+		      || rw_lock_own(lock, RW_LOCK_SHARED));
+# endif
+	} else {
+		ut_ad(table->type == HASH_TABLE_SYNC_NONE);
+	}
+}
+#endif /* UNIV_DEBUG */
+
+/*************************************************************//**
+Looks for an element in a hash table.
+@return pointer to the data of the first hash table node in chain
+having the fold number, NULL if not found */
+UNIV_INLINE
+const rec_t*
+ha_search_and_get_data(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: folded value of the searched data */
+{
+	ha_node_t*	node;
+
+	hash_assert_can_search(table, fold);
+	ut_ad(btr_search_enabled);
+
+	node = ha_chain_get_first(table, fold);
+
+	while (node) {
+		if (node->fold == fold) {
+
+			return(node->data);
+		}
+
+		node = ha_chain_get_next(node);
+	}
+
+	return(NULL);
+}
+
+/*********************************************************//**
+Looks for an element when we know the pointer to the data.
+@return	pointer to the hash table node, NULL if not found in the table */
+UNIV_INLINE
+ha_node_t*
+ha_search_with_data(
+/*================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold,	/*!< in: folded value of the searched data */
+	const rec_t*	data)	/*!< in: pointer to the data */
+{
+	ha_node_t*	node;
+
+	hash_assert_can_search(table, fold);
+
+	ut_ad(btr_search_enabled);
+
+	node = ha_chain_get_first(table, fold);
+
+	while (node) {
+		if (node->data == data) {
+
+			return(node);
+		}
+
+		node = ha_chain_get_next(node);
+	}
+
+	return(NULL);
+}
+
+/*********************************************************//**
+Looks for an element when we know the pointer to the data, and deletes
+it from the hash table, if found.
+@return	TRUE if found */
+UNIV_INLINE
+ibool
+ha_search_and_delete_if_found(
+/*==========================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold,	/*!< in: folded value of the searched data */
+	const rec_t*	data)	/*!< in: pointer to the data */
+{
+	ha_node_t*	node;
+
+	hash_assert_can_modify(table, fold);
+	ut_ad(btr_search_enabled);
+
+	node = ha_search_with_data(table, fold, data);
+
+	if (node) {
+		ha_delete_hash_node(table, node);
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
diff --git a/storage/innobase/include/ha0storage.h b/storage/innobase/include/ha0storage.h
new file mode 100644
index 00000000000..0073930b502
--- /dev/null
+++ b/storage/innobase/include/ha0storage.h
@@ -0,0 +1,140 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0storage.h
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 22, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef ha0storage_h
+#define ha0storage_h
+
+#include "univ.i"
+
+/** This value is used by default by ha_storage_create(). More memory
+is allocated later when/if it is needed. */
+#define HA_STORAGE_DEFAULT_HEAP_BYTES	1024
+
+/** This value is used by default by ha_storage_create(). It is a
+constant per ha_storage's lifetime. */
+#define HA_STORAGE_DEFAULT_HASH_CELLS	4096
+
+/** Hash storage */
+struct ha_storage_t;
+
+/*******************************************************************//**
+Creates a hash storage. If any of the parameters is 0, then a default
+value is used.
+@return	own: hash storage */
+UNIV_INLINE
+ha_storage_t*
+ha_storage_create(
+/*==============*/
+	ulint	initial_heap_bytes,	/*!< in: initial heap's size */
+	ulint	initial_hash_cells);	/*!< in: initial number of cells
+					in the hash table */
+
+/*******************************************************************//**
+Copies data into the storage and returns a pointer to the copy. If the
+same data chunk is already present, then pointer to it is returned.
+Data chunks are considered to be equal if len1 == len2 and
+memcmp(data1, data2, len1) == 0. If "data" is not present (and thus
+data_len bytes need to be allocated) and the size of storage is going to
+become more than "memlim" then "data" is not added and NULL is returned.
+To disable this behavior "memlim" can be set to 0, which stands for
+"no limit".
+@return	pointer to the copy */
+UNIV_INTERN
+const void*
+ha_storage_put_memlim(
+/*==================*/
+	ha_storage_t*	storage,	/*!< in/out: hash storage */
+	const void*	data,		/*!< in: data to store */
+	ulint		data_len,	/*!< in: data length */
+	ulint		memlim);	/*!< in: memory limit to obey */
+
+/*******************************************************************//**
+Same as ha_storage_put_memlim() but without memory limit.
+@param storage	in/out: hash storage
+@param data	in: data to store
+@param data_len	in: data length
+@return		pointer to the copy of the string */
+#define ha_storage_put(storage, data, data_len)	\
+	ha_storage_put_memlim((storage), (data), (data_len), 0)
+
+/*******************************************************************//**
+Copies string into the storage and returns a pointer to the copy. If the
+same string is already present, then pointer to it is returned.
+Strings are considered to be equal if strcmp(str1, str2) == 0.
+@param storage	in/out: hash storage
+@param str	in: string to put
+@return		pointer to the copy of the string */
+#define ha_storage_put_str(storage, str)	\
+	((const char*) ha_storage_put((storage), (str), strlen(str) + 1))
+
+/*******************************************************************//**
+Copies string into the storage and returns a pointer to the copy obeying
+a memory limit.
+If the same string is already present, then pointer to it is returned.
+Strings are considered to be equal if strcmp(str1, str2) == 0.
+@param storage	in/out: hash storage
+@param str	in: string to put
+@param memlim	in: memory limit to obey
+@return		pointer to the copy of the string */
+#define ha_storage_put_str_memlim(storage, str, memlim)	\
+	((const char*) ha_storage_put_memlim((storage), (str),	\
+					     strlen(str) + 1, (memlim)))
+
+/*******************************************************************//**
+Empties a hash storage, freeing memory occupied by data chunks.
+This invalidates any pointers previously returned by ha_storage_put().
+The hash storage is not invalidated itself and can be used again. */
+UNIV_INLINE
+void
+ha_storage_empty(
+/*=============*/
+	ha_storage_t**	storage);	/*!< in/out: hash storage */
+
+/*******************************************************************//**
+Frees a hash storage and everything it contains, it cannot be used after
+this call.
+This invalidates any pointers previously returned by ha_storage_put(). */
+UNIV_INLINE
+void
+ha_storage_free(
+/*============*/
+	ha_storage_t*	storage);	/*!< in, own: hash storage */
+
+/*******************************************************************//**
+Gets the size of the memory used by a storage.
+@return	bytes used */
+UNIV_INLINE
+ulint
+ha_storage_get_size(
+/*================*/
+	const ha_storage_t*	storage);	/*!< in: hash storage */
+
+#ifndef UNIV_NONINL
+#include "ha0storage.ic"
+#endif
+
+#endif /* ha0storage_h */
diff --git a/storage/innobase/include/ha0storage.ic b/storage/innobase/include/ha0storage.ic
new file mode 100644
index 00000000000..7150ca045ec
--- /dev/null
+++ b/storage/innobase/include/ha0storage.ic
@@ -0,0 +1,146 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0storage.ic
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 24, 2007 Vasil Dimov
+*******************************************************/
+
+#include "univ.i"
+#include "ha0storage.h"
+#include "hash0hash.h"
+#include "mem0mem.h"
+
+/** Hash storage for strings */
+struct ha_storage_t {
+	mem_heap_t*	heap;	/*!< memory heap from which memory is
+				allocated */
+	hash_table_t*	hash;	/*!< hash table used to avoid
+				duplicates */
+};
+
+/** Objects of this type are stored in ha_storage_t */
+struct ha_storage_node_t {
+	ulint			data_len;/*!< length of the data */
+	const void*		data;	/*!< pointer to data */
+	ha_storage_node_t*	next;	/*!< next node in hash chain */
+};
+
+/*******************************************************************//**
+Creates a hash storage. If any of the parameters is 0, then a default
+value is used.
+@return	own: hash storage */
+UNIV_INLINE
+ha_storage_t*
+ha_storage_create(
+/*==============*/
+	ulint	initial_heap_bytes,	/*!< in: initial heap's size */
+	ulint	initial_hash_cells)	/*!< in: initial number of cells
+					in the hash table */
+{
+	ha_storage_t*	storage;
+	mem_heap_t*	heap;
+
+	if (initial_heap_bytes == 0) {
+
+		initial_heap_bytes = HA_STORAGE_DEFAULT_HEAP_BYTES;
+	}
+
+	if (initial_hash_cells == 0) {
+
+		initial_hash_cells = HA_STORAGE_DEFAULT_HASH_CELLS;
+	}
+
+	/* we put "storage" within "storage->heap" */
+
+	heap = mem_heap_create(sizeof(ha_storage_t)
+			       + initial_heap_bytes);
+
+	storage = (ha_storage_t*) mem_heap_alloc(heap,
+						 sizeof(ha_storage_t));
+
+	storage->heap = heap;
+	storage->hash = hash_create(initial_hash_cells);
+
+	return(storage);
+}
+
+/*******************************************************************//**
+Empties a hash storage, freeing memory occupied by data chunks.
+This invalidates any pointers previously returned by ha_storage_put().
+The hash storage is not invalidated itself and can be used again. */
+UNIV_INLINE
+void
+ha_storage_empty(
+/*=============*/
+	ha_storage_t**	storage)	/*!< in/out: hash storage */
+{
+	ha_storage_t	temp_storage;
+
+	temp_storage.heap = (*storage)->heap;
+	temp_storage.hash = (*storage)->hash;
+
+	hash_table_clear(temp_storage.hash);
+	mem_heap_empty(temp_storage.heap);
+
+	*storage = (ha_storage_t*) mem_heap_alloc(temp_storage.heap,
+						  sizeof(ha_storage_t));
+
+	(*storage)->heap = temp_storage.heap;
+	(*storage)->hash = temp_storage.hash;
+}
+
+/*******************************************************************//**
+Frees a hash storage and everything it contains, it cannot be used after
+this call.
+This invalidates any pointers previously returned by ha_storage_put(). */
+UNIV_INLINE
+void
+ha_storage_free(
+/*============*/
+	ha_storage_t*	storage)	/*!< in, own: hash storage */
+{
+	/* order is important because the pointer storage->hash is
+	within the heap */
+	hash_table_free(storage->hash);
+	mem_heap_free(storage->heap);
+}
+
+/*******************************************************************//**
+Gets the size of the memory used by a storage.
+@return	bytes used */
+UNIV_INLINE
+ulint
+ha_storage_get_size(
+/*================*/
+	const ha_storage_t*	storage)	/*!< in: hash storage */
+{
+	ulint	ret;
+
+	ret = mem_heap_get_size(storage->heap);
+
+	/* this assumes hash->heap and hash->heaps are NULL */
+	ret += sizeof(hash_table_t);
+	ret += sizeof(hash_cell_t) * hash_get_n_cells(storage->hash);
+
+	return(ret);
+}
diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
new file mode 100644
index 00000000000..fa202aa773e
--- /dev/null
+++ b/storage/innobase/include/ha_prototypes.h
@@ -0,0 +1,596 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ha_prototypes.h
+Prototypes for global functions in ha_innodb.cc that are called by
+InnoDB C code
+
+Created 5/11/2006 Osku Salerma
+************************************************************************/
+
+#ifndef HA_INNODB_PROTOTYPES_H
+#define HA_INNODB_PROTOTYPES_H
+
+#include "my_dbug.h"
+#include "mysqld_error.h"
+#include "my_compare.h"
+#include "my_sys.h"
+#include "m_string.h"
+#include "debug_sync.h"
+
+#include "trx0types.h"
+#include "m_ctype.h" /* CHARSET_INFO */
+
+// Forward declarations
+class Field;
+struct fts_string_t;
+
+/*********************************************************************//**
+Wrapper around MySQL's copy_and_convert function.
+@return	number of bytes copied to 'to' */
+UNIV_INTERN
+ulint
+innobase_convert_string(
+/*====================*/
+	void*		to,		/*!< out: converted string */
+	ulint		to_length,	/*!< in: number of bytes reserved
+					for the converted string */
+	CHARSET_INFO*	to_cs,		/*!< in: character set to convert to */
+	const void*	from,		/*!< in: string to convert */
+	ulint		from_length,	/*!< in: number of bytes to convert */
+	CHARSET_INFO*	from_cs,	/*!< in: character set to convert
+					from */
+	uint*		errors);	/*!< out: number of errors encountered
+					during the conversion */
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes
+the result to "buf". The result is converted to "system_charset_info".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return	number of bytes that were written */
+UNIV_INTERN
+ulint
+innobase_raw_format(
+/*================*/
+	const char*	data,		/*!< in: raw data */
+	ulint		data_len,	/*!< in: raw data length
+					in bytes */
+	ulint		charset_coll,	/*!< in: charset collation */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size);	/*!< in: output buffer size
+					in bytes */
+
+/*****************************************************************//**
+Invalidates the MySQL query cache for the table. */
+UNIV_INTERN
+void
+innobase_invalidate_query_cache(
+/*============================*/
+	trx_t*		trx,		/*!< in: transaction which
+					modifies the table */
+	const char*	full_name,	/*!< in: concatenation of
+					database name, null char NUL,
+					table name, null char NUL;
+					NOTE that in Windows this is
+					always in LOWER CASE! */
+	ulint		full_name_len);	/*!< in: full name length where
+					also the null chars count */
+
+/*****************************************************************//**
+Convert a table or index name to the MySQL system_charset_info (UTF-8)
+and quote it if needed.
+@return	pointer to the end of buf */
+UNIV_INTERN
+char*
+innobase_convert_name(
+/*==================*/
+	char*		buf,	/*!< out: buffer for converted identifier */
+	ulint		buflen,	/*!< in: length of buf, in bytes */
+	const char*	id,	/*!< in: identifier to convert */
+	ulint		idlen,	/*!< in: length of id, in bytes */
+	THD*		thd,	/*!< in: MySQL connection thread, or NULL */
+	ibool		table_id);/*!< in: TRUE=id is a table or database name;
+				FALSE=id is an index name */
+
+/******************************************************************//**
+Returns true if the thread is the replication thread on the slave
+server. Used in srv_conc_enter_innodb() to determine if the thread
+should be allowed to enter InnoDB - the replication thread is treated
+differently than other threads. Also used in
+srv_conc_force_exit_innodb().
+@return	true if thd is the replication thread */
+UNIV_INTERN
+ibool
+thd_is_replication_slave_thread(
+/*============================*/
+	THD*	thd);	/*!< in: thread handle */
+
+/******************************************************************//**
+Gets information on the durability property requested by thread.
+Used when writing either a prepare or commit record to the log
+buffer.
+@return the durability property. */
+UNIV_INTERN
+enum durability_properties
+thd_requested_durability(
+/*=====================*/
+	const THD* thd)	/*!< in: thread handle */
+	__attribute__((nonnull, warn_unused_result));
+
+/******************************************************************//**
+Returns true if the transaction this thread is processing has edited
+non-transactional tables. Used by the deadlock detector when deciding
+which transaction to rollback in case of a deadlock - we try to avoid
+rolling back transactions that have edited non-transactional tables.
+@return	true if non-transactional tables have been edited */
+UNIV_INTERN
+ibool
+thd_has_edited_nontrans_tables(
+/*===========================*/
+	THD*	thd);	/*!< in: thread handle */
+
+/*************************************************************//**
+Prints info of a THD object (== user session thread) to the given file. */
+UNIV_INTERN
+void
+innobase_mysql_print_thd(
+/*=====================*/
+	FILE*	f,		/*!< in: output stream */
+	THD*	thd,		/*!< in: pointer to a MySQL THD object */
+	uint	max_query_len);	/*!< in: max query length to print, or 0 to
+				   use the default max length */
+
+/*************************************************************//**
+InnoDB uses this function to compare two data fields for which the data type
+is such that we must use MySQL code to compare them.
+@return	1, 0, -1, if a is greater, equal, less than b, respectively */
+UNIV_INTERN
+int
+innobase_mysql_cmp(
+/*===============*/
+	int		mysql_type,	/*!< in: MySQL type */
+	uint		charset_number,	/*!< in: number of the charset */
+	const unsigned char* a,		/*!< in: data field */
+	unsigned int	a_length,	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+	const unsigned char* b,		/*!< in: data field */
+	unsigned int	b_length)	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+	__attribute__((nonnull, warn_unused_result));
+/**************************************************************//**
+Converts a MySQL type to an InnoDB type. Note that this function returns
+the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
+VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
+@return	DATA_BINARY, DATA_VARCHAR, ... */
+UNIV_INTERN
+ulint
+get_innobase_type_from_mysql_type(
+/*==============================*/
+	ulint*		unsigned_flag,	/*!< out: DATA_UNSIGNED if an
+					'unsigned type';
+					at least ENUM and SET,
+					and unsigned integer
+					types are 'unsigned types' */
+	const void*	field)		/*!< in: MySQL Field */
+	__attribute__((nonnull));
+
+/******************************************************************//**
+Get the variable length bounds of the given character set. */
+UNIV_INTERN
+void
+innobase_get_cset_width(
+/*====================*/
+	ulint	cset,		/*!< in: MySQL charset-collation code */
+	ulint*	mbminlen,	/*!< out: minimum length of a char (in bytes) */
+	ulint*	mbmaxlen);	/*!< out: maximum length of a char (in bytes) */
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively.
+@return	0 if a=b, <0 if a<b, >1 if a>b */
+UNIV_INTERN
+int
+innobase_strcasecmp(
+/*================*/
+	const char*	a,	/*!< in: first string to compare */
+	const char*	b);	/*!< in: second string to compare */
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively. The
+second string contains wildcards.
+@return 0 if a match is found, 1 if not */
+UNIV_INTERN
+int
+innobase_wildcasecmp(
+/*=================*/
+	const char*	a,	/*!< in: string to compare */
+	const char*	b);	/*!< in: wildcard string to compare */
+
+/******************************************************************//**
+Strip dir name from a full path name and return only its file name.
+@return file name or "null" if no file name */
+UNIV_INTERN
+const char*
+innobase_basename(
+/*==============*/
+	const char*	path_name);	/*!< in: full path name */
+
+/******************************************************************//**
+Returns true if the thread is executing a SELECT statement.
+@return	true if thd is executing SELECT */
+UNIV_INTERN
+ibool
+thd_is_select(
+/*==========*/
+	const THD*	thd);	/*!< in: thread handle */
+
+/******************************************************************//**
+Converts an identifier to a table name. */
+UNIV_INTERN
+void
+innobase_convert_from_table_id(
+/*===========================*/
+	struct charset_info_st*	cs,	/*!< in: the 'from' character set */
+	char*			to,	/*!< out: converted identifier */
+	const char*		from,	/*!< in: identifier to convert */
+	ulint			len);	/*!< in: length of 'to', in bytes; should
+					be at least 5 * strlen(to) + 1 */
+/******************************************************************//**
+Converts an identifier to UTF-8. */
+UNIV_INTERN
+void
+innobase_convert_from_id(
+/*=====================*/
+	struct charset_info_st*	cs,	/*!< in: the 'from' character set */
+	char*			to,	/*!< out: converted identifier */
+	const char*		from,	/*!< in: identifier to convert */
+	ulint			len);	/*!< in: length of 'to', in bytes;
+					should be at least 3 * strlen(to) + 1 */
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+UNIV_INTERN
+void
+innobase_casedn_str(
+/*================*/
+	char*	a);	/*!< in/out: string to put in lower case */
+
+/**********************************************************************//**
+Determines the connection character set.
+@return	connection character set */
+UNIV_INTERN
+struct charset_info_st*
+innobase_get_charset(
+/*=================*/
+	THD*	thd);	/*!< in: MySQL thread handle */
+/**********************************************************************//**
+Determines the current SQL statement.
+@return	SQL statement string */
+UNIV_INTERN
+const char*
+innobase_get_stmt(
+/*==============*/
+	THD*	thd,		/*!< in: MySQL thread handle */
+	size_t*	length)		/*!< out: length of the SQL statement */
+	__attribute__((nonnull));
+/******************************************************************//**
+This function is used to find the storage length in bytes of the first n
+characters for prefix indexes using a multibyte character set. The function
+finds charset information and returns length of prefix_len characters in the
+index field in bytes.
+@return	number of bytes occupied by the first n characters */
+UNIV_INTERN
+ulint
+innobase_get_at_most_n_mbchars(
+/*===========================*/
+	ulint charset_id,	/*!< in: character set id */
+	ulint prefix_len,	/*!< in: prefix length in bytes of the index
+				(this has to be divided by mbmaxlen to get the
+				number of CHARACTERS n in the prefix) */
+	ulint data_len,		/*!< in: length of the string in bytes */
+	const char* str);	/*!< in: character string */
+
+/*************************************************************//**
+InnoDB index push-down condition check
+@return ICP_NO_MATCH, ICP_MATCH, or ICP_OUT_OF_RANGE */
+UNIV_INTERN
+enum icp_result
+innobase_index_cond(
+/*================*/
+	void*	file)	/*!< in/out: pointer to ha_innobase */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Returns true if the thread supports XA,
+global value of innodb_supports_xa if thd is NULL.
+@return	true if thd supports XA */
+UNIV_INTERN
+ibool
+thd_supports_xa(
+/*============*/
+	THD*	thd);	/*!< in: thread handle, or NULL to query
+			the global innodb_supports_xa */
+
+/******************************************************************//**
+Returns the lock wait timeout for the current connection.
+@return	the lock wait timeout, in seconds */
+UNIV_INTERN
+ulong
+thd_lock_wait_timeout(
+/*==================*/
+	THD*	thd);	/*!< in: thread handle, or NULL to query
+			the global innodb_lock_wait_timeout */
+/******************************************************************//**
+Add up the time waited for the lock for the current query. */
+UNIV_INTERN
+void
+thd_set_lock_wait_time(
+/*===================*/
+	THD*	thd,	/*!< in/out: thread handle */
+	ulint	value);	/*!< in: time waited for the lock */
+
+/**********************************************************************//**
+Get the current setting of the table_cache_size global parameter. We do
+a dirty read because for one there is no synchronization object and
+secondly there is little harm in doing so even if we get a torn read.
+@return	SQL statement string */
+UNIV_INTERN
+ulint
+innobase_get_table_cache_size(void);
+/*===============================*/
+
+/**********************************************************************//**
+Get the current setting of the lower_case_table_names global parameter from
+mysqld.cc. We do a dirty read because for one there is no synchronization
+object and secondly there is little harm in doing so even if we get a torn
+read.
+@return	value of lower_case_table_names */
+UNIV_INTERN
+ulint
+innobase_get_lower_case_table_names(void);
+/*=====================================*/
+
+/*****************************************************************//**
+Frees a possible InnoDB trx object associated with the current THD.
+@return 0 or error number */
+UNIV_INTERN
+int
+innobase_close_thd(
+/*===============*/
+	THD*	thd);		/*!< in: MySQL thread handle for
+				which to close the connection */
+/*************************************************************//**
+Get the next token from the given string and store it in *token. */
+UNIV_INTERN
+ulint
+innobase_mysql_fts_get_token(
+/*=========================*/
+	CHARSET_INFO*	charset,	/*!< in: Character set */
+	const byte*	start,		/*!< in: start of text */
+	const byte*	end,		/*!< in: one character past end of
+					text */
+	fts_string_t*	token,		/*!< out: token's text */
+	ulint*		offset);	/*!< out: offset to token,
+					measured as characters from
+					'start' */
+
+/******************************************************************//**
+compare two character string case insensitively according to their charset. */
+UNIV_INTERN
+int
+innobase_fts_text_case_cmp(
+/*=======================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*	p1,		/*!< in: key */
+	const void*	p2);		/*!< in: node */
+
+/****************************************************************//**
+Get FTS field charset info from the field's prtype
+@return charset info */
+UNIV_INTERN
+CHARSET_INFO*
+innobase_get_fts_charset(
+/*=====================*/
+	int		mysql_type,	/*!< in: MySQL type */
+	uint		charset_number);/*!< in: number of the charset */
+/******************************************************************//**
+Returns true if transaction should be flagged as read-only.
+@return	true if the thd is marked as read-only */
+UNIV_INTERN
+ibool
+thd_trx_is_read_only(
+/*=================*/
+	THD*	thd);	/*!< in/out: thread handle */
+
+/******************************************************************//**
+Check if the transaction is an auto-commit transaction. TRUE also
+implies that it is a SELECT (read-only) transaction.
+@return	true if the transaction is an auto commit read-only transaction. */
+UNIV_INTERN
+ibool
+thd_trx_is_auto_commit(
+/*===================*/
+	THD*	thd);	/*!< in: thread handle, or NULL */
+
+/*****************************************************************//**
+A wrapper function of innobase_convert_name(), convert a table or
+index name to the MySQL system_charset_info (UTF-8) and quote it if needed.
+@return	pointer to the end of buf */
+UNIV_INTERN
+void
+innobase_format_name(
+/*==================*/
+	char*		buf,		/*!< out: buffer for converted
+					identifier */
+	ulint		buflen,		/*!< in: length of buf, in bytes */
+	const char*	name,		/*!< in: index or table name
+					to format */
+	ibool		is_index_name)	/*!< in: index name */
+	__attribute__((nonnull));
+
+/** Corresponds to Sql_condition:enum_warning_level. */
+enum ib_log_level_t {
+	IB_LOG_LEVEL_INFO,
+	IB_LOG_LEVEL_WARN,
+	IB_LOG_LEVEL_ERROR,
+	IB_LOG_LEVEL_FATAL
+};
+
+/******************************************************************//**
+Use this when the args are first converted to a formatted string and then
+passed to the format string from errmsg-utf8.txt. The error message format
+must be: "Some string ... %s".
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+	THD *thd, Sql_condition::enum_warning_level level,
+	uint code, const char *format, ...);
+*/
+UNIV_INTERN
+void
+ib_errf(
+/*====*/
+	THD*		thd,		/*!< in/out: session */
+	ib_log_level_t	level,		/*!< in: warning level */
+	ib_uint32_t	code,		/*!< MySQL error code */
+	const char*	format,		/*!< printf format */
+	...)				/*!< Args */
+	__attribute__((format(printf, 4, 5)));
+
+/******************************************************************//**
+Use this when the args are passed to the format string from
+errmsg-utf8.txt directly as is.
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+	THD *thd, Sql_condition::enum_warning_level level,
+	uint code, const char *format, ...);
+*/
+UNIV_INTERN
+void
+ib_senderrf(
+/*========*/
+	THD*		thd,		/*!< in/out: session */
+	ib_log_level_t	level,		/*!< in: warning level */
+	ib_uint32_t	code,		/*!< MySQL error code */
+	...);				/*!< Args */
+
+/******************************************************************//**
+Write a message to the MySQL log, prefixed with "InnoDB: ".
+Wrapper around sql_print_information() */
+UNIV_INTERN
+void
+ib_logf(
+/*====*/
+	ib_log_level_t	level,		/*!< in: warning level */
+	const char*	format,		/*!< printf format */
+	...)				/*!< Args */
+	__attribute__((format(printf, 2, 3)));
+
+/******************************************************************//**
+Returns the NUL terminated value of glob_hostname.
+@return	pointer to glob_hostname. */
+UNIV_INTERN
+const char*
+server_get_hostname();
+/*=================*/
+
+/******************************************************************//**
+Get the error message format string.
+@return the format string or 0 if not found. */
+UNIV_INTERN
+const char*
+innobase_get_err_msg(
+/*=================*/
+	int	error_code);	/*!< in: MySQL error code */
+
+/*********************************************************************//**
+Compute the next autoinc value.
+
+For MySQL replication the autoincrement values can be partitioned among
+the nodes. The offset is the start or origin of the autoincrement value
+for a particular node. For n nodes the increment will be n and the offset
+will be in the interval [1, n]. The formula tries to allocate the next
+value for a particular node.
+
+Note: This function is also called with increment set to the number of
+values we want to reserve for multi-value inserts e.g.,
+
+	INSERT INTO T VALUES(), (), ();
+
+innobase_next_autoinc() will be called with increment set to 3 where
+autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for
+the multi-value INSERT above.
+@return	the next value */
+UNIV_INTERN
+ulonglong
+innobase_next_autoinc(
+/*==================*/
+	ulonglong	current,	/*!< in: Current value */
+	ulonglong	need,		/*!< in: count of values needed */
+	ulonglong	step,		/*!< in: AUTOINC increment step */
+	ulonglong	offset,		/*!< in: AUTOINC offset */
+	ulonglong	max_value)	/*!< in: max value for type */
+	__attribute__((pure, warn_unused_result));
+
+/********************************************************************//**
+Get the upper limit of the MySQL integral and floating-point type.
+@return maximum allowed value for the field */
+UNIV_INTERN
+ulonglong
+innobase_get_int_col_max_value(
+/*===========================*/
+	const Field*	field)	/*!< in: MySQL field */
+	__attribute__((nonnull, pure, warn_unused_result));
+
+/**********************************************************************
+Check if the length of the identifier exceeds the maximum allowed.
+The input to this function is an identifier in charset my_charset_filename.
+return true when length of identifier is too long. */
+UNIV_INTERN
+my_bool
+innobase_check_identifier_length(
+/*=============================*/
+	const char*	id);	/* in: identifier to check.  it must belong
+				to charset my_charset_filename */
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset. */
+uint
+innobase_convert_to_system_charset(
+/*===============================*/
+	char*           to,		/* out: converted identifier */
+	const char*     from,		/* in: identifier to convert */
+	ulint           len,		/* in: length of 'to', in bytes */
+	uint*		errors);	/* out: error return */
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset. */
+uint
+innobase_convert_to_filename_charset(
+/*=================================*/
+	char*           to,     /* out: converted identifier */
+	const char*     from,   /* in: identifier to convert */
+	ulint           len);   /* in: length of 'to', in bytes */
+
+
+#endif /* HA_INNODB_PROTOTYPES_H */
diff --git a/storage/innobase/include/handler0alter.h b/storage/innobase/include/handler0alter.h
new file mode 100644
index 00000000000..66b963ae39a
--- /dev/null
+++ b/storage/innobase/include/handler0alter.h
@@ -0,0 +1,114 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/handler0alter.h
+Smart ALTER TABLE
+*******************************************************/
+
+/*************************************************************//**
+Copies an InnoDB record to table->record[0]. */
+UNIV_INTERN
+void
+innobase_rec_to_mysql(
+/*==================*/
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: index */
+	const ulint*		offsets)/*!< in: rec_get_offsets(
+					rec, index, ...) */
+	__attribute__((nonnull));
+
+/*************************************************************//**
+Copies an InnoDB index entry to table->record[0]. */
+UNIV_INTERN
+void
+innobase_fields_to_mysql(
+/*=====================*/
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const dict_index_t*	index,	/*!< in: InnoDB index */
+	const dfield_t*		fields)	/*!< in: InnoDB index fields */
+	__attribute__((nonnull));
+
+/*************************************************************//**
+Copies an InnoDB row to table->record[0]. */
+UNIV_INTERN
+void
+innobase_row_to_mysql(
+/*==================*/
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const dict_table_t*	itab,	/*!< in: InnoDB table */
+	const dtuple_t*		row)	/*!< in: InnoDB row */
+	__attribute__((nonnull));
+
+/*************************************************************//**
+Resets table->record[0]. */
+UNIV_INTERN
+void
+innobase_rec_reset(
+/*===============*/
+	struct TABLE*		table)		/*!< in/out: MySQL table */
+	__attribute__((nonnull));
+
+/** Generate the next autoinc based on a snapshot of the session
+auto_increment_increment and auto_increment_offset variables. */
+struct ib_sequence_t {
+
+	/**
+	@param thd - the session
+	@param start_value - the lower bound
+	@param max_value - the upper bound (inclusive) */
+	ib_sequence_t(THD* thd, ulonglong start_value, ulonglong max_value);
+
+	/**
+	Postfix increment
+	@return the value to insert */
+	ulonglong operator++(int) UNIV_NOTHROW;
+
+	/** Check if the autoinc "sequence" is exhausted.
+	@return true if the sequence is exhausted */
+	bool eof() const UNIV_NOTHROW
+	{
+		return(m_eof);
+	}
+
+	/**
+	@return the next value in the sequence */
+	ulonglong last() const UNIV_NOTHROW
+	{
+		ut_ad(m_next_value > 0);
+
+		return(m_next_value);
+	}
+
+	/** Maximum calumn value if adding an AUTOINC column else 0. Once
+	we reach the end of the sequence it will be set to ~0. */
+	const ulonglong	m_max_value;
+
+	/** Value of auto_increment_increment */
+	ulong		m_increment;
+
+	/** Value of auto_increment_offset */
+	ulong		m_offset;
+
+	/** Next value in the sequence */
+	ulonglong	m_next_value;
+
+	/** true if no more values left in the sequence */
+	bool		m_eof;
+};
diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h
new file mode 100644
index 00000000000..6f9a628df5d
--- /dev/null
+++ b/storage/innobase/include/hash0hash.h
@@ -0,0 +1,575 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/hash0hash.h
+The simple hash table utility
+
+Created 5/20/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef hash0hash_h
+#define hash0hash_h
+
+#include "univ.i"
+#include "mem0mem.h"
+#ifndef UNIV_HOTBACKUP
+# include "sync0sync.h"
+# include "sync0rw.h"
+#endif /* !UNIV_HOTBACKUP */
+
+struct hash_table_t;
+struct hash_cell_t;
+
+typedef void*	hash_node_t;
+
+/* Fix Bug #13859: symbol collision between imap/mysql */
+#define hash_create hash0_create
+
+/* Differnt types of hash_table based on the synchronization
+method used for it. */
+enum hash_table_sync_t {
+	HASH_TABLE_SYNC_NONE = 0,	/*!< Don't use any internal
+					synchronization objects for
+					this hash_table. */
+	HASH_TABLE_SYNC_MUTEX,		/*!< Use mutexes to control
+					access to this hash_table. */
+	HASH_TABLE_SYNC_RW_LOCK		/*!< Use rw_locks to control
+					access to this hash_table. */
+};
+
+/*************************************************************//**
+Creates a hash table with >= n array cells. The actual number
+of cells is chosen to be a prime number slightly bigger than n.
+@return	own: created table */
+UNIV_INTERN
+hash_table_t*
+hash_create(
+/*========*/
+	ulint	n);	/*!< in: number of array cells */
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Creates a sync object array array to protect a hash table.
+::sync_obj can be mutexes or rw_locks depening on the type of
+hash table. */
+UNIV_INTERN
+void
+hash_create_sync_obj_func(
+/*======================*/
+	hash_table_t*		table,	/*!< in: hash table */
+	enum hash_table_sync_t	type,	/*!< in: HASH_TABLE_SYNC_MUTEX
+					or HASH_TABLE_SYNC_RW_LOCK */
+#ifdef UNIV_SYNC_DEBUG
+	ulint			sync_level,/*!< in: latching order level
+					of the mutexes: used in the
+					debug version */
+#endif /* UNIV_SYNC_DEBUG */
+	ulint			n_sync_obj);/*!< in: number of sync objects,
+					must be a power of 2 */
+#ifdef UNIV_SYNC_DEBUG
+# define hash_create_sync_obj(t, s, n, level)			\
+			hash_create_sync_obj_func(t, s, level, n)
+#else /* UNIV_SYNC_DEBUG */
+# define hash_create_sync_obj(t, s, n, level)			\
+			hash_create_sync_obj_func(t, s, n)
+#endif /* UNIV_SYNC_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Frees a hash table. */
+UNIV_INTERN
+void
+hash_table_free(
+/*============*/
+	hash_table_t*	table);	/*!< in, own: hash table */
+/**************************************************************//**
+Calculates the hash value from a folded value.
+@return	hashed value */
+UNIV_INLINE
+ulint
+hash_calc_hash(
+/*===========*/
+	ulint		fold,	/*!< in: folded value */
+	hash_table_t*	table);	/*!< in: hash table */
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Assert that the mutex for the table is held */
+# define HASH_ASSERT_OWN(TABLE, FOLD)				\
+	ut_ad((TABLE)->type != HASH_TABLE_SYNC_MUTEX		\
+	      || (mutex_own(hash_get_mutex((TABLE), FOLD))));
+#else /* !UNIV_HOTBACKUP */
+# define HASH_ASSERT_OWN(TABLE, FOLD)
+#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Inserts a struct to a hash table. */
+
+#define HASH_INSERT(TYPE, NAME, TABLE, FOLD, DATA)\
+do {\
+	hash_cell_t*	cell3333;\
+	TYPE*		struct3333;\
+\
+	HASH_ASSERT_OWN(TABLE, FOLD)\
+\
+	(DATA)->NAME = NULL;\
+\
+	cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\
+\
+	if (cell3333->node == NULL) {\
+		cell3333->node = DATA;\
+	} else {\
+		struct3333 = (TYPE*) cell3333->node;\
+\
+		while (struct3333->NAME != NULL) {\
+\
+			struct3333 = (TYPE*) struct3333->NAME;\
+		}\
+\
+		struct3333->NAME = DATA;\
+	}\
+} while (0)
+
+#ifdef UNIV_HASH_DEBUG
+# define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1)
+# define HASH_INVALIDATE(DATA, NAME) *(void**) (&DATA->NAME) = (void*) -1
+#else
+# define HASH_ASSERT_VALID(DATA) do {} while (0)
+# define HASH_INVALIDATE(DATA, NAME) do {} while (0)
+#endif
+
+/*******************************************************************//**
+Deletes a struct from a hash table. */
+
+#define HASH_DELETE(TYPE, NAME, TABLE, FOLD, DATA)\
+do {\
+	hash_cell_t*	cell3333;\
+	TYPE*		struct3333;\
+\
+	HASH_ASSERT_OWN(TABLE, FOLD)\
+\
+	cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\
+\
+	if (cell3333->node == DATA) {\
+		HASH_ASSERT_VALID(DATA->NAME);\
+		cell3333->node = DATA->NAME;\
+	} else {\
+		struct3333 = (TYPE*) cell3333->node;\
+\
+		while (struct3333->NAME != DATA) {\
+\
+			struct3333 = (TYPE*) struct3333->NAME;\
+			ut_a(struct3333);\
+		}\
+\
+		struct3333->NAME = DATA->NAME;\
+	}\
+	HASH_INVALIDATE(DATA, NAME);\
+} while (0)
+
+/*******************************************************************//**
+Gets the first struct in a hash chain, NULL if none. */
+
+#define HASH_GET_FIRST(TABLE, HASH_VAL)\
+	(hash_get_nth_cell(TABLE, HASH_VAL)->node)
+
+/*******************************************************************//**
+Gets the next struct in a hash chain, NULL if none. */
+
+#define HASH_GET_NEXT(NAME, DATA)	((DATA)->NAME)
+
+/********************************************************************//**
+Looks for a struct in a hash table. */
+#define HASH_SEARCH(NAME, TABLE, FOLD, TYPE, DATA, ASSERTION, TEST)\
+{\
+\
+	HASH_ASSERT_OWN(TABLE, FOLD)\
+\
+	(DATA) = (TYPE) HASH_GET_FIRST(TABLE, hash_calc_hash(FOLD, TABLE));\
+	HASH_ASSERT_VALID(DATA);\
+\
+	while ((DATA) != NULL) {\
+		ASSERTION;\
+		if (TEST) {\
+			break;\
+		} else {\
+			HASH_ASSERT_VALID(HASH_GET_NEXT(NAME, DATA));\
+			(DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA);\
+		}\
+	}\
+}
+
+/********************************************************************//**
+Looks for an item in all hash buckets. */
+#define HASH_SEARCH_ALL(NAME, TABLE, TYPE, DATA, ASSERTION, TEST)	\
+do {									\
+	ulint	i3333;							\
+									\
+	for (i3333 = (TABLE)->n_cells; i3333--; ) {			\
+		(DATA) = (TYPE) HASH_GET_FIRST(TABLE, i3333);		\
+									\
+		while ((DATA) != NULL) {				\
+			HASH_ASSERT_VALID(DATA);			\
+			ASSERTION;					\
+									\
+			if (TEST) {					\
+				break;					\
+			}						\
+									\
+			(DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA);	\
+		}							\
+									\
+		if ((DATA) != NULL) {					\
+			break;						\
+		}							\
+	}								\
+} while (0)
+
+/************************************************************//**
+Gets the nth cell in a hash table.
+@return	pointer to cell */
+UNIV_INLINE
+hash_cell_t*
+hash_get_nth_cell(
+/*==============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		n);	/*!< in: cell index */
+
+/*************************************************************//**
+Clears a hash table so that all the cells become empty. */
+UNIV_INLINE
+void
+hash_table_clear(
+/*=============*/
+	hash_table_t*	table);	/*!< in/out: hash table */
+
+/*************************************************************//**
+Returns the number of cells in a hash table.
+@return	number of cells */
+UNIV_INLINE
+ulint
+hash_get_n_cells(
+/*=============*/
+	hash_table_t*	table);	/*!< in: table */
+/*******************************************************************//**
+Deletes a struct which is stored in the heap of the hash table, and compacts
+the heap. The fold value must be stored in the struct NODE in a field named
+'fold'. */
+
+#define HASH_DELETE_AND_COMPACT(TYPE, NAME, TABLE, NODE)\
+do {\
+	TYPE*		node111;\
+	TYPE*		top_node111;\
+	hash_cell_t*	cell111;\
+	ulint		fold111;\
+\
+	fold111 = (NODE)->fold;\
+\
+	HASH_DELETE(TYPE, NAME, TABLE, fold111, NODE);\
+\
+	top_node111 = (TYPE*) mem_heap_get_top(\
+				hash_get_heap(TABLE, fold111),\
+							sizeof(TYPE));\
+\
+	/* If the node to remove is not the top node in the heap, compact the\
+	heap of nodes by moving the top node in the place of NODE. */\
+\
+	if (NODE != top_node111) {\
+\
+		/* Copy the top node in place of NODE */\
+\
+		*(NODE) = *top_node111;\
+\
+		cell111 = hash_get_nth_cell(TABLE,\
+				hash_calc_hash(top_node111->fold, TABLE));\
+\
+		/* Look for the pointer to the top node, to update it */\
+\
+		if (cell111->node == top_node111) {\
+			/* The top node is the first in the chain */\
+\
+			cell111->node = NODE;\
+		} else {\
+			/* We have to look for the predecessor of the top\
+			node */\
+			node111 = static_cast<TYPE*>(cell111->node);\
+\
+			while (top_node111 != HASH_GET_NEXT(NAME, node111)) {\
+\
+				node111 = static_cast<TYPE*>(\
+					HASH_GET_NEXT(NAME, node111));\
+			}\
+\
+			/* Now we have the predecessor node */\
+\
+			node111->NAME = NODE;\
+		}\
+	}\
+\
+	/* Free the space occupied by the top node */\
+\
+	mem_heap_free_top(hash_get_heap(TABLE, fold111), sizeof(TYPE));\
+} while (0)
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Move all hash table entries from OLD_TABLE to NEW_TABLE. */
+
+#define HASH_MIGRATE(OLD_TABLE, NEW_TABLE, NODE_TYPE, PTR_NAME, FOLD_FUNC) \
+do {\
+	ulint		i2222;\
+	ulint		cell_count2222;\
+\
+	cell_count2222 = hash_get_n_cells(OLD_TABLE);\
+\
+	for (i2222 = 0; i2222 < cell_count2222; i2222++) {\
+		NODE_TYPE*	node2222 = HASH_GET_FIRST((OLD_TABLE), i2222);\
+\
+		while (node2222) {\
+			NODE_TYPE*	next2222 = node2222->PTR_NAME;\
+			ulint		fold2222 = FOLD_FUNC(node2222);\
+\
+			HASH_INSERT(NODE_TYPE, PTR_NAME, (NEW_TABLE),\
+				fold2222, node2222);\
+\
+			node2222 = next2222;\
+		}\
+	}\
+} while (0)
+
+/************************************************************//**
+Gets the sync object index for a fold value in a hash table.
+@return	index */
+UNIV_INLINE
+ulint
+hash_get_sync_obj_index(
+/*====================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+Gets the nth heap in a hash table.
+@return	mem heap */
+UNIV_INLINE
+mem_heap_t*
+hash_get_nth_heap(
+/*==============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		i);	/*!< in: index of the heap */
+/************************************************************//**
+Gets the heap for a fold value in a hash table.
+@return	mem heap */
+UNIV_INLINE
+mem_heap_t*
+hash_get_heap(
+/*==========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+Gets the nth mutex in a hash table.
+@return	mutex */
+UNIV_INLINE
+ib_mutex_t*
+hash_get_nth_mutex(
+/*===============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		i);	/*!< in: index of the mutex */
+/************************************************************//**
+Gets the nth rw_lock in a hash table.
+@return	rw_lock */
+UNIV_INLINE
+rw_lock_t*
+hash_get_nth_lock(
+/*==============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		i);	/*!< in: index of the rw_lock */
+/************************************************************//**
+Gets the mutex for a fold value in a hash table.
+@return	mutex */
+UNIV_INLINE
+ib_mutex_t*
+hash_get_mutex(
+/*===========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+Gets the rw_lock for a fold value in a hash table.
+@return	rw_lock */
+UNIV_INLINE
+rw_lock_t*
+hash_get_lock(
+/*==========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+Reserves the mutex for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_mutex_enter(
+/*=============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+Releases the mutex for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit(
+/*============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+Reserves all the mutexes of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_mutex_enter_all(
+/*=================*/
+	hash_table_t*	table);	/*!< in: hash table */
+/************************************************************//**
+Releases all the mutexes of a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit_all(
+/*================*/
+	hash_table_t*	table);	/*!< in: hash table */
+/************************************************************//**
+Releases all but the passed in mutex of a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit_all_but(
+/*====================*/
+	hash_table_t*	table,		/*!< in: hash table */
+	ib_mutex_t*	keep_mutex);	/*!< in: mutex to keep */
+/************************************************************//**
+s-lock a lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_lock_s(
+/*========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+x-lock a lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_lock_x(
+/*========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+unlock an s-lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_unlock_s(
+/*==========*/
+
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+unlock x-lock for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_unlock_x(
+/*==========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+Reserves all the locks of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_lock_x_all(
+/*============*/
+	hash_table_t*	table);	/*!< in: hash table */
+/************************************************************//**
+Releases all the locks of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_unlock_x_all(
+/*==============*/
+	hash_table_t*	table);	/*!< in: hash table */
+/************************************************************//**
+Releases all but passed in lock of a hash table, */
+UNIV_INTERN
+void
+hash_unlock_x_all_but(
+/*==================*/
+	hash_table_t*	table,		/*!< in: hash table */
+	rw_lock_t*	keep_lock);	/*!< in: lock to keep */
+
+#else /* !UNIV_HOTBACKUP */
+# define hash_get_heap(table, fold)	((table)->heap)
+# define hash_mutex_enter(table, fold)	((void) 0)
+# define hash_mutex_exit(table, fold)	((void) 0)
+# define hash_mutex_enter_all(table)	((void) 0)
+# define hash_mutex_exit_all(table)	((void) 0)
+# define hash_mutex_exit_all_but(t, m)	((void) 0)
+# define hash_lock_s(t, f)		((void) 0)
+# define hash_lock_x(t, f)		((void) 0)
+# define hash_unlock_s(t, f)		((void) 0)
+# define hash_unlock_x(t, f)		((void) 0)
+# define hash_lock_x_all(t)		((void) 0)
+# define hash_unlock_x_all(t)		((void) 0)
+# define hash_unlock_x_all_but(t, l)	((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+struct hash_cell_t{
+	void*	node;	/*!< hash chain node, NULL if none */
+};
+
+/* The hash table structure */
+struct hash_table_t {
+	enum hash_table_sync_t	type;	/*<! type of hash_table. */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+	ibool			adaptive;/* TRUE if this is the hash
+					table of the adaptive hash
+					index */
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	ulint			n_cells;/* number of cells in the hash table */
+	hash_cell_t*		array;	/*!< pointer to cell array */
+#ifndef UNIV_HOTBACKUP
+	ulint			n_sync_obj;/* if sync_objs != NULL, then
+					the number of either the number
+					of mutexes or the number of
+					rw_locks depending on the type.
+					Must be a power of 2 */
+	union {
+		ib_mutex_t*	mutexes;/* NULL, or an array of mutexes
+					used to protect segments of the
+					hash table */
+		rw_lock_t*	rw_locks;/* NULL, or an array of rw_lcoks
+					used to protect segments of the
+					hash table */
+	} sync_obj;
+
+	mem_heap_t**		heaps;	/*!< if this is non-NULL, hash
+					chain nodes for external chaining
+					can be allocated from these memory
+					heaps; there are then n_mutexes
+					many of these heaps */
+#endif /* !UNIV_HOTBACKUP */
+	mem_heap_t*		heap;
+#ifdef UNIV_DEBUG
+	ulint			magic_n;
+# define HASH_TABLE_MAGIC_N	76561114
+#endif /* UNIV_DEBUG */
+};
+
+#ifndef UNIV_NONINL
+#include "hash0hash.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/hash0hash.ic b/storage/innobase/include/hash0hash.ic
new file mode 100644
index 00000000000..254f3f82e5d
--- /dev/null
+++ b/storage/innobase/include/hash0hash.ic
@@ -0,0 +1,225 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/hash0hash.ic
+The simple hash table utility
+
+Created 5/20/1997 Heikki Tuuri
+*******************************************************/
+
+#include "ut0rnd.h"
+
+/************************************************************//**
+Gets the nth cell in a hash table.
+@return	pointer to cell */
+UNIV_INLINE
+hash_cell_t*
+hash_get_nth_cell(
+/*==============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		n)	/*!< in: cell index */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	ut_ad(n < table->n_cells);
+
+	return(table->array + n);
+}
+
+/*************************************************************//**
+Clears a hash table so that all the cells become empty. */
+UNIV_INLINE
+void
+hash_table_clear(
+/*=============*/
+	hash_table_t*	table)	/*!< in/out: hash table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	memset(table->array, 0x0,
+	       table->n_cells * sizeof(*table->array));
+}
+
+/*************************************************************//**
+Returns the number of cells in a hash table.
+@return	number of cells */
+UNIV_INLINE
+ulint
+hash_get_n_cells(
+/*=============*/
+	hash_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	return(table->n_cells);
+}
+
+/**************************************************************//**
+Calculates the hash value from a folded value.
+@return	hashed value */
+UNIV_INLINE
+ulint
+hash_calc_hash(
+/*===========*/
+	ulint		fold,	/*!< in: folded value */
+	hash_table_t*	table)	/*!< in: hash table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	return(ut_hash_ulint(fold, table->n_cells));
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Gets the sync object index for a fold value in a hash table.
+@return	index */
+UNIV_INLINE
+ulint
+hash_get_sync_obj_index(
+/*====================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	ut_ad(table->type != HASH_TABLE_SYNC_NONE);
+	ut_ad(ut_is_2pow(table->n_sync_obj));
+	return(ut_2pow_remainder(hash_calc_hash(fold, table),
+				 table->n_sync_obj));
+}
+
+/************************************************************//**
+Gets the nth heap in a hash table.
+@return	mem heap */
+UNIV_INLINE
+mem_heap_t*
+hash_get_nth_heap(
+/*==============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		i)	/*!< in: index of the heap */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	ut_ad(table->type != HASH_TABLE_SYNC_NONE);
+	ut_ad(i < table->n_sync_obj);
+
+	return(table->heaps[i]);
+}
+
+/************************************************************//**
+Gets the heap for a fold value in a hash table.
+@return	mem heap */
+UNIV_INLINE
+mem_heap_t*
+hash_get_heap(
+/*==========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	ulint	i;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+
+	if (table->heap) {
+		return(table->heap);
+	}
+
+	i = hash_get_sync_obj_index(table, fold);
+
+	return(hash_get_nth_heap(table, i));
+}
+
+/************************************************************//**
+Gets the nth mutex in a hash table.
+@return	mutex */
+UNIV_INLINE
+ib_mutex_t*
+hash_get_nth_mutex(
+/*===============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		i)	/*!< in: index of the mutex */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	ut_ad(table->type == HASH_TABLE_SYNC_MUTEX);
+	ut_ad(i < table->n_sync_obj);
+
+	return(table->sync_obj.mutexes + i);
+}
+
+/************************************************************//**
+Gets the mutex for a fold value in a hash table.
+@return	mutex */
+UNIV_INLINE
+ib_mutex_t*
+hash_get_mutex(
+/*===========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	ulint	i;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+
+	i = hash_get_sync_obj_index(table, fold);
+
+	return(hash_get_nth_mutex(table, i));
+}
+
+/************************************************************//**
+Gets the nth rw_lock in a hash table.
+@return	rw_lock */
+UNIV_INLINE
+rw_lock_t*
+hash_get_nth_lock(
+/*==============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		i)	/*!< in: index of the rw_lock */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	ut_ad(i < table->n_sync_obj);
+
+	return(table->sync_obj.rw_locks + i);
+}
+
+/************************************************************//**
+Gets the rw_lock for a fold value in a hash table.
+@return	rw_lock */
+UNIV_INLINE
+rw_lock_t*
+hash_get_lock(
+/*==========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	ulint	i;
+
+	ut_ad(table);
+	ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+
+	i = hash_get_sync_obj_index(table, fold);
+
+	return(hash_get_nth_lock(table, i));
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h
new file mode 100644
index 00000000000..9c3b686c998
--- /dev/null
+++ b/storage/innobase/include/ibuf0ibuf.h
@@ -0,0 +1,467 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0ibuf.h
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef ibuf0ibuf_h
+#define ibuf0ibuf_h
+
+#include "univ.i"
+
+#include "mtr0mtr.h"
+#include "dict0mem.h"
+#include "fsp0fsp.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "ibuf0types.h"
+
+/** Default value for maximum on-disk size of change buffer in terms
+of percentage of the buffer pool. */
+#define CHANGE_BUFFER_DEFAULT_SIZE	(25)
+
+/* Possible operations buffered in the insert/whatever buffer. See
+ibuf_insert(). DO NOT CHANGE THE VALUES OF THESE, THEY ARE STORED ON DISK. */
+typedef enum {
+	IBUF_OP_INSERT = 0,
+	IBUF_OP_DELETE_MARK = 1,
+	IBUF_OP_DELETE = 2,
+
+	/* Number of different operation types. */
+	IBUF_OP_COUNT = 3
+} ibuf_op_t;
+
+/** Combinations of operations that can be buffered.  Because the enum
+values are used for indexing innobase_change_buffering_values[], they
+should start at 0 and there should not be any gaps. */
+typedef enum {
+	IBUF_USE_NONE = 0,
+	IBUF_USE_INSERT,	/* insert */
+	IBUF_USE_DELETE_MARK,	/* delete */
+	IBUF_USE_INSERT_DELETE_MARK,	/* insert+delete */
+	IBUF_USE_DELETE,	/* delete+purge */
+	IBUF_USE_ALL,		/* insert+delete+purge */
+
+	IBUF_USE_COUNT		/* number of entries in ibuf_use_t */
+} ibuf_use_t;
+
+/** Operations that can currently be buffered. */
+extern ibuf_use_t	ibuf_use;
+
+/** The insert buffer control structure */
+extern ibuf_t*		ibuf;
+
+/* The purpose of the insert buffer is to reduce random disk access.
+When we wish to insert a record into a non-unique secondary index and
+the B-tree leaf page where the record belongs to is not in the buffer
+pool, we insert the record into the insert buffer B-tree, indexed by
+(space_id, page_no).  When the page is eventually read into the buffer
+pool, we look up the insert buffer B-tree for any modifications to the
+page, and apply these upon the completion of the read operation.  This
+is called the insert buffer merge. */
+
+/* The insert buffer merge must always succeed.  To guarantee this,
+the insert buffer subsystem keeps track of the free space in pages for
+which it can buffer operations.  Two bits per page in the insert
+buffer bitmap indicate the available space in coarse increments.  The
+free bits in the insert buffer bitmap must never exceed the free space
+on a page.  It is safe to decrement or reset the bits in the bitmap in
+a mini-transaction that is committed before the mini-transaction that
+affects the free space.  It is unsafe to increment the bits in a
+separately committed mini-transaction, because in crash recovery, the
+free bits could momentarily be set too high. */
+
+/******************************************************************//**
+Creates the insert buffer data structure at a database startup. */
+UNIV_INTERN
+void
+ibuf_init_at_db_start(void);
+/*=======================*/
+/*********************************************************************//**
+Updates the max_size value for ibuf. */
+UNIV_INTERN
+void
+ibuf_max_size_update(
+/*=================*/
+	ulint	new_val);	/*!< in: new value in terms of
+				percentage of the buffer pool size */
+/*********************************************************************//**
+Reads the biggest tablespace id from the high end of the insert buffer
+tree and updates the counter in fil_system. */
+UNIV_INTERN
+void
+ibuf_update_max_tablespace_id(void);
+/*===============================*/
+/***************************************************************//**
+Starts an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_start(
+/*===========*/
+	mtr_t*	mtr)	/*!< out: mini-transaction */
+	__attribute__((nonnull));
+/***************************************************************//**
+Commits an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_commit(
+/*============*/
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Initializes an ibuf bitmap page. */
+UNIV_INTERN
+void
+ibuf_bitmap_page_init(
+/*==================*/
+	buf_block_t*	block,	/*!< in: bitmap page */
+	mtr_t*		mtr);	/*!< in: mtr */
+/************************************************************************//**
+Resets the free bits of the page in the ibuf bitmap. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to decrement or reset the bits in the bitmap in a mini-transaction
+that is committed before the mini-transaction that affects the free
+space. */
+UNIV_INTERN
+void
+ibuf_reset_free_bits(
+/*=================*/
+	buf_block_t*	block);	/*!< in: index page; free bits are set to 0
+				if the index is a non-clustered
+				non-unique, and page level is 0 */
+/************************************************************************//**
+Updates the free bits of an uncompressed page in the ibuf bitmap if
+there is not enough free on the page any more.  This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is
+unsafe to increment the bits in a separately committed
+mini-transaction, because in crash recovery, the free bits could
+momentarily be set too high.  It is only safe to use this function for
+decrementing the free bits.  Should more free space become available,
+we must not update the free bits here, because that would break crash
+recovery. */
+UNIV_INLINE
+void
+ibuf_update_free_bits_if_full(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: index page to which we have added new
+				records; the free bits are updated if the
+				index is non-clustered and non-unique and
+				the page level is 0, and the page becomes
+				fuller */
+	ulint		max_ins_size,/*!< in: value of maximum insert size with
+				reorganize before the latest operation
+				performed to the page */
+	ulint		increase);/*!< in: upper limit for the additional space
+				used in the latest operation, if known, or
+				ULINT_UNDEFINED */
+/**********************************************************************//**
+Updates the free bits for an uncompressed page to reflect the present
+state.  Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_low(
+/*======================*/
+	const buf_block_t*	block,		/*!< in: index page */
+	ulint			max_ins_size,	/*!< in: value of
+						maximum insert size
+						with reorganize before
+						the latest operation
+						performed to the page */
+	mtr_t*			mtr);		/*!< in/out: mtr */
+/**********************************************************************//**
+Updates the free bits for a compressed page to reflect the present
+state.  Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_zip(
+/*======================*/
+	buf_block_t*	block,	/*!< in/out: index page */
+	mtr_t*		mtr);	/*!< in/out: mtr */
+/**********************************************************************//**
+Updates the free bits for the two pages to reflect the present state.
+Does this in the mtr given, which means that the latching order rules
+virtually prevent any further operations until mtr is committed.
+NOTE: The free bits in the insert buffer bitmap must never exceed the
+free space on a page.  It is safe to set the free bits in the same
+mini-transaction that updated the pages. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_for_two_pages_low(
+/*====================================*/
+	ulint		zip_size,/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	buf_block_t*	block1,	/*!< in: index page */
+	buf_block_t*	block2,	/*!< in: index page */
+	mtr_t*		mtr);	/*!< in: mtr */
+/**********************************************************************//**
+A basic partial test if an insert to the insert buffer could be possible and
+recommended. */
+UNIV_INLINE
+ibool
+ibuf_should_try(
+/*============*/
+	dict_index_t*	index,			/*!< in: index where to insert */
+	ulint		ignore_sec_unique);	/*!< in: if != 0, we should
+						ignore UNIQUE constraint on
+						a secondary index when we
+						decide */
+/******************************************************************//**
+Returns TRUE if the current OS thread is performing an insert buffer
+routine.
+
+For instance, a read-ahead of non-ibuf pages is forbidden by threads
+that are executing an insert buffer routine.
+@return TRUE if inside an insert buffer routine */
+UNIV_INLINE
+ibool
+ibuf_inside(
+/*========*/
+	const mtr_t*	mtr)	/*!< in: mini-transaction */
+	__attribute__((nonnull, pure));
+/***********************************************************************//**
+Checks if a page address is an ibuf bitmap page (level 3 page) address.
+@return	TRUE if a bitmap page */
+UNIV_INLINE
+ibool
+ibuf_bitmap_page(
+/*=============*/
+	ulint	zip_size,/*!< in: compressed page size in bytes;
+			0 for uncompressed pages */
+	ulint	page_no);/*!< in: page number */
+/***********************************************************************//**
+Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==TRUE.
+@return	TRUE if level 2 or level 3 page */
+UNIV_INTERN
+ibool
+ibuf_page_low(
+/*==========*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes, or 0 */
+	ulint		page_no,/*!< in: page number */
+#ifdef UNIV_DEBUG
+	ibool		x_latch,/*!< in: FALSE if relaxed check
+				(avoid latching the bitmap page) */
+#endif /* UNIV_DEBUG */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mtr which will contain an
+				x-latch to the bitmap page if the page
+				is not one of the fixed address ibuf
+				pages, or NULL, in which case a new
+				transaction is created. */
+	__attribute__((warn_unused_result));
+#ifdef UNIV_DEBUG
+/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of
+pages.  Must not be called when recv_no_ibuf_operations==TRUE.
+@param space	tablespace identifier
+@param zip_size	compressed page size in bytes, or 0
+@param page_no	page number
+@param mtr	mini-transaction or NULL
+@return TRUE if level 2 or level 3 page */
+# define ibuf_page(space, zip_size, page_no, mtr)			\
+	ibuf_page_low(space, zip_size, page_no, TRUE, __FILE__, __LINE__, mtr)
+#else /* UVIV_DEBUG */
+/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of
+pages.  Must not be called when recv_no_ibuf_operations==TRUE.
+@param space	tablespace identifier
+@param zip_size	compressed page size in bytes, or 0
+@param page_no	page number
+@param mtr	mini-transaction or NULL
+@return TRUE if level 2 or level 3 page */
+# define ibuf_page(space, zip_size, page_no, mtr)			\
+	ibuf_page_low(space, zip_size, page_no, __FILE__, __LINE__, mtr)
+#endif /* UVIV_DEBUG */
+/***********************************************************************//**
+Frees excess pages from the ibuf free list. This function is called when an OS
+thread calls fsp services to allocate a new file segment, or a new page to a
+file segment, and the thread did not own the fsp latch before this call. */
+UNIV_INTERN
+void
+ibuf_free_excess_pages(void);
+/*========================*/
+/*********************************************************************//**
+Buffer an operation in the insert/delete buffer, instead of doing it
+directly to the disk page, if this is possible. Does not do it if the index
+is clustered or unique.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+ibuf_insert(
+/*========*/
+	ibuf_op_t	op,	/*!< in: operation type */
+	const dtuple_t*	entry,	/*!< in: index entry to insert */
+	dict_index_t*	index,	/*!< in: index where to insert */
+	ulint		space,	/*!< in: space id where to insert */
+	ulint		zip_size,/*!< in: compressed page size in bytes, or 0 */
+	ulint		page_no,/*!< in: page number where to insert */
+	que_thr_t*	thr);	/*!< in: query thread */
+/*********************************************************************//**
+When an index page is read from a disk to the buffer pool, this function
+applies any buffered operations to the page and deletes the entries from the
+insert buffer. If the page is not read, but created in the buffer pool, this
+function deletes its buffered entries from the insert buffer; there can
+exist entries for such a page if the page belonged to an index which
+subsequently was dropped. */
+UNIV_INTERN
+void
+ibuf_merge_or_delete_for_page(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: if page has been read from
+				disk, pointer to the page x-latched,
+				else NULL */
+	ulint		space,	/*!< in: space id of the index page */
+	ulint		page_no,/*!< in: page number of the index page */
+	ulint		zip_size,/*!< in: compressed page size in bytes,
+				or 0 */
+	ibool		update_ibuf_bitmap);/*!< in: normally this is set
+				to TRUE, but if we have deleted or are
+				deleting the tablespace, then we
+				naturally do not want to update a
+				non-existent bitmap page */
+/*********************************************************************//**
+Deletes all entries in the insert buffer for a given space id. This is used
+in DISCARD TABLESPACE and IMPORT TABLESPACE.
+NOTE: this does not update the page free bitmaps in the space. The space will
+become CORRUPT when you call this function! */
+UNIV_INTERN
+void
+ibuf_delete_for_discarded_space(
+/*============================*/
+	ulint	space);	/*!< in: space id */
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+UNIV_INTERN
+ulint
+ibuf_contract_in_background(
+/*========================*/
+	table_id_t	table_id,	/*!< in: if merge should be done only
+					for a specific table, for all tables
+					this should be 0 */
+	ibool		full);		/*!< in: TRUE if the caller wants to
+					do a full contract based on PCT_IO(100).
+					If FALSE then the size of contract
+					batch is determined based on the
+					current size of the ibuf tree. */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Parses a redo log record of an ibuf bitmap page init.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+ibuf_parse_bitmap_init(
+/*===================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	buf_block_t*	block,	/*!< in: block or NULL */
+	mtr_t*		mtr);	/*!< in: mtr or NULL */
+#ifndef UNIV_HOTBACKUP
+#ifdef UNIV_IBUF_COUNT_DEBUG
+/******************************************************************//**
+Gets the ibuf count for a given page.
+@return number of entries in the insert buffer currently buffered for
+this page */
+UNIV_INTERN
+ulint
+ibuf_count_get(
+/*===========*/
+	ulint	space,	/*!< in: space id */
+	ulint	page_no);/*!< in: page number */
+#endif
+/******************************************************************//**
+Looks if the insert buffer is empty.
+@return	true if empty */
+UNIV_INTERN
+bool
+ibuf_is_empty(void);
+/*===============*/
+/******************************************************************//**
+Prints info of ibuf. */
+UNIV_INTERN
+void
+ibuf_print(
+/*=======*/
+	FILE*	file);	/*!< in: file where to print */
+/********************************************************************
+Read the first two bytes from a record's fourth field (counter field in new
+records; something else in older records).
+@return	"counter" field, or ULINT_UNDEFINED if for some reason it can't be read */
+UNIV_INTERN
+ulint
+ibuf_rec_get_counter(
+/*=================*/
+	const rec_t*	rec);	/*!< in: ibuf record */
+/******************************************************************//**
+Closes insert buffer and frees the data structures. */
+UNIV_INTERN
+void
+ibuf_close(void);
+/*============*/
+
+/******************************************************************//**
+Checks the insert buffer bitmaps on IMPORT TABLESPACE.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+ibuf_check_bitmap_on_import(
+/*========================*/
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		space_id)	/*!< in: tablespace identifier */
+	__attribute__((nonnull, warn_unused_result));
+
+#define IBUF_HEADER_PAGE_NO	FSP_IBUF_HEADER_PAGE_NO
+#define IBUF_TREE_ROOT_PAGE_NO	FSP_IBUF_TREE_ROOT_PAGE_NO
+
+#endif /* !UNIV_HOTBACKUP */
+
+/* The ibuf header page currently contains only the file segment header
+for the file segment from which the pages for the ibuf tree are allocated */
+#define IBUF_HEADER		PAGE_DATA
+#define	IBUF_TREE_SEG_HEADER	0	/* fseg header for ibuf tree */
+
+/* The insert buffer tree itself is always located in space 0. */
+#define IBUF_SPACE_ID		0
+
+#ifndef UNIV_NONINL
+#include "ibuf0ibuf.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/ibuf0ibuf.ic b/storage/innobase/include/ibuf0ibuf.ic
new file mode 100644
index 00000000000..21747fdceac
--- /dev/null
+++ b/storage/innobase/include/ibuf0ibuf.ic
@@ -0,0 +1,367 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0ibuf.ic
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "page0page.h"
+#include "page0zip.h"
+#ifndef UNIV_HOTBACKUP
+#include "buf0lru.h"
+
+/** An index page must contain at least UNIV_PAGE_SIZE /
+IBUF_PAGE_SIZE_PER_FREE_SPACE bytes of free space for ibuf to try to
+buffer inserts to this page.  If there is this much of free space, the
+corresponding bits are set in the ibuf bitmap. */
+#define IBUF_PAGE_SIZE_PER_FREE_SPACE	32
+
+/***************************************************************//**
+Starts an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_start(
+/*===========*/
+	mtr_t*	mtr)	/*!< out: mini-transaction */
+{
+	mtr_start(mtr);
+	mtr->inside_ibuf = TRUE;
+}
+/***************************************************************//**
+Commits an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_commit(
+/*============*/
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(mtr->inside_ibuf);
+	ut_d(mtr->inside_ibuf = FALSE);
+	mtr_commit(mtr);
+}
+
+/** Insert buffer struct */
+struct ibuf_t{
+	ulint		size;		/*!< current size of the ibuf index
+					tree, in pages */
+	ulint		max_size;	/*!< recommended maximum size of the
+					ibuf index tree, in pages */
+	ulint		seg_size;	/*!< allocated pages of the file
+					segment containing ibuf header and
+					tree */
+	bool		empty;		/*!< Protected by the page
+					latch of the root page of the
+					insert buffer tree
+					(FSP_IBUF_TREE_ROOT_PAGE_NO). true
+					if and only if the insert
+					buffer tree is empty. */
+	ulint		free_list_len;	/*!< length of the free list */
+	ulint		height;		/*!< tree height */
+	dict_index_t*	index;		/*!< insert buffer index */
+
+	ulint		n_merges;	/*!< number of pages merged */
+	ulint		n_merged_ops[IBUF_OP_COUNT];
+					/*!< number of operations of each type
+					merged to index pages */
+	ulint		n_discarded_ops[IBUF_OP_COUNT];
+					/*!< number of operations of each type
+					discarded without merging due to the
+					tablespace being deleted or the
+					index being dropped */
+};
+
+/************************************************************************//**
+Sets the free bit of the page in the ibuf bitmap. This is done in a separate
+mini-transaction, hence this operation does not restrict further work to only
+ibuf bitmap operations, which would result if the latch to the bitmap page
+were kept. */
+UNIV_INTERN
+void
+ibuf_set_free_bits_func(
+/*====================*/
+	buf_block_t*	block,	/*!< in: index page of a non-clustered index;
+				free bit is reset if page level is 0 */
+#ifdef UNIV_IBUF_DEBUG
+	ulint		max_val,/*!< in: ULINT_UNDEFINED or a maximum
+				value which the bits must have before
+				setting; this is for debugging */
+#endif /* UNIV_IBUF_DEBUG */
+	ulint		val);	/*!< in: value to set: < 4 */
+#ifdef UNIV_IBUF_DEBUG
+# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,max,v)
+#else /* UNIV_IBUF_DEBUG */
+# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,v)
+#endif /* UNIV_IBUF_DEBUG */
+
+/**********************************************************************//**
+A basic partial test if an insert to the insert buffer could be possible and
+recommended. */
+UNIV_INLINE
+ibool
+ibuf_should_try(
+/*============*/
+	dict_index_t*	index,			/*!< in: index where to insert */
+	ulint		ignore_sec_unique)	/*!< in: if != 0, we should
+						ignore UNIQUE constraint on
+						a secondary index when we
+						decide */
+{
+	return(ibuf_use != IBUF_USE_NONE
+	       && ibuf->max_size != 0
+	       && !dict_index_is_clust(index)
+	       && index->table->quiesce == QUIESCE_NONE
+	       && (ignore_sec_unique || !dict_index_is_unique(index)));
+}
+
+/******************************************************************//**
+Returns TRUE if the current OS thread is performing an insert buffer
+routine.
+
+For instance, a read-ahead of non-ibuf pages is forbidden by threads
+that are executing an insert buffer routine.
+@return TRUE if inside an insert buffer routine */
+UNIV_INLINE
+ibool
+ibuf_inside(
+/*========*/
+	const mtr_t*	mtr)	/*!< in: mini-transaction */
+{
+	return(mtr->inside_ibuf);
+}
+
+/***********************************************************************//**
+Checks if a page address is an ibuf bitmap page address.
+@return	TRUE if a bitmap page */
+UNIV_INLINE
+ibool
+ibuf_bitmap_page(
+/*=============*/
+	ulint	zip_size,/*!< in: compressed page size in bytes;
+			0 for uncompressed pages */
+	ulint	page_no)/*!< in: page number */
+{
+	ut_ad(ut_is_2pow(zip_size));
+
+	if (!zip_size) {
+		return((page_no & (UNIV_PAGE_SIZE - 1))
+			== FSP_IBUF_BITMAP_OFFSET);
+	}
+
+	return((page_no & (zip_size - 1)) == FSP_IBUF_BITMAP_OFFSET);
+}
+
+/*********************************************************************//**
+Translates the free space on a page to a value in the ibuf bitmap.
+@return	value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_bits(
+/*===========================*/
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	max_ins_size)	/*!< in: maximum insert size after reorganize
+				for the page */
+{
+	ulint	n;
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(!zip_size || zip_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
+
+	if (zip_size) {
+		n = max_ins_size
+			/ (zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+	} else {
+		n = max_ins_size
+			/ (UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+	}
+
+	if (n == 3) {
+		n = 2;
+	}
+
+	if (n > 3) {
+		n = 3;
+	}
+
+	return(n);
+}
+
+/*********************************************************************//**
+Translates the ibuf free bits to the free space on a page in bytes.
+@return	maximum insert size after reorganize for the page */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_from_bits(
+/*================================*/
+	ulint	zip_size,/*!< in: compressed page size in bytes;
+			0 for uncompressed pages */
+	ulint	bits)	/*!< in: value for ibuf bitmap bits */
+{
+	ut_ad(bits < 4);
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(!zip_size || zip_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
+
+	if (zip_size) {
+		if (bits == 3) {
+			return(4 * zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+		}
+
+		return(bits * zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+	}
+
+	if (bits == 3) {
+		return(4 * UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+	}
+
+	return(bits * (UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE));
+}
+
+/*********************************************************************//**
+Translates the free space on a compressed page to a value in the ibuf bitmap.
+@return	value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_zip(
+/*==========================*/
+	ulint			zip_size,
+					/*!< in: compressed page size in bytes */
+	const buf_block_t*	block)	/*!< in: buffer block */
+{
+	ulint			max_ins_size;
+	const page_zip_des_t*	page_zip;
+	lint			zip_max_ins;
+
+	ut_ad(zip_size == buf_block_get_zip_size(block));
+	ut_ad(zip_size);
+
+	/* Consider the maximum insert size on the uncompressed page
+	without reorganizing the page. We must not assume anything
+	about the compression ratio. If zip_max_ins > max_ins_size and
+	there is 1/4 garbage on the page, recompression after the
+	reorganize could fail, in theory. So, let us guarantee that
+	merging a buffered insert to a compressed page will always
+	succeed without reorganizing or recompressing the page, just
+	by using the page modification log. */
+	max_ins_size = page_get_max_insert_size(
+		buf_block_get_frame(block), 1);
+
+	page_zip = buf_block_get_page_zip(block);
+	zip_max_ins = page_zip_max_ins_size(page_zip,
+					    FALSE/* not clustered */);
+
+	if (zip_max_ins < 0) {
+		return(0);
+	} else if (max_ins_size > (ulint) zip_max_ins) {
+		max_ins_size = (ulint) zip_max_ins;
+	}
+
+	return(ibuf_index_page_calc_free_bits(zip_size, max_ins_size));
+}
+
+/*********************************************************************//**
+Translates the free space on a page to a value in the ibuf bitmap.
+@return	value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free(
+/*======================*/
+	ulint			zip_size,/*!< in: compressed page size in bytes;
+					0 for uncompressed pages */
+	const buf_block_t*	block)	/*!< in: buffer block */
+{
+	ut_ad(zip_size == buf_block_get_zip_size(block));
+
+	if (!zip_size) {
+		ulint	max_ins_size;
+
+		max_ins_size = page_get_max_insert_size_after_reorganize(
+			buf_block_get_frame(block), 1);
+
+		return(ibuf_index_page_calc_free_bits(0, max_ins_size));
+	} else {
+		return(ibuf_index_page_calc_free_zip(zip_size, block));
+	}
+}
+
+/************************************************************************//**
+Updates the free bits of an uncompressed page in the ibuf bitmap if
+there is not enough free on the page any more.  This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is
+unsafe to increment the bits in a separately committed
+mini-transaction, because in crash recovery, the free bits could
+momentarily be set too high.  It is only safe to use this function for
+decrementing the free bits.  Should more free space become available,
+we must not update the free bits here, because that would break crash
+recovery. */
+UNIV_INLINE
+void
+ibuf_update_free_bits_if_full(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: index page to which we have added new
+				records; the free bits are updated if the
+				index is non-clustered and non-unique and
+				the page level is 0, and the page becomes
+				fuller */
+	ulint		max_ins_size,/*!< in: value of maximum insert size with
+				reorganize before the latest operation
+				performed to the page */
+	ulint		increase)/*!< in: upper limit for the additional space
+				used in the latest operation, if known, or
+				ULINT_UNDEFINED */
+{
+	ulint	before;
+	ulint	after;
+
+	ut_ad(!buf_block_get_page_zip(block));
+
+	before = ibuf_index_page_calc_free_bits(0, max_ins_size);
+
+	if (max_ins_size >= increase) {
+#if ULINT32_UNDEFINED <= UNIV_PAGE_SIZE_MAX
+# error "ULINT32_UNDEFINED <= UNIV_PAGE_SIZE_MAX"
+#endif
+		after = ibuf_index_page_calc_free_bits(0, max_ins_size
+						       - increase);
+#ifdef UNIV_IBUF_DEBUG
+		ut_a(after <= ibuf_index_page_calc_free(0, block));
+#endif
+	} else {
+		after = ibuf_index_page_calc_free(0, block);
+	}
+
+	if (after == 0) {
+		/* We move the page to the front of the buffer pool LRU list:
+		the purpose of this is to prevent those pages to which we
+		cannot make inserts using the insert buffer from slipping
+		out of the buffer pool */
+
+		buf_page_make_young(&block->page);
+	}
+
+	if (before > after) {
+		ibuf_set_free_bits(block, after, before);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/ibuf0types.h b/storage/innobase/include/ibuf0types.h
new file mode 100644
index 00000000000..3fdbf078b0b
--- /dev/null
+++ b/storage/innobase/include/ibuf0types.h
@@ -0,0 +1,31 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0types.h
+Insert buffer global types
+
+Created 7/29/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef ibuf0types_h
+#define ibuf0types_h
+
+struct ibuf_t;
+
+#endif
diff --git a/storage/innobase/include/lock0iter.h b/storage/innobase/include/lock0iter.h
new file mode 100644
index 00000000000..0054850b526
--- /dev/null
+++ b/storage/innobase/include/lock0iter.h
@@ -0,0 +1,69 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0iter.h
+Lock queue iterator type and function prototypes.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef lock0iter_h
+#define lock0iter_h
+
+#include "univ.i"
+#include "lock0types.h"
+
+struct lock_queue_iterator_t {
+	const lock_t*	current_lock;
+	/* In case this is a record lock queue (not table lock queue)
+	then bit_no is the record number within the heap in which the
+	record is stored. */
+	ulint		bit_no;
+};
+
+/*******************************************************************//**
+Initialize lock queue iterator so that it starts to iterate from
+"lock". bit_no specifies the record number within the heap where the
+record is stored. It can be undefined (ULINT_UNDEFINED) in two cases:
+1. If the lock is a table lock, thus we have a table lock queue;
+2. If the lock is a record lock and it is a wait lock. In this case
+   bit_no is calculated in this function by using
+   lock_rec_find_set_bit(). There is exactly one bit set in the bitmap
+   of a wait lock. */
+UNIV_INTERN
+void
+lock_queue_iterator_reset(
+/*======================*/
+	lock_queue_iterator_t*	iter,	/*!< out: iterator */
+	const lock_t*		lock,	/*!< in: lock to start from */
+	ulint			bit_no);/*!< in: record number in the
+					heap */
+
+/*******************************************************************//**
+Gets the previous lock in the lock queue, returns NULL if there are no
+more locks (i.e. the current lock is the first one). The iterator is
+receded (if not-NULL is returned).
+@return	previous lock or NULL */
+
+const lock_t*
+lock_queue_iterator_get_prev(
+/*=========================*/
+	lock_queue_iterator_t*	iter);	/*!< in/out: iterator */
+
+#endif /* lock0iter_h */
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
new file mode 100644
index 00000000000..6d5ed35d5d8
--- /dev/null
+++ b/storage/innobase/include/lock0lock.h
@@ -0,0 +1,979 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0lock.h
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef lock0lock_h
+#define lock0lock_h
+
+#include "univ.i"
+#include "buf0types.h"
+#include "trx0types.h"
+#include "mtr0types.h"
+#include "rem0types.h"
+#include "dict0types.h"
+#include "que0types.h"
+#include "lock0types.h"
+#include "read0types.h"
+#include "hash0hash.h"
+#include "srv0srv.h"
+#include "ut0vec.h"
+
+#ifdef UNIV_DEBUG
+extern ibool	lock_print_waits;
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets the size of a lock struct.
+@return	size in bytes */
+UNIV_INTERN
+ulint
+lock_get_size(void);
+/*===============*/
+/*********************************************************************//**
+Creates the lock system at database start. */
+UNIV_INTERN
+void
+lock_sys_create(
+/*============*/
+	ulint	n_cells);	/*!< in: number of slots in lock hash table */
+/*********************************************************************//**
+Closes the lock system at database shutdown. */
+UNIV_INTERN
+void
+lock_sys_close(void);
+/*================*/
+/*********************************************************************//**
+Gets the heap_no of the smallest user record on a page.
+@return	heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
+UNIV_INLINE
+ulint
+lock_get_min_heap_no(
+/*=================*/
+	const buf_block_t*	block);	/*!< in: buffer block */
+/*************************************************************//**
+Updates the lock table when we have reorganized a page. NOTE: we copy
+also the locks set on the infimum of the page; the infimum may carry
+locks if an update of a record is occurring on the page, and its locks
+were temporarily stored on the infimum. */
+UNIV_INTERN
+void
+lock_move_reorganize_page(
+/*======================*/
+	const buf_block_t*	block,	/*!< in: old index page, now
+					reorganized */
+	const buf_block_t*	oblock);/*!< in: copy of the old, not
+					reorganized page */
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list end is moved to another page. */
+UNIV_INTERN
+void
+lock_move_rec_list_end(
+/*===================*/
+	const buf_block_t*	new_block,	/*!< in: index page to move to */
+	const buf_block_t*	block,		/*!< in: index page */
+	const rec_t*		rec);		/*!< in: record on page: this
+						is the first record moved */
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+UNIV_INTERN
+void
+lock_move_rec_list_start(
+/*=====================*/
+	const buf_block_t*	new_block,	/*!< in: index page to move to */
+	const buf_block_t*	block,		/*!< in: index page */
+	const rec_t*		rec,		/*!< in: record on page:
+						this is the first
+						record NOT copied */
+	const rec_t*		old_end);	/*!< in: old
+						previous-to-last
+						record on new_page
+						before the records
+						were copied */
+/*************************************************************//**
+Updates the lock table when a page is split to the right. */
+UNIV_INTERN
+void
+lock_update_split_right(
+/*====================*/
+	const buf_block_t*	right_block,	/*!< in: right page */
+	const buf_block_t*	left_block);	/*!< in: left page */
+/*************************************************************//**
+Updates the lock table when a page is merged to the right. */
+UNIV_INTERN
+void
+lock_update_merge_right(
+/*====================*/
+	const buf_block_t*	right_block,	/*!< in: right page to
+						which merged */
+	const rec_t*		orig_succ,	/*!< in: original
+						successor of infimum
+						on the right page
+						before merge */
+	const buf_block_t*	left_block);	/*!< in: merged index
+						page which will be
+						discarded */
+/*************************************************************//**
+Updates the lock table when the root page is copied to another in
+btr_root_raise_and_insert. Note that we leave lock structs on the
+root page, even though they do not make sense on other than leaf
+pages: the reason is that in a pessimistic update the infimum record
+of the root page will act as a dummy carrier of the locks of the record
+to be updated. */
+UNIV_INTERN
+void
+lock_update_root_raise(
+/*===================*/
+	const buf_block_t*	block,	/*!< in: index page to which copied */
+	const buf_block_t*	root);	/*!< in: root page */
+/*************************************************************//**
+Updates the lock table when a page is copied to another and the original page
+is removed from the chain of leaf pages, except if page is the root! */
+UNIV_INTERN
+void
+lock_update_copy_and_discard(
+/*=========================*/
+	const buf_block_t*	new_block,	/*!< in: index page to
+						which copied */
+	const buf_block_t*	block);		/*!< in: index page;
+						NOT the root! */
+/*************************************************************//**
+Updates the lock table when a page is split to the left. */
+UNIV_INTERN
+void
+lock_update_split_left(
+/*===================*/
+	const buf_block_t*	right_block,	/*!< in: right page */
+	const buf_block_t*	left_block);	/*!< in: left page */
+/*************************************************************//**
+Updates the lock table when a page is merged to the left. */
+UNIV_INTERN
+void
+lock_update_merge_left(
+/*===================*/
+	const buf_block_t*	left_block,	/*!< in: left page to
+						which merged */
+	const rec_t*		orig_pred,	/*!< in: original predecessor
+						of supremum on the left page
+						before merge */
+	const buf_block_t*	right_block);	/*!< in: merged index page
+						which will be discarded */
+/*************************************************************//**
+Resets the original locks on heir and replaces them with gap type locks
+inherited from rec. */
+UNIV_INTERN
+void
+lock_rec_reset_and_inherit_gap_locks(
+/*=================================*/
+	const buf_block_t*	heir_block,	/*!< in: block containing the
+						record which inherits */
+	const buf_block_t*	block,		/*!< in: block containing the
+						record from which inherited;
+						does NOT reset the locks on
+						this record */
+	ulint			heir_heap_no,	/*!< in: heap_no of the
+						inheriting record */
+	ulint			heap_no);	/*!< in: heap_no of the
+						donating record */
+/*************************************************************//**
+Updates the lock table when a page is discarded. */
+UNIV_INTERN
+void
+lock_update_discard(
+/*================*/
+	const buf_block_t*	heir_block,	/*!< in: index page
+						which will inherit the locks */
+	ulint			heir_heap_no,	/*!< in: heap_no of the record
+						which will inherit the locks */
+	const buf_block_t*	block);		/*!< in: index page
+						which will be discarded */
+/*************************************************************//**
+Updates the lock table when a new user record is inserted. */
+UNIV_INTERN
+void
+lock_update_insert(
+/*===============*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec);	/*!< in: the inserted record */
+/*************************************************************//**
+Updates the lock table when a record is removed. */
+UNIV_INTERN
+void
+lock_update_delete(
+/*===============*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec);	/*!< in: the record to be removed */
+/*********************************************************************//**
+Stores on the page infimum record the explicit locks of another record.
+This function is used to store the lock state of a record when it is
+updated and the size of the record changes in the update. The record
+is in such an update moved, perhaps to another page. The infimum record
+acts as a dummy carrier record, taking care of lock releases while the
+actual record is being moved. */
+UNIV_INTERN
+void
+lock_rec_store_on_page_infimum(
+/*===========================*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec);	/*!< in: record whose lock state
+					is stored on the infimum
+					record of the same page; lock
+					bits are reset on the
+					record */
+/*********************************************************************//**
+Restores the state of explicit lock requests on a single record, where the
+state was stored on the infimum of the page. */
+UNIV_INTERN
+void
+lock_rec_restore_from_page_infimum(
+/*===============================*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec,	/*!< in: record whose lock state
+					is restored */
+	const buf_block_t*	donator);/*!< in: page (rec is not
+					necessarily on this page)
+					whose infimum stored the lock
+					state; lock bits are reset on
+					the infimum */
+/*********************************************************************//**
+Determines if there are explicit record locks on a page.
+@return	an explicit record lock on the page, or NULL if there are none */
+UNIV_INTERN
+lock_t*
+lock_rec_expl_exist_on_page(
+/*========================*/
+	ulint	space,	/*!< in: space id */
+	ulint	page_no)/*!< in: page number */
+	__attribute__((warn_unused_result));
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate insert of
+a record. If they do, first tests if the query thread should anyway
+be suspended for some reason; if not, then puts the transaction and
+the query thread to the lock wait state and inserts a waiting request
+for a gap x-lock to the lock queue.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+dberr_t
+lock_rec_insert_check_and_lock(
+/*===========================*/
+	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is
+				set, does nothing */
+	const rec_t*	rec,	/*!< in: record after which to insert */
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	dict_index_t*	index,	/*!< in: index */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	ibool*		inherit)/*!< out: set to TRUE if the new
+				inserted record maybe should inherit
+				LOCK_GAP type locks from the successor
+				record */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify (update,
+delete mark, or delete unmark) of a clustered index record. If they do,
+first tests if the query thread should anyway be suspended for some
+reason; if not, then puts the transaction and the query thread to the
+lock wait state and inserts a waiting request for a record x-lock to the
+lock queue.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+dberr_t
+lock_clust_rec_modify_check_and_lock(
+/*=================================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record which should be
+					modified */
+	dict_index_t*		index,	/*!< in: clustered index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	que_thr_t*		thr)	/*!< in: query thread */
+	__attribute__((warn_unused_result, nonnull));
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify
+(delete mark or delete unmark) of a secondary index record.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+dberr_t
+lock_sec_rec_modify_check_and_lock(
+/*===============================*/
+	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+				bit is set, does nothing */
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	const rec_t*	rec,	/*!< in: record which should be
+				modified; NOTE: as this is a secondary
+				index, we always have to modify the
+				clustered index record first: see the
+				comment below */
+	dict_index_t*	index,	/*!< in: secondary index */
+	que_thr_t*	thr,	/*!< in: query thread
+				(can be NULL if BTR_NO_LOCKING_FLAG) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((warn_unused_result, nonnull(2,3,4,6)));
+/*********************************************************************//**
+Like lock_clust_rec_read_check_and_lock(), but reads a
+secondary index record.
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
+or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+dberr_t
+lock_sec_rec_read_check_and_lock(
+/*=============================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: secondary index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	enum lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	ulint			gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr);	/*!< in: query thread */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record.
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
+or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+dberr_t
+lock_clust_rec_read_check_and_lock(
+/*===============================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: clustered index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	enum lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	ulint			gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr);	/*!< in: query thread */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record. This is an alternative version of
+lock_clust_rec_read_check_and_lock() that does not require the parameter
+"offsets".
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+dberr_t
+lock_clust_rec_read_check_and_lock_alt(
+/*===================================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: clustered index */
+	enum lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	ulint			gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr)	/*!< in: query thread */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Checks that a record is seen in a consistent read.
+@return true if sees, or false if an earlier version of the record
+should be retrieved */
+UNIV_INTERN
+bool
+lock_clust_rec_cons_read_sees(
+/*==========================*/
+	const rec_t*	rec,	/*!< in: user record which should be read or
+				passed over by a read cursor */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	read_view_t*	view);	/*!< in: consistent read view */
+/*********************************************************************//**
+Checks that a non-clustered index record is seen in a consistent read.
+
+NOTE that a non-clustered index page contains so little information on
+its modifications that also in the case false, the present version of
+rec may be the right, but we must check this from the clustered index
+record.
+
+@return true if certainly sees, or false if an earlier version of the
+clustered index record might be needed */
+UNIV_INTERN
+bool
+lock_sec_rec_cons_read_sees(
+/*========================*/
+	const rec_t*		rec,	/*!< in: user record which
+					should be read or passed over
+					by a read cursor */
+	const read_view_t*	view)	/*!< in: consistent read view */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Locks the specified database table in the mode given. If the lock cannot
+be granted immediately, the query thread is put to wait.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+dberr_t
+lock_table(
+/*=======*/
+	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is set,
+				does nothing */
+	dict_table_t*	table,	/*!< in/out: database table
+				in dictionary cache */
+	enum lock_mode	mode,	/*!< in: lock mode */
+	que_thr_t*	thr)	/*!< in: query thread */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Creates a table IX lock object for a resurrected transaction. */
+UNIV_INTERN
+void
+lock_table_ix_resurrect(
+/*====================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	trx_t*		trx);	/*!< in/out: transaction */
+/*************************************************************//**
+Removes a granted record lock of a transaction from the queue and grants
+locks to other transactions waiting in the queue if they now are entitled
+to a lock. */
+UNIV_INTERN
+void
+lock_rec_unlock(
+/*============*/
+	trx_t*			trx,	/*!< in/out: transaction that has
+					set a record lock */
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec,	/*!< in: record */
+	enum lock_mode		lock_mode);/*!< in: LOCK_S or LOCK_X */
+/*********************************************************************//**
+Releases a transaction's locks, and releases possible other transactions
+waiting because of these locks. Change the state of the transaction to
+TRX_STATE_COMMITTED_IN_MEMORY. */
+UNIV_INTERN
+void
+lock_trx_release_locks(
+/*===================*/
+	trx_t*	trx);	/*!< in/out: transaction */
+/*********************************************************************//**
+Removes locks on a table to be dropped or truncated.
+If remove_also_table_sx_locks is TRUE then table-level S and X locks are
+also removed in addition to other table-level and record-level locks.
+No lock, that is going to be removed, is allowed to be a wait lock. */
+UNIV_INTERN
+void
+lock_remove_all_on_table(
+/*=====================*/
+	dict_table_t*	table,			/*!< in: table to be dropped
+						or truncated */
+	ibool		remove_also_table_sx_locks);/*!< in: also removes
+						table S and X locks */
+
+/*********************************************************************//**
+Calculates the fold value of a page file address: used in inserting or
+searching for a lock in the hash table.
+@return	folded value */
+UNIV_INLINE
+ulint
+lock_rec_fold(
+/*==========*/
+	ulint	space,	/*!< in: space */
+	ulint	page_no)/*!< in: page number */
+	__attribute__((const));
+/*********************************************************************//**
+Calculates the hash value of a page file address: used in inserting or
+searching for a lock in the hash table.
+@return	hashed value */
+UNIV_INLINE
+ulint
+lock_rec_hash(
+/*==========*/
+	ulint	space,	/*!< in: space */
+	ulint	page_no);/*!< in: page number */
+
+/**********************************************************************//**
+Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED,
+if none found.
+@return bit index == heap number of the record, or ULINT_UNDEFINED if
+none found */
+UNIV_INTERN
+ulint
+lock_rec_find_set_bit(
+/*==================*/
+	const lock_t*	lock);	/*!< in: record lock with at least one
+				bit set */
+
+/*********************************************************************//**
+Gets the source table of an ALTER TABLE transaction.  The table must be
+covered by an IX or IS table lock.
+@return the source table of transaction, if it is covered by an IX or
+IS table lock; dest if there is no source table, and NULL if the
+transaction is locking more than two tables or an inconsistency is
+found */
+UNIV_INTERN
+dict_table_t*
+lock_get_src_table(
+/*===============*/
+	trx_t*		trx,	/*!< in: transaction */
+	dict_table_t*	dest,	/*!< in: destination of ALTER TABLE */
+	enum lock_mode*	mode);	/*!< out: lock mode of the source table */
+/*********************************************************************//**
+Determine if the given table is exclusively "owned" by the given
+transaction, i.e., transaction holds LOCK_IX and possibly LOCK_AUTO_INC
+on the table.
+@return TRUE if table is only locked by trx, with LOCK_IX, and
+possibly LOCK_AUTO_INC */
+UNIV_INTERN
+ibool
+lock_is_table_exclusive(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const trx_t*		trx)	/*!< in: transaction */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Checks if a lock request lock1 has to wait for request lock2.
+@return	TRUE if lock1 has to wait for lock2 to be removed */
+UNIV_INTERN
+ibool
+lock_has_to_wait(
+/*=============*/
+	const lock_t*	lock1,	/*!< in: waiting lock */
+	const lock_t*	lock2);	/*!< in: another lock; NOTE that it is
+				assumed that this has a lock bit set
+				on the same record as in lock1 if the
+				locks are record locks */
+/*********************************************************************//**
+Reports that a transaction id is insensible, i.e., in the future. */
+UNIV_INTERN
+void
+lock_report_trx_id_insanity(
+/*========================*/
+	trx_id_t	trx_id,		/*!< in: trx id */
+	const rec_t*	rec,		/*!< in: user record */
+	dict_index_t*	index,		/*!< in: index */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	trx_id_t	max_trx_id)	/*!< in: trx_sys_get_max_trx_id() */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Prints info of a table lock. */
+UNIV_INTERN
+void
+lock_table_print(
+/*=============*/
+	FILE*		file,	/*!< in: file where to print */
+	const lock_t*	lock);	/*!< in: table type lock */
+/*********************************************************************//**
+Prints info of a record lock. */
+UNIV_INTERN
+void
+lock_rec_print(
+/*===========*/
+	FILE*		file,	/*!< in: file where to print */
+	const lock_t*	lock);	/*!< in: record type lock */
+/*********************************************************************//**
+Prints info of locks for all transactions.
+@return FALSE if not able to obtain lock mutex and exits without
+printing info */
+UNIV_INTERN
+ibool
+lock_print_info_summary(
+/*====================*/
+	FILE*	file,	/*!< in: file where to print */
+	ibool   nowait)	/*!< in: whether to wait for the lock mutex */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Prints info of locks for each transaction. This function assumes that the
+caller holds the lock mutex and more importantly it will release the lock
+mutex on behalf of the caller. (This should be fixed in the future). */
+UNIV_INTERN
+void
+lock_print_info_all_transactions(
+/*=============================*/
+	FILE*	file);	/*!< in: file where to print */
+/*********************************************************************//**
+Return approximate number or record locks (bits set in the bitmap) for
+this transaction. Since delete-marked records may be removed, the
+record count will not be precise.
+The caller must be holding lock_sys->mutex. */
+UNIV_INTERN
+ulint
+lock_number_of_rows_locked(
+/*=======================*/
+	const trx_lock_t*	trx_lock)	/*!< in: transaction locks */
+	__attribute__((nonnull, warn_unused_result));
+
+/*******************************************************************//**
+Gets the type of a lock. Non-inline version for using outside of the
+lock module.
+@return	LOCK_TABLE or LOCK_REC */
+UNIV_INTERN
+ulint
+lock_get_type(
+/*==========*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*******************************************************************//**
+Gets the id of the transaction owning a lock.
+@return	transaction id */
+UNIV_INTERN
+trx_id_t
+lock_get_trx_id(
+/*============*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*******************************************************************//**
+Gets the mode of a lock in a human readable string.
+The string should not be free()'d or modified.
+@return	lock mode */
+UNIV_INTERN
+const char*
+lock_get_mode_str(
+/*==============*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*******************************************************************//**
+Gets the type of a lock in a human readable string.
+The string should not be free()'d or modified.
+@return	lock type */
+UNIV_INTERN
+const char*
+lock_get_type_str(
+/*==============*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*******************************************************************//**
+Gets the id of the table on which the lock is.
+@return	id of the table */
+UNIV_INTERN
+table_id_t
+lock_get_table_id(
+/*==============*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*******************************************************************//**
+Gets the name of the table on which the lock is.
+The string should not be free()'d or modified.
+@return	name of the table */
+UNIV_INTERN
+const char*
+lock_get_table_name(
+/*================*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*******************************************************************//**
+For a record lock, gets the index on which the lock is.
+@return	index */
+UNIV_INTERN
+const dict_index_t*
+lock_rec_get_index(
+/*===============*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*******************************************************************//**
+For a record lock, gets the name of the index on which the lock is.
+The string should not be free()'d or modified.
+@return	name of the index */
+UNIV_INTERN
+const char*
+lock_rec_get_index_name(
+/*====================*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*******************************************************************//**
+For a record lock, gets the tablespace number on which the lock is.
+@return	tablespace number */
+UNIV_INTERN
+ulint
+lock_rec_get_space_id(
+/*==================*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*******************************************************************//**
+For a record lock, gets the page number on which the lock is.
+@return	page number */
+UNIV_INTERN
+ulint
+lock_rec_get_page_no(
+/*=================*/
+	const lock_t*	lock);	/*!< in: lock */
+/*******************************************************************//**
+Check if there are any locks (table or rec) against table.
+@return	TRUE if locks exist */
+UNIV_INTERN
+ibool
+lock_table_has_locks(
+/*=================*/
+	const dict_table_t*	table);	/*!< in: check if there are any locks
+					held on records in this table or on the
+					table itself */
+
+/*********************************************************************//**
+A thread which wakes up threads whose lock wait may have lasted too long.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(lock_wait_timeout_thread)(
+/*=====================================*/
+	void*	arg);	/*!< in: a dummy parameter required by
+			os_thread_create */
+
+/********************************************************************//**
+Releases a user OS thread waiting for a lock to be released, if the
+thread is already suspended. */
+UNIV_INTERN
+void
+lock_wait_release_thread_if_suspended(
+/*==================================*/
+	que_thr_t*	thr);	/*!< in: query thread associated with the
+				user OS thread	 */
+
+/***************************************************************//**
+Puts a user OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */
+UNIV_INTERN
+void
+lock_wait_suspend_thread(
+/*=====================*/
+	que_thr_t*	thr);	/*!< in: query thread associated with the
+				user OS thread */
+/*********************************************************************//**
+Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
+function should be called at the the end of an SQL statement, by the
+connection thread that owns the transaction (trx->mysql_thd). */
+UNIV_INTERN
+void
+lock_unlock_table_autoinc(
+/*======================*/
+	trx_t*	trx);			/*!< in/out: transaction */
+/*********************************************************************//**
+Check whether the transaction has already been rolled back because it
+was selected as a deadlock victim, or if it has to wait then cancel
+the wait lock.
+@return DB_DEADLOCK, DB_LOCK_WAIT or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+lock_trx_handle_wait(
+/*=================*/
+	trx_t*	trx)	/*!< in/out: trx lock state */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Get the number of locks on a table.
+@return number of locks */
+UNIV_INTERN
+ulint
+lock_table_get_n_locks(
+/*===================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((nonnull));
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Checks that a transaction id is sensible, i.e., not in the future.
+@return	true if ok */
+UNIV_INTERN
+bool
+lock_check_trx_id_sanity(
+/*=====================*/
+	trx_id_t	trx_id,		/*!< in: trx id */
+	const rec_t*	rec,		/*!< in: user record */
+	dict_index_t*	index,		/*!< in: index */
+	const ulint*	offsets)	/*!< in: rec_get_offsets(rec, index) */
+	__attribute__((nonnull, warn_unused_result));
+/*******************************************************************//**
+Check if the transaction holds any locks on the sys tables
+or its records.
+@return	the strongest lock found on any sys table or 0 for none */
+UNIV_INTERN
+const lock_t*
+lock_trx_has_sys_table_locks(
+/*=========================*/
+	const trx_t*	trx)	/*!< in: transaction to check */
+	__attribute__((warn_unused_result));
+
+/*******************************************************************//**
+Check if the transaction holds an exclusive lock on a record.
+@return	whether the locks are held */
+UNIV_INTERN
+bool
+lock_trx_has_rec_x_lock(
+/*====================*/
+	const trx_t*		trx,	/*!< in: transaction to check */
+	const dict_table_t*	table,	/*!< in: table to check */
+	const buf_block_t*	block,	/*!< in: buffer block of the record */
+	ulint			heap_no)/*!< in: record heap number */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
+
+/** Lock modes and types */
+/* @{ */
+#define LOCK_MODE_MASK	0xFUL	/*!< mask used to extract mode from the
+				type_mode field in a lock */
+/** Lock types */
+/* @{ */
+#define LOCK_TABLE	16	/*!< table lock */
+#define	LOCK_REC	32	/*!< record lock */
+#define LOCK_TYPE_MASK	0xF0UL	/*!< mask used to extract lock type from the
+				type_mode field in a lock */
+#if LOCK_MODE_MASK & LOCK_TYPE_MASK
+# error "LOCK_MODE_MASK & LOCK_TYPE_MASK"
+#endif
+
+#define LOCK_WAIT	256	/*!< Waiting lock flag; when set, it
+				means that the lock has not yet been
+				granted, it is just waiting for its
+				turn in the wait queue */
+/* Precise modes */
+#define LOCK_ORDINARY	0	/*!< this flag denotes an ordinary
+				next-key lock in contrast to LOCK_GAP
+				or LOCK_REC_NOT_GAP */
+#define LOCK_GAP	512	/*!< when this bit is set, it means that the
+				lock holds only on the gap before the record;
+				for instance, an x-lock on the gap does not
+				give permission to modify the record on which
+				the bit is set; locks of this type are created
+				when records are removed from the index chain
+				of records */
+#define LOCK_REC_NOT_GAP 1024	/*!< this bit means that the lock is only on
+				the index record and does NOT block inserts
+				to the gap before the index record; this is
+				used in the case when we retrieve a record
+				with a unique key, and is also used in
+				locking plain SELECTs (not part of UPDATE
+				or DELETE) when the user has set the READ
+				COMMITTED isolation level */
+#define LOCK_INSERT_INTENTION 2048 /*!< this bit is set when we place a waiting
+				gap type record lock request in order to let
+				an insert of an index record to wait until
+				there are no conflicting locks by other
+				transactions on the gap; note that this flag
+				remains set when the waiting lock is granted,
+				or if the lock is inherited to a neighboring
+				record */
+
+#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION)&LOCK_MODE_MASK
+# error
+#endif
+#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION)&LOCK_TYPE_MASK
+# error
+#endif
+/* @} */
+
+/** Lock operation struct */
+struct lock_op_t{
+	dict_table_t*	table;	/*!< table to be locked */
+	enum lock_mode	mode;	/*!< lock mode */
+};
+
+/** The lock system struct */
+struct lock_sys_t{
+	ib_mutex_t	mutex;			/*!< Mutex protecting the
+						locks */
+	hash_table_t*	rec_hash;		/*!< hash table of the record
+						locks */
+	ib_mutex_t	wait_mutex;		/*!< Mutex protecting the
+						next two fields */
+	srv_slot_t*	waiting_threads;	/*!< Array  of user threads
+						suspended while waiting for
+						locks within InnoDB, protected
+						by the lock_sys->wait_mutex */
+	srv_slot_t*	last_slot;		/*!< highest slot ever used
+						in the waiting_threads array,
+						protected by
+						lock_sys->wait_mutex */
+	ibool		rollback_complete;
+						/*!< TRUE if rollback of all
+						recovered transactions is
+						complete. Protected by
+						lock_sys->mutex */
+
+	ulint		n_lock_max_wait_time;	/*!< Max wait time */
+
+	os_event_t	timeout_event;		/*!< Set to the event that is
+						created in the lock wait monitor
+						thread. A value of 0 means the
+						thread is not active */
+
+	bool		timeout_thread_active;	/*!< True if the timeout thread
+						is running */
+};
+
+/** The lock system */
+extern lock_sys_t*	lock_sys;
+
+/** Test if lock_sys->mutex can be acquired without waiting. */
+#define lock_mutex_enter_nowait() mutex_enter_nowait(&lock_sys->mutex)
+
+/** Test if lock_sys->mutex is owned. */
+#define lock_mutex_own() mutex_own(&lock_sys->mutex)
+
+/** Acquire the lock_sys->mutex. */
+#define lock_mutex_enter() do {			\
+	mutex_enter(&lock_sys->mutex);		\
+} while (0)
+
+/** Release the lock_sys->mutex. */
+#define lock_mutex_exit() do {			\
+	mutex_exit(&lock_sys->mutex);		\
+} while (0)
+
+/** Test if lock_sys->wait_mutex is owned. */
+#define lock_wait_mutex_own() mutex_own(&lock_sys->wait_mutex)
+
+/** Acquire the lock_sys->wait_mutex. */
+#define lock_wait_mutex_enter() do {		\
+	mutex_enter(&lock_sys->wait_mutex);	\
+} while (0)
+
+/** Release the lock_sys->wait_mutex. */
+#define lock_wait_mutex_exit() do {		\
+	mutex_exit(&lock_sys->wait_mutex);	\
+} while (0)
+
+#ifndef UNIV_NONINL
+#include "lock0lock.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/lock0lock.ic b/storage/innobase/include/lock0lock.ic
new file mode 100644
index 00000000000..736936954cb
--- /dev/null
+++ b/storage/innobase/include/lock0lock.ic
@@ -0,0 +1,92 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0lock.ic
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#include "sync0sync.h"
+#include "srv0srv.h"
+#include "dict0dict.h"
+#include "row0row.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "buf0buf.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "row0vers.h"
+#include "que0que.h"
+#include "btr0cur.h"
+#include "read0read.h"
+#include "log0recv.h"
+
+/*********************************************************************//**
+Calculates the fold value of a page file address: used in inserting or
+searching for a lock in the hash table.
+@return	folded value */
+UNIV_INLINE
+ulint
+lock_rec_fold(
+/*==========*/
+	ulint	space,	/*!< in: space */
+	ulint	page_no)/*!< in: page number */
+{
+	return(ut_fold_ulint_pair(space, page_no));
+}
+
+/*********************************************************************//**
+Calculates the hash value of a page file address: used in inserting or
+searching for a lock in the hash table.
+@return	hashed value */
+UNIV_INLINE
+ulint
+lock_rec_hash(
+/*==========*/
+	ulint	space,	/*!< in: space */
+	ulint	page_no)/*!< in: page number */
+{
+	return(hash_calc_hash(lock_rec_fold(space, page_no),
+			      lock_sys->rec_hash));
+}
+
+/*********************************************************************//**
+Gets the heap_no of the smallest user record on a page.
+@return	heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
+UNIV_INLINE
+ulint
+lock_get_min_heap_no(
+/*=================*/
+	const buf_block_t*	block)	/*!< in: buffer block */
+{
+	const page_t*	page	= block->frame;
+
+	if (page_is_comp(page)) {
+		return(rec_get_heap_no_new(
+			       page
+			       + rec_get_next_offs(page + PAGE_NEW_INFIMUM,
+						   TRUE)));
+	} else {
+		return(rec_get_heap_no_old(
+			       page
+			       + rec_get_next_offs(page + PAGE_OLD_INFIMUM,
+						   FALSE)));
+	}
+}
diff --git a/storage/innobase/include/lock0priv.h b/storage/innobase/include/lock0priv.h
new file mode 100644
index 00000000000..9f7ab9f76b6
--- /dev/null
+++ b/storage/innobase/include/lock0priv.h
@@ -0,0 +1,126 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0priv.h
+Lock module internal structures and methods.
+
+Created July 12, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef lock0priv_h
+#define lock0priv_h
+
+#ifndef LOCK_MODULE_IMPLEMENTATION
+/* If you need to access members of the structures defined in this
+file, please write appropriate functions that retrieve them and put
+those functions in lock/ */
+#error Do not include lock0priv.h outside of the lock/ module
+#endif
+
+#include "univ.i"
+#include "dict0types.h"
+#include "hash0hash.h"
+#include "trx0types.h"
+#include "ut0lst.h"
+
+/** A table lock */
+struct lock_table_t {
+	dict_table_t*	table;		/*!< database table in dictionary
+					cache */
+	UT_LIST_NODE_T(lock_t)
+			locks;		/*!< list of locks on the same
+					table */
+};
+
+/** Record lock for a page */
+struct lock_rec_t {
+	ulint	space;			/*!< space id */
+	ulint	page_no;		/*!< page number */
+	ulint	n_bits;			/*!< number of bits in the lock
+					bitmap; NOTE: the lock bitmap is
+					placed immediately after the
+					lock struct */
+};
+
+/** Lock struct; protected by lock_sys->mutex */
+struct lock_t {
+	trx_t*		trx;		/*!< transaction owning the
+					lock */
+	UT_LIST_NODE_T(lock_t)
+			trx_locks;	/*!< list of the locks of the
+					transaction */
+	ulint		type_mode;	/*!< lock type, mode, LOCK_GAP or
+					LOCK_REC_NOT_GAP,
+					LOCK_INSERT_INTENTION,
+					wait flag, ORed */
+	hash_node_t	hash;		/*!< hash chain node for a record
+					lock */
+	dict_index_t*	index;		/*!< index for a record lock */
+	union {
+		lock_table_t	tab_lock;/*!< table lock */
+		lock_rec_t	rec_lock;/*!< record lock */
+	} un_member;			/*!< lock details */
+};
+
+/*********************************************************************//**
+Gets the type of a lock.
+@return	LOCK_TABLE or LOCK_REC */
+UNIV_INLINE
+ulint
+lock_get_type_low(
+/*==============*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*********************************************************************//**
+Gets the previous record lock set on a record.
+@return	previous lock on the same record, NULL if none exists */
+UNIV_INTERN
+const lock_t*
+lock_rec_get_prev(
+/*==============*/
+	const lock_t*	in_lock,/*!< in: record lock */
+	ulint		heap_no);/*!< in: heap number of the record */
+
+/*********************************************************************//**
+Cancels a waiting lock request and releases possible other transactions
+waiting behind it. */
+UNIV_INTERN
+void
+lock_cancel_waiting_and_release(
+/*============================*/
+	lock_t*	lock);	/*!< in/out: waiting lock request */
+
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return	transaction id of the transaction which has the x-lock, or 0 */
+UNIV_INLINE
+trx_id_t
+lock_clust_rec_some_has_impl(
+/*=========================*/
+	const rec_t*		rec,	/*!< in: user record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
+	__attribute__((nonnull, warn_unused_result));
+
+#ifndef UNIV_NONINL
+#include "lock0priv.ic"
+#endif
+
+#endif /* lock0priv_h */
diff --git a/storage/innobase/include/lock0priv.ic b/storage/innobase/include/lock0priv.ic
new file mode 100644
index 00000000000..6b70dc33d3c
--- /dev/null
+++ b/storage/innobase/include/lock0priv.ic
@@ -0,0 +1,67 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0priv.ic
+Lock module internal inline methods.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+/* This file contains only methods which are used in
+lock/lock0* files, other than lock/lock0lock.cc.
+I.e. lock/lock0lock.cc contains more internal inline
+methods but they are used only in that file. */
+
+#ifndef LOCK_MODULE_IMPLEMENTATION
+#error Do not include lock0priv.ic outside of the lock/ module
+#endif
+
+/*********************************************************************//**
+Gets the type of a lock.
+@return	LOCK_TABLE or LOCK_REC */
+UNIV_INLINE
+ulint
+lock_get_type_low(
+/*==============*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	ut_ad(lock);
+
+	return(lock->type_mode & LOCK_TYPE_MASK);
+}
+
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return	transaction id of the transaction which has the x-lock, or 0 */
+UNIV_INLINE
+trx_id_t
+lock_clust_rec_some_has_impl(
+/*=========================*/
+	const rec_t*		rec,	/*!< in: user record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(page_rec_is_user_rec(rec));
+
+	return(row_get_rec_trx_id(rec, index, offsets));
+}
+
+/* vim: set filetype=c: */
diff --git a/storage/innobase/include/lock0types.h b/storage/innobase/include/lock0types.h
new file mode 100644
index 00000000000..cf32e72f864
--- /dev/null
+++ b/storage/innobase/include/lock0types.h
@@ -0,0 +1,47 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0types.h
+The transaction lock system global types
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef lock0types_h
+#define lock0types_h
+
+#define lock_t ib_lock_t
+struct lock_t;
+struct lock_sys_t;
+
+/* Basic lock modes */
+enum lock_mode {
+	LOCK_IS = 0,	/* intention shared */
+	LOCK_IX,	/* intention exclusive */
+	LOCK_S,		/* shared */
+	LOCK_X,		/* exclusive */
+	LOCK_AUTO_INC,	/* locks the auto-inc counter of a table
+			in an exclusive mode */
+	LOCK_NONE,	/* this is used elsewhere to note consistent read */
+	LOCK_NUM = LOCK_NONE, /* number of lock modes */
+	LOCK_NONE_UNSET = 255
+};
+
+
+#endif
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
new file mode 100644
index 00000000000..1318b62c242
--- /dev/null
+++ b/storage/innobase/include/log0log.h
@@ -0,0 +1,999 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2009, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0log.h
+Database log
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef log0log_h
+#define log0log_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "ut0lst.h"
+#ifndef UNIV_HOTBACKUP
+#include "sync0sync.h"
+#include "sync0rw.h"
+#endif /* !UNIV_HOTBACKUP */
+
+/* Type used for all log sequence number storage and arithmetics */
+typedef	ib_uint64_t		lsn_t;
+#define LSN_MAX			IB_UINT64_MAX
+
+#define LSN_PF			UINT64PF
+
+/** Redo log buffer */
+struct log_t;
+/** Redo log group */
+struct log_group_t;
+
+#ifdef UNIV_DEBUG
+/** Flag: write to log file? */
+extern	ibool	log_do_write;
+/** Flag: enable debug output when writing to the log? */
+extern	ibool	log_debug_writes;
+#else /* UNIV_DEBUG */
+/** Write to log */
+# define log_do_write TRUE
+#endif /* UNIV_DEBUG */
+
+/** Wait modes for log_write_up_to @{ */
+#define LOG_NO_WAIT		91
+#define LOG_WAIT_ONE_GROUP	92
+#define	LOG_WAIT_ALL_GROUPS	93
+/* @} */
+/** Maximum number of log groups in log_group_t::checkpoint_buf */
+#define LOG_MAX_N_GROUPS	32
+
+/*******************************************************************//**
+Calculates where in log files we find a specified lsn.
+@return	log file number */
+UNIV_INTERN
+ulint
+log_calc_where_lsn_is(
+/*==================*/
+	ib_int64_t*	log_file_offset,	/*!< out: offset in that file
+						(including the header) */
+	ib_uint64_t	first_header_lsn,	/*!< in: first log file start
+						lsn */
+	ib_uint64_t	lsn,			/*!< in: lsn whose position to
+						determine */
+	ulint		n_log_files,		/*!< in: total number of log
+						files */
+	ib_int64_t	log_file_size);		/*!< in: log file size
+						(including the header) */
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Writes to the log the string given. The log must be released with
+log_release.
+@return	end lsn of the log record, zero if did not succeed */
+UNIV_INLINE
+lsn_t
+log_reserve_and_write_fast(
+/*=======================*/
+	const void*	str,	/*!< in: string */
+	ulint		len,	/*!< in: string length */
+	lsn_t*		start_lsn);/*!< out: start lsn of the log record */
+/***********************************************************************//**
+Releases the log mutex. */
+UNIV_INLINE
+void
+log_release(void);
+/*=============*/
+/***********************************************************************//**
+Checks if there is need for a log buffer flush or a new checkpoint, and does
+this if yes. Any database operation should call this when it has modified
+more than about 4 pages. NOTE that this function may only be called when the
+OS thread owns no synchronization objects except the dictionary mutex. */
+UNIV_INLINE
+void
+log_free_check(void);
+/*================*/
+/************************************************************//**
+Opens the log for log_write_low. The log must be closed with log_close and
+released with log_release.
+@return	start lsn of the log record */
+UNIV_INTERN
+lsn_t
+log_reserve_and_open(
+/*=================*/
+	ulint	len);	/*!< in: length of data to be catenated */
+/************************************************************//**
+Writes to the log the string given. It is assumed that the caller holds the
+log mutex. */
+UNIV_INTERN
+void
+log_write_low(
+/*==========*/
+	byte*	str,		/*!< in: string */
+	ulint	str_len);	/*!< in: string length */
+/************************************************************//**
+Closes the log.
+@return	lsn */
+UNIV_INTERN
+lsn_t
+log_close(void);
+/*===========*/
+/************************************************************//**
+Gets the current lsn.
+@return	current lsn */
+UNIV_INLINE
+lsn_t
+log_get_lsn(void);
+/*=============*/
+/****************************************************************
+Gets the log group capacity. It is OK to read the value without
+holding log_sys->mutex because it is constant.
+@return	log group capacity */
+UNIV_INLINE
+lsn_t
+log_get_capacity(void);
+/*==================*/
+/****************************************************************
+Get log_sys::max_modified_age_async. It is OK to read the value without
+holding log_sys::mutex because it is constant.
+@return	max_modified_age_async */
+UNIV_INLINE
+lsn_t
+log_get_max_modified_age_async(void);
+/*================================*/
+/******************************************************//**
+Initializes the log. */
+UNIV_INTERN
+void
+log_init(void);
+/*==========*/
+/******************************************************************//**
+Inits a log group to the log system. */
+UNIV_INTERN
+void
+log_group_init(
+/*===========*/
+	ulint	id,			/*!< in: group id */
+	ulint	n_files,		/*!< in: number of log files */
+	lsn_t	file_size,		/*!< in: log file size in bytes */
+	ulint	space_id,		/*!< in: space id of the file space
+					which contains the log files of this
+					group */
+	ulint	archive_space_id);	/*!< in: space id of the file space
+					which contains some archived log
+					files for this group; currently, only
+					for the first log group this is
+					used */
+/******************************************************//**
+Completes an i/o to a log file. */
+UNIV_INTERN
+void
+log_io_complete(
+/*============*/
+	log_group_t*	group);	/*!< in: log group */
+/******************************************************//**
+This function is called, e.g., when a transaction wants to commit. It checks
+that the log has been written to the log file up to the last log entry written
+by the transaction. If there is a flush running, it waits and checks if the
+flush flushed enough. If not, starts a new flush. */
+UNIV_INTERN
+void
+log_write_up_to(
+/*============*/
+	lsn_t	lsn,	/*!< in: log sequence number up to which
+			the log should be written, LSN_MAX if not specified */
+	ulint	wait,	/*!< in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
+			or LOG_WAIT_ALL_GROUPS */
+	ibool	flush_to_disk);
+			/*!< in: TRUE if we want the written log
+			also to be flushed to disk */
+/****************************************************************//**
+Does a syncronous flush of the log buffer to disk. */
+UNIV_INTERN
+void
+log_buffer_flush_to_disk(void);
+/*==========================*/
+/****************************************************************//**
+This functions writes the log buffer to the log file and if 'flush'
+is set it forces a flush of the log file as well. This is meant to be
+called from background master thread only as it does not wait for
+the write (+ possible flush) to finish. */
+UNIV_INTERN
+void
+log_buffer_sync_in_background(
+/*==========================*/
+	ibool	flush);	/*<! in: flush the logs to disk */
+/******************************************************//**
+Makes a checkpoint. Note that this function does not flush dirty
+blocks from the buffer pool: it only checks what is lsn of the oldest
+modification in the pool, and writes information about the lsn in
+log files. Use log_make_checkpoint_at to flush also the pool.
+@return	TRUE if success, FALSE if a checkpoint write was already running */
+UNIV_INTERN
+ibool
+log_checkpoint(
+/*===========*/
+	ibool	sync,		/*!< in: TRUE if synchronous operation is
+				desired */
+	ibool	write_always);	/*!< in: the function normally checks if the
+				the new checkpoint would have a greater
+				lsn than the previous one: if not, then no
+				physical write is done; by setting this
+				parameter TRUE, a physical write will always be
+				made to log files */
+/****************************************************************//**
+Makes a checkpoint at a given lsn or later. */
+UNIV_INTERN
+void
+log_make_checkpoint_at(
+/*===================*/
+	lsn_t	lsn,		/*!< in: make a checkpoint at this or a
+				later lsn, if LSN_MAX, makes
+				a checkpoint at the latest lsn */
+	ibool	write_always);	/*!< in: the function normally checks if
+				the new checkpoint would have a
+				greater lsn than the previous one: if
+				not, then no physical write is done;
+				by setting this parameter TRUE, a
+				physical write will always be made to
+				log files */
+/****************************************************************//**
+Makes a checkpoint at the latest lsn and writes it to first page of each
+data file in the database, so that we know that the file spaces contain
+all modifications up to that lsn. This can only be called at database
+shutdown. This function also writes all log in log files to the log archive. */
+UNIV_INTERN
+void
+logs_empty_and_mark_files_at_shutdown(void);
+/*=======================================*/
+/******************************************************//**
+Reads a checkpoint info from a log group header to log_sys->checkpoint_buf. */
+UNIV_INTERN
+void
+log_group_read_checkpoint_info(
+/*===========================*/
+	log_group_t*	group,	/*!< in: log group */
+	ulint		field);	/*!< in: LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 */
+/*******************************************************************//**
+Gets info from a checkpoint about a log group. */
+UNIV_INTERN
+void
+log_checkpoint_get_nth_group_info(
+/*==============================*/
+	const byte*	buf,	/*!< in: buffer containing checkpoint info */
+	ulint		n,	/*!< in: nth slot */
+	ulint*		file_no,/*!< out: archived file number */
+	ulint*		offset);/*!< out: archived file offset */
+/******************************************************//**
+Writes checkpoint info to groups. */
+UNIV_INTERN
+void
+log_groups_write_checkpoint_info(void);
+/*==================================*/
+/********************************************************************//**
+Starts an archiving operation.
+@return	TRUE if succeed, FALSE if an archiving operation was already running */
+UNIV_INTERN
+ibool
+log_archive_do(
+/*===========*/
+	ibool	sync,	/*!< in: TRUE if synchronous operation is desired */
+	ulint*	n_bytes);/*!< out: archive log buffer size, 0 if nothing to
+			archive */
+/****************************************************************//**
+Writes the log contents to the archive up to the lsn when this function was
+called, and stops the archiving. When archiving is started again, the archived
+log file numbers start from a number one higher, so that the archiving will
+not write again to the archived log files which exist when this function
+returns.
+@return	DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_stop(void);
+/*==================*/
+/****************************************************************//**
+Starts again archiving which has been stopped.
+@return	DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_start(void);
+/*===================*/
+/****************************************************************//**
+Stop archiving the log so that a gap may occur in the archived log files.
+@return	DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_noarchivelog(void);
+/*==========================*/
+/****************************************************************//**
+Start archiving the log so that a gap may occur in the archived log files.
+@return	DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_archivelog(void);
+/*========================*/
+/******************************************************//**
+Generates an archived log file name. */
+UNIV_INTERN
+void
+log_archived_file_name_gen(
+/*=======================*/
+	char*	buf,	/*!< in: buffer where to write */
+	ulint	id,	/*!< in: group id */
+	ulint	file_no);/*!< in: file number */
+#else /* !UNIV_HOTBACKUP */
+/******************************************************//**
+Writes info to a buffer of a log group when log files are created in
+backup restoration. */
+UNIV_INTERN
+void
+log_reset_first_header_and_checkpoint(
+/*==================================*/
+	byte*		hdr_buf,/*!< in: buffer which will be written to the
+				start of the first log file */
+	ib_uint64_t	start);	/*!< in: lsn of the start of the first log file;
+				we pretend that there is a checkpoint at
+				start + LOG_BLOCK_HDR_SIZE */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Checks that there is enough free space in the log to start a new query step.
+Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
+function may only be called if the calling thread owns no synchronization
+objects! */
+UNIV_INTERN
+void
+log_check_margins(void);
+/*===================*/
+#ifndef UNIV_HOTBACKUP
+/******************************************************//**
+Reads a specified log segment to a buffer. */
+UNIV_INTERN
+void
+log_group_read_log_seg(
+/*===================*/
+	ulint		type,		/*!< in: LOG_ARCHIVE or LOG_RECOVER */
+	byte*		buf,		/*!< in: buffer where to read */
+	log_group_t*	group,		/*!< in: log group */
+	lsn_t		start_lsn,	/*!< in: read area start */
+	lsn_t		end_lsn);	/*!< in: read area end */
+/******************************************************//**
+Writes a buffer to a log file group. */
+UNIV_INTERN
+void
+log_group_write_buf(
+/*================*/
+	log_group_t*	group,		/*!< in: log group */
+	byte*		buf,		/*!< in: buffer */
+	ulint		len,		/*!< in: buffer len; must be divisible
+					by OS_FILE_LOG_BLOCK_SIZE */
+	lsn_t		start_lsn,	/*!< in: start lsn of the buffer; must
+					be divisible by
+					OS_FILE_LOG_BLOCK_SIZE */
+	ulint		new_data_offset);/*!< in: start offset of new data in
+					buf: this parameter is used to decide
+					if we have to write a new log file
+					header */
+/********************************************************//**
+Sets the field values in group to correspond to a given lsn. For this function
+to work, the values must already be correctly initialized to correspond to
+some lsn, for instance, a checkpoint lsn. */
+UNIV_INTERN
+void
+log_group_set_fields(
+/*=================*/
+	log_group_t*	group,	/*!< in/out: group */
+	lsn_t		lsn);	/*!< in: lsn for which the values should be
+				set */
+/******************************************************//**
+Calculates the data capacity of a log group, when the log file headers are not
+included.
+@return	capacity in bytes */
+UNIV_INTERN
+lsn_t
+log_group_get_capacity(
+/*===================*/
+	const log_group_t*	group);	/*!< in: log group */
+#endif /* !UNIV_HOTBACKUP */
+/************************************************************//**
+Gets a log block flush bit.
+@return	TRUE if this block was the first to be written in a log flush */
+UNIV_INLINE
+ibool
+log_block_get_flush_bit(
+/*====================*/
+	const byte*	log_block);	/*!< in: log block */
+/************************************************************//**
+Gets a log block number stored in the header.
+@return	log block number stored in the block header */
+UNIV_INLINE
+ulint
+log_block_get_hdr_no(
+/*=================*/
+	const byte*	log_block);	/*!< in: log block */
+/************************************************************//**
+Gets a log block data length.
+@return	log block data length measured as a byte offset from the block start */
+UNIV_INLINE
+ulint
+log_block_get_data_len(
+/*===================*/
+	const byte*	log_block);	/*!< in: log block */
+/************************************************************//**
+Sets the log block data length. */
+UNIV_INLINE
+void
+log_block_set_data_len(
+/*===================*/
+	byte*	log_block,	/*!< in/out: log block */
+	ulint	len);		/*!< in: data length */
+/************************************************************//**
+Calculates the checksum for a log block.
+@return	checksum */
+UNIV_INLINE
+ulint
+log_block_calc_checksum(
+/*====================*/
+	const byte*	block);	/*!< in: log block */
+/************************************************************//**
+Gets a log block checksum field value.
+@return	checksum */
+UNIV_INLINE
+ulint
+log_block_get_checksum(
+/*===================*/
+	const byte*	log_block);	/*!< in: log block */
+/************************************************************//**
+Sets a log block checksum field value. */
+UNIV_INLINE
+void
+log_block_set_checksum(
+/*===================*/
+	byte*	log_block,	/*!< in/out: log block */
+	ulint	checksum);	/*!< in: checksum */
+/************************************************************//**
+Gets a log block first mtr log record group offset.
+@return first mtr log record group byte offset from the block start, 0
+if none */
+UNIV_INLINE
+ulint
+log_block_get_first_rec_group(
+/*==========================*/
+	const byte*	log_block);	/*!< in: log block */
+/************************************************************//**
+Sets the log block first mtr log record group offset. */
+UNIV_INLINE
+void
+log_block_set_first_rec_group(
+/*==========================*/
+	byte*	log_block,	/*!< in/out: log block */
+	ulint	offset);	/*!< in: offset, 0 if none */
+/************************************************************//**
+Gets a log block checkpoint number field (4 lowest bytes).
+@return	checkpoint no (4 lowest bytes) */
+UNIV_INLINE
+ulint
+log_block_get_checkpoint_no(
+/*========================*/
+	const byte*	log_block);	/*!< in: log block */
+/************************************************************//**
+Initializes a log block in the log buffer. */
+UNIV_INLINE
+void
+log_block_init(
+/*===========*/
+	byte*	log_block,	/*!< in: pointer to the log buffer */
+	lsn_t	lsn);		/*!< in: lsn within the log block */
+/************************************************************//**
+Initializes a log block in the log buffer in the old, < 3.23.52 format, where
+there was no checksum yet. */
+UNIV_INLINE
+void
+log_block_init_in_old_format(
+/*=========================*/
+	byte*	log_block,	/*!< in: pointer to the log buffer */
+	lsn_t	lsn);		/*!< in: lsn within the log block */
+/************************************************************//**
+Converts a lsn to a log block number.
+@return	log block number, it is > 0 and <= 1G */
+UNIV_INLINE
+ulint
+log_block_convert_lsn_to_no(
+/*========================*/
+	lsn_t	lsn);	/*!< in: lsn of a byte within the block */
+/******************************************************//**
+Prints info of the log. */
+UNIV_INTERN
+void
+log_print(
+/*======*/
+	FILE*	file);	/*!< in: file where to print */
+/******************************************************//**
+Peeks the current lsn.
+@return	TRUE if success, FALSE if could not get the log system mutex */
+UNIV_INTERN
+ibool
+log_peek_lsn(
+/*=========*/
+	lsn_t*	lsn);	/*!< out: if returns TRUE, current lsn is here */
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+log_refresh_stats(void);
+/*===================*/
+/********************************************************//**
+Closes all log groups. */
+UNIV_INTERN
+void
+log_group_close_all(void);
+/*=====================*/
+/********************************************************//**
+Shutdown the log system but do not release all the memory. */
+UNIV_INTERN
+void
+log_shutdown(void);
+/*==============*/
+/********************************************************//**
+Free the log system data structures. */
+UNIV_INTERN
+void
+log_mem_free(void);
+/*==============*/
+
+extern log_t*	log_sys;
+
+/* Values used as flags */
+#define LOG_FLUSH	7652559
+#define LOG_CHECKPOINT	78656949
+#ifdef UNIV_LOG_ARCHIVE
+# define LOG_ARCHIVE	11122331
+#endif /* UNIV_LOG_ARCHIVE */
+#define LOG_RECOVER	98887331
+
+/* The counting of lsn's starts from this value: this must be non-zero */
+#define LOG_START_LSN		((lsn_t) (16 * OS_FILE_LOG_BLOCK_SIZE))
+
+#define LOG_BUFFER_SIZE		(srv_log_buffer_size * UNIV_PAGE_SIZE)
+#define LOG_ARCHIVE_BUF_SIZE	(srv_log_buffer_size * UNIV_PAGE_SIZE / 4)
+
+/* Offsets of a log block header */
+#define	LOG_BLOCK_HDR_NO	0	/* block number which must be > 0 and
+					is allowed to wrap around at 2G; the
+					highest bit is set to 1 if this is the
+					first log block in a log flush write
+					segment */
+#define LOG_BLOCK_FLUSH_BIT_MASK 0x80000000UL
+					/* mask used to get the highest bit in
+					the preceding field */
+#define	LOG_BLOCK_HDR_DATA_LEN	4	/* number of bytes of log written to
+					this block */
+#define	LOG_BLOCK_FIRST_REC_GROUP 6	/* offset of the first start of an
+					mtr log record group in this log block,
+					0 if none; if the value is the same
+					as LOG_BLOCK_HDR_DATA_LEN, it means
+					that the first rec group has not yet
+					been catenated to this log block, but
+					if it will, it will start at this
+					offset; an archive recovery can
+					start parsing the log records starting
+					from this offset in this log block,
+					if value not 0 */
+#define LOG_BLOCK_CHECKPOINT_NO	8	/* 4 lower bytes of the value of
+					log_sys->next_checkpoint_no when the
+					log block was last written to: if the
+					block has not yet been written full,
+					this value is only updated before a
+					log buffer flush */
+#define LOG_BLOCK_HDR_SIZE	12	/* size of the log block header in
+					bytes */
+
+/* Offsets of a log block trailer from the end of the block */
+#define	LOG_BLOCK_CHECKSUM	4	/* 4 byte checksum of the log block
+					contents; in InnoDB versions
+					< 3.23.52 this did not contain the
+					checksum but the same value as
+					.._HDR_NO */
+#define	LOG_BLOCK_TRL_SIZE	4	/* trailer size in bytes */
+
+/* Offsets for a checkpoint field */
+#define LOG_CHECKPOINT_NO		0
+#define LOG_CHECKPOINT_LSN		8
+#define LOG_CHECKPOINT_OFFSET_LOW32	16
+#define LOG_CHECKPOINT_LOG_BUF_SIZE	20
+#define	LOG_CHECKPOINT_ARCHIVED_LSN	24
+#define	LOG_CHECKPOINT_GROUP_ARRAY	32
+
+/* For each value smaller than LOG_MAX_N_GROUPS the following 8 bytes: */
+
+#define LOG_CHECKPOINT_ARCHIVED_FILE_NO	0
+#define LOG_CHECKPOINT_ARCHIVED_OFFSET	4
+
+#define	LOG_CHECKPOINT_ARRAY_END	(LOG_CHECKPOINT_GROUP_ARRAY\
+							+ LOG_MAX_N_GROUPS * 8)
+#define LOG_CHECKPOINT_CHECKSUM_1	LOG_CHECKPOINT_ARRAY_END
+#define LOG_CHECKPOINT_CHECKSUM_2	(4 + LOG_CHECKPOINT_ARRAY_END)
+#if 0
+#define LOG_CHECKPOINT_FSP_FREE_LIMIT	(8 + LOG_CHECKPOINT_ARRAY_END)
+					/*!< Not used (0);
+					This used to contain the
+					current fsp free limit in
+					tablespace 0, in units of one
+					megabyte.
+
+					This information might have been used
+					since mysqlbackup version 0.35 but
+					before 1.41 to decide if unused ends of
+					non-auto-extending data files
+					in space 0 can be truncated.
+
+					This information was made obsolete
+					by mysqlbackup --compress. */
+#define LOG_CHECKPOINT_FSP_MAGIC_N	(12 + LOG_CHECKPOINT_ARRAY_END)
+					/*!< Not used (0);
+					This magic number tells if the
+					checkpoint contains the above field:
+					the field was added to
+					InnoDB-3.23.50 and
+					removed from MySQL 5.6 */
+#define LOG_CHECKPOINT_FSP_MAGIC_N_VAL	1441231243
+					/*!< if LOG_CHECKPOINT_FSP_MAGIC_N
+					contains this value, then
+					LOG_CHECKPOINT_FSP_FREE_LIMIT
+					is valid */
+#endif
+#define LOG_CHECKPOINT_OFFSET_HIGH32	(16 + LOG_CHECKPOINT_ARRAY_END)
+#define LOG_CHECKPOINT_SIZE		(20 + LOG_CHECKPOINT_ARRAY_END)
+
+
+/* Offsets of a log file header */
+#define LOG_GROUP_ID		0	/* log group number */
+#define LOG_FILE_START_LSN	4	/* lsn of the start of data in this
+					log file */
+#define LOG_FILE_NO		12	/* 4-byte archived log file number;
+					this field is only defined in an
+					archived log file */
+#define LOG_FILE_WAS_CREATED_BY_HOT_BACKUP 16
+					/* a 32-byte field which contains
+					the string 'ibbackup' and the
+					creation time if the log file was
+					created by mysqlbackup --restore;
+					when mysqld is first time started
+					on the restored database, it can
+					print helpful info for the user */
+#define	LOG_FILE_ARCH_COMPLETED	OS_FILE_LOG_BLOCK_SIZE
+					/* this 4-byte field is TRUE when
+					the writing of an archived log file
+					has been completed; this field is
+					only defined in an archived log file */
+#define LOG_FILE_END_LSN	(OS_FILE_LOG_BLOCK_SIZE + 4)
+					/* lsn where the archived log file
+					at least extends: actually the
+					archived log file may extend to a
+					later lsn, as long as it is within the
+					same log block as this lsn; this field
+					is defined only when an archived log
+					file has been completely written */
+#define LOG_CHECKPOINT_1	OS_FILE_LOG_BLOCK_SIZE
+					/* first checkpoint field in the log
+					header; we write alternately to the
+					checkpoint fields when we make new
+					checkpoints; this field is only defined
+					in the first log file of a log group */
+#define LOG_CHECKPOINT_2	(3 * OS_FILE_LOG_BLOCK_SIZE)
+					/* second checkpoint field in the log
+					header */
+#define LOG_FILE_HDR_SIZE	(4 * OS_FILE_LOG_BLOCK_SIZE)
+
+#define LOG_GROUP_OK		301
+#define LOG_GROUP_CORRUPTED	302
+
+/** Log group consists of a number of log files, each of the same size; a log
+group is implemented as a space in the sense of the module fil0fil. */
+struct log_group_t{
+	/* The following fields are protected by log_sys->mutex */
+	ulint		id;		/*!< log group id */
+	ulint		n_files;	/*!< number of files in the group */
+	lsn_t		file_size;	/*!< individual log file size in bytes,
+					including the log file header */
+	ulint		space_id;	/*!< file space which implements the log
+					group */
+	ulint		state;		/*!< LOG_GROUP_OK or
+					LOG_GROUP_CORRUPTED */
+	lsn_t		lsn;		/*!< lsn used to fix coordinates within
+					the log group */
+	lsn_t		lsn_offset;	/*!< the offset of the above lsn */
+	ulint		n_pending_writes;/*!< number of currently pending flush
+					writes for this log group */
+	byte**		file_header_bufs_ptr;/*!< unaligned buffers */
+	byte**		file_header_bufs;/*!< buffers for each file
+					header in the group */
+#ifdef UNIV_LOG_ARCHIVE
+	/*-----------------------------*/
+	byte**		archive_file_header_bufs_ptr;/*!< unaligned buffers */
+	byte**		archive_file_header_bufs;/*!< buffers for each file
+					header in the group */
+	ulint		archive_space_id;/*!< file space which
+					implements the log group
+					archive */
+	ulint		archived_file_no;/*!< file number corresponding to
+					log_sys->archived_lsn */
+	ulint		archived_offset;/*!< file offset corresponding to
+					log_sys->archived_lsn, 0 if we have
+					not yet written to the archive file
+					number archived_file_no */
+	ulint		next_archived_file_no;/*!< during an archive write,
+					until the write is completed, we
+					store the next value for
+					archived_file_no here: the write
+					completion function then sets the new
+					value to ..._file_no */
+	ulint		next_archived_offset; /*!< like the preceding field */
+#endif /* UNIV_LOG_ARCHIVE */
+	/*-----------------------------*/
+	lsn_t		scanned_lsn;	/*!< used only in recovery: recovery scan
+					succeeded up to this lsn in this log
+					group */
+	byte*		checkpoint_buf_ptr;/*!< unaligned checkpoint header */
+	byte*		checkpoint_buf;	/*!< checkpoint header is written from
+					this buffer to the group */
+	UT_LIST_NODE_T(log_group_t)
+			log_groups;	/*!< list of log groups */
+};
+
+/** Redo log buffer */
+struct log_t{
+	byte		pad[64];	/*!< padding to prevent other memory
+					update hotspots from residing on the
+					same memory cache line */
+	lsn_t		lsn;		/*!< log sequence number */
+	ulint		buf_free;	/*!< first free offset within the log
+					buffer */
+#ifndef UNIV_HOTBACKUP
+	ib_mutex_t		mutex;		/*!< mutex protecting the log */
+
+	ib_mutex_t		log_flush_order_mutex;/*!< mutex to serialize access to
+					the flush list when we are putting
+					dirty blocks in the list. The idea
+					behind this mutex is to be able
+					to release log_sys->mutex during
+					mtr_commit and still ensure that
+					insertions in the flush_list happen
+					in the LSN order. */
+#endif /* !UNIV_HOTBACKUP */
+	byte*		buf_ptr;	/* unaligned log buffer */
+	byte*		buf;		/*!< log buffer */
+	ulint		buf_size;	/*!< log buffer size in bytes */
+	ulint		max_buf_free;	/*!< recommended maximum value of
+					buf_free, after which the buffer is
+					flushed */
+ #ifdef UNIV_LOG_DEBUG
+	ulint		old_buf_free;	/*!< value of buf free when log was
+					last time opened; only in the debug
+					version */
+	ib_uint64_t	old_lsn;	/*!< value of lsn when log was
+					last time opened; only in the
+					debug version */
+#endif /* UNIV_LOG_DEBUG */
+	ibool		check_flush_or_checkpoint;
+					/*!< this is set to TRUE when there may
+					be need to flush the log buffer, or
+					preflush buffer pool pages, or make
+					a checkpoint; this MUST be TRUE when
+					lsn - last_checkpoint_lsn >
+					max_checkpoint_age; this flag is
+					peeked at by log_free_check(), which
+					does not reserve the log mutex */
+	UT_LIST_BASE_NODE_T(log_group_t)
+			log_groups;	/*!< log groups */
+
+#ifndef UNIV_HOTBACKUP
+	/** The fields involved in the log buffer flush @{ */
+
+	ulint		buf_next_to_write;/*!< first offset in the log buffer
+					where the byte content may not exist
+					written to file, e.g., the start
+					offset of a log record catenated
+					later; this is advanced when a flush
+					operation is completed to all the log
+					groups */
+	volatile bool	is_extending;	/*!< this is set to true during extend
+					the log buffer size */
+	lsn_t		written_to_some_lsn;
+					/*!< first log sequence number not yet
+					written to any log group; for this to
+					be advanced, it is enough that the
+					write i/o has been completed for any
+					one log group */
+	lsn_t		written_to_all_lsn;
+					/*!< first log sequence number not yet
+					written to some log group; for this to
+					be advanced, it is enough that the
+					write i/o has been completed for all
+					log groups.
+					Note that since InnoDB currently
+					has only one log group therefore
+					this value is redundant. Also it
+					is possible that this value
+					falls behind the
+					flushed_to_disk_lsn transiently.
+					It is appropriate to use either
+					flushed_to_disk_lsn or
+					write_lsn which are always
+					up-to-date and accurate. */
+	lsn_t		write_lsn;	/*!< end lsn for the current running
+					write */
+	ulint		write_end_offset;/*!< the data in buffer has
+					been written up to this offset
+					when the current write ends:
+					this field will then be copied
+					to buf_next_to_write */
+	lsn_t		current_flush_lsn;/*!< end lsn for the current running
+					write + flush operation */
+	lsn_t		flushed_to_disk_lsn;
+					/*!< how far we have written the log
+					AND flushed to disk */
+	ulint		n_pending_writes;/*!< number of currently
+					pending flushes or writes */
+	/* NOTE on the 'flush' in names of the fields below: starting from
+	4.0.14, we separate the write of the log file and the actual fsync()
+	or other method to flush it to disk. The names below shhould really
+	be 'flush_or_write'! */
+	os_event_t	no_flush_event;	/*!< this event is in the reset state
+					when a flush or a write is running;
+					a thread should wait for this without
+					owning the log mutex, but NOTE that
+					to set or reset this event, the
+					thread MUST own the log mutex! */
+	ibool		one_flushed;	/*!< during a flush, this is
+					first FALSE and becomes TRUE
+					when one log group has been
+					written or flushed */
+	os_event_t	one_flushed_event;/*!< this event is reset when the
+					flush or write has not yet completed
+					for any log group; e.g., this means
+					that a transaction has been committed
+					when this is set; a thread should wait
+					for this without owning the log mutex,
+					but NOTE that to set or reset this
+					event, the thread MUST own the log
+					mutex! */
+	ulint		n_log_ios;	/*!< number of log i/os initiated thus
+					far */
+	ulint		n_log_ios_old;	/*!< number of log i/o's at the
+					previous printout */
+	time_t		last_printout_time;/*!< when log_print was last time
+					called */
+	/* @} */
+
+	/** Fields involved in checkpoints @{ */
+	lsn_t		log_group_capacity; /*!< capacity of the log group; if
+					the checkpoint age exceeds this, it is
+					a serious error because it is possible
+					we will then overwrite log and spoil
+					crash recovery */
+	lsn_t		max_modified_age_async;
+					/*!< when this recommended
+					value for lsn -
+					buf_pool_get_oldest_modification()
+					is exceeded, we start an
+					asynchronous preflush of pool pages */
+	lsn_t		max_modified_age_sync;
+					/*!< when this recommended
+					value for lsn -
+					buf_pool_get_oldest_modification()
+					is exceeded, we start a
+					synchronous preflush of pool pages */
+	lsn_t		max_checkpoint_age_async;
+					/*!< when this checkpoint age
+					is exceeded we start an
+					asynchronous writing of a new
+					checkpoint */
+	lsn_t		max_checkpoint_age;
+					/*!< this is the maximum allowed value
+					for lsn - last_checkpoint_lsn when a
+					new query step is started */
+	ib_uint64_t	next_checkpoint_no;
+					/*!< next checkpoint number */
+	lsn_t		last_checkpoint_lsn;
+					/*!< latest checkpoint lsn */
+	lsn_t		next_checkpoint_lsn;
+					/*!< next checkpoint lsn */
+	ulint		n_pending_checkpoint_writes;
+					/*!< number of currently pending
+					checkpoint writes */
+	rw_lock_t	checkpoint_lock;/*!< this latch is x-locked when a
+					checkpoint write is running; a thread
+					should wait for this without owning
+					the log mutex */
+#endif /* !UNIV_HOTBACKUP */
+	byte*		checkpoint_buf_ptr;/* unaligned checkpoint header */
+	byte*		checkpoint_buf;	/*!< checkpoint header is read to this
+					buffer */
+	/* @} */
+#ifdef UNIV_LOG_ARCHIVE
+	/** Fields involved in archiving @{ */
+	ulint		archiving_state;/*!< LOG_ARCH_ON, LOG_ARCH_STOPPING
+					LOG_ARCH_STOPPED, LOG_ARCH_OFF */
+	lsn_t		archived_lsn;	/*!< archiving has advanced to this
+					lsn */
+	lsn_t		max_archived_lsn_age_async;
+					/*!< recommended maximum age of
+					archived_lsn, before we start
+					asynchronous copying to the archive */
+	lsn_t		max_archived_lsn_age;
+					/*!< maximum allowed age for
+					archived_lsn */
+	lsn_t		next_archived_lsn;/*!< during an archive write,
+					until the write is completed, we
+					store the next value for
+					archived_lsn here: the write
+					completion function then sets the new
+					value to archived_lsn */
+	ulint		archiving_phase;/*!< LOG_ARCHIVE_READ or
+					LOG_ARCHIVE_WRITE */
+	ulint		n_pending_archive_ios;
+					/*!< number of currently pending reads
+					or writes in archiving */
+	rw_lock_t	archive_lock;	/*!< this latch is x-locked when an
+					archive write is running; a thread
+					should wait for this without owning
+					the log mutex */
+	ulint		archive_buf_size;/*!< size of archive_buf */
+	byte*		archive_buf;	/*!< log segment is written to the
+					archive from this buffer */
+	os_event_t	archiving_on;	/*!< if archiving has been stopped,
+					a thread can wait for this event to
+					become signaled */
+	/* @} */
+#endif /* UNIV_LOG_ARCHIVE */
+};
+
+/** Test if flush order mutex is owned. */
+#define log_flush_order_mutex_own()	\
+	mutex_own(&log_sys->log_flush_order_mutex)
+
+/** Acquire the flush order mutex. */
+#define log_flush_order_mutex_enter() do {		\
+	mutex_enter(&log_sys->log_flush_order_mutex);	\
+} while (0)
+/** Release the flush order mutex. */
+# define log_flush_order_mutex_exit() do {		\
+	mutex_exit(&log_sys->log_flush_order_mutex);	\
+} while (0)
+
+#ifdef UNIV_LOG_ARCHIVE
+/** Archiving state @{ */
+#define LOG_ARCH_ON		71
+#define LOG_ARCH_STOPPING	72
+#define LOG_ARCH_STOPPING2	73
+#define LOG_ARCH_STOPPED	74
+#define LOG_ARCH_OFF		75
+/* @} */
+#endif /* UNIV_LOG_ARCHIVE */
+
+#ifndef UNIV_NONINL
+#include "log0log.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/log0log.ic b/storage/innobase/include/log0log.ic
new file mode 100644
index 00000000000..9fc12f766bf
--- /dev/null
+++ b/storage/innobase/include/log0log.ic
@@ -0,0 +1,462 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0log.ic
+Database log
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0file.h"
+#include "mach0data.h"
+#include "mtr0mtr.h"
+#include "srv0mon.h"
+
+#ifdef UNIV_LOG_DEBUG
+/******************************************************//**
+Checks by parsing that the catenated log segment for a single mtr is
+consistent. */
+UNIV_INTERN
+ibool
+log_check_log_recs(
+/*===============*/
+	const byte*	buf,		/*!< in: pointer to the start of
+					the log segment in the
+					log_sys->buf log buffer */
+	ulint		len,		/*!< in: segment length in bytes */
+	ib_uint64_t	buf_start_lsn);	/*!< in: buffer start lsn */
+#endif /* UNIV_LOG_DEBUG */
+
+/************************************************************//**
+Gets a log block flush bit.
+@return	TRUE if this block was the first to be written in a log flush */
+UNIV_INLINE
+ibool
+log_block_get_flush_bit(
+/*====================*/
+	const byte*	log_block)	/*!< in: log block */
+{
+	if (LOG_BLOCK_FLUSH_BIT_MASK
+	    & mach_read_from_4(log_block + LOG_BLOCK_HDR_NO)) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/************************************************************//**
+Sets the log block flush bit. */
+UNIV_INLINE
+void
+log_block_set_flush_bit(
+/*====================*/
+	byte*	log_block,	/*!< in/out: log block */
+	ibool	val)		/*!< in: value to set */
+{
+	ulint	field;
+
+	field = mach_read_from_4(log_block + LOG_BLOCK_HDR_NO);
+
+	if (val) {
+		field = field | LOG_BLOCK_FLUSH_BIT_MASK;
+	} else {
+		field = field & ~LOG_BLOCK_FLUSH_BIT_MASK;
+	}
+
+	mach_write_to_4(log_block + LOG_BLOCK_HDR_NO, field);
+}
+
+/************************************************************//**
+Gets a log block number stored in the header.
+@return	log block number stored in the block header */
+UNIV_INLINE
+ulint
+log_block_get_hdr_no(
+/*=================*/
+	const byte*	log_block)	/*!< in: log block */
+{
+	return(~LOG_BLOCK_FLUSH_BIT_MASK
+	       & mach_read_from_4(log_block + LOG_BLOCK_HDR_NO));
+}
+
+/************************************************************//**
+Sets the log block number stored in the header; NOTE that this must be set
+before the flush bit! */
+UNIV_INLINE
+void
+log_block_set_hdr_no(
+/*=================*/
+	byte*	log_block,	/*!< in/out: log block */
+	ulint	n)		/*!< in: log block number: must be > 0 and
+				< LOG_BLOCK_FLUSH_BIT_MASK */
+{
+	ut_ad(n > 0);
+	ut_ad(n < LOG_BLOCK_FLUSH_BIT_MASK);
+
+	mach_write_to_4(log_block + LOG_BLOCK_HDR_NO, n);
+}
+
+/************************************************************//**
+Gets a log block data length.
+@return	log block data length measured as a byte offset from the block start */
+UNIV_INLINE
+ulint
+log_block_get_data_len(
+/*===================*/
+	const byte*	log_block)	/*!< in: log block */
+{
+	return(mach_read_from_2(log_block + LOG_BLOCK_HDR_DATA_LEN));
+}
+
+/************************************************************//**
+Sets the log block data length. */
+UNIV_INLINE
+void
+log_block_set_data_len(
+/*===================*/
+	byte*	log_block,	/*!< in/out: log block */
+	ulint	len)		/*!< in: data length */
+{
+	mach_write_to_2(log_block + LOG_BLOCK_HDR_DATA_LEN, len);
+}
+
+/************************************************************//**
+Gets a log block first mtr log record group offset.
+@return first mtr log record group byte offset from the block start, 0
+if none */
+UNIV_INLINE
+ulint
+log_block_get_first_rec_group(
+/*==========================*/
+	const byte*	log_block)	/*!< in: log block */
+{
+	return(mach_read_from_2(log_block + LOG_BLOCK_FIRST_REC_GROUP));
+}
+
+/************************************************************//**
+Sets the log block first mtr log record group offset. */
+UNIV_INLINE
+void
+log_block_set_first_rec_group(
+/*==========================*/
+	byte*	log_block,	/*!< in/out: log block */
+	ulint	offset)		/*!< in: offset, 0 if none */
+{
+	mach_write_to_2(log_block + LOG_BLOCK_FIRST_REC_GROUP, offset);
+}
+
+/************************************************************//**
+Gets a log block checkpoint number field (4 lowest bytes).
+@return	checkpoint no (4 lowest bytes) */
+UNIV_INLINE
+ulint
+log_block_get_checkpoint_no(
+/*========================*/
+	const byte*	log_block)	/*!< in: log block */
+{
+	return(mach_read_from_4(log_block + LOG_BLOCK_CHECKPOINT_NO));
+}
+
+/************************************************************//**
+Sets a log block checkpoint number field (4 lowest bytes). */
+UNIV_INLINE
+void
+log_block_set_checkpoint_no(
+/*========================*/
+	byte*		log_block,	/*!< in/out: log block */
+	ib_uint64_t	no)		/*!< in: checkpoint no */
+{
+	mach_write_to_4(log_block + LOG_BLOCK_CHECKPOINT_NO, (ulint) no);
+}
+
+/************************************************************//**
+Converts a lsn to a log block number.
+@return	log block number, it is > 0 and <= 1G */
+UNIV_INLINE
+ulint
+log_block_convert_lsn_to_no(
+/*========================*/
+	lsn_t	lsn)	/*!< in: lsn of a byte within the block */
+{
+	return(((ulint) (lsn / OS_FILE_LOG_BLOCK_SIZE) & 0x3FFFFFFFUL) + 1);
+}
+
+/************************************************************//**
+Calculates the checksum for a log block.
+@return	checksum */
+UNIV_INLINE
+ulint
+log_block_calc_checksum(
+/*====================*/
+	const byte*	block)	/*!< in: log block */
+{
+	ulint	sum;
+	ulint	sh;
+	ulint	i;
+
+	sum = 1;
+	sh = 0;
+
+	for (i = 0; i < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; i++) {
+		ulint	b = (ulint) block[i];
+		sum &= 0x7FFFFFFFUL;
+		sum += b;
+		sum += b << sh;
+		sh++;
+		if (sh > 24) {
+			sh = 0;
+		}
+	}
+
+	return(sum);
+}
+
+/************************************************************//**
+Gets a log block checksum field value.
+@return	checksum */
+UNIV_INLINE
+ulint
+log_block_get_checksum(
+/*===================*/
+	const byte*	log_block)	/*!< in: log block */
+{
+	return(mach_read_from_4(log_block + OS_FILE_LOG_BLOCK_SIZE
+				- LOG_BLOCK_CHECKSUM));
+}
+
+/************************************************************//**
+Sets a log block checksum field value. */
+UNIV_INLINE
+void
+log_block_set_checksum(
+/*===================*/
+	byte*	log_block,	/*!< in/out: log block */
+	ulint	checksum)	/*!< in: checksum */
+{
+	mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE
+			- LOG_BLOCK_CHECKSUM,
+			checksum);
+}
+
+/************************************************************//**
+Initializes a log block in the log buffer. */
+UNIV_INLINE
+void
+log_block_init(
+/*===========*/
+	byte*	log_block,	/*!< in: pointer to the log buffer */
+	lsn_t	lsn)		/*!< in: lsn within the log block */
+{
+	ulint	no;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	no = log_block_convert_lsn_to_no(lsn);
+
+	log_block_set_hdr_no(log_block, no);
+
+	log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE);
+	log_block_set_first_rec_group(log_block, 0);
+}
+
+/************************************************************//**
+Initializes a log block in the log buffer in the old format, where there
+was no checksum yet. */
+UNIV_INLINE
+void
+log_block_init_in_old_format(
+/*=========================*/
+	byte*	log_block,	/*!< in: pointer to the log buffer */
+	lsn_t	lsn)		/*!< in: lsn within the log block */
+{
+	ulint	no;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	no = log_block_convert_lsn_to_no(lsn);
+
+	log_block_set_hdr_no(log_block, no);
+	mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE
+			- LOG_BLOCK_CHECKSUM, no);
+	log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE);
+	log_block_set_first_rec_group(log_block, 0);
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Writes to the log the string given. The log must be released with
+log_release.
+@return	end lsn of the log record, zero if did not succeed */
+UNIV_INLINE
+lsn_t
+log_reserve_and_write_fast(
+/*=======================*/
+	const void*	str,	/*!< in: string */
+	ulint		len,	/*!< in: string length */
+	lsn_t*		start_lsn)/*!< out: start lsn of the log record */
+{
+	ulint		data_len;
+#ifdef UNIV_LOG_LSN_DEBUG
+	/* length of the LSN pseudo-record */
+	ulint		lsn_len;
+#endif /* UNIV_LOG_LSN_DEBUG */
+
+	mutex_enter(&log_sys->mutex);
+#ifdef UNIV_LOG_LSN_DEBUG
+	lsn_len = 1
+		+ mach_get_compressed_size(log_sys->lsn >> 32)
+		+ mach_get_compressed_size(log_sys->lsn & 0xFFFFFFFFUL);
+#endif /* UNIV_LOG_LSN_DEBUG */
+
+	data_len = len
+#ifdef UNIV_LOG_LSN_DEBUG
+		+ lsn_len
+#endif /* UNIV_LOG_LSN_DEBUG */
+		+ log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE;
+
+	if (data_len >= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
+
+		/* The string does not fit within the current log block
+		or the log block would become full */
+
+		mutex_exit(&log_sys->mutex);
+
+		return(0);
+	}
+
+	*start_lsn = log_sys->lsn;
+
+#ifdef UNIV_LOG_LSN_DEBUG
+	{
+		/* Write the LSN pseudo-record. */
+		byte* b = &log_sys->buf[log_sys->buf_free];
+		*b++ = MLOG_LSN | (MLOG_SINGLE_REC_FLAG & *(const byte*) str);
+		/* Write the LSN in two parts,
+		as a pseudo page number and space id. */
+		b += mach_write_compressed(b, log_sys->lsn >> 32);
+		b += mach_write_compressed(b, log_sys->lsn & 0xFFFFFFFFUL);
+		ut_a(b - lsn_len == &log_sys->buf[log_sys->buf_free]);
+
+		memcpy(b, str, len);
+		len += lsn_len;
+	}
+#else /* UNIV_LOG_LSN_DEBUG */
+	memcpy(log_sys->buf + log_sys->buf_free, str, len);
+#endif /* UNIV_LOG_LSN_DEBUG */
+
+	log_block_set_data_len((byte*) ut_align_down(log_sys->buf
+						     + log_sys->buf_free,
+						     OS_FILE_LOG_BLOCK_SIZE),
+			       data_len);
+#ifdef UNIV_LOG_DEBUG
+	log_sys->old_buf_free = log_sys->buf_free;
+	log_sys->old_lsn = log_sys->lsn;
+#endif
+	log_sys->buf_free += len;
+
+	ut_ad(log_sys->buf_free <= log_sys->buf_size);
+
+	log_sys->lsn += len;
+
+	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
+		    log_sys->lsn - log_sys->last_checkpoint_lsn);
+
+#ifdef UNIV_LOG_DEBUG
+	log_check_log_recs(log_sys->buf + log_sys->old_buf_free,
+			   log_sys->buf_free - log_sys->old_buf_free,
+			   log_sys->old_lsn);
+#endif
+	return(log_sys->lsn);
+}
+
+/***********************************************************************//**
+Releases the log mutex. */
+UNIV_INLINE
+void
+log_release(void)
+/*=============*/
+{
+	mutex_exit(&(log_sys->mutex));
+}
+
+/************************************************************//**
+Gets the current lsn.
+@return	current lsn */
+UNIV_INLINE
+lsn_t
+log_get_lsn(void)
+/*=============*/
+{
+	lsn_t	lsn;
+
+	mutex_enter(&(log_sys->mutex));
+
+	lsn = log_sys->lsn;
+
+	mutex_exit(&(log_sys->mutex));
+
+	return(lsn);
+}
+
+/****************************************************************
+Gets the log group capacity. It is OK to read the value without
+holding log_sys->mutex because it is constant.
+@return	log group capacity */
+UNIV_INLINE
+lsn_t
+log_get_capacity(void)
+/*==================*/
+{
+	return(log_sys->log_group_capacity);
+}
+
+/****************************************************************
+Get log_sys::max_modified_age_async. It is OK to read the value without
+holding log_sys::mutex because it is constant.
+@return	max_modified_age_async */
+UNIV_INLINE
+lsn_t
+log_get_max_modified_age_async(void)
+/*================================*/
+{
+	return(log_sys->max_modified_age_async);
+}
+
+/***********************************************************************//**
+Checks if there is need for a log buffer flush or a new checkpoint, and does
+this if yes. Any database operation should call this when it has modified
+more than about 4 pages. NOTE that this function may only be called when the
+OS thread owns no synchronization objects except the dictionary mutex. */
+UNIV_INLINE
+void
+log_free_check(void)
+/*================*/
+{
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(sync_thread_levels_empty_except_dict());
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (log_sys->check_flush_or_checkpoint) {
+
+		log_check_margins();
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h
new file mode 100644
index 00000000000..8ede49d4ecc
--- /dev/null
+++ b/storage/innobase/include/log0recv.h
@@ -0,0 +1,505 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0recv.h
+Recovery
+
+Created 9/20/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef log0recv_h
+#define log0recv_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "buf0types.h"
+#include "hash0hash.h"
+#include "log0log.h"
+#include <list>
+
+#ifdef UNIV_HOTBACKUP
+extern ibool	recv_replay_file_ops;
+
+/*******************************************************************//**
+Reads the checkpoint info needed in hot backup.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+recv_read_checkpoint_info_for_backup(
+/*=================================*/
+	const byte*	hdr,	/*!< in: buffer containing the log group
+				header */
+	lsn_t*		lsn,	/*!< out: checkpoint lsn */
+	lsn_t*		offset,	/*!< out: checkpoint offset in the log group */
+	lsn_t*		cp_no,	/*!< out: checkpoint number */
+	lsn_t*		first_header_lsn)
+				/*!< out: lsn of of the start of the
+				first log file */
+	__attribute__((nonnull));
+/*******************************************************************//**
+Scans the log segment and n_bytes_scanned is set to the length of valid
+log scanned. */
+UNIV_INTERN
+void
+recv_scan_log_seg_for_backup(
+/*=========================*/
+	byte*		buf,		/*!< in: buffer containing log data */
+	ulint		buf_len,	/*!< in: data length in that buffer */
+	lsn_t*		scanned_lsn,	/*!< in/out: lsn of buffer start,
+					we return scanned lsn */
+	ulint*		scanned_checkpoint_no,
+					/*!< in/out: 4 lowest bytes of the
+					highest scanned checkpoint number so
+					far */
+	ulint*		n_bytes_scanned);/*!< out: how much we were able to
+					scan, smaller than buf_len if log
+					data ended here */
+#endif /* UNIV_HOTBACKUP */
+/*******************************************************************//**
+Returns TRUE if recovery is currently running.
+@return	recv_recovery_on */
+UNIV_INLINE
+ibool
+recv_recovery_is_on(void);
+/*=====================*/
+#ifdef UNIV_LOG_ARCHIVE
+/*******************************************************************//**
+Returns TRUE if recovery from backup is currently running.
+@return	recv_recovery_from_backup_on */
+UNIV_INLINE
+ibool
+recv_recovery_from_backup_is_on(void);
+/*=================================*/
+#endif /* UNIV_LOG_ARCHIVE */
+/************************************************************************//**
+Applies the hashed log records to the page, if the page lsn is less than the
+lsn of a log record. This can be called when a buffer page has just been
+read in, or also for a page already in the buffer pool. */
+UNIV_INTERN
+void
+recv_recover_page_func(
+/*===================*/
+#ifndef UNIV_HOTBACKUP
+	ibool		just_read_in,
+				/*!< in: TRUE if the i/o handler calls
+				this for a freshly read page */
+#endif /* !UNIV_HOTBACKUP */
+	buf_block_t*	block);	/*!< in/out: buffer block */
+#ifndef UNIV_HOTBACKUP
+/** Wrapper for recv_recover_page_func().
+Applies the hashed log records to the page, if the page lsn is less than the
+lsn of a log record. This can be called when a buffer page has just been
+read in, or also for a page already in the buffer pool.
+@param jri	in: TRUE if just read in (the i/o handler calls this for
+a freshly read page)
+@param block	in/out: the buffer block
+*/
+# define recv_recover_page(jri, block)	recv_recover_page_func(jri, block)
+#else /* !UNIV_HOTBACKUP */
+/** Wrapper for recv_recover_page_func().
+Applies the hashed log records to the page, if the page lsn is less than the
+lsn of a log record. This can be called when a buffer page has just been
+read in, or also for a page already in the buffer pool.
+@param jri	in: TRUE if just read in (the i/o handler calls this for
+a freshly read page)
+@param block	in/out: the buffer block
+*/
+# define recv_recover_page(jri, block)	recv_recover_page_func(block)
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************//**
+Recovers from a checkpoint. When this function returns, the database is able
+to start processing of new user transactions, but the function
+recv_recovery_from_checkpoint_finish should be called later to complete
+the recovery and free the resources used in it.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+recv_recovery_from_checkpoint_start_func(
+/*=====================================*/
+#ifdef UNIV_LOG_ARCHIVE
+	ulint		type,		/*!< in: LOG_CHECKPOINT or
+					LOG_ARCHIVE */
+	lsn_t		limit_lsn,	/*!< in: recover up to this lsn
+					if possible */
+#endif /* UNIV_LOG_ARCHIVE */
+	lsn_t		min_flushed_lsn,/*!< in: min flushed lsn from
+					data files */
+	lsn_t		max_flushed_lsn);/*!< in: max flushed lsn from
+					 data files */
+#ifdef UNIV_LOG_ARCHIVE
+/** Wrapper for recv_recovery_from_checkpoint_start_func().
+Recovers from a checkpoint. When this function returns, the database is able
+to start processing of new user transactions, but the function
+recv_recovery_from_checkpoint_finish should be called later to complete
+the recovery and free the resources used in it.
+@param type	in: LOG_CHECKPOINT or LOG_ARCHIVE
+@param lim	in: recover up to this log sequence number if possible
+@param min	in: minimum flushed log sequence number from data files
+@param max	in: maximum flushed log sequence number from data files
+@return	error code or DB_SUCCESS */
+# define recv_recovery_from_checkpoint_start(type,lim,min,max)		\
+	recv_recovery_from_checkpoint_start_func(type,lim,min,max)
+#else /* UNIV_LOG_ARCHIVE */
+/** Wrapper for recv_recovery_from_checkpoint_start_func().
+Recovers from a checkpoint. When this function returns, the database is able
+to start processing of new user transactions, but the function
+recv_recovery_from_checkpoint_finish should be called later to complete
+the recovery and free the resources used in it.
+@param type	ignored: LOG_CHECKPOINT or LOG_ARCHIVE
+@param lim	ignored: recover up to this log sequence number if possible
+@param min	in: minimum flushed log sequence number from data files
+@param max	in: maximum flushed log sequence number from data files
+@return	error code or DB_SUCCESS */
+# define recv_recovery_from_checkpoint_start(type,lim,min,max)		\
+	recv_recovery_from_checkpoint_start_func(min,max)
+#endif /* UNIV_LOG_ARCHIVE */
+/********************************************************//**
+Completes recovery from a checkpoint. */
+UNIV_INTERN
+void
+recv_recovery_from_checkpoint_finish(void);
+/*======================================*/
+/********************************************************//**
+Initiates the rollback of active transactions. */
+UNIV_INTERN
+void
+recv_recovery_rollback_active(void);
+/*===============================*/
+/*******************************************************//**
+Scans log from a buffer and stores new log data to the parsing buffer.
+Parses and hashes the log records if new data found.  Unless
+UNIV_HOTBACKUP is defined, this function will apply log records
+automatically when the hash table becomes full.
+@return TRUE if limit_lsn has been reached, or not able to scan any
+more in this log group */
+UNIV_INTERN
+ibool
+recv_scan_log_recs(
+/*===============*/
+	ulint		available_memory,/*!< in: we let the hash table of recs
+					to grow to this size, at the maximum */
+	ibool		store_to_hash,	/*!< in: TRUE if the records should be
+					stored to the hash table; this is set
+					to FALSE if just debug checking is
+					needed */
+	const byte*	buf,		/*!< in: buffer containing a log
+					segment or garbage */
+	ulint		len,		/*!< in: buffer length */
+	lsn_t		start_lsn,	/*!< in: buffer start lsn */
+	lsn_t*		contiguous_lsn,	/*!< in/out: it is known that all log
+					groups contain contiguous log data up
+					to this lsn */
+	lsn_t*		group_scanned_lsn);/*!< out: scanning succeeded up to
+					this lsn */
+/******************************************************//**
+Resets the logs. The contents of log files will be lost! */
+UNIV_INTERN
+void
+recv_reset_logs(
+/*============*/
+#ifdef UNIV_LOG_ARCHIVE
+	ulint		arch_log_no,	/*!< in: next archived log file number */
+	ibool		new_logs_created,/*!< in: TRUE if resetting logs
+					is done at the log creation;
+					FALSE if it is done after
+					archive recovery */
+#endif /* UNIV_LOG_ARCHIVE */
+	lsn_t		lsn);		/*!< in: reset to this lsn
+					rounded up to be divisible by
+					OS_FILE_LOG_BLOCK_SIZE, after
+					which we add
+					LOG_BLOCK_HDR_SIZE */
+#ifdef UNIV_HOTBACKUP
+/******************************************************//**
+Creates new log files after a backup has been restored. */
+UNIV_INTERN
+void
+recv_reset_log_files_for_backup(
+/*============================*/
+	const char*	log_dir,	/*!< in: log file directory path */
+	ulint		n_log_files,	/*!< in: number of log files */
+	lsn_t		log_file_size,	/*!< in: log file size */
+	lsn_t		lsn);		/*!< in: new start lsn, must be
+					divisible by OS_FILE_LOG_BLOCK_SIZE */
+#endif /* UNIV_HOTBACKUP */
+/********************************************************//**
+Creates the recovery system. */
+UNIV_INTERN
+void
+recv_sys_create(void);
+/*=================*/
+/**********************************************************//**
+Release recovery system mutexes. */
+UNIV_INTERN
+void
+recv_sys_close(void);
+/*================*/
+/********************************************************//**
+Frees the recovery system memory. */
+UNIV_INTERN
+void
+recv_sys_mem_free(void);
+/*===================*/
+/********************************************************//**
+Inits the recovery system for a recovery operation. */
+UNIV_INTERN
+void
+recv_sys_init(
+/*==========*/
+	ulint	available_memory);	/*!< in: available memory in bytes */
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Reset the state of the recovery system variables. */
+UNIV_INTERN
+void
+recv_sys_var_init(void);
+/*===================*/
+#endif /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Empties the hash table of stored log records, applying them to appropriate
+pages. */
+UNIV_INTERN
+void
+recv_apply_hashed_log_recs(
+/*=======================*/
+	ibool	allow_ibuf);	/*!< in: if TRUE, also ibuf operations are
+				allowed during the application; if FALSE,
+				no ibuf operations are allowed, and after
+				the application all file pages are flushed to
+				disk and invalidated in buffer pool: this
+				alternative means that no new log records
+				can be generated during the application */
+#ifdef UNIV_HOTBACKUP
+/*******************************************************************//**
+Applies log records in the hash table to a backup. */
+UNIV_INTERN
+void
+recv_apply_log_recs_for_backup(void);
+/*================================*/
+#endif
+#ifdef UNIV_LOG_ARCHIVE
+/********************************************************//**
+Recovers from archived log files, and also from log files, if they exist.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+recv_recovery_from_archive_start(
+/*=============================*/
+	lsn_t		min_flushed_lsn,/*!< in: min flushed lsn field from the
+					data files */
+	lsn_t		limit_lsn,	/*!< in: recover up to this lsn if
+					possible */
+	ulint		first_log_no);	/*!< in: number of the first archived
+					log file to use in the recovery; the
+					file will be searched from
+					INNOBASE_LOG_ARCH_DIR specified in
+					server config file */
+/********************************************************//**
+Completes recovery from archive. */
+UNIV_INTERN
+void
+recv_recovery_from_archive_finish(void);
+/*===================================*/
+#endif /* UNIV_LOG_ARCHIVE */
+
+/** Block of log record data */
+struct recv_data_t{
+	recv_data_t*	next;	/*!< pointer to the next block or NULL */
+				/*!< the log record data is stored physically
+				immediately after this struct, max amount
+				RECV_DATA_BLOCK_SIZE bytes of it */
+};
+
+/** Stored log record struct */
+struct recv_t{
+	byte		type;	/*!< log record type */
+	ulint		len;	/*!< log record body length in bytes */
+	recv_data_t*	data;	/*!< chain of blocks containing the log record
+				body */
+	lsn_t		start_lsn;/*!< start lsn of the log segment written by
+				the mtr which generated this log record: NOTE
+				that this is not necessarily the start lsn of
+				this log record */
+	lsn_t		end_lsn;/*!< end lsn of the log segment written by
+				the mtr which generated this log record: NOTE
+				that this is not necessarily the end lsn of
+				this log record */
+	UT_LIST_NODE_T(recv_t)
+			rec_list;/*!< list of log records for this page */
+};
+
+/** States of recv_addr_t */
+enum recv_addr_state {
+	/** not yet processed */
+	RECV_NOT_PROCESSED,
+	/** page is being read */
+	RECV_BEING_READ,
+	/** log records are being applied on the page */
+	RECV_BEING_PROCESSED,
+	/** log records have been applied on the page, or they have
+	been discarded because the tablespace does not exist */
+	RECV_PROCESSED
+};
+
+/** Hashed page file address struct */
+struct recv_addr_t{
+	enum recv_addr_state state;
+				/*!< recovery state of the page */
+	unsigned	space:32;/*!< space id */
+	unsigned	page_no:32;/*!< page number */
+	UT_LIST_BASE_NODE_T(recv_t)
+			rec_list;/*!< list of log records for this page */
+	hash_node_t	addr_hash;/*!< hash node in the hash bucket chain */
+};
+
+struct recv_dblwr_t {
+	void add(byte* page);
+
+	byte* find_page(ulint space_id, ulint page_no);
+
+	std::list<byte *> pages; /* Pages from double write buffer */
+
+	void operator() () {
+		pages.clear();
+	}
+};
+
+/** Recovery system data structure */
+struct recv_sys_t{
+#ifndef UNIV_HOTBACKUP
+	ib_mutex_t		mutex;	/*!< mutex protecting the fields apply_log_recs,
+				n_addrs, and the state field in each recv_addr
+				struct */
+	ib_mutex_t		writer_mutex;/*!< mutex coordinating
+				flushing between recv_writer_thread and
+				the recovery thread. */
+#endif /* !UNIV_HOTBACKUP */
+	ibool		apply_log_recs;
+				/*!< this is TRUE when log rec application to
+				pages is allowed; this flag tells the
+				i/o-handler if it should do log record
+				application */
+	ibool		apply_batch_on;
+				/*!< this is TRUE when a log rec application
+				batch is running */
+	lsn_t		lsn;	/*!< log sequence number */
+	ulint		last_log_buf_size;
+				/*!< size of the log buffer when the database
+				last time wrote to the log */
+	byte*		last_block;
+				/*!< possible incomplete last recovered log
+				block */
+	byte*		last_block_buf_start;
+				/*!< the nonaligned start address of the
+				preceding buffer */
+	byte*		buf;	/*!< buffer for parsing log records */
+	ulint		len;	/*!< amount of data in buf */
+	lsn_t		parse_start_lsn;
+				/*!< this is the lsn from which we were able to
+				start parsing log records and adding them to
+				the hash table; zero if a suitable
+				start point not found yet */
+	lsn_t		scanned_lsn;
+				/*!< the log data has been scanned up to this
+				lsn */
+	ulint		scanned_checkpoint_no;
+				/*!< the log data has been scanned up to this
+				checkpoint number (lowest 4 bytes) */
+	ulint		recovered_offset;
+				/*!< start offset of non-parsed log records in
+				buf */
+	lsn_t		recovered_lsn;
+				/*!< the log records have been parsed up to
+				this lsn */
+	lsn_t		limit_lsn;/*!< recovery should be made at most
+				up to this lsn */
+	ibool		found_corrupt_log;
+				/*!< this is set to TRUE if we during log
+				scan find a corrupt log block, or a corrupt
+				log record, or there is a log parsing
+				buffer overflow */
+#ifdef UNIV_LOG_ARCHIVE
+	log_group_t*	archive_group;
+				/*!< in archive recovery: the log group whose
+				archive is read */
+#endif /* !UNIV_LOG_ARCHIVE */
+	mem_heap_t*	heap;	/*!< memory heap of log records and file
+				addresses*/
+	hash_table_t*	addr_hash;/*!< hash table of file addresses of pages */
+	ulint		n_addrs;/*!< number of not processed hashed file
+				addresses in the hash table */
+
+	recv_dblwr_t	dblwr;
+};
+
+/** The recovery system */
+extern recv_sys_t*	recv_sys;
+
+/** TRUE when applying redo log records during crash recovery; FALSE
+otherwise.  Note that this is FALSE while a background thread is
+rolling back incomplete transactions. */
+extern ibool		recv_recovery_on;
+/** If the following is TRUE, the buffer pool file pages must be invalidated
+after recovery and no ibuf operations are allowed; this becomes TRUE if
+the log record hash table becomes too full, and log records must be merged
+to file pages already before the recovery is finished: in this case no
+ibuf operations are allowed, as they could modify the pages read in the
+buffer pool before the pages have been recovered to the up-to-date state.
+
+TRUE means that recovery is running and no operations on the log files
+are allowed yet: the variable name is misleading. */
+extern ibool		recv_no_ibuf_operations;
+/** TRUE when recv_init_crash_recovery() has been called. */
+extern ibool		recv_needed_recovery;
+#ifdef UNIV_DEBUG
+/** TRUE if writing to the redo log (mtr_commit) is forbidden.
+Protected by log_sys->mutex. */
+extern ibool		recv_no_log_write;
+#endif /* UNIV_DEBUG */
+
+/** TRUE if buf_page_is_corrupted() should check if the log sequence
+number (FIL_PAGE_LSN) is in the future.  Initially FALSE, and set by
+recv_recovery_from_checkpoint_start_func(). */
+extern ibool		recv_lsn_checks_on;
+#ifdef UNIV_HOTBACKUP
+/** TRUE when the redo log is being backed up */
+extern ibool		recv_is_making_a_backup;
+#endif /* UNIV_HOTBACKUP */
+/** Maximum page number encountered in the redo log */
+extern ulint		recv_max_parsed_page_no;
+
+/** Size of the parsing buffer; it must accommodate RECV_SCAN_SIZE many
+times! */
+#define RECV_PARSING_BUF_SIZE	(2 * 1024 * 1024)
+
+/** Size of block reads when the log groups are scanned forward to do a
+roll-forward */
+#define RECV_SCAN_SIZE		(4 * UNIV_PAGE_SIZE)
+
+/** This many frames must be left free in the buffer pool when we scan
+the log and store the scanned log records in the buffer pool: we will
+use these free frames to read in pages when we start applying the
+log records to the database. */
+extern ulint	recv_n_pool_free_frames;
+
+#ifndef UNIV_NONINL
+#include "log0recv.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/log0recv.ic b/storage/innobase/include/log0recv.ic
new file mode 100644
index 00000000000..32c28dd03e6
--- /dev/null
+++ b/storage/innobase/include/log0recv.ic
@@ -0,0 +1,53 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0recv.ic
+Recovery
+
+Created 9/20/1997 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+
+/*******************************************************************//**
+Returns TRUE if recovery is currently running.
+@return	recv_recovery_on */
+UNIV_INLINE
+ibool
+recv_recovery_is_on(void)
+/*=====================*/
+{
+	return(recv_recovery_on);
+}
+
+#ifdef UNIV_LOG_ARCHIVE
+/** TRUE when applying redo log records from an archived log file */
+extern ibool	recv_recovery_from_backup_on;
+
+/*******************************************************************//**
+Returns TRUE if recovery from backup is currently running.
+@return	recv_recovery_from_backup_on */
+UNIV_INLINE
+ibool
+recv_recovery_from_backup_is_on(void)
+/*=================================*/
+{
+	return(recv_recovery_from_backup_on);
+}
+#endif /* UNIV_LOG_ARCHIVE */
diff --git a/storage/innobase/include/mach0data.h b/storage/innobase/include/mach0data.h
new file mode 100644
index 00000000000..d0087f56aaa
--- /dev/null
+++ b/storage/innobase/include/mach0data.h
@@ -0,0 +1,418 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/mach0data.h
+Utilities for converting data from the database file
+to the machine format.
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef mach0data_h
+#define mach0data_h
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "univ.i"
+#include "ut0byte.h"
+
+/* The data and all fields are always stored in a database file
+in the same format: ascii, big-endian, ... .
+All data in the files MUST be accessed using the functions in this
+module. */
+
+/*******************************************************//**
+The following function is used to store data in one byte. */
+UNIV_INLINE
+void
+mach_write_to_1(
+/*============*/
+	byte*	b,	/*!< in: pointer to byte where to store */
+	ulint	n);	 /*!< in: ulint integer to be stored, >= 0, < 256 */
+/********************************************************//**
+The following function is used to fetch data from one byte.
+@return	ulint integer, >= 0, < 256 */
+UNIV_INLINE
+ulint
+mach_read_from_1(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to byte */
+	__attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in two consecutive
+bytes. We store the most significant byte to the lower address. */
+UNIV_INLINE
+void
+mach_write_to_2(
+/*============*/
+	byte*	b,	/*!< in: pointer to two bytes where to store */
+	ulint	n);	 /*!< in: ulint integer to be stored, >= 0, < 64k */
+/********************************************************//**
+The following function is used to fetch data from two consecutive
+bytes. The most significant byte is at the lowest address.
+@return	ulint integer, >= 0, < 64k */
+UNIV_INLINE
+ulint
+mach_read_from_2(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to two bytes */
+	__attribute__((nonnull, pure));
+
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+to the canonical format, for fast bytewise equality test
+against memory.
+@return	16-bit integer in canonical format */
+UNIV_INLINE
+uint16
+mach_encode_2(
+/*==========*/
+	ulint	n)	/*!< in: integer in machine-dependent format */
+	__attribute__((const));
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+from the canonical format, for fast bytewise equality test
+against memory.
+@return	integer in machine-dependent format */
+UNIV_INLINE
+ulint
+mach_decode_2(
+/*==========*/
+	uint16	n)	/*!< in: 16-bit integer in canonical format */
+	__attribute__((const));
+/*******************************************************//**
+The following function is used to store data in 3 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_3(
+/*============*/
+	byte*	b,	/*!< in: pointer to 3 bytes where to store */
+	ulint	n);	 /*!< in: ulint integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 3 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_3(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 3 bytes */
+	__attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in four consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_4(
+/*============*/
+	byte*	b,	/*!< in: pointer to four bytes where to store */
+	ulint	n);	 /*!< in: ulint integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 4 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_4(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to four bytes */
+	__attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a ulint in a compressed form (1..5 bytes).
+@return	stored size in bytes */
+UNIV_INLINE
+ulint
+mach_write_compressed(
+/*==================*/
+	byte*	b,	/*!< in: pointer to memory where to store */
+	ulint	n);	/*!< in: ulint integer to be stored */
+/*********************************************************//**
+Returns the size of an ulint when written in the compressed form.
+@return	compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_get_compressed_size(
+/*=====================*/
+	ulint	n)	/*!< in: ulint integer to be stored */
+	__attribute__((const));
+/*********************************************************//**
+Reads a ulint in a compressed form.
+@return	read integer */
+UNIV_INLINE
+ulint
+mach_read_compressed(
+/*=================*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+	__attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in 6 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_6(
+/*============*/
+	byte*		b,	/*!< in: pointer to 6 bytes where to store */
+	ib_uint64_t	id);	/*!< in: 48-bit integer */
+/********************************************************//**
+The following function is used to fetch data from 6 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	48-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_6(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 6 bytes */
+	__attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in 7 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_7(
+/*============*/
+	byte*		b,	/*!< in: pointer to 7 bytes where to store */
+	ib_uint64_t	n);	/*!< in: 56-bit integer */
+/********************************************************//**
+The following function is used to fetch data from 7 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	56-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_7(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 7 bytes */
+	__attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_8(
+/*============*/
+	void*		b,	/*!< in: pointer to 8 bytes where to store */
+	ib_uint64_t	n);	/*!< in: 64-bit integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	64-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_8(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 8 bytes */
+	__attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (5..9 bytes).
+@return	size in bytes */
+UNIV_INLINE
+ulint
+mach_ull_write_compressed(
+/*======================*/
+	byte*		b,	/*!< in: pointer to memory where to store */
+	ib_uint64_t	n);	/*!< in: 64-bit integer to be stored */
+/*********************************************************//**
+Returns the size of a 64-bit integer when written in the compressed form.
+@return	compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_ull_get_compressed_size(
+/*=========================*/
+	ib_uint64_t	n);	/*!< in: 64-bit integer to be stored */
+/*********************************************************//**
+Reads a 64-bit integer in a compressed form.
+@return	the value read */
+UNIV_INLINE
+ib_uint64_t
+mach_ull_read_compressed(
+/*=====================*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+	__attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (1..11 bytes).
+@return	size in bytes */
+UNIV_INLINE
+ulint
+mach_ull_write_much_compressed(
+/*===========================*/
+	byte*		b,	/*!< in: pointer to memory where to store */
+	ib_uint64_t	n);	/*!< in: 64-bit integer to be stored */
+/*********************************************************//**
+Returns the size of a 64-bit integer when written in the compressed form.
+@return	compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_ull_get_much_compressed_size(
+/*==============================*/
+	ib_uint64_t	n)	/*!< in: 64-bit integer to be stored */
+	__attribute__((const));
+/*********************************************************//**
+Reads a 64-bit integer in a compressed form.
+@return	the value read */
+UNIV_INLINE
+ib_uint64_t
+mach_ull_read_much_compressed(
+/*==========================*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+	__attribute__((nonnull, pure));
+/*********************************************************//**
+Reads a ulint in a compressed form if the log record fully contains it.
+@return	pointer to end of the stored field, NULL if not complete */
+UNIV_INTERN
+byte*
+mach_parse_compressed(
+/*==================*/
+	byte*	ptr,	/*!< in: pointer to buffer from where to read */
+	byte*	end_ptr,/*!< in: pointer to end of the buffer */
+	ulint*	val);	/*!< out: read value */
+/*********************************************************//**
+Reads a 64-bit integer in a compressed form
+if the log record fully contains it.
+@return pointer to end of the stored field, NULL if not complete */
+UNIV_INLINE
+byte*
+mach_ull_parse_compressed(
+/*======================*/
+	byte*		ptr,	/*!< in: pointer to buffer from where to read */
+	byte*		end_ptr,/*!< in: pointer to end of the buffer */
+	ib_uint64_t*	val);	/*!< out: read value */
+#ifndef UNIV_HOTBACKUP
+/*********************************************************//**
+Reads a double. It is stored in a little-endian format.
+@return	double read */
+UNIV_INLINE
+double
+mach_double_read(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+	__attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a double. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_double_write(
+/*==============*/
+	byte*	b,	/*!< in: pointer to memory where to write */
+	double	d);	/*!< in: double */
+/*********************************************************//**
+Reads a float. It is stored in a little-endian format.
+@return	float read */
+UNIV_INLINE
+float
+mach_float_read(
+/*============*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+	__attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a float. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_float_write(
+/*=============*/
+	byte*	b,	/*!< in: pointer to memory where to write */
+	float	d);	/*!< in: float */
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return	unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_n_little_endian(
+/*===========================*/
+	const byte*	buf,		/*!< in: from where to read */
+	ulint		buf_size)	/*!< in: from how many bytes to read */
+	__attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_n_little_endian(
+/*==========================*/
+	byte*	dest,		/*!< in: where to write */
+	ulint	dest_size,	/*!< in: into how many bytes to write */
+	ulint	n);		/*!< in: unsigned long int to write */
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return	unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_2_little_endian(
+/*===========================*/
+	const byte*	buf)		/*!< in: from where to read */
+	__attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_2_little_endian(
+/*==========================*/
+	byte*	dest,		/*!< in: where to write */
+	ulint	n);		/*!< in: unsigned long int to write */
+/*********************************************************//**
+Convert integral type from storage byte order (big endian) to
+host byte order.
+@return	integer value */
+UNIV_INLINE
+ib_uint64_t
+mach_read_int_type(
+/*===============*/
+	const byte*	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of src */
+	ibool		unsigned_type);	/*!< in: signed or unsigned flag */
+/***********************************************************//**
+Convert integral type from host byte order to (big-endian) storage
+byte order. */
+UNIV_INLINE
+void
+mach_write_int_type(
+/*================*/
+	byte*		dest,		/*!< in: where to write*/
+	const byte*	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of src */
+	bool		usign);		/*!< in: signed or unsigned flag */
+
+/*************************************************************
+Convert a ulonglong integer from host byte order to (big-endian)
+storage byte order. */
+UNIV_INLINE
+void
+mach_write_ulonglong(
+/*=================*/
+	byte*		dest,		/*!< in: where to write */
+	ulonglong	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of dest */
+	bool		usign);		/*!< in: signed or unsigned flag */
+
+/********************************************************//**
+Reads 1 - 4 bytes from a file page buffered in the buffer pool.
+@return	value read */
+UNIV_INLINE
+ulint
+mach_read_ulint(
+/*============*/
+	const byte*	ptr,	/*!< in: pointer from where to read */
+	ulint		type);	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+
+#endif /* !UNIV_HOTBACKUP */
+#endif /* !UNIV_INNOCHECKSUM */
+
+#ifndef UNIV_NONINL
+#include "mach0data.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/mach0data.ic b/storage/innobase/include/mach0data.ic
new file mode 100644
index 00000000000..7449d2da2b8
--- /dev/null
+++ b/storage/innobase/include/mach0data.ic
@@ -0,0 +1,881 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/mach0data.ic
+Utilities for converting data from the database file
+to the machine format.
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "ut0mem.h"
+
+/*******************************************************//**
+The following function is used to store data in one byte. */
+UNIV_INLINE
+void
+mach_write_to_1(
+/*============*/
+	byte*	b,	/*!< in: pointer to byte where to store */
+	ulint	n)	/*!< in: ulint integer to be stored, >= 0, < 256 */
+{
+	ut_ad(b);
+	ut_ad((n | 0xFFUL) <= 0xFFUL);
+
+	b[0] = (byte) n;
+}
+
+/********************************************************//**
+The following function is used to fetch data from one byte.
+@return	ulint integer, >= 0, < 256 */
+UNIV_INLINE
+ulint
+mach_read_from_1(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to byte */
+{
+	ut_ad(b);
+	return((ulint)(b[0]));
+}
+
+/*******************************************************//**
+The following function is used to store data in two consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_2(
+/*============*/
+	byte*	b,	/*!< in: pointer to two bytes where to store */
+	ulint	n)	/*!< in: ulint integer to be stored */
+{
+	ut_ad(b);
+	ut_ad((n | 0xFFFFUL) <= 0xFFFFUL);
+
+	b[0] = (byte)(n >> 8);
+	b[1] = (byte)(n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 2 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_2(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 2 bytes */
+{
+	return(((ulint)(b[0]) << 8) | (ulint)(b[1]));
+}
+
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+to the canonical format, for fast bytewise equality test
+against memory.
+@return	16-bit integer in canonical format */
+UNIV_INLINE
+uint16
+mach_encode_2(
+/*==========*/
+	ulint	n)	/*!< in: integer in machine-dependent format */
+{
+	uint16	ret;
+	ut_ad(2 == sizeof ret);
+	mach_write_to_2((byte*) &ret, n);
+	return(ret);
+}
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+from the canonical format, for fast bytewise equality test
+against memory.
+@return	integer in machine-dependent format */
+UNIV_INLINE
+ulint
+mach_decode_2(
+/*==========*/
+	uint16	n)	/*!< in: 16-bit integer in canonical format */
+{
+	ut_ad(2 == sizeof n);
+	return(mach_read_from_2((const byte*) &n));
+}
+
+/*******************************************************//**
+The following function is used to store data in 3 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_3(
+/*============*/
+	byte*	b,	/*!< in: pointer to 3 bytes where to store */
+	ulint	n)	/*!< in: ulint integer to be stored */
+{
+	ut_ad(b);
+	ut_ad((n | 0xFFFFFFUL) <= 0xFFFFFFUL);
+
+	b[0] = (byte)(n >> 16);
+	b[1] = (byte)(n >> 8);
+	b[2] = (byte)(n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 3 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_3(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 3 bytes */
+{
+	ut_ad(b);
+	return( ((ulint)(b[0]) << 16)
+		| ((ulint)(b[1]) << 8)
+		| (ulint)(b[2])
+		);
+}
+
+/*******************************************************//**
+The following function is used to store data in four consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_4(
+/*============*/
+	byte*	b,	/*!< in: pointer to four bytes where to store */
+	ulint	n)	/*!< in: ulint integer to be stored */
+{
+	ut_ad(b);
+
+	b[0] = (byte)(n >> 24);
+	b[1] = (byte)(n >> 16);
+	b[2] = (byte)(n >> 8);
+	b[3] = (byte) n;
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/********************************************************//**
+The following function is used to fetch data from 4 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_4(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to four bytes */
+{
+	ut_ad(b);
+	return( ((ulint)(b[0]) << 24)
+		| ((ulint)(b[1]) << 16)
+		| ((ulint)(b[2]) << 8)
+		| (ulint)(b[3])
+		);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/*********************************************************//**
+Writes a ulint in a compressed form where the first byte codes the
+length of the stored ulint. We look at the most significant bits of
+the byte. If the most significant bit is zero, it means 1-byte storage,
+else if the 2nd bit is 0, it means 2-byte storage, else if 3rd is 0,
+it means 3-byte storage, else if 4th is 0, it means 4-byte storage,
+else the storage is 5-byte.
+@return	compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_write_compressed(
+/*==================*/
+	byte*	b,	/*!< in: pointer to memory where to store */
+	ulint	n)	/*!< in: ulint integer (< 2^32) to be stored */
+{
+	ut_ad(b);
+
+	if (n < 0x80UL) {
+		mach_write_to_1(b, n);
+		return(1);
+	} else if (n < 0x4000UL) {
+		mach_write_to_2(b, n | 0x8000UL);
+		return(2);
+	} else if (n < 0x200000UL) {
+		mach_write_to_3(b, n | 0xC00000UL);
+		return(3);
+	} else if (n < 0x10000000UL) {
+		mach_write_to_4(b, n | 0xE0000000UL);
+		return(4);
+	} else {
+		mach_write_to_1(b, 0xF0UL);
+		mach_write_to_4(b + 1, n);
+		return(5);
+	}
+}
+
+/*********************************************************//**
+Returns the size of a ulint when written in the compressed form.
+@return	compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_get_compressed_size(
+/*=====================*/
+	ulint	n)	/*!< in: ulint integer (< 2^32) to be stored */
+{
+	if (n < 0x80UL) {
+		return(1);
+	} else if (n < 0x4000UL) {
+		return(2);
+	} else if (n < 0x200000UL) {
+		return(3);
+	} else if (n < 0x10000000UL) {
+		return(4);
+	} else {
+		return(5);
+	}
+}
+
+/*********************************************************//**
+Reads a ulint in a compressed form.
+@return	read integer (< 2^32) */
+UNIV_INLINE
+ulint
+mach_read_compressed(
+/*=================*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+{
+	ulint	flag;
+
+	ut_ad(b);
+
+	flag = mach_read_from_1(b);
+
+	if (flag < 0x80UL) {
+		return(flag);
+	} else if (flag < 0xC0UL) {
+		return(mach_read_from_2(b) & 0x7FFFUL);
+	} else if (flag < 0xE0UL) {
+		return(mach_read_from_3(b) & 0x3FFFFFUL);
+	} else if (flag < 0xF0UL) {
+		return(mach_read_from_4(b) & 0x1FFFFFFFUL);
+	} else {
+		ut_ad(flag == 0xF0UL);
+		return(mach_read_from_4(b + 1));
+	}
+}
+
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_8(
+/*============*/
+	void*		b,	/*!< in: pointer to 8 bytes where to store */
+	ib_uint64_t	n)	/*!< in: 64-bit integer to be stored */
+{
+	ut_ad(b);
+
+	mach_write_to_4(static_cast<byte*>(b), (ulint) (n >> 32));
+	mach_write_to_4(static_cast<byte*>(b) + 4, (ulint) n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	64-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_8(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 8 bytes */
+{
+	ib_uint64_t	ull;
+
+	ull = ((ib_uint64_t) mach_read_from_4(b)) << 32;
+	ull |= (ib_uint64_t) mach_read_from_4(b + 4);
+
+	return(ull);
+}
+
+/*******************************************************//**
+The following function is used to store data in 7 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_7(
+/*============*/
+	byte*		b,	/*!< in: pointer to 7 bytes where to store */
+	ib_uint64_t	n)	/*!< in: 56-bit integer */
+{
+	ut_ad(b);
+
+	mach_write_to_3(b, (ulint) (n >> 32));
+	mach_write_to_4(b + 3, (ulint) n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 7 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	56-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_7(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 7 bytes */
+{
+	ut_ad(b);
+
+	return(ut_ull_create(mach_read_from_3(b), mach_read_from_4(b + 3)));
+}
+
+/*******************************************************//**
+The following function is used to store data in 6 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_6(
+/*============*/
+	byte*		b,	/*!< in: pointer to 6 bytes where to store */
+	ib_uint64_t	n)	/*!< in: 48-bit integer */
+{
+	ut_ad(b);
+
+	mach_write_to_2(b, (ulint) (n >> 32));
+	mach_write_to_4(b + 2, (ulint) n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 6 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	48-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_6(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 6 bytes */
+{
+	ut_ad(b);
+
+	return(ut_ull_create(mach_read_from_2(b), mach_read_from_4(b + 2)));
+}
+
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (5..9 bytes).
+@return	size in bytes */
+UNIV_INLINE
+ulint
+mach_ull_write_compressed(
+/*======================*/
+	byte*		b,	/*!< in: pointer to memory where to store */
+	ib_uint64_t	n)	/*!< in: 64-bit integer to be stored */
+{
+	ulint	size;
+
+	ut_ad(b);
+
+	size = mach_write_compressed(b, (ulint) (n >> 32));
+	mach_write_to_4(b + size, (ulint) n);
+
+	return(size + 4);
+}
+
+/*********************************************************//**
+Returns the size of a 64-bit integer when written in the compressed form.
+@return	compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_ull_get_compressed_size(
+/*=========================*/
+	ib_uint64_t	n)	/*!< in: 64-bit integer to be stored */
+{
+	return(4 + mach_get_compressed_size((ulint) (n >> 32)));
+}
+
+/*********************************************************//**
+Reads a 64-bit integer in a compressed form.
+@return	the value read */
+UNIV_INLINE
+ib_uint64_t
+mach_ull_read_compressed(
+/*=====================*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+{
+	ib_uint64_t	n;
+	ulint		size;
+
+	ut_ad(b);
+
+	n = (ib_uint64_t) mach_read_compressed(b);
+
+	size = mach_get_compressed_size((ulint) n);
+
+	n <<= 32;
+	n |= (ib_uint64_t) mach_read_from_4(b + size);
+
+	return(n);
+}
+
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (1..11 bytes).
+@return	size in bytes */
+UNIV_INLINE
+ulint
+mach_ull_write_much_compressed(
+/*===========================*/
+	byte*		b,	/*!< in: pointer to memory where to store */
+	ib_uint64_t	n)	/*!< in: 64-bit integer to be stored */
+{
+	ulint	size;
+
+	ut_ad(b);
+
+	if (!(n >> 32)) {
+		return(mach_write_compressed(b, (ulint) n));
+	}
+
+	*b = (byte)0xFF;
+	size = 1 + mach_write_compressed(b + 1, (ulint) (n >> 32));
+
+	size += mach_write_compressed(b + size, (ulint) n & 0xFFFFFFFF);
+
+	return(size);
+}
+
+/*********************************************************//**
+Returns the size of a 64-bit integer when written in the compressed form.
+@return	compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_ull_get_much_compressed_size(
+/*==============================*/
+	ib_uint64_t	n)	/*!< in: 64-bit integer to be stored */
+{
+	if (!(n >> 32)) {
+		return(mach_get_compressed_size((ulint) n));
+	}
+
+	return(1 + mach_get_compressed_size((ulint) (n >> 32))
+	       + mach_get_compressed_size((ulint) n & ULINT32_MASK));
+}
+
+/*********************************************************//**
+Reads a 64-bit integer in a compressed form.
+@return	the value read */
+UNIV_INLINE
+ib_uint64_t
+mach_ull_read_much_compressed(
+/*==========================*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+{
+	ib_uint64_t	n;
+	ulint		size;
+
+	ut_ad(b);
+
+	if (*b != (byte)0xFF) {
+		n = 0;
+		size = 0;
+	} else {
+		n = (ib_uint64_t) mach_read_compressed(b + 1);
+
+		size = 1 + mach_get_compressed_size((ulint) n);
+		n <<= 32;
+	}
+
+	n |= mach_read_compressed(b + size);
+
+	return(n);
+}
+
+/*********************************************************//**
+Reads a 64-bit integer in a compressed form
+if the log record fully contains it.
+@return pointer to end of the stored field, NULL if not complete */
+UNIV_INLINE
+byte*
+mach_ull_parse_compressed(
+/*======================*/
+	byte*		ptr,	/* in: pointer to buffer from where to read */
+	byte*		end_ptr,/* in: pointer to end of the buffer */
+	ib_uint64_t*	val)	/* out: read value */
+{
+	ulint		size;
+
+	ut_ad(ptr);
+	ut_ad(end_ptr);
+	ut_ad(val);
+
+	if (end_ptr < ptr + 5) {
+
+		return(NULL);
+	}
+
+	*val = mach_read_compressed(ptr);
+
+	size = mach_get_compressed_size((ulint) *val);
+
+	ptr += size;
+
+	if (end_ptr < ptr + 4) {
+
+		return(NULL);
+	}
+
+	*val <<= 32;
+	*val |= mach_read_from_4(ptr);
+
+	return(ptr + 4);
+}
+#ifndef UNIV_HOTBACKUP
+/*********************************************************//**
+Reads a double. It is stored in a little-endian format.
+@return	double read */
+UNIV_INLINE
+double
+mach_double_read(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+{
+	double	d;
+	ulint	i;
+	byte*	ptr;
+
+	ptr = (byte*) &d;
+
+	for (i = 0; i < sizeof(double); i++) {
+#ifdef WORDS_BIGENDIAN
+		ptr[sizeof(double) - i - 1] = b[i];
+#else
+		ptr[i] = b[i];
+#endif
+	}
+
+	return(d);
+}
+
+/*********************************************************//**
+Writes a double. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_double_write(
+/*==============*/
+	byte*	b,	/*!< in: pointer to memory where to write */
+	double	d)	/*!< in: double */
+{
+	ulint	i;
+	byte*	ptr;
+
+	ptr = (byte*) &d;
+
+	for (i = 0; i < sizeof(double); i++) {
+#ifdef WORDS_BIGENDIAN
+		b[i] = ptr[sizeof(double) - i - 1];
+#else
+		b[i] = ptr[i];
+#endif
+	}
+}
+
+/*********************************************************//**
+Reads a float. It is stored in a little-endian format.
+@return	float read */
+UNIV_INLINE
+float
+mach_float_read(
+/*============*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+{
+	float	d;
+	ulint	i;
+	byte*	ptr;
+
+	ptr = (byte*) &d;
+
+	for (i = 0; i < sizeof(float); i++) {
+#ifdef WORDS_BIGENDIAN
+		ptr[sizeof(float) - i - 1] = b[i];
+#else
+		ptr[i] = b[i];
+#endif
+	}
+
+	return(d);
+}
+
+/*********************************************************//**
+Writes a float. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_float_write(
+/*=============*/
+	byte*	b,	/*!< in: pointer to memory where to write */
+	float	d)	/*!< in: float */
+{
+	ulint	i;
+	byte*	ptr;
+
+	ptr = (byte*) &d;
+
+	for (i = 0; i < sizeof(float); i++) {
+#ifdef WORDS_BIGENDIAN
+		b[i] = ptr[sizeof(float) - i - 1];
+#else
+		b[i] = ptr[i];
+#endif
+	}
+}
+
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return	unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_n_little_endian(
+/*===========================*/
+	const byte*	buf,		/*!< in: from where to read */
+	ulint		buf_size)	/*!< in: from how many bytes to read */
+{
+	ulint	n	= 0;
+	const byte*	ptr;
+
+	ut_ad(buf_size > 0);
+
+	ptr = buf + buf_size;
+
+	for (;;) {
+		ptr--;
+
+		n = n << 8;
+
+		n += (ulint)(*ptr);
+
+		if (ptr == buf) {
+			break;
+		}
+	}
+
+	return(n);
+}
+
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_n_little_endian(
+/*==========================*/
+	byte*	dest,		/*!< in: where to write */
+	ulint	dest_size,	/*!< in: into how many bytes to write */
+	ulint	n)		/*!< in: unsigned long int to write */
+{
+	byte*	end;
+
+	ut_ad(dest_size <= sizeof(ulint));
+	ut_ad(dest_size > 0);
+
+	end = dest + dest_size;
+
+	for (;;) {
+		*dest = (byte)(n & 0xFF);
+
+		n = n >> 8;
+
+		dest++;
+
+		if (dest == end) {
+			break;
+		}
+	}
+
+	ut_ad(n == 0);
+}
+
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return	unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_2_little_endian(
+/*===========================*/
+	const byte*	buf)		/*!< in: from where to read */
+{
+	return((ulint)(buf[0]) | ((ulint)(buf[1]) << 8));
+}
+
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_2_little_endian(
+/*==========================*/
+	byte*	dest,		/*!< in: where to write */
+	ulint	n)		/*!< in: unsigned long int to write */
+{
+	ut_ad(n < 256 * 256);
+
+	*dest = (byte)(n & 0xFFUL);
+
+	n = n >> 8;
+	dest++;
+
+	*dest = (byte)(n & 0xFFUL);
+}
+
+/*********************************************************//**
+Convert integral type from storage byte order (big endian) to
+host byte order.
+@return	integer value */
+UNIV_INLINE
+ib_uint64_t
+mach_read_int_type(
+/*===============*/
+	const byte*	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of src */
+	ibool		unsigned_type)	/*!< in: signed or unsigned flag */
+{
+	/* XXX this can be optimized on big-endian machines */
+
+	ullint	ret;
+	uint	i;
+
+	if (unsigned_type || (src[0] & 0x80)) {
+
+		ret = 0x0000000000000000ULL;
+	} else {
+
+		ret = 0xFFFFFFFFFFFFFF00ULL;
+	}
+
+	if (unsigned_type) {
+
+		ret |= src[0];
+	} else {
+
+		ret |= src[0] ^ 0x80;
+	}
+
+	for (i = 1; i < len; i++) {
+		ret <<= 8;
+		ret |= src[i];
+	}
+
+	return(ret);
+}
+/*********************************************************//**
+Swap byte ordering. */
+UNIV_INLINE
+void
+mach_swap_byte_order(
+/*=================*/
+        byte*           dest,           /*!< out: where to write */
+        const byte*     from,           /*!< in: where to read from */
+        ulint           len)            /*!< in: length of src */
+{
+        ut_ad(len > 0);
+        ut_ad(len <= 8);
+
+        dest += len;
+
+        switch (len & 0x7) {
+        case 0: *--dest = *from++;
+        case 7: *--dest = *from++;
+        case 6: *--dest = *from++;
+        case 5: *--dest = *from++;
+        case 4: *--dest = *from++;
+        case 3: *--dest = *from++;
+        case 2: *--dest = *from++;
+        case 1: *--dest = *from;
+        }
+}
+
+/*************************************************************
+Convert integral type from host byte order (big-endian) storage
+byte order. */
+UNIV_INLINE
+void
+mach_write_int_type(
+/*================*/
+	byte*		dest,		/*!< in: where to write */
+	const byte*	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of src */
+	bool		usign)		/*!< in: signed or unsigned flag */
+{
+#ifdef WORDS_BIGENDIAN
+        memcpy(dest, src, len);
+#else
+        mach_swap_byte_order(dest, src, len);
+#endif /* WORDS_BIGENDIAN */
+
+	if (!usign) {
+		*dest ^=  0x80;
+	}
+}
+
+/*************************************************************
+Convert a ulonglong integer from host byte order to (big-endian)
+storage byte order. */
+UNIV_INLINE
+void
+mach_write_ulonglong(
+/*=================*/
+	byte*		dest,		/*!< in: where to write */
+	ulonglong	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of dest */
+	bool		usign)		/*!< in: signed or unsigned flag */
+{
+	byte*		ptr = reinterpret_cast<byte*>(&src);
+
+	ut_ad(len <= sizeof(ulonglong));
+
+#ifdef WORDS_BIGENDIAN
+	memcpy(dest, ptr + (sizeof(src) - len), len);
+#else
+	mach_swap_byte_order(dest, reinterpret_cast<byte*>(ptr), len);
+#endif /* WORDS_BIGENDIAN */
+
+	if (!usign) {
+		*dest ^=  0x80;
+	}
+}
+
+/********************************************************//**
+Reads 1 - 4 bytes from a file page buffered in the buffer pool.
+@return	value read */
+UNIV_INLINE
+ulint
+mach_read_ulint(
+/*============*/
+	const byte*	ptr,	/*!< in: pointer from where to read */
+	ulint		type)	/*!< in: 1,2 or 4 bytes */
+{
+	switch (type) {
+	case 1:
+		return(mach_read_from_1(ptr));
+	case 2:
+		return(mach_read_from_2(ptr));
+	case 4:
+		return(mach_read_from_4(ptr));
+	default:
+		ut_error;
+	}
+
+	return(0);
+}
+
+#endif /* !UNIV_HOTBACKUP */
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/mem0dbg.h b/storage/innobase/include/mem0dbg.h
new file mode 100644
index 00000000000..cc339b82910
--- /dev/null
+++ b/storage/innobase/include/mem0dbg.h
@@ -0,0 +1,150 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mem0dbg.h
+The memory management: the debug code. This is not a compilation module,
+but is included in mem0mem.* !
+
+Created 6/9/1994 Heikki Tuuri
+*******************************************************/
+
+/* In the debug version each allocated field is surrounded with
+check fields whose sizes are given below */
+
+#ifdef UNIV_MEM_DEBUG
+# ifndef UNIV_HOTBACKUP
+/* The mutex which protects in the debug version the hash table
+containing the list of live memory heaps, and also the global
+variables in mem0dbg.cc. */
+extern ib_mutex_t	mem_hash_mutex;
+# endif /* !UNIV_HOTBACKUP */
+
+#define MEM_FIELD_HEADER_SIZE	ut_calc_align(2 * sizeof(ulint),\
+						UNIV_MEM_ALIGNMENT)
+#define MEM_FIELD_TRAILER_SIZE	sizeof(ulint)
+#else
+#define MEM_FIELD_HEADER_SIZE	0
+#endif
+
+
+/* Space needed when allocating for a user a field of
+length N. The space is allocated only in multiples of
+UNIV_MEM_ALIGNMENT. In the debug version there are also
+check fields at the both ends of the field. */
+#ifdef UNIV_MEM_DEBUG
+#define MEM_SPACE_NEEDED(N) ut_calc_align((N) + MEM_FIELD_HEADER_SIZE\
+		 + MEM_FIELD_TRAILER_SIZE, UNIV_MEM_ALIGNMENT)
+#else
+#define MEM_SPACE_NEEDED(N) ut_calc_align((N), UNIV_MEM_ALIGNMENT)
+#endif
+
+#if defined UNIV_MEM_DEBUG || defined UNIV_DEBUG
+/***************************************************************//**
+Checks a memory heap for consistency and prints the contents if requested.
+Outputs the sum of sizes of buffers given to the user (only in
+the debug version), the physical size of the heap and the number of
+blocks in the heap. In case of error returns 0 as sizes and number
+of blocks. */
+UNIV_INTERN
+void
+mem_heap_validate_or_print(
+/*=======================*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	byte*		top,	/*!< in: calculate and validate only until
+				this top pointer in the heap is reached,
+				if this pointer is NULL, ignored */
+	ibool		 print,	 /*!< in: if TRUE, prints the contents
+				of the heap; works only in
+				the debug version */
+	ibool*		 error,	 /*!< out: TRUE if error */
+	ulint*		us_size,/*!< out: allocated memory
+				(for the user) in the heap,
+				if a NULL pointer is passed as this
+				argument, it is ignored; in the
+				non-debug version this is always -1 */
+	ulint*		ph_size,/*!< out: physical size of the heap,
+				if a NULL pointer is passed as this
+				argument, it is ignored */
+	ulint*		n_blocks); /*!< out: number of blocks in the heap,
+				if a NULL pointer is passed as this
+				argument, it is ignored */
+/**************************************************************//**
+Validates the contents of a memory heap.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+mem_heap_validate(
+/*==============*/
+	mem_heap_t*   heap);	/*!< in: memory heap */
+#endif /* UNIV_MEM_DEBUG || UNIV_DEBUG */
+#ifdef UNIV_DEBUG
+/**************************************************************//**
+Checks that an object is a memory heap (or a block of it)
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+mem_heap_check(
+/*===========*/
+	mem_heap_t*   heap);	/*!< in: memory heap */
+#endif /* UNIV_DEBUG */
+#ifdef UNIV_MEM_DEBUG
+/*****************************************************************//**
+TRUE if no memory is currently allocated.
+@return	TRUE if no heaps exist */
+UNIV_INTERN
+ibool
+mem_all_freed(void);
+/*===============*/
+/*****************************************************************//**
+Validates the dynamic memory
+@return	TRUE if error */
+UNIV_INTERN
+ibool
+mem_validate_no_assert(void);
+/*=========================*/
+/************************************************************//**
+Validates the dynamic memory
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+mem_validate(void);
+/*===============*/
+#endif /* UNIV_MEM_DEBUG */
+/************************************************************//**
+Tries to find neigboring memory allocation blocks and dumps to stderr
+the neighborhood of a given pointer. */
+UNIV_INTERN
+void
+mem_analyze_corruption(
+/*===================*/
+	void*	ptr);	/*!< in: pointer to place of possible corruption */
+/*****************************************************************//**
+Prints information of dynamic memory usage and currently allocated memory
+heaps or buffers. Can only be used in the debug version. */
+UNIV_INTERN
+void
+mem_print_info(void);
+/*================*/
+/*****************************************************************//**
+Prints information of dynamic memory usage and currently allocated memory
+heaps or buffers since the last ..._print_info or..._print_new_info. */
+UNIV_INTERN
+void
+mem_print_new_info(void);
+/*====================*/
diff --git a/storage/innobase/include/mem0dbg.ic b/storage/innobase/include/mem0dbg.ic
new file mode 100644
index 00000000000..ec60ed35337
--- /dev/null
+++ b/storage/innobase/include/mem0dbg.ic
@@ -0,0 +1,109 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/mem0dbg.ic
+The memory management: the debug code. This is not an independent
+compilation module but is included in mem0mem.*.
+
+Created 6/8/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifdef UNIV_MEM_DEBUG
+extern ulint	mem_current_allocated_memory;
+
+/******************************************************************//**
+Initializes an allocated memory field in the debug version. */
+UNIV_INTERN
+void
+mem_field_init(
+/*===========*/
+	byte*	buf,	/*!< in: memory field */
+	ulint	n);	/*!< in: how many bytes the user requested */
+/******************************************************************//**
+Erases an allocated memory field in the debug version. */
+UNIV_INTERN
+void
+mem_field_erase(
+/*============*/
+	byte*	buf,	/*!< in: memory field */
+	ulint	n);	/*!< in: how many bytes the user requested */
+/***************************************************************//**
+Initializes a buffer to a random combination of hex BA and BE.
+Used to initialize allocated memory. */
+UNIV_INTERN
+void
+mem_init_buf(
+/*=========*/
+	byte*	buf,	/*!< in: pointer to buffer */
+	ulint	 n);	 /*!< in: length of buffer */
+/***************************************************************//**
+Initializes a buffer to a random combination of hex DE and AD.
+Used to erase freed memory. */
+UNIV_INTERN
+void
+mem_erase_buf(
+/*==========*/
+	byte*	buf,	/*!< in: pointer to buffer */
+	ulint	n);	/*!< in: length of buffer */
+/***************************************************************//**
+Inserts a created memory heap to the hash table of
+current allocated memory heaps.
+Initializes the hash table when first called. */
+UNIV_INTERN
+void
+mem_hash_insert(
+/*============*/
+	mem_heap_t*	heap,	   /*!< in: the created heap */
+	const char*	file_name, /*!< in: file name of creation */
+	ulint		line);	   /*!< in: line where created */
+/***************************************************************//**
+Removes a memory heap (which is going to be freed by the caller)
+from the list of live memory heaps. Returns the size of the heap
+in terms of how much memory in bytes was allocated for the user of
+the heap (not the total space occupied by the heap).
+Also validates the heap.
+NOTE: This function does not free the storage occupied by the
+heap itself, only the node in the list of heaps. */
+UNIV_INTERN
+void
+mem_hash_remove(
+/*============*/
+	mem_heap_t*	heap,	   /*!< in: the heap to be freed */
+	const char*	file_name, /*!< in: file name of freeing */
+	ulint		line);	   /*!< in: line where freed */
+
+
+void
+mem_field_header_set_len(byte* field, ulint len);
+
+ulint
+mem_field_header_get_len(byte* field);
+
+void
+mem_field_header_set_check(byte* field, ulint check);
+
+ulint
+mem_field_header_get_check(byte* field);
+
+void
+mem_field_trailer_set_check(byte* field, ulint check);
+
+ulint
+mem_field_trailer_get_check(byte* field);
+#endif /* UNIV_MEM_DEBUG */
diff --git a/storage/innobase/include/mem0mem.h b/storage/innobase/include/mem0mem.h
new file mode 100644
index 00000000000..f30034f3074
--- /dev/null
+++ b/storage/innobase/include/mem0mem.h
@@ -0,0 +1,425 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mem0mem.h
+The memory management
+
+Created 6/9/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef mem0mem_h
+#define mem0mem_h
+
+#include "univ.i"
+#include "ut0mem.h"
+#include "ut0byte.h"
+#include "ut0rnd.h"
+#ifndef UNIV_HOTBACKUP
+# include "sync0sync.h"
+#endif /* UNIV_HOTBACKUP */
+#include "ut0lst.h"
+#include "mach0data.h"
+
+/* -------------------- MEMORY HEAPS ----------------------------- */
+
+/* A block of a memory heap consists of the info structure
+followed by an area of memory */
+typedef struct mem_block_info_t	mem_block_t;
+
+/* A memory heap is a nonempty linear list of memory blocks */
+typedef mem_block_t		mem_heap_t;
+
+/* Types of allocation for memory heaps: DYNAMIC means allocation from the
+dynamic memory pool of the C compiler, BUFFER means allocation from the
+buffer pool; the latter method is used for very big heaps */
+
+#define MEM_HEAP_DYNAMIC	0	/* the most common type */
+#define MEM_HEAP_BUFFER		1
+#define MEM_HEAP_BTR_SEARCH	2	/* this flag can optionally be
+					ORed to MEM_HEAP_BUFFER, in which
+					case heap->free_block is used in
+					some cases for memory allocations,
+					and if it's NULL, the memory
+					allocation functions can return
+					NULL. */
+
+/* Different type of heaps in terms of which datastructure is using them */
+#define MEM_HEAP_FOR_BTR_SEARCH		(MEM_HEAP_BTR_SEARCH | MEM_HEAP_BUFFER)
+#define MEM_HEAP_FOR_PAGE_HASH		(MEM_HEAP_DYNAMIC)
+#define MEM_HEAP_FOR_RECV_SYS		(MEM_HEAP_BUFFER)
+#define MEM_HEAP_FOR_LOCK_HEAP		(MEM_HEAP_BUFFER)
+
+/* The following start size is used for the first block in the memory heap if
+the size is not specified, i.e., 0 is given as the parameter in the call of
+create. The standard size is the maximum (payload) size of the blocks used for
+allocations of small buffers. */
+
+#define MEM_BLOCK_START_SIZE		64
+#define MEM_BLOCK_STANDARD_SIZE		\
+	(UNIV_PAGE_SIZE >= 16384 ? 8000 : MEM_MAX_ALLOC_IN_BUF)
+
+/* If a memory heap is allowed to grow into the buffer pool, the following
+is the maximum size for a single allocated buffer: */
+#define MEM_MAX_ALLOC_IN_BUF		(UNIV_PAGE_SIZE - 200)
+
+/******************************************************************//**
+Initializes the memory system. */
+UNIV_INTERN
+void
+mem_init(
+/*=====*/
+	ulint	size);	/*!< in: common pool size in bytes */
+/******************************************************************//**
+Closes the memory system. */
+UNIV_INTERN
+void
+mem_close(void);
+/*===========*/
+
+#ifdef UNIV_DEBUG
+/**************************************************************//**
+Use this macro instead of the corresponding function! Macro for memory
+heap creation. */
+
+# define mem_heap_create(N)	mem_heap_create_func(		\
+		(N), __FILE__, __LINE__, MEM_HEAP_DYNAMIC)
+/**************************************************************//**
+Use this macro instead of the corresponding function! Macro for memory
+heap creation. */
+
+# define mem_heap_create_typed(N, T)	mem_heap_create_func(	\
+		(N), __FILE__, __LINE__, (T))
+
+#else /* UNIV_DEBUG */
+/**************************************************************//**
+Use this macro instead of the corresponding function! Macro for memory
+heap creation. */
+
+# define mem_heap_create(N)	mem_heap_create_func(		\
+		(N), MEM_HEAP_DYNAMIC)
+/**************************************************************//**
+Use this macro instead of the corresponding function! Macro for memory
+heap creation. */
+
+# define mem_heap_create_typed(N, T)	mem_heap_create_func(	\
+		(N), (T))
+
+#endif /* UNIV_DEBUG */
+/**************************************************************//**
+Use this macro instead of the corresponding function! Macro for memory
+heap freeing. */
+
+#define mem_heap_free(heap) mem_heap_free_func(\
+					  (heap), __FILE__, __LINE__)
+/*****************************************************************//**
+NOTE: Use the corresponding macros instead of this function. Creates a
+memory heap. For debugging purposes, takes also the file name and line as
+arguments.
+@return own: memory heap, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+mem_heap_t*
+mem_heap_create_func(
+/*=================*/
+	ulint		n,		/*!< in: desired start block size,
+					this means that a single user buffer
+					of size n will fit in the block,
+					0 creates a default size block */
+#ifdef UNIV_DEBUG
+	const char*	file_name,	/*!< in: file name where created */
+	ulint		line,		/*!< in: line where created */
+#endif /* UNIV_DEBUG */
+	ulint		type);		/*!< in: heap type */
+/*****************************************************************//**
+NOTE: Use the corresponding macro instead of this function. Frees the space
+occupied by a memory heap. In the debug version erases the heap memory
+blocks. */
+UNIV_INLINE
+void
+mem_heap_free_func(
+/*===============*/
+	mem_heap_t*	heap,		/*!< in, own: heap to be freed */
+	const char*	file_name,	/*!< in: file name where freed */
+	ulint		line);		/*!< in: line where freed */
+/***************************************************************//**
+Allocates and zero-fills n bytes of memory from a memory heap.
+@return	allocated, zero-filled storage */
+UNIV_INLINE
+void*
+mem_heap_zalloc(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n);	/*!< in: number of bytes; if the heap is allowed
+				to grow into the buffer pool, this must be
+				<= MEM_MAX_ALLOC_IN_BUF */
+/***************************************************************//**
+Allocates n bytes of memory from a memory heap.
+@return allocated storage, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+void*
+mem_heap_alloc(
+/*===========*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n);	/*!< in: number of bytes; if the heap is allowed
+				to grow into the buffer pool, this must be
+				<= MEM_MAX_ALLOC_IN_BUF */
+/*****************************************************************//**
+Returns a pointer to the heap top.
+@return	pointer to the heap top */
+UNIV_INLINE
+byte*
+mem_heap_get_heap_top(
+/*==================*/
+	mem_heap_t*	heap);	/*!< in: memory heap */
+/*****************************************************************//**
+Frees the space in a memory heap exceeding the pointer given. The
+pointer must have been acquired from mem_heap_get_heap_top. The first
+memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_free_heap_top(
+/*===================*/
+	mem_heap_t*	heap,	/*!< in: heap from which to free */
+	byte*		old_top);/*!< in: pointer to old top of heap */
+/*****************************************************************//**
+Empties a memory heap. The first memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_empty(
+/*===========*/
+	mem_heap_t*	heap);	/*!< in: heap to empty */
+/*****************************************************************//**
+Returns a pointer to the topmost element in a memory heap.
+The size of the element must be given.
+@return	pointer to the topmost element */
+UNIV_INLINE
+void*
+mem_heap_get_top(
+/*=============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n);	/*!< in: size of the topmost element */
+/*****************************************************************//**
+Frees the topmost element in a memory heap.
+The size of the element must be given. */
+UNIV_INLINE
+void
+mem_heap_free_top(
+/*==============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n);	/*!< in: size of the topmost element */
+/*****************************************************************//**
+Returns the space in bytes occupied by a memory heap. */
+UNIV_INLINE
+ulint
+mem_heap_get_size(
+/*==============*/
+	mem_heap_t*	heap);		/*!< in: heap */
+/**************************************************************//**
+Use this macro instead of the corresponding function!
+Macro for memory buffer allocation */
+
+#define mem_zalloc(N)	memset(mem_alloc(N), 0, (N))
+
+#ifdef UNIV_DEBUG
+#define mem_alloc(N)	mem_alloc_func((N), __FILE__, __LINE__, NULL)
+#define mem_alloc2(N,S) mem_alloc_func((N), __FILE__, __LINE__, (S))
+#else /* UNIV_DEBUG */
+#define mem_alloc(N)	mem_alloc_func((N), NULL)
+#define mem_alloc2(N,S) mem_alloc_func((N), (S))
+#endif /* UNIV_DEBUG */
+
+/***************************************************************//**
+NOTE: Use the corresponding macro instead of this function.
+Allocates a single buffer of memory from the dynamic memory of
+the C compiler. Is like malloc of C. The buffer must be freed
+with mem_free.
+@return	own: free storage */
+UNIV_INLINE
+void*
+mem_alloc_func(
+/*===========*/
+	ulint		n,		/*!< in: requested size in bytes */
+#ifdef UNIV_DEBUG
+	const char*	file_name,	/*!< in: file name where created */
+	ulint		line,		/*!< in: line where created */
+#endif /* UNIV_DEBUG */
+	ulint*		size);		/*!< out: allocated size in bytes,
+					or NULL */
+
+/**************************************************************//**
+Use this macro instead of the corresponding function!
+Macro for memory buffer freeing */
+
+#define mem_free(PTR)	mem_free_func((PTR), __FILE__, __LINE__)
+/***************************************************************//**
+NOTE: Use the corresponding macro instead of this function.
+Frees a single buffer of storage from
+the dynamic memory of C compiler. Similar to free of C. */
+UNIV_INLINE
+void
+mem_free_func(
+/*==========*/
+	void*		ptr,		/*!< in, own: buffer to be freed */
+	const char*	file_name,	/*!< in: file name where created */
+	ulint		line);		/*!< in: line where created */
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string.
+@return	own: a copy of the string, must be deallocated with mem_free */
+UNIV_INLINE
+char*
+mem_strdup(
+/*=======*/
+	const char*	str);	/*!< in: string to be copied */
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string.
+@return	own: a copy of the string, must be deallocated with mem_free */
+UNIV_INLINE
+char*
+mem_strdupl(
+/*========*/
+	const char*	str,	/*!< in: string to be copied */
+	ulint		len);	/*!< in: length of str, in bytes */
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string, allocated from a memory heap.
+@return	own: a copy of the string */
+UNIV_INTERN
+char*
+mem_heap_strdup(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap where string is allocated */
+	const char*	str);	/*!< in: string to be copied */
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string,
+allocated from a memory heap.
+@return	own: a copy of the string */
+UNIV_INLINE
+char*
+mem_heap_strdupl(
+/*=============*/
+	mem_heap_t*	heap,	/*!< in: memory heap where string is allocated */
+	const char*	str,	/*!< in: string to be copied */
+	ulint		len);	/*!< in: length of str, in bytes */
+
+/**********************************************************************//**
+Concatenate two strings and return the result, using a memory heap.
+@return	own: the result */
+UNIV_INTERN
+char*
+mem_heap_strcat(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap where string is allocated */
+	const char*	s1,	/*!< in: string 1 */
+	const char*	s2);	/*!< in: string 2 */
+
+/**********************************************************************//**
+Duplicate a block of data, allocated from a memory heap.
+@return	own: a copy of the data */
+UNIV_INTERN
+void*
+mem_heap_dup(
+/*=========*/
+	mem_heap_t*	heap,	/*!< in: memory heap where copy is allocated */
+	const void*	data,	/*!< in: data to be copied */
+	ulint		len);	/*!< in: length of data, in bytes */
+
+/****************************************************************//**
+A simple sprintf replacement that dynamically allocates the space for the
+formatted string from the given heap. This supports a very limited set of
+the printf syntax: types 's' and 'u' and length modifier 'l' (which is
+required for the 'u' type).
+@return	heap-allocated formatted string */
+UNIV_INTERN
+char*
+mem_heap_printf(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	const char*	format,	/*!< in: format string */
+	...) __attribute__ ((format (printf, 2, 3)));
+
+#ifdef MEM_PERIODIC_CHECK
+/******************************************************************//**
+Goes through the list of all allocated mem blocks, checks their magic
+numbers, and reports possible corruption. */
+UNIV_INTERN
+void
+mem_validate_all_blocks(void);
+/*=========================*/
+#endif
+
+/*#######################################################################*/
+
+/** The info structure stored at the beginning of a heap block */
+struct mem_block_info_t {
+	ulint	magic_n;/* magic number for debugging */
+#ifdef UNIV_DEBUG
+	char	file_name[8];/* file name where the mem heap was created */
+	ulint	line;	/*!< line number where the mem heap was created */
+#endif /* UNIV_DEBUG */
+	UT_LIST_BASE_NODE_T(mem_block_t) base; /* In the first block in the
+			the list this is the base node of the list of blocks;
+			in subsequent blocks this is undefined */
+	UT_LIST_NODE_T(mem_block_t) list; /* This contains pointers to next
+			and prev in the list. The first block allocated
+			to the heap is also the first block in this list,
+			though it also contains the base node of the list. */
+	ulint	len;	/*!< physical length of this block in bytes */
+	ulint	total_size; /*!< physical length in bytes of all blocks
+			in the heap. This is defined only in the base
+			node and is set to ULINT_UNDEFINED in others. */
+	ulint	type;	/*!< type of heap: MEM_HEAP_DYNAMIC, or
+			MEM_HEAP_BUF possibly ORed to MEM_HEAP_BTR_SEARCH */
+	ulint	free;	/*!< offset in bytes of the first free position for
+			user data in the block */
+	ulint	start;	/*!< the value of the struct field 'free' at the
+			creation of the block */
+#ifndef UNIV_HOTBACKUP
+	void*	free_block;
+			/* if the MEM_HEAP_BTR_SEARCH bit is set in type,
+			and this is the heap root, this can contain an
+			allocated buffer frame, which can be appended as a
+			free block to the heap, if we need more space;
+			otherwise, this is NULL */
+	void*	buf_block;
+			/* if this block has been allocated from the buffer
+			pool, this contains the buf_block_t handle;
+			otherwise, this is NULL */
+#endif /* !UNIV_HOTBACKUP */
+#ifdef MEM_PERIODIC_CHECK
+	UT_LIST_NODE_T(mem_block_t) mem_block_list;
+			/* List of all mem blocks allocated; protected
+			by the mem_comm_pool mutex */
+#endif
+};
+
+#define MEM_BLOCK_MAGIC_N	764741555
+#define MEM_FREED_BLOCK_MAGIC_N	547711122
+
+/* Header size for a memory heap block */
+#define MEM_BLOCK_HEADER_SIZE	ut_calc_align(sizeof(mem_block_info_t),\
+							UNIV_MEM_ALIGNMENT)
+#include "mem0dbg.h"
+
+#ifndef UNIV_NONINL
+#include "mem0mem.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/mem0mem.ic b/storage/innobase/include/mem0mem.ic
new file mode 100644
index 00000000000..0d983d69e1a
--- /dev/null
+++ b/storage/innobase/include/mem0mem.ic
@@ -0,0 +1,649 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/mem0mem.ic
+The memory management
+
+Created 6/8/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mem0dbg.ic"
+#ifndef UNIV_HOTBACKUP
+# include "mem0pool.h"
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+# define mem_heap_create_block(heap, n, type, file_name, line)		\
+	mem_heap_create_block_func(heap, n, file_name, line, type)
+# define mem_heap_create_at(N, file_name, line)				\
+	mem_heap_create_func(N, file_name, line, MEM_HEAP_DYNAMIC)
+#else /* UNIV_DEBUG */
+# define mem_heap_create_block(heap, n, type, file_name, line)		\
+	mem_heap_create_block_func(heap, n, type)
+# define mem_heap_create_at(N, file_name, line)				\
+	mem_heap_create_func(N, MEM_HEAP_DYNAMIC)
+#endif /* UNIV_DEBUG */
+/***************************************************************//**
+Creates a memory heap block where data can be allocated.
+@return own: memory heap block, NULL if did not succeed (only possible
+for MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INTERN
+mem_block_t*
+mem_heap_create_block_func(
+/*=======================*/
+	mem_heap_t*	heap,	/*!< in: memory heap or NULL if first block
+				should be created */
+	ulint		n,	/*!< in: number of bytes needed for user data */
+#ifdef UNIV_DEBUG
+	const char*	file_name,/*!< in: file name where created */
+	ulint		line,	/*!< in: line where created */
+#endif /* UNIV_DEBUG */
+	ulint		type);	/*!< in: type of heap: MEM_HEAP_DYNAMIC or
+				MEM_HEAP_BUFFER */
+/******************************************************************//**
+Frees a block from a memory heap. */
+UNIV_INTERN
+void
+mem_heap_block_free(
+/*================*/
+	mem_heap_t*	heap,	/*!< in: heap */
+	mem_block_t*	block);	/*!< in: block to free */
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Frees the free_block field from a memory heap. */
+UNIV_INTERN
+void
+mem_heap_free_block_free(
+/*=====================*/
+	mem_heap_t*	heap);	/*!< in: heap */
+#endif /* !UNIV_HOTBACKUP */
+/***************************************************************//**
+Adds a new block to a memory heap.
+@return created block, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INTERN
+mem_block_t*
+mem_heap_add_block(
+/*===============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n);	/*!< in: number of bytes user needs */
+
+UNIV_INLINE
+void
+mem_block_set_len(mem_block_t* block, ulint len)
+{
+	ut_ad(len > 0);
+
+	block->len = len;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_len(mem_block_t* block)
+{
+	return(block->len);
+}
+
+UNIV_INLINE
+void
+mem_block_set_type(mem_block_t* block, ulint type)
+{
+	ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER)
+	      || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH));
+
+	block->type = type;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_type(mem_block_t* block)
+{
+	return(block->type);
+}
+
+UNIV_INLINE
+void
+mem_block_set_free(mem_block_t* block, ulint free)
+{
+	ut_ad(free > 0);
+	ut_ad(free <= mem_block_get_len(block));
+
+	block->free = free;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_free(mem_block_t* block)
+{
+	return(block->free);
+}
+
+UNIV_INLINE
+void
+mem_block_set_start(mem_block_t* block, ulint start)
+{
+	ut_ad(start > 0);
+
+	block->start = start;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_start(mem_block_t* block)
+{
+	return(block->start);
+}
+
+/***************************************************************//**
+Allocates and zero-fills n bytes of memory from a memory heap.
+@return	allocated, zero-filled storage */
+UNIV_INLINE
+void*
+mem_heap_zalloc(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n)	/*!< in: number of bytes; if the heap is allowed
+				to grow into the buffer pool, this must be
+				<= MEM_MAX_ALLOC_IN_BUF */
+{
+	ut_ad(heap);
+	ut_ad(!(heap->type & MEM_HEAP_BTR_SEARCH));
+	return(memset(mem_heap_alloc(heap, n), 0, n));
+}
+
+/***************************************************************//**
+Allocates n bytes of memory from a memory heap.
+@return allocated storage, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+void*
+mem_heap_alloc(
+/*===========*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n)	/*!< in: number of bytes; if the heap is allowed
+				to grow into the buffer pool, this must be
+				<= MEM_MAX_ALLOC_IN_BUF */
+{
+	mem_block_t*	block;
+	void*		buf;
+	ulint		free;
+
+	ut_ad(mem_heap_check(heap));
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	ut_ad(!(block->type & MEM_HEAP_BUFFER) || (n <= MEM_MAX_ALLOC_IN_BUF));
+
+	/* Check if there is enough space in block. If not, create a new
+	block to the heap */
+
+	if (mem_block_get_len(block)
+	    < mem_block_get_free(block) + MEM_SPACE_NEEDED(n)) {
+
+		block = mem_heap_add_block(heap, n);
+
+		if (block == NULL) {
+
+			return(NULL);
+		}
+	}
+
+	free = mem_block_get_free(block);
+
+	buf = (byte*) block + free;
+
+	mem_block_set_free(block, free + MEM_SPACE_NEEDED(n));
+
+#ifdef UNIV_MEM_DEBUG
+	UNIV_MEM_ALLOC(buf,
+		       n + MEM_FIELD_HEADER_SIZE + MEM_FIELD_TRAILER_SIZE);
+
+	/* In the debug version write debugging info to the field */
+	mem_field_init((byte*) buf, n);
+
+	/* Advance buf to point at the storage which will be given to the
+	caller */
+	buf = (byte*) buf + MEM_FIELD_HEADER_SIZE;
+
+#endif
+	UNIV_MEM_ALLOC(buf, n);
+	return(buf);
+}
+
+/*****************************************************************//**
+Returns a pointer to the heap top.
+@return	pointer to the heap top */
+UNIV_INLINE
+byte*
+mem_heap_get_heap_top(
+/*==================*/
+	mem_heap_t*	heap)	/*!< in: memory heap */
+{
+	mem_block_t*	block;
+	byte*		buf;
+
+	ut_ad(mem_heap_check(heap));
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	buf = (byte*) block + mem_block_get_free(block);
+
+	return(buf);
+}
+
+/*****************************************************************//**
+Frees the space in a memory heap exceeding the pointer given. The
+pointer must have been acquired from mem_heap_get_heap_top. The first
+memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_free_heap_top(
+/*===================*/
+	mem_heap_t*	heap,	/*!< in: heap from which to free */
+	byte*		old_top)/*!< in: pointer to old top of heap */
+{
+	mem_block_t*	block;
+	mem_block_t*	prev_block;
+#if defined UNIV_MEM_DEBUG || defined UNIV_DEBUG
+	ibool		error;
+	ulint		total_size;
+	ulint		size;
+
+	ut_ad(mem_heap_check(heap));
+
+	/* Validate the heap and get its total allocated size */
+	mem_heap_validate_or_print(heap, NULL, FALSE, &error, &total_size,
+				   NULL, NULL);
+	ut_a(!error);
+
+	/* Get the size below top pointer */
+	mem_heap_validate_or_print(heap, old_top, FALSE, &error, &size, NULL,
+				   NULL);
+	ut_a(!error);
+
+#endif
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	while (block != NULL) {
+		if (((byte*) block + mem_block_get_free(block) >= old_top)
+		    && ((byte*) block <= old_top)) {
+			/* Found the right block */
+
+			break;
+		}
+
+		/* Store prev_block value before freeing the current block
+		(the current block will be erased in freeing) */
+
+		prev_block = UT_LIST_GET_PREV(list, block);
+
+		mem_heap_block_free(heap, block);
+
+		block = prev_block;
+	}
+
+	ut_ad(block);
+
+	/* Set the free field of block */
+	mem_block_set_free(block, old_top - (byte*) block);
+
+	ut_ad(mem_block_get_start(block) <= mem_block_get_free(block));
+	UNIV_MEM_ASSERT_W(old_top, (byte*) block + block->len - old_top);
+#if defined UNIV_MEM_DEBUG
+	/* In the debug version erase block from top up */
+	mem_erase_buf(old_top, (byte*) block + block->len - old_top);
+
+	/* Update allocated memory count */
+	mutex_enter(&mem_hash_mutex);
+	mem_current_allocated_memory -= (total_size - size);
+	mutex_exit(&mem_hash_mutex);
+#endif /* UNIV_MEM_DEBUG */
+	UNIV_MEM_ALLOC(old_top, (byte*) block + block->len - old_top);
+
+	/* If free == start, we may free the block if it is not the first
+	one */
+
+	if ((heap != block) && (mem_block_get_free(block)
+				== mem_block_get_start(block))) {
+		mem_heap_block_free(heap, block);
+	}
+}
+
+/*****************************************************************//**
+Empties a memory heap. The first memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_empty(
+/*===========*/
+	mem_heap_t*	heap)	/*!< in: heap to empty */
+{
+	mem_heap_free_heap_top(heap, (byte*) heap + mem_block_get_start(heap));
+#ifndef UNIV_HOTBACKUP
+	if (heap->free_block) {
+		mem_heap_free_block_free(heap);
+	}
+#endif /* !UNIV_HOTBACKUP */
+}
+
+/*****************************************************************//**
+Returns a pointer to the topmost element in a memory heap. The size of the
+element must be given.
+@return	pointer to the topmost element */
+UNIV_INLINE
+void*
+mem_heap_get_top(
+/*=============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n)	/*!< in: size of the topmost element */
+{
+	mem_block_t*	block;
+	byte*		buf;
+
+	ut_ad(mem_heap_check(heap));
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	buf = (byte*) block + mem_block_get_free(block) - MEM_SPACE_NEEDED(n);
+
+#ifdef UNIV_MEM_DEBUG
+	ut_ad(mem_block_get_start(block) <= (ulint) (buf - (byte*) block));
+
+	/* In the debug version, advance buf to point at the storage which
+	was given to the caller in the allocation*/
+
+	buf += MEM_FIELD_HEADER_SIZE;
+
+	/* Check that the field lengths agree */
+	ut_ad(n == mem_field_header_get_len(buf));
+#endif
+
+	return((void*) buf);
+}
+
+/*****************************************************************//**
+Frees the topmost element in a memory heap. The size of the element must be
+given. */
+UNIV_INLINE
+void
+mem_heap_free_top(
+/*==============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n)	/*!< in: size of the topmost element */
+{
+	mem_block_t*	block;
+
+	ut_ad(mem_heap_check(heap));
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	/* Subtract the free field of block */
+	mem_block_set_free(block, mem_block_get_free(block)
+			   - MEM_SPACE_NEEDED(n));
+	UNIV_MEM_ASSERT_W((byte*) block + mem_block_get_free(block), n);
+#ifdef UNIV_MEM_DEBUG
+
+	ut_ad(mem_block_get_start(block) <= mem_block_get_free(block));
+
+	/* In the debug version check the consistency, and erase field */
+	mem_field_erase((byte*) block + mem_block_get_free(block), n);
+#endif
+
+	/* If free == start, we may free the block if it is not the first
+	one */
+
+	if ((heap != block) && (mem_block_get_free(block)
+				== mem_block_get_start(block))) {
+		mem_heap_block_free(heap, block);
+	} else {
+		/* Avoid a bogus UNIV_MEM_ASSERT_W() warning in a
+		subsequent invocation of mem_heap_free_top().
+		Originally, this was UNIV_MEM_FREE(), to catch writes
+		to freed memory. */
+		UNIV_MEM_ALLOC((byte*) block + mem_block_get_free(block), n);
+	}
+}
+
+/*****************************************************************//**
+NOTE: Use the corresponding macros instead of this function. Creates a
+memory heap. For debugging purposes, takes also the file name and line as
+argument.
+@return own: memory heap, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+mem_heap_t*
+mem_heap_create_func(
+/*=================*/
+	ulint		n,		/*!< in: desired start block size,
+					this means that a single user buffer
+					of size n will fit in the block,
+					0 creates a default size block */
+#ifdef UNIV_DEBUG
+	const char*	file_name,	/*!< in: file name where created */
+	ulint		line,		/*!< in: line where created */
+#endif /* UNIV_DEBUG */
+	ulint		type)		/*!< in: heap type */
+{
+	mem_block_t*   block;
+
+	if (!n) {
+		n = MEM_BLOCK_START_SIZE;
+	}
+
+	block = mem_heap_create_block(NULL, n, type, file_name, line);
+
+	if (block == NULL) {
+
+		return(NULL);
+	}
+
+	UT_LIST_INIT(block->base);
+
+	/* Add the created block itself as the first block in the list */
+	UT_LIST_ADD_FIRST(list, block->base, block);
+
+#ifdef UNIV_MEM_DEBUG
+
+	mem_hash_insert(block, file_name, line);
+
+#endif
+
+	return(block);
+}
+
+/*****************************************************************//**
+NOTE: Use the corresponding macro instead of this function. Frees the space
+occupied by a memory heap. In the debug version erases the heap memory
+blocks. */
+UNIV_INLINE
+void
+mem_heap_free_func(
+/*===============*/
+	mem_heap_t*	heap,		/*!< in, own: heap to be freed */
+	const char*	file_name __attribute__((unused)),
+					/*!< in: file name where freed */
+	ulint		line  __attribute__((unused)))
+{
+	mem_block_t*	block;
+	mem_block_t*	prev_block;
+
+	ut_ad(mem_heap_check(heap));
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+#ifdef UNIV_MEM_DEBUG
+
+	/* In the debug version remove the heap from the hash table of heaps
+	and check its consistency */
+
+	mem_hash_remove(heap, file_name, line);
+
+#endif
+#ifndef UNIV_HOTBACKUP
+	if (heap->free_block) {
+		mem_heap_free_block_free(heap);
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	while (block != NULL) {
+		/* Store the contents of info before freeing current block
+		(it is erased in freeing) */
+
+		prev_block = UT_LIST_GET_PREV(list, block);
+
+		mem_heap_block_free(heap, block);
+
+		block = prev_block;
+	}
+}
+
+/***************************************************************//**
+NOTE: Use the corresponding macro instead of this function.
+Allocates a single buffer of memory from the dynamic memory of
+the C compiler. Is like malloc of C. The buffer must be freed
+with mem_free.
+@return	own: free storage */
+UNIV_INLINE
+void*
+mem_alloc_func(
+/*===========*/
+	ulint		n,		/*!< in: desired number of bytes */
+#ifdef UNIV_DEBUG
+	const char*	file_name,	/*!< in: file name where created */
+	ulint		line,		/*!< in: line where created */
+#endif /* UNIV_DEBUG */
+	ulint*		size)		/*!< out: allocated size in bytes,
+					or NULL */
+{
+	mem_heap_t*	heap;
+	void*		buf;
+
+	heap = mem_heap_create_at(n, file_name, line);
+
+	/* Note that as we created the first block in the heap big enough
+	for the buffer requested by the caller, the buffer will be in the
+	first block and thus we can calculate the pointer to the heap from
+	the pointer to the buffer when we free the memory buffer. */
+
+	if (size) {
+		/* Adjust the allocation to the actual size of the
+		memory block. */
+		ulint	m = mem_block_get_len(heap)
+			- mem_block_get_free(heap);
+#ifdef UNIV_MEM_DEBUG
+		m -= MEM_FIELD_HEADER_SIZE + MEM_FIELD_TRAILER_SIZE;
+#endif /* UNIV_MEM_DEBUG */
+		ut_ad(m >= n);
+		n = m;
+		*size = m;
+	}
+
+	buf = mem_heap_alloc(heap, n);
+
+	ut_a((byte*) heap == (byte*) buf - MEM_BLOCK_HEADER_SIZE
+	     - MEM_FIELD_HEADER_SIZE);
+	return(buf);
+}
+
+/***************************************************************//**
+NOTE: Use the corresponding macro instead of this function. Frees a single
+buffer of storage from the dynamic memory of the C compiler. Similar to the
+free of C. */
+UNIV_INLINE
+void
+mem_free_func(
+/*==========*/
+	void*		ptr,		/*!< in, own: buffer to be freed */
+	const char*	file_name,	/*!< in: file name where created */
+	ulint		line)		/*!< in: line where created */
+{
+	mem_heap_t*   heap;
+
+	heap = (mem_heap_t*)((byte*) ptr - MEM_BLOCK_HEADER_SIZE
+			     - MEM_FIELD_HEADER_SIZE);
+	mem_heap_free_func(heap, file_name, line);
+}
+
+/*****************************************************************//**
+Returns the space in bytes occupied by a memory heap. */
+UNIV_INLINE
+ulint
+mem_heap_get_size(
+/*==============*/
+	mem_heap_t*	heap)	/*!< in: heap */
+{
+	ulint		size	= 0;
+
+	ut_ad(mem_heap_check(heap));
+
+	size = heap->total_size;
+
+#ifndef UNIV_HOTBACKUP
+	if (heap->free_block) {
+		size += UNIV_PAGE_SIZE;
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	return(size);
+}
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string.
+@return	own: a copy of the string, must be deallocated with mem_free */
+UNIV_INLINE
+char*
+mem_strdup(
+/*=======*/
+	const char*	str)	/*!< in: string to be copied */
+{
+	ulint	len = strlen(str) + 1;
+	return((char*) memcpy(mem_alloc(len), str, len));
+}
+
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string.
+@return	own: a copy of the string, must be deallocated with mem_free */
+UNIV_INLINE
+char*
+mem_strdupl(
+/*========*/
+	const char*	str,	/*!< in: string to be copied */
+	ulint		len)	/*!< in: length of str, in bytes */
+{
+	char*	s = (char*) mem_alloc(len + 1);
+	s[len] = 0;
+	return((char*) memcpy(s, str, len));
+}
+
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string,
+allocated from a memory heap.
+@return	own: a copy of the string */
+UNIV_INLINE
+char*
+mem_heap_strdupl(
+/*=============*/
+	mem_heap_t*	heap,	/*!< in: memory heap where string is allocated */
+	const char*	str,	/*!< in: string to be copied */
+	ulint		len)	/*!< in: length of str, in bytes */
+{
+	char*	s = (char*) mem_heap_alloc(heap, len + 1);
+	s[len] = 0;
+	return((char*) memcpy(s, str, len));
+}
diff --git a/storage/innobase/include/mem0pool.h b/storage/innobase/include/mem0pool.h
new file mode 100644
index 00000000000..a65ba50fdf9
--- /dev/null
+++ b/storage/innobase/include/mem0pool.h
@@ -0,0 +1,121 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mem0pool.h
+The lowest-level memory management
+
+Created 6/9/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef mem0pool_h
+#define mem0pool_h
+
+#include "univ.i"
+#include "os0file.h"
+#include "ut0lst.h"
+
+/** Memory pool */
+struct mem_pool_t;
+
+/** The common memory pool */
+extern mem_pool_t*	mem_comm_pool;
+
+/** Memory area header */
+struct mem_area_t{
+	ulint		size_and_free;	/*!< memory area size is obtained by
+					anding with ~MEM_AREA_FREE; area in
+					a free list if ANDing with
+					MEM_AREA_FREE results in nonzero */
+	UT_LIST_NODE_T(mem_area_t)
+			free_list;	/*!< free list node */
+};
+
+/** Each memory area takes this many extra bytes for control information */
+#define MEM_AREA_EXTRA_SIZE	(ut_calc_align(sizeof(struct mem_area_t),\
+			UNIV_MEM_ALIGNMENT))
+
+/********************************************************************//**
+Creates a memory pool.
+@return	memory pool */
+UNIV_INTERN
+mem_pool_t*
+mem_pool_create(
+/*============*/
+	ulint	size);	/*!< in: pool size in bytes */
+/********************************************************************//**
+Frees a memory pool. */
+UNIV_INTERN
+void
+mem_pool_free(
+/*==========*/
+	mem_pool_t*	pool);	/*!< in, own: memory pool */
+/********************************************************************//**
+Allocates memory from a pool. NOTE: This low-level function should only be
+used in mem0mem.*!
+@return	own: allocated memory buffer */
+UNIV_INTERN
+void*
+mem_area_alloc(
+/*===========*/
+	ulint*		psize,	/*!< in: requested size in bytes; for optimum
+				space usage, the size should be a power of 2
+				minus MEM_AREA_EXTRA_SIZE;
+				out: allocated size in bytes (greater than
+				or equal to the requested size) */
+	mem_pool_t*	pool);	/*!< in: memory pool */
+/********************************************************************//**
+Frees memory to a pool. */
+UNIV_INTERN
+void
+mem_area_free(
+/*==========*/
+	void*		ptr,	/*!< in, own: pointer to allocated memory
+				buffer */
+	mem_pool_t*	pool);	/*!< in: memory pool */
+/********************************************************************//**
+Returns the amount of reserved memory.
+@return	reserved mmeory in bytes */
+UNIV_INTERN
+ulint
+mem_pool_get_reserved(
+/*==================*/
+	mem_pool_t*	pool);	/*!< in: memory pool */
+/********************************************************************//**
+Validates a memory pool.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+mem_pool_validate(
+/*==============*/
+	mem_pool_t*	pool);	/*!< in: memory pool */
+/********************************************************************//**
+Prints info of a memory pool. */
+UNIV_INTERN
+void
+mem_pool_print_info(
+/*================*/
+	FILE*		outfile,/*!< in: output file to write to */
+	mem_pool_t*	pool);	/*!< in: memory pool */
+
+
+#ifndef UNIV_NONINL
+#include "mem0pool.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/mem0pool.ic b/storage/innobase/include/mem0pool.ic
new file mode 100644
index 00000000000..f4bafb8ba63
--- /dev/null
+++ b/storage/innobase/include/mem0pool.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/mem0pool.ic
+The lowest-level memory management
+
+Created 6/8/1994 Heikki Tuuri
+*************************************************************************/
diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h
new file mode 100644
index 00000000000..18a345d050f
--- /dev/null
+++ b/storage/innobase/include/mtr0log.h
@@ -0,0 +1,251 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0log.h
+Mini-transaction logging routines
+
+Created 12/7/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef mtr0log_h
+#define mtr0log_h
+
+#include "univ.i"
+#include "mtr0mtr.h"
+#include "dict0types.h"
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Writes 1, 2 or 4 bytes to a file page. Writes the corresponding log
+record to the mini-transaction log if mtr is not NULL. */
+UNIV_INTERN
+void
+mlog_write_ulint(
+/*=============*/
+	byte*	ptr,	/*!< in: pointer where to write */
+	ulint	val,	/*!< in: value to write */
+	byte	type,	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+	mtr_t*	mtr);	/*!< in: mini-transaction handle */
+/********************************************************//**
+Writes 8 bytes to a file page. Writes the corresponding log
+record to the mini-transaction log, only if mtr is not NULL */
+UNIV_INTERN
+void
+mlog_write_ull(
+/*===========*/
+	byte*		ptr,	/*!< in: pointer where to write */
+	ib_uint64_t	val,	/*!< in: value to write */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+/********************************************************//**
+Writes a string to a file page buffered in the buffer pool. Writes the
+corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_write_string(
+/*==============*/
+	byte*		ptr,	/*!< in: pointer where to write */
+	const byte*	str,	/*!< in: string to write */
+	ulint		len,	/*!< in: string length */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+/********************************************************//**
+Logs a write of a string to a file page buffered in the buffer pool.
+Writes the corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_log_string(
+/*============*/
+	byte*	ptr,	/*!< in: pointer written to */
+	ulint	len,	/*!< in: string length */
+	mtr_t*	mtr);	/*!< in: mini-transaction handle */
+/********************************************************//**
+Writes initial part of a log record consisting of one-byte item
+type and four-byte space and page numbers. */
+UNIV_INTERN
+void
+mlog_write_initial_log_record(
+/*==========================*/
+	const byte*	ptr,	/*!< in: pointer to (inside) a buffer
+				frame holding the file page where
+				modification is made */
+	byte		type,	/*!< in: log item type: MLOG_1BYTE, ... */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+/********************************************************//**
+Writes a log record about an .ibd file create/delete/rename.
+@return	new value of log_ptr */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_for_file_op(
+/*======================================*/
+	ulint	type,	/*!< in: MLOG_FILE_CREATE, MLOG_FILE_DELETE, or
+			MLOG_FILE_RENAME */
+	ulint	space_id,/*!< in: space id, if applicable */
+	ulint	page_no,/*!< in: page number (not relevant currently) */
+	byte*	log_ptr,/*!< in: pointer to mtr log which has been opened */
+	mtr_t*	mtr);	/*!< in: mtr */
+/********************************************************//**
+Catenates 1 - 4 bytes to the mtr log. */
+UNIV_INLINE
+void
+mlog_catenate_ulint(
+/*================*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	ulint	val,	/*!< in: value to write */
+	ulint	type);	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+/********************************************************//**
+Catenates n bytes to the mtr log. */
+UNIV_INTERN
+void
+mlog_catenate_string(
+/*=================*/
+	mtr_t*		mtr,	/*!< in: mtr */
+	const byte*	str,	/*!< in: string to write */
+	ulint		len);	/*!< in: string length */
+/********************************************************//**
+Catenates a compressed ulint to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_ulint_compressed(
+/*===========================*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	ulint	val);	/*!< in: value to write */
+/********************************************************//**
+Catenates a compressed 64-bit integer to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_ull_compressed(
+/*=========================*/
+	mtr_t*		mtr,	/*!< in: mtr */
+	ib_uint64_t	val);	/*!< in: value to write */
+/********************************************************//**
+Opens a buffer to mlog. It must be closed with mlog_close.
+@return	buffer, NULL if log mode MTR_LOG_NONE */
+UNIV_INLINE
+byte*
+mlog_open(
+/*======*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	ulint	size);	/*!< in: buffer size in bytes; MUST be
+			smaller than DYN_ARRAY_DATA_SIZE! */
+/********************************************************//**
+Closes a buffer opened to mlog. */
+UNIV_INLINE
+void
+mlog_close(
+/*=======*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	byte*	ptr);	/*!< in: buffer space from ptr up was not used */
+/********************************************************//**
+Writes the initial part of a log record (3..11 bytes).
+If the implementation of this function is changed, all
+size parameters to mlog_open() should be adjusted accordingly!
+@return	new value of log_ptr */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_fast(
+/*===============================*/
+	const byte*	ptr,	/*!< in: pointer to (inside) a buffer
+				frame holding the file page where
+				modification is made */
+	byte		type,	/*!< in: log item type: MLOG_1BYTE, ... */
+	byte*		log_ptr,/*!< in: pointer to mtr log which has
+				been opened */
+	mtr_t*		mtr);	/*!< in: mtr */
+#else /* !UNIV_HOTBACKUP */
+# define mlog_write_initial_log_record(ptr,type,mtr) ((void) 0)
+# define mlog_write_initial_log_record_fast(ptr,type,log_ptr,mtr) ((byte*) 0)
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************//**
+Parses an initial log record written by mlog_write_initial_log_record.
+@return	parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_initial_log_record(
+/*==========================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	byte*	type,	/*!< out: log record type: MLOG_1BYTE, ... */
+	ulint*	space,	/*!< out: space id */
+	ulint*	page_no);/*!< out: page number */
+/********************************************************//**
+Parses a log record written by mlog_write_ulint or mlog_write_ull.
+@return	parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_nbytes(
+/*==============*/
+	ulint	type,	/*!< in: log record type: MLOG_1BYTE, ... */
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	byte*	page,	/*!< in: page where to apply the log record, or NULL */
+	void*	page_zip);/*!< in/out: compressed page, or NULL */
+/********************************************************//**
+Parses a log record written by mlog_write_string.
+@return	parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_string(
+/*==============*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	byte*	page,	/*!< in: page where to apply the log record, or NULL */
+	void*	page_zip);/*!< in/out: compressed page, or NULL */
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Opens a buffer for mlog, writes the initial log record and,
+if needed, the field lengths of an index.  Reserves space
+for further log entries.  The log entry must be closed with
+mtr_close().
+@return	buffer, NULL if log mode MTR_LOG_NONE */
+UNIV_INTERN
+byte*
+mlog_open_and_write_index(
+/*======================*/
+	mtr_t*			mtr,	/*!< in: mtr */
+	const byte*		rec,	/*!< in: index record or page */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	byte			type,	/*!< in: log item type */
+	ulint			size);	/*!< in: requested buffer size in bytes
+					(if 0, calls mlog_close() and
+					returns NULL) */
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Parses a log record written by mlog_open_and_write_index.
+@return	parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_index(
+/*=============*/
+	byte*		ptr,	/*!< in: buffer */
+	const byte*	end_ptr,/*!< in: buffer end */
+	ibool		comp,	/*!< in: TRUE=compact record format */
+	dict_index_t**	index);	/*!< out, own: dummy index */
+
+#ifndef UNIV_HOTBACKUP
+/* Insert, update, and maybe other functions may use this value to define an
+extra mlog buffer size for variable size data */
+#define MLOG_BUF_MARGIN	256
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "mtr0log.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/mtr0log.ic b/storage/innobase/include/mtr0log.ic
new file mode 100644
index 00000000000..3ed4876eeab
--- /dev/null
+++ b/storage/innobase/include/mtr0log.ic
@@ -0,0 +1,276 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0log.ic
+Mini-transaction logging routines
+
+Created 12/7/1995 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#include "ut0lst.h"
+#include "buf0buf.h"
+#include "buf0dblwr.h"
+#include "fsp0types.h"
+#include "trx0sys.h"
+
+/********************************************************//**
+Opens a buffer to mlog. It must be closed with mlog_close.
+@return	buffer, NULL if log mode MTR_LOG_NONE */
+UNIV_INLINE
+byte*
+mlog_open(
+/*======*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	ulint	size)	/*!< in: buffer size in bytes; MUST be
+			smaller than DYN_ARRAY_DATA_SIZE! */
+{
+	dyn_array_t*	mlog;
+
+	mtr->modifications = TRUE;
+
+	if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) {
+
+		return(NULL);
+	}
+
+	mlog = &(mtr->log);
+
+	return(dyn_array_open(mlog, size));
+}
+
+/********************************************************//**
+Closes a buffer opened to mlog. */
+UNIV_INLINE
+void
+mlog_close(
+/*=======*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	byte*	ptr)	/*!< in: buffer space from ptr up was not used */
+{
+	dyn_array_t*	mlog;
+
+	ut_ad(mtr_get_log_mode(mtr) != MTR_LOG_NONE);
+
+	mlog = &(mtr->log);
+
+	dyn_array_close(mlog, ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Catenates 1 - 4 bytes to the mtr log. The value is not compressed. */
+UNIV_INLINE
+void
+mlog_catenate_ulint(
+/*================*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	ulint	val,	/*!< in: value to write */
+	ulint	type)	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+{
+	dyn_array_t*	mlog;
+	byte*		ptr;
+
+	if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) {
+
+		return;
+	}
+
+	mlog = &(mtr->log);
+
+#if MLOG_1BYTE != 1
+# error "MLOG_1BYTE != 1"
+#endif
+#if MLOG_2BYTES != 2
+# error "MLOG_2BYTES != 2"
+#endif
+#if MLOG_4BYTES != 4
+# error "MLOG_4BYTES != 4"
+#endif
+#if MLOG_8BYTES != 8
+# error "MLOG_8BYTES != 8"
+#endif
+	ptr = (byte*) dyn_array_push(mlog, type);
+
+	if (type == MLOG_4BYTES) {
+		mach_write_to_4(ptr, val);
+	} else if (type == MLOG_2BYTES) {
+		mach_write_to_2(ptr, val);
+	} else {
+		ut_ad(type == MLOG_1BYTE);
+		mach_write_to_1(ptr, val);
+	}
+}
+
+/********************************************************//**
+Catenates a compressed ulint to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_ulint_compressed(
+/*===========================*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	ulint	val)	/*!< in: value to write */
+{
+	byte*	log_ptr;
+
+	log_ptr = mlog_open(mtr, 10);
+
+	/* If no logging is requested, we may return now */
+	if (log_ptr == NULL) {
+
+		return;
+	}
+
+	log_ptr += mach_write_compressed(log_ptr, val);
+
+	mlog_close(mtr, log_ptr);
+}
+
+/********************************************************//**
+Catenates a compressed 64-bit integer to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_ull_compressed(
+/*=========================*/
+	mtr_t*		mtr,	/*!< in: mtr */
+	ib_uint64_t	val)	/*!< in: value to write */
+{
+	byte*	log_ptr;
+
+	log_ptr = mlog_open(mtr, 15);
+
+	/* If no logging is requested, we may return now */
+	if (log_ptr == NULL) {
+
+		return;
+	}
+
+	log_ptr += mach_ull_write_compressed(log_ptr, val);
+
+	mlog_close(mtr, log_ptr);
+}
+
+/********************************************************//**
+Writes the initial part of a log record (3..11 bytes).
+If the implementation of this function is changed, all
+size parameters to mlog_open() should be adjusted accordingly!
+@return	new value of log_ptr */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_fast(
+/*===============================*/
+	const byte*	ptr,	/*!< in: pointer to (inside) a buffer
+				frame holding the file page where
+				modification is made */
+	byte		type,	/*!< in: log item type: MLOG_1BYTE, ... */
+	byte*		log_ptr,/*!< in: pointer to mtr log which has
+				been opened */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+#ifdef UNIV_DEBUG
+	buf_block_t*	block;
+#endif
+	const byte*	page;
+	ulint		space;
+	ulint		offset;
+
+	ut_ad(mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(type <= MLOG_BIGGEST_TYPE);
+	ut_ad(ptr && log_ptr);
+
+	page = (const byte*) ut_align_down(ptr, UNIV_PAGE_SIZE);
+	space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	offset = mach_read_from_4(page + FIL_PAGE_OFFSET);
+
+	/* check whether the page is in the doublewrite buffer;
+	the doublewrite buffer is located in pages
+	FSP_EXTENT_SIZE, ..., 3 * FSP_EXTENT_SIZE - 1 in the
+	system tablespace */
+	if (space == TRX_SYS_SPACE
+	    && offset >= FSP_EXTENT_SIZE && offset < 3 * FSP_EXTENT_SIZE) {
+		if (buf_dblwr_being_created) {
+			/* Do nothing: we only come to this branch in an
+			InnoDB database creation. We do not redo log
+			anything for the doublewrite buffer pages. */
+			return(log_ptr);
+		} else {
+			fprintf(stderr,
+				"Error: trying to redo log a record of type "
+				"%d on page %lu of space %lu in the "
+				"doublewrite buffer, continuing anyway.\n"
+				"Please post a bug report to "
+				"bugs.mysql.com.\n",
+				type, offset, space);
+			ut_ad(0);
+		}
+	}
+
+	mach_write_to_1(log_ptr, type);
+	log_ptr++;
+	log_ptr += mach_write_compressed(log_ptr, space);
+	log_ptr += mach_write_compressed(log_ptr, offset);
+
+	mtr->n_log_recs++;
+
+#ifdef UNIV_LOG_DEBUG
+	fprintf(stderr,
+		"Adding to mtr log record type %lu space %lu page no %lu\n",
+		(ulong) type, space, offset);
+#endif
+
+#ifdef UNIV_DEBUG
+	/* We now assume that all x-latched pages have been modified! */
+	block = (buf_block_t*) buf_block_align(ptr);
+
+	if (!mtr_memo_contains(mtr, block, MTR_MEMO_MODIFY)) {
+
+		mtr_memo_push(mtr, block, MTR_MEMO_MODIFY);
+	}
+#endif
+	return(log_ptr);
+}
+
+/********************************************************//**
+Writes a log record about an .ibd file create/delete/rename.
+@return	new value of log_ptr */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_for_file_op(
+/*======================================*/
+	ulint	type,	/*!< in: MLOG_FILE_CREATE, MLOG_FILE_DELETE, or
+			MLOG_FILE_RENAME */
+	ulint	space_id,/*!< in: space id, if applicable */
+	ulint	page_no,/*!< in: page number (not relevant currently) */
+	byte*	log_ptr,/*!< in: pointer to mtr log which has been opened */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	ut_ad(log_ptr);
+
+	mach_write_to_1(log_ptr, type);
+	log_ptr++;
+
+	/* We write dummy space id and page number */
+	log_ptr += mach_write_compressed(log_ptr, space_id);
+	log_ptr += mach_write_compressed(log_ptr, page_no);
+
+	mtr->n_log_recs++;
+
+	return(log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h
new file mode 100644
index 00000000000..ed7fd76d425
--- /dev/null
+++ b/storage/innobase/include/mtr0mtr.h
@@ -0,0 +1,420 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0mtr.h
+Mini-transaction buffer
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef mtr0mtr_h
+#define mtr0mtr_h
+
+#include "univ.i"
+#include "mem0mem.h"
+#include "dyn0dyn.h"
+#include "buf0types.h"
+#include "sync0rw.h"
+#include "ut0byte.h"
+#include "mtr0types.h"
+#include "page0types.h"
+
+/* Logging modes for a mini-transaction */
+#define MTR_LOG_ALL		21	/* default mode: log all operations
+					modifying disk-based data */
+#define	MTR_LOG_NONE		22	/* log no operations */
+#define	MTR_LOG_NO_REDO		23	/* Don't generate REDO */
+/*#define	MTR_LOG_SPACE	23 */	/* log only operations modifying
+					file space page allocation data
+					(operations in fsp0fsp.* ) */
+#define	MTR_LOG_SHORT_INSERTS	24	/* inserts are logged in a shorter
+					form */
+
+/* Types for the mlock objects to store in the mtr memo; NOTE that the
+first 3 values must be RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
+#define	MTR_MEMO_PAGE_S_FIX	RW_S_LATCH
+#define	MTR_MEMO_PAGE_X_FIX	RW_X_LATCH
+#define	MTR_MEMO_BUF_FIX	RW_NO_LATCH
+#ifdef UNIV_DEBUG
+# define MTR_MEMO_MODIFY	54
+#endif /* UNIV_DEBUG */
+#define	MTR_MEMO_S_LOCK		55
+#define	MTR_MEMO_X_LOCK		56
+
+/** @name Log item types
+The log items are declared 'byte' so that the compiler can warn if val
+and type parameters are switched in a call to mlog_write_ulint. NOTE!
+For 1 - 8 bytes, the flag value must give the length also! @{ */
+#define	MLOG_SINGLE_REC_FLAG	128		/*!< if the mtr contains only
+						one log record for one page,
+						i.e., write_initial_log_record
+						has been called only once,
+						this flag is ORed to the type
+						of that first log record */
+#define	MLOG_1BYTE		(1)		/*!< one byte is written */
+#define	MLOG_2BYTES		(2)		/*!< 2 bytes ... */
+#define	MLOG_4BYTES		(4)		/*!< 4 bytes ... */
+#define	MLOG_8BYTES		(8)		/*!< 8 bytes ... */
+#define	MLOG_REC_INSERT		((byte)9)	/*!< record insert */
+#define	MLOG_REC_CLUST_DELETE_MARK ((byte)10)	/*!< mark clustered index record
+						deleted */
+#define	MLOG_REC_SEC_DELETE_MARK ((byte)11)	/*!< mark secondary index record
+						deleted */
+#define MLOG_REC_UPDATE_IN_PLACE ((byte)13)	/*!< update of a record,
+						preserves record field sizes */
+#define MLOG_REC_DELETE		((byte)14)	/*!< delete a record from a
+						page */
+#define	MLOG_LIST_END_DELETE	((byte)15)	/*!< delete record list end on
+						index page */
+#define	MLOG_LIST_START_DELETE	((byte)16)	/*!< delete record list start on
+						index page */
+#define	MLOG_LIST_END_COPY_CREATED ((byte)17)	/*!< copy record list end to a
+						new created index page */
+#define	MLOG_PAGE_REORGANIZE	((byte)18)	/*!< reorganize an
+						index page in
+						ROW_FORMAT=REDUNDANT */
+#define MLOG_PAGE_CREATE	((byte)19)	/*!< create an index page */
+#define	MLOG_UNDO_INSERT	((byte)20)	/*!< insert entry in an undo
+						log */
+#define MLOG_UNDO_ERASE_END	((byte)21)	/*!< erase an undo log
+						page end */
+#define	MLOG_UNDO_INIT		((byte)22)	/*!< initialize a page in an
+						undo log */
+#define MLOG_UNDO_HDR_DISCARD	((byte)23)	/*!< discard an update undo log
+						header */
+#define	MLOG_UNDO_HDR_REUSE	((byte)24)	/*!< reuse an insert undo log
+						header */
+#define MLOG_UNDO_HDR_CREATE	((byte)25)	/*!< create an undo
+						log header */
+#define MLOG_REC_MIN_MARK	((byte)26)	/*!< mark an index
+						record as the
+						predefined minimum
+						record */
+#define MLOG_IBUF_BITMAP_INIT	((byte)27)	/*!< initialize an
+						ibuf bitmap page */
+/*#define	MLOG_FULL_PAGE	((byte)28)	full contents of a page */
+#ifdef UNIV_LOG_LSN_DEBUG
+# define MLOG_LSN		((byte)28)	/* current LSN */
+#endif
+#define MLOG_INIT_FILE_PAGE	((byte)29)	/*!< this means that a
+						file page is taken
+						into use and the prior
+						contents of the page
+						should be ignored: in
+						recovery we must not
+						trust the lsn values
+						stored to the file
+						page */
+#define MLOG_WRITE_STRING	((byte)30)	/*!< write a string to
+						a page */
+#define	MLOG_MULTI_REC_END	((byte)31)	/*!< if a single mtr writes
+						several log records,
+						this log record ends the
+						sequence of these records */
+#define MLOG_DUMMY_RECORD	((byte)32)	/*!< dummy log record used to
+						pad a log block full */
+#define MLOG_FILE_CREATE	((byte)33)	/*!< log record about an .ibd
+						file creation */
+#define MLOG_FILE_RENAME	((byte)34)	/*!< log record about an .ibd
+						file rename */
+#define MLOG_FILE_DELETE	((byte)35)	/*!< log record about an .ibd
+						file deletion */
+#define MLOG_COMP_REC_MIN_MARK	((byte)36)	/*!< mark a compact
+						index record as the
+						predefined minimum
+						record */
+#define MLOG_COMP_PAGE_CREATE	((byte)37)	/*!< create a compact
+						index page */
+#define MLOG_COMP_REC_INSERT	((byte)38)	/*!< compact record insert */
+#define MLOG_COMP_REC_CLUST_DELETE_MARK ((byte)39)
+						/*!< mark compact
+						clustered index record
+						deleted */
+#define MLOG_COMP_REC_SEC_DELETE_MARK ((byte)40)/*!< mark compact
+						secondary index record
+						deleted; this log
+						record type is
+						redundant, as
+						MLOG_REC_SEC_DELETE_MARK
+						is independent of the
+						record format. */
+#define MLOG_COMP_REC_UPDATE_IN_PLACE ((byte)41)/*!< update of a
+						compact record,
+						preserves record field
+						sizes */
+#define MLOG_COMP_REC_DELETE	((byte)42)	/*!< delete a compact record
+						from a page */
+#define MLOG_COMP_LIST_END_DELETE ((byte)43)	/*!< delete compact record list
+						end on index page */
+#define MLOG_COMP_LIST_START_DELETE ((byte)44)	/*!< delete compact record list
+						start on index page */
+#define MLOG_COMP_LIST_END_COPY_CREATED ((byte)45)
+						/*!< copy compact
+						record list end to a
+						new created index
+						page */
+#define MLOG_COMP_PAGE_REORGANIZE ((byte)46)	/*!< reorganize an index page */
+#define MLOG_FILE_CREATE2	((byte)47)	/*!< log record about creating
+						an .ibd file, with format */
+#define MLOG_ZIP_WRITE_NODE_PTR	((byte)48)	/*!< write the node pointer of
+						a record on a compressed
+						non-leaf B-tree page */
+#define MLOG_ZIP_WRITE_BLOB_PTR	((byte)49)	/*!< write the BLOB pointer
+						of an externally stored column
+						on a compressed page */
+#define MLOG_ZIP_WRITE_HEADER	((byte)50)	/*!< write to compressed page
+						header */
+#define MLOG_ZIP_PAGE_COMPRESS	((byte)51)	/*!< compress an index page */
+#define MLOG_ZIP_PAGE_COMPRESS_NO_DATA	((byte)52)/*!< compress an index page
+						without logging it's image */
+#define MLOG_ZIP_PAGE_REORGANIZE ((byte)53)	/*!< reorganize a compressed
+						page */
+#define MLOG_BIGGEST_TYPE	((byte)53)	/*!< biggest value (used in
+						assertions) */
+/* @} */
+
+/** @name Flags for MLOG_FILE operations
+(stored in the page number parameter, called log_flags in the
+functions).  The page number parameter was originally written as 0. @{ */
+#define MLOG_FILE_FLAG_TEMP	1	/*!< identifies TEMPORARY TABLE in
+					MLOG_FILE_CREATE, MLOG_FILE_CREATE2 */
+/* @} */
+
+/* included here because it needs MLOG_LSN defined */
+#include "log0log.h"
+
+/***************************************************************//**
+Starts a mini-transaction. */
+UNIV_INLINE
+void
+mtr_start(
+/*======*/
+	mtr_t*	mtr)	/*!< out: mini-transaction */
+	__attribute__((nonnull));
+/***************************************************************//**
+Commits a mini-transaction. */
+UNIV_INTERN
+void
+mtr_commit(
+/*=======*/
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
+/**********************************************************//**
+Sets and returns a savepoint in mtr.
+@return	savepoint */
+UNIV_INLINE
+ulint
+mtr_set_savepoint(
+/*==============*/
+	mtr_t*	mtr);	/*!< in: mtr */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Releases the (index tree) s-latch stored in an mtr memo after a
+savepoint. */
+UNIV_INLINE
+void
+mtr_release_s_latch_at_savepoint(
+/*=============================*/
+	mtr_t*		mtr,		/*!< in: mtr */
+	ulint		savepoint,	/*!< in: savepoint */
+	rw_lock_t*	lock);		/*!< in: latch to release */
+#else /* !UNIV_HOTBACKUP */
+# define mtr_release_s_latch_at_savepoint(mtr,savepoint,lock) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+/***************************************************************//**
+Gets the logging mode of a mini-transaction.
+@return	logging mode: MTR_LOG_NONE, ... */
+UNIV_INLINE
+ulint
+mtr_get_log_mode(
+/*=============*/
+	mtr_t*	mtr);	/*!< in: mtr */
+/***************************************************************//**
+Changes the logging mode of a mini-transaction.
+@return	old mode */
+UNIV_INLINE
+ulint
+mtr_set_log_mode(
+/*=============*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	ulint	mode);	/*!< in: logging mode: MTR_LOG_NONE, ... */
+/********************************************************//**
+Reads 1 - 4 bytes from a file page buffered in the buffer pool.
+@return	value read */
+UNIV_INTERN
+ulint
+mtr_read_ulint(
+/*===========*/
+	const byte*	ptr,	/*!< in: pointer from where to read */
+	ulint		type,	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+This macro locks an rw-lock in s-mode. */
+#define mtr_s_lock(B, MTR)	mtr_s_lock_func((B), __FILE__, __LINE__,\
+						(MTR))
+/*********************************************************************//**
+This macro locks an rw-lock in x-mode. */
+#define mtr_x_lock(B, MTR)	mtr_x_lock_func((B), __FILE__, __LINE__,\
+						(MTR))
+/*********************************************************************//**
+NOTE! Use the macro above!
+Locks a lock in s-mode. */
+UNIV_INLINE
+void
+mtr_s_lock_func(
+/*============*/
+	rw_lock_t*	lock,	/*!< in: rw-lock */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line number */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************************//**
+NOTE! Use the macro above!
+Locks a lock in x-mode. */
+UNIV_INLINE
+void
+mtr_x_lock_func(
+/*============*/
+	rw_lock_t*	lock,	/*!< in: rw-lock */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line number */
+	mtr_t*		mtr);	/*!< in: mtr */
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************//**
+Releases an object in the memo stack.
+@return true if released */
+UNIV_INTERN
+bool
+mtr_memo_release(
+/*=============*/
+	mtr_t*	mtr,	/*!< in/out: mini-transaction */
+	void*	object,	/*!< in: object */
+	ulint	type)	/*!< in: object type: MTR_MEMO_S_LOCK, ... */
+	__attribute__((nonnull));
+#ifdef UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Checks if memo contains the given item.
+@return	TRUE if contains */
+UNIV_INLINE
+bool
+mtr_memo_contains(
+/*==============*/
+	mtr_t*		mtr,	/*!< in: mtr */
+	const void*	object,	/*!< in: object to search */
+	ulint		type)	/*!< in: type of object */
+	__attribute__((warn_unused_result, nonnull));
+
+/**********************************************************//**
+Checks if memo contains the given page.
+@return	TRUE if contains */
+UNIV_INTERN
+ibool
+mtr_memo_contains_page(
+/*===================*/
+	mtr_t*		mtr,	/*!< in: mtr */
+	const byte*	ptr,	/*!< in: pointer to buffer frame */
+	ulint		type);	/*!< in: type of object */
+/*********************************************************//**
+Prints info of an mtr handle. */
+UNIV_INTERN
+void
+mtr_print(
+/*======*/
+	mtr_t*	mtr);	/*!< in: mtr */
+# else /* !UNIV_HOTBACKUP */
+#  define mtr_memo_contains(mtr, object, type)		TRUE
+#  define mtr_memo_contains_page(mtr, ptr, type)	TRUE
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_DEBUG */
+/*######################################################################*/
+
+#define	MTR_BUF_MEMO_SIZE	200	/* number of slots in memo */
+
+/***************************************************************//**
+Returns the log object of a mini-transaction buffer.
+@return	log */
+UNIV_INLINE
+dyn_array_t*
+mtr_get_log(
+/*========*/
+	mtr_t*	mtr);	/*!< in: mini-transaction */
+/***************************************************//**
+Pushes an object to an mtr memo stack. */
+UNIV_INLINE
+void
+mtr_memo_push(
+/*==========*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	void*	object,	/*!< in: object */
+	ulint	type);	/*!< in: object type: MTR_MEMO_S_LOCK, ... */
+
+/** Mini-transaction memo stack slot. */
+struct mtr_memo_slot_t{
+	ulint	type;	/*!< type of the stored object (MTR_MEMO_S_LOCK, ...) */
+	void*	object;	/*!< pointer to the object */
+};
+
+/* Mini-transaction handle and buffer */
+struct mtr_t{
+#ifdef UNIV_DEBUG
+	ulint		state;	/*!< MTR_ACTIVE, MTR_COMMITTING, MTR_COMMITTED */
+#endif
+	dyn_array_t	memo;	/*!< memo stack for locks etc. */
+	dyn_array_t	log;	/*!< mini-transaction log */
+	unsigned	inside_ibuf:1;
+				/*!< TRUE if inside ibuf changes */
+	unsigned	modifications:1;
+				/*!< TRUE if the mini-transaction
+				modified buffer pool pages */
+	unsigned	made_dirty:1;
+				/*!< TRUE if mtr has made at least
+				one buffer pool page dirty */
+	ulint		n_log_recs;
+				/* count of how many page initial log records
+				have been written to the mtr log */
+	ulint		n_freed_pages;
+				/* number of pages that have been freed in
+				this mini-transaction */
+	ulint		log_mode; /* specifies which operations should be
+				logged; default value MTR_LOG_ALL */
+	lsn_t		start_lsn;/* start lsn of the possible log entry for
+				this mtr */
+	lsn_t		end_lsn;/* end lsn of the possible log entry for
+				this mtr */
+#ifdef UNIV_DEBUG
+	ulint		magic_n;
+#endif /* UNIV_DEBUG */
+};
+
+#ifdef UNIV_DEBUG
+# define MTR_MAGIC_N		54551
+#endif /* UNIV_DEBUG */
+
+#define MTR_ACTIVE		12231
+#define MTR_COMMITTING		56456
+#define MTR_COMMITTED		34676
+
+#ifndef UNIV_NONINL
+#include "mtr0mtr.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic
new file mode 100644
index 00000000000..a9f02430220
--- /dev/null
+++ b/storage/innobase/include/mtr0mtr.ic
@@ -0,0 +1,296 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0mtr.ic
+Mini-transaction buffer
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+# include "sync0sync.h"
+# include "sync0rw.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "mach0data.h"
+
+/***************************************************//**
+Checks if a mini-transaction is dirtying a clean page.
+@return TRUE if the mtr is dirtying a clean page. */
+UNIV_INTERN
+ibool
+mtr_block_dirtied(
+/*==============*/
+	const buf_block_t*	block)	/*!< in: block being x-fixed */
+	__attribute__((nonnull,warn_unused_result));
+
+/***************************************************************//**
+Starts a mini-transaction. */
+UNIV_INLINE
+void
+mtr_start(
+/*======*/
+	mtr_t*	mtr)	/*!< out: mini-transaction */
+{
+	UNIV_MEM_INVALID(mtr, sizeof *mtr);
+
+	dyn_array_create(&(mtr->memo));
+	dyn_array_create(&(mtr->log));
+
+	mtr->log_mode = MTR_LOG_ALL;
+	mtr->inside_ibuf = FALSE;
+	mtr->modifications = FALSE;
+	mtr->made_dirty = FALSE;
+	mtr->n_log_recs = 0;
+	mtr->n_freed_pages = 0;
+
+	ut_d(mtr->state = MTR_ACTIVE);
+	ut_d(mtr->magic_n = MTR_MAGIC_N);
+}
+
+/***************************************************//**
+Pushes an object to an mtr memo stack. */
+UNIV_INLINE
+void
+mtr_memo_push(
+/*==========*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	void*	object,	/*!< in: object */
+	ulint	type)	/*!< in: object type: MTR_MEMO_S_LOCK, ... */
+{
+	dyn_array_t*		memo;
+	mtr_memo_slot_t*	slot;
+
+	ut_ad(object);
+	ut_ad(type >= MTR_MEMO_PAGE_S_FIX);
+	ut_ad(type <= MTR_MEMO_X_LOCK);
+	ut_ad(mtr);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
+	/* If this mtr has x-fixed a clean page then we set
+	the made_dirty flag. This tells us if we need to
+	grab log_flush_order_mutex at mtr_commit so that we
+	can insert the dirtied page to the flush list. */
+	if (type == MTR_MEMO_PAGE_X_FIX && !mtr->made_dirty) {
+		mtr->made_dirty =
+			mtr_block_dirtied((const buf_block_t*) object);
+	}
+
+	memo = &(mtr->memo);
+
+	slot = (mtr_memo_slot_t*) dyn_array_push(memo, sizeof *slot);
+
+	slot->object = object;
+	slot->type = type;
+}
+
+/**********************************************************//**
+Sets and returns a savepoint in mtr.
+@return	savepoint */
+UNIV_INLINE
+ulint
+mtr_set_savepoint(
+/*==============*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	dyn_array_t*	memo;
+
+	ut_ad(mtr);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
+	memo = &(mtr->memo);
+
+	return(dyn_array_get_data_size(memo));
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Releases the (index tree) s-latch stored in an mtr memo after a
+savepoint. */
+UNIV_INLINE
+void
+mtr_release_s_latch_at_savepoint(
+/*=============================*/
+	mtr_t*		mtr,		/*!< in: mtr */
+	ulint		savepoint,	/*!< in: savepoint */
+	rw_lock_t*	lock)		/*!< in: latch to release */
+{
+	mtr_memo_slot_t* slot;
+	dyn_array_t*	memo;
+
+	ut_ad(mtr);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
+	memo = &(mtr->memo);
+
+	ut_ad(dyn_array_get_data_size(memo) > savepoint);
+
+	slot = (mtr_memo_slot_t*) dyn_array_get_element(memo, savepoint);
+
+	ut_ad(slot->object == lock);
+	ut_ad(slot->type == MTR_MEMO_S_LOCK);
+
+	rw_lock_s_unlock(lock);
+
+	slot->object = NULL;
+}
+
+# ifdef UNIV_DEBUG
+/**********************************************************//**
+Checks if memo contains the given item.
+@return	TRUE if contains */
+UNIV_INLINE
+bool
+mtr_memo_contains(
+/*==============*/
+	mtr_t*		mtr,	/*!< in: mtr */
+	const void*	object,	/*!< in: object to search */
+	ulint		type)	/*!< in: type of object */
+{
+	ut_ad(mtr);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE || mtr->state == MTR_COMMITTING);
+
+	for (const dyn_block_t* block = dyn_array_get_last_block(&mtr->memo);
+	     block;
+	     block = dyn_array_get_prev_block(&mtr->memo, block)) {
+		const mtr_memo_slot_t*	start
+			= reinterpret_cast<mtr_memo_slot_t*>(
+				dyn_block_get_data(block));
+		mtr_memo_slot_t*	slot
+			= reinterpret_cast<mtr_memo_slot_t*>(
+				dyn_block_get_data(block)
+				+ dyn_block_get_used(block));
+
+		ut_ad(!(dyn_block_get_used(block) % sizeof(mtr_memo_slot_t)));
+
+		while (slot-- != start) {
+			if (object == slot->object && type == slot->type) {
+				return(true);
+			}
+		}
+	}
+
+	return(false);
+}
+# endif /* UNIV_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+Returns the log object of a mini-transaction buffer.
+@return	log */
+UNIV_INLINE
+dyn_array_t*
+mtr_get_log(
+/*========*/
+	mtr_t*	mtr)	/*!< in: mini-transaction */
+{
+	ut_ad(mtr);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+
+	return(&(mtr->log));
+}
+
+/***************************************************************//**
+Gets the logging mode of a mini-transaction.
+@return	logging mode: MTR_LOG_NONE, ... */
+UNIV_INLINE
+ulint
+mtr_get_log_mode(
+/*=============*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	ut_ad(mtr);
+	ut_ad(mtr->log_mode >= MTR_LOG_ALL);
+	ut_ad(mtr->log_mode <= MTR_LOG_SHORT_INSERTS);
+
+	return(mtr->log_mode);
+}
+
+/***************************************************************//**
+Changes the logging mode of a mini-transaction.
+@return	old mode */
+UNIV_INLINE
+ulint
+mtr_set_log_mode(
+/*=============*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	ulint	mode)	/*!< in: logging mode: MTR_LOG_NONE, ... */
+{
+	ulint	old_mode;
+
+	ut_ad(mtr);
+	ut_ad(mode >= MTR_LOG_ALL);
+	ut_ad(mode <= MTR_LOG_SHORT_INSERTS);
+
+	old_mode = mtr->log_mode;
+
+	if ((mode == MTR_LOG_SHORT_INSERTS) && (old_mode == MTR_LOG_NONE)) {
+		/* Do nothing */
+	} else {
+		mtr->log_mode = mode;
+	}
+
+	ut_ad(old_mode >= MTR_LOG_ALL);
+	ut_ad(old_mode <= MTR_LOG_SHORT_INSERTS);
+
+	return(old_mode);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Locks a lock in s-mode. */
+UNIV_INLINE
+void
+mtr_s_lock_func(
+/*============*/
+	rw_lock_t*	lock,	/*!< in: rw-lock */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line number */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(mtr);
+	ut_ad(lock);
+
+	rw_lock_s_lock_inline(lock, 0, file, line);
+
+	mtr_memo_push(mtr, lock, MTR_MEMO_S_LOCK);
+}
+
+/*********************************************************************//**
+Locks a lock in x-mode. */
+UNIV_INLINE
+void
+mtr_x_lock_func(
+/*============*/
+	rw_lock_t*	lock,	/*!< in: rw-lock */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line number */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(mtr);
+	ut_ad(lock);
+
+	rw_lock_x_lock_inline(lock, 0, file, line);
+
+	mtr_memo_push(mtr, lock, MTR_MEMO_X_LOCK);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h
new file mode 100644
index 00000000000..43368c0b726
--- /dev/null
+++ b/storage/innobase/include/mtr0types.h
@@ -0,0 +1,31 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0types.h
+Mini-transaction buffer global types
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef mtr0types_h
+#define mtr0types_h
+
+struct mtr_t;
+
+#endif
diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h
new file mode 100644
index 00000000000..ad9b6a9ac10
--- /dev/null
+++ b/storage/innobase/include/os0file.h
@@ -0,0 +1,1289 @@
+/***********************************************************************
+
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file include/os0file.h
+The interface to the operating system file io
+
+Created 10/21/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0file_h
+#define os0file_h
+
+#include "univ.i"
+
+#ifndef __WIN__
+#include <dirent.h>
+#include <sys/stat.h>
+#include <time.h>
+#endif
+
+/** File node of a tablespace or the log data space */
+struct fil_node_t;
+
+extern ibool	os_has_said_disk_full;
+/** Flag: enable debug printout for asynchronous i/o */
+extern ibool	os_aio_print_debug;
+
+/** Number of pending os_file_pread() operations */
+extern ulint	os_file_n_pending_preads;
+/** Number of pending os_file_pwrite() operations */
+extern ulint	os_file_n_pending_pwrites;
+
+/** Number of pending read operations */
+extern ulint	os_n_pending_reads;
+/** Number of pending write operations */
+extern ulint	os_n_pending_writes;
+
+#ifdef __WIN__
+
+/** We define always WIN_ASYNC_IO, and check at run-time whether
+   the OS actually supports it: Win 95 does not, NT does. */
+#define WIN_ASYNC_IO
+
+/** Use unbuffered I/O */
+#define UNIV_NON_BUFFERED_IO
+
+#endif
+
+/** File offset in bytes */
+typedef ib_uint64_t os_offset_t;
+#ifdef __WIN__
+/** File handle */
+# define os_file_t	HANDLE
+/** Convert a C file descriptor to a native file handle
+@param fd	file descriptor
+@return		native file handle */
+# define OS_FILE_FROM_FD(fd) (HANDLE) _get_osfhandle(fd)
+#else
+/** File handle */
+typedef int	os_file_t;
+/** Convert a C file descriptor to a native file handle
+@param fd	file descriptor
+@return		native file handle */
+# define OS_FILE_FROM_FD(fd) fd
+#endif
+
+/** Umask for creating files */
+extern ulint	os_innodb_umask;
+
+/** The next value should be smaller or equal to the smallest sector size used
+on any disk. A log block is required to be a portion of disk which is written
+so that if the start and the end of a block get written to disk, then the
+whole block gets written. This should be true even in most cases of a crash:
+if this fails for a log block, then it is equivalent to a media failure in the
+log. */
+
+#define OS_FILE_LOG_BLOCK_SIZE		512
+
+/** Options for os_file_create_func @{ */
+enum os_file_create_t {
+	OS_FILE_OPEN = 51,		/*!< to open an existing file (if
+					doesn't exist, error) */
+	OS_FILE_CREATE,			/*!< to create new file (if
+					exists, error) */
+	OS_FILE_OVERWRITE,		/*!< to create a new file, if exists
+					the overwrite old file */
+	OS_FILE_OPEN_RAW,		/*!< to open a raw device or disk
+					partition */
+	OS_FILE_CREATE_PATH,		/*!< to create the directories */
+	OS_FILE_OPEN_RETRY,		/*!< open with retry */
+
+	/** Flags that can be combined with the above values. Please ensure
+	that the above values stay below 128. */
+
+	OS_FILE_ON_ERROR_NO_EXIT = 128,	/*!< do not exit on unknown errors */
+	OS_FILE_ON_ERROR_SILENT = 256	/*!< don't print diagnostic messages to
+					the log unless it is a fatal error,
+					this flag is only used if
+					ON_ERROR_NO_EXIT is set */
+};
+
+#define OS_FILE_READ_ONLY		333
+#define	OS_FILE_READ_WRITE		444
+#define	OS_FILE_READ_ALLOW_DELETE	555	/* for mysqlbackup */
+
+/* Options for file_create */
+#define	OS_FILE_AIO			61
+#define	OS_FILE_NORMAL			62
+/* @} */
+
+/** Types for file create @{ */
+#define	OS_DATA_FILE			100
+#define OS_LOG_FILE			101
+/* @} */
+
+/** Error codes from os_file_get_last_error @{ */
+#define	OS_FILE_NOT_FOUND		71
+#define	OS_FILE_DISK_FULL		72
+#define	OS_FILE_ALREADY_EXISTS		73
+#define	OS_FILE_PATH_ERROR		74
+#define	OS_FILE_AIO_RESOURCES_RESERVED	75	/* wait for OS aio resources
+						to become available again */
+#define	OS_FILE_SHARING_VIOLATION	76
+#define	OS_FILE_ERROR_NOT_SPECIFIED	77
+#define	OS_FILE_INSUFFICIENT_RESOURCE	78
+#define	OS_FILE_AIO_INTERRUPTED		79
+#define	OS_FILE_OPERATION_ABORTED	80
+
+#define	OS_FILE_ACCESS_VIOLATION	81
+
+#define	OS_FILE_ERROR_MAX		100
+/* @} */
+
+/** Types for aio operations @{ */
+#define OS_FILE_READ	10
+#define OS_FILE_WRITE	11
+
+#define OS_FILE_LOG	256	/* This can be ORed to type */
+/* @} */
+
+#define OS_AIO_N_PENDING_IOS_PER_THREAD 32	/*!< Win NT does not allow more
+						than 64 */
+
+/** Modes for aio operations @{ */
+#define OS_AIO_NORMAL	21	/*!< Normal asynchronous i/o not for ibuf
+				pages or ibuf bitmap pages */
+#define OS_AIO_IBUF	22	/*!< Asynchronous i/o for ibuf pages or ibuf
+				bitmap pages */
+#define OS_AIO_LOG	23	/*!< Asynchronous i/o for the log */
+#define OS_AIO_SYNC	24	/*!< Asynchronous i/o where the calling thread
+				will itself wait for the i/o to complete,
+				doing also the job of the i/o-handler thread;
+				can be used for any pages, ibuf or non-ibuf.
+				This is used to save CPU time, as we can do
+				with fewer thread switches. Plain synchronous
+				i/o is not as good, because it must serialize
+				the file seek and read or write, causing a
+				bottleneck for parallelism. */
+
+#define OS_AIO_SIMULATED_WAKE_LATER	512 /*!< This can be ORed to mode
+				in the call of os_aio(...),
+				if the caller wants to post several i/o
+				requests in a batch, and only after that
+				wake the i/o-handler thread; this has
+				effect only in simulated aio */
+/* @} */
+
+#define OS_WIN31	1	/*!< Microsoft Windows 3.x */
+#define OS_WIN95	2	/*!< Microsoft Windows 95 */
+#define OS_WINNT	3	/*!< Microsoft Windows NT 3.x */
+#define OS_WIN2000	4	/*!< Microsoft Windows 2000 */
+#define OS_WINXP	5	/*!< Microsoft Windows XP
+				or Windows Server 2003 */
+#define OS_WINVISTA	6	/*!< Microsoft Windows Vista
+				or Windows Server 2008 */
+#define OS_WIN7		7	/*!< Microsoft Windows 7
+				or Windows Server 2008 R2 */
+
+
+extern ulint	os_n_file_reads;
+extern ulint	os_n_file_writes;
+extern ulint	os_n_fsyncs;
+
+#ifdef UNIV_PFS_IO
+/* Keys to register InnoDB I/O with performance schema */
+extern mysql_pfs_key_t	innodb_file_data_key;
+extern mysql_pfs_key_t	innodb_file_log_key;
+extern mysql_pfs_key_t	innodb_file_temp_key;
+
+/* Following four macros are instumentations to register
+various file I/O operations with performance schema.
+1) register_pfs_file_open_begin() and register_pfs_file_open_end() are
+used to register file creation, opening, closing and renaming.
+2) register_pfs_file_io_begin() and register_pfs_file_io_end() are
+used to register actual file read, write and flush
+3) register_pfs_file_close_begin() and register_pfs_file_close_end()
+are used to register file deletion operations*/
+# define register_pfs_file_open_begin(state, locker, key, op, name,	\
+				      src_file, src_line)		\
+do {									\
+	locker = PSI_FILE_CALL(get_thread_file_name_locker)(		\
+		state, key, op, name, &locker);				\
+	if (UNIV_LIKELY(locker != NULL)) {				\
+		PSI_FILE_CALL(start_file_open_wait)(			\
+			locker, src_file, src_line);			\
+	}								\
+} while (0)
+
+# define register_pfs_file_open_end(locker, file)			\
+do {									\
+	if (UNIV_LIKELY(locker != NULL)) {				\
+		PSI_FILE_CALL(end_file_open_wait_and_bind_to_descriptor)(\
+			locker, file);					\
+	}								\
+} while (0)
+
+# define register_pfs_file_close_begin(state, locker, key, op, name,	\
+				      src_file, src_line)		\
+do {									\
+	locker = PSI_FILE_CALL(get_thread_file_name_locker)(		\
+		state, key, op, name, &locker);				\
+	if (UNIV_LIKELY(locker != NULL)) {				\
+		PSI_FILE_CALL(start_file_close_wait)(			\
+			locker, src_file, src_line);			\
+	}								\
+} while (0)
+
+# define register_pfs_file_close_end(locker, result)			\
+do {									\
+	if (UNIV_LIKELY(locker != NULL)) {				\
+		PSI_FILE_CALL(end_file_close_wait)(			\
+			locker, result);				\
+	}								\
+} while (0)
+
+# define register_pfs_file_io_begin(state, locker, file, count, op,	\
+				    src_file, src_line)			\
+do {									\
+	locker = PSI_FILE_CALL(get_thread_file_descriptor_locker)(	\
+		state, file, op);					\
+	if (UNIV_LIKELY(locker != NULL)) {				\
+		PSI_FILE_CALL(start_file_wait)(				\
+			locker, count, src_file, src_line);		\
+	}								\
+} while (0)
+
+# define register_pfs_file_io_end(locker, count)			\
+do {									\
+	if (UNIV_LIKELY(locker != NULL)) {				\
+		PSI_FILE_CALL(end_file_wait)(locker, count);		\
+	}								\
+} while (0)
+#endif /* UNIV_PFS_IO  */
+
+/* Following macros/functions are file I/O APIs that would be performance
+schema instrumented if "UNIV_PFS_IO" is defined. They would point to
+wrapper functions with performance schema instrumentation in such case.
+
+os_file_create
+os_file_create_simple
+os_file_create_simple_no_error_handling
+os_file_close
+os_file_rename
+os_aio
+os_file_read
+os_file_read_no_error_handling
+os_file_write
+
+The wrapper functions have the prefix of "innodb_". */
+
+#ifdef UNIV_PFS_IO
+# define os_file_create(key, name, create, purpose, type, success)	\
+	pfs_os_file_create_func(key, name, create, purpose,	type,	\
+				success, __FILE__, __LINE__)
+
+# define os_file_create_simple(key, name, create, access, success)	\
+	pfs_os_file_create_simple_func(key, name, create, access,	\
+				       success, __FILE__, __LINE__)
+
+# define os_file_create_simple_no_error_handling(			\
+		key, name, create_mode, access, success)		\
+	pfs_os_file_create_simple_no_error_handling_func(		\
+		key, name, create_mode, access, success, __FILE__, __LINE__)
+
+# define os_file_close(file)						\
+	pfs_os_file_close_func(file, __FILE__, __LINE__)
+
+# define os_aio(type, mode, name, file, buf, offset,			\
+		n, message1, message2)					\
+	pfs_os_aio_func(type, mode, name, file, buf, offset,		\
+			n, message1, message2, __FILE__, __LINE__)
+
+# define os_file_read(file, buf, offset, n)				\
+	pfs_os_file_read_func(file, buf, offset, n, __FILE__, __LINE__)
+
+# define os_file_read_no_error_handling(file, buf, offset, n)		\
+	pfs_os_file_read_no_error_handling_func(file, buf, offset, n,	\
+						__FILE__, __LINE__)
+
+# define os_file_write(name, file, buf, offset, n)	\
+	pfs_os_file_write_func(name, file, buf, offset,	\
+			       n, __FILE__, __LINE__)
+
+# define os_file_flush(file)						\
+	pfs_os_file_flush_func(file, __FILE__, __LINE__)
+
+# define os_file_rename(key, oldpath, newpath)				\
+	pfs_os_file_rename_func(key, oldpath, newpath, __FILE__, __LINE__)
+
+# define os_file_delete(key, name)					\
+	pfs_os_file_delete_func(key, name, __FILE__, __LINE__)
+
+# define os_file_delete_if_exists(key, name)				\
+	pfs_os_file_delete_if_exists_func(key, name, __FILE__, __LINE__)
+#else /* UNIV_PFS_IO */
+
+/* If UNIV_PFS_IO is not defined, these I/O APIs point
+to original un-instrumented file I/O APIs */
+# define os_file_create(key, name, create, purpose, type, success)	\
+	os_file_create_func(name, create, purpose, type, success)
+
+# define os_file_create_simple(key, name, create_mode, access, success)	\
+	os_file_create_simple_func(name, create_mode, access, success)
+
+# define os_file_create_simple_no_error_handling(			\
+		key, name, create_mode, access, success)		\
+	os_file_create_simple_no_error_handling_func(			\
+		name, create_mode, access, success)
+
+# define os_file_close(file)	os_file_close_func(file)
+
+# define os_aio(type, mode, name, file, buf, offset, n, message1, message2) \
+	os_aio_func(type, mode, name, file, buf, offset, n,		\
+		    message1, message2)
+
+# define os_file_read(file, buf, offset, n)	\
+	os_file_read_func(file, buf, offset, n)
+
+# define os_file_read_no_error_handling(file, buf, offset, n)		\
+	os_file_read_no_error_handling_func(file, buf, offset, n)
+
+# define os_file_write(name, file, buf, offset, n)			\
+	os_file_write_func(name, file, buf, offset, n)
+
+# define os_file_flush(file)	os_file_flush_func(file)
+
+# define os_file_rename(key, oldpath, newpath)				\
+	os_file_rename_func(oldpath, newpath)
+
+# define os_file_delete(key, name)	os_file_delete_func(name)
+
+# define os_file_delete_if_exists(key, name)				\
+	os_file_delete_if_exists_func(name)
+
+#endif /* UNIV_PFS_IO */
+
+/* File types for directory entry data type */
+
+enum os_file_type_t {
+	OS_FILE_TYPE_UNKNOWN = 0,
+	OS_FILE_TYPE_FILE,			/* regular file */
+	OS_FILE_TYPE_DIR,			/* directory */
+	OS_FILE_TYPE_LINK,			/* symbolic link */
+	OS_FILE_TYPE_BLOCK			/* block device */
+};
+
+/* Maximum path string length in bytes when referring to tables with in the
+'./databasename/tablename.ibd' path format; we can allocate at least 2 buffers
+of this size from the thread stack; that is why this should not be made much
+bigger than 4000 bytes */
+#define OS_FILE_MAX_PATH	4000
+
+/** Struct used in fetching information of a file in a directory */
+struct os_file_stat_t {
+	char		name[OS_FILE_MAX_PATH];	/*!< path to a file */
+	os_file_type_t	type;			/*!< file type */
+	ib_int64_t	size;			/*!< file size */
+	time_t		ctime;			/*!< creation time */
+	time_t		mtime;			/*!< modification time */
+	time_t		atime;			/*!< access time */
+	bool		rw_perm;		/*!< true if can be opened
+						in read-write mode. Only valid
+						if type == OS_FILE_TYPE_FILE */
+};
+
+#ifdef __WIN__
+typedef HANDLE	os_file_dir_t;	/*!< directory stream */
+#else
+typedef DIR*	os_file_dir_t;	/*!< directory stream */
+#endif
+
+#ifdef __WIN__
+/***********************************************************************//**
+Gets the operating system version. Currently works only on Windows.
+@return	OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA,
+OS_WIN7. */
+UNIV_INTERN
+ulint
+os_get_os_version(void);
+/*===================*/
+#endif /* __WIN__ */
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Creates the seek mutexes used in positioned reads and writes. */
+UNIV_INTERN
+void
+os_io_init_simple(void);
+/*===================*/
+/***********************************************************************//**
+Creates a temporary file.  This function is like tmpfile(3), but
+the temporary file is created in the MySQL temporary directory.
+@return	temporary file handle, or NULL on error */
+
+FILE*
+os_file_create_tmpfile(void);
+/*========================*/
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************************//**
+The os_file_opendir() function opens a directory stream corresponding to the
+directory named by the dirname argument. The directory stream is positioned
+at the first entry. In both Unix and Windows we automatically skip the '.'
+and '..' items at the start of the directory listing.
+@return	directory stream, NULL if error */
+UNIV_INTERN
+os_file_dir_t
+os_file_opendir(
+/*============*/
+	const char*	dirname,	/*!< in: directory name; it must not
+					contain a trailing '\' or '/' */
+	ibool		error_is_fatal);/*!< in: TRUE if we should treat an
+					error as a fatal error; if we try to
+					open symlinks then we do not wish a
+					fatal error if it happens not to be
+					a directory */
+/***********************************************************************//**
+Closes a directory stream.
+@return	0 if success, -1 if failure */
+UNIV_INTERN
+int
+os_file_closedir(
+/*=============*/
+	os_file_dir_t	dir);	/*!< in: directory stream */
+/***********************************************************************//**
+This function returns information of the next file in the directory. We jump
+over the '.' and '..' entries in the directory.
+@return	0 if ok, -1 if error, 1 if at the end of the directory */
+UNIV_INTERN
+int
+os_file_readdir_next_file(
+/*======================*/
+	const char*	dirname,/*!< in: directory name or path */
+	os_file_dir_t	dir,	/*!< in: directory stream */
+	os_file_stat_t*	info);	/*!< in/out: buffer where the info is returned */
+/*****************************************************************//**
+This function attempts to create a directory named pathname. The new directory
+gets default permissions. On Unix, the permissions are (0770 & ~umask). If the
+directory exists already, nothing is done and the call succeeds, unless the
+fail_if_exists arguments is true.
+@return	TRUE if call succeeds, FALSE on error */
+UNIV_INTERN
+ibool
+os_file_create_directory(
+/*=====================*/
+	const char*	pathname,	/*!< in: directory name as
+					null-terminated string */
+	ibool		fail_if_exists);/*!< in: if TRUE, pre-existing directory
+					is treated as an error. */
+/****************************************************************//**
+NOTE! Use the corresponding macro os_file_create_simple(), not directly
+this function!
+A simple function to open or create a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create_simple_func(
+/*=======================*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	ulint		create_mode,/*!< in: create mode */
+	ulint		access_type,/*!< in: OS_FILE_READ_ONLY or
+				OS_FILE_READ_WRITE */
+	ibool*		success);/*!< out: TRUE if succeed, FALSE if error */
+/****************************************************************//**
+NOTE! Use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A simple function to open or create a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create_simple_no_error_handling_func(
+/*=========================================*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	ulint		create_mode,/*!< in: create mode */
+	ulint		access_type,/*!< in: OS_FILE_READ_ONLY,
+				OS_FILE_READ_WRITE, or
+				OS_FILE_READ_ALLOW_DELETE; the last option is
+				used by a backup program reading the file */
+	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
+	__attribute__((nonnull, warn_unused_result));
+/****************************************************************//**
+Tries to disable OS caching on an opened file descriptor. */
+UNIV_INTERN
+void
+os_file_set_nocache(
+/*================*/
+	int		fd,		/*!< in: file descriptor to alter */
+	const char*	file_name,	/*!< in: file name, used in the
+					diagnostic message */
+	const char*	operation_name);/*!< in: "open" or "create"; used in the
+					diagnostic message */
+/****************************************************************//**
+NOTE! Use the corresponding macro os_file_create(), not directly
+this function!
+Opens an existing file or creates a new.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create_func(
+/*================*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	ulint		create_mode,/*!< in: create mode */
+	ulint		purpose,/*!< in: OS_FILE_AIO, if asynchronous,
+				non-buffered i/o is desired,
+				OS_FILE_NORMAL, if any normal file;
+				NOTE that it also depends on type, os_aio_..
+				and srv_.. variables whether we really use
+				async i/o or unbuffered i/o: look in the
+				function source code for the exact rules */
+	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
+	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
+	__attribute__((nonnull, warn_unused_result));
+/***********************************************************************//**
+Deletes a file. The file has to be closed before calling this.
+@return	TRUE if success */
+UNIV_INTERN
+bool
+os_file_delete_func(
+/*================*/
+	const char*	name);	/*!< in: file path as a null-terminated
+				string */
+
+/***********************************************************************//**
+Deletes a file if it exists. The file has to be closed before calling this.
+@return	TRUE if success */
+UNIV_INTERN
+bool
+os_file_delete_if_exists_func(
+/*==========================*/
+	const char*	name);	/*!< in: file path as a null-terminated
+				string */
+/***********************************************************************//**
+NOTE! Use the corresponding macro os_file_rename(), not directly
+this function!
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_rename_func(
+/*================*/
+	const char*	oldpath,	/*!< in: old file path as a
+					null-terminated string */
+	const char*	newpath);	/*!< in: new file path */
+/***********************************************************************//**
+NOTE! Use the corresponding macro os_file_close(), not directly this
+function!
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_close_func(
+/*===============*/
+	os_file_t	file);	/*!< in, own: handle to a file */
+
+#ifdef UNIV_PFS_IO
+/****************************************************************//**
+NOTE! Please use the corresponding macro os_file_create_simple(),
+not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple() which opens or creates a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+os_file_t
+pfs_os_file_create_simple_func(
+/*===========================*/
+	mysql_pfs_key_t key,	/*!< in: Performance Schema Key */
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	ulint		create_mode,/*!< in: create mode */
+	ulint		access_type,/*!< in: OS_FILE_READ_ONLY or
+				OS_FILE_READ_WRITE */
+	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line)/*!< in: line where the func invoked */
+	__attribute__((nonnull, warn_unused_result));
+
+/****************************************************************//**
+NOTE! Please use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple_no_error_handling(). Add instrumentation to
+monitor file creation/open.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+os_file_t
+pfs_os_file_create_simple_no_error_handling_func(
+/*=============================================*/
+	mysql_pfs_key_t key,	/*!< in: Performance Schema Key */
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	ulint		create_mode, /*!< in: file create mode */
+	ulint		access_type,/*!< in: OS_FILE_READ_ONLY,
+				OS_FILE_READ_WRITE, or
+				OS_FILE_READ_ALLOW_DELETE; the last option is
+				used by a backup program reading the file */
+	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line)/*!< in: line where the func invoked */
+	__attribute__((nonnull, warn_unused_result));
+
+/****************************************************************//**
+NOTE! Please use the corresponding macro os_file_create(), not directly
+this function!
+A performance schema wrapper function for os_file_create().
+Add instrumentation to monitor file creation/open.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+os_file_t
+pfs_os_file_create_func(
+/*====================*/
+	mysql_pfs_key_t key,	/*!< in: Performance Schema Key */
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	ulint		create_mode,/*!< in: file create mode */
+	ulint		purpose,/*!< in: OS_FILE_AIO, if asynchronous,
+				non-buffered i/o is desired,
+				OS_FILE_NORMAL, if any normal file;
+				NOTE that it also depends on type, os_aio_..
+				and srv_.. variables whether we really use
+				async i/o or unbuffered i/o: look in the
+				function source code for the exact rules */
+	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
+	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line)/*!< in: line where the func invoked */
+	__attribute__((nonnull, warn_unused_result));
+
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_close(), not directly
+this function!
+A performance schema instrumented wrapper function for os_file_close().
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_os_file_close_func(
+/*===================*/
+        os_file_t	file,	/*!< in, own: handle to a file */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line);/*!< in: line where the func invoked */
+/*******************************************************************//**
+NOTE! Please use the corresponding macro os_file_read(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_read() which requests a synchronous read operation.
+@return	TRUE if request was successful, FALSE if fail */
+UNIV_INLINE
+ibool
+pfs_os_file_read_func(
+/*==================*/
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read */
+	os_offset_t	offset,	/*!< in: file offset where to read */
+	ulint		n,	/*!< in: number of bytes to read */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line);/*!< in: line where the func invoked */
+
+/*******************************************************************//**
+NOTE! Please use the corresponding macro os_file_read_no_error_handling(),
+not directly this function!
+This is the performance schema instrumented wrapper function for
+os_file_read_no_error_handling_func() which requests a synchronous
+read operation.
+@return	TRUE if request was successful, FALSE if fail */
+UNIV_INLINE
+ibool
+pfs_os_file_read_no_error_handling_func(
+/*====================================*/
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read */
+	os_offset_t	offset,	/*!< in: file offset where to read */
+	ulint		n,	/*!< in: number of bytes to read */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line);/*!< in: line where the func invoked */
+
+/*******************************************************************//**
+NOTE! Please use the corresponding macro os_aio(), not directly this
+function!
+Performance schema wrapper function of os_aio() which requests
+an asynchronous i/o operation.
+@return TRUE if request was queued successfully, FALSE if fail */
+UNIV_INLINE
+ibool
+pfs_os_aio_func(
+/*============*/
+	ulint		type,	/*!< in: OS_FILE_READ or OS_FILE_WRITE */
+	ulint		mode,	/*!< in: OS_AIO_NORMAL etc. I/O mode */
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read or from which
+				to write */
+	os_offset_t	offset,	/*!< in: file offset where to read or write */
+	ulint		n,	/*!< in: number of bytes to read or write */
+	fil_node_t*	message1,/*!< in: message for the aio handler
+				(can be used to identify a completed
+				aio operation); ignored if mode is
+				OS_AIO_SYNC */
+	void*		message2,/*!< in: message for the aio handler
+				(can be used to identify a completed
+				aio operation); ignored if mode is
+                                OS_AIO_SYNC */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line);/*!< in: line where the func invoked */
+/*******************************************************************//**
+NOTE! Please use the corresponding macro os_file_write(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_write() which requests a synchronous write operation.
+@return	TRUE if request was successful, FALSE if fail */
+UNIV_INLINE
+ibool
+pfs_os_file_write_func(
+/*===================*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	os_file_t	file,	/*!< in: handle to a file */
+	const void*	buf,	/*!< in: buffer from which to write */
+	os_offset_t	offset,	/*!< in: file offset where to write */
+	ulint		n,	/*!< in: number of bytes to write */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line);/*!< in: line where the func invoked */
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_flush(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_flush() which flushes the write buffers of a given file to the disk.
+Flushes the write buffers of a given file to the disk.
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_os_file_flush_func(
+/*===================*/
+	os_file_t	file,	/*!< in, own: handle to a file */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line);/*!< in: line where the func invoked */
+
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_rename(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_rename()
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_os_file_rename_func(
+/*====================*/
+	mysql_pfs_key_t	key,	/*!< in: Performance Schema Key */
+	const char*	oldpath,/*!< in: old file path as a null-terminated
+				string */
+	const char*	newpath,/*!< in: new file path */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line);/*!< in: line where the func invoked */
+
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_delete(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete()
+@return TRUE if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_func(
+/*====================*/
+	mysql_pfs_key_t	key,	/*!< in: Performance Schema Key */
+	const char*	name,	/*!< in: old file path as a null-terminated
+				string */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line);/*!< in: line where the func invoked */
+
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_delete_if_exists(), not
+directly this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete_if_exists()
+@return TRUE if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_if_exists_func(
+/*==============================*/
+	mysql_pfs_key_t	key,	/*!< in: Performance Schema Key */
+	const char*	name,	/*!< in: old file path as a null-terminated
+				string */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line);/*!< in: line where the func invoked */
+#endif	/* UNIV_PFS_IO */
+
+#ifdef UNIV_HOTBACKUP
+/***********************************************************************//**
+Closes a file handle.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_close_no_error_handling(
+/*============================*/
+	os_file_t	file);	/*!< in, own: handle to a file */
+#endif /* UNIV_HOTBACKUP */
+/***********************************************************************//**
+Gets a file size.
+@return	file size, or (os_offset_t) -1 on failure */
+UNIV_INTERN
+os_offset_t
+os_file_get_size(
+/*=============*/
+	os_file_t	file)	/*!< in: handle to a file */
+	__attribute__((warn_unused_result));
+/***********************************************************************//**
+Write the specified number of zeros to a newly created file.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_size(
+/*=============*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	os_file_t	file,	/*!< in: handle to a file */
+	os_offset_t	size)	/*!< in: file size */
+	__attribute__((nonnull, warn_unused_result));
+/***********************************************************************//**
+Truncates a file at its current position.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_eof(
+/*============*/
+	FILE*		file);	/*!< in: file to be truncated */
+/***********************************************************************//**
+NOTE! Use the corresponding macro os_file_flush(), not directly this function!
+Flushes the write buffers of a given file to the disk.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_flush_func(
+/*===============*/
+	os_file_t	file);	/*!< in, own: handle to a file */
+/***********************************************************************//**
+Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@return	error number, or OS error number + 100 */
+UNIV_INTERN
+ulint
+os_file_get_last_error(
+/*===================*/
+	bool	report_all_errors);	/*!< in: TRUE if we want an error message
+					printed of all errors */
+/*******************************************************************//**
+NOTE! Use the corresponding macro os_file_read(), not directly this function!
+Requests a synchronous read operation.
+@return	TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_read_func(
+/*==============*/
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read */
+	os_offset_t	offset,	/*!< in: file offset where to read */
+	ulint		n);	/*!< in: number of bytes to read */
+/*******************************************************************//**
+Rewind file to its start, read at most size - 1 bytes from it to str, and
+NUL-terminate str. All errors are silently ignored. This function is
+mostly meant to be used with temporary files. */
+UNIV_INTERN
+void
+os_file_read_string(
+/*================*/
+	FILE*	file,	/*!< in: file to read from */
+	char*	str,	/*!< in: buffer where to read */
+	ulint	size);	/*!< in: size of buffer */
+/*******************************************************************//**
+NOTE! Use the corresponding macro os_file_read_no_error_handling(),
+not directly this function!
+Requests a synchronous positioned read operation. This function does not do
+any error handling. In case of error it returns FALSE.
+@return	TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_read_no_error_handling_func(
+/*================================*/
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read */
+	os_offset_t	offset,	/*!< in: file offset where to read */
+	ulint		n);	/*!< in: number of bytes to read */
+
+/*******************************************************************//**
+NOTE! Use the corresponding macro os_file_write(), not directly this
+function!
+Requests a synchronous write operation.
+@return	TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_write_func(
+/*===============*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	os_file_t	file,	/*!< in: handle to a file */
+	const void*	buf,	/*!< in: buffer from which to write */
+	os_offset_t	offset,	/*!< in: file offset where to write */
+	ulint		n);	/*!< in: number of bytes to write */
+/*******************************************************************//**
+Check the existence and type of the given file.
+@return	TRUE if call succeeded */
+UNIV_INTERN
+ibool
+os_file_status(
+/*===========*/
+	const char*	path,	/*!< in:	pathname of the file */
+	ibool*		exists,	/*!< out: TRUE if file exists */
+	os_file_type_t* type);	/*!< out: type of the file (if it exists) */
+/****************************************************************//**
+The function os_file_dirname returns a directory component of a
+null-terminated pathname string.  In the usual case, dirname returns
+the string up to, but not including, the final '/', and basename
+is the component following the final '/'.  Trailing '/' characters
+are not counted as part of the pathname.
+
+If path does not contain a slash, dirname returns the string ".".
+
+Concatenating the string returned by dirname, a "/", and the basename
+yields a complete pathname.
+
+The return value is  a copy of the directory component of the pathname.
+The copy is allocated from heap. It is the caller responsibility
+to free it after it is no longer needed.
+
+The following list of examples (taken from SUSv2) shows the strings
+returned by dirname and basename for different paths:
+
+       path	      dirname	     basename
+       "/usr/lib"     "/usr"	     "lib"
+       "/usr/"	      "/"	     "usr"
+       "usr"	      "."	     "usr"
+       "/"	      "/"	     "/"
+       "."	      "."	     "."
+       ".."	      "."	     ".."
+
+@return	own: directory component of the pathname */
+UNIV_INTERN
+char*
+os_file_dirname(
+/*============*/
+	const char*	path);	/*!< in: pathname */
+/****************************************************************//**
+This function returns a new path name after replacing the basename
+in an old path with a new basename.  The old_path is a full path
+name including the extension.  The tablename is in the normal
+form "databasename/tablename".  The new base name is found after
+the forward slash.  Both input strings are null terminated.
+
+This function allocates memory to be returned.  It is the callers
+responsibility to free the return value after it is no longer needed.
+
+@return	own: new full pathname */
+UNIV_INTERN
+char*
+os_file_make_new_pathname(
+/*======================*/
+	const char*	old_path,	/*!< in: pathname */
+	const char*	new_name);	/*!< in: new file name */
+/****************************************************************//**
+This function returns a remote path name by combining a data directory
+path provided in a DATA DIRECTORY clause with the tablename which is
+in the form 'database/tablename'.  It strips the file basename (which
+is the tablename) found after the last directory in the path provided.
+The full filepath created will include the database name as a directory
+under the path provided.  The filename is the tablename with the '.ibd'
+extension. All input and output strings are null-terminated.
+
+This function allocates memory to be returned.  It is the callers
+responsibility to free the return value after it is no longer needed.
+
+@return	own: A full pathname; data_dir_path/databasename/tablename.ibd */
+UNIV_INTERN
+char*
+os_file_make_remote_pathname(
+/*=========================*/
+	const char*	data_dir_path,	/*!< in: pathname */
+	const char*	tablename,	/*!< in: tablename */
+	const char*	extention);	/*!< in: file extention; ibd,cfg*/
+/****************************************************************//**
+This function reduces a null-terminated full remote path name into
+the path that is sent by MySQL for DATA DIRECTORY clause.  It replaces
+the 'databasename/tablename.ibd' found at the end of the path with just
+'tablename'.
+
+Since the result is always smaller than the path sent in, no new memory
+is allocated. The caller should allocate memory for the path sent in.
+This function manipulates that path in place.
+
+If the path format is not as expected, just return.  The result is used
+to inform a SHOW CREATE TABLE command. */
+UNIV_INTERN
+void
+os_file_make_data_dir_path(
+/*========================*/
+	char*	data_dir_path);	/*!< in/out: full path/data_dir_path */
+/****************************************************************//**
+Creates all missing subdirectories along the given path.
+@return	TRUE if call succeeded FALSE otherwise */
+UNIV_INTERN
+ibool
+os_file_create_subdirs_if_needed(
+/*=============================*/
+	const char*	path);	/*!< in: path name */
+/***********************************************************************
+Initializes the asynchronous io system. Creates one array each for ibuf
+and log i/o. Also creates one array each for read and write where each
+array is divided logically into n_read_segs and n_write_segs
+respectively. The caller must create an i/o handler thread for each
+segment in these arrays. This function also creates the sync array.
+No i/o handler thread needs to be created for that */
+UNIV_INTERN
+ibool
+os_aio_init(
+/*========*/
+	ulint	n_per_seg,	/*<! in: maximum number of pending aio
+				operations allowed per segment */
+	ulint	n_read_segs,	/*<! in: number of reader threads */
+	ulint	n_write_segs,	/*<! in: number of writer threads */
+	ulint	n_slots_sync);	/*<! in: number of slots in the sync aio
+				array */
+/***********************************************************************
+Frees the asynchronous io system. */
+UNIV_INTERN
+void
+os_aio_free(void);
+/*=============*/
+
+/*******************************************************************//**
+NOTE! Use the corresponding macro os_aio(), not directly this function!
+Requests an asynchronous i/o operation.
+@return	TRUE if request was queued successfully, FALSE if fail */
+UNIV_INTERN
+ibool
+os_aio_func(
+/*========*/
+	ulint		type,	/*!< in: OS_FILE_READ or OS_FILE_WRITE */
+	ulint		mode,	/*!< in: OS_AIO_NORMAL, ..., possibly ORed
+				to OS_AIO_SIMULATED_WAKE_LATER: the
+				last flag advises this function not to wake
+				i/o-handler threads, but the caller will
+				do the waking explicitly later, in this
+				way the caller can post several requests in
+				a batch; NOTE that the batch must not be
+				so big that it exhausts the slots in aio
+				arrays! NOTE that a simulated batch
+				may introduce hidden chances of deadlocks,
+				because i/os are not actually handled until
+				all have been posted: use with great
+				caution! */
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read or from which
+				to write */
+	os_offset_t	offset,	/*!< in: file offset where to read or write */
+	ulint		n,	/*!< in: number of bytes to read or write */
+	fil_node_t*	message1,/*!< in: message for the aio handler
+				(can be used to identify a completed
+				aio operation); ignored if mode is
+				OS_AIO_SYNC */
+	void*		message2);/*!< in: message for the aio handler
+				(can be used to identify a completed
+				aio operation); ignored if mode is
+				OS_AIO_SYNC */
+/************************************************************************//**
+Wakes up all async i/o threads so that they know to exit themselves in
+shutdown. */
+UNIV_INTERN
+void
+os_aio_wake_all_threads_at_shutdown(void);
+/*=====================================*/
+/************************************************************************//**
+Waits until there are no pending writes in os_aio_write_array. There can
+be other, synchronous, pending writes. */
+UNIV_INTERN
+void
+os_aio_wait_until_no_pending_writes(void);
+/*=====================================*/
+/**********************************************************************//**
+Wakes up simulated aio i/o-handler threads if they have something to do. */
+UNIV_INTERN
+void
+os_aio_simulated_wake_handler_threads(void);
+/*=======================================*/
+/**********************************************************************//**
+This function can be called if one wants to post a batch of reads and
+prefers an i/o-handler thread to handle them all at once later. You must
+call os_aio_simulated_wake_handler_threads later to ensure the threads
+are not left sleeping! */
+UNIV_INTERN
+void
+os_aio_simulated_put_read_threads_to_sleep(void);
+/*============================================*/
+
+#ifdef WIN_ASYNC_IO
+/**********************************************************************//**
+This function is only used in Windows asynchronous i/o.
+Waits for an aio operation to complete. This function is used to wait the
+for completed requests. The aio array of pending requests is divided
+into segments. The thread specifies which segment or slot it wants to wait
+for. NOTE: this function will also take care of freeing the aio slot,
+therefore no other thread is allowed to do the freeing!
+@return	TRUE if the aio operation succeeded */
+UNIV_INTERN
+ibool
+os_aio_windows_handle(
+/*==================*/
+	ulint	segment,	/*!< in: the number of the segment in the aio
+				arrays to wait for; segment 0 is the ibuf
+				i/o thread, segment 1 the log i/o thread,
+				then follow the non-ibuf read threads, and as
+				the last are the non-ibuf write threads; if
+				this is ULINT_UNDEFINED, then it means that
+				sync aio is used, and this parameter is
+				ignored */
+	ulint	pos,		/*!< this parameter is used only in sync aio:
+				wait for the aio slot at this position */
+	fil_node_t**message1,	/*!< out: the messages passed with the aio
+				request; note that also in the case where
+				the aio operation failed, these output
+				parameters are valid and can be used to
+				restart the operation, for example */
+	void**	message2,
+	ulint*	type);		/*!< out: OS_FILE_WRITE or ..._READ */
+#endif
+
+/**********************************************************************//**
+Does simulated aio. This function should be called by an i/o-handler
+thread.
+@return	TRUE if the aio operation succeeded */
+UNIV_INTERN
+ibool
+os_aio_simulated_handle(
+/*====================*/
+	ulint	segment,	/*!< in: the number of the segment in the aio
+				arrays to wait for; segment 0 is the ibuf
+				i/o thread, segment 1 the log i/o thread,
+				then follow the non-ibuf read threads, and as
+				the last are the non-ibuf write threads */
+	fil_node_t**message1,	/*!< out: the messages passed with the aio
+				request; note that also in the case where
+				the aio operation failed, these output
+				parameters are valid and can be used to
+				restart the operation, for example */
+	void**	message2,
+	ulint*	type);		/*!< out: OS_FILE_WRITE or ..._READ */
+/**********************************************************************//**
+Validates the consistency of the aio system.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+os_aio_validate(void);
+/*=================*/
+/**********************************************************************//**
+Prints info of the aio arrays. */
+UNIV_INTERN
+void
+os_aio_print(
+/*=========*/
+	FILE*	file);	/*!< in: file where to print */
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+os_aio_refresh_stats(void);
+/*======================*/
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Checks that all slots in the system have been freed, that is, there are
+no pending io operations. */
+UNIV_INTERN
+ibool
+os_aio_all_slots_free(void);
+/*=======================*/
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+This function returns information about the specified file
+@return	DB_SUCCESS if all OK */
+UNIV_INTERN
+dberr_t
+os_file_get_status(
+/*===============*/
+	const char*	path,		/*!< in: pathname of the file */
+	os_file_stat_t* stat_info,	/*!< information of a file in a
+					directory */
+	bool		check_rw_perm);	/*!< in: for testing whether the
+					file can be opened in RW mode */
+
+#if !defined(UNIV_HOTBACKUP)
+/*********************************************************************//**
+Creates a temporary file that will be deleted on close.
+This function is defined in ha_innodb.cc.
+@return	temporary file descriptor, or < 0 on error */
+UNIV_INTERN
+int
+innobase_mysql_tmpfile(void);
+/*========================*/
+#endif /* !UNIV_HOTBACKUP */
+
+
+#if defined(LINUX_NATIVE_AIO)
+/**************************************************************************
+This function is only used in Linux native asynchronous i/o.
+Waits for an aio operation to complete. This function is used to wait the
+for completed requests. The aio array of pending requests is divided
+into segments. The thread specifies which segment or slot it wants to wait
+for. NOTE: this function will also take care of freeing the aio slot,
+therefore no other thread is allowed to do the freeing!
+@return	TRUE if the IO was successful */
+UNIV_INTERN
+ibool
+os_aio_linux_handle(
+/*================*/
+	ulint	global_seg,	/*!< in: segment number in the aio array
+				to wait for; segment 0 is the ibuf
+				i/o thread, segment 1 is log i/o thread,
+				then follow the non-ibuf read threads,
+				and the last are the non-ibuf write
+				threads. */
+	fil_node_t**message1,	/*!< out: the messages passed with the */
+	void**	message2,	/*!< aio request; note that in case the
+				aio operation failed, these output
+				parameters are valid and can be used to
+				restart the operation. */
+	ulint*	type);		/*!< out: OS_FILE_WRITE or ..._READ */
+#endif /* LINUX_NATIVE_AIO */
+
+#ifndef UNIV_NONINL
+#include "os0file.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/os0file.ic b/storage/innobase/include/os0file.ic
new file mode 100644
index 00000000000..defd8204ba3
--- /dev/null
+++ b/storage/innobase/include/os0file.ic
@@ -0,0 +1,449 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0file.ic
+The interface to the operating system file io
+
+Created 2/20/2010 Jimmy Yang
+*******************************************************/
+
+#include "univ.i"
+
+#ifdef UNIV_PFS_IO
+/****************************************************************//**
+NOTE! Please use the corresponding macro os_file_create_simple(),
+not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple() which opens or creates a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+os_file_t
+pfs_os_file_create_simple_func(
+/*===========================*/
+	mysql_pfs_key_t key,	/*!< in: Performance Schema Key */
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	ulint		create_mode,/*!< in: create mode */
+	ulint		access_type,/*!< in: OS_FILE_READ_ONLY or
+				OS_FILE_READ_WRITE */
+	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line)/*!< in: line where the func invoked */
+{
+	os_file_t	file;
+	struct PSI_file_locker* locker = NULL;
+	PSI_file_locker_state	state;
+
+	/* register a file open or creation depending on "create_mode" */
+	register_pfs_file_open_begin(&state, locker, key,
+				     ((create_mode == OS_FILE_CREATE)
+					? PSI_FILE_CREATE
+					: PSI_FILE_OPEN),
+				     name, src_file, src_line);
+
+	file = os_file_create_simple_func(name, create_mode,
+					  access_type, success);
+
+	/* Regsiter the returning "file" value with the system */
+	register_pfs_file_open_end(locker, file);
+
+	return(file);
+}
+
+/****************************************************************//**
+NOTE! Please use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple_no_error_handling(). Add instrumentation to
+monitor file creation/open.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+os_file_t
+pfs_os_file_create_simple_no_error_handling_func(
+/*=============================================*/
+	mysql_pfs_key_t key,	/*!< in: Performance Schema Key */
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	ulint		create_mode, /*!< in: file create mode */
+	ulint		access_type,/*!< in: OS_FILE_READ_ONLY,
+				OS_FILE_READ_WRITE, or
+				OS_FILE_READ_ALLOW_DELETE; the last option is
+				used by a backup program reading the file */
+	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line)/*!< in: line where the func invoked */
+{
+	os_file_t	file;
+	struct PSI_file_locker* locker = NULL;
+	PSI_file_locker_state	state;
+
+	/* register a file open or creation depending on "create_mode" */
+	register_pfs_file_open_begin(&state, locker, key,
+				     ((create_mode == OS_FILE_CREATE)
+					? PSI_FILE_CREATE
+					: PSI_FILE_OPEN),
+				     name, src_file, src_line);
+
+	file = os_file_create_simple_no_error_handling_func(
+		name, create_mode, access_type, success);
+
+	register_pfs_file_open_end(locker, file);
+
+	return(file);
+}
+
+/****************************************************************//**
+NOTE! Please use the corresponding macro os_file_create(), not directly
+this function!
+A performance schema wrapper function for os_file_create().
+Add instrumentation to monitor file creation/open.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+os_file_t
+pfs_os_file_create_func(
+/*====================*/
+	mysql_pfs_key_t key,	/*!< in: Performance Schema Key */
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	ulint		create_mode,/*!< in: file create mode */
+	ulint		purpose,/*!< in: OS_FILE_AIO, if asynchronous,
+				non-buffered i/o is desired,
+				OS_FILE_NORMAL, if any normal file;
+				NOTE that it also depends on type, os_aio_..
+				and srv_.. variables whether we really use
+				async i/o or unbuffered i/o: look in the
+				function source code for the exact rules */
+	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
+	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line)/*!< in: line where the func invoked */
+{
+	os_file_t	file;
+	struct PSI_file_locker* locker = NULL;
+	PSI_file_locker_state	state;
+
+	/* register a file open or creation depending on "create_mode" */
+	register_pfs_file_open_begin(&state, locker, key,
+				     ((create_mode == OS_FILE_CREATE)
+					? PSI_FILE_CREATE
+					: PSI_FILE_OPEN),
+				     name, src_file, src_line);
+
+	file = os_file_create_func(name, create_mode, purpose, type, success);
+
+	register_pfs_file_open_end(locker, file);
+
+	return(file);
+}
+
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_close(), not directly
+this function!
+A performance schema instrumented wrapper function for os_file_close().
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_os_file_close_func(
+/*===================*/
+        os_file_t	file,	/*!< in, own: handle to a file */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line)/*!< in: line where the func invoked */
+{
+	ibool	result;
+	struct PSI_file_locker*	locker = NULL;
+	PSI_file_locker_state	state;
+
+	/* register the file close */
+	register_pfs_file_io_begin(&state, locker, file, 0, PSI_FILE_CLOSE,
+				   src_file, src_line);
+
+	result = os_file_close_func(file);
+
+	register_pfs_file_io_end(locker, 0);
+
+	return(result);
+}
+
+/*******************************************************************//**
+NOTE! Please use the corresponding macro os_aio(), not directly this
+function!
+Performance schema instrumented wrapper function of os_aio() which
+requests an asynchronous i/o operation.
+@return TRUE if request was queued successfully, FALSE if fail */
+UNIV_INLINE
+ibool
+pfs_os_aio_func(
+/*============*/
+	ulint		type,	/*!< in: OS_FILE_READ or OS_FILE_WRITE */
+	ulint		mode,	/*!< in: OS_AIO_NORMAL etc. I/O mode */
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read or from which
+				to write */
+	os_offset_t	offset,	/*!< in: file offset where to read or write */
+	ulint		n,	/*!< in: number of bytes to read or write */
+	fil_node_t*	message1,/*!< in: message for the aio handler
+				(can be used to identify a completed
+				aio operation); ignored if mode is
+				OS_AIO_SYNC */
+	void*		message2,/*!< in: message for the aio handler
+				(can be used to identify a completed
+				aio operation); ignored if mode is
+                                OS_AIO_SYNC */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line)/*!< in: line where the func invoked */
+{
+	ibool	result;
+	struct PSI_file_locker*	locker = NULL;
+	PSI_file_locker_state	state;
+
+	/* Register the read or write I/O depending on "type" */
+	register_pfs_file_io_begin(&state, locker, file, n,
+				   (type == OS_FILE_WRITE)
+					? PSI_FILE_WRITE
+					: PSI_FILE_READ,
+				   src_file, src_line);
+
+	result = os_aio_func(type, mode, name, file, buf, offset,
+			     n, message1, message2);
+
+	register_pfs_file_io_end(locker, n);
+
+	return(result);
+}
+
+/*******************************************************************//**
+NOTE! Please use the corresponding macro os_file_read(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_read() which requests a synchronous read operation.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INLINE
+ibool
+pfs_os_file_read_func(
+/*==================*/
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read */
+	os_offset_t	offset,	/*!< in: file offset where to read */
+	ulint		n,	/*!< in: number of bytes to read */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line)/*!< in: line where the func invoked */
+{
+	ibool	result;
+	struct PSI_file_locker*	locker = NULL;
+	PSI_file_locker_state	state;
+
+	register_pfs_file_io_begin(&state, locker, file, n, PSI_FILE_READ,
+				   src_file, src_line);
+
+	result = os_file_read_func(file, buf, offset, n);
+
+	register_pfs_file_io_end(locker, n);
+
+	return(result);
+}
+
+/*******************************************************************//**
+NOTE! Please use the corresponding macro
+os_file_read_no_error_handling(), not directly this function!
+This is the performance schema instrumented wrapper function for
+os_file_read_no_error_handling() which requests a synchronous
+positioned read operation. This function does not do any error
+handling. In case of error it returns FALSE.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INLINE
+ibool
+pfs_os_file_read_no_error_handling_func(
+/*====================================*/
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read */
+	os_offset_t	offset,	/*!< in: file offset where to read */
+	ulint		n,	/*!< in: number of bytes to read */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line)/*!< in: line where the func invoked */
+{
+	ibool	result;
+	struct PSI_file_locker*	locker = NULL;
+	PSI_file_locker_state	state;
+
+	register_pfs_file_io_begin(&state, locker, file, n, PSI_FILE_READ,
+				   src_file, src_line);
+
+	result = os_file_read_no_error_handling_func(file, buf, offset, n);
+
+	register_pfs_file_io_end(locker, n);
+
+	return(result);
+}
+
+/*******************************************************************//**
+NOTE! Please use the corresponding macro os_file_write(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_write() which requests a synchronous write operation.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INLINE
+ibool
+pfs_os_file_write_func(
+/*===================*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	os_file_t	file,	/*!< in: handle to a file */
+	const void*	buf,	/*!< in: buffer from which to write */
+	os_offset_t	offset,	/*!< in: file offset where to write */
+	ulint		n,	/*!< in: number of bytes to write */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line)/*!< in: line where the func invoked */
+{
+	ibool	result;
+	struct PSI_file_locker*	locker = NULL;
+	PSI_file_locker_state	state;
+
+	register_pfs_file_io_begin(&state, locker, file, n, PSI_FILE_WRITE,
+				   src_file, src_line);
+
+	result = os_file_write_func(name, file, buf, offset, n);
+
+	register_pfs_file_io_end(locker, n);
+
+	return(result);
+}
+
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_flush(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_flush() which flushes the write buffers of a given file to the disk.
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_os_file_flush_func(
+/*===================*/
+	os_file_t	file,	/*!< in, own: handle to a file */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line)/*!< in: line where the func invoked */
+{
+	ibool	result;
+	struct PSI_file_locker*	locker = NULL;
+	PSI_file_locker_state	state;
+
+	register_pfs_file_io_begin(&state, locker, file, 0, PSI_FILE_SYNC,
+				   src_file, src_line);
+	result = os_file_flush_func(file);
+
+	register_pfs_file_io_end(locker, 0);
+
+	return(result);
+}
+
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_rename(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_rename()
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_os_file_rename_func(
+/*====================*/
+	mysql_pfs_key_t key,	/*!< in: Performance Schema Key */
+	const char*	oldpath,/*!< in: old file path as a null-terminated
+				string */
+	const char*	newpath,/*!< in: new file path */
+	const char*	src_file,/*!< in: file name where func invoked */
+	ulint		src_line)/*!< in: line where the func invoked */
+{
+	ibool	result;
+	struct PSI_file_locker*	locker = NULL;
+	PSI_file_locker_state	state;
+
+	register_pfs_file_open_begin(&state, locker, key, PSI_FILE_RENAME, newpath,
+				     src_file, src_line);
+
+	result = os_file_rename_func(oldpath, newpath);
+
+	register_pfs_file_open_end(locker, 0);
+
+	return(result);
+}
+
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_delete(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete()
+@return TRUE if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_func(
+/*====================*/
+	mysql_pfs_key_t key,		/*!< in: Performance Schema Key */
+	const char*	name,		/*!< in: file path as a null-terminated
+					string */
+	const char*	src_file,	/*!< in: file name where func invoked */
+	ulint		src_line)	/*!< in: line where the func invoked */
+{
+	bool	result;
+	struct PSI_file_locker*	locker = NULL;
+	PSI_file_locker_state	state;
+
+	register_pfs_file_close_begin(&state, locker, key, PSI_FILE_DELETE,
+				      name, src_file, src_line);
+
+	result = os_file_delete_func(name);
+
+	register_pfs_file_close_end(locker, 0);
+
+	return(result);
+}
+
+/***********************************************************************//**
+NOTE! Please use the corresponding macro os_file_delete_if_exists(), not
+directly this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete_if_exists()
+@return TRUE if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_if_exists_func(
+/*==============================*/
+	mysql_pfs_key_t key,		/*!< in: Performance Schema Key */
+	const char*	name,		/*!< in: file path as a null-terminated
+					string */
+	const char*	src_file,	/*!< in: file name where func invoked */
+	ulint		src_line)	/*!< in: line where the func invoked */
+{
+	bool	result;
+	struct PSI_file_locker*	locker = NULL;
+	PSI_file_locker_state	state;
+
+	register_pfs_file_close_begin(&state, locker, key, PSI_FILE_DELETE,
+				      name, src_file, src_line);
+
+	result = os_file_delete_if_exists_func(name);
+
+	register_pfs_file_close_end(locker, 0);
+
+	return(result);
+}
+#endif /* UNIV_PFS_IO */
diff --git a/storage/innobase/include/os0once.h b/storage/innobase/include/os0once.h
new file mode 100644
index 00000000000..a8bbaf1d2d4
--- /dev/null
+++ b/storage/innobase/include/os0once.h
@@ -0,0 +1,125 @@
+/*****************************************************************************
+
+Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0once.h
+A class that aids executing a given function exactly once in a multi-threaded
+environment.
+
+Created Feb 20, 2014 Vasil Dimov
+*******************************************************/
+
+#ifndef os0once_h
+#define os0once_h
+
+#include "univ.i"
+
+#include "os0sync.h"
+#include "ut0ut.h"
+
+/** Execute a given function exactly once in a multi-threaded environment
+or wait for the function to be executed by another thread.
+
+Example usage:
+First the user must create a control variable of type os_once::state_t and
+assign it os_once::NEVER_DONE.
+Then the user must pass this variable, together with a function to be
+executed to os_once::do_or_wait_for_done().
+
+Multiple threads can call os_once::do_or_wait_for_done() simultaneously with
+the same (os_once::state_t) control variable. The provided function will be
+called exactly once and when os_once::do_or_wait_for_done() returns then this
+function has completed execution, by this or another thread. In other words
+os_once::do_or_wait_for_done() will either execute the provided function or
+will wait for its execution to complete if it is already called by another
+thread or will do nothing if the function has already completed its execution
+earlier.
+
+This mimics pthread_once(3), but unfortunatelly pthread_once(3) does not
+support passing arguments to the init_routine() function. We should use
+std::call_once() when we start compiling with C++11 enabled. */
+class os_once {
+public:
+	/** Control variables' state type */
+	typedef ib_uint32_t	state_t;
+
+	/** Not yet executed. */
+	static const state_t	NEVER_DONE = 0;
+
+	/** Currently being executed by this or another thread. */
+	static const state_t	IN_PROGRESS = 1;
+
+	/** Finished execution. */
+	static const state_t	DONE = 2;
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	/** Call a given function or wait its execution to complete if it is
+	already called by another thread.
+	@param[in,out]	state		control variable
+	@param[in]	do_func		function to call
+	@param[in,out]	do_func_arg	an argument to pass to do_func(). */
+	static
+	void
+	do_or_wait_for_done(
+		volatile state_t*	state,
+		void			(*do_func)(void*),
+		void*			do_func_arg)
+	{
+		/* Avoid calling os_compare_and_swap_uint32() in the most
+		common case. */
+		if (*state == DONE) {
+			return;
+		}
+
+		if (os_compare_and_swap_uint32(state,
+					       NEVER_DONE, IN_PROGRESS)) {
+			/* We are the first. Call the function. */
+
+			do_func(do_func_arg);
+
+			const bool	swapped = os_compare_and_swap_uint32(
+				state, IN_PROGRESS, DONE);
+
+			ut_a(swapped);
+		} else {
+			/* The state is not NEVER_DONE, so either it is
+			IN_PROGRESS (somebody is calling the function right
+			now or DONE (it has already been called and completed).
+			Wait for it to become DONE. */
+			for (;;) {
+				const state_t	s = *state;
+
+				switch (s) {
+				case DONE:
+					return;
+				case IN_PROGRESS:
+					break;
+				case NEVER_DONE:
+					/* fall through */
+				default:
+					ut_error;
+				}
+
+				UT_RELAX_CPU();
+			}
+		}
+	}
+#endif /* HAVE_ATOMIC_BUILTINS */
+};
+
+#endif /* os0once_h */
diff --git a/storage/innobase/include/os0proc.h b/storage/innobase/include/os0proc.h
new file mode 100644
index 00000000000..613e3bd6947
--- /dev/null
+++ b/storage/innobase/include/os0proc.h
@@ -0,0 +1,77 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0proc.h
+The interface to the operating system
+process control primitives
+
+Created 9/30/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0proc_h
+#define os0proc_h
+
+#include "univ.i"
+
+#ifdef UNIV_LINUX
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#endif
+
+typedef void*			os_process_t;
+typedef unsigned long int	os_process_id_t;
+
+extern ibool os_use_large_pages;
+/* Large page size. This may be a boot-time option on some platforms */
+extern ulint os_large_page_size;
+
+/****************************************************************//**
+Converts the current process id to a number. It is not guaranteed that the
+number is unique. In Linux returns the 'process number' of the current
+thread. That number is the same as one sees in 'top', for example. In Linux
+the thread id is not the same as one sees in 'top'.
+@return	process id as a number */
+UNIV_INTERN
+ulint
+os_proc_get_number(void);
+/*====================*/
+/****************************************************************//**
+Allocates large pages memory.
+@return	allocated memory */
+UNIV_INTERN
+void*
+os_mem_alloc_large(
+/*===============*/
+	ulint*	n);			/*!< in/out: number of bytes */
+/****************************************************************//**
+Frees large pages memory. */
+UNIV_INTERN
+void
+os_mem_free_large(
+/*==============*/
+	void	*ptr,			/*!< in: pointer returned by
+					os_mem_alloc_large() */
+	ulint	size);			/*!< in: size returned by
+					os_mem_alloc_large() */
+
+#ifndef UNIV_NONINL
+#include "os0proc.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/os0proc.ic b/storage/innobase/include/os0proc.ic
new file mode 100644
index 00000000000..506f4f8ce0c
--- /dev/null
+++ b/storage/innobase/include/os0proc.ic
@@ -0,0 +1,27 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0proc.ic
+The interface to the operating system
+process control primitives
+
+Created 9/30/1995 Heikki Tuuri
+*******************************************************/
+
+
diff --git a/storage/innobase/include/os0sync.h b/storage/innobase/include/os0sync.h
new file mode 100644
index 00000000000..57b29fff663
--- /dev/null
+++ b/storage/innobase/include/os0sync.h
@@ -0,0 +1,743 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0sync.h
+The interface to the operating system
+synchronization primitives.
+
+Created 9/6/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0sync_h
+#define os0sync_h
+
+#include "univ.i"
+#include "ut0lst.h"
+#include "sync0types.h"
+
+#ifdef __WIN__
+/** Native event (slow)*/
+typedef HANDLE			os_native_event_t;
+/** Native mutex */
+typedef CRITICAL_SECTION	fast_mutex_t;
+/** Native condition variable. */
+typedef CONDITION_VARIABLE	os_cond_t;
+#else
+/** Native mutex */
+typedef pthread_mutex_t		fast_mutex_t;
+/** Native condition variable */
+typedef pthread_cond_t		os_cond_t;
+#endif
+
+/** Structure that includes Performance Schema Probe pfs_psi
+in the os_fast_mutex structure if UNIV_PFS_MUTEX is defined */
+struct os_fast_mutex_t {
+	fast_mutex_t		mutex;	/*!< os_fast_mutex */
+#ifdef UNIV_PFS_MUTEX
+	struct PSI_mutex*	pfs_psi;/*!< The performance schema
+					instrumentation hook */
+#endif
+};
+
+/** Operating system event handle */
+typedef struct os_event*	os_event_t;
+
+/** An asynchronous signal sent between threads */
+struct os_event {
+#ifdef __WIN__
+	HANDLE		handle;		/*!< kernel event object, slow,
+					used on older Windows */
+#endif
+	os_fast_mutex_t	os_mutex;	/*!< this mutex protects the next
+					fields */
+	ibool		is_set;		/*!< this is TRUE when the event is
+					in the signaled state, i.e., a thread
+					does not stop if it tries to wait for
+					this event */
+	ib_int64_t	signal_count;	/*!< this is incremented each time
+					the event becomes signaled */
+	os_cond_t	cond_var;	/*!< condition variable is used in
+					waiting for the event */
+	UT_LIST_NODE_T(os_event_t) os_event_list;
+					/*!< list of all created events */
+};
+
+/** Denotes an infinite delay for os_event_wait_time() */
+#define OS_SYNC_INFINITE_TIME   ULINT_UNDEFINED
+
+/** Return value of os_event_wait_time() when the time is exceeded */
+#define OS_SYNC_TIME_EXCEEDED   1
+
+/** Operating system mutex handle */
+typedef struct os_mutex_t*	os_ib_mutex_t;
+
+/** Mutex protecting counts and the event and OS 'slow' mutex lists */
+extern os_ib_mutex_t	os_sync_mutex;
+
+/** This is incremented by 1 in os_thread_create and decremented by 1 in
+os_thread_exit */
+extern ulint		os_thread_count;
+
+extern ulint		os_event_count;
+extern ulint		os_mutex_count;
+extern ulint		os_fast_mutex_count;
+
+/*********************************************************//**
+Initializes global event and OS 'slow' mutex lists. */
+UNIV_INTERN
+void
+os_sync_init(void);
+/*==============*/
+/*********************************************************//**
+Frees created events and OS 'slow' mutexes. */
+UNIV_INTERN
+void
+os_sync_free(void);
+/*==============*/
+/*********************************************************//**
+Creates an event semaphore, i.e., a semaphore which may just have two states:
+signaled and nonsignaled. The created event is manual reset: it must be reset
+explicitly by calling sync_os_reset_event.
+@return	the event handle */
+UNIV_INTERN
+os_event_t
+os_event_create(void);
+/*==================*/
+/**********************************************************//**
+Sets an event semaphore to the signaled state: lets waiting threads
+proceed. */
+UNIV_INTERN
+void
+os_event_set(
+/*=========*/
+	os_event_t	event);	/*!< in: event to set */
+/**********************************************************//**
+Resets an event semaphore to the nonsignaled state. Waiting threads will
+stop to wait for the event.
+The return value should be passed to os_even_wait_low() if it is desired
+that this thread should not wait in case of an intervening call to
+os_event_set() between this os_event_reset() and the
+os_event_wait_low() call. See comments for os_event_wait_low(). */
+UNIV_INTERN
+ib_int64_t
+os_event_reset(
+/*===========*/
+	os_event_t	event);	/*!< in: event to reset */
+/**********************************************************//**
+Frees an event object. */
+UNIV_INTERN
+void
+os_event_free(
+/*==========*/
+	os_event_t	event);	/*!< in: event to free */
+
+/**********************************************************//**
+Waits for an event object until it is in the signaled state.
+
+Typically, if the event has been signalled after the os_event_reset()
+we'll return immediately because event->is_set == TRUE.
+There are, however, situations (e.g.: sync_array code) where we may
+lose this information. For example:
+
+thread A calls os_event_reset()
+thread B calls os_event_set()   [event->is_set == TRUE]
+thread C calls os_event_reset() [event->is_set == FALSE]
+thread A calls os_event_wait()  [infinite wait!]
+thread C calls os_event_wait()  [infinite wait!]
+
+Where such a scenario is possible, to avoid infinite wait, the
+value returned by os_event_reset() should be passed in as
+reset_sig_count. */
+UNIV_INTERN
+void
+os_event_wait_low(
+/*==============*/
+	os_event_t	event,		/*!< in: event to wait */
+	ib_int64_t	reset_sig_count);/*!< in: zero or the value
+					returned by previous call of
+					os_event_reset(). */
+
+#define os_event_wait(event) os_event_wait_low(event, 0)
+#define os_event_wait_time(event, t) os_event_wait_time_low(event, t, 0)
+
+/**********************************************************//**
+Waits for an event object until it is in the signaled state or
+a timeout is exceeded. In Unix the timeout is always infinite.
+@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
+UNIV_INTERN
+ulint
+os_event_wait_time_low(
+/*===================*/
+	os_event_t	event,			/*!< in: event to wait */
+	ulint		time_in_usec,		/*!< in: timeout in
+						microseconds, or
+						OS_SYNC_INFINITE_TIME */
+	ib_int64_t	reset_sig_count);	/*!< in: zero or the value
+						returned by previous call of
+						os_event_reset(). */
+/*********************************************************//**
+Creates an operating system mutex semaphore. Because these are slow, the
+mutex semaphore of InnoDB itself (ib_mutex_t) should be used where possible.
+@return	the mutex handle */
+UNIV_INTERN
+os_ib_mutex_t
+os_mutex_create(void);
+/*=================*/
+/**********************************************************//**
+Acquires ownership of a mutex semaphore. */
+UNIV_INTERN
+void
+os_mutex_enter(
+/*===========*/
+	os_ib_mutex_t	mutex);	/*!< in: mutex to acquire */
+/**********************************************************//**
+Releases ownership of a mutex. */
+UNIV_INTERN
+void
+os_mutex_exit(
+/*==========*/
+	os_ib_mutex_t	mutex);	/*!< in: mutex to release */
+/**********************************************************//**
+Frees an mutex object. */
+UNIV_INTERN
+void
+os_mutex_free(
+/*==========*/
+	os_ib_mutex_t	mutex);	/*!< in: mutex to free */
+/**********************************************************//**
+Acquires ownership of a fast mutex. Currently in Windows this is the same
+as os_fast_mutex_lock!
+@return	0 if success, != 0 if was reserved by another thread */
+UNIV_INLINE
+ulint
+os_fast_mutex_trylock(
+/*==================*/
+	os_fast_mutex_t*	fast_mutex);	/*!< in: mutex to acquire */
+
+/**********************************************************************
+Following os_fast_ mutex APIs would be performance schema instrumented:
+
+os_fast_mutex_init
+os_fast_mutex_lock
+os_fast_mutex_unlock
+os_fast_mutex_free
+
+These mutex APIs will point to corresponding wrapper functions that contain
+the performance schema instrumentation.
+
+NOTE! The following macro should be used in mutex operation, not the
+corresponding function. */
+
+#ifdef UNIV_PFS_MUTEX
+# define os_fast_mutex_init(K, M)			\
+	pfs_os_fast_mutex_init(K, M)
+
+# define os_fast_mutex_lock(M)				\
+	pfs_os_fast_mutex_lock(M, __FILE__, __LINE__)
+
+# define os_fast_mutex_unlock(M)	pfs_os_fast_mutex_unlock(M)
+
+# define os_fast_mutex_free(M)		pfs_os_fast_mutex_free(M)
+
+/*********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_init(), not directly
+this function!
+A wrapper function for os_fast_mutex_init_func(). Initializes an operating
+system fast mutex semaphore. */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_init(
+/*===================*/
+	PSI_mutex_key		key,		/*!< in: Performance Schema
+						key */
+	os_fast_mutex_t*	fast_mutex);	/*!< out: fast mutex */
+/**********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_free(), not directly
+this function!
+Wrapper function for pfs_os_fast_mutex_free(). Also destroys the performance
+schema probes when freeing the mutex */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_free(
+/*===================*/
+	os_fast_mutex_t*	fast_mutex);	/*!< in/out: mutex to free */
+/**********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_lock, not directly
+this function!
+Wrapper function of os_fast_mutex_lock. Acquires ownership of a fast mutex. */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_lock(
+/*===================*/
+	os_fast_mutex_t*	fast_mutex,	/*!< in/out: mutex to acquire */
+	const char*		file_name,	/*!< in: file name where
+						 locked */
+	ulint			line);		/*!< in: line where locked */
+/**********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_unlock, not directly
+this function!
+Wrapper function of os_fast_mutex_unlock. Releases ownership of a fast mutex. */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_unlock(
+/*=====================*/
+	os_fast_mutex_t*	fast_mutex);	/*!< in/out: mutex to release */
+
+#else /* UNIV_PFS_MUTEX */
+
+# define os_fast_mutex_init(K, M)			\
+	os_fast_mutex_init_func(&((os_fast_mutex_t*)(M))->mutex)
+
+# define os_fast_mutex_lock(M)				\
+	os_fast_mutex_lock_func(&((os_fast_mutex_t*)(M))->mutex)
+
+# define os_fast_mutex_unlock(M)			\
+	os_fast_mutex_unlock_func(&((os_fast_mutex_t*)(M))->mutex)
+
+# define os_fast_mutex_free(M)				\
+	os_fast_mutex_free_func(&((os_fast_mutex_t*)(M))->mutex)
+#endif /* UNIV_PFS_MUTEX */
+
+/**********************************************************//**
+Releases ownership of a fast mutex. */
+UNIV_INTERN
+void
+os_fast_mutex_unlock_func(
+/*======================*/
+	fast_mutex_t*		fast_mutex);	/*!< in: mutex to release */
+/*********************************************************//**
+Initializes an operating system fast mutex semaphore. */
+UNIV_INTERN
+void
+os_fast_mutex_init_func(
+/*====================*/
+	fast_mutex_t*		fast_mutex);	/*!< in: fast mutex */
+/**********************************************************//**
+Acquires ownership of a fast mutex. */
+UNIV_INTERN
+void
+os_fast_mutex_lock_func(
+/*====================*/
+	fast_mutex_t*		fast_mutex);	/*!< in: mutex to acquire */
+/**********************************************************//**
+Frees an mutex object. */
+UNIV_INTERN
+void
+os_fast_mutex_free_func(
+/*====================*/
+	fast_mutex_t*		fast_mutex);	/*!< in: mutex to free */
+
+/**********************************************************//**
+Atomic compare-and-swap and increment for InnoDB. */
+
+#if defined(HAVE_IB_GCC_ATOMIC_BUILTINS)
+
+# define HAVE_ATOMIC_BUILTINS
+
+# ifdef HAVE_IB_GCC_ATOMIC_BUILTINS_BYTE
+#  define HAVE_ATOMIC_BUILTINS_BYTE
+# endif
+
+# ifdef HAVE_IB_GCC_ATOMIC_BUILTINS_64
+#  define HAVE_ATOMIC_BUILTINS_64
+# endif
+
+/**********************************************************//**
+Returns true if swapped, ptr is pointer to target, old_val is value to
+compare to, new_val is the value to swap in. */
+
+# define os_compare_and_swap(ptr, old_val, new_val) \
+	__sync_bool_compare_and_swap(ptr, old_val, new_val)
+
+# define os_compare_and_swap_ulint(ptr, old_val, new_val) \
+	os_compare_and_swap(ptr, old_val, new_val)
+
+# define os_compare_and_swap_lint(ptr, old_val, new_val) \
+	os_compare_and_swap(ptr, old_val, new_val)
+
+#  define os_compare_and_swap_uint32(ptr, old_val, new_val) \
+	os_compare_and_swap(ptr, old_val, new_val)
+
+# ifdef HAVE_IB_ATOMIC_PTHREAD_T_GCC
+#  define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
+	os_compare_and_swap(ptr, old_val, new_val)
+#  define INNODB_RW_LOCKS_USE_ATOMICS
+#  define IB_ATOMICS_STARTUP_MSG \
+	"Mutexes and rw_locks use GCC atomic builtins"
+# else /* HAVE_IB_ATOMIC_PTHREAD_T_GCC */
+#  define IB_ATOMICS_STARTUP_MSG \
+	"Mutexes use GCC atomic builtins, rw_locks do not"
+# endif /* HAVE_IB_ATOMIC_PTHREAD_T_GCC */
+
+/**********************************************************//**
+Returns the resulting value, ptr is pointer to target, amount is the
+amount of increment. */
+
+# define os_atomic_increment(ptr, amount) \
+	__sync_add_and_fetch(ptr, amount)
+
+# define os_atomic_increment_lint(ptr, amount) \
+	os_atomic_increment(ptr, amount)
+
+# define os_atomic_increment_uint32(ptr, amount ) \
+	os_atomic_increment(ptr, amount)
+
+# define os_atomic_increment_ulint(ptr, amount) \
+	os_atomic_increment(ptr, amount)
+
+# define os_atomic_increment_uint64(ptr, amount) \
+	os_atomic_increment(ptr, amount)
+
+/* Returns the resulting value, ptr is pointer to target, amount is the
+amount to decrement. */
+
+# define os_atomic_decrement(ptr, amount) \
+	__sync_sub_and_fetch(ptr, amount)
+
+# define os_atomic_decrement_uint32(ptr, amount) \
+	os_atomic_decrement(ptr, amount)
+
+# define os_atomic_decrement_lint(ptr, amount) \
+	os_atomic_decrement(ptr, amount)
+
+# define os_atomic_decrement_ulint(ptr, amount) \
+	os_atomic_decrement(ptr, amount)
+
+# define os_atomic_decrement_uint64(ptr, amount) \
+	os_atomic_decrement(ptr, amount)
+
+/**********************************************************//**
+Returns the old value of *ptr, atomically sets *ptr to new_val */
+
+# define os_atomic_test_and_set_byte(ptr, new_val) \
+	__sync_lock_test_and_set(ptr, (byte) new_val)
+
+# define os_atomic_test_and_set_ulint(ptr, new_val) \
+	__sync_lock_test_and_set(ptr, new_val)
+
+#elif defined(HAVE_IB_SOLARIS_ATOMICS)
+
+# define HAVE_ATOMIC_BUILTINS
+# define HAVE_ATOMIC_BUILTINS_BYTE
+# define HAVE_ATOMIC_BUILTINS_64
+
+/* If not compiling with GCC or GCC doesn't support the atomic
+intrinsics and running on Solaris >= 10 use Solaris atomics */
+
+# include <atomic.h>
+
+/**********************************************************//**
+Returns true if swapped, ptr is pointer to target, old_val is value to
+compare to, new_val is the value to swap in. */
+
+# define os_compare_and_swap_uint32(ptr, old_val, new_val) \
+	(atomic_cas_32(ptr, old_val, new_val) == old_val)
+
+# define os_compare_and_swap_ulint(ptr, old_val, new_val) \
+	(atomic_cas_ulong(ptr, old_val, new_val) == old_val)
+
+# define os_compare_and_swap_lint(ptr, old_val, new_val) \
+	((lint) atomic_cas_ulong((ulong_t*) ptr, old_val, new_val) == old_val)
+
+# ifdef HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS
+#  if SIZEOF_PTHREAD_T == 4
+#   define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
+	((pthread_t) atomic_cas_32(ptr, old_val, new_val) == old_val)
+#  elif SIZEOF_PTHREAD_T == 8
+#   define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
+	((pthread_t) atomic_cas_64(ptr, old_val, new_val) == old_val)
+#  else
+#   error "SIZEOF_PTHREAD_T != 4 or 8"
+#  endif /* SIZEOF_PTHREAD_T CHECK */
+#  define INNODB_RW_LOCKS_USE_ATOMICS
+#  define IB_ATOMICS_STARTUP_MSG \
+	"Mutexes and rw_locks use Solaris atomic functions"
+# else /* HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS */
+#  define IB_ATOMICS_STARTUP_MSG \
+	"Mutexes use Solaris atomic functions, rw_locks do not"
+# endif /* HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS */
+
+/**********************************************************//**
+Returns the resulting value, ptr is pointer to target, amount is the
+amount of increment. */
+
+# define os_atomic_increment_uint32(ptr, amount) \
+	atomic_add_32_nv(ptr, amount)
+
+# define os_atomic_increment_ulint(ptr, amount) \
+	atomic_add_long_nv(ptr, amount)
+
+# define os_atomic_increment_lint(ptr, amount) \
+	os_atomic_increment_ulint((ulong_t*) ptr, amount)
+
+# define os_atomic_increment_uint64(ptr, amount) \
+	atomic_add_64_nv(ptr, amount)
+
+/* Returns the resulting value, ptr is pointer to target, amount is the
+amount to decrement. */
+
+# define os_atomic_decrement_uint32(ptr, amount) \
+	os_atomic_increment_uint32(ptr, -(amount))
+
+# define os_atomic_decrement_lint(ptr, amount) \
+	os_atomic_increment_ulint((ulong_t*) ptr, -(amount))
+
+# define os_atomic_decrement_ulint(ptr, amount) \
+	os_atomic_increment_ulint(ptr, -(amount))
+
+# define os_atomic_decrement_uint64(ptr, amount) \
+	os_atomic_increment_uint64(ptr, -(amount))
+
+/**********************************************************//**
+Returns the old value of *ptr, atomically sets *ptr to new_val */
+
+# define os_atomic_test_and_set_byte(ptr, new_val) \
+	atomic_swap_uchar(ptr, new_val)
+
+# define os_atomic_test_and_set_ulint(ptr, new_val) \
+	atomic_swap_ulong(ptr, new_val)
+
+#elif defined(HAVE_WINDOWS_ATOMICS)
+
+# define HAVE_ATOMIC_BUILTINS
+# define HAVE_ATOMIC_BUILTINS_BYTE
+
+# ifndef _WIN32
+#  define HAVE_ATOMIC_BUILTINS_64
+# endif
+
+/**********************************************************//**
+Atomic compare and exchange of signed integers (both 32 and 64 bit).
+@return value found before the exchange.
+If it is not equal to old_value the exchange did not happen. */
+UNIV_INLINE
+lint
+win_cmp_and_xchg_lint(
+/*==================*/
+	volatile lint*	ptr,		/*!< in/out: source/destination */
+	lint		new_val,	/*!< in: exchange value */
+	lint		old_val);	/*!< in: value to compare to */
+
+/**********************************************************//**
+Atomic addition of signed integers.
+@return Initial value of the variable pointed to by ptr */
+UNIV_INLINE
+lint
+win_xchg_and_add(
+/*=============*/
+	volatile lint*	ptr,	/*!< in/out: address of destination */
+	lint		val);	/*!< in: number to be added */
+
+/**********************************************************//**
+Atomic compare and exchange of unsigned integers.
+@return value found before the exchange.
+If it is not equal to old_value the exchange did not happen. */
+UNIV_INLINE
+ulint
+win_cmp_and_xchg_ulint(
+/*===================*/
+	volatile ulint*	ptr,		/*!< in/out: source/destination */
+	ulint		new_val,	/*!< in: exchange value */
+	ulint		old_val);	/*!< in: value to compare to */
+
+/**********************************************************//**
+Atomic compare and exchange of 32 bit unsigned integers.
+@return value found before the exchange.
+If it is not equal to old_value the exchange did not happen. */
+UNIV_INLINE
+DWORD
+win_cmp_and_xchg_dword(
+/*===================*/
+	volatile DWORD*	ptr,		/*!< in/out: source/destination */
+	DWORD		new_val,	/*!< in: exchange value */
+	DWORD		old_val);	/*!< in: value to compare to */
+
+/**********************************************************//**
+Returns true if swapped, ptr is pointer to target, old_val is value to
+compare to, new_val is the value to swap in. */
+
+# define os_compare_and_swap_uint32(ptr, old_val, new_val) \
+	(InterlockedCompareExchange(reinterpret_cast<volatile long*>(ptr), \
+				    new_val, old_val) == old_val)
+
+# define os_compare_and_swap_ulint(ptr, old_val, new_val) \
+	(win_cmp_and_xchg_ulint(ptr, new_val, old_val) == old_val)
+
+# define os_compare_and_swap_lint(ptr, old_val, new_val) \
+	(win_cmp_and_xchg_lint(ptr, new_val, old_val) == old_val)
+
+/* windows thread objects can always be passed to windows atomic functions */
+# define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
+	(win_cmp_and_xchg_dword(ptr, new_val, old_val) == old_val)
+
+# define INNODB_RW_LOCKS_USE_ATOMICS
+# define IB_ATOMICS_STARTUP_MSG \
+	"Mutexes and rw_locks use Windows interlocked functions"
+
+/**********************************************************//**
+Returns the resulting value, ptr is pointer to target, amount is the
+amount of increment. */
+
+# define os_atomic_increment_lint(ptr, amount) \
+	(win_xchg_and_add(ptr, amount) + amount)
+
+# define os_atomic_increment_uint32(ptr, amount) \
+	((ulint) InterlockedExchangeAdd((long*) ptr, amount))
+
+# define os_atomic_increment_ulint(ptr, amount) \
+	((ulint) (win_xchg_and_add((lint*) ptr, (lint) amount) + amount))
+
+# define os_atomic_increment_uint64(ptr, amount)		\
+	((ib_uint64_t) (InterlockedExchangeAdd64(		\
+				(ib_int64_t*) ptr,		\
+				(ib_int64_t) amount) + amount))
+
+/**********************************************************//**
+Returns the resulting value, ptr is pointer to target, amount is the
+amount to decrement. There is no atomic substract function on Windows */
+
+# define os_atomic_decrement_uint32(ptr, amount) \
+	((ulint) InterlockedExchangeAdd((long*) ptr, (-amount)))
+
+# define os_atomic_decrement_lint(ptr, amount) \
+	(win_xchg_and_add(ptr, -(lint) amount) - amount)
+
+# define os_atomic_decrement_ulint(ptr, amount) \
+	((ulint) (win_xchg_and_add((lint*) ptr, -(lint) amount) - amount))
+
+# define os_atomic_decrement_uint64(ptr, amount)		\
+	((ib_uint64_t) (InterlockedExchangeAdd64(		\
+				(ib_int64_t*) ptr,		\
+				-(ib_int64_t) amount) - amount))
+
+/**********************************************************//**
+Returns the old value of *ptr, atomically sets *ptr to new_val.
+InterlockedExchange() operates on LONG, and the LONG will be
+clobbered */
+
+# define os_atomic_test_and_set_byte(ptr, new_val) \
+	((byte) InterlockedExchange(ptr, new_val))
+
+# define os_atomic_test_and_set_ulong(ptr, new_val) \
+	InterlockedExchange(ptr, new_val)
+
+#else
+# define IB_ATOMICS_STARTUP_MSG \
+	"Mutexes and rw_locks use InnoDB's own implementation"
+#endif
+#ifdef HAVE_ATOMIC_BUILTINS
+#define os_atomic_inc_ulint(m,v,d)	os_atomic_increment_ulint(v, d)
+#define os_atomic_dec_ulint(m,v,d)	os_atomic_decrement_ulint(v, d)
+#else
+#define os_atomic_inc_ulint(m,v,d)	os_atomic_inc_ulint_func(m, v, d)
+#define os_atomic_dec_ulint(m,v,d)	os_atomic_dec_ulint_func(m, v, d)
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+/**********************************************************//**
+Following macros are used to update specified counter atomically
+if HAVE_ATOMIC_BUILTINS defined. Otherwise, use mutex passed in
+for synchronization */
+#ifdef HAVE_ATOMIC_BUILTINS
+#define os_increment_counter_by_amount(mutex, counter, amount)	\
+	(void) os_atomic_increment_ulint(&counter, amount)
+
+#define os_decrement_counter_by_amount(mutex, counter, amount)	\
+	(void) os_atomic_increment_ulint(&counter, (-((lint) amount)))
+#else
+#define os_increment_counter_by_amount(mutex, counter, amount)	\
+	do {							\
+		mutex_enter(&(mutex));				\
+		(counter) += (amount);				\
+		mutex_exit(&(mutex));				\
+	} while (0)
+
+#define os_decrement_counter_by_amount(mutex, counter, amount)	\
+	do {							\
+		ut_a(counter >= amount);			\
+		mutex_enter(&(mutex));				\
+		(counter) -= (amount);				\
+		mutex_exit(&(mutex));				\
+	} while (0)
+#endif  /* HAVE_ATOMIC_BUILTINS */
+
+#define os_inc_counter(mutex, counter)				\
+	os_increment_counter_by_amount(mutex, counter, 1)
+
+#define os_dec_counter(mutex, counter)				\
+	do {							\
+		os_decrement_counter_by_amount(mutex, counter, 1);\
+	} while (0);
+
+/** barrier definitions for memory ordering */
+#if defined __i386__ || defined __x86_64__ || defined _M_IX86 || defined _M_X64 || defined __WIN__
+/* Performance regression was observed at some conditions for Intel
+architecture. Disable memory barrier for Intel architecture for now. */
+# define os_rmb
+# define os_wmb
+# define IB_MEMORY_BARRIER_STARTUP_MSG \
+	"Memory barrier is not used"
+#elif defined(HAVE_IB_GCC_ATOMIC_THREAD_FENCE)
+# define HAVE_MEMORY_BARRIER
+# define os_rmb	__atomic_thread_fence(__ATOMIC_ACQUIRE)
+# define os_wmb	__atomic_thread_fence(__ATOMIC_RELEASE)
+# define IB_MEMORY_BARRIER_STARTUP_MSG \
+	"GCC builtin __atomic_thread_fence() is used for memory barrier"
+
+#elif defined(HAVE_IB_GCC_SYNC_SYNCHRONISE)
+# define HAVE_MEMORY_BARRIER
+# define os_rmb	__sync_synchronize()
+# define os_wmb	__sync_synchronize()
+# define IB_MEMORY_BARRIER_STARTUP_MSG \
+	"GCC builtin __sync_synchronize() is used for memory barrier"
+
+#elif defined(HAVE_IB_MACHINE_BARRIER_SOLARIS)
+# define HAVE_MEMORY_BARRIER
+# include <mbarrier.h>
+# define os_rmb	__machine_r_barrier()
+# define os_wmb	__machine_w_barrier()
+# define IB_MEMORY_BARRIER_STARTUP_MSG \
+	"Solaris memory ordering functions are used for memory barrier"
+
+#elif defined(HAVE_WINDOWS_MM_FENCE) && defined(_WIN64)
+# define HAVE_MEMORY_BARRIER
+# include <mmintrin.h>
+# define os_rmb	_mm_lfence()
+# define os_wmb	_mm_sfence()
+# define IB_MEMORY_BARRIER_STARTUP_MSG \
+	"_mm_lfence() and _mm_sfence() are used for memory barrier"
+
+#else
+# define os_rmb
+# define os_wmb
+# define IB_MEMORY_BARRIER_STARTUP_MSG \
+	"Memory barrier is not used"
+#endif
+
+#ifndef UNIV_NONINL
+#include "os0sync.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/os0sync.ic b/storage/innobase/include/os0sync.ic
new file mode 100644
index 00000000000..9a7e520ece6
--- /dev/null
+++ b/storage/innobase/include/os0sync.ic
@@ -0,0 +1,234 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0sync.ic
+The interface to the operating system synchronization primitives.
+
+Created 9/6/1995 Heikki Tuuri
+*******************************************************/
+
+#ifdef __WIN__
+#include <winbase.h>
+#endif
+
+/**********************************************************//**
+Acquires ownership of a fast mutex.
+@return	0 if success, != 0 if was reserved by another thread */
+UNIV_INLINE
+ulint
+os_fast_mutex_trylock(
+/*==================*/
+	os_fast_mutex_t*	fast_mutex)	/*!< in: mutex to acquire */
+{
+	fast_mutex_t*	mutex = &fast_mutex->mutex;
+
+#ifdef __WIN__
+	return(!TryEnterCriticalSection(mutex));
+#else
+	/* NOTE that the MySQL my_pthread.h redefines pthread_mutex_trylock
+	so that it returns 0 on success. In the operating system
+	libraries, HP-UX-10.20 follows the old Posix 1003.4a Draft 4 and
+	returns 1 on success (but MySQL remaps that to 0), while Linux,
+	FreeBSD, Solaris, AIX, Tru64 Unix, HP-UX-11.0 return 0 on success. */
+
+	return((ulint) pthread_mutex_trylock(mutex));
+#endif
+}
+
+#ifdef UNIV_PFS_MUTEX
+/*********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_init(), not directly
+this function!
+A wrapper function for os_fast_mutex_init_func(). Initializes an operating
+system fast mutex semaphore. */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_init(
+/*===================*/
+	PSI_mutex_key		key,		/*!< in: Performance Schema
+						key */
+	os_fast_mutex_t*	fast_mutex)	/*!< out: fast mutex */
+{
+#ifdef HAVE_PSI_MUTEX_INTERFACE
+	fast_mutex->pfs_psi = PSI_MUTEX_CALL(init_mutex)(key, &fast_mutex->mutex);
+#else
+	fast_mutex->pfs_psi = NULL;
+#endif
+
+	os_fast_mutex_init_func(&fast_mutex->mutex);
+}
+/******************************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_free(), not directly
+this function!
+Wrapper function for pfs_os_fast_mutex_free(). Also destroys the performance
+schema probes when freeing the mutex */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_free(
+/*===================*/
+	os_fast_mutex_t*	fast_mutex)  /*!< in/out: mutex */
+{
+#ifdef HAVE_PSI_MUTEX_INTERFACE
+	if (fast_mutex->pfs_psi != NULL)
+		PSI_MUTEX_CALL(destroy_mutex)(fast_mutex->pfs_psi);
+#endif
+	fast_mutex->pfs_psi = NULL;
+
+	os_fast_mutex_free_func(&fast_mutex->mutex);
+}
+/**********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_lock, not directly
+this function!
+Wrapper function of os_fast_mutex_lock_func. Acquires ownership of a fast
+mutex. */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_lock(
+/*===================*/
+	os_fast_mutex_t*	fast_mutex,	/*!< in/out: mutex to acquire */
+	const char*		file_name,	/*!< in: file name where
+						 locked */
+	ulint			line)		/*!< in: line where locked */
+{
+#ifdef HAVE_PSI_MUTEX_INTERFACE
+	if (fast_mutex->pfs_psi != NULL)
+	{
+		PSI_mutex_locker* 	locker;
+		PSI_mutex_locker_state	state;
+
+		locker = PSI_MUTEX_CALL(start_mutex_wait)(
+			&state, fast_mutex->pfs_psi,
+			PSI_MUTEX_LOCK, file_name,
+			static_cast<uint>(line));
+
+		os_fast_mutex_lock_func(&fast_mutex->mutex);
+
+		if (locker != NULL)
+			PSI_MUTEX_CALL(end_mutex_wait)(locker, 0);
+	}
+	else
+#endif
+	{
+		os_fast_mutex_lock_func(&fast_mutex->mutex);
+	}
+
+	return;
+}
+/**********************************************************//**
+NOTE! Please use the corresponding macro os_fast_mutex_unlock, not directly
+this function!
+Wrapper function of os_fast_mutex_unlock_func. Releases ownership of a
+fast mutex. */
+UNIV_INLINE
+void
+pfs_os_fast_mutex_unlock(
+/*=====================*/
+	os_fast_mutex_t*	fast_mutex)	/*!< in/out: mutex to release */
+{
+#ifdef HAVE_PSI_MUTEX_INTERFACE
+	if (fast_mutex->pfs_psi != NULL)
+		PSI_MUTEX_CALL(unlock_mutex)(fast_mutex->pfs_psi);
+#endif
+
+	os_fast_mutex_unlock_func(&fast_mutex->mutex);
+}
+#endif /* UNIV_PFS_MUTEX */
+
+#ifdef HAVE_WINDOWS_ATOMICS
+
+/* Use inline functions to make 64 and 32 bit versions of windows atomic
+functions so that typecasts are evaluated at compile time. Take advantage
+that lint is either __int64 or long int and windows atomic functions work
+on __int64 and LONG */
+
+/**********************************************************//**
+Atomic compare and exchange of unsigned integers.
+@return value found before the exchange.
+If it is not equal to old_value the exchange did not happen. */
+UNIV_INLINE
+lint
+win_cmp_and_xchg_lint(
+/*==================*/
+	volatile lint*	ptr,		/*!< in/out: source/destination */
+	lint		new_val,	/*!< in: exchange value */
+	lint		old_val)	/*!< in: value to compare to */
+{
+# ifdef _WIN64
+	return(InterlockedCompareExchange64(ptr, new_val, old_val));
+# else
+	return(InterlockedCompareExchange(ptr, new_val, old_val));
+# endif
+}
+
+/**********************************************************//**
+Atomic addition of signed integers.
+@return Initial value of the variable pointed to by ptr */
+UNIV_INLINE
+lint
+win_xchg_and_add(
+/*=============*/
+	volatile lint*	ptr,	/*!< in/out: address of destination */
+	lint		val)	/*!< in: number to be added */
+{
+#ifdef _WIN64
+	return(InterlockedExchangeAdd64(ptr, val));
+#else
+	return(InterlockedExchangeAdd(ptr, val));
+#endif
+}
+
+/**********************************************************//**
+Atomic compare and exchange of unsigned integers.
+@return value found before the exchange.
+If it is not equal to old_value the exchange did not happen. */
+UNIV_INLINE
+ulint
+win_cmp_and_xchg_ulint(
+/*===================*/
+	volatile ulint*	ptr,		/*!< in/out: source/destination */
+	ulint		new_val,	/*!< in: exchange value */
+	ulint		old_val)	/*!< in: value to compare to */
+{
+	return((ulint) win_cmp_and_xchg_lint(
+		(volatile lint*) ptr,
+		(lint) new_val,
+		(lint) old_val));
+}
+
+/**********************************************************//**
+Atomic compare and exchange of 32-bit unsigned integers.
+@return value found before the exchange.
+If it is not equal to old_value the exchange did not happen. */
+UNIV_INLINE
+DWORD
+win_cmp_and_xchg_dword(
+/*===================*/
+	volatile DWORD*	ptr,		/*!< in/out: source/destination */
+	DWORD		new_val,	/*!< in: exchange value */
+	DWORD		old_val)	/*!< in: value to compare to */
+{
+	ut_ad(sizeof(DWORD) == sizeof(LONG));	/* We assume this. */
+	return(InterlockedCompareExchange(
+		(volatile LONG*) ptr,
+		(LONG) new_val,
+		(LONG) old_val));
+}
+
+#endif /* HAVE_WINDOWS_ATOMICS */
+
diff --git a/storage/innobase/include/os0thread.h b/storage/innobase/include/os0thread.h
new file mode 100644
index 00000000000..37c54afae80
--- /dev/null
+++ b/storage/innobase/include/os0thread.h
@@ -0,0 +1,154 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0thread.h
+The interface to the operating system
+process and thread control primitives
+
+Created 9/8/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0thread_h
+#define os0thread_h
+
+#include "univ.i"
+
+/* Maximum number of threads which can be created in the program;
+this is also the size of the wait slot array for MySQL threads which
+can wait inside InnoDB */
+
+#define	OS_THREAD_MAX_N		srv_max_n_threads
+
+/* Possible fixed priorities for threads */
+#define OS_THREAD_PRIORITY_NONE		100
+#define OS_THREAD_PRIORITY_BACKGROUND	1
+#define OS_THREAD_PRIORITY_NORMAL	2
+#define OS_THREAD_PRIORITY_ABOVE_NORMAL	3
+
+#ifdef __WIN__
+typedef void*			os_thread_t;
+typedef DWORD			os_thread_id_t;	/*!< In Windows the thread id
+						is an unsigned long int */
+extern "C"  {
+typedef LPTHREAD_START_ROUTINE	os_thread_func_t;
+}
+
+/** Macro for specifying a Windows thread start function. */
+#define DECLARE_THREAD(func)	WINAPI func
+
+/** Required to get around a build error on Windows. Even though our functions
+are defined/declared as WINAPI f(LPVOID a); the compiler complains that they
+are defined as: os_thread_ret_t (__cdecl*)(void*). Because our functions
+don't access the arguments and don't return any value, we should be safe. */
+#define os_thread_create(f,a,i)	\
+	os_thread_create_func(reinterpret_cast<os_thread_func_t>(f), a, i)
+
+#else
+
+typedef pthread_t		os_thread_t;
+typedef os_thread_t		os_thread_id_t;	/*!< In Unix we use the thread
+						handle itself as the id of
+						the thread */
+extern "C"  { typedef void*	(*os_thread_func_t)(void*); }
+
+/** Macro for specifying a POSIX thread start function. */
+#define DECLARE_THREAD(func)	func
+#define os_thread_create(f,a,i)	os_thread_create_func(f, a, i)
+
+#endif /* __WIN__ */
+
+/* Define a function pointer type to use in a typecast */
+typedef void* (*os_posix_f_t) (void*);
+
+#ifdef HAVE_PSI_INTERFACE
+/* Define for performance schema registration key */
+typedef unsigned int    mysql_pfs_key_t;
+#endif
+
+/***************************************************************//**
+Compares two thread ids for equality.
+@return	TRUE if equal */
+UNIV_INTERN
+ibool
+os_thread_eq(
+/*=========*/
+	os_thread_id_t	a,	/*!< in: OS thread or thread id */
+	os_thread_id_t	b);	/*!< in: OS thread or thread id */
+/****************************************************************//**
+Converts an OS thread id to a ulint. It is NOT guaranteed that the ulint is
+unique for the thread though!
+@return	thread identifier as a number */
+UNIV_INTERN
+ulint
+os_thread_pf(
+/*=========*/
+	os_thread_id_t	a);	/*!< in: OS thread identifier */
+/****************************************************************//**
+Creates a new thread of execution. The execution starts from
+the function given. The start function takes a void* parameter
+and returns a ulint.
+NOTE: We count the number of threads in os_thread_exit(). A created
+thread should always use that to exit and not use return() to exit.
+@return	handle to the thread */
+UNIV_INTERN
+os_thread_t
+os_thread_create_func(
+/*==================*/
+	os_thread_func_t	func,		/*!< in: pointer to function
+						from which to start */
+	void*			arg,		/*!< in: argument to start
+						function */
+	os_thread_id_t*		thread_id);	/*!< out: id of the created
+						thread, or NULL */
+
+/*****************************************************************//**
+Exits the current thread. */
+UNIV_INTERN
+void
+os_thread_exit(
+/*===========*/
+	void*	exit_value)	/*!< in: exit value; in Windows this void*
+				is cast as a DWORD */
+	UNIV_COLD __attribute__((noreturn));
+/*****************************************************************//**
+Returns the thread identifier of current thread.
+@return	current thread identifier */
+UNIV_INTERN
+os_thread_id_t
+os_thread_get_curr_id(void);
+/*========================*/
+/*****************************************************************//**
+Advises the os to give up remainder of the thread's time slice. */
+UNIV_INTERN
+void
+os_thread_yield(void);
+/*=================*/
+/*****************************************************************//**
+The thread sleeps at least the time given in microseconds. */
+UNIV_INTERN
+void
+os_thread_sleep(
+/*============*/
+	ulint	tm);	/*!< in: time in microseconds */
+
+#ifndef UNIV_NONINL
+#include "os0thread.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/os0thread.ic b/storage/innobase/include/os0thread.ic
new file mode 100644
index 00000000000..0622d22f2dc
--- /dev/null
+++ b/storage/innobase/include/os0thread.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0thread.ic
+The interface to the operating system
+process and thread control primitives
+
+Created 9/8/1995 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/page0cur.h b/storage/innobase/include/page0cur.h
new file mode 100644
index 00000000000..b1ad49b4915
--- /dev/null
+++ b/storage/innobase/include/page0cur.h
@@ -0,0 +1,387 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/page0cur.h
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef page0cur_h
+#define page0cur_h
+
+#include "univ.i"
+
+#include "buf0types.h"
+#include "page0page.h"
+#include "rem0rec.h"
+#include "data0data.h"
+#include "mtr0mtr.h"
+
+
+#define PAGE_CUR_ADAPT
+
+/* Page cursor search modes; the values must be in this order! */
+
+#define	PAGE_CUR_UNSUPP	0
+#define	PAGE_CUR_G	1
+#define	PAGE_CUR_GE	2
+#define	PAGE_CUR_L	3
+#define	PAGE_CUR_LE	4
+/*#define PAGE_CUR_LE_OR_EXTENDS 5*/ /* This is a search mode used in
+				 "column LIKE 'abc%' ORDER BY column DESC";
+				 we have to find strings which are <= 'abc' or
+				 which extend it */
+#ifdef UNIV_SEARCH_DEBUG
+# define PAGE_CUR_DBG	6	/* As PAGE_CUR_LE, but skips search shortcut */
+#endif /* UNIV_SEARCH_DEBUG */
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return	page */
+UNIV_INLINE
+page_t*
+page_cur_get_page(
+/*==============*/
+	page_cur_t*	cur);	/*!< in: page cursor */
+/*********************************************************//**
+Gets pointer to the buffer block where the cursor is positioned.
+@return	page */
+UNIV_INLINE
+buf_block_t*
+page_cur_get_block(
+/*===============*/
+	page_cur_t*	cur);	/*!< in: page cursor */
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return	page */
+UNIV_INLINE
+page_zip_des_t*
+page_cur_get_page_zip(
+/*==================*/
+	page_cur_t*	cur);	/*!< in: page cursor */
+/*********************************************************//**
+Gets the record where the cursor is positioned.
+@return	record */
+UNIV_INLINE
+rec_t*
+page_cur_get_rec(
+/*=============*/
+	page_cur_t*	cur);	/*!< in: page cursor */
+#else /* UNIV_DEBUG */
+# define page_cur_get_page(cur)		page_align((cur)->rec)
+# define page_cur_get_block(cur)	(cur)->block
+# define page_cur_get_page_zip(cur)	buf_block_get_page_zip((cur)->block)
+# define page_cur_get_rec(cur)		(cur)->rec
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Sets the cursor object to point before the first user record
+on the page. */
+UNIV_INLINE
+void
+page_cur_set_before_first(
+/*======================*/
+	const buf_block_t*	block,	/*!< in: index page */
+	page_cur_t*		cur);	/*!< in: cursor */
+/*********************************************************//**
+Sets the cursor object to point after the last user record on
+the page. */
+UNIV_INLINE
+void
+page_cur_set_after_last(
+/*====================*/
+	const buf_block_t*	block,	/*!< in: index page */
+	page_cur_t*		cur);	/*!< in: cursor */
+/*********************************************************//**
+Returns TRUE if the cursor is before first user record on page.
+@return	TRUE if at start */
+UNIV_INLINE
+ibool
+page_cur_is_before_first(
+/*=====================*/
+	const page_cur_t*	cur);	/*!< in: cursor */
+/*********************************************************//**
+Returns TRUE if the cursor is after last user record.
+@return	TRUE if at end */
+UNIV_INLINE
+ibool
+page_cur_is_after_last(
+/*===================*/
+	const page_cur_t*	cur);	/*!< in: cursor */
+/**********************************************************//**
+Positions the cursor on the given record. */
+UNIV_INLINE
+void
+page_cur_position(
+/*==============*/
+	const rec_t*		rec,	/*!< in: record on a page */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	page_cur_t*		cur);	/*!< out: page cursor */
+/**********************************************************//**
+Invalidates a page cursor by setting the record pointer NULL. */
+UNIV_INLINE
+void
+page_cur_invalidate(
+/*================*/
+	page_cur_t*	cur);	/*!< out: page cursor */
+/**********************************************************//**
+Moves the cursor to the next record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_next(
+/*==================*/
+	page_cur_t*	cur);	/*!< in/out: cursor; must not be after last */
+/**********************************************************//**
+Moves the cursor to the previous record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_prev(
+/*==================*/
+	page_cur_t*	cur);	/*!< in/out: cursor; not before first */
+#ifndef UNIV_HOTBACKUP
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return	pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_tuple_insert(
+/*==================*/
+	page_cur_t*	cursor,	/*!< in/out: a page cursor */
+	const dtuple_t*	tuple,	/*!< in: pointer to a data tuple */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint**		offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
+	__attribute__((nonnull(1,2,3,4,5), warn_unused_result));
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return	pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_rec_insert(
+/*================*/
+	page_cur_t*	cursor,	/*!< in/out: a page cursor */
+	const rec_t*	rec,	/*!< in: record to insert */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle, or NULL */
+/***********************************************************//**
+Inserts a record next to page cursor on an uncompressed page.
+Returns pointer to inserted record if succeed, i.e., enough
+space available, NULL otherwise. The cursor stays at the same position.
+@return	pointer to record if succeed, NULL otherwise */
+UNIV_INTERN
+rec_t*
+page_cur_insert_rec_low(
+/*====================*/
+	rec_t*		current_rec,/*!< in: pointer to current record after
+				which the new record is inserted */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	const rec_t*	rec,	/*!< in: pointer to a physical record */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
+	__attribute__((nonnull(1,2,3,4), warn_unused_result));
+/***********************************************************//**
+Inserts a record next to page cursor on a compressed and uncompressed
+page. Returns pointer to inserted record if succeed, i.e.,
+enough space available, NULL otherwise.
+The cursor stays at the same position.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return	pointer to record if succeed, NULL otherwise */
+UNIV_INTERN
+rec_t*
+page_cur_insert_rec_zip(
+/*====================*/
+	page_cur_t*	cursor,	/*!< in/out: page cursor */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	const rec_t*	rec,	/*!< in: pointer to a physical record */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
+	__attribute__((nonnull(1,2,3,4), warn_unused_result));
+/*************************************************************//**
+Copies records from page to a newly created page, from a given record onward,
+including that record. Infimum and supremum records are not copied.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit(). */
+UNIV_INTERN
+void
+page_copy_rec_list_end_to_created_page(
+/*===================================*/
+	page_t*		new_page,	/*!< in/out: index page to copy to */
+	rec_t*		rec,		/*!< in: first record to copy */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr);		/*!< in: mtr */
+/***********************************************************//**
+Deletes a record at the page cursor. The cursor is moved to the
+next record after the deleted one. */
+UNIV_INTERN
+void
+page_cur_delete_rec(
+/*================*/
+	page_cur_t*		cursor,	/*!< in/out: a page cursor */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const ulint*		offsets,/*!< in: rec_get_offsets(
+					cursor->rec, index) */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Searches the right position for a page cursor.
+@return	number of matched fields on the left */
+UNIV_INLINE
+ulint
+page_cur_search(
+/*============*/
+	const buf_block_t*	block,	/*!< in: buffer block */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		tuple,	/*!< in: data tuple */
+	ulint			mode,	/*!< in: PAGE_CUR_L,
+					PAGE_CUR_LE, PAGE_CUR_G, or
+					PAGE_CUR_GE */
+	page_cur_t*		cursor);/*!< out: page cursor */
+/****************************************************************//**
+Searches the right position for a page cursor. */
+UNIV_INTERN
+void
+page_cur_search_with_match(
+/*=======================*/
+	const buf_block_t*	block,	/*!< in: buffer block */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		tuple,	/*!< in: data tuple */
+	ulint			mode,	/*!< in: PAGE_CUR_L,
+					PAGE_CUR_LE, PAGE_CUR_G, or
+					PAGE_CUR_GE */
+	ulint*			iup_matched_fields,
+					/*!< in/out: already matched
+					fields in upper limit record */
+	ulint*			iup_matched_bytes,
+					/*!< in/out: already matched
+					bytes in a field not yet
+					completely matched */
+	ulint*			ilow_matched_fields,
+					/*!< in/out: already matched
+					fields in lower limit record */
+	ulint*			ilow_matched_bytes,
+					/*!< in/out: already matched
+					bytes in a field not yet
+					completely matched */
+	page_cur_t*		cursor);/*!< out: page cursor */
+/***********************************************************//**
+Positions a page cursor on a randomly chosen user record on a page. If there
+are no user records, sets the cursor on the infimum record. */
+UNIV_INTERN
+void
+page_cur_open_on_rnd_user_rec(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: page */
+	page_cur_t*	cursor);/*!< out: page cursor */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Parses a log record of a record insert on a page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_cur_parse_insert_rec(
+/*======================*/
+	ibool		is_short,/*!< in: TRUE if short inserts */
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	buf_block_t*	block,	/*!< in: page or NULL */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr);	/*!< in: mtr or NULL */
+/**********************************************************//**
+Parses a log record of copying a record list end to a new created page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_copy_rec_list_to_created_page(
+/*=====================================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	buf_block_t*	block,	/*!< in: page or NULL */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr);	/*!< in: mtr or NULL */
+/***********************************************************//**
+Parses log record of a record delete on a page.
+@return	pointer to record end or NULL */
+UNIV_INTERN
+byte*
+page_cur_parse_delete_rec(
+/*======================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	buf_block_t*	block,	/*!< in: page or NULL */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr);	/*!< in: mtr or NULL */
+/*******************************************************//**
+Removes the record from a leaf page. This function does not log
+any changes. It is used by the IMPORT tablespace functions.
+@return	true if success, i.e., the page did not become too empty */
+UNIV_INTERN
+bool
+page_delete_rec(
+/*============*/
+	const dict_index_t*	index,	/*!< in: The index that the record
+					belongs to */
+	page_cur_t*		pcur,	/*!< in/out: page cursor on record
+					to delete */
+	page_zip_des_t*		page_zip,/*!< in: compressed page descriptor */
+	const ulint*		offsets);/*!< in: offsets for record */
+
+/** Index page cursor */
+
+struct page_cur_t{
+	byte*		rec;	/*!< pointer to a record on page */
+	buf_block_t*	block;	/*!< pointer to the block containing rec */
+};
+
+#ifndef UNIV_NONINL
+#include "page0cur.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/page0cur.ic b/storage/innobase/include/page0cur.ic
new file mode 100644
index 00000000000..028d33b17aa
--- /dev/null
+++ b/storage/innobase/include/page0cur.ic
@@ -0,0 +1,317 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/page0cur.ic
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "page0page.h"
+#include "buf0types.h"
+
+#ifdef UNIV_DEBUG
+# include "rem0cmp.h"
+
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return	page */
+UNIV_INLINE
+page_t*
+page_cur_get_page(
+/*==============*/
+	page_cur_t*	cur)	/*!< in: page cursor */
+{
+	ut_ad(cur);
+	ut_ad(page_align(cur->rec) == cur->block->frame);
+
+	return(page_align(cur->rec));
+}
+
+/*********************************************************//**
+Gets pointer to the buffer block where the cursor is positioned.
+@return	page */
+UNIV_INLINE
+buf_block_t*
+page_cur_get_block(
+/*===============*/
+	page_cur_t*	cur)	/*!< in: page cursor */
+{
+	ut_ad(cur);
+	ut_ad(page_align(cur->rec) == cur->block->frame);
+	return(cur->block);
+}
+
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return	page */
+UNIV_INLINE
+page_zip_des_t*
+page_cur_get_page_zip(
+/*==================*/
+	page_cur_t*	cur)	/*!< in: page cursor */
+{
+	return(buf_block_get_page_zip(page_cur_get_block(cur)));
+}
+
+/*********************************************************//**
+Gets the record where the cursor is positioned.
+@return	record */
+UNIV_INLINE
+rec_t*
+page_cur_get_rec(
+/*=============*/
+	page_cur_t*	cur)	/*!< in: page cursor */
+{
+	ut_ad(cur);
+	ut_ad(page_align(cur->rec) == cur->block->frame);
+
+	return(cur->rec);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************//**
+Sets the cursor object to point before the first user record
+on the page. */
+UNIV_INLINE
+void
+page_cur_set_before_first(
+/*======================*/
+	const buf_block_t*	block,	/*!< in: index page */
+	page_cur_t*		cur)	/*!< in: cursor */
+{
+	cur->block = (buf_block_t*) block;
+	cur->rec = page_get_infimum_rec(buf_block_get_frame(cur->block));
+}
+
+/*********************************************************//**
+Sets the cursor object to point after the last user record on
+the page. */
+UNIV_INLINE
+void
+page_cur_set_after_last(
+/*====================*/
+	const buf_block_t*	block,	/*!< in: index page */
+	page_cur_t*		cur)	/*!< in: cursor */
+{
+	cur->block = (buf_block_t*) block;
+	cur->rec = page_get_supremum_rec(buf_block_get_frame(cur->block));
+}
+
+/*********************************************************//**
+Returns TRUE if the cursor is before first user record on page.
+@return	TRUE if at start */
+UNIV_INLINE
+ibool
+page_cur_is_before_first(
+/*=====================*/
+	const page_cur_t*	cur)	/*!< in: cursor */
+{
+	ut_ad(cur);
+	ut_ad(page_align(cur->rec) == cur->block->frame);
+	return(page_rec_is_infimum(cur->rec));
+}
+
+/*********************************************************//**
+Returns TRUE if the cursor is after last user record.
+@return	TRUE if at end */
+UNIV_INLINE
+ibool
+page_cur_is_after_last(
+/*===================*/
+	const page_cur_t*	cur)	/*!< in: cursor */
+{
+	ut_ad(cur);
+	ut_ad(page_align(cur->rec) == cur->block->frame);
+	return(page_rec_is_supremum(cur->rec));
+}
+
+/**********************************************************//**
+Positions the cursor on the given record. */
+UNIV_INLINE
+void
+page_cur_position(
+/*==============*/
+	const rec_t*		rec,	/*!< in: record on a page */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	page_cur_t*		cur)	/*!< out: page cursor */
+{
+	ut_ad(rec && block && cur);
+	ut_ad(page_align(rec) == block->frame);
+
+	cur->rec = (rec_t*) rec;
+	cur->block = (buf_block_t*) block;
+}
+
+/**********************************************************//**
+Invalidates a page cursor by setting the record pointer NULL. */
+UNIV_INLINE
+void
+page_cur_invalidate(
+/*================*/
+	page_cur_t*	cur)	/*!< out: page cursor */
+{
+	ut_ad(cur);
+
+	cur->rec = NULL;
+	cur->block = NULL;
+}
+
+/**********************************************************//**
+Moves the cursor to the next record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_next(
+/*==================*/
+	page_cur_t*	cur)	/*!< in/out: cursor; must not be after last */
+{
+	ut_ad(!page_cur_is_after_last(cur));
+
+	cur->rec = page_rec_get_next(cur->rec);
+}
+
+/**********************************************************//**
+Moves the cursor to the previous record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_prev(
+/*==================*/
+	page_cur_t*	cur)	/*!< in/out: page cursor, not before first */
+{
+	ut_ad(!page_cur_is_before_first(cur));
+
+	cur->rec = page_rec_get_prev(cur->rec);
+}
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Searches the right position for a page cursor.
+@return	number of matched fields on the left */
+UNIV_INLINE
+ulint
+page_cur_search(
+/*============*/
+	const buf_block_t*	block,	/*!< in: buffer block */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		tuple,	/*!< in: data tuple */
+	ulint			mode,	/*!< in: PAGE_CUR_L,
+					PAGE_CUR_LE, PAGE_CUR_G, or
+					PAGE_CUR_GE */
+	page_cur_t*		cursor)	/*!< out: page cursor */
+{
+	ulint		low_matched_fields = 0;
+	ulint		low_matched_bytes = 0;
+	ulint		up_matched_fields = 0;
+	ulint		up_matched_bytes = 0;
+
+	ut_ad(dtuple_check_typed(tuple));
+
+	page_cur_search_with_match(block, index, tuple, mode,
+				   &up_matched_fields,
+				   &up_matched_bytes,
+				   &low_matched_fields,
+				   &low_matched_bytes,
+				   cursor);
+	return(low_matched_fields);
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return	pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_tuple_insert(
+/*==================*/
+	page_cur_t*	cursor,	/*!< in/out: a page cursor */
+	const dtuple_t*	tuple,	/*!< in: pointer to a data tuple */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint**		offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
+{
+	ulint		size
+		= rec_get_converted_size(index, tuple, n_ext);
+	rec_t*		rec;
+
+	if (!*heap) {
+		*heap = mem_heap_create(size
+					+ (4 + REC_OFFS_HEADER_SIZE
+					   + dtuple_get_n_fields(tuple))
+					* sizeof **offsets);
+	}
+
+	rec = rec_convert_dtuple_to_rec((byte*) mem_heap_alloc(*heap, size),
+					index, tuple, n_ext);
+	*offsets = rec_get_offsets(
+		rec, index, *offsets, ULINT_UNDEFINED, heap);
+
+	if (buf_block_get_page_zip(cursor->block)) {
+		rec = page_cur_insert_rec_zip(
+			cursor, index, rec, *offsets, mtr);
+	} else {
+		rec = page_cur_insert_rec_low(cursor->rec,
+					      index, rec, *offsets, mtr);
+	}
+
+	ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, *offsets));
+	return(rec);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return	pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_rec_insert(
+/*================*/
+	page_cur_t*	cursor,	/*!< in/out: a page cursor */
+	const rec_t*	rec,	/*!< in: record to insert */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
+{
+	if (buf_block_get_page_zip(cursor->block)) {
+		return(page_cur_insert_rec_zip(
+			       cursor, index, rec, offsets, mtr));
+	} else {
+		return(page_cur_insert_rec_low(cursor->rec,
+					       index, rec, offsets, mtr));
+	}
+}
diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h
new file mode 100644
index 00000000000..b572f7abb49
--- /dev/null
+++ b/storage/innobase/include/page0page.h
@@ -0,0 +1,1122 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0page.h
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef page0page_h
+#define page0page_h
+
+#include "univ.i"
+
+#include "page0types.h"
+#include "fil0fil.h"
+#include "buf0buf.h"
+#include "data0data.h"
+#include "dict0dict.h"
+#include "rem0rec.h"
+#include "fsp0fsp.h"
+#include "mtr0mtr.h"
+
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE
+#endif
+
+/*			PAGE HEADER
+			===========
+
+Index page header starts at the first offset left free by the FIL-module */
+
+typedef	byte		page_header_t;
+
+#define	PAGE_HEADER	FSEG_PAGE_DATA	/* index page header starts at this
+				offset */
+/*-----------------------------*/
+#define PAGE_N_DIR_SLOTS 0	/* number of slots in page directory */
+#define	PAGE_HEAP_TOP	 2	/* pointer to record heap top */
+#define	PAGE_N_HEAP	 4	/* number of records in the heap,
+				bit 15=flag: new-style compact page format */
+#define	PAGE_FREE	 6	/* pointer to start of page free record list */
+#define	PAGE_GARBAGE	 8	/* number of bytes in deleted records */
+#define	PAGE_LAST_INSERT 10	/* pointer to the last inserted record, or
+				NULL if this info has been reset by a delete,
+				for example */
+#define	PAGE_DIRECTION	 12	/* last insert direction: PAGE_LEFT, ... */
+#define	PAGE_N_DIRECTION 14	/* number of consecutive inserts to the same
+				direction */
+#define	PAGE_N_RECS	 16	/* number of user records on the page */
+#define PAGE_MAX_TRX_ID	 18	/* highest id of a trx which may have modified
+				a record on the page; trx_id_t; defined only
+				in secondary indexes and in the insert buffer
+				tree */
+#define PAGE_HEADER_PRIV_END 26	/* end of private data structure of the page
+				header which are set in a page create */
+/*----*/
+#define	PAGE_LEVEL	 26	/* level of the node in an index tree; the
+				leaf level is the level 0.  This field should
+				not be written to after page creation. */
+#define	PAGE_INDEX_ID	 28	/* index id where the page belongs.
+				This field should not be written to after
+				page creation. */
+#define PAGE_BTR_SEG_LEAF 36	/* file segment header for the leaf pages in
+				a B-tree: defined only on the root page of a
+				B-tree, but not in the root of an ibuf tree */
+#define PAGE_BTR_IBUF_FREE_LIST	PAGE_BTR_SEG_LEAF
+#define PAGE_BTR_IBUF_FREE_LIST_NODE PAGE_BTR_SEG_LEAF
+				/* in the place of PAGE_BTR_SEG_LEAF and _TOP
+				there is a free list base node if the page is
+				the root page of an ibuf tree, and at the same
+				place is the free list node if the page is in
+				a free list */
+#define PAGE_BTR_SEG_TOP (36 + FSEG_HEADER_SIZE)
+				/* file segment header for the non-leaf pages
+				in a B-tree: defined only on the root page of
+				a B-tree, but not in the root of an ibuf
+				tree */
+/*----*/
+#define PAGE_DATA	(PAGE_HEADER + 36 + 2 * FSEG_HEADER_SIZE)
+				/* start of data on the page */
+
+#define PAGE_OLD_INFIMUM	(PAGE_DATA + 1 + REC_N_OLD_EXTRA_BYTES)
+				/* offset of the page infimum record on an
+				old-style page */
+#define PAGE_OLD_SUPREMUM	(PAGE_DATA + 2 + 2 * REC_N_OLD_EXTRA_BYTES + 8)
+				/* offset of the page supremum record on an
+				old-style page */
+#define PAGE_OLD_SUPREMUM_END (PAGE_OLD_SUPREMUM + 9)
+				/* offset of the page supremum record end on
+				an old-style page */
+#define PAGE_NEW_INFIMUM	(PAGE_DATA + REC_N_NEW_EXTRA_BYTES)
+				/* offset of the page infimum record on a
+				new-style compact page */
+#define PAGE_NEW_SUPREMUM	(PAGE_DATA + 2 * REC_N_NEW_EXTRA_BYTES + 8)
+				/* offset of the page supremum record on a
+				new-style compact page */
+#define PAGE_NEW_SUPREMUM_END (PAGE_NEW_SUPREMUM + 8)
+				/* offset of the page supremum record end on
+				a new-style compact page */
+/*-----------------------------*/
+
+/* Heap numbers */
+#define PAGE_HEAP_NO_INFIMUM	0	/* page infimum */
+#define PAGE_HEAP_NO_SUPREMUM	1	/* page supremum */
+#define PAGE_HEAP_NO_USER_LOW	2	/* first user record in
+					creation (insertion) order,
+					not necessarily collation order;
+					this record may have been deleted */
+
+/* Directions of cursor movement */
+#define	PAGE_LEFT		1
+#define	PAGE_RIGHT		2
+#define	PAGE_SAME_REC		3
+#define	PAGE_SAME_PAGE		4
+#define	PAGE_NO_DIRECTION	5
+
+/*			PAGE DIRECTORY
+			==============
+*/
+
+typedef	byte			page_dir_slot_t;
+typedef page_dir_slot_t		page_dir_t;
+
+/* Offset of the directory start down from the page end. We call the
+slot with the highest file address directory start, as it points to
+the first record in the list of records. */
+#define	PAGE_DIR		FIL_PAGE_DATA_END
+
+/* We define a slot in the page directory as two bytes */
+#define	PAGE_DIR_SLOT_SIZE	2
+
+/* The offset of the physically lower end of the directory, counted from
+page end, when the page is empty */
+#define PAGE_EMPTY_DIR_START	(PAGE_DIR + 2 * PAGE_DIR_SLOT_SIZE)
+
+/* The maximum and minimum number of records owned by a directory slot. The
+number may drop below the minimum in the first and the last slot in the
+directory. */
+#define PAGE_DIR_SLOT_MAX_N_OWNED	8
+#define	PAGE_DIR_SLOT_MIN_N_OWNED	4
+
+/************************************************************//**
+Gets the start of a page.
+@return	start of the page */
+UNIV_INLINE
+page_t*
+page_align(
+/*=======*/
+	const void*	ptr)	/*!< in: pointer to page frame */
+		__attribute__((const));
+/************************************************************//**
+Gets the offset within a page.
+@return	offset from the start of the page */
+UNIV_INLINE
+ulint
+page_offset(
+/*========*/
+	const void*	ptr)	/*!< in: pointer to page frame */
+		__attribute__((const));
+/*************************************************************//**
+Returns the max trx id field value. */
+UNIV_INLINE
+trx_id_t
+page_get_max_trx_id(
+/*================*/
+	const page_t*	page);	/*!< in: page */
+/*************************************************************//**
+Sets the max trx id field value. */
+UNIV_INTERN
+void
+page_set_max_trx_id(
+/*================*/
+	buf_block_t*	block,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction, or NULL */
+/*************************************************************//**
+Sets the max trx id field value if trx_id is bigger than the previous
+value. */
+UNIV_INLINE
+void
+page_update_max_trx_id(
+/*===================*/
+	buf_block_t*	block,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+/*************************************************************//**
+Reads the given header field. */
+UNIV_INLINE
+ulint
+page_header_get_field(
+/*==================*/
+	const page_t*	page,	/*!< in: page */
+	ulint		field);	/*!< in: PAGE_N_DIR_SLOTS, ... */
+/*************************************************************//**
+Sets the given header field. */
+UNIV_INLINE
+void
+page_header_set_field(
+/*==================*/
+	page_t*		page,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	ulint		field,	/*!< in: PAGE_N_DIR_SLOTS, ... */
+	ulint		val);	/*!< in: value */
+/*************************************************************//**
+Returns the offset stored in the given header field.
+@return	offset from the start of the page, or 0 */
+UNIV_INLINE
+ulint
+page_header_get_offs(
+/*=================*/
+	const page_t*	page,	/*!< in: page */
+	ulint		field)	/*!< in: PAGE_FREE, ... */
+	__attribute__((nonnull, pure));
+
+/*************************************************************//**
+Returns the pointer stored in the given header field, or NULL. */
+#define page_header_get_ptr(page, field)			\
+	(page_header_get_offs(page, field)			\
+	 ? page + page_header_get_offs(page, field) : NULL)
+/*************************************************************//**
+Sets the pointer stored in the given header field. */
+UNIV_INLINE
+void
+page_header_set_ptr(
+/*================*/
+	page_t*		page,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	ulint		field,	/*!< in/out: PAGE_FREE, ... */
+	const byte*	ptr);	/*!< in: pointer or NULL*/
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Resets the last insert info field in the page header. Writes to mlog
+about this operation. */
+UNIV_INLINE
+void
+page_header_reset_last_insert(
+/*==========================*/
+	page_t*		page,	/*!< in: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	mtr_t*		mtr);	/*!< in: mtr */
+#endif /* !UNIV_HOTBACKUP */
+/************************************************************//**
+Gets the offset of the first record on the page.
+@return	offset of the first record in record list, relative from page */
+UNIV_INLINE
+ulint
+page_get_infimum_offset(
+/*====================*/
+	const page_t*	page);	/*!< in: page which must have record(s) */
+/************************************************************//**
+Gets the offset of the last record on the page.
+@return	offset of the last record in record list, relative from page */
+UNIV_INLINE
+ulint
+page_get_supremum_offset(
+/*=====================*/
+	const page_t*	page);	/*!< in: page which must have record(s) */
+#define page_get_infimum_rec(page) ((page) + page_get_infimum_offset(page))
+#define page_get_supremum_rec(page) ((page) + page_get_supremum_offset(page))
+
+/************************************************************//**
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return	nth record */
+UNIV_INTERN
+const rec_t*
+page_rec_get_nth_const(
+/*===================*/
+	const page_t*	page,	/*!< in: page */
+	ulint		nth)	/*!< in: nth record */
+	__attribute__((nonnull, warn_unused_result));
+/************************************************************//**
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return	nth record */
+UNIV_INLINE
+rec_t*
+page_rec_get_nth(
+/*=============*/
+	page_t*	page,	/*< in: page */
+	ulint	nth)	/*!< in: nth record */
+	__attribute__((nonnull, warn_unused_result));
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Returns the middle record of the records on the page. If there is an
+even number of records in the list, returns the first record of the
+upper half-list.
+@return	middle record */
+UNIV_INLINE
+rec_t*
+page_get_middle_rec(
+/*================*/
+	page_t*	page)	/*!< in: page */
+	__attribute__((nonnull, warn_unused_result));
+/*************************************************************//**
+Compares a data tuple to a physical record. Differs from the function
+cmp_dtuple_rec_with_match in the way that the record must reside on an
+index page, and also page infimum and supremum records can be given in
+the parameter rec. These are considered as the negative infinity and
+the positive infinity in the alphabetical order.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared */
+UNIV_INLINE
+int
+page_cmp_dtuple_rec_with_match(
+/*===========================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record on a page; may also
+				be page infimum or supremum, in which case
+				matched-parameter values below are not
+				affected */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint*		matched_fields, /*!< in/out: number of already completely
+				matched fields; when function returns
+				contains the value for current comparison */
+	ulint*		matched_bytes); /*!< in/out: number of already matched
+				bytes within the first field not completely
+				matched; when function returns contains the
+				value for current comparison */
+#endif /* !UNIV_HOTBACKUP */
+/*************************************************************//**
+Gets the page number.
+@return	page number */
+UNIV_INLINE
+ulint
+page_get_page_no(
+/*=============*/
+	const page_t*	page);	/*!< in: page */
+/*************************************************************//**
+Gets the tablespace identifier.
+@return	space id */
+UNIV_INLINE
+ulint
+page_get_space_id(
+/*==============*/
+	const page_t*	page);	/*!< in: page */
+/*************************************************************//**
+Gets the number of user records on page (the infimum and supremum records
+are not user records).
+@return	number of user records */
+UNIV_INLINE
+ulint
+page_get_n_recs(
+/*============*/
+	const page_t*	page);	/*!< in: index page */
+/***************************************************************//**
+Returns the number of records before the given record in chain.
+The number includes infimum and supremum records.
+This is the inverse function of page_rec_get_nth().
+@return	number of records */
+UNIV_INTERN
+ulint
+page_rec_get_n_recs_before(
+/*=======================*/
+	const rec_t*	rec);	/*!< in: the physical record */
+/*************************************************************//**
+Gets the number of records in the heap.
+@return	number of user records */
+UNIV_INLINE
+ulint
+page_dir_get_n_heap(
+/*================*/
+	const page_t*	page);	/*!< in: index page */
+/*************************************************************//**
+Sets the number of records in the heap. */
+UNIV_INLINE
+void
+page_dir_set_n_heap(
+/*================*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL.
+				Note that the size of the dense page directory
+				in the compressed page trailer is
+				n_heap * PAGE_ZIP_DIR_SLOT_SIZE. */
+	ulint		n_heap);/*!< in: number of records */
+/*************************************************************//**
+Gets the number of dir slots in directory.
+@return	number of slots */
+UNIV_INLINE
+ulint
+page_dir_get_n_slots(
+/*=================*/
+	const page_t*	page);	/*!< in: index page */
+/*************************************************************//**
+Sets the number of dir slots in directory. */
+UNIV_INLINE
+void
+page_dir_set_n_slots(
+/*=================*/
+	page_t*		page,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	ulint		n_slots);/*!< in: number of slots */
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Gets pointer to nth directory slot.
+@return	pointer to dir slot */
+UNIV_INLINE
+page_dir_slot_t*
+page_dir_get_nth_slot(
+/*==================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n);	/*!< in: position */
+#else /* UNIV_DEBUG */
+# define page_dir_get_nth_slot(page, n)		\
+	((page) + UNIV_PAGE_SIZE - PAGE_DIR	\
+	 - (n + 1) * PAGE_DIR_SLOT_SIZE)
+#endif /* UNIV_DEBUG */
+/**************************************************************//**
+Used to check the consistency of a record on a page.
+@return	TRUE if succeed */
+UNIV_INLINE
+ibool
+page_rec_check(
+/*===========*/
+	const rec_t*	rec);	/*!< in: record */
+/***************************************************************//**
+Gets the record pointed to by a directory slot.
+@return	pointer to record */
+UNIV_INLINE
+const rec_t*
+page_dir_slot_get_rec(
+/*==================*/
+	const page_dir_slot_t*	slot);	/*!< in: directory slot */
+/***************************************************************//**
+This is used to set the record offset in a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_rec(
+/*==================*/
+	page_dir_slot_t* slot,	/*!< in: directory slot */
+	rec_t*		 rec);	/*!< in: record on the page */
+/***************************************************************//**
+Gets the number of records owned by a directory slot.
+@return	number of records */
+UNIV_INLINE
+ulint
+page_dir_slot_get_n_owned(
+/*======================*/
+	const page_dir_slot_t*	slot);	/*!< in: page directory slot */
+/***************************************************************//**
+This is used to set the owned records field of a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_n_owned(
+/*======================*/
+	page_dir_slot_t*slot,	/*!< in/out: directory slot */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	ulint		n);	/*!< in: number of records owned by the slot */
+/************************************************************//**
+Calculates the space reserved for directory slots of a given
+number of records. The exact value is a fraction number
+n * PAGE_DIR_SLOT_SIZE / PAGE_DIR_SLOT_MIN_N_OWNED, and it is
+rounded upwards to an integer. */
+UNIV_INLINE
+ulint
+page_dir_calc_reserved_space(
+/*=========================*/
+	ulint	n_recs);	/*!< in: number of records */
+/***************************************************************//**
+Looks for the directory slot which owns the given record.
+@return	the directory slot number */
+UNIV_INTERN
+ulint
+page_dir_find_owner_slot(
+/*=====================*/
+	const rec_t*	rec);	/*!< in: the physical record */
+/************************************************************//**
+Determine whether the page is in new-style compact format.
+@return nonzero if the page is in compact format, zero if it is in
+old-style format */
+UNIV_INLINE
+ulint
+page_is_comp(
+/*=========*/
+	const page_t*	page);	/*!< in: index page */
+/************************************************************//**
+TRUE if the record is on a page in compact format.
+@return	nonzero if in compact format */
+UNIV_INLINE
+ulint
+page_rec_is_comp(
+/*=============*/
+	const rec_t*	rec);	/*!< in: record */
+/***************************************************************//**
+Returns the heap number of a record.
+@return	heap number */
+UNIV_INLINE
+ulint
+page_rec_get_heap_no(
+/*=================*/
+	const rec_t*	rec);	/*!< in: the physical record */
+/************************************************************//**
+Determine whether the page is a B-tree leaf.
+@return	true if the page is a B-tree leaf (PAGE_LEVEL = 0) */
+UNIV_INLINE
+bool
+page_is_leaf(
+/*=========*/
+	const page_t*	page)	/*!< in: page */
+	__attribute__((nonnull, pure));
+/************************************************************//**
+Determine whether the page is empty.
+@return	true if the page is empty (PAGE_N_RECS = 0) */
+UNIV_INLINE
+bool
+page_is_empty(
+/*==========*/
+	const page_t*	page)	/*!< in: page */
+	__attribute__((nonnull, pure));
+/************************************************************//**
+Determine whether the page contains garbage.
+@return	true if the page contains garbage (PAGE_GARBAGE is not 0) */
+UNIV_INLINE
+bool
+page_has_garbage(
+/*=============*/
+	const page_t*	page)	/*!< in: page */
+	__attribute__((nonnull, pure));
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return	pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_low(
+/*==================*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	ulint		comp);	/*!< in: nonzero=compact page layout */
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return	pointer to next record */
+UNIV_INLINE
+rec_t*
+page_rec_get_next(
+/*==============*/
+	rec_t*	rec);	/*!< in: pointer to record */
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return	pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_const(
+/*====================*/
+	const rec_t*	rec);	/*!< in: pointer to record */
+/************************************************************//**
+Gets the pointer to the next non delete-marked record on the page.
+If all subsequent records are delete-marked, then this function
+will return the supremum record.
+@return	pointer to next non delete-marked record or pointer to supremum */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_non_del_marked(
+/*=============================*/
+	const rec_t*	rec);	/*!< in: pointer to record */
+/************************************************************//**
+Sets the pointer to the next record on the page. */
+UNIV_INLINE
+void
+page_rec_set_next(
+/*==============*/
+	rec_t*		rec,	/*!< in: pointer to record,
+				must not be page supremum */
+	const rec_t*	next);	/*!< in: pointer to next record,
+				must not be page infimum */
+/************************************************************//**
+Gets the pointer to the previous record.
+@return	pointer to previous record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_prev_const(
+/*====================*/
+	const rec_t*	rec);	/*!< in: pointer to record, must not be page
+				infimum */
+/************************************************************//**
+Gets the pointer to the previous record.
+@return	pointer to previous record */
+UNIV_INLINE
+rec_t*
+page_rec_get_prev(
+/*==============*/
+	rec_t*		rec);	/*!< in: pointer to record,
+				must not be page infimum */
+/************************************************************//**
+TRUE if the record is a user record on the page.
+@return	TRUE if a user record */
+UNIV_INLINE
+ibool
+page_rec_is_user_rec_low(
+/*=====================*/
+	ulint	offset)	/*!< in: record offset on page */
+	__attribute__((const));
+/************************************************************//**
+TRUE if the record is the supremum record on a page.
+@return	TRUE if the supremum record */
+UNIV_INLINE
+ibool
+page_rec_is_supremum_low(
+/*=====================*/
+	ulint	offset)	/*!< in: record offset on page */
+	__attribute__((const));
+/************************************************************//**
+TRUE if the record is the infimum record on a page.
+@return	TRUE if the infimum record */
+UNIV_INLINE
+ibool
+page_rec_is_infimum_low(
+/*====================*/
+	ulint	offset)	/*!< in: record offset on page */
+	__attribute__((const));
+
+/************************************************************//**
+TRUE if the record is a user record on the page.
+@return	TRUE if a user record */
+UNIV_INLINE
+ibool
+page_rec_is_user_rec(
+/*=================*/
+	const rec_t*	rec)	/*!< in: record */
+	__attribute__((const));
+/************************************************************//**
+TRUE if the record is the supremum record on a page.
+@return	TRUE if the supremum record */
+UNIV_INLINE
+ibool
+page_rec_is_supremum(
+/*=================*/
+	const rec_t*	rec)	/*!< in: record */
+	__attribute__((const));
+
+/************************************************************//**
+TRUE if the record is the infimum record on a page.
+@return	TRUE if the infimum record */
+UNIV_INLINE
+ibool
+page_rec_is_infimum(
+/*================*/
+	const rec_t*	rec)	/*!< in: record */
+	__attribute__((const));
+/***************************************************************//**
+Looks for the record which owns the given record.
+@return	the owner record */
+UNIV_INLINE
+rec_t*
+page_rec_find_owner_rec(
+/*====================*/
+	rec_t*	rec);	/*!< in: the physical record */
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Write a 32-bit field in a data dictionary record. */
+UNIV_INLINE
+void
+page_rec_write_field(
+/*=================*/
+	rec_t*	rec,	/*!< in/out: record to update */
+	ulint	i,	/*!< in: index of the field to update */
+	ulint	val,	/*!< in: value to write */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
+#endif /* !UNIV_HOTBACKUP */
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of record heap.
+@return	maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size(
+/*=====================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n_recs);/*!< in: number of records */
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of record heap if page is first reorganized.
+@return	maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size_after_reorganize(
+/*======================================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n_recs);/*!< in: number of records */
+/*************************************************************//**
+Calculates free space if a page is emptied.
+@return	free space */
+UNIV_INLINE
+ulint
+page_get_free_space_of_empty(
+/*=========================*/
+	ulint	comp)	/*!< in: nonzero=compact page format */
+		__attribute__((const));
+/**********************************************************//**
+Returns the base extra size of a physical record.  This is the
+size of the fixed header, independent of the record size.
+@return	REC_N_NEW_EXTRA_BYTES or REC_N_OLD_EXTRA_BYTES */
+UNIV_INLINE
+ulint
+page_rec_get_base_extra_size(
+/*=========================*/
+	const rec_t*	rec);	/*!< in: physical record */
+/************************************************************//**
+Returns the sum of the sizes of the records in the record list
+excluding the infimum and supremum records.
+@return	data in bytes */
+UNIV_INLINE
+ulint
+page_get_data_size(
+/*===============*/
+	const page_t*	page);	/*!< in: index page */
+/************************************************************//**
+Allocates a block of memory from the head of the free list
+of an index page. */
+UNIV_INLINE
+void
+page_mem_alloc_free(
+/*================*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page with enough
+				space available for inserting the record,
+				or NULL */
+	rec_t*		next_rec,/*!< in: pointer to the new head of the
+				free record list */
+	ulint		need);	/*!< in: number of bytes allocated */
+/************************************************************//**
+Allocates a block of memory from the heap of an index page.
+@return	pointer to start of allocated buffer, or NULL if allocation fails */
+UNIV_INTERN
+byte*
+page_mem_alloc_heap(
+/*================*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page with enough
+				space available for inserting the record,
+				or NULL */
+	ulint		need,	/*!< in: total number of bytes needed */
+	ulint*		heap_no);/*!< out: this contains the heap number
+				of the allocated record
+				if allocation succeeds */
+/************************************************************//**
+Puts a record to free list. */
+UNIV_INLINE
+void
+page_mem_free(
+/*==========*/
+	page_t*			page,	/*!< in/out: index page */
+	page_zip_des_t*		page_zip,/*!< in/out: compressed page,
+					 or NULL */
+	rec_t*			rec,	/*!< in: pointer to the (origin of)
+					record */
+	const dict_index_t*	index,	/*!< in: index of rec */
+	const ulint*		offsets);/*!< in: array returned by
+					 rec_get_offsets() */
+/**********************************************************//**
+Create an uncompressed B-tree index page.
+@return	pointer to the page */
+UNIV_INTERN
+page_t*
+page_create(
+/*========*/
+	buf_block_t*	block,		/*!< in: a buffer block where the
+					page is created */
+	mtr_t*		mtr,		/*!< in: mini-transaction handle */
+	ulint		comp);		/*!< in: nonzero=compact page format */
+/**********************************************************//**
+Create a compressed B-tree index page.
+@return	pointer to the page */
+UNIV_INTERN
+page_t*
+page_create_zip(
+/*============*/
+	buf_block_t*	block,		/*!< in/out: a buffer frame where the
+					page is created */
+	dict_index_t*	index,		/*!< in: the index of the page */
+	ulint		level,		/*!< in: the B-tree level of the page */
+	trx_id_t	max_trx_id,	/*!< in: PAGE_MAX_TRX_ID */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
+/**********************************************************//**
+Empty a previously created B-tree index page. */
+UNIV_INTERN
+void
+page_create_empty(
+/*==============*/
+	buf_block_t*	block,	/*!< in/out: B-tree block */
+	dict_index_t*	index,	/*!< in: the index of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull(1,2)));
+/*************************************************************//**
+Differs from page_copy_rec_list_end, because this function does not
+touch the lock table and max trx id on page or compress the page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit(). */
+UNIV_INTERN
+void
+page_copy_rec_list_end_no_locks(
+/*============================*/
+	buf_block_t*	new_block,	/*!< in: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page of rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr);		/*!< in: mtr */
+/*************************************************************//**
+Copies records from page to new_page, from the given record onward,
+including that record. Infimum and supremum records are not copied.
+The records are copied to the start of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to the original successor of the infimum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+UNIV_INTERN
+rec_t*
+page_copy_rec_list_end(
+/*===================*/
+	buf_block_t*	new_block,	/*!< in/out: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page containing rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+	__attribute__((nonnull));
+/*************************************************************//**
+Copies records from page to new_page, up to the given record, NOT
+including that record. Infimum and supremum records are not copied.
+The records are copied to the end of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to the original predecessor of the supremum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+UNIV_INTERN
+rec_t*
+page_copy_rec_list_start(
+/*=====================*/
+	buf_block_t*	new_block,	/*!< in/out: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page containing rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+	__attribute__((nonnull));
+/*************************************************************//**
+Deletes records from a page from a given record onward, including that record.
+The infimum and supremum records are not deleted. */
+UNIV_INTERN
+void
+page_delete_rec_list_end(
+/*=====================*/
+	rec_t*		rec,	/*!< in: pointer to record on page */
+	buf_block_t*	block,	/*!< in: buffer block of the page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint		n_recs,	/*!< in: number of records to delete,
+				or ULINT_UNDEFINED if not known */
+	ulint		size,	/*!< in: the sum of the sizes of the
+				records in the end of the chain to
+				delete, or ULINT_UNDEFINED if not known */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
+/*************************************************************//**
+Deletes records from page, up to the given record, NOT including
+that record. Infimum and supremum records are not deleted. */
+UNIV_INTERN
+void
+page_delete_rec_list_start(
+/*=======================*/
+	rec_t*		rec,	/*!< in: record on page */
+	buf_block_t*	block,	/*!< in: buffer block of the page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
+/*************************************************************//**
+Moves record list end to another page. Moved records include
+split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return TRUE on success; FALSE on compression failure (new_block will
+be decompressed) */
+UNIV_INTERN
+ibool
+page_move_rec_list_end(
+/*===================*/
+	buf_block_t*	new_block,	/*!< in/out: index page where to move */
+	buf_block_t*	block,		/*!< in: index page from where to move */
+	rec_t*		split_rec,	/*!< in: first record to move */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+	__attribute__((nonnull(1, 2, 4, 5)));
+/*************************************************************//**
+Moves record list start to another page. Moved records do not include
+split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return	TRUE on success; FALSE on compression failure */
+UNIV_INTERN
+ibool
+page_move_rec_list_start(
+/*=====================*/
+	buf_block_t*	new_block,	/*!< in/out: index page where to move */
+	buf_block_t*	block,		/*!< in/out: page containing split_rec */
+	rec_t*		split_rec,	/*!< in: first record not to move */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+	__attribute__((nonnull(1, 2, 4, 5)));
+/****************************************************************//**
+Splits a directory slot which owns too many records. */
+UNIV_INTERN
+void
+page_dir_split_slot(
+/*================*/
+	page_t*		page,	/*!< in: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be written, or NULL */
+	ulint		slot_no)/*!< in: the directory slot */
+	__attribute__((nonnull(1)));
+/*************************************************************//**
+Tries to balance the given directory slot with too few records
+with the upper neighbor, so that there are at least the minimum number
+of records owned by the slot; this may result in the merging of
+two slots. */
+UNIV_INTERN
+void
+page_dir_balance_slot(
+/*==================*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	ulint		slot_no)/*!< in: the directory slot */
+	__attribute__((nonnull(1)));
+/**********************************************************//**
+Parses a log record of a record list end or start deletion.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_delete_rec_list(
+/*=======================*/
+	byte		type,	/*!< in: MLOG_LIST_END_DELETE,
+				MLOG_LIST_START_DELETE,
+				MLOG_COMP_LIST_END_DELETE or
+				MLOG_COMP_LIST_START_DELETE */
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	buf_block_t*	block,	/*!< in/out: buffer block or NULL */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr);	/*!< in: mtr or NULL */
+/***********************************************************//**
+Parses a redo log record of creating a page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_create(
+/*==============*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	ulint		comp,	/*!< in: nonzero=compact page format */
+	buf_block_t*	block,	/*!< in: block or NULL */
+	mtr_t*		mtr);	/*!< in: mtr or NULL */
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Prints record contents including the data relevant only in
+the index page context. */
+UNIV_INTERN
+void
+page_rec_print(
+/*===========*/
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets);/*!< in: record descriptor */
+# ifdef UNIV_BTR_PRINT
+/***************************************************************//**
+This is used to print the contents of the directory for
+debugging purposes. */
+UNIV_INTERN
+void
+page_dir_print(
+/*===========*/
+	page_t*	page,	/*!< in: index page */
+	ulint	pr_n);	/*!< in: print n first and n last entries */
+/***************************************************************//**
+This is used to print the contents of the page record list for
+debugging purposes. */
+UNIV_INTERN
+void
+page_print_list(
+/*============*/
+	buf_block_t*	block,	/*!< in: index page */
+	dict_index_t*	index,	/*!< in: dictionary index of the page */
+	ulint		pr_n);	/*!< in: print n first and n last entries */
+/***************************************************************//**
+Prints the info in a page header. */
+UNIV_INTERN
+void
+page_header_print(
+/*==============*/
+	const page_t*	page);	/*!< in: index page */
+/***************************************************************//**
+This is used to print the contents of the page for
+debugging purposes. */
+UNIV_INTERN
+void
+page_print(
+/*=======*/
+	buf_block_t*	block,	/*!< in: index page */
+	dict_index_t*	index,	/*!< in: dictionary index of the page */
+	ulint		dn,	/*!< in: print dn first and last entries
+				in directory */
+	ulint		rn);	/*!< in: print rn first and last records
+				in directory */
+# endif /* UNIV_BTR_PRINT */
+#endif /* !UNIV_HOTBACKUP */
+/***************************************************************//**
+The following is used to validate a record on a page. This function
+differs from rec_validate as it can also check the n_owned field and
+the heap_no field.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+page_rec_validate(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Checks that the first directory slot points to the infimum record and
+the last to the supremum. This function is intended to track if the
+bug fixed in 4.0.14 has caused corruption to users' databases. */
+UNIV_INTERN
+void
+page_check_dir(
+/*===========*/
+	const page_t*	page);	/*!< in: index page */
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+page_simple_validate_old(
+/*=====================*/
+	const page_t*	page);	/*!< in: index page in ROW_FORMAT=REDUNDANT */
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+page_simple_validate_new(
+/*=====================*/
+	const page_t*	page);	/*!< in: index page in ROW_FORMAT!=REDUNDANT */
+/***************************************************************//**
+This function checks the consistency of an index page.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+page_validate(
+/*==========*/
+	const page_t*	page,	/*!< in: index page */
+	dict_index_t*	index);	/*!< in: data dictionary index containing
+				the page record type definition */
+/***************************************************************//**
+Looks in the page record list for a record with the given heap number.
+@return	record, NULL if not found */
+
+const rec_t*
+page_find_rec_with_heap_no(
+/*=======================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		heap_no);/*!< in: heap number */
+/** Get the last non-delete-marked record on a page.
+@param[in]	page	index tree leaf page
+@return the last record, not delete-marked
+@retval infimum record if all records are delete-marked */
+
+const rec_t*
+page_find_rec_max_not_deleted(
+	const page_t*	page);
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE  UNIV_INLINE_ORIGINAL
+#endif
+
+#ifndef UNIV_NONINL
+#include "page0page.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/page0page.ic b/storage/innobase/include/page0page.ic
new file mode 100644
index 00000000000..9b81156708f
--- /dev/null
+++ b/storage/innobase/include/page0page.ic
@@ -0,0 +1,1176 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0page.ic
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#ifdef UNIV_DEBUG
+# include "log0recv.h"
+#endif /* !UNIV_DEBUG */
+#ifndef UNIV_HOTBACKUP
+# include "rem0cmp.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "mtr0log.h"
+#include "page0zip.h"
+
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE
+#endif
+
+/************************************************************//**
+Gets the start of a page.
+@return	start of the page */
+UNIV_INLINE
+page_t*
+page_align(
+/*=======*/
+	const void*	ptr)	/*!< in: pointer to page frame */
+{
+	return((page_t*) ut_align_down(ptr, UNIV_PAGE_SIZE));
+}
+/************************************************************//**
+Gets the offset within a page.
+@return	offset from the start of the page */
+UNIV_INLINE
+ulint
+page_offset(
+/*========*/
+	const void*	ptr)	/*!< in: pointer to page frame */
+{
+	return(ut_align_offset(ptr, UNIV_PAGE_SIZE));
+}
+/*************************************************************//**
+Returns the max trx id field value. */
+UNIV_INLINE
+trx_id_t
+page_get_max_trx_id(
+/*================*/
+	const page_t*	page)	/*!< in: page */
+{
+	ut_ad(page);
+
+	return(mach_read_from_8(page + PAGE_HEADER + PAGE_MAX_TRX_ID));
+}
+
+/*************************************************************//**
+Sets the max trx id field value if trx_id is bigger than the previous
+value. */
+UNIV_INLINE
+void
+page_update_max_trx_id(
+/*===================*/
+	buf_block_t*	block,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(block);
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	/* During crash recovery, this function may be called on
+	something else than a leaf page of a secondary index or the
+	insert buffer index tree (dict_index_is_sec_or_ibuf() returns
+	TRUE for the dummy indexes constructed during redo log
+	application).  In that case, PAGE_MAX_TRX_ID is unused,
+	and trx_id is usually zero. */
+	ut_ad(trx_id || recv_recovery_is_on());
+	ut_ad(page_is_leaf(buf_block_get_frame(block)));
+
+	if (page_get_max_trx_id(buf_block_get_frame(block)) < trx_id) {
+
+		page_set_max_trx_id(block, page_zip, trx_id, mtr);
+	}
+}
+
+/*************************************************************//**
+Reads the given header field. */
+UNIV_INLINE
+ulint
+page_header_get_field(
+/*==================*/
+	const page_t*	page,	/*!< in: page */
+	ulint		field)	/*!< in: PAGE_LEVEL, ... */
+{
+	ut_ad(page);
+	ut_ad(field <= PAGE_INDEX_ID);
+
+	return(mach_read_from_2(page + PAGE_HEADER + field));
+}
+
+/*************************************************************//**
+Sets the given header field. */
+UNIV_INLINE
+void
+page_header_set_field(
+/*==================*/
+	page_t*		page,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	ulint		field,	/*!< in: PAGE_N_DIR_SLOTS, ... */
+	ulint		val)	/*!< in: value */
+{
+	ut_ad(page);
+	ut_ad(field <= PAGE_N_RECS);
+	ut_ad(field == PAGE_N_HEAP || val < UNIV_PAGE_SIZE);
+	ut_ad(field != PAGE_N_HEAP || (val & 0x7fff) < UNIV_PAGE_SIZE);
+
+	mach_write_to_2(page + PAGE_HEADER + field, val);
+	if (page_zip) {
+		page_zip_write_header(page_zip,
+				      page + PAGE_HEADER + field, 2, NULL);
+	}
+}
+
+/*************************************************************//**
+Returns the offset stored in the given header field.
+@return	offset from the start of the page, or 0 */
+UNIV_INLINE
+ulint
+page_header_get_offs(
+/*=================*/
+	const page_t*	page,	/*!< in: page */
+	ulint		field)	/*!< in: PAGE_FREE, ... */
+{
+	ulint	offs;
+
+	ut_ad(page);
+	ut_ad((field == PAGE_FREE)
+	      || (field == PAGE_LAST_INSERT)
+	      || (field == PAGE_HEAP_TOP));
+
+	offs = page_header_get_field(page, field);
+
+	ut_ad((field != PAGE_HEAP_TOP) || offs);
+
+	return(offs);
+}
+
+/*************************************************************//**
+Sets the pointer stored in the given header field. */
+UNIV_INLINE
+void
+page_header_set_ptr(
+/*================*/
+	page_t*		page,	/*!< in: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	ulint		field,	/*!< in: PAGE_FREE, ... */
+	const byte*	ptr)	/*!< in: pointer or NULL*/
+{
+	ulint	offs;
+
+	ut_ad(page);
+	ut_ad((field == PAGE_FREE)
+	      || (field == PAGE_LAST_INSERT)
+	      || (field == PAGE_HEAP_TOP));
+
+	if (ptr == NULL) {
+		offs = 0;
+	} else {
+		offs = ptr - page;
+	}
+
+	ut_ad((field != PAGE_HEAP_TOP) || offs);
+
+	page_header_set_field(page, page_zip, field, offs);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Resets the last insert info field in the page header. Writes to mlog
+about this operation. */
+UNIV_INLINE
+void
+page_header_reset_last_insert(
+/*==========================*/
+	page_t*		page,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(page && mtr);
+
+	if (page_zip) {
+		mach_write_to_2(page + (PAGE_HEADER + PAGE_LAST_INSERT), 0);
+		page_zip_write_header(page_zip,
+				      page + (PAGE_HEADER + PAGE_LAST_INSERT),
+				      2, mtr);
+	} else {
+		mlog_write_ulint(page + (PAGE_HEADER + PAGE_LAST_INSERT), 0,
+				 MLOG_2BYTES, mtr);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/************************************************************//**
+Determine whether the page is in new-style compact format.
+@return nonzero if the page is in compact format, zero if it is in
+old-style format */
+UNIV_INLINE
+ulint
+page_is_comp(
+/*=========*/
+	const page_t*	page)	/*!< in: index page */
+{
+	return(page_header_get_field(page, PAGE_N_HEAP) & 0x8000);
+}
+
+/************************************************************//**
+TRUE if the record is on a page in compact format.
+@return	nonzero if in compact format */
+UNIV_INLINE
+ulint
+page_rec_is_comp(
+/*=============*/
+	const rec_t*	rec)	/*!< in: record */
+{
+	return(page_is_comp(page_align(rec)));
+}
+
+/***************************************************************//**
+Returns the heap number of a record.
+@return	heap number */
+UNIV_INLINE
+ulint
+page_rec_get_heap_no(
+/*=================*/
+	const rec_t*	rec)	/*!< in: the physical record */
+{
+	if (page_rec_is_comp(rec)) {
+		return(rec_get_heap_no_new(rec));
+	} else {
+		return(rec_get_heap_no_old(rec));
+	}
+}
+
+/************************************************************//**
+Determine whether the page is a B-tree leaf.
+@return	true if the page is a B-tree leaf (PAGE_LEVEL = 0) */
+UNIV_INLINE
+bool
+page_is_leaf(
+/*=========*/
+	const page_t*	page)	/*!< in: page */
+{
+	return(!*(const uint16*) (page + (PAGE_HEADER + PAGE_LEVEL)));
+}
+
+/************************************************************//**
+Determine whether the page is empty.
+@return	true if the page is empty (PAGE_N_RECS = 0) */
+UNIV_INLINE
+bool
+page_is_empty(
+/*==========*/
+	const page_t*	page)	/*!< in: page */
+{
+	return(!*(const uint16*) (page + (PAGE_HEADER + PAGE_N_RECS)));
+}
+
+/************************************************************//**
+Determine whether the page contains garbage.
+@return	true if the page contains garbage (PAGE_GARBAGE is not 0) */
+UNIV_INLINE
+bool
+page_has_garbage(
+/*=============*/
+	const page_t*	page)	/*!< in: page */
+{
+	return(!!*(const uint16*) (page + (PAGE_HEADER + PAGE_GARBAGE)));
+}
+
+/************************************************************//**
+Gets the offset of the first record on the page.
+@return	offset of the first record in record list, relative from page */
+UNIV_INLINE
+ulint
+page_get_infimum_offset(
+/*====================*/
+	const page_t*	page)	/*!< in: page which must have record(s) */
+{
+	ut_ad(page);
+	ut_ad(!page_offset(page));
+
+	if (page_is_comp(page)) {
+		return(PAGE_NEW_INFIMUM);
+	} else {
+		return(PAGE_OLD_INFIMUM);
+	}
+}
+
+/************************************************************//**
+Gets the offset of the last record on the page.
+@return	offset of the last record in record list, relative from page */
+UNIV_INLINE
+ulint
+page_get_supremum_offset(
+/*=====================*/
+	const page_t*	page)	/*!< in: page which must have record(s) */
+{
+	ut_ad(page);
+	ut_ad(!page_offset(page));
+
+	if (page_is_comp(page)) {
+		return(PAGE_NEW_SUPREMUM);
+	} else {
+		return(PAGE_OLD_SUPREMUM);
+	}
+}
+
+/************************************************************//**
+TRUE if the record is a user record on the page.
+@return	TRUE if a user record */
+UNIV_INLINE
+ibool
+page_rec_is_user_rec_low(
+/*=====================*/
+	ulint	offset)	/*!< in: record offset on page */
+{
+	ut_ad(offset >= PAGE_NEW_INFIMUM);
+#if PAGE_OLD_INFIMUM < PAGE_NEW_INFIMUM
+# error "PAGE_OLD_INFIMUM < PAGE_NEW_INFIMUM"
+#endif
+#if PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM
+# error "PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM"
+#endif
+#if PAGE_NEW_INFIMUM > PAGE_OLD_SUPREMUM
+# error "PAGE_NEW_INFIMUM > PAGE_OLD_SUPREMUM"
+#endif
+#if PAGE_OLD_INFIMUM > PAGE_NEW_SUPREMUM
+# error "PAGE_OLD_INFIMUM > PAGE_NEW_SUPREMUM"
+#endif
+#if PAGE_NEW_SUPREMUM > PAGE_OLD_SUPREMUM_END
+# error "PAGE_NEW_SUPREMUM > PAGE_OLD_SUPREMUM_END"
+#endif
+#if PAGE_OLD_SUPREMUM > PAGE_NEW_SUPREMUM_END
+# error "PAGE_OLD_SUPREMUM > PAGE_NEW_SUPREMUM_END"
+#endif
+	ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START);
+
+	return(offset != PAGE_NEW_SUPREMUM
+	       && offset != PAGE_NEW_INFIMUM
+	       && offset != PAGE_OLD_INFIMUM
+	       && offset != PAGE_OLD_SUPREMUM);
+}
+
+/************************************************************//**
+TRUE if the record is the supremum record on a page.
+@return	TRUE if the supremum record */
+UNIV_INLINE
+ibool
+page_rec_is_supremum_low(
+/*=====================*/
+	ulint	offset)	/*!< in: record offset on page */
+{
+	ut_ad(offset >= PAGE_NEW_INFIMUM);
+	ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START);
+
+	return(offset == PAGE_NEW_SUPREMUM
+	       || offset == PAGE_OLD_SUPREMUM);
+}
+
+/************************************************************//**
+TRUE if the record is the infimum record on a page.
+@return	TRUE if the infimum record */
+UNIV_INLINE
+ibool
+page_rec_is_infimum_low(
+/*====================*/
+	ulint	offset)	/*!< in: record offset on page */
+{
+	ut_ad(offset >= PAGE_NEW_INFIMUM);
+	ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START);
+
+	return(offset == PAGE_NEW_INFIMUM || offset == PAGE_OLD_INFIMUM);
+}
+
+/************************************************************//**
+TRUE if the record is a user record on the page.
+@return	TRUE if a user record */
+UNIV_INLINE
+ibool
+page_rec_is_user_rec(
+/*=================*/
+	const rec_t*	rec)	/*!< in: record */
+{
+	ut_ad(page_rec_check(rec));
+
+	return(page_rec_is_user_rec_low(page_offset(rec)));
+}
+
+/************************************************************//**
+TRUE if the record is the supremum record on a page.
+@return	TRUE if the supremum record */
+UNIV_INLINE
+ibool
+page_rec_is_supremum(
+/*=================*/
+	const rec_t*	rec)	/*!< in: record */
+{
+	ut_ad(page_rec_check(rec));
+
+	return(page_rec_is_supremum_low(page_offset(rec)));
+}
+
+/************************************************************//**
+TRUE if the record is the infimum record on a page.
+@return	TRUE if the infimum record */
+UNIV_INLINE
+ibool
+page_rec_is_infimum(
+/*================*/
+	const rec_t*	rec)	/*!< in: record */
+{
+	ut_ad(page_rec_check(rec));
+
+	return(page_rec_is_infimum_low(page_offset(rec)));
+}
+
+/************************************************************//**
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return	nth record */
+UNIV_INLINE
+rec_t*
+page_rec_get_nth(
+/*=============*/
+	page_t*	page,	/*!< in: page */
+	ulint	nth)	/*!< in: nth record */
+{
+	return((rec_t*) page_rec_get_nth_const(page, nth));
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Returns the middle record of the records on the page. If there is an
+even number of records in the list, returns the first record of the
+upper half-list.
+@return	middle record */
+UNIV_INLINE
+rec_t*
+page_get_middle_rec(
+/*================*/
+	page_t*	page)	/*!< in: page */
+{
+	ulint	middle = (page_get_n_recs(page) + PAGE_HEAP_NO_USER_LOW) / 2;
+
+	return(page_rec_get_nth(page, middle));
+}
+
+/*************************************************************//**
+Compares a data tuple to a physical record. Differs from the function
+cmp_dtuple_rec_with_match in the way that the record must reside on an
+index page, and also page infimum and supremum records can be given in
+the parameter rec. These are considered as the negative infinity and
+the positive infinity in the alphabetical order.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared */
+UNIV_INLINE
+int
+page_cmp_dtuple_rec_with_match(
+/*===========================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record on a page; may also
+				be page infimum or supremum, in which case
+				matched-parameter values below are not
+				affected */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint*		matched_fields, /*!< in/out: number of already completely
+				matched fields; when function returns
+				contains the value for current comparison */
+	ulint*		matched_bytes) /*!< in/out: number of already matched
+				bytes within the first field not completely
+				matched; when function returns contains the
+				value for current comparison */
+{
+	ulint	rec_offset;
+
+	ut_ad(dtuple_check_typed(dtuple));
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(!rec_offs_comp(offsets) == !page_rec_is_comp(rec));
+
+	rec_offset = page_offset(rec);
+
+	if (rec_offset == PAGE_NEW_INFIMUM
+	    || rec_offset == PAGE_OLD_INFIMUM) {
+
+		return(1);
+
+	} else if (rec_offset == PAGE_NEW_SUPREMUM
+		   || rec_offset == PAGE_OLD_SUPREMUM) {
+
+		return(-1);
+	}
+
+	return(cmp_dtuple_rec_with_match(dtuple, rec, offsets,
+					 matched_fields,
+					 matched_bytes));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Gets the page number.
+@return	page number */
+UNIV_INLINE
+ulint
+page_get_page_no(
+/*=============*/
+	const page_t*	page)	/*!< in: page */
+{
+	ut_ad(page == page_align((page_t*) page));
+	return(mach_read_from_4(page + FIL_PAGE_OFFSET));
+}
+
+/*************************************************************//**
+Gets the tablespace identifier.
+@return	space id */
+UNIV_INLINE
+ulint
+page_get_space_id(
+/*==============*/
+	const page_t*	page)	/*!< in: page */
+{
+	ut_ad(page == page_align((page_t*) page));
+	return(mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+}
+
+/*************************************************************//**
+Gets the number of user records on page (infimum and supremum records
+are not user records).
+@return	number of user records */
+UNIV_INLINE
+ulint
+page_get_n_recs(
+/*============*/
+	const page_t*	page)	/*!< in: index page */
+{
+	return(page_header_get_field(page, PAGE_N_RECS));
+}
+
+/*************************************************************//**
+Gets the number of dir slots in directory.
+@return	number of slots */
+UNIV_INLINE
+ulint
+page_dir_get_n_slots(
+/*=================*/
+	const page_t*	page)	/*!< in: index page */
+{
+	return(page_header_get_field(page, PAGE_N_DIR_SLOTS));
+}
+/*************************************************************//**
+Sets the number of dir slots in directory. */
+UNIV_INLINE
+void
+page_dir_set_n_slots(
+/*=================*/
+	page_t*		page,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	ulint		n_slots)/*!< in: number of slots */
+{
+	page_header_set_field(page, page_zip, PAGE_N_DIR_SLOTS, n_slots);
+}
+
+/*************************************************************//**
+Gets the number of records in the heap.
+@return	number of user records */
+UNIV_INLINE
+ulint
+page_dir_get_n_heap(
+/*================*/
+	const page_t*	page)	/*!< in: index page */
+{
+	return(page_header_get_field(page, PAGE_N_HEAP) & 0x7fff);
+}
+
+/*************************************************************//**
+Sets the number of records in the heap. */
+UNIV_INLINE
+void
+page_dir_set_n_heap(
+/*================*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL.
+				Note that the size of the dense page directory
+				in the compressed page trailer is
+				n_heap * PAGE_ZIP_DIR_SLOT_SIZE. */
+	ulint		n_heap)	/*!< in: number of records */
+{
+	ut_ad(n_heap < 0x8000);
+	ut_ad(!page_zip || n_heap
+	      == (page_header_get_field(page, PAGE_N_HEAP) & 0x7fff) + 1);
+
+	page_header_set_field(page, page_zip, PAGE_N_HEAP, n_heap
+			      | (0x8000
+				 & page_header_get_field(page, PAGE_N_HEAP)));
+}
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Gets pointer to nth directory slot.
+@return	pointer to dir slot */
+UNIV_INLINE
+page_dir_slot_t*
+page_dir_get_nth_slot(
+/*==================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n)	/*!< in: position */
+{
+	ut_ad(page_dir_get_n_slots(page) > n);
+
+	return((page_dir_slot_t*)
+	       page + UNIV_PAGE_SIZE - PAGE_DIR
+	       - (n + 1) * PAGE_DIR_SLOT_SIZE);
+}
+#endif /* UNIV_DEBUG */
+
+/**************************************************************//**
+Used to check the consistency of a record on a page.
+@return	TRUE if succeed */
+UNIV_INLINE
+ibool
+page_rec_check(
+/*===========*/
+	const rec_t*	rec)	/*!< in: record */
+{
+	const page_t*	page = page_align(rec);
+
+	ut_a(rec);
+
+	ut_a(page_offset(rec) <= page_header_get_field(page, PAGE_HEAP_TOP));
+	ut_a(page_offset(rec) >= PAGE_DATA);
+
+	return(TRUE);
+}
+
+/***************************************************************//**
+Gets the record pointed to by a directory slot.
+@return	pointer to record */
+UNIV_INLINE
+const rec_t*
+page_dir_slot_get_rec(
+/*==================*/
+	const page_dir_slot_t*	slot)	/*!< in: directory slot */
+{
+	return(page_align(slot) + mach_read_from_2(slot));
+}
+
+/***************************************************************//**
+This is used to set the record offset in a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_rec(
+/*==================*/
+	page_dir_slot_t* slot,	/*!< in: directory slot */
+	rec_t*		 rec)	/*!< in: record on the page */
+{
+	ut_ad(page_rec_check(rec));
+
+	mach_write_to_2(slot, page_offset(rec));
+}
+
+/***************************************************************//**
+Gets the number of records owned by a directory slot.
+@return	number of records */
+UNIV_INLINE
+ulint
+page_dir_slot_get_n_owned(
+/*======================*/
+	const page_dir_slot_t*	slot)	/*!< in: page directory slot */
+{
+	const rec_t*	rec	= page_dir_slot_get_rec(slot);
+	if (page_rec_is_comp(slot)) {
+		return(rec_get_n_owned_new(rec));
+	} else {
+		return(rec_get_n_owned_old(rec));
+	}
+}
+
+/***************************************************************//**
+This is used to set the owned records field of a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_n_owned(
+/*======================*/
+	page_dir_slot_t*slot,	/*!< in/out: directory slot */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	ulint		n)	/*!< in: number of records owned by the slot */
+{
+	rec_t*	rec	= (rec_t*) page_dir_slot_get_rec(slot);
+	if (page_rec_is_comp(slot)) {
+		rec_set_n_owned_new(rec, page_zip, n);
+	} else {
+		ut_ad(!page_zip);
+		rec_set_n_owned_old(rec, n);
+	}
+}
+
+/************************************************************//**
+Calculates the space reserved for directory slots of a given number of
+records. The exact value is a fraction number n * PAGE_DIR_SLOT_SIZE /
+PAGE_DIR_SLOT_MIN_N_OWNED, and it is rounded upwards to an integer. */
+UNIV_INLINE
+ulint
+page_dir_calc_reserved_space(
+/*=========================*/
+	ulint	n_recs)		/*!< in: number of records */
+{
+	return((PAGE_DIR_SLOT_SIZE * n_recs + PAGE_DIR_SLOT_MIN_N_OWNED - 1)
+	       / PAGE_DIR_SLOT_MIN_N_OWNED);
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return	pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_low(
+/*==================*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	ulint		comp)	/*!< in: nonzero=compact page layout */
+{
+	ulint		offs;
+	const page_t*	page;
+
+	ut_ad(page_rec_check(rec));
+
+	page = page_align(rec);
+
+	offs = rec_get_next_offs(rec, comp);
+
+	if (offs >= UNIV_PAGE_SIZE) {
+		fprintf(stderr,
+			"InnoDB: Next record offset is nonsensical %lu"
+			" in record at offset %lu\n"
+			"InnoDB: rec address %p, space id %lu, page %lu\n",
+			(ulong) offs, (ulong) page_offset(rec),
+			(void*) rec,
+			(ulong) page_get_space_id(page),
+			(ulong) page_get_page_no(page));
+		buf_page_print(page, 0, 0);
+
+		ut_error;
+	} else if (offs == 0) {
+
+		return(NULL);
+	}
+
+	return(page + offs);
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return	pointer to next record */
+UNIV_INLINE
+rec_t*
+page_rec_get_next(
+/*==============*/
+	rec_t*	rec)	/*!< in: pointer to record */
+{
+	return((rec_t*) page_rec_get_next_low(rec, page_rec_is_comp(rec)));
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return	pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_const(
+/*====================*/
+	const rec_t*	rec)	/*!< in: pointer to record */
+{
+	return(page_rec_get_next_low(rec, page_rec_is_comp(rec)));
+}
+
+/************************************************************//**
+Gets the pointer to the next non delete-marked record on the page.
+If all subsequent records are delete-marked, then this function
+will return the supremum record.
+@return	pointer to next non delete-marked record or pointer to supremum */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_non_del_marked(
+/*=============================*/
+	const rec_t*	rec)	/*!< in: pointer to record */
+{
+	const rec_t*	r;
+	ulint		page_is_compact = page_rec_is_comp(rec);
+
+	for (r = page_rec_get_next_const(rec);
+	     !page_rec_is_supremum(r)
+	     && rec_get_deleted_flag(r, page_is_compact);
+	     r = page_rec_get_next_const(r)) {
+		/* noop */
+	}
+
+	return(r);
+}
+
+/************************************************************//**
+Sets the pointer to the next record on the page. */
+UNIV_INLINE
+void
+page_rec_set_next(
+/*==============*/
+	rec_t*		rec,	/*!< in: pointer to record,
+				must not be page supremum */
+	const rec_t*	next)	/*!< in: pointer to next record,
+				must not be page infimum */
+{
+	ulint	offs;
+
+	ut_ad(page_rec_check(rec));
+	ut_ad(!page_rec_is_supremum(rec));
+	ut_ad(rec != next);
+
+	ut_ad(!next || !page_rec_is_infimum(next));
+	ut_ad(!next || page_align(rec) == page_align(next));
+
+	offs = next != NULL ? page_offset(next) : 0;
+
+	if (page_rec_is_comp(rec)) {
+		rec_set_next_offs_new(rec, offs);
+	} else {
+		rec_set_next_offs_old(rec, offs);
+	}
+}
+
+/************************************************************//**
+Gets the pointer to the previous record.
+@return	pointer to previous record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_prev_const(
+/*====================*/
+	const rec_t*	rec)	/*!< in: pointer to record, must not be page
+				infimum */
+{
+	const page_dir_slot_t*	slot;
+	ulint			slot_no;
+	const rec_t*		rec2;
+	const rec_t*		prev_rec = NULL;
+	const page_t*		page;
+
+	ut_ad(page_rec_check(rec));
+
+	page = page_align(rec);
+
+	ut_ad(!page_rec_is_infimum(rec));
+
+	slot_no = page_dir_find_owner_slot(rec);
+
+	ut_a(slot_no != 0);
+
+	slot = page_dir_get_nth_slot(page, slot_no - 1);
+
+	rec2 = page_dir_slot_get_rec(slot);
+
+	if (page_is_comp(page)) {
+		while (rec != rec2) {
+			prev_rec = rec2;
+			rec2 = page_rec_get_next_low(rec2, TRUE);
+		}
+	} else {
+		while (rec != rec2) {
+			prev_rec = rec2;
+			rec2 = page_rec_get_next_low(rec2, FALSE);
+		}
+	}
+
+	ut_a(prev_rec);
+
+	return(prev_rec);
+}
+
+/************************************************************//**
+Gets the pointer to the previous record.
+@return	pointer to previous record */
+UNIV_INLINE
+rec_t*
+page_rec_get_prev(
+/*==============*/
+	rec_t*	rec)	/*!< in: pointer to record, must not be page
+			infimum */
+{
+	return((rec_t*) page_rec_get_prev_const(rec));
+}
+
+/***************************************************************//**
+Looks for the record which owns the given record.
+@return	the owner record */
+UNIV_INLINE
+rec_t*
+page_rec_find_owner_rec(
+/*====================*/
+	rec_t*	rec)	/*!< in: the physical record */
+{
+	ut_ad(page_rec_check(rec));
+
+	if (page_rec_is_comp(rec)) {
+		while (rec_get_n_owned_new(rec) == 0) {
+			rec = page_rec_get_next(rec);
+		}
+	} else {
+		while (rec_get_n_owned_old(rec) == 0) {
+			rec = page_rec_get_next(rec);
+		}
+	}
+
+	return(rec);
+}
+
+/**********************************************************//**
+Returns the base extra size of a physical record.  This is the
+size of the fixed header, independent of the record size.
+@return	REC_N_NEW_EXTRA_BYTES or REC_N_OLD_EXTRA_BYTES */
+UNIV_INLINE
+ulint
+page_rec_get_base_extra_size(
+/*=========================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+#if REC_N_NEW_EXTRA_BYTES + 1 != REC_N_OLD_EXTRA_BYTES
+# error "REC_N_NEW_EXTRA_BYTES + 1 != REC_N_OLD_EXTRA_BYTES"
+#endif
+	return(REC_N_NEW_EXTRA_BYTES + (ulint) !page_rec_is_comp(rec));
+}
+
+/************************************************************//**
+Returns the sum of the sizes of the records in the record list, excluding
+the infimum and supremum records.
+@return	data in bytes */
+UNIV_INLINE
+ulint
+page_get_data_size(
+/*===============*/
+	const page_t*	page)	/*!< in: index page */
+{
+	ulint	ret;
+
+	ret = (ulint)(page_header_get_field(page, PAGE_HEAP_TOP)
+		      - (page_is_comp(page)
+			 ? PAGE_NEW_SUPREMUM_END
+			 : PAGE_OLD_SUPREMUM_END)
+		      - page_header_get_field(page, PAGE_GARBAGE));
+
+	ut_ad(ret < UNIV_PAGE_SIZE);
+
+	return(ret);
+}
+
+
+/************************************************************//**
+Allocates a block of memory from the free list of an index page. */
+UNIV_INLINE
+void
+page_mem_alloc_free(
+/*================*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page with enough
+				space available for inserting the record,
+				or NULL */
+	rec_t*		next_rec,/*!< in: pointer to the new head of the
+				free record list */
+	ulint		need)	/*!< in: number of bytes allocated */
+{
+	ulint		garbage;
+
+#ifdef UNIV_DEBUG
+	const rec_t*	old_rec	= page_header_get_ptr(page, PAGE_FREE);
+	ulint		next_offs;
+
+	ut_ad(old_rec);
+	next_offs = rec_get_next_offs(old_rec, page_is_comp(page));
+	ut_ad(next_rec == (next_offs ? page + next_offs : NULL));
+#endif
+
+	page_header_set_ptr(page, page_zip, PAGE_FREE, next_rec);
+
+	garbage = page_header_get_field(page, PAGE_GARBAGE);
+	ut_ad(garbage >= need);
+
+	page_header_set_field(page, page_zip, PAGE_GARBAGE, garbage - need);
+}
+
+/*************************************************************//**
+Calculates free space if a page is emptied.
+@return	free space */
+UNIV_INLINE
+ulint
+page_get_free_space_of_empty(
+/*=========================*/
+	ulint	comp)		/*!< in: nonzero=compact page layout */
+{
+	if (comp) {
+		return((ulint)(UNIV_PAGE_SIZE
+			       - PAGE_NEW_SUPREMUM_END
+			       - PAGE_DIR
+			       - 2 * PAGE_DIR_SLOT_SIZE));
+	}
+
+	return((ulint)(UNIV_PAGE_SIZE
+		       - PAGE_OLD_SUPREMUM_END
+		       - PAGE_DIR
+		       - 2 * PAGE_DIR_SLOT_SIZE));
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Write a 32-bit field in a data dictionary record. */
+UNIV_INLINE
+void
+page_rec_write_field(
+/*=================*/
+	rec_t*	rec,	/*!< in/out: record to update */
+	ulint	i,	/*!< in: index of the field to update */
+	ulint	val,	/*!< in: value to write */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	byte*	data;
+	ulint	len;
+
+	data = rec_get_nth_field_old(rec, i, &len);
+
+	ut_ad(len == 4);
+
+	mlog_write_ulint(data, val, MLOG_4BYTES, mtr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/************************************************************//**
+Each user record on a page, and also the deleted user records in the heap
+takes its size plus the fraction of the dir cell size /
+PAGE_DIR_SLOT_MIN_N_OWNED bytes for it. If the sum of these exceeds the
+value of page_get_free_space_of_empty, the insert is impossible, otherwise
+it is allowed. This function returns the maximum combined size of records
+which can be inserted on top of the record heap.
+@return	maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size(
+/*=====================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n_recs)	/*!< in: number of records */
+{
+	ulint	occupied;
+	ulint	free_space;
+
+	if (page_is_comp(page)) {
+		occupied = page_header_get_field(page, PAGE_HEAP_TOP)
+			- PAGE_NEW_SUPREMUM_END
+			+ page_dir_calc_reserved_space(
+				n_recs + page_dir_get_n_heap(page) - 2);
+
+		free_space = page_get_free_space_of_empty(TRUE);
+	} else {
+		occupied = page_header_get_field(page, PAGE_HEAP_TOP)
+			- PAGE_OLD_SUPREMUM_END
+			+ page_dir_calc_reserved_space(
+				n_recs + page_dir_get_n_heap(page) - 2);
+
+		free_space = page_get_free_space_of_empty(FALSE);
+	}
+
+	/* Above the 'n_recs +' part reserves directory space for the new
+	inserted records; the '- 2' excludes page infimum and supremum
+	records */
+
+	if (occupied > free_space) {
+
+		return(0);
+	}
+
+	return(free_space - occupied);
+}
+
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of the record heap if a page is first reorganized.
+@return	maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size_after_reorganize(
+/*======================================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n_recs)	/*!< in: number of records */
+{
+	ulint	occupied;
+	ulint	free_space;
+
+	occupied = page_get_data_size(page)
+		+ page_dir_calc_reserved_space(n_recs + page_get_n_recs(page));
+
+	free_space = page_get_free_space_of_empty(page_is_comp(page));
+
+	if (occupied > free_space) {
+
+		return(0);
+	}
+
+	return(free_space - occupied);
+}
+
+/************************************************************//**
+Puts a record to free list. */
+UNIV_INLINE
+void
+page_mem_free(
+/*==========*/
+	page_t*			page,		/*!< in/out: index page */
+	page_zip_des_t*		page_zip,	/*!< in/out: compressed page,
+						or NULL */
+	rec_t*			rec,		/*!< in: pointer to the
+						(origin of) record */
+	const dict_index_t*	index,		/*!< in: index of rec */
+	const ulint*		offsets)	/*!< in: array returned by
+						rec_get_offsets() */
+{
+	rec_t*		free;
+	ulint		garbage;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	free = page_header_get_ptr(page, PAGE_FREE);
+
+	page_rec_set_next(rec, free);
+	page_header_set_ptr(page, page_zip, PAGE_FREE, rec);
+
+	garbage = page_header_get_field(page, PAGE_GARBAGE);
+
+	page_header_set_field(page, page_zip, PAGE_GARBAGE,
+			      garbage + rec_offs_size(offsets));
+
+	if (page_zip) {
+		page_zip_dir_delete(page_zip, rec, index, offsets, free);
+	} else {
+		page_header_set_field(page, page_zip, PAGE_N_RECS,
+				      page_get_n_recs(page) - 1);
+	}
+}
+
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE	UNIV_INLINE_ORIGINAL
+#endif
diff --git a/storage/innobase/include/page0types.h b/storage/innobase/include/page0types.h
new file mode 100644
index 00000000000..95143a4bb44
--- /dev/null
+++ b/storage/innobase/include/page0types.h
@@ -0,0 +1,169 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0types.h
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef page0types_h
+#define page0types_h
+
+using namespace std;
+
+#include <map>
+
+#include "univ.i"
+#include "dict0types.h"
+#include "mtr0types.h"
+
+/** Eliminates a name collision on HP-UX */
+#define page_t	   ib_page_t
+/** Type of the index page */
+typedef	byte		page_t;
+/** Index page cursor */
+struct page_cur_t;
+
+/** Compressed index page */
+typedef byte		page_zip_t;
+
+/* The following definitions would better belong to page0zip.h,
+but we cannot include page0zip.h from rem0rec.ic, because
+page0*.h includes rem0rec.h and may include rem0rec.ic. */
+
+/** Number of bits needed for representing different compressed page sizes */
+#define PAGE_ZIP_SSIZE_BITS 3
+
+/** Maximum compressed page shift size */
+#define PAGE_ZIP_SSIZE_MAX	\
+	(UNIV_ZIP_SIZE_SHIFT_MAX - UNIV_ZIP_SIZE_SHIFT_MIN + 1)
+
+/* Make sure there are enough bits available to store the maximum zip
+ssize, which is the number of shifts from 512. */
+#if PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS)
+# error "PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS)"
+#endif
+
+/** Compressed page descriptor */
+struct page_zip_des_t
+{
+	page_zip_t*	data;		/*!< compressed page data */
+
+#ifdef UNIV_DEBUG
+	unsigned	m_start:16;	/*!< start offset of modification log */
+	bool		m_external;	/*!< Allocated externally, not from the
+					buffer pool */
+#endif /* UNIV_DEBUG */
+	unsigned	m_end:16;	/*!< end offset of modification log */
+	unsigned	m_nonempty:1;	/*!< TRUE if the modification log
+					is not empty */
+	unsigned	n_blobs:12;	/*!< number of externally stored
+					columns on the page; the maximum
+					is 744 on a 16 KiB page */
+	unsigned	ssize:PAGE_ZIP_SSIZE_BITS;
+					/*!< 0 or compressed page shift size;
+					the size in bytes is
+					(UNIV_ZIP_SIZE_MIN >> 1) << ssize. */
+};
+
+/** Compression statistics for a given page size */
+struct page_zip_stat_t {
+	/** Number of page compressions */
+	ulint		compressed;
+	/** Number of successful page compressions */
+	ulint		compressed_ok;
+	/** Number of page decompressions */
+	ulint		decompressed;
+	/** Duration of page compressions in microseconds */
+	ib_uint64_t	compressed_usec;
+	/** Duration of page decompressions in microseconds */
+	ib_uint64_t	decompressed_usec;
+	page_zip_stat_t() :
+		/* Initialize members to 0 so that when we do
+		stlmap[key].compressed++ and element with "key" does not
+		exist it gets inserted with zeroed members. */
+		compressed(0),
+		compressed_ok(0),
+		decompressed(0),
+		compressed_usec(0),
+		decompressed_usec(0)
+	{ }
+};
+
+/** Compression statistics types */
+typedef map<index_id_t, page_zip_stat_t>	page_zip_stat_per_index_t;
+
+/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */
+extern page_zip_stat_t				page_zip_stat[PAGE_ZIP_SSIZE_MAX];
+/** Statistics on compression, indexed by dict_index_t::id */
+extern page_zip_stat_per_index_t		page_zip_stat_per_index;
+extern ib_mutex_t				page_zip_stat_per_index_mutex;
+#ifdef HAVE_PSI_INTERFACE
+extern mysql_pfs_key_t				page_zip_stat_per_index_mutex_key;
+#endif /* HAVE_PSI_INTERFACE */
+
+/**********************************************************************//**
+Write the "deleted" flag of a record on a compressed page.  The flag must
+already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_deleted(
+/*=====================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in: record on the uncompressed page */
+	ulint		flag)	/*!< in: the deleted flag (nonzero=TRUE) */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Write the "owned" flag of a record on a compressed page.  The n_owned field
+must already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_owned(
+/*===================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in: record on the uncompressed page */
+	ulint		flag)	/*!< in: the owned flag (nonzero=TRUE) */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Shift the dense page directory when a record is deleted. */
+UNIV_INTERN
+void
+page_zip_dir_delete(
+/*================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	byte*		rec,	/*!< in: deleted record */
+	dict_index_t*	index,	/*!< in: index of rec */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec) */
+	const byte*	free)	/*!< in: previous start of the free list */
+	__attribute__((nonnull(1,2,3,4)));
+
+/**********************************************************************//**
+Add a slot to the dense page directory. */
+UNIV_INTERN
+void
+page_zip_dir_add_slot(
+/*==================*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	ulint		is_clustered)	/*!< in: nonzero for clustered index,
+					zero for others */
+	__attribute__((nonnull));
+#endif
diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h
new file mode 100644
index 00000000000..9d3b78ed2fc
--- /dev/null
+++ b/storage/innobase/include/page0zip.h
@@ -0,0 +1,538 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0zip.h
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#ifndef page0zip_h
+#define page0zip_h
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE
+#endif
+
+#include "mtr0types.h"
+#include "page0types.h"
+#include "buf0types.h"
+#include "dict0types.h"
+#include "srv0srv.h"
+#include "trx0types.h"
+#include "mem0mem.h"
+
+/* Compression level to be used by zlib. Settable by user. */
+extern uint	page_zip_level;
+
+/* Default compression level. */
+#define DEFAULT_COMPRESSION_LEVEL	6
+
+/* Whether or not to log compressed page images to avoid possible
+compression algorithm changes in zlib. */
+extern my_bool	page_zip_log_pages;
+
+/**********************************************************************//**
+Determine the size of a compressed page in bytes.
+@return	size in bytes */
+UNIV_INLINE
+ulint
+page_zip_get_size(
+/*==============*/
+	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
+	__attribute__((nonnull, pure));
+/**********************************************************************//**
+Set the size of a compressed page in bytes. */
+UNIV_INLINE
+void
+page_zip_set_size(
+/*==============*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	ulint		size);		/*!< in: size in bytes */
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Determine if a record is so big that it needs to be stored externally.
+@return	FALSE if the entire record can be stored locally on the page */
+UNIV_INLINE
+ibool
+page_zip_rec_needs_ext(
+/*===================*/
+	ulint	rec_size,	/*!< in: length of the record in bytes */
+	ulint	comp,		/*!< in: nonzero=compact format */
+	ulint	n_fields,	/*!< in: number of fields in the record;
+				ignored if zip_size == 0 */
+	ulint	zip_size)	/*!< in: compressed page size in bytes, or 0 */
+	__attribute__((const));
+
+/**********************************************************************//**
+Determine the guaranteed free space on an empty page.
+@return	minimum payload size on the page */
+UNIV_INTERN
+ulint
+page_zip_empty_size(
+/*================*/
+	ulint	n_fields,	/*!< in: number of columns in the index */
+	ulint	zip_size)	/*!< in: compressed page size in bytes */
+	__attribute__((const));
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Initialize a compressed page descriptor. */
+UNIV_INLINE
+void
+page_zip_des_init(
+/*==============*/
+	page_zip_des_t*	page_zip);	/*!< in/out: compressed page
+					descriptor */
+
+/**********************************************************************//**
+Configure the zlib allocator to use the given memory heap. */
+UNIV_INTERN
+void
+page_zip_set_alloc(
+/*===============*/
+	void*		stream,		/*!< in/out: zlib stream */
+	mem_heap_t*	heap);		/*!< in: memory heap to use */
+
+/**********************************************************************//**
+Compress a page.
+@return TRUE on success, FALSE on failure; page_zip will be left
+intact on failure. */
+UNIV_INTERN
+ibool
+page_zip_compress(
+/*==============*/
+	page_zip_des_t*	page_zip,/*!< in: size; out: data, n_blobs,
+				m_start, m_end, m_nonempty */
+	const page_t*	page,	/*!< in: uncompressed page */
+	dict_index_t*	index,	/*!< in: index of the B-tree node */
+	ulint		level,	/*!< in: compression level */
+	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
+	__attribute__((nonnull(1,2,3)));
+
+/**********************************************************************//**
+Decompress a page.  This function should tolerate errors on the compressed
+page.  Instead of letting assertions fail, it will return FALSE if an
+inconsistency is detected.
+@return	TRUE on success, FALSE on failure */
+UNIV_INTERN
+ibool
+page_zip_decompress(
+/*================*/
+	page_zip_des_t*	page_zip,/*!< in: data, ssize;
+				out: m_start, m_end, m_nonempty, n_blobs */
+	page_t*		page,	/*!< out: uncompressed page, may be trashed */
+	ibool		all)	/*!< in: TRUE=decompress the whole page;
+				FALSE=verify but do not copy some
+				page header fields that should not change
+				after page creation */
+	__attribute__((nonnull(1,2)));
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate a compressed page descriptor.
+@return	TRUE if ok */
+UNIV_INLINE
+ibool
+page_zip_simple_validate(
+/*=====================*/
+	const page_zip_des_t*	page_zip);	/*!< in: compressed page
+						descriptor */
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_ZIP_DEBUG
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return	TRUE if valid, FALSE if not */
+UNIV_INTERN
+ibool
+page_zip_validate_low(
+/*==================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	const page_t*		page,	/*!< in: uncompressed page */
+	const dict_index_t*	index,	/*!< in: index of the page, if known */
+	ibool			sloppy)	/*!< in: FALSE=strict,
+					TRUE=ignore the MIN_REC_FLAG */
+	__attribute__((nonnull(1,2)));
+/**********************************************************************//**
+Check that the compressed and decompressed pages match. */
+UNIV_INTERN
+ibool
+page_zip_validate(
+/*==============*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	const page_t*		page,	/*!< in: uncompressed page */
+	const dict_index_t*	index)	/*!< in: index of the page, if known */
+	__attribute__((nonnull(1,2)));
+#endif /* UNIV_ZIP_DEBUG */
+
+/**********************************************************************//**
+Determine how big record can be inserted without recompressing the page.
+@return a positive number indicating the maximum size of a record
+whose insertion is guaranteed to succeed, or zero or negative */
+UNIV_INLINE
+lint
+page_zip_max_ins_size(
+/*==================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust)/*!< in: TRUE if clustered index */
+	__attribute__((nonnull, pure));
+
+/**********************************************************************//**
+Determine if enough space is available in the modification log.
+@return	TRUE if page_zip_write_rec() will succeed */
+UNIV_INLINE
+ibool
+page_zip_available(
+/*===============*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust,/*!< in: TRUE if clustered index */
+	ulint			length,	/*!< in: combined size of the record */
+	ulint			create)	/*!< in: nonzero=add the record to
+					the heap */
+	__attribute__((nonnull, pure));
+
+/**********************************************************************//**
+Write data to the uncompressed header portion of a page.  The data must
+already have been written to the uncompressed page. */
+UNIV_INLINE
+void
+page_zip_write_header(
+/*==================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	str,	/*!< in: address on the uncompressed page */
+	ulint		length,	/*!< in: length of the data */
+	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
+	__attribute__((nonnull(1,2)));
+
+/**********************************************************************//**
+Write an entire record on the compressed page.  The data must already
+have been written to the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_write_rec(
+/*===============*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in: record being written */
+	dict_index_t*	index,	/*!< in: the index the record belongs to */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint		create)	/*!< in: nonzero=insert, zero=update */
+	__attribute__((nonnull));
+
+/***********************************************************//**
+Parses a log record of writing a BLOB pointer of a record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_blob_ptr(
+/*==========================*/
+	byte*		ptr,	/*!< in: redo log buffer */
+	byte*		end_ptr,/*!< in: redo log buffer end */
+	page_t*		page,	/*!< in/out: uncompressed page */
+	page_zip_des_t*	page_zip);/*!< in/out: compressed page */
+
+/**********************************************************************//**
+Write a BLOB pointer of a record on the leaf page of a clustered index.
+The information must already have been updated on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_write_blob_ptr(
+/*====================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in/out: record whose data is being
+				written */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint		n,	/*!< in: column index */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle,
+				or NULL if no logging is needed */
+	__attribute__((nonnull(1,2,3,4)));
+
+/***********************************************************//**
+Parses a log record of writing the node pointer of a record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_node_ptr(
+/*==========================*/
+	byte*		ptr,	/*!< in: redo log buffer */
+	byte*		end_ptr,/*!< in: redo log buffer end */
+	page_t*		page,	/*!< in/out: uncompressed page */
+	page_zip_des_t*	page_zip);/*!< in/out: compressed page */
+
+/**********************************************************************//**
+Write the node pointer of a record on a non-leaf compressed page. */
+UNIV_INTERN
+void
+page_zip_write_node_ptr(
+/*====================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	byte*		rec,	/*!< in/out: record */
+	ulint		size,	/*!< in: data size of rec */
+	ulint		ptr,	/*!< in: node pointer */
+	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
+	__attribute__((nonnull(1,2)));
+
+/**********************************************************************//**
+Write the trx_id and roll_ptr of a record on a B-tree leaf node page. */
+UNIV_INTERN
+void
+page_zip_write_trx_id_and_roll_ptr(
+/*===============================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	byte*		rec,	/*!< in/out: record */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint		trx_id_col,/*!< in: column number of TRX_ID in rec */
+	trx_id_t	trx_id,	/*!< in: transaction identifier */
+	roll_ptr_t	roll_ptr)/*!< in: roll_ptr */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Write the "deleted" flag of a record on a compressed page.  The flag must
+already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_deleted(
+/*=====================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in: record on the uncompressed page */
+	ulint		flag)	/*!< in: the deleted flag (nonzero=TRUE) */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Write the "owned" flag of a record on a compressed page.  The n_owned field
+must already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_owned(
+/*===================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in: record on the uncompressed page */
+	ulint		flag)	/*!< in: the owned flag (nonzero=TRUE) */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Insert a record to the dense page directory. */
+UNIV_INTERN
+void
+page_zip_dir_insert(
+/*================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	prev_rec,/*!< in: record after which to insert */
+	const byte*	free_rec,/*!< in: record from which rec was
+				allocated, or NULL */
+	byte*		rec);	/*!< in: record to insert */
+
+/**********************************************************************//**
+Shift the dense page directory and the array of BLOB pointers
+when a record is deleted. */
+UNIV_INTERN
+void
+page_zip_dir_delete(
+/*================*/
+	page_zip_des_t*		page_zip,	/*!< in/out: compressed page */
+	byte*			rec,		/*!< in: deleted record */
+	const dict_index_t*	index,		/*!< in: index of rec */
+	const ulint*		offsets,	/*!< in: rec_get_offsets(rec) */
+	const byte*		free)		/*!< in: previous start of
+						the free list */
+	__attribute__((nonnull(1,2,3,4)));
+
+/**********************************************************************//**
+Add a slot to the dense page directory. */
+UNIV_INTERN
+void
+page_zip_dir_add_slot(
+/*==================*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	ulint		is_clustered)	/*!< in: nonzero for clustered index,
+					zero for others */
+	__attribute__((nonnull));
+
+/***********************************************************//**
+Parses a log record of writing to the header of a page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_header(
+/*========================*/
+	byte*		ptr,	/*!< in: redo log buffer */
+	byte*		end_ptr,/*!< in: redo log buffer end */
+	page_t*		page,	/*!< in/out: uncompressed page */
+	page_zip_des_t*	page_zip);/*!< in/out: compressed page */
+
+/**********************************************************************//**
+Write data to the uncompressed header portion of a page.  The data must
+already have been written to the uncompressed page.
+However, the data portion of the uncompressed page may differ from
+the compressed page when a record is being inserted in
+page_cur_insert_rec_low(). */
+UNIV_INLINE
+void
+page_zip_write_header(
+/*==================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	str,	/*!< in: address on the uncompressed page */
+	ulint		length,	/*!< in: length of the data */
+	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
+	__attribute__((nonnull(1,2)));
+
+/**********************************************************************//**
+Reorganize and compress a page.  This is a low-level operation for
+compressed pages, to be used when page_zip_compress() fails.
+On success, a redo log entry MLOG_ZIP_PAGE_COMPRESS will be written.
+The function btr_page_reorganize() should be preferred whenever possible.
+IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
+non-clustered index, the caller must update the insert buffer free
+bits in the same mini-transaction in such a way that the modification
+will be redo-logged.
+@return TRUE on success, FALSE on failure; page_zip will be left
+intact on failure, but page will be overwritten. */
+UNIV_INTERN
+ibool
+page_zip_reorganize(
+/*================*/
+	buf_block_t*	block,	/*!< in/out: page with compressed page;
+				on the compressed page, in: size;
+				out: data, n_blobs,
+				m_start, m_end, m_nonempty */
+	dict_index_t*	index,	/*!< in: index of the B-tree node */
+	mtr_t*		mtr)	/*!< in: mini-transaction */
+	__attribute__((nonnull));
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Copy the records of a page byte for byte.  Do not copy the page header
+or trailer, except those B-tree header fields that are directly
+related to the storage of records.  Also copy PAGE_MAX_TRX_ID.
+NOTE: The caller must update the lock table and the adaptive hash index. */
+UNIV_INTERN
+void
+page_zip_copy_recs(
+/*===============*/
+	page_zip_des_t*		page_zip,	/*!< out: copy of src_zip
+						(n_blobs, m_start, m_end,
+						m_nonempty, data[0..size-1]) */
+	page_t*			page,		/*!< out: copy of src */
+	const page_zip_des_t*	src_zip,	/*!< in: compressed page */
+	const page_t*		src,		/*!< in: page */
+	dict_index_t*		index,		/*!< in: index of the B-tree */
+	mtr_t*			mtr)		/*!< in: mini-transaction */
+	__attribute__((nonnull));
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Parses a log record of compressing an index page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_compress(
+/*====================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	page_t*		page,	/*!< out: uncompressed page */
+	page_zip_des_t*	page_zip)/*!< out: compressed page */
+	__attribute__((nonnull(1,2)));
+
+/**********************************************************************//**
+Calculate the compressed page checksum.
+@return	page checksum */
+UNIV_INTERN
+ulint
+page_zip_calc_checksum(
+/*===================*/
+        const void*     data,   /*!< in: compressed page */
+        ulint           size,   /*!< in: size of compressed page */
+	srv_checksum_algorithm_t algo) /*!< in: algorithm to use */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Verify a compressed page's checksum.
+@return	TRUE if the stored checksum is valid according to the value of
+innodb_checksum_algorithm */
+UNIV_INTERN
+ibool
+page_zip_verify_checksum(
+/*=====================*/
+	const void*	data,	/*!< in: compressed page */
+	ulint		size);	/*!< in: size of compressed page */
+/**********************************************************************//**
+Write a log record of compressing an index page without the data on the page. */
+UNIV_INLINE
+void
+page_zip_compress_write_log_no_data(
+/*================================*/
+	ulint		level,	/*!< in: compression level */
+	const page_t*	page,	/*!< in: page that is compressed */
+	dict_index_t*	index,	/*!< in: index */
+	mtr_t*		mtr);	/*!< in: mtr */
+/**********************************************************************//**
+Parses a log record of compressing an index page without the data.
+@return	end of log record or NULL */
+UNIV_INLINE
+byte*
+page_zip_parse_compress_no_data(
+/*============================*/
+	byte*		ptr,		/*!< in: buffer */
+	byte*		end_ptr,	/*!< in: buffer end */
+	page_t*		page,		/*!< in: uncompressed page */
+	page_zip_des_t*	page_zip,	/*!< out: compressed page */
+	dict_index_t*	index)		/*!< in: index */
+	__attribute__((nonnull(1,2)));
+
+/**********************************************************************//**
+Reset the counters used for filling
+INFORMATION_SCHEMA.innodb_cmp_per_index. */
+UNIV_INLINE
+void
+page_zip_reset_stat_per_index();
+/*===========================*/
+
+#ifndef UNIV_HOTBACKUP
+/** Check if a pointer to an uncompressed page matches a compressed page.
+When we IMPORT a tablespace the blocks and accompanying frames are allocted
+from outside the buffer pool.
+@param ptr	pointer to an uncompressed page frame
+@param page_zip	compressed page descriptor
+@return		TRUE if ptr and page_zip refer to the same block */
+# define PAGE_ZIP_MATCH(ptr, page_zip)					\
+	(((page_zip)->m_external					\
+	  && (page_align(ptr) + UNIV_PAGE_SIZE == (page_zip)->data))	\
+	  || buf_frame_get_page_zip(ptr) == (page_zip))
+#else /* !UNIV_HOTBACKUP */
+/** Check if a pointer to an uncompressed page matches a compressed page.
+@param ptr	pointer to an uncompressed page frame
+@param page_zip	compressed page descriptor
+@return		TRUE if ptr and page_zip refer to the same block */
+# define PAGE_ZIP_MATCH(ptr, page_zip)				\
+	(page_align(ptr) + UNIV_PAGE_SIZE == (page_zip)->data)
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE	UNIV_INLINE_ORIGINAL
+#endif
+
+#ifndef UNIV_NONINL
+# include "page0zip.ic"
+#endif
+
+#endif /* page0zip_h */
diff --git a/storage/innobase/include/page0zip.ic b/storage/innobase/include/page0zip.ic
new file mode 100644
index 00000000000..6c7d8cd32c7
--- /dev/null
+++ b/storage/innobase/include/page0zip.ic
@@ -0,0 +1,456 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0zip.ic
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE
+#endif
+
+#include "page0zip.h"
+#include "mtr0log.h"
+#include "page0page.h"
+
+/* The format of compressed pages is as follows.
+
+The header and trailer of the uncompressed pages, excluding the page
+directory in the trailer, are copied as is to the header and trailer
+of the compressed page.
+
+At the end of the compressed page, there is a dense page directory
+pointing to every user record contained on the page, including deleted
+records on the free list.  The dense directory is indexed in the
+collation order, i.e., in the order in which the record list is
+linked on the uncompressed page.  The infimum and supremum records are
+excluded.  The two most significant bits of the entries are allocated
+for the delete-mark and an n_owned flag indicating the last record in
+a chain of records pointed to from the sparse page directory on the
+uncompressed page.
+
+The data between PAGE_ZIP_START and the last page directory entry will
+be written in compressed format, starting at offset PAGE_DATA.
+Infimum and supremum records are not stored.  We exclude the
+REC_N_NEW_EXTRA_BYTES in every record header.  These can be recovered
+from the dense page directory stored at the end of the compressed
+page.
+
+The fields node_ptr (in non-leaf B-tree nodes; level>0), trx_id and
+roll_ptr (in leaf B-tree nodes; level=0), and BLOB pointers of
+externally stored columns are stored separately, in ascending order of
+heap_no and column index, starting backwards from the dense page
+directory.
+
+The compressed data stream may be followed by a modification log
+covering the compressed portion of the page, as follows.
+
+MODIFICATION LOG ENTRY FORMAT
+- write record:
+  - (heap_no - 1) << 1 (1..2 bytes)
+  - extra bytes backwards
+  - data bytes
+- clear record:
+  - (heap_no - 1) << 1 | 1 (1..2 bytes)
+
+The integer values are stored in a variable-length format:
+- 0xxxxxxx: 0..127
+- 1xxxxxxx xxxxxxxx: 0..32767
+
+The end of the modification log is marked by a 0 byte.
+
+In summary, the compressed page looks like this:
+
+(1) Uncompressed page header (PAGE_DATA bytes)
+(2) Compressed index information
+(3) Compressed page data
+(4) Page modification log (page_zip->m_start..page_zip->m_end)
+(5) Empty zero-filled space
+(6) BLOB pointers (on leaf pages)
+  - BTR_EXTERN_FIELD_REF_SIZE for each externally stored column
+  - in descending collation order
+(7) Uncompressed columns of user records, n_dense * uncompressed_size bytes,
+  - indexed by heap_no
+  - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN for leaf pages of clustered indexes
+  - REC_NODE_PTR_SIZE for non-leaf pages
+  - 0 otherwise
+(8) dense page directory, stored backwards
+  - n_dense = n_heap - 2
+  - existing records in ascending collation order
+  - deleted records (free list) in link order
+*/
+
+/** Start offset of the area that will be compressed */
+#define PAGE_ZIP_START		PAGE_NEW_SUPREMUM_END
+/** Size of an compressed page directory entry */
+#define PAGE_ZIP_DIR_SLOT_SIZE	2
+/** Mask of record offsets */
+#define PAGE_ZIP_DIR_SLOT_MASK	0x3fff
+/** 'owned' flag */
+#define PAGE_ZIP_DIR_SLOT_OWNED	0x4000
+/** 'deleted' flag */
+#define PAGE_ZIP_DIR_SLOT_DEL	0x8000
+
+/**********************************************************************//**
+Determine the size of a compressed page in bytes.
+@return	size in bytes */
+UNIV_INLINE
+ulint
+page_zip_get_size(
+/*==============*/
+	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
+{
+	ulint	size;
+
+	if (!page_zip->ssize) {
+		return(0);
+	}
+
+	size = (UNIV_ZIP_SIZE_MIN >> 1) << page_zip->ssize;
+
+	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
+	ut_ad(size <= UNIV_PAGE_SIZE);
+
+	return(size);
+}
+/**********************************************************************//**
+Set the size of a compressed page in bytes. */
+UNIV_INLINE
+void
+page_zip_set_size(
+/*==============*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	ulint		size)		/*!< in: size in bytes */
+{
+	if (size) {
+		int	ssize;
+
+		ut_ad(ut_is_2pow(size));
+
+		for (ssize = 1; size > (ulint) (512 << ssize); ssize++) {
+		}
+
+		page_zip->ssize = ssize;
+	} else {
+		page_zip->ssize = 0;
+	}
+
+	ut_ad(page_zip_get_size(page_zip) == size);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Determine if a record is so big that it needs to be stored externally.
+@return	FALSE if the entire record can be stored locally on the page */
+UNIV_INLINE
+ibool
+page_zip_rec_needs_ext(
+/*===================*/
+	ulint	rec_size,	/*!< in: length of the record in bytes */
+	ulint	comp,		/*!< in: nonzero=compact format */
+	ulint	n_fields,	/*!< in: number of fields in the record;
+				ignored if zip_size == 0 */
+	ulint	zip_size)	/*!< in: compressed page size in bytes, or 0 */
+{
+	ut_ad(rec_size > comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES);
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(comp || !zip_size);
+
+#if UNIV_PAGE_SIZE_MAX > REC_MAX_DATA_SIZE
+	if (rec_size >= REC_MAX_DATA_SIZE) {
+		return(TRUE);
+	}
+#endif
+
+	if (zip_size) {
+		ut_ad(comp);
+		/* On a compressed page, there is a two-byte entry in
+		the dense page directory for every record.  But there
+		is no record header.  There should be enough room for
+		one record on an empty leaf page.  Subtract 1 byte for
+		the encoded heap number.  Check also the available space
+		on the uncompressed page. */
+		return(rec_size - (REC_N_NEW_EXTRA_BYTES - 2 - 1)
+		       >= page_zip_empty_size(n_fields, zip_size)
+		       || rec_size >= page_get_free_space_of_empty(TRUE) / 2);
+	}
+
+	return(rec_size >= page_get_free_space_of_empty(comp) / 2);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate a compressed page descriptor.
+@return	TRUE if ok */
+UNIV_INLINE
+ibool
+page_zip_simple_validate(
+/*=====================*/
+	const page_zip_des_t*	page_zip)/*!< in: compressed page descriptor */
+{
+	ut_ad(page_zip);
+	ut_ad(page_zip->data);
+	ut_ad(page_zip->ssize <= PAGE_ZIP_SSIZE_MAX);
+	ut_ad(page_zip_get_size(page_zip)
+	      > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE);
+	ut_ad(page_zip->m_start <= page_zip->m_end);
+	ut_ad(page_zip->m_end < page_zip_get_size(page_zip));
+	ut_ad(page_zip->n_blobs
+	      < page_zip_get_size(page_zip) / BTR_EXTERN_FIELD_REF_SIZE);
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Determine if the length of the page trailer.
+@return length of the page trailer, in bytes, not including the
+terminating zero byte of the modification log */
+UNIV_INLINE
+ibool
+page_zip_get_trailer_len(
+/*=====================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust)/*!< in: TRUE if clustered index */
+{
+	ulint	uncompressed_size;
+
+	ut_ad(page_zip_simple_validate(page_zip));
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+	if (!page_is_leaf(page_zip->data)) {
+		uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
+			+ REC_NODE_PTR_SIZE;
+		ut_ad(!page_zip->n_blobs);
+	} else if (is_clust) {
+		uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
+			+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+	} else {
+		uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE;
+		ut_ad(!page_zip->n_blobs);
+	}
+
+	return((page_dir_get_n_heap(page_zip->data) - 2)
+	       * uncompressed_size
+	       + page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE);
+}
+
+/**********************************************************************//**
+Determine how big record can be inserted without recompressing the page.
+@return a positive number indicating the maximum size of a record
+whose insertion is guaranteed to succeed, or zero or negative */
+UNIV_INLINE
+lint
+page_zip_max_ins_size(
+/*==================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust)/*!< in: TRUE if clustered index */
+{
+	ulint	trailer_len;
+
+	trailer_len = page_zip_get_trailer_len(page_zip, is_clust);
+
+	/* When a record is created, a pointer may be added to
+	the dense directory.
+	Likewise, space for the columns that will not be
+	compressed will be allocated from the page trailer.
+	Also the BLOB pointers will be allocated from there, but
+	we may as well count them in the length of the record. */
+
+	trailer_len += PAGE_ZIP_DIR_SLOT_SIZE;
+
+	return((lint) page_zip_get_size(page_zip)
+	       - trailer_len - page_zip->m_end
+	       - (REC_N_NEW_EXTRA_BYTES - 2));
+}
+
+/**********************************************************************//**
+Determine if enough space is available in the modification log.
+@return	TRUE if enough space is available */
+UNIV_INLINE
+ibool
+page_zip_available(
+/*===============*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust,/*!< in: TRUE if clustered index */
+	ulint			length,	/*!< in: combined size of the record */
+	ulint			create)	/*!< in: nonzero=add the record to
+					the heap */
+{
+	ulint	trailer_len;
+
+	ut_ad(length > REC_N_NEW_EXTRA_BYTES);
+
+	trailer_len = page_zip_get_trailer_len(page_zip, is_clust);
+
+	/* Subtract the fixed extra bytes and add the maximum
+	space needed for identifying the record (encoded heap_no). */
+	length -= REC_N_NEW_EXTRA_BYTES - 2;
+
+	if (create > 0) {
+		/* When a record is created, a pointer may be added to
+		the dense directory.
+		Likewise, space for the columns that will not be
+		compressed will be allocated from the page trailer.
+		Also the BLOB pointers will be allocated from there, but
+		we may as well count them in the length of the record. */
+
+		trailer_len += PAGE_ZIP_DIR_SLOT_SIZE;
+	}
+
+	return(length + trailer_len + page_zip->m_end
+	       < page_zip_get_size(page_zip));
+}
+
+/**********************************************************************//**
+Initialize a compressed page descriptor. */
+UNIV_INLINE
+void
+page_zip_des_init(
+/*==============*/
+	page_zip_des_t*	page_zip)	/*!< in/out: compressed page
+					descriptor */
+{
+	memset(page_zip, 0, sizeof *page_zip);
+}
+
+/**********************************************************************//**
+Write a log record of writing to the uncompressed header portion of a page. */
+UNIV_INTERN
+void
+page_zip_write_header_log(
+/*======================*/
+	const byte*	data,/*!< in: data on the uncompressed page */
+	ulint		length,	/*!< in: length of the data */
+	mtr_t*		mtr);	/*!< in: mini-transaction */
+
+/**********************************************************************//**
+Write data to the uncompressed header portion of a page.  The data must
+already have been written to the uncompressed page.
+However, the data portion of the uncompressed page may differ from
+the compressed page when a record is being inserted in
+page_cur_insert_rec_zip(). */
+UNIV_INLINE
+void
+page_zip_write_header(
+/*==================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	str,	/*!< in: address on the uncompressed page */
+	ulint		length,	/*!< in: length of the data */
+	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
+{
+	ulint	pos;
+
+	ut_ad(PAGE_ZIP_MATCH(str, page_zip));
+	ut_ad(page_zip_simple_validate(page_zip));
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+	pos = page_offset(str);
+
+	ut_ad(pos < PAGE_DATA);
+
+	memcpy(page_zip->data + pos, str, length);
+
+	/* The following would fail in page_cur_insert_rec_zip(). */
+	/* ut_ad(page_zip_validate(page_zip, str - pos)); */
+
+	if (mtr) {
+#ifndef UNIV_HOTBACKUP
+		page_zip_write_header_log(str, length, mtr);
+#endif /* !UNIV_HOTBACKUP */
+	}
+}
+
+/**********************************************************************//**
+Write a log record of compressing an index page without the data on the page. */
+UNIV_INLINE
+void
+page_zip_compress_write_log_no_data(
+/*================================*/
+	ulint		level,	/*!< in: compression level */
+	const page_t*	page,	/*!< in: page that is compressed */
+	dict_index_t*	index,	/*!< in: index */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	byte* log_ptr = mlog_open_and_write_index(
+		mtr, page, index, MLOG_ZIP_PAGE_COMPRESS_NO_DATA, 1);
+
+	if (log_ptr) {
+		mach_write_to_1(log_ptr, level);
+		mlog_close(mtr, log_ptr + 1);
+	}
+}
+
+/**********************************************************************//**
+Parses a log record of compressing an index page without the data.
+@return	end of log record or NULL */
+UNIV_INLINE
+byte*
+page_zip_parse_compress_no_data(
+/*============================*/
+	byte*		ptr,		/*!< in: buffer */
+	byte*		end_ptr,	/*!< in: buffer end */
+	page_t*		page,		/*!< in: uncompressed page */
+	page_zip_des_t*	page_zip,	/*!< out: compressed page */
+	dict_index_t*	index)		/*!< in: index */
+{
+	ulint	level;
+	if (end_ptr == ptr) {
+		return(NULL);
+	}
+
+	level = mach_read_from_1(ptr);
+
+	/* If page compression fails then there must be something wrong
+	because a compress log record is logged only if the compression
+	was successful. Crash in this case. */
+
+	if (page
+	    && !page_zip_compress(page_zip, page, index, level, NULL)) {
+		ut_error;
+	}
+
+	return(ptr + 1);
+}
+
+/**********************************************************************//**
+Reset the counters used for filling
+INFORMATION_SCHEMA.innodb_cmp_per_index. */
+UNIV_INLINE
+void
+page_zip_reset_stat_per_index()
+/*===========================*/
+{
+	mutex_enter(&page_zip_stat_per_index_mutex);
+
+	page_zip_stat_per_index.erase(
+		page_zip_stat_per_index.begin(),
+		page_zip_stat_per_index.end());
+
+	mutex_exit(&page_zip_stat_per_index_mutex);
+}
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE	UNIV_INLINE_ORIGINAL
+#endif
diff --git a/storage/innobase/include/pars0grm.h b/storage/innobase/include/pars0grm.h
new file mode 100644
index 00000000000..8e725fe9545
--- /dev/null
+++ b/storage/innobase/include/pars0grm.h
@@ -0,0 +1,261 @@
+/* A Bison parser, made by GNU Bison 2.3.  */
+
+/* Skeleton interface for Bison's Yacc-like parsers in C
+
+   Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004, 2005, 2006
+   Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor,
+   Boston, MA 02110-1301, USA.  */
+
+/* As a special exception, you may create a larger work that contains
+   part or all of the Bison parser skeleton and distribute that work
+   under terms of your choice, so long as that work isn't itself a
+   parser generator using the skeleton or a modified version thereof
+   as a parser skeleton.  Alternatively, if you modify or redistribute
+   the parser skeleton itself, you may (at your option) remove this
+   special exception, which will cause the skeleton and the resulting
+   Bison output files to be licensed under the GNU General Public
+   License without this special exception.
+
+   This special exception was added by the Free Software Foundation in
+   version 2.2 of Bison.  */
+
+/* Tokens.  */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+   /* Put the tokens into the symbol table, so that GDB and other debuggers
+      know about them.  */
+   enum yytokentype {
+     PARS_INT_LIT = 258,
+     PARS_FLOAT_LIT = 259,
+     PARS_STR_LIT = 260,
+     PARS_FIXBINARY_LIT = 261,
+     PARS_BLOB_LIT = 262,
+     PARS_NULL_LIT = 263,
+     PARS_ID_TOKEN = 264,
+     PARS_AND_TOKEN = 265,
+     PARS_OR_TOKEN = 266,
+     PARS_NOT_TOKEN = 267,
+     PARS_GE_TOKEN = 268,
+     PARS_LE_TOKEN = 269,
+     PARS_NE_TOKEN = 270,
+     PARS_PROCEDURE_TOKEN = 271,
+     PARS_IN_TOKEN = 272,
+     PARS_OUT_TOKEN = 273,
+     PARS_BINARY_TOKEN = 274,
+     PARS_BLOB_TOKEN = 275,
+     PARS_INT_TOKEN = 276,
+     PARS_INTEGER_TOKEN = 277,
+     PARS_FLOAT_TOKEN = 278,
+     PARS_CHAR_TOKEN = 279,
+     PARS_IS_TOKEN = 280,
+     PARS_BEGIN_TOKEN = 281,
+     PARS_END_TOKEN = 282,
+     PARS_IF_TOKEN = 283,
+     PARS_THEN_TOKEN = 284,
+     PARS_ELSE_TOKEN = 285,
+     PARS_ELSIF_TOKEN = 286,
+     PARS_LOOP_TOKEN = 287,
+     PARS_WHILE_TOKEN = 288,
+     PARS_RETURN_TOKEN = 289,
+     PARS_SELECT_TOKEN = 290,
+     PARS_SUM_TOKEN = 291,
+     PARS_COUNT_TOKEN = 292,
+     PARS_DISTINCT_TOKEN = 293,
+     PARS_FROM_TOKEN = 294,
+     PARS_WHERE_TOKEN = 295,
+     PARS_FOR_TOKEN = 296,
+     PARS_DDOT_TOKEN = 297,
+     PARS_READ_TOKEN = 298,
+     PARS_ORDER_TOKEN = 299,
+     PARS_BY_TOKEN = 300,
+     PARS_ASC_TOKEN = 301,
+     PARS_DESC_TOKEN = 302,
+     PARS_INSERT_TOKEN = 303,
+     PARS_INTO_TOKEN = 304,
+     PARS_VALUES_TOKEN = 305,
+     PARS_UPDATE_TOKEN = 306,
+     PARS_SET_TOKEN = 307,
+     PARS_DELETE_TOKEN = 308,
+     PARS_CURRENT_TOKEN = 309,
+     PARS_OF_TOKEN = 310,
+     PARS_CREATE_TOKEN = 311,
+     PARS_TABLE_TOKEN = 312,
+     PARS_INDEX_TOKEN = 313,
+     PARS_UNIQUE_TOKEN = 314,
+     PARS_CLUSTERED_TOKEN = 315,
+     PARS_DOES_NOT_FIT_IN_MEM_TOKEN = 316,
+     PARS_ON_TOKEN = 317,
+     PARS_ASSIGN_TOKEN = 318,
+     PARS_DECLARE_TOKEN = 319,
+     PARS_CURSOR_TOKEN = 320,
+     PARS_SQL_TOKEN = 321,
+     PARS_OPEN_TOKEN = 322,
+     PARS_FETCH_TOKEN = 323,
+     PARS_CLOSE_TOKEN = 324,
+     PARS_NOTFOUND_TOKEN = 325,
+     PARS_TO_CHAR_TOKEN = 326,
+     PARS_TO_NUMBER_TOKEN = 327,
+     PARS_TO_BINARY_TOKEN = 328,
+     PARS_BINARY_TO_NUMBER_TOKEN = 329,
+     PARS_SUBSTR_TOKEN = 330,
+     PARS_REPLSTR_TOKEN = 331,
+     PARS_CONCAT_TOKEN = 332,
+     PARS_INSTR_TOKEN = 333,
+     PARS_LENGTH_TOKEN = 334,
+     PARS_SYSDATE_TOKEN = 335,
+     PARS_PRINTF_TOKEN = 336,
+     PARS_ASSERT_TOKEN = 337,
+     PARS_RND_TOKEN = 338,
+     PARS_RND_STR_TOKEN = 339,
+     PARS_ROW_PRINTF_TOKEN = 340,
+     PARS_COMMIT_TOKEN = 341,
+     PARS_ROLLBACK_TOKEN = 342,
+     PARS_WORK_TOKEN = 343,
+     PARS_UNSIGNED_TOKEN = 344,
+     PARS_EXIT_TOKEN = 345,
+     PARS_FUNCTION_TOKEN = 346,
+     PARS_LOCK_TOKEN = 347,
+     PARS_SHARE_TOKEN = 348,
+     PARS_MODE_TOKEN = 349,
+     PARS_LIKE_TOKEN = 350,
+     PARS_LIKE_TOKEN_EXACT = 351,
+     PARS_LIKE_TOKEN_PREFIX = 352,
+     PARS_LIKE_TOKEN_SUFFIX = 353,
+     PARS_LIKE_TOKEN_SUBSTR = 354,
+     PARS_TABLE_NAME_TOKEN = 355,
+     PARS_COMPACT_TOKEN = 356,
+     PARS_BLOCK_SIZE_TOKEN = 357,
+     PARS_BIGINT_TOKEN = 358,
+     NEG = 359
+   };
+#endif
+/* Tokens.  */
+#define PARS_INT_LIT 258
+#define PARS_FLOAT_LIT 259
+#define PARS_STR_LIT 260
+#define PARS_FIXBINARY_LIT 261
+#define PARS_BLOB_LIT 262
+#define PARS_NULL_LIT 263
+#define PARS_ID_TOKEN 264
+#define PARS_AND_TOKEN 265
+#define PARS_OR_TOKEN 266
+#define PARS_NOT_TOKEN 267
+#define PARS_GE_TOKEN 268
+#define PARS_LE_TOKEN 269
+#define PARS_NE_TOKEN 270
+#define PARS_PROCEDURE_TOKEN 271
+#define PARS_IN_TOKEN 272
+#define PARS_OUT_TOKEN 273
+#define PARS_BINARY_TOKEN 274
+#define PARS_BLOB_TOKEN 275
+#define PARS_INT_TOKEN 276
+#define PARS_INTEGER_TOKEN 277
+#define PARS_FLOAT_TOKEN 278
+#define PARS_CHAR_TOKEN 279
+#define PARS_IS_TOKEN 280
+#define PARS_BEGIN_TOKEN 281
+#define PARS_END_TOKEN 282
+#define PARS_IF_TOKEN 283
+#define PARS_THEN_TOKEN 284
+#define PARS_ELSE_TOKEN 285
+#define PARS_ELSIF_TOKEN 286
+#define PARS_LOOP_TOKEN 287
+#define PARS_WHILE_TOKEN 288
+#define PARS_RETURN_TOKEN 289
+#define PARS_SELECT_TOKEN 290
+#define PARS_SUM_TOKEN 291
+#define PARS_COUNT_TOKEN 292
+#define PARS_DISTINCT_TOKEN 293
+#define PARS_FROM_TOKEN 294
+#define PARS_WHERE_TOKEN 295
+#define PARS_FOR_TOKEN 296
+#define PARS_DDOT_TOKEN 297
+#define PARS_READ_TOKEN 298
+#define PARS_ORDER_TOKEN 299
+#define PARS_BY_TOKEN 300
+#define PARS_ASC_TOKEN 301
+#define PARS_DESC_TOKEN 302
+#define PARS_INSERT_TOKEN 303
+#define PARS_INTO_TOKEN 304
+#define PARS_VALUES_TOKEN 305
+#define PARS_UPDATE_TOKEN 306
+#define PARS_SET_TOKEN 307
+#define PARS_DELETE_TOKEN 308
+#define PARS_CURRENT_TOKEN 309
+#define PARS_OF_TOKEN 310
+#define PARS_CREATE_TOKEN 311
+#define PARS_TABLE_TOKEN 312
+#define PARS_INDEX_TOKEN 313
+#define PARS_UNIQUE_TOKEN 314
+#define PARS_CLUSTERED_TOKEN 315
+#define PARS_DOES_NOT_FIT_IN_MEM_TOKEN 316
+#define PARS_ON_TOKEN 317
+#define PARS_ASSIGN_TOKEN 318
+#define PARS_DECLARE_TOKEN 319
+#define PARS_CURSOR_TOKEN 320
+#define PARS_SQL_TOKEN 321
+#define PARS_OPEN_TOKEN 322
+#define PARS_FETCH_TOKEN 323
+#define PARS_CLOSE_TOKEN 324
+#define PARS_NOTFOUND_TOKEN 325
+#define PARS_TO_CHAR_TOKEN 326
+#define PARS_TO_NUMBER_TOKEN 327
+#define PARS_TO_BINARY_TOKEN 328
+#define PARS_BINARY_TO_NUMBER_TOKEN 329
+#define PARS_SUBSTR_TOKEN 330
+#define PARS_REPLSTR_TOKEN 331
+#define PARS_CONCAT_TOKEN 332
+#define PARS_INSTR_TOKEN 333
+#define PARS_LENGTH_TOKEN 334
+#define PARS_SYSDATE_TOKEN 335
+#define PARS_PRINTF_TOKEN 336
+#define PARS_ASSERT_TOKEN 337
+#define PARS_RND_TOKEN 338
+#define PARS_RND_STR_TOKEN 339
+#define PARS_ROW_PRINTF_TOKEN 340
+#define PARS_COMMIT_TOKEN 341
+#define PARS_ROLLBACK_TOKEN 342
+#define PARS_WORK_TOKEN 343
+#define PARS_UNSIGNED_TOKEN 344
+#define PARS_EXIT_TOKEN 345
+#define PARS_FUNCTION_TOKEN 346
+#define PARS_LOCK_TOKEN 347
+#define PARS_SHARE_TOKEN 348
+#define PARS_MODE_TOKEN 349
+#define PARS_LIKE_TOKEN 350
+#define PARS_LIKE_TOKEN_EXACT 351
+#define PARS_LIKE_TOKEN_PREFIX 352
+#define PARS_LIKE_TOKEN_SUFFIX 353
+#define PARS_LIKE_TOKEN_SUBSTR 354
+#define PARS_TABLE_NAME_TOKEN 355
+#define PARS_COMPACT_TOKEN 356
+#define PARS_BLOCK_SIZE_TOKEN 357
+#define PARS_BIGINT_TOKEN 358
+#define NEG 359
+
+
+
+
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef int YYSTYPE;
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+# define YYSTYPE_IS_TRIVIAL 1
+#endif
+
+extern YYSTYPE yylval;
+
diff --git a/storage/innobase/include/pars0opt.h b/storage/innobase/include/pars0opt.h
new file mode 100644
index 00000000000..1084d644c90
--- /dev/null
+++ b/storage/innobase/include/pars0opt.h
@@ -0,0 +1,75 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0opt.h
+Simple SQL optimizer
+
+Created 12/21/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0opt_h
+#define pars0opt_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "usr0types.h"
+#include "pars0sym.h"
+#include "dict0types.h"
+#include "row0sel.h"
+
+/*******************************************************************//**
+Optimizes a select. Decides which indexes to tables to use. The tables
+are accessed in the order that they were written to the FROM part in the
+select statement. */
+UNIV_INTERN
+void
+opt_search_plan(
+/*============*/
+	sel_node_t*	sel_node);	/*!< in: parsed select node */
+/*******************************************************************//**
+Looks for occurrences of the columns of the table in the query subgraph and
+adds them to the list of columns if an occurrence of the same column does not
+already exist in the list. If the column is already in the list, puts a value
+indirection to point to the occurrence in the column list, except if the
+column occurrence we are looking at is in the column list, in which case
+nothing is done. */
+UNIV_INTERN
+void
+opt_find_all_cols(
+/*==============*/
+	ibool		copy_val,	/*!< in: if TRUE, new found columns are
+					added as columns to copy */
+	dict_index_t*	index,		/*!< in: index to use */
+	sym_node_list_t* col_list,	/*!< in: base node of a list where
+					to add new found columns */
+	plan_t*		plan,		/*!< in: plan or NULL */
+	que_node_t*	exp);		/*!< in: expression or condition */
+/********************************************************************//**
+Prints info of a query plan. */
+UNIV_INTERN
+void
+opt_print_query_plan(
+/*=================*/
+	sel_node_t*	sel_node);	/*!< in: select node */
+
+#ifndef UNIV_NONINL
+#include "pars0opt.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/pars0opt.ic b/storage/innobase/include/pars0opt.ic
new file mode 100644
index 00000000000..786d911ca3d
--- /dev/null
+++ b/storage/innobase/include/pars0opt.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0opt.ic
+Simple SQL optimizer
+
+Created 12/21/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/pars0pars.h b/storage/innobase/include/pars0pars.h
new file mode 100644
index 00000000000..65ff7533828
--- /dev/null
+++ b/storage/innobase/include/pars0pars.h
@@ -0,0 +1,826 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0pars.h
+SQL parser
+
+Created 11/19/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0pars_h
+#define pars0pars_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "usr0types.h"
+#include "pars0types.h"
+#include "row0types.h"
+#include "trx0types.h"
+#include "ut0vec.h"
+
+/** Type of the user functions. The first argument is always InnoDB-supplied
+and varies in type, while 'user_arg' is a user-supplied argument. The
+meaning of the return type also varies. See the individual use cases, e.g.
+the FETCH statement, for details on them. */
+typedef ibool	(*pars_user_func_cb_t)(void* arg, void* user_arg);
+
+/** If the following is set TRUE, the parser will emit debugging
+information */
+extern int	yydebug;
+
+#ifdef UNIV_SQL_DEBUG
+/** If the following is set TRUE, the lexer will print the SQL string
+as it tokenizes it */
+extern ibool	pars_print_lexed;
+#endif /* UNIV_SQL_DEBUG */
+
+/* Global variable used while parsing a single procedure or query : the code is
+NOT re-entrant */
+extern sym_tab_t*	pars_sym_tab_global;
+
+extern pars_res_word_t	pars_to_char_token;
+extern pars_res_word_t	pars_to_number_token;
+extern pars_res_word_t	pars_to_binary_token;
+extern pars_res_word_t	pars_binary_to_number_token;
+extern pars_res_word_t	pars_substr_token;
+extern pars_res_word_t	pars_replstr_token;
+extern pars_res_word_t	pars_concat_token;
+extern pars_res_word_t	pars_length_token;
+extern pars_res_word_t	pars_instr_token;
+extern pars_res_word_t	pars_sysdate_token;
+extern pars_res_word_t	pars_printf_token;
+extern pars_res_word_t	pars_assert_token;
+extern pars_res_word_t	pars_rnd_token;
+extern pars_res_word_t	pars_rnd_str_token;
+extern pars_res_word_t	pars_count_token;
+extern pars_res_word_t	pars_sum_token;
+extern pars_res_word_t	pars_distinct_token;
+extern pars_res_word_t	pars_binary_token;
+extern pars_res_word_t	pars_blob_token;
+extern pars_res_word_t	pars_int_token;
+extern pars_res_word_t	pars_bigint_token;
+extern pars_res_word_t	pars_char_token;
+extern pars_res_word_t	pars_float_token;
+extern pars_res_word_t	pars_update_token;
+extern pars_res_word_t	pars_asc_token;
+extern pars_res_word_t	pars_desc_token;
+extern pars_res_word_t	pars_open_token;
+extern pars_res_word_t	pars_close_token;
+extern pars_res_word_t	pars_share_token;
+extern pars_res_word_t	pars_unique_token;
+extern pars_res_word_t	pars_clustered_token;
+
+extern ulint		pars_star_denoter;
+
+/* Procedure parameter types */
+#define PARS_INPUT	0
+#define PARS_OUTPUT	1
+#define PARS_NOT_PARAM	2
+
+int
+yyparse(void);
+
+/*************************************************************//**
+Parses an SQL string returning the query graph.
+@return	own: the query graph */
+UNIV_INTERN
+que_t*
+pars_sql(
+/*=====*/
+	pars_info_t*	info,	/*!< in: extra information, or NULL */
+	const char*	str);	/*!< in: SQL string */
+/*************************************************************//**
+Retrieves characters to the lexical analyzer.
+@return number of characters copied or 0 on EOF */
+UNIV_INTERN
+int
+pars_get_lex_chars(
+/*===============*/
+	char*	buf,		/*!< in/out: buffer where to copy */
+	int	max_size);	/*!< in: maximum number of characters which fit
+				in the buffer */
+/*************************************************************//**
+Called by yyparse on error. */
+UNIV_INTERN
+void
+yyerror(
+/*====*/
+	const char*	s);	/*!< in: error message string */
+/*********************************************************************//**
+Parses a variable declaration.
+@return	own: symbol table node of type SYM_VAR */
+UNIV_INTERN
+sym_node_t*
+pars_variable_declaration(
+/*======================*/
+	sym_node_t*	node,	/*!< in: symbol table node allocated for the
+				id of the variable */
+	pars_res_word_t* type);	/*!< in: pointer to a type token */
+/*********************************************************************//**
+Parses a function expression.
+@return	own: function node in a query tree */
+UNIV_INTERN
+func_node_t*
+pars_func(
+/*======*/
+	que_node_t*	res_word,/*!< in: function name reserved word */
+	que_node_t*	arg);	/*!< in: first argument in the argument list */
+/*************************************************************************
+Rebind a LIKE search string. NOTE: We ignore any '%' characters embedded
+within the search string.
+@return	own: function node in a query tree */
+UNIV_INTERN
+int
+pars_like_rebind(
+/*=============*/
+        sym_node_t*     node,   /* in: The search string node.*/
+        const byte*     ptr,    /* in: literal to (re) bind */
+        ulint           len);   /* in: length of literal to (re) bind*/
+/*********************************************************************//**
+Parses an operator expression.
+@return	own: function node in a query tree */
+UNIV_INTERN
+func_node_t*
+pars_op(
+/*====*/
+	int		func,	/*!< in: operator token code */
+	que_node_t*	arg1,	/*!< in: first argument */
+	que_node_t*	arg2);	/*!< in: second argument or NULL for an unary
+				operator */
+/*********************************************************************//**
+Parses an ORDER BY clause. Order by a single column only is supported.
+@return	own: order-by node in a query tree */
+UNIV_INTERN
+order_node_t*
+pars_order_by(
+/*==========*/
+	sym_node_t*	column,	/*!< in: column name */
+	pars_res_word_t* asc);	/*!< in: &pars_asc_token or pars_desc_token */
+/*********************************************************************//**
+Parses a select list; creates a query graph node for the whole SELECT
+statement.
+@return	own: select node in a query tree */
+UNIV_INTERN
+sel_node_t*
+pars_select_list(
+/*=============*/
+	que_node_t*	select_list,	/*!< in: select list */
+	sym_node_t*	into_list);	/*!< in: variables list or NULL */
+/*********************************************************************//**
+Parses a cursor declaration.
+@return	sym_node */
+UNIV_INTERN
+que_node_t*
+pars_cursor_declaration(
+/*====================*/
+	sym_node_t*	sym_node,	/*!< in: cursor id node in the symbol
+					table */
+	sel_node_t*	select_node);	/*!< in: select node */
+/*********************************************************************//**
+Parses a function declaration.
+@return	sym_node */
+UNIV_INTERN
+que_node_t*
+pars_function_declaration(
+/*======================*/
+	sym_node_t*	sym_node);	/*!< in: function id node in the symbol
+					table */
+/*********************************************************************//**
+Parses a select statement.
+@return	own: select node in a query tree */
+UNIV_INTERN
+sel_node_t*
+pars_select_statement(
+/*==================*/
+	sel_node_t*	select_node,	/*!< in: select node already containing
+					the select list */
+	sym_node_t*	table_list,	/*!< in: table list */
+	que_node_t*	search_cond,	/*!< in: search condition or NULL */
+	pars_res_word_t* for_update,	/*!< in: NULL or &pars_update_token */
+	pars_res_word_t* consistent_read,/*!< in: NULL or
+						&pars_consistent_token */
+	order_node_t*	order_by);	/*!< in: NULL or an order-by node */
+/*********************************************************************//**
+Parses a column assignment in an update.
+@return	column assignment node */
+UNIV_INTERN
+col_assign_node_t*
+pars_column_assignment(
+/*===================*/
+	sym_node_t*	column,	/*!< in: column to assign */
+	que_node_t*	exp);	/*!< in: value to assign */
+/*********************************************************************//**
+Parses a delete or update statement start.
+@return	own: update node in a query tree */
+UNIV_INTERN
+upd_node_t*
+pars_update_statement_start(
+/*========================*/
+	ibool		is_delete,	/*!< in: TRUE if delete */
+	sym_node_t*	table_sym,	/*!< in: table name node */
+	col_assign_node_t* col_assign_list);/*!< in: column assignment list, NULL
+					if delete */
+/*********************************************************************//**
+Parses an update or delete statement.
+@return	own: update node in a query tree */
+UNIV_INTERN
+upd_node_t*
+pars_update_statement(
+/*==================*/
+	upd_node_t*	node,		/*!< in: update node */
+	sym_node_t*	cursor_sym,	/*!< in: pointer to a cursor entry in
+					the symbol table or NULL */
+	que_node_t*	search_cond);	/*!< in: search condition or NULL */
+/*********************************************************************//**
+Parses an insert statement.
+@return	own: update node in a query tree */
+UNIV_INTERN
+ins_node_t*
+pars_insert_statement(
+/*==================*/
+	sym_node_t*	table_sym,	/*!< in: table name node */
+	que_node_t*	values_list,	/*!< in: value expression list or NULL */
+	sel_node_t*	select);	/*!< in: select condition or NULL */
+/*********************************************************************//**
+Parses a procedure parameter declaration.
+@return	own: symbol table node of type SYM_VAR */
+UNIV_INTERN
+sym_node_t*
+pars_parameter_declaration(
+/*=======================*/
+	sym_node_t*	node,	/*!< in: symbol table node allocated for the
+				id of the parameter */
+	ulint		param_type,
+				/*!< in: PARS_INPUT or PARS_OUTPUT */
+	pars_res_word_t* type);	/*!< in: pointer to a type token */
+/*********************************************************************//**
+Parses an elsif element.
+@return	elsif node */
+UNIV_INTERN
+elsif_node_t*
+pars_elsif_element(
+/*===============*/
+	que_node_t*	cond,		/*!< in: if-condition */
+	que_node_t*	stat_list);	/*!< in: statement list */
+/*********************************************************************//**
+Parses an if-statement.
+@return	if-statement node */
+UNIV_INTERN
+if_node_t*
+pars_if_statement(
+/*==============*/
+	que_node_t*	cond,		/*!< in: if-condition */
+	que_node_t*	stat_list,	/*!< in: statement list */
+	que_node_t*	else_part);	/*!< in: else-part statement list */
+/*********************************************************************//**
+Parses a for-loop-statement.
+@return	for-statement node */
+UNIV_INTERN
+for_node_t*
+pars_for_statement(
+/*===============*/
+	sym_node_t*	loop_var,	/*!< in: loop variable */
+	que_node_t*	loop_start_limit,/*!< in: loop start expression */
+	que_node_t*	loop_end_limit,	/*!< in: loop end expression */
+	que_node_t*	stat_list);	/*!< in: statement list */
+/*********************************************************************//**
+Parses a while-statement.
+@return	while-statement node */
+UNIV_INTERN
+while_node_t*
+pars_while_statement(
+/*=================*/
+	que_node_t*	cond,		/*!< in: while-condition */
+	que_node_t*	stat_list);	/*!< in: statement list */
+/*********************************************************************//**
+Parses an exit statement.
+@return	exit statement node */
+UNIV_INTERN
+exit_node_t*
+pars_exit_statement(void);
+/*=====================*/
+/*********************************************************************//**
+Parses a return-statement.
+@return	return-statement node */
+UNIV_INTERN
+return_node_t*
+pars_return_statement(void);
+/*=======================*/
+/*********************************************************************//**
+Parses a procedure call.
+@return	function node */
+UNIV_INTERN
+func_node_t*
+pars_procedure_call(
+/*================*/
+	que_node_t*	res_word,/*!< in: procedure name reserved word */
+	que_node_t*	args);	/*!< in: argument list */
+/*********************************************************************//**
+Parses an assignment statement.
+@return	assignment statement node */
+UNIV_INTERN
+assign_node_t*
+pars_assignment_statement(
+/*======================*/
+	sym_node_t*	var,	/*!< in: variable to assign */
+	que_node_t*	val);	/*!< in: value to assign */
+/*********************************************************************//**
+Parses a fetch statement. into_list or user_func (but not both) must be
+non-NULL.
+@return	fetch statement node */
+UNIV_INTERN
+fetch_node_t*
+pars_fetch_statement(
+/*=================*/
+	sym_node_t*	cursor,		/*!< in: cursor node */
+	sym_node_t*	into_list,	/*!< in: variables to set, or NULL */
+	sym_node_t*	user_func);	/*!< in: user function name, or NULL */
+/*********************************************************************//**
+Parses an open or close cursor statement.
+@return	fetch statement node */
+UNIV_INTERN
+open_node_t*
+pars_open_statement(
+/*================*/
+	ulint		type,	/*!< in: ROW_SEL_OPEN_CURSOR
+				or ROW_SEL_CLOSE_CURSOR */
+	sym_node_t*	cursor);	/*!< in: cursor node */
+/*********************************************************************//**
+Parses a row_printf-statement.
+@return	row_printf-statement node */
+UNIV_INTERN
+row_printf_node_t*
+pars_row_printf_statement(
+/*======================*/
+	sel_node_t*	sel_node);	/*!< in: select node */
+/*********************************************************************//**
+Parses a commit statement.
+@return	own: commit node struct */
+UNIV_INTERN
+commit_node_t*
+pars_commit_statement(void);
+/*=======================*/
+/*********************************************************************//**
+Parses a rollback statement.
+@return	own: rollback node struct */
+UNIV_INTERN
+roll_node_t*
+pars_rollback_statement(void);
+/*=========================*/
+/*********************************************************************//**
+Parses a column definition at a table creation.
+@return	column sym table node */
+UNIV_INTERN
+sym_node_t*
+pars_column_def(
+/*============*/
+	sym_node_t*		sym_node,	/*!< in: column node in the
+						symbol table */
+	pars_res_word_t*	type,		/*!< in: data type */
+	sym_node_t*		len,		/*!< in: length of column, or
+						NULL */
+	void*			is_unsigned,	/*!< in: if not NULL, column
+						is of type UNSIGNED. */
+	void*			is_not_null);	/*!< in: if not NULL, column
+						is of type NOT NULL. */
+/*********************************************************************//**
+Parses a table creation operation.
+@return	table create subgraph */
+UNIV_INTERN
+tab_node_t*
+pars_create_table(
+/*==============*/
+	sym_node_t*	table_sym,	/*!< in: table name node in the symbol
+					table */
+	sym_node_t*	column_defs,	/*!< in: list of column names */
+	sym_node_t*	compact,	/* in: non-NULL if COMPACT table. */
+	sym_node_t*	block_size,	/* in: block size (can be NULL) */
+	void*		not_fit_in_memory);
+					/*!< in: a non-NULL pointer means that
+					this is a table which in simulations
+					should be simulated as not fitting
+					in memory; thread is put to sleep
+					to simulate disk accesses; NOTE that
+					this flag is not stored to the data
+					dictionary on disk, and the database
+					will forget about non-NULL value if
+					it has to reload the table definition
+					from disk */
+/*********************************************************************//**
+Parses an index creation operation.
+@return	index create subgraph */
+UNIV_INTERN
+ind_node_t*
+pars_create_index(
+/*==============*/
+	pars_res_word_t* unique_def,	/*!< in: not NULL if a unique index */
+	pars_res_word_t* clustered_def,	/*!< in: not NULL if a clustered index */
+	sym_node_t*	index_sym,	/*!< in: index name node in the symbol
+					table */
+	sym_node_t*	table_sym,	/*!< in: table name node in the symbol
+					table */
+	sym_node_t*	column_list);	/*!< in: list of column names */
+/*********************************************************************//**
+Parses a procedure definition.
+@return	query fork node */
+UNIV_INTERN
+que_fork_t*
+pars_procedure_definition(
+/*======================*/
+	sym_node_t*	sym_node,	/*!< in: procedure id node in the symbol
+					table */
+	sym_node_t*	param_list,	/*!< in: parameter declaration list */
+	que_node_t*	stat_list);	/*!< in: statement list */
+
+/*************************************************************//**
+Parses a stored procedure call, when this is not within another stored
+procedure, that is, the client issues a procedure call directly.
+In MySQL/InnoDB, stored InnoDB procedures are invoked via the
+parsed procedure tree, not via InnoDB SQL, so this function is not used.
+@return	query graph */
+UNIV_INTERN
+que_fork_t*
+pars_stored_procedure_call(
+/*=======================*/
+	sym_node_t*	sym_node);	/*!< in: stored procedure name */
+/******************************************************************//**
+Completes a query graph by adding query thread and fork nodes
+above it and prepares the graph for running. The fork created is of
+type QUE_FORK_MYSQL_INTERFACE.
+@return	query thread node to run */
+UNIV_INTERN
+que_thr_t*
+pars_complete_graph_for_exec(
+/*=========================*/
+	que_node_t*	node,	/*!< in: root node for an incomplete
+				query graph, or NULL for dummy graph */
+	trx_t*		trx,	/*!< in: transaction handle */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+	__attribute__((nonnull(2,3), warn_unused_result));
+
+/****************************************************************//**
+Create parser info struct.
+@return	own: info struct */
+UNIV_INTERN
+pars_info_t*
+pars_info_create(void);
+/*==================*/
+
+/****************************************************************//**
+Free info struct and everything it contains. */
+UNIV_INTERN
+void
+pars_info_free(
+/*===========*/
+	pars_info_t*	info);	/*!< in, own: info struct */
+
+/****************************************************************//**
+Add bound literal. */
+UNIV_INTERN
+void
+pars_info_add_literal(
+/*==================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const void*	address,	/*!< in: address */
+	ulint		length,		/*!< in: length of data */
+	ulint		type,		/*!< in: type, e.g. DATA_FIXBINARY */
+	ulint		prtype);	/*!< in: precise type, e.g.
+					DATA_UNSIGNED */
+
+/****************************************************************//**
+Equivalent to pars_info_add_literal(info, name, str, strlen(str),
+DATA_VARCHAR, DATA_ENGLISH). */
+UNIV_INTERN
+void
+pars_info_add_str_literal(
+/*======================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const char*	str);		/*!< in: string */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+UNIV_INTERN
+void
+pars_info_bind_literal(
+/*===================*/
+	pars_info_t*	info,		/* in: info struct */
+	const char*	name,		/* in: name */
+	const void*	address,	/* in: address */
+	ulint		length,		/* in: length of data */
+	ulint		type,		/* in: type, e.g. DATA_FIXBINARY */
+	ulint		prtype);	/* in: precise type, e.g. */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+UNIV_INTERN
+void
+pars_info_bind_varchar_literal(
+/*===========================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const byte*	str,		/*!< in: string */
+	ulint		str_len);	/*!< in: string length */
+/****************************************************************//**
+Equivalent to:
+
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+UNIV_INTERN
+void
+pars_info_bind_int4_literal(
+/*=======================*/
+	pars_info_t*		info,		/*!< in: info struct */
+	const char*		name,		/*!< in: name */
+	const ib_uint32_t*	val);		/*!< in: value */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+UNIV_INTERN
+void
+pars_info_bind_int8_literal(
+/*=======================*/
+	pars_info_t*		info,		/*!< in: info struct */
+	const char*		name,		/*!< in: name */
+	const ib_uint64_t*	val);		/*!< in: value */
+/****************************************************************//**
+Add user function. */
+UNIV_INTERN
+void
+pars_info_bind_function(
+/*===================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name,	/*!< in: function name */
+	pars_user_func_cb_t	func,	/*!< in: function address */
+	void*			arg);	/*!< in: user-supplied argument */
+/****************************************************************//**
+Add bound id. */
+UNIV_INTERN
+void
+pars_info_bind_id(
+/*=============*/
+	pars_info_t*		info,	/*!< in: info struct */
+	ibool			copy_name,/* in: make a copy of name if TRUE */
+	const char*		name,	/*!< in: name */
+	const char*		id);	/*!< in: id */
+/****************************************************************//**
+Equivalent to:
+
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+UNIV_INTERN
+void
+pars_info_add_int4_literal(
+/*=======================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	lint		val);		/*!< in: value */
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[8];
+mach_write_to_8(buf, val);
+pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+UNIV_INTERN
+void
+pars_info_add_ull_literal(
+/*======================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	ib_uint64_t	val);		/*!< in: value */
+
+/****************************************************************//**
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+UNIV_INTERN
+void
+pars_info_bind_ull_literal(
+/*=======================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name,	/*!< in: name */
+	const ib_uint64_t*	val)	/*!< in: value */
+	__attribute__((nonnull));
+
+/****************************************************************//**
+Add bound id. */
+UNIV_INTERN
+void
+pars_info_add_id(
+/*=============*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const char*	id);		/*!< in: id */
+
+/****************************************************************//**
+Get bound literal with the given name.
+@return	bound literal, or NULL if not found */
+UNIV_INTERN
+pars_bound_lit_t*
+pars_info_get_bound_lit(
+/*====================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name);	/*!< in: bound literal name to find */
+
+/****************************************************************//**
+Get bound id with the given name.
+@return	bound id, or NULL if not found */
+UNIV_INTERN
+pars_bound_id_t*
+pars_info_get_bound_id(
+/*===================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name);	/*!< in: bound id name to find */
+
+/******************************************************************//**
+Release any resources used by the lexer. */
+UNIV_INTERN
+void
+pars_lexer_close(void);
+/*==================*/
+
+/** Extra information supplied for pars_sql(). */
+struct pars_info_t {
+	mem_heap_t*	heap;		/*!< our own memory heap */
+
+	ib_vector_t*	funcs;		/*!< user functions, or NUll
+					(pars_user_func_t*) */
+	ib_vector_t*	bound_lits;	/*!< bound literals, or NULL
+					(pars_bound_lit_t*) */
+	ib_vector_t*	bound_ids;	/*!< bound ids, or NULL
+					(pars_bound_id_t*) */
+
+	ibool		graph_owns_us;	/*!< if TRUE (which is the default),
+					que_graph_free() will free us */
+};
+
+/** User-supplied function and argument. */
+struct pars_user_func_t {
+	const char*		name;	/*!< function name */
+	pars_user_func_cb_t	func;	/*!< function address */
+	void*			arg;	/*!< user-supplied argument */
+};
+
+/** Bound literal. */
+struct pars_bound_lit_t {
+	const char*	name;		/*!< name */
+	const void*	address;	/*!< address */
+	ulint		length;		/*!< length of data */
+	ulint		type;		/*!< type, e.g. DATA_FIXBINARY */
+	ulint		prtype;		/*!< precise type, e.g. DATA_UNSIGNED */
+	sym_node_t*	node;		/*!< symbol node */
+};
+
+/** Bound identifier. */
+struct pars_bound_id_t {
+	const char*	name;		/*!< name */
+	const char*	id;		/*!< identifier */
+};
+
+/** Struct used to denote a reserved word in a parsing tree */
+struct pars_res_word_t{
+	int	code;	/*!< the token code for the reserved word from
+			pars0grm.h */
+};
+
+/** A predefined function or operator node in a parsing tree; this construct
+is also used for some non-functions like the assignment ':=' */
+struct func_node_t{
+	que_common_t	common;	/*!< type: QUE_NODE_FUNC */
+	int		func;	/*!< token code of the function name */
+	ulint		fclass;	/*!< class of the function */
+	que_node_t*	args;	/*!< argument(s) of the function */
+	UT_LIST_NODE_T(func_node_t) cond_list;
+				/*!< list of comparison conditions; defined
+				only for comparison operator nodes except,
+				presently, for OPT_SCROLL_TYPE ones */
+	UT_LIST_NODE_T(func_node_t) func_node_list;
+				/*!< list of function nodes in a parsed
+				query graph */
+};
+
+/** An order-by node in a select */
+struct order_node_t{
+	que_common_t	common;	/*!< type: QUE_NODE_ORDER */
+	sym_node_t*	column;	/*!< order-by column */
+	ibool		asc;	/*!< TRUE if ascending, FALSE if descending */
+};
+
+/** Procedure definition node */
+struct proc_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_PROC */
+	sym_node_t*	proc_id;	/*!< procedure name symbol in the symbol
+					table of this same procedure */
+	sym_node_t*	param_list;	/*!< input and output parameters */
+	que_node_t*	stat_list;	/*!< statement list */
+	sym_tab_t*	sym_tab;	/*!< symbol table of this procedure */
+};
+
+/** elsif-element node */
+struct elsif_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_ELSIF */
+	que_node_t*	cond;		/*!< if condition */
+	que_node_t*	stat_list;	/*!< statement list */
+};
+
+/** if-statement node */
+struct if_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_IF */
+	que_node_t*	cond;		/*!< if condition */
+	que_node_t*	stat_list;	/*!< statement list */
+	que_node_t*	else_part;	/*!< else-part statement list */
+	elsif_node_t*	elsif_list;	/*!< elsif element list */
+};
+
+/** while-statement node */
+struct while_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_WHILE */
+	que_node_t*	cond;		/*!< while condition */
+	que_node_t*	stat_list;	/*!< statement list */
+};
+
+/** for-loop-statement node */
+struct for_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_FOR */
+	sym_node_t*	loop_var;	/*!< loop variable: this is the
+					dereferenced symbol from the
+					variable declarations, not the
+					symbol occurrence in the for loop
+					definition */
+	que_node_t*	loop_start_limit;/*!< initial value of loop variable */
+	que_node_t*	loop_end_limit;	/*!< end value of loop variable */
+	lint		loop_end_value;	/*!< evaluated value for the end value:
+					it is calculated only when the loop
+					is entered, and will not change within
+					the loop */
+	que_node_t*	stat_list;	/*!< statement list */
+};
+
+/** exit statement node */
+struct exit_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_EXIT */
+};
+
+/** return-statement node */
+struct return_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_RETURN */
+};
+
+/** Assignment statement node */
+struct assign_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_ASSIGNMENT */
+	sym_node_t*	var;		/*!< variable to set */
+	que_node_t*	val;		/*!< value to assign */
+};
+
+/** Column assignment node */
+struct col_assign_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_COL_ASSIGN */
+	sym_node_t*	col;		/*!< column to set */
+	que_node_t*	val;		/*!< value to assign */
+};
+
+/** Classes of functions */
+/* @{ */
+#define PARS_FUNC_ARITH		1	/*!< +, -, *, / */
+#define	PARS_FUNC_LOGICAL	2	/*!< AND, OR, NOT */
+#define PARS_FUNC_CMP		3	/*!< comparison operators */
+#define	PARS_FUNC_PREDEFINED	4	/*!< TO_NUMBER, SUBSTR, ... */
+#define	PARS_FUNC_AGGREGATE	5	/*!< COUNT, DISTINCT, SUM */
+#define	PARS_FUNC_OTHER		6	/*!< these are not real functions,
+					e.g., := */
+/* @} */
+
+#ifndef UNIV_NONINL
+#include "pars0pars.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/pars0pars.ic b/storage/innobase/include/pars0pars.ic
new file mode 100644
index 00000000000..4c88337a265
--- /dev/null
+++ b/storage/innobase/include/pars0pars.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0pars.ic
+SQL parser
+
+Created 11/19/1996 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/pars0sym.h b/storage/innobase/include/pars0sym.h
new file mode 100644
index 00000000000..bcf73639228
--- /dev/null
+++ b/storage/innobase/include/pars0sym.h
@@ -0,0 +1,258 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0sym.h
+SQL parser symbol table
+
+Created 12/15/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0sym_h
+#define pars0sym_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "usr0types.h"
+#include "dict0types.h"
+#include "pars0types.h"
+#include "row0types.h"
+
+/******************************************************************//**
+Creates a symbol table for a single stored procedure or query.
+@return	own: symbol table */
+UNIV_INTERN
+sym_tab_t*
+sym_tab_create(
+/*===========*/
+	mem_heap_t*	heap);	/*!< in: memory heap where to create */
+/******************************************************************//**
+Frees the memory allocated dynamically AFTER parsing phase for variables
+etc. in the symbol table. Does not free the mem heap where the table was
+originally created. Frees also SQL explicit cursor definitions. */
+UNIV_INTERN
+void
+sym_tab_free_private(
+/*=================*/
+	sym_tab_t*	sym_tab);	/*!< in, own: symbol table */
+/******************************************************************//**
+Adds an integer literal to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_int_lit(
+/*================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	ulint		val);		/*!< in: integer value */
+/******************************************************************//**
+Adds an string literal to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_str_lit(
+/*================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	const byte*	str,		/*!< in: string with no quotes around
+					it */
+	ulint		len);		/*!< in: string length */
+/******************************************************************//**
+Add a bound literal to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_bound_lit(
+/*==================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	const char*	name,		/*!< in: name of bound literal */
+	ulint*		lit_type);	/*!< out: type of literal (PARS_*_LIT) */
+/**********************************************************************
+Rebind literal to a node in the symbol table. */
+
+sym_node_t*
+sym_tab_rebind_lit(
+/*===============*/
+                                        /* out: symbol table node */
+        sym_node_t*     node,           /* in: node that is bound to literal*/
+        const void*     address,        /* in: pointer to data */
+        ulint           length);        /* in: length of data */
+/******************************************************************//**
+Adds an SQL null literal to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_null_lit(
+/*=================*/
+	sym_tab_t*	sym_tab);	/*!< in: symbol table */
+/******************************************************************//**
+Adds an identifier to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_id(
+/*===========*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	byte*		name,		/*!< in: identifier name */
+	ulint		len);		/*!< in: identifier length */
+
+/******************************************************************//**
+Add a bound identifier to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_bound_id(
+/*===========*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	const char*	name);		/*!< in: name of bound id */
+
+/** Index of sym_node_t::field_nos corresponding to the clustered index */
+#define	SYM_CLUST_FIELD_NO	0
+/** Index of sym_node_t::field_nos corresponding to a secondary index */
+#define	SYM_SEC_FIELD_NO	1
+
+/** Types of a symbol table node */
+enum sym_tab_entry {
+	SYM_UNSET,		/*!< Unset entry. */
+	SYM_VAR = 91,		/*!< declared parameter or local
+				variable of a procedure */
+	SYM_IMPLICIT_VAR,	/*!< storage for a intermediate result
+				of a calculation */
+	SYM_LIT,		/*!< literal */
+	SYM_TABLE_REF_COUNTED,	/*!< database table name, ref counted. Must
+				be closed explicitly. */
+	SYM_TABLE,		/*!< database table name */
+	SYM_COLUMN,		/*!< database table name */
+	SYM_CURSOR,		/*!< named cursor */
+	SYM_PROCEDURE_NAME,	/*!< stored procedure name */
+	SYM_INDEX,		/*!< database index name */
+	SYM_FUNCTION		/*!< user function name */
+};
+
+/** Symbol table node */
+struct sym_node_t{
+	que_common_t			common;		/*!< node type:
+							QUE_NODE_SYMBOL */
+	/* NOTE: if the data field in 'common.val' is not NULL and the symbol
+	table node is not for a temporary column, the memory for the value has
+	been allocated from dynamic memory and it should be freed when the
+	symbol table is discarded */
+
+	/* 'alias' and 'indirection' are almost the same, but not quite.
+	'alias' always points to the primary instance of the variable, while
+	'indirection' does the same only if we should use the primary
+	instance's values for the node's data. This is usually the case, but
+	when initializing a cursor (e.g., "DECLARE CURSOR c IS SELECT * FROM
+	t WHERE id = x;"), we copy the values from the primary instance to
+	the cursor's instance so that they are fixed for the duration of the
+	cursor, and set 'indirection' to NULL. If we did not, the value of
+	'x' could change between fetches and things would break horribly.
+
+	TODO: It would be cleaner to make 'indirection' a boolean field and
+	always use 'alias' to refer to the primary node. */
+
+	sym_node_t*			indirection;	/*!< pointer to
+							another symbol table
+							node which contains
+							the value for this
+							node, NULL otherwise */
+	sym_node_t*			alias;		/*!< pointer to
+							another symbol table
+							node for which this
+							node is an alias,
+							NULL otherwise */
+	UT_LIST_NODE_T(sym_node_t)	col_var_list;	/*!< list of table
+							columns or a list of
+							input variables for an
+							explicit cursor */
+	ibool				copy_val;	/*!< TRUE if a column
+							and its value should
+							be copied to dynamic
+							memory when fetched */
+	ulint				field_nos[2];	/*!< if a column, in
+							the position
+							SYM_CLUST_FIELD_NO is
+							the field number in the
+							clustered index; in
+							the position
+							SYM_SEC_FIELD_NO
+							the field number in the
+							non-clustered index to
+							use first; if not found
+							from the index, then
+							ULINT_UNDEFINED */
+	ibool				resolved;	/*!< TRUE if the
+							meaning of a variable
+							or a column has been
+							resolved; for literals
+							this is always TRUE */
+	enum sym_tab_entry		token_type;	/*!< type of the
+							parsed token */
+	const char*			name;		/*!< name of an id */
+	ulint				name_len;	/*!< id name length */
+	dict_table_t*			table;		/*!< table definition
+							if a table id or a
+							column id */
+	ulint				col_no;		/*!< column number if a
+							column */
+	sel_buf_t*			prefetch_buf;	/*!< NULL, or a buffer
+							for cached column
+							values for prefetched
+							rows */
+	sel_node_t*			cursor_def;	/*!< cursor definition
+							select node if a
+							named cursor */
+	ulint				param_type;	/*!< PARS_INPUT,
+							PARS_OUTPUT, or
+							PARS_NOT_PARAM if not a
+							procedure parameter */
+	sym_tab_t*			sym_table;	/*!< back pointer to
+							the symbol table */
+	UT_LIST_NODE_T(sym_node_t)	sym_list;	/*!< list of symbol
+							nodes */
+	sym_node_t*			like_node;	/* LIKE operator node*/
+};
+
+/** Symbol table */
+struct sym_tab_t{
+	que_t*			query_graph;
+					/*!< query graph generated by the
+					parser */
+	const char*		sql_string;
+					/*!< SQL string to parse */
+	size_t			string_len;
+					/*!< SQL string length */
+	int			next_char_pos;
+					/*!< position of the next character in
+					sql_string to give to the lexical
+					analyzer */
+	pars_info_t*		info;	/*!< extra information, or NULL */
+	sym_node_list_t		sym_list;
+					/*!< list of symbol nodes in the symbol
+					table */
+	UT_LIST_BASE_NODE_T(func_node_t)
+				func_node_list;
+					/*!< list of function nodes in the
+					parsed query graph */
+	mem_heap_t*		heap;	/*!< memory heap from which we can
+					allocate space */
+};
+
+#ifndef UNIV_NONINL
+#include "pars0sym.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/pars0sym.ic b/storage/innobase/include/pars0sym.ic
new file mode 100644
index 00000000000..266c1a6310d
--- /dev/null
+++ b/storage/innobase/include/pars0sym.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0sym.ic
+SQL parser symbol table
+
+Created 12/15/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/pars0types.h b/storage/innobase/include/pars0types.h
new file mode 100644
index 00000000000..47f4b432d20
--- /dev/null
+++ b/storage/innobase/include/pars0types.h
@@ -0,0 +1,50 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0types.h
+SQL parser global types
+
+Created 1/11/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0types_h
+#define pars0types_h
+
+struct pars_info_t;
+struct pars_user_func_t;
+struct pars_bound_lit_t;
+struct pars_bound_id_t;
+struct sym_node_t;
+struct sym_tab_t;
+struct pars_res_word_t;
+struct func_node_t;
+struct order_node_t;
+struct proc_node_t;
+struct elsif_node_t;
+struct if_node_t;
+struct while_node_t;
+struct for_node_t;
+struct exit_node_t;
+struct return_node_t;
+struct assign_node_t;
+struct col_assign_node_t;
+
+typedef UT_LIST_BASE_NODE_T(sym_node_t)	sym_node_list_t;
+
+#endif
diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h
new file mode 100644
index 00000000000..ba8828623af
--- /dev/null
+++ b/storage/innobase/include/que0que.h
@@ -0,0 +1,530 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0que.h
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef que0que_h
+#define que0que_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "srv0srv.h"
+#include "usr0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "pars0types.h"
+
+/* If the following flag is set TRUE, the module will print trace info
+of SQL execution in the UNIV_SQL_DEBUG version */
+extern ibool	que_trace_on;
+
+/** Mutex protecting the query threads. */
+extern ib_mutex_t	que_thr_mutex;
+
+/***********************************************************************//**
+Creates a query graph fork node.
+@return	own: fork node */
+UNIV_INTERN
+que_fork_t*
+que_fork_create(
+/*============*/
+	que_t*		graph,		/*!< in: graph, if NULL then this
+					fork node is assumed to be the
+					graph root */
+	que_node_t*	parent,		/*!< in: parent node */
+	ulint		fork_type,	/*!< in: fork type */
+	mem_heap_t*	heap);		/*!< in: memory heap where created */
+/***********************************************************************//**
+Gets the first thr in a fork. */
+UNIV_INLINE
+que_thr_t*
+que_fork_get_first_thr(
+/*===================*/
+	que_fork_t*	fork);	/*!< in: query fork */
+/***********************************************************************//**
+Gets the child node of the first thr in a fork. */
+UNIV_INLINE
+que_node_t*
+que_fork_get_child(
+/*===============*/
+	que_fork_t*	fork);	/*!< in: query fork */
+/***********************************************************************//**
+Sets the parent of a graph node. */
+UNIV_INLINE
+void
+que_node_set_parent(
+/*================*/
+	que_node_t*	node,	/*!< in: graph node */
+	que_node_t*	parent);/*!< in: parent */
+/***********************************************************************//**
+Creates a query graph thread node.
+@return	own: query thread node */
+UNIV_INTERN
+que_thr_t*
+que_thr_create(
+/*===========*/
+	que_fork_t*	parent,	/*!< in: parent node, i.e., a fork node */
+	mem_heap_t*	heap);	/*!< in: memory heap where created */
+/**********************************************************************//**
+Frees a query graph, but not the heap where it was created. Does not free
+explicit cursor declarations, they are freed in que_graph_free. */
+UNIV_INTERN
+void
+que_graph_free_recursive(
+/*=====================*/
+	que_node_t*	node);	/*!< in: query graph node */
+/**********************************************************************//**
+Frees a query graph. */
+UNIV_INTERN
+void
+que_graph_free(
+/*===========*/
+	que_t*	graph);	/*!< in: query graph; we assume that the memory
+			heap where this graph was created is private
+			to this graph: if not, then use
+			que_graph_free_recursive and free the heap
+			afterwards! */
+/**********************************************************************//**
+Stops a query thread if graph or trx is in a state requiring it. The
+conditions are tested in the order (1) graph, (2) trx. The lock_sys_t::mutex
+has to be reserved.
+@return	TRUE if stopped */
+UNIV_INTERN
+ibool
+que_thr_stop(
+/*=========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Moves a thread from another state to the QUE_THR_RUNNING state. Increments
+the n_active_thrs counters of the query graph and transaction. */
+UNIV_INTERN
+void
+que_thr_move_to_run_state_for_mysql(
+/*================================*/
+	que_thr_t*	thr,	/*!< in: an query thread */
+	trx_t*		trx);	/*!< in: transaction */
+/**********************************************************************//**
+A patch for MySQL used to 'stop' a dummy query thread used in MySQL
+select, when there is no error or lock wait. */
+UNIV_INTERN
+void
+que_thr_stop_for_mysql_no_error(
+/*============================*/
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_t*		trx);	/*!< in: transaction */
+/**********************************************************************//**
+A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The
+query thread is stopped and made inactive, except in the case where
+it was put to the lock wait state in lock0lock.cc, but the lock has already
+been granted or the transaction chosen as a victim in deadlock resolution. */
+UNIV_INTERN
+void
+que_thr_stop_for_mysql(
+/*===================*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Run a query thread. Handles lock waits. */
+UNIV_INTERN
+void
+que_run_threads(
+/*============*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Moves a suspended query thread to the QUE_THR_RUNNING state and release
+a worker thread to execute it. This function should be used to end
+the wait state of a query thread waiting for a lock or a stored procedure
+completion.
+@return query thread instance of thread to wakeup or NULL  */
+UNIV_INTERN
+que_thr_t*
+que_thr_end_lock_wait(
+/*==================*/
+	trx_t*		trx);		/*!< in: transaction in the
+					QUE_THR_LOCK_WAIT state */
+/**********************************************************************//**
+Starts execution of a command in a query fork. Picks a query thread which
+is not in the QUE_THR_RUNNING state and moves it to that state. If none
+can be chosen, a situation which may arise in parallelized fetches, NULL
+is returned.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+UNIV_INTERN
+que_thr_t*
+que_fork_start_command(
+/*===================*/
+	que_fork_t*	fork);	/*!< in: a query fork */
+/***********************************************************************//**
+Gets the trx of a query thread. */
+UNIV_INLINE
+trx_t*
+thr_get_trx(
+/*========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/*******************************************************************//**
+Determines if this thread is rolling back an incomplete transaction
+in crash recovery.
+@return TRUE if thr is rolling back an incomplete transaction in crash
+recovery */
+UNIV_INLINE
+ibool
+thr_is_recv(
+/*========*/
+	const que_thr_t*	thr);	/*!< in: query thread */
+/***********************************************************************//**
+Gets the type of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_type(
+/*==============*/
+	que_node_t*	node);	/*!< in: graph node */
+/***********************************************************************//**
+Gets pointer to the value data type field of a graph node. */
+UNIV_INLINE
+dtype_t*
+que_node_get_data_type(
+/*===================*/
+	que_node_t*	node);	/*!< in: graph node */
+/***********************************************************************//**
+Gets pointer to the value dfield of a graph node. */
+UNIV_INLINE
+dfield_t*
+que_node_get_val(
+/*=============*/
+	que_node_t*	node);	/*!< in: graph node */
+/***********************************************************************//**
+Gets the value buffer size of a graph node.
+@return	val buffer size, not defined if val.data == NULL in node */
+UNIV_INLINE
+ulint
+que_node_get_val_buf_size(
+/*======================*/
+	que_node_t*	node);	/*!< in: graph node */
+/***********************************************************************//**
+Sets the value buffer size of a graph node. */
+UNIV_INLINE
+void
+que_node_set_val_buf_size(
+/*======================*/
+	que_node_t*	node,	/*!< in: graph node */
+	ulint		size);	/*!< in: size */
+/*********************************************************************//**
+Gets the next list node in a list of query graph nodes. */
+UNIV_INLINE
+que_node_t*
+que_node_get_next(
+/*==============*/
+	que_node_t*	node);	/*!< in: node in a list */
+/*********************************************************************//**
+Gets the parent node of a query graph node.
+@return	parent node or NULL */
+UNIV_INLINE
+que_node_t*
+que_node_get_parent(
+/*================*/
+	que_node_t*	node);	/*!< in: node */
+/****************************************************************//**
+Get the first containing loop node (e.g. while_node_t or for_node_t) for the
+given node, or NULL if the node is not within a loop.
+@return	containing loop node, or NULL. */
+UNIV_INTERN
+que_node_t*
+que_node_get_containing_loop_node(
+/*==============================*/
+	que_node_t*	node);	/*!< in: node */
+/*********************************************************************//**
+Catenates a query graph node to a list of them, possible empty list.
+@return	one-way list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_list_add_last(
+/*===================*/
+	que_node_t*	node_list,	/*!< in: node list, or NULL */
+	que_node_t*	node);		/*!< in: node */
+/*************************************************************************
+Get the last node from the list.*/
+UNIV_INLINE
+que_node_t*
+que_node_list_get_last(
+/*===================*/
+					/* out: node last node from list.*/
+	que_node_t*	node_list);	/* in: node list, or NULL */
+/*********************************************************************//**
+Gets a query graph node list length.
+@return	length, for NULL list 0 */
+UNIV_INLINE
+ulint
+que_node_list_get_len(
+/*==================*/
+	que_node_t*	node_list);	/*!< in: node list, or NULL */
+/**********************************************************************//**
+Checks if graph, trx, or session is in a state where the query thread should
+be stopped.
+@return TRUE if should be stopped; NOTE that if the peek is made
+without reserving the trx_t::mutex, then another peek with the mutex
+reserved is necessary before deciding the actual stopping */
+UNIV_INLINE
+ibool
+que_thr_peek_stop(
+/*==============*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/***********************************************************************//**
+Returns TRUE if the query graph is for a SELECT statement.
+@return	TRUE if a select */
+UNIV_INLINE
+ibool
+que_graph_is_select(
+/*================*/
+	que_t*		graph);		/*!< in: graph */
+/**********************************************************************//**
+Prints info of an SQL query graph node. */
+UNIV_INTERN
+void
+que_node_print_info(
+/*================*/
+	que_node_t*	node);	/*!< in: query graph node */
+/*********************************************************************//**
+Evaluate the given SQL
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+que_eval_sql(
+/*=========*/
+	pars_info_t*	info,	/*!< in: info struct, or NULL */
+	const char*	sql,	/*!< in: SQL string */
+	ibool		reserve_dict_mutex,
+				/*!< in: if TRUE, acquire/release
+				dict_sys->mutex around call to pars_sql. */
+	trx_t*		trx);	/*!< in: trx */
+
+/**********************************************************************//**
+Round robin scheduler.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+UNIV_INTERN
+que_thr_t*
+que_fork_scheduler_round_robin(
+/*===========================*/
+	que_fork_t*	fork,		/*!< in: a query fork */
+	que_thr_t*	thr);		/*!< in: current pos */
+
+/*********************************************************************//**
+Initialise the query sub-system. */
+UNIV_INTERN
+void
+que_init(void);
+/*==========*/
+
+/*********************************************************************//**
+Close the query sub-system. */
+UNIV_INTERN
+void
+que_close(void);
+/*===========*/
+
+/* Query graph query thread node: the fields are protected by the
+trx_t::mutex with the exceptions named below */
+
+struct que_thr_t{
+	que_common_t	common;		/*!< type: QUE_NODE_THR */
+	ulint		magic_n;	/*!< magic number to catch memory
+					corruption */
+	que_node_t*	child;		/*!< graph child node */
+	que_t*		graph;		/*!< graph where this node belongs */
+	ulint		state;		/*!< state of the query thread */
+	ibool		is_active;	/*!< TRUE if the thread has been set
+					to the run state in
+					que_thr_move_to_run_state, but not
+					deactivated in
+					que_thr_dec_reference_count */
+	/*------------------------------*/
+	/* The following fields are private to the OS thread executing the
+	query thread, and are not protected by any mutex: */
+
+	que_node_t*	run_node;	/*!< pointer to the node where the
+					subgraph down from this node is
+					currently executed */
+	que_node_t*	prev_node;	/*!< pointer to the node from which
+					the control came */
+	ulint		resource;	/*!< resource usage of the query thread
+					thus far */
+	ulint		lock_state;	/*!< lock state of thread (table or
+					row) */
+	struct srv_slot_t*
+			slot;		/* The thread slot in the wait
+					array in srv_sys_t */
+	/*------------------------------*/
+	/* The following fields are links for the various lists that
+	this type can be on. */
+	UT_LIST_NODE_T(que_thr_t)
+			thrs;		/*!< list of thread nodes of the fork
+					node */
+	UT_LIST_NODE_T(que_thr_t)
+			trx_thrs;	/*!< lists of threads in wait list of
+					the trx */
+	UT_LIST_NODE_T(que_thr_t)
+			queue;		/*!< list of runnable thread nodes in
+					the server task queue */
+	ulint		fk_cascade_depth; /*!< maximum cascading call depth
+					supported for foreign key constraint
+					related delete/updates */
+};
+
+#define QUE_THR_MAGIC_N		8476583
+#define QUE_THR_MAGIC_FREED	123461526
+
+/* Query graph fork node: its fields are protected by the query thread mutex */
+struct que_fork_t{
+	que_common_t	common;		/*!< type: QUE_NODE_FORK */
+	que_t*		graph;		/*!< query graph of this node */
+	ulint		fork_type;	/*!< fork type */
+	ulint		n_active_thrs;	/*!< if this is the root of a graph, the
+					number query threads that have been
+					started in que_thr_move_to_run_state
+					but for which que_thr_dec_refer_count
+					has not yet been called */
+	trx_t*		trx;		/*!< transaction: this is set only in
+					the root node */
+	ulint		state;		/*!< state of the fork node */
+	que_thr_t*	caller;		/*!< pointer to a possible calling query
+					thread */
+	UT_LIST_BASE_NODE_T(que_thr_t)
+			thrs;		/*!< list of query threads */
+	/*------------------------------*/
+	/* The fields in this section are defined only in the root node */
+	sym_tab_t*	sym_tab;	/*!< symbol table of the query,
+					generated by the parser, or NULL
+					if the graph was created 'by hand' */
+	pars_info_t*	info;		/*!< info struct, or NULL */
+	/* The following cur_... fields are relevant only in a select graph */
+
+	ulint		cur_end;	/*!< QUE_CUR_NOT_DEFINED, QUE_CUR_START,
+					QUE_CUR_END */
+	ulint		cur_pos;	/*!< if there are n rows in the result
+					set, values 0 and n + 1 mean before
+					first row, or after last row, depending
+					on cur_end; values 1...n mean a row
+					index */
+	ibool		cur_on_row;	/*!< TRUE if cursor is on a row, i.e.,
+					it is not before the first row or
+					after the last row */
+	sel_node_t*	last_sel_node;	/*!< last executed select node, or NULL
+					if none */
+	UT_LIST_NODE_T(que_fork_t)
+			graphs;		/*!< list of query graphs of a session
+					or a stored procedure */
+	/*------------------------------*/
+	mem_heap_t*	heap;		/*!< memory heap where the fork was
+					created */
+
+};
+
+/* Query fork (or graph) types */
+#define QUE_FORK_SELECT_NON_SCROLL	1	/* forward-only cursor */
+#define QUE_FORK_SELECT_SCROLL		2	/* scrollable cursor */
+#define QUE_FORK_INSERT			3
+#define QUE_FORK_UPDATE			4
+#define QUE_FORK_ROLLBACK		5
+			/* This is really the undo graph used in rollback,
+			no signal-sending roll_node in this graph */
+#define QUE_FORK_PURGE			6
+#define	QUE_FORK_EXECUTE		7
+#define QUE_FORK_PROCEDURE		8
+#define QUE_FORK_PROCEDURE_CALL		9
+#define QUE_FORK_MYSQL_INTERFACE	10
+#define	QUE_FORK_RECOVERY		11
+
+/* Query fork (or graph) states */
+#define QUE_FORK_ACTIVE		1
+#define QUE_FORK_COMMAND_WAIT	2
+#define QUE_FORK_INVALID	3
+#define QUE_FORK_BEING_FREED	4
+
+/* Flag which is ORed to control structure statement node types */
+#define QUE_NODE_CONTROL_STAT	1024
+
+/* Query graph node types */
+#define	QUE_NODE_LOCK		1
+#define	QUE_NODE_INSERT		2
+#define QUE_NODE_UPDATE		4
+#define	QUE_NODE_CURSOR		5
+#define	QUE_NODE_SELECT		6
+#define	QUE_NODE_AGGREGATE	7
+#define QUE_NODE_FORK		8
+#define QUE_NODE_THR		9
+#define QUE_NODE_UNDO		10
+#define QUE_NODE_COMMIT		11
+#define QUE_NODE_ROLLBACK	12
+#define QUE_NODE_PURGE		13
+#define QUE_NODE_CREATE_TABLE	14
+#define QUE_NODE_CREATE_INDEX	15
+#define QUE_NODE_SYMBOL		16
+#define QUE_NODE_RES_WORD	17
+#define QUE_NODE_FUNC		18
+#define QUE_NODE_ORDER		19
+#define QUE_NODE_PROC		(20 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_IF		(21 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_WHILE		(22 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_ASSIGNMENT	23
+#define QUE_NODE_FETCH		24
+#define QUE_NODE_OPEN		25
+#define QUE_NODE_COL_ASSIGNMENT	26
+#define QUE_NODE_FOR		(27 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_RETURN		28
+#define QUE_NODE_ROW_PRINTF	29
+#define QUE_NODE_ELSIF		30
+#define QUE_NODE_CALL		31
+#define QUE_NODE_EXIT		32
+
+/* Query thread states */
+#define QUE_THR_RUNNING		1
+#define QUE_THR_PROCEDURE_WAIT	2
+#define	QUE_THR_COMPLETED	3	/* in selects this means that the
+					thread is at the end of its result set
+					(or start, in case of a scroll cursor);
+					in other statements, this means the
+					thread has done its task */
+#define QUE_THR_COMMAND_WAIT	4
+#define QUE_THR_LOCK_WAIT	5
+#define QUE_THR_SUSPENDED	7
+#define QUE_THR_ERROR		8
+
+/* Query thread lock states */
+#define QUE_THR_LOCK_NOLOCK	0
+#define QUE_THR_LOCK_ROW	1
+#define QUE_THR_LOCK_TABLE	2
+
+/* From where the cursor position is counted */
+#define QUE_CUR_NOT_DEFINED	1
+#define QUE_CUR_START		2
+#define	QUE_CUR_END		3
+
+#ifndef UNIV_NONINL
+#include "que0que.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/que0que.ic b/storage/innobase/include/que0que.ic
new file mode 100644
index 00000000000..eff5a86d958
--- /dev/null
+++ b/storage/innobase/include/que0que.ic
@@ -0,0 +1,309 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0que.ic
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "usr0sess.h"
+
+/***********************************************************************//**
+Gets the trx of a query thread. */
+UNIV_INLINE
+trx_t*
+thr_get_trx(
+/*========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ut_ad(thr);
+
+	return(thr->graph->trx);
+}
+
+/*******************************************************************//**
+Determines if this thread is rolling back an incomplete transaction
+in crash recovery.
+@return TRUE if thr is rolling back an incomplete transaction in crash
+recovery */
+UNIV_INLINE
+ibool
+thr_is_recv(
+/*========*/
+	const que_thr_t*	thr)	/*!< in: query thread */
+{
+	return(trx_is_recv(thr->graph->trx));
+}
+
+/***********************************************************************//**
+Gets the first thr in a fork. */
+UNIV_INLINE
+que_thr_t*
+que_fork_get_first_thr(
+/*===================*/
+	que_fork_t*	fork)	/*!< in: query fork */
+{
+	return(UT_LIST_GET_FIRST(fork->thrs));
+}
+
+/***********************************************************************//**
+Gets the child node of the first thr in a fork. */
+UNIV_INLINE
+que_node_t*
+que_fork_get_child(
+/*===============*/
+	que_fork_t*	fork)	/*!< in: query fork */
+{
+	que_thr_t*	thr;
+
+	thr = UT_LIST_GET_FIRST(fork->thrs);
+
+	return(thr->child);
+}
+
+/***********************************************************************//**
+Gets the type of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_type(
+/*==============*/
+	que_node_t*	node)	/*!< in: graph node */
+{
+	ut_ad(node);
+
+	return(((que_common_t*) node)->type);
+}
+
+/***********************************************************************//**
+Gets pointer to the value dfield of a graph node. */
+UNIV_INLINE
+dfield_t*
+que_node_get_val(
+/*=============*/
+	que_node_t*	node)	/*!< in: graph node */
+{
+	ut_ad(node);
+
+	return(&(((que_common_t*) node)->val));
+}
+
+/***********************************************************************//**
+Gets the value buffer size of a graph node.
+@return	val buffer size, not defined if val.data == NULL in node */
+UNIV_INLINE
+ulint
+que_node_get_val_buf_size(
+/*======================*/
+	que_node_t*	node)	/*!< in: graph node */
+{
+	ut_ad(node);
+
+	return(((que_common_t*) node)->val_buf_size);
+}
+
+/***********************************************************************//**
+Sets the value buffer size of a graph node. */
+UNIV_INLINE
+void
+que_node_set_val_buf_size(
+/*======================*/
+	que_node_t*	node,	/*!< in: graph node */
+	ulint		size)	/*!< in: size */
+{
+	ut_ad(node);
+
+	((que_common_t*) node)->val_buf_size = size;
+}
+
+/***********************************************************************//**
+Sets the parent of a graph node. */
+UNIV_INLINE
+void
+que_node_set_parent(
+/*================*/
+	que_node_t*	node,	/*!< in: graph node */
+	que_node_t*	parent)	/*!< in: parent */
+{
+	ut_ad(node);
+
+	((que_common_t*) node)->parent = parent;
+}
+
+/***********************************************************************//**
+Gets pointer to the value data type field of a graph node. */
+UNIV_INLINE
+dtype_t*
+que_node_get_data_type(
+/*===================*/
+	que_node_t*	node)	/*!< in: graph node */
+{
+	ut_ad(node);
+
+	return(dfield_get_type(&((que_common_t*) node)->val));
+}
+
+/*********************************************************************//**
+Catenates a query graph node to a list of them, possible empty list.
+@return	one-way list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_list_add_last(
+/*===================*/
+	que_node_t*	node_list,	/*!< in: node list, or NULL */
+	que_node_t*	node)		/*!< in: node */
+{
+	que_common_t*	cnode;
+	que_common_t*	cnode2;
+
+	cnode = (que_common_t*) node;
+
+	cnode->brother = NULL;
+
+	if (node_list == NULL) {
+
+		return(node);
+	}
+
+	cnode2 = (que_common_t*) node_list;
+
+	while (cnode2->brother != NULL) {
+		cnode2 = (que_common_t*) cnode2->brother;
+	}
+
+	cnode2->brother = node;
+
+	return(node_list);
+}
+
+/*************************************************************************
+Removes a query graph node from the list.*/
+UNIV_INLINE
+que_node_t*
+que_node_list_get_last(
+/*===================*/
+					/* out: last node in list.*/
+	que_node_t*	node_list)	/* in: node list */
+{
+	que_common_t*	node;
+
+	ut_a(node_list != NULL);
+
+	node = (que_common_t*) node_list;
+
+	/* We need the last element */
+	while (node->brother != NULL) {
+		node = (que_common_t*) node->brother;
+	}
+
+	return(node);
+}
+/*********************************************************************//**
+Gets the next list node in a list of query graph nodes.
+@return	next node in a list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_get_next(
+/*==============*/
+	que_node_t*	node)	/*!< in: node in a list */
+{
+	return(((que_common_t*) node)->brother);
+}
+
+/*********************************************************************//**
+Gets a query graph node list length.
+@return	length, for NULL list 0 */
+UNIV_INLINE
+ulint
+que_node_list_get_len(
+/*==================*/
+	que_node_t*	node_list)	/*!< in: node list, or NULL */
+{
+	const que_common_t*	cnode;
+	ulint			len;
+
+	cnode = (const que_common_t*) node_list;
+	len = 0;
+
+	while (cnode != NULL) {
+		len++;
+		cnode = (const que_common_t*) cnode->brother;
+	}
+
+	return(len);
+}
+
+/*********************************************************************//**
+Gets the parent node of a query graph node.
+@return	parent node or NULL */
+UNIV_INLINE
+que_node_t*
+que_node_get_parent(
+/*================*/
+	que_node_t*	node)	/*!< in: node */
+{
+	return(((que_common_t*) node)->parent);
+}
+
+/**********************************************************************//**
+Checks if graph, trx, or session is in a state where the query thread should
+be stopped.
+@return TRUE if should be stopped; NOTE that if the peek is made
+without reserving the trx mutex, then another peek with the mutex
+reserved is necessary before deciding the actual stopping */
+UNIV_INLINE
+ibool
+que_thr_peek_stop(
+/*==============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	trx_t*	trx;
+	que_t*	graph;
+
+	graph = thr->graph;
+	trx = graph->trx;
+
+	if (graph->state != QUE_FORK_ACTIVE
+	    || trx->lock.que_state == TRX_QUE_LOCK_WAIT
+	    || (trx->lock.que_state != TRX_QUE_ROLLING_BACK
+		&& trx->lock.que_state != TRX_QUE_RUNNING)) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/***********************************************************************//**
+Returns TRUE if the query graph is for a SELECT statement.
+@return	TRUE if a select */
+UNIV_INLINE
+ibool
+que_graph_is_select(
+/*================*/
+	que_t*		graph)		/*!< in: graph */
+{
+	if (graph->fork_type == QUE_FORK_SELECT_SCROLL
+	    || graph->fork_type == QUE_FORK_SELECT_NON_SCROLL) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
diff --git a/storage/innobase/include/que0types.h b/storage/innobase/include/que0types.h
new file mode 100644
index 00000000000..0f11cad301a
--- /dev/null
+++ b/storage/innobase/include/que0types.h
@@ -0,0 +1,57 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0types.h
+Query graph global types
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef que0types_h
+#define que0types_h
+
+#include "data0data.h"
+#include "dict0types.h"
+
+/* Pseudotype for all graph nodes */
+typedef void	que_node_t;
+
+/* Query graph root is a fork node */
+typedef	struct que_fork_t	que_t;
+
+struct que_thr_t;
+
+/* Common struct at the beginning of each query graph node; the name of this
+substruct must be 'common' */
+
+struct que_common_t{
+	ulint		type;	/*!< query node type */
+	que_node_t*	parent;	/*!< back pointer to parent node, or NULL */
+	que_node_t*	brother;/* pointer to a possible brother node */
+	dfield_t	val;	/*!< evaluated value for an expression */
+	ulint		val_buf_size;
+				/* buffer size for the evaluated value data,
+				if the buffer has been allocated dynamically:
+				if this field is != 0, and the node is a
+				symbol node or a function node, then we
+				have to free the data field in val
+				explicitly */
+};
+
+#endif
diff --git a/storage/innobase/include/read0read.h b/storage/innobase/include/read0read.h
new file mode 100644
index 00000000000..980faddf98e
--- /dev/null
+++ b/storage/innobase/include/read0read.h
@@ -0,0 +1,193 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/read0read.h
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef read0read_h
+#define read0read_h
+
+#include "univ.i"
+
+
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "trx0trx.h"
+#include "read0types.h"
+
+/*********************************************************************//**
+Opens a read view where exactly the transactions serialized before this
+point in time are seen in the view.
+@return	own: read view struct */
+UNIV_INTERN
+read_view_t*
+read_view_open_now(
+/*===============*/
+	trx_id_t	cr_trx_id,	/*!< in: trx_id of creating
+					transaction, or 0 used in purge */
+	mem_heap_t*	heap);		/*!< in: memory heap from which
+					allocated */
+/*********************************************************************//**
+Makes a copy of the oldest existing read view, or opens a new. The view
+must be closed with ..._close.
+@return	own: read view struct */
+UNIV_INTERN
+read_view_t*
+read_view_purge_open(
+/*=================*/
+	mem_heap_t*	heap);		/*!< in: memory heap from which
+					allocated */
+/*********************************************************************//**
+Remove a read view from the trx_sys->view_list. */
+UNIV_INLINE
+void
+read_view_remove(
+/*=============*/
+	read_view_t*	view,		/*!< in: read view, can be 0 */
+	bool		own_mutex);	/*!< in: true if caller owns the
+					trx_sys_t::mutex */
+/*********************************************************************//**
+Closes a consistent read view for MySQL. This function is called at an SQL
+statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */
+UNIV_INTERN
+void
+read_view_close_for_mysql(
+/*======================*/
+	trx_t*	trx);	/*!< in: trx which has a read view */
+/*********************************************************************//**
+Checks if a read view sees the specified transaction.
+@return	true if sees */
+UNIV_INLINE
+bool
+read_view_sees_trx_id(
+/*==================*/
+	const read_view_t*	view,	/*!< in: read view */
+	trx_id_t		trx_id)	/*!< in: trx id */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Prints a read view to stderr. */
+UNIV_INTERN
+void
+read_view_print(
+/*============*/
+	const read_view_t*	view);	/*!< in: read view */
+/*********************************************************************//**
+Create a consistent cursor view for mysql to be used in cursors. In this
+consistent read view modifications done by the creating transaction or future
+transactions are not visible. */
+UNIV_INTERN
+cursor_view_t*
+read_cursor_view_create_for_mysql(
+/*==============================*/
+	trx_t*		cr_trx);/*!< in: trx where cursor view is created */
+/*********************************************************************//**
+Close a given consistent cursor view for mysql and restore global read view
+back to a transaction read view. */
+UNIV_INTERN
+void
+read_cursor_view_close_for_mysql(
+/*=============================*/
+	trx_t*		trx,		/*!< in: trx */
+	cursor_view_t*	curview);	/*!< in: cursor view to be closed */
+/*********************************************************************//**
+This function sets a given consistent cursor view to a transaction
+read view if given consistent cursor view is not NULL. Otherwise, function
+restores a global read view to a transaction read view. */
+UNIV_INTERN
+void
+read_cursor_set_for_mysql(
+/*======================*/
+	trx_t*		trx,	/*!< in: transaction where cursor is set */
+	cursor_view_t*	curview);/*!< in: consistent cursor view to be set */
+
+/** Read view lists the trx ids of those transactions for which a consistent
+read should not see the modifications to the database. */
+
+struct read_view_t{
+	ulint		type;	/*!< VIEW_NORMAL, VIEW_HIGH_GRANULARITY */
+	undo_no_t	undo_no;/*!< 0 or if type is
+				VIEW_HIGH_GRANULARITY
+				transaction undo_no when this high-granularity
+				consistent read view was created */
+	trx_id_t	low_limit_no;
+				/*!< The view does not need to see the undo
+				logs for transactions whose transaction number
+				is strictly smaller (<) than this value: they
+				can be removed in purge if not needed by other
+				views */
+	trx_id_t	low_limit_id;
+				/*!< The read should not see any transaction
+				with trx id >= this value. In other words,
+				this is the "high water mark". */
+	trx_id_t	up_limit_id;
+				/*!< The read should see all trx ids which
+				are strictly smaller (<) than this value.
+				In other words,
+				this is the "low water mark". */
+	ulint		n_trx_ids;
+				/*!< Number of cells in the trx_ids array */
+	trx_id_t*	trx_ids;/*!< Additional trx ids which the read should
+				not see: typically, these are the read-write
+				active transactions at the time when the read
+				is serialized, except the reading transaction
+				itself; the trx ids in this array are in a
+				descending order. These trx_ids should be
+				between the "low" and "high" water marks,
+				that is, up_limit_id and low_limit_id. */
+	trx_id_t	creator_trx_id;
+				/*!< trx id of creating transaction, or
+				0 used in purge */
+	UT_LIST_NODE_T(read_view_t) view_list;
+				/*!< List of read views in trx_sys */
+};
+
+/** Read view types @{ */
+#define VIEW_NORMAL		1	/*!< Normal consistent read view
+					where transaction does not see changes
+					made by active transactions except
+					creating transaction. */
+#define VIEW_HIGH_GRANULARITY	2	/*!< High-granularity read view where
+					transaction does not see changes
+					made by active transactions and own
+					changes after a point in time when this
+					read view was created. */
+/* @} */
+
+/** Implement InnoDB framework to support consistent read views in
+cursors. This struct holds both heap where consistent read view
+is allocated and pointer to a read view. */
+
+struct cursor_view_t{
+	mem_heap_t*	heap;
+				/*!< Memory heap for the cursor view */
+	read_view_t*	read_view;
+				/*!< Consistent read view of the cursor*/
+	ulint		n_mysql_tables_in_use;
+				/*!< number of Innobase tables used in the
+				processing of this cursor */
+};
+
+#ifndef UNIV_NONINL
+#include "read0read.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/read0read.ic b/storage/innobase/include/read0read.ic
new file mode 100644
index 00000000000..82c1028f12e
--- /dev/null
+++ b/storage/innobase/include/read0read.ic
@@ -0,0 +1,148 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/read0read.ic
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#include "trx0sys.h"
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Validates a read view object. */
+static
+bool
+read_view_validate(
+/*===============*/
+	const read_view_t*	view)	/*!< in: view to validate */
+{
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	/* Check that the view->trx_ids array is in descending order. */
+	for (ulint i = 1; i < view->n_trx_ids; ++i) {
+
+		ut_a(view->trx_ids[i] < view->trx_ids[i - 1]);
+	}
+
+	return(true);
+}
+
+/** Functor to validate the view list. */
+struct	ViewCheck {
+
+	ViewCheck() : m_prev_view(0) { }
+
+	void	operator()(const read_view_t* view)
+	{
+		ut_a(m_prev_view == NULL
+		     || m_prev_view->low_limit_no >= view->low_limit_no);
+
+		m_prev_view = view;
+	}
+
+	const read_view_t*	m_prev_view;
+};
+
+/*********************************************************************//**
+Validates a read view list. */
+static
+bool
+read_view_list_validate(void)
+/*=========================*/
+{
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	ut_list_map(trx_sys->view_list, &read_view_t::view_list, ViewCheck());
+
+	return(true);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Checks if a read view sees the specified transaction.
+@return	true if sees */
+UNIV_INLINE
+bool
+read_view_sees_trx_id(
+/*==================*/
+	const read_view_t*	view,	/*!< in: read view */
+	trx_id_t		trx_id)	/*!< in: trx id */
+{
+	if (trx_id < view->up_limit_id) {
+
+		return(true);
+	} else if (trx_id >= view->low_limit_id) {
+
+		return(false);
+	} else {
+		ulint	lower = 0;
+		ulint	upper = view->n_trx_ids - 1;
+
+		ut_a(view->n_trx_ids > 0);
+
+		do {
+			ulint		mid	= (lower + upper) >> 1;
+			trx_id_t	mid_id	= view->trx_ids[mid];
+
+			if (mid_id == trx_id) {
+				return(FALSE);
+			} else if (mid_id < trx_id) {
+				if (mid > 0) {
+					upper = mid - 1;
+				} else {
+					break;
+				}
+			} else {
+				lower = mid + 1;
+			}
+		} while (lower <= upper);
+	}
+
+	return(true);
+}
+
+/*********************************************************************//**
+Remove a read view from the trx_sys->view_list. */
+UNIV_INLINE
+void
+read_view_remove(
+/*=============*/
+	read_view_t*	view,		/*!< in: read view, can be 0 */
+	bool		own_mutex)	/*!< in: true if caller owns the
+					trx_sys_t::mutex */
+{
+	if (view != 0) {
+		if (!own_mutex) {
+			mutex_enter(&trx_sys->mutex);
+		}
+
+		ut_ad(read_view_validate(view));
+
+		UT_LIST_REMOVE(view_list, trx_sys->view_list, view);
+
+		ut_ad(read_view_list_validate());
+
+		if (!own_mutex) {
+			mutex_exit(&trx_sys->mutex);
+		}
+	}
+}
+
diff --git a/storage/innobase/include/read0types.h b/storage/innobase/include/read0types.h
new file mode 100644
index 00000000000..969f4ebb637
--- /dev/null
+++ b/storage/innobase/include/read0types.h
@@ -0,0 +1,32 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/read0types.h
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef read0types_h
+#define read0types_h
+
+struct read_view_t;
+struct cursor_view_t;
+
+#endif
diff --git a/storage/innobase/include/rem0cmp.h b/storage/innobase/include/rem0cmp.h
new file mode 100644
index 00000000000..cb3c85ac2c8
--- /dev/null
+++ b/storage/innobase/include/rem0cmp.h
@@ -0,0 +1,301 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/rem0cmp.h
+Comparison services for records
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+#ifndef rem0cmp_h
+#define rem0cmp_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "data0type.h"
+#include "dict0dict.h"
+#include "rem0rec.h"
+
+/*************************************************************//**
+Returns TRUE if two columns are equal for comparison purposes.
+@return	TRUE if the columns are considered equal in comparisons */
+UNIV_INTERN
+ibool
+cmp_cols_are_equal(
+/*===============*/
+	const dict_col_t*	col1,	/*!< in: column 1 */
+	const dict_col_t*	col2,	/*!< in: column 2 */
+	ibool			check_charsets);
+					/*!< in: whether to check charsets */
+/*************************************************************//**
+This function is used to compare two data fields for which we know the
+data type.
+@return	1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INLINE
+int
+cmp_data_data(
+/*==========*/
+	ulint		mtype,	/*!< in: main type */
+	ulint		prtype,	/*!< in: precise type */
+	const byte*	data1,	/*!< in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/*!< in: data field length or UNIV_SQL_NULL */
+	const byte*	data2,	/*!< in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2);	/*!< in: data field length or UNIV_SQL_NULL */
+/*************************************************************//**
+This function is used to compare two data fields for which we know the
+data type.
+@return	1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INTERN
+int
+cmp_data_data_slow(
+/*===============*/
+	ulint		mtype,	/*!< in: main type */
+	ulint		prtype,	/*!< in: precise type */
+	const byte*	data1,	/*!< in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/*!< in: data field length or UNIV_SQL_NULL */
+	const byte*	data2,	/*!< in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2);	/*!< in: data field length or UNIV_SQL_NULL */
+
+/*****************************************************************
+This function is used to compare two data fields for which we know the
+data type to be VARCHAR.
+@return	1, 0, -1, if lhs is greater, equal, less than rhs, respectively */
+UNIV_INTERN
+int
+cmp_data_data_slow_varchar(
+/*=======================*/
+	const byte*	lhs,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		lhs_len,/* in: data field length or UNIV_SQL_NULL */
+	const byte*	rhs,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		rhs_len);/* in: data field length or UNIV_SQL_NULL */
+/*****************************************************************
+This function is used to compare two varchar/char fields. The comparison
+is for the LIKE operator.
+@return	1, 0, -1, if lhs is greater, equal, less than rhs, respectively */
+UNIV_INTERN
+int
+cmp_data_data_slow_like_prefix(
+/*===========================*/
+	const byte*	data1,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/* in: data field length or UNIV_SQL_NULL */
+	const byte*	data2,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2);	/* in: data field length or UNIV_SQL_NULL */
+/*****************************************************************
+This function is used to compare two varchar/char fields. The comparison
+is for the LIKE operator.
+@return	1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INTERN
+int
+cmp_data_data_slow_like_suffix(
+/*===========================*/
+	const byte*	data1,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/* in: data field length or UNIV_SQL_NULL */
+	const byte*	data2,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2);	/* in: data field length or UNIV_SQL_NULL */
+/*****************************************************************
+This function is used to compare two varchar/char fields. The comparison
+is for the LIKE operator.
+@return	1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INTERN
+int
+cmp_data_data_slow_like_substr(
+/*===========================*/
+	const byte*	data1,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/* in: data field length or UNIV_SQL_NULL */
+	const byte*	data2,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2);	/* in: data field length or UNIV_SQL_NULL */
+/*************************************************************//**
+This function is used to compare two dfields where at least the first
+has its data type field set.
+@return 1, 0, -1, if dfield1 is greater, equal, less than dfield2,
+respectively */
+UNIV_INLINE
+int
+cmp_dfield_dfield(
+/*==============*/
+	const dfield_t*	dfield1,/*!< in: data field; must have type field set */
+	const dfield_t*	dfield2);/*!< in: data field */
+/*************************************************************//**
+This function is used to compare a data tuple to a physical record.
+Only dtuple->n_fields_cmp first fields are taken into account for
+the data tuple! If we denote by n = n_fields_cmp, then rec must
+have either m >= n fields, or it must differ from dtuple in some of
+the m fields rec has. If rec has an externally stored field we do not
+compare it but return with value 0 if such a comparison should be
+made.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared, or until
+the first externally stored field in rec */
+UNIV_INTERN
+int
+cmp_dtuple_rec_with_match_low(
+/*==========================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record which differs from
+				dtuple in some of the common fields, or which
+				has an equal number or more fields than
+				dtuple */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n_cmp,	/*!< in: number of fields to compare */
+	ulint*		matched_fields,
+				/*!< in/out: number of already completely
+				matched fields; when function returns,
+				contains the value for current comparison */
+	ulint*		matched_bytes)
+				/*!< in/out: number of already matched
+				bytes within the first field not completely
+				matched; when function returns, contains the
+				value for current comparison */
+	__attribute__((nonnull));
+#define cmp_dtuple_rec_with_match(tuple,rec,offsets,fields,bytes)	\
+	cmp_dtuple_rec_with_match_low(					\
+		tuple,rec,offsets,dtuple_get_n_fields_cmp(tuple),fields,bytes)
+/**************************************************************//**
+Compares a data tuple to a physical record.
+@see cmp_dtuple_rec_with_match
+@return 1, 0, -1, if dtuple is greater, equal, less than rec, respectively */
+UNIV_INTERN
+int
+cmp_dtuple_rec(
+/*===========*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/**************************************************************//**
+Checks if a dtuple is a prefix of a record. The last field in dtuple
+is allowed to be a prefix of the corresponding field in the record.
+@return	TRUE if prefix */
+UNIV_INTERN
+ibool
+cmp_dtuple_is_prefix_of_rec(
+/*========================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/*************************************************************//**
+Compare two physical records that contain the same number of columns,
+none of which are stored externally.
+@retval 1 if rec1 (including non-ordering columns) is greater than rec2
+@retval -1 if rec1 (including non-ordering columns) is less than rec2
+@retval 0 if rec1 is a duplicate of rec2 */
+UNIV_INTERN
+int
+cmp_rec_rec_simple(
+/*===============*/
+	const rec_t*		rec1,	/*!< in: physical record */
+	const rec_t*		rec2,	/*!< in: physical record */
+	const ulint*		offsets1,/*!< in: rec_get_offsets(rec1, ...) */
+	const ulint*		offsets2,/*!< in: rec_get_offsets(rec2, ...) */
+	const dict_index_t*	index,	/*!< in: data dictionary index */
+	struct TABLE*		table)	/*!< in: MySQL table, for reporting
+					duplicate key value if applicable,
+					or NULL */
+	__attribute__((nonnull(1,2,3,4), warn_unused_result));
+/*************************************************************//**
+This function is used to compare two physical records. Only the common
+first fields are compared, and if an externally stored field is
+encountered, then 0 is returned.
+@return 1, 0, -1 if rec1 is greater, equal, less, respectively */
+UNIV_INTERN
+int
+cmp_rec_rec_with_match(
+/*===================*/
+	const rec_t*	rec1,	/*!< in: physical record */
+	const rec_t*	rec2,	/*!< in: physical record */
+	const ulint*	offsets1,/*!< in: rec_get_offsets(rec1, index) */
+	const ulint*	offsets2,/*!< in: rec_get_offsets(rec2, index) */
+	dict_index_t*	index,	/*!< in: data dictionary index */
+	ibool		nulls_unequal,
+				/* in: TRUE if this is for index statistics
+				cardinality estimation, and innodb_stats_method
+				is "nulls_unequal" or "nulls_ignored" */
+	ulint*		matched_fields, /*!< in/out: number of already completely
+				matched fields; when the function returns,
+				contains the value the for current
+				comparison */
+	ulint*		matched_bytes);/*!< in/out: number of already matched
+				bytes within the first field not completely
+				matched; when the function returns, contains
+				the value for the current comparison */
+/*************************************************************//**
+This function is used to compare two physical records. Only the common
+first fields are compared.
+@return 1, 0 , -1 if rec1 is greater, equal, less, respectively, than
+rec2; only the common first fields are compared */
+UNIV_INLINE
+int
+cmp_rec_rec(
+/*========*/
+	const rec_t*	rec1,	/*!< in: physical record */
+	const rec_t*	rec2,	/*!< in: physical record */
+	const ulint*	offsets1,/*!< in: rec_get_offsets(rec1, index) */
+	const ulint*	offsets2,/*!< in: rec_get_offsets(rec2, index) */
+	dict_index_t*	index);	/*!< in: data dictionary index */
+
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INTERN
+int
+cmp_dfield_dfield_like_prefix(
+/*==========================*/
+				/* out: 1, 0, -1, if dfield1 is greater, equal,
+				less than dfield2, respectively */
+	dfield_t*	dfield1,/* in: data field; must have type field set */
+	dfield_t*	dfield2);/* in: data field */
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INLINE
+int
+cmp_dfield_dfield_like_substr(
+/*==========================*/
+				/* out: 1, 0, -1, if dfield1 is greater, equal,
+				less than dfield2, respectively */
+	dfield_t*	dfield1,/* in: data field; must have type field set */
+	dfield_t*	dfield2);/* in: data field */
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INLINE
+int
+cmp_dfield_dfield_like_suffix(
+/*==========================*/
+				/* out: 1, 0, -1, if dfield1 is greater, equal,
+				less than dfield2, respectively */
+	dfield_t*	dfield1,/* in: data field; must have type field set */
+	dfield_t*	dfield2);/* in: data field */
+
+#ifndef UNIV_NONINL
+#include "rem0cmp.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/rem0cmp.ic b/storage/innobase/include/rem0cmp.ic
new file mode 100644
index 00000000000..67a2dcacba1
--- /dev/null
+++ b/storage/innobase/include/rem0cmp.ic
@@ -0,0 +1,186 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/rem0cmp.ic
+Comparison services for records
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+/*************************************************************//**
+This function is used to compare two data fields for which we know the
+data type.
+@return	1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INLINE
+int
+cmp_data_data(
+/*==========*/
+	ulint		mtype,	/*!< in: main type */
+	ulint		prtype,	/*!< in: precise type */
+	const byte*	data1,	/*!< in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/*!< in: data field length or UNIV_SQL_NULL */
+	const byte*	data2,	/*!< in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2)	/*!< in: data field length or UNIV_SQL_NULL */
+{
+	return(cmp_data_data_slow(mtype, prtype, data1, len1, data2, len2));
+}
+
+/*****************************************************************
+This function is used to compare two (CHAR) data fields for the LIKE
+operator. */
+UNIV_INLINE
+int
+cmp_data_data_like_prefix(
+/*======================*/
+				/* out: 1, 0, -1, if data1 is greater, equal,
+				less than data2, respectively */
+	byte*           data1,  /* in: data field (== a pointer to a memory
+				buffer) */
+	ulint           len1,   /* in: data field length or UNIV_SQL_NULL */
+	byte*           data2,  /* in: data field (== a pointer to a memory
+				buffer) */
+	ulint           len2)   /* in: data field length or UNIV_SQL_NULL */
+{
+	return(cmp_data_data_slow_like_prefix(data1, len1, data2, len2));
+}
+/*****************************************************************
+This function is used to compare two (CHAR) data fields for the LIKE
+operator. */
+UNIV_INLINE
+int
+cmp_data_data_like_suffix(
+/*======================*/
+				/* out: 1, 0, -1, if data1 is greater, equal,
+				less than data2, respectively */
+	byte*           data1,  /* in: data field (== a pointer to a memory
+				buffer) */
+	ulint           len1,   /* in: data field length or UNIV_SQL_NULL */
+	byte*           data2,  /* in: data field (== a pointer to a memory
+				buffer) */
+	ulint           len2)   /* in: data field length or UNIV_SQL_NULL */
+{
+	return(cmp_data_data_slow_like_suffix(data1, len1, data2, len2));
+}
+/*****************************************************************
+This function is used to compare two (CHAR) data fields for the LIKE
+operator. */
+UNIV_INLINE
+int
+cmp_data_data_like_substr(
+/*======================*/
+				/* out: 1, 0, -1, if data1 is greater, equal,
+				less than data2, respectively */
+	byte*           data1,  /* in: data field (== a pointer to a memory
+				buffer) */
+	ulint           len1,   /* in: data field length or UNIV_SQL_NULL */
+	byte*           data2,  /* in: data field (== a pointer to a memory
+				buffer) */
+	ulint           len2)   /* in: data field length or UNIV_SQL_NULL */
+{
+	return(cmp_data_data_slow_like_substr(data1, len1, data2, len2));
+}
+/*************************************************************//**
+This function is used to compare two dfields where at least the first
+has its data type field set.
+@return 1, 0, -1, if dfield1 is greater, equal, less than dfield2,
+respectively */
+UNIV_INLINE
+int
+cmp_dfield_dfield(
+/*==============*/
+	const dfield_t*	dfield1,/*!< in: data field; must have type field set */
+	const dfield_t*	dfield2)/*!< in: data field */
+{
+	const dtype_t*	type;
+
+	ut_ad(dfield_check_typed(dfield1));
+
+	type = dfield_get_type(dfield1);
+
+	return(cmp_data_data(type->mtype, type->prtype,
+			     (const byte*) dfield_get_data(dfield1),
+			     dfield_get_len(dfield1),
+			     (const byte*) dfield_get_data(dfield2),
+			     dfield_get_len(dfield2)));
+}
+
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INLINE
+int
+cmp_dfield_dfield_like_suffix(
+/*==========================*/
+				/* out: 1, 0, -1, if dfield1 is greater, equal,
+				less than dfield2, respectively */
+	dfield_t*       dfield1,/* in: data field; must have type field set */
+	dfield_t*       dfield2)/* in: data field */
+{
+	ut_ad(dfield_check_typed(dfield1));
+
+	return(cmp_data_data_like_suffix(
+		(byte*) dfield_get_data(dfield1),
+		dfield_get_len(dfield1),
+		(byte*) dfield_get_data(dfield2),
+		dfield_get_len(dfield2)));
+}
+
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INLINE
+int
+cmp_dfield_dfield_like_substr(
+/*==========================*/
+				/* out: 1, 0, -1, if dfield1 is greater, equal,
+				less than dfield2, respectively */
+	dfield_t*       dfield1,/* in: data field; must have type field set */
+	dfield_t*       dfield2)/* in: data field */
+{
+	ut_ad(dfield_check_typed(dfield1));
+
+	return(cmp_data_data_like_substr(
+		(byte*) dfield_get_data(dfield1),
+		dfield_get_len(dfield1),
+		(byte*) dfield_get_data(dfield2),
+		dfield_get_len(dfield2)));
+}
+/*************************************************************//**
+This function is used to compare two physical records. Only the common
+first fields are compared.
+@return 1, 0 , -1 if rec1 is greater, equal, less, respectively, than
+rec2; only the common first fields are compared */
+UNIV_INLINE
+int
+cmp_rec_rec(
+/*========*/
+	const rec_t*	rec1,	/*!< in: physical record */
+	const rec_t*	rec2,	/*!< in: physical record */
+	const ulint*	offsets1,/*!< in: rec_get_offsets(rec1, index) */
+	const ulint*	offsets2,/*!< in: rec_get_offsets(rec2, index) */
+	dict_index_t*	index)	/*!< in: data dictionary index */
+{
+	ulint	match_f		= 0;
+	ulint	match_b		= 0;
+
+	return(cmp_rec_rec_with_match(rec1, rec2, offsets1, offsets2, index,
+				      FALSE, &match_f, &match_b));
+}
diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h
new file mode 100644
index 00000000000..8e7d5ff2d48
--- /dev/null
+++ b/storage/innobase/include/rem0rec.h
@@ -0,0 +1,988 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0rec.h
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef rem0rec_h
+#define rem0rec_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "rem0types.h"
+#include "mtr0types.h"
+#include "page0types.h"
+
+/* Info bit denoting the predefined minimum record: this bit is set
+if and only if the record is the first user record on a non-leaf
+B-tree page that is the leftmost page on its level
+(PAGE_LEVEL is nonzero and FIL_PAGE_PREV is FIL_NULL). */
+#define REC_INFO_MIN_REC_FLAG	0x10UL
+/* The deleted flag in info bits */
+#define REC_INFO_DELETED_FLAG	0x20UL	/* when bit is set to 1, it means the
+					record has been delete marked */
+
+/* Number of extra bytes in an old-style record,
+in addition to the data and the offsets */
+#define REC_N_OLD_EXTRA_BYTES	6
+/* Number of extra bytes in a new-style record,
+in addition to the data and the offsets */
+#define REC_N_NEW_EXTRA_BYTES	5
+
+/* Record status values */
+#define REC_STATUS_ORDINARY	0
+#define REC_STATUS_NODE_PTR	1
+#define REC_STATUS_INFIMUM	2
+#define REC_STATUS_SUPREMUM	3
+
+/* The following four constants are needed in page0zip.cc in order to
+efficiently compress and decompress pages. */
+
+/* The offset of heap_no in a compact record */
+#define REC_NEW_HEAP_NO		4
+/* The shift of heap_no in a compact record.
+The status is stored in the low-order bits. */
+#define	REC_HEAP_NO_SHIFT	3
+
+/* Length of a B-tree node pointer, in bytes */
+#define REC_NODE_PTR_SIZE	4
+
+/** SQL null flag in a 1-byte offset of ROW_FORMAT=REDUNDANT records */
+#define REC_1BYTE_SQL_NULL_MASK	0x80UL
+/** SQL null flag in a 2-byte offset of ROW_FORMAT=REDUNDANT records */
+#define REC_2BYTE_SQL_NULL_MASK	0x8000UL
+
+/** In a 2-byte offset of ROW_FORMAT=REDUNDANT records, the second most
+significant bit denotes that the tail of a field is stored off-page. */
+#define REC_2BYTE_EXTERN_MASK	0x4000UL
+
+#ifdef UNIV_DEBUG
+/* Length of the rec_get_offsets() header */
+# define REC_OFFS_HEADER_SIZE	4
+#else /* UNIV_DEBUG */
+/* Length of the rec_get_offsets() header */
+# define REC_OFFS_HEADER_SIZE	2
+#endif /* UNIV_DEBUG */
+
+/* Number of elements that should be initially allocated for the
+offsets[] array, first passed to rec_get_offsets() */
+#define REC_OFFS_NORMAL_SIZE	100
+#define REC_OFFS_SMALL_SIZE	10
+
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return	pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+const rec_t*
+rec_get_next_ptr_const(
+/*===================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+	__attribute__((nonnull, pure, warn_unused_result));
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return	pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+rec_t*
+rec_get_next_ptr(
+/*=============*/
+	rec_t*	rec,	/*!< in: physical record */
+	ulint	comp)	/*!< in: nonzero=compact page format */
+	__attribute__((nonnull, pure, warn_unused_result));
+/******************************************************//**
+The following function is used to get the offset of the
+next chained record on the same page.
+@return	the page offset of the next chained record, or 0 if none */
+UNIV_INLINE
+ulint
+rec_get_next_offs(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+	__attribute__((nonnull, pure, warn_unused_result));
+/******************************************************//**
+The following function is used to set the next record offset field
+of an old-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_old(
+/*==================*/
+	rec_t*	rec,	/*!< in: old-style physical record */
+	ulint	next)	/*!< in: offset of the next record */
+	__attribute__((nonnull));
+/******************************************************//**
+The following function is used to set the next record offset field
+of a new-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_new(
+/*==================*/
+	rec_t*	rec,	/*!< in/out: new-style physical record */
+	ulint	next)	/*!< in: offset of the next record */
+	__attribute__((nonnull));
+/******************************************************//**
+The following function is used to get the number of fields
+in an old-style record.
+@return	number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields_old(
+/*=================*/
+	const rec_t*	rec)	/*!< in: physical record */
+	__attribute__((nonnull, pure, warn_unused_result));
+/******************************************************//**
+The following function is used to get the number of fields
+in a record.
+@return	number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields(
+/*=============*/
+	const rec_t*		rec,	/*!< in: physical record */
+	const dict_index_t*	index)	/*!< in: record descriptor */
+	__attribute__((nonnull, pure, warn_unused_result));
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return	number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_old(
+/*================*/
+	const rec_t*	rec)	/*!< in: old-style physical record */
+	__attribute__((nonnull, pure, warn_unused_result));
+/******************************************************//**
+The following function is used to set the number of owned records. */
+UNIV_INLINE
+void
+rec_set_n_owned_old(
+/*================*/
+	rec_t*	rec,		/*!< in: old-style physical record */
+	ulint	n_owned)	/*!< in: the number of owned */
+	__attribute__((nonnull));
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return	number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_new(
+/*================*/
+	const rec_t*	rec)	/*!< in: new-style physical record */
+	__attribute__((nonnull, pure, warn_unused_result));
+/******************************************************//**
+The following function is used to set the number of owned records. */
+UNIV_INLINE
+void
+rec_set_n_owned_new(
+/*================*/
+	rec_t*		rec,	/*!< in/out: new-style physical record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	ulint		n_owned)/*!< in: the number of owned */
+	__attribute__((nonnull(1)));
+/******************************************************//**
+The following function is used to retrieve the info bits of
+a record.
+@return	info bits */
+UNIV_INLINE
+ulint
+rec_get_info_bits(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+	__attribute__((nonnull, pure, warn_unused_result));
+/******************************************************//**
+The following function is used to set the info bits of a record. */
+UNIV_INLINE
+void
+rec_set_info_bits_old(
+/*==================*/
+	rec_t*	rec,	/*!< in: old-style physical record */
+	ulint	bits)	/*!< in: info bits */
+	__attribute__((nonnull));
+/******************************************************//**
+The following function is used to set the info bits of a record. */
+UNIV_INLINE
+void
+rec_set_info_bits_new(
+/*==================*/
+	rec_t*	rec,	/*!< in/out: new-style physical record */
+	ulint	bits)	/*!< in: info bits */
+	__attribute__((nonnull));
+/******************************************************//**
+The following function retrieves the status bits of a new-style record.
+@return	status bits */
+UNIV_INLINE
+ulint
+rec_get_status(
+/*===========*/
+	const rec_t*	rec)	/*!< in: physical record */
+	__attribute__((nonnull, pure, warn_unused_result));
+
+/******************************************************//**
+The following function is used to set the status bits of a new-style record. */
+UNIV_INLINE
+void
+rec_set_status(
+/*===========*/
+	rec_t*	rec,	/*!< in/out: physical record */
+	ulint	bits)	/*!< in: info bits */
+	__attribute__((nonnull));
+
+/******************************************************//**
+The following function is used to retrieve the info and status
+bits of a record.  (Only compact records have status bits.)
+@return	info bits */
+UNIV_INLINE
+ulint
+rec_get_info_and_status_bits(
+/*=========================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+	__attribute__((nonnull, pure, warn_unused_result));
+/******************************************************//**
+The following function is used to set the info and status
+bits of a record.  (Only compact records have status bits.) */
+UNIV_INLINE
+void
+rec_set_info_and_status_bits(
+/*=========================*/
+	rec_t*	rec,	/*!< in/out: compact physical record */
+	ulint	bits)	/*!< in: info bits */
+	__attribute__((nonnull));
+
+/******************************************************//**
+The following function tells if record is delete marked.
+@return	nonzero if delete marked */
+UNIV_INLINE
+ulint
+rec_get_deleted_flag(
+/*=================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+	__attribute__((nonnull, pure, warn_unused_result));
+/******************************************************//**
+The following function is used to set the deleted bit. */
+UNIV_INLINE
+void
+rec_set_deleted_flag_old(
+/*=====================*/
+	rec_t*	rec,	/*!< in: old-style physical record */
+	ulint	flag)	/*!< in: nonzero if delete marked */
+	__attribute__((nonnull));
+/******************************************************//**
+The following function is used to set the deleted bit. */
+UNIV_INLINE
+void
+rec_set_deleted_flag_new(
+/*=====================*/
+	rec_t*		rec,	/*!< in/out: new-style physical record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	ulint		flag)	/*!< in: nonzero if delete marked */
+	__attribute__((nonnull(1)));
+/******************************************************//**
+The following function tells if a new-style record is a node pointer.
+@return	TRUE if node pointer */
+UNIV_INLINE
+ibool
+rec_get_node_ptr_flag(
+/*==================*/
+	const rec_t*	rec)	/*!< in: physical record */
+	__attribute__((nonnull, pure, warn_unused_result));
+/******************************************************//**
+The following function is used to get the order number
+of an old-style record in the heap of the index page.
+@return	heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_old(
+/*================*/
+	const rec_t*	rec)	/*!< in: physical record */
+	__attribute__((nonnull, pure, warn_unused_result));
+/******************************************************//**
+The following function is used to set the heap number
+field in an old-style record. */
+UNIV_INLINE
+void
+rec_set_heap_no_old(
+/*================*/
+	rec_t*	rec,	/*!< in: physical record */
+	ulint	heap_no)/*!< in: the heap number */
+	__attribute__((nonnull));
+/******************************************************//**
+The following function is used to get the order number
+of a new-style record in the heap of the index page.
+@return	heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_new(
+/*================*/
+	const rec_t*	rec)	/*!< in: physical record */
+	__attribute__((nonnull, pure, warn_unused_result));
+/******************************************************//**
+The following function is used to set the heap number
+field in a new-style record. */
+UNIV_INLINE
+void
+rec_set_heap_no_new(
+/*================*/
+	rec_t*	rec,	/*!< in/out: physical record */
+	ulint	heap_no)/*!< in: the heap number */
+	__attribute__((nonnull));
+/******************************************************//**
+The following function is used to test whether the data offsets
+in the record are stored in one-byte or two-byte format.
+@return	TRUE if 1-byte form */
+UNIV_INLINE
+ibool
+rec_get_1byte_offs_flag(
+/*====================*/
+	const rec_t*	rec)	/*!< in: physical record */
+	__attribute__((nonnull, pure, warn_unused_result));
+
+/******************************************************//**
+The following function is used to set the 1-byte offsets flag. */
+UNIV_INLINE
+void
+rec_set_1byte_offs_flag(
+/*====================*/
+	rec_t*	rec,	/*!< in: physical record */
+	ibool	flag)	/*!< in: TRUE if 1byte form */
+	__attribute__((nonnull));
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return	offset of the start of the field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_1_get_field_end_info(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+	__attribute__((nonnull, pure, warn_unused_result));
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag and extern
+storage flag ORed */
+UNIV_INLINE
+ulint
+rec_2_get_field_end_info(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+	__attribute__((nonnull, pure, warn_unused_result));
+
+/******************************************************//**
+Returns nonzero if the field is stored off-page.
+@retval 0 if the field is stored in-page
+@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */
+UNIV_INLINE
+ulint
+rec_2_is_field_extern(
+/*==================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+	__attribute__((nonnull, pure, warn_unused_result));
+
+/******************************************************//**
+Determine how many of the first n columns in a compact
+physical record are stored externally.
+@return	number of externally stored columns */
+UNIV_INTERN
+ulint
+rec_get_n_extern_new(
+/*=================*/
+	const rec_t*		rec,	/*!< in: compact physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint			n)	/*!< in: number of columns to scan */
+	__attribute__((nonnull, warn_unused_result));
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record.	It can reuse a previously allocated array.
+@return	the new offsets */
+UNIV_INTERN
+ulint*
+rec_get_offsets_func(
+/*=================*/
+	const rec_t*		rec,	/*!< in: physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint*			offsets,/*!< in/out: array consisting of
+					offsets[0] allocated elements,
+					or an array from rec_get_offsets(),
+					or NULL */
+	ulint			n_fields,/*!< in: maximum number of
+					initialized fields
+					 (ULINT_UNDEFINED if all fields) */
+#ifdef UNIV_DEBUG
+	const char*		file,	/*!< in: file name where called */
+	ulint			line,	/*!< in: line number where called */
+#endif /* UNIV_DEBUG */
+	mem_heap_t**		heap)	/*!< in/out: memory heap */
+#ifdef UNIV_DEBUG
+	__attribute__((nonnull(1,2,5,7),warn_unused_result));
+#else /* UNIV_DEBUG */
+	__attribute__((nonnull(1,2,5),warn_unused_result));
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_DEBUG
+# define rec_get_offsets(rec,index,offsets,n,heap)			\
+	rec_get_offsets_func(rec,index,offsets,n,__FILE__,__LINE__,heap)
+#else /* UNIV_DEBUG */
+# define rec_get_offsets(rec, index, offsets, n, heap)	\
+	rec_get_offsets_func(rec, index, offsets, n, heap)
+#endif /* UNIV_DEBUG */
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record.  It can reuse a previously allocated array. */
+UNIV_INTERN
+void
+rec_get_offsets_reverse(
+/*====================*/
+	const byte*		extra,	/*!< in: the extra bytes of a
+					compact record in reverse order,
+					excluding the fixed-size
+					REC_N_NEW_EXTRA_BYTES */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint			node_ptr,/*!< in: nonzero=node pointer,
+					0=leaf node */
+	ulint*			offsets)/*!< in/out: array consisting of
+					offsets[0] allocated elements */
+	__attribute__((nonnull));
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Validates offsets returned by rec_get_offsets().
+@return	TRUE if valid */
+UNIV_INLINE
+ibool
+rec_offs_validate(
+/*==============*/
+	const rec_t*		rec,	/*!< in: record or NULL */
+	const dict_index_t*	index,	/*!< in: record descriptor or NULL */
+	const ulint*		offsets)/*!< in: array returned by
+					rec_get_offsets() */
+	__attribute__((nonnull(3), warn_unused_result));
+/************************************************************//**
+Updates debug data in offsets, in order to avoid bogus
+rec_offs_validate() failures. */
+UNIV_INLINE
+void
+rec_offs_make_valid(
+/*================*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint*			offsets)/*!< in: array returned by
+					rec_get_offsets() */
+	__attribute__((nonnull));
+#else
+# define rec_offs_make_valid(rec, index, offsets) ((void) 0)
+#endif /* UNIV_DEBUG */
+
+/************************************************************//**
+The following function is used to get the offset to the nth
+data field in an old-style record.
+@return	offset to the field */
+UNIV_INTERN
+ulint
+rec_get_nth_field_offs_old(
+/*=======================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n,	/*!< in: index of the field */
+	ulint*		len)	/*!< out: length of the field; UNIV_SQL_NULL
+				if SQL null */
+	__attribute__((nonnull));
+#define rec_get_nth_field_old(rec, n, len) \
+((rec) + rec_get_nth_field_offs_old(rec, n, len))
+/************************************************************//**
+Gets the physical size of an old-style field.
+Also an SQL null may have a field of size > 0,
+if the data type is of a fixed size.
+@return	field size in bytes */
+UNIV_INLINE
+ulint
+rec_get_nth_field_size(
+/*===================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: index of the field */
+	__attribute__((nonnull, pure, warn_unused_result));
+/************************************************************//**
+The following function is used to get an offset to the nth
+data field in a record.
+@return	offset from the origin of rec */
+UNIV_INLINE
+ulint
+rec_get_nth_field_offs(
+/*===================*/
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n,	/*!< in: index of the field */
+	ulint*		len)	/*!< out: length of the field; UNIV_SQL_NULL
+				if SQL null */
+	__attribute__((nonnull));
+#define rec_get_nth_field(rec, offsets, n, len) \
+((rec) + rec_get_nth_field_offs(offsets, n, len))
+/******************************************************//**
+Determine if the offsets are for a record in the new
+compact format.
+@return	nonzero if compact format */
+UNIV_INLINE
+ulint
+rec_offs_comp(
+/*==========*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
+/******************************************************//**
+Determine if the offsets are for a record containing
+externally stored columns.
+@return	nonzero if externally stored */
+UNIV_INLINE
+ulint
+rec_offs_any_extern(
+/*================*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
+/******************************************************//**
+Determine if the offsets are for a record containing null BLOB pointers.
+@return	first field containing a null BLOB pointer, or NULL if none found */
+UNIV_INLINE
+const byte*
+rec_offs_any_null_extern(
+/*=====================*/
+	const rec_t*	rec,		/*!< in: record */
+	const ulint*	offsets)	/*!< in: rec_get_offsets(rec) */
+	__attribute__((nonnull, pure, warn_unused_result));
+/******************************************************//**
+Returns nonzero if the extern bit is set in nth field of rec.
+@return	nonzero if externally stored */
+UNIV_INLINE
+ulint
+rec_offs_nth_extern(
+/*================*/
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n)	/*!< in: nth field */
+	__attribute__((nonnull, pure, warn_unused_result));
+/******************************************************//**
+Returns nonzero if the SQL NULL bit is set in nth field of rec.
+@return	nonzero if SQL NULL */
+UNIV_INLINE
+ulint
+rec_offs_nth_sql_null(
+/*==================*/
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n)	/*!< in: nth field */
+	__attribute__((nonnull, pure, warn_unused_result));
+/******************************************************//**
+Gets the physical size of a field.
+@return	length of field */
+UNIV_INLINE
+ulint
+rec_offs_nth_size(
+/*==============*/
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n)	/*!< in: nth field */
+	__attribute__((nonnull, pure, warn_unused_result));
+
+/******************************************************//**
+Returns the number of extern bits set in a record.
+@return	number of externally stored fields */
+UNIV_INLINE
+ulint
+rec_offs_n_extern(
+/*==============*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
+/***********************************************************//**
+This is used to modify the value of an already existing field in a record.
+The previous value must have exactly the same size as the new value. If len
+is UNIV_SQL_NULL then the field is treated as an SQL null.
+For records in ROW_FORMAT=COMPACT (new-style records), len must not be
+UNIV_SQL_NULL unless the field already is SQL null. */
+UNIV_INLINE
+void
+rec_set_nth_field(
+/*==============*/
+	rec_t*		rec,	/*!< in: record */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n,	/*!< in: index number of the field */
+	const void*	data,	/*!< in: pointer to the data if not SQL null */
+	ulint		len)	/*!< in: length of the data or UNIV_SQL_NULL.
+				If not SQL null, must have the same
+				length as the previous value.
+				If SQL null, previous value must be
+				SQL null. */
+	__attribute__((nonnull(1,2)));
+/**********************************************************//**
+The following function returns the data size of an old-style physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return	size */
+UNIV_INLINE
+ulint
+rec_get_data_size_old(
+/*==================*/
+	const rec_t*	rec)	/*!< in: physical record */
+	__attribute__((nonnull, pure, warn_unused_result));
+/**********************************************************//**
+The following function returns the number of allocated elements
+for an array of offsets.
+@return	number of elements */
+UNIV_INLINE
+ulint
+rec_offs_get_n_alloc(
+/*=================*/
+	const ulint*	offsets)/*!< in: array for rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
+/**********************************************************//**
+The following function sets the number of allocated elements
+for an array of offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_alloc(
+/*=================*/
+	ulint*	offsets,	/*!< out: array for rec_get_offsets(),
+				must be allocated */
+	ulint	n_alloc)	/*!< in: number of elements */
+	__attribute__((nonnull));
+#define rec_offs_init(offsets) \
+	rec_offs_set_n_alloc(offsets, (sizeof offsets) / sizeof *offsets)
+/**********************************************************//**
+The following function returns the number of fields in a record.
+@return	number of fields */
+UNIV_INLINE
+ulint
+rec_offs_n_fields(
+/*==============*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
+/**********************************************************//**
+The following function returns the data size of a physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return	size */
+UNIV_INLINE
+ulint
+rec_offs_data_size(
+/*===============*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
+/**********************************************************//**
+Returns the total size of record minus data size of record.
+The value returned by the function is the distance from record
+start to record origin in bytes.
+@return	size */
+UNIV_INLINE
+ulint
+rec_offs_extra_size(
+/*================*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
+/**********************************************************//**
+Returns the total size of a physical record.
+@return	size */
+UNIV_INLINE
+ulint
+rec_offs_size(
+/*==========*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Returns a pointer to the start of the record.
+@return	pointer to start */
+UNIV_INLINE
+byte*
+rec_get_start(
+/*==========*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
+/**********************************************************//**
+Returns a pointer to the end of the record.
+@return	pointer to end */
+UNIV_INLINE
+byte*
+rec_get_end(
+/*========*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull, pure, warn_unused_result));
+#else /* UNIV_DEBUG */
+# define rec_get_start(rec, offsets) ((rec) - rec_offs_extra_size(offsets))
+# define rec_get_end(rec, offsets) ((rec) + rec_offs_data_size(offsets))
+#endif /* UNIV_DEBUG */
+/***************************************************************//**
+Copies a physical record to a buffer.
+@return	pointer to the origin of the copy */
+UNIV_INLINE
+rec_t*
+rec_copy(
+/*=====*/
+	void*		buf,	/*!< in: buffer */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull));
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Determines the size of a data tuple prefix in a temporary file.
+@return	total size */
+UNIV_INTERN
+ulint
+rec_get_converted_size_temp(
+/*========================*/
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dfield_t*		fields,	/*!< in: array of data fields */
+	ulint			n_fields,/*!< in: number of data fields */
+	ulint*			extra)	/*!< out: extra size */
+	__attribute__((warn_unused_result, nonnull));
+
+/******************************************************//**
+Determine the offset to each field in temporary file.
+@see rec_convert_dtuple_to_temp() */
+UNIV_INTERN
+void
+rec_init_offsets_temp(
+/*==================*/
+	const rec_t*		rec,	/*!< in: temporary file record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint*			offsets)/*!< in/out: array of offsets;
+					in: n=rec_offs_n_fields(offsets) */
+	__attribute__((nonnull));
+
+/*********************************************************//**
+Builds a temporary file record out of a data tuple.
+@see rec_init_offsets_temp() */
+UNIV_INTERN
+void
+rec_convert_dtuple_to_temp(
+/*=======================*/
+	rec_t*			rec,		/*!< out: record */
+	const dict_index_t*	index,		/*!< in: record descriptor */
+	const dfield_t*		fields,		/*!< in: array of data fields */
+	ulint			n_fields)	/*!< in: number of fields */
+	__attribute__((nonnull));
+
+/**************************************************************//**
+Copies the first n fields of a physical record to a new physical record in
+a buffer.
+@return	own: copied record */
+UNIV_INTERN
+rec_t*
+rec_copy_prefix_to_buf(
+/*===================*/
+	const rec_t*		rec,		/*!< in: physical record */
+	const dict_index_t*	index,		/*!< in: record descriptor */
+	ulint			n_fields,	/*!< in: number of fields
+						to copy */
+	byte**			buf,		/*!< in/out: memory buffer
+						for the copied prefix,
+						or NULL */
+	ulint*			buf_size)	/*!< in/out: buffer size */
+	__attribute__((nonnull));
+/************************************************************//**
+Folds a prefix of a physical record to a ulint.
+@return	the folded value */
+UNIV_INLINE
+ulint
+rec_fold(
+/*=====*/
+	const rec_t*	rec,		/*!< in: the physical record */
+	const ulint*	offsets,	/*!< in: array returned by
+					rec_get_offsets() */
+	ulint		n_fields,	/*!< in: number of complete
+					fields to fold */
+	ulint		n_bytes,	/*!< in: number of bytes to fold
+					in an incomplete last field */
+	index_id_t	tree_id)	/*!< in: index tree id */
+	__attribute__((nonnull, pure, warn_unused_result));
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************//**
+Builds a physical record out of a data tuple and
+stores it into the given buffer.
+@return	pointer to the origin of physical record */
+UNIV_INTERN
+rec_t*
+rec_convert_dtuple_to_rec(
+/*======================*/
+	byte*			buf,	/*!< in: start address of the
+					physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		dtuple,	/*!< in: data tuple */
+	ulint			n_ext)	/*!< in: number of
+					externally stored columns */
+	__attribute__((nonnull, warn_unused_result));
+/**********************************************************//**
+Returns the extra size of an old-style physical record if we know its
+data size and number of fields.
+@return	extra size */
+UNIV_INLINE
+ulint
+rec_get_converted_extra_size(
+/*=========================*/
+	ulint	data_size,	/*!< in: data size */
+	ulint	n_fields,	/*!< in: number of fields */
+	ulint	n_ext)		/*!< in: number of externally stored columns */
+	__attribute__((const));
+/**********************************************************//**
+Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT.
+@return	total size */
+UNIV_INTERN
+ulint
+rec_get_converted_size_comp_prefix(
+/*===============================*/
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dfield_t*		fields,	/*!< in: array of data fields */
+	ulint			n_fields,/*!< in: number of data fields */
+	ulint*			extra)	/*!< out: extra size */
+	__attribute__((warn_unused_result, nonnull(1,2)));
+/**********************************************************//**
+Determines the size of a data tuple in ROW_FORMAT=COMPACT.
+@return	total size */
+UNIV_INTERN
+ulint
+rec_get_converted_size_comp(
+/*========================*/
+	const dict_index_t*	index,	/*!< in: record descriptor;
+					dict_table_is_comp() is
+					assumed to hold, even if
+					it does not */
+	ulint			status,	/*!< in: status bits of the record */
+	const dfield_t*		fields,	/*!< in: array of data fields */
+	ulint			n_fields,/*!< in: number of data fields */
+	ulint*			extra)	/*!< out: extra size */
+	__attribute__((nonnull(1,3)));
+/**********************************************************//**
+The following function returns the size of a data tuple when converted to
+a physical record.
+@return	size */
+UNIV_INLINE
+ulint
+rec_get_converted_size(
+/*===================*/
+	dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+	__attribute__((warn_unused_result, nonnull));
+#ifndef UNIV_HOTBACKUP
+/**************************************************************//**
+Copies the first n fields of a physical record to a data tuple.
+The fields are copied to the memory heap. */
+UNIV_INTERN
+void
+rec_copy_prefix_to_dtuple(
+/*======================*/
+	dtuple_t*		tuple,		/*!< out: data tuple */
+	const rec_t*		rec,		/*!< in: physical record */
+	const dict_index_t*	index,		/*!< in: record descriptor */
+	ulint			n_fields,	/*!< in: number of fields
+						to copy */
+	mem_heap_t*		heap)		/*!< in: memory heap */
+	__attribute__((nonnull));
+#endif /* !UNIV_HOTBACKUP */
+/***************************************************************//**
+Validates the consistency of a physical record.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+rec_validate(
+/*=========*/
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull));
+/***************************************************************//**
+Prints an old-style physical record. */
+UNIV_INTERN
+void
+rec_print_old(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec)	/*!< in: physical record */
+	__attribute__((nonnull));
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Prints a physical record in ROW_FORMAT=COMPACT.  Ignores the
+record header. */
+UNIV_INTERN
+void
+rec_print_comp(
+/*===========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull));
+/***************************************************************//**
+Prints a physical record. */
+UNIV_INTERN
+void
+rec_print_new(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+	__attribute__((nonnull));
+/***************************************************************//**
+Prints a physical record. */
+UNIV_INTERN
+void
+rec_print(
+/*======*/
+	FILE*			file,	/*!< in: file where to print */
+	const rec_t*		rec,	/*!< in: physical record */
+	const dict_index_t*	index)	/*!< in: record descriptor */
+	__attribute__((nonnull));
+
+# ifdef UNIV_DEBUG
+/************************************************************//**
+Reads the DB_TRX_ID of a clustered index record.
+@return	the value of DB_TRX_ID */
+UNIV_INTERN
+trx_id_t
+rec_get_trx_id(
+/*===========*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index)	/*!< in: clustered index */
+	__attribute__((nonnull, warn_unused_result));
+# endif /* UNIV_DEBUG */
+#endif /* UNIV_HOTBACKUP */
+
+/* Maximum lengths for the data in a physical record if the offsets
+are given in one byte (resp. two byte) format. */
+#define REC_1BYTE_OFFS_LIMIT	0x7FUL
+#define REC_2BYTE_OFFS_LIMIT	0x7FFFUL
+
+/* The data size of record must be smaller than this because we reserve
+two upmost bits in a two byte offset for special purposes */
+#define REC_MAX_DATA_SIZE	(16 * 1024)
+
+#ifndef UNIV_NONINL
+#include "rem0rec.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/rem0rec.ic b/storage/innobase/include/rem0rec.ic
new file mode 100644
index 00000000000..a539320dd2a
--- /dev/null
+++ b/storage/innobase/include/rem0rec.ic
@@ -0,0 +1,1718 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0rec.ic
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mach0data.h"
+#include "ut0byte.h"
+#include "dict0dict.h"
+#include "btr0types.h"
+
+/* Compact flag ORed to the extra size returned by rec_get_offsets() */
+#define REC_OFFS_COMPACT	((ulint) 1 << 31)
+/* SQL NULL flag in offsets returned by rec_get_offsets() */
+#define REC_OFFS_SQL_NULL	((ulint) 1 << 31)
+/* External flag in offsets returned by rec_get_offsets() */
+#define REC_OFFS_EXTERNAL	((ulint) 1 << 30)
+/* Mask for offsets returned by rec_get_offsets() */
+#define REC_OFFS_MASK		(REC_OFFS_EXTERNAL - 1)
+
+/* Offsets of the bit-fields in an old-style record. NOTE! In the table the
+most significant bytes and bits are written below less significant.
+
+	(1) byte offset		(2) bit usage within byte
+	downward from
+	origin ->	1	8 bits pointer to next record
+			2	8 bits pointer to next record
+			3	1 bit short flag
+				7 bits number of fields
+			4	3 bits number of fields
+				5 bits heap number
+			5	8 bits heap number
+			6	4 bits n_owned
+				4 bits info bits
+*/
+
+/* Offsets of the bit-fields in a new-style record. NOTE! In the table the
+most significant bytes and bits are written below less significant.
+
+	(1) byte offset		(2) bit usage within byte
+	downward from
+	origin ->	1	8 bits relative offset of next record
+			2	8 bits relative offset of next record
+				  the relative offset is an unsigned 16-bit
+				  integer:
+				  (offset_of_next_record
+				   - offset_of_this_record) mod 64Ki,
+				  where mod is the modulo as a non-negative
+				  number;
+				  we can calculate the offset of the next
+				  record with the formula:
+				  relative_offset + offset_of_this_record
+				  mod UNIV_PAGE_SIZE
+			3	3 bits status:
+					000=conventional record
+					001=node pointer record (inside B-tree)
+					010=infimum record
+					011=supremum record
+					1xx=reserved
+				5 bits heap number
+			4	8 bits heap number
+			5	4 bits n_owned
+				4 bits info bits
+*/
+
+/* We list the byte offsets from the origin of the record, the mask,
+and the shift needed to obtain each bit-field of the record. */
+
+#define REC_NEXT		2
+#define REC_NEXT_MASK		0xFFFFUL
+#define REC_NEXT_SHIFT		0
+
+#define REC_OLD_SHORT		3	/* This is single byte bit-field */
+#define REC_OLD_SHORT_MASK	0x1UL
+#define REC_OLD_SHORT_SHIFT	0
+
+#define REC_OLD_N_FIELDS	4
+#define REC_OLD_N_FIELDS_MASK	0x7FEUL
+#define REC_OLD_N_FIELDS_SHIFT	1
+
+#define REC_NEW_STATUS		3	/* This is single byte bit-field */
+#define REC_NEW_STATUS_MASK	0x7UL
+#define REC_NEW_STATUS_SHIFT	0
+
+#define REC_OLD_HEAP_NO		5
+#define REC_HEAP_NO_MASK	0xFFF8UL
+#if 0 /* defined in rem0rec.h for use of page0zip.cc */
+#define REC_NEW_HEAP_NO		4
+#define	REC_HEAP_NO_SHIFT	3
+#endif
+
+#define REC_OLD_N_OWNED		6	/* This is single byte bit-field */
+#define REC_NEW_N_OWNED		5	/* This is single byte bit-field */
+#define	REC_N_OWNED_MASK	0xFUL
+#define REC_N_OWNED_SHIFT	0
+
+#define REC_OLD_INFO_BITS	6	/* This is single byte bit-field */
+#define REC_NEW_INFO_BITS	5	/* This is single byte bit-field */
+#define	REC_INFO_BITS_MASK	0xF0UL
+#define REC_INFO_BITS_SHIFT	0
+
+#if REC_OLD_SHORT_MASK << (8 * (REC_OLD_SHORT - 3)) \
+		^ REC_OLD_N_FIELDS_MASK << (8 * (REC_OLD_N_FIELDS - 4)) \
+		^ REC_HEAP_NO_MASK << (8 * (REC_OLD_HEAP_NO - 4)) \
+		^ REC_N_OWNED_MASK << (8 * (REC_OLD_N_OWNED - 3)) \
+		^ REC_INFO_BITS_MASK << (8 * (REC_OLD_INFO_BITS - 3)) \
+		^ 0xFFFFFFFFUL
+# error "sum of old-style masks != 0xFFFFFFFFUL"
+#endif
+#if REC_NEW_STATUS_MASK << (8 * (REC_NEW_STATUS - 3)) \
+		^ REC_HEAP_NO_MASK << (8 * (REC_NEW_HEAP_NO - 4)) \
+		^ REC_N_OWNED_MASK << (8 * (REC_NEW_N_OWNED - 3)) \
+		^ REC_INFO_BITS_MASK << (8 * (REC_NEW_INFO_BITS - 3)) \
+		^ 0xFFFFFFUL
+# error "sum of new-style masks != 0xFFFFFFUL"
+#endif
+
+/***********************************************************//**
+Sets the value of the ith field SQL null bit of an old-style record. */
+UNIV_INTERN
+void
+rec_set_nth_field_null_bit(
+/*=======================*/
+	rec_t*	rec,	/*!< in: record */
+	ulint	i,	/*!< in: ith field */
+	ibool	val);	/*!< in: value to set */
+/***********************************************************//**
+Sets an old-style record field to SQL null.
+The physical size of the field is not changed. */
+UNIV_INTERN
+void
+rec_set_nth_field_sql_null(
+/*=======================*/
+	rec_t*	rec,	/*!< in: record */
+	ulint	n);	/*!< in: index of the field */
+
+/******************************************************//**
+Gets a bit field from within 1 byte. */
+UNIV_INLINE
+ulint
+rec_get_bit_field_1(
+/*================*/
+	const rec_t*	rec,	/*!< in: pointer to record origin */
+	ulint		offs,	/*!< in: offset from the origin down */
+	ulint		mask,	/*!< in: mask used to filter bits */
+	ulint		shift)	/*!< in: shift right applied after masking */
+{
+	ut_ad(rec);
+
+	return((mach_read_from_1(rec - offs) & mask) >> shift);
+}
+
+/******************************************************//**
+Sets a bit field within 1 byte. */
+UNIV_INLINE
+void
+rec_set_bit_field_1(
+/*================*/
+	rec_t*	rec,	/*!< in: pointer to record origin */
+	ulint	val,	/*!< in: value to set */
+	ulint	offs,	/*!< in: offset from the origin down */
+	ulint	mask,	/*!< in: mask used to filter bits */
+	ulint	shift)	/*!< in: shift right applied after masking */
+{
+	ut_ad(rec);
+	ut_ad(offs <= REC_N_OLD_EXTRA_BYTES);
+	ut_ad(mask);
+	ut_ad(mask <= 0xFFUL);
+	ut_ad(((mask >> shift) << shift) == mask);
+	ut_ad(((val << shift) & mask) == (val << shift));
+
+	mach_write_to_1(rec - offs,
+			(mach_read_from_1(rec - offs) & ~mask)
+			| (val << shift));
+}
+
+/******************************************************//**
+Gets a bit field from within 2 bytes. */
+UNIV_INLINE
+ulint
+rec_get_bit_field_2(
+/*================*/
+	const rec_t*	rec,	/*!< in: pointer to record origin */
+	ulint		offs,	/*!< in: offset from the origin down */
+	ulint		mask,	/*!< in: mask used to filter bits */
+	ulint		shift)	/*!< in: shift right applied after masking */
+{
+	ut_ad(rec);
+
+	return((mach_read_from_2(rec - offs) & mask) >> shift);
+}
+
+/******************************************************//**
+Sets a bit field within 2 bytes. */
+UNIV_INLINE
+void
+rec_set_bit_field_2(
+/*================*/
+	rec_t*	rec,	/*!< in: pointer to record origin */
+	ulint	val,	/*!< in: value to set */
+	ulint	offs,	/*!< in: offset from the origin down */
+	ulint	mask,	/*!< in: mask used to filter bits */
+	ulint	shift)	/*!< in: shift right applied after masking */
+{
+	ut_ad(rec);
+	ut_ad(offs <= REC_N_OLD_EXTRA_BYTES);
+	ut_ad(mask > 0xFFUL);
+	ut_ad(mask <= 0xFFFFUL);
+	ut_ad((mask >> shift) & 1);
+	ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1)));
+	ut_ad(((mask >> shift) << shift) == mask);
+	ut_ad(((val << shift) & mask) == (val << shift));
+
+	mach_write_to_2(rec - offs,
+			(mach_read_from_2(rec - offs) & ~mask)
+			| (val << shift));
+}
+
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return	pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+const rec_t*
+rec_get_next_ptr_const(
+/*===================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+{
+	ulint	field_value;
+
+	ut_ad(REC_NEXT_MASK == 0xFFFFUL);
+	ut_ad(REC_NEXT_SHIFT == 0);
+
+	field_value = mach_read_from_2(rec - REC_NEXT);
+
+	if (field_value == 0) {
+
+		return(NULL);
+	}
+
+	if (comp) {
+#if UNIV_PAGE_SIZE_MAX <= 32768
+		/* Note that for 64 KiB pages, field_value can 'wrap around'
+		and the debug assertion is not valid */
+
+		/* In the following assertion, field_value is interpreted
+		as signed 16-bit integer in 2's complement arithmetics.
+		If all platforms defined int16_t in the standard headers,
+		the expression could be written simpler as
+		(int16_t) field_value + ut_align_offset(...) < UNIV_PAGE_SIZE
+		*/
+		ut_ad((field_value >= 32768
+		       ? field_value - 65536
+		       : field_value)
+		      + ut_align_offset(rec, UNIV_PAGE_SIZE)
+		      < UNIV_PAGE_SIZE);
+#endif
+		/* There must be at least REC_N_NEW_EXTRA_BYTES + 1
+		between each record. */
+		ut_ad((field_value > REC_N_NEW_EXTRA_BYTES
+		       && field_value < 32768)
+		      || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES);
+
+		return((byte*) ut_align_down(rec, UNIV_PAGE_SIZE)
+		       + ut_align_offset(rec + field_value, UNIV_PAGE_SIZE));
+	} else {
+		ut_ad(field_value < UNIV_PAGE_SIZE);
+
+		return((byte*) ut_align_down(rec, UNIV_PAGE_SIZE)
+		       + field_value);
+	}
+}
+
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return	pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+rec_t*
+rec_get_next_ptr(
+/*=============*/
+	rec_t*	rec,	/*!< in: physical record */
+	ulint	comp)	/*!< in: nonzero=compact page format */
+{
+	return(const_cast<rec_t*>(rec_get_next_ptr_const(rec, comp)));
+}
+
+/******************************************************//**
+The following function is used to get the offset of the next chained record
+on the same page.
+@return	the page offset of the next chained record, or 0 if none */
+UNIV_INLINE
+ulint
+rec_get_next_offs(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+{
+	ulint	field_value;
+#if REC_NEXT_MASK != 0xFFFFUL
+# error "REC_NEXT_MASK != 0xFFFFUL"
+#endif
+#if REC_NEXT_SHIFT
+# error "REC_NEXT_SHIFT != 0"
+#endif
+
+	field_value = mach_read_from_2(rec - REC_NEXT);
+
+	if (comp) {
+#if UNIV_PAGE_SIZE_MAX <= 32768
+		/* Note that for 64 KiB pages, field_value can 'wrap around'
+		and the debug assertion is not valid */
+
+		/* In the following assertion, field_value is interpreted
+		as signed 16-bit integer in 2's complement arithmetics.
+		If all platforms defined int16_t in the standard headers,
+		the expression could be written simpler as
+		(int16_t) field_value + ut_align_offset(...) < UNIV_PAGE_SIZE
+		*/
+		ut_ad((field_value >= 32768
+		       ? field_value - 65536
+		       : field_value)
+		      + ut_align_offset(rec, UNIV_PAGE_SIZE)
+		      < UNIV_PAGE_SIZE);
+#endif
+		if (field_value == 0) {
+
+			return(0);
+		}
+
+		/* There must be at least REC_N_NEW_EXTRA_BYTES + 1
+		between each record. */
+		ut_ad((field_value > REC_N_NEW_EXTRA_BYTES
+		       && field_value < 32768)
+		      || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES);
+
+		return(ut_align_offset(rec + field_value, UNIV_PAGE_SIZE));
+	} else {
+		ut_ad(field_value < UNIV_PAGE_SIZE);
+
+		return(field_value);
+	}
+}
+
+/******************************************************//**
+The following function is used to set the next record offset field
+of an old-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_old(
+/*==================*/
+	rec_t*	rec,	/*!< in: old-style physical record */
+	ulint	next)	/*!< in: offset of the next record */
+{
+	ut_ad(rec);
+	ut_ad(UNIV_PAGE_SIZE > next);
+#if REC_NEXT_MASK != 0xFFFFUL
+# error "REC_NEXT_MASK != 0xFFFFUL"
+#endif
+#if REC_NEXT_SHIFT
+# error "REC_NEXT_SHIFT != 0"
+#endif
+
+	mach_write_to_2(rec - REC_NEXT, next);
+}
+
+/******************************************************//**
+The following function is used to set the next record offset field
+of a new-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_new(
+/*==================*/
+	rec_t*	rec,	/*!< in/out: new-style physical record */
+	ulint	next)	/*!< in: offset of the next record */
+{
+	ulint	field_value;
+
+	ut_ad(rec);
+	ut_ad(UNIV_PAGE_SIZE > next);
+
+	if (!next) {
+		field_value = 0;
+	} else {
+		/* The following two statements calculate
+		next - offset_of_rec mod 64Ki, where mod is the modulo
+		as a non-negative number */
+
+		field_value = (ulint)
+			((lint) next
+			 - (lint) ut_align_offset(rec, UNIV_PAGE_SIZE));
+		field_value &= REC_NEXT_MASK;
+	}
+
+	mach_write_to_2(rec - REC_NEXT, field_value);
+}
+
+/******************************************************//**
+The following function is used to get the number of fields
+in an old-style record.
+@return	number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields_old(
+/*=================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	ulint	ret;
+
+	ut_ad(rec);
+
+	ret = rec_get_bit_field_2(rec, REC_OLD_N_FIELDS,
+				  REC_OLD_N_FIELDS_MASK,
+				  REC_OLD_N_FIELDS_SHIFT);
+	ut_ad(ret <= REC_MAX_N_FIELDS);
+	ut_ad(ret > 0);
+
+	return(ret);
+}
+
+/******************************************************//**
+The following function is used to set the number of fields
+in an old-style record. */
+UNIV_INLINE
+void
+rec_set_n_fields_old(
+/*=================*/
+	rec_t*	rec,		/*!< in: physical record */
+	ulint	n_fields)	/*!< in: the number of fields */
+{
+	ut_ad(rec);
+	ut_ad(n_fields <= REC_MAX_N_FIELDS);
+	ut_ad(n_fields > 0);
+
+	rec_set_bit_field_2(rec, n_fields, REC_OLD_N_FIELDS,
+			    REC_OLD_N_FIELDS_MASK, REC_OLD_N_FIELDS_SHIFT);
+}
+
+/******************************************************//**
+The following function retrieves the status bits of a new-style record.
+@return	status bits */
+UNIV_INLINE
+ulint
+rec_get_status(
+/*===========*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	ulint	ret;
+
+	ut_ad(rec);
+
+	ret = rec_get_bit_field_1(rec, REC_NEW_STATUS,
+				  REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT);
+	ut_ad((ret & ~REC_NEW_STATUS_MASK) == 0);
+
+	return(ret);
+}
+
+/******************************************************//**
+The following function is used to get the number of fields
+in a record.
+@return	number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields(
+/*=============*/
+	const rec_t*		rec,	/*!< in: physical record */
+	const dict_index_t*	index)	/*!< in: record descriptor */
+{
+	ut_ad(rec);
+	ut_ad(index);
+
+	if (!dict_table_is_comp(index->table)) {
+		return(rec_get_n_fields_old(rec));
+	}
+
+	switch (rec_get_status(rec)) {
+	case REC_STATUS_ORDINARY:
+		return(dict_index_get_n_fields(index));
+	case REC_STATUS_NODE_PTR:
+		return(dict_index_get_n_unique_in_tree(index) + 1);
+	case REC_STATUS_INFIMUM:
+	case REC_STATUS_SUPREMUM:
+		return(1);
+	default:
+		ut_error;
+		return(ULINT_UNDEFINED);
+	}
+}
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return	number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_old(
+/*================*/
+	const rec_t*	rec)	/*!< in: old-style physical record */
+{
+	return(rec_get_bit_field_1(rec, REC_OLD_N_OWNED,
+				   REC_N_OWNED_MASK, REC_N_OWNED_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the number of owned records. */
+UNIV_INLINE
+void
+rec_set_n_owned_old(
+/*================*/
+	rec_t*	rec,		/*!< in: old-style physical record */
+	ulint	n_owned)	/*!< in: the number of owned */
+{
+	rec_set_bit_field_1(rec, n_owned, REC_OLD_N_OWNED,
+			    REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return	number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_new(
+/*================*/
+	const rec_t*	rec)	/*!< in: new-style physical record */
+{
+	return(rec_get_bit_field_1(rec, REC_NEW_N_OWNED,
+				   REC_N_OWNED_MASK, REC_N_OWNED_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the number of owned records. */
+UNIV_INLINE
+void
+rec_set_n_owned_new(
+/*================*/
+	rec_t*		rec,	/*!< in/out: new-style physical record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	ulint		n_owned)/*!< in: the number of owned */
+{
+	rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED,
+			    REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+	if (page_zip && rec_get_status(rec) != REC_STATUS_SUPREMUM) {
+		page_zip_rec_set_owned(page_zip, rec, n_owned);
+	}
+}
+
+/******************************************************//**
+The following function is used to retrieve the info bits of a record.
+@return	info bits */
+UNIV_INLINE
+ulint
+rec_get_info_bits(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+{
+	return(rec_get_bit_field_1(
+		       rec, comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS,
+		       REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the info bits of a record. */
+UNIV_INLINE
+void
+rec_set_info_bits_old(
+/*==================*/
+	rec_t*	rec,	/*!< in: old-style physical record */
+	ulint	bits)	/*!< in: info bits */
+{
+	rec_set_bit_field_1(rec, bits, REC_OLD_INFO_BITS,
+			    REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+}
+/******************************************************//**
+The following function is used to set the info bits of a record. */
+UNIV_INLINE
+void
+rec_set_info_bits_new(
+/*==================*/
+	rec_t*	rec,	/*!< in/out: new-style physical record */
+	ulint	bits)	/*!< in: info bits */
+{
+	rec_set_bit_field_1(rec, bits, REC_NEW_INFO_BITS,
+			    REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to set the status bits of a new-style record. */
+UNIV_INLINE
+void
+rec_set_status(
+/*===========*/
+	rec_t*	rec,	/*!< in/out: physical record */
+	ulint	bits)	/*!< in: info bits */
+{
+	rec_set_bit_field_1(rec, bits, REC_NEW_STATUS,
+			    REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to retrieve the info and status
+bits of a record.  (Only compact records have status bits.)
+@return	info bits */
+UNIV_INLINE
+ulint
+rec_get_info_and_status_bits(
+/*=========================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+{
+	ulint	bits;
+#if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \
+& (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)
+# error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap"
+#endif
+	if (comp) {
+		bits = rec_get_info_bits(rec, TRUE) | rec_get_status(rec);
+	} else {
+		bits = rec_get_info_bits(rec, FALSE);
+		ut_ad(!(bits & ~(REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)));
+	}
+	return(bits);
+}
+/******************************************************//**
+The following function is used to set the info and status
+bits of a record.  (Only compact records have status bits.) */
+UNIV_INLINE
+void
+rec_set_info_and_status_bits(
+/*=========================*/
+	rec_t*	rec,	/*!< in/out: physical record */
+	ulint	bits)	/*!< in: info bits */
+{
+#if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \
+& (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)
+# error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap"
+#endif
+	rec_set_status(rec, bits & REC_NEW_STATUS_MASK);
+	rec_set_info_bits_new(rec, bits & ~REC_NEW_STATUS_MASK);
+}
+
+/******************************************************//**
+The following function tells if record is delete marked.
+@return	nonzero if delete marked */
+UNIV_INLINE
+ulint
+rec_get_deleted_flag(
+/*=================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+{
+	if (comp) {
+		return(rec_get_bit_field_1(rec, REC_NEW_INFO_BITS,
+					   REC_INFO_DELETED_FLAG,
+					   REC_INFO_BITS_SHIFT));
+	} else {
+		return(rec_get_bit_field_1(rec, REC_OLD_INFO_BITS,
+					   REC_INFO_DELETED_FLAG,
+					   REC_INFO_BITS_SHIFT));
+	}
+}
+
+/******************************************************//**
+The following function is used to set the deleted bit. */
+UNIV_INLINE
+void
+rec_set_deleted_flag_old(
+/*=====================*/
+	rec_t*	rec,	/*!< in: old-style physical record */
+	ulint	flag)	/*!< in: nonzero if delete marked */
+{
+	ulint	val;
+
+	val = rec_get_info_bits(rec, FALSE);
+
+	if (flag) {
+		val |= REC_INFO_DELETED_FLAG;
+	} else {
+		val &= ~REC_INFO_DELETED_FLAG;
+	}
+
+	rec_set_info_bits_old(rec, val);
+}
+
+/******************************************************//**
+The following function is used to set the deleted bit. */
+UNIV_INLINE
+void
+rec_set_deleted_flag_new(
+/*=====================*/
+	rec_t*		rec,	/*!< in/out: new-style physical record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	ulint		flag)	/*!< in: nonzero if delete marked */
+{
+	ulint	val;
+
+	val = rec_get_info_bits(rec, TRUE);
+
+	if (flag) {
+		val |= REC_INFO_DELETED_FLAG;
+	} else {
+		val &= ~REC_INFO_DELETED_FLAG;
+	}
+
+	rec_set_info_bits_new(rec, val);
+
+	if (page_zip) {
+		page_zip_rec_set_deleted(page_zip, rec, flag);
+	}
+}
+
+/******************************************************//**
+The following function tells if a new-style record is a node pointer.
+@return	TRUE if node pointer */
+UNIV_INLINE
+ibool
+rec_get_node_ptr_flag(
+/*==================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	return(REC_STATUS_NODE_PTR == rec_get_status(rec));
+}
+
+/******************************************************//**
+The following function is used to get the order number
+of an old-style record in the heap of the index page.
+@return	heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_old(
+/*================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	return(rec_get_bit_field_2(rec, REC_OLD_HEAP_NO,
+				   REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the heap number
+field in an old-style record. */
+UNIV_INLINE
+void
+rec_set_heap_no_old(
+/*================*/
+	rec_t*	rec,	/*!< in: physical record */
+	ulint	heap_no)/*!< in: the heap number */
+{
+	rec_set_bit_field_2(rec, heap_no, REC_OLD_HEAP_NO,
+			    REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to get the order number
+of a new-style record in the heap of the index page.
+@return	heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_new(
+/*================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	return(rec_get_bit_field_2(rec, REC_NEW_HEAP_NO,
+				   REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the heap number
+field in a new-style record. */
+UNIV_INLINE
+void
+rec_set_heap_no_new(
+/*================*/
+	rec_t*	rec,	/*!< in/out: physical record */
+	ulint	heap_no)/*!< in: the heap number */
+{
+	rec_set_bit_field_2(rec, heap_no, REC_NEW_HEAP_NO,
+			    REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to test whether the data offsets in the record
+are stored in one-byte or two-byte format.
+@return	TRUE if 1-byte form */
+UNIV_INLINE
+ibool
+rec_get_1byte_offs_flag(
+/*====================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+#if TRUE != 1
+#error "TRUE != 1"
+#endif
+
+	return(rec_get_bit_field_1(rec, REC_OLD_SHORT, REC_OLD_SHORT_MASK,
+				   REC_OLD_SHORT_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the 1-byte offsets flag. */
+UNIV_INLINE
+void
+rec_set_1byte_offs_flag(
+/*====================*/
+	rec_t*	rec,	/*!< in: physical record */
+	ibool	flag)	/*!< in: TRUE if 1byte form */
+{
+#if TRUE != 1
+#error "TRUE != 1"
+#endif
+	ut_ad(flag <= TRUE);
+
+	rec_set_bit_field_1(rec, flag, REC_OLD_SHORT, REC_OLD_SHORT_MASK,
+			    REC_OLD_SHORT_SHIFT);
+}
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return	offset of the start of the field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_1_get_field_end_info(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(rec_get_1byte_offs_flag(rec));
+	ut_ad(n < rec_get_n_fields_old(rec));
+
+	return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1)));
+}
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag and extern
+storage flag ORed */
+UNIV_INLINE
+ulint
+rec_2_get_field_end_info(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(!rec_get_1byte_offs_flag(rec));
+	ut_ad(n < rec_get_n_fields_old(rec));
+
+	return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2)));
+}
+
+/******************************************************//**
+Returns nonzero if the field is stored off-page.
+@retval 0 if the field is stored in-page
+@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */
+UNIV_INLINE
+ulint
+rec_2_is_field_extern(
+/*==================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	return(rec_2_get_field_end_info(rec, n) & REC_2BYTE_EXTERN_MASK);
+}
+
+/* Get the base address of offsets.  The extra_size is stored at
+this position, and following positions hold the end offsets of
+the fields. */
+#define rec_offs_base(offsets) (offsets + REC_OFFS_HEADER_SIZE)
+
+/**********************************************************//**
+The following function returns the number of allocated elements
+for an array of offsets.
+@return	number of elements */
+UNIV_INLINE
+ulint
+rec_offs_get_n_alloc(
+/*=================*/
+	const ulint*	offsets)/*!< in: array for rec_get_offsets() */
+{
+	ulint	n_alloc;
+	ut_ad(offsets);
+	n_alloc = offsets[0];
+	ut_ad(n_alloc > REC_OFFS_HEADER_SIZE);
+	UNIV_MEM_ASSERT_W(offsets, n_alloc * sizeof *offsets);
+	return(n_alloc);
+}
+
+/**********************************************************//**
+The following function sets the number of allocated elements
+for an array of offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_alloc(
+/*=================*/
+	ulint*	offsets,	/*!< out: array for rec_get_offsets(),
+				must be allocated */
+	ulint	n_alloc)	/*!< in: number of elements */
+{
+	ut_ad(offsets);
+	ut_ad(n_alloc > REC_OFFS_HEADER_SIZE);
+	UNIV_MEM_ASSERT_AND_ALLOC(offsets, n_alloc * sizeof *offsets);
+	offsets[0] = n_alloc;
+}
+
+/**********************************************************//**
+The following function returns the number of fields in a record.
+@return	number of fields */
+UNIV_INLINE
+ulint
+rec_offs_n_fields(
+/*==============*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	n_fields;
+	ut_ad(offsets);
+	n_fields = offsets[1];
+	ut_ad(n_fields > 0);
+	ut_ad(n_fields <= REC_MAX_N_FIELDS);
+	ut_ad(n_fields + REC_OFFS_HEADER_SIZE
+	      <= rec_offs_get_n_alloc(offsets));
+	return(n_fields);
+}
+
+/************************************************************//**
+Validates offsets returned by rec_get_offsets().
+@return	TRUE if valid */
+UNIV_INLINE
+ibool
+rec_offs_validate(
+/*==============*/
+	const rec_t*		rec,	/*!< in: record or NULL */
+	const dict_index_t*	index,	/*!< in: record descriptor or NULL */
+	const ulint*		offsets)/*!< in: array returned by
+					rec_get_offsets() */
+{
+	ulint	i	= rec_offs_n_fields(offsets);
+	ulint	last	= ULINT_MAX;
+	ulint	comp	= *rec_offs_base(offsets) & REC_OFFS_COMPACT;
+
+	if (rec) {
+		ut_ad((ulint) rec == offsets[2]);
+		if (!comp) {
+			ut_a(rec_get_n_fields_old(rec) >= i);
+		}
+	}
+	if (index) {
+		ulint max_n_fields;
+		ut_ad((ulint) index == offsets[3]);
+		max_n_fields = ut_max(
+			dict_index_get_n_fields(index),
+			dict_index_get_n_unique_in_tree(index) + 1);
+		if (comp && rec) {
+			switch (rec_get_status(rec)) {
+			case REC_STATUS_ORDINARY:
+				break;
+			case REC_STATUS_NODE_PTR:
+				max_n_fields = dict_index_get_n_unique_in_tree(
+					index) + 1;
+				break;
+			case REC_STATUS_INFIMUM:
+			case REC_STATUS_SUPREMUM:
+				max_n_fields = 1;
+				break;
+			default:
+				ut_error;
+			}
+		}
+		/* index->n_def == 0 for dummy indexes if !comp */
+		ut_a(!comp || index->n_def);
+		ut_a(!index->n_def || i <= max_n_fields);
+	}
+	while (i--) {
+		ulint	curr = rec_offs_base(offsets)[1 + i] & REC_OFFS_MASK;
+		ut_a(curr <= last);
+		last = curr;
+	}
+	return(TRUE);
+}
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Updates debug data in offsets, in order to avoid bogus
+rec_offs_validate() failures. */
+UNIV_INLINE
+void
+rec_offs_make_valid(
+/*================*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint*			offsets)/*!< in: array returned by
+					rec_get_offsets() */
+{
+	ut_ad(rec);
+	ut_ad(index);
+	ut_ad(offsets);
+	ut_ad(rec_get_n_fields(rec, index) >= rec_offs_n_fields(offsets));
+	offsets[2] = (ulint) rec;
+	offsets[3] = (ulint) index;
+}
+#endif /* UNIV_DEBUG */
+
+/************************************************************//**
+The following function is used to get an offset to the nth
+data field in a record.
+@return	offset from the origin of rec */
+UNIV_INLINE
+ulint
+rec_get_nth_field_offs(
+/*===================*/
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n,	/*!< in: index of the field */
+	ulint*		len)	/*!< out: length of the field; UNIV_SQL_NULL
+				if SQL null */
+{
+	ulint	offs;
+	ulint	length;
+	ut_ad(n < rec_offs_n_fields(offsets));
+	ut_ad(len);
+
+	if (n == 0) {
+		offs = 0;
+	} else {
+		offs = rec_offs_base(offsets)[n] & REC_OFFS_MASK;
+	}
+
+	length = rec_offs_base(offsets)[1 + n];
+
+	if (length & REC_OFFS_SQL_NULL) {
+		length = UNIV_SQL_NULL;
+	} else {
+		length &= REC_OFFS_MASK;
+		length -= offs;
+	}
+
+	*len = length;
+	return(offs);
+}
+
+/******************************************************//**
+Determine if the offsets are for a record in the new
+compact format.
+@return	nonzero if compact format */
+UNIV_INLINE
+ulint
+rec_offs_comp(
+/*==========*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	return(*rec_offs_base(offsets) & REC_OFFS_COMPACT);
+}
+
+/******************************************************//**
+Determine if the offsets are for a record containing
+externally stored columns.
+@return	nonzero if externally stored */
+UNIV_INLINE
+ulint
+rec_offs_any_extern(
+/*================*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	return(*rec_offs_base(offsets) & REC_OFFS_EXTERNAL);
+}
+
+/******************************************************//**
+Determine if the offsets are for a record containing null BLOB pointers.
+@return	first field containing a null BLOB pointer, or NULL if none found */
+UNIV_INLINE
+const byte*
+rec_offs_any_null_extern(
+/*=====================*/
+	const rec_t*	rec,		/*!< in: record */
+	const ulint*	offsets)	/*!< in: rec_get_offsets(rec) */
+{
+	ulint	i;
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	if (!rec_offs_any_extern(offsets)) {
+		return(NULL);
+	}
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
+			ulint		len;
+			const byte*	field
+				= rec_get_nth_field(rec, offsets, i, &len);
+
+			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+			if (!memcmp(field + len
+				    - BTR_EXTERN_FIELD_REF_SIZE,
+				    field_ref_zero,
+				    BTR_EXTERN_FIELD_REF_SIZE)) {
+				return(field);
+			}
+		}
+	}
+
+	return(NULL);
+}
+
+/******************************************************//**
+Returns nonzero if the extern bit is set in nth field of rec.
+@return	nonzero if externally stored */
+UNIV_INLINE
+ulint
+rec_offs_nth_extern(
+/*================*/
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n)	/*!< in: nth field */
+{
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	ut_ad(n < rec_offs_n_fields(offsets));
+	return(rec_offs_base(offsets)[1 + n] & REC_OFFS_EXTERNAL);
+}
+
+/******************************************************//**
+Returns nonzero if the SQL NULL bit is set in nth field of rec.
+@return	nonzero if SQL NULL */
+UNIV_INLINE
+ulint
+rec_offs_nth_sql_null(
+/*==================*/
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n)	/*!< in: nth field */
+{
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	ut_ad(n < rec_offs_n_fields(offsets));
+	return(rec_offs_base(offsets)[1 + n] & REC_OFFS_SQL_NULL);
+}
+
+/******************************************************//**
+Gets the physical size of a field.
+@return	length of field */
+UNIV_INLINE
+ulint
+rec_offs_nth_size(
+/*==============*/
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n)	/*!< in: nth field */
+{
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	ut_ad(n < rec_offs_n_fields(offsets));
+	if (!n) {
+		return(rec_offs_base(offsets)[1 + n] & REC_OFFS_MASK);
+	}
+	return((rec_offs_base(offsets)[1 + n] - rec_offs_base(offsets)[n])
+	       & REC_OFFS_MASK);
+}
+
+/******************************************************//**
+Returns the number of extern bits set in a record.
+@return	number of externally stored fields */
+UNIV_INLINE
+ulint
+rec_offs_n_extern(
+/*==============*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	n = 0;
+
+	if (rec_offs_any_extern(offsets)) {
+		ulint	i;
+
+		for (i = rec_offs_n_fields(offsets); i--; ) {
+			if (rec_offs_nth_extern(offsets, i)) {
+				n++;
+			}
+		}
+	}
+
+	return(n);
+}
+
+/******************************************************//**
+Returns the offset of n - 1th field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value. This function and the 2-byte counterpart are defined here because the
+C-compiler was not able to sum negative and positive constant offsets, and
+warned of constant arithmetic overflow within the compiler.
+@return	offset of the start of the PREVIOUS field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_1_get_prev_field_end_info(
+/*==========================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(rec_get_1byte_offs_flag(rec));
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n)));
+}
+
+/******************************************************//**
+Returns the offset of n - 1th field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return	offset of the start of the PREVIOUS field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_2_get_prev_field_end_info(
+/*==========================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(!rec_get_1byte_offs_flag(rec));
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n)));
+}
+
+/******************************************************//**
+Sets the field end info for the nth field if the record is stored in the
+1-byte format. */
+UNIV_INLINE
+void
+rec_1_set_field_end_info(
+/*=====================*/
+	rec_t*	rec,	/*!< in: record */
+	ulint	n,	/*!< in: field index */
+	ulint	info)	/*!< in: value to set */
+{
+	ut_ad(rec_get_1byte_offs_flag(rec));
+	ut_ad(n < rec_get_n_fields_old(rec));
+
+	mach_write_to_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1), info);
+}
+
+/******************************************************//**
+Sets the field end info for the nth field if the record is stored in the
+2-byte format. */
+UNIV_INLINE
+void
+rec_2_set_field_end_info(
+/*=====================*/
+	rec_t*	rec,	/*!< in: record */
+	ulint	n,	/*!< in: field index */
+	ulint	info)	/*!< in: value to set */
+{
+	ut_ad(!rec_get_1byte_offs_flag(rec));
+	ut_ad(n < rec_get_n_fields_old(rec));
+
+	mach_write_to_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2), info);
+}
+
+/******************************************************//**
+Returns the offset of nth field start if the record is stored in the 1-byte
+offsets form.
+@return	offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_1_get_field_start_offs(
+/*=======================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(rec_get_1byte_offs_flag(rec));
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	if (n == 0) {
+
+		return(0);
+	}
+
+	return(rec_1_get_prev_field_end_info(rec, n)
+	       & ~REC_1BYTE_SQL_NULL_MASK);
+}
+
+/******************************************************//**
+Returns the offset of nth field start if the record is stored in the 2-byte
+offsets form.
+@return	offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_2_get_field_start_offs(
+/*=======================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(!rec_get_1byte_offs_flag(rec));
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	if (n == 0) {
+
+		return(0);
+	}
+
+	return(rec_2_get_prev_field_end_info(rec, n)
+	       & ~(REC_2BYTE_SQL_NULL_MASK | REC_2BYTE_EXTERN_MASK));
+}
+
+/******************************************************//**
+The following function is used to read the offset of the start of a data field
+in the record. The start of an SQL null field is the end offset of the
+previous non-null field, or 0, if none exists. If n is the number of the last
+field + 1, then the end offset of the last field is returned.
+@return	offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_get_field_start_offs(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(rec);
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	if (n == 0) {
+
+		return(0);
+	}
+
+	if (rec_get_1byte_offs_flag(rec)) {
+
+		return(rec_1_get_field_start_offs(rec, n));
+	}
+
+	return(rec_2_get_field_start_offs(rec, n));
+}
+
+/************************************************************//**
+Gets the physical size of an old-style field.
+Also an SQL null may have a field of size > 0,
+if the data type is of a fixed size.
+@return	field size in bytes */
+UNIV_INLINE
+ulint
+rec_get_nth_field_size(
+/*===================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: index of the field */
+{
+	ulint	os;
+	ulint	next_os;
+
+	os = rec_get_field_start_offs(rec, n);
+	next_os = rec_get_field_start_offs(rec, n + 1);
+
+	ut_ad(next_os - os < UNIV_PAGE_SIZE);
+
+	return(next_os - os);
+}
+
+/***********************************************************//**
+This is used to modify the value of an already existing field in a record.
+The previous value must have exactly the same size as the new value. If len
+is UNIV_SQL_NULL then the field is treated as an SQL null.
+For records in ROW_FORMAT=COMPACT (new-style records), len must not be
+UNIV_SQL_NULL unless the field already is SQL null. */
+UNIV_INLINE
+void
+rec_set_nth_field(
+/*==============*/
+	rec_t*		rec,	/*!< in: record */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n,	/*!< in: index number of the field */
+	const void*	data,	/*!< in: pointer to the data
+				if not SQL null */
+	ulint		len)	/*!< in: length of the data or UNIV_SQL_NULL */
+{
+	byte*	data2;
+	ulint	len2;
+
+	ut_ad(rec);
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	if (len == UNIV_SQL_NULL) {
+		if (!rec_offs_nth_sql_null(offsets, n)) {
+			ut_a(!rec_offs_comp(offsets));
+			rec_set_nth_field_sql_null(rec, n);
+		}
+
+		return;
+	}
+
+	data2 = rec_get_nth_field(rec, offsets, n, &len2);
+	if (len2 == UNIV_SQL_NULL) {
+		ut_ad(!rec_offs_comp(offsets));
+		rec_set_nth_field_null_bit(rec, n, FALSE);
+		ut_ad(len == rec_get_nth_field_size(rec, n));
+	} else {
+		ut_ad(len2 == len);
+	}
+
+	ut_memcpy(data2, data, len);
+}
+
+/**********************************************************//**
+The following function returns the data size of an old-style physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return	size */
+UNIV_INLINE
+ulint
+rec_get_data_size_old(
+/*==================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	ut_ad(rec);
+
+	return(rec_get_field_start_offs(rec, rec_get_n_fields_old(rec)));
+}
+
+/**********************************************************//**
+The following function sets the number of fields in offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_fields(
+/*==================*/
+	ulint*	offsets,	/*!< in/out: array returned by
+				rec_get_offsets() */
+	ulint	n_fields)	/*!< in: number of fields */
+{
+	ut_ad(offsets);
+	ut_ad(n_fields > 0);
+	ut_ad(n_fields <= REC_MAX_N_FIELDS);
+	ut_ad(n_fields + REC_OFFS_HEADER_SIZE
+	      <= rec_offs_get_n_alloc(offsets));
+	offsets[1] = n_fields;
+}
+
+/**********************************************************//**
+The following function returns the data size of a physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return	size */
+UNIV_INLINE
+ulint
+rec_offs_data_size(
+/*===============*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	size;
+
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	size = rec_offs_base(offsets)[rec_offs_n_fields(offsets)]
+		& REC_OFFS_MASK;
+	ut_ad(size < UNIV_PAGE_SIZE);
+	return(size);
+}
+
+/**********************************************************//**
+Returns the total size of record minus data size of record. The value
+returned by the function is the distance from record start to record origin
+in bytes.
+@return	size */
+UNIV_INLINE
+ulint
+rec_offs_extra_size(
+/*================*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	size;
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	size = *rec_offs_base(offsets) & ~(REC_OFFS_COMPACT | REC_OFFS_EXTERNAL);
+	ut_ad(size < UNIV_PAGE_SIZE);
+	return(size);
+}
+
+/**********************************************************//**
+Returns the total size of a physical record.
+@return	size */
+UNIV_INLINE
+ulint
+rec_offs_size(
+/*==========*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	return(rec_offs_data_size(offsets) + rec_offs_extra_size(offsets));
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Returns a pointer to the end of the record.
+@return	pointer to end */
+UNIV_INLINE
+byte*
+rec_get_end(
+/*========*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	return(const_cast<rec_t*>(rec + rec_offs_data_size(offsets)));
+}
+
+/**********************************************************//**
+Returns a pointer to the start of the record.
+@return	pointer to start */
+UNIV_INLINE
+byte*
+rec_get_start(
+/*==========*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	return(const_cast<rec_t*>(rec - rec_offs_extra_size(offsets)));
+}
+#endif /* UNIV_DEBUG */
+
+/***************************************************************//**
+Copies a physical record to a buffer.
+@return	pointer to the origin of the copy */
+UNIV_INLINE
+rec_t*
+rec_copy(
+/*=====*/
+	void*		buf,	/*!< in: buffer */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	extra_len;
+	ulint	data_len;
+
+	ut_ad(rec && buf);
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(rec_validate(rec, offsets));
+
+	extra_len = rec_offs_extra_size(offsets);
+	data_len = rec_offs_data_size(offsets);
+
+	ut_memcpy(buf, rec - extra_len, extra_len + data_len);
+
+	return((byte*) buf + extra_len);
+}
+
+/**********************************************************//**
+Returns the extra size of an old-style physical record if we know its
+data size and number of fields.
+@return	extra size */
+UNIV_INLINE
+ulint
+rec_get_converted_extra_size(
+/*=========================*/
+	ulint	data_size,	/*!< in: data size */
+	ulint	n_fields,	/*!< in: number of fields */
+	ulint	n_ext)		/*!< in: number of externally stored columns */
+{
+	if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) {
+
+		return(REC_N_OLD_EXTRA_BYTES + n_fields);
+	}
+
+	return(REC_N_OLD_EXTRA_BYTES + 2 * n_fields);
+}
+
+/**********************************************************//**
+The following function returns the size of a data tuple when converted to
+a physical record.
+@return	size */
+UNIV_INLINE
+ulint
+rec_get_converted_size(
+/*===================*/
+	dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+{
+	ulint	data_size;
+	ulint	extra_size;
+
+	ut_ad(index);
+	ut_ad(dtuple);
+	ut_ad(dtuple_check_typed(dtuple));
+
+	ut_ad(dict_index_is_univ(index)
+	      || dtuple_get_n_fields(dtuple)
+	      == (((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK)
+		   == REC_STATUS_NODE_PTR)
+		  ? dict_index_get_n_unique_in_tree(index) + 1
+		  : dict_index_get_n_fields(index)));
+
+	if (dict_table_is_comp(index->table)) {
+		return(rec_get_converted_size_comp(index,
+						   dtuple_get_info_bits(dtuple)
+						   & REC_NEW_STATUS_MASK,
+						   dtuple->fields,
+						   dtuple->n_fields, NULL));
+	}
+
+	data_size = dtuple_get_data_size(dtuple, 0);
+
+	extra_size = rec_get_converted_extra_size(
+		data_size, dtuple_get_n_fields(dtuple), n_ext);
+
+#if 0
+	/* This code is inactive since it may be the wrong place to add
+	in the size of node pointers used in parent pages AND it is not
+	currently needed since ha_innobase::max_supported_key_length()
+	ensures that the key size limit for each page size is well below
+	the actual limit ((free space on page / 4) - record overhead).
+	But those limits will need to be raised when InnoDB can
+	support multiple page sizes.  At that time, we will need
+	to consider the node pointer on these universal btrees. */
+
+	if (dict_index_is_univ(index)) {
+		/* This is for the insert buffer B-tree.
+		All fields in the leaf tuple ascend to the
+		parent node plus the child page pointer. */
+
+		/* ibuf cannot contain externally stored fields */
+		ut_ad(n_ext == 0);
+
+		/* Add the data pointer and recompute extra_size
+		based on one more field. */
+		data_size += REC_NODE_PTR_SIZE;
+		extra_size = rec_get_converted_extra_size(
+			data_size,
+			dtuple_get_n_fields(dtuple) + 1,
+			0);
+
+		/* Be sure dtuple->n_fields has this node ptr
+		accounted for.  This function should correspond to
+		what rec_convert_dtuple_to_rec() needs in storage.
+		In optimistic insert or update-not-in-place, we will
+		have to ensure that if the record is converted to a
+		node pointer, it will not become too large.*/
+	}
+#endif
+
+	return(data_size + extra_size);
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Folds a prefix of a physical record to a ulint. Folds only existing fields,
+that is, checks that we do not run out of the record.
+@return	the folded value */
+UNIV_INLINE
+ulint
+rec_fold(
+/*=====*/
+	const rec_t*	rec,		/*!< in: the physical record */
+	const ulint*	offsets,	/*!< in: array returned by
+					rec_get_offsets() */
+	ulint		n_fields,	/*!< in: number of complete
+					fields to fold */
+	ulint		n_bytes,	/*!< in: number of bytes to fold
+					in an incomplete last field */
+	index_id_t	tree_id)	/*!< in: index tree id */
+{
+	ulint		i;
+	const byte*	data;
+	ulint		len;
+	ulint		fold;
+	ulint		n_fields_rec;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(rec_validate(rec, offsets));
+	ut_ad(n_fields + n_bytes > 0);
+
+	n_fields_rec = rec_offs_n_fields(offsets);
+	ut_ad(n_fields <= n_fields_rec);
+	ut_ad(n_fields < n_fields_rec || n_bytes == 0);
+
+	if (n_fields > n_fields_rec) {
+		n_fields = n_fields_rec;
+	}
+
+	if (n_fields == n_fields_rec) {
+		n_bytes = 0;
+	}
+
+	fold = ut_fold_ull(tree_id);
+
+	for (i = 0; i < n_fields; i++) {
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		if (len != UNIV_SQL_NULL) {
+			fold = ut_fold_ulint_pair(fold,
+						  ut_fold_binary(data, len));
+		}
+	}
+
+	if (n_bytes > 0) {
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		if (len != UNIV_SQL_NULL) {
+			if (len > n_bytes) {
+				len = n_bytes;
+			}
+
+			fold = ut_fold_ulint_pair(fold,
+						  ut_fold_binary(data, len));
+		}
+	}
+
+	return(fold);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/rem0types.h b/storage/innobase/include/rem0types.h
new file mode 100644
index 00000000000..f8133f77466
--- /dev/null
+++ b/storage/innobase/include/rem0types.h
@@ -0,0 +1,74 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0types.h
+Record manager global types
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef rem0types_h
+#define rem0types_h
+
+/* We define the physical record simply as an array of bytes */
+typedef byte	rec_t;
+
+/* Maximum values for various fields (for non-blob tuples) */
+#define REC_MAX_N_FIELDS	(1024 - 1)
+#define REC_MAX_HEAP_NO		(2 * 8192 - 1)
+#define REC_MAX_N_OWNED		(16 - 1)
+
+/* Maximum number of user defined fields/columns. The reserved columns
+are the ones InnoDB adds internally: DB_ROW_ID, DB_TRX_ID, DB_ROLL_PTR.
+We need "* 2" because mlog_parse_index() creates a dummy table object
+possibly, with some of the system columns in it, and then adds the 3
+system columns (again) using dict_table_add_system_columns(). The problem
+is that mlog_parse_index() cannot recognize the system columns by
+just having n_fields, n_uniq and the lengths of the columns. */
+#define REC_MAX_N_USER_FIELDS	(REC_MAX_N_FIELDS - DATA_N_SYS_COLS * 2)
+
+/* REC_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and is the maximum
+indexed field length (or indexed prefix length) for indexes on tables of
+ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT format.
+Before we support UTF-8 encodings with mbmaxlen = 4, a UTF-8 character
+may take at most 3 bytes.  So the limit was set to 3*256, so that one
+can create a column prefix index on 256 characters of a TEXT or VARCHAR
+column also in the UTF-8 charset.
+This constant MUST NOT BE CHANGED, or the compatibility of InnoDB data
+files would be at risk! */
+#define REC_ANTELOPE_MAX_INDEX_COL_LEN		768
+
+/** Maximum indexed field length for table format UNIV_FORMAT_B and
+beyond.
+This (3072) is the maximum index row length allowed, so we cannot create index
+prefix column longer than that. */
+#define REC_VERSION_56_MAX_INDEX_COL_LEN	3072
+
+/** Innodb row types are a subset of the MySQL global enum row_type.
+They are made into their own enum so that switch statements can account
+for each of them. */
+enum rec_format_enum {
+	REC_FORMAT_REDUNDANT	= 0,	/*!< REDUNDANT row format */
+	REC_FORMAT_COMPACT	= 1,	/*!< COMPACT row format */
+	REC_FORMAT_COMPRESSED	= 2,	/*!< COMPRESSED row format */
+	REC_FORMAT_DYNAMIC	= 3	/*!< DYNAMIC row format */
+};
+typedef enum rec_format_enum rec_format_t;
+
+#endif
diff --git a/storage/innobase/include/row0ext.h b/storage/innobase/include/row0ext.h
new file mode 100644
index 00000000000..a098e2f9b29
--- /dev/null
+++ b/storage/innobase/include/row0ext.h
@@ -0,0 +1,102 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ext.h
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#ifndef row0ext_h
+#define row0ext_h
+
+#include "univ.i"
+#include "row0types.h"
+#include "data0types.h"
+#include "mem0mem.h"
+#include "dict0types.h"
+
+/********************************************************************//**
+Creates a cache of column prefixes of externally stored columns.
+@return	own: column prefix cache */
+UNIV_INTERN
+row_ext_t*
+row_ext_create(
+/*===========*/
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	const ulint*	ext,	/*!< in: col_no's of externally stored columns
+				in the InnoDB table object, as reported by
+				dict_col_get_no(); NOT relative to the records
+				in the clustered index */
+	ulint		flags, /*!< in: table->flags */
+	const dtuple_t*	tuple,	/*!< in: data tuple containing the field
+				references of the externally stored
+				columns; must be indexed by col_no;
+				the clustered index record must be
+				covered by a lock or a page latch
+				to prevent deletion (rollback or purge). */
+	mem_heap_t*	heap);	/*!< in: heap where created */
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup_ith(
+/*===============*/
+	const row_ext_t*	ext,	/*!< in/out: column prefix cache */
+	ulint			i,	/*!< in: index of ext->ext[] */
+	ulint*			len);	/*!< out: length of prefix, in bytes,
+					at most the length determined by
+					DICT_MAX_FIELD_LEN_BY_FORMAT() */
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup(
+/*===========*/
+	const row_ext_t*	ext,	/*!< in: column prefix cache */
+	ulint			col,	/*!< in: column number in the InnoDB
+					table object, as reported by
+					dict_col_get_no(); NOT relative to the
+					records in the clustered index */
+	ulint*			len);	/*!< out: length of prefix, in bytes,
+					at most the length determined by
+					DICT_MAX_FIELD_LEN_BY_FORMAT() */
+
+/** Prefixes of externally stored columns */
+struct row_ext_t{
+	ulint		n_ext;	/*!< number of externally stored columns */
+	const ulint*	ext;	/*!< col_no's of externally stored columns */
+	byte*		buf;	/*!< backing store of the column prefix cache */
+	ulint		max_len;/*!< maximum prefix length, it could be
+				REC_ANTELOPE_MAX_INDEX_COL_LEN or
+				REC_VERSION_56_MAX_INDEX_COL_LEN depending
+				on row format */
+	ulint		len[1];	/*!< prefix lengths; 0 if not cached */
+};
+
+#ifndef UNIV_NONINL
+#include "row0ext.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0ext.ic b/storage/innobase/include/row0ext.ic
new file mode 100644
index 00000000000..39e150d91d5
--- /dev/null
+++ b/storage/innobase/include/row0ext.ic
@@ -0,0 +1,87 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ext.ic
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#include "rem0types.h"
+#include "btr0types.h"
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup_ith(
+/*===============*/
+	const row_ext_t*	ext,	/*!< in/out: column prefix cache */
+	ulint			i,	/*!< in: index of ext->ext[] */
+	ulint*			len)	/*!< out: length of prefix, in bytes,
+					at most ext->max_len */
+{
+	ut_ad(ext);
+	ut_ad(len);
+	ut_ad(i < ext->n_ext);
+
+	*len = ext->len[i];
+
+	ut_ad(*len <= ext->max_len);
+	ut_ad(ext->max_len > 0);
+
+	if (*len == 0) {
+		/* The BLOB could not be fetched to the cache. */
+		return(field_ref_zero);
+	} else {
+		return(ext->buf + i * ext->max_len);
+	}
+}
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup(
+/*===========*/
+	const row_ext_t*	ext,	/*!< in: column prefix cache */
+	ulint			col,	/*!< in: column number in the InnoDB
+					table object, as reported by
+					dict_col_get_no(); NOT relative to the
+					records in the clustered index */
+	ulint*			len)	/*!< out: length of prefix, in bytes,
+					at most ext->max_len */
+{
+	ulint	i;
+
+	ut_ad(ext);
+	ut_ad(len);
+
+	for (i = 0; i < ext->n_ext; i++) {
+		if (col == ext->ext[i]) {
+			return(row_ext_lookup_ith(ext, i, len));
+		}
+	}
+
+	return(NULL);
+}
diff --git a/storage/innobase/include/row0ftsort.h b/storage/innobase/include/row0ftsort.h
new file mode 100644
index 00000000000..4e04a099140
--- /dev/null
+++ b/storage/innobase/include/row0ftsort.h
@@ -0,0 +1,279 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ftsort.h
+Create Full Text Index with (parallel) merge sort
+
+Created 10/13/2010 Jimmy Yang
+*******************************************************/
+
+#ifndef row0ftsort_h
+#define row0ftsort_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "row0mysql.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include "fts0priv.h"
+#include "row0merge.h"
+
+/** This structure defineds information the scan thread will fetch
+and put to the linked list for parallel tokenization/sort threads
+to process */
+typedef struct fts_doc_item     fts_doc_item_t;
+
+/** Information about temporary files used in merge sort */
+struct fts_doc_item {
+	dfield_t*	field;		/*!< field contains document string */
+	doc_id_t	doc_id;		/*!< document ID */
+	UT_LIST_NODE_T(fts_doc_item_t)	doc_list;
+					/*!< list of doc items */
+};
+
+/** This defines the list type that scan thread would feed the parallel
+tokenization threads and sort threads. */
+typedef UT_LIST_BASE_NODE_T(fts_doc_item_t)     fts_doc_list_t;
+
+#define FTS_NUM_AUX_INDEX	6
+#define FTS_PLL_MERGE		1
+
+/** Sort information passed to each individual parallel sort thread */
+struct fts_psort_t;
+
+/** Common info passed to each parallel sort thread */
+struct fts_psort_common_t {
+	row_merge_dup_t*	dup;		/*!< descriptor of FTS index */
+	dict_table_t*		new_table;	/*!< source table */
+	trx_t*			trx;		/*!< transaction */
+	fts_psort_t*		all_info;	/*!< all parallel sort info */
+	os_event_t		sort_event;	/*!< sort event */
+	os_event_t		merge_event;	/*!< merge event */
+	ibool			opt_doc_id_size;/*!< whether to use 4 bytes
+						instead of 8 bytes integer to
+						store Doc ID during sort, if
+						Doc ID will not be big enough
+						to use 8 bytes value */
+};
+
+struct fts_psort_t {
+	ulint			psort_id;	/*!< Parallel sort ID */
+	row_merge_buf_t*	merge_buf[FTS_NUM_AUX_INDEX];
+						/*!< sort buffer */
+	merge_file_t*		merge_file[FTS_NUM_AUX_INDEX];
+						/*!< sort file */
+	row_merge_block_t*	merge_block[FTS_NUM_AUX_INDEX];
+						/*!< buffer to write to file */
+	row_merge_block_t*	block_alloc[FTS_NUM_AUX_INDEX];
+						/*!< buffer to allocated */
+	ulint			child_status;	/*!< child thread status */
+	ulint			state;		/*!< parent thread state */
+	fts_doc_list_t		fts_doc_list;	/*!< doc list to process */
+	fts_psort_common_t*	psort_common;	/*!< ptr to all psort info */
+	os_thread_t		thread_hdl;	/*!< thread handler */
+	dberr_t			error;		/*!< db error during psort */
+	ulint			memory_used;	/*!< memory used by fts_doc_list */
+	ib_mutex_t		mutex;		/*!< mutex for fts_doc_list */
+};
+
+/** Structure stores information from string tokenization operation */
+struct fts_tokenize_ctx {
+	ulint			processed_len;  /*!< processed string length */
+	ulint			init_pos;       /*!< doc start position */
+	ulint			buf_used;       /*!< the sort buffer (ID) when
+						tokenization stops, which
+						could due to sort buffer full */
+	ulint			rows_added[FTS_NUM_AUX_INDEX];
+						/*!< number of rows added for
+						each FTS index partition */
+	ib_rbt_t*		cached_stopword;/*!< in: stopword list */
+	dfield_t		sort_field[FTS_NUM_FIELDS_SORT];
+						/*!< in: sort field */
+};
+
+typedef struct fts_tokenize_ctx fts_tokenize_ctx_t;
+
+/** Structure stores information needed for the insertion phase of FTS
+parallel sort. */
+struct fts_psort_insert {
+	trx_t*		trx;		/*!< Transaction used for insertion */
+	que_t**		ins_graph;	/*!< insert graph */
+	fts_table_t	fts_table;	/*!< auxiliary table */
+	CHARSET_INFO*	charset;	/*!< charset info */
+	mem_heap_t*	heap;		/*!< heap */
+	ibool		opt_doc_id_size;/*!< Whether to use smaller (4 bytes)
+					integer for Doc ID */
+};
+
+typedef struct fts_psort_insert	fts_psort_insert_t;
+
+
+/** status bit used for communication between parent and child thread */
+#define FTS_PARENT_COMPLETE	1
+#define FTS_PARENT_EXITING	2
+#define FTS_CHILD_COMPLETE	1
+#define FTS_CHILD_EXITING	2
+
+/** Print some debug information */
+#define	FTSORT_PRINT
+
+#ifdef	FTSORT_PRINT
+#define	DEBUG_FTS_SORT_PRINT(str)		\
+	do {					\
+		ut_print_timestamp(stderr);	\
+		fprintf(stderr, str);		\
+	} while (0)
+#else
+#define DEBUG_FTS_SORT_PRINT(str)
+#endif	/* FTSORT_PRINT */
+
+/*************************************************************//**
+Create a temporary "fts sort index" used to merge sort the
+tokenized doc string. The index has three "fields":
+
+1) Tokenized word,
+2) Doc ID
+3) Word's position in original 'doc'.
+
+@return dict_index_t structure for the fts sort index */
+UNIV_INTERN
+dict_index_t*
+row_merge_create_fts_sort_index(
+/*============================*/
+	dict_index_t*		index,	/*!< in: Original FTS index
+					based on which this sort index
+					is created */
+	const dict_table_t*	table,	/*!< in: table that FTS index
+					is being created on */
+	ibool*			opt_doc_id_size);
+					/*!< out: whether to use 4 bytes
+					instead of 8 bytes integer to
+					store Doc ID during sort */
+
+/********************************************************************//**
+Initialize FTS parallel sort structures.
+@return TRUE if all successful */
+UNIV_INTERN
+ibool
+row_fts_psort_info_init(
+/*====================*/
+	trx_t*			trx,	/*!< in: transaction */
+	row_merge_dup_t*	dup,	/*!< in,own: descriptor of
+					FTS index being created */
+	const dict_table_t*	new_table,/*!< in: table where indexes are
+					created */
+	ibool			opt_doc_id_size,
+					/*!< in: whether to use 4 bytes
+					instead of 8 bytes integer to
+					store Doc ID during sort */
+	fts_psort_t**		psort,	/*!< out: parallel sort info to be
+					instantiated */
+	fts_psort_t**		merge)	/*!< out: parallel merge info
+					to be instantiated */
+	__attribute__((nonnull));
+/********************************************************************//**
+Clean up and deallocate FTS parallel sort structures, and close
+temparary merge sort files */
+UNIV_INTERN
+void
+row_fts_psort_info_destroy(
+/*=======================*/
+	fts_psort_t*	psort_info,	/*!< parallel sort info */
+	fts_psort_t*	merge_info);	/*!< parallel merge info */
+/********************************************************************//**
+Free up merge buffers when merge sort is done */
+UNIV_INTERN
+void
+row_fts_free_pll_merge_buf(
+/*=======================*/
+	fts_psort_t*	psort_info);	/*!< in: parallel sort info */
+
+/*********************************************************************//**
+Function performs parallel tokenization of the incoming doc strings.
+@return OS_THREAD_DUMMY_RETURN */
+UNIV_INTERN
+os_thread_ret_t
+fts_parallel_tokenization(
+/*======================*/
+	void*		arg);		/*!< in: psort_info for the thread */
+/*********************************************************************//**
+Start the parallel tokenization and parallel merge sort */
+UNIV_INTERN
+void
+row_fts_start_psort(
+/*================*/
+	fts_psort_t*	psort_info);	/*!< in: parallel sort info */
+/*********************************************************************//**
+Function performs the merge and insertion of the sorted records.
+@return OS_THREAD_DUMMY_RETURN */
+UNIV_INTERN
+os_thread_ret_t
+fts_parallel_merge(
+/*===============*/
+	void*		arg);		/*!< in: parallel merge info */
+/*********************************************************************//**
+Kick off the parallel merge and insert thread */
+UNIV_INTERN
+void
+row_fts_start_parallel_merge(
+/*=========================*/
+	fts_psort_t*	merge_info);	/*!< in: parallel sort info */
+/********************************************************************//**
+Read sorted FTS data files and insert data tuples to auxillary tables.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+void
+row_fts_insert_tuple(
+/*=================*/
+	fts_psort_insert_t*
+			ins_ctx,        /*!< in: insert context */
+	fts_tokenizer_word_t* word,	/*!< in: last processed
+					tokenized word */
+	ib_vector_t*	positions,	/*!< in: word position */
+	doc_id_t*	in_doc_id,	/*!< in: last item doc id */
+	dtuple_t*	dtuple);	/*!< in: entry to insert */
+/********************************************************************//**
+Propagate a newly added record up one level in the selection tree
+@return parent where this value propagated to */
+UNIV_INTERN
+int
+row_merge_fts_sel_propagate(
+/*========================*/
+	int		propogated,	/*<! in: tree node propagated */
+	int*		sel_tree,	/*<! in: selection tree */
+	ulint		level,		/*<! in: selection tree level */
+	const mrec_t**	 mrec,		/*<! in: sort record */
+	ulint**		offsets,	/*<! in: record offsets */
+	dict_index_t*	index);		/*<! in: FTS index */
+/********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+dberr_t
+row_fts_merge_insert(
+/*=================*/
+	dict_index_t*	index,		/*!< in: index */
+	dict_table_t*	table,		/*!< in: new table */
+	fts_psort_t*	psort_info,	/*!< parallel sort info */
+	ulint		id)		/* !< in: which auxiliary table's data
+					to insert to */
+	__attribute__((nonnull));
+#endif /* row0ftsort_h */
diff --git a/storage/innobase/include/row0import.h b/storage/innobase/include/row0import.h
new file mode 100644
index 00000000000..aa46fdb7c27
--- /dev/null
+++ b/storage/innobase/include/row0import.h
@@ -0,0 +1,91 @@
+/*****************************************************************************
+
+Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0import.h
+Header file for import tablespace functions.
+
+Created 2012-02-08 by Sunny Bains
+*******************************************************/
+
+#ifndef row0import_h
+#define row0import_h
+
+#include "univ.i"
+#include "db0err.h"
+#include "dict0types.h"
+
+// Forward declarations
+struct trx_t;
+struct dict_table_t;
+struct row_prebuilt_t;
+
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_import_for_mysql(
+/*=================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct
+						in MySQL */
+	__attribute__((nonnull, warn_unused_result));
+
+/*****************************************************************//**
+Update the DICT_TF2_DISCARDED flag in SYS_TABLES.
+@return DB_SUCCESS or error code. */
+UNIV_INTERN
+dberr_t
+row_import_update_discarded_flag(
+/*=============================*/
+	trx_t*		trx,			/*!< in/out: transaction that
+						covers the update */
+	table_id_t	table_id,		/*!< in: Table for which we want
+						to set the root table->flags2 */
+	bool		discarded,		/*!< in: set MIX_LEN column bit
+						to discarded, if true */
+	bool		dict_locked)		/*!< in: Set to true if the
+						caller already owns the
+						dict_sys_t:: mutex. */
+	__attribute__((nonnull, warn_unused_result));
+
+/*****************************************************************//**
+Update the (space, root page) of a table's indexes from the values
+in the data dictionary.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_import_update_index_root(
+/*=========================*/
+	trx_t*			trx,		/*!< in/out: transaction that
+						covers the update */
+	const dict_table_t*	table,		/*!< in: Table for which we want
+						to set the root page_no */
+	bool			reset,		/*!< in: if true then set to
+						FIL_NUL */
+	bool			dict_locked)	/*!< in: Set to true if the
+						caller already owns the
+						dict_sys_t:: mutex. */
+	__attribute__((nonnull, warn_unused_result));
+#ifndef UNIV_NONINL
+#include "row0import.ic"
+#endif
+
+#endif /* row0import_h */
diff --git a/storage/innobase/include/row0import.ic b/storage/innobase/include/row0import.ic
new file mode 100644
index 00000000000..c5bbab49f6f
--- /dev/null
+++ b/storage/innobase/include/row0import.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0import.ic
+
+Import tablespace inline functions.
+
+Created 2012-02-08 Sunny Bains
+*******************************************************/
diff --git a/storage/innobase/include/row0ins.h b/storage/innobase/include/row0ins.h
new file mode 100644
index 00000000000..2a892d2f5df
--- /dev/null
+++ b/storage/innobase/include/row0ins.h
@@ -0,0 +1,240 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ins.h
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0ins_h
+#define row0ins_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "que0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+
+/***************************************************************//**
+Checks if foreign key constraint fails for an index entry. Sets shared locks
+which lock either the success or the failure of the constraint. NOTE that
+the caller must have a shared latch on dict_foreign_key_check_lock.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_NO_REFERENCED_ROW, or
+DB_ROW_IS_REFERENCED */
+UNIV_INTERN
+dberr_t
+row_ins_check_foreign_constraint(
+/*=============================*/
+	ibool		check_ref,/*!< in: TRUE If we want to check that
+				the referenced table is ok, FALSE if we
+				want to check the foreign key table */
+	dict_foreign_t*	foreign,/*!< in: foreign constraint; NOTE that the
+				tables mentioned in it must be in the
+				dictionary cache if they exist at all */
+	dict_table_t*	table,	/*!< in: if check_ref is TRUE, then the foreign
+				table, else the referenced table */
+	dtuple_t*	entry,	/*!< in: index entry for index */
+	que_thr_t*	thr)	/*!< in: query thread */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Creates an insert node struct.
+@return	own: insert node struct */
+UNIV_INTERN
+ins_node_t*
+ins_node_create(
+/*============*/
+	ulint		ins_type,	/*!< in: INS_VALUES, ... */
+	dict_table_t*	table,		/*!< in: table where to insert */
+	mem_heap_t*	heap);		/*!< in: mem heap where created */
+/*********************************************************************//**
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+UNIV_INTERN
+void
+ins_node_set_new_row(
+/*=================*/
+	ins_node_t*	node,	/*!< in: insert node */
+	dtuple_t*	row);	/*!< in: new row (or first row) for the node */
+/***************************************************************//**
+Tries to insert an entry into a clustered index, ignoring foreign key
+constraints. If a record with the same unique key is found, the other
+record is necessarily marked deleted by a committed transaction, or a
+unique key violation error occurs. The delete marked record is then
+updated to an existing record, and we must write an undo log record on
+the delete marked record.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@return error code */
+UNIV_INTERN
+dberr_t
+row_ins_clust_index_entry_low(
+/*==========================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		n_uniq,	/*!< in: 0 or index->n_uniq */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr)	/*!< in: query thread or NULL */
+	__attribute__((nonnull, warn_unused_result));
+/***************************************************************//**
+Tries to insert an entry into a secondary index. If a record with exactly the
+same fields is found, the other record is necessarily marked deleted.
+It is then unmarked. Otherwise, the entry is just inserted to the index.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@return error code */
+UNIV_INTERN
+dberr_t
+row_ins_sec_index_entry_low(
+/*========================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: secondary index */
+	mem_heap_t*	offsets_heap,
+				/*!< in/out: memory heap that can be emptied */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	trx_id_t	trx_id,	/*!< in: PAGE_MAX_TRX_ID during
+				row_log_table_apply(), or 0 */
+	que_thr_t*	thr)	/*!< in: query thread */
+	__attribute__((nonnull, warn_unused_result));
+/***************************************************************//**
+Tries to insert the externally stored fields (off-page columns)
+of a clustered index entry.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+UNIV_INTERN
+dberr_t
+row_ins_index_entry_big_rec_func(
+/*=============================*/
+	const dtuple_t*		entry,	/*!< in/out: index entry to insert */
+	const big_rec_t*	big_rec,/*!< in: externally stored fields */
+	ulint*			offsets,/*!< in/out: rec offsets */
+	mem_heap_t**		heap,	/*!< in/out: memory heap */
+	dict_index_t*		index,	/*!< in: index */
+	const char*		file,	/*!< in: file name of caller */
+#ifndef DBUG_OFF
+	const void*		thd,	/*!< in: connection, or NULL */
+#endif /* DBUG_OFF */
+	ulint			line)	/*!< in: line number of caller */
+	__attribute__((nonnull(1,2,3,4,5,6), warn_unused_result));
+#ifdef DBUG_OFF
+# define row_ins_index_entry_big_rec(e,big,ofs,heap,index,thd,file,line) \
+	row_ins_index_entry_big_rec_func(e,big,ofs,heap,index,file,line)
+#else /* DBUG_OFF */
+# define row_ins_index_entry_big_rec(e,big,ofs,heap,index,thd,file,line) \
+	row_ins_index_entry_big_rec_func(e,big,ofs,heap,index,file,thd,line)
+#endif /* DBUG_OFF */
+/***************************************************************//**
+Inserts an entry into a clustered index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+UNIV_INTERN
+dberr_t
+row_ins_clust_index_entry(
+/*======================*/
+	dict_index_t*	index,	/*!< in: clustered index */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+	__attribute__((nonnull, warn_unused_result));
+/***************************************************************//**
+Inserts an entry into a secondary index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+UNIV_INTERN
+dberr_t
+row_ins_sec_index_entry(
+/*====================*/
+	dict_index_t*	index,	/*!< in: secondary index */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	que_thr_t*	thr)	/*!< in: query thread */
+	__attribute__((nonnull, warn_unused_result));
+/***********************************************************//**
+Inserts a row to a table. This is a high-level function used in
+SQL execution graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_ins_step(
+/*=========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/* Insert node structure */
+
+struct ins_node_t{
+	que_common_t	common;	/*!< node type: QUE_NODE_INSERT */
+	ulint		ins_type;/* INS_VALUES, INS_SEARCHED, or INS_DIRECT */
+	dtuple_t*	row;	/*!< row to insert */
+	dict_table_t*	table;	/*!< table where to insert */
+	sel_node_t*	select;	/*!< select in searched insert */
+	que_node_t*	values_list;/* list of expressions to evaluate and
+				insert in an INS_VALUES insert */
+	ulint		state;	/*!< node execution state */
+	dict_index_t*	index;	/*!< NULL, or the next index where the index
+				entry should be inserted */
+	dtuple_t*	entry;	/*!< NULL, or entry to insert in the index;
+				after a successful insert of the entry,
+				this should be reset to NULL */
+	UT_LIST_BASE_NODE_T(dtuple_t)
+			entry_list;/* list of entries, one for each index */
+	byte*		row_id_buf;/* buffer for the row id sys field in row */
+	trx_id_t	trx_id;	/*!< trx id or the last trx which executed the
+				node */
+	byte*		trx_id_buf;/* buffer for the trx id sys field in row */
+	mem_heap_t*	entry_sys_heap;
+				/* memory heap used as auxiliary storage;
+				entry_list and sys fields are stored here;
+				if this is NULL, entry list should be created
+				and buffers for sys fields in row allocated */
+	ulint		magic_n;
+};
+
+#define	INS_NODE_MAGIC_N	15849075
+
+/* Insert node types */
+#define INS_SEARCHED	0	/* INSERT INTO ... SELECT ... */
+#define INS_VALUES	1	/* INSERT INTO ... VALUES ... */
+#define INS_DIRECT	2	/* this is for internal use in dict0crea:
+				insert the row directly */
+
+/* Node execution states */
+#define	INS_NODE_SET_IX_LOCK	1	/* we should set an IX lock on table */
+#define INS_NODE_ALLOC_ROW_ID	2	/* row id should be allocated */
+#define	INS_NODE_INSERT_ENTRIES 3	/* index entries should be built and
+					inserted */
+
+#ifndef UNIV_NONINL
+#include "row0ins.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0ins.ic b/storage/innobase/include/row0ins.ic
new file mode 100644
index 00000000000..9c191d869a2
--- /dev/null
+++ b/storage/innobase/include/row0ins.ic
@@ -0,0 +1,26 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ins.ic
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+
diff --git a/storage/innobase/include/row0log.h b/storage/innobase/include/row0log.h
new file mode 100644
index 00000000000..62715fe8808
--- /dev/null
+++ b/storage/innobase/include/row0log.h
@@ -0,0 +1,239 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0log.h
+Modification log for online index creation and online table rebuild
+
+Created 2011-05-26 Marko Makela
+*******************************************************/
+
+#ifndef row0log_h
+#define row0log_h
+
+#include "univ.i"
+#include "mtr0types.h"
+#include "row0types.h"
+#include "rem0types.h"
+#include "data0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+
+/******************************************************//**
+Allocate the row log for an index and flag the index
+for online creation.
+@retval true if success, false if not */
+UNIV_INTERN
+bool
+row_log_allocate(
+/*=============*/
+	dict_index_t*	index,	/*!< in/out: index */
+	dict_table_t*	table,	/*!< in/out: new table being rebuilt,
+				or NULL when creating a secondary index */
+	bool		same_pk,/*!< in: whether the definition of the
+				PRIMARY KEY has remained the same */
+	const dtuple_t*	add_cols,
+				/*!< in: default values of
+				added columns, or NULL */
+	const ulint*	col_map)/*!< in: mapping of old column
+				numbers to new ones, or NULL if !table */
+	__attribute__((nonnull(1), warn_unused_result));
+
+/******************************************************//**
+Free the row log for an index that was being created online. */
+UNIV_INTERN
+void
+row_log_free(
+/*=========*/
+	row_log_t*&	log)	/*!< in,own: row log */
+	__attribute__((nonnull));
+
+/******************************************************//**
+Free the row log for an index on which online creation was aborted. */
+UNIV_INLINE
+void
+row_log_abort_sec(
+/*==============*/
+	dict_index_t*	index)	/*!< in/out: index (x-latched) */
+	__attribute__((nonnull));
+
+/******************************************************//**
+Try to log an operation to a secondary index that is
+(or was) being created.
+@retval	true if the operation was logged or can be ignored
+@retval	false if online index creation is not taking place */
+UNIV_INLINE
+bool
+row_log_online_op_try(
+/*==================*/
+	dict_index_t*	index,	/*!< in/out: index, S or X latched */
+	const dtuple_t* tuple,	/*!< in: index tuple */
+	trx_id_t	trx_id)	/*!< in: transaction ID for insert,
+				or 0 for delete */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************//**
+Logs an operation to a secondary index that is (or was) being created. */
+UNIV_INTERN
+void
+row_log_online_op(
+/*==============*/
+	dict_index_t*	index,	/*!< in/out: index, S or X latched */
+	const dtuple_t*	tuple,	/*!< in: index tuple */
+	trx_id_t	trx_id)	/*!< in: transaction ID for insert,
+				or 0 for delete */
+	UNIV_COLD __attribute__((nonnull));
+
+/******************************************************//**
+Gets the error status of the online index rebuild log.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_log_table_get_error(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: clustered index of a table
+					that is being rebuilt online */
+	__attribute__((nonnull, warn_unused_result));
+
+/******************************************************//**
+Logs a delete operation to a table that is being rebuilt.
+This will be merged in row_log_table_apply_delete(). */
+UNIV_INTERN
+void
+row_log_table_delete(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	const byte*	sys)	/*!< in: DB_TRX_ID,DB_ROLL_PTR that should
+				be logged, or NULL to use those in rec */
+	UNIV_COLD __attribute__((nonnull(1,2,3)));
+
+/******************************************************//**
+Logs an update operation to a table that is being rebuilt.
+This will be merged in row_log_table_apply_update(). */
+UNIV_INTERN
+void
+row_log_table_update(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	const dtuple_t*	old_pk)	/*!< in: row_log_table_get_pk()
+				before the update */
+	UNIV_COLD __attribute__((nonnull(1,2,3)));
+
+/******************************************************//**
+Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR
+of a table that is being rebuilt.
+@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table,
+or NULL if the PRIMARY KEY definition does not change */
+UNIV_INTERN
+const dtuple_t*
+row_log_table_get_pk(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec,index),
+				or NULL */
+	byte*		sys,	/*!< out: DB_TRX_ID,DB_ROLL_PTR for
+				row_log_table_delete(), or NULL */
+	mem_heap_t**	heap)	/*!< in/out: memory heap where allocated */
+	UNIV_COLD __attribute__((nonnull(1,2,5), warn_unused_result));
+
+/******************************************************//**
+Logs an insert to a table that is being rebuilt.
+This will be merged in row_log_table_apply_insert(). */
+UNIV_INTERN
+void
+row_log_table_insert(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec,index) */
+	UNIV_COLD __attribute__((nonnull));
+/******************************************************//**
+Notes that a BLOB is being freed during online ALTER TABLE. */
+UNIV_INTERN
+void
+row_log_table_blob_free(
+/*====================*/
+	dict_index_t*	index,	/*!< in/out: clustered index, X-latched */
+	ulint		page_no)/*!< in: starting page number of the BLOB */
+	UNIV_COLD __attribute__((nonnull));
+/******************************************************//**
+Notes that a BLOB is being allocated during online ALTER TABLE. */
+UNIV_INTERN
+void
+row_log_table_blob_alloc(
+/*=====================*/
+	dict_index_t*	index,	/*!< in/out: clustered index, X-latched */
+	ulint		page_no)/*!< in: starting page number of the BLOB */
+	UNIV_COLD __attribute__((nonnull));
+/******************************************************//**
+Apply the row_log_table log to a table upon completing rebuild.
+@return DB_SUCCESS, or error code on failure */
+UNIV_INTERN
+dberr_t
+row_log_table_apply(
+/*================*/
+	que_thr_t*	thr,	/*!< in: query graph */
+	dict_table_t*	old_table,
+				/*!< in: old table */
+	struct TABLE*	table)	/*!< in/out: MySQL table
+				(for reporting duplicates) */
+	__attribute__((nonnull, warn_unused_result));
+
+/******************************************************//**
+Get the latest transaction ID that has invoked row_log_online_op()
+during online creation.
+@return latest transaction ID, or 0 if nothing was logged */
+UNIV_INTERN
+trx_id_t
+row_log_get_max_trx(
+/*================*/
+	dict_index_t*	index)	/*!< in: index, must be locked */
+	__attribute__((nonnull, warn_unused_result));
+
+/******************************************************//**
+Merge the row log to the index upon completing index creation.
+@return DB_SUCCESS, or error code on failure */
+UNIV_INTERN
+dberr_t
+row_log_apply(
+/*==========*/
+	trx_t*		trx,	/*!< in: transaction (for checking if
+				the operation was interrupted) */
+	dict_index_t*	index,	/*!< in/out: secondary index */
+	struct TABLE*	table)	/*!< in/out: MySQL table
+				(for reporting duplicates) */
+	__attribute__((nonnull, warn_unused_result));
+
+#ifndef UNIV_NONINL
+#include "row0log.ic"
+#endif
+
+#endif /* row0log.h */
diff --git a/storage/innobase/include/row0log.ic b/storage/innobase/include/row0log.ic
new file mode 100644
index 00000000000..b0f37dbd8e7
--- /dev/null
+++ b/storage/innobase/include/row0log.ic
@@ -0,0 +1,84 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0log.ic
+Modification log for online index creation and online table rebuild
+
+Created 2012-10-18 Marko Makela
+*******************************************************/
+
+#include "dict0dict.h"
+
+/******************************************************//**
+Free the row log for an index on which online creation was aborted. */
+UNIV_INLINE
+void
+row_log_abort_sec(
+/*===============*/
+	dict_index_t*	index)	/*!< in/out: index (x-latched) */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_ad(!dict_index_is_clust(index));
+	dict_index_set_online_status(index, ONLINE_INDEX_ABORTED);
+	row_log_free(index->online_log);
+}
+
+/******************************************************//**
+Try to log an operation to a secondary index that is
+(or was) being created.
+@retval	true if the operation was logged or can be ignored
+@retval	false if online index creation is not taking place */
+UNIV_INLINE
+bool
+row_log_online_op_try(
+/*==================*/
+	dict_index_t*	index,	/*!< in/out: index, S or X latched */
+	const dtuple_t* tuple,	/*!< in: index tuple */
+	trx_id_t	trx_id)	/*!< in: transaction ID for insert,
+				or 0 for delete */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_SHARED)
+	      || rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	switch (dict_index_get_online_status(index)) {
+	case ONLINE_INDEX_COMPLETE:
+		/* This is a normal index. Do not log anything.
+		The caller must perform the operation on the
+		index tree directly. */
+		return(false);
+	case ONLINE_INDEX_CREATION:
+		/* The index is being created online. Log the
+		operation. */
+		row_log_online_op(index, tuple, trx_id);
+		break;
+	case ONLINE_INDEX_ABORTED:
+	case ONLINE_INDEX_ABORTED_DROPPED:
+		/* The index was created online, but the operation was
+		aborted. Do not log the operation and tell the caller
+		to skip the operation. */
+		break;
+	}
+
+	return(true);
+}
diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h
new file mode 100644
index 00000000000..2b9e9f7711c
--- /dev/null
+++ b/storage/innobase/include/row0merge.h
@@ -0,0 +1,430 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0merge.h
+Index build routines using a merge sort
+
+Created 13/06/2005 Jan Lindstrom
+*******************************************************/
+
+#ifndef row0merge_h
+#define row0merge_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "mtr0mtr.h"
+#include "rem0types.h"
+#include "rem0rec.h"
+#include "read0types.h"
+#include "btr0types.h"
+#include "row0mysql.h"
+#include "lock0types.h"
+#include "srv0srv.h"
+
+// Forward declaration
+struct ib_sequence_t;
+
+/** @brief Block size for I/O operations in merge sort.
+
+The minimum is UNIV_PAGE_SIZE, or page_get_free_space_of_empty()
+rounded to a power of 2.
+
+When not creating a PRIMARY KEY that contains column prefixes, this
+can be set as small as UNIV_PAGE_SIZE / 2. */
+typedef byte	row_merge_block_t;
+
+/** @brief Secondary buffer for I/O operations of merge records.
+
+This buffer is used for writing or reading a record that spans two
+row_merge_block_t.  Thus, it must be able to hold one merge record,
+whose maximum size is the same as the minimum size of
+row_merge_block_t. */
+typedef byte	mrec_buf_t[UNIV_PAGE_SIZE_MAX];
+
+/** @brief Merge record in row_merge_block_t.
+
+The format is the same as a record in ROW_FORMAT=COMPACT with the
+exception that the REC_N_NEW_EXTRA_BYTES are omitted. */
+typedef byte	mrec_t;
+
+/** Merge record in row_merge_buf_t */
+struct mtuple_t {
+	dfield_t*	fields;		/*!< data fields */
+};
+
+/** Buffer for sorting in main memory. */
+struct row_merge_buf_t {
+	mem_heap_t*	heap;		/*!< memory heap where allocated */
+	dict_index_t*	index;		/*!< the index the tuples belong to */
+	ulint		total_size;	/*!< total amount of data bytes */
+	ulint		n_tuples;	/*!< number of data tuples */
+	ulint		max_tuples;	/*!< maximum number of data tuples */
+	mtuple_t*	tuples;		/*!< array of data tuples */
+	mtuple_t*	tmp_tuples;	/*!< temporary copy of tuples,
+					for sorting */
+};
+
+/** Information about temporary files used in merge sort */
+struct merge_file_t {
+	int		fd;		/*!< file descriptor */
+	ulint		offset;		/*!< file offset (end of file) */
+	ib_uint64_t	n_rec;		/*!< number of records in the file */
+};
+
+/** Index field definition */
+struct index_field_t {
+	ulint		col_no;		/*!< column offset */
+	ulint		prefix_len;	/*!< column prefix length, or 0
+					if indexing the whole column */
+};
+
+/** Definition of an index being created */
+struct index_def_t {
+	const char*	name;		/*!< index name */
+	ulint		ind_type;	/*!< 0, DICT_UNIQUE,
+					or DICT_CLUSTERED */
+	ulint		key_number;	/*!< MySQL key number,
+					or ULINT_UNDEFINED if none */
+	ulint		n_fields;	/*!< number of fields in index */
+	index_field_t*	fields;		/*!< field definitions */
+};
+
+/** Structure for reporting duplicate records. */
+struct row_merge_dup_t {
+	dict_index_t*		index;	/*!< index being sorted */
+	struct TABLE*		table;	/*!< MySQL table object */
+	const ulint*		col_map;/*!< mapping of column numbers
+					in table to the rebuilt table
+					(index->table), or NULL if not
+					rebuilding table */
+	ulint			n_dup;	/*!< number of duplicates */
+};
+
+/*************************************************************//**
+Report a duplicate key. */
+UNIV_INTERN
+void
+row_merge_dup_report(
+/*=================*/
+	row_merge_dup_t*	dup,	/*!< in/out: for reporting duplicates */
+	const dfield_t*		entry)	/*!< in: duplicate index entry */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Sets an exclusive lock on a table, for the duration of creating indexes.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_merge_lock_table(
+/*=================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	dict_table_t*	table,		/*!< in: table to lock */
+	enum lock_mode	mode)		/*!< in: LOCK_X or LOCK_S */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Drop indexes that were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed. */
+UNIV_INTERN
+void
+row_merge_drop_indexes_dict(
+/*========================*/
+	trx_t*		trx,	/*!< in/out: dictionary transaction */
+	table_id_t	table_id)/*!< in: table identifier */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Drop those indexes which were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed. */
+UNIV_INTERN
+void
+row_merge_drop_indexes(
+/*===================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	dict_table_t*	table,	/*!< in/out: table containing the indexes */
+	ibool		locked)	/*!< in: TRUE=table locked,
+				FALSE=may need to do a lazy drop */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Drop all partially created indexes during crash recovery. */
+UNIV_INTERN
+void
+row_merge_drop_temp_indexes(void);
+/*=============================*/
+
+/*********************************************************************//**
+Creates temporary merge files, and if UNIV_PFS_IO defined, register
+the file descriptor with Performance Schema.
+@return File descriptor */
+UNIV_INTERN
+int
+row_merge_file_create_low(void)
+/*===========================*/
+	__attribute__((warn_unused_result));
+/*********************************************************************//**
+Destroy a merge file. And de-register the file from Performance Schema
+if UNIV_PFS_IO is defined. */
+UNIV_INTERN
+void
+row_merge_file_destroy_low(
+/*=======================*/
+	int		fd);	/*!< in: merge file descriptor */
+
+/*********************************************************************//**
+Provide a new pathname for a table that is being renamed if it belongs to
+a file-per-table tablespace.  The caller is responsible for freeing the
+memory allocated for the return value.
+@return	new pathname of tablespace file, or NULL if space = 0 */
+UNIV_INTERN
+char*
+row_make_new_pathname(
+/*==================*/
+	dict_table_t*	table,		/*!< in: table to be renamed */
+	const char*	new_name);	/*!< in: new name */
+/*********************************************************************//**
+Rename the tables in the data dictionary.  The data dictionary must
+have been locked exclusively by the caller, because the transaction
+will not be committed.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_merge_rename_tables_dict(
+/*=========================*/
+	dict_table_t*	old_table,	/*!< in/out: old table, renamed to
+					tmp_name */
+	dict_table_t*	new_table,	/*!< in/out: new table, renamed to
+					old_table->name */
+	const char*	tmp_name,	/*!< in: new name for old_table */
+	trx_t*		trx)		/*!< in/out: dictionary transaction */
+	__attribute__((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Rename an index in the dictionary that was created. The data
+dictionary must have been locked exclusively by the caller, because
+the transaction will not be committed.
+@return	DB_SUCCESS if all OK */
+UNIV_INTERN
+dberr_t
+row_merge_rename_index_to_add(
+/*==========================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	table_id_t	table_id,	/*!< in: table identifier */
+	index_id_t	index_id)	/*!< in: index identifier */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Rename an index in the dictionary that is to be dropped. The data
+dictionary must have been locked exclusively by the caller, because
+the transaction will not be committed.
+@return	DB_SUCCESS if all OK */
+UNIV_INTERN
+dberr_t
+row_merge_rename_index_to_drop(
+/*===========================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	table_id_t	table_id,	/*!< in: table identifier */
+	index_id_t	index_id)	/*!< in: index identifier */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Create the index and load in to the dictionary.
+@return	index, or NULL on error */
+UNIV_INTERN
+dict_index_t*
+row_merge_create_index(
+/*===================*/
+	trx_t*			trx,	/*!< in/out: trx (sets error_state) */
+	dict_table_t*		table,	/*!< in: the index is on this table */
+	const index_def_t*	index_def);
+					/*!< in: the index definition */
+/*********************************************************************//**
+Check if a transaction can use an index.
+@return	TRUE if index can be used by the transaction else FALSE */
+UNIV_INTERN
+ibool
+row_merge_is_index_usable(
+/*======================*/
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_index_t*	index);	/*!< in: index to check */
+/*********************************************************************//**
+Drop a table. The caller must have ensured that the background stats
+thread is not processing the table. This can be done by calling
+dict_stats_wait_bg_to_stop_using_table() after locking the dictionary and
+before calling this function.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_merge_drop_table(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_table_t*	table)		/*!< in: table instance to drop */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Build indexes on a table by reading a clustered index,
+creating a temporary file containing index entries, merge sorting
+these index entries and inserting sorted index entries to indexes.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_merge_build_indexes(
+/*====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_table_t*	old_table,	/*!< in: table where rows are
+					read from */
+	dict_table_t*	new_table,	/*!< in: table where indexes are
+					created; identical to old_table
+					unless creating a PRIMARY KEY */
+	bool		online,		/*!< in: true if creating indexes
+					online */
+	dict_index_t**	indexes,	/*!< in: indexes to be created */
+	const ulint*	key_numbers,	/*!< in: MySQL key numbers */
+	ulint		n_indexes,	/*!< in: size of indexes[] */
+	struct TABLE*	table,		/*!< in/out: MySQL table, for
+					reporting erroneous key value
+					if applicable */
+	const dtuple_t*	add_cols,	/*!< in: default values of
+					added columns, or NULL */
+	const ulint*	col_map,	/*!< in: mapping of old column
+					numbers to new ones, or NULL
+					if old_table == new_table */
+	ulint		add_autoinc,	/*!< in: number of added
+					AUTO_INCREMENT column, or
+					ULINT_UNDEFINED if none is added */
+	ib_sequence_t&	sequence)	/*!< in/out: autoinc sequence */
+	__attribute__((nonnull(1,2,3,5,6,8), warn_unused_result));
+/********************************************************************//**
+Write a buffer to a block. */
+UNIV_INTERN
+void
+row_merge_buf_write(
+/*================*/
+	const row_merge_buf_t*	buf,	/*!< in: sorted buffer */
+	const merge_file_t*	of,	/*!< in: output file */
+	row_merge_block_t*	block)	/*!< out: buffer for writing to file */
+	__attribute__((nonnull));
+/********************************************************************//**
+Sort a buffer. */
+UNIV_INTERN
+void
+row_merge_buf_sort(
+/*===============*/
+	row_merge_buf_t*	buf,	/*!< in/out: sort buffer */
+	row_merge_dup_t*	dup)	/*!< in/out: reporter of duplicates
+					(NULL if non-unique index) */
+	__attribute__((nonnull(1)));
+/********************************************************************//**
+Write a merge block to the file system.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+row_merge_write(
+/*============*/
+	int		fd,	/*!< in: file descriptor */
+	ulint		offset,	/*!< in: offset where to write,
+				in number of row_merge_block_t elements */
+	const void*	buf);	/*!< in: data */
+/********************************************************************//**
+Empty a sort buffer.
+@return sort buffer */
+UNIV_INTERN
+row_merge_buf_t*
+row_merge_buf_empty(
+/*================*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer */
+	__attribute__((warn_unused_result, nonnull));
+/*********************************************************************//**
+Create a merge file.
+@return file descriptor, or -1 on failure */
+UNIV_INTERN
+int
+row_merge_file_create(
+/*==================*/
+	merge_file_t*	merge_file)	/*!< out: merge file structure */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Merge disk files.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_merge_sort(
+/*===========*/
+	trx_t*			trx,	/*!< in: transaction */
+	const row_merge_dup_t*	dup,	/*!< in: descriptor of
+					index being created */
+	merge_file_t*		file,	/*!< in/out: file containing
+					index entries */
+	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
+	int*			tmpfd)	/*!< in/out: temporary file handle */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Allocate a sort buffer.
+@return own: sort buffer */
+UNIV_INTERN
+row_merge_buf_t*
+row_merge_buf_create(
+/*=================*/
+	dict_index_t*	index)	/*!< in: secondary index */
+	__attribute__((warn_unused_result, nonnull, malloc));
+/*********************************************************************//**
+Deallocate a sort buffer. */
+UNIV_INTERN
+void
+row_merge_buf_free(
+/*===============*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer to be freed */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Destroy a merge file. */
+UNIV_INTERN
+void
+row_merge_file_destroy(
+/*===================*/
+	merge_file_t*	merge_file)	/*!< in/out: merge file structure */
+	__attribute__((nonnull));
+/********************************************************************//**
+Read a merge block from the file system.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+row_merge_read(
+/*===========*/
+	int			fd,	/*!< in: file descriptor */
+	ulint			offset,	/*!< in: offset where to read
+					in number of row_merge_block_t
+					elements */
+	row_merge_block_t*	buf);	/*!< out: data */
+/********************************************************************//**
+Read a merge record.
+@return pointer to next record, or NULL on I/O error or end of list */
+UNIV_INTERN
+const byte*
+row_merge_read_rec(
+/*===============*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	mrec_buf_t*		buf,	/*!< in/out: secondary buffer */
+	const byte*		b,	/*!< in: pointer to record */
+	const dict_index_t*	index,	/*!< in: index of the record */
+	int			fd,	/*!< in: file descriptor */
+	ulint*			foffs,	/*!< in/out: file offset */
+	const mrec_t**		mrec,	/*!< out: pointer to merge record,
+					or NULL on end of list
+					(non-NULL on I/O error) */
+	ulint*			offsets)/*!< out: offsets of mrec */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* row0merge.h */
diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h
new file mode 100644
index 00000000000..06c07002c2b
--- /dev/null
+++ b/storage/innobase/include/row0mysql.h
@@ -0,0 +1,915 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0mysql.h
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0mysql_h
+#define row0mysql_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "que0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include "btr0pcur.h"
+#include "trx0types.h"
+
+// Forward declaration
+struct SysIndexCallback;
+
+extern ibool row_rollback_on_timeout;
+
+struct row_prebuilt_t;
+
+/*******************************************************************//**
+Frees the blob heap in prebuilt when no longer needed. */
+UNIV_INTERN
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct of a
+					ha_innobase:: table handle */
+/*******************************************************************//**
+Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
+format.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+UNIV_INTERN
+byte*
+row_mysql_store_true_var_len(
+/*=========================*/
+	byte*	dest,	/*!< in: where to store */
+	ulint	len,	/*!< in: length, must fit in two bytes */
+	ulint	lenlen);/*!< in: storage length of len: either 1 or 2 bytes */
+/*******************************************************************//**
+Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and
+returns a pointer to the data.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+UNIV_INTERN
+const byte*
+row_mysql_read_true_varchar(
+/*========================*/
+	ulint*		len,	/*!< out: variable-length field length */
+	const byte*	field,	/*!< in: field in the MySQL format */
+	ulint		lenlen);/*!< in: storage length of len: either 1
+				or 2 bytes */
+/*******************************************************************//**
+Stores a reference to a BLOB in the MySQL format. */
+UNIV_INTERN
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+	byte*		dest,	/*!< in: where to store */
+	ulint		col_len,/*!< in: dest buffer size: determines into
+				how many bytes the BLOB length is stored,
+				the space for the length may vary from 1
+				to 4 bytes */
+	const void*	data,	/*!< in: BLOB data; if the value to store
+				is SQL NULL this should be NULL pointer */
+	ulint		len);	/*!< in: BLOB length; if the value to store
+				is SQL NULL this should be 0; remember
+				also to set the NULL bit in the MySQL record
+				header! */
+/*******************************************************************//**
+Reads a reference to a BLOB in the MySQL format.
+@return	pointer to BLOB data */
+UNIV_INTERN
+const byte*
+row_mysql_read_blob_ref(
+/*====================*/
+	ulint*		len,		/*!< out: BLOB length */
+	const byte*	ref,		/*!< in: BLOB reference in the
+					MySQL format */
+	ulint		col_len);	/*!< in: BLOB reference length
+					(not BLOB length) */
+/**************************************************************//**
+Pad a column with spaces. */
+UNIV_INTERN
+void
+row_mysql_pad_col(
+/*==============*/
+	ulint	mbminlen,	/*!< in: minimum size of a character,
+				in bytes */
+	byte*	pad,		/*!< out: padded buffer */
+	ulint	len);		/*!< in: number of bytes to pad */
+
+/**************************************************************//**
+Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
+The counterpart of this function is row_sel_field_store_in_mysql_format() in
+row0sel.cc.
+@return	up to which byte we used buf in the conversion */
+UNIV_INTERN
+byte*
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+	dfield_t*	dfield,		/*!< in/out: dfield where dtype
+					information must be already set when
+					this function is called! */
+	byte*		buf,		/*!< in/out: buffer for a converted
+					integer value; this must be at least
+					col_len long then! NOTE that dfield
+					may also get a pointer to 'buf',
+					therefore do not discard this as long
+					as dfield is used! */
+	ibool		row_format_col,	/*!< TRUE if the mysql_data is from
+					a MySQL row, FALSE if from a MySQL
+					key value;
+					in MySQL, a true VARCHAR storage
+					format differs in a row and in a
+					key value: in a key value the length
+					is always stored in 2 bytes! */
+	const byte*	mysql_data,	/*!< in: MySQL column value, not
+					SQL NULL; NOTE that dfield may also
+					get a pointer to mysql_data,
+					therefore do not discard this as long
+					as dfield is used! */
+	ulint		col_len,	/*!< in: MySQL column length; NOTE that
+					this is the storage length of the
+					column in the MySQL format row, not
+					necessarily the length of the actual
+					payload data; if the column is a true
+					VARCHAR then this is irrelevant */
+	ulint		comp);		/*!< in: nonzero=compact format */
+/****************************************************************//**
+Handles user errors and lock waits detected by the database engine.
+@return true if it was a lock wait and we should continue running the
+query thread */
+UNIV_INTERN
+bool
+row_mysql_handle_errors(
+/*====================*/
+	dberr_t*	new_err,/*!< out: possible new error encountered in
+				rollback, or the old error which was
+				during the function entry */
+	trx_t*		trx,	/*!< in: transaction */
+	que_thr_t*	thr,	/*!< in: query thread, or NULL */
+	trx_savept_t*	savept)	/*!< in: savepoint, or NULL */
+	__attribute__((nonnull(1,2)));
+/********************************************************************//**
+Create a prebuilt struct for a MySQL table handle.
+@return	own: a prebuilt struct */
+UNIV_INTERN
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+	dict_table_t*	table,		/*!< in: Innobase table handle */
+	ulint		mysql_row_len);	/*!< in: length in bytes of a row in
+					the MySQL format */
+/********************************************************************//**
+Free a prebuilt struct for a MySQL table handle. */
+UNIV_INTERN
+void
+row_prebuilt_free(
+/*==============*/
+	row_prebuilt_t*	prebuilt,	/*!< in, own: prebuilt struct */
+	ibool		dict_locked);	/*!< in: TRUE=data dictionary locked */
+/*********************************************************************//**
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+UNIV_INTERN
+void
+row_update_prebuilt_trx(
+/*====================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct
+					in MySQL handle */
+	trx_t*		trx);		/*!< in: transaction handle */
+/*********************************************************************//**
+Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
+AUTO_INC lock gives exclusive access to the auto-inc counter of the
+table. The lock is reserved only for the duration of an SQL statement.
+It is not compatible with another AUTO_INC or exclusive lock on the
+table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_lock_table_autoinc_for_mysql(
+/*=============================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in the MySQL
+					table handle */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Sets a table lock on the table mentioned in prebuilt.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_lock_table_for_mysql(
+/*=====================*/
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct in the MySQL
+					table handle */
+	dict_table_t*	table,		/*!< in: table to lock, or NULL
+					if prebuilt->table should be
+					locked as
+					prebuilt->select_lock_type */
+	ulint		mode)		/*!< in: lock mode of table
+					(ignored if table==NULL) */
+	__attribute__((nonnull(1)));
+/*********************************************************************//**
+Does an insert for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_insert_for_mysql(
+/*=================*/
+	byte*		mysql_rec,	/*!< in: row in the MySQL format */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Builds a dummy query graph used in selects. */
+UNIV_INTERN
+void
+row_prebuild_sel_graph(
+/*===================*/
+	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct in MySQL
+					handle */
+/*********************************************************************//**
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it.
+@return	prebuilt update vector */
+UNIV_INTERN
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct in MySQL
+					handle */
+/*********************************************************************//**
+Checks if a table is such that we automatically created a clustered
+index on it (on row id).
+@return	TRUE if the clustered index was generated automatically */
+UNIV_INTERN
+ibool
+row_table_got_default_clust_index(
+/*==============================*/
+	const dict_table_t*	table);	/*!< in: table */
+/*********************************************************************//**
+Does an update or delete of a row for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_update_for_mysql(
+/*=================*/
+	byte*		mysql_rec,	/*!< in: the row to be updated, in
+					the MySQL format */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+This can only be used when srv_locks_unsafe_for_binlog is TRUE or this
+session is using a READ COMMITTED or READ UNCOMMITTED isolation level.
+Before calling this function row_search_for_mysql() must have
+initialized prebuilt->new_rec_locks to store the information which new
+record locks really were set. This function removes a newly set
+clustered index record lock under prebuilt->pcur or
+prebuilt->clust_pcur.  Thus, this implements a 'mini-rollback' that
+releases the latest clustered index record lock we set. */
+UNIV_INTERN
+void
+row_unlock_for_mysql(
+/*=================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct in MySQL
+					handle */
+	ibool		has_latches_on_recs)/*!< in: TRUE if called
+					so that we have the latches on
+					the records under pcur and
+					clust_pcur, and we do not need
+					to reposition the cursors. */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Checks if a table name contains the string "/#sql" which denotes temporary
+tables in MySQL.
+@return true if temporary table */
+UNIV_INTERN
+bool
+row_is_mysql_tmp_table_name(
+/*========================*/
+	const char*	name) __attribute__((warn_unused_result));
+				/*!< in: table name in the form
+				'database/tablename' */
+
+/*********************************************************************//**
+Creates an query graph node of 'update' type to be used in the MySQL
+interface.
+@return	own: update node */
+UNIV_INTERN
+upd_node_t*
+row_create_update_node_for_mysql(
+/*=============================*/
+	dict_table_t*	table,	/*!< in: table to update */
+	mem_heap_t*	heap);	/*!< in: mem heap from which allocated */
+/**********************************************************************//**
+Does a cascaded delete or set null in a foreign key operation.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_update_cascade_for_mysql(
+/*=========================*/
+	que_thr_t*	thr,	/*!< in: query thread */
+	upd_node_t*	node,	/*!< in: update node used in the cascade
+				or set null operation */
+	dict_table_t*	table)	/*!< in: table where we do the operation */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Locks the data dictionary exclusively for performing a table create or other
+data dictionary modification operation. */
+UNIV_INTERN
+void
+row_mysql_lock_data_dictionary_func(
+/*================================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	const char*	file,	/*!< in: file name */
+	ulint		line);	/*!< in: line number */
+#define row_mysql_lock_data_dictionary(trx)				\
+	row_mysql_lock_data_dictionary_func(trx, __FILE__, __LINE__)
+/*********************************************************************//**
+Unlocks the data dictionary exclusive lock. */
+UNIV_INTERN
+void
+row_mysql_unlock_data_dictionary(
+/*=============================*/
+	trx_t*	trx);	/*!< in/out: transaction */
+/*********************************************************************//**
+Locks the data dictionary in shared mode from modifications, for performing
+foreign key check, rollback, or other operation invisible to MySQL. */
+UNIV_INTERN
+void
+row_mysql_freeze_data_dictionary_func(
+/*==================================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	const char*	file,	/*!< in: file name */
+	ulint		line);	/*!< in: line number */
+#define row_mysql_freeze_data_dictionary(trx)				\
+	row_mysql_freeze_data_dictionary_func(trx, __FILE__, __LINE__)
+/*********************************************************************//**
+Unlocks the data dictionary shared lock. */
+UNIV_INTERN
+void
+row_mysql_unfreeze_data_dictionary(
+/*===============================*/
+	trx_t*	trx);	/*!< in/out: transaction */
+/*********************************************************************//**
+Creates a table for MySQL. If the name of the table ends in
+one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor",
+"innodb_table_monitor", then this will also start the printing of monitor
+output by the master thread. If the table name ends in "innodb_mem_validate",
+InnoDB will try to invoke mem_validate(). On failure the transaction will
+be rolled back.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_create_table_for_mysql(
+/*=======================*/
+	dict_table_t*	table,	/*!< in, own: table definition
+				(will be freed, or on DB_SUCCESS
+				added to the data dictionary cache) */
+	trx_t*		trx,	/*!< in/out: transaction */
+	bool		commit)	/*!< in: if true, commit the transaction */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Does an index creation operation for MySQL. TODO: currently failure
+to create an index results in dropping the whole table! This is no problem
+currently as all indexes must be created at the same time as the table.
+@return	error number or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_create_index_for_mysql(
+/*=======================*/
+	dict_index_t*	index,		/*!< in, own: index definition
+					(will be freed) */
+	trx_t*		trx,		/*!< in: transaction handle */
+	const ulint*	field_lengths)	/*!< in: if not NULL, must contain
+					dict_index_get_n_fields(index)
+					actual field lengths for the
+					index columns, which are
+					then checked for not being too
+					large. */
+	__attribute__((nonnull(1,2), warn_unused_result));
+/*********************************************************************//**
+Scans a table create SQL string and adds to the data dictionary
+the foreign key constraints declared in the string. This function
+should be called after the indexes for a table have been created.
+Each foreign key constraint must be accompanied with indexes in
+bot participating tables. The indexes are allowed to contain more
+fields than mentioned in the constraint.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_table_add_foreign_constraints(
+/*==============================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const char*	sql_string,	/*!< in: table create statement where
+					foreign keys are declared like:
+				FOREIGN KEY (a, b) REFERENCES table2(c, d),
+					table2 can be written also with the
+					database name before it: test.table2 */
+	size_t		sql_length,	/*!< in: length of sql_string */
+	const char*	name,		/*!< in: table full name in the
+					normalized form
+					database_name/table_name */
+	ibool		reject_fks)	/*!< in: if TRUE, fail with error
+					code DB_CANNOT_ADD_CONSTRAINT if
+					any foreign keys are found. */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+The master thread in srv0srv.cc calls this regularly to drop tables which
+we must drop in background after queries to them have ended. Such lazy
+dropping of tables is needed in ALTER TABLE on Unix.
+@return	how many tables dropped + remaining tables in list */
+UNIV_INTERN
+ulint
+row_drop_tables_for_mysql_in_background(void);
+/*=========================================*/
+/*********************************************************************//**
+Get the background drop list length. NOTE: the caller must own the kernel
+mutex!
+@return	how many tables in list */
+UNIV_INTERN
+ulint
+row_get_background_drop_list_len_low(void);
+/*======================================*/
+/*********************************************************************//**
+Sets an exclusive lock on a table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_mysql_lock_table(
+/*=================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	dict_table_t*	table,		/*!< in: table to lock */
+	enum lock_mode	mode,		/*!< in: LOCK_X or LOCK_S */
+	const char*	op_info)	/*!< in: string for trx->op_info */
+	__attribute__((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Truncates a table for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_truncate_table_for_mysql(
+/*=========================*/
+	dict_table_t*	table,	/*!< in: table handle */
+	trx_t*		trx)	/*!< in: transaction handle */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Drops a table for MySQL.  If the name of the dropped table ends in
+one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor",
+"innodb_table_monitor", then this will also stop the printing of monitor
+output by the master thread.  If the data dictionary was not already locked
+by the transaction, the transaction will be committed.  Otherwise, the
+data dictionary will remain locked.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_drop_table_for_mysql(
+/*=====================*/
+	const char*	name,	/*!< in: table name */
+	trx_t*		trx,	/*!< in: dictionary transaction handle */
+	bool		drop_db,/*!< in: true=dropping whole database */
+	bool		nonatomic = true)
+				/*!< in: whether it is permitted
+				to release and reacquire dict_operation_lock */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Drop all temporary tables during crash recovery. */
+UNIV_INTERN
+void
+row_mysql_drop_temp_tables(void);
+/*============================*/
+
+/*********************************************************************//**
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function deletes the .ibd file and assigns a new table id for
+the table. Also the flag table->ibd_file_missing is set TRUE.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_discard_tablespace_for_mysql(
+/*=============================*/
+	const char*	name,	/*!< in: table name */
+	trx_t*		trx)	/*!< in: transaction handle */
+	__attribute__((nonnull, warn_unused_result));
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_import_tablespace_for_mysql(
+/*============================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL */
+        __attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Drops a database for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_drop_database_for_mysql(
+/*========================*/
+	const char*	name,	/*!< in: database name which ends to '/' */
+	trx_t*		trx)	/*!< in: transaction handle */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Renames a table for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_rename_table_for_mysql(
+/*=======================*/
+	const char*	old_name,	/*!< in: old table name */
+	const char*	new_name,	/*!< in: new table name */
+	trx_t*		trx,		/*!< in/out: transaction */
+	bool		commit)		/*!< in: whether to commit trx */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Checks that the index contains entries in an ascending order, unique
+constraint is not broken, and calculates the number of index entries
+in the read view of the current transaction.
+@return true if ok */
+UNIV_INTERN
+bool
+row_check_index_for_mysql(
+/*======================*/
+	row_prebuilt_t*		prebuilt,	/*!< in: prebuilt struct
+						in MySQL handle */
+	const dict_index_t*	index,		/*!< in: index */
+	ulint*			n_rows)		/*!< out: number of entries
+						seen in the consistent read */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Determines if a table is a magic monitor table.
+@return	true if monitor table */
+UNIV_INTERN
+bool
+row_is_magic_monitor_table(
+/*=======================*/
+	const char*	table_name)	/*!< in: name of the table, in the
+					form database/table_name */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Initialize this module */
+UNIV_INTERN
+void
+row_mysql_init(void);
+/*================*/
+
+/*********************************************************************//**
+Close this module */
+UNIV_INTERN
+void
+row_mysql_close(void);
+/*=================*/
+
+/*********************************************************************//**
+Reassigns the table identifier of a table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_mysql_table_id_reassign(
+/*========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	trx_t*		trx,	/*!< in/out: transaction */
+	table_id_t*	new_id) /*!< out: new table id */
+        __attribute__((nonnull, warn_unused_result));
+
+/* A struct describing a place for an individual column in the MySQL
+row format which is presented to the table handler in ha_innobase.
+This template struct is used to speed up row transformations between
+Innobase and MySQL. */
+
+struct mysql_row_templ_t {
+	ulint	col_no;			/*!< column number of the column */
+	ulint	rec_field_no;		/*!< field number of the column in an
+					Innobase record in the current index;
+					not defined if template_type is
+					ROW_MYSQL_WHOLE_ROW */
+	ulint	clust_rec_field_no;	/*!< field number of the column in an
+					Innobase record in the clustered index;
+					not defined if template_type is
+					ROW_MYSQL_WHOLE_ROW */
+	ulint	icp_rec_field_no;	/*!< field number of the column in an
+					Innobase record in the current index;
+					not defined unless
+					index condition pushdown is used */
+	ulint	mysql_col_offset;	/*!< offset of the column in the MySQL
+					row format */
+	ulint	mysql_col_len;		/*!< length of the column in the MySQL
+					row format */
+	ulint	mysql_null_byte_offset;	/*!< MySQL NULL bit byte offset in a
+					MySQL record */
+	ulint	mysql_null_bit_mask;	/*!< bit mask to get the NULL bit,
+					zero if column cannot be NULL */
+	ulint	type;			/*!< column type in Innobase mtype
+					numbers DATA_CHAR... */
+	ulint	mysql_type;		/*!< MySQL type code; this is always
+					< 256 */
+	ulint	mysql_length_bytes;	/*!< if mysql_type
+					== DATA_MYSQL_TRUE_VARCHAR, this tells
+					whether we should use 1 or 2 bytes to
+					store the MySQL true VARCHAR data
+					length at the start of row in the MySQL
+					format (NOTE that the MySQL key value
+					format always uses 2 bytes for the data
+					len) */
+	ulint	charset;		/*!< MySQL charset-collation code
+					of the column, or zero */
+	ulint	mbminlen;		/*!< minimum length of a char, in bytes,
+					or zero if not a char type */
+	ulint	mbmaxlen;		/*!< maximum length of a char, in bytes,
+					or zero if not a char type */
+	ulint	is_unsigned;		/*!< if a column type is an integer
+					type and this field is != 0, then
+					it is an unsigned integer type */
+};
+
+#define MYSQL_FETCH_CACHE_SIZE		8
+/* After fetching this many rows, we start caching them in fetch_cache */
+#define MYSQL_FETCH_CACHE_THRESHOLD	4
+
+#define ROW_PREBUILT_ALLOCATED	78540783
+#define ROW_PREBUILT_FREED	26423527
+
+/** A struct for (sometimes lazily) prebuilt structures in an Innobase table
+handle used within MySQL; these are used to save CPU time. */
+
+struct row_prebuilt_t {
+	ulint		magic_n;	/*!< this magic number is set to
+					ROW_PREBUILT_ALLOCATED when created,
+					or ROW_PREBUILT_FREED when the
+					struct has been freed */
+	dict_table_t*	table;		/*!< Innobase table handle */
+	dict_index_t*	index;		/*!< current index for a search, if
+					any */
+	trx_t*		trx;		/*!< current transaction handle */
+	unsigned	sql_stat_start:1;/*!< TRUE when we start processing of
+					an SQL statement: we may have to set
+					an intention lock on the table,
+					create a consistent read view etc. */
+	unsigned	mysql_has_locked:1;/*!< this is set TRUE when MySQL
+					calls external_lock on this handle
+					with a lock flag, and set FALSE when
+					with the F_UNLOCK flag */
+	unsigned	clust_index_was_generated:1;
+					/*!< if the user did not define a
+					primary key in MySQL, then Innobase
+					automatically generated a clustered
+					index where the ordering column is
+					the row id: in this case this flag
+					is set to TRUE */
+	unsigned	index_usable:1;	/*!< caches the value of
+					row_merge_is_index_usable(trx,index) */
+	unsigned	read_just_key:1;/*!< set to 1 when MySQL calls
+					ha_innobase::extra with the
+					argument HA_EXTRA_KEYREAD; it is enough
+					to read just columns defined in
+					the index (i.e., no read of the
+					clustered index record necessary) */
+	unsigned	used_in_HANDLER:1;/*!< TRUE if we have been using this
+					handle in a MySQL HANDLER low level
+					index cursor command: then we must
+					store the pcur position even in a
+					unique search from a clustered index,
+					because HANDLER allows NEXT and PREV
+					in such a situation */
+	unsigned	template_type:2;/*!< ROW_MYSQL_WHOLE_ROW,
+					ROW_MYSQL_REC_FIELDS,
+					ROW_MYSQL_DUMMY_TEMPLATE, or
+					ROW_MYSQL_NO_TEMPLATE */
+	unsigned	n_template:10;	/*!< number of elements in the
+					template */
+	unsigned	null_bitmap_len:10;/*!< number of bytes in the SQL NULL
+					bitmap at the start of a row in the
+					MySQL format */
+	unsigned	need_to_access_clustered:1; /*!< if we are fetching
+					columns through a secondary index
+					and at least one column is not in
+					the secondary index, then this is
+					set to TRUE */
+	unsigned	templ_contains_blob:1;/*!< TRUE if the template contains
+					a column with DATA_BLOB ==
+					get_innobase_type_from_mysql_type();
+					not to be confused with InnoDB
+					externally stored columns
+					(VARCHAR can be off-page too) */
+	mysql_row_templ_t* mysql_template;/*!< template used to transform
+					rows fast between MySQL and Innobase
+					formats; memory for this template
+					is not allocated from 'heap' */
+	mem_heap_t*	heap;		/*!< memory heap from which
+					these auxiliary structures are
+					allocated when needed */
+	ins_node_t*	ins_node;	/*!< Innobase SQL insert node
+					used to perform inserts
+					to the table */
+	byte*		ins_upd_rec_buff;/*!< buffer for storing data converted
+					to the Innobase format from the MySQL
+					format */
+	const byte*	default_rec;	/*!< the default values of all columns
+					(a "default row") in MySQL format */
+	ulint		hint_need_to_fetch_extra_cols;
+					/*!< normally this is set to 0; if this
+					is set to ROW_RETRIEVE_PRIMARY_KEY,
+					then we should at least retrieve all
+					columns in the primary key; if this
+					is set to ROW_RETRIEVE_ALL_COLS, then
+					we must retrieve all columns in the
+					key (if read_just_key == 1), or all
+					columns in the table */
+	upd_node_t*	upd_node;	/*!< Innobase SQL update node used
+					to perform updates and deletes */
+	trx_id_t	trx_id;		/*!< The table->def_trx_id when
+					ins_graph was built */
+	que_fork_t*	ins_graph;	/*!< Innobase SQL query graph used
+					in inserts. Will be rebuilt on
+					trx_id or n_indexes mismatch. */
+	que_fork_t*	upd_graph;	/*!< Innobase SQL query graph used
+					in updates or deletes */
+	btr_pcur_t	pcur;		/*!< persistent cursor used in selects
+					and updates */
+	btr_pcur_t	clust_pcur;	/*!< persistent cursor used in
+					some selects and updates */
+	que_fork_t*	sel_graph;	/*!< dummy query graph used in
+					selects */
+	dtuple_t*	search_tuple;	/*!< prebuilt dtuple used in selects */
+	byte		row_id[DATA_ROW_ID_LEN];
+					/*!< if the clustered index was
+					generated, the row id of the
+					last row fetched is stored
+					here */
+	doc_id_t	fts_doc_id;	/* if the table has an FTS index on
+					it then we fetch the doc_id.
+					FTS-FIXME: Currently we fetch it always
+					but in the future we must only fetch
+					it when FTS columns are being
+					updated */
+	dtuple_t*	clust_ref;	/*!< prebuilt dtuple used in
+					sel/upd/del */
+	ulint		select_lock_type;/*!< LOCK_NONE, LOCK_S, or LOCK_X */
+	ulint		stored_select_lock_type;/*!< this field is used to
+					remember the original select_lock_type
+					that was decided in ha_innodb.cc,
+					::store_lock(), ::external_lock(),
+					etc. */
+	ulint		row_read_type;	/*!< ROW_READ_WITH_LOCKS if row locks
+					should be the obtained for records
+					under an UPDATE or DELETE cursor.
+					If innodb_locks_unsafe_for_binlog
+					is TRUE, this can be set to
+					ROW_READ_TRY_SEMI_CONSISTENT, so that
+					if the row under an UPDATE or DELETE
+					cursor was locked by another
+					transaction, InnoDB will resort
+					to reading the last committed value
+					('semi-consistent read').  Then,
+					this field will be set to
+					ROW_READ_DID_SEMI_CONSISTENT to
+					indicate that.	If the row does not
+					match the WHERE condition, MySQL will
+					invoke handler::unlock_row() to
+					clear the flag back to
+					ROW_READ_TRY_SEMI_CONSISTENT and
+					to simply skip the row.	 If
+					the row matches, the next call to
+					row_search_for_mysql() will lock
+					the row.
+					This eliminates lock waits in some
+					cases; note that this breaks
+					serializability. */
+	ulint		new_rec_locks;	/*!< normally 0; if
+					srv_locks_unsafe_for_binlog is
+					TRUE or session is using READ
+					COMMITTED or READ UNCOMMITTED
+					isolation level, set in
+					row_search_for_mysql() if we set a new
+					record lock on the secondary
+					or clustered index; this is
+					used in row_unlock_for_mysql()
+					when releasing the lock under
+					the cursor if we determine
+					after retrieving the row that
+					it does not need to be locked
+					('mini-rollback') */
+	ulint		mysql_prefix_len;/*!< byte offset of the end of
+					the last requested column */
+	ulint		mysql_row_len;	/*!< length in bytes of a row in the
+					MySQL format */
+	ulint		n_rows_fetched;	/*!< number of rows fetched after
+					positioning the current cursor */
+	ulint		fetch_direction;/*!< ROW_SEL_NEXT or ROW_SEL_PREV */
+	byte*		fetch_cache[MYSQL_FETCH_CACHE_SIZE];
+					/*!< a cache for fetched rows if we
+					fetch many rows from the same cursor:
+					it saves CPU time to fetch them in a
+					batch; we reserve mysql_row_len
+					bytes for each such row; these
+					pointers point 4 bytes past the
+					allocated mem buf start, because
+					there is a 4 byte magic number at the
+					start and at the end */
+	ibool		keep_other_fields_on_keyread; /*!< when using fetch
+					cache with HA_EXTRA_KEYREAD, don't
+					overwrite other fields in mysql row
+					row buffer.*/
+	ulint		fetch_cache_first;/*!< position of the first not yet
+					fetched row in fetch_cache */
+	ulint		n_fetch_cached;	/*!< number of not yet fetched rows
+					in fetch_cache */
+	mem_heap_t*	blob_heap;	/*!< in SELECTS BLOB fields are copied
+					to this heap */
+	mem_heap_t*	old_vers_heap;	/*!< memory heap where a previous
+					version is built in consistent read */
+	bool		in_fts_query;	/*!< Whether we are in a FTS query */
+	/*----------------------*/
+	ulonglong	autoinc_last_value;
+					/*!< last value of AUTO-INC interval */
+	ulonglong	autoinc_increment;/*!< The increment step of the auto
+					increment column. Value must be
+					greater than or equal to 1. Required to
+					calculate the next value */
+	ulonglong	autoinc_offset; /*!< The offset passed to
+					get_auto_increment() by MySQL. Required
+					to calculate the next value */
+	dberr_t		autoinc_error;	/*!< The actual error code encountered
+					while trying to init or read the
+					autoinc value from the table. We
+					store it here so that we can return
+					it to MySQL */
+	/*----------------------*/
+	void*		idx_cond;	/*!< In ICP, pointer to a ha_innobase,
+					passed to innobase_index_cond().
+					NULL if index condition pushdown is
+					not used. */
+	ulint		idx_cond_n_cols;/*!< Number of fields in idx_cond_cols.
+					0 if and only if idx_cond == NULL. */
+	/*----------------------*/
+	ulint		magic_n2;	/*!< this should be the same as
+					magic_n */
+	/*----------------------*/
+	unsigned	innodb_api:1;	/*!< whether this is a InnoDB API
+					query */
+	const rec_t*	innodb_api_rec;	/*!< InnoDB API search result */
+	byte*		srch_key_val1;  /*!< buffer used in converting
+					search key values from MySQL format
+					to InnoDB format.*/
+	byte*		srch_key_val2;  /*!< buffer used in converting
+					search key values from MySQL format
+					to InnoDB format.*/
+	uint		srch_key_val_len; /*!< Size of search key */
+
+};
+
+/** Callback for row_mysql_sys_index_iterate() */
+struct SysIndexCallback {
+	virtual ~SysIndexCallback() { }
+
+	/** Callback method
+	@param mtr - current mini transaction
+	@param pcur - persistent cursor. */
+	virtual void operator()(mtr_t* mtr, btr_pcur_t* pcur) throw() = 0;
+};
+
+#define ROW_PREBUILT_FETCH_MAGIC_N	465765687
+
+#define ROW_MYSQL_WHOLE_ROW	0
+#define ROW_MYSQL_REC_FIELDS	1
+#define ROW_MYSQL_NO_TEMPLATE	2
+#define ROW_MYSQL_DUMMY_TEMPLATE 3	/* dummy template used in
+					row_scan_and_check_index */
+
+/* Values for hint_need_to_fetch_extra_cols */
+#define ROW_RETRIEVE_PRIMARY_KEY	1
+#define ROW_RETRIEVE_ALL_COLS		2
+
+/* Values for row_read_type */
+#define ROW_READ_WITH_LOCKS		0
+#define ROW_READ_TRY_SEMI_CONSISTENT	1
+#define ROW_READ_DID_SEMI_CONSISTENT	2
+
+#ifndef UNIV_NONINL
+#include "row0mysql.ic"
+#endif
+
+#endif /* row0mysql.h */
diff --git a/storage/innobase/include/row0mysql.ic b/storage/innobase/include/row0mysql.ic
new file mode 100644
index 00000000000..2eb60898c46
--- /dev/null
+++ b/storage/innobase/include/row0mysql.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 2001, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0mysql.ic
+MySQL interface for Innobase
+
+Created 1/23/2001 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h
new file mode 100644
index 00000000000..93dcf9cf49b
--- /dev/null
+++ b/storage/innobase/include/row0purge.h
@@ -0,0 +1,128 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0purge.h
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0purge_h
+#define row0purge_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "btr0types.h"
+#include "btr0pcur.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "row0purge.h"
+#include "ut0vec.h"
+
+/********************************************************************//**
+Creates a purge node to a query graph.
+@return	own: purge node */
+UNIV_INTERN
+purge_node_t*
+row_purge_node_create(
+/*==================*/
+	que_thr_t*	parent,		/*!< in: parent node, i.e., a
+					thr node */
+	mem_heap_t*	heap)		/*!< in: memory heap where created */
+	__attribute__((nonnull, warn_unused_result));
+/***********************************************************//**
+Determines if it is possible to remove a secondary index entry.
+Removal is possible if the secondary index entry does not refer to any
+not delete marked version of a clustered index record where DB_TRX_ID
+is newer than the purge view.
+
+NOTE: This function should only be called by the purge thread, only
+while holding a latch on the leaf page of the secondary index entry
+(or keeping the buffer pool watch on the page).  It is possible that
+this function first returns true and then false, if a user transaction
+inserts a record that the secondary index entry would refer to.
+However, in that case, the user transaction would also re-insert the
+secondary index entry after purge has removed it and released the leaf
+page latch.
+@return	true if the secondary index record can be purged */
+UNIV_INTERN
+bool
+row_purge_poss_sec(
+/*===============*/
+	purge_node_t*	node,	/*!< in/out: row purge node */
+	dict_index_t*	index,	/*!< in: secondary index */
+	const dtuple_t*	entry)	/*!< in: secondary index entry */
+	__attribute__((nonnull, warn_unused_result));
+/***************************************************************
+Does the purge operation for a single undo log record. This is a high-level
+function used in an SQL execution graph.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_purge_step(
+/*===========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+	__attribute__((nonnull, warn_unused_result));
+
+/* Purge node structure */
+
+struct purge_node_t{
+	que_common_t	common;	/*!< node type: QUE_NODE_PURGE */
+	/*----------------------*/
+	/* Local storage for this graph node */
+	roll_ptr_t	roll_ptr;/* roll pointer to undo log record */
+	ib_vector_t*    undo_recs;/*!< Undo recs to purge */
+
+	undo_no_t	undo_no;/* undo number of the record */
+
+	ulint		rec_type;/* undo log record type: TRX_UNDO_INSERT_REC,
+				... */
+	dict_table_t*	table;	/*!< table where purge is done */
+
+	ulint		cmpl_info;/* compiler analysis info of an update */
+
+	upd_t*		update;	/*!< update vector for a clustered index
+				record */
+	dtuple_t*	ref;	/*!< NULL, or row reference to the next row to
+				handle */
+	dtuple_t*	row;	/*!< NULL, or a copy (also fields copied to
+				heap) of the indexed fields of the row to
+				handle */
+	dict_index_t*	index;	/*!< NULL, or the next index whose record should
+				be handled */
+	mem_heap_t*	heap;	/*!< memory heap used as auxiliary storage for
+				row; this must be emptied after a successful
+				purge of a row */
+	ibool		found_clust;/* TRUE if the clustered index record
+				determined by ref was found in the clustered
+				index, and we were able to position pcur on
+				it */
+	btr_pcur_t	pcur;	/*!< persistent cursor used in searching the
+				clustered index record */
+	ibool		done;	/* Debug flag */
+
+};
+
+#ifndef UNIV_NONINL
+#include "row0purge.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0purge.ic b/storage/innobase/include/row0purge.ic
new file mode 100644
index 00000000000..700106d1048
--- /dev/null
+++ b/storage/innobase/include/row0purge.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+
+/**************************************************//**
+@file include/row0purge.ic
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/row0quiesce.h b/storage/innobase/include/row0quiesce.h
new file mode 100644
index 00000000000..1d6d11291b8
--- /dev/null
+++ b/storage/innobase/include/row0quiesce.h
@@ -0,0 +1,74 @@
+/*****************************************************************************
+
+Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0quiesce.h
+
+Header file for tablespace quiesce functions.
+
+Created 2012-02-08 by Sunny Bains
+*******************************************************/
+
+#ifndef row0quiesce_h
+#define row0quiesce_h
+
+#include "univ.i"
+#include "dict0types.h"
+
+struct trx_t;
+
+/** The version number of the export meta-data text file. */
+#define IB_EXPORT_CFG_VERSION_V1	0x1UL
+
+/*********************************************************************//**
+Quiesce the tablespace that the table resides in. */
+UNIV_INTERN
+void
+row_quiesce_table_start(
+/*====================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	trx_t*		trx)		/*!< in/out: transaction/session */
+        __attribute__((nonnull));
+
+/*********************************************************************//**
+Set a table's quiesce state.
+@return DB_SUCCESS or errro code. */
+UNIV_INTERN
+dberr_t
+row_quiesce_set_state(
+/*==================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	ib_quiesce_t	state,		/*!< in: quiesce state to set */
+	trx_t*		trx)		/*!< in/out: transaction */
+        __attribute__((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Cleanup after table quiesce. */
+UNIV_INTERN
+void
+row_quiesce_table_complete(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	trx_t*		trx)		/*!< in/out: transaction/session */
+        __attribute__((nonnull));
+
+#ifndef UNIV_NONINL
+#include "row0quiesce.ic"
+#endif
+
+#endif /* row0quiesce_h */
diff --git a/storage/innobase/include/row0quiesce.ic b/storage/innobase/include/row0quiesce.ic
new file mode 100644
index 00000000000..f570a6aed05
--- /dev/null
+++ b/storage/innobase/include/row0quiesce.ic
@@ -0,0 +1,26 @@
+/*****************************************************************************
+
+Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0quiesce.ic
+
+Quiesce a tablespace.
+
+Created 2012-02-08 Sunny Bains
+*******************************************************/
+
diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h
new file mode 100644
index 00000000000..a4e5e0dd2fa
--- /dev/null
+++ b/storage/innobase/include/row0row.h
@@ -0,0 +1,343 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0row.h
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0row_h
+#define row0row_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "mtr0mtr.h"
+#include "rem0types.h"
+#include "read0types.h"
+#include "row0types.h"
+#include "btr0types.h"
+
+/*********************************************************************//**
+Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of
+a clustered index record.
+@return	offset of DATA_TRX_ID */
+UNIV_INLINE
+ulint
+row_get_trx_id_offset(
+/*==================*/
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*		offsets)/*!< in: record offsets */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Reads the trx id field from a clustered index record.
+@return	value of the field */
+UNIV_INLINE
+trx_id_t
+row_get_rec_trx_id(
+/*===============*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Reads the roll pointer field from a clustered index record.
+@return	value of the field */
+UNIV_INLINE
+roll_ptr_t
+row_get_rec_roll_ptr(
+/*=================*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
+	__attribute__((nonnull, warn_unused_result));
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged
+@retval NULL if the externally stored columns in the clustered index record
+are unavailable and ext != NULL, or row is missing some needed columns. */
+UNIV_INTERN
+dtuple_t*
+row_build_index_entry_low(
+/*======================*/
+	const dtuple_t*		row,	/*!< in: row which should be
+					inserted or purged */
+	const row_ext_t*	ext,	/*!< in: externally stored column
+					prefixes, or NULL */
+	dict_index_t*		index,	/*!< in: index on the table */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory for the index entry
+					is allocated */
+	__attribute__((warn_unused_result, nonnull(1,3,4)));
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged, or NULL if the
+externally stored columns in the clustered index record are
+unavailable and ext != NULL */
+UNIV_INLINE
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+	const dtuple_t*		row,	/*!< in: row which should be
+					inserted or purged */
+	const row_ext_t*	ext,	/*!< in: externally stored column
+					prefixes, or NULL */
+	dict_index_t*		index,	/*!< in: index on the table */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory for the index entry
+					is allocated */
+	__attribute__((warn_unused_result, nonnull(1,3,4)));
+/*******************************************************************//**
+An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index.
+@return	own: row built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_build(
+/*======*/
+	ulint			type,	/*!< in: ROW_COPY_POINTERS or
+					ROW_COPY_DATA; the latter
+					copies also the data fields to
+					heap while the first only
+					places pointers to data fields
+					on the index page, and thus is
+					more efficient */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_t*		rec,	/*!< in: record in the clustered
+					index; NOTE: in the case
+					ROW_COPY_POINTERS the data
+					fields in the row will point
+					directly into this record,
+					therefore, the buffer page of
+					this record must be at least
+					s-latched and the latch held
+					as long as the row dtuple is used! */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec,index)
+					or NULL, in which case this function
+					will invoke rec_get_offsets() */
+	const dict_table_t*	col_table,
+					/*!< in: table, to check which
+					externally stored columns
+					occur in the ordering columns
+					of an index, or NULL if
+					index->table should be
+					consulted instead; the user
+					columns in this table should be
+					the same columns as in index->table */
+	const dtuple_t*		add_cols,
+					/*!< in: default values of
+					added columns, or NULL */
+	const ulint*		col_map,/*!< in: mapping of old column
+					numbers to new ones, or NULL */
+	row_ext_t**		ext,	/*!< out, own: cache of
+					externally stored column
+					prefixes, or NULL */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory needed is allocated */
+	__attribute__((nonnull(2,3,9)));
+/*******************************************************************//**
+Converts an index record to a typed data tuple.
+@return index entry built; does not set info_bits, and the data fields
+in the entry will point directly to rec */
+UNIV_INTERN
+dtuple_t*
+row_rec_to_index_entry_low(
+/*=======================*/
+	const rec_t*		rec,	/*!< in: record in the index */
+	const dict_index_t*	index,	/*!< in: index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint*			n_ext,	/*!< out: number of externally
+					stored columns */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory needed is allocated */
+	__attribute__((nonnull, warn_unused_result));
+/*******************************************************************//**
+Converts an index record to a typed data tuple. NOTE that externally
+stored (often big) fields are NOT copied to heap.
+@return	own: index entry built */
+UNIV_INTERN
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+	const rec_t*		rec,	/*!< in: record in the index */
+	const dict_index_t*	index,	/*!< in: index */
+	const ulint*		offsets,/*!< in/out: rec_get_offsets(rec) */
+	ulint*			n_ext,	/*!< out: number of externally
+					stored columns */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory needed is allocated */
+	__attribute__((nonnull, warn_unused_result));
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record.
+@return	own: row reference built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+	ulint		type,	/*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+				the former copies also the data fields to
+				heap, whereas the latter only places pointers
+				to data fields on the index page */
+	dict_index_t*	index,	/*!< in: secondary index */
+	const rec_t*	rec,	/*!< in: record in the index;
+				NOTE: in the case ROW_COPY_POINTERS
+				the data fields in the row will point
+				directly into this record, therefore,
+				the buffer page of this record must be
+				at least s-latched and the latch held
+				as long as the row reference is used! */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
+				needed is allocated */
+	__attribute__((nonnull, warn_unused_result));
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INTERN
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+	dtuple_t*		ref,	/*!< in/out: row reference built;
+					see the NOTE below! */
+	const rec_t*		rec,	/*!< in: record in the index;
+					NOTE: the data fields in ref
+					will point directly into this
+					record, therefore, the buffer
+					page of this record must be at
+					least s-latched and the latch
+					held as long as the row
+					reference is used! */
+	const dict_index_t*	index,	/*!< in: secondary index */
+	ulint*			offsets,/*!< in: rec_get_offsets(rec, index)
+					or NULL */
+	trx_t*			trx)	/*!< in: transaction or NULL */
+	__attribute__((nonnull(1,2,3)));
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INLINE
+void
+row_build_row_ref_fast(
+/*===================*/
+	dtuple_t*	ref,	/*!< in/out: typed data tuple where the
+				reference is built */
+	const ulint*	map,	/*!< in: array of field numbers in rec
+				telling how ref should be built from
+				the fields of rec */
+	const rec_t*	rec,	/*!< in: record in the index; must be
+				preserved while ref is used, as we do
+				not copy field values to heap */
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Searches the clustered index record for a row, if we have the row
+reference.
+@return	TRUE if found */
+UNIV_INTERN
+ibool
+row_search_on_row_ref(
+/*==================*/
+	btr_pcur_t*		pcur,	/*!< out: persistent cursor, which must
+					be closed by the caller */
+	ulint			mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	const dict_table_t*	table,	/*!< in: table */
+	const dtuple_t*		ref,	/*!< in: row reference */
+	mtr_t*			mtr)	/*!< in/out: mtr */
+	__attribute__((nonnull, warn_unused_result));
+/*********************************************************************//**
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved.
+@return	record or NULL, if no record found */
+UNIV_INTERN
+rec_t*
+row_get_clust_rec(
+/*==============*/
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	const rec_t*	rec,	/*!< in: record in a secondary index */
+	dict_index_t*	index,	/*!< in: secondary index */
+	dict_index_t**	clust_index,/*!< out: clustered index */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull, warn_unused_result));
+
+/** Result of row_search_index_entry */
+enum row_search_result {
+	ROW_FOUND = 0,		/*!< the record was found */
+	ROW_NOT_FOUND,		/*!< record not found */
+	ROW_BUFFERED,		/*!< one of BTR_INSERT, BTR_DELETE, or
+				BTR_DELETE_MARK was specified, the
+				secondary index leaf page was not in
+				the buffer pool, and the operation was
+				enqueued in the insert/delete buffer */
+	ROW_NOT_DELETED_REF	/*!< BTR_DELETE was specified, and
+				row_purge_poss_sec() failed */
+};
+
+/***************************************************************//**
+Searches an index record.
+@return	whether the record was found or buffered */
+UNIV_INTERN
+enum row_search_result
+row_search_index_entry(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry,	/*!< in: index entry */
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor, which must
+				be closed by the caller */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull, warn_unused_result));
+
+#define ROW_COPY_DATA		1
+#define ROW_COPY_POINTERS	2
+
+/* The allowed latching order of index records is the following:
+(1) a secondary index record ->
+(2) the clustered index record ->
+(3) rollback segment data for the clustered index record. */
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) using
+"dict_field" and writes the result to "buf".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size is positive) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return	number of bytes that were written */
+UNIV_INTERN
+ulint
+row_raw_format(
+/*===========*/
+	const char*		data,		/*!< in: raw data */
+	ulint			data_len,	/*!< in: raw data length
+						in bytes */
+	const dict_field_t*	dict_field,	/*!< in: index field */
+	char*			buf,		/*!< out: output buffer */
+	ulint			buf_size)	/*!< in: output buffer size
+						in bytes */
+	__attribute__((nonnull, warn_unused_result));
+
+#ifndef UNIV_NONINL
+#include "row0row.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0row.ic b/storage/innobase/include/row0row.ic
new file mode 100644
index 00000000000..ac62422be1f
--- /dev/null
+++ b/storage/innobase/include/row0row.ic
@@ -0,0 +1,174 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0row.ic
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0dict.h"
+#include "rem0rec.h"
+#include "trx0undo.h"
+
+/*********************************************************************//**
+Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of
+a clustered index record.
+@return	offset of DATA_TRX_ID */
+UNIV_INLINE
+ulint
+row_get_trx_id_offset(
+/*==================*/
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*		offsets)/*!< in: record offsets */
+{
+	ulint	pos;
+	ulint	offset;
+	ulint	len;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(NULL, index, offsets));
+
+	pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+
+	offset = rec_get_nth_field_offs(offsets, pos, &len);
+
+	ut_ad(len == DATA_TRX_ID_LEN);
+
+	return(offset);
+}
+
+/*********************************************************************//**
+Reads the trx id field from a clustered index record.
+@return	value of the field */
+UNIV_INLINE
+trx_id_t
+row_get_rec_trx_id(
+/*===============*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ulint	offset;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	offset = index->trx_id_offset;
+
+	if (!offset) {
+		offset = row_get_trx_id_offset(index, offsets);
+	}
+
+	return(trx_read_trx_id(rec + offset));
+}
+
+/*********************************************************************//**
+Reads the roll pointer field from a clustered index record.
+@return	value of the field */
+UNIV_INLINE
+roll_ptr_t
+row_get_rec_roll_ptr(
+/*=================*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ulint	offset;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	offset = index->trx_id_offset;
+
+	if (!offset) {
+		offset = row_get_trx_id_offset(index, offsets);
+	}
+
+	return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN));
+}
+
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged, or NULL if the
+externally stored columns in the clustered index record are
+unavailable and ext != NULL */
+UNIV_INLINE
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+	const dtuple_t*		row,	/*!< in: row which should be
+					inserted or purged */
+	const row_ext_t*	ext,	/*!< in: externally stored column
+					prefixes, or NULL */
+	dict_index_t*		index,	/*!< in: index on the table */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory for the index entry
+					is allocated */
+{
+	dtuple_t*	entry;
+
+	ut_ad(dtuple_check_typed(row));
+	entry = row_build_index_entry_low(row, ext, index, heap);
+	ut_ad(!entry || dtuple_check_typed(entry));
+	return(entry);
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INLINE
+void
+row_build_row_ref_fast(
+/*===================*/
+	dtuple_t*	ref,	/*!< in/out: typed data tuple where the
+				reference is built */
+	const ulint*	map,	/*!< in: array of field numbers in rec
+				telling how ref should be built from
+				the fields of rec */
+	const rec_t*	rec,	/*!< in: record in the index; must be
+				preserved while ref is used, as we do
+				not copy field values to heap */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	dfield_t*	dfield;
+	const byte*	field;
+	ulint		len;
+	ulint		ref_len;
+	ulint		field_no;
+	ulint		i;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(!rec_offs_any_extern(offsets));
+	ref_len = dtuple_get_n_fields(ref);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+
+		field_no = *(map + i);
+
+		if (field_no != ULINT_UNDEFINED) {
+
+			field = rec_get_nth_field(rec, offsets,
+						  field_no, &len);
+			dfield_set_data(dfield, field, len);
+		}
+	}
+}
diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h
new file mode 100644
index 00000000000..c8be80f89d9
--- /dev/null
+++ b/storage/innobase/include/row0sel.h
@@ -0,0 +1,409 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0sel.h
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0sel_h
+#define row0sel_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "que0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include "que0types.h"
+#include "pars0sym.h"
+#include "btr0pcur.h"
+#include "read0read.h"
+#include "row0mysql.h"
+
+/*********************************************************************//**
+Creates a select node struct.
+@return	own: select node struct */
+UNIV_INTERN
+sel_node_t*
+sel_node_create(
+/*============*/
+	mem_heap_t*	heap);	/*!< in: memory heap where created */
+/*********************************************************************//**
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+UNIV_INTERN
+void
+sel_node_free_private(
+/*==================*/
+	sel_node_t*	node);	/*!< in: select node struct */
+/*********************************************************************//**
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+UNIV_INTERN
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+	sel_buf_t*	prefetch_buf);	/*!< in, own: prefetch buffer */
+/*********************************************************************//**
+Gets the plan node for the nth table in a join.
+@return	plan node */
+UNIV_INLINE
+plan_t*
+sel_node_get_nth_plan(
+/*==================*/
+	sel_node_t*	node,	/*!< in: select node */
+	ulint		i);	/*!< in: get ith plan node */
+/**********************************************************************//**
+Performs a select step. This is a high-level function used in SQL execution
+graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_sel_step(
+/*=========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an open or close cursor statement node.
+@return	query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+open_step(
+/*======*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs a fetch for a cursor.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+fetch_step(
+/*=======*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/****************************************************************//**
+Sample callback function for fetch that prints each row.
+@return	always returns non-NULL */
+UNIV_INTERN
+void*
+row_fetch_print(
+/*============*/
+	void*	row,		/*!< in:  sel_node_t* */
+	void*	user_arg);	/*!< in:  not used */
+/***********************************************************//**
+Prints a row in a select result.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_printf_step(
+/*============*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/****************************************************************//**
+Converts a key value stored in MySQL format to an Innobase dtuple. The last
+field of the key value may be just a prefix of a fixed length field: hence
+the parameter key_len. But currently we do not allow search keys where the
+last field is only a prefix of the full key field len and print a warning if
+such appears. */
+UNIV_INTERN
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+	dtuple_t*	tuple,		/*!< in/out: tuple where to build;
+					NOTE: we assume that the type info
+					in the tuple is already according
+					to index! */
+	byte*		buf,		/*!< in: buffer to use in field
+					conversions; NOTE that dtuple->data
+					may end up pointing inside buf so
+					do not discard that buffer while
+					the tuple is being used. See
+					row_mysql_store_col_in_innobase_format()
+					in the case of DATA_INT */
+	ulint		buf_len,	/*!< in: buffer length */
+	dict_index_t*	index,		/*!< in: index of the key value */
+	const byte*	key_ptr,	/*!< in: MySQL key value */
+	ulint		key_len,	/*!< in: MySQL key value length */
+	trx_t*		trx);		/*!< in: transaction */
+/********************************************************************//**
+Searches for rows in the database. This is used in the interface to
+MySQL. This function opens a cursor, and also implements fetch next
+and fetch prev. NOTE that if we do a search with a full key value
+from a unique index (ROW_SEL_EXACT), then we will not store the cursor
+position and fetch next or fetch prev must not be tried to the cursor!
+@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
+DB_LOCK_TABLE_FULL, or DB_TOO_BIG_RECORD */
+UNIV_INTERN
+dberr_t
+row_search_for_mysql(
+/*=================*/
+	byte*		buf,		/*!< in/out: buffer for the fetched
+					row in the MySQL format */
+	ulint		mode,		/*!< in: search mode PAGE_CUR_L, ... */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct for the
+					table handle; this contains the info
+					of search_tuple, index; if search
+					tuple contains 0 fields then we
+					position the cursor at the start or
+					the end of the index, depending on
+					'mode' */
+	ulint		match_mode,	/*!< in: 0 or ROW_SEL_EXACT or
+					ROW_SEL_EXACT_PREFIX */
+	ulint		direction)	/*!< in: 0 or ROW_SEL_NEXT or
+					ROW_SEL_PREV; NOTE: if this is != 0,
+					then prebuilt must have a pcur
+					with stored position! In opening of a
+					cursor 'direction' should be 0. */
+	__attribute__((nonnull, warn_unused_result));
+/*******************************************************************//**
+Checks if MySQL at the moment is allowed for this table to retrieve a
+consistent read result, or store it to the query cache.
+@return	TRUE if storing or retrieving from the query cache is permitted */
+UNIV_INTERN
+ibool
+row_search_check_if_query_cache_permitted(
+/*======================================*/
+	trx_t*		trx,		/*!< in: transaction object */
+	const char*	norm_name);	/*!< in: concatenation of database name,
+					'/' char, table name */
+/*******************************************************************//**
+Read the max AUTOINC value from an index.
+@return	DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+dberr_t
+row_search_max_autoinc(
+/*===================*/
+	dict_index_t*	index,		/*!< in: index to search */
+	const char*	col_name,	/*!< in: autoinc column name */
+	ib_uint64_t*	value)		/*!< out: AUTOINC value read */
+	__attribute__((nonnull, warn_unused_result));
+
+/** A structure for caching column values for prefetched rows */
+struct sel_buf_t{
+	byte*		data;	/*!< data, or NULL; if not NULL, this field
+				has allocated memory which must be explicitly
+				freed; can be != NULL even when len is
+				UNIV_SQL_NULL */
+	ulint		len;	/*!< data length or UNIV_SQL_NULL */
+	ulint		val_buf_size;
+				/*!< size of memory buffer allocated for data:
+				this can be more than len; this is defined
+				when data != NULL */
+};
+
+/** Query plan */
+struct plan_t{
+	dict_table_t*	table;		/*!< table struct in the dictionary
+					cache */
+	dict_index_t*	index;		/*!< table index used in the search */
+	btr_pcur_t	pcur;		/*!< persistent cursor used to search
+					the index */
+	ibool		asc;		/*!< TRUE if cursor traveling upwards */
+	ibool		pcur_is_open;	/*!< TRUE if pcur has been positioned
+					and we can try to fetch new rows */
+	ibool		cursor_at_end;	/*!< TRUE if the cursor is open but
+					we know that there are no more
+					qualifying rows left to retrieve from
+					the index tree; NOTE though, that
+					there may still be unprocessed rows in
+					the prefetch stack; always FALSE when
+					pcur_is_open is FALSE */
+	ibool		stored_cursor_rec_processed;
+					/*!< TRUE if the pcur position has been
+					stored and the record it is positioned
+					on has already been processed */
+	que_node_t**	tuple_exps;	/*!< array of expressions
+					which are used to calculate
+					the field values in the search
+					tuple: there is one expression
+					for each field in the search
+					tuple */
+	dtuple_t*	tuple;		/*!< search tuple */
+	ulint		mode;		/*!< search mode: PAGE_CUR_G, ... */
+	ulint		n_exact_match;	/*!< number of first fields in
+					the search tuple which must be
+					exactly matched */
+	ibool		unique_search;	/*!< TRUE if we are searching an
+					index record with a unique key */
+	ulint		n_rows_fetched;	/*!< number of rows fetched using pcur
+					after it was opened */
+	ulint		n_rows_prefetched;/*!< number of prefetched rows cached
+					for fetch: fetching several rows in
+					the same mtr saves CPU time */
+	ulint		first_prefetched;/*!< index of the first cached row in
+					select buffer arrays for each column */
+	ibool		no_prefetch;	/*!< no prefetch for this table */
+	sym_node_list_t	columns;	/*!< symbol table nodes for the columns
+					to retrieve from the table */
+	UT_LIST_BASE_NODE_T(func_node_t)
+			end_conds;	/*!< conditions which determine the
+					fetch limit of the index segment we
+					have to look at: when one of these
+					fails, the result set has been
+					exhausted for the cursor in this
+					index; these conditions are normalized
+					so that in a comparison the column
+					for this table is the first argument */
+	UT_LIST_BASE_NODE_T(func_node_t)
+			other_conds;	/*!< the rest of search conditions we can
+					test at this table in a join */
+	ibool		must_get_clust;	/*!< TRUE if index is a non-clustered
+					index and we must also fetch the
+					clustered index record; this is the
+					case if the non-clustered record does
+					not contain all the needed columns, or
+					if this is a single-table explicit
+					cursor, or a searched update or
+					delete */
+	ulint*		clust_map;	/*!< map telling how clust_ref is built
+					from the fields of a non-clustered
+					record */
+	dtuple_t*	clust_ref;	/*!< the reference to the clustered
+					index entry is built here if index is
+					a non-clustered index */
+	btr_pcur_t	clust_pcur;	/*!< if index is non-clustered, we use
+					this pcur to search the clustered
+					index */
+	mem_heap_t*	old_vers_heap;	/*!< memory heap used in building an old
+					version of a row, or NULL */
+};
+
+/** Select node states */
+enum sel_node_state {
+	SEL_NODE_CLOSED,	/*!< it is a declared cursor which is not
+				currently open */
+	SEL_NODE_OPEN,		/*!< intention locks not yet set on tables */
+	SEL_NODE_FETCH,		/*!< intention locks have been set */
+	SEL_NODE_NO_MORE_ROWS	/*!< cursor has reached the result set end */
+};
+
+/** Select statement node */
+struct sel_node_t{
+	que_common_t	common;		/*!< node type: QUE_NODE_SELECT */
+	enum sel_node_state
+			state;	/*!< node state */
+	que_node_t*	select_list;	/*!< select list */
+	sym_node_t*	into_list;	/*!< variables list or NULL */
+	sym_node_t*	table_list;	/*!< table list */
+	ibool		asc;		/*!< TRUE if the rows should be fetched
+					in an ascending order */
+	ibool		set_x_locks;	/*!< TRUE if the cursor is for update or
+					delete, which means that a row x-lock
+					should be placed on the cursor row */
+	ulint		row_lock_mode;	/*!< LOCK_X or LOCK_S */
+	ulint		n_tables;	/*!< number of tables */
+	ulint		fetch_table;	/*!< number of the next table to access
+					in the join */
+	plan_t*		plans;		/*!< array of n_tables many plan nodes
+					containing the search plan and the
+					search data structures */
+	que_node_t*	search_cond;	/*!< search condition */
+	read_view_t*	read_view;	/*!< if the query is a non-locking
+					consistent read, its read view is
+					placed here, otherwise NULL */
+	ibool		consistent_read;/*!< TRUE if the select is a consistent,
+					non-locking read */
+	order_node_t*	order_by;	/*!< order by column definition, or
+					NULL */
+	ibool		is_aggregate;	/*!< TRUE if the select list consists of
+					aggregate functions */
+	ibool		aggregate_already_fetched;
+					/*!< TRUE if the aggregate row has
+					already been fetched for the current
+					cursor */
+	ibool		can_get_updated;/*!< this is TRUE if the select
+					is in a single-table explicit
+					cursor which can get updated
+					within the stored procedure,
+					or in a searched update or
+					delete; NOTE that to determine
+					of an explicit cursor if it
+					can get updated, the parser
+					checks from a stored procedure
+					if it contains positioned
+					update or delete statements */
+	sym_node_t*	explicit_cursor;/*!< not NULL if an explicit cursor */
+	UT_LIST_BASE_NODE_T(sym_node_t)
+			copy_variables; /*!< variables whose values we have to
+					copy when an explicit cursor is opened,
+					so that they do not change between
+					fetches */
+};
+
+/** Fetch statement node */
+struct fetch_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_FETCH */
+	sel_node_t*	cursor_def;	/*!< cursor definition */
+	sym_node_t*	into_list;	/*!< variables to set */
+
+	pars_user_func_t*
+			func;		/*!< User callback function or NULL.
+					The first argument to the function
+					is a sel_node_t*, containing the
+					results of the SELECT operation for
+					one row. If the function returns
+					NULL, it is not interested in
+					further rows and the cursor is
+					modified so (cursor % NOTFOUND) is
+					true. If it returns not-NULL,
+					continue normally. See
+					row_fetch_print() for an example
+					(and a useful debugging tool). */
+};
+
+/** Open or close cursor operation type */
+enum open_node_op {
+	ROW_SEL_OPEN_CURSOR,	/*!< open cursor */
+	ROW_SEL_CLOSE_CURSOR	/*!< close cursor */
+};
+
+/** Open or close cursor statement node */
+struct open_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_OPEN */
+	enum open_node_op
+			op_type;	/*!< operation type: open or
+					close cursor */
+	sel_node_t*	cursor_def;	/*!< cursor definition */
+};
+
+/** Row printf statement node */
+struct row_printf_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_ROW_PRINTF */
+	sel_node_t*	sel_node;	/*!< select */
+};
+
+/** Search direction for the MySQL interface */
+enum row_sel_direction {
+	ROW_SEL_NEXT = 1,	/*!< ascending direction */
+	ROW_SEL_PREV = 2	/*!< descending direction */
+};
+
+/** Match mode for the MySQL interface */
+enum row_sel_match_mode {
+	ROW_SEL_EXACT = 1,	/*!< search using a complete key value */
+	ROW_SEL_EXACT_PREFIX	/*!< search using a key prefix which
+				must match rows: the prefix may
+				contain an incomplete field (the last
+				field in prefix may be just a prefix
+				of a fixed length column) */
+};
+
+#ifndef UNIV_NONINL
+#include "row0sel.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0sel.ic b/storage/innobase/include/row0sel.ic
new file mode 100644
index 00000000000..d83a3448832
--- /dev/null
+++ b/storage/innobase/include/row0sel.ic
@@ -0,0 +1,105 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0sel.ic
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+
+/*********************************************************************//**
+Gets the plan node for the nth table in a join.
+@return	plan node */
+UNIV_INLINE
+plan_t*
+sel_node_get_nth_plan(
+/*==================*/
+	sel_node_t*	node,	/*!< in: select node */
+	ulint		i)	/*!< in: get ith plan node */
+{
+	ut_ad(i < node->n_tables);
+
+	return(node->plans + i);
+}
+
+/*********************************************************************//**
+Resets the cursor defined by sel_node to the SEL_NODE_OPEN state, which means
+that it will start fetching from the start of the result set again, regardless
+of where it was before, and it will set intention locks on the tables. */
+UNIV_INLINE
+void
+sel_node_reset_cursor(
+/*==================*/
+	sel_node_t*	node)	/*!< in: select node */
+{
+	node->state = SEL_NODE_OPEN;
+}
+
+/**********************************************************************//**
+Performs an execution step of an open or close cursor statement node.
+@return	query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+open_step(
+/*======*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	sel_node_t*	sel_node;
+	open_node_t*	node;
+	ulint		err;
+
+	ut_ad(thr);
+
+	node = (open_node_t*) thr->run_node;
+	ut_ad(que_node_get_type(node) == QUE_NODE_OPEN);
+
+	sel_node = node->cursor_def;
+
+	err = DB_SUCCESS;
+
+	if (node->op_type == ROW_SEL_OPEN_CURSOR) {
+
+		/*		if (sel_node->state == SEL_NODE_CLOSED) { */
+
+		sel_node_reset_cursor(sel_node);
+		/*		} else {
+		err = DB_ERROR;
+		} */
+	} else {
+		if (sel_node->state != SEL_NODE_CLOSED) {
+
+			sel_node->state = SEL_NODE_CLOSED;
+		} else {
+			err = DB_ERROR;
+		}
+	}
+
+	if (err != DB_SUCCESS) {
+		/* SQL error detected */
+		fprintf(stderr, "SQL error %lu\n", (ulong) err);
+
+		ut_error;
+	}
+
+	thr->run_node = que_node_get_parent(node);
+
+	return(thr);
+}
diff --git a/storage/innobase/include/row0types.h b/storage/innobase/include/row0types.h
new file mode 100644
index 00000000000..52c89cb01fa
--- /dev/null
+++ b/storage/innobase/include/row0types.h
@@ -0,0 +1,55 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0types.h
+Row operation global types
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0types_h
+#define row0types_h
+
+struct plan_t;
+
+struct upd_t;
+struct upd_field_t;
+struct upd_node_t;
+struct del_node_t;
+struct ins_node_t;
+struct sel_node_t;
+struct open_node_t;
+struct fetch_node_t;
+
+struct row_printf_node_t;
+struct sel_buf_t;
+
+struct undo_node_t;
+
+struct purge_node_t;
+
+struct row_ext_t;
+
+/** Buffer for logging modifications during online index creation */
+struct row_log_t;
+
+/* MySQL data types */
+struct TABLE;
+
+#endif
diff --git a/storage/innobase/include/row0uins.h b/storage/innobase/include/row0uins.h
new file mode 100644
index 00000000000..ebf4881208a
--- /dev/null
+++ b/storage/innobase/include/row0uins.h
@@ -0,0 +1,54 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0uins.h
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0uins_h
+#define row0uins_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/***********************************************************//**
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert.  InnoDB is eager in a rollback:
+if it figures out that an index record will be removed in the purge
+anyway, it will remove it in the rollback.
+@return	DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_undo_ins(
+/*=========*/
+	undo_node_t*	node)	/*!< in: row undo node */
+	__attribute__((nonnull, warn_unused_result));
+#ifndef UNIV_NONINL
+#include "row0uins.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0uins.ic b/storage/innobase/include/row0uins.ic
new file mode 100644
index 00000000000..54da2e49874
--- /dev/null
+++ b/storage/innobase/include/row0uins.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0uins.ic
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
diff --git a/storage/innobase/include/row0umod.h b/storage/innobase/include/row0umod.h
new file mode 100644
index 00000000000..f89d5a334fc
--- /dev/null
+++ b/storage/innobase/include/row0umod.h
@@ -0,0 +1,52 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0umod.h
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0umod_h
+#define row0umod_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/***********************************************************//**
+Undoes a modify operation on a row of a table.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_undo_mod(
+/*=========*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+	__attribute__((nonnull, warn_unused_result));
+
+#ifndef UNIV_NONINL
+#include "row0umod.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0umod.ic b/storage/innobase/include/row0umod.ic
new file mode 100644
index 00000000000..00a8cd86e01
--- /dev/null
+++ b/storage/innobase/include/row0umod.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0umod.ic
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/row0undo.h b/storage/innobase/include/row0undo.h
new file mode 100644
index 00000000000..5dddfb4eae1
--- /dev/null
+++ b/storage/innobase/include/row0undo.h
@@ -0,0 +1,135 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0undo.h
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0undo_h
+#define row0undo_h
+
+#include "univ.i"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+#include "btr0types.h"
+#include "btr0pcur.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+
+/********************************************************************//**
+Creates a row undo node to a query graph.
+@return	own: undo node */
+UNIV_INTERN
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+	trx_t*		trx,	/*!< in: transaction */
+	que_thr_t*	parent,	/*!< in: parent node, i.e., a thr node */
+	mem_heap_t*	heap);	/*!< in: memory heap where created */
+/***********************************************************//**
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case.
+@return TRUE if found; NOTE the node->pcur must be closed by the
+caller, regardless of the return value */
+UNIV_INTERN
+ibool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+	undo_node_t*	node);	/*!< in: row undo node */
+/***********************************************************//**
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_undo_step(
+/*==========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/* A single query thread will try to perform the undo for all successive
+versions of a clustered index record, if the transaction has modified it
+several times during the execution which is rolled back. It may happen
+that the task is transferred to another query thread, if the other thread
+is assigned to handle an undo log record in the chain of different versions
+of the record, and the other thread happens to get the x-latch to the
+clustered index record at the right time.
+	If a query thread notices that the clustered index record it is looking
+for is missing, or the roll ptr field in the record doed not point to the
+undo log record the thread was assigned to handle, then it gives up the undo
+task for that undo log record, and fetches the next. This situation can occur
+just in the case where the transaction modified the same record several times
+and another thread is currently doing the undo for successive versions of
+that index record. */
+
+/** Execution state of an undo node */
+enum undo_exec {
+	UNDO_NODE_FETCH_NEXT = 1,	/*!< we should fetch the next
+					undo log record */
+	UNDO_NODE_INSERT,		/*!< undo a fresh insert of a
+					row to a table */
+	UNDO_NODE_MODIFY		/*!< undo a modify operation
+					(DELETE or UPDATE) on a row
+					of a table */
+};
+
+/** Undo node structure */
+struct undo_node_t{
+	que_common_t	common;	/*!< node type: QUE_NODE_UNDO */
+	enum undo_exec	state;	/*!< node execution state */
+	trx_t*		trx;	/*!< trx for which undo is done */
+	roll_ptr_t	roll_ptr;/*!< roll pointer to undo log record */
+	trx_undo_rec_t*	undo_rec;/*!< undo log record */
+	undo_no_t	undo_no;/*!< undo number of the record */
+	ulint		rec_type;/*!< undo log record type: TRX_UNDO_INSERT_REC,
+				... */
+	trx_id_t	new_trx_id; /*!< trx id to restore to clustered index
+				record */
+	btr_pcur_t	pcur;	/*!< persistent cursor used in searching the
+				clustered index record */
+	dict_table_t*	table;	/*!< table where undo is done */
+	ulint		cmpl_info;/*!< compiler analysis of an update */
+	upd_t*		update;	/*!< update vector for a clustered index
+				record */
+	dtuple_t*	ref;	/*!< row reference to the next row to handle */
+	dtuple_t*	row;	/*!< a copy (also fields copied to heap) of the
+				row to handle */
+	row_ext_t*	ext;	/*!< NULL, or prefixes of the externally
+				stored columns of the row */
+	dtuple_t*	undo_row;/*!< NULL, or the row after undo */
+	row_ext_t*	undo_ext;/*!< NULL, or prefixes of the externally
+				stored columns of undo_row */
+	dict_index_t*	index;	/*!< the next index whose record should be
+				handled */
+	mem_heap_t*	heap;	/*!< memory heap used as auxiliary storage for
+				row; this must be emptied after undo is tried
+				on a row */
+};
+
+
+#ifndef UNIV_NONINL
+#include "row0undo.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0undo.ic b/storage/innobase/include/row0undo.ic
new file mode 100644
index 00000000000..b97ffca590e
--- /dev/null
+++ b/storage/innobase/include/row0undo.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0undo.ic
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/row0upd.h b/storage/innobase/include/row0upd.h
new file mode 100644
index 00000000000..27dedeb65a7
--- /dev/null
+++ b/storage/innobase/include/row0upd.h
@@ -0,0 +1,540 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0upd.h
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0upd_h
+#define row0upd_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "row0types.h"
+#include "btr0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "btr0pcur.h"
+# include "que0types.h"
+# include "pars0types.h"
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Creates an update vector object.
+@return	own: update vector object */
+UNIV_INLINE
+upd_t*
+upd_create(
+/*=======*/
+	ulint		n,	/*!< in: number of fields */
+	mem_heap_t*	heap);	/*!< in: heap from which memory allocated */
+/*********************************************************************//**
+Returns the number of fields in the update vector == number of columns
+to be updated by an update vector.
+@return	number of fields */
+UNIV_INLINE
+ulint
+upd_get_n_fields(
+/*=============*/
+	const upd_t*	update);	/*!< in: update vector */
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the nth field of an update vector.
+@return	update vector field */
+UNIV_INLINE
+upd_field_t*
+upd_get_nth_field(
+/*==============*/
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		n);	/*!< in: field position in update vector */
+#else
+# define upd_get_nth_field(update, n) ((update)->fields + (n))
+#endif
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Sets an index field number to be updated by an update vector field. */
+UNIV_INLINE
+void
+upd_field_set_field_no(
+/*===================*/
+	upd_field_t*	upd_field,	/*!< in: update vector field */
+	ulint		field_no,	/*!< in: field number in a clustered
+					index */
+	dict_index_t*	index,		/*!< in: index */
+	trx_t*		trx);		/*!< in: transaction */
+/*********************************************************************//**
+Returns a field of an update vector by field_no.
+@return	update vector field, or NULL */
+UNIV_INLINE
+const upd_field_t*
+upd_get_field_by_field_no(
+/*======================*/
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		no)	/*!< in: field_no */
+	__attribute__((nonnull, pure));
+/*********************************************************************//**
+Writes into the redo log the values of trx id and roll ptr and enough info
+to determine their positions within a clustered index record.
+@return	new pointer to mlog */
+UNIV_INTERN
+byte*
+row_upd_write_sys_vals_to_log(
+/*==========================*/
+	dict_index_t*	index,	/*!< in: clustered index */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	roll_ptr_t	roll_ptr,/*!< in: roll ptr of the undo log record */
+	byte*		log_ptr,/*!< pointer to a buffer of size > 20 opened
+				in mlog */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************************//**
+Updates the trx id and roll ptr field in a clustered index record when
+a row is updated or marked deleted. */
+UNIV_INLINE
+void
+row_upd_rec_sys_fields(
+/*===================*/
+	rec_t*		rec,	/*!< in/out: record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const trx_t*	trx,	/*!< in: transaction */
+	roll_ptr_t	roll_ptr);/*!< in: roll ptr of the undo log record,
+				  can be 0 during IMPORT */
+/*********************************************************************//**
+Sets the trx id or roll ptr field of a clustered index entry. */
+UNIV_INTERN
+void
+row_upd_index_entry_sys_field(
+/*==========================*/
+	dtuple_t*	entry,	/*!< in/out: index entry, where the memory
+				buffers for sys fields are already allocated:
+				the function just copies the new values to
+				them */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		type,	/*!< in: DATA_TRX_ID or DATA_ROLL_PTR */
+	ib_uint64_t	val);	/*!< in: value to write */
+/*********************************************************************//**
+Creates an update node for a query graph.
+@return	own: update node */
+UNIV_INTERN
+upd_node_t*
+upd_node_create(
+/*============*/
+	mem_heap_t*	heap);	/*!< in: mem heap where created */
+/***********************************************************//**
+Writes to the redo log the new values of the fields occurring in the index. */
+UNIV_INTERN
+void
+row_upd_index_write_log(
+/*====================*/
+	const upd_t*	update,	/*!< in: update vector */
+	byte*		log_ptr,/*!< in: pointer to mlog buffer: must
+				contain at least MLOG_BUF_MARGIN bytes
+				of free space; the buffer is closed
+				within this function */
+	mtr_t*		mtr);	/*!< in: mtr into whose log to write */
+/***********************************************************//**
+Returns TRUE if row update changes size of some field in index or if some
+field to be updated is stored externally in rec or update.
+@return TRUE if the update changes the size of some field in index or
+the field is external in rec or update */
+UNIV_INTERN
+ibool
+row_upd_changes_field_size_or_external(
+/*===================================*/
+	dict_index_t*	index,	/*!< in: index */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update);/*!< in: update vector */
+/***********************************************************//**
+Returns true if row update contains disowned external fields.
+@return true if the update contains disowned external fields. */
+UNIV_INTERN
+bool
+row_upd_changes_disowned_external(
+/*==============================*/
+	const upd_t*	update)	/*!< in: update vector */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the
+record given. No field size changes are allowed. This function is
+usually invoked on a clustered index. The only use case for a
+secondary index is row_ins_sec_index_entry_by_modify() or its
+counterpart in ibuf_insert_to_index_page(). */
+UNIV_INTERN
+void
+row_upd_rec_in_place(
+/*=================*/
+	rec_t*		rec,	/*!< in/out: record where replaced */
+	dict_index_t*	index,	/*!< in: the index the record belongs to */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	const upd_t*	update,	/*!< in: update vector */
+	page_zip_des_t*	page_zip);/*!< in: compressed page with enough space
+				available, or NULL */
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Builds an update vector from those fields which in a secondary index entry
+differ from a record that has the equal ordering fields. NOTE: we compare
+the fields as binary strings!
+@return	own: update vector of differing fields */
+UNIV_INTERN
+upd_t*
+row_upd_build_sec_rec_difference_binary(
+/*====================================*/
+	const rec_t*	rec,	/*!< in: secondary index record */
+	dict_index_t*	index,	/*!< in: index */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+	__attribute__((warn_unused_result, nonnull));
+/***************************************************************//**
+Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. NOTE: we compare the fields as binary strings!
+@return own: update vector of differing fields, excluding roll ptr and
+trx id */
+UNIV_INTERN
+const upd_t*
+row_upd_build_difference_binary(
+/*============================*/
+	dict_index_t*	index,	/*!< in: clustered index */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	const rec_t*	rec,	/*!< in: clustered index record */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec,index), or NULL */
+	bool		no_sys,	/*!< in: skip the system columns
+				DB_TRX_ID and DB_ROLL_PTR */
+	trx_t*		trx,	/*!< in: transaction (for diagnostics),
+				or NULL */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+	__attribute__((nonnull(1,2,3,7), warn_unused_result));
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the index entry
+given. */
+UNIV_INTERN
+void
+row_upd_index_replace_new_col_vals_index_pos(
+/*=========================================*/
+	dtuple_t*	entry,	/*!< in/out: index entry where replaced;
+				the clustered index record must be
+				covered by a lock or a page latch to
+				prevent deletion (rollback or purge) */
+	dict_index_t*	index,	/*!< in: index; NOTE that this may also be a
+				non-clustered index */
+	const upd_t*	update,	/*!< in: an update vector built for the index so
+				that the field number in an upd_field is the
+				index position */
+	ibool		order_only,
+				/*!< in: if TRUE, limit the replacement to
+				ordering fields of index; note that this
+				does not work for non-clustered indexes. */
+	mem_heap_t*	heap)	/*!< in: memory heap for allocating and
+				copying the new values */
+	__attribute__((nonnull));
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the index entry
+given. */
+UNIV_INTERN
+void
+row_upd_index_replace_new_col_vals(
+/*===============================*/
+	dtuple_t*	entry,	/*!< in/out: index entry where replaced;
+				the clustered index record must be
+				covered by a lock or a page latch to
+				prevent deletion (rollback or purge) */
+	dict_index_t*	index,	/*!< in: index; NOTE that this may also be a
+				non-clustered index */
+	const upd_t*	update,	/*!< in: an update vector built for the
+				CLUSTERED index so that the field number in
+				an upd_field is the clustered index position */
+	mem_heap_t*	heap)	/*!< in: memory heap for allocating and
+				copying the new values */
+	__attribute__((nonnull));
+/***********************************************************//**
+Replaces the new column values stored in the update vector. */
+UNIV_INTERN
+void
+row_upd_replace(
+/*============*/
+	dtuple_t*		row,	/*!< in/out: row where replaced,
+					indexed by col_no;
+					the clustered index record must be
+					covered by a lock or a page latch to
+					prevent deletion (rollback or purge) */
+	row_ext_t**		ext,	/*!< out, own: NULL, or externally
+					stored column prefixes */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const upd_t*		update,	/*!< in: an update vector built for the
+					clustered index */
+	mem_heap_t*		heap);	/*!< in: memory heap */
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector changes an ordering field in the index record */
+UNIV_INTERN
+ibool
+row_upd_changes_ord_field_binary_func(
+/*==================================*/
+	dict_index_t*	index,	/*!< in: index of the record */
+	const upd_t*	update,	/*!< in: update vector for the row; NOTE: the
+				field numbers in this MUST be clustered index
+				positions! */
+#ifdef UNIV_DEBUG
+	const que_thr_t*thr,	/*!< in: query thread */
+#endif /* UNIV_DEBUG */
+	const dtuple_t*	row,	/*!< in: old value of row, or NULL if the
+				row and the data values in update are not
+				known when this function is called, e.g., at
+				compile time */
+	const row_ext_t*ext)	/*!< NULL, or prefixes of the externally
+				stored columns in the old row */
+	__attribute__((nonnull(1,2), warn_unused_result));
+#ifdef UNIV_DEBUG
+# define row_upd_changes_ord_field_binary(index,update,thr,row,ext)	\
+	row_upd_changes_ord_field_binary_func(index,update,thr,row,ext)
+#else /* UNIV_DEBUG */
+# define row_upd_changes_ord_field_binary(index,update,thr,row,ext)	\
+	row_upd_changes_ord_field_binary_func(index,update,row,ext)
+#endif /* UNIV_DEBUG */
+/***********************************************************//**
+Checks if an FTS indexed column is affected by an UPDATE.
+@return offset within fts_t::indexes if FTS indexed column updated else
+ULINT_UNDEFINED */
+UNIV_INTERN
+ulint
+row_upd_changes_fts_column(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field);	/*!< in: field to check */
+/***********************************************************//**
+Checks if an FTS Doc ID column is affected by an UPDATE.
+@return whether Doc ID column is affected */
+UNIV_INTERN
+bool
+row_upd_changes_doc_id(
+/*===================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field)	/*!< in: field to check */
+	__attribute__((nonnull, warn_unused_result));
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector may change an ordering field in an index
+record */
+UNIV_INTERN
+ibool
+row_upd_changes_some_index_ord_field_binary(
+/*========================================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const upd_t*		update);/*!< in: update vector for the row */
+/***********************************************************//**
+Updates a row in a table. This is a high-level function used
+in SQL execution graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_upd_step(
+/*=========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Parses the log data of system field values.
+@return	log data end or NULL */
+UNIV_INTERN
+byte*
+row_upd_parse_sys_vals(
+/*===================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	ulint*		pos,	/*!< out: TRX_ID position in record */
+	trx_id_t*	trx_id,	/*!< out: trx id */
+	roll_ptr_t*	roll_ptr);/*!< out: roll ptr */
+/*********************************************************************//**
+Updates the trx id and roll ptr field in a clustered index record in database
+recovery. */
+UNIV_INTERN
+void
+row_upd_rec_sys_fields_in_recovery(
+/*===============================*/
+	rec_t*		rec,	/*!< in/out: record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		pos,	/*!< in: TRX_ID position in rec */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	roll_ptr_t	roll_ptr);/*!< in: roll ptr of the undo log record */
+/*********************************************************************//**
+Parses the log data written by row_upd_index_write_log.
+@return	log data end or NULL */
+UNIV_INTERN
+byte*
+row_upd_index_parse(
+/*================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	mem_heap_t*	heap,	/*!< in: memory heap where update vector is
+				built */
+	upd_t**		update_out);/*!< out: update vector */
+
+
+/* Update vector field */
+struct upd_field_t{
+	unsigned	field_no:16;	/*!< field number in an index, usually
+					the clustered index, but in updating
+					a secondary index record in btr0cur.cc
+					this is the position in the secondary
+					index */
+#ifndef UNIV_HOTBACKUP
+	unsigned	orig_len:16;	/*!< original length of the locally
+					stored part of an externally stored
+					column, or 0 */
+	que_node_t*	exp;		/*!< expression for calculating a new
+					value: it refers to column values and
+					constants in the symbol table of the
+					query graph */
+#endif /* !UNIV_HOTBACKUP */
+	dfield_t	new_val;	/*!< new value for the column */
+};
+
+/* Update vector structure */
+struct upd_t{
+	ulint		info_bits;	/*!< new value of info bits to record;
+					default is 0 */
+	ulint		n_fields;	/*!< number of update fields */
+	upd_field_t*	fields;		/*!< array of update fields */
+};
+
+#ifndef UNIV_HOTBACKUP
+/* Update node structure which also implements the delete operation
+of a row */
+
+struct upd_node_t{
+	que_common_t	common;	/*!< node type: QUE_NODE_UPDATE */
+	ibool		is_delete;/* TRUE if delete, FALSE if update */
+	ibool		searched_update;
+				/* TRUE if searched update, FALSE if
+				positioned */
+	ibool		in_mysql_interface;
+				/* TRUE if the update node was created
+				for the MySQL interface */
+	dict_foreign_t*	foreign;/* NULL or pointer to a foreign key
+				constraint if this update node is used in
+				doing an ON DELETE or ON UPDATE operation */
+	upd_node_t*	cascade_node;/* NULL or an update node template which
+				is used to implement ON DELETE/UPDATE CASCADE
+				or ... SET NULL for foreign keys */
+	mem_heap_t*	cascade_heap;/* NULL or a mem heap where the cascade
+				node is created */
+	sel_node_t*	select;	/*!< query graph subtree implementing a base
+				table cursor: the rows returned will be
+				updated */
+	btr_pcur_t*	pcur;	/*!< persistent cursor placed on the clustered
+				index record which should be updated or
+				deleted; the cursor is stored in the graph
+				of 'select' field above, except in the case
+				of the MySQL interface */
+	dict_table_t*	table;	/*!< table where updated */
+	upd_t*		update;	/*!< update vector for the row */
+	ulint		update_n_fields;
+				/* when this struct is used to implement
+				a cascade operation for foreign keys, we store
+				here the size of the buffer allocated for use
+				as the update vector */
+	sym_node_list_t	columns;/* symbol table nodes for the columns
+				to retrieve from the table */
+	ibool		has_clust_rec_x_lock;
+				/* TRUE if the select which retrieves the
+				records to update already sets an x-lock on
+				the clustered record; note that it must always
+				set at least an s-lock */
+	ulint		cmpl_info;/* information extracted during query
+				compilation; speeds up execution:
+				UPD_NODE_NO_ORD_CHANGE and
+				UPD_NODE_NO_SIZE_CHANGE, ORed */
+	/*----------------------*/
+	/* Local storage for this graph node */
+	ulint		state;	/*!< node execution state */
+	dict_index_t*	index;	/*!< NULL, or the next index whose record should
+				be updated */
+	dtuple_t*	row;	/*!< NULL, or a copy (also fields copied to
+				heap) of the row to update; this must be reset
+				to NULL after a successful update */
+	row_ext_t*	ext;	/*!< NULL, or prefixes of the externally
+				stored columns in the old row */
+	dtuple_t*	upd_row;/* NULL, or a copy of the updated row */
+	row_ext_t*	upd_ext;/* NULL, or prefixes of the externally
+				stored columns in upd_row */
+	mem_heap_t*	heap;	/*!< memory heap used as auxiliary storage;
+				this must be emptied after a successful
+				update */
+	/*----------------------*/
+	sym_node_t*	table_sym;/* table node in symbol table */
+	que_node_t*	col_assign_list;
+				/* column assignment list */
+	ulint		magic_n;
+};
+
+#define	UPD_NODE_MAGIC_N	1579975
+
+/* Node execution states */
+#define UPD_NODE_SET_IX_LOCK	   1	/* execution came to the node from
+					a node above and if the field
+					has_clust_rec_x_lock is FALSE, we
+					should set an intention x-lock on
+					the table */
+#define UPD_NODE_UPDATE_CLUSTERED  2	/* clustered index record should be
+					updated */
+#define UPD_NODE_INSERT_CLUSTERED  3	/* clustered index record should be
+					inserted, old record is already delete
+					marked */
+#define UPD_NODE_INSERT_BLOB	   4	/* clustered index record should be
+					inserted, old record is already
+					delete-marked; non-updated BLOBs
+					should be inherited by the new record
+					and disowned by the old record */
+#define UPD_NODE_UPDATE_ALL_SEC	   5	/* an ordering field of the clustered
+					index record was changed, or this is
+					a delete operation: should update
+					all the secondary index records */
+#define UPD_NODE_UPDATE_SOME_SEC   6	/* secondary index entries should be
+					looked at and updated if an ordering
+					field changed */
+
+/* Compilation info flags: these must fit within 3 bits; see trx0rec.h */
+#define UPD_NODE_NO_ORD_CHANGE	1	/* no secondary index record will be
+					changed in the update and no ordering
+					field of the clustered index */
+#define UPD_NODE_NO_SIZE_CHANGE	2	/* no record field size will be
+					changed in the update */
+
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "row0upd.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0upd.ic b/storage/innobase/include/row0upd.ic
new file mode 100644
index 00000000000..618a77fa4bf
--- /dev/null
+++ b/storage/innobase/include/row0upd.ic
@@ -0,0 +1,188 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0upd.ic
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0log.h"
+#ifndef UNIV_HOTBACKUP
+# include "trx0trx.h"
+# include "trx0undo.h"
+# include "row0row.h"
+# include "lock0lock.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "page0zip.h"
+
+/*********************************************************************//**
+Creates an update vector object.
+@return	own: update vector object */
+UNIV_INLINE
+upd_t*
+upd_create(
+/*=======*/
+	ulint		n,	/*!< in: number of fields */
+	mem_heap_t*	heap)	/*!< in: heap from which memory allocated */
+{
+	upd_t*	update;
+
+	update = (upd_t*) mem_heap_zalloc(heap, sizeof(upd_t));
+
+	update->n_fields = n;
+	update->fields = (upd_field_t*)
+		mem_heap_zalloc(heap, sizeof(upd_field_t) * n);
+
+	return(update);
+}
+
+/*********************************************************************//**
+Returns the number of fields in the update vector == number of columns
+to be updated by an update vector.
+@return	number of fields */
+UNIV_INLINE
+ulint
+upd_get_n_fields(
+/*=============*/
+	const upd_t*	update)	/*!< in: update vector */
+{
+	ut_ad(update);
+
+	return(update->n_fields);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the nth field of an update vector.
+@return	update vector field */
+UNIV_INLINE
+upd_field_t*
+upd_get_nth_field(
+/*==============*/
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		n)	/*!< in: field position in update vector */
+{
+	ut_ad(update);
+	ut_ad(n < update->n_fields);
+
+	return((upd_field_t*) update->fields + n);
+}
+#endif /* UNIV_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Sets an index field number to be updated by an update vector field. */
+UNIV_INLINE
+void
+upd_field_set_field_no(
+/*===================*/
+	upd_field_t*	upd_field,	/*!< in: update vector field */
+	ulint		field_no,	/*!< in: field number in a clustered
+					index */
+	dict_index_t*	index,		/*!< in: index */
+	trx_t*		trx)		/*!< in: transaction */
+{
+	upd_field->field_no = field_no;
+	upd_field->orig_len = 0;
+
+	if (field_no >= dict_index_get_n_fields(index)) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to access field %lu in ",
+			(ulong) field_no);
+		dict_index_name_print(stderr, trx, index);
+		fprintf(stderr, "\n"
+			"InnoDB: but index only has %lu fields\n",
+			(ulong) dict_index_get_n_fields(index));
+		ut_ad(0);
+	}
+
+	dict_col_copy_type(dict_index_get_nth_col(index, field_no),
+			   dfield_get_type(&upd_field->new_val));
+}
+
+/*********************************************************************//**
+Returns a field of an update vector by field_no.
+@return	update vector field, or NULL */
+UNIV_INLINE
+const upd_field_t*
+upd_get_field_by_field_no(
+/*======================*/
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		no)	/*!< in: field_no */
+{
+	ulint	i;
+	for (i = 0; i < upd_get_n_fields(update); i++) {
+		const upd_field_t*	uf = upd_get_nth_field(update, i);
+
+		if (uf->field_no == no) {
+
+			return(uf);
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Updates the trx id and roll ptr field in a clustered index record when
+a row is updated or marked deleted. */
+UNIV_INLINE
+void
+row_upd_rec_sys_fields(
+/*===================*/
+	rec_t*		rec,	/*!< in/out: record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const trx_t*	trx,	/*!< in: transaction */
+	roll_ptr_t	roll_ptr)/*!< in: roll ptr of the undo log record,
+				 can be 0 during IMPORT */
+{
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (page_zip) {
+		ulint	pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+		page_zip_write_trx_id_and_roll_ptr(page_zip, rec, offsets,
+						   pos, trx->id, roll_ptr);
+	} else {
+		ulint	offset = index->trx_id_offset;
+
+		if (!offset) {
+			offset = row_get_trx_id_offset(index, offsets);
+		}
+
+#if DATA_TRX_ID + 1 != DATA_ROLL_PTR
+# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR"
+#endif
+		/* During IMPORT the trx id in the record can be in the
+		future, if the .ibd file is being imported from another
+		instance. During IMPORT roll_ptr will be 0. */
+		ut_ad(roll_ptr == 0
+		      || lock_check_trx_id_sanity(
+			      trx_read_trx_id(rec + offset),
+			      rec, index, offsets));
+
+		trx_write_trx_id(rec + offset, trx->id);
+		trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, roll_ptr);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h
new file mode 100644
index 00000000000..1df5b4d3e98
--- /dev/null
+++ b/storage/innobase/include/row0vers.h
@@ -0,0 +1,146 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0vers.h
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0vers_h
+#define row0vers_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "rem0types.h"
+#include "mtr0mtr.h"
+#include "read0types.h"
+
+/*****************************************************************//**
+Finds out if an active transaction has inserted or modified a secondary
+index record.
+@return 0 if committed, else the active transaction id;
+NOTE that this function can return false positives but never false
+negatives. The caller must confirm all positive results by calling
+trx_is_active() while holding lock_sys->mutex. */
+UNIV_INTERN
+trx_id_t
+row_vers_impl_x_locked(
+/*===================*/
+	const rec_t*	rec,	/*!< in: record in a secondary index */
+	dict_index_t*	index,	/*!< in: the secondary index */
+	const ulint*	offsets);/*!< in: rec_get_offsets(rec, index) */
+/*****************************************************************//**
+Finds out if we must preserve a delete marked earlier version of a clustered
+index record, because it is >= the purge view.
+@return	TRUE if earlier version should be preserved */
+UNIV_INTERN
+ibool
+row_vers_must_preserve_del_marked(
+/*==============================*/
+	trx_id_t	trx_id,	/*!< in: transaction id in the version */
+	mtr_t*		mtr);	/*!< in: mtr holding the latch on the
+				clustered index record; it will also
+				hold the latch on purge_view */
+/*****************************************************************//**
+Finds out if a version of the record, where the version >= the current
+purge view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry == ientry; exactly in
+this case we return TRUE.
+@return	TRUE if earlier version should have */
+UNIV_INTERN
+ibool
+row_vers_old_has_index_entry(
+/*=========================*/
+	ibool		also_curr,/*!< in: TRUE if also rec is included in the
+				versions to search; otherwise only versions
+				prior to it are searched */
+	const rec_t*	rec,	/*!< in: record in the clustered index; the
+				caller must have a latch on the page */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec; it will
+				also hold the latch on purge_view */
+	dict_index_t*	index,	/*!< in: the secondary index */
+	const dtuple_t*	ientry);/*!< in: the secondary index entry */
+/*****************************************************************//**
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version.
+@return	DB_SUCCESS or DB_MISSING_HISTORY */
+UNIV_INTERN
+dberr_t
+row_vers_build_for_consistent_read(
+/*===============================*/
+	const rec_t*	rec,	/*!< in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec; it will
+				also hold the latch on purge_view */
+	dict_index_t*	index,	/*!< in: the clustered index */
+	ulint**		offsets,/*!< in/out: offsets returned by
+				rec_get_offsets(rec, index) */
+	read_view_t*	view,	/*!< in: the consistent read view */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mem_heap_t*	in_heap,/*!< in: memory heap from which the memory for
+				*old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	rec_t**		old_vers)/*!< out, own: old version, or NULL
+				if the history is missing or the record
+				does not exist in the view, that is,
+				it was freshly inserted afterwards */
+	__attribute__((nonnull(1,2,3,4,5,6,7)));
+
+/*****************************************************************//**
+Constructs the last committed version of a clustered index record,
+which should be seen by a semi-consistent read. */
+UNIV_INTERN
+void
+row_vers_build_for_semi_consistent_read(
+/*====================================*/
+	const rec_t*	rec,	/*!< in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec */
+	dict_index_t*	index,	/*!< in: the clustered index */
+	ulint**		offsets,/*!< in/out: offsets returned by
+				rec_get_offsets(rec, index) */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mem_heap_t*	in_heap,/*!< in: memory heap from which the memory for
+				*old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	const rec_t**	old_vers)/*!< out: rec, old version, or NULL if the
+				record does not exist in the view, that is,
+				it was freshly inserted afterwards */
+	__attribute__((nonnull(1,2,3,4,5)));
+
+
+#ifndef UNIV_NONINL
+#include "row0vers.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/row0vers.ic b/storage/innobase/include/row0vers.ic
new file mode 100644
index 00000000000..ef43a55bf70
--- /dev/null
+++ b/storage/innobase/include/row0vers.ic
@@ -0,0 +1,30 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0vers.ic
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0row.h"
+#include "dict0dict.h"
+#include "read0read.h"
+#include "page0page.h"
+#include "log0recv.h"
diff --git a/storage/innobase/include/srv0conc.h b/storage/innobase/include/srv0conc.h
new file mode 100644
index 00000000000..cf61ef5528d
--- /dev/null
+++ b/storage/innobase/include/srv0conc.h
@@ -0,0 +1,111 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0conc.h
+
+InnoDB concurrency manager header file
+
+Created 2011/04/18 Sunny Bains
+*******************************************************/
+
+#ifndef srv_conc_h
+#define srv_conc_h
+
+/** We are prepared for a situation that we have this many threads waiting for
+a semaphore inside InnoDB. innobase_start_or_create_for_mysql() sets the
+value. */
+
+extern	ulint	srv_max_n_threads;
+
+/** The following controls how many threads we let inside InnoDB concurrently:
+threads waiting for locks are not counted into the number because otherwise
+we could get a deadlock. Value of 0 will disable the concurrency check. */
+
+extern ulong	srv_thread_concurrency;
+
+/*********************************************************************//**
+Initialise the concurrency management data structures */
+void
+srv_conc_init(void);
+/*===============*/
+
+/*********************************************************************//**
+Free the concurrency management data structures */
+void
+srv_conc_free(void);
+/*===============*/
+
+/*********************************************************************//**
+Puts an OS thread to wait if there are too many concurrent threads
+(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */
+UNIV_INTERN
+void
+srv_conc_enter_innodb(
+/*==================*/
+	trx_t*	trx);		/*!< in: transaction object associated
+				with the thread */
+
+/*********************************************************************//**
+This lets a thread enter InnoDB regardless of the number of threads inside
+InnoDB. This must be called when a thread ends a lock wait. */
+UNIV_INTERN
+void
+srv_conc_force_enter_innodb(
+/*========================*/
+	trx_t*	trx);		/*!< in: transaction object associated with
+				the thread */
+
+/*********************************************************************//**
+This must be called when a thread exits InnoDB in a lock wait or at the
+end of an SQL statement. */
+UNIV_INTERN
+void
+srv_conc_force_exit_innodb(
+/*=======================*/
+	trx_t*	trx);		/*!< in: transaction object associated with
+				the thread */
+
+/*********************************************************************//**
+Get the count of threads waiting inside InnoDB. */
+UNIV_INTERN
+ulint
+srv_conc_get_waiting_threads(void);
+/*==============================*/
+
+/*********************************************************************//**
+Get the count of threads active inside InnoDB. */
+UNIV_INTERN
+ulint
+srv_conc_get_active_threads(void);
+/*==============================*/
+
+#endif /* srv_conc_h */
diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h
new file mode 100644
index 00000000000..e2ab81bf53a
--- /dev/null
+++ b/storage/innobase/include/srv0mon.h
@@ -0,0 +1,896 @@
+/***********************************************************************
+
+Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file include/srv0mon.h
+Server monitor counter related defines
+
+Created 12/15/2009	Jimmy Yang
+*******************************************************/
+
+#ifndef srv0mon_h
+#define srv0mon_h
+
+#include "univ.i"
+#ifndef UNIV_HOTBACKUP
+
+
+/** Possible status values for "mon_status" in "struct monitor_value" */
+enum monitor_running_status {
+	MONITOR_STARTED = 1,	/*!< Monitor has been turned on */
+	MONITOR_STOPPED = 2	/*!< Monitor has been turned off */
+};
+
+typedef enum monitor_running_status	monitor_running_t;
+
+/** Monitor counter value type */
+typedef	ib_int64_t			mon_type_t;
+
+/** Two monitor structures are defined in this file. One is
+"monitor_value_t" which contains dynamic counter values for each
+counter. The other is "monitor_info_t", which contains
+static information (counter name, desc etc.) for each counter.
+In addition, an enum datatype "monitor_id_t" is also defined,
+it identifies each monitor with an internally used symbol, whose
+integer value indexes into above two structure for its dynamic
+and static information.
+Developer who intend to add new counters would require to
+fill in counter information as described in "monitor_info_t" and
+create the internal counter ID in "monitor_id_t". */
+
+/** Structure containing the actual values of a monitor counter. */
+struct monitor_value_t {
+	ib_time_t	mon_start_time;	/*!< Start time of monitoring  */
+	ib_time_t	mon_stop_time;	/*!< Stop time of monitoring */
+	ib_time_t	mon_reset_time;	/*!< Time counter resetted */
+	mon_type_t	mon_value;	/*!< Current counter Value */
+	mon_type_t	mon_max_value;	/*!< Current Max value */
+	mon_type_t	mon_min_value;	/*!< Current Min value */
+	mon_type_t	mon_value_reset;/*!< value at last reset */
+	mon_type_t	mon_max_value_start; /*!< Max value since start */
+	mon_type_t	mon_min_value_start; /*!< Min value since start */
+	mon_type_t	mon_start_value;/*!< Value at the start time */
+	mon_type_t	mon_last_value;	/*!< Last set of values */
+	monitor_running_t mon_status;	/* whether monitor still running */
+};
+
+/** Follwoing defines are possible values for "monitor_type" field in
+"struct monitor_info" */
+enum monitor_type_t {
+	MONITOR_NONE = 0,	/*!< No monitoring */
+	MONITOR_MODULE = 1,	/*!< This is a monitor module type,
+				not a counter */
+	MONITOR_EXISTING = 2,	/*!< The monitor carries information from
+				an existing system status variable */
+	MONITOR_NO_AVERAGE = 4,	/*!< Set this status if we don't want to
+				calculate the average value for the counter */
+	MONITOR_DISPLAY_CURRENT = 8, /*!< Display current value of the
+				counter, rather than incremental value
+				over the period. Mostly for counters
+				displaying current resource usage */
+	MONITOR_GROUP_MODULE = 16, /*!< Monitor can be turned on/off
+				only as a module, but not individually */
+	MONITOR_DEFAULT_ON = 32,/*!< Monitor will be turned on by default at
+				server start up */
+	MONITOR_SET_OWNER = 64,	/*!< Owner of "monitor set", a set of
+				monitor counters */
+	MONITOR_SET_MEMBER = 128,/*!< Being part of a "monitor set" */
+	MONITOR_HIDDEN = 256	/*!< Do not display this monitor in the
+				metrics table */
+};
+
+/** Counter minimum value is initialized to be max value of
+ mon_type_t (ib_int64_t) */
+#define	MIN_RESERVED		((mon_type_t) (IB_UINT64_MAX >> 1))
+#define	MAX_RESERVED		(~MIN_RESERVED)
+
+/** This enumeration defines internal monitor identifier used internally
+to identify each particular counter. Its value indexes into two arrays,
+one is the "innodb_counter_value" array which records actual monitor
+counter values, the other is "innodb_counter_info" array which describes
+each counter's basic information (name, desc etc.). A couple of
+naming rules here:
+1) If the monitor defines a module, it starts with MONITOR_MODULE
+2) If the monitor uses exisitng counters from "status variable", its ID
+name shall start with MONITOR_OVLD
+
+Please refer to "innodb_counter_info" in srv/srv0mon.cc for detail
+information for each monitor counter */
+
+enum monitor_id_t {
+	/* This is to identify the default value set by the metrics
+	control global variables */
+	MONITOR_DEFAULT_START = 0,
+
+	/* Start of Metadata counter */
+	MONITOR_MODULE_METADATA,
+	MONITOR_TABLE_OPEN,
+	MONITOR_TABLE_CLOSE,
+	MONITOR_TABLE_REFERENCE,
+	MONITOR_OVLD_META_MEM_POOL,
+
+	/* Lock manager related counters */
+	MONITOR_MODULE_LOCK,
+	MONITOR_DEADLOCK,
+	MONITOR_TIMEOUT,
+	MONITOR_LOCKREC_WAIT,
+	MONITOR_TABLELOCK_WAIT,
+	MONITOR_NUM_RECLOCK_REQ,
+	MONITOR_RECLOCK_CREATED,
+	MONITOR_RECLOCK_REMOVED,
+	MONITOR_NUM_RECLOCK,
+	MONITOR_TABLELOCK_CREATED,
+	MONITOR_TABLELOCK_REMOVED,
+	MONITOR_NUM_TABLELOCK,
+	MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT,
+	MONITOR_OVLD_LOCK_WAIT_TIME,
+	MONITOR_OVLD_LOCK_MAX_WAIT_TIME,
+	MONITOR_OVLD_ROW_LOCK_WAIT,
+	MONITOR_OVLD_LOCK_AVG_WAIT_TIME,
+
+	/* Buffer and I/O realted counters. */
+	MONITOR_MODULE_BUFFER,
+	MONITOR_OVLD_BUFFER_POOL_SIZE,
+	MONITOR_OVLD_BUF_POOL_READS,
+	MONITOR_OVLD_BUF_POOL_READ_REQUESTS,
+	MONITOR_OVLD_BUF_POOL_WRITE_REQUEST,
+	MONITOR_OVLD_BUF_POOL_WAIT_FREE,
+	MONITOR_OVLD_BUF_POOL_READ_AHEAD,
+	MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED,
+	MONITOR_OVLD_BUF_POOL_PAGE_TOTAL,
+	MONITOR_OVLD_BUF_POOL_PAGE_MISC,
+	MONITOR_OVLD_BUF_POOL_PAGES_DATA,
+	MONITOR_OVLD_BUF_POOL_BYTES_DATA,
+	MONITOR_OVLD_BUF_POOL_PAGES_DIRTY,
+	MONITOR_OVLD_BUF_POOL_BYTES_DIRTY,
+	MONITOR_OVLD_BUF_POOL_PAGES_FREE,
+	MONITOR_OVLD_PAGE_CREATED,
+	MONITOR_OVLD_PAGES_WRITTEN,
+	MONITOR_OVLD_PAGES_READ,
+	MONITOR_OVLD_BYTE_READ,
+	MONITOR_OVLD_BYTE_WRITTEN,
+	MONITOR_FLUSH_BATCH_SCANNED,
+	MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+	MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
+	MONITOR_FLUSH_HP_RESCAN,
+	MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+	MONITOR_FLUSH_BATCH_COUNT,
+	MONITOR_FLUSH_BATCH_PAGES,
+	MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+	MONITOR_FLUSH_NEIGHBOR_COUNT,
+	MONITOR_FLUSH_NEIGHBOR_PAGES,
+	MONITOR_FLUSH_N_TO_FLUSH_REQUESTED,
+	MONITOR_FLUSH_AVG_PAGE_RATE,
+	MONITOR_FLUSH_LSN_AVG_RATE,
+	MONITOR_FLUSH_PCT_FOR_DIRTY,
+	MONITOR_FLUSH_PCT_FOR_LSN,
+	MONITOR_FLUSH_SYNC_WAITS,
+	MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+	MONITOR_FLUSH_ADAPTIVE_COUNT,
+	MONITOR_FLUSH_ADAPTIVE_PAGES,
+	MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+	MONITOR_FLUSH_SYNC_COUNT,
+	MONITOR_FLUSH_SYNC_PAGES,
+	MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+	MONITOR_FLUSH_BACKGROUND_COUNT,
+	MONITOR_FLUSH_BACKGROUND_PAGES,
+	MONITOR_LRU_BATCH_SCANNED,
+	MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+	MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+	MONITOR_LRU_BATCH_TOTAL_PAGE,
+	MONITOR_LRU_BATCH_COUNT,
+	MONITOR_LRU_BATCH_PAGES,
+	MONITOR_LRU_SINGLE_FLUSH_SCANNED,
+	MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
+	MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
+	MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT,
+	MONITOR_LRU_GET_FREE_SEARCH,
+	MONITOR_LRU_SEARCH_SCANNED,
+	MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+	MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
+	MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+	MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+	MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
+
+	/* Buffer Page I/O specific counters. */
+	MONITOR_MODULE_BUF_PAGE,
+	MONITOR_INDEX_LEAF_PAGE_READ,
+	MONITOR_INDEX_NON_LEAF_PAGE_READ,
+	MONITOR_INDEX_IBUF_LEAF_PAGE_READ,
+	MONITOR_INDEX_IBUF_NON_LEAF_PAGE_READ,
+	MONITOR_UNDO_LOG_PAGE_READ,
+	MONITOR_INODE_PAGE_READ,
+	MONITOR_IBUF_FREELIST_PAGE_READ,
+	MONITOR_IBUF_BITMAP_PAGE_READ,
+	MONITOR_SYSTEM_PAGE_READ,
+	MONITOR_TRX_SYSTEM_PAGE_READ,
+	MONITOR_FSP_HDR_PAGE_READ,
+	MONITOR_XDES_PAGE_READ,
+	MONITOR_BLOB_PAGE_READ,
+	MONITOR_ZBLOB_PAGE_READ,
+	MONITOR_ZBLOB2_PAGE_READ,
+	MONITOR_OTHER_PAGE_READ,
+	MONITOR_INDEX_LEAF_PAGE_WRITTEN,
+	MONITOR_INDEX_NON_LEAF_PAGE_WRITTEN,
+	MONITOR_INDEX_IBUF_LEAF_PAGE_WRITTEN,
+	MONITOR_INDEX_IBUF_NON_LEAF_PAGE_WRITTEN,
+	MONITOR_UNDO_LOG_PAGE_WRITTEN,
+	MONITOR_INODE_PAGE_WRITTEN,
+	MONITOR_IBUF_FREELIST_PAGE_WRITTEN,
+	MONITOR_IBUF_BITMAP_PAGE_WRITTEN,
+	MONITOR_SYSTEM_PAGE_WRITTEN,
+	MONITOR_TRX_SYSTEM_PAGE_WRITTEN,
+	MONITOR_FSP_HDR_PAGE_WRITTEN,
+	MONITOR_XDES_PAGE_WRITTEN,
+	MONITOR_BLOB_PAGE_WRITTEN,
+	MONITOR_ZBLOB_PAGE_WRITTEN,
+	MONITOR_ZBLOB2_PAGE_WRITTEN,
+	MONITOR_OTHER_PAGE_WRITTEN,
+
+	/* OS level counters (I/O) */
+	MONITOR_MODULE_OS,
+	MONITOR_OVLD_OS_FILE_READ,
+	MONITOR_OVLD_OS_FILE_WRITE,
+	MONITOR_OVLD_OS_FSYNC,
+	MONITOR_OS_PENDING_READS,
+	MONITOR_OS_PENDING_WRITES,
+	MONITOR_OVLD_OS_LOG_WRITTEN,
+	MONITOR_OVLD_OS_LOG_FSYNC,
+	MONITOR_OVLD_OS_LOG_PENDING_FSYNC,
+	MONITOR_OVLD_OS_LOG_PENDING_WRITES,
+
+	/* Transaction related counters */
+	MONITOR_MODULE_TRX,
+	MONITOR_TRX_RW_COMMIT,
+	MONITOR_TRX_RO_COMMIT,
+	MONITOR_TRX_NL_RO_COMMIT,
+	MONITOR_TRX_COMMIT_UNDO,
+	MONITOR_TRX_ROLLBACK,
+	MONITOR_TRX_ROLLBACK_SAVEPOINT,
+	MONITOR_TRX_ROLLBACK_ACTIVE,
+	MONITOR_TRX_ACTIVE,
+	MONITOR_RSEG_HISTORY_LEN,
+	MONITOR_NUM_UNDO_SLOT_USED,
+	MONITOR_NUM_UNDO_SLOT_CACHED,
+	MONITOR_RSEG_CUR_SIZE,
+
+	/* Purge related counters */
+	MONITOR_MODULE_PURGE,
+	MONITOR_N_DEL_ROW_PURGE,
+	MONITOR_N_UPD_EXIST_EXTERN,
+	MONITOR_PURGE_INVOKED,
+	MONITOR_PURGE_N_PAGE_HANDLED,
+	MONITOR_DML_PURGE_DELAY,
+	MONITOR_PURGE_STOP_COUNT,
+	MONITOR_PURGE_RESUME_COUNT,
+
+	/* Recovery related counters */
+	MONITOR_MODULE_RECOVERY,
+	MONITOR_NUM_CHECKPOINT,
+	MONITOR_OVLD_LSN_FLUSHDISK,
+	MONITOR_OVLD_LSN_CHECKPOINT,
+	MONITOR_OVLD_LSN_CURRENT,
+	MONITOR_LSN_CHECKPOINT_AGE,
+	MONITOR_OVLD_BUF_OLDEST_LSN,
+	MONITOR_OVLD_MAX_AGE_ASYNC,
+	MONITOR_OVLD_MAX_AGE_SYNC,
+	MONITOR_PENDING_LOG_WRITE,
+	MONITOR_PENDING_CHECKPOINT_WRITE,
+	MONITOR_LOG_IO,
+	MONITOR_OVLD_LOG_WAITS,
+	MONITOR_OVLD_LOG_WRITE_REQUEST,
+	MONITOR_OVLD_LOG_WRITES,
+
+	/* Page Manager related counters */
+	MONITOR_MODULE_PAGE,
+	MONITOR_PAGE_COMPRESS,
+	MONITOR_PAGE_DECOMPRESS,
+	MONITOR_PAD_INCREMENTS,
+	MONITOR_PAD_DECREMENTS,
+
+	/* Index related counters */
+	MONITOR_MODULE_INDEX,
+	MONITOR_INDEX_SPLIT,
+	MONITOR_INDEX_MERGE_ATTEMPTS,
+	MONITOR_INDEX_MERGE_SUCCESSFUL,
+	MONITOR_INDEX_REORG_ATTEMPTS,
+	MONITOR_INDEX_REORG_SUCCESSFUL,
+	MONITOR_INDEX_DISCARD,
+
+	/* Adaptive Hash Index related counters */
+	MONITOR_MODULE_ADAPTIVE_HASH,
+	MONITOR_OVLD_ADAPTIVE_HASH_SEARCH,
+	MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE,
+	MONITOR_ADAPTIVE_HASH_PAGE_ADDED,
+	MONITOR_ADAPTIVE_HASH_PAGE_REMOVED,
+	MONITOR_ADAPTIVE_HASH_ROW_ADDED,
+	MONITOR_ADAPTIVE_HASH_ROW_REMOVED,
+	MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND,
+	MONITOR_ADAPTIVE_HASH_ROW_UPDATED,
+
+	/* Tablespace related counters */
+	MONITOR_MODULE_FIL_SYSTEM,
+	MONITOR_OVLD_N_FILE_OPENED,
+
+	/* InnoDB Change Buffer related counters */
+	MONITOR_MODULE_IBUF_SYSTEM,
+	MONITOR_OVLD_IBUF_MERGE_INSERT,
+	MONITOR_OVLD_IBUF_MERGE_DELETE,
+	MONITOR_OVLD_IBUF_MERGE_PURGE,
+	MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT,
+	MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE,
+	MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE,
+	MONITOR_OVLD_IBUF_MERGES,
+	MONITOR_OVLD_IBUF_SIZE,
+
+	/* Counters for server operations */
+	MONITOR_MODULE_SERVER,
+	MONITOR_MASTER_THREAD_SLEEP,
+	MONITOR_OVLD_SERVER_ACTIVITY,
+	MONITOR_MASTER_ACTIVE_LOOPS,
+	MONITOR_MASTER_IDLE_LOOPS,
+	MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND,
+	MONITOR_SRV_IBUF_MERGE_MICROSECOND,
+	MONITOR_SRV_LOG_FLUSH_MICROSECOND,
+	MONITOR_SRV_MEM_VALIDATE_MICROSECOND,
+	MONITOR_SRV_PURGE_MICROSECOND,
+	MONITOR_SRV_DICT_LRU_MICROSECOND,
+	MONITOR_SRV_CHECKPOINT_MICROSECOND,
+	MONITOR_OVLD_SRV_DBLWR_WRITES,
+	MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN,
+	MONITOR_OVLD_SRV_PAGE_SIZE,
+	MONITOR_OVLD_RWLOCK_S_SPIN_WAITS,
+	MONITOR_OVLD_RWLOCK_X_SPIN_WAITS,
+	MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS,
+	MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS,
+	MONITOR_OVLD_RWLOCK_S_OS_WAITS,
+	MONITOR_OVLD_RWLOCK_X_OS_WAITS,
+
+	/* Data DML related counters */
+	MONITOR_MODULE_DML_STATS,
+	MONITOR_OLVD_ROW_READ,
+	MONITOR_OLVD_ROW_INSERTED,
+	MONITOR_OLVD_ROW_DELETED,
+	MONITOR_OLVD_ROW_UPDTATED,
+
+	/* Data DDL related counters */
+	MONITOR_MODULE_DDL_STATS,
+	MONITOR_BACKGROUND_DROP_INDEX,
+	MONITOR_BACKGROUND_DROP_TABLE,
+	MONITOR_ONLINE_CREATE_INDEX,
+	MONITOR_PENDING_ALTER_TABLE,
+
+	MONITOR_MODULE_ICP,
+	MONITOR_ICP_ATTEMPTS,
+	MONITOR_ICP_NO_MATCH,
+	MONITOR_ICP_OUT_OF_RANGE,
+	MONITOR_ICP_MATCH,
+
+	/* This is used only for control system to turn
+	on/off and reset all monitor counters */
+	MONITOR_ALL_COUNTER,
+
+	/* This must be the last member */
+	NUM_MONITOR
+};
+
+/** This informs the monitor control system to turn
+on/off and reset monitor counters through wild card match */
+#define	MONITOR_WILDCARD_MATCH		(NUM_MONITOR + 1)
+
+/** Cannot find monitor counter with a specified name */
+#define	MONITOR_NO_MATCH		(NUM_MONITOR + 2)
+
+/** struct monitor_info describes the basic/static information
+about each monitor counter. */
+struct monitor_info_t {
+	const char*	monitor_name;	/*!< Monitor name */
+	const char*	monitor_module;	/*!< Sub Module the monitor
+					belongs to */
+	const char*	monitor_desc;	/*!< Brief desc of monitor counter */
+	monitor_type_t	monitor_type;	/*!< Type of Monitor Info */
+	monitor_id_t	monitor_related_id;/*!< Monitor ID of counter that
+					related to this monitor. This is
+					set when the monitor belongs to
+					a "monitor set" */
+	monitor_id_t	monitor_id;	/*!< Monitor ID as defined in enum
+					monitor_id_t */
+};
+
+/** Following are the "set_option" values allowed for
+srv_mon_process_existing_counter() and srv_mon_process_existing_counter()
+functions. To turn on/off/reset the monitor counters. */
+enum mon_option_t {
+	MONITOR_TURN_ON = 1,		/*!< Turn on the counter */
+	MONITOR_TURN_OFF,		/*!< Turn off the counter */
+	MONITOR_RESET_VALUE,		/*!< Reset current values */
+	MONITOR_RESET_ALL_VALUE,	/*!< Reset all values */
+	MONITOR_GET_VALUE		/*!< Option for
+					srv_mon_process_existing_counter()
+					function */
+};
+
+/** Number of bit in a ulint datatype */
+#define	NUM_BITS_ULINT	(sizeof(ulint) * CHAR_BIT)
+
+/** This "monitor_set_tbl" is a bitmap records whether a particular monitor
+counter has been turned on or off */
+extern ulint		monitor_set_tbl[(NUM_MONITOR + NUM_BITS_ULINT - 1) /
+					NUM_BITS_ULINT];
+
+/** Macros to turn on/off the control bit in monitor_set_tbl for a monitor
+counter option. */
+#define MONITOR_ON(monitor)				\
+	(monitor_set_tbl[monitor / NUM_BITS_ULINT] |=	\
+			((ulint)1 << (monitor % NUM_BITS_ULINT)))
+
+#define MONITOR_OFF(monitor)				\
+	(monitor_set_tbl[monitor / NUM_BITS_ULINT] &=	\
+			~((ulint)1 << (monitor % NUM_BITS_ULINT)))
+
+/** Check whether the requested monitor is turned on/off */
+#define MONITOR_IS_ON(monitor)				\
+	(monitor_set_tbl[monitor / NUM_BITS_ULINT] &	\
+			((ulint)1 << (monitor % NUM_BITS_ULINT)))
+
+/** The actual monitor counter array that records each monintor counter
+value */
+extern monitor_value_t	 innodb_counter_value[NUM_MONITOR];
+
+/** Following are macro defines for basic montior counter manipulations.
+Please note we do not provide any synchronization for these monitor
+operations due to performance consideration. Most counters can
+be placed under existing mutex protections in respective code
+module. */
+
+/** Macros to access various fields of a monitor counters */
+#define MONITOR_FIELD(monitor, field)			\
+		(innodb_counter_value[monitor].field)
+
+#define MONITOR_VALUE(monitor)				\
+		MONITOR_FIELD(monitor, mon_value)
+
+#define MONITOR_MAX_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_max_value)
+
+#define MONITOR_MIN_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_min_value)
+
+#define MONITOR_VALUE_RESET(monitor)			\
+		MONITOR_FIELD(monitor, mon_value_reset)
+
+#define MONITOR_MAX_VALUE_START(monitor)		\
+		MONITOR_FIELD(monitor, mon_max_value_start)
+
+#define MONITOR_MIN_VALUE_START(monitor)		\
+		MONITOR_FIELD(monitor, mon_min_value_start)
+
+#define MONITOR_LAST_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_last_value)
+
+#define MONITOR_START_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_start_value)
+
+#define MONITOR_VALUE_SINCE_START(monitor)		\
+		(MONITOR_VALUE(monitor) + MONITOR_VALUE_RESET(monitor))
+
+#define MONITOR_STATUS(monitor)				\
+		MONITOR_FIELD(monitor, mon_status)
+
+#define MONITOR_SET_START(monitor)					\
+	do {								\
+		MONITOR_STATUS(monitor) = MONITOR_STARTED;		\
+		MONITOR_FIELD((monitor), mon_start_time) = time(NULL);	\
+	} while (0)
+
+#define MONITOR_SET_OFF(monitor)					\
+	do {								\
+		MONITOR_STATUS(monitor) = MONITOR_STOPPED;		\
+		MONITOR_FIELD((monitor), mon_stop_time) = time(NULL);	\
+	} while (0)
+
+#define	MONITOR_INIT_ZERO_VALUE		0
+
+/** Max and min values are initialized when we first turn on the monitor
+counter, and set the MONITOR_STATUS. */
+#define MONITOR_MAX_MIN_NOT_INIT(monitor)				\
+		(MONITOR_STATUS(monitor) == MONITOR_INIT_ZERO_VALUE	\
+		 && MONITOR_MIN_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE \
+		 && MONITOR_MAX_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE)
+
+#define MONITOR_INIT(monitor)						\
+	if (MONITOR_MAX_MIN_NOT_INIT(monitor)) {			\
+		MONITOR_MIN_VALUE(monitor) = MIN_RESERVED;		\
+		MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED;	\
+		MONITOR_MAX_VALUE(monitor) = MAX_RESERVED;		\
+		MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED;	\
+	}
+
+/** Macros to increment/decrement the counters. The normal
+monitor counter operation expects appropriate synchronization
+already exists. No additional mutex is necessary when operating
+on the counters */
+#define	MONITOR_INC(monitor)						\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor)++;				\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/** Increment a monitor counter under mutex protection.
+Use MONITOR_INC if appropriate mutex protection already exists.
+@param monitor	monitor to be incremented by 1
+@param mutex	mutex to acquire and relese */
+# define MONITOR_MUTEX_INC(mutex, monitor)				\
+	ut_ad(!mutex_own(mutex));					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		mutex_enter(mutex);					\
+		if (++MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor); \
+		}							\
+		mutex_exit(mutex);					\
+	}
+/** Decrement a monitor counter under mutex protection.
+Use MONITOR_DEC if appropriate mutex protection already exists.
+@param monitor	monitor to be decremented by 1
+@param mutex	mutex to acquire and relese */
+# define MONITOR_MUTEX_DEC(mutex, monitor)				\
+	ut_ad(!mutex_own(mutex));					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		mutex_enter(mutex);					\
+		if (--MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor); \
+		}							\
+		mutex_exit(mutex);					\
+	}
+
+#if defined HAVE_ATOMIC_BUILTINS_64
+/** Atomically increment a monitor counter.
+Use MONITOR_INC if appropriate mutex protection exists.
+@param monitor	monitor to be incremented by 1 */
+# define MONITOR_ATOMIC_INC(monitor)					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		ib_uint64_t	value;					\
+		value  = os_atomic_increment_uint64(			\
+			(ib_uint64_t*) &MONITOR_VALUE(monitor),	 1);	\
+		/* Note: This is not 100% accurate because of the	\
+		inherent race, we ignore it due to performance. */	\
+		if (value > (ib_uint64_t) MONITOR_MAX_VALUE(monitor)) {	\
+			MONITOR_MAX_VALUE(monitor) = value;		\
+		}							\
+	}
+
+/** Atomically decrement a monitor counter.
+Use MONITOR_DEC if appropriate mutex protection exists.
+@param monitor	monitor to be decremented by 1 */
+# define MONITOR_ATOMIC_DEC(monitor)					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		ib_uint64_t	value;					\
+		value = os_atomic_decrement_uint64(			\
+			(ib_uint64_t*) &MONITOR_VALUE(monitor), 1);	\
+		/* Note: This is not 100% accurate because of the	\
+		inherent race, we ignore it due to performance. */	\
+		if (value < (ib_uint64_t) MONITOR_MIN_VALUE(monitor)) {	\
+			MONITOR_MIN_VALUE(monitor) = value;		\
+		}							\
+	}
+# define srv_mon_create() ((void) 0)
+# define srv_mon_free() ((void) 0)
+#else /* HAVE_ATOMIC_BUILTINS_64 */
+/** Mutex protecting atomic operations on platforms that lack
+built-in operations for atomic memory access */
+extern ib_mutex_t	monitor_mutex;
+/****************************************************************//**
+Initialize the monitor subsystem. */
+UNIV_INTERN
+void
+srv_mon_create(void);
+/*================*/
+/****************************************************************//**
+Close the monitor subsystem. */
+UNIV_INTERN
+void
+srv_mon_free(void);
+/*==============*/
+
+/** Atomically increment a monitor counter.
+Use MONITOR_INC if appropriate mutex protection exists.
+@param monitor	monitor to be incremented by 1 */
+# define MONITOR_ATOMIC_INC(monitor) MONITOR_MUTEX_INC(&monitor_mutex, monitor)
+/** Atomically decrement a monitor counter.
+Use MONITOR_DEC if appropriate mutex protection exists.
+@param monitor	monitor to be decremented by 1 */
+# define MONITOR_ATOMIC_DEC(monitor) MONITOR_MUTEX_DEC(&monitor_mutex, monitor)
+#endif /* HAVE_ATOMIC_BUILTINS_64 */
+
+#define	MONITOR_DEC(monitor)						\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor)--;				\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+#ifdef UNIV_DEBUG_VALGRIND
+# define MONITOR_CHECK_DEFINED(value) do {	\
+	mon_type_t m = value;			\
+	UNIV_MEM_ASSERT_RW(&m, sizeof m);	\
+} while (0)
+#else /* UNIV_DEBUG_VALGRIND */
+# define MONITOR_CHECK_DEFINED(value) (void) 0
+#endif /* UNIV_DEBUG_VALGRIND */
+
+#define	MONITOR_INC_VALUE(monitor, value)				\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) += (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+#define	MONITOR_DEC_VALUE(monitor, value)				\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		ut_ad(MONITOR_VALUE(monitor) >= (mon_type_t) (value);	\
+		MONITOR_VALUE(monitor) -= (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/* Increment/decrement counter without check the monitor on/off bit, which
+could already be checked as a module group */
+#define	MONITOR_INC_NOCHECK(monitor)					\
+	do {								\
+		MONITOR_VALUE(monitor)++;				\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	} while (0)							\
+
+#define	MONITOR_DEC_NOCHECK(monitor)					\
+	do {								\
+		MONITOR_VALUE(monitor)--;				\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	} while (0)
+
+/** Directly set a monitor counter's value */
+#define	MONITOR_SET(monitor, value)					\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) = (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/** Add time difference between now and input "value" (in seconds) to the
+monitor counter
+@param monitor	monitor to update for the time difference
+@param value	the start time value */
+#define	MONITOR_INC_TIME_IN_MICRO_SECS(monitor, value)			\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		ullint	old_time = (value);				\
+		value = ut_time_us(NULL);				\
+		MONITOR_VALUE(monitor) += (mon_type_t) (value - old_time);\
+	}
+
+/** This macro updates 3 counters in one call. However, it only checks the
+main/first monitor counter 'monitor', to see it is on or off to decide
+whether to do the update.
+@param monitor		the main monitor counter to update. It accounts for
+			the accumulative value for the counter.
+@param monitor_n_calls	counter that counts number of times this macro is
+			called
+@param monitor_per_call	counter that records the current and max value of
+			each incremental value
+@param value		incremental value to record this time */
+#define MONITOR_INC_VALUE_CUMULATIVE(					\
+		monitor, monitor_n_calls, monitor_per_call, value)	\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor_n_calls)++;			\
+		MONITOR_VALUE(monitor_per_call) = (mon_type_t) (value);	\
+		if (MONITOR_VALUE(monitor_per_call)			\
+		    > MONITOR_MAX_VALUE(monitor_per_call)) {		\
+			MONITOR_MAX_VALUE(monitor_per_call) =		\
+				 (mon_type_t) (value);			\
+		}							\
+		MONITOR_VALUE(monitor) += (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/** Directly set a monitor counter's value, and if the value
+is monotonically increasing, only max value needs to be updated */
+#define	MONITOR_SET_UPD_MAX_ONLY(monitor, value)			\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) = (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/** Some values such as log sequence number are montomically increasing
+number, do not need to record max/min values */
+#define MONITOR_SET_SIMPLE(monitor, value)				\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) = (mon_type_t) (value);		\
+	}
+
+/** Reset the monitor value and max/min value to zero. The reset
+operation would only be conducted when the counter is turned off */
+#define MONITOR_RESET_ALL(monitor)					\
+	do {								\
+		MONITOR_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_MAX_VALUE(monitor) = MAX_RESERVED;		\
+		MONITOR_MIN_VALUE(monitor) = MIN_RESERVED;		\
+		MONITOR_VALUE_RESET(monitor) = MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED;	\
+		MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED;	\
+		MONITOR_LAST_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_FIELD(monitor, mon_start_time) =		\
+					MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_FIELD(monitor, mon_stop_time) =			\
+					MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_FIELD(monitor, mon_reset_time) =		\
+					MONITOR_INIT_ZERO_VALUE;	\
+	} while (0)
+
+/** Following four macros defines necessary operations to fetch and
+consolidate information from existing system status variables. */
+
+/** Save the passed-in value to mon_start_value field of monitor
+counters */
+#define MONITOR_SAVE_START(monitor, value) do {				\
+	MONITOR_CHECK_DEFINED(value);					\
+	(MONITOR_START_VALUE(monitor) =					\
+		(mon_type_t) (value) - MONITOR_VALUE_RESET(monitor));	\
+	} while (0)
+
+/** Save the passed-in value to mon_last_value field of monitor
+counters */
+#define MONITOR_SAVE_LAST(monitor)					\
+	do {								\
+		MONITOR_LAST_VALUE(monitor) = MONITOR_VALUE(monitor);	\
+		MONITOR_START_VALUE(monitor) += MONITOR_VALUE(monitor);	\
+	} while (0)
+
+/** Set monitor value to the difference of value and mon_start_value
+compensated by mon_last_value if accumulated value is required. */
+#define MONITOR_SET_DIFF(monitor, value)				\
+	MONITOR_SET_UPD_MAX_ONLY(monitor, ((value)			\
+	- MONITOR_VALUE_RESET(monitor)					\
+	- MONITOR_FIELD(monitor, mon_start_value)			\
+	+ MONITOR_FIELD(monitor, mon_last_value)))
+
+/****************************************************************//**
+Get monitor's monitor_info_t by its monitor id (index into the
+innodb_counter_info array
+@return	Point to corresponding monitor_info_t, or NULL if no such
+monitor */
+UNIV_INTERN
+monitor_info_t*
+srv_mon_get_info(
+/*=============*/
+	monitor_id_t	monitor_id);	/*!< id index into the
+					innodb_counter_info array */
+/****************************************************************//**
+Get monitor's name by its monitor id (index into the
+innodb_counter_info array
+@return	corresponding monitor name, or NULL if no such
+monitor */
+UNIV_INTERN
+const char*
+srv_mon_get_name(
+/*=============*/
+	monitor_id_t	monitor_id);	/*!< id index into the
+					innodb_counter_info array */
+
+/****************************************************************//**
+Turn on/off/reset monitor counters in a module. If module_value
+is NUM_MONITOR then turn on all monitor counters.
+@return	0 if successful, or the first monitor that cannot be
+turned on because it is already turned on. */
+UNIV_INTERN
+void
+srv_mon_set_module_control(
+/*=======================*/
+	monitor_id_t	module_id,	/*!< in: Module ID as in
+					monitor_counter_id. If it is
+					set to NUM_MONITOR, this means
+					we shall turn on all the counters */
+	mon_option_t	set_option);	/*!< in: Turn on/off reset the
+					counter */
+/****************************************************************//**
+This function consolidates some existing server counters used
+by "system status variables". These existing system variables do not have
+mechanism to start/stop and reset the counters, so we simulate these
+controls by remembering the corresponding counter values when the
+corresponding monitors are turned on/off/reset, and do appropriate
+mathematics to deduct the actual value. */
+UNIV_INTERN
+void
+srv_mon_process_existing_counter(
+/*=============================*/
+	monitor_id_t	monitor_id,	/*!< in: the monitor's ID as in
+					monitor_counter_id */
+	mon_option_t	set_option);	/*!< in: Turn on/off reset the
+					counter */
+/*************************************************************//**
+This function is used to calculate the maximum counter value
+since the start of monitor counter
+@return	max counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_max_since_start(
+/*=========================*/
+	monitor_id_t	monitor);	/*!< in: monitor id */
+/*************************************************************//**
+This function is used to calculate the minimum counter value
+since the start of monitor counter
+@return	min counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_min_since_start(
+/*=========================*/
+	monitor_id_t	monitor);	/*!< in: monitor id*/
+/*************************************************************//**
+Reset a monitor, create a new base line with the current monitor
+value. This baseline is recorded by MONITOR_VALUE_RESET(monitor) */
+UNIV_INTERN
+void
+srv_mon_reset(
+/*==========*/
+	monitor_id_t	monitor);	/*!< in: monitor id*/
+/*************************************************************//**
+This function resets all values of a monitor counter */
+UNIV_INLINE
+void
+srv_mon_reset_all(
+/*==============*/
+	monitor_id_t	monitor);	/*!< in: monitor id*/
+/*************************************************************//**
+Turn on monitor counters that are marked as default ON. */
+UNIV_INTERN
+void
+srv_mon_default_on(void);
+/*====================*/
+
+#ifndef UNIV_NONINL
+#include "srv0mon.ic"
+#endif
+#else /* !UNIV_HOTBACKUP */
+# define MONITOR_INC(x)		((void) 0)
+# define MONITOR_DEC(x)		((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/innobase/include/srv0mon.ic b/storage/innobase/include/srv0mon.ic
new file mode 100644
index 00000000000..225390c6b6f
--- /dev/null
+++ b/storage/innobase/include/srv0mon.ic
@@ -0,0 +1,113 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/srv0mon.ic
+Server monitoring system
+
+Created 1/20/2010	Jimmy Yang
+************************************************************************/
+
+/*************************************************************//**
+This function is used to calculate the maximum counter value
+since the start of monitor counter
+@return	max counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_max_since_start(
+/*=========================*/
+	monitor_id_t	monitor)	/*!< in: monitor id */
+{
+	if (MONITOR_MAX_VALUE_START(monitor) == MAX_RESERVED) {
+
+		/* MONITOR_MAX_VALUE_START has not yet been
+		initialized, the max value since start is the
+		max count in MONITOR_MAX_VALUE */
+		MONITOR_MAX_VALUE_START(monitor) =
+				MONITOR_MAX_VALUE(monitor);
+
+	} else if (MONITOR_MAX_VALUE(monitor) != MAX_RESERVED
+		   && (MONITOR_MAX_VALUE(monitor)
+		       + MONITOR_VALUE_RESET(monitor)
+		      > MONITOR_MAX_VALUE_START(monitor))) {
+
+		/* If the max value since reset (as specified
+		in MONITOR_MAX_VALUE) plus the reset value is
+		larger than MONITOR_MAX_VALUE_START, reset
+		MONITOR_MAX_VALUE_START to this new max value */
+		MONITOR_MAX_VALUE_START(monitor) =
+				MONITOR_MAX_VALUE(monitor)
+				+ MONITOR_VALUE_RESET(monitor);
+	}
+
+	return(MONITOR_MAX_VALUE_START(monitor));
+}
+
+/*************************************************************//**
+This function is used to calculate the minimum counter value
+since the start of monitor counter
+@return	min counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_min_since_start(
+/*=========================*/
+	monitor_id_t	monitor)	/*!< in: monitor id */
+{
+	if (MONITOR_MIN_VALUE_START(monitor) == MIN_RESERVED) {
+
+		/* MONITOR_MIN_VALUE_START has not yet been
+		initialized, the min value since start is the
+		min count in MONITOR_MIN_VALUE */
+		MONITOR_MIN_VALUE_START(monitor) =
+				MONITOR_MIN_VALUE(monitor);
+
+	} else if (MONITOR_MIN_VALUE(monitor) != MIN_RESERVED
+		   && (MONITOR_MIN_VALUE(monitor)
+		       + MONITOR_VALUE_RESET(monitor)
+		       < MONITOR_MIN_VALUE_START(monitor))) {
+
+		/* If the min value since reset (as specified
+		in MONITOR_MIN_VALUE) plus the reset value is
+		less than MONITOR_MIN_VALUE_START, reset
+		MONITOR_MIN_VALUE_START to this new min value */
+		MONITOR_MIN_VALUE_START(monitor) =
+			MONITOR_MIN_VALUE(monitor)
+                        + MONITOR_VALUE_RESET(monitor);
+        }
+
+	return(MONITOR_MIN_VALUE_START(monitor));
+}
+
+/*************************************************************//**
+This function resets all values of a monitor counter */
+UNIV_INLINE
+void
+srv_mon_reset_all(
+/*==============*/
+	monitor_id_t	monitor)	/*!< in: monitor id */
+{
+	/* Do not reset all counter values if monitor is still on. */
+	if (MONITOR_IS_ON(monitor)) {
+		fprintf(stderr, "InnoDB: Cannot reset all values for "
+			"monitor counter %s while it is on. Please "
+			"turn it off and retry. \n",
+			srv_mon_get_name(monitor));
+	} else {
+		MONITOR_RESET_ALL(monitor);
+	}
+}
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
new file mode 100644
index 00000000000..7a6c9f93e3d
--- /dev/null
+++ b/storage/innobase/include/srv0srv.h
@@ -0,0 +1,888 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2008, 2009, Google Inc.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0srv.h
+The server main program
+
+Created 10/10/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef srv0srv_h
+#define srv0srv_h
+
+#include "univ.i"
+#ifndef UNIV_HOTBACKUP
+#include "log0log.h"
+#include "sync0sync.h"
+#include "os0sync.h"
+#include "que0types.h"
+#include "trx0types.h"
+#include "srv0conc.h"
+#include "buf0checksum.h"
+#include "ut0counter.h"
+
+/* Global counters used inside InnoDB. */
+struct srv_stats_t {
+	typedef ib_counter_t<lsn_t, 1, single_indexer_t> lsn_ctr_1_t;
+	typedef ib_counter_t<ulint, 1, single_indexer_t> ulint_ctr_1_t;
+	typedef ib_counter_t<lint, 1, single_indexer_t> lint_ctr_1_t;
+	typedef ib_counter_t<ulint, 64> ulint_ctr_64_t;
+	typedef ib_counter_t<ib_int64_t, 1, single_indexer_t> ib_int64_ctr_1_t;
+
+	/** Count the amount of data written in total (in bytes) */
+	ulint_ctr_1_t		data_written;
+
+	/** Number of the log write requests done */
+	ulint_ctr_1_t		log_write_requests;
+
+	/** Number of physical writes to the log performed */
+	ulint_ctr_1_t		log_writes;
+
+	/** Amount of data written to the log files in bytes */
+	lsn_ctr_1_t		os_log_written;
+
+	/** Number of writes being done to the log files */
+	lint_ctr_1_t		os_log_pending_writes;
+
+	/** We increase this counter, when we don't have enough
+	space in the log buffer and have to flush it */
+	ulint_ctr_1_t		log_waits;
+
+	/** Count the number of times the doublewrite buffer was flushed */
+	ulint_ctr_1_t		dblwr_writes;
+
+	/** Store the number of pages that have been flushed to the
+	doublewrite buffer */
+	ulint_ctr_1_t		dblwr_pages_written;
+
+	/** Store the number of write requests issued */
+	ulint_ctr_1_t		buf_pool_write_requests;
+
+	/** Store the number of times when we had to wait for a free page
+	in the buffer pool. It happens when the buffer pool is full and we
+	need to make a flush, in order to be able to read or create a page. */
+	ulint_ctr_1_t		buf_pool_wait_free;
+
+	/** Count the number of pages that were written from buffer
+	pool to the disk */
+	ulint_ctr_1_t		buf_pool_flushed;
+
+	/** Number of buffer pool reads that led to the reading of
+	a disk page */
+	ulint_ctr_1_t		buf_pool_reads;
+
+	/** Number of data read in total (in bytes) */
+	ulint_ctr_1_t		data_read;
+
+	/** Wait time of database locks */
+	ib_int64_ctr_1_t	n_lock_wait_time;
+
+	/** Number of database lock waits */
+	ulint_ctr_1_t		n_lock_wait_count;
+
+	/** Number of threads currently waiting on database locks */
+	lint_ctr_1_t		n_lock_wait_current_count;
+
+	/** Number of rows read. */
+	ulint_ctr_64_t		n_rows_read;
+
+	/** Number of rows updated */
+	ulint_ctr_64_t		n_rows_updated;
+
+	/** Number of rows deleted */
+	ulint_ctr_64_t		n_rows_deleted;
+
+	/** Number of rows inserted */
+	ulint_ctr_64_t		n_rows_inserted;
+};
+
+extern const char*	srv_main_thread_op_info;
+
+/** Prefix used by MySQL to indicate pre-5.1 table name encoding */
+extern const char	srv_mysql50_table_name_prefix[10];
+
+/* The monitor thread waits on this event. */
+extern os_event_t	srv_monitor_event;
+
+/* The error monitor thread waits on this event. */
+extern os_event_t	srv_error_event;
+
+/** The buffer pool dump/load thread waits on this event. */
+extern os_event_t	srv_buf_dump_event;
+
+/** The buffer pool dump/load file name */
+#define SRV_BUF_DUMP_FILENAME_DEFAULT	"ib_buffer_pool"
+extern char*		srv_buf_dump_filename;
+
+/** Boolean config knobs that tell InnoDB to dump the buffer pool at shutdown
+and/or load it during startup. */
+extern char		srv_buffer_pool_dump_at_shutdown;
+extern char		srv_buffer_pool_load_at_startup;
+
+/* Whether to disable file system cache if it is defined */
+extern char		srv_disable_sort_file_cache;
+
+/* If the last data file is auto-extended, we add this many pages to it
+at a time */
+#define SRV_AUTO_EXTEND_INCREMENT	\
+	(srv_auto_extend_increment * ((1024 * 1024) / UNIV_PAGE_SIZE))
+
+/* Mutex for locking srv_monitor_file. Not created if srv_read_only_mode */
+extern ib_mutex_t	srv_monitor_file_mutex;
+/* Temporary file for innodb monitor output */
+extern FILE*	srv_monitor_file;
+/* Mutex for locking srv_dict_tmpfile. Only created if !srv_read_only_mode.
+This mutex has a very high rank; threads reserving it should not
+be holding any InnoDB latches. */
+extern ib_mutex_t	srv_dict_tmpfile_mutex;
+/* Temporary file for output from the data dictionary */
+extern FILE*	srv_dict_tmpfile;
+/* Mutex for locking srv_misc_tmpfile. Only created if !srv_read_only_mode.
+This mutex has a very low rank; threads reserving it should not
+acquire any further latches or sleep before releasing this one. */
+extern ib_mutex_t	srv_misc_tmpfile_mutex;
+/* Temporary file for miscellanous diagnostic output */
+extern FILE*	srv_misc_tmpfile;
+
+/* Server parameters which are read from the initfile */
+
+extern char*	srv_data_home;
+
+#ifdef UNIV_LOG_ARCHIVE
+extern char*	srv_arch_dir;
+#endif /* UNIV_LOG_ARCHIVE */
+
+/** Set if InnoDB must operate in read-only mode. We don't do any
+recovery and open all tables in RO mode instead of RW mode. We don't
+sync the max trx id to disk either. */
+extern my_bool	srv_read_only_mode;
+/** store to its own file each table created by an user; data
+dictionary tables are in the system tablespace 0 */
+extern my_bool	srv_file_per_table;
+/** Sleep delay for threads waiting to enter InnoDB. In micro-seconds. */
+extern	ulong	srv_thread_sleep_delay;
+#if defined(HAVE_ATOMIC_BUILTINS)
+/** Maximum sleep delay (in micro-seconds), value of 0 disables it.*/
+extern	ulong	srv_adaptive_max_sleep_delay;
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+/** The file format to use on new *.ibd files. */
+extern ulint	srv_file_format;
+/** Whether to check file format during startup.  A value of
+UNIV_FORMAT_MAX + 1 means no checking ie. FALSE.  The default is to
+set it to the highest format we support. */
+extern ulint	srv_max_file_format_at_startup;
+/** Place locks to records only i.e. do not use next-key locking except
+on duplicate key checking and foreign key checking */
+extern ibool	srv_locks_unsafe_for_binlog;
+
+/** Sort buffer size in index creation */
+extern ulong	srv_sort_buf_size;
+/** Maximum modification log file size for online index creation */
+extern unsigned long long	srv_online_max_size;
+
+/* If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio we build below with threads.
+Currently we support native aio on windows and linux */
+extern my_bool	srv_use_native_aio;
+#ifdef __WIN__
+extern ibool	srv_use_native_conditions;
+#endif /* __WIN__ */
+#endif /* !UNIV_HOTBACKUP */
+
+/** Server undo tablespaces directory, can be absolute path. */
+extern char*	srv_undo_dir;
+
+/** Number of undo tablespaces to use. */
+extern ulong	srv_undo_tablespaces;
+
+/** The number of UNDO tablespaces that are open and ready to use. */
+extern ulint	srv_undo_tablespaces_open;
+
+/* The number of undo segments to use */
+extern ulong	srv_undo_logs;
+
+extern ulint	srv_n_data_files;
+extern char**	srv_data_file_names;
+extern ulint*	srv_data_file_sizes;
+extern ulint*	srv_data_file_is_raw_partition;
+
+extern ibool	srv_auto_extend_last_data_file;
+extern ulint	srv_last_file_size_max;
+extern char*	srv_log_group_home_dir;
+#ifndef UNIV_HOTBACKUP
+extern ulong	srv_auto_extend_increment;
+
+extern ibool	srv_created_new_raw;
+
+/** Maximum number of srv_n_log_files, or innodb_log_files_in_group */
+#define SRV_N_LOG_FILES_MAX 100
+extern ulong	srv_n_log_files;
+extern ib_uint64_t	srv_log_file_size;
+extern ib_uint64_t	srv_log_file_size_requested;
+extern ulint	srv_log_buffer_size;
+extern ulong	srv_flush_log_at_trx_commit;
+extern uint	srv_flush_log_at_timeout;
+extern char	srv_adaptive_flushing;
+
+/* If this flag is TRUE, then we will load the indexes' (and tables') metadata
+even if they are marked as "corrupted". Mostly it is for DBA to process
+corrupted index and table */
+extern my_bool	srv_load_corrupted;
+
+/* The sort order table of the MySQL latin1_swedish_ci character set
+collation */
+extern const byte*	srv_latin1_ordering;
+#ifndef UNIV_HOTBACKUP
+extern my_bool	srv_use_sys_malloc;
+#else
+extern ibool	srv_use_sys_malloc;
+#endif /* UNIV_HOTBACKUP */
+extern ulint	srv_buf_pool_size;	/*!< requested size in bytes */
+extern ulint    srv_buf_pool_instances; /*!< requested number of buffer pool instances */
+extern ulong	srv_n_page_hash_locks;	/*!< number of locks to
+					protect buf_pool->page_hash */
+extern ulong	srv_LRU_scan_depth;	/*!< Scan depth for LRU
+					flush batch */
+extern ulong	srv_flush_neighbors;	/*!< whether or not to flush
+					neighbors of a block */
+extern ulint	srv_buf_pool_old_size;	/*!< previously requested size */
+extern ulint	srv_buf_pool_curr_size;	/*!< current size in bytes */
+extern ulint	srv_mem_pool_size;
+extern ulint	srv_lock_table_size;
+
+extern ulint	srv_n_file_io_threads;
+extern my_bool	srv_random_read_ahead;
+extern ulong	srv_read_ahead_threshold;
+extern ulint	srv_n_read_io_threads;
+extern ulint	srv_n_write_io_threads;
+
+/* Number of IO operations per second the server can do */
+extern ulong    srv_io_capacity;
+
+/* We use this dummy default value at startup for max_io_capacity.
+The real value is set based on the value of io_capacity. */
+#define SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT	(~0UL)
+#define SRV_MAX_IO_CAPACITY_LIMIT		(~0UL)
+extern ulong    srv_max_io_capacity;
+/* Returns the number of IO operations that is X percent of the
+capacity. PCT_IO(5) -> returns the number of IO operations that
+is 5% of the max where max is srv_io_capacity.  */
+#define PCT_IO(p) ((ulong) (srv_io_capacity * ((double) (p) / 100.0)))
+
+/* The "innodb_stats_method" setting, decides how InnoDB is going
+to treat NULL value when collecting statistics. It is not defined
+as enum type because the configure option takes unsigned integer type. */
+extern ulong	srv_innodb_stats_method;
+
+#ifdef UNIV_LOG_ARCHIVE
+extern ibool		srv_log_archive_on;
+extern ibool		srv_archive_recovery;
+extern ib_uint64_t	srv_archive_recovery_limit_lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+
+extern char*	srv_file_flush_method_str;
+extern ulint	srv_unix_file_flush_method;
+extern ulint	srv_win_file_flush_method;
+
+extern ulint	srv_max_n_open_files;
+
+extern ulong	srv_max_dirty_pages_pct;
+extern ulong	srv_max_dirty_pages_pct_lwm;
+
+extern ulong	srv_adaptive_flushing_lwm;
+extern ulong	srv_flushing_avg_loops;
+
+extern ulong	srv_force_recovery;
+#ifndef DBUG_OFF
+extern ulong	srv_force_recovery_crash;
+#endif /* !DBUG_OFF */
+
+extern ulint	srv_fast_shutdown;	/*!< If this is 1, do not do a
+					purge and index buffer merge.
+					If this 2, do not even flush the
+					buffer pool to data files at the
+					shutdown: we effectively 'crash'
+					InnoDB (but lose no committed
+					transactions). */
+extern ibool	srv_innodb_status;
+
+extern unsigned long long	srv_stats_transient_sample_pages;
+extern my_bool			srv_stats_persistent;
+extern unsigned long long	srv_stats_persistent_sample_pages;
+extern my_bool			srv_stats_auto_recalc;
+
+extern ibool	srv_use_doublewrite_buf;
+extern ulong	srv_doublewrite_batch_size;
+extern ulong	srv_checksum_algorithm;
+
+extern ulong	srv_max_buf_pool_modified_pct;
+extern ulong	srv_max_purge_lag;
+extern ulong	srv_max_purge_lag_delay;
+
+extern ulong	srv_replication_delay;
+/*-------------------------------------------*/
+
+extern my_bool	srv_print_innodb_monitor;
+extern my_bool	srv_print_innodb_lock_monitor;
+extern ibool	srv_print_innodb_tablespace_monitor;
+extern ibool	srv_print_verbose_log;
+#define DEPRECATED_MSG_INNODB_TABLE_MONITOR \
+	"Using innodb_table_monitor is deprecated and it may be removed " \
+	"in future releases. Please use the InnoDB INFORMATION_SCHEMA " \
+	"tables instead, see " REFMAN "innodb-i_s-tables.html"
+extern ibool	srv_print_innodb_table_monitor;
+
+extern ibool	srv_monitor_active;
+extern ibool	srv_error_monitor_active;
+
+/* TRUE during the lifetime of the buffer pool dump/load thread */
+extern ibool	srv_buf_dump_thread_active;
+
+/* TRUE during the lifetime of the stats thread */
+extern ibool	srv_dict_stats_thread_active;
+
+extern ulong	srv_n_spin_wait_rounds;
+extern ulong	srv_n_free_tickets_to_enter;
+extern ulong	srv_thread_sleep_delay;
+extern ulong	srv_spin_wait_delay;
+extern ibool	srv_priority_boost;
+
+extern ulint	srv_truncated_status_writes;
+extern ulint	srv_available_undo_logs;
+
+extern	ulint	srv_mem_pool_size;
+extern	ulint	srv_lock_table_size;
+
+#ifdef UNIV_DEBUG
+extern	ibool	srv_print_thread_releases;
+extern	ibool	srv_print_lock_waits;
+extern	ibool	srv_print_buf_io;
+extern	ibool	srv_print_log_io;
+extern	ibool	srv_print_latch_waits;
+#else /* UNIV_DEBUG */
+# define srv_print_thread_releases	FALSE
+# define srv_print_lock_waits		FALSE
+# define srv_print_buf_io		FALSE
+# define srv_print_log_io		FALSE
+# define srv_print_latch_waits		FALSE
+#endif /* UNIV_DEBUG */
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+extern my_bool	srv_ibuf_disable_background_merge;
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+#ifdef UNIV_DEBUG
+extern my_bool	srv_purge_view_update_only_debug;
+#endif /* UNIV_DEBUG */
+
+extern ulint	srv_fatal_semaphore_wait_threshold;
+#define SRV_SEMAPHORE_WAIT_EXTENSION	7200
+extern ulint	srv_dml_needed_delay;
+
+#ifndef HAVE_ATOMIC_BUILTINS
+/** Mutex protecting some server global variables. */
+extern ib_mutex_t	server_mutex;
+#endif /* !HAVE_ATOMIC_BUILTINS */
+
+#define SRV_MAX_N_IO_THREADS	130
+
+/* Array of English strings describing the current state of an
+i/o handler thread */
+extern const char* srv_io_thread_op_info[];
+extern const char* srv_io_thread_function[];
+
+/* the number of purge threads to use from the worker pool (currently 0 or 1) */
+extern ulong srv_n_purge_threads;
+
+/* the number of pages to purge in one batch */
+extern ulong srv_purge_batch_size;
+
+/* the number of sync wait arrays */
+extern ulong srv_sync_array_size;
+
+/* print all user-level transactions deadlocks to mysqld stderr */
+extern my_bool srv_print_all_deadlocks;
+
+extern my_bool	srv_cmp_per_index_enabled;
+
+/** Status variables to be passed to MySQL */
+extern struct export_var_t export_vars;
+
+/** Global counters */
+extern srv_stats_t	srv_stats;
+
+# ifdef UNIV_PFS_THREAD
+/* Keys to register InnoDB threads with performance schema */
+extern mysql_pfs_key_t	buf_page_cleaner_thread_key;
+extern mysql_pfs_key_t	trx_rollback_clean_thread_key;
+extern mysql_pfs_key_t	io_handler_thread_key;
+extern mysql_pfs_key_t	srv_lock_timeout_thread_key;
+extern mysql_pfs_key_t	srv_error_monitor_thread_key;
+extern mysql_pfs_key_t	srv_monitor_thread_key;
+extern mysql_pfs_key_t	srv_master_thread_key;
+extern mysql_pfs_key_t	srv_purge_thread_key;
+extern mysql_pfs_key_t	recv_writer_thread_key;
+
+/* This macro register the current thread and its key with performance
+schema */
+#  define pfs_register_thread(key)			\
+do {								\
+	struct PSI_thread* psi = PSI_THREAD_CALL(new_thread)(key, NULL, 0);\
+	PSI_THREAD_CALL(set_thread)(psi);			\
+} while (0)
+
+/* This macro delist the current thread from performance schema */
+#  define pfs_delete_thread()				\
+do {								\
+	PSI_THREAD_CALL(delete_current_thread)();		\
+} while (0)
+# endif /* UNIV_PFS_THREAD */
+
+#endif /* !UNIV_HOTBACKUP */
+
+/** Types of raw partitions in innodb_data_file_path */
+enum {
+	SRV_NOT_RAW = 0,	/*!< Not a raw partition */
+	SRV_NEW_RAW,		/*!< A 'newraw' partition, only to be
+				initialized */
+	SRV_OLD_RAW		/*!< An initialized raw partition */
+};
+
+/** Alternatives for the file flush option in Unix; see the InnoDB manual
+about what these mean */
+enum {
+	SRV_UNIX_FSYNC = 1,	/*!< fsync, the default */
+	SRV_UNIX_O_DSYNC,	/*!< open log files in O_SYNC mode */
+	SRV_UNIX_LITTLESYNC,	/*!< do not call os_file_flush()
+				when writing data files, but do flush
+				after writing to log files */
+	SRV_UNIX_NOSYNC,	/*!< do not flush after writing */
+	SRV_UNIX_O_DIRECT,	/*!< invoke os_file_set_nocache() on
+				data files. This implies using
+				non-buffered IO but still using fsync,
+				the reason for which is that some FS
+				do not flush meta-data when
+				unbuffered IO happens */
+	SRV_UNIX_O_DIRECT_NO_FSYNC
+				/*!< do not use fsync() when using
+				direct IO i.e.: it can be set to avoid
+				the fsync() call that we make when
+				using SRV_UNIX_O_DIRECT. However, in
+				this case user/DBA should be sure about
+				the integrity of the meta-data */
+};
+
+/** Alternatives for file i/o in Windows */
+enum {
+	SRV_WIN_IO_NORMAL = 1,	/*!< buffered I/O */
+	SRV_WIN_IO_UNBUFFERED	/*!< unbuffered I/O; this is the default */
+};
+
+/** Alternatives for srv_force_recovery. Non-zero values are intended
+to help the user get a damaged database up so that he can dump intact
+tables and rows with SELECT INTO OUTFILE. The database must not otherwise
+be used with these options! A bigger number below means that all precautions
+of lower numbers are included. */
+enum {
+	SRV_FORCE_IGNORE_CORRUPT = 1,	/*!< let the server run even if it
+					detects a corrupt page */
+	SRV_FORCE_NO_BACKGROUND	= 2,	/*!< prevent the main thread from
+					running: if a crash would occur
+					in purge, this prevents it */
+	SRV_FORCE_NO_TRX_UNDO = 3,	/*!< do not run trx rollback after
+					recovery */
+	SRV_FORCE_NO_IBUF_MERGE = 4,	/*!< prevent also ibuf operations:
+					if they would cause a crash, better
+					not do them */
+	SRV_FORCE_NO_UNDO_LOG_SCAN = 5,	/*!< do not look at undo logs when
+					starting the database: InnoDB will
+					treat even incomplete transactions
+					as committed */
+	SRV_FORCE_NO_LOG_REDO = 6	/*!< do not do the log roll-forward
+					in connection with recovery */
+};
+
+/* Alternatives for srv_innodb_stats_method, which could be changed by
+setting innodb_stats_method */
+enum srv_stats_method_name_enum {
+	SRV_STATS_NULLS_EQUAL,		/* All NULL values are treated as
+					equal. This is the default setting
+					for innodb_stats_method */
+	SRV_STATS_NULLS_UNEQUAL,	/* All NULL values are treated as
+					NOT equal. */
+	SRV_STATS_NULLS_IGNORED		/* NULL values are ignored */
+};
+
+typedef enum srv_stats_method_name_enum		srv_stats_method_name_t;
+
+#ifndef UNIV_HOTBACKUP
+/** Types of threads existing in the system. */
+enum srv_thread_type {
+	SRV_NONE,			/*!< None */
+	SRV_WORKER,			/*!< threads serving parallelized
+					queries and queries released from
+					lock wait */
+	SRV_PURGE,			/*!< Purge coordinator thread */
+	SRV_MASTER			/*!< the master thread, (whose type
+					number must be biggest) */
+};
+
+/*********************************************************************//**
+Boots Innobase server. */
+UNIV_INTERN
+void
+srv_boot(void);
+/*==========*/
+/*********************************************************************//**
+Initializes the server. */
+UNIV_INTERN
+void
+srv_init(void);
+/*==========*/
+/*********************************************************************//**
+Frees the data structures created in srv_init(). */
+UNIV_INTERN
+void
+srv_free(void);
+/*==========*/
+/*********************************************************************//**
+Initializes the synchronization primitives, memory system, and the thread
+local storage. */
+UNIV_INTERN
+void
+srv_general_init(void);
+/*==================*/
+/*********************************************************************//**
+Sets the info describing an i/o thread current state. */
+UNIV_INTERN
+void
+srv_set_io_thread_op_info(
+/*======================*/
+	ulint		i,	/*!< in: the 'segment' of the i/o thread */
+	const char*	str);	/*!< in: constant char string describing the
+				state */
+/*********************************************************************//**
+Resets the info describing an i/o thread current state. */
+UNIV_INTERN
+void
+srv_reset_io_thread_op_info();
+/*=========================*/
+/*******************************************************************//**
+Tells the purge thread that there has been activity in the database
+and wakes up the purge thread if it is suspended (not sleeping).  Note
+that there is a small chance that the purge thread stays suspended
+(we do not protect our operation with the srv_sys_t:mutex, for
+performance reasons). */
+UNIV_INTERN
+void
+srv_wake_purge_thread_if_not_active(void);
+/*=====================================*/
+/*******************************************************************//**
+Tells the Innobase server that there has been activity in the database
+and wakes up the master thread if it is suspended (not sleeping). Used
+in the MySQL interface. Note that there is a small chance that the master
+thread stays suspended (we do not protect our operation with the kernel
+mutex, for performace reasons). */
+UNIV_INTERN
+void
+srv_active_wake_master_thread(void);
+/*===============================*/
+/*******************************************************************//**
+Wakes up the master thread if it is suspended or being suspended. */
+UNIV_INTERN
+void
+srv_wake_master_thread(void);
+/*========================*/
+/******************************************************************//**
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
+UNIV_INTERN
+ibool
+srv_printf_innodb_monitor(
+/*======================*/
+	FILE*	file,		/*!< in: output stream */
+	ibool	nowait,		/*!< in: whether to wait for the
+				lock_sys_t::mutex */
+	ulint*	trx_start,	/*!< out: file position of the start of
+				the list of active transactions */
+	ulint*	trx_end);	/*!< out: file position of the end of
+				the list of active transactions */
+
+/******************************************************************//**
+Function to pass InnoDB status variables to MySQL */
+UNIV_INTERN
+void
+srv_export_innodb_status(void);
+/*==========================*/
+/*******************************************************************//**
+Get current server activity count. We don't hold srv_sys::mutex while
+reading this value as it is only used in heuristics.
+@return activity count. */
+UNIV_INTERN
+ulint
+srv_get_activity_count(void);
+/*========================*/
+/*******************************************************************//**
+Check if there has been any activity.
+@return FALSE if no change in activity counter. */
+UNIV_INTERN
+ibool
+srv_check_activity(
+/*===============*/
+	ulint		old_activity_count);	/*!< old activity count */
+/******************************************************************//**
+Increment the server activity counter. */
+UNIV_INTERN
+void
+srv_inc_activity_count(void);
+/*=========================*/
+
+/**********************************************************************//**
+Enqueues a task to server task queue and releases a worker thread, if there
+is a suspended one. */
+UNIV_INTERN
+void
+srv_que_task_enqueue_low(
+/*=====================*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/**********************************************************************//**
+Check whether any background thread is active. If so, return the thread
+type.
+@return SRV_NONE if all are are suspended or have exited, thread
+type if any are still active. */
+UNIV_INTERN
+enum srv_thread_type
+srv_get_active_thread_type(void);
+/*============================*/
+
+extern "C" {
+
+/*********************************************************************//**
+A thread which prints the info output by various InnoDB monitors.
+@return	a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_monitor_thread)(
+/*===============================*/
+	void*	arg);	/*!< in: a dummy parameter required by
+			os_thread_create */
+
+/*********************************************************************//**
+The master thread controlling the server.
+@return	a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_master_thread)(
+/*==============================*/
+	void*	arg);	/*!< in: a dummy parameter required by
+			os_thread_create */
+
+/*************************************************************************
+A thread which prints warnings about semaphore waits which have lasted
+too long. These can be used to track bugs which cause hangs.
+@return	a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_error_monitor_thread)(
+/*=====================================*/
+	void*	arg);	/*!< in: a dummy parameter required by
+			os_thread_create */
+
+/*********************************************************************//**
+Purge coordinator thread that schedules the purge tasks.
+@return	a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_purge_coordinator_thread)(
+/*=========================================*/
+	void*	arg __attribute__((unused)));	/*!< in: a dummy parameter
+						required by os_thread_create */
+
+/*********************************************************************//**
+Worker thread that reads tasks from the work queue and executes them.
+@return	a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_worker_thread)(
+/*==============================*/
+	void*	arg __attribute__((unused)));	/*!< in: a dummy parameter
+						required by os_thread_create */
+} /* extern "C" */
+
+/**********************************************************************//**
+Get count of tasks in the queue.
+@return number of tasks in queue  */
+UNIV_INTERN
+ulint
+srv_get_task_queue_length(void);
+/*===========================*/
+
+/*********************************************************************//**
+Releases threads of the type given from suspension in the thread table.
+NOTE! The server mutex has to be reserved by the caller!
+@return number of threads released: this may be less than n if not
+enough threads were suspended at the moment */
+UNIV_INTERN
+ulint
+srv_release_threads(
+/*================*/
+	enum srv_thread_type	type,	/*!< in: thread type */
+	ulint			n);	/*!< in: number of threads to release */
+
+/**********************************************************************//**
+Check whether any background thread are active. If so print which thread
+is active. Send the threads wakeup signal.
+@return name of thread that is active or NULL */
+UNIV_INTERN
+const char*
+srv_any_background_threads_are_active(void);
+/*=======================================*/
+
+/**********************************************************************//**
+Wakeup the purge threads. */
+UNIV_INTERN
+void
+srv_purge_wakeup(void);
+/*==================*/
+
+/** Status variables to be passed to MySQL */
+struct export_var_t{
+	ulint innodb_data_pending_reads;	/*!< Pending reads */
+	ulint innodb_data_pending_writes;	/*!< Pending writes */
+	ulint innodb_data_pending_fsyncs;	/*!< Pending fsyncs */
+	ulint innodb_data_fsyncs;		/*!< Number of fsyncs so far */
+	ulint innodb_data_read;			/*!< Data bytes read */
+	ulint innodb_data_writes;		/*!< I/O write requests */
+	ulint innodb_data_written;		/*!< Data bytes written */
+	ulint innodb_data_reads;		/*!< I/O read requests */
+	char  innodb_buffer_pool_dump_status[512];/*!< Buf pool dump status */
+	char  innodb_buffer_pool_load_status[512];/*!< Buf pool load status */
+	ulint innodb_buffer_pool_pages_total;	/*!< Buffer pool size */
+	ulint innodb_buffer_pool_pages_data;	/*!< Data pages */
+	ulint innodb_buffer_pool_bytes_data;	/*!< File bytes used */
+	ulint innodb_buffer_pool_pages_dirty;	/*!< Dirty data pages */
+	ulint innodb_buffer_pool_bytes_dirty;	/*!< File bytes modified */
+	ulint innodb_buffer_pool_pages_misc;	/*!< Miscellanous pages */
+	ulint innodb_buffer_pool_pages_free;	/*!< Free pages */
+#ifdef UNIV_DEBUG
+	ulint innodb_buffer_pool_pages_latched;	/*!< Latched pages */
+#endif /* UNIV_DEBUG */
+	ulint innodb_buffer_pool_read_requests;	/*!< buf_pool->stat.n_page_gets */
+	ulint innodb_buffer_pool_reads;		/*!< srv_buf_pool_reads */
+	ulint innodb_buffer_pool_wait_free;	/*!< srv_buf_pool_wait_free */
+	ulint innodb_buffer_pool_pages_flushed;	/*!< srv_buf_pool_flushed */
+	ulint innodb_buffer_pool_write_requests;/*!< srv_buf_pool_write_requests */
+	ulint innodb_buffer_pool_read_ahead_rnd;/*!< srv_read_ahead_rnd */
+	ulint innodb_buffer_pool_read_ahead;	/*!< srv_read_ahead */
+	ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/
+	ulint innodb_dblwr_pages_written;	/*!< srv_dblwr_pages_written */
+	ulint innodb_dblwr_writes;		/*!< srv_dblwr_writes */
+	ibool innodb_have_atomic_builtins;	/*!< HAVE_ATOMIC_BUILTINS */
+	ulint innodb_log_waits;			/*!< srv_log_waits */
+	ulint innodb_log_write_requests;	/*!< srv_log_write_requests */
+	ulint innodb_log_writes;		/*!< srv_log_writes */
+	lsn_t innodb_os_log_written;		/*!< srv_os_log_written */
+	ulint innodb_os_log_fsyncs;		/*!< fil_n_log_flushes */
+	ulint innodb_os_log_pending_writes;	/*!< srv_os_log_pending_writes */
+	ulint innodb_os_log_pending_fsyncs;	/*!< fil_n_pending_log_flushes */
+	ulint innodb_page_size;			/*!< UNIV_PAGE_SIZE */
+	ulint innodb_pages_created;		/*!< buf_pool->stat.n_pages_created */
+	ulint innodb_pages_read;		/*!< buf_pool->stat.n_pages_read */
+	ulint innodb_pages_written;		/*!< buf_pool->stat.n_pages_written */
+	ulint innodb_row_lock_waits;		/*!< srv_n_lock_wait_count */
+	ulint innodb_row_lock_current_waits;	/*!< srv_n_lock_wait_current_count */
+	ib_int64_t innodb_row_lock_time;	/*!< srv_n_lock_wait_time
+						/ 1000 */
+	ulint innodb_row_lock_time_avg;		/*!< srv_n_lock_wait_time
+						/ 1000
+						/ srv_n_lock_wait_count */
+	ulint innodb_row_lock_time_max;		/*!< srv_n_lock_max_wait_time
+						/ 1000 */
+	ulint innodb_rows_read;			/*!< srv_n_rows_read */
+	ulint innodb_rows_inserted;		/*!< srv_n_rows_inserted */
+	ulint innodb_rows_updated;		/*!< srv_n_rows_updated */
+	ulint innodb_rows_deleted;		/*!< srv_n_rows_deleted */
+	ulint innodb_num_open_files;		/*!< fil_n_file_opened */
+	ulint innodb_truncated_status_writes;	/*!< srv_truncated_status_writes */
+	ulint innodb_available_undo_logs;       /*!< srv_available_undo_logs */
+#ifdef UNIV_DEBUG
+	ulint innodb_purge_trx_id_age;		/*!< rw_max_trx_id - purged trx_id */
+	ulint innodb_purge_view_trx_id_age;	/*!< rw_max_trx_id
+						- purged view's min trx_id */
+#endif /* UNIV_DEBUG */
+};
+
+/** Thread slot in the thread table.  */
+struct srv_slot_t{
+	srv_thread_type type;			/*!< thread type: user,
+						utility etc. */
+	ibool		in_use;			/*!< TRUE if this slot
+						is in use */
+	ibool		suspended;		/*!< TRUE if the thread is
+						waiting for the event of this
+						slot */
+	ib_time_t	suspend_time;		/*!< time when the thread was
+						suspended. Initialized by
+						lock_wait_table_reserve_slot()
+						for lock wait */
+	ulong		wait_timeout;		/*!< wait time that if exceeded
+						the thread will be timed out.
+						Initialized by
+						lock_wait_table_reserve_slot()
+						for lock wait */
+	os_event_t	event;			/*!< event used in suspending
+						the thread when it has nothing
+						to do */
+	que_thr_t*	thr;			/*!< suspended query thread
+						(only used for user threads) */
+};
+
+#else /* !UNIV_HOTBACKUP */
+# define srv_use_adaptive_hash_indexes		FALSE
+# define srv_use_native_aio			FALSE
+# define srv_force_recovery			0UL
+# define srv_set_io_thread_op_info(t,info)	((void) 0)
+# define srv_reset_io_thread_op_info()		((void) 0)
+# define srv_is_being_started			0
+# define srv_win_file_flush_method		SRV_WIN_IO_UNBUFFERED
+# define srv_unix_file_flush_method		SRV_UNIX_O_DSYNC
+# define srv_start_raw_disk_in_use		0
+# define srv_file_per_table			1
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/innobase/include/srv0srv.ic b/storage/innobase/include/srv0srv.ic
new file mode 100644
index 00000000000..53405c06f97
--- /dev/null
+++ b/storage/innobase/include/srv0srv.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0srv.ic
+Server main program
+
+Created 10/4/1995 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/srv0start.h b/storage/innobase/include/srv0start.h
new file mode 100644
index 00000000000..40d502f4459
--- /dev/null
+++ b/storage/innobase/include/srv0start.h
@@ -0,0 +1,167 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0start.h
+Starts the Innobase database server
+
+Created 10/10/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef srv0start_h
+#define srv0start_h
+
+#include "univ.i"
+#include "log0log.h"
+#include "ut0byte.h"
+
+#ifdef __WIN__
+#define SRV_PATH_SEPARATOR	'\\'
+#else
+#define SRV_PATH_SEPARATOR	'/'
+#endif
+
+/*********************************************************************//**
+Normalizes a directory path for Windows: converts slashes to backslashes. */
+UNIV_INTERN
+void
+srv_normalize_path_for_win(
+/*=======================*/
+	char*	str);	/*!< in/out: null-terminated character string */
+/*********************************************************************//**
+Reads the data files and their sizes from a character string given in
+the .cnf file.
+@return	TRUE if ok, FALSE on parse error */
+UNIV_INTERN
+ibool
+srv_parse_data_file_paths_and_sizes(
+/*================================*/
+	char*	str);	/*!< in/out: the data file path string */
+/*********************************************************************//**
+Frees the memory allocated by srv_parse_data_file_paths_and_sizes()
+and srv_parse_log_group_home_dirs(). */
+UNIV_INTERN
+void
+srv_free_paths_and_sizes(void);
+/*==========================*/
+/*********************************************************************//**
+Adds a slash or a backslash to the end of a string if it is missing
+and the string is not empty.
+@return	string which has the separator if the string is not empty */
+UNIV_INTERN
+char*
+srv_add_path_separator_if_needed(
+/*=============================*/
+	char*	str);	/*!< in: null-terminated character string */
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Starts Innobase and creates a new database if database files
+are not found and the user wants.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+innobase_start_or_create_for_mysql(void);
+/*====================================*/
+/****************************************************************//**
+Shuts down the Innobase database.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+innobase_shutdown_for_mysql(void);
+
+/********************************************************************
+Signal all per-table background threads to shutdown, and wait for them to do
+so. */
+UNIV_INTERN
+void
+srv_shutdown_table_bg_threads(void);
+/*=============================*/
+
+/*************************************************************//**
+Copy the file path component of the physical file to parameter. It will
+copy up to and including the terminating path separator.
+@return number of bytes copied or ULINT_UNDEFINED if destination buffer
+	is smaller than the path to be copied. */
+UNIV_INTERN
+ulint
+srv_path_copy(
+/*==========*/
+	char*		dest,		/*!< out: destination buffer */
+	ulint		dest_len,	/*!< in: max bytes to copy */
+	const char*	basedir,	/*!< in: base directory */
+	const char*	table_name)	/*!< in: source table name */
+	__attribute__((nonnull, warn_unused_result));
+
+/*****************************************************************//**
+Get the meta-data filename from the table name. */
+UNIV_INTERN
+void
+srv_get_meta_data_filename(
+/*======================*/
+	dict_table_t*	table,		/*!< in: table */
+	char*			filename,	/*!< out: filename */
+	ulint			max_len)	/*!< in: filename max length */
+	__attribute__((nonnull));
+
+/** Log sequence number at shutdown */
+extern	lsn_t	srv_shutdown_lsn;
+/** Log sequence number immediately after startup */
+extern	lsn_t	srv_start_lsn;
+
+#ifdef HAVE_DARWIN_THREADS
+/** TRUE if the F_FULLFSYNC option is available */
+extern	ibool	srv_have_fullfsync;
+#endif
+
+/** TRUE if the server is being started */
+extern	ibool	srv_is_being_started;
+/** TRUE if the server was successfully started */
+extern	ibool	srv_was_started;
+/** TRUE if the server is being started, before rolling back any
+incomplete transactions */
+extern	ibool	srv_startup_is_before_trx_rollback_phase;
+
+/** TRUE if a raw partition is in use */
+extern	ibool	srv_start_raw_disk_in_use;
+
+
+/** Shutdown state */
+enum srv_shutdown_state {
+	SRV_SHUTDOWN_NONE = 0,	/*!< Database running normally */
+	SRV_SHUTDOWN_CLEANUP,	/*!< Cleaning up in
+				logs_empty_and_mark_files_at_shutdown() */
+	SRV_SHUTDOWN_FLUSH_PHASE,/*!< At this phase the master and the
+				purge threads must have completed their
+				work. Once we enter this phase the
+				page_cleaner can clean up the buffer
+				pool and exit */
+	SRV_SHUTDOWN_LAST_PHASE,/*!< Last phase after ensuring that
+				the buffer pool can be freed: flush
+				all file spaces and close all files */
+	SRV_SHUTDOWN_EXIT_THREADS/*!< Exit all threads */
+};
+
+/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to
+SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */
+extern	enum srv_shutdown_state	srv_shutdown_state;
+#endif /* !UNIV_HOTBACKUP */
+
+/** Log 'spaces' have id's >= this */
+#define SRV_LOG_SPACE_FIRST_ID		0xFFFFFFF0UL
+
+#endif
diff --git a/storage/innobase/include/sync0arr.h b/storage/innobase/include/sync0arr.h
new file mode 100644
index 00000000000..15dbdcb540d
--- /dev/null
+++ b/storage/innobase/include/sync0arr.h
@@ -0,0 +1,155 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0arr.h
+The wait array used in synchronization primitives
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0arr_h
+#define sync0arr_h
+
+#include "univ.i"
+#include "ut0lst.h"
+#include "ut0mem.h"
+#include "os0thread.h"
+
+/** Synchronization wait array cell */
+struct sync_cell_t;
+/** Synchronization wait array */
+struct sync_array_t;
+
+/******************************************************************//**
+Get an instance of the sync wait array and reserve a wait array cell
+in the instance for waiting for an object. The event of the cell is
+reset to nonsignalled state.
+If reserving cell of the instance fails, try to get another new
+instance until we can reserve an empty cell of it.
+@return the instance found, never NULL. */
+UNIV_INLINE
+sync_array_t*
+sync_array_get_and_reserve_cell(
+/*============================*/
+	void*		object,	/*!< in: pointer to the object to wait for */
+	ulint		type,	/*!< in: lock request type */
+	const char*	file,	/*!< in: file where requested */
+	ulint		line,	/*!< in: line where requested */
+	ulint*		index);	/*!< out: index of the reserved cell */
+/******************************************************************//**
+Reserves a wait array cell for waiting for an object.
+The event of the cell is reset to nonsignalled state.
+@return true if free cell is found, otherwise false */
+UNIV_INTERN
+bool
+sync_array_reserve_cell(
+/*====================*/
+	sync_array_t*	arr,	/*!< in: wait array */
+	void*		object, /*!< in: pointer to the object to wait for */
+	ulint		type,	/*!< in: lock request type */
+	const char*	file,	/*!< in: file where requested */
+	ulint		line,	/*!< in: line where requested */
+	ulint*		index); /*!< out: index of the reserved cell */
+/******************************************************************//**
+This function should be called when a thread starts to wait on
+a wait array cell. In the debug version this function checks
+if the wait for a semaphore will result in a deadlock, in which
+case prints info and asserts. */
+UNIV_INTERN
+void
+sync_array_wait_event(
+/*==================*/
+	sync_array_t*	arr,	/*!< in: wait array */
+	ulint		index);	 /*!< in: index of the reserved cell */
+/******************************************************************//**
+Frees the cell. NOTE! sync_array_wait_event frees the cell
+automatically! */
+UNIV_INTERN
+void
+sync_array_free_cell(
+/*=================*/
+	sync_array_t*	arr,	/*!< in: wait array */
+	ulint		index);	/*!< in: index of the cell in array */
+/**********************************************************************//**
+Note that one of the wait objects was signalled. */
+UNIV_INTERN
+void
+sync_array_object_signalled(void);
+/*=============================*/
+
+/**********************************************************************//**
+If the wakeup algorithm does not work perfectly at semaphore relases,
+this function will do the waking (see the comment in mutex_exit). This
+function should be called about every 1 second in the server. */
+UNIV_INTERN
+void
+sync_arr_wake_threads_if_sema_free(void);
+/*====================================*/
+/**********************************************************************//**
+Prints warnings of long semaphore waits to stderr.
+@return	TRUE if fatal semaphore wait threshold was exceeded */
+UNIV_INTERN
+ibool
+sync_array_print_long_waits(
+/*========================*/
+	os_thread_id_t*	waiter,	/*!< out: longest waiting thread */
+	const void**	sema)	/*!< out: longest-waited-for semaphore */
+	__attribute__((nonnull));
+/********************************************************************//**
+Validates the integrity of the wait array. Checks
+that the number of reserved cells equals the count variable. */
+UNIV_INTERN
+void
+sync_array_validate(
+/*================*/
+	sync_array_t*	arr);	/*!< in: sync wait array */
+/**********************************************************************//**
+Prints info of the wait array. */
+UNIV_INTERN
+void
+sync_array_print(
+/*=============*/
+	FILE*		file);	/*!< in: file where to print */
+
+/**********************************************************************//**
+Create the primary system wait array(s), they are protected by an OS mutex */
+UNIV_INTERN
+void
+sync_array_init(
+/*============*/
+	ulint		n_threads);	/*!< in: Number of slots to create */
+/**********************************************************************//**
+Close sync array wait sub-system. */
+UNIV_INTERN
+void
+sync_array_close(void);
+/*==================*/
+
+/**********************************************************************//**
+Get an instance of the sync wait array. */
+UNIV_INTERN
+sync_array_t*
+sync_array_get(void);
+/*================*/
+
+#ifndef UNIV_NONINL
+#include "sync0arr.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/sync0arr.ic b/storage/innobase/include/sync0arr.ic
new file mode 100644
index 00000000000..18a46dd0a41
--- /dev/null
+++ b/storage/innobase/include/sync0arr.ic
@@ -0,0 +1,64 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0arr.ic
+The wait array for synchronization primitives
+
+Inline code
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+/** User configured sync array size */
+extern ulong srv_sync_array_size;
+
+/******************************************************************//**
+Get an instance of the sync wait array and reserve a wait array cell
+in the instance for waiting for an object. The event of the cell is
+reset to nonsignalled state.
+If reserving cell of the instance fails, try to get another new
+instance until we can reserve an empty cell of it.
+@return the instance found, never NULL. */
+UNIV_INLINE
+sync_array_t*
+sync_array_get_and_reserve_cell(
+/*============================*/
+	void*		object,	/*!< in: pointer to the object to wait for */
+	ulint		type,	/*!< in: lock request type */
+	const char*	file,	/*!< in: file where requested */
+	ulint		line,	/*!< in: line where requested */
+	ulint*		index)	/*!< out: index of the reserved cell */
+{
+	sync_array_t*	sync_arr;
+	bool		reserved = false;
+
+	for (ulint i = 0; i < srv_sync_array_size && !reserved; ++i) {
+		sync_arr = sync_array_get();
+		reserved = sync_array_reserve_cell(sync_arr, object, type,
+						   file, line, index);
+	}
+
+	/* This won't be true every time, for the loop above may execute
+	more than srv_sync_array_size times to reserve a cell.
+	But an assertion here makes the code more solid. */
+	ut_a(reserved);
+
+	return sync_arr;
+}
+
diff --git a/storage/innobase/include/sync0rw.h b/storage/innobase/include/sync0rw.h
new file mode 100644
index 00000000000..fdcbb1b6fa5
--- /dev/null
+++ b/storage/innobase/include/sync0rw.h
@@ -0,0 +1,813 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0rw.h
+The read-write lock (for threads, not for database transactions)
+
+Created 9/11/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0rw_h
+#define sync0rw_h
+
+#include "univ.i"
+#ifndef UNIV_HOTBACKUP
+#include "ut0lst.h"
+#include "ut0counter.h"
+#include "sync0sync.h"
+#include "os0sync.h"
+
+/* The following undef is to prevent a name conflict with a macro
+in MySQL: */
+#undef rw_lock_t
+#endif /* !UNIV_HOTBACKUP */
+
+/** Counters for RW locks. */
+struct rw_lock_stats_t {
+	typedef ib_counter_t<ib_int64_t, IB_N_SLOTS> ib_int64_counter_t;
+
+	/** number of spin waits on rw-latches,
+	resulted during shared (read) locks */
+	ib_int64_counter_t	rw_s_spin_wait_count;
+
+	/** number of spin loop rounds on rw-latches,
+	resulted during shared (read) locks */
+	ib_int64_counter_t	rw_s_spin_round_count;
+
+	/** number of OS waits on rw-latches,
+	resulted during shared (read) locks */
+	ib_int64_counter_t	rw_s_os_wait_count;
+
+	/** number of unlocks (that unlock shared locks),
+	set only when UNIV_SYNC_PERF_STAT is defined */
+	ib_int64_counter_t	rw_s_exit_count;
+
+	/** number of spin waits on rw-latches,
+	resulted during exclusive (write) locks */
+	ib_int64_counter_t	rw_x_spin_wait_count;
+
+	/** number of spin loop rounds on rw-latches,
+	resulted during exclusive (write) locks */
+	ib_int64_counter_t	rw_x_spin_round_count;
+
+	/** number of OS waits on rw-latches,
+	resulted during exclusive (write) locks */
+	ib_int64_counter_t	rw_x_os_wait_count;
+
+	/** number of unlocks (that unlock exclusive locks),
+	set only when UNIV_SYNC_PERF_STAT is defined */
+	ib_int64_counter_t	rw_x_exit_count;
+};
+
+/* Latch types; these are used also in btr0btr.h: keep the numerical values
+smaller than 30 and the order of the numerical values like below! */
+#define RW_S_LATCH	1
+#define	RW_X_LATCH	2
+#define	RW_NO_LATCH	3
+
+#ifndef UNIV_HOTBACKUP
+/* We decrement lock_word by this amount for each x_lock. It is also the
+start value for the lock_word, meaning that it limits the maximum number
+of concurrent read locks before the rw_lock breaks. The current value of
+0x00100000 allows 1,048,575 concurrent readers and 2047 recursive writers.*/
+#define X_LOCK_DECR		0x00100000
+
+struct rw_lock_t;
+#ifdef UNIV_SYNC_DEBUG
+struct rw_lock_debug_t;
+#endif /* UNIV_SYNC_DEBUG */
+
+typedef UT_LIST_BASE_NODE_T(rw_lock_t)	rw_lock_list_t;
+
+extern rw_lock_list_t	rw_lock_list;
+extern ib_mutex_t		rw_lock_list_mutex;
+
+#ifdef UNIV_SYNC_DEBUG
+/* The global mutex which protects debug info lists of all rw-locks.
+To modify the debug info list of an rw-lock, this mutex has to be
+
+acquired in addition to the mutex protecting the lock. */
+extern ib_mutex_t		rw_lock_debug_mutex;
+extern os_event_t	rw_lock_debug_event;	/*!< If deadlock detection does
+					not get immediately the mutex it
+					may wait for this event */
+extern ibool		rw_lock_debug_waiters;	/*!< This is set to TRUE, if
+					there may be waiters for the event */
+#endif /* UNIV_SYNC_DEBUG */
+
+/** Counters for RW locks. */
+extern rw_lock_stats_t	rw_lock_stats;
+
+#ifdef UNIV_PFS_RWLOCK
+/* Following are rwlock keys used to register with MySQL
+performance schema */
+# ifdef UNIV_LOG_ARCHIVE
+extern	mysql_pfs_key_t	archive_lock_key;
+# endif /* UNIV_LOG_ARCHIVE */
+extern	mysql_pfs_key_t btr_search_latch_key;
+extern	mysql_pfs_key_t	buf_block_lock_key;
+# ifdef UNIV_SYNC_DEBUG
+extern	mysql_pfs_key_t	buf_block_debug_latch_key;
+# endif /* UNIV_SYNC_DEBUG */
+extern	mysql_pfs_key_t	dict_operation_lock_key;
+extern	mysql_pfs_key_t	checkpoint_lock_key;
+extern	mysql_pfs_key_t	fil_space_latch_key;
+extern	mysql_pfs_key_t	fts_cache_rw_lock_key;
+extern	mysql_pfs_key_t	fts_cache_init_rw_lock_key;
+extern	mysql_pfs_key_t	trx_i_s_cache_lock_key;
+extern	mysql_pfs_key_t	trx_purge_latch_key;
+extern	mysql_pfs_key_t	index_tree_rw_lock_key;
+extern	mysql_pfs_key_t	index_online_log_key;
+extern	mysql_pfs_key_t	dict_table_stats_key;
+extern  mysql_pfs_key_t trx_sys_rw_lock_key;
+extern  mysql_pfs_key_t hash_table_rw_lock_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+
+#ifndef UNIV_PFS_RWLOCK
+/******************************************************************//**
+Creates, or rather, initializes an rw-lock object in a specified memory
+location (which must be appropriately aligned). The rw-lock is initialized
+to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free
+is necessary only if the memory block containing it is freed.
+if MySQL performance schema is enabled and "UNIV_PFS_RWLOCK" is
+defined, the rwlock are instrumented with performance schema probes. */
+# ifdef UNIV_DEBUG
+#  ifdef UNIV_SYNC_DEBUG
+#   define rw_lock_create(K, L, level)				\
+	rw_lock_create_func((L), (level), #L, __FILE__, __LINE__)
+#  else	/* UNIV_SYNC_DEBUG */
+#   define rw_lock_create(K, L, level)				\
+	rw_lock_create_func((L), #L, __FILE__, __LINE__)
+#  endif/* UNIV_SYNC_DEBUG */
+# else /* UNIV_DEBUG */
+#  define rw_lock_create(K, L, level)				\
+	rw_lock_create_func((L), __FILE__, __LINE__)
+# endif	/* UNIV_DEBUG */
+
+/**************************************************************//**
+NOTE! The following macros should be used in rw locking and
+unlocking, not the corresponding function. */
+
+# define rw_lock_s_lock(M)					\
+	rw_lock_s_lock_func((M), 0, __FILE__, __LINE__)
+
+# define rw_lock_s_lock_inline(M, P, F, L)			\
+	rw_lock_s_lock_func((M), (P), (F), (L))
+
+# define rw_lock_s_lock_gen(M, P)				\
+	rw_lock_s_lock_func((M), (P), __FILE__, __LINE__)
+
+# define rw_lock_s_lock_gen_nowait(M, P)			\
+	rw_lock_s_lock_low((M), (P), __FILE__, __LINE__)
+
+# define rw_lock_s_lock_nowait(M, F, L)				\
+	rw_lock_s_lock_low((M), 0, (F), (L))
+
+# ifdef UNIV_SYNC_DEBUG
+#  define rw_lock_s_unlock_gen(L, P)	rw_lock_s_unlock_func(P, L)
+# else
+#  define rw_lock_s_unlock_gen(L, P)	rw_lock_s_unlock_func(L)
+# endif
+
+
+# define rw_lock_x_lock(M)					\
+	rw_lock_x_lock_func((M), 0, __FILE__, __LINE__)
+
+# define rw_lock_x_lock_inline(M, P, F, L)			\
+	rw_lock_x_lock_func((M), (P), (F), (L))
+
+# define rw_lock_x_lock_gen(M, P)				\
+	rw_lock_x_lock_func((M), (P), __FILE__, __LINE__)
+
+# define rw_lock_x_lock_nowait(M)				\
+	rw_lock_x_lock_func_nowait((M), __FILE__, __LINE__)
+
+# define rw_lock_x_lock_func_nowait_inline(M, F, L)		\
+	rw_lock_x_lock_func_nowait((M), (F), (L))
+
+# ifdef UNIV_SYNC_DEBUG
+#  define rw_lock_x_unlock_gen(L, P)	rw_lock_x_unlock_func(P, L)
+# else
+#  define rw_lock_x_unlock_gen(L, P)	rw_lock_x_unlock_func(L)
+# endif
+
+# define rw_lock_free(M)		rw_lock_free_func(M)
+
+#else /* !UNIV_PFS_RWLOCK */
+
+/* Following macros point to Performance Schema instrumented functions. */
+# ifdef UNIV_DEBUG
+#  ifdef UNIV_SYNC_DEBUG
+#   define rw_lock_create(K, L, level)				\
+	pfs_rw_lock_create_func((K), (L), (level), #L, __FILE__, __LINE__)
+#  else	/* UNIV_SYNC_DEBUG */
+#   define rw_lock_create(K, L, level)				\
+	pfs_rw_lock_create_func((K), (L), #L, __FILE__, __LINE__)
+#  endif/* UNIV_SYNC_DEBUG */
+# else	/* UNIV_DEBUG */
+#  define rw_lock_create(K, L, level)				\
+	pfs_rw_lock_create_func((K), (L), __FILE__, __LINE__)
+# endif	/* UNIV_DEBUG */
+
+/******************************************************************
+NOTE! The following macros should be used in rw locking and
+unlocking, not the corresponding function. */
+
+# define rw_lock_s_lock(M)					\
+	pfs_rw_lock_s_lock_func((M), 0, __FILE__, __LINE__)
+
+# define rw_lock_s_lock_inline(M, P, F, L)			\
+	pfs_rw_lock_s_lock_func((M), (P), (F), (L))
+
+# define rw_lock_s_lock_gen(M, P)				\
+	pfs_rw_lock_s_lock_func((M), (P), __FILE__, __LINE__)
+
+# define rw_lock_s_lock_gen_nowait(M, P)			\
+	pfs_rw_lock_s_lock_low((M), (P), __FILE__, __LINE__)
+
+# define rw_lock_s_lock_nowait(M, F, L)				\
+	pfs_rw_lock_s_lock_low((M), 0, (F), (L))
+
+# ifdef UNIV_SYNC_DEBUG
+#  define rw_lock_s_unlock_gen(L, P)	pfs_rw_lock_s_unlock_func(P, L)
+# else
+#  define rw_lock_s_unlock_gen(L, P)	pfs_rw_lock_s_unlock_func(L)
+# endif
+
+# define rw_lock_x_lock(M)					\
+	pfs_rw_lock_x_lock_func((M), 0, __FILE__, __LINE__)
+
+# define rw_lock_x_lock_inline(M, P, F, L)			\
+	pfs_rw_lock_x_lock_func((M), (P), (F), (L))
+
+# define rw_lock_x_lock_gen(M, P)				\
+	pfs_rw_lock_x_lock_func((M), (P), __FILE__, __LINE__)
+
+# define rw_lock_x_lock_nowait(M)				\
+	pfs_rw_lock_x_lock_func_nowait((M), __FILE__, __LINE__)
+
+# define rw_lock_x_lock_func_nowait_inline(M, F, L)		\
+	pfs_rw_lock_x_lock_func_nowait((M), (F), (L))
+
+# ifdef UNIV_SYNC_DEBUG
+#  define rw_lock_x_unlock_gen(L, P)	pfs_rw_lock_x_unlock_func(P, L)
+# else
+#  define rw_lock_x_unlock_gen(L, P)	pfs_rw_lock_x_unlock_func(L)
+# endif
+
+# define rw_lock_free(M)		pfs_rw_lock_free_func(M)
+
+#endif /* UNIV_PFS_RWLOCK */
+
+#define rw_lock_s_unlock(L)		rw_lock_s_unlock_gen(L, 0)
+#define rw_lock_x_unlock(L)		rw_lock_x_unlock_gen(L, 0)
+
+/******************************************************************//**
+Creates, or rather, initializes an rw-lock object in a specified memory
+location (which must be appropriately aligned). The rw-lock is initialized
+to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free
+is necessary only if the memory block containing it is freed. */
+UNIV_INTERN
+void
+rw_lock_create_func(
+/*================*/
+	rw_lock_t*	lock,		/*!< in: pointer to memory */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+	ulint		level,		/*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+	const char*	cmutex_name,	/*!< in: mutex name */
+#endif /* UNIV_DEBUG */
+	const char*	cfile_name,	/*!< in: file name where created */
+	ulint		cline);		/*!< in: file line where created */
+/******************************************************************//**
+Calling this function is obligatory only if the memory buffer containing
+the rw-lock is freed. Removes an rw-lock object from the global list. The
+rw-lock is checked to be in the non-locked state. */
+UNIV_INTERN
+void
+rw_lock_free_func(
+/*==============*/
+	rw_lock_t*	lock);	/*!< in: rw-lock */
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Checks that the rw-lock has been initialized and that there are no
+simultaneous shared and exclusive locks.
+@return	TRUE */
+UNIV_INTERN
+ibool
+rw_lock_validate(
+/*=============*/
+	rw_lock_t*	lock);	/*!< in: rw-lock */
+#endif /* UNIV_DEBUG */
+/******************************************************************//**
+Low-level function which tries to lock an rw-lock in s-mode. Performs no
+spinning.
+@return	TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_s_lock_low(
+/*===============*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass __attribute__((unused)),
+				/*!< in: pass value; != 0, if the lock will be
+				passed to another thread to unlock */
+	const char*	file_name, /*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function, except if
+you supply the file name and line number. Lock an rw-lock in shared mode
+for the current thread. If the rw-lock is locked in exclusive mode, or
+there is an exclusive lock request waiting, the function spins a preset
+time (controlled by SYNC_SPIN_ROUNDS), waiting for the lock, before
+suspending the thread. */
+UNIV_INLINE
+void
+rw_lock_s_lock_func(
+/*================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread if the lock can be
+obtained immediately.
+@return	TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_x_lock_func_nowait(
+/*=======================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+/******************************************************************//**
+Releases a shared mode lock. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the lock may have
+				been passed to another thread to unlock */
+#endif
+	rw_lock_t*	lock);	/*!< in/out: rw-lock */
+
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread. If the rw-lock is locked
+in shared or exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
+for the lock, before suspending the thread. If the same thread has an x-lock
+on the rw-lock, locking succeed, with the following exception: if pass != 0,
+only a single x-lock may be taken on the lock. NOTE: If the same thread has
+an s-lock, locking does not succeed! */
+UNIV_INTERN
+void
+rw_lock_x_lock_func(
+/*================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+/******************************************************************//**
+Releases an exclusive mode lock. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the lock may have
+				been passed to another thread to unlock */
+#endif
+	rw_lock_t*	lock);	/*!< in/out: rw-lock */
+/******************************************************************//**
+This function is used in the insert buffer to move the ownership of an
+x-latch on a buffer frame to the current thread. The x-latch was set by
+the buffer read operation and it protected the buffer frame while the
+read was done. The ownership is moved because we want that the current
+thread is able to acquire a second x-latch which is stored in an mtr.
+This, in turn, is needed to pass the debug checks of index page
+operations. */
+UNIV_INTERN
+void
+rw_lock_x_lock_move_ownership(
+/*==========================*/
+	rw_lock_t*	lock);	/*!< in: lock which was x-locked in the
+				buffer read */
+/******************************************************************//**
+Returns the value of writer_count for the lock. Does not reserve the lock
+mutex, so the caller must be sure it is not changed during the call.
+@return	value of writer_count */
+UNIV_INLINE
+ulint
+rw_lock_get_x_lock_count(
+/*=====================*/
+	const rw_lock_t*	lock);	/*!< in: rw-lock */
+/********************************************************************//**
+Check if there are threads waiting for the rw-lock.
+@return	1 if waiters, 0 otherwise */
+UNIV_INLINE
+ulint
+rw_lock_get_waiters(
+/*================*/
+	const rw_lock_t*	lock);	/*!< in: rw-lock */
+/******************************************************************//**
+Returns the write-status of the lock - this function made more sense
+with the old rw_lock implementation.
+@return	RW_LOCK_NOT_LOCKED, RW_LOCK_EX, RW_LOCK_WAIT_EX */
+UNIV_INLINE
+ulint
+rw_lock_get_writer(
+/*===============*/
+	const rw_lock_t*	lock);	/*!< in: rw-lock */
+/******************************************************************//**
+Returns the number of readers.
+@return	number of readers */
+UNIV_INLINE
+ulint
+rw_lock_get_reader_count(
+/*=====================*/
+	const rw_lock_t*	lock);	/*!< in: rw-lock */
+/******************************************************************//**
+Decrements lock_word the specified amount if it is greater than 0.
+This is used by both s_lock and x_lock operations.
+@return	TRUE if decr occurs */
+UNIV_INLINE
+ibool
+rw_lock_lock_word_decr(
+/*===================*/
+	rw_lock_t*	lock,		/*!< in/out: rw-lock */
+	ulint		amount);	/*!< in: amount to decrement */
+/******************************************************************//**
+Increments lock_word the specified amount and returns new value.
+@return	lock->lock_word after increment */
+UNIV_INLINE
+lint
+rw_lock_lock_word_incr(
+/*===================*/
+	rw_lock_t*	lock,		/*!< in/out: rw-lock */
+	ulint		amount);	/*!< in: amount to increment */
+/******************************************************************//**
+This function sets the lock->writer_thread and lock->recursive fields.
+For platforms where we are using atomic builtins instead of lock->mutex
+it sets the lock->writer_thread field using atomics to ensure memory
+ordering. Note that it is assumed that the caller of this function
+effectively owns the lock i.e.: nobody else is allowed to modify
+lock->writer_thread at this point in time.
+The protocol is that lock->writer_thread MUST be updated BEFORE the
+lock->recursive flag is set. */
+UNIV_INLINE
+void
+rw_lock_set_writer_id_and_recursion_flag(
+/*=====================================*/
+	rw_lock_t*	lock,		/*!< in/out: lock to work on */
+	ibool		recursive);	/*!< in: TRUE if recursion
+					allowed */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Checks if the thread has locked the rw-lock in the specified mode, with
+the pass value == 0. */
+UNIV_INTERN
+ibool
+rw_lock_own(
+/*========*/
+	rw_lock_t*	lock,		/*!< in: rw-lock */
+	ulint		lock_type)	/*!< in: lock type: RW_LOCK_SHARED,
+					RW_LOCK_EX */
+	__attribute__((warn_unused_result));
+#endif /* UNIV_SYNC_DEBUG */
+/******************************************************************//**
+Checks if somebody has locked the rw-lock in the specified mode. */
+UNIV_INTERN
+ibool
+rw_lock_is_locked(
+/*==============*/
+	rw_lock_t*	lock,		/*!< in: rw-lock */
+	ulint		lock_type);	/*!< in: lock type: RW_LOCK_SHARED,
+					RW_LOCK_EX */
+#ifdef UNIV_SYNC_DEBUG
+/***************************************************************//**
+Prints debug info of an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_print(
+/*==========*/
+	rw_lock_t*	lock);	/*!< in: rw-lock */
+/***************************************************************//**
+Prints debug info of currently locked rw-locks. */
+UNIV_INTERN
+void
+rw_lock_list_print_info(
+/*====================*/
+	FILE*	file);		/*!< in: file where to print */
+/***************************************************************//**
+Returns the number of currently locked rw-locks.
+Works only in the debug version.
+@return	number of locked rw-locks */
+UNIV_INTERN
+ulint
+rw_lock_n_locked(void);
+/*==================*/
+
+/*#####################################################################*/
+
+/******************************************************************//**
+Acquires the debug mutex. We cannot use the mutex defined in sync0sync,
+because the debug mutex is also acquired in sync0arr while holding the OS
+mutex protecting the sync array, and the ordinary mutex_enter might
+recursively call routines in sync0arr, leading to a deadlock on the OS
+mutex. */
+UNIV_INTERN
+void
+rw_lock_debug_mutex_enter(void);
+/*===========================*/
+/******************************************************************//**
+Releases the debug mutex. */
+UNIV_INTERN
+void
+rw_lock_debug_mutex_exit(void);
+/*==========================*/
+/*********************************************************************//**
+Prints info of a debug struct. */
+UNIV_INTERN
+void
+rw_lock_debug_print(
+/*================*/
+	FILE*			f,	/*!< in: output stream */
+	rw_lock_debug_t*	info);	/*!< in: debug struct */
+#endif /* UNIV_SYNC_DEBUG */
+
+/* NOTE! The structure appears here only for the compiler to know its size.
+Do not use its fields directly! */
+
+/** The structure used in the spin lock implementation of a read-write
+lock. Several threads may have a shared lock simultaneously in this
+lock, but only one writer may have an exclusive lock, in which case no
+shared locks are allowed. To prevent starving of a writer blocked by
+readers, a writer may queue for x-lock by decrementing lock_word: no
+new readers will be let in while the thread waits for readers to
+exit. */
+struct rw_lock_t {
+	volatile lint	lock_word;
+				/*!< Holds the state of the lock. */
+	volatile ulint	waiters;/*!< 1: there are waiters */
+	volatile ibool	recursive;/*!< Default value FALSE which means the lock
+				is non-recursive. The value is typically set
+				to TRUE making normal rw_locks recursive. In
+				case of asynchronous IO, when a non-zero
+				value of 'pass' is passed then we keep the
+				lock non-recursive.
+				This flag also tells us about the state of
+				writer_thread field. If this flag is set
+				then writer_thread MUST contain the thread
+				id of the current x-holder or wait-x thread.
+				This flag must be reset in x_unlock
+				functions before incrementing the lock_word */
+	volatile os_thread_id_t	writer_thread;
+				/*!< Thread id of writer thread. Is only
+				guaranteed to have sane and non-stale
+				value iff recursive flag is set. */
+	os_event_t	event;	/*!< Used by sync0arr.cc for thread queueing */
+	os_event_t	wait_ex_event;
+				/*!< Event for next-writer to wait on. A thread
+				must decrement lock_word before waiting. */
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+	ib_mutex_t	mutex;		/*!< The mutex protecting rw_lock_t */
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+
+	UT_LIST_NODE_T(rw_lock_t) list;
+				/*!< All allocated rw locks are put into a
+				list */
+#ifdef UNIV_SYNC_DEBUG
+	UT_LIST_BASE_NODE_T(rw_lock_debug_t) debug_list;
+				/*!< In the debug version: pointer to the debug
+				info list of the lock */
+	ulint	level;		/*!< Level in the global latching order. */
+#endif /* UNIV_SYNC_DEBUG */
+#ifdef UNIV_PFS_RWLOCK
+	struct PSI_rwlock *pfs_psi;/*!< The instrumentation hook */
+#endif
+	ulint count_os_wait;	/*!< Count of os_waits. May not be accurate */
+	const char*	cfile_name;/*!< File name where lock created */
+        /* last s-lock file/line is not guaranteed to be correct */
+	const char*	last_s_file_name;/*!< File name where last s-locked */
+	const char*	last_x_file_name;/*!< File name where last x-locked */
+	ibool		writer_is_wait_ex;
+				/*!< This is TRUE if the writer field is
+				RW_LOCK_WAIT_EX; this field is located far
+				from the memory update hotspot fields which
+				are at the start of this struct, thus we can
+				peek this field without causing much memory
+				bus traffic */
+	unsigned	cline:14;	/*!< Line where created */
+	unsigned	last_s_line:14;	/*!< Line number where last time s-locked */
+	unsigned	last_x_line:14;	/*!< Line number where last time x-locked */
+#ifdef UNIV_DEBUG
+	ulint	magic_n;	/*!< RW_LOCK_MAGIC_N */
+/** Value of rw_lock_t::magic_n */
+#define	RW_LOCK_MAGIC_N	22643
+#endif /* UNIV_DEBUG */
+};
+
+#ifdef UNIV_SYNC_DEBUG
+/** The structure for storing debug info of an rw-lock.  All access to this
+structure must be protected by rw_lock_debug_mutex_enter(). */
+struct	rw_lock_debug_t {
+
+	os_thread_id_t thread_id;  /*!< The thread id of the thread which
+				locked the rw-lock */
+	ulint	pass;		/*!< Pass value given in the lock operation */
+	ulint	lock_type;	/*!< Type of the lock: RW_LOCK_EX,
+				RW_LOCK_SHARED, RW_LOCK_WAIT_EX */
+	const char*	file_name;/*!< File name where the lock was obtained */
+	ulint	line;		/*!< Line where the rw-lock was locked */
+	UT_LIST_NODE_T(rw_lock_debug_t) list;
+				/*!< Debug structs are linked in a two-way
+				list */
+};
+#endif /* UNIV_SYNC_DEBUG */
+
+/* For performance schema instrumentation, a new set of rwlock
+wrap functions are created if "UNIV_PFS_RWLOCK" is defined.
+The instrumentations are not planted directly into original
+functions, so that we keep the underlying function as they
+are. And in case, user wants to "take out" some rwlock from
+instrumentation even if performance schema (UNIV_PFS_RWLOCK)
+is defined, they can do so by reinstating APIs directly link to
+original underlying functions.
+The instrumented function names have prefix of "pfs_rw_lock_" vs.
+original name prefix of "rw_lock_". Following are list of functions
+that have been instrumented:
+
+rw_lock_create()
+rw_lock_x_lock()
+rw_lock_x_lock_gen()
+rw_lock_x_lock_nowait()
+rw_lock_x_unlock_gen()
+rw_lock_s_lock()
+rw_lock_s_lock_gen()
+rw_lock_s_lock_nowait()
+rw_lock_s_unlock_gen()
+rw_lock_free()
+*/
+
+#ifdef UNIV_PFS_RWLOCK
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_create_func()
+NOTE! Please use the corresponding macro rw_lock_create(), not
+directly this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_create_func(
+/*====================*/
+	PSI_rwlock_key  key,		/*!< in: key registered with
+					performance schema */
+	rw_lock_t*	lock,		/*!< in: rw lock */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+	ulint		level,		/*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+	const char*	cmutex_name,	/*!< in: mutex name */
+#endif /* UNIV_DEBUG */
+	const char*	cfile_name,	/*!< in: file name where created */
+	ulint		cline);		/*!< in: file line where created */
+
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_x_lock_func()
+NOTE! Please use the corresponding macro rw_lock_x_lock(), not
+directly this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_lock_func(
+/*====================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+/******************************************************************//**
+Performance schema instrumented wrap function for
+rw_lock_x_lock_func_nowait()
+NOTE! Please use the corresponding macro, not directly this function!
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_rw_lock_x_lock_func_nowait(
+/*===========================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_lock_func()
+NOTE! Please use the corresponding macro rw_lock_s_lock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_s_lock_func(
+/*====================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_lock_func()
+NOTE! Please use the corresponding macro rw_lock_s_lock(), not directly
+this function!
+@return TRUE if success */
+UNIV_INLINE
+ibool
+pfs_rw_lock_s_lock_low(
+/*===================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the
+				lock will be passed to another
+				thread to unlock */
+	const char*	file_name, /*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_x_lock_func()
+NOTE! Please use the corresponding macro rw_lock_x_lock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_lock_func(
+/*====================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_unlock_func()
+NOTE! Please use the corresponding macro rw_lock_s_unlock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_s_unlock_func(
+/*======================*/
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the
+				lock may have been passed to another
+				thread to unlock */
+#endif
+	rw_lock_t*	lock);	/*!< in/out: rw-lock */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_unlock_func()
+NOTE! Please use the corresponding macro rw_lock_x_unlock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_unlock_func(
+/*======================*/
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the
+				lock may have been passed to another
+				thread to unlock */
+#endif
+	rw_lock_t*	lock);	/*!< in/out: rw-lock */
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_free_func()
+NOTE! Please use the corresponding macro rw_lock_free(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_free_func(
+/*==================*/
+	rw_lock_t*	lock);	/*!< in: rw-lock */
+#endif  /* UNIV_PFS_RWLOCK */
+
+
+#ifndef UNIV_NONINL
+#include "sync0rw.ic"
+#endif
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/innobase/include/sync0rw.ic b/storage/innobase/include/sync0rw.ic
new file mode 100644
index 00000000000..bb05ae7daf1
--- /dev/null
+++ b/storage/innobase/include/sync0rw.ic
@@ -0,0 +1,797 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0rw.ic
+The read-write lock (for threads)
+
+Created 9/11/1995 Heikki Tuuri
+*******************************************************/
+
+/******************************************************************//**
+Lock an rw-lock in shared mode for the current thread. If the rw-lock is
+locked in exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS),
+waiting for the lock before suspending the thread. */
+UNIV_INTERN
+void
+rw_lock_s_lock_spin(
+/*================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Inserts the debug information for an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_add_debug_info(
+/*===================*/
+	rw_lock_t*	lock,		/*!< in: rw-lock */
+	ulint		pass,		/*!< in: pass value */
+	ulint		lock_type,	/*!< in: lock type */
+	const char*	file_name,	/*!< in: file where requested */
+	ulint		line);		/*!< in: line where requested */
+/******************************************************************//**
+Removes a debug information struct for an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_remove_debug_info(
+/*======================*/
+	rw_lock_t*	lock,		/*!< in: rw-lock */
+	ulint		pass,		/*!< in: pass value */
+	ulint		lock_type);	/*!< in: lock type */
+#endif /* UNIV_SYNC_DEBUG */
+
+/********************************************************************//**
+Check if there are threads waiting for the rw-lock.
+@return	1 if waiters, 0 otherwise */
+UNIV_INLINE
+ulint
+rw_lock_get_waiters(
+/*================*/
+	const rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+	return(lock->waiters);
+}
+
+/********************************************************************//**
+Sets lock->waiters to 1. It is not an error if lock->waiters is already
+1. On platforms where ATOMIC builtins are used this function enforces a
+memory barrier. */
+UNIV_INLINE
+void
+rw_lock_set_waiter_flag(
+/*====================*/
+	rw_lock_t*	lock)	/*!< in/out: rw-lock */
+{
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+	(void) os_compare_and_swap_ulint(&lock->waiters, 0, 1);
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+	lock->waiters = 1;
+	os_wmb;
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/********************************************************************//**
+Resets lock->waiters to 0. It is not an error if lock->waiters is already
+0. On platforms where ATOMIC builtins are used this function enforces a
+memory barrier. */
+UNIV_INLINE
+void
+rw_lock_reset_waiter_flag(
+/*======================*/
+	rw_lock_t*	lock)	/*!< in/out: rw-lock */
+{
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+	(void) os_compare_and_swap_ulint(&lock->waiters, 1, 0);
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+	lock->waiters = 0;
+	os_wmb;
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/******************************************************************//**
+Returns the write-status of the lock - this function made more sense
+with the old rw_lock implementation.
+@return	RW_LOCK_NOT_LOCKED, RW_LOCK_EX, RW_LOCK_WAIT_EX */
+UNIV_INLINE
+ulint
+rw_lock_get_writer(
+/*===============*/
+	const rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+	lint lock_word = lock->lock_word;
+	if (lock_word > 0) {
+		/* return NOT_LOCKED in s-lock state, like the writer
+		member of the old lock implementation. */
+		return(RW_LOCK_NOT_LOCKED);
+	} else if ((lock_word == 0) || (lock_word <= -X_LOCK_DECR)) {
+		return(RW_LOCK_EX);
+	} else {
+		ut_ad(lock_word > -X_LOCK_DECR);
+		return(RW_LOCK_WAIT_EX);
+	}
+}
+
+/******************************************************************//**
+Returns the number of readers.
+@return	number of readers */
+UNIV_INLINE
+ulint
+rw_lock_get_reader_count(
+/*=====================*/
+	const rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+	lint lock_word = lock->lock_word;
+	if (lock_word > 0) {
+		/* s-locked, no x-waiters */
+		return(X_LOCK_DECR - lock_word);
+	} else if (lock_word < 0 && lock_word > -X_LOCK_DECR) {
+		/* s-locked, with x-waiters */
+		return((ulint)(-lock_word));
+	}
+	return(0);
+}
+
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+UNIV_INLINE
+ib_mutex_t*
+rw_lock_get_mutex(
+/*==============*/
+	rw_lock_t*	lock)
+{
+	return(&(lock->mutex));
+}
+#endif
+
+/******************************************************************//**
+Returns the value of writer_count for the lock. Does not reserve the lock
+mutex, so the caller must be sure it is not changed during the call.
+@return	value of writer_count */
+UNIV_INLINE
+ulint
+rw_lock_get_x_lock_count(
+/*=====================*/
+	const rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+	lint lock_copy = lock->lock_word;
+	if ((lock_copy != 0) && (lock_copy > -X_LOCK_DECR)) {
+		return(0);
+	}
+	return((lock_copy == 0) ? 1 : (2 - (lock_copy + X_LOCK_DECR)));
+}
+
+/******************************************************************//**
+Two different implementations for decrementing the lock_word of a rw_lock:
+one for systems supporting atomic operations, one for others. This does
+does not support recusive x-locks: they should be handled by the caller and
+need not be atomic since they are performed by the current lock holder.
+Returns true if the decrement was made, false if not.
+@return	TRUE if decr occurs */
+UNIV_INLINE
+ibool
+rw_lock_lock_word_decr(
+/*===================*/
+	rw_lock_t*	lock,		/*!< in/out: rw-lock */
+	ulint		amount)		/*!< in: amount to decrement */
+{
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+	lint local_lock_word;
+
+	os_rmb;
+	local_lock_word = lock->lock_word;
+	while (local_lock_word > 0) {
+		if (os_compare_and_swap_lint(&lock->lock_word,
+					     local_lock_word,
+					     local_lock_word - amount)) {
+			return(TRUE);
+		}
+		local_lock_word = lock->lock_word;
+	}
+	return(FALSE);
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+	ibool success = FALSE;
+	mutex_enter(&(lock->mutex));
+	if (lock->lock_word > 0) {
+		lock->lock_word -= amount;
+		success = TRUE;
+	}
+	mutex_exit(&(lock->mutex));
+	return(success);
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/******************************************************************//**
+Increments lock_word the specified amount and returns new value.
+@return	lock->lock_word after increment */
+UNIV_INLINE
+lint
+rw_lock_lock_word_incr(
+/*===================*/
+	rw_lock_t*	lock,		/*!< in/out: rw-lock */
+	ulint		amount)		/*!< in: amount of increment */
+{
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+	return(os_atomic_increment_lint(&lock->lock_word, amount));
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+	lint local_lock_word;
+
+	mutex_enter(&(lock->mutex));
+
+	lock->lock_word += amount;
+	local_lock_word = lock->lock_word;
+
+	mutex_exit(&(lock->mutex));
+
+	return(local_lock_word);
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/******************************************************************//**
+This function sets the lock->writer_thread and lock->recursive fields.
+For platforms where we are using atomic builtins instead of lock->mutex
+it sets the lock->writer_thread field using atomics to ensure memory
+ordering. Note that it is assumed that the caller of this function
+effectively owns the lock i.e.: nobody else is allowed to modify
+lock->writer_thread at this point in time.
+The protocol is that lock->writer_thread MUST be updated BEFORE the
+lock->recursive flag is set. */
+UNIV_INLINE
+void
+rw_lock_set_writer_id_and_recursion_flag(
+/*=====================================*/
+	rw_lock_t*	lock,		/*!< in/out: lock to work on */
+	ibool		recursive)	/*!< in: TRUE if recursion
+					allowed */
+{
+	os_thread_id_t	curr_thread	= os_thread_get_curr_id();
+
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+	os_thread_id_t	local_thread;
+	ibool		success;
+
+	/* Prevent Valgrind warnings about writer_thread being
+	uninitialized.  It does not matter if writer_thread is
+	uninitialized, because we are comparing writer_thread against
+	itself, and the operation should always succeed. */
+	UNIV_MEM_VALID(&lock->writer_thread, sizeof lock->writer_thread);
+
+	local_thread = lock->writer_thread;
+	success = os_compare_and_swap_thread_id(
+		&lock->writer_thread, local_thread, curr_thread);
+	ut_a(success);
+	lock->recursive = recursive;
+
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+
+	mutex_enter(&lock->mutex);
+	lock->writer_thread = curr_thread;
+	lock->recursive = recursive;
+	mutex_exit(&lock->mutex);
+
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/******************************************************************//**
+Low-level function which tries to lock an rw-lock in s-mode. Performs no
+spinning.
+@return	TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_s_lock_low(
+/*===============*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass __attribute__((unused)),
+				/*!< in: pass value; != 0, if the lock will be
+				passed to another thread to unlock */
+	const char*	file_name, /*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	if (!rw_lock_lock_word_decr(lock, 1)) {
+		/* Locking did not succeed */
+		return(FALSE);
+	}
+
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name, line);
+#endif
+	/* These debugging values are not set safely: they may be incorrect
+	or even refer to a line that is invalid for the file name. */
+	lock->last_s_file_name = file_name;
+	lock->last_s_line = line;
+
+	return(TRUE);	/* locking succeeded */
+}
+
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in shared mode for the current thread. If the rw-lock is locked
+in exclusive mode, or there is an exclusive lock request waiting, the
+function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting for
+the lock, before suspending the thread. */
+UNIV_INLINE
+void
+rw_lock_s_lock_func(
+/*================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	/* NOTE: As we do not know the thread ids for threads which have
+	s-locked a latch, and s-lockers will be served only after waiting
+	x-lock requests have been fulfilled, then if this thread already
+	owns an s-lock here, it may end up in a deadlock with another thread
+	which requests an x-lock here. Therefore, we will forbid recursive
+	s-locking of a latch: the following assert will warn the programmer
+	of the possibility of this kind of a deadlock. If we want to implement
+	safe recursive s-locking, we should keep in a list the thread ids of
+	the threads which have s-locked a latch. This would use some CPU
+	time. */
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); /* see NOTE above */
+	ut_ad(!rw_lock_own(lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (rw_lock_s_lock_low(lock, pass, file_name, line)) {
+
+		return; /* Success */
+	} else {
+		/* Did not succeed, try spin wait */
+
+		rw_lock_s_lock_spin(lock, pass, file_name, line);
+
+		return;
+	}
+}
+
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread if the lock can be
+obtained immediately.
+@return	TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_x_lock_func_nowait(
+/*=======================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	ibool success;
+
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+	success = os_compare_and_swap_lint(&lock->lock_word, X_LOCK_DECR, 0);
+#else
+
+	success = FALSE;
+	mutex_enter(&(lock->mutex));
+	if (lock->lock_word == X_LOCK_DECR) {
+		lock->lock_word = 0;
+		success = TRUE;
+	}
+	mutex_exit(&(lock->mutex));
+
+#endif
+	if (success) {
+		rw_lock_set_writer_id_and_recursion_flag(lock, TRUE);
+
+	} else if (lock->recursive
+		   && os_thread_eq(lock->writer_thread,
+				   os_thread_get_curr_id())) {
+		/* Relock: this lock_word modification is safe since no other
+		threads can modify (lock, unlock, or reserve) lock_word while
+		there is an exclusive writer and this is the writer thread. */
+		if (lock->lock_word == 0) {
+			lock->lock_word = -X_LOCK_DECR;
+		} else {
+			lock->lock_word--;
+		}
+
+		/* Watch for too many recursive locks */
+		ut_ad(lock->lock_word < 0);
+
+	} else {
+		/* Failure */
+		return(FALSE);
+	}
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line);
+#endif
+
+	lock->last_x_file_name = file_name;
+	lock->last_x_line = line;
+
+	ut_ad(rw_lock_validate(lock));
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+Releases a shared mode lock. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the lock may have
+				been passed to another thread to unlock */
+#endif
+	rw_lock_t*	lock)	/*!< in/out: rw-lock */
+{
+	ut_ad(lock->lock_word > -X_LOCK_DECR);
+	ut_ad(lock->lock_word != 0);
+	ut_ad(lock->lock_word < X_LOCK_DECR);
+
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_remove_debug_info(lock, pass, RW_LOCK_SHARED);
+#endif
+
+	/* Increment lock_word to indicate 1 less reader */
+	if (rw_lock_lock_word_incr(lock, 1) == 0) {
+
+		/* wait_ex waiter exists. It may not be asleep, but we signal
+		anyway. We do not wake other waiters, because they can't
+		exist without wait_ex waiter and wait_ex waiter goes first.*/
+		os_event_set(lock->wait_ex_event);
+		sync_array_object_signalled();
+
+	}
+
+	ut_ad(rw_lock_validate(lock));
+
+#ifdef UNIV_SYNC_PERF_STAT
+	rw_s_exit_count++;
+#endif
+}
+
+/******************************************************************//**
+Releases an exclusive mode lock. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the lock may have
+				been passed to another thread to unlock */
+#endif
+	rw_lock_t*	lock)	/*!< in/out: rw-lock */
+{
+	ut_ad(lock->lock_word == 0 || lock->lock_word <= -X_LOCK_DECR);
+
+	/* lock->recursive flag also indicates if lock->writer_thread is
+	valid or stale. If we are the last of the recursive callers
+	then we must unset lock->recursive flag to indicate that the
+	lock->writer_thread is now stale.
+	Note that since we still hold the x-lock we can safely read the
+	lock_word. */
+	if (lock->lock_word == 0) {
+		/* Last caller in a possible recursive chain. */
+		lock->recursive = FALSE;
+	}
+
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_remove_debug_info(lock, pass, RW_LOCK_EX);
+#endif
+
+	ulint x_lock_incr;
+	if (lock->lock_word == 0) {
+		x_lock_incr = X_LOCK_DECR;
+	} else if (lock->lock_word == -X_LOCK_DECR) {
+		x_lock_incr = X_LOCK_DECR;
+	} else {
+		ut_ad(lock->lock_word < -X_LOCK_DECR);
+		x_lock_incr = 1;
+	}
+
+	if (rw_lock_lock_word_incr(lock, x_lock_incr) == X_LOCK_DECR) {
+		/* Lock is now free. May have to signal read/write waiters.
+		We do not need to signal wait_ex waiters, since they cannot
+		exist when there is a writer. */
+		if (lock->waiters) {
+			rw_lock_reset_waiter_flag(lock);
+			os_event_set(lock->event);
+			sync_array_object_signalled();
+		}
+	}
+
+	ut_ad(rw_lock_validate(lock));
+
+#ifdef UNIV_SYNC_PERF_STAT
+	rw_x_exit_count++;
+#endif
+}
+
+#ifdef UNIV_PFS_RWLOCK
+
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_create_func().
+NOTE! Please use the corresponding macro rw_lock_create(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_create_func(
+/*====================*/
+	mysql_pfs_key_t	key,		/*!< in: key registered with
+					performance schema */
+	rw_lock_t*	lock,		/*!< in: pointer to memory */
+# ifdef UNIV_DEBUG
+#  ifdef UNIV_SYNC_DEBUG
+	ulint		level,		/*!< in: level */
+#  endif /* UNIV_SYNC_DEBUG */
+	const char*	cmutex_name,	/*!< in: mutex name */
+# endif /* UNIV_DEBUG */
+	const char*	cfile_name,	/*!< in: file name where created */
+	ulint		cline)		/*!< in: file line where created */
+{
+	/* Initialize the rwlock for performance schema */
+	lock->pfs_psi = PSI_RWLOCK_CALL(init_rwlock)(key, lock);
+
+	/* The actual function to initialize an rwlock */
+	rw_lock_create_func(lock,
+# ifdef UNIV_DEBUG
+#  ifdef UNIV_SYNC_DEBUG
+			    level,
+#  endif /* UNIV_SYNC_DEBUG */
+			    cmutex_name,
+# endif /* UNIV_DEBUG */
+			    cfile_name,
+			    cline);
+}
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_x_lock_func()
+NOTE! Please use the corresponding macro rw_lock_x_lock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_lock_func(
+/*====================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	if (lock->pfs_psi != NULL)
+	{
+		PSI_rwlock_locker*	locker;
+		PSI_rwlock_locker_state	state;
+
+		/* Record the entry of rw x lock request in performance schema */
+		locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)(
+			&state, lock->pfs_psi, PSI_RWLOCK_WRITELOCK,
+			file_name, static_cast<uint>(line));
+
+		rw_lock_x_lock_func(
+			lock, pass, file_name, static_cast<uint>(line));
+
+		if (locker != NULL) {
+			PSI_RWLOCK_CALL(end_rwlock_wrwait)(locker, 0);
+		}
+	}
+	else
+	{
+		rw_lock_x_lock_func(lock, pass, file_name, line);
+	}
+}
+/******************************************************************//**
+Performance schema instrumented wrap function for
+rw_lock_x_lock_func_nowait()
+NOTE! Please use the corresponding macro rw_lock_x_lock_func(),
+not directly this function!
+@return	TRUE if success */
+UNIV_INLINE
+ibool
+pfs_rw_lock_x_lock_func_nowait(
+/*===========================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	const char*	file_name,/*!< in: file name where lock
+				requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	ibool	ret;
+
+	if (lock->pfs_psi != NULL)
+	{
+		PSI_rwlock_locker*	locker;
+		PSI_rwlock_locker_state		state;
+
+		/* Record the entry of rw x lock request in performance schema */
+		locker = PSI_RWLOCK_CALL(start_rwlock_wrwait)(
+			&state, lock->pfs_psi, PSI_RWLOCK_WRITELOCK,
+			file_name, static_cast<uint>(line));
+
+		ret = rw_lock_x_lock_func_nowait(lock, file_name, line);
+
+		if (locker != NULL) {
+			PSI_RWLOCK_CALL(end_rwlock_wrwait)(
+				locker, static_cast<int>(ret));
+		}
+	}
+	else
+	{
+		ret = rw_lock_x_lock_func_nowait(lock, file_name, line);
+	}
+
+	return(ret);
+}
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_free_func()
+NOTE! Please use the corresponding macro rw_lock_free(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_free_func(
+/*==================*/
+	rw_lock_t*	lock)	/*!< in: pointer to rw-lock */
+{
+	if (lock->pfs_psi != NULL)
+	{
+		PSI_RWLOCK_CALL(destroy_rwlock)(lock->pfs_psi);
+		lock->pfs_psi = NULL;
+	}
+
+	rw_lock_free_func(lock);
+}
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_lock_func()
+NOTE! Please use the corresponding macro rw_lock_s_lock(), not
+directly this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_s_lock_func(
+/*====================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the
+				lock will be passed to another
+				thread to unlock */
+	const char*	file_name,/*!< in: file name where lock
+				requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	if (lock->pfs_psi != NULL)
+	{
+		PSI_rwlock_locker*	locker;
+		PSI_rwlock_locker_state	state;
+
+		/* Instrumented to inform we are aquiring a shared rwlock */
+		locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)(
+			&state, lock->pfs_psi, PSI_RWLOCK_READLOCK,
+			file_name, static_cast<uint>(line));
+
+		rw_lock_s_lock_func(lock, pass, file_name, line);
+
+		if (locker != NULL) {
+			PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0);
+		}
+	}
+	else
+	{
+		rw_lock_s_lock_func(lock, pass, file_name, line);
+	}
+
+	return;
+}
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_lock_func()
+NOTE! Please use the corresponding macro rw_lock_s_lock(), not
+directly this function!
+@return	TRUE if success */
+UNIV_INLINE
+ibool
+pfs_rw_lock_s_lock_low(
+/*===================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the
+				lock will be passed to another
+				thread to unlock */
+	const char*	file_name, /*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	ibool	ret;
+
+	if (lock->pfs_psi != NULL)
+	{
+		PSI_rwlock_locker*	locker;
+		PSI_rwlock_locker_state	state;
+
+		/* Instrumented to inform we are aquiring a shared rwlock */
+		locker = PSI_RWLOCK_CALL(start_rwlock_rdwait)(
+			&state, lock->pfs_psi, PSI_RWLOCK_READLOCK,
+			file_name, static_cast<uint>(line));
+
+		ret = rw_lock_s_lock_low(lock, pass, file_name, line);
+
+		if (locker != NULL) {
+			PSI_RWLOCK_CALL(end_rwlock_rdwait)(
+				locker, static_cast<int>(ret));
+		}
+	}
+	else
+	{
+		ret = rw_lock_s_lock_low(lock, pass, file_name, line);
+	}
+
+	return(ret);
+}
+
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_x_unlock_func()
+NOTE! Please use the corresponding macro rw_lock_x_unlock(), not directly
+this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_x_unlock_func(
+/*======================*/
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the
+				lock may have been passed to another
+				thread to unlock */
+#endif
+	rw_lock_t*	lock)	/*!< in/out: rw-lock */
+{
+	/* Inform performance schema we are unlocking the lock */
+	if (lock->pfs_psi != NULL)
+		PSI_RWLOCK_CALL(unlock_rwlock)(lock->pfs_psi);
+
+	rw_lock_x_unlock_func(
+#ifdef UNIV_SYNC_DEBUG
+		pass,
+#endif
+		lock);
+}
+
+/******************************************************************//**
+Performance schema instrumented wrap function for rw_lock_s_unlock_func()
+NOTE! Please use the corresponding macro pfs_rw_lock_s_unlock(), not
+directly this function! */
+UNIV_INLINE
+void
+pfs_rw_lock_s_unlock_func(
+/*======================*/
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the
+				lock may have been passed to another
+				thread to unlock */
+#endif
+	rw_lock_t*	lock)	/*!< in/out: rw-lock */
+{
+	/* Inform performance schema we are unlocking the lock */
+	if (lock->pfs_psi != NULL)
+		PSI_RWLOCK_CALL(unlock_rwlock)(lock->pfs_psi);
+
+	rw_lock_s_unlock_func(
+#ifdef UNIV_SYNC_DEBUG
+		pass,
+#endif
+		lock);
+
+}
+#endif /* UNIV_PFS_RWLOCK */
diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h
new file mode 100644
index 00000000000..82fb353a41b
--- /dev/null
+++ b/storage/innobase/include/sync0sync.h
@@ -0,0 +1,845 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2012, Facebook Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0sync.h
+Mutex, the basic synchronization primitive
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0sync_h
+#define sync0sync_h
+
+#include "univ.i"
+#include "sync0types.h"
+#include "ut0lst.h"
+#include "ut0mem.h"
+#include "os0thread.h"
+#include "os0sync.h"
+#include "sync0arr.h"
+
+#if  defined(UNIV_DEBUG) && !defined(UNIV_HOTBACKUP)
+extern "C" my_bool	timed_mutexes;
+#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */
+
+#ifdef HAVE_WINDOWS_ATOMICS
+typedef LONG lock_word_t;	/*!< On Windows, InterlockedExchange operates
+				on LONG variable */
+#elif defined(HAVE_ATOMIC_BUILTINS) && !defined(HAVE_ATOMIC_BUILTINS_BYTE)
+typedef ulint lock_word_t;
+#else
+typedef byte lock_word_t;
+#endif
+
+#if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK
+
+/* By default, buffer mutexes and rwlocks will be excluded from
+instrumentation due to their large number of instances. */
+# define PFS_SKIP_BUFFER_MUTEX_RWLOCK
+
+/* By default, event->mutex will also be excluded from instrumentation */
+# define PFS_SKIP_EVENT_MUTEX
+
+#endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_PFS_MUTEX
+/* Key defines to register InnoDB mutexes with performance schema */
+extern mysql_pfs_key_t	autoinc_mutex_key;
+extern mysql_pfs_key_t	buffer_block_mutex_key;
+extern mysql_pfs_key_t	buf_pool_mutex_key;
+extern mysql_pfs_key_t	buf_pool_zip_mutex_key;
+extern mysql_pfs_key_t	cache_last_read_mutex_key;
+extern mysql_pfs_key_t	dict_foreign_err_mutex_key;
+extern mysql_pfs_key_t	dict_sys_mutex_key;
+extern mysql_pfs_key_t	file_format_max_mutex_key;
+extern mysql_pfs_key_t	fil_system_mutex_key;
+extern mysql_pfs_key_t	flush_list_mutex_key;
+extern mysql_pfs_key_t	fts_bg_threads_mutex_key;
+extern mysql_pfs_key_t	fts_delete_mutex_key;
+extern mysql_pfs_key_t	fts_optimize_mutex_key;
+extern mysql_pfs_key_t	fts_doc_id_mutex_key;
+extern mysql_pfs_key_t	fts_pll_tokenize_mutex_key;
+extern mysql_pfs_key_t	hash_table_mutex_key;
+extern mysql_pfs_key_t	ibuf_bitmap_mutex_key;
+extern mysql_pfs_key_t	ibuf_mutex_key;
+extern mysql_pfs_key_t	ibuf_pessimistic_insert_mutex_key;
+extern mysql_pfs_key_t	log_sys_mutex_key;
+extern mysql_pfs_key_t	log_flush_order_mutex_key;
+# ifndef HAVE_ATOMIC_BUILTINS
+extern mysql_pfs_key_t	server_mutex_key;
+# endif /* !HAVE_ATOMIC_BUILTINS */
+# ifdef UNIV_MEM_DEBUG
+extern mysql_pfs_key_t	mem_hash_mutex_key;
+# endif /* UNIV_MEM_DEBUG */
+extern mysql_pfs_key_t	mem_pool_mutex_key;
+extern mysql_pfs_key_t	mutex_list_mutex_key;
+extern mysql_pfs_key_t	purge_sys_bh_mutex_key;
+extern mysql_pfs_key_t	recv_sys_mutex_key;
+extern mysql_pfs_key_t	recv_writer_mutex_key;
+extern mysql_pfs_key_t	rseg_mutex_key;
+# ifdef UNIV_SYNC_DEBUG
+extern mysql_pfs_key_t	rw_lock_debug_mutex_key;
+# endif /* UNIV_SYNC_DEBUG */
+extern mysql_pfs_key_t	rw_lock_list_mutex_key;
+extern mysql_pfs_key_t	rw_lock_mutex_key;
+extern mysql_pfs_key_t	srv_dict_tmpfile_mutex_key;
+extern mysql_pfs_key_t	srv_innodb_monitor_mutex_key;
+extern mysql_pfs_key_t	srv_misc_tmpfile_mutex_key;
+extern mysql_pfs_key_t	srv_threads_mutex_key;
+extern mysql_pfs_key_t	srv_monitor_file_mutex_key;
+# ifdef UNIV_SYNC_DEBUG
+extern mysql_pfs_key_t	sync_thread_mutex_key;
+# endif /* UNIV_SYNC_DEBUG */
+extern mysql_pfs_key_t	buf_dblwr_mutex_key;
+extern mysql_pfs_key_t	trx_undo_mutex_key;
+extern mysql_pfs_key_t	trx_mutex_key;
+extern mysql_pfs_key_t	lock_sys_mutex_key;
+extern mysql_pfs_key_t	lock_sys_wait_mutex_key;
+extern mysql_pfs_key_t	trx_sys_mutex_key;
+extern mysql_pfs_key_t	srv_sys_mutex_key;
+extern mysql_pfs_key_t	srv_sys_tasks_mutex_key;
+#ifndef HAVE_ATOMIC_BUILTINS
+extern mysql_pfs_key_t	srv_conc_mutex_key;
+#endif /* !HAVE_ATOMIC_BUILTINS */
+#ifndef HAVE_ATOMIC_BUILTINS_64
+extern mysql_pfs_key_t	monitor_mutex_key;
+#endif /* !HAVE_ATOMIC_BUILTINS_64 */
+extern mysql_pfs_key_t	event_os_mutex_key;
+extern mysql_pfs_key_t	ut_list_mutex_key;
+extern mysql_pfs_key_t	os_mutex_key;
+extern mysql_pfs_key_t  zip_pad_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/******************************************************************//**
+Initializes the synchronization data structures. */
+UNIV_INTERN
+void
+sync_init(void);
+/*===========*/
+/******************************************************************//**
+Frees the resources in synchronization data structures. */
+UNIV_INTERN
+void
+sync_close(void);
+/*===========*/
+
+#undef mutex_free			/* Fix for MacOS X */
+
+#ifdef UNIV_PFS_MUTEX
+/**********************************************************************
+Following mutex APIs would be performance schema instrumented
+if "UNIV_PFS_MUTEX" is defined:
+
+mutex_create
+mutex_enter
+mutex_exit
+mutex_enter_nowait
+mutex_free
+
+These mutex APIs will point to corresponding wrapper functions that contain
+the performance schema instrumentation if "UNIV_PFS_MUTEX" is defined.
+The instrumented wrapper functions have the prefix of "innodb_".
+
+NOTE! The following macro should be used in mutex operation, not the
+corresponding function. */
+
+/******************************************************************//**
+Creates, or rather, initializes a mutex object to a specified memory
+location (which must be appropriately aligned). The mutex is initialized
+in the reset state. Explicit freeing of the mutex with mutex_free is
+necessary only if the memory block containing it is freed. */
+# ifdef UNIV_DEBUG
+#  ifdef UNIV_SYNC_DEBUG
+#   define mutex_create(K, M, level)				\
+	pfs_mutex_create_func((K), (M), #M, (level), __FILE__, __LINE__)
+#  else
+#   define mutex_create(K, M, level)				\
+	pfs_mutex_create_func((K), (M), #M, __FILE__, __LINE__)
+#  endif/* UNIV_SYNC_DEBUG */
+# else
+#  define mutex_create(K, M, level)				\
+	pfs_mutex_create_func((K), (M), __FILE__, __LINE__)
+# endif	/* UNIV_DEBUG */
+
+# define mutex_enter(M)						\
+	pfs_mutex_enter_func((M), __FILE__, __LINE__)
+
+# define mutex_enter_nowait(M)					\
+	pfs_mutex_enter_nowait_func((M), __FILE__, __LINE__)
+
+# define mutex_exit(M)	pfs_mutex_exit_func(M)
+
+# define mutex_free(M)	pfs_mutex_free_func(M)
+
+#else	/* UNIV_PFS_MUTEX */
+
+/* If "UNIV_PFS_MUTEX" is not defined, the mutex APIs point to
+original non-instrumented functions */
+# ifdef UNIV_DEBUG
+#  ifdef UNIV_SYNC_DEBUG
+#   define mutex_create(K, M, level)			\
+	mutex_create_func((M), #M, (level), __FILE__, __LINE__)
+#  else /* UNIV_SYNC_DEBUG */
+#   define mutex_create(K, M, level)				\
+	mutex_create_func((M), #M, __FILE__, __LINE__)
+#  endif /* UNIV_SYNC_DEBUG */
+# else /* UNIV_DEBUG */
+#  define mutex_create(K, M, level)				\
+	mutex_create_func((M), __FILE__, __LINE__)
+# endif	/* UNIV_DEBUG */
+
+# define mutex_enter(M)	mutex_enter_func((M), __FILE__, __LINE__)
+
+# define mutex_enter_nowait(M)	\
+	mutex_enter_nowait_func((M), __FILE__, __LINE__)
+
+# define mutex_exit(M)	mutex_exit_func(M)
+
+# define mutex_free(M)	mutex_free_func(M)
+
+#endif	/* UNIV_PFS_MUTEX */
+
+/******************************************************************//**
+Creates, or rather, initializes a mutex object in a specified memory
+location (which must be appropriately aligned). The mutex is initialized
+in the reset state. Explicit freeing of the mutex with mutex_free is
+necessary only if the memory block containing it is freed. */
+UNIV_INTERN
+void
+mutex_create_func(
+/*==============*/
+	ib_mutex_t*	mutex,		/*!< in: pointer to memory */
+#ifdef UNIV_DEBUG
+	const char*	cmutex_name,	/*!< in: mutex name */
+# ifdef UNIV_SYNC_DEBUG
+	ulint		level,		/*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+#endif /* UNIV_DEBUG */
+	const char*	cfile_name,	/*!< in: file name where created */
+	ulint		cline);		/*!< in: file line where created */
+
+/******************************************************************//**
+NOTE! Use the corresponding macro mutex_free(), not directly this function!
+Calling this function is obligatory only if the memory buffer containing
+the mutex is freed. Removes a mutex object from the mutex list. The mutex
+is checked to be in the reset state. */
+UNIV_INTERN
+void
+mutex_free_func(
+/*============*/
+	ib_mutex_t*	mutex);	/*!< in: mutex */
+/**************************************************************//**
+NOTE! The following macro should be used in mutex locking, not the
+corresponding function. */
+
+/* NOTE! currently same as mutex_enter! */
+
+#define mutex_enter_fast(M)	mutex_enter_func((M), __FILE__, __LINE__)
+/******************************************************************//**
+NOTE! Use the corresponding macro in the header file, not this function
+directly. Locks a mutex for the current thread. If the mutex is reserved
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS) waiting
+for the mutex before suspending the thread. */
+UNIV_INLINE
+void
+mutex_enter_func(
+/*=============*/
+	ib_mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*	file_name,	/*!< in: file name where locked */
+	ulint		line);		/*!< in: line where locked */
+/********************************************************************//**
+NOTE! Use the corresponding macro in the header file, not this function
+directly. Tries to lock the mutex for the current thread. If the lock is not
+acquired immediately, returns with return value 1.
+@return	0 if succeed, 1 if not */
+UNIV_INTERN
+ulint
+mutex_enter_nowait_func(
+/*====================*/
+	ib_mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*	file_name,	/*!< in: file name where mutex
+					requested */
+	ulint		line);		/*!< in: line where requested */
+/******************************************************************//**
+NOTE! Use the corresponding macro mutex_exit(), not directly this function!
+Unlocks a mutex owned by the current thread. */
+UNIV_INLINE
+void
+mutex_exit_func(
+/*============*/
+	ib_mutex_t*	mutex);	/*!< in: pointer to mutex */
+
+
+#ifdef UNIV_PFS_MUTEX
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_create(), not directly
+this function!
+A wrapper function for mutex_create_func(), registers the mutex
+with peformance schema if "UNIV_PFS_MUTEX" is defined when
+creating the mutex */
+UNIV_INLINE
+void
+pfs_mutex_create_func(
+/*==================*/
+	PSI_mutex_key	key,		/*!< in: Performance Schema key */
+	ib_mutex_t*	mutex,		/*!< in: pointer to memory */
+# ifdef UNIV_DEBUG
+	const char*	cmutex_name,	/*!< in: mutex name */
+#  ifdef UNIV_SYNC_DEBUG
+	ulint		level,		/*!< in: level */
+#  endif /* UNIV_SYNC_DEBUG */
+# endif /* UNIV_DEBUG */
+	const char*	cfile_name,	/*!< in: file name where created */
+	ulint		cline);		/*!< in: file line where created */
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_enter(), not directly
+this function!
+This is a performance schema instrumented wrapper function for
+mutex_enter_func(). */
+UNIV_INLINE
+void
+pfs_mutex_enter_func(
+/*=================*/
+	ib_mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*	file_name,	/*!< in: file name where locked */
+	ulint		line);		/*!< in: line where locked */
+/********************************************************************//**
+NOTE! Please use the corresponding macro mutex_enter_nowait(), not directly
+this function!
+This is a performance schema instrumented wrapper function for
+mutex_enter_nowait_func.
+@return	0 if succeed, 1 if not */
+UNIV_INLINE
+ulint
+pfs_mutex_enter_nowait_func(
+/*========================*/
+	ib_mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*	file_name,	/*!< in: file name where mutex
+					requested */
+	ulint		line);		/*!< in: line where requested */
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_exit(), not directly
+this function!
+A wrap function of mutex_exit_func() with peformance schema instrumentation.
+Unlocks a mutex owned by the current thread. */
+UNIV_INLINE
+void
+pfs_mutex_exit_func(
+/*================*/
+	ib_mutex_t*	mutex);	/*!< in: pointer to mutex */
+
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_free(), not directly
+this function!
+Wrapper function for mutex_free_func(). Also destroys the performance
+schema probes when freeing the mutex */
+UNIV_INLINE
+void
+pfs_mutex_free_func(
+/*================*/
+	ib_mutex_t*	mutex);	/*!< in: mutex */
+
+#endif /* UNIV_PFS_MUTEX */
+
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Returns TRUE if no mutex or rw-lock is currently locked.
+Works only in the debug version.
+@return	TRUE if no mutexes and rw-locks reserved */
+UNIV_INTERN
+ibool
+sync_all_freed(void);
+/*================*/
+#endif /* UNIV_SYNC_DEBUG */
+/*#####################################################################
+FUNCTION PROTOTYPES FOR DEBUGGING */
+/*******************************************************************//**
+Prints wait info of the sync system. */
+UNIV_INTERN
+void
+sync_print_wait_info(
+/*=================*/
+	FILE*	file);		/*!< in: file where to print */
+/*******************************************************************//**
+Prints info of the sync system. */
+UNIV_INTERN
+void
+sync_print(
+/*=======*/
+	FILE*	file);		/*!< in: file where to print */
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Checks that the mutex has been initialized.
+@return	TRUE */
+UNIV_INTERN
+ibool
+mutex_validate(
+/*===========*/
+	const ib_mutex_t*	mutex);	/*!< in: mutex */
+/******************************************************************//**
+Checks that the current thread owns the mutex. Works only
+in the debug version.
+@return	TRUE if owns */
+UNIV_INTERN
+ibool
+mutex_own(
+/*======*/
+	const ib_mutex_t*	mutex)	/*!< in: mutex */
+	__attribute__((warn_unused_result));
+#endif /* UNIV_DEBUG */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Adds a latch and its level in the thread level array. Allocates the memory
+for the array if called first time for this OS thread. Makes the checks
+against other latch levels stored in the array for this thread. */
+UNIV_INTERN
+void
+sync_thread_add_level(
+/*==================*/
+	void*	latch,	/*!< in: pointer to a mutex or an rw-lock */
+	ulint	level,	/*!< in: level in the latching order; if
+			SYNC_LEVEL_VARYING, nothing is done */
+	ibool	relock)	/*!< in: TRUE if re-entering an x-lock */
+	__attribute__((nonnull));
+/******************************************************************//**
+Removes a latch from the thread level array if it is found there.
+@return TRUE if found in the array; it is no error if the latch is
+not found, as we presently are not able to determine the level for
+every latch reservation the program does */
+UNIV_INTERN
+ibool
+sync_thread_reset_level(
+/*====================*/
+	void*	latch);	/*!< in: pointer to a mutex or an rw-lock */
+/******************************************************************//**
+Checks if the level array for the current thread contains a
+mutex or rw-latch at the specified level.
+@return	a matching latch, or NULL if not found */
+UNIV_INTERN
+void*
+sync_thread_levels_contains(
+/*========================*/
+	ulint	level);			/*!< in: latching order level
+					(SYNC_DICT, ...)*/
+/******************************************************************//**
+Checks that the level array for the current thread is empty.
+@return	a latch, or NULL if empty except the exceptions specified below */
+UNIV_INTERN
+void*
+sync_thread_levels_nonempty_gen(
+/*============================*/
+	ibool	dict_mutex_allowed)	/*!< in: TRUE if dictionary mutex is
+					allowed to be owned by the thread */
+	__attribute__((warn_unused_result));
+/******************************************************************//**
+Checks if the level array for the current thread is empty,
+except for data dictionary latches. */
+#define sync_thread_levels_empty_except_dict()		\
+	(!sync_thread_levels_nonempty_gen(TRUE))
+/******************************************************************//**
+Checks if the level array for the current thread is empty,
+except for the btr_search_latch.
+@return	a latch, or NULL if empty except the exceptions specified below */
+UNIV_INTERN
+void*
+sync_thread_levels_nonempty_trx(
+/*============================*/
+	ibool	has_search_latch)
+				/*!< in: TRUE if and only if the thread
+				is supposed to hold btr_search_latch */
+	__attribute__((warn_unused_result));
+
+/******************************************************************//**
+Gets the debug information for a reserved mutex. */
+UNIV_INTERN
+void
+mutex_get_debug_info(
+/*=================*/
+	ib_mutex_t*	mutex,		/*!< in: mutex */
+	const char**	file_name,	/*!< out: file where requested */
+	ulint*		line,		/*!< out: line where requested */
+	os_thread_id_t* thread_id);	/*!< out: id of the thread which owns
+					the mutex */
+/******************************************************************//**
+Counts currently reserved mutexes. Works only in the debug version.
+@return	number of reserved mutexes */
+UNIV_INTERN
+ulint
+mutex_n_reserved(void);
+/*==================*/
+#endif /* UNIV_SYNC_DEBUG */
+/******************************************************************//**
+NOT to be used outside this module except in debugging! Gets the value
+of the lock word. */
+UNIV_INLINE
+lock_word_t
+mutex_get_lock_word(
+/*================*/
+	const ib_mutex_t*	mutex);	/*!< in: mutex */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+NOT to be used outside this module except in debugging! Gets the waiters
+field in a mutex.
+@return	value to set */
+UNIV_INLINE
+ulint
+mutex_get_waiters(
+/*==============*/
+	const ib_mutex_t*	mutex);	/*!< in: mutex */
+#endif /* UNIV_SYNC_DEBUG */
+
+/*
+		LATCHING ORDER WITHIN THE DATABASE
+		==================================
+
+The mutex or latch in the central memory object, for instance, a rollback
+segment object, must be acquired before acquiring the latch or latches to
+the corresponding file data structure. In the latching order below, these
+file page object latches are placed immediately below the corresponding
+central memory object latch or mutex.
+
+Synchronization object			Notes
+----------------------			-----
+
+Dictionary mutex			If we have a pointer to a dictionary
+|					object, e.g., a table, it can be
+|					accessed without reserving the
+|					dictionary mutex. We must have a
+|					reservation, a memoryfix, to the
+|					appropriate table object in this case,
+|					and the table must be explicitly
+|					released later.
+V
+Dictionary header
+|
+V
+Secondary index tree latch		The tree latch protects also all
+|					the B-tree non-leaf pages. These
+V					can be read with the page only
+Secondary index non-leaf		bufferfixed to save CPU time,
+|					no s-latch is needed on the page.
+|					Modification of a page requires an
+|					x-latch on the page, however. If a
+|					thread owns an x-latch to the tree,
+|					it is allowed to latch non-leaf pages
+|					even after it has acquired the fsp
+|					latch.
+V
+Secondary index leaf			The latch on the secondary index leaf
+|					can be kept while accessing the
+|					clustered index, to save CPU time.
+V
+Clustered index tree latch		To increase concurrency, the tree
+|					latch is usually released when the
+|					leaf page latch has been acquired.
+V
+Clustered index non-leaf
+|
+V
+Clustered index leaf
+|
+V
+Transaction system header
+|
+V
+Transaction undo mutex			The undo log entry must be written
+|					before any index page is modified.
+|					Transaction undo mutex is for the undo
+|					logs the analogue of the tree latch
+|					for a B-tree. If a thread has the
+|					trx undo mutex reserved, it is allowed
+|					to latch the undo log pages in any
+|					order, and also after it has acquired
+|					the fsp latch.
+V
+Rollback segment mutex			The rollback segment mutex must be
+|					reserved, if, e.g., a new page must
+|					be added to an undo log. The rollback
+|					segment and the undo logs in its
+|					history list can be seen as an
+|					analogue of a B-tree, and the latches
+|					reserved similarly, using a version of
+|					lock-coupling. If an undo log must be
+|					extended by a page when inserting an
+|					undo log record, this corresponds to
+|					a pessimistic insert in a B-tree.
+V
+Rollback segment header
+|
+V
+Purge system latch
+|
+V
+Undo log pages				If a thread owns the trx undo mutex,
+|					or for a log in the history list, the
+|					rseg mutex, it is allowed to latch
+|					undo log pages in any order, and even
+|					after it has acquired the fsp latch.
+|					If a thread does not have the
+|					appropriate mutex, it is allowed to
+|					latch only a single undo log page in
+|					a mini-transaction.
+V
+File space management latch		If a mini-transaction must allocate
+|					several file pages, it can do that,
+|					because it keeps the x-latch to the
+|					file space management in its memo.
+V
+File system pages
+|
+V
+lock_sys_wait_mutex			Mutex protecting lock timeout data
+|
+V
+lock_sys_mutex				Mutex protecting lock_sys_t
+|
+V
+trx_sys->mutex				Mutex protecting trx_sys_t
+|
+V
+Threads mutex				Background thread scheduling mutex
+|
+V
+query_thr_mutex				Mutex protecting query threads
+|
+V
+trx_mutex				Mutex protecting trx_t fields
+|
+V
+Search system mutex
+|
+V
+Buffer pool mutex
+|
+V
+Log mutex
+|
+Any other latch
+|
+V
+Memory pool mutex */
+
+/* Latching order levels. If you modify these, you have to also update
+sync_thread_add_level(). */
+
+/* User transaction locks are higher than any of the latch levels below:
+no latches are allowed when a thread goes to wait for a normal table
+or row lock! */
+#define SYNC_USER_TRX_LOCK	9999
+#define SYNC_NO_ORDER_CHECK	3000	/* this can be used to suppress
+					latching order checking */
+#define	SYNC_LEVEL_VARYING	2000	/* Level is varying. Only used with
+					buffer pool page locks, which do not
+					have a fixed level, but instead have
+					their level set after the page is
+					locked; see e.g.
+					ibuf_bitmap_get_map_page(). */
+#define SYNC_TRX_I_S_RWLOCK	1910	/* Used for
+					trx_i_s_cache_t::rw_lock */
+#define SYNC_TRX_I_S_LAST_READ	1900	/* Used for
+					trx_i_s_cache_t::last_read_mutex */
+#define SYNC_FILE_FORMAT_TAG	1200	/* Used to serialize access to the
+					file format tag */
+#define	SYNC_DICT_OPERATION	1010	/* table create, drop, etc. reserve
+					this in X-mode; implicit or backround
+					operations purge, rollback, foreign
+					key checks reserve this in S-mode */
+#define SYNC_FTS_CACHE		1005	/* FTS cache rwlock */
+#define SYNC_DICT		1000
+#define SYNC_DICT_AUTOINC_MUTEX	999
+#define SYNC_STATS_AUTO_RECALC	997
+#define SYNC_DICT_HEADER	995
+#define SYNC_IBUF_HEADER	914
+#define SYNC_IBUF_PESS_INSERT_MUTEX 912
+/*-------------------------------*/
+#define	SYNC_INDEX_TREE		900
+#define SYNC_TREE_NODE_NEW	892
+#define SYNC_TREE_NODE_FROM_HASH 891
+#define SYNC_TREE_NODE		890
+#define	SYNC_PURGE_LATCH	800
+#define	SYNC_TRX_UNDO		700
+#define SYNC_RSEG		600
+#define SYNC_RSEG_HEADER_NEW	591
+#define SYNC_RSEG_HEADER	590
+#define SYNC_TRX_UNDO_PAGE	570
+#define SYNC_EXTERN_STORAGE	500
+#define	SYNC_FSP		400
+#define	SYNC_FSP_PAGE		395
+/*------------------------------------- Change buffer headers */
+#define SYNC_IBUF_MUTEX		370	/* ibuf_mutex */
+/*------------------------------------- Change buffer tree */
+#define SYNC_IBUF_INDEX_TREE	360
+#define SYNC_IBUF_TREE_NODE_NEW	359
+#define SYNC_IBUF_TREE_NODE	358
+#define	SYNC_IBUF_BITMAP_MUTEX	351
+#define	SYNC_IBUF_BITMAP	350
+/*------------------------------------- Change log for online create index */
+#define SYNC_INDEX_ONLINE_LOG	340
+/*------------------------------------- MySQL query cache mutex */
+/*------------------------------------- MySQL binlog mutex */
+/*-------------------------------*/
+#define SYNC_LOCK_WAIT_SYS	300
+#define SYNC_LOCK_SYS		299
+#define SYNC_TRX_SYS		298
+#define SYNC_TRX		297
+#define SYNC_THREADS		295
+#define SYNC_REC_LOCK		294
+#define SYNC_TRX_SYS_HEADER	290
+#define	SYNC_PURGE_QUEUE	200
+#define SYNC_LOG		170
+#define SYNC_LOG_FLUSH_ORDER	147
+#define SYNC_RECV		168
+#define SYNC_FTS_TOKENIZE	167
+#define SYNC_FTS_CACHE_INIT	166	/* Used for FTS cache initialization */
+#define SYNC_FTS_BG_THREADS	165
+#define SYNC_FTS_OPTIMIZE       164     // FIXME: is this correct number, test
+#define	SYNC_WORK_QUEUE		162
+#define	SYNC_SEARCH_SYS		160	/* NOTE that if we have a memory
+					heap that can be extended to the
+					buffer pool, its logical level is
+					SYNC_SEARCH_SYS, as memory allocation
+					can call routines there! Otherwise
+					the level is SYNC_MEM_HASH. */
+#define	SYNC_BUF_POOL		150	/* Buffer pool mutex */
+#define	SYNC_BUF_PAGE_HASH	149	/* buf_pool->page_hash rw_lock */
+#define	SYNC_BUF_BLOCK		146	/* Block mutex */
+#define	SYNC_BUF_FLUSH_LIST	145	/* Buffer flush list mutex */
+#define SYNC_DOUBLEWRITE	140
+#define	SYNC_ANY_LATCH		135
+#define	SYNC_MEM_HASH		131
+#define	SYNC_MEM_POOL		130
+
+/* Codes used to designate lock operations */
+#define RW_LOCK_NOT_LOCKED	350
+#define RW_LOCK_EX		351
+#define RW_LOCK_EXCLUSIVE	351
+#define RW_LOCK_SHARED		352
+#define RW_LOCK_WAIT_EX		353
+#define SYNC_MUTEX		354
+
+/* NOTE! The structure appears here only for the compiler to know its size.
+Do not use its fields directly! The structure used in the spin lock
+implementation of a mutual exclusion semaphore. */
+
+/** InnoDB mutex */
+struct ib_mutex_t {
+	os_event_t	event;	/*!< Used by sync0arr.cc for the wait queue */
+	volatile lock_word_t	lock_word;	/*!< lock_word is the target
+				of the atomic test-and-set instruction when
+				atomic operations are enabled. */
+
+#if !defined(HAVE_ATOMIC_BUILTINS)
+	os_fast_mutex_t
+		os_fast_mutex;	/*!< We use this OS mutex in place of lock_word
+				when atomic operations are not enabled */
+#endif
+	ulint	waiters;	/*!< This ulint is set to 1 if there are (or
+				may be) threads waiting in the global wait
+				array for this mutex to be released.
+				Otherwise, this is 0. */
+	UT_LIST_NODE_T(ib_mutex_t)	list; /*!< All allocated mutexes are put into
+				a list.	Pointers to the next and prev. */
+#ifdef UNIV_SYNC_DEBUG
+	const char*	file_name;	/*!< File where the mutex was locked */
+	ulint	line;		/*!< Line where the mutex was locked */
+	ulint	level;		/*!< Level in the global latching order */
+#endif /* UNIV_SYNC_DEBUG */
+	const char*	cfile_name;/*!< File name where mutex created */
+	ulint		cline;	/*!< Line where created */
+	ulong		count_os_wait;	/*!< count of os_wait */
+#ifdef UNIV_DEBUG
+
+/** Value of mutex_t::magic_n */
+# define MUTEX_MAGIC_N	979585UL
+
+	os_thread_id_t thread_id; /*!< The thread id of the thread
+				which locked the mutex. */
+	ulint		magic_n;	/*!< MUTEX_MAGIC_N */
+	const char*	cmutex_name;	/*!< mutex name */
+	ulint		ib_mutex_type;	/*!< 0=usual mutex, 1=rw_lock mutex */
+#endif /* UNIV_DEBUG */
+#ifdef UNIV_PFS_MUTEX
+	struct PSI_mutex* pfs_psi;	/*!< The performance schema
+					instrumentation hook */
+#endif
+};
+
+/** Constant determining how long spin wait is continued before suspending
+the thread. A value 600 rounds on a 1995 100 MHz Pentium seems to correspond
+to 20 microseconds. */
+
+#define	SYNC_SPIN_ROUNDS	srv_n_spin_wait_rounds
+
+/** The number of mutex_exit calls. Intended for performance monitoring. */
+extern	ib_int64_t	mutex_exit_count;
+
+#ifdef UNIV_SYNC_DEBUG
+/** Latching order checks start when this is set TRUE */
+extern ibool	sync_order_checks_on;
+#endif /* UNIV_SYNC_DEBUG */
+
+/** This variable is set to TRUE when sync_init is called */
+extern ibool	sync_initialized;
+
+/** Global list of database mutexes (not OS mutexes) created. */
+typedef UT_LIST_BASE_NODE_T(ib_mutex_t)  ut_list_base_node_t;
+/** Global list of database mutexes (not OS mutexes) created. */
+extern ut_list_base_node_t  mutex_list;
+
+/** Mutex protecting the mutex_list variable */
+extern ib_mutex_t mutex_list_mutex;
+
+#ifndef HAVE_ATOMIC_BUILTINS
+/**********************************************************//**
+Function that uses a mutex to decrement a variable atomically */
+UNIV_INLINE
+void
+os_atomic_dec_ulint_func(
+/*=====================*/
+	ib_mutex_t*		mutex,		/*!< in: mutex guarding the
+						decrement */
+	volatile ulint*		var,		/*!< in/out: variable to
+						decrement */
+	ulint			delta);		/*!< in: delta to decrement */
+/**********************************************************//**
+Function that uses a mutex to increment a variable atomically */
+UNIV_INLINE
+void
+os_atomic_inc_ulint_func(
+/*=====================*/
+	ib_mutex_t*		mutex,		/*!< in: mutex guarding the
+						increment */
+	volatile ulint*		var,		/*!< in/out: variable to
+						increment */
+	ulint			delta);		/*!< in: delta to increment */
+#endif /* !HAVE_ATOMIC_BUILTINS */
+
+#ifndef UNIV_NONINL
+#include "sync0sync.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/sync0sync.ic b/storage/innobase/include/sync0sync.ic
new file mode 100644
index 00000000000..616e53d4aac
--- /dev/null
+++ b/storage/innobase/include/sync0sync.ic
@@ -0,0 +1,414 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0sync.ic
+Mutex, the basic synchronization primitive
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+/******************************************************************//**
+Sets the waiters field in a mutex. */
+UNIV_INTERN
+void
+mutex_set_waiters(
+/*==============*/
+	ib_mutex_t*	mutex,	/*!< in: mutex */
+	ulint		n);	/*!< in: value to set */
+/******************************************************************//**
+Reserves a mutex for the current thread. If the mutex is reserved, the
+function spins a preset time (controlled by SYNC_SPIN_ROUNDS) waiting
+for the mutex before suspending the thread. */
+UNIV_INTERN
+void
+mutex_spin_wait(
+/*============*/
+	ib_mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*	file_name,	/*!< in: file name where mutex
+					requested */
+	ulint		line);		/*!< in: line where requested */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Sets the debug information for a reserved mutex. */
+UNIV_INTERN
+void
+mutex_set_debug_info(
+/*=================*/
+	ib_mutex_t*	mutex,		/*!< in: mutex */
+	const char*	file_name,	/*!< in: file where requested */
+	ulint		line);		/*!< in: line where requested */
+#endif /* UNIV_SYNC_DEBUG */
+/******************************************************************//**
+Releases the threads waiting in the primary wait array for this mutex. */
+UNIV_INTERN
+void
+mutex_signal_object(
+/*================*/
+	ib_mutex_t*	mutex);	/*!< in: mutex */
+
+/******************************************************************//**
+Performs an atomic test-and-set instruction to the lock_word field of a
+mutex.
+@return	the previous value of lock_word: 0 or 1 */
+UNIV_INLINE
+byte
+ib_mutex_test_and_set(
+/*===============*/
+	ib_mutex_t*	mutex)	/*!< in: mutex */
+{
+#if defined(HAVE_ATOMIC_BUILTINS)
+# if defined(HAVE_ATOMIC_BUILTINS_BYTE)
+	return(os_atomic_test_and_set_byte(&mutex->lock_word, 1));
+# else
+	return(os_atomic_test_and_set_ulint(&mutex->lock_word, 1));
+# endif
+#else
+	ibool	ret;
+
+	ret = os_fast_mutex_trylock(&(mutex->os_fast_mutex));
+
+	if (ret == 0) {
+		/* We check that os_fast_mutex_trylock does not leak
+		and allow race conditions */
+		ut_a(mutex->lock_word == 0);
+
+		mutex->lock_word = 1;
+		os_wmb;
+	}
+
+	return((byte) ret);
+#endif
+}
+
+/******************************************************************//**
+Performs a reset instruction to the lock_word field of a mutex. This
+instruction also serializes memory operations to the program order. */
+UNIV_INLINE
+void
+mutex_reset_lock_word(
+/*==================*/
+	ib_mutex_t*	mutex)	/*!< in: mutex */
+{
+#if defined(HAVE_ATOMIC_BUILTINS)
+	/* In theory __sync_lock_release should be used to release the lock.
+	Unfortunately, it does not work properly alone. The workaround is
+	that more conservative __sync_lock_test_and_set is used instead. */
+# if defined(HAVE_ATOMIC_BUILTINS_BYTE)
+	os_atomic_test_and_set_byte(&mutex->lock_word, 0);
+# else
+	os_atomic_test_and_set_ulint(&mutex->lock_word, 0);
+# endif
+#else
+	mutex->lock_word = 0;
+
+	os_fast_mutex_unlock(&(mutex->os_fast_mutex));
+#endif
+}
+
+/******************************************************************//**
+Gets the value of the lock word. */
+UNIV_INLINE
+lock_word_t
+mutex_get_lock_word(
+/*================*/
+	const ib_mutex_t*	mutex)	/*!< in: mutex */
+{
+	ut_ad(mutex);
+
+	return(mutex->lock_word);
+}
+
+/******************************************************************//**
+Gets the waiters field in a mutex.
+@return	value to set */
+UNIV_INLINE
+ulint
+mutex_get_waiters(
+/*==============*/
+	const ib_mutex_t*	mutex)	/*!< in: mutex */
+{
+	const volatile ulint*	ptr;	/*!< declared volatile to ensure that
+					the value is read from memory */
+	ut_ad(mutex);
+
+	ptr = &(mutex->waiters);
+
+	return(*ptr);		/* Here we assume that the read of a single
+				word from memory is atomic */
+}
+
+/******************************************************************//**
+NOTE! Use the corresponding macro mutex_exit(), not directly this function!
+Unlocks a mutex owned by the current thread. */
+UNIV_INLINE
+void
+mutex_exit_func(
+/*============*/
+	ib_mutex_t*	mutex)	/*!< in: pointer to mutex */
+{
+	ut_ad(mutex_own(mutex));
+
+	ut_d(mutex->thread_id = (os_thread_id_t) ULINT_UNDEFINED);
+
+#ifdef UNIV_SYNC_DEBUG
+	sync_thread_reset_level(mutex);
+#endif
+	mutex_reset_lock_word(mutex);
+
+	/* A problem: we assume that mutex_reset_lock word
+	is a memory barrier, that is when we read the waiters
+	field next, the read must be serialized in memory
+	after the reset. A speculative processor might
+	perform the read first, which could leave a waiting
+	thread hanging indefinitely.
+
+	Our current solution call every second
+	sync_arr_wake_threads_if_sema_free()
+	to wake up possible hanging threads if
+	they are missed in mutex_signal_object. */
+
+	if (mutex_get_waiters(mutex) != 0) {
+
+		mutex_signal_object(mutex);
+	}
+
+#ifdef UNIV_SYNC_PERF_STAT
+	mutex_exit_count++;
+#endif
+}
+
+/******************************************************************//**
+Locks a mutex for the current thread. If the mutex is reserved, the function
+spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting for the mutex
+before suspending the thread. */
+UNIV_INLINE
+void
+mutex_enter_func(
+/*=============*/
+	ib_mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*	file_name,	/*!< in: file name where locked */
+	ulint		line)		/*!< in: line where locked */
+{
+	ut_ad(mutex_validate(mutex));
+	ut_ad(!mutex_own(mutex));
+
+	/* Note that we do not peek at the value of lock_word before trying
+	the atomic test_and_set; we could peek, and possibly save time. */
+
+	if (!ib_mutex_test_and_set(mutex)) {
+		ut_d(mutex->thread_id = os_thread_get_curr_id());
+#ifdef UNIV_SYNC_DEBUG
+		mutex_set_debug_info(mutex, file_name, line);
+#endif
+		return;	/* Succeeded! */
+	}
+
+	mutex_spin_wait(mutex, file_name, line);
+}
+
+#ifdef UNIV_PFS_MUTEX
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_enter(), not directly
+this function!
+This is a performance schema instrumented wrapper function for
+mutex_enter_func(). */
+UNIV_INLINE
+void
+pfs_mutex_enter_func(
+/*=================*/
+	ib_mutex_t*	mutex,	/*!< in: pointer to mutex */
+	const char*	file_name,	/*!< in: file name where locked */
+	ulint		line)		/*!< in: line where locked */
+{
+	if (mutex->pfs_psi != NULL) {
+		PSI_mutex_locker*	locker;
+		PSI_mutex_locker_state	state;
+
+		locker = PSI_MUTEX_CALL(start_mutex_wait)(
+			&state, mutex->pfs_psi,
+			PSI_MUTEX_LOCK, file_name,
+			static_cast<uint>(line));
+
+		mutex_enter_func(mutex, file_name, line);
+
+		if (locker != NULL) {
+			PSI_MUTEX_CALL(end_mutex_wait)(locker, 0);
+		}
+	} else {
+		mutex_enter_func(mutex, file_name, line);
+	}
+}
+
+/********************************************************************//**
+NOTE! Please use the corresponding macro mutex_enter_nowait(), not directly
+this function!
+This is a performance schema instrumented wrapper function for
+mutex_enter_nowait_func.
+@return 0 if succeed, 1 if not */
+UNIV_INLINE
+ulint
+pfs_mutex_enter_nowait_func(
+/*========================*/
+	ib_mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*	file_name,	/*!< in: file name where mutex
+					requested */
+	ulint		line)		/*!< in: line where requested */
+{
+	ulint		ret;
+
+	if (mutex->pfs_psi != NULL) {
+		PSI_mutex_locker*	locker;
+		PSI_mutex_locker_state		state;
+
+		locker = PSI_MUTEX_CALL(start_mutex_wait)(
+			&state, mutex->pfs_psi,
+			PSI_MUTEX_TRYLOCK, file_name,
+			static_cast<uint>(line));
+
+		ret = mutex_enter_nowait_func(mutex, file_name, line);
+
+		if (locker != NULL) {
+			PSI_MUTEX_CALL(end_mutex_wait)(locker, (int) ret);
+		}
+	} else {
+		ret = mutex_enter_nowait_func(mutex, file_name, line);
+	}
+
+	return(ret);
+}
+
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_exit(), not directly
+this function!
+A wrap function of mutex_exit_func() with performance schema instrumentation.
+Unlocks a mutex owned by the current thread. */
+UNIV_INLINE
+void
+pfs_mutex_exit_func(
+/*================*/
+	ib_mutex_t*	mutex)	/*!< in: pointer to mutex */
+{
+	if (mutex->pfs_psi != NULL) {
+		PSI_MUTEX_CALL(unlock_mutex)(mutex->pfs_psi);
+	}
+
+	mutex_exit_func(mutex);
+}
+
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_create(), not directly
+this function!
+A wrapper function for mutex_create_func(), registers the mutex
+with performance schema if "UNIV_PFS_MUTEX" is defined when
+creating the mutex */
+UNIV_INLINE
+void
+pfs_mutex_create_func(
+/*==================*/
+	mysql_pfs_key_t	key,		/*!< in: Performance Schema key */
+	ib_mutex_t*	mutex,		/*!< in: pointer to memory */
+# ifdef UNIV_DEBUG
+	const char*	cmutex_name,	/*!< in: mutex name */
+#  ifdef UNIV_SYNC_DEBUG
+	ulint		level,		/*!< in: level */
+#  endif /* UNIV_SYNC_DEBUG */
+# endif /* UNIV_DEBUG */
+	const char*	cfile_name,	/*!< in: file name where created */
+	ulint		cline)		/*!< in: file line where created */
+{
+	mutex->pfs_psi = PSI_MUTEX_CALL(init_mutex)(key, mutex);
+
+	mutex_create_func(mutex,
+# ifdef UNIV_DEBUG
+			  cmutex_name,
+#  ifdef UNIV_SYNC_DEBUG
+			  level,
+#  endif /* UNIV_SYNC_DEBUG */
+# endif /* UNIV_DEBUG */
+			  cfile_name,
+			  cline);
+}
+
+/******************************************************************//**
+NOTE! Please use the corresponding macro mutex_free(), not directly
+this function!
+Wrapper function for mutex_free_func(). Also destroys the performance
+schema probes when freeing the mutex */
+UNIV_INLINE
+void
+pfs_mutex_free_func(
+/*================*/
+	ib_mutex_t*	mutex)	/*!< in: mutex */
+{
+	if (mutex->pfs_psi != NULL) {
+		PSI_MUTEX_CALL(destroy_mutex)(mutex->pfs_psi);
+		mutex->pfs_psi = NULL;
+	}
+
+	mutex_free_func(mutex);
+}
+
+#endif /* UNIV_PFS_MUTEX */
+
+#ifndef HAVE_ATOMIC_BUILTINS
+/**********************************************************//**
+Function that uses a mutex to decrement a variable atomically */
+UNIV_INLINE
+void
+os_atomic_dec_ulint_func(
+/*=====================*/
+	ib_mutex_t*	mutex,		/*!< in: mutex guarding the dec */
+	volatile ulint*	var,		/*!< in/out: variable to decrement */
+	ulint		delta)		/*!< in: delta to decrement */
+{
+	mutex_enter(mutex);
+
+	/* I don't think we will encounter a situation where
+	this check will not be required. */
+	ut_ad(*var >= delta);
+
+	*var -= delta;
+
+	mutex_exit(mutex);
+}
+
+/**********************************************************//**
+Function that uses a mutex to increment a variable atomically */
+UNIV_INLINE
+void
+os_atomic_inc_ulint_func(
+/*=====================*/
+	ib_mutex_t*	mutex,		/*!< in: mutex guarding the increment */
+	volatile ulint*	var,		/*!< in/out: variable to increment */
+	ulint		delta)		/*!< in: delta to increment */
+{
+	mutex_enter(mutex);
+
+	*var += delta;
+
+	mutex_exit(mutex);
+}
+#endif /* !HAVE_ATOMIC_BUILTINS */
diff --git a/storage/innobase/include/sync0types.h b/storage/innobase/include/sync0types.h
new file mode 100644
index 00000000000..0d143004a7a
--- /dev/null
+++ b/storage/innobase/include/sync0types.h
@@ -0,0 +1,31 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0types.h
+Global types for sync
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0types_h
+#define sync0types_h
+
+struct ib_mutex_t;
+
+#endif
diff --git a/storage/innobase/include/trx0i_s.h b/storage/innobase/include/trx0i_s.h
new file mode 100644
index 00000000000..662971a7841
--- /dev/null
+++ b/storage/innobase/include/trx0i_s.h
@@ -0,0 +1,311 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0i_s.h
+INFORMATION SCHEMA innodb_trx, innodb_locks and
+innodb_lock_waits tables cache structures and public
+functions.
+
+Created July 17, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef trx0i_s_h
+#define trx0i_s_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "dict0types.h"
+#include "ut0ut.h"
+
+/** The maximum amount of memory that can be consumed by innodb_trx,
+innodb_locks and innodb_lock_waits information schema tables. */
+#define TRX_I_S_MEM_LIMIT		16777216 /* 16 MiB */
+
+/** The maximum length of a string that can be stored in
+i_s_locks_row_t::lock_data */
+#define TRX_I_S_LOCK_DATA_MAX_LEN	8192
+
+/** The maximum length of a string that can be stored in
+i_s_trx_row_t::trx_query */
+#define TRX_I_S_TRX_QUERY_MAX_LEN	1024
+
+/** The maximum length of a string that can be stored in
+i_s_trx_row_t::trx_operation_state */
+#define TRX_I_S_TRX_OP_STATE_MAX_LEN	64
+
+/** The maximum length of a string that can be stored in
+i_s_trx_row_t::trx_foreign_key_error */
+#define TRX_I_S_TRX_FK_ERROR_MAX_LEN	256
+
+/** The maximum length of a string that can be stored in
+i_s_trx_row_t::trx_isolation_level */
+#define TRX_I_S_TRX_ISOLATION_LEVEL_MAX_LEN	16
+
+/** Safely copy strings in to the INNODB_TRX table's
+string based columns */
+#define TRX_I_S_STRING_COPY(data, field, constraint, tcache)	\
+do {								\
+	if (strlen(data) > constraint) {			\
+		char	buff[constraint + 1];			\
+		strncpy(buff, data, constraint);		\
+		buff[constraint] = '\0';			\
+								\
+		field = static_cast<const char*>(		\
+			ha_storage_put_memlim(			\
+			(tcache)->storage, buff, constraint + 1,\
+			MAX_ALLOWED_FOR_STORAGE(tcache)));	\
+	} else {						\
+		field = static_cast<const char*>(		\
+			ha_storage_put_str_memlim(		\
+			(tcache)->storage, data,		\
+			MAX_ALLOWED_FOR_STORAGE(tcache)));	\
+	}							\
+} while (0)
+
+/** A row of INFORMATION_SCHEMA.innodb_locks */
+struct i_s_locks_row_t;
+
+/** Objects of trx_i_s_cache_t::locks_hash */
+struct i_s_hash_chain_t;
+
+/** Objects of this type are added to the hash table
+trx_i_s_cache_t::locks_hash */
+struct i_s_hash_chain_t {
+	i_s_locks_row_t*	value;	/*!< row of
+					INFORMATION_SCHEMA.innodb_locks*/
+	i_s_hash_chain_t*	next;	/*!< next item in the hash chain */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_locks row */
+struct i_s_locks_row_t {
+	trx_id_t	lock_trx_id;	/*!< transaction identifier */
+	const char*	lock_mode;	/*!< lock mode from
+					lock_get_mode_str() */
+	const char*	lock_type;	/*!< lock type from
+					lock_get_type_str() */
+	const char*	lock_table;	/*!< table name from
+					lock_get_table_name() */
+	const char*	lock_index;	/*!< index name from
+					lock_rec_get_index_name() */
+	/** Information for record locks.  All these are
+	ULINT_UNDEFINED for table locks. */
+	/* @{ */
+	ulint		lock_space;	/*!< tablespace identifier */
+	ulint		lock_page;	/*!< page number within the_space */
+	ulint		lock_rec;	/*!< heap number of the record
+					on the page */
+	const char*	lock_data;	/*!< (some) content of the record */
+	/* @} */
+
+	/** The following are auxiliary and not included in the table */
+	/* @{ */
+	table_id_t	lock_table_id;
+					/*!< table identifier from
+					lock_get_table_id */
+	i_s_hash_chain_t hash_chain;	/*!< hash table chain node for
+					trx_i_s_cache_t::locks_hash */
+	/* @} */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_trx row */
+struct i_s_trx_row_t {
+	trx_id_t		trx_id;		/*!< transaction identifier */
+	const char*		trx_state;	/*!< transaction state from
+						trx_get_que_state_str() */
+	ib_time_t		trx_started;	/*!< trx_t::start_time */
+	const i_s_locks_row_t*	requested_lock_row;
+					/*!< pointer to a row
+					in innodb_locks if trx
+					is waiting, or NULL */
+	ib_time_t	trx_wait_started; /*!< trx_t::wait_started */
+	ullint		trx_weight;	/*!< TRX_WEIGHT() */
+	ulint		trx_mysql_thread_id; /*!< thd_get_thread_id() */
+	const char*	trx_query;	/*!< MySQL statement being
+					executed in the transaction */
+	struct charset_info_st*	trx_query_cs;
+					/*!< charset encode the MySQL
+					statement */
+	const char*	trx_operation_state; /*!< trx_t::op_info */
+	ulint		trx_tables_in_use;/*!< n_mysql_tables_in_use in
+					 trx_t */
+	ulint		trx_tables_locked;
+					/*!< mysql_n_tables_locked in
+					trx_t */
+	ulint		trx_lock_structs;/*!< list len of trx_locks in
+					trx_t */
+	ulint		trx_lock_memory_bytes;
+					/*!< mem_heap_get_size(
+					trx->lock_heap) */
+	ulint		trx_rows_locked;/*!< lock_number_of_rows_locked() */
+	ullint		trx_rows_modified;/*!< trx_t::undo_no */
+	ulint		trx_concurrency_tickets;
+					/*!< n_tickets_to_enter_innodb in
+					trx_t */
+	const char*	trx_isolation_level;
+					/*!< isolation_level in trx_t */
+	ibool		trx_unique_checks;
+					/*!< check_unique_secondary in trx_t*/
+	ibool		trx_foreign_key_checks;
+					/*!< check_foreigns in trx_t */
+	const char*	trx_foreign_key_error;
+					/*!< detailed_error in trx_t */
+	ibool		trx_has_search_latch;
+					/*!< has_search_latch in trx_t */
+	ulint		trx_search_latch_timeout;
+					/*!< search_latch_timeout in trx_t */
+	ulint		trx_is_read_only;
+					/*!< trx_t::read_only */
+	ulint		trx_is_autocommit_non_locking;
+					/*!< trx_is_autocommit_non_locking(trx)
+					*/
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_lock_waits row */
+struct i_s_lock_waits_row_t {
+	const i_s_locks_row_t*	requested_lock_row;	/*!< requested lock */
+	const i_s_locks_row_t*	blocking_lock_row;	/*!< blocking lock */
+};
+
+/** Cache of INFORMATION_SCHEMA table data */
+struct trx_i_s_cache_t;
+
+/** Auxiliary enum used by functions that need to select one of the
+INFORMATION_SCHEMA tables */
+enum i_s_table {
+	I_S_INNODB_TRX,		/*!< INFORMATION_SCHEMA.innodb_trx */
+	I_S_INNODB_LOCKS,	/*!< INFORMATION_SCHEMA.innodb_locks */
+	I_S_INNODB_LOCK_WAITS	/*!< INFORMATION_SCHEMA.innodb_lock_waits */
+};
+
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+extern trx_i_s_cache_t*	trx_i_s_cache;
+
+/*******************************************************************//**
+Initialize INFORMATION SCHEMA trx related cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_init(
+/*===============*/
+	trx_i_s_cache_t*	cache);	/*!< out: cache to init */
+/*******************************************************************//**
+Free the INFORMATION SCHEMA trx related cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_free(
+/*===============*/
+	trx_i_s_cache_t*	cache);	/*!< in/out: cache to free */
+
+/*******************************************************************//**
+Issue a shared/read lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_start_read(
+/*=====================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+
+/*******************************************************************//**
+Release a shared/read lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_end_read(
+/*===================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+
+/*******************************************************************//**
+Issue an exclusive/write lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_start_write(
+/*======================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+
+/*******************************************************************//**
+Release an exclusive/write lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_end_write(
+/*====================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+
+
+/*******************************************************************//**
+Retrieves the number of used rows in the cache for a given
+INFORMATION SCHEMA table.
+@return	number of rows */
+UNIV_INTERN
+ulint
+trx_i_s_cache_get_rows_used(
+/*========================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	enum i_s_table		table);	/*!< in: which table */
+
+/*******************************************************************//**
+Retrieves the nth row in the cache for a given INFORMATION SCHEMA
+table.
+@return	row */
+UNIV_INTERN
+void*
+trx_i_s_cache_get_nth_row(
+/*======================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	enum i_s_table		table,	/*!< in: which table */
+	ulint			n);	/*!< in: row number */
+
+/*******************************************************************//**
+Update the transactions cache if it has not been read for some time.
+@return	0 - fetched, 1 - not */
+UNIV_INTERN
+int
+trx_i_s_possibly_fetch_data_into_cache(
+/*===================================*/
+	trx_i_s_cache_t*	cache);	/*!< in/out: cache */
+
+/*******************************************************************//**
+Returns TRUE if the data in the cache is truncated due to the memory
+limit posed by TRX_I_S_MEM_LIMIT.
+@return	TRUE if truncated */
+UNIV_INTERN
+ibool
+trx_i_s_cache_is_truncated(
+/*=======================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+
+/** The maximum length of a resulting lock_id_size in
+trx_i_s_create_lock_id(), not including the terminating NUL.
+":%lu:%lu:%lu" -> 63 chars */
+#define TRX_I_S_LOCK_ID_MAX_LEN	(TRX_ID_MAX_LEN + 63)
+
+/*******************************************************************//**
+Crafts a lock id string from a i_s_locks_row_t object. Returns its
+second argument. This function aborts if there is not enough space in
+lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you
+want to be 100% sure that it will not abort.
+@return	resulting lock id */
+UNIV_INTERN
+char*
+trx_i_s_create_lock_id(
+/*===================*/
+	const i_s_locks_row_t*	row,	/*!< in: innodb_locks row */
+	char*			lock_id,/*!< out: resulting lock_id */
+	ulint			lock_id_size);/*!< in: size of the lock id
+					buffer */
+
+#endif /* trx0i_s_h */
diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h
new file mode 100644
index 00000000000..1e13c883800
--- /dev/null
+++ b/storage/innobase/include/trx0purge.h
@@ -0,0 +1,218 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0purge.h
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0purge_h
+#define trx0purge_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+#include "que0types.h"
+#include "page0page.h"
+#include "usr0sess.h"
+#include "fil0fil.h"
+
+/** The global data structure coordinating a purge */
+extern trx_purge_t*	purge_sys;
+
+/** A dummy undo record used as a return value when we have a whole undo log
+which needs no purge */
+extern trx_undo_rec_t	trx_purge_dummy_rec;
+
+/********************************************************************//**
+Calculates the file address of an undo log header when we have the file
+address of its history list node.
+@return	file address of the log */
+UNIV_INLINE
+fil_addr_t
+trx_purge_get_log_from_hist(
+/*========================*/
+	fil_addr_t	node_addr);	/*!< in: file address of the history
+					list node of the log */
+/********************************************************************//**
+Creates the global purge system control structure and inits the history
+mutex. */
+UNIV_INTERN
+void
+trx_purge_sys_create(
+/*=================*/
+	ulint		n_purge_threads,/*!< in: number of purge threads */
+	ib_bh_t*	ib_bh);		/*!< in/own: UNDO log min binary heap*/
+/********************************************************************//**
+Frees the global purge system control structure. */
+UNIV_INTERN
+void
+trx_purge_sys_close(void);
+/*======================*/
+/************************************************************************
+Adds the update undo log as the first log in the history list. Removes the
+update undo log segment from the rseg slot if it is too big for reuse. */
+UNIV_INTERN
+void
+trx_purge_add_update_undo_to_history(
+/*=================================*/
+	trx_t*	trx,		/*!< in: transaction */
+	page_t*	undo_page,	/*!< in: update undo log header page,
+				x-latched */
+	mtr_t*	mtr);		/*!< in: mtr */
+/*******************************************************************//**
+This function runs a purge batch.
+@return	number of undo log pages handled in the batch */
+UNIV_INTERN
+ulint
+trx_purge(
+/*======*/
+	ulint	n_purge_threads,	/*!< in: number of purge tasks to
+					submit to task queue. */
+	ulint	limit,			/*!< in: the maximum number of
+					records to purge in one batch */
+	bool	truncate);		/*!< in: truncate history if true */
+/*******************************************************************//**
+Stop purge and wait for it to stop, move to PURGE_STATE_STOP. */
+UNIV_INTERN
+void
+trx_purge_stop(void);
+/*================*/
+/*******************************************************************//**
+Resume purge, move to PURGE_STATE_RUN. */
+UNIV_INTERN
+void
+trx_purge_run(void);
+/*================*/
+
+/** Purge states */
+enum purge_state_t {
+	PURGE_STATE_INIT,		/*!< Purge instance created */
+	PURGE_STATE_RUN,		/*!< Purge should be running */
+	PURGE_STATE_STOP,		/*!< Purge should be stopped */
+	PURGE_STATE_EXIT,		/*!< Purge has been shutdown */
+	PURGE_STATE_DISABLED		/*!< Purge was never started */
+};
+
+/*******************************************************************//**
+Get the purge state.
+@return purge state. */
+UNIV_INTERN
+purge_state_t
+trx_purge_state(void);
+/*=================*/
+
+/** This is the purge pointer/iterator. We need both the undo no and the
+transaction no up to which purge has parsed and applied the records. */
+struct purge_iter_t {
+	trx_id_t	trx_no;		/*!< Purge has advanced past all
+					transactions whose number is less
+					than this */
+	undo_no_t	undo_no;	/*!< Purge has advanced past all records
+					whose undo number is less than this */
+};
+
+/** The control structure used in the purge operation */
+struct trx_purge_t{
+	sess_t*		sess;		/*!< System session running the purge
+					query */
+	trx_t*		trx;		/*!< System transaction running the
+					purge query: this trx is not in the
+					trx list of the trx system and it
+					never ends */
+	rw_lock_t	latch;		/*!< The latch protecting the purge
+					view. A purge operation must acquire an
+					x-latch here for the instant at which
+					it changes the purge view: an undo
+					log operation can prevent this by
+					obtaining an s-latch here. It also
+					protects state and running */
+	os_event_t	event;		/*!< State signal event */
+	ulint		n_stop;		/*!< Counter to track number stops */
+	volatile bool	running;	/*!< true, if purge is active,
+					we check this without the latch too */
+	volatile purge_state_t	state;	/*!< Purge coordinator thread states,
+					we check this in several places
+					without holding the latch. */
+	que_t*		query;		/*!< The query graph which will do the
+					parallelized purge operation */
+	read_view_t*	view;		/*!< The purge will not remove undo logs
+					which are >= this view (purge view) */
+	volatile ulint	n_submitted;	/*!< Count of total tasks submitted
+					to the task queue */
+	volatile ulint	n_completed;	/*!< Count of total tasks completed */
+
+	/*------------------------------*/
+	/* The following two fields form the 'purge pointer' which advances
+	during a purge, and which is used in history list truncation */
+
+	purge_iter_t	iter;		/* Limit up to which we have read and
+					parsed the UNDO log records.  Not
+					necessarily purged from the indexes.
+					Note that this can never be less than
+					the limit below, we check for this
+					invariant in trx0purge.cc */
+	purge_iter_t	limit;		/* The 'purge pointer' which advances
+					during a purge, and which is used in
+					history list truncation */
+#ifdef UNIV_DEBUG
+	purge_iter_t	done;		/* Indicate 'purge pointer' which have
+					purged already accurately. */
+#endif /* UNIV_DEBUG */
+	/*-----------------------------*/
+	ibool		next_stored;	/*!< TRUE if the info of the next record
+					to purge is stored below: if yes, then
+					the transaction number and the undo
+					number of the record are stored in
+					purge_trx_no and purge_undo_no above */
+	trx_rseg_t*	rseg;		/*!< Rollback segment for the next undo
+					record to purge */
+	ulint		page_no;	/*!< Page number for the next undo
+					record to purge, page number of the
+					log header, if dummy record */
+	ulint		offset;		/*!< Page offset for the next undo
+					record to purge, 0 if the dummy
+					record */
+	ulint		hdr_page_no;	/*!< Header page of the undo log where
+					the next record to purge belongs */
+	ulint		hdr_offset;	/*!< Header byte offset on the page */
+	/*-----------------------------*/
+	mem_heap_t*	heap;		/*!< Temporary storage used during a
+					purge: can be emptied after purge
+					completes */
+	/*-----------------------------*/
+	ib_bh_t*	ib_bh;		/*!< Binary min-heap, ordered on
+					rseg_queue_t::trx_no. It is protected
+					by the bh_mutex */
+	ib_mutex_t		bh_mutex;	/*!< Mutex protecting ib_bh */
+};
+
+/** Info required to purge a record */
+struct trx_purge_rec_t {
+	trx_undo_rec_t*	undo_rec;	/*!< Record to purge */
+	roll_ptr_t	roll_ptr;	/*!< File pointr to UNDO record */
+};
+
+#ifndef UNIV_NONINL
+#include "trx0purge.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/trx0purge.ic b/storage/innobase/include/trx0purge.ic
new file mode 100644
index 00000000000..ca9cc1fb894
--- /dev/null
+++ b/storage/innobase/include/trx0purge.ic
@@ -0,0 +1,62 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0purge.ic
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0undo.h"
+
+/********************************************************************//**
+Calculates the file address of an undo log header when we have the file
+address of its history list node.
+@return	file address of the log */
+UNIV_INLINE
+fil_addr_t
+trx_purge_get_log_from_hist(
+/*========================*/
+	fil_addr_t	node_addr)	/*!< in: file address of the history
+					list node of the log */
+{
+	node_addr.boffset -= TRX_UNDO_HISTORY_NODE;
+
+	return(node_addr);
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+address of its history list node.
+@return	TRUE if purge_sys_t::limit <= purge_sys_t::iter*/
+UNIV_INLINE
+ibool
+trx_purge_check_limit(void)
+/*=======================*/
+{
+	ut_ad(purge_sys->limit.trx_no <= purge_sys->iter.trx_no);
+
+	if (purge_sys->limit.trx_no == purge_sys->iter.trx_no) {
+		ut_ad(purge_sys->limit.undo_no <= purge_sys->iter.undo_no);
+	}
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
diff --git a/storage/innobase/include/trx0rec.h b/storage/innobase/include/trx0rec.h
new file mode 100644
index 00000000000..50da55d2ea3
--- /dev/null
+++ b/storage/innobase/include/trx0rec.h
@@ -0,0 +1,326 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rec.h
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0rec_h
+#define trx0rec_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+#include "dict0types.h"
+#include "data0data.h"
+#include "rem0types.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "que0types.h"
+
+/***********************************************************************//**
+Copies the undo record to the heap.
+@return	own: copy of undo log record */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_rec_copy(
+/*==============*/
+	const trx_undo_rec_t*	undo_rec,	/*!< in: undo log record */
+	mem_heap_t*		heap);		/*!< in: heap where copied */
+/**********************************************************************//**
+Reads the undo log record type.
+@return	record type */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_type(
+/*==================*/
+	const trx_undo_rec_t*	undo_rec);	/*!< in: undo log record */
+/**********************************************************************//**
+Reads from an undo log record the record compiler info.
+@return	compiler info */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_cmpl_info(
+/*=======================*/
+	const trx_undo_rec_t*	undo_rec);	/*!< in: undo log record */
+/**********************************************************************//**
+Returns TRUE if an undo log record contains an extern storage field.
+@return	TRUE if extern */
+UNIV_INLINE
+ibool
+trx_undo_rec_get_extern_storage(
+/*============================*/
+	const trx_undo_rec_t*	undo_rec);	/*!< in: undo log record */
+/**********************************************************************//**
+Reads the undo log record number.
+@return	undo no */
+UNIV_INLINE
+undo_no_t
+trx_undo_rec_get_undo_no(
+/*=====================*/
+	const trx_undo_rec_t*	undo_rec);	/*!< in: undo log record */
+/**********************************************************************//**
+Returns the start of the undo record data area.
+@return	offset to the data area */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_offset(
+/*====================*/
+	undo_no_t	undo_no)	/*!< in: undo no read from node */
+	__attribute__((const));
+
+/**********************************************************************//**
+Returns the start of the undo record data area. */
+#define trx_undo_rec_get_ptr(undo_rec, undo_no)		\
+	((undo_rec) + trx_undo_rec_get_offset(undo_no))
+
+/**********************************************************************//**
+Reads from an undo log record the general parameters.
+@return	remaining part of undo log record after reading these values */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_pars(
+/*==================*/
+	trx_undo_rec_t*	undo_rec,	/*!< in: undo log record */
+	ulint*		type,		/*!< out: undo record type:
+					TRX_UNDO_INSERT_REC, ... */
+	ulint*		cmpl_info,	/*!< out: compiler info, relevant only
+					for update type records */
+	bool*		updated_extern,	/*!< out: true if we updated an
+					externally stored fild */
+	undo_no_t*	undo_no,	/*!< out: undo log record number */
+	table_id_t*	table_id)	/*!< out: table id */
+	__attribute__((nonnull));
+/*******************************************************************//**
+Builds a row reference from an undo log record.
+@return	pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+	byte*		ptr,	/*!< in: remaining part of a copy of an undo log
+				record, at the start of the row reference;
+				NOTE that this copy of the undo log record must
+				be preserved as long as the row reference is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	dtuple_t**	ref,	/*!< out, own: row reference */
+	mem_heap_t*	heap);	/*!< in: memory heap from which the memory
+				needed is allocated */
+/*******************************************************************//**
+Skips a row reference from an undo log record.
+@return	pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_skip_row_ref(
+/*======================*/
+	byte*		ptr,	/*!< in: remaining part in update undo log
+				record, at the start of the row reference */
+	dict_index_t*	index);	/*!< in: clustered index */
+/**********************************************************************//**
+Reads from an undo log update record the system field values of the old
+version.
+@return	remaining part of undo log record after reading these values */
+UNIV_INTERN
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+	byte*		ptr,		/*!< in: remaining part of undo
+					log record after reading
+					general parameters */
+	trx_id_t*	trx_id,		/*!< out: trx id */
+	roll_ptr_t*	roll_ptr,	/*!< out: roll ptr */
+	ulint*		info_bits);	/*!< out: info bits state */
+/*******************************************************************//**
+Builds an update vector based on a remaining part of an undo log record.
+@return remaining part of the record, NULL if an error detected, which
+means that the record is corrupted */
+UNIV_INTERN
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+	byte*		ptr,	/*!< in: remaining part in update undo log
+				record, after reading the row reference
+				NOTE that this copy of the undo log record must
+				be preserved as long as the update vector is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		type,	/*!< in: TRX_UNDO_UPD_EXIST_REC,
+				TRX_UNDO_UPD_DEL_REC, or
+				TRX_UNDO_DEL_MARK_REC; in the last case,
+				only trx id and roll ptr fields are added to
+				the update vector */
+	trx_id_t	trx_id,	/*!< in: transaction id from this undorecord */
+	roll_ptr_t	roll_ptr,/*!< in: roll pointer from this undo record */
+	ulint		info_bits,/*!< in: info bits from this undo record */
+	trx_t*		trx,	/*!< in: transaction */
+	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
+				needed is allocated */
+	upd_t**		upd);	/*!< out, own: update vector */
+/*******************************************************************//**
+Builds a partial row from an update undo log record, for purge.
+It contains the columns which occur as ordering in any index of the table.
+Any missing columns are indicated by col->mtype == DATA_MISSING.
+@return	pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_partial_row(
+/*=========================*/
+	byte*		ptr,	/*!< in: remaining part in update undo log
+				record of a suitable type, at the start of
+				the stored index columns;
+				NOTE that this copy of the undo log record must
+				be preserved as long as the partial row is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	dtuple_t**	row,	/*!< out, own: partial row */
+	ibool		ignore_prefix, /*!< in: flag to indicate if we
+				expect blob prefixes in undo. Used
+				only in the assertion. */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
+				needed is allocated */
+	__attribute__((nonnull, warn_unused_result));
+/***********************************************************************//**
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+trx_undo_report_row_operation(
+/*==========================*/
+	ulint		flags,		/*!< in: if BTR_NO_UNDO_LOG_FLAG bit is
+					set, does nothing */
+	ulint		op_type,	/*!< in: TRX_UNDO_INSERT_OP or
+					TRX_UNDO_MODIFY_OP */
+	que_thr_t*	thr,		/*!< in: query thread */
+	dict_index_t*	index,		/*!< in: clustered index */
+	const dtuple_t*	clust_entry,	/*!< in: in the case of an insert,
+					index entry to insert into the
+					clustered index, otherwise NULL */
+	const upd_t*	update,		/*!< in: in the case of an update,
+					the update vector, otherwise NULL */
+	ulint		cmpl_info,	/*!< in: compiler info on secondary
+					index updates */
+	const rec_t*	rec,		/*!< in: case of an update or delete
+					marking, the record in the clustered
+					index, otherwise NULL */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec) */
+	roll_ptr_t*	roll_ptr)	/*!< out: rollback pointer to the
+					inserted undo log record,
+					0 if BTR_NO_UNDO_LOG
+					flag was specified */
+	__attribute__((nonnull(3,4,10), warn_unused_result));
+/******************************************************************//**
+Copies an undo record to heap. This function can be called if we know that
+the undo log record exists.
+@return	own: copy of the record */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_undo_rec_low(
+/*======================*/
+	roll_ptr_t	roll_ptr,	/*!< in: roll pointer to record */
+	mem_heap_t*	heap)		/*!< in: memory heap where copied */
+	__attribute__((nonnull, warn_unused_result));
+/*******************************************************************//**
+Build a previous version of a clustered index record. The caller must
+hold a latch on the index page of the clustered index record.
+@retval true if previous version was built, or if it was an insert
+or the table has been rebuilt
+@retval false if the previous version is earlier than purge_view,
+which means that it may have been removed */
+UNIV_INTERN
+bool
+trx_undo_prev_version_build(
+/*========================*/
+	const rec_t*	index_rec,/*!< in: clustered index record in the
+				index tree */
+	mtr_t*		index_mtr,/*!< in: mtr which contains the latch to
+				index_rec page and purge_view */
+	const rec_t*	rec,	/*!< in: version of a clustered index record */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
+				needed is allocated */
+	rec_t**		old_vers)/*!< out, own: previous version, or NULL if
+				rec is the first inserted version, or if
+				history data has been deleted */
+	__attribute__((nonnull));
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Parses a redo log record of adding an undo log record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_add_undo_rec(
+/*========================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	page_t*	page);	/*!< in: page or NULL */
+/***********************************************************//**
+Parses a redo log record of erasing of an undo page end.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_erase_page_end(
+/*==========================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr);	/*!< in: mtr or NULL */
+
+#ifndef UNIV_HOTBACKUP
+
+/* Types of an undo log record: these have to be smaller than 16, as the
+compilation info multiplied by 16 is ORed to this value in an undo log
+record */
+
+#define	TRX_UNDO_INSERT_REC	11	/* fresh insert into clustered index */
+#define	TRX_UNDO_UPD_EXIST_REC	12	/* update of a non-delete-marked
+					record */
+#define	TRX_UNDO_UPD_DEL_REC	13	/* update of a delete marked record to
+					a not delete marked record; also the
+					fields of the record can change */
+#define	TRX_UNDO_DEL_MARK_REC	14	/* delete marking of a record; fields
+					do not change */
+#define	TRX_UNDO_CMPL_INFO_MULT	16	/* compilation info is multiplied by
+					this and ORed to the type above */
+#define	TRX_UNDO_UPD_EXTERN	128	/* This bit can be ORed to type_cmpl
+					to denote that we updated external
+					storage fields: used by purge to
+					free the external storage */
+
+/* Operation type flags used in trx_undo_report_row_operation */
+#define	TRX_UNDO_INSERT_OP		1
+#define	TRX_UNDO_MODIFY_OP		2
+
+#ifndef UNIV_NONINL
+#include "trx0rec.ic"
+#endif
+
+#endif /* !UNIV_HOTBACKUP */
+
+#endif /* trx0rec_h */
diff --git a/storage/innobase/include/trx0rec.ic b/storage/innobase/include/trx0rec.ic
new file mode 100644
index 00000000000..08704f6b821
--- /dev/null
+++ b/storage/innobase/include/trx0rec.ic
@@ -0,0 +1,113 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rec.ic
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Reads from an undo log record the record type.
+@return	record type */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_type(
+/*==================*/
+	const trx_undo_rec_t*	undo_rec)	/*!< in: undo log record */
+{
+	return(mach_read_from_1(undo_rec + 2) & (TRX_UNDO_CMPL_INFO_MULT - 1));
+}
+
+/**********************************************************************//**
+Reads from an undo log record the record compiler info.
+@return	compiler info */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_cmpl_info(
+/*=======================*/
+	const trx_undo_rec_t*	undo_rec)	/*!< in: undo log record */
+{
+	return(mach_read_from_1(undo_rec + 2) / TRX_UNDO_CMPL_INFO_MULT);
+}
+
+/**********************************************************************//**
+Returns TRUE if an undo log record contains an extern storage field.
+@return	TRUE if extern */
+UNIV_INLINE
+ibool
+trx_undo_rec_get_extern_storage(
+/*============================*/
+	const trx_undo_rec_t*	undo_rec)	/*!< in: undo log record */
+{
+	if (mach_read_from_1(undo_rec + 2) & TRX_UNDO_UPD_EXTERN) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Reads the undo log record number.
+@return	undo no */
+UNIV_INLINE
+undo_no_t
+trx_undo_rec_get_undo_no(
+/*=====================*/
+	const trx_undo_rec_t*	undo_rec)	/*!< in: undo log record */
+{
+	const byte*	ptr;
+
+	ptr = undo_rec + 3;
+
+	return(mach_ull_read_much_compressed(ptr));
+}
+
+/**********************************************************************//**
+Returns the start of the undo record data area.
+@return	offset to the data area */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_offset(
+/*====================*/
+	undo_no_t	undo_no)	/*!< in: undo no read from node */
+{
+	return(3 + mach_ull_get_much_compressed_size(undo_no));
+}
+
+/***********************************************************************//**
+Copies the undo record to the heap.
+@return	own: copy of undo log record */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_rec_copy(
+/*==============*/
+	const trx_undo_rec_t*	undo_rec,	/*!< in: undo log record */
+	mem_heap_t*		heap)		/*!< in: heap where copied */
+{
+	ulint		len;
+
+	len = mach_read_from_2(undo_rec)
+		- ut_align_offset(undo_rec, UNIV_PAGE_SIZE);
+	ut_ad(len < UNIV_PAGE_SIZE);
+	return((trx_undo_rec_t*) mem_heap_dup(heap, undo_rec, len));
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h
new file mode 100644
index 00000000000..d5ab83d7767
--- /dev/null
+++ b/storage/innobase/include/trx0roll.h
@@ -0,0 +1,297 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0roll.h
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0roll_h
+#define trx0roll_h
+
+#include "univ.i"
+#include "trx0trx.h"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+
+extern bool	trx_rollback_or_clean_is_active;
+
+/*******************************************************************//**
+Determines if this transaction is rolling back an incomplete transaction
+in crash recovery.
+@return TRUE if trx is an incomplete transaction that is being rolled
+back in crash recovery */
+UNIV_INTERN
+ibool
+trx_is_recv(
+/*========*/
+	const trx_t*	trx);	/*!< in: transaction */
+/*******************************************************************//**
+Returns a transaction savepoint taken at this point in time.
+@return	savepoint */
+UNIV_INTERN
+trx_savept_t
+trx_savept_take(
+/*============*/
+	trx_t*	trx);	/*!< in: transaction */
+/*******************************************************************//**
+Frees an undo number array. */
+UNIV_INTERN
+void
+trx_undo_arr_free(
+/*==============*/
+	trx_undo_arr_t*	arr);	/*!< in: undo number array */
+/*******************************************************************//**
+Returns pointer to nth element in an undo number array.
+@return	pointer to the nth element */
+UNIV_INLINE
+trx_undo_inf_t*
+trx_undo_arr_get_nth_info(
+/*======================*/
+	trx_undo_arr_t*	arr,	/*!< in: undo number array */
+	ulint		n);	/*!< in: position */
+/********************************************************************//**
+Pops the topmost record when the two undo logs of a transaction are seen
+as a single stack of records ordered by their undo numbers. Inserts the
+undo number of the popped undo record to the array of currently processed
+undo numbers in the transaction. When the query thread finishes processing
+of this undo record, it must be released with trx_undo_rec_release.
+@return undo log record copied to heap, NULL if none left, or if the
+undo number of the top record would be less than the limit */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_roll_pop_top_rec_of_trx(
+/*========================*/
+	trx_t*		trx,	/*!< in: transaction */
+	undo_no_t	limit,	/*!< in: least undo number we need */
+	roll_ptr_t*	roll_ptr,/*!< out: roll pointer to undo record */
+	mem_heap_t*	heap);	/*!< in: memory heap where copied */
+/********************************************************************//**
+Reserves an undo log record for a query thread to undo. This should be
+called if the query thread gets the undo log record not using the pop
+function above.
+@return	TRUE if succeeded */
+UNIV_INTERN
+ibool
+trx_undo_rec_reserve(
+/*=================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	undo_no_t	undo_no);/*!< in: undo number of the record */
+/*******************************************************************//**
+Releases a reserved undo record. */
+UNIV_INTERN
+void
+trx_undo_rec_release(
+/*=================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	undo_no_t	undo_no);/*!< in: undo number */
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery.  If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back. */
+UNIV_INTERN
+void
+trx_rollback_or_clean_recovered(
+/*============================*/
+	ibool	all);	/*!< in: FALSE=roll back dictionary transactions;
+			TRUE=roll back all non-PREPARED transactions */
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery.  If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+Note: this is done in a background thread.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(trx_rollback_or_clean_all_recovered)(
+/*================================================*/
+	void*	arg __attribute__((unused)));
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+/*********************************************************************//**
+Creates a rollback command node struct.
+@return	own: rollback node struct */
+UNIV_INTERN
+roll_node_t*
+roll_node_create(
+/*=============*/
+	mem_heap_t*	heap);	/*!< in: mem heap where created */
+/***********************************************************//**
+Performs an execution step for a rollback command node in a query graph.
+@return	query thread to run next, or NULL */
+UNIV_INTERN
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+trx_rollback_for_mysql(
+/*===================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+	__attribute__((nonnull));
+/*******************************************************************//**
+Rollback the latest SQL statement for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+	__attribute__((nonnull));
+/*******************************************************************//**
+Rollback a transaction to a given savepoint or do a complete rollback.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+trx_rollback_to_savepoint(
+/*======================*/
+	trx_t*		trx,	/*!< in: transaction handle */
+	trx_savept_t*	savept)	/*!< in: pointer to savepoint undo number, if
+				partial rollback requested, or NULL for
+				complete rollback */
+	__attribute__((nonnull(1)));
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+trx_rollback_to_savepoint_for_mysql(
+/*================================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name,		/*!< in: savepoint name */
+	ib_int64_t*	mysql_binlog_cache_pos)	/*!< out: the MySQL binlog cache
+						position corresponding to this
+						savepoint; MySQL needs this
+						information to remove the
+						binlog entries of the queries
+						executed after the savepoint */
+	__attribute__((nonnull, warn_unused_result));
+/*******************************************************************//**
+Creates a named savepoint. If the transaction is not yet started, starts it.
+If there is already a savepoint of the same name, this call erases that old
+savepoint and replaces it with a new. Savepoints are deleted in a transaction
+commit or rollback.
+@return	always DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+trx_savepoint_for_mysql(
+/*====================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name,		/*!< in: savepoint name */
+	ib_int64_t	binlog_cache_pos)	/*!< in: MySQL binlog cache
+						position corresponding to this
+						connection at the time of the
+						savepoint */
+	__attribute__((nonnull));
+/*******************************************************************//**
+Releases a named savepoint. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+trx_release_savepoint_for_mysql(
+/*============================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name)		/*!< in: savepoint name */
+	__attribute__((nonnull, warn_unused_result));
+/*******************************************************************//**
+Frees savepoint structs starting from savep. */
+UNIV_INTERN
+void
+trx_roll_savepoints_free(
+/*=====================*/
+	trx_t*			trx,	/*!< in: transaction handle */
+	trx_named_savept_t*	savep);	/*!< in: free all savepoints > this one;
+					if this is NULL, free all savepoints
+					of trx */
+
+/** A cell of trx_undo_arr_t; used during a rollback and a purge */
+struct	trx_undo_inf_t{
+	ibool		in_use;	/*!< true if cell is being used */
+	trx_id_t	trx_no;	/*!< transaction number: not defined during
+				a rollback */
+	undo_no_t	undo_no;/*!< undo number of an undo record */
+};
+
+/** During a rollback and a purge, undo numbers of undo records currently being
+processed are stored in this array */
+
+struct trx_undo_arr_t{
+	ulint		n_cells;	/*!< number of cells in the array */
+	ulint		n_used;		/*!< number of cells in use */
+	trx_undo_inf_t*	infos;		/*!< the array of undo infos */
+	mem_heap_t*	heap;		/*!< memory heap from which allocated */
+};
+
+/** Rollback node states */
+enum roll_node_state {
+	ROLL_NODE_NONE = 0,		/*!< Unknown state */
+	ROLL_NODE_SEND,			/*!< about to send a rollback signal to
+					the transaction */
+	ROLL_NODE_WAIT			/*!< rollback signal sent to the
+				       	transaction, waiting for completion */
+};
+
+/** Rollback command node in a query graph */
+struct roll_node_t{
+	que_common_t		common;	/*!< node type: QUE_NODE_ROLLBACK */
+	enum roll_node_state	state;	/*!< node execution state */
+	ibool			partial;/*!< TRUE if we want a partial
+					rollback */
+	trx_savept_t		savept;	/*!< savepoint to which to
+					roll back, in the case of a
+					partial rollback */
+	que_thr_t*		undo_thr;/*!< undo query graph */
+};
+
+/** A savepoint set with SQL's "SAVEPOINT savepoint_id" command */
+struct trx_named_savept_t{
+	char*		name;		/*!< savepoint name */
+	trx_savept_t	savept;		/*!< the undo number corresponding to
+					the savepoint */
+	ib_int64_t	mysql_binlog_cache_pos;
+					/*!< the MySQL binlog cache position
+					corresponding to this savepoint, not
+					defined if the MySQL binlogging is not
+					enabled */
+	UT_LIST_NODE_T(trx_named_savept_t)
+			trx_savepoints;	/*!< the list of savepoints of a
+					transaction */
+};
+
+#ifndef UNIV_NONINL
+#include "trx0roll.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/trx0roll.ic b/storage/innobase/include/trx0roll.ic
new file mode 100644
index 00000000000..178e9bb730a
--- /dev/null
+++ b/storage/innobase/include/trx0roll.ic
@@ -0,0 +1,40 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0roll.ic
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+/*******************************************************************//**
+Returns pointer to nth element in an undo number array.
+@return	pointer to the nth element */
+UNIV_INLINE
+trx_undo_inf_t*
+trx_undo_arr_get_nth_info(
+/*======================*/
+	trx_undo_arr_t*	arr,	/*!< in: undo number array */
+	ulint		n)	/*!< in: position */
+{
+	ut_ad(arr);
+	ut_ad(n < arr->n_cells);
+
+	return(arr->infos + n);
+}
diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h
new file mode 100644
index 00000000000..185b05876b4
--- /dev/null
+++ b/storage/innobase/include/trx0rseg.h
@@ -0,0 +1,230 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rseg.h
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0rseg_h
+#define trx0rseg_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "trx0sys.h"
+#include "ut0bh.h"
+
+/******************************************************************//**
+Gets a rollback segment header.
+@return	rollback segment header, page x-latched */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get(
+/*==========*/
+	ulint	space,		/*!< in: space where placed */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number of the header */
+	mtr_t*	mtr);		/*!< in: mtr */
+/******************************************************************//**
+Gets a newly created rollback segment header.
+@return	rollback segment header, page x-latched */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get_new(
+/*==============*/
+	ulint	space,		/*!< in: space where placed */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number of the header */
+	mtr_t*	mtr);		/*!< in: mtr */
+/***************************************************************//**
+Gets the file page number of the nth undo log slot.
+@return	page number of the undo log segment */
+UNIV_INLINE
+ulint
+trx_rsegf_get_nth_undo(
+/*===================*/
+	trx_rsegf_t*	rsegf,	/*!< in: rollback segment header */
+	ulint		n,	/*!< in: index of slot */
+	mtr_t*		mtr);	/*!< in: mtr */
+/***************************************************************//**
+Sets the file page number of the nth undo log slot. */
+UNIV_INLINE
+void
+trx_rsegf_set_nth_undo(
+/*===================*/
+	trx_rsegf_t*	rsegf,	/*!< in: rollback segment header */
+	ulint		n,	/*!< in: index of slot */
+	ulint		page_no,/*!< in: page number of the undo log segment */
+	mtr_t*		mtr);	/*!< in: mtr */
+/****************************************************************//**
+Looks for a free slot for an undo log segment.
+@return	slot index or ULINT_UNDEFINED if not found */
+UNIV_INLINE
+ulint
+trx_rsegf_undo_find_free(
+/*=====================*/
+	trx_rsegf_t*	rsegf,	/*!< in: rollback segment header */
+	mtr_t*		mtr);	/*!< in: mtr */
+/******************************************************************//**
+Looks for a rollback segment, based on the rollback segment id.
+@return	rollback segment */
+UNIV_INLINE
+trx_rseg_t*
+trx_rseg_get_on_id(
+/*===============*/
+	ulint	id);		/*!< in: rollback segment id */
+/****************************************************************//**
+Creates a rollback segment header. This function is called only when
+a new rollback segment is created in the database.
+@return	page number of the created segment, FIL_NULL if fail */
+UNIV_INTERN
+ulint
+trx_rseg_header_create(
+/*===================*/
+	ulint	space,		/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	max_size,	/*!< in: max size in pages */
+	ulint	rseg_slot_no,	/*!< in: rseg id == slot number in trx sys */
+	mtr_t*	mtr);		/*!< in: mtr */
+/*********************************************************************//**
+Creates the memory copies for rollback segments and initializes the
+rseg array in trx_sys at a database startup. */
+UNIV_INTERN
+void
+trx_rseg_array_init(
+/*================*/
+	trx_sysf_t*	sys_header,	/*!< in/out: trx system header */
+	ib_bh_t*	ib_bh,		/*!< in: rseg queue */
+	mtr_t*		mtr);		/*!< in/out: mtr */
+/***************************************************************************
+Free's an instance of the rollback segment in memory. */
+UNIV_INTERN
+void
+trx_rseg_mem_free(
+/*==============*/
+	trx_rseg_t*	rseg);		/*!< in, own: instance to free */
+
+/*********************************************************************
+Creates a rollback segment. */
+UNIV_INTERN
+trx_rseg_t*
+trx_rseg_create(
+/*============*/
+	ulint	space);			/*!< in: id of UNDO tablespace */
+
+/********************************************************************
+Get the number of unique rollback tablespaces in use except space id 0.
+The last space id will be the sentinel value ULINT_UNDEFINED. The array
+will be sorted on space id. Note: space_ids should have have space for
+TRX_SYS_N_RSEGS + 1 elements.
+@return number of unique rollback tablespaces in use. */
+UNIV_INTERN
+ulint
+trx_rseg_get_n_undo_tablespaces(
+/*============================*/
+	ulint*		space_ids);	/*!< out: array of space ids of
+					UNDO tablespaces */
+/* Number of undo log slots in a rollback segment file copy */
+#define TRX_RSEG_N_SLOTS	(UNIV_PAGE_SIZE / 16)
+
+/* Maximum number of transactions supported by a single rollback segment */
+#define TRX_RSEG_MAX_N_TRXS	(TRX_RSEG_N_SLOTS / 2)
+
+/* The rollback segment memory object */
+struct trx_rseg_t{
+	/*--------------------------------------------------------*/
+	ulint		id;	/*!< rollback segment id == the index of
+				its slot in the trx system file copy */
+	ib_mutex_t		mutex;	/*!< mutex protecting the fields in this
+				struct except id, which is constant */
+	ulint		space;	/*!< space where the rollback segment is
+				header is placed */
+	ulint		zip_size;/* compressed page size of space
+				in bytes, or 0 for uncompressed spaces */
+	ulint		page_no;/* page number of the rollback segment
+				header */
+	ulint		max_size;/* maximum allowed size in pages */
+	ulint		curr_size;/* current size in pages */
+	/*--------------------------------------------------------*/
+	/* Fields for update undo logs */
+	UT_LIST_BASE_NODE_T(trx_undo_t) update_undo_list;
+					/* List of update undo logs */
+	UT_LIST_BASE_NODE_T(trx_undo_t) update_undo_cached;
+					/* List of update undo log segments
+					cached for fast reuse */
+	/*--------------------------------------------------------*/
+	/* Fields for insert undo logs */
+	UT_LIST_BASE_NODE_T(trx_undo_t) insert_undo_list;
+					/* List of insert undo logs */
+	UT_LIST_BASE_NODE_T(trx_undo_t) insert_undo_cached;
+					/* List of insert undo log segments
+					cached for fast reuse */
+	/*--------------------------------------------------------*/
+	ulint		last_page_no;	/*!< Page number of the last not yet
+					purged log header in the history list;
+					FIL_NULL if all list purged */
+	ulint		last_offset;	/*!< Byte offset of the last not yet
+					purged log header */
+	trx_id_t	last_trx_no;	/*!< Transaction number of the last not
+					yet purged log */
+	ibool		last_del_marks;	/*!< TRUE if the last not yet purged log
+					needs purging */
+};
+
+/** For prioritising the rollback segments for purge. */
+struct rseg_queue_t {
+        trx_id_t	trx_no;         /*!< trx_rseg_t::last_trx_no */
+        trx_rseg_t*     rseg;           /*!< Rollback segment */
+};
+
+/* Undo log segment slot in a rollback segment header */
+/*-------------------------------------------------------------*/
+#define	TRX_RSEG_SLOT_PAGE_NO	0	/* Page number of the header page of
+					an undo log segment */
+/*-------------------------------------------------------------*/
+/* Slot size */
+#define TRX_RSEG_SLOT_SIZE	4
+
+/* The offset of the rollback segment header on its page */
+#define	TRX_RSEG		FSEG_PAGE_DATA
+
+/* Transaction rollback segment header */
+/*-------------------------------------------------------------*/
+#define	TRX_RSEG_MAX_SIZE	0	/* Maximum allowed size for rollback
+					segment in pages */
+#define	TRX_RSEG_HISTORY_SIZE	4	/* Number of file pages occupied
+					by the logs in the history list */
+#define	TRX_RSEG_HISTORY	8	/* The update undo logs for committed
+					transactions */
+#define	TRX_RSEG_FSEG_HEADER	(8 + FLST_BASE_NODE_SIZE)
+					/* Header for the file segment where
+					this page is placed */
+#define TRX_RSEG_UNDO_SLOTS	(8 + FLST_BASE_NODE_SIZE + FSEG_HEADER_SIZE)
+					/* Undo log segment slots */
+/*-------------------------------------------------------------*/
+
+#ifndef UNIV_NONINL
+#include "trx0rseg.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/trx0rseg.ic b/storage/innobase/include/trx0rseg.ic
new file mode 100644
index 00000000000..30743da9b8c
--- /dev/null
+++ b/storage/innobase/include/trx0rseg.ic
@@ -0,0 +1,167 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rseg.ic
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "srv0srv.h"
+#include "mtr0log.h"
+#include "trx0sys.h"
+
+/******************************************************************//**
+Gets a rollback segment header.
+@return	rollback segment header, page x-latched */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get(
+/*==========*/
+	ulint	space,		/*!< in: space where placed */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number of the header */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	buf_block_t*	block;
+	trx_rsegf_t*	header;
+
+	block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_RSEG_HEADER);
+
+	header = TRX_RSEG + buf_block_get_frame(block);
+
+	return(header);
+}
+
+/******************************************************************//**
+Gets a newly created rollback segment header.
+@return	rollback segment header, page x-latched */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get_new(
+/*==============*/
+	ulint	space,		/*!< in: space where placed */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number of the header */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	buf_block_t*	block;
+	trx_rsegf_t*	header;
+
+	block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW);
+
+	header = TRX_RSEG + buf_block_get_frame(block);
+
+	return(header);
+}
+
+/***************************************************************//**
+Gets the file page number of the nth undo log slot.
+@return	page number of the undo log segment */
+UNIV_INLINE
+ulint
+trx_rsegf_get_nth_undo(
+/*===================*/
+	trx_rsegf_t*	rsegf,	/*!< in: rollback segment header */
+	ulint		n,	/*!< in: index of slot */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	if (n >= TRX_RSEG_N_SLOTS) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to get slot %lu of rseg\n",
+			(ulong) n);
+		ut_error;
+	}
+
+	return(mtr_read_ulint(rsegf + TRX_RSEG_UNDO_SLOTS
+			      + n * TRX_RSEG_SLOT_SIZE, MLOG_4BYTES, mtr));
+}
+
+/***************************************************************//**
+Sets the file page number of the nth undo log slot. */
+UNIV_INLINE
+void
+trx_rsegf_set_nth_undo(
+/*===================*/
+	trx_rsegf_t*	rsegf,	/*!< in: rollback segment header */
+	ulint		n,	/*!< in: index of slot */
+	ulint		page_no,/*!< in: page number of the undo log segment */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	if (n >= TRX_RSEG_N_SLOTS) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to set slot %lu of rseg\n",
+			(ulong) n);
+		ut_error;
+	}
+
+	mlog_write_ulint(rsegf + TRX_RSEG_UNDO_SLOTS + n * TRX_RSEG_SLOT_SIZE,
+			 page_no, MLOG_4BYTES, mtr);
+}
+
+/****************************************************************//**
+Looks for a free slot for an undo log segment.
+@return	slot index or ULINT_UNDEFINED if not found */
+UNIV_INLINE
+ulint
+trx_rsegf_undo_find_free(
+/*=====================*/
+	trx_rsegf_t*	rsegf,	/*!< in: rollback segment header */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint		i;
+	ulint		page_no;
+
+	for (i = 0;
+#ifndef UNIV_DEBUG
+	     i < TRX_RSEG_N_SLOTS;
+#else
+	     i < (trx_rseg_n_slots_debug ? trx_rseg_n_slots_debug : TRX_RSEG_N_SLOTS);
+#endif
+	     i++) {
+
+		page_no = trx_rsegf_get_nth_undo(rsegf, i, mtr);
+
+		if (page_no == FIL_NULL) {
+
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/******************************************************************//**
+Looks for a rollback segment, based on the rollback segment id.
+@return	rollback segment */
+UNIV_INLINE
+trx_rseg_t*
+trx_rseg_get_on_id(
+/*===============*/
+	ulint	id)	/*!< in: rollback segment id */
+{
+	ut_a(id < TRX_SYS_N_RSEGS);
+
+	return(trx_sys->rseg_array[id]);
+}
+
diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
new file mode 100644
index 00000000000..70f214d1ac7
--- /dev/null
+++ b/storage/innobase/include/trx0sys.h
@@ -0,0 +1,674 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0sys.h
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0sys_h
+#define trx0sys_h
+
+#include "univ.i"
+
+#include "trx0types.h"
+#include "fsp0types.h"
+#include "fil0fil.h"
+#include "buf0buf.h"
+#ifndef UNIV_HOTBACKUP
+#include "mtr0mtr.h"
+#include "ut0byte.h"
+#include "mem0mem.h"
+#include "sync0sync.h"
+#include "ut0lst.h"
+#include "ut0bh.h"
+#include "read0types.h"
+#include "page0types.h"
+#include "ut0bh.h"
+
+typedef UT_LIST_BASE_NODE_T(trx_t) trx_list_t;
+
+/** In a MySQL replication slave, in crash recovery we store the master log
+file name and position here. */
+/* @{ */
+/** Master binlog file name */
+extern char		trx_sys_mysql_master_log_name[];
+/** Master binlog file position.  We have successfully got the updates
+up to this position.  -1 means that no crash recovery was needed, or
+there was no master log position info inside InnoDB.*/
+extern ib_int64_t	trx_sys_mysql_master_log_pos;
+/* @} */
+
+/** If this MySQL server uses binary logging, after InnoDB has been inited
+and if it has done a crash recovery, we store the binlog file name and position
+here. */
+/* @{ */
+/** Binlog file name */
+extern char		trx_sys_mysql_bin_log_name[];
+/** Binlog file position, or -1 if unknown */
+extern ib_int64_t	trx_sys_mysql_bin_log_pos;
+/* @} */
+
+/** The transaction system */
+extern trx_sys_t*	trx_sys;
+
+/***************************************************************//**
+Checks if a page address is the trx sys header page.
+@return	TRUE if trx sys header page */
+UNIV_INLINE
+ibool
+trx_sys_hdr_page(
+/*=============*/
+	ulint	space,	/*!< in: space */
+	ulint	page_no);/*!< in: page number */
+/*****************************************************************//**
+Creates and initializes the central memory structures for the transaction
+system. This is called when the database is started.
+@return min binary heap of rsegs to purge */
+UNIV_INTERN
+ib_bh_t*
+trx_sys_init_at_db_start(void);
+/*==========================*/
+/*****************************************************************//**
+Creates the trx_sys instance and initializes ib_bh and mutex. */
+UNIV_INTERN
+void
+trx_sys_create(void);
+/*================*/
+/*****************************************************************//**
+Creates and initializes the transaction system at the database creation. */
+UNIV_INTERN
+void
+trx_sys_create_sys_pages(void);
+/*==========================*/
+/****************************************************************//**
+Looks for a free slot for a rollback segment in the trx system file copy.
+@return	slot index or ULINT_UNDEFINED if not found */
+UNIV_INTERN
+ulint
+trx_sysf_rseg_find_free(
+/*====================*/
+	mtr_t*		mtr);		/*!< in: mtr */
+/***************************************************************//**
+Gets the pointer in the nth slot of the rseg array.
+@return	pointer to rseg object, NULL if slot not in use */
+UNIV_INLINE
+trx_rseg_t*
+trx_sys_get_nth_rseg(
+/*=================*/
+	trx_sys_t*	sys,	/*!< in: trx system */
+	ulint		n);	/*!< in: index of slot */
+/**********************************************************************//**
+Gets a pointer to the transaction system file copy and x-locks its page.
+@return	pointer to system file copy, page x-locked */
+UNIV_INLINE
+trx_sysf_t*
+trx_sysf_get(
+/*=========*/
+	mtr_t*	mtr);	/*!< in: mtr */
+/*****************************************************************//**
+Gets the space of the nth rollback segment slot in the trx system
+file copy.
+@return	space id */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_space(
+/*====================*/
+	trx_sysf_t*	sys_header,	/*!< in: trx sys file copy */
+	ulint		i,		/*!< in: slot index == rseg id */
+	mtr_t*		mtr);		/*!< in: mtr */
+/*****************************************************************//**
+Gets the page number of the nth rollback segment slot in the trx system
+file copy.
+@return	page number, FIL_NULL if slot unused */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_page_no(
+/*======================*/
+	trx_sysf_t*	sys_header,	/*!< in: trx sys file copy */
+	ulint		i,		/*!< in: slot index == rseg id */
+	mtr_t*		mtr);		/*!< in: mtr */
+/*****************************************************************//**
+Sets the space id of the nth rollback segment slot in the trx system
+file copy. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_space(
+/*====================*/
+	trx_sysf_t*	sys_header,	/*!< in: trx sys file copy */
+	ulint		i,		/*!< in: slot index == rseg id */
+	ulint		space,		/*!< in: space id */
+	mtr_t*		mtr);		/*!< in: mtr */
+/*****************************************************************//**
+Sets the page number of the nth rollback segment slot in the trx system
+file copy. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_page_no(
+/*======================*/
+	trx_sysf_t*	sys_header,	/*!< in: trx sys file copy */
+	ulint		i,		/*!< in: slot index == rseg id */
+	ulint		page_no,	/*!< in: page number, FIL_NULL if
+					the slot is reset to unused */
+	mtr_t*		mtr);		/*!< in: mtr */
+/*****************************************************************//**
+Allocates a new transaction id.
+@return	new, allocated trx id */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_new_trx_id(void);
+/*========================*/
+/*****************************************************************//**
+Determines the maximum transaction id.
+@return maximum currently allocated trx id; will be stale after the
+next call to trx_sys_get_new_trx_id() */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_max_trx_id(void);
+/*========================*/
+
+#ifdef UNIV_DEBUG
+/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
+extern uint			trx_rseg_n_slots_debug;
+#endif
+
+/*****************************************************************//**
+Writes a trx id to an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_trx_id(
+/*=============*/
+	byte*		ptr,	/*!< in: pointer to memory where written */
+	trx_id_t	id);	/*!< in: id */
+/*****************************************************************//**
+Reads a trx id from an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_read_...
+@return	id */
+UNIV_INLINE
+trx_id_t
+trx_read_trx_id(
+/*============*/
+	const byte*	ptr);	/*!< in: pointer to memory from where to read */
+/****************************************************************//**
+Looks for the trx instance with the given id in the rw trx_list.
+The caller must be holding trx_sys->mutex.
+@return	the trx handle or NULL if not found;
+the pointer must not be dereferenced unless lock_sys->mutex was
+acquired before calling this function and is still being held */
+UNIV_INLINE
+trx_t*
+trx_get_rw_trx_by_id(
+/*=================*/
+	trx_id_t	trx_id);/*!< in: trx id to search for */
+/****************************************************************//**
+Returns the minimum trx id in rw trx list. This is the smallest id for which
+the trx can possibly be active. (But, you must look at the trx->state to
+find out if the minimum trx id transaction itself is active, or already
+committed.)
+@return	the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */
+UNIV_INLINE
+trx_id_t
+trx_rw_min_trx_id(void);
+/*===================*/
+/****************************************************************//**
+Checks if a rw transaction with the given id is active. Caller must hold
+trx_sys->mutex in shared mode. If the caller is not holding
+lock_sys->mutex, the transaction may already have been committed.
+@return	transaction instance if active, or NULL;
+the pointer must not be dereferenced unless lock_sys->mutex was
+acquired before calling this function and is still being held */
+UNIV_INLINE
+trx_t*
+trx_rw_is_active_low(
+/*=================*/
+	trx_id_t	trx_id,		/*!< in: trx id of the transaction */
+	ibool*		corrupt);	/*!< in: NULL or pointer to a flag
+					that will be set if corrupt */
+/****************************************************************//**
+Checks if a rw transaction with the given id is active. If the caller is
+not holding lock_sys->mutex, the transaction may already have been
+committed.
+@return	transaction instance if active, or NULL;
+the pointer must not be dereferenced unless lock_sys->mutex was
+acquired before calling this function and is still being held */
+UNIV_INLINE
+trx_t*
+trx_rw_is_active(
+/*=============*/
+	trx_id_t	trx_id,		/*!< in: trx id of the transaction */
+	ibool*		corrupt);	/*!< in: NULL or pointer to a flag
+					that will be set if corrupt */
+#ifdef UNIV_DEBUG
+/****************************************************************//**
+Checks whether a trx is in one of rw_trx_list or ro_trx_list.
+@return	TRUE if is in */
+UNIV_INTERN
+ibool
+trx_in_trx_list(
+/*============*/
+	const trx_t*	in_trx)		/*!< in: transaction */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+/***********************************************************//**
+Assert that a transaction has been recovered.
+@return TRUE */
+UNIV_INLINE
+ibool
+trx_assert_recovered(
+/*=================*/
+	trx_id_t	trx_id)		/*!< in: transaction identifier */
+	__attribute__((warn_unused_result));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+/*****************************************************************//**
+Updates the offset information about the end of the MySQL binlog entry
+which corresponds to the transaction just being committed. In a MySQL
+replication slave updates the latest master binlog position up to which
+replication has proceeded. */
+UNIV_INTERN
+void
+trx_sys_update_mysql_binlog_offset(
+/*===============================*/
+	const char*	file_name,/*!< in: MySQL log file name */
+	ib_int64_t	offset,	/*!< in: position in that log file */
+	ulint		field,	/*!< in: offset of the MySQL log info field in
+				the trx sys header */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*****************************************************************//**
+Prints to stderr the MySQL binlog offset info in the trx system header if
+the magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_binlog_offset(void);
+/*===================================*/
+/*****************************************************************//**
+Prints to stderr the MySQL master log offset info in the trx system header if
+the magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_master_log_pos(void);
+/*====================================*/
+/*****************************************************************//**
+Initializes the tablespace tag system. */
+UNIV_INTERN
+void
+trx_sys_file_format_init(void);
+/*==========================*/
+/*****************************************************************//**
+Closes the tablespace tag system. */
+UNIV_INTERN
+void
+trx_sys_file_format_close(void);
+/*===========================*/
+/********************************************************************//**
+Tags the system table space with minimum format id if it has not been
+tagged yet.
+WARNING: This function is only called during the startup and AFTER the
+redo log application during recovery has finished. */
+UNIV_INTERN
+void
+trx_sys_file_format_tag_init(void);
+/*==============================*/
+/*****************************************************************//**
+Shutdown/Close the transaction system. */
+UNIV_INTERN
+void
+trx_sys_close(void);
+/*===============*/
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return	pointer to the name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_id_to_name(
+/*===========================*/
+	const ulint	id);		/*!< in: id of the file format */
+/*****************************************************************//**
+Set the file format id unconditionally except if it's already the
+same value.
+@return	TRUE if value updated */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_set(
+/*========================*/
+	ulint		format_id,	/*!< in: file format id */
+	const char**	name);		/*!< out: max file format name or
+					NULL if not needed. */
+/*********************************************************************
+Creates the rollback segments
+@return number of rollback segments that are active. */
+UNIV_INTERN
+ulint
+trx_sys_create_rsegs(
+/*=================*/
+	ulint	n_spaces,	/*!< number of tablespaces for UNDO logs */
+	ulint	n_rsegs);	/*!< number of rollback segments to create */
+/*****************************************************************//**
+Get the number of transaction in the system, independent of their state.
+@return count of transactions in trx_sys_t::trx_list */
+UNIV_INLINE
+ulint
+trx_sys_get_n_rw_trx(void);
+/*======================*/
+
+/*********************************************************************
+Check if there are any active (non-prepared) transactions.
+@return total number of active transactions or 0 if none */
+UNIV_INTERN
+ulint
+trx_sys_any_active_transactions(void);
+/*=================================*/
+#else /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Prints to stderr the MySQL binlog info in the system header if the
+magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_binlog_offset_from_page(
+/*========================================*/
+	const byte*	page);	/*!< in: buffer containing the trx
+				system header page, i.e., page number
+				TRX_SYS_PAGE_NO in the tablespace */
+/*****************************************************************//**
+Reads the file format id from the first system table space file.
+Even if the call succeeds and returns TRUE, the returned format id
+may be ULINT_UNDEFINED signalling that the format id was not present
+in the data file.
+@return TRUE if call succeeds */
+UNIV_INTERN
+ibool
+trx_sys_read_file_format_id(
+/*========================*/
+	const char *pathname,	/*!< in: pathname of the first system
+				table space file */
+	ulint *format_id);	/*!< out: file format of the system table
+				space */
+/*****************************************************************//**
+Reads the file format id from the given per-table data file.
+@return TRUE if call succeeds */
+UNIV_INTERN
+ibool
+trx_sys_read_pertable_file_format_id(
+/*=================================*/
+	const char *pathname,	/*!< in: pathname of a per-table
+				datafile */
+	ulint *format_id);	/*!< out: file format of the per-table
+				data file */
+#endif /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return	pointer to the max format name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_max_get(void);
+/*=============================*/
+/*****************************************************************//**
+Check for the max file format tag stored on disk.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+trx_sys_file_format_max_check(
+/*==========================*/
+	ulint		max_format_id);	/*!< in: the max format id to check */
+/********************************************************************//**
+Update the file format tag in the system tablespace only if the given
+format id is greater than the known max id.
+@return	TRUE if format_id was bigger than the known max id */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_upgrade(
+/*============================*/
+	const char**	name,		/*!< out: max file format name */
+	ulint		format_id);	/*!< in: file format identifier */
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return	pointer to the name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_id_to_name(
+/*===========================*/
+	const ulint	id);	/*!< in: id of the file format */
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Validate the trx_sys_t::trx_list. */
+UNIV_INTERN
+ibool
+trx_sys_validate_trx_list(void);
+/*===========================*/
+#endif /* UNIV_DEBUG */
+
+/* The automatically created system rollback segment has this id */
+#define TRX_SYS_SYSTEM_RSEG_ID	0
+
+/* Space id and page no where the trx system file copy resides */
+#define	TRX_SYS_SPACE	0	/* the SYSTEM tablespace */
+#include "fsp0fsp.h"
+#define	TRX_SYS_PAGE_NO	FSP_TRX_SYS_PAGE_NO
+
+/* The offset of the transaction system header on the page */
+#define	TRX_SYS		FSEG_PAGE_DATA
+
+/** Transaction system header */
+/*------------------------------------------------------------- @{ */
+#define	TRX_SYS_TRX_ID_STORE	0	/*!< the maximum trx id or trx
+					number modulo
+					TRX_SYS_TRX_ID_UPDATE_MARGIN
+					written to a file page by any
+					transaction; the assignment of
+					transaction ids continues from
+					this number rounded up by
+					TRX_SYS_TRX_ID_UPDATE_MARGIN
+					plus
+					TRX_SYS_TRX_ID_UPDATE_MARGIN
+					when the database is
+					started */
+#define TRX_SYS_FSEG_HEADER	8	/*!< segment header for the
+					tablespace segment the trx
+					system is created into */
+#define	TRX_SYS_RSEGS		(8 + FSEG_HEADER_SIZE)
+					/*!< the start of the array of
+					rollback segment specification
+					slots */
+/*------------------------------------------------------------- @} */
+
+/* Max number of rollback segments: the number of segment specification slots
+in the transaction system array; rollback segment id must fit in one (signed)
+byte, therefore 128; each slot is currently 8 bytes in size. If you want
+to raise the level to 256 then you will need to fix some assertions that
+impose the 7 bit restriction. e.g., mach_write_to_3() */
+#define	TRX_SYS_N_RSEGS			128
+/* Originally, InnoDB defined TRX_SYS_N_RSEGS as 256 but created only one
+rollback segment.  It initialized some arrays with this number of entries.
+We must remember this limit in order to keep file compatibility. */
+#define TRX_SYS_OLD_N_RSEGS		256
+
+/** Maximum length of MySQL binlog file name, in bytes.
+@see trx_sys_mysql_master_log_name
+@see trx_sys_mysql_bin_log_name */
+#define TRX_SYS_MYSQL_LOG_NAME_LEN	512
+/** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */
+#define TRX_SYS_MYSQL_LOG_MAGIC_N	873422344
+
+#if UNIV_PAGE_SIZE_MIN < 4096
+# error "UNIV_PAGE_SIZE_MIN < 4096"
+#endif
+/** The offset of the MySQL replication info in the trx system header;
+this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */
+#define TRX_SYS_MYSQL_MASTER_LOG_INFO	(UNIV_PAGE_SIZE - 2000)
+
+/** The offset of the MySQL binlog offset info in the trx system header */
+#define TRX_SYS_MYSQL_LOG_INFO		(UNIV_PAGE_SIZE - 1000)
+#define	TRX_SYS_MYSQL_LOG_MAGIC_N_FLD	0	/*!< magic number which is
+						TRX_SYS_MYSQL_LOG_MAGIC_N
+						if we have valid data in the
+						MySQL binlog info */
+#define TRX_SYS_MYSQL_LOG_OFFSET_HIGH	4	/*!< high 4 bytes of the offset
+						within that file */
+#define TRX_SYS_MYSQL_LOG_OFFSET_LOW	8	/*!< low 4 bytes of the offset
+						within that file */
+#define TRX_SYS_MYSQL_LOG_NAME		12	/*!< MySQL log file name */
+
+/** Doublewrite buffer */
+/* @{ */
+/** The offset of the doublewrite buffer header on the trx system header page */
+#define TRX_SYS_DOUBLEWRITE		(UNIV_PAGE_SIZE - 200)
+/*-------------------------------------------------------------*/
+#define TRX_SYS_DOUBLEWRITE_FSEG	0	/*!< fseg header of the fseg
+						containing the doublewrite
+						buffer */
+#define TRX_SYS_DOUBLEWRITE_MAGIC	FSEG_HEADER_SIZE
+						/*!< 4-byte magic number which
+						shows if we already have
+						created the doublewrite
+						buffer */
+#define TRX_SYS_DOUBLEWRITE_BLOCK1	(4 + FSEG_HEADER_SIZE)
+						/*!< page number of the
+						first page in the first
+						sequence of 64
+						(= FSP_EXTENT_SIZE) consecutive
+						pages in the doublewrite
+						buffer */
+#define TRX_SYS_DOUBLEWRITE_BLOCK2	(8 + FSEG_HEADER_SIZE)
+						/*!< page number of the
+						first page in the second
+						sequence of 64 consecutive
+						pages in the doublewrite
+						buffer */
+#define TRX_SYS_DOUBLEWRITE_REPEAT	12	/*!< we repeat
+						TRX_SYS_DOUBLEWRITE_MAGIC,
+						TRX_SYS_DOUBLEWRITE_BLOCK1,
+						TRX_SYS_DOUBLEWRITE_BLOCK2
+						so that if the trx sys
+						header is half-written
+						to disk, we still may
+						be able to recover the
+						information */
+/** If this is not yet set to TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+we must reset the doublewrite buffer, because starting from 4.1.x the
+space id of a data page is stored into
+FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
+#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE)
+
+/*-------------------------------------------------------------*/
+/** Contents of TRX_SYS_DOUBLEWRITE_MAGIC */
+#define TRX_SYS_DOUBLEWRITE_MAGIC_N	536853855
+/** Contents of TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED */
+#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N 1783657386
+
+/** Size of the doublewrite block in pages */
+#define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE	FSP_EXTENT_SIZE
+/* @} */
+
+/** File format tag */
+/* @{ */
+/** The offset of the file format tag on the trx system header page
+(TRX_SYS_PAGE_NO of TRX_SYS_SPACE) */
+#define TRX_SYS_FILE_FORMAT_TAG		(UNIV_PAGE_SIZE - 16)
+
+/** Contents of TRX_SYS_FILE_FORMAT_TAG when valid. The file format
+identifier is added to this constant. */
+#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW	3645922177UL
+/** Contents of TRX_SYS_FILE_FORMAT_TAG+4 when valid */
+#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH	2745987765UL
+/** Contents of TRX_SYS_FILE_FORMAT_TAG when valid. The file format
+identifier is added to this 64-bit constant. */
+#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N					\
+	((ib_uint64_t) TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH << 32	\
+	 | TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW)
+/* @} */
+
+#ifndef UNIV_HOTBACKUP
+/** The transaction system central memory data structure. */
+struct trx_sys_t{
+
+	ib_mutex_t		mutex;		/*!< mutex protecting most fields in
+					this structure except when noted
+					otherwise */
+	ulint		n_prepared_trx;	/*!< Number of transactions currently
+					in the XA PREPARED state */
+	ulint		n_prepared_recovered_trx; /*!< Number of transactions
+					currently in XA PREPARED state that are
+					also recovered. Such transactions cannot
+					be added during runtime. They can only
+					occur after recovery if mysqld crashed
+					while there were XA PREPARED
+					transactions. We disable query cache
+					if such transactions exist. */
+	trx_id_t	max_trx_id;	/*!< The smallest number not yet
+					assigned as a transaction id or
+					transaction number */
+#ifdef UNIV_DEBUG
+	trx_id_t	rw_max_trx_id;	/*!< Max trx id of read-write transactions
+					which exist or existed */
+#endif
+	trx_list_t	rw_trx_list;	/*!< List of active and committed in
+					memory read-write transactions, sorted
+					on trx id, biggest first. Recovered
+					transactions are always on this list. */
+	trx_list_t	ro_trx_list;	/*!< List of active and committed in
+					memory read-only transactions, sorted
+					on trx id, biggest first. NOTE:
+					The order for read-only transactions
+					is not necessary. We should exploit
+					this and increase concurrency during
+					add/remove. */
+	trx_list_t	mysql_trx_list;	/*!< List of transactions created
+					for MySQL. All transactions on
+					ro_trx_list are on mysql_trx_list. The
+					rw_trx_list can contain system
+					transactions and recovered transactions
+					that will not be in the mysql_trx_list.
+					There can be active non-locking
+					auto-commit read only transactions that
+					are on this list but not on ro_trx_list.
+					mysql_trx_list may additionally contain
+					transactions that have not yet been
+					started in InnoDB. */
+	trx_rseg_t*	const rseg_array[TRX_SYS_N_RSEGS];
+					/*!< Pointer array to rollback
+					segments; NULL if slot not in use;
+					created and destroyed in
+					single-threaded mode; not protected
+					by any mutex, because it is read-only
+					during multi-threaded operation */
+	ulint		rseg_history_len;/*!< Length of the TRX_RSEG_HISTORY
+					list (update undo logs for committed
+					transactions), protected by
+					rseg->mutex */
+	UT_LIST_BASE_NODE_T(read_view_t) view_list;
+					/*!< List of read views sorted
+					on trx no, biggest first */
+};
+
+/** When a trx id which is zero modulo this number (which must be a power of
+two) is assigned, the field TRX_SYS_TRX_ID_STORE on the transaction system
+page is updated */
+#define TRX_SYS_TRX_ID_WRITE_MARGIN	256
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "trx0sys.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/trx0sys.ic b/storage/innobase/include/trx0sys.ic
new file mode 100644
index 00000000000..e097e29b551
--- /dev/null
+++ b/storage/innobase/include/trx0sys.ic
@@ -0,0 +1,512 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0sys.ic
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0trx.h"
+#include "data0type.h"
+#ifndef UNIV_HOTBACKUP
+# include "srv0srv.h"
+# include "mtr0log.h"
+
+/* The typedef for rseg slot in the file copy */
+typedef byte	trx_sysf_rseg_t;
+
+/* Rollback segment specification slot offsets */
+/*-------------------------------------------------------------*/
+#define	TRX_SYS_RSEG_SPACE	0	/* space where the segment
+					header is placed; starting with
+					MySQL/InnoDB 5.1.7, this is
+					UNIV_UNDEFINED if the slot is unused */
+#define	TRX_SYS_RSEG_PAGE_NO	4	/*  page number where the segment
+					header is placed; this is FIL_NULL
+					if the slot is unused */
+/*-------------------------------------------------------------*/
+/* Size of a rollback segment specification slot */
+#define TRX_SYS_RSEG_SLOT_SIZE	8
+
+/*****************************************************************//**
+Writes the value of max_trx_id to the file based trx system header. */
+UNIV_INTERN
+void
+trx_sys_flush_max_trx_id(void);
+/*==========================*/
+
+/***************************************************************//**
+Checks if a page address is the trx sys header page.
+@return	TRUE if trx sys header page */
+UNIV_INLINE
+ibool
+trx_sys_hdr_page(
+/*=============*/
+	ulint	space,	/*!< in: space */
+	ulint	page_no)/*!< in: page number */
+{
+	if ((space == TRX_SYS_SPACE) && (page_no == TRX_SYS_PAGE_NO)) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/***************************************************************//**
+Gets the pointer in the nth slot of the rseg array.
+@return	pointer to rseg object, NULL if slot not in use */
+UNIV_INLINE
+trx_rseg_t*
+trx_sys_get_nth_rseg(
+/*=================*/
+	trx_sys_t*	sys,	/*!< in: trx system */
+	ulint		n)	/*!< in: index of slot */
+{
+	ut_ad(n < TRX_SYS_N_RSEGS);
+
+	return(sys->rseg_array[n]);
+}
+
+/**********************************************************************//**
+Gets a pointer to the transaction system header and x-latches its page.
+@return	pointer to system header, page x-latched. */
+UNIV_INLINE
+trx_sysf_t*
+trx_sysf_get(
+/*=========*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+	trx_sysf_t*	header;
+
+	ut_ad(mtr);
+
+	block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
+			     RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
+
+	header = TRX_SYS + buf_block_get_frame(block);
+
+	return(header);
+}
+
+/*****************************************************************//**
+Gets the space of the nth rollback segment slot in the trx system
+file copy.
+@return	space id */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_space(
+/*====================*/
+	trx_sysf_t*	sys_header,	/*!< in: trx sys header */
+	ulint		i,		/*!< in: slot index == rseg id */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ut_ad(sys_header);
+	ut_ad(i < TRX_SYS_N_RSEGS);
+
+	return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS
+			      + i * TRX_SYS_RSEG_SLOT_SIZE
+			      + TRX_SYS_RSEG_SPACE, MLOG_4BYTES, mtr));
+}
+
+/*****************************************************************//**
+Gets the page number of the nth rollback segment slot in the trx system
+header.
+@return	page number, FIL_NULL if slot unused */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_page_no(
+/*======================*/
+	trx_sysf_t*	sys_header,	/*!< in: trx system header */
+	ulint		i,		/*!< in: slot index == rseg id */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ut_ad(sys_header);
+	ut_ad(i < TRX_SYS_N_RSEGS);
+
+	return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS
+			      + i * TRX_SYS_RSEG_SLOT_SIZE
+			      + TRX_SYS_RSEG_PAGE_NO, MLOG_4BYTES, mtr));
+}
+
+/*****************************************************************//**
+Sets the space id of the nth rollback segment slot in the trx system
+file copy. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_space(
+/*====================*/
+	trx_sysf_t*	sys_header,	/*!< in: trx sys file copy */
+	ulint		i,		/*!< in: slot index == rseg id */
+	ulint		space,		/*!< in: space id */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ut_ad(sys_header);
+	ut_ad(i < TRX_SYS_N_RSEGS);
+
+	mlog_write_ulint(sys_header + TRX_SYS_RSEGS
+			 + i * TRX_SYS_RSEG_SLOT_SIZE
+			 + TRX_SYS_RSEG_SPACE,
+			 space,
+			 MLOG_4BYTES, mtr);
+}
+
+/*****************************************************************//**
+Sets the page number of the nth rollback segment slot in the trx system
+header. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_page_no(
+/*======================*/
+	trx_sysf_t*	sys_header,	/*!< in: trx sys header */
+	ulint		i,		/*!< in: slot index == rseg id */
+	ulint		page_no,	/*!< in: page number, FIL_NULL if the
+					slot is reset to unused */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ut_ad(sys_header);
+	ut_ad(i < TRX_SYS_N_RSEGS);
+
+	mlog_write_ulint(sys_header + TRX_SYS_RSEGS
+			 + i * TRX_SYS_RSEG_SLOT_SIZE
+			 + TRX_SYS_RSEG_PAGE_NO,
+			 page_no,
+			 MLOG_4BYTES, mtr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*****************************************************************//**
+Writes a trx id to an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_trx_id(
+/*=============*/
+	byte*		ptr,	/*!< in: pointer to memory where written */
+	trx_id_t	id)	/*!< in: id */
+{
+#if DATA_TRX_ID_LEN != 6
+# error "DATA_TRX_ID_LEN != 6"
+#endif
+	mach_write_to_6(ptr, id);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*****************************************************************//**
+Reads a trx id from an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_read_...
+@return	id */
+UNIV_INLINE
+trx_id_t
+trx_read_trx_id(
+/*============*/
+	const byte*	ptr)	/*!< in: pointer to memory from where to read */
+{
+#if DATA_TRX_ID_LEN != 6
+# error "DATA_TRX_ID_LEN != 6"
+#endif
+	return(mach_read_from_6(ptr));
+}
+
+/****************************************************************//**
+Looks for the trx handle with the given id in rw_trx_list.
+The caller must be holding trx_sys->mutex.
+@return	the trx handle or NULL if not found;
+the pointer must not be dereferenced unless lock_sys->mutex was
+acquired before calling this function and is still being held */
+UNIV_INLINE
+trx_t*
+trx_get_rw_trx_by_id(
+/*=================*/
+	trx_id_t	trx_id)	/*!< in: trx id to search for */
+{
+	trx_t*		trx;
+	ulint		len;
+	trx_t*		first;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	len = UT_LIST_GET_LEN(trx_sys->rw_trx_list);
+
+	if (len == 0) {
+		return(NULL);
+	}
+
+	/* Because the list is ordered on trx id in descending order,
+	we try to speed things up a bit. */
+
+	trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+	assert_trx_in_rw_list(trx);
+
+	if (trx_id == trx->id) {
+		return(trx);
+	} else if (len == 1 || trx_id > trx->id) {
+		return(NULL);
+	}
+
+	first = trx;
+
+	trx = UT_LIST_GET_LAST(trx_sys->rw_trx_list);
+	assert_trx_in_rw_list(trx);
+
+	if (trx_id == trx->id) {
+		return(trx);
+	} else if (len == 2 || trx_id < trx->id) {
+		return(NULL);
+	}
+
+	/* Search the list from the lower end (tail). */
+	if (trx_id < (first->id + trx->id) >> 1) {
+		for (trx = UT_LIST_GET_PREV(trx_list, trx);
+		     trx != NULL && trx_id > trx->id;
+		     trx = UT_LIST_GET_PREV(trx_list, trx)) {
+			assert_trx_in_rw_list(trx);
+		}
+	} else {
+		for (trx = UT_LIST_GET_NEXT(trx_list, first);
+		     trx != NULL && trx_id < trx->id;
+		     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+			assert_trx_in_rw_list(trx);
+		}
+	}
+
+	return((trx != NULL && trx->id == trx_id) ? trx : NULL);
+}
+
+/****************************************************************//**
+Returns the minimum trx id in trx list. This is the smallest id for which
+the trx can possibly be active. (But, you must look at the trx->state
+to find out if the minimum trx id transaction itself is active, or already
+committed.). The caller must be holding the trx_sys_t::mutex in shared mode.
+@return	the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */
+UNIV_INLINE
+trx_id_t
+trx_rw_min_trx_id_low(void)
+/*=======================*/
+{
+	trx_id_t	id;
+	const trx_t*	trx;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	trx = UT_LIST_GET_LAST(trx_sys->rw_trx_list);
+
+	if (trx == NULL) {
+		id = trx_sys->max_trx_id;
+	} else {
+		assert_trx_in_rw_list(trx);
+		id = trx->id;
+	}
+
+	return(id);
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+/***********************************************************//**
+Assert that a transaction has been recovered.
+@return TRUE */
+UNIV_INLINE
+ibool
+trx_assert_recovered(
+/*=================*/
+	trx_id_t	trx_id)		/*!< in: transaction identifier */
+{
+	const trx_t*	trx;
+
+	mutex_enter(&trx_sys->mutex);
+
+	trx = trx_get_rw_trx_by_id(trx_id);
+	ut_a(trx->is_recovered);
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+/****************************************************************//**
+Returns the minimum trx id in rw trx list. This is the smallest id for which
+the rw trx can possibly be active. (But, you must look at the trx->state
+to find out if the minimum trx id transaction itself is active, or already
+committed.)
+@return	the minimum trx id, or trx_sys->max_trx_id if rw trx list is empty */
+UNIV_INLINE
+trx_id_t
+trx_rw_min_trx_id(void)
+/*===================*/
+{
+	trx_id_t	id;
+
+	mutex_enter(&trx_sys->mutex);
+
+	id = trx_rw_min_trx_id_low();
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(id);
+}
+
+/****************************************************************//**
+Checks if a rw transaction with the given id is active. Caller must hold
+trx_sys->mutex. If the caller is not holding lock_sys->mutex, the
+transaction may already have been committed.
+@return	transaction instance if active, or NULL;
+the pointer must not be dereferenced unless lock_sys->mutex was
+acquired before calling this function and is still being held */
+UNIV_INLINE
+trx_t*
+trx_rw_is_active_low(
+/*=================*/
+	trx_id_t	trx_id,		/*!< in: trx id of the transaction */
+	ibool*		corrupt)	/*!< in: NULL or pointer to a flag
+					that will be set if corrupt */
+{
+	trx_t*		trx;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	if (trx_id < trx_rw_min_trx_id_low()) {
+
+		trx = NULL;
+	} else if (trx_id >= trx_sys->max_trx_id) {
+
+		/* There must be corruption: we let the caller handle the
+		diagnostic prints in this case. */
+
+		trx = NULL;
+		if (corrupt != NULL) {
+			*corrupt = TRUE;
+		}
+	} else {
+		trx = trx_get_rw_trx_by_id(trx_id);
+
+		if (trx != NULL
+		    && trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)) {
+
+			trx = NULL;
+		}
+	}
+
+	return(trx);
+}
+
+/****************************************************************//**
+Checks if a rw transaction with the given id is active. If the caller is
+not holding lock_sys->mutex, the transaction may already have been
+committed.
+@return	transaction instance if active, or NULL;
+the pointer must not be dereferenced unless lock_sys->mutex was
+acquired before calling this function and is still being held */
+UNIV_INLINE
+trx_t*
+trx_rw_is_active(
+/*=============*/
+	trx_id_t	trx_id,		/*!< in: trx id of the transaction */
+	ibool*		corrupt)	/*!< in: NULL or pointer to a flag
+					that will be set if corrupt */
+{
+	trx_t*		trx;
+
+	mutex_enter(&trx_sys->mutex);
+
+	trx = trx_rw_is_active_low(trx_id, corrupt);
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(trx);
+}
+
+/*****************************************************************//**
+Allocates a new transaction id.
+@return	new, allocated trx id */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_new_trx_id(void)
+/*========================*/
+{
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	/* VERY important: after the database is started, max_trx_id value is
+	divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the following if
+	will evaluate to TRUE when this function is first time called,
+	and the value for trx id will be written to disk-based header!
+	Thus trx id values will not overlap when the database is
+	repeatedly started! */
+
+	if (!(trx_sys->max_trx_id % (trx_id_t) TRX_SYS_TRX_ID_WRITE_MARGIN)) {
+
+		trx_sys_flush_max_trx_id();
+	}
+
+	return(trx_sys->max_trx_id++);
+}
+
+/*****************************************************************//**
+Determines the maximum transaction id.
+@return maximum currently allocated trx id; will be stale after the
+next call to trx_sys_get_new_trx_id() */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_max_trx_id(void)
+/*========================*/
+{
+#if UNIV_WORD_SIZE < DATA_TRX_ID_LEN
+	trx_id_t	max_trx_id;
+#endif
+
+	ut_ad(!mutex_own(&trx_sys->mutex));
+
+#if UNIV_WORD_SIZE < DATA_TRX_ID_LEN
+	/* Avoid torn reads. */
+	mutex_enter(&trx_sys->mutex);
+	max_trx_id = trx_sys->max_trx_id;
+	mutex_exit(&trx_sys->mutex);
+	return(max_trx_id);
+#else
+	/* Perform a dirty read. Callers should be prepared for stale
+	values, and we know that the value fits in a machine word, so
+	that it will be read and written atomically. */
+	return(trx_sys->max_trx_id);
+#endif
+}
+
+/*****************************************************************//**
+Get the number of transaction in the system, independent of their state.
+@return count of transactions in trx_sys_t::rw_trx_list */
+UNIV_INLINE
+ulint
+trx_sys_get_n_rw_trx(void)
+/*======================*/
+{
+	ulint	n_trx;
+
+	mutex_enter(&trx_sys->mutex);
+
+	n_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list);
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(n_trx);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h
new file mode 100644
index 00000000000..144e1803975
--- /dev/null
+++ b/storage/innobase/include/trx0trx.h
@@ -0,0 +1,1116 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0trx.h
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0trx_h
+#define trx0trx_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "dict0types.h"
+#ifndef UNIV_HOTBACKUP
+#include "lock0types.h"
+#include "log0log.h"
+#include "usr0types.h"
+#include "que0types.h"
+#include "mem0mem.h"
+#include "read0types.h"
+#include "trx0xa.h"
+#include "ut0vec.h"
+#include "fts0fts.h"
+
+/** Dummy session used currently in MySQL interface */
+extern sess_t*	trx_dummy_sess;
+
+/********************************************************************//**
+Releases the search latch if trx has reserved it. */
+UNIV_INLINE
+void
+trx_search_latch_release_if_reserved(
+/*=================================*/
+	trx_t*		trx); /*!< in: transaction */
+/******************************************************************//**
+Set detailed error message for the transaction. */
+UNIV_INTERN
+void
+trx_set_detailed_error(
+/*===================*/
+	trx_t*		trx,	/*!< in: transaction struct */
+	const char*	msg);	/*!< in: detailed error message */
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+UNIV_INTERN
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+	trx_t*	trx,	/*!< in: transaction struct */
+	FILE*	file);	/*!< in: file to read message from */
+/****************************************************************//**
+Retrieves the error_info field from a trx.
+@return	the error info */
+UNIV_INLINE
+const dict_index_t*
+trx_get_error_info(
+/*===============*/
+	const trx_t*	trx);	/*!< in: trx object */
+/********************************************************************//**
+Creates a transaction object for MySQL.
+@return	own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_mysql(void);
+/*========================*/
+/********************************************************************//**
+Creates a transaction object for background operations by the master thread.
+@return	own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_background(void);
+/*=============================*/
+/********************************************************************//**
+Frees a transaction object of a background operation of the master thread. */
+UNIV_INTERN
+void
+trx_free_for_background(
+/*====================*/
+	trx_t*	trx);	/*!< in, own: trx object */
+/********************************************************************//**
+At shutdown, frees a transaction object that is in the PREPARED state. */
+UNIV_INTERN
+void
+trx_free_prepared(
+/*==============*/
+	trx_t*	trx)	/*!< in, own: trx object */
+	UNIV_COLD __attribute__((nonnull));
+/********************************************************************//**
+Frees a transaction object for MySQL. */
+UNIV_INTERN
+void
+trx_free_for_mysql(
+/*===============*/
+	trx_t*	trx);	/*!< in, own: trx object */
+/****************************************************************//**
+Creates trx objects for transactions and initializes the trx list of
+trx_sys at database start. Rollback segment and undo log lists must
+already exist when this function is called, because the lists of
+transactions to be rolled back or cleaned up are built based on the
+undo log lists. */
+UNIV_INTERN
+void
+trx_lists_init_at_db_start(void);
+/*============================*/
+
+#ifdef UNIV_DEBUG
+#define trx_start_if_not_started_xa(t)				\
+	{							\
+	(t)->start_line = __LINE__;				\
+	(t)->start_file = __FILE__;				\
+	trx_start_if_not_started_xa_low((t));			\
+	}
+#else
+#define trx_start_if_not_started_xa(t)				\
+	trx_start_if_not_started_xa_low((t))
+#endif /* UNIV_DEBUG */
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+UNIV_INTERN
+void
+trx_start_if_not_started_xa_low(
+/*============================*/
+	trx_t*	trx);	/*!< in: transaction */
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+UNIV_INTERN
+void
+trx_start_if_not_started_low(
+/*=========================*/
+	trx_t*	trx);	/*!< in: transaction */
+
+#ifdef UNIV_DEBUG
+#define trx_start_if_not_started(t)				\
+	{							\
+	(t)->start_line = __LINE__;				\
+	(t)->start_file = __FILE__;				\
+	trx_start_if_not_started_low((t));			\
+	}
+#else
+#define trx_start_if_not_started(t)				\
+	trx_start_if_not_started_low((t))
+#endif /* UNIV_DEBUG */
+
+/*************************************************************//**
+Starts the transaction for a DDL operation. */
+UNIV_INTERN
+void
+trx_start_for_ddl_low(
+/*==================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	trx_dict_op_t	op)	/*!< in: dictionary operation type */
+	__attribute__((nonnull));
+
+#ifdef UNIV_DEBUG
+#define trx_start_for_ddl(t, o)					\
+	{							\
+	ut_ad((t)->start_file == 0);				\
+	(t)->start_line = __LINE__;				\
+	(t)->start_file = __FILE__;				\
+	trx_start_for_ddl_low((t), (o));			\
+	}
+#else
+#define trx_start_for_ddl(t, o)					\
+	trx_start_for_ddl_low((t), (o))
+#endif /* UNIV_DEBUG */
+
+/****************************************************************//**
+Commits a transaction. */
+UNIV_INTERN
+void
+trx_commit(
+/*=======*/
+	trx_t*	trx)	/*!< in/out: transaction */
+	__attribute__((nonnull));
+/****************************************************************//**
+Commits a transaction and a mini-transaction. */
+UNIV_INTERN
+void
+trx_commit_low(
+/*===========*/
+	trx_t*	trx,	/*!< in/out: transaction */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction (will be committed),
+			or NULL if trx made no modifications */
+	__attribute__((nonnull(1)));
+/****************************************************************//**
+Cleans up a transaction at database startup. The cleanup is needed if
+the transaction already got to the middle of a commit when the database
+crashed, and we cannot roll it back. */
+UNIV_INTERN
+void
+trx_cleanup_at_db_startup(
+/*======================*/
+	trx_t*	trx);	/*!< in: transaction */
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+dberr_t
+trx_commit_for_mysql(
+/*=================*/
+	trx_t*	trx);	/*!< in/out: transaction */
+/**********************************************************************//**
+Does the transaction prepare for MySQL. */
+UNIV_INTERN
+void
+trx_prepare_for_mysql(
+/*==================*/
+	trx_t*	trx);	/*!< in/out: trx handle */
+/**********************************************************************//**
+This function is used to find number of prepared transactions and
+their transaction objects for a recovery.
+@return	number of prepared transactions */
+UNIV_INTERN
+int
+trx_recover_for_mysql(
+/*==================*/
+	XID*	xid_list,	/*!< in/out: prepared transactions */
+	ulint	len);		/*!< in: number of slots in xid_list */
+/*******************************************************************//**
+This function is used to find one X/Open XA distributed transaction
+which is in the prepared state
+@return	trx or NULL; on match, the trx->xid will be invalidated;
+note that the trx may have been committed, unless the caller is
+holding lock_sys->mutex */
+UNIV_INTERN
+trx_t *
+trx_get_trx_by_xid(
+/*===============*/
+	const XID*	xid);	/*!< in: X/Open XA transaction identifier */
+/**********************************************************************//**
+If required, flushes the log to disk if we called trx_commit_for_mysql()
+with trx->flush_log_later == TRUE. */
+UNIV_INTERN
+void
+trx_commit_complete_for_mysql(
+/*==========================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+UNIV_INTERN
+void
+trx_mark_sql_stat_end(
+/*==================*/
+	trx_t*	trx);	/*!< in: trx handle */
+/********************************************************************//**
+Assigns a read view for a consistent read query. All the consistent reads
+within the same transaction will get the same read view, which is created
+when this function is first called for a new started transaction.
+@return	consistent read view */
+UNIV_INTERN
+read_view_t*
+trx_assign_read_view(
+/*=================*/
+	trx_t*	trx);	/*!< in: active transaction */
+/****************************************************************//**
+Prepares a transaction for commit/rollback. */
+UNIV_INTERN
+void
+trx_commit_or_rollback_prepare(
+/*===========================*/
+	trx_t*	trx);	/*!< in/out: transaction */
+/*********************************************************************//**
+Creates a commit command node struct.
+@return	own: commit node struct */
+UNIV_INTERN
+commit_node_t*
+trx_commit_node_create(
+/*===================*/
+	mem_heap_t*	heap);	/*!< in: mem heap where created */
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return	query thread to run next, or NULL */
+UNIV_INTERN
+que_thr_t*
+trx_commit_step(
+/*============*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/**********************************************************************//**
+Prints info about a transaction.
+Caller must hold trx_sys->mutex. */
+UNIV_INTERN
+void
+trx_print_low(
+/*==========*/
+	FILE*		f,
+			/*!< in: output stream */
+	const trx_t*	trx,
+			/*!< in: transaction */
+	ulint		max_query_len,
+			/*!< in: max query length to print,
+			or 0 to use the default max length */
+	ulint		n_rec_locks,
+			/*!< in: lock_number_of_rows_locked(&trx->lock) */
+	ulint		n_trx_locks,
+			/*!< in: length of trx->lock.trx_locks */
+	ulint		heap_size)
+			/*!< in: mem_heap_get_size(trx->lock.lock_heap) */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Prints info about a transaction.
+The caller must hold lock_sys->mutex and trx_sys->mutex.
+When possible, use trx_print() instead. */
+UNIV_INTERN
+void
+trx_print_latched(
+/*==============*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Prints info about a transaction.
+Acquires and releases lock_sys->mutex and trx_sys->mutex. */
+UNIV_INTERN
+void
+trx_print(
+/*======*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Determine if a transaction is a dictionary operation.
+@return	dictionary operation mode */
+UNIV_INLINE
+enum trx_dict_op_t
+trx_get_dict_operation(
+/*===================*/
+	const trx_t*	trx)	/*!< in: transaction */
+	__attribute__((pure));
+/**********************************************************************//**
+Flag a transaction a dictionary operation. */
+UNIV_INLINE
+void
+trx_set_dict_operation(
+/*===================*/
+	trx_t*			trx,	/*!< in/out: transaction */
+	enum trx_dict_op_t	op);	/*!< in: operation, not
+					TRX_DICT_OP_NONE */
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Determines if a transaction is in the given state.
+The caller must hold trx_sys->mutex, or it must be the thread
+that is serving a running transaction.
+A running transaction must be in trx_sys->ro_trx_list or trx_sys->rw_trx_list
+unless it is a non-locking autocommit read only transaction, which is only
+in trx_sys->mysql_trx_list.
+@return	TRUE if trx->state == state */
+UNIV_INLINE
+ibool
+trx_state_eq(
+/*=========*/
+	const trx_t*	trx,	/*!< in: transaction */
+	trx_state_t	state)	/*!< in: state;
+				if state != TRX_STATE_NOT_STARTED
+				asserts that
+				trx->state != TRX_STATE_NOT_STARTED */
+	__attribute__((nonnull, warn_unused_result));
+# ifdef UNIV_DEBUG
+/**********************************************************************//**
+Asserts that a transaction has been started.
+The caller must hold trx_sys->mutex.
+@return TRUE if started */
+UNIV_INTERN
+ibool
+trx_assert_started(
+/*===============*/
+	const trx_t*	trx)	/*!< in: transaction */
+	__attribute__((nonnull, warn_unused_result));
+# endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Determines if the currently running transaction has been interrupted.
+@return	TRUE if interrupted */
+UNIV_INTERN
+ibool
+trx_is_interrupted(
+/*===============*/
+	const trx_t*	trx);	/*!< in: transaction */
+/**********************************************************************//**
+Determines if the currently running transaction is in strict mode.
+@return	TRUE if strict */
+UNIV_INTERN
+ibool
+trx_is_strict(
+/*==========*/
+	trx_t*	trx);	/*!< in: transaction */
+#else /* !UNIV_HOTBACKUP */
+#define trx_is_interrupted(trx) FALSE
+#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Calculates the "weight" of a transaction. The weight of one transaction
+is estimated as the number of altered rows + the number of locked rows.
+@param t	transaction
+@return		transaction weight */
+#define TRX_WEIGHT(t)	((t)->undo_no + UT_LIST_GET_LEN((t)->lock.trx_locks))
+
+/*******************************************************************//**
+Compares the "weight" (or size) of two transactions. Transactions that
+have edited non-transactional tables are considered heavier than ones
+that have not.
+@return	TRUE if weight(a) >= weight(b) */
+UNIV_INTERN
+ibool
+trx_weight_ge(
+/*==========*/
+	const trx_t*	a,	/*!< in: the first transaction to be compared */
+	const trx_t*	b);	/*!< in: the second transaction to be compared */
+
+/* Maximum length of a string that can be returned by
+trx_get_que_state_str(). */
+#define TRX_QUE_STATE_STR_MAX_LEN	12 /* "ROLLING BACK" */
+
+/*******************************************************************//**
+Retrieves transaction's que state in a human readable string. The string
+should not be free()'d or modified.
+@return	string in the data segment */
+UNIV_INLINE
+const char*
+trx_get_que_state_str(
+/*==================*/
+	const trx_t*	trx);	/*!< in: transaction */
+
+/****************************************************************//**
+Assign a read-only transaction a rollback-segment, if it is attempting
+to write to a TEMPORARY table. */
+UNIV_INTERN
+void
+trx_assign_rseg(
+/*============*/
+	trx_t*		trx);		/*!< A read-only transaction that
+					needs to be assigned a RBS. */
+/*******************************************************************//**
+Transactions that aren't started by the MySQL server don't set
+the trx_t::mysql_thd field. For such transactions we set the lock
+wait timeout to 0 instead of the user configured value that comes
+from innodb_lock_wait_timeout via trx_t::mysql_thd.
+@param trx	transaction
+@return		lock wait timeout in seconds */
+#define trx_lock_wait_timeout_get(trx)					\
+	((trx)->mysql_thd != NULL					\
+	 ? thd_lock_wait_timeout((trx)->mysql_thd)			\
+	 : 0)
+
+/*******************************************************************//**
+Determine if the transaction is a non-locking autocommit select
+(implied read-only).
+@param t	transaction
+@return true	if non-locking autocommit select transaction. */
+#define trx_is_autocommit_non_locking(t)				\
+((t)->auto_commit && (t)->will_lock == 0)
+
+/*******************************************************************//**
+Determine if the transaction is a non-locking autocommit select
+with an explicit check for the read-only status.
+@param t	transaction
+@return true	if non-locking autocommit read-only transaction. */
+#define trx_is_ac_nl_ro(t)						\
+((t)->read_only && trx_is_autocommit_non_locking((t)))
+
+/*******************************************************************//**
+Assert that the transaction is in the trx_sys_t::rw_trx_list */
+#define assert_trx_in_rw_list(t) do {					\
+	ut_ad(!(t)->read_only);						\
+	assert_trx_in_list(t);						\
+} while (0)
+
+/*******************************************************************//**
+Assert that the transaction is either in trx_sys->ro_trx_list or
+trx_sys->rw_trx_list but not both and it cannot be an autocommit
+non-locking select */
+#define assert_trx_in_list(t) do {					\
+	ut_ad((t)->in_ro_trx_list == (t)->read_only);			\
+	ut_ad((t)->in_rw_trx_list == !(t)->read_only);			\
+	ut_ad(!trx_is_autocommit_non_locking((t)));			\
+	switch ((t)->state) {						\
+	case TRX_STATE_PREPARED:					\
+		/* fall through */					\
+	case TRX_STATE_ACTIVE:						\
+	case TRX_STATE_COMMITTED_IN_MEMORY:				\
+		continue;						\
+	case TRX_STATE_NOT_STARTED:					\
+		break;							\
+	}								\
+	ut_error;							\
+} while (0)
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Assert that an autocommit non-locking select cannot be in the
+ro_trx_list nor the rw_trx_list and that it is a read-only transaction.
+The tranasction must be in the mysql_trx_list. */
+# define assert_trx_nonlocking_or_in_list(t)				\
+	do {								\
+		if (trx_is_autocommit_non_locking(t)) {			\
+			trx_state_t	t_state = (t)->state;		\
+			ut_ad((t)->read_only);				\
+			ut_ad(!(t)->is_recovered);			\
+			ut_ad(!(t)->in_ro_trx_list);			\
+			ut_ad(!(t)->in_rw_trx_list);			\
+			ut_ad((t)->in_mysql_trx_list);			\
+			ut_ad(t_state == TRX_STATE_NOT_STARTED		\
+			      || t_state == TRX_STATE_ACTIVE);		\
+		} else {						\
+			assert_trx_in_list(t);				\
+		}							\
+	} while (0)
+#else /* UNIV_DEBUG */
+/*******************************************************************//**
+Assert that an autocommit non-locking slect cannot be in the
+ro_trx_list nor the rw_trx_list and that it is a read-only transaction.
+The tranasction must be in the mysql_trx_list. */
+# define assert_trx_nonlocking_or_in_list(trx) ((void)0)
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Latching protocol for trx_lock_t::que_state.  trx_lock_t::que_state
+captures the state of the query thread during the execution of a query.
+This is different from a transaction state. The query state of a transaction
+can be updated asynchronously by other threads.  The other threads can be
+system threads, like the timeout monitor thread or user threads executing
+other queries. Another thing to be mindful of is that there is a delay between
+when a query thread is put into LOCK_WAIT state and before it actually starts
+waiting.  Between these two events it is possible that the query thread is
+granted the lock it was waiting for, which implies that the state can be changed
+asynchronously.
+
+All these operations take place within the context of locking. Therefore state
+changes within the locking code must acquire both the lock mutex and the
+trx->mutex when changing trx->lock.que_state to TRX_QUE_LOCK_WAIT or
+trx->lock.wait_lock to non-NULL but when the lock wait ends it is sufficient
+to only acquire the trx->mutex.
+To query the state either of the mutexes is sufficient within the locking
+code and no mutex is required when the query thread is no longer waiting. */
+
+/** The locks and state of an active transaction. Protected by
+lock_sys->mutex, trx->mutex or both. */
+struct trx_lock_t {
+	ulint		n_active_thrs;	/*!< number of active query threads */
+
+	trx_que_t	que_state;	/*!< valid when trx->state
+					== TRX_STATE_ACTIVE: TRX_QUE_RUNNING,
+					TRX_QUE_LOCK_WAIT, ... */
+
+	lock_t*		wait_lock;	/*!< if trx execution state is
+					TRX_QUE_LOCK_WAIT, this points to
+					the lock request, otherwise this is
+					NULL; set to non-NULL when holding
+					both trx->mutex and lock_sys->mutex;
+					set to NULL when holding
+					lock_sys->mutex; readers should
+					hold lock_sys->mutex, except when
+					they are holding trx->mutex and
+					wait_lock==NULL */
+	ib_uint64_t	deadlock_mark;	/*!< A mark field that is initialized
+					to and checked against lock_mark_counter
+					by lock_deadlock_recursive(). */
+	ibool		was_chosen_as_deadlock_victim;
+					/*!< when the transaction decides to
+					wait for a lock, it sets this to FALSE;
+					if another transaction chooses this
+					transaction as a victim in deadlock
+					resolution, it sets this to TRUE.
+					Protected by trx->mutex. */
+	time_t		wait_started;	/*!< lock wait started at this time,
+					protected only by lock_sys->mutex */
+
+	que_thr_t*	wait_thr;	/*!< query thread belonging to this
+					trx that is in QUE_THR_LOCK_WAIT
+					state. For threads suspended in a
+					lock wait, this is protected by
+					lock_sys->mutex. Otherwise, this may
+					only be modified by the thread that is
+					serving the running transaction. */
+
+	mem_heap_t*	lock_heap;	/*!< memory heap for trx_locks;
+					protected by lock_sys->mutex */
+
+	UT_LIST_BASE_NODE_T(lock_t)
+			trx_locks;	/*!< locks requested
+					by the transaction;
+					insertions are protected by trx->mutex
+					and lock_sys->mutex; removals are
+					protected by lock_sys->mutex */
+
+	ib_vector_t*	table_locks;	/*!< All table locks requested by this
+					transaction, including AUTOINC locks */
+
+	ibool		cancel;		/*!< TRUE if the transaction is being
+					rolled back either via deadlock
+					detection or due to lock timeout. The
+					caller has to acquire the trx_t::mutex
+					in order to cancel the locks. In
+					lock_trx_table_locks_remove() we
+					check for this cancel of a transaction's
+					locks and avoid reacquiring the trx
+					mutex to prevent recursive deadlocks.
+					Protected by both the lock sys mutex
+					and the trx_t::mutex. */
+};
+
+#define TRX_MAGIC_N	91118598
+
+/** The transaction handle
+
+Normally, there is a 1:1 relationship between a transaction handle
+(trx) and a session (client connection). One session is associated
+with exactly one user transaction. There are some exceptions to this:
+
+* For DDL operations, a subtransaction is allocated that modifies the
+data dictionary tables. Lock waits and deadlocks are prevented by
+acquiring the dict_operation_lock before starting the subtransaction
+and releasing it after committing the subtransaction.
+
+* The purge system uses a special transaction that is not associated
+with any session.
+
+* If the system crashed or it was quickly shut down while there were
+transactions in the ACTIVE or PREPARED state, these transactions would
+no longer be associated with a session when the server is restarted.
+
+A session may be served by at most one thread at a time. The serving
+thread of a session might change in some MySQL implementations.
+Therefore we do not have os_thread_get_curr_id() assertions in the code.
+
+Normally, only the thread that is currently associated with a running
+transaction may access (read and modify) the trx object, and it may do
+so without holding any mutex. The following are exceptions to this:
+
+* trx_rollback_resurrected() may access resurrected (connectionless)
+transactions while the system is already processing new user
+transactions. The trx_sys->mutex prevents a race condition between it
+and lock_trx_release_locks() [invoked by trx_commit()].
+
+* trx_print_low() may access transactions not associated with the current
+thread. The caller must be holding trx_sys->mutex and lock_sys->mutex.
+
+* When a transaction handle is in the trx_sys->mysql_trx_list or
+trx_sys->trx_list, some of its fields must not be modified without
+holding trx_sys->mutex exclusively.
+
+* The locking code (in particular, lock_deadlock_recursive() and
+lock_rec_convert_impl_to_expl()) will access transactions associated
+to other connections. The locks of transactions are protected by
+lock_sys->mutex and sometimes by trx->mutex. */
+
+struct trx_t{
+	ulint		magic_n;
+
+	ib_mutex_t	mutex;		/*!< Mutex protecting the fields
+					state and lock
+					(except some fields of lock, which
+					are protected by lock_sys->mutex) */
+
+	/** State of the trx from the point of view of concurrency control
+	and the valid state transitions.
+
+	Possible states:
+
+	TRX_STATE_NOT_STARTED
+	TRX_STATE_ACTIVE
+	TRX_STATE_PREPARED
+	TRX_STATE_COMMITTED_IN_MEMORY (alias below COMMITTED)
+
+	Valid state transitions are:
+
+	Regular transactions:
+	* NOT_STARTED -> ACTIVE -> COMMITTED -> NOT_STARTED
+
+	Auto-commit non-locking read-only:
+	* NOT_STARTED -> ACTIVE -> NOT_STARTED
+
+	XA (2PC):
+	* NOT_STARTED -> ACTIVE -> PREPARED -> COMMITTED -> NOT_STARTED
+
+	Recovered XA:
+	* NOT_STARTED -> PREPARED -> COMMITTED -> (freed)
+
+	XA (2PC) (shutdown before ROLLBACK or COMMIT):
+	* NOT_STARTED -> PREPARED -> (freed)
+
+	Latching and various transaction lists membership rules:
+
+	XA (2PC) transactions are always treated as non-autocommit.
+
+	Transitions to ACTIVE or NOT_STARTED occur when
+	!in_rw_trx_list and !in_ro_trx_list (no trx_sys->mutex needed).
+
+	Autocommit non-locking read-only transactions move between states
+	without holding any mutex. They are !in_rw_trx_list, !in_ro_trx_list.
+
+	When a transaction is NOT_STARTED, it can be in_mysql_trx_list if
+	it is a user transaction. It cannot be in ro_trx_list or rw_trx_list.
+
+	ACTIVE->PREPARED->COMMITTED is only possible when trx->in_rw_trx_list.
+	The transition ACTIVE->PREPARED is protected by trx_sys->mutex.
+
+	ACTIVE->COMMITTED is possible when the transaction is in
+	ro_trx_list or rw_trx_list.
+
+	Transitions to COMMITTED are protected by both lock_sys->mutex
+	and trx->mutex.
+
+	NOTE: Some of these state change constraints are an overkill,
+	currently only required for a consistent view for printing stats.
+	This unnecessarily adds a huge cost for the general case.
+
+	NOTE: In the future we should add read only transactions to the
+	ro_trx_list the first time they try to acquire a lock ie. by default
+	we treat all read-only transactions as non-locking.  */
+	trx_state_t	state;
+
+	trx_lock_t	lock;		/*!< Information about the transaction
+					locks and state. Protected by
+					trx->mutex or lock_sys->mutex
+					or both */
+	ulint		is_recovered;	/*!< 0=normal transaction,
+					1=recovered, must be rolled back,
+					protected by trx_sys->mutex when
+					trx->in_rw_trx_list holds */
+
+	/* These fields are not protected by any mutex. */
+	const char*	op_info;	/*!< English text describing the
+					current operation, or an empty
+					string */
+	ulint		isolation_level;/*!< TRX_ISO_REPEATABLE_READ, ... */
+	ulint		check_foreigns;	/*!< normally TRUE, but if the user
+					wants to suppress foreign key checks,
+					(in table imports, for example) we
+					set this FALSE */
+	/*------------------------------*/
+	/* MySQL has a transaction coordinator to coordinate two phase
+	commit between multiple storage engines and the binary log. When
+	an engine participates in a transaction, it's responsible for
+	registering itself using the trans_register_ha() API. */
+	unsigned	is_registered:1;/* This flag is set to 1 after the
+					transaction has been registered with
+					the coordinator using the XA API, and
+					is set to 0 after commit or rollback. */
+	unsigned	owns_prepare_mutex:1;/* 1 if owns prepare mutex, if
+					this is set to 1 then registered should
+					also be set to 1. This is used in the
+					XA code */
+	/*------------------------------*/
+	ulint		check_unique_secondary;
+					/*!< normally TRUE, but if the user
+					wants to speed up inserts by
+					suppressing unique key checks
+					for secondary indexes when we decide
+					if we can use the insert buffer for
+					them, we set this FALSE */
+	ulint		support_xa;	/*!< normally we do the XA two-phase
+					commit steps, but by setting this to
+					FALSE, one can save CPU time and about
+					150 bytes in the undo log size as then
+					we skip XA steps */
+	ulint		flush_log_later;/* In 2PC, we hold the
+					prepare_commit mutex across
+					both phases. In that case, we
+					defer flush of the logs to disk
+					until after we release the
+					mutex. */
+	ulint		must_flush_log_later;/*!< this flag is set to TRUE in
+					trx_commit() if flush_log_later was
+					TRUE, and there were modifications by
+					the transaction; in that case we must
+					flush the log in
+					trx_commit_complete_for_mysql() */
+	ulint		duplicates;	/*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
+	ulint		has_search_latch;
+					/*!< TRUE if this trx has latched the
+					search system latch in S-mode */
+	ulint		search_latch_timeout;
+					/*!< If we notice that someone is
+					waiting for our S-lock on the search
+					latch to be released, we wait in
+					row0sel.cc for BTR_SEA_TIMEOUT new
+					searches until we try to keep
+					the search latch again over
+					calls from MySQL; this is intended
+					to reduce contention on the search
+					latch */
+	trx_dict_op_t	dict_operation;	/**< @see enum trx_dict_op */
+
+	/* Fields protected by the srv_conc_mutex. */
+	ulint		declared_to_be_inside_innodb;
+					/*!< this is TRUE if we have declared
+					this transaction in
+					srv_conc_enter_innodb to be inside the
+					InnoDB engine */
+	ulint		n_tickets_to_enter_innodb;
+					/*!< this can be > 0 only when
+					declared_to_... is TRUE; when we come
+					to srv_conc_innodb_enter, if the value
+					here is > 0, we decrement this by 1 */
+	ulint		dict_operation_lock_mode;
+					/*!< 0, RW_S_LATCH, or RW_X_LATCH:
+					the latch mode trx currently holds
+					on dict_operation_lock. Protected
+					by dict_operation_lock. */
+
+	trx_id_t	no;		/*!< transaction serialization number:
+					max trx id shortly before the
+					transaction is moved to
+					COMMITTED_IN_MEMORY state.
+					Protected by trx_sys_t::mutex
+					when trx->in_rw_trx_list. Initially
+					set to TRX_ID_MAX. */
+
+	time_t		start_time;	/*!< time the trx state last time became
+					TRX_STATE_ACTIVE */
+	trx_id_t	id;		/*!< transaction id */
+	XID		xid;		/*!< X/Open XA transaction
+					identification to identify a
+					transaction branch */
+	lsn_t		commit_lsn;	/*!< lsn at the time of the commit */
+	table_id_t	table_id;	/*!< Table to drop iff dict_operation
+					== TRX_DICT_OP_TABLE, or 0. */
+	/*------------------------------*/
+	THD*		mysql_thd;	/*!< MySQL thread handle corresponding
+					to this trx, or NULL */
+	const char*	mysql_log_file_name;
+					/*!< if MySQL binlog is used, this field
+					contains a pointer to the latest file
+					name; this is NULL if binlog is not
+					used */
+	ib_int64_t	mysql_log_offset;
+					/*!< if MySQL binlog is used, this
+					field contains the end offset of the
+					binlog entry */
+	/*------------------------------*/
+	ulint		n_mysql_tables_in_use; /*!< number of Innobase tables
+					used in the processing of the current
+					SQL statement in MySQL */
+	ulint		mysql_n_tables_locked;
+					/*!< how many tables the current SQL
+					statement uses, except those
+					in consistent read */
+	/*------------------------------*/
+	UT_LIST_NODE_T(trx_t)
+			trx_list;	/*!< list of transactions;
+					protected by trx_sys->mutex.
+					The same node is used for both
+					trx_sys_t::ro_trx_list and
+					trx_sys_t::rw_trx_list */
+#ifdef UNIV_DEBUG
+	/** The following two fields are mutually exclusive. */
+	/* @{ */
+
+	ibool		in_ro_trx_list;	/*!< TRUE if in trx_sys->ro_trx_list */
+	ibool		in_rw_trx_list;	/*!< TRUE if in trx_sys->rw_trx_list */
+	/* @} */
+#endif /* UNIV_DEBUG */
+	UT_LIST_NODE_T(trx_t)
+			mysql_trx_list;	/*!< list of transactions created for
+					MySQL; protected by trx_sys->mutex */
+#ifdef UNIV_DEBUG
+	ibool		in_mysql_trx_list;
+					/*!< TRUE if in
+					trx_sys->mysql_trx_list */
+#endif /* UNIV_DEBUG */
+	/*------------------------------*/
+	dberr_t		error_state;	/*!< 0 if no error, otherwise error
+					number; NOTE That ONLY the thread
+					doing the transaction is allowed to
+					set this field: this is NOT protected
+					by any mutex */
+	const dict_index_t*error_info;	/*!< if the error number indicates a
+					duplicate key error, a pointer to
+					the problematic index is stored here */
+	ulint		error_key_num;	/*!< if the index creation fails to a
+					duplicate key error, a mysql key
+					number of that index is stored here */
+	sess_t*		sess;		/*!< session of the trx, NULL if none */
+	que_t*		graph;		/*!< query currently run in the session,
+					or NULL if none; NOTE that the query
+					belongs to the session, and it can
+					survive over a transaction commit, if
+					it is a stored procedure with a COMMIT
+					WORK statement, for instance */
+	mem_heap_t*	global_read_view_heap;
+					/*!< memory heap for the global read
+					view */
+	read_view_t*	global_read_view;
+					/*!< consistent read view associated
+					to a transaction or NULL */
+	read_view_t*	read_view;	/*!< consistent read view used in the
+					transaction or NULL, this read view
+					if defined can be normal read view
+					associated to a transaction (i.e.
+					same as global_read_view) or read view
+					associated to a cursor */
+	/*------------------------------*/
+	UT_LIST_BASE_NODE_T(trx_named_savept_t)
+			trx_savepoints;	/*!< savepoints set with SAVEPOINT ...,
+					oldest first */
+	/*------------------------------*/
+	ib_mutex_t	undo_mutex;	/*!< mutex protecting the fields in this
+					section (down to undo_no_arr), EXCEPT
+					last_sql_stat_start, which can be
+					accessed only when we know that there
+					cannot be any activity in the undo
+					logs! */
+	undo_no_t	undo_no;	/*!< next undo log record number to
+					assign; since the undo log is
+					private for a transaction, this
+					is a simple ascending sequence
+					with no gaps; thus it represents
+					the number of modified/inserted
+					rows in a transaction */
+	trx_savept_t	last_sql_stat_start;
+					/*!< undo_no when the last sql statement
+					was started: in case of an error, trx
+					is rolled back down to this undo
+					number; see note at undo_mutex! */
+	trx_rseg_t*	rseg;		/*!< rollback segment assigned to the
+					transaction, or NULL if not assigned
+					yet */
+	trx_undo_t*	insert_undo;	/*!< pointer to the insert undo log, or
+					NULL if no inserts performed yet */
+	trx_undo_t*	update_undo;	/*!< pointer to the update undo log, or
+					NULL if no update performed yet */
+	undo_no_t	roll_limit;	/*!< least undo number to undo during
+					a rollback */
+	ulint		pages_undone;	/*!< number of undo log pages undone
+					since the last undo log truncation */
+	trx_undo_arr_t*	undo_no_arr;	/*!< array of undo numbers of undo log
+					records which are currently processed
+					by a rollback operation */
+	/*------------------------------*/
+	ulint		n_autoinc_rows;	/*!< no. of AUTO-INC rows required for
+					an SQL statement. This is useful for
+					multi-row INSERTs */
+	ib_vector_t*    autoinc_locks;  /* AUTOINC locks held by this
+					transaction. Note that these are
+					also in the lock list trx_locks. This
+					vector needs to be freed explicitly
+					when the trx instance is destroyed.
+					Protected by lock_sys->mutex. */
+	/*------------------------------*/
+	ibool		read_only;	/*!< TRUE if transaction is flagged
+					as a READ-ONLY transaction.
+					if !auto_commit || will_lock > 0
+					then it will added to the list
+					trx_sys_t::ro_trx_list. A read only
+					transaction will not be assigned an
+					UNDO log. Non-locking auto-commit
+					read-only transaction will not be on
+					either list. */
+	ibool		auto_commit;	/*!< TRUE if it is an autocommit */
+	ulint		will_lock;	/*!< Will acquire some locks. Increment
+					each time we determine that a lock will
+					be acquired by the MySQL layer. */
+	bool		ddl;		/*!< true if it is a transaction that
+					is being started for a DDL operation */
+	/*------------------------------*/
+	fts_trx_t*	fts_trx;	/*!< FTS information, or NULL if
+					transaction hasn't modified tables
+					with FTS indexes (yet). */
+	doc_id_t	fts_next_doc_id;/* The document id used for updates */
+	/*------------------------------*/
+	ulint		flush_tables;	/*!< if "covering" the FLUSH TABLES",
+					count of tables being flushed. */
+
+	/*------------------------------*/
+#ifdef UNIV_DEBUG
+	ulint		start_line;	/*!< Track where it was started from */
+	const char*	start_file;	/*!< Filename where it was started */
+#endif /* UNIV_DEBUG */
+	/*------------------------------*/
+	bool		api_trx;	/*!< trx started by InnoDB API */
+	bool		api_auto_commit;/*!< automatic commit */
+	bool		read_write;	/*!< if read and write operation */
+
+	/*------------------------------*/
+	char detailed_error[256];	/*!< detailed error message for last
+					error, or empty. */
+};
+
+/* Transaction isolation levels (trx->isolation_level) */
+#define TRX_ISO_READ_UNCOMMITTED	0	/* dirty read: non-locking
+						SELECTs are performed so that
+						we do not look at a possible
+						earlier version of a record;
+						thus they are not 'consistent'
+						reads under this isolation
+						level; otherwise like level
+						2 */
+
+#define TRX_ISO_READ_COMMITTED		1	/* somewhat Oracle-like
+						isolation, except that in
+						range UPDATE and DELETE we
+						must block phantom rows
+						with next-key locks;
+						SELECT ... FOR UPDATE and ...
+						LOCK IN SHARE MODE only lock
+						the index records, NOT the
+						gaps before them, and thus
+						allow free inserting;
+						each consistent read reads its
+						own snapshot */
+
+#define TRX_ISO_REPEATABLE_READ		2	/* this is the default;
+						all consistent reads in the
+						same trx read the same
+						snapshot;
+						full next-key locking used
+						in locking reads to block
+						insertions into gaps */
+
+#define TRX_ISO_SERIALIZABLE		3	/* all plain SELECTs are
+						converted to LOCK IN SHARE
+						MODE reads */
+
+/* Treatment of duplicate values (trx->duplicates; for example, in inserts).
+Multiple flags can be combined with bitwise OR. */
+#define TRX_DUP_IGNORE	1	/* duplicate rows are to be updated */
+#define TRX_DUP_REPLACE	2	/* duplicate rows are to be replaced */
+
+
+/* Types of a trx signal */
+#define TRX_SIG_NO_SIGNAL		0
+#define TRX_SIG_TOTAL_ROLLBACK		1
+#define TRX_SIG_ROLLBACK_TO_SAVEPT	2
+#define TRX_SIG_COMMIT			3
+#define TRX_SIG_BREAK_EXECUTION		5
+
+/* Sender types of a signal */
+#define TRX_SIG_SELF		0	/* sent by the session itself, or
+					by an error occurring within this
+					session */
+#define TRX_SIG_OTHER_SESS	1	/* sent by another session (which
+					must hold rights to this) */
+
+/** Commit node states */
+enum commit_node_state {
+	COMMIT_NODE_SEND = 1,	/*!< about to send a commit signal to
+				the transaction */
+	COMMIT_NODE_WAIT	/*!< commit signal sent to the transaction,
+				waiting for completion */
+};
+
+/** Commit command node in a query graph */
+struct commit_node_t{
+	que_common_t	common;	/*!< node type: QUE_NODE_COMMIT */
+	enum commit_node_state
+			state;	/*!< node execution state */
+};
+
+
+/** Test if trx->mutex is owned. */
+#define trx_mutex_own(t) mutex_own(&t->mutex)
+
+/** Acquire the trx->mutex. */
+#define trx_mutex_enter(t) do {			\
+	mutex_enter(&t->mutex);			\
+} while (0)
+
+/** Release the trx->mutex. */
+#define trx_mutex_exit(t) do {			\
+	mutex_exit(&t->mutex);			\
+} while (0)
+
+/** @brief The latch protecting the adaptive search system
+
+This latch protects the
+(1) hash index;
+(2) columns of a record to which we have a pointer in the hash index;
+
+but does NOT protect:
+
+(3) next record offset field in a record;
+(4) next or previous records on the same page.
+
+Bear in mind (3) and (4) when using the hash index.
+*/
+extern rw_lock_t*	btr_search_latch_temp;
+
+/** The latch protecting the adaptive search system */
+#define btr_search_latch	(*btr_search_latch_temp)
+
+#ifndef UNIV_NONINL
+#include "trx0trx.ic"
+#endif
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/innobase/include/trx0trx.ic b/storage/innobase/include/trx0trx.ic
new file mode 100644
index 00000000000..69ee17ea98b
--- /dev/null
+++ b/storage/innobase/include/trx0trx.ic
@@ -0,0 +1,180 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0trx.ic
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+/**********************************************************************//**
+Determines if a transaction is in the given state.
+The caller must hold trx_sys->mutex, or it must be the thread
+that is serving a running transaction.
+A running transaction must be in trx_sys->ro_trx_list or trx_sys->rw_trx_list
+unless it is a non-locking autocommit read only transaction, which is only
+in trx_sys->mysql_trx_list.
+@return	TRUE if trx->state == state */
+UNIV_INLINE
+ibool
+trx_state_eq(
+/*=========*/
+	const trx_t*	trx,	/*!< in: transaction */
+	trx_state_t	state)	/*!< in: state;
+				if state != TRX_STATE_NOT_STARTED
+				asserts that
+				trx->state != TRX_STATE_NOT_STARTED */
+{
+#ifdef UNIV_DEBUG
+	switch (trx->state) {
+	case TRX_STATE_PREPARED:
+		ut_ad(!trx_is_autocommit_non_locking(trx));
+		return(trx->state == state);
+
+	case TRX_STATE_ACTIVE:
+		assert_trx_nonlocking_or_in_list(trx);
+		return(state == trx->state);
+
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		assert_trx_in_list(trx);
+		return(state == trx->state);
+
+	case TRX_STATE_NOT_STARTED:
+		/* This state is not allowed for running transactions. */
+		ut_a(state == TRX_STATE_NOT_STARTED);
+		ut_ad(!trx->in_rw_trx_list);
+		ut_ad(!trx->in_ro_trx_list);
+		return(state == trx->state);
+	}
+	ut_error;
+#endif /* UNIV_DEBUG */
+	return(trx->state == state);
+}
+
+/****************************************************************//**
+Retrieves the error_info field from a trx.
+@return	the error info */
+UNIV_INLINE
+const dict_index_t*
+trx_get_error_info(
+/*===============*/
+	const trx_t*	trx)	/*!< in: trx object */
+{
+	return(trx->error_info);
+}
+
+/*******************************************************************//**
+Retrieves transaction's que state in a human readable string. The string
+should not be free()'d or modified.
+@return	string in the data segment */
+UNIV_INLINE
+const char*
+trx_get_que_state_str(
+/*==================*/
+	const trx_t*	trx)	/*!< in: transaction */
+{
+	/* be sure to adjust TRX_QUE_STATE_STR_MAX_LEN if you change this */
+	switch (trx->lock.que_state) {
+	case TRX_QUE_RUNNING:
+		return("RUNNING");
+	case TRX_QUE_LOCK_WAIT:
+		return("LOCK WAIT");
+	case TRX_QUE_ROLLING_BACK:
+		return("ROLLING BACK");
+	case TRX_QUE_COMMITTING:
+		return("COMMITTING");
+	default:
+		return("UNKNOWN");
+	}
+}
+
+/**********************************************************************//**
+Determine if a transaction is a dictionary operation.
+@return	dictionary operation mode */
+UNIV_INLINE
+enum trx_dict_op_t
+trx_get_dict_operation(
+/*===================*/
+	const trx_t*	trx)	/*!< in: transaction */
+{
+	trx_dict_op_t op = static_cast<trx_dict_op_t>(trx->dict_operation);
+
+#ifdef UNIV_DEBUG
+	switch (op) {
+	case TRX_DICT_OP_NONE:
+	case TRX_DICT_OP_TABLE:
+	case TRX_DICT_OP_INDEX:
+		return(op);
+	}
+	ut_error;
+#endif /* UNIV_DEBUG */
+	return(op);
+}
+/**********************************************************************//**
+Flag a transaction a dictionary operation. */
+UNIV_INLINE
+void
+trx_set_dict_operation(
+/*===================*/
+	trx_t*			trx,	/*!< in/out: transaction */
+	enum trx_dict_op_t	op)	/*!< in: operation, not
+					TRX_DICT_OP_NONE */
+{
+#ifdef UNIV_DEBUG
+	enum trx_dict_op_t	old_op = trx_get_dict_operation(trx);
+
+	switch (op) {
+	case TRX_DICT_OP_NONE:
+		ut_error;
+		break;
+	case TRX_DICT_OP_TABLE:
+		switch (old_op) {
+		case TRX_DICT_OP_NONE:
+		case TRX_DICT_OP_INDEX:
+		case TRX_DICT_OP_TABLE:
+			goto ok;
+		}
+		ut_error;
+		break;
+	case TRX_DICT_OP_INDEX:
+		ut_ad(old_op == TRX_DICT_OP_NONE);
+		break;
+	}
+ok:
+#endif /* UNIV_DEBUG */
+
+	trx->ddl = true;
+	trx->dict_operation = op;
+}
+
+/********************************************************************//**
+Releases the search latch if trx has reserved it. */
+UNIV_INLINE
+void
+trx_search_latch_release_if_reserved(
+/*=================================*/
+	trx_t*	   trx) /*!< in: transaction */
+{
+	if (trx->has_search_latch) {
+		rw_lock_s_unlock(&btr_search_latch);
+
+		trx->has_search_latch = FALSE;
+	}
+}
+
diff --git a/storage/innobase/include/trx0types.h b/storage/innobase/include/trx0types.h
new file mode 100644
index 00000000000..7ca95131328
--- /dev/null
+++ b/storage/innobase/include/trx0types.h
@@ -0,0 +1,147 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0types.h
+Transaction system global type definitions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0types_h
+#define trx0types_h
+
+#include "ut0byte.h"
+
+/** printf(3) format used for printing DB_TRX_ID and other system fields */
+#define TRX_ID_FMT		IB_ID_FMT
+
+/** maximum length that a formatted trx_t::id could take, not including
+the terminating NUL character. */
+#define TRX_ID_MAX_LEN		17
+
+/** Transaction execution states when trx->state == TRX_STATE_ACTIVE */
+enum trx_que_t {
+	TRX_QUE_RUNNING,		/*!< transaction is running */
+	TRX_QUE_LOCK_WAIT,		/*!< transaction is waiting for
+					a lock */
+	TRX_QUE_ROLLING_BACK,		/*!< transaction is rolling back */
+	TRX_QUE_COMMITTING		/*!< transaction is committing */
+};
+
+/** Transaction states (trx_t::state) */
+enum trx_state_t {
+	TRX_STATE_NOT_STARTED,
+	TRX_STATE_ACTIVE,
+	TRX_STATE_PREPARED,			/* Support for 2PC/XA */
+	TRX_STATE_COMMITTED_IN_MEMORY
+};
+
+/** Type of data dictionary operation */
+enum trx_dict_op_t {
+	/** The transaction is not modifying the data dictionary. */
+	TRX_DICT_OP_NONE = 0,
+	/** The transaction is creating a table or an index, or
+	dropping a table.  The table must be dropped in crash
+	recovery.  This and TRX_DICT_OP_NONE are the only possible
+	operation modes in crash recovery. */
+	TRX_DICT_OP_TABLE = 1,
+	/** The transaction is creating or dropping an index in an
+	existing table.  In crash recovery, the data dictionary
+	must be locked, but the table must not be dropped. */
+	TRX_DICT_OP_INDEX = 2
+};
+
+/** Memory objects */
+/* @{ */
+/** Transaction */
+struct trx_t;
+/** The locks and state of an active transaction */
+struct trx_lock_t;
+/** Transaction system */
+struct trx_sys_t;
+/** Signal */
+struct trx_sig_t;
+/** Rollback segment */
+struct trx_rseg_t;
+/** Transaction undo log */
+struct trx_undo_t;
+/** Array of undo numbers of undo records being rolled back or purged */
+struct trx_undo_arr_t;
+/** A cell of trx_undo_arr_t */
+struct trx_undo_inf_t;
+/** The control structure used in the purge operation */
+struct trx_purge_t;
+/** Rollback command node in a query graph */
+struct roll_node_t;
+/** Commit command node in a query graph */
+struct commit_node_t;
+/** SAVEPOINT command node in a query graph */
+struct trx_named_savept_t;
+/* @} */
+
+/** Rollback contexts */
+enum trx_rb_ctx {
+	RB_NONE = 0,	/*!< no rollback */
+	RB_NORMAL,	/*!< normal rollback */
+	RB_RECOVERY_PURGE_REC,
+			/*!< rolling back an incomplete transaction,
+			in crash recovery, rolling back an
+			INSERT that was performed by updating a
+			delete-marked record; if the delete-marked record
+			no longer exists in an active read view, it will
+			be purged */
+	RB_RECOVERY	/*!< rolling back an incomplete transaction,
+			in crash recovery */
+};
+
+/** Row identifier (DB_ROW_ID, DATA_ROW_ID) */
+typedef ib_id_t	row_id_t;
+/** Transaction identifier (DB_TRX_ID, DATA_TRX_ID) */
+typedef ib_id_t	trx_id_t;
+/** Rollback pointer (DB_ROLL_PTR, DATA_ROLL_PTR) */
+typedef ib_id_t	roll_ptr_t;
+/** Undo number */
+typedef ib_id_t	undo_no_t;
+
+/** Maximum transaction identifier */
+#define TRX_ID_MAX	IB_ID_MAX
+
+/** Transaction savepoint */
+struct trx_savept_t{
+	undo_no_t	least_undo_no;	/*!< least undo number to undo */
+};
+
+/** File objects */
+/* @{ */
+/** Transaction system header */
+typedef byte	trx_sysf_t;
+/** Rollback segment header */
+typedef byte	trx_rsegf_t;
+/** Undo segment header */
+typedef byte	trx_usegf_t;
+/** Undo log header */
+typedef byte	trx_ulogf_t;
+/** Undo log page header */
+typedef byte	trx_upagef_t;
+
+/** Undo log record */
+typedef	byte	trx_undo_rec_t;
+/* @} */
+
+#endif
diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h
new file mode 100644
index 00000000000..61b0dabb1e6
--- /dev/null
+++ b/storage/innobase/include/trx0undo.h
@@ -0,0 +1,604 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0undo.h
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0undo_h
+#define trx0undo_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+#include "page0types.h"
+#include "trx0xa.h"
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Builds a roll pointer.
+@return	roll pointer */
+UNIV_INLINE
+roll_ptr_t
+trx_undo_build_roll_ptr(
+/*====================*/
+	ibool	is_insert,	/*!< in: TRUE if insert undo log */
+	ulint	rseg_id,	/*!< in: rollback segment id */
+	ulint	page_no,	/*!< in: page number */
+	ulint	offset);	/*!< in: offset of the undo entry within page */
+/***********************************************************************//**
+Decodes a roll pointer. */
+UNIV_INLINE
+void
+trx_undo_decode_roll_ptr(
+/*=====================*/
+	roll_ptr_t	roll_ptr,	/*!< in: roll pointer */
+	ibool*		is_insert,	/*!< out: TRUE if insert undo log */
+	ulint*		rseg_id,	/*!< out: rollback segment id */
+	ulint*		page_no,	/*!< out: page number */
+	ulint*		offset);	/*!< out: offset of the undo
+					entry within page */
+/***********************************************************************//**
+Returns TRUE if the roll pointer is of the insert type.
+@return	TRUE if insert undo log */
+UNIV_INLINE
+ibool
+trx_undo_roll_ptr_is_insert(
+/*========================*/
+	roll_ptr_t	roll_ptr);	/*!< in: roll pointer */
+/***********************************************************************//**
+Returns true if the record is of the insert type.
+@return	true if the record was freshly inserted (not updated). */
+UNIV_INLINE
+bool
+trx_undo_trx_id_is_insert(
+/*======================*/
+	const byte*	trx_id)	/*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */
+	__attribute__((nonnull, pure, warn_unused_result));
+#endif /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Writes a roll ptr to an index page. In case that the size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_roll_ptr(
+/*===============*/
+	byte*		ptr,		/*!< in: pointer to memory where
+					written */
+	roll_ptr_t	roll_ptr);	/*!< in: roll ptr */
+/*****************************************************************//**
+Reads a roll ptr from an index page. In case that the roll ptr size
+changes in some future version, this function should be used instead of
+mach_read_...
+@return	roll ptr */
+UNIV_INLINE
+roll_ptr_t
+trx_read_roll_ptr(
+/*==============*/
+	const byte*	ptr);	/*!< in: pointer to memory from where to read */
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Gets an undo log page and x-latches it.
+@return	pointer to page x-latched */
+UNIV_INLINE
+page_t*
+trx_undo_page_get(
+/*==============*/
+	ulint	space,		/*!< in: space where placed */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number */
+	mtr_t*	mtr);		/*!< in: mtr */
+/******************************************************************//**
+Gets an undo log page and s-latches it.
+@return	pointer to page s-latched */
+UNIV_INLINE
+page_t*
+trx_undo_page_get_s_latched(
+/*========================*/
+	ulint	space,		/*!< in: space where placed */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number */
+	mtr_t*	mtr);		/*!< in: mtr */
+/******************************************************************//**
+Returns the previous undo record on the page in the specified log, or
+NULL if none exists.
+@return	pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_prev_rec(
+/*=======================*/
+	trx_undo_rec_t*	rec,	/*!< in: undo log record */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset);/*!< in: undo log header offset on page */
+/******************************************************************//**
+Returns the next undo log record on the page in the specified log, or
+NULL if none exists.
+@return	pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_next_rec(
+/*=======================*/
+	trx_undo_rec_t*	rec,	/*!< in: undo log record */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset);/*!< in: undo log header offset on page */
+/******************************************************************//**
+Returns the last undo record on the page in the specified undo log, or
+NULL if none exists.
+@return	pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_last_rec(
+/*=======================*/
+	page_t*	undo_page,/*!< in: undo log page */
+	ulint	page_no,/*!< in: undo log header page number */
+	ulint	offset);	/*!< in: undo log header offset on page */
+/******************************************************************//**
+Returns the first undo record on the page in the specified undo log, or
+NULL if none exists.
+@return	pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_first_rec(
+/*========================*/
+	page_t*	undo_page,/*!< in: undo log page */
+	ulint	page_no,/*!< in: undo log header page number */
+	ulint	offset);/*!< in: undo log header offset on page */
+/***********************************************************************//**
+Gets the previous record in an undo log.
+@return	undo log record, the page s-latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_prev_rec(
+/*==================*/
+	trx_undo_rec_t*	rec,	/*!< in: undo record */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset,	/*!< in: undo log header offset on page */
+	bool		shared,	/*!< in: true=S-latch, false=X-latch */
+	mtr_t*		mtr);	/*!< in: mtr */
+/***********************************************************************//**
+Gets the next record in an undo log.
+@return	undo log record, the page s-latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_next_rec(
+/*==================*/
+	trx_undo_rec_t*	rec,	/*!< in: undo record */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset,	/*!< in: undo log header offset on page */
+	mtr_t*		mtr);	/*!< in: mtr */
+/***********************************************************************//**
+Gets the first record in an undo log.
+@return	undo log record, the page latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_first_rec(
+/*===================*/
+	ulint	space,	/*!< in: undo log header space */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	ulint	page_no,/*!< in: undo log header page number */
+	ulint	offset,	/*!< in: undo log header offset on page */
+	ulint	mode,	/*!< in: latching mode: RW_S_LATCH or RW_X_LATCH */
+	mtr_t*	mtr);	/*!< in: mtr */
+/********************************************************************//**
+Tries to add a page to the undo log segment where the undo log is placed.
+@return	X-latched block if success, else NULL */
+UNIV_INTERN
+buf_block_t*
+trx_undo_add_page(
+/*==============*/
+	trx_t*		trx,	/*!< in: transaction */
+	trx_undo_t*	undo,	/*!< in: undo log memory object */
+	mtr_t*		mtr)	/*!< in: mtr which does not have a latch to any
+				undo log page; the caller must have reserved
+				the rollback segment mutex */
+	__attribute__((nonnull, warn_unused_result));
+/********************************************************************//**
+Frees the last undo log page.
+The caller must hold the rollback segment mutex. */
+UNIV_INTERN
+void
+trx_undo_free_last_page_func(
+/*==========================*/
+#ifdef UNIV_DEBUG
+	const trx_t*	trx,	/*!< in: transaction */
+#endif /* UNIV_DEBUG */
+	trx_undo_t*	undo,	/*!< in/out: undo log memory copy */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction which does not
+				have a latch to any undo log page or which
+				has allocated the undo log page */
+	__attribute__((nonnull));
+#ifdef UNIV_DEBUG
+# define trx_undo_free_last_page(trx,undo,mtr)	\
+	trx_undo_free_last_page_func(trx,undo,mtr)
+#else /* UNIV_DEBUG */
+# define trx_undo_free_last_page(trx,undo,mtr)	\
+	trx_undo_free_last_page_func(undo,mtr)
+#endif /* UNIV_DEBUG */
+
+/***********************************************************************//**
+Truncates an undo log from the end. This function is used during a rollback
+to free space from an undo log. */
+UNIV_INTERN
+void
+trx_undo_truncate_end_func(
+/*=======================*/
+#ifdef UNIV_DEBUG
+	const trx_t*	trx,	/*!< in: transaction whose undo log it is */
+#endif /* UNIV_DEBUG */
+	trx_undo_t*	undo,	/*!< in/out: undo log */
+	undo_no_t	limit)	/*!< in: all undo records with undo number
+				>= this value should be truncated */
+	__attribute__((nonnull));
+#ifdef UNIV_DEBUG
+# define trx_undo_truncate_end(trx,undo,limit)		\
+	trx_undo_truncate_end_func(trx,undo,limit)
+#else /* UNIV_DEBUG */
+# define trx_undo_truncate_end(trx,undo,limit)		\
+	trx_undo_truncate_end_func(undo,limit)
+#endif /* UNIV_DEBUG */
+
+/***********************************************************************//**
+Truncates an undo log from the start. This function is used during a purge
+operation. */
+UNIV_INTERN
+void
+trx_undo_truncate_start(
+/*====================*/
+	trx_rseg_t*	rseg,		/*!< in: rollback segment */
+	ulint		space,		/*!< in: space id of the log */
+	ulint		hdr_page_no,	/*!< in: header page number */
+	ulint		hdr_offset,	/*!< in: header offset on the page */
+	undo_no_t	limit);		/*!< in: all undo pages with
+					undo numbers < this value
+					should be truncated; NOTE that
+					the function only frees whole
+					pages; the header page is not
+					freed, but emptied, if all the
+					records there are < limit */
+/********************************************************************//**
+Initializes the undo log lists for a rollback segment memory copy.
+This function is only called when the database is started or a new
+rollback segment created.
+@return	the combined size of undo log segments in pages */
+UNIV_INTERN
+ulint
+trx_undo_lists_init(
+/*================*/
+	trx_rseg_t*	rseg);	/*!< in: rollback segment memory object */
+/**********************************************************************//**
+Assigns an undo log for a transaction. A new undo log is created or a cached
+undo log reused.
+@return DB_SUCCESS if undo log assign successful, possible error codes
+are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE DB_READ_ONLY
+DB_OUT_OF_MEMORY */
+UNIV_INTERN
+dberr_t
+trx_undo_assign_undo(
+/*=================*/
+	trx_t*		trx,	/*!< in: transaction */
+	ulint		type)	/*!< in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */
+	__attribute__((nonnull, warn_unused_result));
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction finish.
+@return	undo log segment header page, x-latched */
+UNIV_INTERN
+page_t*
+trx_undo_set_state_at_finish(
+/*=========================*/
+	trx_undo_t*	undo,	/*!< in: undo log memory copy */
+	mtr_t*		mtr);	/*!< in: mtr */
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction prepare.
+@return	undo log segment header page, x-latched */
+UNIV_INTERN
+page_t*
+trx_undo_set_state_at_prepare(
+/*==========================*/
+	trx_t*		trx,	/*!< in: transaction */
+	trx_undo_t*	undo,	/*!< in: undo log memory copy */
+	mtr_t*		mtr);	/*!< in: mtr */
+
+/**********************************************************************//**
+Adds the update undo log header as the first in the history list, and
+frees the memory object, or puts it to the list of cached update undo log
+segments. */
+UNIV_INTERN
+void
+trx_undo_update_cleanup(
+/*====================*/
+	trx_t*	trx,		/*!< in: trx owning the update undo log */
+	page_t*	undo_page,	/*!< in: update undo log header page,
+				x-latched */
+	mtr_t*	mtr);		/*!< in: mtr */
+/******************************************************************//**
+Frees or caches an insert undo log after a transaction commit or rollback.
+Knowledge of inserts is not needed after a commit or rollback, therefore
+the data can be discarded. */
+UNIV_INTERN
+void
+trx_undo_insert_cleanup(
+/*====================*/
+	trx_t*	trx);	/*!< in: transaction handle */
+
+/********************************************************************//**
+At shutdown, frees the undo logs of a PREPARED transaction. */
+UNIV_INTERN
+void
+trx_undo_free_prepared(
+/*===================*/
+	trx_t*	trx)	/*!< in/out: PREPARED transaction */
+	UNIV_COLD __attribute__((nonnull));
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Parses the redo log entry of an undo log page initialization.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_page_init(
+/*=====================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr);	/*!< in: mtr or NULL */
+/***********************************************************//**
+Parses the redo log entry of an undo log page header create or reuse.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_page_header(
+/*=======================*/
+	ulint	type,	/*!< in: MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE */
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr);	/*!< in: mtr or NULL */
+/***********************************************************//**
+Parses the redo log entry of an undo log page header discard.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_discard_latest(
+/*==========================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr);	/*!< in: mtr or NULL */
+/************************************************************************
+Frees an undo log memory copy. */
+UNIV_INTERN
+void
+trx_undo_mem_free(
+/*==============*/
+	trx_undo_t*	undo);		/* in: the undo object to be freed */
+
+/* Types of an undo log segment */
+#define	TRX_UNDO_INSERT		1	/* contains undo entries for inserts */
+#define	TRX_UNDO_UPDATE		2	/* contains undo entries for updates
+					and delete markings: in short,
+					modifys (the name 'UPDATE' is a
+					historical relic) */
+/* States of an undo log segment */
+#define TRX_UNDO_ACTIVE		1	/* contains an undo log of an active
+					transaction */
+#define	TRX_UNDO_CACHED		2	/* cached for quick reuse */
+#define	TRX_UNDO_TO_FREE	3	/* insert undo segment can be freed */
+#define	TRX_UNDO_TO_PURGE	4	/* update undo segment will not be
+					reused: it can be freed in purge when
+					all undo data in it is removed */
+#define	TRX_UNDO_PREPARED	5	/* contains an undo log of an
+					prepared transaction */
+
+#ifndef UNIV_HOTBACKUP
+/** Transaction undo log memory object; this is protected by the undo_mutex
+in the corresponding transaction object */
+
+struct trx_undo_t{
+	/*-----------------------------*/
+	ulint		id;		/*!< undo log slot number within the
+					rollback segment */
+	ulint		type;		/*!< TRX_UNDO_INSERT or
+					TRX_UNDO_UPDATE */
+	ulint		state;		/*!< state of the corresponding undo log
+					segment */
+	ibool		del_marks;	/*!< relevant only in an update undo
+					log: this is TRUE if the transaction may
+					have delete marked records, because of
+					a delete of a row or an update of an
+					indexed field; purge is then
+					necessary; also TRUE if the transaction
+					has updated an externally stored
+					field */
+	trx_id_t	trx_id;		/*!< id of the trx assigned to the undo
+					log */
+	XID		xid;		/*!< X/Open XA transaction
+					identification */
+	ibool		dict_operation;	/*!< TRUE if a dict operation trx */
+	table_id_t	table_id;	/*!< if a dict operation, then the table
+					id */
+	trx_rseg_t*	rseg;		/*!< rseg where the undo log belongs */
+	/*-----------------------------*/
+	ulint		space;		/*!< space id where the undo log
+					placed */
+	ulint		zip_size;	/*!< compressed page size of space
+					in bytes, or 0 for uncompressed */
+	ulint		hdr_page_no;	/*!< page number of the header page in
+					the undo log */
+	ulint		hdr_offset;	/*!< header offset of the undo log on
+				       	the page */
+	ulint		last_page_no;	/*!< page number of the last page in the
+					undo log; this may differ from
+					top_page_no during a rollback */
+	ulint		size;		/*!< current size in pages */
+	/*-----------------------------*/
+	ulint		empty;		/*!< TRUE if the stack of undo log
+					records is currently empty */
+	ulint		top_page_no;	/*!< page number where the latest undo
+					log record was catenated; during
+					rollback the page from which the latest
+					undo record was chosen */
+	ulint		top_offset;	/*!< offset of the latest undo record,
+					i.e., the topmost element in the undo
+					log if we think of it as a stack */
+	undo_no_t	top_undo_no;	/*!< undo number of the latest record */
+	buf_block_t*	guess_block;	/*!< guess for the buffer block where
+					the top page might reside */
+	/*-----------------------------*/
+	UT_LIST_NODE_T(trx_undo_t) undo_list;
+					/*!< undo log objects in the rollback
+					segment are chained into lists */
+};
+#endif /* !UNIV_HOTBACKUP */
+
+/** The offset of the undo log page header on pages of the undo log */
+#define	TRX_UNDO_PAGE_HDR	FSEG_PAGE_DATA
+/*-------------------------------------------------------------*/
+/** Transaction undo log page header offsets */
+/* @{ */
+#define	TRX_UNDO_PAGE_TYPE	0	/*!< TRX_UNDO_INSERT or
+					TRX_UNDO_UPDATE */
+#define	TRX_UNDO_PAGE_START	2	/*!< Byte offset where the undo log
+					records for the LATEST transaction
+					start on this page (remember that
+					in an update undo log, the first page
+					can contain several undo logs) */
+#define	TRX_UNDO_PAGE_FREE	4	/*!< On each page of the undo log this
+					field contains the byte offset of the
+					first free byte on the page */
+#define TRX_UNDO_PAGE_NODE	6	/*!< The file list node in the chain
+					of undo log pages */
+/*-------------------------------------------------------------*/
+#define TRX_UNDO_PAGE_HDR_SIZE	(6 + FLST_NODE_SIZE)
+					/*!< Size of the transaction undo
+					log page header, in bytes */
+/* @} */
+
+/** An update undo segment with just one page can be reused if it has
+at most this many bytes used; we must leave space at least for one new undo
+log header on the page */
+
+#define TRX_UNDO_PAGE_REUSE_LIMIT	(3 * UNIV_PAGE_SIZE / 4)
+
+/* An update undo log segment may contain several undo logs on its first page
+if the undo logs took so little space that the segment could be cached and
+reused. All the undo log headers are then on the first page, and the last one
+owns the undo log records on subsequent pages if the segment is bigger than
+one page. If an undo log is stored in a segment, then on the first page it is
+allowed to have zero undo records, but if the segment extends to several
+pages, then all the rest of the pages must contain at least one undo log
+record. */
+
+/** The offset of the undo log segment header on the first page of the undo
+log segment */
+
+#define	TRX_UNDO_SEG_HDR	(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE)
+/** Undo log segment header */
+/* @{ */
+/*-------------------------------------------------------------*/
+#define	TRX_UNDO_STATE		0	/*!< TRX_UNDO_ACTIVE, ... */
+#define	TRX_UNDO_LAST_LOG	2	/*!< Offset of the last undo log header
+					on the segment header page, 0 if
+					none */
+#define	TRX_UNDO_FSEG_HEADER	4	/*!< Header for the file segment which
+					the undo log segment occupies */
+#define	TRX_UNDO_PAGE_LIST	(4 + FSEG_HEADER_SIZE)
+					/*!< Base node for the list of pages in
+					the undo log segment; defined only on
+					the undo log segment's first page */
+/*-------------------------------------------------------------*/
+/** Size of the undo log segment header */
+#define TRX_UNDO_SEG_HDR_SIZE	(4 + FSEG_HEADER_SIZE + FLST_BASE_NODE_SIZE)
+/* @} */
+
+
+/** The undo log header. There can be several undo log headers on the first
+page of an update undo log segment. */
+/* @{ */
+/*-------------------------------------------------------------*/
+#define	TRX_UNDO_TRX_ID		0	/*!< Transaction id */
+#define	TRX_UNDO_TRX_NO		8	/*!< Transaction number of the
+					transaction; defined only if the log
+					is in a history list */
+#define TRX_UNDO_DEL_MARKS	16	/*!< Defined only in an update undo
+					log: TRUE if the transaction may have
+					done delete markings of records, and
+					thus purge is necessary */
+#define	TRX_UNDO_LOG_START	18	/*!< Offset of the first undo log record
+					of this log on the header page; purge
+					may remove undo log record from the
+					log start, and therefore this is not
+					necessarily the same as this log
+					header end offset */
+#define	TRX_UNDO_XID_EXISTS	20	/*!< TRUE if undo log header includes
+					X/Open XA transaction identification
+					XID */
+#define	TRX_UNDO_DICT_TRANS	21	/*!< TRUE if the transaction is a table
+					create, index create, or drop
+					transaction: in recovery
+					the transaction cannot be rolled back
+					in the usual way: a 'rollback' rather
+					means dropping the created or dropped
+					table, if it still exists */
+#define TRX_UNDO_TABLE_ID	22	/*!< Id of the table if the preceding
+					field is TRUE */
+#define	TRX_UNDO_NEXT_LOG	30	/*!< Offset of the next undo log header
+					on this page, 0 if none */
+#define	TRX_UNDO_PREV_LOG	32	/*!< Offset of the previous undo log
+					header on this page, 0 if none */
+#define TRX_UNDO_HISTORY_NODE	34	/*!< If the log is put to the history
+					list, the file list node is here */
+/*-------------------------------------------------------------*/
+/** Size of the undo log header without XID information */
+#define TRX_UNDO_LOG_OLD_HDR_SIZE (34 + FLST_NODE_SIZE)
+
+/* Note: the writing of the undo log old header is coded by a log record
+MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE. The appending of an XID to the
+header is logged separately. In this sense, the XID is not really a member
+of the undo log header. TODO: do not append the XID to the log header if XA
+is not needed by the user. The XID wastes about 150 bytes of space in every
+undo log. In the history list we may have millions of undo logs, which means
+quite a large overhead. */
+
+/** X/Open XA Transaction Identification (XID) */
+/* @{ */
+/** xid_t::formatID */
+#define	TRX_UNDO_XA_FORMAT	(TRX_UNDO_LOG_OLD_HDR_SIZE)
+/** xid_t::gtrid_length */
+#define	TRX_UNDO_XA_TRID_LEN	(TRX_UNDO_XA_FORMAT + 4)
+/** xid_t::bqual_length */
+#define	TRX_UNDO_XA_BQUAL_LEN	(TRX_UNDO_XA_TRID_LEN + 4)
+/** Distributed transaction identifier data */
+#define	TRX_UNDO_XA_XID		(TRX_UNDO_XA_BQUAL_LEN + 4)
+/*--------------------------------------------------------------*/
+#define TRX_UNDO_LOG_XA_HDR_SIZE (TRX_UNDO_XA_XID + XIDDATASIZE)
+					/*!< Total size of the undo log header
+					with the XA XID */
+/* @} */
+
+#ifndef UNIV_NONINL
+#include "trx0undo.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/trx0undo.ic b/storage/innobase/include/trx0undo.ic
new file mode 100644
index 00000000000..577759d6c3d
--- /dev/null
+++ b/storage/innobase/include/trx0undo.ic
@@ -0,0 +1,363 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0undo.ic
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "data0type.h"
+#include "page0page.h"
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Builds a roll pointer.
+@return	roll pointer */
+UNIV_INLINE
+roll_ptr_t
+trx_undo_build_roll_ptr(
+/*====================*/
+	ibool	is_insert,	/*!< in: TRUE if insert undo log */
+	ulint	rseg_id,	/*!< in: rollback segment id */
+	ulint	page_no,	/*!< in: page number */
+	ulint	offset)		/*!< in: offset of the undo entry within page */
+{
+	roll_ptr_t	roll_ptr;
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+	ut_ad(is_insert == 0 || is_insert == 1);
+	ut_ad(rseg_id < TRX_SYS_N_RSEGS);
+	ut_ad(offset < 65536);
+
+	roll_ptr = (roll_ptr_t) is_insert << 55
+		| (roll_ptr_t) rseg_id << 48
+		| (roll_ptr_t) page_no << 16
+		| offset;
+	return(roll_ptr);
+}
+
+/***********************************************************************//**
+Decodes a roll pointer. */
+UNIV_INLINE
+void
+trx_undo_decode_roll_ptr(
+/*=====================*/
+	roll_ptr_t	roll_ptr,	/*!< in: roll pointer */
+	ibool*		is_insert,	/*!< out: TRUE if insert undo log */
+	ulint*		rseg_id,	/*!< out: rollback segment id */
+	ulint*		page_no,	/*!< out: page number */
+	ulint*		offset)		/*!< out: offset of the undo
+					entry within page */
+{
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+#if TRUE != 1
+# error "TRUE != 1"
+#endif
+	ut_ad(roll_ptr < (1ULL << 56));
+	*offset = (ulint) roll_ptr & 0xFFFF;
+	roll_ptr >>= 16;
+	*page_no = (ulint) roll_ptr & 0xFFFFFFFF;
+	roll_ptr >>= 32;
+	*rseg_id = (ulint) roll_ptr & 0x7F;
+	roll_ptr >>= 7;
+	*is_insert = (ibool) roll_ptr; /* TRUE==1 */
+}
+
+/***********************************************************************//**
+Returns TRUE if the roll pointer is of the insert type.
+@return	TRUE if insert undo log */
+UNIV_INLINE
+ibool
+trx_undo_roll_ptr_is_insert(
+/*========================*/
+	roll_ptr_t	roll_ptr)	/*!< in: roll pointer */
+{
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+#if TRUE != 1
+# error "TRUE != 1"
+#endif
+	ut_ad(roll_ptr < (1ULL << 56));
+	return((ibool) (roll_ptr >> 55));
+}
+
+/***********************************************************************//**
+Returns true if the record is of the insert type.
+@return	true if the record was freshly inserted (not updated). */
+UNIV_INLINE
+bool
+trx_undo_trx_id_is_insert(
+/*======================*/
+	const byte*	trx_id)	/*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */
+{
+#if DATA_TRX_ID + 1 != DATA_ROLL_PTR
+# error
+#endif
+	return(static_cast<bool>(trx_id[DATA_TRX_ID_LEN] >> 7));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*****************************************************************//**
+Writes a roll ptr to an index page. In case that the size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_roll_ptr(
+/*===============*/
+	byte*		ptr,		/*!< in: pointer to memory where
+					written */
+	roll_ptr_t	roll_ptr)	/*!< in: roll ptr */
+{
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+	mach_write_to_7(ptr, roll_ptr);
+}
+
+/*****************************************************************//**
+Reads a roll ptr from an index page. In case that the roll ptr size
+changes in some future version, this function should be used instead of
+mach_read_...
+@return	roll ptr */
+UNIV_INLINE
+roll_ptr_t
+trx_read_roll_ptr(
+/*==============*/
+	const byte*	ptr)	/*!< in: pointer to memory from where to read */
+{
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+	return(mach_read_from_7(ptr));
+}
+
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Gets an undo log page and x-latches it.
+@return	pointer to page x-latched */
+UNIV_INLINE
+page_t*
+trx_undo_page_get(
+/*==============*/
+	ulint	space,		/*!< in: space where placed */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	buf_block_t*	block = buf_page_get(space, zip_size, page_no,
+					     RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+	return(buf_block_get_frame(block));
+}
+
+/******************************************************************//**
+Gets an undo log page and s-latches it.
+@return	pointer to page s-latched */
+UNIV_INLINE
+page_t*
+trx_undo_page_get_s_latched(
+/*========================*/
+	ulint	space,		/*!< in: space where placed */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	buf_block_t*	block = buf_page_get(space, zip_size, page_no,
+					     RW_S_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+	return(buf_block_get_frame(block));
+}
+
+/******************************************************************//**
+Returns the start offset of the undo log records of the specified undo
+log on the page.
+@return	start offset */
+UNIV_INLINE
+ulint
+trx_undo_page_get_start(
+/*====================*/
+	page_t*	undo_page,/*!< in: undo log page */
+	ulint	page_no,/*!< in: undo log header page number */
+	ulint	offset)	/*!< in: undo log header offset on page */
+{
+	ulint	start;
+
+	if (page_no == page_get_page_no(undo_page)) {
+
+		start = mach_read_from_2(offset + undo_page
+					 + TRX_UNDO_LOG_START);
+	} else {
+		start = TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE;
+	}
+
+	return(start);
+}
+
+/******************************************************************//**
+Returns the end offset of the undo log records of the specified undo
+log on the page.
+@return	end offset */
+UNIV_INLINE
+ulint
+trx_undo_page_get_end(
+/*==================*/
+	page_t*	undo_page,/*!< in: undo log page */
+	ulint	page_no,/*!< in: undo log header page number */
+	ulint	offset)	/*!< in: undo log header offset on page */
+{
+	trx_ulogf_t*	log_hdr;
+	ulint		end;
+
+	if (page_no == page_get_page_no(undo_page)) {
+
+		log_hdr = undo_page + offset;
+
+		end = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG);
+
+		if (end == 0) {
+			end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+					       + TRX_UNDO_PAGE_FREE);
+		}
+	} else {
+		end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+				       + TRX_UNDO_PAGE_FREE);
+	}
+
+	return(end);
+}
+
+/******************************************************************//**
+Returns the previous undo record on the page in the specified log, or
+NULL if none exists.
+@return	pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_prev_rec(
+/*=======================*/
+	trx_undo_rec_t*	rec,	/*!< in: undo log record */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset)	/*!< in: undo log header offset on page */
+{
+	page_t*	undo_page;
+	ulint	start;
+
+	undo_page = (page_t*) ut_align_down(rec, UNIV_PAGE_SIZE);
+
+	start = trx_undo_page_get_start(undo_page, page_no, offset);
+
+	if (start + undo_page == rec) {
+
+		return(NULL);
+	}
+
+	return(undo_page + mach_read_from_2(rec - 2));
+}
+
+/******************************************************************//**
+Returns the next undo log record on the page in the specified log, or
+NULL if none exists.
+@return	pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_next_rec(
+/*=======================*/
+	trx_undo_rec_t*	rec,	/*!< in: undo log record */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset)	/*!< in: undo log header offset on page */
+{
+	page_t*	undo_page;
+	ulint	end;
+	ulint	next;
+
+	undo_page = (page_t*) ut_align_down(rec, UNIV_PAGE_SIZE);
+
+	end = trx_undo_page_get_end(undo_page, page_no, offset);
+
+	next = mach_read_from_2(rec);
+
+	if (next == end) {
+
+		return(NULL);
+	}
+
+	return(undo_page + next);
+}
+
+/******************************************************************//**
+Returns the last undo record on the page in the specified undo log, or
+NULL if none exists.
+@return	pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_last_rec(
+/*=======================*/
+	page_t*	undo_page,/*!< in: undo log page */
+	ulint	page_no,/*!< in: undo log header page number */
+	ulint	offset)	/*!< in: undo log header offset on page */
+{
+	ulint	start;
+	ulint	end;
+
+	start = trx_undo_page_get_start(undo_page, page_no, offset);
+	end = trx_undo_page_get_end(undo_page, page_no, offset);
+
+	if (start == end) {
+
+		return(NULL);
+	}
+
+	return(undo_page + mach_read_from_2(undo_page + end - 2));
+}
+
+/******************************************************************//**
+Returns the first undo record on the page in the specified undo log, or
+NULL if none exists.
+@return	pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_first_rec(
+/*========================*/
+	page_t*	undo_page,/*!< in: undo log page */
+	ulint	page_no,/*!< in: undo log header page number */
+	ulint	offset)	/*!< in: undo log header offset on page */
+{
+	ulint	start;
+	ulint	end;
+
+	start = trx_undo_page_get_start(undo_page, page_no, offset);
+	end = trx_undo_page_get_end(undo_page, page_no, offset);
+
+	if (start == end) {
+
+		return(NULL);
+	}
+
+	return(undo_page + start);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/trx0xa.h b/storage/innobase/include/trx0xa.h
new file mode 100644
index 00000000000..7caddfb7ba4
--- /dev/null
+++ b/storage/innobase/include/trx0xa.h
@@ -0,0 +1,70 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*
+ * Start of xa.h header
+ *
+ * Define a symbol to prevent multiple inclusions of this header file
+ */
+#ifndef	XA_H
+#define	XA_H
+
+/*
+ * Transaction branch identification: XID and NULLXID:
+ */
+#ifndef XIDDATASIZE
+
+/** Sizes of transaction identifier */
+#define	XIDDATASIZE	128		/*!< maximum size of a transaction
+					identifier, in bytes */
+#define	MAXGTRIDSIZE	 64		/*!< maximum size in bytes of gtrid */
+#define	MAXBQUALSIZE	 64		/*!< maximum size in bytes of bqual */
+
+/** X/Open XA distributed transaction identifier */
+struct xid_t {
+	long formatID;			/*!< format identifier; -1
+					means that the XID is null */
+	long gtrid_length;		/*!< value from 1 through 64 */
+	long bqual_length;		/*!< value from 1 through 64 */
+	char data[XIDDATASIZE];		/*!< distributed transaction
+					identifier */
+};
+/** X/Open XA distributed transaction identifier */
+typedef	struct xid_t XID;
+#endif
+/** X/Open XA distributed transaction status codes */
+/* @{ */
+#define	XA_OK		0		/*!< normal execution */
+#define	XAER_ASYNC	-2		/*!< asynchronous operation already
+					outstanding */
+#define	XAER_RMERR	-3		/*!< a resource manager error
+					occurred in the transaction
+					branch */
+#define	XAER_NOTA	-4		/*!< the XID is not valid */
+#define	XAER_INVAL	-5		/*!< invalid arguments were given */
+#define	XAER_PROTO	-6		/*!< routine invoked in an improper
+					context */
+#define	XAER_RMFAIL	-7		/*!< resource manager unavailable */
+#define	XAER_DUPID	-8		/*!< the XID already exists */
+#define	XAER_OUTSIDE	-9		/*!< resource manager doing
+					work outside transaction */
+/* @} */
+#endif /* ifndef XA_H */
+/*
+ * End of xa.h header
+ */
diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i
new file mode 100644
index 00000000000..8c325ecc88c
--- /dev/null
+++ b/storage/innobase/include/univ.i
@@ -0,0 +1,667 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***********************************************************************//**
+@file include/univ.i
+Version control for database, common definitions, and include files
+
+Created 1/20/1994 Heikki Tuuri
+****************************************************************************/
+
+#ifndef univ_i
+#define univ_i
+
+#ifdef UNIV_HOTBACKUP
+#include "hb_univ.i"
+#endif /* UNIV_HOTBACKUP */
+
+/* aux macros to convert M into "123" (string) if M is defined like
+#define M 123 */
+#define _IB_TO_STR(s)	#s
+#define IB_TO_STR(s)	_IB_TO_STR(s)
+
+#define INNODB_VERSION_MAJOR	MYSQL_VERSION_MAJOR
+#define INNODB_VERSION_MINOR	MYSQL_VERSION_MINOR
+#define INNODB_VERSION_BUGFIX	MYSQL_VERSION_PATCH
+
+/* The following is the InnoDB version as shown in
+SELECT plugin_version FROM information_schema.plugins;
+calculated in make_version_string() in sql/sql_show.cc like this:
+"version >> 8" . "version & 0xff"
+because the version is shown with only one dot, we skip the last
+component, i.e. we show M.N.P as M.N */
+#define INNODB_VERSION_SHORT	\
+	(INNODB_VERSION_MAJOR << 8 | INNODB_VERSION_MINOR)
+
+#define INNODB_VERSION_STR			\
+	IB_TO_STR(INNODB_VERSION_MAJOR) "."	\
+	IB_TO_STR(INNODB_VERSION_MINOR) "."	\
+	IB_TO_STR(INNODB_VERSION_BUGFIX)
+
+#define REFMAN "http://dev.mysql.com/doc/refman/"	\
+	IB_TO_STR(MYSQL_VERSION_MAJOR) "."		\
+	IB_TO_STR(MYSQL_VERSION_MINOR) "/en/"
+
+#ifdef MYSQL_DYNAMIC_PLUGIN
+/* In the dynamic plugin, redefine some externally visible symbols
+in order not to conflict with the symbols of a builtin InnoDB. */
+
+/* Rename all C++ classes that contain virtual functions, because we
+have not figured out how to apply the visibility=hidden attribute to
+the virtual method table (vtable) in GCC 3. */
+# define ha_innobase ha_innodb
+#endif /* MYSQL_DYNAMIC_PLUGIN */
+
+#if (defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)) && !defined(MYSQL_SERVER) && !defined(__WIN__)
+# undef __WIN__
+# define __WIN__
+
+# include <windows.h>
+
+# ifdef _NT_
+#  define __NT__
+# endif
+
+#else
+/* The defines used with MySQL */
+
+/* Include two header files from MySQL to make the Unix flavor used
+in compiling more Posix-compatible. These headers also define __WIN__
+if we are compiling on Windows. */
+
+#ifndef UNIV_HOTBACKUP
+# include <my_global.h>
+# include <my_pthread.h>
+#endif /* UNIV_HOTBACKUP */
+
+/* Include <sys/stat.h> to get S_I... macros defined for os0file.cc */
+# include <sys/stat.h>
+# if !defined(__WIN__)
+#  include <sys/mman.h> /* mmap() for os0proc.cc */
+# endif
+
+/* Include the header file generated by GNU autoconf */
+# ifndef __WIN__
+#  ifndef UNIV_HOTBACKUP
+#   include "config.h"
+#  endif /* UNIV_HOTBACKUP */
+# endif
+
+# ifdef HAVE_SCHED_H
+#  include <sched.h>
+# endif
+
+/* We only try to do explicit inlining of functions with gcc and
+Sun Studio */
+
+# ifdef HAVE_PREAD
+#  define HAVE_PWRITE
+# endif
+
+#endif /* #if (defined(WIN32) || ... */
+
+#ifndef __WIN__
+#define __STDC_FORMAT_MACROS    /* Enable C99 printf format macros */
+#include <inttypes.h>
+#endif /* !__WIN__ */
+
+/* Following defines are to enable performance schema
+instrumentation in each of four InnoDB modules if
+HAVE_PSI_INTERFACE is defined. */
+#if defined HAVE_PSI_INTERFACE && !defined UNIV_HOTBACKUP
+# define UNIV_PFS_MUTEX
+# define UNIV_PFS_RWLOCK
+/* For I/O instrumentation, performance schema rely
+on a native descriptor to identify the file, this
+descriptor could conflict with our OS level descriptor.
+Disable IO instrumentation on Windows until this is
+resolved */
+# ifndef __WIN__
+#  define UNIV_PFS_IO
+# endif
+# define UNIV_PFS_THREAD
+
+/* There are mutexes/rwlocks that we want to exclude from
+instrumentation even if their corresponding performance schema
+define is set. And this PFS_NOT_INSTRUMENTED is used
+as the key value to identify those objects that would
+be excluded from instrumentation. */
+# define PFS_NOT_INSTRUMENTED		ULINT32_UNDEFINED
+
+# define PFS_IS_INSTRUMENTED(key)	((key) != PFS_NOT_INSTRUMENTED)
+
+#endif /* HAVE_PSI_INTERFACE */
+
+#ifdef __WIN__
+# define YY_NO_UNISTD_H 1
+#endif /* __WIN__ */
+
+/*			DEBUG VERSION CONTROL
+			===================== */
+
+/* When this macro is defined then additional test functions will be
+compiled. These functions live at the end of each relevant source file
+and have "test_" prefix. These functions are not called from anywhere in
+the code, they can be called from gdb after
+innobase_start_or_create_for_mysql() has executed using the call
+command. Not tested on Windows. */
+/*
+#define UNIV_COMPILE_TEST_FUNCS
+*/
+
+#if defined HAVE_VALGRIND
+# define UNIV_DEBUG_VALGRIND
+#endif /* HAVE_VALGRIND */
+#if 0
+#define UNIV_DEBUG_VALGRIND			/* Enable extra
+						Valgrind instrumentation */
+#define UNIV_DEBUG_PRINT			/* Enable the compilation of
+						some debug print functions */
+#define UNIV_AHI_DEBUG				/* Enable adaptive hash index
+						debugging without UNIV_DEBUG */
+#define UNIV_BUF_DEBUG				/* Enable buffer pool
+						debugging without UNIV_DEBUG */
+#define UNIV_BLOB_LIGHT_DEBUG			/* Enable off-page column
+						debugging without UNIV_DEBUG */
+#define UNIV_DEBUG				/* Enable ut_ad() assertions
+						and disable UNIV_INLINE */
+#define UNIV_DEBUG_LOCK_VALIDATE		/* Enable
+						ut_ad(lock_rec_validate_page())
+						assertions. */
+#define UNIV_DEBUG_FILE_ACCESSES		/* Enable freed block access
+						debugging without UNIV_DEBUG */
+#define UNIV_LRU_DEBUG				/* debug the buffer pool LRU */
+#define UNIV_HASH_DEBUG				/* debug HASH_ macros */
+#define UNIV_LIST_DEBUG				/* debug UT_LIST_ macros */
+#define UNIV_LOG_LSN_DEBUG			/* write LSN to the redo log;
+this will break redo log file compatibility, but it may be useful when
+debugging redo log application problems. */
+#define UNIV_MEM_DEBUG				/* detect memory leaks etc */
+#define UNIV_IBUF_DEBUG				/* debug the insert buffer */
+#define UNIV_BLOB_DEBUG				/* track BLOB ownership;
+assumes that no BLOBs survive server restart */
+#define UNIV_IBUF_COUNT_DEBUG			/* debug the insert buffer;
+this limits the database to IBUF_COUNT_N_SPACES and IBUF_COUNT_N_PAGES,
+and the insert buffer must be empty when the database is started */
+#define UNIV_PERF_DEBUG                         /* debug flag that enables
+                                                light weight performance
+                                                related stuff. */
+#define UNIV_SYNC_DEBUG				/* debug mutex and latch
+operations (very slow); also UNIV_DEBUG must be defined */
+#define UNIV_SEARCH_DEBUG			/* debug B-tree comparisons */
+#define UNIV_SYNC_PERF_STAT			/* operation counts for
+						rw-locks and mutexes */
+#define UNIV_SEARCH_PERF_STAT			/* statistics for the
+						adaptive hash index */
+#define UNIV_SRV_PRINT_LATCH_WAITS		/* enable diagnostic output
+						in sync0sync.cc */
+#define UNIV_BTR_PRINT				/* enable functions for
+						printing B-trees */
+#define UNIV_ZIP_DEBUG				/* extensive consistency checks
+						for compressed pages */
+#define UNIV_ZIP_COPY				/* call page_zip_copy_recs()
+						more often */
+#define UNIV_AIO_DEBUG				/* prints info about
+						submitted and reaped AIO
+						requests to the log. */
+#define UNIV_STATS_DEBUG			/* prints various stats
+						related debug info from
+						dict0stats.c */
+#define FTS_INTERNAL_DIAG_PRINT                 /* FTS internal debugging
+                                                info output */
+#endif
+
+#define UNIV_BTR_DEBUG				/* check B-tree links */
+#define UNIV_LIGHT_MEM_DEBUG			/* light memory debugging */
+
+/*
+#define UNIV_SQL_DEBUG
+#define UNIV_LOG_DEBUG
+*/
+			/* the above option prevents forcing of log to disk
+			at a buffer page write: it should be tested with this
+			option off; also some ibuf tests are suppressed */
+
+/* Linkage specifier for non-static InnoDB symbols (variables and functions)
+that are only referenced from within InnoDB, not from MySQL. We disable the
+GCC visibility directive on all Sun operating systems because there is no
+easy way to get it to work. See http://bugs.mysql.com/bug.php?id=52263. */
+#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(sun) || defined(__INTEL_COMPILER)
+# define UNIV_INTERN __attribute__((visibility ("hidden")))
+#else
+# define UNIV_INTERN
+#endif
+#if defined(INNODB_COMPILER_HINTS)      \
+    && defined __GNUC__                 \
+    && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 3)
+/** Starting with GCC 4.3, the "cold" attribute is used to inform the
+compiler that a function is unlikely executed.  The function is
+optimized for size rather than speed and on many targets it is placed
+into special subsection of the text section so all cold functions
+appears close together improving code locality of non-cold parts of
+program.  The paths leading to call of cold functions within code are
+marked as unlikely by the branch prediction mechanism.  optimize a
+rarely invoked function for size instead for speed. */
+# define UNIV_COLD __attribute__((cold))
+#else
+# define UNIV_COLD /* empty */
+#endif
+
+#ifndef UNIV_MUST_NOT_INLINE
+/* Definition for inline version */
+
+#define UNIV_INLINE static inline
+
+#else /* !UNIV_MUST_NOT_INLINE */
+/* If we want to compile a noninlined version we use the following macro
+definitions: */
+
+#define UNIV_NONINL
+#define UNIV_INLINE	UNIV_INTERN
+
+#endif /* !UNIV_MUST_NOT_INLINE */
+
+#ifdef _WIN32
+#define UNIV_WORD_SIZE		4
+#elif defined(_WIN64)
+#define UNIV_WORD_SIZE		8
+#else
+/** MySQL config.h generated by GNU autoconf will define SIZEOF_LONG in Posix */
+#define UNIV_WORD_SIZE		SIZEOF_LONG
+#endif
+
+/** The following alignment is used in memory allocations in memory heap
+management to ensure correct alignment for doubles etc. */
+#define UNIV_MEM_ALIGNMENT	8
+
+/** The following alignment is used in aligning lints etc. */
+#define UNIV_WORD_ALIGNMENT	UNIV_WORD_SIZE
+
+/*
+			DATABASE VERSION CONTROL
+			========================
+*/
+
+/** There are currently two InnoDB file formats which are used to group
+features with similar restrictions and dependencies. Using an enum allows
+switch statements to give a compiler warning when a new one is introduced. */
+enum innodb_file_formats_enum {
+	/** Antelope File Format: InnoDB/MySQL up to 5.1.
+	This format includes REDUNDANT and COMPACT row formats */
+	UNIV_FORMAT_A		= 0,
+
+	/** Barracuda File Format: Introduced in InnoDB plugin for 5.1:
+	This format includes COMPRESSED and DYNAMIC row formats.  It
+	includes the ability to create secondary indexes from data that
+	is not on the clustered index page and the ability to store more
+	data off the clustered index page. */
+	UNIV_FORMAT_B		= 1
+};
+
+typedef enum innodb_file_formats_enum innodb_file_formats_t;
+
+/** Minimum supported file format */
+#define UNIV_FORMAT_MIN		UNIV_FORMAT_A
+
+/** Maximum supported file format */
+#define UNIV_FORMAT_MAX		UNIV_FORMAT_B
+
+/** The 2-logarithm of UNIV_PAGE_SIZE: */
+#define UNIV_PAGE_SIZE_SHIFT	srv_page_size_shift
+
+/** The universal page size of the database */
+#define UNIV_PAGE_SIZE		((ulint) srv_page_size)
+
+/** log2 of smallest compressed page size (1<<10 == 1024 bytes)
+Note: This must never change! */
+#define UNIV_ZIP_SIZE_SHIFT_MIN		10
+
+/** log2 of largest compressed page size (1<<14 == 16384 bytes).
+A compressed page directory entry reserves 14 bits for the start offset
+and 2 bits for flags. This limits the uncompressed page size to 16k.
+Even though a 16k uncompressed page can theoretically be compressed
+into a larger compressed page, it is not a useful feature so we will
+limit both with this same constant. */
+#define UNIV_ZIP_SIZE_SHIFT_MAX		14
+
+/* Define the Min, Max, Default page sizes. */
+/** Minimum Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_MIN	12
+/** Maximum Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_MAX	14
+/** Default Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_DEF	14
+/** Original 16k InnoDB Page Size Shift, in case the default changes */
+#define UNIV_PAGE_SIZE_SHIFT_ORIG	14
+
+/** Minimum page size InnoDB currently supports. */
+#define UNIV_PAGE_SIZE_MIN	(1 << UNIV_PAGE_SIZE_SHIFT_MIN)
+/** Maximum page size InnoDB currently supports. */
+#define UNIV_PAGE_SIZE_MAX	(1 << UNIV_PAGE_SIZE_SHIFT_MAX)
+/** Default page size for InnoDB tablespaces. */
+#define UNIV_PAGE_SIZE_DEF	(1 << UNIV_PAGE_SIZE_SHIFT_DEF)
+/** Original 16k page size for InnoDB tablespaces. */
+#define UNIV_PAGE_SIZE_ORIG	(1 << UNIV_PAGE_SIZE_SHIFT_ORIG)
+
+/** Smallest compressed page size */
+#define UNIV_ZIP_SIZE_MIN	(1 << UNIV_ZIP_SIZE_SHIFT_MIN)
+
+/** Largest compressed page size */
+#define UNIV_ZIP_SIZE_MAX	(1 << UNIV_ZIP_SIZE_SHIFT_MAX)
+
+/** Number of supported page sizes (The convention 'ssize' is used
+for 'log2 minus 9' or the number of shifts starting with 512.)
+This number varies depending on UNIV_PAGE_SIZE. */
+#define UNIV_PAGE_SSIZE_MAX					\
+	(UNIV_PAGE_SIZE_SHIFT - UNIV_ZIP_SIZE_SHIFT_MIN + 1)
+
+/** Maximum number of parallel threads in a parallelized operation */
+#define UNIV_MAX_PARALLELISM	32
+
+/** This is the "mbmaxlen" for my_charset_filename (defined in
+strings/ctype-utf8.c), which is used to encode File and Database names. */
+#define FILENAME_CHARSET_MAXNAMLEN	5
+
+/** The maximum length of an encode table name in bytes.  The max
+table and database names are NAME_CHAR_LEN (64) characters. After the
+encoding, the max length would be NAME_CHAR_LEN (64) *
+FILENAME_CHARSET_MAXNAMLEN (5) = 320 bytes. The number does not include a
+terminating '\0'. InnoDB can handle longer names internally */
+#define MAX_TABLE_NAME_LEN	320
+
+/** The maximum length of a database name. Like MAX_TABLE_NAME_LEN this is
+the MySQL's NAME_LEN, see check_and_convert_db_name(). */
+#define MAX_DATABASE_NAME_LEN	MAX_TABLE_NAME_LEN
+
+/** MAX_FULL_NAME_LEN defines the full name path including the
+database name and table name. In addition, 14 bytes is added for:
+	2 for surrounding quotes around table name
+	1 for the separating dot (.)
+	9 for the #mysql50# prefix */
+#define MAX_FULL_NAME_LEN				\
+	(MAX_TABLE_NAME_LEN + MAX_DATABASE_NAME_LEN + 14)
+
+/** The maximum length in bytes that a database name can occupy when stored in
+UTF8, including the terminating '\0', see dict_fs2utf8(). You must include
+mysql_com.h if you are to use this macro. */
+#define MAX_DB_UTF8_LEN		(NAME_LEN + 1)
+
+/** The maximum length in bytes that a table name can occupy when stored in
+UTF8, including the terminating '\0', see dict_fs2utf8(). You must include
+mysql_com.h if you are to use this macro. */
+#define MAX_TABLE_UTF8_LEN	(NAME_LEN + sizeof(srv_mysql50_table_name_prefix))
+
+/*
+			UNIVERSAL TYPE DEFINITIONS
+			==========================
+*/
+
+/* Note that inside MySQL 'byte' is defined as char on Linux! */
+#define byte			unsigned char
+
+/* Another basic type we use is unsigned long integer which should be equal to
+the word size of the machine, that is on a 32-bit platform 32 bits, and on a
+64-bit platform 64 bits. We also give the printf format for the type as a
+macro ULINTPF. */
+
+
+#ifdef __WIN__
+/* Use the integer types and formatting strings defined in Visual Studio. */
+# define UINT32PF	"%I32u"
+# define INT64PF	"%I64d"
+# define UINT64PF	"%I64u"
+# define UINT64PFx	"%016I64x"
+# define DBUG_LSN_PF    "%llu"
+typedef __int64 ib_int64_t;
+typedef unsigned __int64 ib_uint64_t;
+typedef unsigned __int32 ib_uint32_t;
+#else
+/* Use the integer types and formatting strings defined in the C99 standard. */
+# define UINT32PF	"%" PRIu32
+# define INT64PF	"%" PRId64
+# define UINT64PF	"%" PRIu64
+# define UINT64PFx	"%016" PRIx64
+# define DBUG_LSN_PF    UINT64PF
+typedef int64_t ib_int64_t;
+typedef uint64_t ib_uint64_t;
+typedef uint32_t ib_uint32_t;
+# endif /* __WIN__ */
+
+# define IB_ID_FMT	UINT64PF
+
+#ifdef _WIN64
+typedef unsigned __int64	ulint;
+typedef __int64			lint;
+# define ULINTPF		UINT64PF
+#else
+typedef unsigned long int	ulint;
+typedef long int		lint;
+# define ULINTPF		"%lu"
+#endif /* _WIN64 */
+
+#ifndef UNIV_HOTBACKUP
+typedef unsigned long long int	ullint;
+#endif /* UNIV_HOTBACKUP */
+
+#ifndef __WIN__
+#if SIZEOF_LONG != SIZEOF_VOIDP
+#error "Error: InnoDB's ulint must be of the same size as void*"
+#endif
+#endif
+
+/** The 'undefined' value for a ulint */
+#define ULINT_UNDEFINED		((ulint)(-1))
+
+#define ULONG_UNDEFINED		((ulong)(-1))
+
+/** The 'undefined' value for a ib_uint64_t */
+#define UINT64_UNDEFINED	((ib_uint64_t)(-1))
+
+/** The bitmask of 32-bit unsigned integer */
+#define ULINT32_MASK		0xFFFFFFFF
+/** The undefined 32-bit unsigned integer */
+#define	ULINT32_UNDEFINED	ULINT32_MASK
+
+/** Maximum value for a ulint */
+#define ULINT_MAX		((ulint)(-2))
+
+/** Maximum value for ib_uint64_t */
+#define IB_UINT64_MAX		((ib_uint64_t) (~0ULL))
+
+/** The generic InnoDB system object identifier data type */
+typedef ib_uint64_t		ib_id_t;
+#define IB_ID_MAX		IB_UINT64_MAX
+
+/** The 'undefined' value for a ullint */
+#define ULLINT_UNDEFINED        ((ullint)(-1))
+
+/** This 'ibool' type is used within Innobase. Remember that different included
+headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */
+#define ibool			ulint
+
+#ifndef TRUE
+
+#define TRUE    1
+#define FALSE   0
+
+#endif
+
+#define UNIV_NOTHROW
+
+/** The following number as the length of a logical field means that the field
+has the SQL NULL as its value. NOTE that because we assume that the length
+of a field is a 32-bit integer when we store it, for example, to an undo log
+on disk, we must have also this number fit in 32 bits, also in 64-bit
+computers! */
+
+#define UNIV_SQL_NULL ULINT32_UNDEFINED
+
+/** Lengths which are not UNIV_SQL_NULL, but bigger than the following
+number indicate that a field contains a reference to an externally
+stored part of the field in the tablespace. The length field then
+contains the sum of the following flag and the locally stored len. */
+
+#define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE_MAX)
+
+#if defined(__GNUC__) && (__GNUC__ > 2) && ! defined(__INTEL_COMPILER)
+#define HAVE_GCC_GT_2
+/* Tell the compiler that variable/function is unused. */
+# define UNIV_UNUSED    __attribute__ ((unused))
+#else
+# define UNIV_UNUSED
+#endif /* CHECK FOR GCC VER_GT_2 */
+
+/* Some macros to improve branch prediction and reduce cache misses */
+#if defined(INNODB_COMPILER_HINTS) && defined(HAVE_GCC_GT_2)
+/* Tell the compiler that 'expr' probably evaluates to 'constant'. */
+# define UNIV_EXPECT(expr,constant) __builtin_expect(expr, constant)
+/* Tell the compiler that a pointer is likely to be NULL */
+# define UNIV_LIKELY_NULL(ptr) __builtin_expect((ulint) ptr, 0)
+/* Minimize cache-miss latency by moving data at addr into a cache before
+it is read. */
+# define UNIV_PREFETCH_R(addr) __builtin_prefetch(addr, 0, 3)
+/* Minimize cache-miss latency by moving data at addr into a cache before
+it is read or written. */
+# define UNIV_PREFETCH_RW(addr) __builtin_prefetch(addr, 1, 3)
+
+/* Sun Studio includes sun_prefetch.h as of version 5.9 */
+#elif (defined(__SUNPRO_C) && __SUNPRO_C >= 0x590) \
+       || (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x590)
+
+# include <sun_prefetch.h>
+
+#if __SUNPRO_C >= 0x550
+# undef UNIV_INTERN
+# define UNIV_INTERN __hidden
+#endif /* __SUNPRO_C >= 0x550 */
+
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+
+# if defined(INNODB_COMPILER_HINTS)
+//# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many((void*) addr)
+#  define UNIV_PREFETCH_R(addr) ((void) 0)
+#  define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr)
+# else
+#  define UNIV_PREFETCH_R(addr) ((void) 0)
+#  define UNIV_PREFETCH_RW(addr) ((void) 0)
+# endif /* INNODB_COMPILER_HINTS */
+
+#else
+/* Dummy versions of the macros */
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+# define UNIV_PREFETCH_R(addr) ((void) 0)
+# define UNIV_PREFETCH_RW(addr) ((void) 0)
+#endif
+
+/* Tell the compiler that cond is likely to hold */
+#define UNIV_LIKELY(cond) UNIV_EXPECT(cond, TRUE)
+/* Tell the compiler that cond is unlikely to hold */
+#define UNIV_UNLIKELY(cond) UNIV_EXPECT(cond, FALSE)
+
+/* Compile-time constant of the given array's size. */
+#define UT_ARR_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+
+/* The return type from a thread's start function differs between Unix and
+Windows, so define a typedef for it and a macro to use at the end of such
+functions. */
+
+#ifdef __WIN__
+typedef ulint os_thread_ret_t;
+#define OS_THREAD_DUMMY_RETURN return(0)
+#else
+typedef void* os_thread_ret_t;
+#define OS_THREAD_DUMMY_RETURN return(NULL)
+#endif
+
+#include <stdio.h>
+#include "ut0dbg.h"
+#include "ut0ut.h"
+#include "db0err.h"
+#ifdef UNIV_DEBUG_VALGRIND
+# include <valgrind/memcheck.h>
+# define UNIV_MEM_VALID(addr, size) VALGRIND_MAKE_MEM_DEFINED(addr, size)
+# define UNIV_MEM_INVALID(addr, size) VALGRIND_MAKE_MEM_UNDEFINED(addr, size)
+# define UNIV_MEM_FREE(addr, size) VALGRIND_MAKE_MEM_NOACCESS(addr, size)
+# define UNIV_MEM_ALLOC(addr, size) VALGRIND_MAKE_MEM_UNDEFINED(addr, size)
+# define UNIV_MEM_DESC(addr, size) VALGRIND_CREATE_BLOCK(addr, size, #addr)
+# define UNIV_MEM_UNDESC(b) VALGRIND_DISCARD(b)
+# define UNIV_MEM_ASSERT_RW_LOW(addr, size, should_abort) do {		\
+	const void* _p = (const void*) (ulint)				\
+		VALGRIND_CHECK_MEM_IS_DEFINED(addr, size);		\
+	if (UNIV_LIKELY_NULL(_p)) {					\
+		fprintf(stderr, "%s:%d: %p[%u] undefined at %ld\n",	\
+			__FILE__, __LINE__,				\
+			(const void*) (addr), (unsigned) (size), (long)	\
+			(((const char*) _p) - ((const char*) (addr))));	\
+		if (should_abort) {					\
+			ut_error;					\
+		}							\
+	}								\
+} while (0)
+# define UNIV_MEM_ASSERT_RW(addr, size)					\
+	UNIV_MEM_ASSERT_RW_LOW(addr, size, false)
+# define UNIV_MEM_ASSERT_RW_ABORT(addr, size)				\
+	UNIV_MEM_ASSERT_RW_LOW(addr, size, true)
+# define UNIV_MEM_ASSERT_W(addr, size) do {				\
+	const void* _p = (const void*) (ulint)				\
+		VALGRIND_CHECK_MEM_IS_ADDRESSABLE(addr, size);		\
+	if (UNIV_LIKELY_NULL(_p))					\
+		fprintf(stderr, "%s:%d: %p[%u] unwritable at %ld\n",	\
+			__FILE__, __LINE__,				\
+			(const void*) (addr), (unsigned) (size), (long)	\
+			(((const char*) _p) - ((const char*) (addr))));	\
+	} while (0)
+# define UNIV_MEM_TRASH(addr, c, size) do {				\
+	ut_d(memset(addr, c, size));					\
+	UNIV_MEM_INVALID(addr, size);					\
+	} while (0)
+#else
+# define UNIV_MEM_VALID(addr, size) do {} while(0)
+# define UNIV_MEM_INVALID(addr, size) do {} while(0)
+# define UNIV_MEM_FREE(addr, size) do {} while(0)
+# define UNIV_MEM_ALLOC(addr, size) do {} while(0)
+# define UNIV_MEM_DESC(addr, size) do {} while(0)
+# define UNIV_MEM_UNDESC(b) do {} while(0)
+# define UNIV_MEM_ASSERT_RW_LOW(addr, size, should_abort) do {} while(0)
+# define UNIV_MEM_ASSERT_RW(addr, size) do {} while(0)
+# define UNIV_MEM_ASSERT_RW_ABORT(addr, size) do {} while(0)
+# define UNIV_MEM_ASSERT_W(addr, size) do {} while(0)
+# define UNIV_MEM_TRASH(addr, c, size) do {} while(0)
+#endif
+#define UNIV_MEM_ASSERT_AND_FREE(addr, size) do {	\
+	UNIV_MEM_ASSERT_W(addr, size);			\
+	UNIV_MEM_FREE(addr, size);			\
+} while (0)
+#define UNIV_MEM_ASSERT_AND_ALLOC(addr, size) do {	\
+	UNIV_MEM_ASSERT_W(addr, size);			\
+	UNIV_MEM_ALLOC(addr, size);			\
+} while (0)
+
+extern ulong	srv_page_size_shift;
+extern ulong	srv_page_size;
+
+#endif
diff --git a/storage/innobase/include/usr0sess.h b/storage/innobase/include/usr0sess.h
new file mode 100644
index 00000000000..b5c80b97b43
--- /dev/null
+++ b/storage/innobase/include/usr0sess.h
@@ -0,0 +1,77 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/usr0sess.h
+Sessions
+
+Created 6/25/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef usr0sess_h
+#define usr0sess_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "trx0types.h"
+#include "srv0srv.h"
+#include "trx0types.h"
+#include "usr0types.h"
+#include "que0types.h"
+#include "data0data.h"
+#include "rem0rec.h"
+
+/*********************************************************************//**
+Opens a session.
+@return	own: session object */
+UNIV_INTERN
+sess_t*
+sess_open(void);
+/*============*/
+/*********************************************************************//**
+Closes a session, freeing the memory occupied by it. */
+UNIV_INTERN
+void
+sess_close(
+/*=======*/
+	sess_t*		sess);		/* in, own: session object */
+
+/* The session handle. This data structure is only used by purge and is
+not really necessary. We should get rid of it. */
+struct sess_t{
+	ulint		state;		/*!< state of the session */
+	trx_t*		trx;		/*!< transaction object permanently
+					assigned for the session: the
+					transaction instance designated by the
+					trx id changes, but the memory
+					structure is preserved */
+	UT_LIST_BASE_NODE_T(que_t)
+			graphs;		/*!< query graphs belonging to this
+					session */
+};
+
+/* Session states */
+#define SESS_ACTIVE		1
+#define SESS_ERROR		2	/* session contains an error message
+					which has not yet been communicated
+					to the client */
+#ifndef UNIV_NONINL
+#include "usr0sess.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/usr0sess.ic b/storage/innobase/include/usr0sess.ic
new file mode 100644
index 00000000000..284e59537fe
--- /dev/null
+++ b/storage/innobase/include/usr0sess.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/usr0sess.ic
+Sessions
+
+Created 6/25/1996 Heikki Tuuri
+*******************************************************/
diff --git a/storage/innobase/include/usr0types.h b/storage/innobase/include/usr0types.h
new file mode 100644
index 00000000000..6ba937cacc8
--- /dev/null
+++ b/storage/innobase/include/usr0types.h
@@ -0,0 +1,31 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/usr0types.h
+Users and sessions global types
+
+Created 6/25/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef usr0types_h
+#define usr0types_h
+
+struct sess_t;
+
+#endif
diff --git a/storage/innobase/include/ut0bh.h b/storage/innobase/include/ut0bh.h
new file mode 100644
index 00000000000..1085736c7ab
--- /dev/null
+++ b/storage/innobase/include/ut0bh.h
@@ -0,0 +1,152 @@
+/***************************************************************************//**
+
+Copyright (c) 2011, 2013, Oracle Corpn. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0bh.h
+Binary min-heap interface.
+
+Created 2010-05-28 by Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_UT0BH_H
+#define INNOBASE_UT0BH_H
+
+#include "univ.i"
+
+/** Comparison function for objects in the binary heap. */
+typedef int (*ib_bh_cmp_t)(const void* p1, const void* p2);
+
+struct ib_bh_t;
+
+/**********************************************************************//**
+Get the number of elements in the binary heap.
+@return number of elements */
+UNIV_INLINE
+ulint
+ib_bh_size(
+/*=======*/
+	const ib_bh_t*	ib_bh);			/*!< in: instance */
+
+/**********************************************************************//**
+Test if binary heap is empty.
+@return TRUE if empty. */
+UNIV_INLINE
+ibool
+ib_bh_is_empty(
+/*===========*/
+	const ib_bh_t*	ib_bh);			/*!< in: instance */
+
+/**********************************************************************//**
+Test if binary heap is full.
+@return TRUE if full. */
+UNIV_INLINE
+ibool
+ib_bh_is_full(
+/*===========*/
+	const ib_bh_t*	ib_bh);			/*!< in: instance */
+
+/**********************************************************************//**
+Get a pointer to the element.
+@return pointer to element */
+UNIV_INLINE
+void*
+ib_bh_get(
+/*=======*/
+	ib_bh_t*	ib_bh,			/*!< in: instance */
+	ulint		i);			/*!< in: index */
+
+/**********************************************************************//**
+Copy an element to the binary heap.
+@return pointer to copied element */
+UNIV_INLINE
+void*
+ib_bh_set(
+/*======*/
+	ib_bh_t*	ib_bh,			/*!< in/out: instance */
+	ulint		i,			/*!< in: index */
+	const void*	elem);			/*!< in: element to add */
+
+/**********************************************************************//**
+Return the first element from the binary heap.
+@return pointer to first element or NULL if empty. */
+UNIV_INLINE
+void*
+ib_bh_first(
+/*========*/
+	ib_bh_t*	ib_bh);			/*!< in: instance */
+
+/**********************************************************************//**
+Return the last element from the binary heap.
+@return pointer to last element or NULL if empty. */
+UNIV_INLINE
+void*
+ib_bh_last(
+/*========*/
+	ib_bh_t*	ib_bh);			/*!< in/out: instance */
+
+/**********************************************************************//**
+Create a binary heap.
+@return a new binary heap */
+UNIV_INTERN
+ib_bh_t*
+ib_bh_create(
+/*=========*/
+	ib_bh_cmp_t	compare,		/*!< in: comparator */
+	ulint		sizeof_elem,		/*!< in: size of one element */
+	ulint		max_elems);		/*!< in: max elements allowed */
+
+/**********************************************************************//**
+Free a binary heap.
+@return a new binary heap */
+UNIV_INTERN
+void
+ib_bh_free(
+/*=======*/
+	ib_bh_t*	ib_bh);			/*!< in,own: instance */
+
+/**********************************************************************//**
+Add an element to the binary heap. Note: The element is copied.
+@return pointer to added element or NULL if full. */
+UNIV_INTERN
+void*
+ib_bh_push(
+/*=======*/
+	ib_bh_t*	ib_bh,			/*!< in/out: instance */
+	const void*	elem);			/*!< in: element to add */
+
+/**********************************************************************//**
+Remove the first element from the binary heap. */
+UNIV_INTERN
+void
+ib_bh_pop(
+/*======*/
+	ib_bh_t*	ib_bh);			/*!< in/out: instance */
+
+/** Binary heap data structure */
+struct ib_bh_t {
+	ulint		max_elems;		/*!< max elements allowed */
+	ulint		n_elems;		/*!< current size */
+	ulint		sizeof_elem;		/*!< sizeof element */
+	ib_bh_cmp_t	compare;		/*!< comparator */
+};
+
+#ifndef UNIV_NONINL
+#include "ut0bh.ic"
+#endif
+
+#endif /* INNOBASE_UT0BH_H */
diff --git a/storage/innobase/include/ut0bh.ic b/storage/innobase/include/ut0bh.ic
new file mode 100644
index 00000000000..b11de5b8b3e
--- /dev/null
+++ b/storage/innobase/include/ut0bh.ic
@@ -0,0 +1,125 @@
+/***************************************************************************//**
+
+Copyright (c) 2011, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0bh.ic
+Binary min-heap implementation.
+
+Created 2011-01-15 by Sunny Bains
+*******************************************************/
+
+#include "ut0bh.h"
+#include "ut0mem.h"	/* For ut_memcpy() */
+
+/**********************************************************************//**
+Get the number of elements in the binary heap.
+@return number of elements */
+UNIV_INLINE
+ulint
+ib_bh_size(
+/*=======*/
+	const ib_bh_t*	ib_bh)			/*!< in: instance */
+{
+	return(ib_bh->n_elems);
+}
+
+/**********************************************************************//**
+Test if binary heap is empty.
+@return TRUE if empty. */
+UNIV_INLINE
+ibool
+ib_bh_is_empty(
+/*===========*/
+	const ib_bh_t*	ib_bh)			/*!< in: instance */
+{
+	return(ib_bh_size(ib_bh) == 0);
+}
+
+/**********************************************************************//**
+Test if binary heap is full.
+@return TRUE if full. */
+UNIV_INLINE
+ibool
+ib_bh_is_full(
+/*===========*/
+	const ib_bh_t*	ib_bh)			/*!< in: instance */
+{
+	return(ib_bh_size(ib_bh) >= ib_bh->max_elems);
+}
+
+/**********************************************************************//**
+Get a pointer to the element.
+@return pointer to element */
+UNIV_INLINE
+void*
+ib_bh_get(
+/*=======*/
+	ib_bh_t*	ib_bh,			/*!< in: instance */
+	ulint		i)			/*!< in: index */
+{
+	byte*		ptr = (byte*) (ib_bh + 1);
+
+	ut_a(i < ib_bh_size(ib_bh));
+
+	return(ptr + (ib_bh->sizeof_elem * i));
+}
+
+/**********************************************************************//**
+Copy an element to the binary heap.
+@return pointer to copied element */
+UNIV_INLINE
+void*
+ib_bh_set(
+/*======*/
+	ib_bh_t*	ib_bh,			/*!< in/out: instance */
+	ulint		i,			/*!< in: index */
+	const void*	elem)			/*!< in: element to add */
+{
+	void*		ptr = ib_bh_get(ib_bh, i);
+
+	ut_memcpy(ptr, elem, ib_bh->sizeof_elem);
+
+	return(ptr);
+}
+
+/**********************************************************************//**
+Return the first element from the binary heap.
+@return pointer to first element or NULL if empty. */
+UNIV_INLINE
+void*
+ib_bh_first(
+/*========*/
+	ib_bh_t*	ib_bh)			/*!< in: instance */
+{
+	return(ib_bh_is_empty(ib_bh) ? NULL : ib_bh_get(ib_bh, 0));
+}
+
+/**********************************************************************//**
+Return the last element from the binary heap.
+@return pointer to last element or NULL if empty. */
+UNIV_INLINE
+void*
+ib_bh_last(
+/*========*/
+	ib_bh_t*	ib_bh)			/*!< in/out: instance */
+{
+	return(ib_bh_is_empty(ib_bh)
+		? NULL
+		: ib_bh_get(ib_bh, ib_bh_size(ib_bh) - 1));
+}
+
diff --git a/storage/innobase/include/ut0byte.h b/storage/innobase/include/ut0byte.h
new file mode 100644
index 00000000000..5bdd553ca80
--- /dev/null
+++ b/storage/innobase/include/ut0byte.h
@@ -0,0 +1,119 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0byte.h
+Utilities for byte operations
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0byte_h
+#define ut0byte_h
+
+
+
+#include "univ.i"
+
+/*******************************************************//**
+Creates a 64-bit integer out of two 32-bit integers.
+@return	created integer */
+UNIV_INLINE
+ib_uint64_t
+ut_ull_create(
+/*==========*/
+	ulint	high,	/*!< in: high-order 32 bits */
+	ulint	low)	/*!< in: low-order 32 bits */
+	__attribute__((const));
+
+/********************************************************//**
+Rounds a 64-bit integer downward to a multiple of a power of 2.
+@return	rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_down(
+/*=================*/
+	ib_uint64_t	 n,		/*!< in: number to be rounded */
+	ulint		 align_no);	/*!< in: align by this number
+					which must be a power of 2 */
+/********************************************************//**
+Rounds ib_uint64_t upward to a multiple of a power of 2.
+@return	rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_up(
+/*===============*/
+	ib_uint64_t	 n,		/*!< in: number to be rounded */
+	ulint		 align_no);	/*!< in: align by this number
+					which must be a power of 2 */
+/*********************************************************//**
+The following function rounds up a pointer to the nearest aligned address.
+@return	aligned pointer */
+UNIV_INLINE
+void*
+ut_align(
+/*=====*/
+	const void*	ptr,		/*!< in: pointer */
+	ulint		align_no);	/*!< in: align by this number */
+/*********************************************************//**
+The following function rounds down a pointer to the nearest
+aligned address.
+@return	aligned pointer */
+UNIV_INLINE
+void*
+ut_align_down(
+/*==========*/
+	const void*	ptr,		/*!< in: pointer */
+	ulint		align_no)	/*!< in: align by this number */
+		__attribute__((const));
+/*********************************************************//**
+The following function computes the offset of a pointer from the nearest
+aligned address.
+@return	distance from aligned pointer */
+UNIV_INLINE
+ulint
+ut_align_offset(
+/*============*/
+	const void*	ptr,		/*!< in: pointer */
+	ulint		align_no)	/*!< in: align by this number */
+			__attribute__((const));
+/*****************************************************************//**
+Gets the nth bit of a ulint.
+@return	TRUE if nth bit is 1; 0th bit is defined to be the least significant */
+UNIV_INLINE
+ibool
+ut_bit_get_nth(
+/*===========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	n);	/*!< in: nth bit requested */
+/*****************************************************************//**
+Sets the nth bit of a ulint.
+@return	the ulint with the bit set as requested */
+UNIV_INLINE
+ulint
+ut_bit_set_nth(
+/*===========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	n,	/*!< in: nth bit requested */
+	ibool	val);	/*!< in: value for the bit to set */
+
+#ifndef UNIV_NONINL
+#include "ut0byte.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/ut0byte.ic b/storage/innobase/include/ut0byte.ic
new file mode 100644
index 00000000000..873d98c727e
--- /dev/null
+++ b/storage/innobase/include/ut0byte.ic
@@ -0,0 +1,173 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0byte.ic
+Utilities for byte operations
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+/*******************************************************//**
+Creates a 64-bit integer out of two 32-bit integers.
+@return	created integer */
+UNIV_INLINE
+ib_uint64_t
+ut_ull_create(
+/*==========*/
+	ulint	high,	/*!< in: high-order 32 bits */
+	ulint	low)	/*!< in: low-order 32 bits */
+{
+	ut_ad(high <= ULINT32_MASK);
+	ut_ad(low <= ULINT32_MASK);
+	return(((ib_uint64_t) high) << 32 | low);
+}
+
+/********************************************************//**
+Rounds a 64-bit integer downward to a multiple of a power of 2.
+@return	rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_down(
+/*=================*/
+	ib_uint64_t	 n,		/*!< in: number to be rounded */
+	ulint		 align_no)	/*!< in: align by this number
+					which must be a power of 2 */
+{
+	ut_ad(align_no > 0);
+	ut_ad(ut_is_2pow(align_no));
+
+	return(n & ~((ib_uint64_t) align_no - 1));
+}
+
+/********************************************************//**
+Rounds ib_uint64_t upward to a multiple of a power of 2.
+@return	rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_up(
+/*===============*/
+	ib_uint64_t	 n,		/*!< in: number to be rounded */
+	ulint		 align_no)	/*!< in: align by this number
+					which must be a power of 2 */
+{
+	ib_uint64_t	align_1 = (ib_uint64_t) align_no - 1;
+
+	ut_ad(align_no > 0);
+	ut_ad(ut_is_2pow(align_no));
+
+	return((n + align_1) & ~align_1);
+}
+
+/*********************************************************//**
+The following function rounds up a pointer to the nearest aligned address.
+@return	aligned pointer */
+UNIV_INLINE
+void*
+ut_align(
+/*=====*/
+	const void*	ptr,		/*!< in: pointer */
+	ulint		align_no)	/*!< in: align by this number */
+{
+	ut_ad(align_no > 0);
+	ut_ad(((align_no - 1) & align_no) == 0);
+	ut_ad(ptr);
+
+	ut_ad(sizeof(void*) == sizeof(ulint));
+
+	return((void*)((((ulint) ptr) + align_no - 1) & ~(align_no - 1)));
+}
+
+/*********************************************************//**
+The following function rounds down a pointer to the nearest
+aligned address.
+@return	aligned pointer */
+UNIV_INLINE
+void*
+ut_align_down(
+/*==========*/
+	const void*	ptr,		/*!< in: pointer */
+	ulint		align_no)	/*!< in: align by this number */
+{
+	ut_ad(align_no > 0);
+	ut_ad(((align_no - 1) & align_no) == 0);
+	ut_ad(ptr);
+
+	ut_ad(sizeof(void*) == sizeof(ulint));
+
+	return((void*)((((ulint) ptr)) & ~(align_no - 1)));
+}
+
+/*********************************************************//**
+The following function computes the offset of a pointer from the nearest
+aligned address.
+@return	distance from aligned pointer */
+UNIV_INLINE
+ulint
+ut_align_offset(
+/*============*/
+	const void*	ptr,		/*!< in: pointer */
+	ulint		align_no)	/*!< in: align by this number */
+{
+	ut_ad(align_no > 0);
+	ut_ad(((align_no - 1) & align_no) == 0);
+	ut_ad(ptr);
+
+	ut_ad(sizeof(void*) == sizeof(ulint));
+
+	return(((ulint) ptr) & (align_no - 1));
+}
+
+/*****************************************************************//**
+Gets the nth bit of a ulint.
+@return	TRUE if nth bit is 1; 0th bit is defined to be the least significant */
+UNIV_INLINE
+ibool
+ut_bit_get_nth(
+/*===========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	n)	/*!< in: nth bit requested */
+{
+	ut_ad(n < 8 * sizeof(ulint));
+#if TRUE != 1
+# error "TRUE != 1"
+#endif
+	return(1 & (a >> n));
+}
+
+/*****************************************************************//**
+Sets the nth bit of a ulint.
+@return	the ulint with the bit set as requested */
+UNIV_INLINE
+ulint
+ut_bit_set_nth(
+/*===========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	n,	/*!< in: nth bit requested */
+	ibool	val)	/*!< in: value for the bit to set */
+{
+	ut_ad(n < 8 * sizeof(ulint));
+#if TRUE != 1
+# error "TRUE != 1"
+#endif
+	if (val) {
+		return(((ulint) 1 << n) | a);
+	} else {
+		return(~((ulint) 1 << n) & a);
+	}
+}
diff --git a/storage/innobase/include/ut0counter.h b/storage/innobase/include/ut0counter.h
new file mode 100644
index 00000000000..fe0f36dfff2
--- /dev/null
+++ b/storage/innobase/include/ut0counter.h
@@ -0,0 +1,203 @@
+/*****************************************************************************
+
+Copyright (c) 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ut0counter.h
+
+Counter utility class
+
+Created 2012/04/12 by Sunny Bains
+*******************************************************/
+
+#ifndef UT0COUNTER_H
+#define UT0COUNTER_H
+
+#include "univ.i"
+#include <string.h>
+#include "os0thread.h"
+
+/** CPU cache line size */
+#define CACHE_LINE_SIZE		64
+
+/** Default number of slots to use in ib_counter_t */
+#define IB_N_SLOTS		64
+
+/** Get the offset into the counter array. */
+template <typename Type, int N>
+struct generic_indexer_t {
+	/** Default constructor/destructor should be OK. */
+
+        /** @return offset within m_counter */
+        size_t offset(size_t index) const UNIV_NOTHROW {
+                return(((index % N) + 1) * (CACHE_LINE_SIZE / sizeof(Type)));
+        }
+};
+
+#ifdef HAVE_SCHED_GETCPU
+#include <utmpx.h>
+/** Use the cpu id to index into the counter array. If it fails then
+use the thread id. */
+template <typename Type, int N>
+struct get_sched_indexer_t : public generic_indexer_t<Type, N> {
+	/** Default constructor/destructor should be OK. */
+
+	/* @return result from sched_getcpu(), the thread id if it fails. */
+	size_t get_rnd_index() const UNIV_NOTHROW {
+
+		size_t	cpu = sched_getcpu();
+		if (cpu == -1) {
+			cpu = (lint) os_thread_get_curr_id();
+		}
+
+		return(cpu);
+	}
+};
+#endif /* HAVE_SCHED_GETCPU */
+
+/** Use the thread id to index into the counter array. */
+template <typename Type, int N>
+struct thread_id_indexer_t : public generic_indexer_t<Type, N> {
+	/** Default constructor/destructor should are OK. */
+
+	/* @return a random number, currently we use the thread id. Where
+	thread id is represented as a pointer, it may not work as
+	effectively. */
+	size_t get_rnd_index() const UNIV_NOTHROW {
+		return((lint) os_thread_get_curr_id());
+	}
+};
+
+/** For counters wher N=1 */
+template <typename Type, int N=1>
+struct single_indexer_t {
+	/** Default constructor/destructor should are OK. */
+
+        /** @return offset within m_counter */
+        size_t offset(size_t index) const UNIV_NOTHROW {
+		ut_ad(N == 1);
+                return((CACHE_LINE_SIZE / sizeof(Type)));
+        }
+
+	/* @return 1 */
+	size_t get_rnd_index() const UNIV_NOTHROW {
+		ut_ad(N == 1);
+		return(1);
+	}
+};
+
+/** Class for using fuzzy counters. The counter is not protected by any
+mutex and the results are not guaranteed to be 100% accurate but close
+enough. Creates an array of counters and separates each element by the
+CACHE_LINE_SIZE bytes */
+template <
+	typename Type,
+	int N = IB_N_SLOTS,
+	template<typename, int> class Indexer = thread_id_indexer_t>
+class ib_counter_t {
+public:
+	ib_counter_t() { memset(m_counter, 0x0, sizeof(m_counter)); }
+
+	~ib_counter_t()
+	{
+		ut_ad(validate());
+	}
+
+	bool validate() UNIV_NOTHROW {
+#ifdef UNIV_DEBUG
+		size_t	n = (CACHE_LINE_SIZE / sizeof(Type));
+
+		/* Check that we aren't writing outside our defined bounds. */
+		for (size_t i = 0; i < UT_ARR_SIZE(m_counter); i += n) {
+			for (size_t j = 1; j < n - 1; ++j) {
+				ut_ad(m_counter[i + j] == 0);
+			}
+		}
+#endif /* UNIV_DEBUG */
+		return(true);
+	}
+
+	/** If you can't use a good index id. Increment by 1. */
+	void inc() UNIV_NOTHROW { add(1); }
+
+	/** If you can't use a good index id.
+	* @param n  - is the amount to increment */
+	void add(Type n) UNIV_NOTHROW {
+		size_t	i = m_policy.offset(m_policy.get_rnd_index());
+
+		ut_ad(i < UT_ARR_SIZE(m_counter));
+
+		m_counter[i] += n;
+	}
+
+	/** Use this if you can use a unique indentifier, saves a
+	call to get_rnd_index().
+	@param i - index into a slot
+	@param n - amount to increment */
+	void add(size_t index, Type n) UNIV_NOTHROW {
+		size_t	i = m_policy.offset(index);
+
+		ut_ad(i < UT_ARR_SIZE(m_counter));
+
+		m_counter[i] += n;
+	}
+
+	/** If you can't use a good index id. Decrement by 1. */
+	void dec() UNIV_NOTHROW { sub(1); }
+
+	/** If you can't use a good index id.
+	* @param - n is the amount to decrement */
+	void sub(Type n) UNIV_NOTHROW {
+		size_t	i = m_policy.offset(m_policy.get_rnd_index());
+
+		ut_ad(i < UT_ARR_SIZE(m_counter));
+
+		m_counter[i] -= n;
+	}
+
+	/** Use this if you can use a unique indentifier, saves a
+	call to get_rnd_index().
+	@param i - index into a slot
+	@param n - amount to decrement */
+	void sub(size_t index, Type n) UNIV_NOTHROW {
+		size_t	i = m_policy.offset(index);
+
+		ut_ad(i < UT_ARR_SIZE(m_counter));
+
+		m_counter[i] -= n;
+	}
+
+	/* @return total value - not 100% accurate, since it is not atomic. */
+	operator Type() const UNIV_NOTHROW {
+		Type	total = 0;
+
+		for (size_t i = 0; i < N; ++i) {
+			total += m_counter[m_policy.offset(i)];
+		}
+
+		return(total);
+	}
+
+private:
+	/** Indexer into the array */
+	Indexer<Type, N>m_policy;
+
+        /** Slot 0 is unused. */
+	Type		m_counter[(N + 1) * (CACHE_LINE_SIZE / sizeof(Type))];
+};
+
+#endif /* UT0COUNTER_H */
diff --git a/storage/innobase/include/ut0crc32.h b/storage/innobase/include/ut0crc32.h
new file mode 100644
index 00000000000..86217692764
--- /dev/null
+++ b/storage/innobase/include/ut0crc32.h
@@ -0,0 +1,51 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ut0crc32.h
+CRC32 implementation
+
+Created Aug 10, 2011 Vasil Dimov
+*******************************************************/
+
+#ifndef ut0crc32_h
+#define ut0crc32_h
+
+#include "univ.i"
+
+/********************************************************************//**
+Initializes the data structures used by ut_crc32(). Does not do any
+allocations, would not hurt if called twice, but would be pointless. */
+UNIV_INTERN
+void
+ut_crc32_init();
+/*===========*/
+
+/********************************************************************//**
+Calculates CRC32.
+@param ptr	- data over which to calculate CRC32.
+@param len	- data length in bytes.
+@return CRC32 (CRC-32C, using the GF(2) primitive polynomial 0x11EDC6F41,
+or 0x1EDC6F41 without the high-order bit) */
+typedef ib_uint32_t (*ib_ut_crc32_t)(const byte* ptr, ulint len);
+
+extern ib_ut_crc32_t	ut_crc32;
+
+extern bool	ut_crc32_sse2_enabled;
+
+#endif /* ut0crc32_h */
diff --git a/storage/innobase/include/ut0dbg.h b/storage/innobase/include/ut0dbg.h
new file mode 100644
index 00000000000..6a4afe99597
--- /dev/null
+++ b/storage/innobase/include/ut0dbg.h
@@ -0,0 +1,132 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*****************************************************************//**
+@file include/ut0dbg.h
+Debug utilities for Innobase
+
+Created 1/30/1994 Heikki Tuuri
+**********************************************************************/
+
+#ifndef ut0dbg_h
+#define ut0dbg_h
+
+#ifdef UNIV_INNOCHECKSUM
+#define ut_a		assert
+#define ut_ad		assert
+#define ut_error	assert(0)
+#else /* !UNIV_INNOCHECKSUM */
+
+#include "univ.i"
+#include <stdlib.h>
+#include "os0thread.h"
+
+#if defined(__GNUC__) && (__GNUC__ > 2)
+/** Test if an assertion fails.
+@param EXPR	assertion expression
+@return		nonzero if EXPR holds, zero if not */
+# define UT_DBG_FAIL(EXPR) UNIV_UNLIKELY(!((ulint)(EXPR)))
+#else
+/** This is used to eliminate compiler warnings */
+extern ulint	ut_dbg_zero;
+/** Test if an assertion fails.
+@param EXPR	assertion expression
+@return		nonzero if EXPR holds, zero if not */
+# define UT_DBG_FAIL(EXPR) !((ulint)(EXPR) + ut_dbg_zero)
+#endif
+
+/*************************************************************//**
+Report a failed assertion. */
+UNIV_INTERN
+void
+ut_dbg_assertion_failed(
+/*====================*/
+	const char*	expr,	/*!< in: the failed assertion */
+	const char*	file,	/*!< in: source file containing the assertion */
+	ulint		line)	/*!< in: line number of the assertion */
+	UNIV_COLD __attribute__((nonnull(2)));
+
+/** Abort the execution. */
+# define UT_DBG_PANIC abort()
+
+/** Abort execution if EXPR does not evaluate to nonzero.
+@param EXPR	assertion expression that should hold */
+#define ut_a(EXPR) do {						\
+	if (UT_DBG_FAIL(EXPR)) {				\
+		ut_dbg_assertion_failed(#EXPR,			\
+				__FILE__, (ulint) __LINE__);	\
+		UT_DBG_PANIC;					\
+	}							\
+} while (0)
+
+/** Abort execution. */
+#define ut_error do {						\
+	ut_dbg_assertion_failed(0, __FILE__, (ulint) __LINE__);	\
+	UT_DBG_PANIC;						\
+} while (0)
+
+#ifdef UNIV_DEBUG
+/** Debug assertion. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_ad(EXPR)	ut_a(EXPR)
+/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_d(EXPR)	do {EXPR;} while (0)
+#else
+/** Debug assertion. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_ad(EXPR)
+/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_d(EXPR)
+#endif
+
+/** Silence warnings about an unused variable by doing a null assignment.
+@param A	the unused variable */
+#define UT_NOT_USED(A)	A = A
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+/** structure used for recording usage statistics */
+struct speedo_t {
+	struct rusage	ru;	/*!< getrusage() result */
+	struct timeval	tv;	/*!< gettimeofday() result */
+};
+
+/*******************************************************************//**
+Resets a speedo (records the current time in it). */
+UNIV_INTERN
+void
+speedo_reset(
+/*=========*/
+	speedo_t*	speedo);	/*!< out: speedo */
+
+/*******************************************************************//**
+Shows the time elapsed and usage statistics since the last reset of a
+speedo. */
+UNIV_INTERN
+void
+speedo_show(
+/*========*/
+	const speedo_t*	speedo);	/*!< in: speedo */
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif
diff --git a/storage/innobase/include/ut0list.h b/storage/innobase/include/ut0list.h
new file mode 100644
index 00000000000..29fc8669ce4
--- /dev/null
+++ b/storage/innobase/include/ut0list.h
@@ -0,0 +1,180 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0list.h
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/*******************************************************************//**
+A double-linked list. This differs from the one in ut0lst.h in that in this
+one, each list node contains a pointer to the data, whereas the one in
+ut0lst.h uses a strategy where the list pointers are embedded in the data
+items themselves.
+
+Use this one when you need to store arbitrary data in the list where you
+can't embed the list pointers in the data, if a data item needs to be
+stored in multiple lists, etc.
+
+Note about the memory management: ib_list_t is a fixed-size struct whose
+allocation/deallocation is done through ib_list_create/ib_list_free, but the
+memory for the list nodes is allocated through a user-given memory heap,
+which can either be the same for all nodes or vary per node. Most users will
+probably want to create a memory heap to store the item-specific data, and
+pass in this same heap to the list node creation functions, thus
+automatically freeing the list node when the item's heap is freed.
+
+************************************************************************/
+
+#ifndef IB_LIST_H
+#define IB_LIST_H
+
+#include "mem0mem.h"
+
+struct ib_list_t;
+struct ib_list_node_t;
+
+/****************************************************************//**
+Create a new list using mem_alloc. Lists created with this function must be
+freed with ib_list_free.
+@return	list */
+UNIV_INTERN
+ib_list_t*
+ib_list_create(void);
+/*=================*/
+
+
+/****************************************************************//**
+Create a new list using the given heap. ib_list_free MUST NOT BE CALLED for
+lists created with this function.
+@return	list */
+UNIV_INTERN
+ib_list_t*
+ib_list_create_heap(
+/*================*/
+	mem_heap_t*	heap);	/*!< in: memory heap to use */
+
+/****************************************************************//**
+Free a list. */
+UNIV_INTERN
+void
+ib_list_free(
+/*=========*/
+	ib_list_t*	list);	/*!< in: list */
+
+/****************************************************************//**
+Add the data to the start of the list.
+@return	new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_first(
+/*==============*/
+	ib_list_t*	list,	/*!< in: list */
+	void*		data,	/*!< in: data */
+	mem_heap_t*	heap);	/*!< in: memory heap to use */
+
+/****************************************************************//**
+Add the data to the end of the list.
+@return	new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_last(
+/*=============*/
+	ib_list_t*	list,	/*!< in: list */
+	void*		data,	/*!< in: data */
+	mem_heap_t*	heap);	/*!< in: memory heap to use */
+
+/****************************************************************//**
+Add the data after the indicated node.
+@return	new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_after(
+/*==============*/
+	ib_list_t*	list,		/*!< in: list */
+	ib_list_node_t*	prev_node,	/*!< in: node preceding new node (can
+					be NULL) */
+	void*		data,		/*!< in: data */
+	mem_heap_t*	heap);		/*!< in: memory heap to use */
+
+/****************************************************************//**
+Remove the node from the list. */
+UNIV_INTERN
+void
+ib_list_remove(
+/*===========*/
+	ib_list_t*	list,	/*!< in: list */
+	ib_list_node_t*	node);	/*!< in: node to remove */
+
+/****************************************************************//**
+Get the first node in the list.
+@return	first node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_first(
+/*==============*/
+	ib_list_t*	list);	/*!< in: list */
+
+/****************************************************************//**
+Get the last node in the list.
+@return	last node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_last(
+/*=============*/
+	ib_list_t*	list);	/*!< in: list */
+
+/********************************************************************
+Check if list is empty. */
+UNIV_INLINE
+ibool
+ib_list_is_empty(
+/*=============*/
+					/* out: TRUE if empty else  */
+	const ib_list_t*	list);	/* in: list */
+
+/* List. */
+struct ib_list_t {
+	ib_list_node_t*		first;		/*!< first node */
+	ib_list_node_t*		last;		/*!< last node */
+	ibool			is_heap_list;	/*!< TRUE if this list was
+						allocated through a heap */
+};
+
+/* A list node. */
+struct ib_list_node_t {
+	ib_list_node_t*		prev;		/*!< previous node */
+	ib_list_node_t*		next;		/*!< next node */
+	void*			data;		/*!< user data */
+};
+
+/* Quite often, the only additional piece of data you need is the per-item
+memory heap, so we have this generic struct available to use in those
+cases. */
+struct ib_list_helper_t {
+	mem_heap_t*	heap;		/*!< memory heap */
+	void*		data;		/*!< user data */
+};
+
+#ifndef UNIV_NONINL
+#include "ut0list.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/ut0list.ic b/storage/innobase/include/ut0list.ic
new file mode 100644
index 00000000000..d9dcb2eac99
--- /dev/null
+++ b/storage/innobase/include/ut0list.ic
@@ -0,0 +1,60 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0list.ic
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/****************************************************************//**
+Get the first node in the list.
+@return	first node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_first(
+/*==============*/
+	ib_list_t*	list)	/*!< in: list */
+{
+	return(list->first);
+}
+
+/****************************************************************//**
+Get the last node in the list.
+@return	last node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_last(
+/*=============*/
+	ib_list_t*	list)	/*!< in: list */
+{
+	return(list->last);
+}
+
+/********************************************************************
+Check if list is empty. */
+UNIV_INLINE
+ibool
+ib_list_is_empty(
+/*=============*/
+					/* out: TRUE if empty else FALSE */
+	const ib_list_t*	list)	/* in: list */
+{
+	return(!(list->first || list->last));
+}
diff --git a/storage/innobase/include/ut0lst.h b/storage/innobase/include/ut0lst.h
new file mode 100644
index 00000000000..b53e7ade4c1
--- /dev/null
+++ b/storage/innobase/include/ut0lst.h
@@ -0,0 +1,408 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0lst.h
+List utilities
+
+Created 9/10/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0lst_h
+#define ut0lst_h
+
+#include "univ.i"
+
+/*******************************************************************//**
+Return offset of F in POD T.
+@param T	- POD pointer
+@param F	- Field in T */
+#define IB_OFFSETOF(T, F)						\
+	(reinterpret_cast<byte*>(&(T)->F) - reinterpret_cast<byte*>(T))
+
+/* This module implements the two-way linear list which should be used
+if a list is used in the database. Note that a single struct may belong
+to two or more lists, provided that the list are given different names.
+An example of the usage of the lists can be found in fil0fil.cc. */
+
+/*******************************************************************//**
+This macro expands to the unnamed type definition of a struct which acts
+as the two-way list base node. The base node contains pointers
+to both ends of the list and a count of nodes in the list (excluding
+the base node from the count).
+@param TYPE	the name of the list node data type */
+template <typename TYPE>
+struct ut_list_base {
+	typedef TYPE elem_type;
+
+	ulint	count;	/*!< count of nodes in list */
+	TYPE*	start;	/*!< pointer to list start, NULL if empty */
+	TYPE*	end;	/*!< pointer to list end, NULL if empty */
+};
+
+#define UT_LIST_BASE_NODE_T(TYPE)	ut_list_base<TYPE>
+
+/*******************************************************************//**
+This macro expands to the unnamed type definition of a struct which
+should be embedded in the nodes of the list, the node type must be a struct.
+This struct contains the pointers to next and previous nodes in the list.
+The name of the field in the node struct should be the name given
+to the list.
+@param TYPE	the list node type name */
+/* Example:
+struct LRU_node_t {
+	UT_LIST_NODE_T(LRU_node_t)	LRU_list;
+	...
+}
+The example implements an LRU list of name LRU_list. Its nodes are of type
+LRU_node_t. */
+
+template <typename TYPE>
+struct ut_list_node {
+	TYPE* 	prev;	/*!< pointer to the previous node,
+			NULL if start of list */
+	TYPE* 	next;	/*!< pointer to next node, NULL if end of list */
+};
+
+#define UT_LIST_NODE_T(TYPE)	ut_list_node<TYPE>
+
+/*******************************************************************//**
+Get the list node at offset.
+@param elem	- list element
+@param offset	- offset within element.
+@return reference to list node. */
+template <typename Type>
+ut_list_node<Type>&
+ut_elem_get_node(Type&	elem, size_t offset)
+{
+	ut_a(offset < sizeof(elem));
+
+	return(*reinterpret_cast<ut_list_node<Type>*>(
+		reinterpret_cast<byte*>(&elem) + offset));
+}
+
+/*******************************************************************//**
+Initializes the base node of a two-way list.
+@param BASE	the list base node
+*/
+#define UT_LIST_INIT(BASE)\
+{\
+	(BASE).count = 0;\
+	(BASE).start = NULL;\
+	(BASE).end   = NULL;\
+}\
+
+/*******************************************************************//**
+Adds the node as the first element in a two-way linked list.
+@param list	the base node (not a pointer to it)
+@param elem	the element to add
+@param offset	offset of list node in elem. */
+template <typename List, typename Type>
+void
+ut_list_prepend(
+	List&		list,
+	Type&		elem,
+	size_t		offset)
+{
+	ut_list_node<Type>&	elem_node = ut_elem_get_node(elem, offset);
+
+ 	elem_node.prev = 0;
+ 	elem_node.next = list.start;
+
+	if (list.start != 0) {
+		ut_list_node<Type>&	base_node =
+			ut_elem_get_node(*list.start, offset);
+
+		ut_ad(list.start != &elem);
+
+		base_node.prev = &elem;
+	}
+
+	list.start = &elem;
+
+	if (list.end == 0) {
+		list.end = &elem;
+	}
+
+	++list.count;
+}
+
+/*******************************************************************//**
+Adds the node as the first element in a two-way linked list.
+@param NAME	list name
+@param LIST	the base node (not a pointer to it)
+@param ELEM	the element to add */
+#define UT_LIST_ADD_FIRST(NAME, LIST, ELEM)	\
+	ut_list_prepend(LIST, *ELEM, IB_OFFSETOF(ELEM, NAME))
+
+/*******************************************************************//**
+Adds the node as the last element in a two-way linked list.
+@param list	list
+@param elem	the element to add
+@param offset	offset of list node in elem */
+template <typename List, typename Type>
+void
+ut_list_append(
+	List&		list,
+	Type&		elem,
+	size_t		offset)
+{
+	ut_list_node<Type>&	elem_node = ut_elem_get_node(elem, offset);
+
+	elem_node.next = 0;
+	elem_node.prev = list.end;
+
+	if (list.end != 0) {
+		ut_list_node<Type>&	base_node =
+			ut_elem_get_node(*list.end, offset);
+
+		ut_ad(list.end != &elem);
+
+		base_node.next = &elem;
+	}
+
+	list.end = &elem;
+
+	if (list.start == 0) {
+		list.start = &elem;
+	}
+
+	++list.count;
+}
+
+/*******************************************************************//**
+Adds the node as the last element in a two-way linked list.
+@param NAME	list name
+@param LIST	list
+@param ELEM	the element to add */
+#define UT_LIST_ADD_LAST(NAME, LIST, ELEM)\
+	ut_list_append(LIST, *ELEM, IB_OFFSETOF(ELEM, NAME))
+
+/*******************************************************************//**
+Inserts a ELEM2 after ELEM1 in a list.
+@param list	the base node
+@param elem1	node after which ELEM2 is inserted
+@param elem2	node being inserted after NODE1
+@param offset	offset of list node in elem1 and elem2 */
+template <typename List, typename Type>
+void
+ut_list_insert(
+	List&		list,
+	Type&		elem1,
+	Type&		elem2,
+	size_t		offset)
+{
+	ut_ad(&elem1 != &elem2);
+
+	ut_list_node<Type>&	elem1_node = ut_elem_get_node(elem1, offset);
+	ut_list_node<Type>&	elem2_node = ut_elem_get_node(elem2, offset);
+
+	elem2_node.prev = &elem1;
+	elem2_node.next = elem1_node.next;
+
+	if (elem1_node.next != NULL) {
+		ut_list_node<Type>&	next_node =
+			ut_elem_get_node(*elem1_node.next, offset);
+
+		next_node.prev = &elem2;
+	}
+
+	elem1_node.next = &elem2;
+
+	if (list.end == &elem1) {
+		list.end = &elem2;
+	}
+
+	++list.count;
+}
+
+/*******************************************************************//**
+Inserts a ELEM2 after ELEM1 in a list.
+@param NAME	list name
+@param LIST	the base node
+@param ELEM1	node after which ELEM2 is inserted
+@param ELEM2	node being inserted after ELEM1 */
+#define UT_LIST_INSERT_AFTER(NAME, LIST, ELEM1, ELEM2)\
+	ut_list_insert(LIST, *ELEM1, *ELEM2, IB_OFFSETOF(ELEM1, NAME))
+
+#ifdef UNIV_LIST_DEBUG
+/** Invalidate the pointers in a list node.
+@param NAME	list name
+@param N	pointer to the node that was removed */
+# define UT_LIST_REMOVE_CLEAR(N)					\
+	(N).next = (Type*) -1;						\
+	(N).prev = (N).next
+#else
+/** Invalidate the pointers in a list node.
+@param NAME	list name
+@param N	pointer to the node that was removed */
+# define UT_LIST_REMOVE_CLEAR(N)
+#endif /* UNIV_LIST_DEBUG */
+
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param list	the base node (not a pointer to it)
+@param elem	node to be removed from the list
+@param offset	offset of list node within elem */
+template <typename List, typename Type>
+void
+ut_list_remove(
+	List&		list,
+ 	Type&		elem,
+	size_t		offset)
+{
+	ut_list_node<Type>&	elem_node = ut_elem_get_node(elem, offset);
+
+	ut_a(list.count > 0);
+
+	if (elem_node.next != NULL) {
+		ut_list_node<Type>&	next_node =
+			ut_elem_get_node(*elem_node.next, offset);
+
+		next_node.prev = elem_node.prev;
+	} else {
+		list.end = elem_node.prev;
+	}
+
+	if (elem_node.prev != NULL) {
+		ut_list_node<Type>&	prev_node =
+			ut_elem_get_node(*elem_node.prev, offset);
+
+		prev_node.next = elem_node.next;
+	} else {
+		list.start = elem_node.next;
+	}
+
+	UT_LIST_REMOVE_CLEAR(elem_node);
+
+	--list.count;
+}
+
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+  aram NAME	list name
+@param LIST	the base node (not a pointer to it)
+@param ELEM	node to be removed from the list */
+#define UT_LIST_REMOVE(NAME, LIST, ELEM)				\
+	ut_list_remove(LIST, *ELEM, IB_OFFSETOF(ELEM, NAME))
+
+/********************************************************************//**
+Gets the next node in a two-way list.
+@param NAME	list name
+@param N	pointer to a node
+@return		the successor of N in NAME, or NULL */
+#define UT_LIST_GET_NEXT(NAME, N)\
+	(((N)->NAME).next)
+
+/********************************************************************//**
+Gets the previous node in a two-way list.
+@param NAME	list name
+@param N	pointer to a node
+@return		the predecessor of N in NAME, or NULL */
+#define UT_LIST_GET_PREV(NAME, N)\
+	(((N)->NAME).prev)
+
+/********************************************************************//**
+Alternative macro to get the number of nodes in a two-way list, i.e.,
+its length.
+@param BASE	the base node (not a pointer to it).
+@return		the number of nodes in the list */
+#define UT_LIST_GET_LEN(BASE)\
+	(BASE).count
+
+/********************************************************************//**
+Gets the first node in a two-way list.
+@param BASE	the base node (not a pointer to it)
+@return		first node, or NULL if the list is empty */
+#define UT_LIST_GET_FIRST(BASE)\
+	(BASE).start
+
+/********************************************************************//**
+Gets the last node in a two-way list.
+@param BASE	the base node (not a pointer to it)
+@return		last node, or NULL if the list is empty */
+#define UT_LIST_GET_LAST(BASE)\
+	(BASE).end
+
+struct	NullValidate { void operator()(const void* elem) { } };
+
+/********************************************************************//**
+Iterate over all the elements and call the functor for each element.
+@param list	base node (not a pointer to it)
+@param functor	Functor that is called for each element in the list
+@parm  node	pointer to member node within list element */
+template <typename List, class Functor>
+void
+ut_list_map(
+	List&		list,
+	ut_list_node<typename List::elem_type>
+			List::elem_type::*node,
+	Functor		functor)
+{
+	ulint		count = 0;
+
+	for (typename List::elem_type* elem = list.start;
+	     elem != 0;
+	     elem = (elem->*node).next, ++count) {
+
+		functor(elem);
+	}
+
+	ut_a(count == list.count);
+}
+
+/********************************************************************//**
+Checks the consistency of a two-way list.
+@param list	base node (not a pointer to it)
+@param functor	Functor that is called for each element in the list
+@parm  node	pointer to member node within list element */
+template <typename List, class Functor>
+void
+ut_list_validate(
+	List&		list,
+	ut_list_node<typename List::elem_type>
+			List::elem_type::*node,
+	Functor		functor = NullValidate())
+{
+	ut_list_map(list, node, functor);
+
+	ulint		count = 0;
+
+	for (typename List::elem_type* elem = list.end;
+	     elem != 0;
+	     elem = (elem->*node).prev, ++count) {
+
+		functor(elem);
+	}
+
+	ut_a(count == list.count);
+}
+
+/********************************************************************//**
+Checks the consistency of a two-way list.
+@param NAME		the name of the list
+@param TYPE		node type
+@param LIST		base node (not a pointer to it)
+@param FUNCTOR		called for each list element */
+#define UT_LIST_VALIDATE(NAME, TYPE, LIST, FUNCTOR)			\
+	ut_list_validate(LIST, &TYPE::NAME, FUNCTOR)
+
+#define UT_LIST_CHECK(NAME, TYPE, LIST)					\
+	ut_list_validate(LIST, &TYPE::NAME, NullValidate())
+
+#endif /* ut0lst.h */
diff --git a/storage/innobase/include/ut0mem.h b/storage/innobase/include/ut0mem.h
new file mode 100644
index 00000000000..af7eb4e9b1d
--- /dev/null
+++ b/storage/innobase/include/ut0mem.h
@@ -0,0 +1,261 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0mem.h
+Memory primitives
+
+Created 5/30/1994 Heikki Tuuri
+************************************************************************/
+
+#ifndef ut0mem_h
+#define ut0mem_h
+
+#include "univ.i"
+#include <string.h>
+#ifndef UNIV_HOTBACKUP
+# include "os0sync.h"
+
+/** The total amount of memory currently allocated from the operating
+system with os_mem_alloc_large() or malloc().  Does not count malloc()
+if srv_use_sys_malloc is set.  Protected by ut_list_mutex. */
+extern ulint		ut_total_allocated_memory;
+
+/** Mutex protecting ut_total_allocated_memory and ut_mem_block_list */
+extern os_fast_mutex_t	ut_list_mutex;
+#endif /* !UNIV_HOTBACKUP */
+
+/** Wrapper for memcpy(3).  Copy memory area when the source and
+target are not overlapping.
+* @param dest	in: copy to
+* @param sour	in: copy from
+* @param n	in: number of bytes to copy
+* @return	dest */
+UNIV_INLINE
+void*
+ut_memcpy(void* dest, const void* sour, ulint n);
+
+/** Wrapper for memmove(3).  Copy memory area when the source and
+target are overlapping.
+* @param dest	in: copy to
+* @param sour	in: copy from
+* @param n	in: number of bytes to copy
+* @return	dest */
+UNIV_INLINE
+void*
+ut_memmove(void* dest, const void* sour, ulint n);
+
+/** Wrapper for memcmp(3).  Compare memory areas.
+* @param str1	in: first memory block to compare
+* @param str2	in: second memory block to compare
+* @param n	in: number of bytes to compare
+* @return	negative, 0, or positive if str1 is smaller, equal,
+		or greater than str2, respectively. */
+UNIV_INLINE
+int
+ut_memcmp(const void* str1, const void* str2, ulint n);
+
+/**********************************************************************//**
+Initializes the mem block list at database startup. */
+UNIV_INTERN
+void
+ut_mem_init(void);
+/*=============*/
+
+/**********************************************************************//**
+Allocates memory.
+@return	own: allocated memory */
+UNIV_INTERN
+void*
+ut_malloc_low(
+/*==========*/
+	ulint	n,			/*!< in: number of bytes to allocate */
+	ibool	assert_on_error)	/*!< in: if TRUE, we crash mysqld if
+					the memory cannot be allocated */
+	__attribute__((malloc));
+/**********************************************************************//**
+Allocates memory. */
+#define ut_malloc(n) ut_malloc_low(n, TRUE)
+/**********************************************************************//**
+Frees a memory block allocated with ut_malloc. Freeing a NULL pointer is
+a nop. */
+UNIV_INTERN
+void
+ut_free(
+/*====*/
+	void* ptr);  /*!< in, own: memory block, can be NULL */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Implements realloc. This is needed by /pars/lexyy.cc. Otherwise, you should not
+use this function because the allocation functions in mem0mem.h are the
+recommended ones in InnoDB.
+
+man realloc in Linux, 2004:
+
+       realloc()  changes the size of the memory block pointed to
+       by ptr to size bytes.  The contents will be  unchanged  to
+       the minimum of the old and new sizes; newly allocated mem�
+       ory will be uninitialized.  If ptr is NULL,  the	 call  is
+       equivalent  to malloc(size); if size is equal to zero, the
+       call is equivalent to free(ptr).	 Unless ptr is	NULL,  it
+       must  have  been	 returned by an earlier call to malloc(),
+       calloc() or realloc().
+
+RETURN VALUE
+       realloc() returns a pointer to the newly allocated memory,
+       which is suitably aligned for any kind of variable and may
+       be different from ptr, or NULL if the  request  fails.  If
+       size  was equal to 0, either NULL or a pointer suitable to
+       be passed to free() is returned.	 If realloc()  fails  the
+       original	 block	is  left  untouched  - it is not freed or
+       moved.
+@return	own: pointer to new mem block or NULL */
+UNIV_INTERN
+void*
+ut_realloc(
+/*=======*/
+	void*	ptr,	/*!< in: pointer to old block or NULL */
+	ulint	size);	/*!< in: desired size */
+/**********************************************************************//**
+Frees in shutdown all allocated memory not freed yet. */
+UNIV_INTERN
+void
+ut_free_all_mem(void);
+/*=================*/
+#endif /* !UNIV_HOTBACKUP */
+
+/** Wrapper for strcpy(3).  Copy a NUL-terminated string.
+* @param dest	in: copy to
+* @param sour	in: copy from
+* @return	dest */
+UNIV_INLINE
+char*
+ut_strcpy(char* dest, const char* sour);
+
+/** Wrapper for strlen(3).  Determine the length of a NUL-terminated string.
+* @param str	in: string
+* @return	length of the string in bytes, excluding the terminating NUL */
+UNIV_INLINE
+ulint
+ut_strlen(const char* str);
+
+/** Wrapper for strcmp(3).  Compare NUL-terminated strings.
+* @param str1	in: first string to compare
+* @param str2	in: second string to compare
+* @return	negative, 0, or positive if str1 is smaller, equal,
+		or greater than str2, respectively. */
+UNIV_INLINE
+int
+ut_strcmp(const char* str1, const char* str2);
+
+/**********************************************************************//**
+Copies up to size - 1 characters from the NUL-terminated string src to
+dst, NUL-terminating the result. Returns strlen(src), so truncation
+occurred if the return value >= size.
+@return	strlen(src) */
+UNIV_INTERN
+ulint
+ut_strlcpy(
+/*=======*/
+	char*		dst,	/*!< in: destination buffer */
+	const char*	src,	/*!< in: source buffer */
+	ulint		size);	/*!< in: size of destination buffer */
+
+/**********************************************************************//**
+Like ut_strlcpy, but if src doesn't fit in dst completely, copies the last
+(size - 1) bytes of src, not the first.
+@return	strlen(src) */
+UNIV_INTERN
+ulint
+ut_strlcpy_rev(
+/*===========*/
+	char*		dst,	/*!< in: destination buffer */
+	const char*	src,	/*!< in: source buffer */
+	ulint		size);	/*!< in: size of destination buffer */
+
+/**********************************************************************//**
+Return the number of times s2 occurs in s1. Overlapping instances of s2
+are only counted once.
+@return	the number of times s2 occurs in s1 */
+UNIV_INTERN
+ulint
+ut_strcount(
+/*========*/
+	const char*	s1,	/*!< in: string to search in */
+	const char*	s2);	/*!< in: string to search for */
+
+/**********************************************************************//**
+Replace every occurrence of s1 in str with s2. Overlapping instances of s1
+are only replaced once.
+@return	own: modified string, must be freed with mem_free() */
+UNIV_INTERN
+char*
+ut_strreplace(
+/*==========*/
+	const char*	str,	/*!< in: string to operate on */
+	const char*	s1,	/*!< in: string to replace */
+	const char*	s2);	/*!< in: string to replace s1 with */
+
+/********************************************************************
+Concatenate 3 strings.*/
+
+char*
+ut_str3cat(
+/*=======*/
+				/* out, own: concatenated string, must be
+				freed with mem_free() */
+	const char*	s1,	/* in: string 1 */
+	const char*	s2,	/* in: string 2 */
+	const char*	s3);	/* in: string 3 */
+
+/**********************************************************************//**
+Converts a raw binary data to a NUL-terminated hex string. The output is
+truncated if there is not enough space in "hex", make sure "hex_size" is at
+least (2 * raw_size + 1) if you do not want this to happen. Returns the
+actual number of characters written to "hex" (including the NUL).
+@return	number of chars written */
+UNIV_INLINE
+ulint
+ut_raw_to_hex(
+/*==========*/
+	const void*	raw,		/*!< in: raw data */
+	ulint		raw_size,	/*!< in: "raw" length in bytes */
+	char*		hex,		/*!< out: hex string */
+	ulint		hex_size);	/*!< in: "hex" size in bytes */
+
+/*******************************************************************//**
+Adds single quotes to the start and end of string and escapes any quotes
+by doubling them. Returns the number of bytes that were written to "buf"
+(including the terminating NUL). If buf_size is too small then the
+trailing bytes from "str" are discarded.
+@return	number of bytes that were written */
+UNIV_INLINE
+ulint
+ut_str_sql_format(
+/*==============*/
+	const char*	str,		/*!< in: string */
+	ulint		str_len,	/*!< in: string length in bytes */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size);	/*!< in: output buffer size
+					in bytes */
+
+#ifndef UNIV_NONINL
+#include "ut0mem.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/ut0mem.ic b/storage/innobase/include/ut0mem.ic
new file mode 100644
index 00000000000..5c9071d52cc
--- /dev/null
+++ b/storage/innobase/include/ut0mem.ic
@@ -0,0 +1,317 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0mem.ic
+Memory primitives
+
+Created 5/30/1994 Heikki Tuuri
+************************************************************************/
+
+#include "ut0byte.h"
+#include "mach0data.h"
+
+/** Wrapper for memcpy(3).  Copy memory area when the source and
+target are not overlapping.
+* @param dest	in: copy to
+* @param sour	in: copy from
+* @param n	in: number of bytes to copy
+* @return	dest */
+UNIV_INLINE
+void*
+ut_memcpy(void* dest, const void* sour, ulint n)
+{
+	return(memcpy(dest, sour, n));
+}
+
+/** Wrapper for memmove(3).  Copy memory area when the source and
+target are overlapping.
+* @param dest	in: copy to
+* @param sour	in: copy from
+* @param n	in: number of bytes to copy
+* @return	dest */
+UNIV_INLINE
+void*
+ut_memmove(void* dest, const void* sour, ulint n)
+{
+	return(memmove(dest, sour, n));
+}
+
+/** Wrapper for memcmp(3).  Compare memory areas.
+* @param str1	in: first memory block to compare
+* @param str2	in: second memory block to compare
+* @param n	in: number of bytes to compare
+* @return	negative, 0, or positive if str1 is smaller, equal,
+		or greater than str2, respectively. */
+UNIV_INLINE
+int
+ut_memcmp(const void* str1, const void* str2, ulint n)
+{
+	return(memcmp(str1, str2, n));
+}
+
+/** Wrapper for strcpy(3).  Copy a NUL-terminated string.
+* @param dest	in: copy to
+* @param sour	in: copy from
+* @return	dest */
+UNIV_INLINE
+char*
+ut_strcpy(char* dest, const char* sour)
+{
+	return(strcpy(dest, sour));
+}
+
+/** Wrapper for strlen(3).  Determine the length of a NUL-terminated string.
+* @param str	in: string
+* @return	length of the string in bytes, excluding the terminating NUL */
+UNIV_INLINE
+ulint
+ut_strlen(const char* str)
+{
+	return(strlen(str));
+}
+
+/** Wrapper for strcmp(3).  Compare NUL-terminated strings.
+* @param str1	in: first string to compare
+* @param str2	in: second string to compare
+* @return	negative, 0, or positive if str1 is smaller, equal,
+		or greater than str2, respectively. */
+UNIV_INLINE
+int
+ut_strcmp(const char* str1, const char* str2)
+{
+	return(strcmp(str1, str2));
+}
+
+/**********************************************************************//**
+Converts a raw binary data to a NUL-terminated hex string. The output is
+truncated if there is not enough space in "hex", make sure "hex_size" is at
+least (2 * raw_size + 1) if you do not want this to happen. Returns the
+actual number of characters written to "hex" (including the NUL).
+@return	number of chars written */
+UNIV_INLINE
+ulint
+ut_raw_to_hex(
+/*==========*/
+	const void*	raw,		/*!< in: raw data */
+	ulint		raw_size,	/*!< in: "raw" length in bytes */
+	char*		hex,		/*!< out: hex string */
+	ulint		hex_size)	/*!< in: "hex" size in bytes */
+{
+
+#ifdef WORDS_BIGENDIAN
+
+#define MK_UINT16(a, b) (((uint16) (a)) << 8 | (uint16) (b))
+
+#define UINT16_GET_A(u)	((unsigned char) ((u) >> 8))
+#define UINT16_GET_B(u)	((unsigned char) ((u) & 0xFF))
+
+#else /* WORDS_BIGENDIAN */
+
+#define MK_UINT16(a, b) (((uint16) (b)) << 8 | (uint16) (a))
+
+#define UINT16_GET_A(u)	((unsigned char) ((u) & 0xFF))
+#define UINT16_GET_B(u)	((unsigned char) ((u) >> 8))
+
+#endif /* WORDS_BIGENDIAN */
+
+#define MK_ALL_UINT16_WITH_A(a)	\
+	MK_UINT16(a, '0'),	\
+	MK_UINT16(a, '1'),	\
+	MK_UINT16(a, '2'),	\
+	MK_UINT16(a, '3'),	\
+	MK_UINT16(a, '4'),	\
+	MK_UINT16(a, '5'),	\
+	MK_UINT16(a, '6'),	\
+	MK_UINT16(a, '7'),	\
+	MK_UINT16(a, '8'),	\
+	MK_UINT16(a, '9'),	\
+	MK_UINT16(a, 'A'),	\
+	MK_UINT16(a, 'B'),	\
+	MK_UINT16(a, 'C'),	\
+	MK_UINT16(a, 'D'),	\
+	MK_UINT16(a, 'E'),	\
+	MK_UINT16(a, 'F')
+
+	static const uint16	hex_map[256] = {
+		MK_ALL_UINT16_WITH_A('0'),
+		MK_ALL_UINT16_WITH_A('1'),
+		MK_ALL_UINT16_WITH_A('2'),
+		MK_ALL_UINT16_WITH_A('3'),
+		MK_ALL_UINT16_WITH_A('4'),
+		MK_ALL_UINT16_WITH_A('5'),
+		MK_ALL_UINT16_WITH_A('6'),
+		MK_ALL_UINT16_WITH_A('7'),
+		MK_ALL_UINT16_WITH_A('8'),
+		MK_ALL_UINT16_WITH_A('9'),
+		MK_ALL_UINT16_WITH_A('A'),
+		MK_ALL_UINT16_WITH_A('B'),
+		MK_ALL_UINT16_WITH_A('C'),
+		MK_ALL_UINT16_WITH_A('D'),
+		MK_ALL_UINT16_WITH_A('E'),
+		MK_ALL_UINT16_WITH_A('F')
+	};
+	const unsigned char*	rawc;
+	ulint			read_bytes;
+	ulint			write_bytes;
+	ulint			i;
+
+	rawc = (const unsigned char*) raw;
+
+	if (hex_size == 0) {
+
+		return(0);
+	}
+
+	if (hex_size <= 2 * raw_size) {
+
+		read_bytes = hex_size / 2;
+		write_bytes = hex_size;
+	} else {
+
+		read_bytes = raw_size;
+		write_bytes = 2 * raw_size + 1;
+	}
+
+#define LOOP_READ_BYTES(ASSIGN)			\
+	for (i = 0; i < read_bytes; i++) {	\
+		ASSIGN;				\
+		hex += 2;			\
+		rawc++;				\
+	}
+
+	if (ut_align_offset(hex, 2) == 0) {
+
+		LOOP_READ_BYTES(
+			*(uint16*) hex = hex_map[*rawc]
+		);
+	} else {
+
+		LOOP_READ_BYTES(
+			*hex       = UINT16_GET_A(hex_map[*rawc]);
+			*(hex + 1) = UINT16_GET_B(hex_map[*rawc])
+		);
+	}
+
+	if (hex_size <= 2 * raw_size && hex_size % 2 == 0) {
+
+		hex--;
+	}
+
+	*hex = '\0';
+
+	return(write_bytes);
+}
+
+/*******************************************************************//**
+Adds single quotes to the start and end of string and escapes any quotes
+by doubling them. Returns the number of bytes that were written to "buf"
+(including the terminating NUL). If buf_size is too small then the
+trailing bytes from "str" are discarded.
+@return	number of bytes that were written */
+UNIV_INLINE
+ulint
+ut_str_sql_format(
+/*==============*/
+	const char*	str,		/*!< in: string */
+	ulint		str_len,	/*!< in: string length in bytes */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size)	/*!< in: output buffer size
+					in bytes */
+{
+	ulint	str_i;
+	ulint	buf_i;
+
+	buf_i = 0;
+
+	switch (buf_size) {
+	case 3:
+
+		if (str_len == 0) {
+
+			buf[buf_i] = '\'';
+			buf_i++;
+			buf[buf_i] = '\'';
+			buf_i++;
+		}
+		/* FALLTHROUGH */
+	case 2:
+	case 1:
+
+		buf[buf_i] = '\0';
+		buf_i++;
+		/* FALLTHROUGH */
+	case 0:
+
+		return(buf_i);
+	}
+
+	/* buf_size >= 4 */
+
+	buf[0] = '\'';
+	buf_i = 1;
+
+	for (str_i = 0; str_i < str_len; str_i++) {
+
+		char	ch;
+
+		if (buf_size - buf_i == 2) {
+
+			break;
+		}
+
+		ch = str[str_i];
+
+		switch (ch) {
+		case '\0':
+
+			if (buf_size - buf_i < 4) {
+
+				goto func_exit;
+			}
+			buf[buf_i] = '\\';
+			buf_i++;
+			buf[buf_i] = '0';
+			buf_i++;
+			break;
+		case '\'':
+		case '\\':
+
+			if (buf_size - buf_i < 4) {
+
+				goto func_exit;
+			}
+			buf[buf_i] = ch;
+			buf_i++;
+			/* FALLTHROUGH */
+		default:
+
+			buf[buf_i] = ch;
+			buf_i++;
+		}
+	}
+
+func_exit:
+
+	buf[buf_i] = '\'';
+	buf_i++;
+	buf[buf_i] = '\0';
+	buf_i++;
+
+	return(buf_i);
+}
diff --git a/storage/innobase/include/ut0rbt.h b/storage/innobase/include/ut0rbt.h
new file mode 100644
index 00000000000..e0593e99bde
--- /dev/null
+++ b/storage/innobase/include/ut0rbt.h
@@ -0,0 +1,324 @@
+/***************************************************************************//**
+
+Copyright (c) 2007, 2010, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/******************************************************************//**
+@file include/ut0rbt.h
+Various utilities
+
+Created 2007-03-20 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_UT0RBT_H
+#define INNOBASE_UT0RBT_H
+
+#if !defined(IB_RBT_TESTING)
+#include "univ.i"
+#include "ut0mem.h"
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#define	ut_malloc	malloc
+#define	ut_free		free
+#define	ulint		unsigned long
+#define	ut_a(c)		assert(c)
+#define ut_error	assert(0)
+#define	ibool		unsigned int
+#define	TRUE		1
+#define	FALSE		0
+#endif
+
+struct ib_rbt_node_t;
+typedef void (*ib_rbt_print_node)(const ib_rbt_node_t* node);
+typedef int (*ib_rbt_compare)(const void* p1, const void* p2);
+typedef int (*ib_rbt_arg_compare)(const void*, const void* p1, const void* p2);
+
+/** Red black tree color types */
+enum ib_rbt_color_t {
+	IB_RBT_RED,
+	IB_RBT_BLACK
+};
+
+/** Red black tree node */
+struct ib_rbt_node_t {
+	ib_rbt_color_t	color;			/* color of this node */
+
+	ib_rbt_node_t*	left;			/* points left child */
+	ib_rbt_node_t*	right;			/* points right child */
+	ib_rbt_node_t*	parent;			/* points parent node */
+
+	char		value[1];		/* Data value */
+};
+
+/** Red black tree instance.*/
+struct	ib_rbt_t {
+	ib_rbt_node_t*	nil;			/* Black colored node that is
+						used as a sentinel. This is
+						pre-allocated too.*/
+
+	ib_rbt_node_t*	root;			/* Root of the tree, this is
+						pre-allocated and the first
+						data node is the left child.*/
+
+	ulint		n_nodes;		/* Total number of data nodes */
+
+	ib_rbt_compare	compare;		/* Fn. to use for comparison */
+	ib_rbt_arg_compare
+			compare_with_arg;	/* Fn. to use for comparison
+						with argument */
+	ulint		sizeof_value;		/* Sizeof the item in bytes */
+	void*		cmp_arg;		/* Compare func argument */
+};
+
+/** The result of searching for a key in the tree, this is useful for
+a speedy lookup and insert if key doesn't exist.*/
+struct ib_rbt_bound_t {
+	const ib_rbt_node_t*
+			last;			/* Last node visited */
+
+	int		result;			/* Result of comparing with
+						the last non-nil node that
+						was visited */
+};
+
+/* Size in elements (t is an rb tree instance) */
+#define rbt_size(t)	(t->n_nodes)
+
+/* Check whether the rb tree is empty (t is an rb tree instance) */
+#define rbt_empty(t)	(rbt_size(t) == 0)
+
+/* Get data value (t is the data type, n is an rb tree node instance) */
+#define rbt_value(t, n) ((t*) &n->value[0])
+
+/* Compare a key with the node value (t is tree, k is key, n is node)*/
+#define rbt_compare(t, k, n) (t->compare(k, n->value))
+
+/**********************************************************************//**
+Free an instance of  a red black tree */
+UNIV_INTERN
+void
+rbt_free(
+/*=====*/
+	ib_rbt_t*	tree);			/*!< in: rb tree to free */
+/**********************************************************************//**
+Create an instance of a red black tree
+@return	rb tree instance */
+UNIV_INTERN
+ib_rbt_t*
+rbt_create(
+/*=======*/
+	size_t		sizeof_value,		/*!< in: size in bytes */
+	ib_rbt_compare	compare);		/*!< in: comparator */
+/**********************************************************************//**
+Create an instance of a red black tree, whose comparison function takes
+an argument
+@return	rb tree instance */
+UNIV_INTERN
+ib_rbt_t*
+rbt_create_arg_cmp(
+/*===============*/
+	size_t		sizeof_value,		/*!< in: size in bytes */
+	ib_rbt_arg_compare
+			compare,		/*!< in: comparator */
+	void*		cmp_arg);		/*!< in: compare fn arg */
+/**********************************************************************//**
+Delete a node from the red black tree, identified by key */
+UNIV_INTERN
+ibool
+rbt_delete(
+/*=======*/
+						/* in: TRUE on success */
+	ib_rbt_t*	tree,			/* in: rb tree */
+	const void*	key);			/* in: key to delete */
+/**********************************************************************//**
+Remove a node from the red black tree, NOTE: This function will not delete
+the node instance, THAT IS THE CALLERS RESPONSIBILITY.
+@return	the deleted node with the const. */
+UNIV_INTERN
+ib_rbt_node_t*
+rbt_remove_node(
+/*============*/
+	ib_rbt_t*	tree,			/*!< in: rb tree */
+	const ib_rbt_node_t*
+			node);			/*!< in: node to delete, this
+						is a fudge and declared const
+						because the caller has access
+						only to const nodes.*/
+/**********************************************************************//**
+Return a node from the red black tree, identified by
+key, NULL if not found
+@return	node if found else return NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lookup(
+/*=======*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree to search */
+	const void*	key);			/*!< in: key to lookup */
+/**********************************************************************//**
+Add data to the red black tree, identified by key (no dups yet!)
+@return	inserted node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_insert(
+/*=======*/
+	ib_rbt_t*	tree,			/*!< in: rb tree */
+	const void*	key,			/*!< in: key for ordering */
+	const void*	value);			/*!< in: data that will be
+						copied to the node.*/
+/**********************************************************************//**
+Add a new node to the tree, useful for data that is pre-sorted.
+@return	appended node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_add_node(
+/*=========*/
+	ib_rbt_t*	tree,			/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,			/*!< in: parent */
+	const void*	value);			/*!< in: this value is copied
+						to the node */
+/**********************************************************************//**
+Return the left most data node in the tree
+@return	left most node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_first(
+/*======*/
+	const ib_rbt_t*	tree);			/*!< in: rb tree */
+/**********************************************************************//**
+Return the right most data node in the tree
+@return	right most node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_last(
+/*=====*/
+	const ib_rbt_t*	tree);			/*!< in: rb tree */
+/**********************************************************************//**
+Return the next node from current.
+@return	successor node to current that is passed in. */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_next(
+/*=====*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	const ib_rbt_node_t*			/* in: current node */
+			current);
+/**********************************************************************//**
+Return the prev node from current.
+@return	precedessor node to current that is passed in */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_prev(
+/*=====*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	const ib_rbt_node_t*			/* in: current node */
+			current);
+/**********************************************************************//**
+Find the node that has the lowest key that is >= key.
+@return	node that satisfies the lower bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lower_bound(
+/*============*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	const void*	key);			/*!< in: key to search */
+/**********************************************************************//**
+Find the node that has the greatest key that is <= key.
+@return	node that satisifies the upper bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_upper_bound(
+/*============*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	const void*	key);			/*!< in: key to search */
+/**********************************************************************//**
+Search for the key, a node will be retuned in parent.last, whether it
+was found or not. If not found then parent.last will contain the
+parent node for the possibly new key otherwise the matching node.
+@return	result of last comparison */
+UNIV_INTERN
+int
+rbt_search(
+/*=======*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,			/*!< in: search bounds */
+	const void*	key);			/*!< in: key to search */
+/**********************************************************************//**
+Search for the key, a node will be retuned in parent.last, whether it
+was found or not. If not found then parent.last will contain the
+parent node for the possibly new key otherwise the matching node.
+@return	result of last comparison */
+UNIV_INTERN
+int
+rbt_search_cmp(
+/*===========*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,			/*!< in: search bounds */
+	const void*	key,			/*!< in: key to search */
+	ib_rbt_compare	compare,		/*!< in: comparator */
+	ib_rbt_arg_compare
+			arg_compare);		/*!< in: fn to compare items
+						with argument */
+/**********************************************************************//**
+Clear the tree, deletes (and free's) all the nodes. */
+UNIV_INTERN
+void
+rbt_clear(
+/*======*/
+	ib_rbt_t*	tree);			/*!< in: rb tree */
+/**********************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+@return	no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq(
+/*===========*/
+	ib_rbt_t*	dst,			/*!< in: dst rb tree */
+	const ib_rbt_t*	src);			/*!< in: src rb tree */
+/**********************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+Delete the nodes from src after copying node to dst. As a side effect
+the duplicates will be left untouched in the src, since we don't support
+duplicates (yet). NOTE: src and dst must be similar, the function doesn't
+check for this condition (yet).
+@return	no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq_destructive(
+/*=======================*/
+	ib_rbt_t*	dst,			/*!< in: dst rb tree */
+	ib_rbt_t*	src);			/*!< in: src rb tree */
+/**********************************************************************//**
+Verify the integrity of the RB tree. For debugging. 0 failure else height
+of tree (in count of black nodes).
+@return	TRUE if OK FALSE if tree invalid. */
+UNIV_INTERN
+ibool
+rbt_validate(
+/*=========*/
+	const ib_rbt_t*	tree);			/*!< in: tree to validate */
+/**********************************************************************//**
+Iterate over the tree in depth first order. */
+UNIV_INTERN
+void
+rbt_print(
+/*======*/
+	const ib_rbt_t*		tree,		/*!< in: tree to traverse */
+	ib_rbt_print_node	print);		/*!< in: print function */
+
+#endif /* INNOBASE_UT0RBT_H */
diff --git a/storage/innobase/include/ut0rnd.h b/storage/innobase/include/ut0rnd.h
new file mode 100644
index 00000000000..53b769849a5
--- /dev/null
+++ b/storage/innobase/include/ut0rnd.h
@@ -0,0 +1,148 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0rnd.h
+Random numbers and hashing
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0rnd_h
+#define ut0rnd_h
+
+#include "univ.i"
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "ut0byte.h"
+
+/** The 'character code' for end of field or string (used
+in folding records */
+#define UT_END_OF_FIELD		257
+
+/********************************************************//**
+This is used to set the random number seed. */
+UNIV_INLINE
+void
+ut_rnd_set_seed(
+/*============*/
+	ulint	 seed);		 /*!< in: seed */
+/********************************************************//**
+The following function generates a series of 'random' ulint integers.
+@return	the next 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_gen_next_ulint(
+/*==================*/
+	ulint	rnd);	/*!< in: the previous random number value */
+/*********************************************************//**
+The following function generates 'random' ulint integers which
+enumerate the value space (let there be N of them) of ulint integers
+in a pseudo-random fashion. Note that the same integer is repeated
+always after N calls to the generator.
+@return	the 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_gen_ulint(void);
+/*==================*/
+/********************************************************//**
+Generates a random integer from a given interval.
+@return	the 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_interval(
+/*============*/
+	ulint	low,	/*!< in: low limit; can generate also this value */
+	ulint	high);	/*!< in: high limit; can generate also this value */
+/*********************************************************//**
+Generates a random iboolean value.
+@return	the random value */
+UNIV_INLINE
+ibool
+ut_rnd_gen_ibool(void);
+/*=================*/
+/*******************************************************//**
+The following function generates a hash value for a ulint integer
+to a hash table of size table_size, which should be a prime or some
+random number to work reliably.
+@return	hash value */
+UNIV_INLINE
+ulint
+ut_hash_ulint(
+/*==========*/
+	ulint	 key,		/*!< in: value to be hashed */
+	ulint	 table_size);	/*!< in: hash table size */
+/*************************************************************//**
+Folds a 64-bit integer.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_ull(
+/*========*/
+	ib_uint64_t	d)	/*!< in: 64-bit integer */
+	__attribute__((const));
+/*************************************************************//**
+Folds a character string ending in the null character.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_string(
+/*===========*/
+	const char*	str)	/*!< in: null-terminated string */
+	__attribute__((pure));
+/***********************************************************//**
+Looks for a prime number slightly greater than the given argument.
+The prime is chosen so that it is not near any power of 2.
+@return	prime */
+UNIV_INTERN
+ulint
+ut_find_prime(
+/*==========*/
+	ulint	n)	/*!< in: positive number > 100 */
+	__attribute__((const));
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Folds a pair of ulints.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+	ulint	n1,	/*!< in: ulint */
+	ulint	n2)	/*!< in: ulint */
+	__attribute__((const));
+/*************************************************************//**
+Folds a binary string.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+	const byte*	str,	/*!< in: string of bytes */
+	ulint		len)	/*!< in: length */
+	__attribute__((pure));
+
+
+#ifndef UNIV_NONINL
+#include "ut0rnd.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/ut0rnd.ic b/storage/innobase/include/ut0rnd.ic
new file mode 100644
index 00000000000..024c59e553b
--- /dev/null
+++ b/storage/innobase/include/ut0rnd.ic
@@ -0,0 +1,255 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0rnd.ic
+Random numbers and hashing
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+#define UT_HASH_RANDOM_MASK	1463735687
+#define UT_HASH_RANDOM_MASK2	1653893711
+
+#ifndef UNIV_INNOCHECKSUM
+
+#define UT_RND1			151117737
+#define UT_RND2			119785373
+#define UT_RND3			 85689495
+#define UT_RND4			 76595339
+#define UT_SUM_RND2		 98781234
+#define UT_SUM_RND3		126792457
+#define UT_SUM_RND4		 63498502
+#define UT_XOR_RND1		187678878
+#define UT_XOR_RND2		143537923
+
+/** Seed value of ut_rnd_gen_ulint() */
+extern	ulint	 ut_rnd_ulint_counter;
+
+/********************************************************//**
+This is used to set the random number seed. */
+UNIV_INLINE
+void
+ut_rnd_set_seed(
+/*============*/
+	ulint	 seed)		 /*!< in: seed */
+{
+	ut_rnd_ulint_counter = seed;
+}
+
+/********************************************************//**
+The following function generates a series of 'random' ulint integers.
+@return	the next 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_gen_next_ulint(
+/*==================*/
+	ulint	rnd)	/*!< in: the previous random number value */
+{
+	ulint	n_bits;
+
+	n_bits = 8 * sizeof(ulint);
+
+	rnd = UT_RND2 * rnd + UT_SUM_RND3;
+	rnd = UT_XOR_RND1 ^ rnd;
+	rnd = (rnd << 20) + (rnd >> (n_bits - 20));
+	rnd = UT_RND3 * rnd + UT_SUM_RND4;
+	rnd = UT_XOR_RND2 ^ rnd;
+	rnd = (rnd << 20) + (rnd >> (n_bits - 20));
+	rnd = UT_RND1 * rnd + UT_SUM_RND2;
+
+	return(rnd);
+}
+
+/********************************************************//**
+The following function generates 'random' ulint integers which
+enumerate the value space of ulint integers in a pseudo random
+fashion. Note that the same integer is repeated always after
+2 to power 32 calls to the generator (if ulint is 32-bit).
+@return	the 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_gen_ulint(void)
+/*==================*/
+{
+	ulint	rnd;
+
+	ut_rnd_ulint_counter = UT_RND1 * ut_rnd_ulint_counter + UT_RND2;
+
+	rnd = ut_rnd_gen_next_ulint(ut_rnd_ulint_counter);
+
+	return(rnd);
+}
+
+/********************************************************//**
+Generates a random integer from a given interval.
+@return	the 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_interval(
+/*============*/
+	ulint	low,	/*!< in: low limit; can generate also this value */
+	ulint	high)	/*!< in: high limit; can generate also this value */
+{
+	ulint	rnd;
+
+	ut_ad(high >= low);
+
+	if (low == high) {
+
+		return(low);
+	}
+
+	rnd = ut_rnd_gen_ulint();
+
+	return(low + (rnd % (high - low)));
+}
+
+/*********************************************************//**
+Generates a random iboolean value.
+@return	the random value */
+UNIV_INLINE
+ibool
+ut_rnd_gen_ibool(void)
+/*=================*/
+{
+	ulint	 x;
+
+	x = ut_rnd_gen_ulint();
+
+	if (((x >> 20) + (x >> 15)) & 1) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************//**
+The following function generates a hash value for a ulint integer
+to a hash table of size table_size, which should be a prime
+or some random number for the hash table to work reliably.
+@return	hash value */
+UNIV_INLINE
+ulint
+ut_hash_ulint(
+/*==========*/
+	ulint	 key,		/*!< in: value to be hashed */
+	ulint	 table_size)	/*!< in: hash table size */
+{
+	ut_ad(table_size);
+	key = key ^ UT_HASH_RANDOM_MASK2;
+
+	return(key % table_size);
+}
+
+/*************************************************************//**
+Folds a 64-bit integer.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_ull(
+/*========*/
+	ib_uint64_t	d)	/*!< in: 64-bit integer */
+{
+	return(ut_fold_ulint_pair((ulint) d & ULINT32_MASK,
+				  (ulint) (d >> 32)));
+}
+
+/*************************************************************//**
+Folds a character string ending in the null character.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_string(
+/*===========*/
+	const char*	str)	/*!< in: null-terminated string */
+{
+	ulint	fold = 0;
+
+	ut_ad(str);
+
+	while (*str != '\0') {
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str));
+		str++;
+	}
+
+	return(fold);
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Folds a pair of ulints.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+	ulint	n1,	/*!< in: ulint */
+	ulint	n2)	/*!< in: ulint */
+{
+	return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1)
+		^ UT_HASH_RANDOM_MASK) + n2);
+}
+
+/*************************************************************//**
+Folds a binary string.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+	const byte*	str,	/*!< in: string of bytes */
+	ulint		len)	/*!< in: length */
+{
+	ulint		fold = 0;
+	const byte*	str_end	= str + (len & 0xFFFFFFF8);
+
+	ut_ad(str || !len);
+
+	while (str < str_end) {
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	}
+
+	switch (len & 0x7) {
+	case 7:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	case 6:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	case 5:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	case 4:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	case 3:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	case 2:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	case 1:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	}
+
+	return(fold);
+}
diff --git a/storage/innobase/include/ut0sort.h b/storage/innobase/include/ut0sort.h
new file mode 100644
index 00000000000..75648b5c317
--- /dev/null
+++ b/storage/innobase/include/ut0sort.h
@@ -0,0 +1,106 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0sort.h
+Sort utility
+
+Created 11/9/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0sort_h
+#define ut0sort_h
+
+#include "univ.i"
+
+/* This module gives a macro definition of the body of
+a standard sort function for an array of elements of any
+type. The comparison function is given as a parameter to
+the macro. The sort algorithm is mergesort which has logarithmic
+worst case.
+*/
+
+/*******************************************************************//**
+This macro expands to the body of a standard sort function.
+The sort function uses mergesort and must be defined separately
+for each type of array.
+Also the comparison function has to be defined individually
+for each array cell type. SORT_FUN is the sort function name.
+The function takes the array to be sorted (ARR),
+the array of auxiliary space (AUX_ARR) of same size,
+and the low (LOW), inclusive, and high (HIGH), noninclusive,
+limits for the sort interval as arguments.
+CMP_FUN is the comparison function name. It takes as arguments
+two elements from the array and returns 1, if the first is bigger,
+0 if equal, and -1 if the second bigger. */
+
+#define UT_SORT_FUNCTION_BODY(SORT_FUN, ARR, AUX_ARR, LOW, HIGH, CMP_FUN)\
+{\
+	ulint		ut_sort_mid77;\
+	ulint		ut_sort_i77;\
+	ulint		ut_sort_low77;\
+	ulint		ut_sort_high77;\
+\
+	ut_ad((LOW) < (HIGH));\
+	ut_ad(ARR);\
+	ut_ad(AUX_ARR);\
+\
+	if ((LOW) == (HIGH) - 1) {\
+		return;\
+	} else if ((LOW) == (HIGH) - 2) {\
+		if (CMP_FUN((ARR)[LOW], (ARR)[(HIGH) - 1]) > 0) {\
+			(AUX_ARR)[LOW] = (ARR)[LOW];\
+			(ARR)[LOW] = (ARR)[(HIGH) - 1];\
+			(ARR)[(HIGH) - 1] = (AUX_ARR)[LOW];\
+		}\
+		return;\
+	}\
+\
+	ut_sort_mid77 = ((LOW) + (HIGH)) / 2;\
+\
+	SORT_FUN((ARR), (AUX_ARR), (LOW), ut_sort_mid77);\
+	SORT_FUN((ARR), (AUX_ARR), ut_sort_mid77, (HIGH));\
+\
+	ut_sort_low77 = (LOW);\
+	ut_sort_high77 = ut_sort_mid77;\
+\
+	for (ut_sort_i77 = (LOW); ut_sort_i77 < (HIGH); ut_sort_i77++) {\
+\
+		if (ut_sort_low77 >= ut_sort_mid77) {\
+			(AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\
+			ut_sort_high77++;\
+		} else if (ut_sort_high77 >= (HIGH)) {\
+			(AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\
+			ut_sort_low77++;\
+		} else if (CMP_FUN((ARR)[ut_sort_low77],\
+				   (ARR)[ut_sort_high77]) > 0) {\
+			(AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\
+			ut_sort_high77++;\
+		} else {\
+			(AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\
+			ut_sort_low77++;\
+		}\
+	}\
+\
+	memcpy((void*) ((ARR) + (LOW)), (AUX_ARR) + (LOW),\
+	       ((HIGH) - (LOW)) * sizeof *(ARR));\
+}\
+
+
+#endif
+
diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h
new file mode 100644
index 00000000000..0caf379d8fa
--- /dev/null
+++ b/storage/innobase/include/ut0ut.h
@@ -0,0 +1,497 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0ut.h
+Various utilities
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0ut_h
+#define ut0ut_h
+
+#include "univ.i"
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "db0err.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
+#endif /* UNIV_HOTBACKUP */
+
+#include <time.h>
+#ifndef MYSQL_SERVER
+#include <ctype.h>
+#endif
+
+#include <stdarg.h> /* for va_list */
+
+/** Index name prefix in fast index creation */
+#define	TEMP_INDEX_PREFIX	'\377'
+/** Index name prefix in fast index creation, as a string constant */
+#define TEMP_INDEX_PREFIX_STR	"\377"
+
+/** Time stamp */
+typedef time_t	ib_time_t;
+
+/* In order to call a piece of code, when a function returns or when the
+scope ends, use this utility class.  It will invoke the given function
+object in its destructor. */
+template<typename F>
+struct ut_when_dtor {
+	ut_when_dtor(F& p) : f(p) {}
+	~ut_when_dtor() {
+		f();
+	}
+private:
+	F& f;
+};
+
+#ifndef UNIV_HOTBACKUP
+# if defined(HAVE_PAUSE_INSTRUCTION)
+   /* According to the gcc info page, asm volatile means that the
+   instruction has important side-effects and must not be removed.
+   Also asm volatile may trigger a memory barrier (spilling all registers
+   to memory). */
+#  ifdef __SUNPRO_CC
+#   define UT_RELAX_CPU() asm ("pause" )
+#  else
+#   define UT_RELAX_CPU() __asm__ __volatile__ ("pause")
+#  endif /* __SUNPRO_CC */
+
+# elif defined(HAVE_FAKE_PAUSE_INSTRUCTION)
+#  define UT_RELAX_CPU() __asm__ __volatile__ ("rep; nop")
+# elif defined(HAVE_ATOMIC_BUILTINS)
+#  define UT_RELAX_CPU() do { \
+     volatile lint	volatile_var; \
+     os_compare_and_swap_lint(&volatile_var, 0, 1); \
+   } while (0)
+# elif defined(HAVE_WINDOWS_ATOMICS)
+   /* In the Win32 API, the x86 PAUSE instruction is executed by calling
+   the YieldProcessor macro defined in WinNT.h. It is a CPU architecture-
+   independent way by using YieldProcessor. */
+#  define UT_RELAX_CPU() YieldProcessor()
+# else
+#  define UT_RELAX_CPU() ((void)0) /* avoid warning for an empty statement */
+# endif
+
+/*********************************************************************//**
+Delays execution for at most max_wait_us microseconds or returns earlier
+if cond becomes true.
+@param cond		in: condition to wait for; evaluated every 2 ms
+@param max_wait_us	in: maximum delay to wait, in microseconds */
+#define UT_WAIT_FOR(cond, max_wait_us)				\
+do {								\
+	ullint	start_us;					\
+	start_us = ut_time_us(NULL);				\
+	while (!(cond) 						\
+	       && ut_time_us(NULL) - start_us < (max_wait_us)) {\
+								\
+		os_thread_sleep(2000 /* 2 ms */);		\
+	}							\
+} while (0)
+#endif /* !UNIV_HOTBACKUP */
+
+template <class T> T ut_min(T a, T b) { return(a < b ? a : b); }
+template <class T> T ut_max(T a, T b) { return(a > b ? a : b); }
+
+/******************************************************//**
+Calculates the minimum of two ulints.
+@return	minimum */
+UNIV_INLINE
+ulint
+ut_min(
+/*===*/
+	ulint	 n1,	/*!< in: first number */
+	ulint	 n2);	/*!< in: second number */
+/******************************************************//**
+Calculates the maximum of two ulints.
+@return	maximum */
+UNIV_INLINE
+ulint
+ut_max(
+/*===*/
+	ulint	 n1,	/*!< in: first number */
+	ulint	 n2);	/*!< in: second number */
+/****************************************************************//**
+Calculates minimum of two ulint-pairs. */
+UNIV_INLINE
+void
+ut_pair_min(
+/*========*/
+	ulint*	a,	/*!< out: more significant part of minimum */
+	ulint*	b,	/*!< out: less significant part of minimum */
+	ulint	a1,	/*!< in: more significant part of first pair */
+	ulint	b1,	/*!< in: less significant part of first pair */
+	ulint	a2,	/*!< in: more significant part of second pair */
+	ulint	b2);	/*!< in: less significant part of second pair */
+/******************************************************//**
+Compares two ulints.
+@return	1 if a > b, 0 if a == b, -1 if a < b */
+UNIV_INLINE
+int
+ut_ulint_cmp(
+/*=========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	b);	/*!< in: ulint */
+/*******************************************************//**
+Compares two pairs of ulints.
+@return	-1 if a < b, 0 if a == b, 1 if a > b */
+UNIV_INLINE
+int
+ut_pair_cmp(
+/*========*/
+	ulint	a1,	/*!< in: more significant part of first pair */
+	ulint	a2,	/*!< in: less significant part of first pair */
+	ulint	b1,	/*!< in: more significant part of second pair */
+	ulint	b2);	/*!< in: less significant part of second pair */
+/*************************************************************//**
+Determines if a number is zero or a power of two.
+@param n	in: number
+@return		nonzero if n is zero or a power of two; zero otherwise */
+#define ut_is_2pow(n) UNIV_LIKELY(!((n) & ((n) - 1)))
+/*************************************************************//**
+Calculates fast the remainder of n/m when m is a power of two.
+@param n	in: numerator
+@param m	in: denominator, must be a power of two
+@return		the remainder of n/m */
+#define ut_2pow_remainder(n, m) ((n) & ((m) - 1))
+/*************************************************************//**
+Calculates the biggest multiple of m that is not bigger than n
+when m is a power of two.  In other words, rounds n down to m * k.
+@param n	in: number to round down
+@param m	in: alignment, must be a power of two
+@return		n rounded down to the biggest possible integer multiple of m */
+#define ut_2pow_round(n, m) ((n) & ~((m) - 1))
+/** Align a number down to a multiple of a power of two.
+@param n	in: number to round down
+@param m	in: alignment, must be a power of two
+@return		n rounded down to the biggest possible integer multiple of m */
+#define ut_calc_align_down(n, m) ut_2pow_round(n, m)
+/********************************************************//**
+Calculates the smallest multiple of m that is not smaller than n
+when m is a power of two.  In other words, rounds n up to m * k.
+@param n	in: number to round up
+@param m	in: alignment, must be a power of two
+@return		n rounded up to the smallest possible integer multiple of m */
+#define ut_calc_align(n, m) (((n) + ((m) - 1)) & ~((m) - 1))
+/*************************************************************//**
+Calculates fast the 2-logarithm of a number, rounded upward to an
+integer.
+@return	logarithm in the base 2, rounded upward */
+UNIV_INLINE
+ulint
+ut_2_log(
+/*=====*/
+	ulint	n);	/*!< in: number */
+/*************************************************************//**
+Calculates 2 to power n.
+@return	2 to power n */
+UNIV_INLINE
+ulint
+ut_2_exp(
+/*=====*/
+	ulint	n);	/*!< in: number */
+/*************************************************************//**
+Calculates fast the number rounded up to the nearest power of 2.
+@return	first power of 2 which is >= n */
+UNIV_INTERN
+ulint
+ut_2_power_up(
+/*==========*/
+	ulint	n)	/*!< in: number != 0 */
+	__attribute__((const));
+
+/** Determine how many bytes (groups of 8 bits) are needed to
+store the given number of bits.
+@param b	in: bits
+@return		number of bytes (octets) needed to represent b */
+#define UT_BITS_IN_BYTES(b) (((b) + 7) / 8)
+
+/**********************************************************//**
+Returns system time. We do not specify the format of the time returned:
+the only way to manipulate it is to use the function ut_difftime.
+@return	system time */
+UNIV_INTERN
+ib_time_t
+ut_time(void);
+/*=========*/
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Returns system time.
+Upon successful completion, the value 0 is returned; otherwise the
+value -1 is returned and the global variable errno is set to indicate the
+error.
+@return	0 on success, -1 otherwise */
+UNIV_INTERN
+int
+ut_usectime(
+/*========*/
+	ulint*	sec,	/*!< out: seconds since the Epoch */
+	ulint*	ms);	/*!< out: microseconds since the Epoch+*sec */
+
+/**********************************************************//**
+Returns the number of microseconds since epoch. Similar to
+time(3), the return value is also stored in *tloc, provided
+that tloc is non-NULL.
+@return	us since epoch */
+UNIV_INTERN
+ullint
+ut_time_us(
+/*=======*/
+	ullint*	tloc);	/*!< out: us since epoch, if non-NULL */
+/**********************************************************//**
+Returns the number of milliseconds since some epoch.  The
+value may wrap around.  It should only be used for heuristic
+purposes.
+@return	ms since epoch */
+UNIV_INTERN
+ulint
+ut_time_ms(void);
+/*============*/
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************//**
+Returns the number of milliseconds since some epoch.  The
+value may wrap around.  It should only be used for heuristic
+purposes.
+@return ms since epoch */
+UNIV_INTERN
+ulint
+ut_time_ms(void);
+/*============*/
+
+/**********************************************************//**
+Returns the difference of two times in seconds.
+@return	time2 - time1 expressed in seconds */
+UNIV_INTERN
+double
+ut_difftime(
+/*========*/
+	ib_time_t	time2,	/*!< in: time */
+	ib_time_t	time1);	/*!< in: time */
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/**********************************************************//**
+Prints a timestamp to a file. */
+UNIV_INTERN
+void
+ut_print_timestamp(
+/*===============*/
+	FILE*	file)	/*!< in: file where to print */
+	UNIV_COLD __attribute__((nonnull));
+
+#ifndef UNIV_INNOCHECKSUM
+
+/**********************************************************//**
+Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */
+UNIV_INTERN
+void
+ut_sprintf_timestamp(
+/*=================*/
+	char*	buf); /*!< in: buffer where to sprintf */
+#ifdef UNIV_HOTBACKUP
+/**********************************************************//**
+Sprintfs a timestamp to a buffer with no spaces and with ':' characters
+replaced by '_'. */
+UNIV_INTERN
+void
+ut_sprintf_timestamp_without_extra_chars(
+/*=====================================*/
+	char*	buf); /*!< in: buffer where to sprintf */
+/**********************************************************//**
+Returns current year, month, day. */
+UNIV_INTERN
+void
+ut_get_year_month_day(
+/*==================*/
+	ulint*	year,	/*!< out: current year */
+	ulint*	month,	/*!< out: month */
+	ulint*	day);	/*!< out: day */
+#else /* UNIV_HOTBACKUP */
+/*************************************************************//**
+Runs an idle loop on CPU. The argument gives the desired delay
+in microseconds on 100 MHz Pentium + Visual C++.
+@return	dummy value */
+UNIV_INTERN
+ulint
+ut_delay(
+/*=====*/
+	ulint	delay);	/*!< in: delay in microseconds on 100 MHz Pentium */
+#endif /* UNIV_HOTBACKUP */
+/*************************************************************//**
+Prints the contents of a memory buffer in hex and ascii. */
+UNIV_INTERN
+void
+ut_print_buf(
+/*=========*/
+	FILE*		file,	/*!< in: file where to print */
+	const void*	buf,	/*!< in: memory buffer */
+	ulint		len);	/*!< in: length of the buffer */
+
+/**********************************************************************//**
+Outputs a NUL-terminated file name, quoted with apostrophes. */
+UNIV_INTERN
+void
+ut_print_filename(
+/*==============*/
+	FILE*		f,	/*!< in: output stream */
+	const char*	name);	/*!< in: name to print */
+
+#ifndef UNIV_HOTBACKUP
+/* Forward declaration of transaction handle */
+struct trx_t;
+
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+UNIV_INTERN
+void
+ut_print_name(
+/*==========*/
+	FILE*		f,	/*!< in: output stream */
+	const trx_t*	trx,	/*!< in: transaction */
+	ibool		table_id,/*!< in: TRUE=print a table name,
+				FALSE=print other identifier */
+	const char*	name);	/*!< in: name to print */
+
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+UNIV_INTERN
+void
+ut_print_namel(
+/*===========*/
+	FILE*		f,	/*!< in: output stream */
+	const trx_t*	trx,	/*!< in: transaction (NULL=no quotes) */
+	ibool		table_id,/*!< in: TRUE=print a table name,
+				FALSE=print other identifier */
+	const char*	name,	/*!< in: name to print */
+	ulint		namelen);/*!< in: length of name */
+
+/**********************************************************************//**
+Formats a table or index name, quoted as an SQL identifier. If the name
+contains a slash '/', the result will contain two identifiers separated by
+a period (.), as in SQL database_name.identifier.
+@return pointer to 'formatted' */
+UNIV_INTERN
+char*
+ut_format_name(
+/*===========*/
+	const char*	name,		/*!< in: table or index name, must be
+					'\0'-terminated */
+	ibool		is_table,	/*!< in: if TRUE then 'name' is a table
+					name */
+	char*		formatted,	/*!< out: formatted result, will be
+					'\0'-terminated */
+	ulint		formatted_size);/*!< out: no more than this number of
+					bytes will be written to 'formatted' */
+
+/**********************************************************************//**
+Catenate files. */
+UNIV_INTERN
+void
+ut_copy_file(
+/*=========*/
+	FILE*	dest,	/*!< in: output file */
+	FILE*	src);	/*!< in: input file to be appended to output */
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef __WIN__
+/**********************************************************************//**
+A substitute for vsnprintf(3), formatted output conversion into
+a limited buffer. Note: this function DOES NOT return the number of
+characters that would have been printed if the buffer was unlimited because
+VC's _vsnprintf() returns -1 in this case and we would need to call
+_vscprintf() in addition to estimate that but we would need another copy
+of "ap" for that and VC does not provide va_copy(). */
+UNIV_INTERN
+void
+ut_vsnprintf(
+/*=========*/
+	char*		str,	/*!< out: string */
+	size_t		size,	/*!< in: str size */
+	const char*	fmt,	/*!< in: format */
+	va_list		ap);	/*!< in: format values */
+
+/**********************************************************************//**
+A substitute for snprintf(3), formatted output conversion into
+a limited buffer.
+@return number of characters that would have been printed if the size
+were unlimited, not including the terminating '\0'. */
+UNIV_INTERN
+int
+ut_snprintf(
+/*========*/
+	char*		str,	/*!< out: string */
+	size_t		size,	/*!< in: str size */
+	const char*	fmt,	/*!< in: format */
+	...);			/*!< in: format values */
+#else
+/**********************************************************************//**
+A wrapper for vsnprintf(3), formatted output conversion into
+a limited buffer. Note: this function DOES NOT return the number of
+characters that would have been printed if the buffer was unlimited because
+VC's _vsnprintf() returns -1 in this case and we would need to call
+_vscprintf() in addition to estimate that but we would need another copy
+of "ap" for that and VC does not provide va_copy(). */
+# define ut_vsnprintf(buf, size, fmt, ap)	\
+	((void) vsnprintf(buf, size, fmt, ap))
+/**********************************************************************//**
+A wrapper for snprintf(3), formatted output conversion into
+a limited buffer. */
+# define ut_snprintf	snprintf
+#endif /* __WIN__ */
+
+/*************************************************************//**
+Convert an error number to a human readable text message. The
+returned string is static and should not be freed or modified.
+@return	string, describing the error */
+UNIV_INTERN
+const char*
+ut_strerr(
+/*======*/
+	dberr_t	num);	/*!< in: error number */
+
+/****************************************************************
+Sort function for ulint arrays. */
+UNIV_INTERN
+void
+ut_ulint_sort(
+/*==========*/
+	ulint*	arr,		/*!< in/out: array to sort */
+	ulint*	aux_arr,	/*!< in/out: aux array to use in sort */
+	ulint	low,		/*!< in: lower bound */
+	ulint	high)		/*!< in: upper bound */
+	__attribute__((nonnull));
+
+#ifndef UNIV_NONINL
+#include "ut0ut.ic"
+#endif
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif
+
diff --git a/storage/innobase/include/ut0ut.ic b/storage/innobase/include/ut0ut.ic
new file mode 100644
index 00000000000..4e0f76e1957
--- /dev/null
+++ b/storage/innobase/include/ut0ut.ic
@@ -0,0 +1,162 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0ut.ic
+Various utilities
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+/******************************************************//**
+Calculates the minimum of two ulints.
+@return	minimum */
+UNIV_INLINE
+ulint
+ut_min(
+/*===*/
+	ulint	 n1,	/*!< in: first number */
+	ulint	 n2)	/*!< in: second number */
+{
+	return((n1 <= n2) ? n1 : n2);
+}
+
+/******************************************************//**
+Calculates the maximum of two ulints.
+@return	maximum */
+UNIV_INLINE
+ulint
+ut_max(
+/*===*/
+	ulint	 n1,	/*!< in: first number */
+	ulint	 n2)	/*!< in: second number */
+{
+	return((n1 <= n2) ? n2 : n1);
+}
+
+/****************************************************************//**
+Calculates minimum of two ulint-pairs. */
+UNIV_INLINE
+void
+ut_pair_min(
+/*========*/
+	ulint*	a,	/*!< out: more significant part of minimum */
+	ulint*	b,	/*!< out: less significant part of minimum */
+	ulint	a1,	/*!< in: more significant part of first pair */
+	ulint	b1,	/*!< in: less significant part of first pair */
+	ulint	a2,	/*!< in: more significant part of second pair */
+	ulint	b2)	/*!< in: less significant part of second pair */
+{
+	if (a1 == a2) {
+		*a = a1;
+		*b = ut_min(b1, b2);
+	} else if (a1 < a2) {
+		*a = a1;
+		*b = b1;
+	} else {
+		*a = a2;
+		*b = b2;
+	}
+}
+
+/******************************************************//**
+Compares two ulints.
+@return	1 if a > b, 0 if a == b, -1 if a < b */
+UNIV_INLINE
+int
+ut_ulint_cmp(
+/*=========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	b)	/*!< in: ulint */
+{
+	if (a < b) {
+		return(-1);
+	} else if (a == b) {
+		return(0);
+	} else {
+		return(1);
+	}
+}
+
+/*******************************************************//**
+Compares two pairs of ulints.
+@return	-1 if a < b, 0 if a == b, 1 if a > b */
+UNIV_INLINE
+int
+ut_pair_cmp(
+/*========*/
+	ulint	a1,	/*!< in: more significant part of first pair */
+	ulint	a2,	/*!< in: less significant part of first pair */
+	ulint	b1,	/*!< in: more significant part of second pair */
+	ulint	b2)	/*!< in: less significant part of second pair */
+{
+	if (a1 > b1) {
+		return(1);
+	} else if (a1 < b1) {
+		return(-1);
+	} else if (a2 > b2) {
+		return(1);
+	} else if (a2 < b2) {
+		return(-1);
+	} else {
+		return(0);
+	}
+}
+
+/*************************************************************//**
+Calculates fast the 2-logarithm of a number, rounded upward to an
+integer.
+@return	logarithm in the base 2, rounded upward */
+UNIV_INLINE
+ulint
+ut_2_log(
+/*=====*/
+	ulint	n)	/*!< in: number != 0 */
+{
+	ulint	res;
+
+	res = 0;
+
+	ut_ad(n > 0);
+
+	n = n - 1;
+
+	for (;;) {
+		n = n / 2;
+
+		if (n == 0) {
+			break;
+		}
+
+		res++;
+	}
+
+	return(res + 1);
+}
+
+/*************************************************************//**
+Calculates 2 to power n.
+@return	2 to power n */
+UNIV_INLINE
+ulint
+ut_2_exp(
+/*=====*/
+	ulint	n)	/*!< in: number */
+{
+	return((ulint) 1 << n);
+}
diff --git a/storage/innobase/include/ut0vec.h b/storage/innobase/include/ut0vec.h
new file mode 100644
index 00000000000..432fb348a09
--- /dev/null
+++ b/storage/innobase/include/ut0vec.h
@@ -0,0 +1,337 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0vec.h
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+#ifndef IB_VECTOR_H
+#define IB_VECTOR_H
+
+#include "univ.i"
+#include "mem0mem.h"
+
+struct ib_alloc_t;
+struct ib_vector_t;
+
+typedef void* (*ib_mem_alloc_t)(
+					/* out: Pointer to allocated memory */
+	ib_alloc_t*	allocator,	/* in: Pointer to allocator instance */
+	ulint		size);		/* in: Number of bytes to allocate */
+
+typedef void (*ib_mem_free_t)(
+	ib_alloc_t*	allocator,	/* in: Pointer to allocator instance */
+	void*		ptr);		/* in: Memory to free */
+
+typedef void* (*ib_mem_resize_t)(
+					/* out: Pointer to resized memory */
+	ib_alloc_t*	allocator,	/* in: Pointer to allocator */
+	void*		ptr,		/* in: Memory to resize */
+	ulint		old_size,	/* in: Old memory size in bytes */
+	ulint		new_size);	/* in: New size in bytes */
+
+typedef int (*ib_compare_t)(const void*, const void*);
+
+/* An automatically resizing vector datatype with the following properties:
+
+ -All memory allocation is done through an allocator, which is  responsible for
+freeing it when done with the vector.
+*/
+
+/* This is useful shorthand for elements of type void* */
+#define	ib_vector_getp(v, n)	(*(void**) ib_vector_get(v, n))
+#define	ib_vector_getp_const(v, n)	(*(void**) ib_vector_get_const(v, n))
+
+#define ib_vector_allocator(v)	(v->allocator)
+
+/********************************************************************
+Create a new vector with the given initial size. */
+UNIV_INTERN
+ib_vector_t*
+ib_vector_create(
+/*=============*/
+					/* out: vector */
+	ib_alloc_t*	alloc,		/* in: Allocator */
+					/* in: size of the data item */
+	ulint		sizeof_value,
+	ulint		size);		/* in: initial size */
+
+/********************************************************************
+Destroy the vector. Make sure the vector owns the allocator, e.g.,
+the heap in the the heap allocator. */
+UNIV_INLINE
+void
+ib_vector_free(
+/*===========*/
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/********************************************************************
+Push a new element to the vector, increasing its size if necessary,
+if elem is not NULL then elem is copied to the vector.*/
+UNIV_INLINE
+void*
+ib_vector_push(
+/*===========*/
+					/* out: pointer the "new" element */
+	ib_vector_t*	vec,		/* in/out: vector */
+	const void*	elem);		/* in: data element */
+
+/********************************************************************
+Pop the last element from the vector.*/
+UNIV_INLINE
+void*
+ib_vector_pop(
+/*==========*/
+					/* out: pointer to the "new" element */
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/*******************************************************************//**
+Remove an element to the vector
+@return pointer to the "removed" element */
+UNIV_INLINE
+void*
+ib_vector_remove(
+/*=============*/
+	ib_vector_t*	vec,	/*!< in: vector */
+	const void*	elem);	/*!< in: value to remove */
+
+/********************************************************************
+Get the number of elements in the vector. */
+UNIV_INLINE
+ulint
+ib_vector_size(
+/*===========*/
+					/* out: number of elements in vector */
+	const ib_vector_t*	vec);	/* in: vector */
+
+/********************************************************************
+Increase the size of the vector. */
+UNIV_INTERN
+void
+ib_vector_resize(
+/*=============*/
+					/* out: number of elements in vector */
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/********************************************************************
+Test whether a vector is empty or not.
+@return TRUE if empty */
+UNIV_INLINE
+ibool
+ib_vector_is_empty(
+/*===============*/
+	const ib_vector_t*	vec);    /*!< in: vector */
+
+/****************************************************************//**
+Get the n'th element.
+@return	n'th element */
+UNIV_INLINE
+void*
+ib_vector_get(
+/*==========*/
+	ib_vector_t*	vec,	/*!< in: vector */
+	ulint		n);	/*!< in: element index to get */
+
+/********************************************************************
+Const version of the get n'th element.
+@return n'th element */
+UNIV_INLINE
+const void*
+ib_vector_get_const(
+/*================*/
+	const ib_vector_t*	vec,	/* in: vector */
+	ulint			n);	/* in: element index to get */
+/****************************************************************//**
+Get last element. The vector must not be empty.
+@return	last element */
+UNIV_INLINE
+void*
+ib_vector_get_last(
+/*===============*/
+	ib_vector_t*	vec);	/*!< in: vector */
+/****************************************************************//**
+Set the n'th element. */
+UNIV_INLINE
+void
+ib_vector_set(
+/*==========*/
+	ib_vector_t*	vec,	/*!< in/out: vector */
+	ulint		n,	/*!< in: element index to set */
+	void*		elem);	/*!< in: data element */
+
+/********************************************************************
+Reset the vector size to 0 elements. */
+UNIV_INLINE
+void
+ib_vector_reset(
+/*============*/
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+void*
+ib_vector_last(
+/*===========*/
+					/* out: pointer to last element */
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+const void*
+ib_vector_last_const(
+/*=================*/
+					/* out: pointer to last element */
+	const ib_vector_t*	vec);	/* in: vector */
+
+/********************************************************************
+Sort the vector elements. */
+UNIV_INLINE
+void
+ib_vector_sort(
+/*===========*/
+	ib_vector_t*	vec,		/* in/out: vector */
+	ib_compare_t	compare);	/* in: the comparator to use for sort */
+
+/********************************************************************
+The default ib_vector_t heap free. Does nothing. */
+UNIV_INLINE
+void
+ib_heap_free(
+/*=========*/
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		ptr);		/* in: size in bytes */
+
+/********************************************************************
+The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_malloc(
+/*===========*/
+					/* out: pointer to allocated memory */
+	ib_alloc_t*	allocator,	/* in: allocator */
+	ulint		size);		/* in: size in bytes */
+
+/********************************************************************
+The default ib_vector_t heap resize. Since we can't resize the heap
+we have to copy the elements from the old ptr to the new ptr.
+Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_resize(
+/*===========*/
+					/* out: pointer to reallocated
+					memory */
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		old_ptr,	/* in: pointer to memory */
+	ulint		old_size,	/* in: old size in bytes */
+	ulint		new_size);	/* in: new size in bytes */
+
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+ib_alloc_t*
+ib_heap_allocator_create(
+/*=====================*/
+					/* out: heap allocator instance */
+	mem_heap_t*	heap);		/* in: heap to use */
+
+/********************************************************************
+Free a heap allocator. */
+UNIV_INLINE
+void
+ib_heap_allocator_free(
+/*===================*/
+	ib_alloc_t*	ib_ut_alloc);	/* in: alloc instace to free */
+
+/********************************************************************
+Wrapper for ut_free(). */
+UNIV_INLINE
+void
+ib_ut_free(
+/*=======*/
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		ptr);		/* in: size in bytes */
+
+/********************************************************************
+Wrapper for ut_malloc(). */
+UNIV_INLINE
+void*
+ib_ut_malloc(
+/*=========*/
+					/* out: pointer to allocated memory */
+	ib_alloc_t*	allocator,	/* in: allocator */
+	ulint		size);		/* in: size in bytes */
+
+/********************************************************************
+Wrapper for ut_realloc(). */
+UNIV_INLINE
+void*
+ib_ut_resize(
+/*=========*/
+					/* out: pointer to reallocated
+					memory */
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		old_ptr,	/* in: pointer to memory */
+	ulint		old_size,	/* in: old size in bytes */
+	ulint		new_size);	/* in: new size in bytes */
+
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+ib_alloc_t*
+ib_ut_allocator_create(void);
+/*=========================*/
+
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+void
+ib_ut_allocator_free(
+/*=================*/
+	ib_alloc_t*	ib_ut_alloc);	/* in: alloc instace to free */
+
+/* Allocator used by ib_vector_t. */
+struct ib_alloc_t {
+	ib_mem_alloc_t	mem_malloc;	/* For allocating memory */
+	ib_mem_free_t	mem_release;	/* For freeing memory */
+	ib_mem_resize_t	mem_resize;	/* For resizing memory */
+	void*		arg;		/* Currently if not NULL then it
+					points to the heap instance */
+};
+
+/* See comment at beginning of file. */
+struct ib_vector_t {
+	ib_alloc_t*	allocator;	/* Allocator, because one size
+					doesn't fit all */
+	void*		data;		/* data elements */
+	ulint		used;		/* number of elements currently used */
+	ulint		total;		/* number of elements allocated */
+					/* Size of a data item */
+	ulint		sizeof_value;
+};
+
+#ifndef UNIV_NONINL
+#include "ut0vec.ic"
+#endif
+
+#endif /* IB_VECTOR_H */
diff --git a/storage/innobase/include/ut0vec.ic b/storage/innobase/include/ut0vec.ic
new file mode 100644
index 00000000000..f41a85e1d1d
--- /dev/null
+++ b/storage/innobase/include/ut0vec.ic
@@ -0,0 +1,425 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0vec.ic
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+#define	IB_VEC_OFFSET(v, i)	(vec->sizeof_value * i)
+
+/********************************************************************
+The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_malloc(
+/*===========*/
+	ib_alloc_t*	allocator,	/* in: allocator */
+	ulint		size)		/* in: size in bytes */
+{
+	mem_heap_t*	heap = (mem_heap_t*) allocator->arg;
+
+	return(mem_heap_alloc(heap, size));
+}
+
+/********************************************************************
+The default ib_vector_t heap free. Does nothing. */
+UNIV_INLINE
+void
+ib_heap_free(
+/*=========*/
+	ib_alloc_t*	allocator UNIV_UNUSED,	/* in: allocator */
+	void*		ptr UNIV_UNUSED)	/* in: size in bytes */
+{
+	/* We can't free individual elements. */
+}
+
+/********************************************************************
+The default ib_vector_t heap resize. Since we can't resize the heap
+we have to copy the elements from the old ptr to the new ptr.
+Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_resize(
+/*===========*/
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		old_ptr,	/* in: pointer to memory */
+	ulint		old_size,	/* in: old size in bytes */
+	ulint		new_size)	/* in: new size in bytes */
+{
+	void*		new_ptr;
+	mem_heap_t*	heap = (mem_heap_t*) allocator->arg;
+
+	new_ptr = mem_heap_alloc(heap, new_size);
+	memcpy(new_ptr, old_ptr, old_size);
+
+	return(new_ptr);
+}
+
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+ib_alloc_t*
+ib_heap_allocator_create(
+/*=====================*/
+	mem_heap_t*	heap)		/* in: heap to use */
+{
+	ib_alloc_t*	heap_alloc;
+
+	heap_alloc = (ib_alloc_t*) mem_heap_alloc(heap, sizeof(*heap_alloc));
+
+	heap_alloc->arg = heap;
+	heap_alloc->mem_release = ib_heap_free;
+	heap_alloc->mem_malloc = ib_heap_malloc;
+	heap_alloc->mem_resize = ib_heap_resize;
+
+	return(heap_alloc);
+}
+
+/********************************************************************
+Free a heap allocator. */
+UNIV_INLINE
+void
+ib_heap_allocator_free(
+/*===================*/
+	ib_alloc_t*	ib_ut_alloc)	/* in: alloc instace to free */
+{
+	mem_heap_free((mem_heap_t*) ib_ut_alloc->arg);
+}
+
+/********************************************************************
+Wrapper around ut_malloc(). */
+UNIV_INLINE
+void*
+ib_ut_malloc(
+/*=========*/
+	ib_alloc_t*	allocator UNIV_UNUSED,	/* in: allocator */
+	ulint		size)			/* in: size in bytes */
+{
+	return(ut_malloc(size));
+}
+
+/********************************************************************
+Wrapper around ut_free(). */
+UNIV_INLINE
+void
+ib_ut_free(
+/*=======*/
+	ib_alloc_t*	allocator UNIV_UNUSED,	/* in: allocator */
+	void*		ptr)			/* in: size in bytes */
+{
+	ut_free(ptr);
+}
+
+/********************************************************************
+Wrapper aroung ut_realloc(). */
+UNIV_INLINE
+void*
+ib_ut_resize(
+/*=========*/
+	ib_alloc_t*	allocator UNIV_UNUSED,	/* in: allocator */
+	void*		old_ptr,	/* in: pointer to memory */
+	ulint		old_size UNIV_UNUSED,/* in: old size in bytes */
+	ulint		new_size)	/* in: new size in bytes */
+{
+	return(ut_realloc(old_ptr, new_size));
+}
+
+/********************************************************************
+Create a ut allocator. */
+UNIV_INLINE
+ib_alloc_t*
+ib_ut_allocator_create(void)
+/*========================*/
+{
+	ib_alloc_t*	ib_ut_alloc;
+
+	ib_ut_alloc = (ib_alloc_t*) ut_malloc(sizeof(*ib_ut_alloc));
+
+	ib_ut_alloc->arg = NULL;
+	ib_ut_alloc->mem_release = ib_ut_free;
+	ib_ut_alloc->mem_malloc = ib_ut_malloc;
+	ib_ut_alloc->mem_resize = ib_ut_resize;
+
+	return(ib_ut_alloc);
+}
+
+/********************************************************************
+Free a ut allocator. */
+UNIV_INLINE
+void
+ib_ut_allocator_free(
+/*=================*/
+	ib_alloc_t*	ib_ut_alloc)	/* in: alloc instace to free */
+{
+	ut_free(ib_ut_alloc);
+}
+
+/********************************************************************
+Get number of elements in vector. */
+UNIV_INLINE
+ulint
+ib_vector_size(
+/*===========*/
+					/* out: number of elements in vector*/
+	const ib_vector_t*	vec)	/* in: vector */
+{
+	return(vec->used);
+}
+
+/****************************************************************//**
+Get n'th element. */
+UNIV_INLINE
+void*
+ib_vector_get(
+/*==========*/
+	ib_vector_t*	vec,	/*!< in: vector */
+	ulint		n)	/*!< in: element index to get */
+{
+	ut_a(n < vec->used);
+
+	return((byte*) vec->data + IB_VEC_OFFSET(vec, n));
+}
+
+/********************************************************************
+Const version of the get n'th element.
+@return n'th element */
+UNIV_INLINE
+const void*
+ib_vector_get_const(
+/*================*/
+	const ib_vector_t*	vec,	/* in: vector */
+	ulint			n)	/* in: element index to get */
+{
+	ut_a(n < vec->used);
+
+	return((byte*) vec->data + IB_VEC_OFFSET(vec, n));
+}
+/****************************************************************//**
+Get last element. The vector must not be empty.
+@return	last element */
+UNIV_INLINE
+void*
+ib_vector_get_last(
+/*===============*/
+	ib_vector_t*	vec)	/*!< in: vector */
+{
+	ut_a(vec->used > 0);
+
+	return((byte*) ib_vector_get(vec, vec->used - 1));
+}
+
+/****************************************************************//**
+Set the n'th element. */
+UNIV_INLINE
+void
+ib_vector_set(
+/*==========*/
+	ib_vector_t*	vec,	/*!< in/out: vector */
+	ulint		n,	/*!< in: element index to set */
+	void*		elem)	/*!< in: data element */
+{
+	void*		slot;
+
+	ut_a(n < vec->used);
+
+	slot = ((byte*) vec->data + IB_VEC_OFFSET(vec, n));
+	memcpy(slot, elem, vec->sizeof_value);
+}
+
+/********************************************************************
+Reset the vector size to 0 elements. */
+UNIV_INLINE
+void
+ib_vector_reset(
+/*============*/
+					/* out: void */
+	ib_vector_t*	vec)		/* in: vector */
+{
+	vec->used = 0;
+}
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+void*
+ib_vector_last(
+/*===========*/
+					/* out: void */
+	ib_vector_t*	vec)		/* in: vector */
+{
+	ut_a(ib_vector_size(vec) > 0);
+
+	return(ib_vector_get(vec, ib_vector_size(vec) - 1));
+}
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+const void*
+ib_vector_last_const(
+/*=================*/
+					/* out: void */
+	const ib_vector_t*	vec)	/* in: vector */
+{
+	ut_a(ib_vector_size(vec) > 0);
+
+	return(ib_vector_get_const(vec, ib_vector_size(vec) - 1));
+}
+
+/****************************************************************//**
+Remove the last element from the vector.
+@return	last vector element */
+UNIV_INLINE
+void*
+ib_vector_pop(
+/*==========*/
+				/* out: pointer to element */
+	ib_vector_t*	vec)	/* in: vector */
+{
+	void*		elem;
+
+	ut_a(vec->used > 0);
+
+	elem = ib_vector_last(vec);
+	--vec->used;
+
+	return(elem);
+}
+
+/********************************************************************
+Append an element to the vector, if elem != NULL then copy the data
+from elem.*/
+UNIV_INLINE
+void*
+ib_vector_push(
+/*===========*/
+				/* out: pointer to the "new" element */
+	ib_vector_t*	vec,	/* in: vector */
+	const void*	elem)	/* in: element to add (can be NULL) */
+{
+	void*		last;
+
+	if (vec->used >= vec->total) {
+		ib_vector_resize(vec);
+	}
+
+	last = (byte*) vec->data + IB_VEC_OFFSET(vec, vec->used);
+
+#ifdef UNIV_DEBUG
+	memset(last, 0, vec->sizeof_value);
+#endif
+
+	if (elem) {
+		memcpy(last, elem, vec->sizeof_value);
+	}
+
+	++vec->used;
+
+	return(last);
+}
+
+/*******************************************************************//**
+Remove an element to the vector
+@return pointer to the "removed" element */
+UNIV_INLINE
+void*
+ib_vector_remove(
+/*=============*/
+	ib_vector_t*	vec,	/*!< in: vector */
+	const void*	elem)	/*!< in: value to remove */
+{
+	void*		current = NULL;
+	void*		next;
+	ulint		i;
+	ulint		old_used_count = vec->used;
+
+	for (i = 0; i < vec->used; i++) {
+		current = ib_vector_get(vec, i);
+
+		if (*(void**) current == elem) {
+			if (i == vec->used - 1) {
+				return(ib_vector_pop(vec));
+			}
+
+			next = ib_vector_get(vec, i + 1);
+			memmove(current, next, vec->sizeof_value
+			        * (vec->used - i - 1));
+			--vec->used;
+			break;
+		}
+	}
+
+	return((old_used_count != vec->used) ? current : NULL);
+}
+
+/********************************************************************
+Sort the vector elements. */
+UNIV_INLINE
+void
+ib_vector_sort(
+/*===========*/
+				/* out: void */
+	ib_vector_t*	vec,	/* in: vector */
+	ib_compare_t	compare)/* in: the comparator to use for sort */
+{
+	qsort(vec->data, vec->used, vec->sizeof_value, compare);
+}
+
+/********************************************************************
+Destroy the vector. Make sure the vector owns the allocator, e.g.,
+the heap in the the heap allocator. */
+UNIV_INLINE
+void
+ib_vector_free(
+/*===========*/
+	ib_vector_t*	vec)		/* in, own: vector */
+{
+	/* Currently we only support two types of allocators, heap
+	and ut_malloc(), when the heap is freed all the elements are
+	freed too. With ut allocator, we need to free the elements,
+	the vector instance and the allocator separately. */
+
+	/* Only the heap allocator uses the arg field. */
+	if (vec->allocator->arg) {
+		mem_heap_free((mem_heap_t*) vec->allocator->arg);
+	} else {
+		ib_alloc_t*	allocator;
+
+		allocator = vec->allocator;
+
+		allocator->mem_release(allocator, vec->data);
+		allocator->mem_release(allocator, vec);
+
+		ib_ut_allocator_free(allocator);
+	}
+}
+
+/********************************************************************
+Test whether a vector is empty or not.
+@return TRUE if empty */
+UNIV_INLINE
+ibool
+ib_vector_is_empty(
+/*===============*/
+	const ib_vector_t*	vec)	/*!< in: vector */
+{
+	return(ib_vector_size(vec) == 0);
+}
diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h
new file mode 100644
index 00000000000..33385ddf2d4
--- /dev/null
+++ b/storage/innobase/include/ut0wqueue.h
@@ -0,0 +1,105 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0wqueue.h
+A work queue
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/*******************************************************************//**
+A Work queue. Threads can add work items to the queue and other threads can
+wait for work items to be available and take them off the queue for
+processing.
+************************************************************************/
+
+#ifndef IB_WORK_QUEUE_H
+#define IB_WORK_QUEUE_H
+
+#include "ut0list.h"
+#include "mem0mem.h"
+#include "os0sync.h"
+#include "sync0types.h"
+
+struct ib_wqueue_t;
+
+/****************************************************************//**
+Create a new work queue.
+@return	work queue */
+UNIV_INTERN
+ib_wqueue_t*
+ib_wqueue_create(void);
+/*===================*/
+
+/****************************************************************//**
+Free a work queue. */
+UNIV_INTERN
+void
+ib_wqueue_free(
+/*===========*/
+	ib_wqueue_t*	wq);	/*!< in: work queue */
+
+/****************************************************************//**
+Add a work item to the queue. */
+UNIV_INTERN
+void
+ib_wqueue_add(
+/*==========*/
+	ib_wqueue_t*	wq,	/*!< in: work queue */
+	void*		item,	/*!< in: work item */
+	mem_heap_t*	heap);	/*!< in: memory heap to use for allocating the
+				list node */
+
+/********************************************************************
+Check if queue is empty. */
+
+ibool
+ib_wqueue_is_empty(
+/*===============*/
+					/* out: TRUE if queue empty
+					else FALSE */
+	const ib_wqueue_t*      wq);    /* in: work queue */
+
+/****************************************************************//**
+Wait for a work item to appear in the queue.
+@return	work item */
+UNIV_INTERN
+void*
+ib_wqueue_wait(
+/*===========*/
+	ib_wqueue_t*	wq);	/*!< in: work queue */
+
+/********************************************************************
+Wait for a work item to appear in the queue for specified time. */
+
+void*
+ib_wqueue_timedwait(
+/*================*/
+					/* out: work item or NULL on timeout*/
+	ib_wqueue_t*	wq,		/* in: work queue */
+	ib_time_t	wait_in_usecs); /* in: wait time in micro seconds */
+
+/* Work queue. */
+struct ib_wqueue_t {
+	ib_mutex_t		mutex;	/*!< mutex protecting everything */
+	ib_list_t*	items;	/*!< work item list */
+	os_event_t	event;	/*!< event we use to signal additions to list */
+};
+
+#endif
diff --git a/storage/innobase/lock/lock0iter.cc b/storage/innobase/lock/lock0iter.cc
new file mode 100644
index 00000000000..b424d2fc757
--- /dev/null
+++ b/storage/innobase/lock/lock0iter.cc
@@ -0,0 +1,111 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file lock/lock0iter.cc
+Lock queue iterator. Can iterate over table and record
+lock queues.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+#define LOCK_MODULE_IMPLEMENTATION
+
+#include "univ.i"
+#include "lock0iter.h"
+#include "lock0lock.h"
+#include "lock0priv.h"
+#include "ut0dbg.h"
+#include "ut0lst.h"
+
+/*******************************************************************//**
+Initialize lock queue iterator so that it starts to iterate from
+"lock". bit_no specifies the record number within the heap where the
+record is stored. It can be undefined (ULINT_UNDEFINED) in two cases:
+1. If the lock is a table lock, thus we have a table lock queue;
+2. If the lock is a record lock and it is a wait lock. In this case
+   bit_no is calculated in this function by using
+   lock_rec_find_set_bit(). There is exactly one bit set in the bitmap
+   of a wait lock. */
+UNIV_INTERN
+void
+lock_queue_iterator_reset(
+/*======================*/
+	lock_queue_iterator_t*	iter,	/*!< out: iterator */
+	const lock_t*		lock,	/*!< in: lock to start from */
+	ulint			bit_no)	/*!< in: record number in the
+					heap */
+{
+	ut_ad(lock_mutex_own());
+
+	iter->current_lock = lock;
+
+	if (bit_no != ULINT_UNDEFINED) {
+
+		iter->bit_no = bit_no;
+	} else {
+
+		switch (lock_get_type_low(lock)) {
+		case LOCK_TABLE:
+			iter->bit_no = ULINT_UNDEFINED;
+			break;
+		case LOCK_REC:
+			iter->bit_no = lock_rec_find_set_bit(lock);
+			ut_a(iter->bit_no != ULINT_UNDEFINED);
+			break;
+		default:
+			ut_error;
+		}
+	}
+}
+
+/*******************************************************************//**
+Gets the previous lock in the lock queue, returns NULL if there are no
+more locks (i.e. the current lock is the first one). The iterator is
+receded (if not-NULL is returned).
+@return	previous lock or NULL */
+UNIV_INTERN
+const lock_t*
+lock_queue_iterator_get_prev(
+/*=========================*/
+	lock_queue_iterator_t*	iter)	/*!< in/out: iterator */
+{
+	const lock_t*	prev_lock;
+
+	ut_ad(lock_mutex_own());
+
+	switch (lock_get_type_low(iter->current_lock)) {
+	case LOCK_REC:
+		prev_lock = lock_rec_get_prev(
+			iter->current_lock, iter->bit_no);
+		break;
+	case LOCK_TABLE:
+		prev_lock = UT_LIST_GET_PREV(
+			un_member.tab_lock.locks, iter->current_lock);
+		break;
+	default:
+		ut_error;
+	}
+
+	if (prev_lock != NULL) {
+
+		iter->current_lock = prev_lock;
+	}
+
+	return(prev_lock);
+}
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
new file mode 100644
index 00000000000..bf7ca1607d1
--- /dev/null
+++ b/storage/innobase/lock/lock0lock.cc
@@ -0,0 +1,7104 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file lock/lock0lock.cc
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#define LOCK_MODULE_IMPLEMENTATION
+
+#include "lock0lock.h"
+#include "lock0priv.h"
+
+#ifdef UNIV_NONINL
+#include "lock0lock.ic"
+#include "lock0priv.ic"
+#endif
+
+#include "ha_prototypes.h"
+#include "usr0sess.h"
+#include "trx0purge.h"
+#include "dict0mem.h"
+#include "dict0boot.h"
+#include "trx0sys.h"
+#include "pars0pars.h" /* pars_complete_graph_for_exec() */
+#include "que0que.h" /* que_node_get_parent() */
+#include "row0mysql.h" /* row_mysql_handle_errors() */
+#include "row0sel.h" /* sel_node_create(), sel_node_t */
+#include "row0types.h" /* sel_node_t */
+#include "srv0mon.h"
+#include "ut0vec.h"
+#include "btr0btr.h"
+#include "dict0boot.h"
+#include <set>
+
+/* Restricts the length of search we will do in the waits-for
+graph of transactions */
+#define LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK 1000000
+
+/* Restricts the search depth we will do in the waits-for graph of
+transactions */
+#define LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK 200
+
+/* When releasing transaction locks, this specifies how often we release
+the lock mutex for a moment to give also others access to it */
+
+#define LOCK_RELEASE_INTERVAL		1000
+
+/* Safety margin when creating a new record lock: this many extra records
+can be inserted to the page without need to create a lock with a bigger
+bitmap */
+
+#define LOCK_PAGE_BITMAP_MARGIN		64
+
+/* An explicit record lock affects both the record and the gap before it.
+An implicit x-lock does not affect the gap, it only locks the index
+record from read or update.
+
+If a transaction has modified or inserted an index record, then
+it owns an implicit x-lock on the record. On a secondary index record,
+a transaction has an implicit x-lock also if it has modified the
+clustered index record, the max trx id of the page where the secondary
+index record resides is >= trx id of the transaction (or database recovery
+is running), and there are no explicit non-gap lock requests on the
+secondary index record.
+
+This complicated definition for a secondary index comes from the
+implementation: we want to be able to determine if a secondary index
+record has an implicit x-lock, just by looking at the present clustered
+index record, not at the historical versions of the record. The
+complicated definition can be explained to the user so that there is
+nondeterminism in the access path when a query is answered: we may,
+or may not, access the clustered index record and thus may, or may not,
+bump into an x-lock set there.
+
+Different transaction can have conflicting locks set on the gap at the
+same time. The locks on the gap are purely inhibitive: an insert cannot
+be made, or a select cursor may have to wait if a different transaction
+has a conflicting lock on the gap. An x-lock on the gap does not give
+the right to insert into the gap.
+
+An explicit lock can be placed on a user record or the supremum record of
+a page. The locks on the supremum record are always thought to be of the gap
+type, though the gap bit is not set. When we perform an update of a record
+where the size of the record changes, we may temporarily store its explicit
+locks on the infimum record of the page, though the infimum otherwise never
+carries locks.
+
+A waiting record lock can also be of the gap type. A waiting lock request
+can be granted when there is no conflicting mode lock request by another
+transaction ahead of it in the explicit lock queue.
+
+In version 4.0.5 we added yet another explicit lock type: LOCK_REC_NOT_GAP.
+It only locks the record it is placed on, not the gap before the record.
+This lock type is necessary to emulate an Oracle-like READ COMMITTED isolation
+level.
+
+-------------------------------------------------------------------------
+RULE 1: If there is an implicit x-lock on a record, and there are non-gap
+-------
+lock requests waiting in the queue, then the transaction holding the implicit
+x-lock also has an explicit non-gap record x-lock. Therefore, as locks are
+released, we can grant locks to waiting lock requests purely by looking at
+the explicit lock requests in the queue.
+
+RULE 3: Different transactions cannot have conflicting granted non-gap locks
+-------
+on a record at the same time. However, they can have conflicting granted gap
+locks.
+RULE 4: If a there is a waiting lock request in a queue, no lock request,
+-------
+gap or not, can be inserted ahead of it in the queue. In record deletes
+and page splits new gap type locks can be created by the database manager
+for a transaction, and without rule 4, the waits-for graph of transactions
+might become cyclic without the database noticing it, as the deadlock check
+is only performed when a transaction itself requests a lock!
+-------------------------------------------------------------------------
+
+An insert is allowed to a gap if there are no explicit lock requests by
+other transactions on the next record. It does not matter if these lock
+requests are granted or waiting, gap bit set or not, with the exception
+that a gap type request set by another transaction to wait for
+its turn to do an insert is ignored. On the other hand, an
+implicit x-lock by another transaction does not prevent an insert, which
+allows for more concurrency when using an Oracle-style sequence number
+generator for the primary key with many transactions doing inserts
+concurrently.
+
+A modify of a record is allowed if the transaction has an x-lock on the
+record, or if other transactions do not have any non-gap lock requests on the
+record.
+
+A read of a single user record with a cursor is allowed if the transaction
+has a non-gap explicit, or an implicit lock on the record, or if the other
+transactions have no x-lock requests on the record. At a page supremum a
+read is always allowed.
+
+In summary, an implicit lock is seen as a granted x-lock only on the
+record, not on the gap. An explicit lock with no gap bit set is a lock
+both on the record and the gap. If the gap bit is set, the lock is only
+on the gap. Different transaction cannot own conflicting locks on the
+record at the same time, but they may own conflicting locks on the gap.
+Granted locks on a record give an access right to the record, but gap type
+locks just inhibit operations.
+
+NOTE: Finding out if some transaction has an implicit x-lock on a secondary
+index record can be cumbersome. We may have to look at previous versions of
+the corresponding clustered index record to find out if a delete marked
+secondary index record was delete marked by an active transaction, not by
+a committed one.
+
+FACT A: If a transaction has inserted a row, it can delete it any time
+without need to wait for locks.
+
+PROOF: The transaction has an implicit x-lock on every index record inserted
+for the row, and can thus modify each record without the need to wait. Q.E.D.
+
+FACT B: If a transaction has read some result set with a cursor, it can read
+it again, and retrieves the same result set, if it has not modified the
+result set in the meantime. Hence, there is no phantom problem. If the
+biggest record, in the alphabetical order, touched by the cursor is removed,
+a lock wait may occur, otherwise not.
+
+PROOF: When a read cursor proceeds, it sets an s-lock on each user record
+it passes, and a gap type s-lock on each page supremum. The cursor must
+wait until it has these locks granted. Then no other transaction can
+have a granted x-lock on any of the user records, and therefore cannot
+modify the user records. Neither can any other transaction insert into
+the gaps which were passed over by the cursor. Page splits and merges,
+and removal of obsolete versions of records do not affect this, because
+when a user record or a page supremum is removed, the next record inherits
+its locks as gap type locks, and therefore blocks inserts to the same gap.
+Also, if a page supremum is inserted, it inherits its locks from the successor
+record. When the cursor is positioned again at the start of the result set,
+the records it will touch on its course are either records it touched
+during the last pass or new inserted page supremums. It can immediately
+access all these records, and when it arrives at the biggest record, it
+notices that the result set is complete. If the biggest record was removed,
+lock wait can occur because the next record only inherits a gap type lock,
+and a wait may be needed. Q.E.D. */
+
+/* If an index record should be changed or a new inserted, we must check
+the lock on the record or the next. When a read cursor starts reading,
+we will set a record level s-lock on each record it passes, except on the
+initial record on which the cursor is positioned before we start to fetch
+records. Our index tree search has the convention that the B-tree
+cursor is positioned BEFORE the first possibly matching record in
+the search. Optimizations are possible here: if the record is searched
+on an equality condition to a unique key, we could actually set a special
+lock on the record, a lock which would not prevent any insert before
+this record. In the next key locking an x-lock set on a record also
+prevents inserts just before that record.
+	There are special infimum and supremum records on each page.
+A supremum record can be locked by a read cursor. This records cannot be
+updated but the lock prevents insert of a user record to the end of
+the page.
+	Next key locks will prevent the phantom problem where new rows
+could appear to SELECT result sets after the select operation has been
+performed. Prevention of phantoms ensures the serilizability of
+transactions.
+	What should we check if an insert of a new record is wanted?
+Only the lock on the next record on the same page, because also the
+supremum record can carry a lock. An s-lock prevents insertion, but
+what about an x-lock? If it was set by a searched update, then there
+is implicitly an s-lock, too, and the insert should be prevented.
+What if our transaction owns an x-lock to the next record, but there is
+a waiting s-lock request on the next record? If this s-lock was placed
+by a read cursor moving in the ascending order in the index, we cannot
+do the insert immediately, because when we finally commit our transaction,
+the read cursor should see also the new inserted record. So we should
+move the read cursor backward from the next record for it to pass over
+the new inserted record. This move backward may be too cumbersome to
+implement. If we in this situation just enqueue a second x-lock request
+for our transaction on the next record, then the deadlock mechanism
+notices a deadlock between our transaction and the s-lock request
+transaction. This seems to be an ok solution.
+	We could have the convention that granted explicit record locks,
+lock the corresponding records from changing, and also lock the gaps
+before them from inserting. A waiting explicit lock request locks the gap
+before from inserting. Implicit record x-locks, which we derive from the
+transaction id in the clustered index record, only lock the record itself
+from modification, not the gap before it from inserting.
+	How should we store update locks? If the search is done by a unique
+key, we could just modify the record trx id. Otherwise, we could put a record
+x-lock on the record. If the update changes ordering fields of the
+clustered index record, the inserted new record needs no record lock in
+lock table, the trx id is enough. The same holds for a secondary index
+record. Searched delete is similar to update.
+
+PROBLEM:
+What about waiting lock requests? If a transaction is waiting to make an
+update to a record which another modified, how does the other transaction
+know to send the end-lock-wait signal to the waiting transaction? If we have
+the convention that a transaction may wait for just one lock at a time, how
+do we preserve it if lock wait ends?
+
+PROBLEM:
+Checking the trx id label of a secondary index record. In the case of a
+modification, not an insert, is this necessary? A secondary index record
+is modified only by setting or resetting its deleted flag. A secondary index
+record contains fields to uniquely determine the corresponding clustered
+index record. A secondary index record is therefore only modified if we
+also modify the clustered index record, and the trx id checking is done
+on the clustered index record, before we come to modify the secondary index
+record. So, in the case of delete marking or unmarking a secondary index
+record, we do not have to care about trx ids, only the locks in the lock
+table must be checked. In the case of a select from a secondary index, the
+trx id is relevant, and in this case we may have to search the clustered
+index record.
+
+PROBLEM: How to update record locks when page is split or merged, or
+--------------------------------------------------------------------
+a record is deleted or updated?
+If the size of fields in a record changes, we perform the update by
+a delete followed by an insert. How can we retain the locks set or
+waiting on the record? Because a record lock is indexed in the bitmap
+by the heap number of the record, when we remove the record from the
+record list, it is possible still to keep the lock bits. If the page
+is reorganized, we could make a table of old and new heap numbers,
+and permute the bitmaps in the locks accordingly. We can add to the
+table a row telling where the updated record ended. If the update does
+not require a reorganization of the page, we can simply move the lock
+bits for the updated record to the position determined by its new heap
+number (we may have to allocate a new lock, if we run out of the bitmap
+in the old one).
+	A more complicated case is the one where the reinsertion of the
+updated record is done pessimistically, because the structure of the
+tree may change.
+
+PROBLEM: If a supremum record is removed in a page merge, or a record
+---------------------------------------------------------------------
+removed in a purge, what to do to the waiting lock requests? In a split to
+the right, we just move the lock requests to the new supremum. If a record
+is removed, we could move the waiting lock request to its inheritor, the
+next record in the index. But, the next record may already have lock
+requests on its own queue. A new deadlock check should be made then. Maybe
+it is easier just to release the waiting transactions. They can then enqueue
+new lock requests on appropriate records.
+
+PROBLEM: When a record is inserted, what locks should it inherit from the
+-------------------------------------------------------------------------
+upper neighbor? An insert of a new supremum record in a page split is
+always possible, but an insert of a new user record requires that the upper
+neighbor does not have any lock requests by other transactions, granted or
+waiting, in its lock queue. Solution: We can copy the locks as gap type
+locks, so that also the waiting locks are transformed to granted gap type
+locks on the inserted record. */
+
+#define LOCK_STACK_SIZE		OS_THREAD_MAX_N
+
+/* LOCK COMPATIBILITY MATRIX
+ *    IS IX S  X  AI
+ * IS +	 +  +  -  +
+ * IX +	 +  -  -  +
+ * S  +	 -  +  -  -
+ * X  -	 -  -  -  -
+ * AI +	 +  -  -  -
+ *
+ * Note that for rows, InnoDB only acquires S or X locks.
+ * For tables, InnoDB normally acquires IS or IX locks.
+ * S or X table locks are only acquired for LOCK TABLES.
+ * Auto-increment (AI) locks are needed because of
+ * statement-level MySQL binlog.
+ * See also lock_mode_compatible().
+ */
+static const byte lock_compatibility_matrix[5][5] = {
+ /**         IS     IX       S     X       AI */
+ /* IS */ {  TRUE,  TRUE,  TRUE,  FALSE,  TRUE},
+ /* IX */ {  TRUE,  TRUE,  FALSE, FALSE,  TRUE},
+ /* S  */ {  TRUE,  FALSE, TRUE,  FALSE,  FALSE},
+ /* X  */ {  FALSE, FALSE, FALSE, FALSE,  FALSE},
+ /* AI */ {  TRUE,  TRUE,  FALSE, FALSE,  FALSE}
+};
+
+/* STRONGER-OR-EQUAL RELATION (mode1=row, mode2=column)
+ *    IS IX S  X  AI
+ * IS +  -  -  -  -
+ * IX +  +  -  -  -
+ * S  +  -  +  -  -
+ * X  +  +  +  +  +
+ * AI -  -  -  -  +
+ * See lock_mode_stronger_or_eq().
+ */
+static const byte lock_strength_matrix[5][5] = {
+ /**         IS     IX       S     X       AI */
+ /* IS */ {  TRUE,  FALSE, FALSE,  FALSE, FALSE},
+ /* IX */ {  TRUE,  TRUE,  FALSE, FALSE,  FALSE},
+ /* S  */ {  TRUE,  FALSE, TRUE,  FALSE,  FALSE},
+ /* X  */ {  TRUE,  TRUE,  TRUE,  TRUE,   TRUE},
+ /* AI */ {  FALSE, FALSE, FALSE, FALSE,  TRUE}
+};
+
+/** Deadlock check context. */
+struct lock_deadlock_ctx_t {
+	const trx_t*	start;		/*!< Joining transaction that is
+					requesting a lock in an incompatible
+					mode */
+
+	const lock_t*	wait_lock;	/*!< Lock that trx wants */
+
+	ib_uint64_t	mark_start;	/*!<  Value of lock_mark_count at
+					the start of the deadlock check. */
+
+	ulint		depth;		/*!< Stack depth */
+
+	ulint		cost;		/*!< Calculation steps thus far */
+
+	ibool		too_deep;	/*!< TRUE if search was too deep and
+					was aborted */
+};
+
+/** DFS visited node information used during deadlock checking. */
+struct lock_stack_t {
+	const lock_t*	lock;			/*!< Current lock */
+	const lock_t*	wait_lock;		/*!< Waiting for lock */
+	ulint		heap_no;		/*!< heap number if rec lock */
+};
+
+/** Stack to use during DFS search. Currently only a single stack is required
+because there is no parallel deadlock check. This stack is protected by
+the lock_sys_t::mutex. */
+static lock_stack_t*	lock_stack;
+
+/** The count of the types of locks. */
+static const ulint	lock_types = UT_ARR_SIZE(lock_compatibility_matrix);
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	lock_sys_mutex_key;
+/* Key to register mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	lock_sys_wait_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+#ifdef UNIV_DEBUG
+UNIV_INTERN ibool	lock_print_waits	= FALSE;
+
+/*********************************************************************//**
+Validates the lock system.
+@return	TRUE if ok */
+static
+bool
+lock_validate();
+/*============*/
+
+/*********************************************************************//**
+Validates the record lock queues on a page.
+@return	TRUE if ok */
+static
+ibool
+lock_rec_validate_page(
+/*===================*/
+	const buf_block_t*	block)	/*!< in: buffer block */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
+
+/* The lock system */
+UNIV_INTERN lock_sys_t*	lock_sys	= NULL;
+
+/** We store info on the latest deadlock error to this buffer. InnoDB
+Monitor will then fetch it and print */
+UNIV_INTERN ibool	lock_deadlock_found = FALSE;
+/** Only created if !srv_read_only_mode */
+static FILE*		lock_latest_err_file;
+
+/********************************************************************//**
+Checks if a joining lock request results in a deadlock. If a deadlock is
+found this function will resolve the dadlock by choosing a victim transaction
+and rolling it back. It will attempt to resolve all deadlocks. The returned
+transaction id will be the joining transaction id or 0 if some other
+transaction was chosen as a victim and rolled back or no deadlock found.
+
+@return id of transaction chosen as victim or 0 */
+static
+trx_id_t
+lock_deadlock_check_and_resolve(
+/*===========================*/
+	const lock_t*	lock,	/*!< in: lock the transaction is requesting */
+	const trx_t*	trx);	/*!< in: transaction */
+
+/*********************************************************************//**
+Gets the nth bit of a record lock.
+@return	TRUE if bit set also if i == ULINT_UNDEFINED return FALSE*/
+UNIV_INLINE
+ibool
+lock_rec_get_nth_bit(
+/*=================*/
+	const lock_t*	lock,	/*!< in: record lock */
+	ulint		i)	/*!< in: index of the bit */
+{
+	const byte*	b;
+
+	ut_ad(lock);
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+	if (i >= lock->un_member.rec_lock.n_bits) {
+
+		return(FALSE);
+	}
+
+	b = ((const byte*) &lock[1]) + (i / 8);
+
+	return(1 & *b >> (i % 8));
+}
+
+/*********************************************************************//**
+Reports that a transaction id is insensible, i.e., in the future. */
+UNIV_INTERN
+void
+lock_report_trx_id_insanity(
+/*========================*/
+	trx_id_t	trx_id,		/*!< in: trx id */
+	const rec_t*	rec,		/*!< in: user record */
+	dict_index_t*	index,		/*!< in: index */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	trx_id_t	max_trx_id)	/*!< in: trx_sys_get_max_trx_id() */
+{
+	ut_print_timestamp(stderr);
+	fputs("  InnoDB: Error: transaction id associated with record\n",
+	      stderr);
+	rec_print_new(stderr, rec, offsets);
+	fputs("InnoDB: in ", stderr);
+	dict_index_name_print(stderr, NULL, index);
+	fprintf(stderr, "\n"
+		"InnoDB: is " TRX_ID_FMT " which is higher than the"
+		" global trx id counter " TRX_ID_FMT "!\n"
+		"InnoDB: The table is corrupt. You have to do"
+		" dump + drop + reimport.\n",
+		trx_id, max_trx_id);
+}
+
+/*********************************************************************//**
+Checks that a transaction id is sensible, i.e., not in the future.
+@return	true if ok */
+#ifdef UNIV_DEBUG
+UNIV_INTERN
+#else
+static __attribute__((nonnull, warn_unused_result))
+#endif
+bool
+lock_check_trx_id_sanity(
+/*=====================*/
+	trx_id_t	trx_id,		/*!< in: trx id */
+	const rec_t*	rec,		/*!< in: user record */
+	dict_index_t*	index,		/*!< in: index */
+	const ulint*	offsets)	/*!< in: rec_get_offsets(rec, index) */
+{
+	bool		is_ok;
+	trx_id_t	max_trx_id;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	max_trx_id = trx_sys_get_max_trx_id();
+	is_ok = trx_id < max_trx_id;
+
+	if (UNIV_UNLIKELY(!is_ok)) {
+		lock_report_trx_id_insanity(trx_id,
+					    rec, index, offsets, max_trx_id);
+	}
+
+	return(is_ok);
+}
+
+/*********************************************************************//**
+Checks that a record is seen in a consistent read.
+@return true if sees, or false if an earlier version of the record
+should be retrieved */
+UNIV_INTERN
+bool
+lock_clust_rec_cons_read_sees(
+/*==========================*/
+	const rec_t*	rec,	/*!< in: user record which should be read or
+				passed over by a read cursor */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	read_view_t*	view)	/*!< in: consistent read view */
+{
+	trx_id_t	trx_id;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(page_rec_is_user_rec(rec));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	/* NOTE that we call this function while holding the search
+	system latch. */
+
+	trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+	return(read_view_sees_trx_id(view, trx_id));
+}
+
+/*********************************************************************//**
+Checks that a non-clustered index record is seen in a consistent read.
+
+NOTE that a non-clustered index page contains so little information on
+its modifications that also in the case false, the present version of
+rec may be the right, but we must check this from the clustered index
+record.
+
+@return true if certainly sees, or false if an earlier version of the
+clustered index record might be needed */
+UNIV_INTERN
+bool
+lock_sec_rec_cons_read_sees(
+/*========================*/
+	const rec_t*		rec,	/*!< in: user record which
+					should be read or passed over
+					by a read cursor */
+	const read_view_t*	view)	/*!< in: consistent read view */
+{
+	trx_id_t	max_trx_id;
+
+	ut_ad(page_rec_is_user_rec(rec));
+
+	/* NOTE that we might call this function while holding the search
+	system latch. */
+
+	if (recv_recovery_is_on()) {
+
+		return(false);
+	}
+
+	max_trx_id = page_get_max_trx_id(page_align(rec));
+	ut_ad(max_trx_id);
+
+	return(max_trx_id < view->up_limit_id);
+}
+
+/*********************************************************************//**
+Creates the lock system at database start. */
+UNIV_INTERN
+void
+lock_sys_create(
+/*============*/
+	ulint	n_cells)	/*!< in: number of slots in lock hash table */
+{
+	ulint	lock_sys_sz;
+
+	lock_sys_sz = sizeof(*lock_sys)
+		+ OS_THREAD_MAX_N * sizeof(srv_slot_t);
+
+	lock_sys = static_cast<lock_sys_t*>(mem_zalloc(lock_sys_sz));
+
+	lock_stack = static_cast<lock_stack_t*>(
+		mem_zalloc(sizeof(*lock_stack) * LOCK_STACK_SIZE));
+
+	void*	ptr = &lock_sys[1];
+
+	lock_sys->waiting_threads = static_cast<srv_slot_t*>(ptr);
+
+	lock_sys->last_slot = lock_sys->waiting_threads;
+
+	mutex_create(lock_sys_mutex_key, &lock_sys->mutex, SYNC_LOCK_SYS);
+
+	mutex_create(lock_sys_wait_mutex_key,
+		     &lock_sys->wait_mutex, SYNC_LOCK_WAIT_SYS);
+
+	lock_sys->timeout_event = os_event_create();
+
+	lock_sys->rec_hash = hash_create(n_cells);
+
+	if (!srv_read_only_mode) {
+		lock_latest_err_file = os_file_create_tmpfile();
+		ut_a(lock_latest_err_file);
+	}
+}
+
+/*********************************************************************//**
+Closes the lock system at database shutdown. */
+UNIV_INTERN
+void
+lock_sys_close(void)
+/*================*/
+{
+	if (lock_latest_err_file != NULL) {
+		fclose(lock_latest_err_file);
+		lock_latest_err_file = NULL;
+	}
+
+	hash_table_free(lock_sys->rec_hash);
+
+	mutex_free(&lock_sys->mutex);
+	mutex_free(&lock_sys->wait_mutex);
+
+	mem_free(lock_stack);
+	mem_free(lock_sys);
+
+	lock_sys = NULL;
+	lock_stack = NULL;
+}
+
+/*********************************************************************//**
+Gets the size of a lock struct.
+@return	size in bytes */
+UNIV_INTERN
+ulint
+lock_get_size(void)
+/*===============*/
+{
+	return((ulint) sizeof(lock_t));
+}
+
+/*********************************************************************//**
+Gets the mode of a lock.
+@return	mode */
+UNIV_INLINE
+enum lock_mode
+lock_get_mode(
+/*==========*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	ut_ad(lock);
+
+	return(static_cast<enum lock_mode>(lock->type_mode & LOCK_MODE_MASK));
+}
+
+/*********************************************************************//**
+Gets the wait flag of a lock.
+@return	LOCK_WAIT if waiting, 0 if not */
+UNIV_INLINE
+ulint
+lock_get_wait(
+/*==========*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	ut_ad(lock);
+
+	return(lock->type_mode & LOCK_WAIT);
+}
+
+/*********************************************************************//**
+Gets the source table of an ALTER TABLE transaction.  The table must be
+covered by an IX or IS table lock.
+@return the source table of transaction, if it is covered by an IX or
+IS table lock; dest if there is no source table, and NULL if the
+transaction is locking more than two tables or an inconsistency is
+found */
+UNIV_INTERN
+dict_table_t*
+lock_get_src_table(
+/*===============*/
+	trx_t*		trx,	/*!< in: transaction */
+	dict_table_t*	dest,	/*!< in: destination of ALTER TABLE */
+	enum lock_mode*	mode)	/*!< out: lock mode of the source table */
+{
+	dict_table_t*	src;
+	lock_t*		lock;
+
+	ut_ad(!lock_mutex_own());
+
+	src = NULL;
+	*mode = LOCK_NONE;
+
+	/* The trx mutex protects the trx_locks for our purposes.
+	Other transactions could want to convert one of our implicit
+	record locks to an explicit one. For that, they would need our
+	trx mutex. Waiting locks can be removed while only holding
+	lock_sys->mutex, but this is a running transaction and cannot
+	thus be holding any waiting locks. */
+	trx_mutex_enter(trx);
+
+	for (lock = UT_LIST_GET_FIRST(trx->lock.trx_locks);
+	     lock != NULL;
+	     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+		lock_table_t*	tab_lock;
+		enum lock_mode	lock_mode;
+		if (!(lock_get_type_low(lock) & LOCK_TABLE)) {
+			/* We are only interested in table locks. */
+			continue;
+		}
+		tab_lock = &lock->un_member.tab_lock;
+		if (dest == tab_lock->table) {
+			/* We are not interested in the destination table. */
+			continue;
+		} else if (!src) {
+			/* This presumably is the source table. */
+			src = tab_lock->table;
+			if (UT_LIST_GET_LEN(src->locks) != 1
+			    || UT_LIST_GET_FIRST(src->locks) != lock) {
+				/* We only support the case when
+				there is only one lock on this table. */
+				src = NULL;
+				goto func_exit;
+			}
+		} else if (src != tab_lock->table) {
+			/* The transaction is locking more than
+			two tables (src and dest): abort */
+			src = NULL;
+			goto func_exit;
+		}
+
+		/* Check that the source table is locked by
+		LOCK_IX or LOCK_IS. */
+		lock_mode = lock_get_mode(lock);
+		if (lock_mode == LOCK_IX || lock_mode == LOCK_IS) {
+			if (*mode != LOCK_NONE && *mode != lock_mode) {
+				/* There are multiple locks on src. */
+				src = NULL;
+				goto func_exit;
+			}
+			*mode = lock_mode;
+		}
+	}
+
+	if (!src) {
+		/* No source table lock found: flag the situation to caller */
+		src = dest;
+	}
+
+func_exit:
+	trx_mutex_exit(trx);
+	return(src);
+}
+
+/*********************************************************************//**
+Determine if the given table is exclusively "owned" by the given
+transaction, i.e., transaction holds LOCK_IX and possibly LOCK_AUTO_INC
+on the table.
+@return TRUE if table is only locked by trx, with LOCK_IX, and
+possibly LOCK_AUTO_INC */
+UNIV_INTERN
+ibool
+lock_is_table_exclusive(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const trx_t*		trx)	/*!< in: transaction */
+{
+	const lock_t*	lock;
+	ibool		ok	= FALSE;
+
+	ut_ad(table);
+	ut_ad(trx);
+
+	lock_mutex_enter();
+
+	for (lock = UT_LIST_GET_FIRST(table->locks);
+	     lock != NULL;
+	     lock = UT_LIST_GET_NEXT(locks, &lock->un_member.tab_lock)) {
+		if (lock->trx != trx) {
+			/* A lock on the table is held
+			by some other transaction. */
+			goto not_ok;
+		}
+
+		if (!(lock_get_type_low(lock) & LOCK_TABLE)) {
+			/* We are interested in table locks only. */
+			continue;
+		}
+
+		switch (lock_get_mode(lock)) {
+		case LOCK_IX:
+			ok = TRUE;
+			break;
+		case LOCK_AUTO_INC:
+			/* It is allowed for trx to hold an
+			auto_increment lock. */
+			break;
+		default:
+not_ok:
+			/* Other table locks than LOCK_IX are not allowed. */
+			ok = FALSE;
+			goto func_exit;
+		}
+	}
+
+func_exit:
+	lock_mutex_exit();
+
+	return(ok);
+}
+
+/*********************************************************************//**
+Sets the wait flag of a lock and the back pointer in trx to lock. */
+UNIV_INLINE
+void
+lock_set_lock_and_trx_wait(
+/*=======================*/
+	lock_t*	lock,	/*!< in: lock */
+	trx_t*	trx)	/*!< in/out: trx */
+{
+	ut_ad(lock);
+	ut_ad(lock->trx == trx);
+	ut_ad(trx->lock.wait_lock == NULL);
+	ut_ad(lock_mutex_own());
+	ut_ad(trx_mutex_own(trx));
+
+	trx->lock.wait_lock = lock;
+	lock->type_mode |= LOCK_WAIT;
+}
+
+/**********************************************************************//**
+The back pointer to a waiting lock request in the transaction is set to NULL
+and the wait bit in lock type_mode is reset. */
+UNIV_INLINE
+void
+lock_reset_lock_and_trx_wait(
+/*=========================*/
+	lock_t*	lock)	/*!< in/out: record lock */
+{
+	ut_ad(lock->trx->lock.wait_lock == lock);
+	ut_ad(lock_get_wait(lock));
+	ut_ad(lock_mutex_own());
+
+	lock->trx->lock.wait_lock = NULL;
+	lock->type_mode &= ~LOCK_WAIT;
+}
+
+/*********************************************************************//**
+Gets the gap flag of a record lock.
+@return	LOCK_GAP or 0 */
+UNIV_INLINE
+ulint
+lock_rec_get_gap(
+/*=============*/
+	const lock_t*	lock)	/*!< in: record lock */
+{
+	ut_ad(lock);
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+	return(lock->type_mode & LOCK_GAP);
+}
+
+/*********************************************************************//**
+Gets the LOCK_REC_NOT_GAP flag of a record lock.
+@return	LOCK_REC_NOT_GAP or 0 */
+UNIV_INLINE
+ulint
+lock_rec_get_rec_not_gap(
+/*=====================*/
+	const lock_t*	lock)	/*!< in: record lock */
+{
+	ut_ad(lock);
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+	return(lock->type_mode & LOCK_REC_NOT_GAP);
+}
+
+/*********************************************************************//**
+Gets the waiting insert flag of a record lock.
+@return	LOCK_INSERT_INTENTION or 0 */
+UNIV_INLINE
+ulint
+lock_rec_get_insert_intention(
+/*==========================*/
+	const lock_t*	lock)	/*!< in: record lock */
+{
+	ut_ad(lock);
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+	return(lock->type_mode & LOCK_INSERT_INTENTION);
+}
+
+/*********************************************************************//**
+Calculates if lock mode 1 is stronger or equal to lock mode 2.
+@return	nonzero if mode1 stronger or equal to mode2 */
+UNIV_INLINE
+ulint
+lock_mode_stronger_or_eq(
+/*=====================*/
+	enum lock_mode	mode1,	/*!< in: lock mode */
+	enum lock_mode	mode2)	/*!< in: lock mode */
+{
+	ut_ad((ulint) mode1 < lock_types);
+	ut_ad((ulint) mode2 < lock_types);
+
+	return(lock_strength_matrix[mode1][mode2]);
+}
+
+/*********************************************************************//**
+Calculates if lock mode 1 is compatible with lock mode 2.
+@return	nonzero if mode1 compatible with mode2 */
+UNIV_INLINE
+ulint
+lock_mode_compatible(
+/*=================*/
+	enum lock_mode	mode1,	/*!< in: lock mode */
+	enum lock_mode	mode2)	/*!< in: lock mode */
+{
+	ut_ad((ulint) mode1 < lock_types);
+	ut_ad((ulint) mode2 < lock_types);
+
+	return(lock_compatibility_matrix[mode1][mode2]);
+}
+
+/*********************************************************************//**
+Checks if a lock request for a new lock has to wait for request lock2.
+@return	TRUE if new lock has to wait for lock2 to be removed */
+UNIV_INLINE
+ibool
+lock_rec_has_to_wait(
+/*=================*/
+	const trx_t*	trx,	/*!< in: trx of new lock */
+	ulint		type_mode,/*!< in: precise mode of the new lock
+				to set: LOCK_S or LOCK_X, possibly
+				ORed to LOCK_GAP or LOCK_REC_NOT_GAP,
+				LOCK_INSERT_INTENTION */
+	const lock_t*	lock2,	/*!< in: another record lock; NOTE that
+				it is assumed that this has a lock bit
+				set on the same record as in the new
+				lock we are setting */
+	ibool lock_is_on_supremum)  /*!< in: TRUE if we are setting the
+				lock on the 'supremum' record of an
+				index page: we know then that the lock
+				request is really for a 'gap' type lock */
+{
+	ut_ad(trx && lock2);
+	ut_ad(lock_get_type_low(lock2) == LOCK_REC);
+
+	if (trx != lock2->trx
+	    && !lock_mode_compatible(static_cast<enum lock_mode>(
+			             LOCK_MODE_MASK & type_mode),
+				     lock_get_mode(lock2))) {
+
+		/* We have somewhat complex rules when gap type record locks
+		cause waits */
+
+		if ((lock_is_on_supremum || (type_mode & LOCK_GAP))
+		    && !(type_mode & LOCK_INSERT_INTENTION)) {
+
+			/* Gap type locks without LOCK_INSERT_INTENTION flag
+			do not need to wait for anything. This is because
+			different users can have conflicting lock types
+			on gaps. */
+
+			return(FALSE);
+		}
+
+		if (!(type_mode & LOCK_INSERT_INTENTION)
+		    && lock_rec_get_gap(lock2)) {
+
+			/* Record lock (LOCK_ORDINARY or LOCK_REC_NOT_GAP
+			does not need to wait for a gap type lock */
+
+			return(FALSE);
+		}
+
+		if ((type_mode & LOCK_GAP)
+		    && lock_rec_get_rec_not_gap(lock2)) {
+
+			/* Lock on gap does not need to wait for
+			a LOCK_REC_NOT_GAP type lock */
+
+			return(FALSE);
+		}
+
+		if (lock_rec_get_insert_intention(lock2)) {
+
+			/* No lock request needs to wait for an insert
+			intention lock to be removed. This is ok since our
+			rules allow conflicting locks on gaps. This eliminates
+			a spurious deadlock caused by a next-key lock waiting
+			for an insert intention lock; when the insert
+			intention lock was granted, the insert deadlocked on
+			the waiting next-key lock.
+
+			Also, insert intention locks do not disturb each
+			other. */
+
+			return(FALSE);
+		}
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Checks if a lock request lock1 has to wait for request lock2.
+@return	TRUE if lock1 has to wait for lock2 to be removed */
+UNIV_INTERN
+ibool
+lock_has_to_wait(
+/*=============*/
+	const lock_t*	lock1,	/*!< in: waiting lock */
+	const lock_t*	lock2)	/*!< in: another lock; NOTE that it is
+				assumed that this has a lock bit set
+				on the same record as in lock1 if the
+				locks are record locks */
+{
+	ut_ad(lock1 && lock2);
+
+	if (lock1->trx != lock2->trx
+	    && !lock_mode_compatible(lock_get_mode(lock1),
+				     lock_get_mode(lock2))) {
+		if (lock_get_type_low(lock1) == LOCK_REC) {
+			ut_ad(lock_get_type_low(lock2) == LOCK_REC);
+
+			/* If this lock request is for a supremum record
+			then the second bit on the lock bitmap is set */
+
+			return(lock_rec_has_to_wait(lock1->trx,
+						    lock1->type_mode, lock2,
+						    lock_rec_get_nth_bit(
+							    lock1, 1)));
+		}
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*============== RECORD LOCK BASIC FUNCTIONS ============================*/
+
+/*********************************************************************//**
+Gets the number of bits in a record lock bitmap.
+@return	number of bits */
+UNIV_INLINE
+ulint
+lock_rec_get_n_bits(
+/*================*/
+	const lock_t*	lock)	/*!< in: record lock */
+{
+	return(lock->un_member.rec_lock.n_bits);
+}
+
+/**********************************************************************//**
+Sets the nth bit of a record lock to TRUE. */
+UNIV_INLINE
+void
+lock_rec_set_nth_bit(
+/*=================*/
+	lock_t*	lock,	/*!< in: record lock */
+	ulint	i)	/*!< in: index of the bit */
+{
+	ulint	byte_index;
+	ulint	bit_index;
+
+	ut_ad(lock);
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+	ut_ad(i < lock->un_member.rec_lock.n_bits);
+
+	byte_index = i / 8;
+	bit_index = i % 8;
+
+	((byte*) &lock[1])[byte_index] |= 1 << bit_index;
+}
+
+/**********************************************************************//**
+Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED,
+if none found.
+@return bit index == heap number of the record, or ULINT_UNDEFINED if
+none found */
+UNIV_INTERN
+ulint
+lock_rec_find_set_bit(
+/*==================*/
+	const lock_t*	lock)	/*!< in: record lock with at least one bit set */
+{
+	ulint	i;
+
+	for (i = 0; i < lock_rec_get_n_bits(lock); i++) {
+
+		if (lock_rec_get_nth_bit(lock, i)) {
+
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Resets the nth bit of a record lock. */
+UNIV_INLINE
+void
+lock_rec_reset_nth_bit(
+/*===================*/
+	lock_t*	lock,	/*!< in: record lock */
+	ulint	i)	/*!< in: index of the bit which must be set to TRUE
+			when this function is called */
+{
+	ulint	byte_index;
+	ulint	bit_index;
+
+	ut_ad(lock);
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+	ut_ad(i < lock->un_member.rec_lock.n_bits);
+
+	byte_index = i / 8;
+	bit_index = i % 8;
+
+	((byte*) &lock[1])[byte_index] &= ~(1 << bit_index);
+}
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return	next lock, NULL if none exists */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_on_page_const(
+/*============================*/
+	const lock_t*	lock)	/*!< in: a record lock */
+{
+	ulint	space;
+	ulint	page_no;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+	space = lock->un_member.rec_lock.space;
+	page_no = lock->un_member.rec_lock.page_no;
+
+	for (;;) {
+		lock = static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock));
+
+		if (!lock) {
+
+			break;
+		}
+
+		if ((lock->un_member.rec_lock.space == space)
+		    && (lock->un_member.rec_lock.page_no == page_no)) {
+
+			break;
+		}
+	}
+
+	return(lock);
+}
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return	next lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next_on_page(
+/*======================*/
+	lock_t*	lock)	/*!< in: a record lock */
+{
+	return((lock_t*) lock_rec_get_next_on_page_const(lock));
+}
+
+/*********************************************************************//**
+Gets the first record lock on a page, where the page is identified by its
+file address.
+@return	first lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_first_on_page_addr(
+/*============================*/
+	ulint	space,	/*!< in: space */
+	ulint	page_no)/*!< in: page number */
+{
+	lock_t*	lock;
+
+	ut_ad(lock_mutex_own());
+
+	for (lock = static_cast<lock_t*>(
+			HASH_GET_FIRST(lock_sys->rec_hash,
+				       lock_rec_hash(space, page_no)));
+	      lock != NULL;
+	      lock = static_cast<lock_t*>(HASH_GET_NEXT(hash, lock))) {
+
+		if (lock->un_member.rec_lock.space == space
+		    && lock->un_member.rec_lock.page_no == page_no) {
+
+			break;
+		}
+	}
+
+	return(lock);
+}
+
+/*********************************************************************//**
+Determines if there are explicit record locks on a page.
+@return	an explicit record lock on the page, or NULL if there are none */
+UNIV_INTERN
+lock_t*
+lock_rec_expl_exist_on_page(
+/*========================*/
+	ulint	space,	/*!< in: space id */
+	ulint	page_no)/*!< in: page number */
+{
+	lock_t*	lock;
+
+	lock_mutex_enter();
+	lock = lock_rec_get_first_on_page_addr(space, page_no);
+	lock_mutex_exit();
+
+	return(lock);
+}
+
+/*********************************************************************//**
+Gets the first record lock on a page, where the page is identified by a
+pointer to it.
+@return	first lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_first_on_page(
+/*=======================*/
+	const buf_block_t*	block)	/*!< in: buffer block */
+{
+	ulint	hash;
+	lock_t*	lock;
+	ulint	space	= buf_block_get_space(block);
+	ulint	page_no	= buf_block_get_page_no(block);
+
+	ut_ad(lock_mutex_own());
+
+	hash = buf_block_get_lock_hash_val(block);
+
+	for (lock = static_cast<lock_t*>(
+			HASH_GET_FIRST( lock_sys->rec_hash, hash));
+	     lock != NULL;
+	     lock = static_cast<lock_t*>(HASH_GET_NEXT(hash, lock))) {
+
+		if ((lock->un_member.rec_lock.space == space)
+		    && (lock->un_member.rec_lock.page_no == page_no)) {
+
+			break;
+		}
+	}
+
+	return(lock);
+}
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return	next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next(
+/*==============*/
+	ulint	heap_no,/*!< in: heap number of the record */
+	lock_t*	lock)	/*!< in: lock */
+{
+	ut_ad(lock_mutex_own());
+
+	do {
+		ut_ad(lock_get_type_low(lock) == LOCK_REC);
+		lock = lock_rec_get_next_on_page(lock);
+	} while (lock && !lock_rec_get_nth_bit(lock, heap_no));
+
+	return(lock);
+}
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return	next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_const(
+/*====================*/
+	ulint		heap_no,/*!< in: heap number of the record */
+	const lock_t*	lock)	/*!< in: lock */
+{
+	return(lock_rec_get_next(heap_no, (lock_t*) lock));
+}
+
+/*********************************************************************//**
+Gets the first explicit lock request on a record.
+@return	first lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_first(
+/*===============*/
+	const buf_block_t*	block,	/*!< in: block containing the record */
+	ulint			heap_no)/*!< in: heap number of the record */
+{
+	lock_t*	lock;
+
+	ut_ad(lock_mutex_own());
+
+	for (lock = lock_rec_get_first_on_page(block); lock;
+	     lock = lock_rec_get_next_on_page(lock)) {
+		if (lock_rec_get_nth_bit(lock, heap_no)) {
+			break;
+		}
+	}
+
+	return(lock);
+}
+
+/*********************************************************************//**
+Resets the record lock bitmap to zero. NOTE: does not touch the wait_lock
+pointer in the transaction! This function is used in lock object creation
+and resetting. */
+static
+void
+lock_rec_bitmap_reset(
+/*==================*/
+	lock_t*	lock)	/*!< in: record lock */
+{
+	ulint	n_bytes;
+
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+	/* Reset to zero the bitmap which resides immediately after the lock
+	struct */
+
+	n_bytes = lock_rec_get_n_bits(lock) / 8;
+
+	ut_ad((lock_rec_get_n_bits(lock) % 8) == 0);
+
+	memset(&lock[1], 0, n_bytes);
+}
+
+/*********************************************************************//**
+Copies a record lock to heap.
+@return	copy of lock */
+static
+lock_t*
+lock_rec_copy(
+/*==========*/
+	const lock_t*	lock,	/*!< in: record lock */
+	mem_heap_t*	heap)	/*!< in: memory heap */
+{
+	ulint	size;
+
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+	size = sizeof(lock_t) + lock_rec_get_n_bits(lock) / 8;
+
+	return(static_cast<lock_t*>(mem_heap_dup(heap, lock, size)));
+}
+
+/*********************************************************************//**
+Gets the previous record lock set on a record.
+@return	previous lock on the same record, NULL if none exists */
+UNIV_INTERN
+const lock_t*
+lock_rec_get_prev(
+/*==============*/
+	const lock_t*	in_lock,/*!< in: record lock */
+	ulint		heap_no)/*!< in: heap number of the record */
+{
+	lock_t*	lock;
+	ulint	space;
+	ulint	page_no;
+	lock_t*	found_lock	= NULL;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
+
+	space = in_lock->un_member.rec_lock.space;
+	page_no = in_lock->un_member.rec_lock.page_no;
+
+	for (lock = lock_rec_get_first_on_page_addr(space, page_no);
+	     /* No op */;
+	     lock = lock_rec_get_next_on_page(lock)) {
+
+		ut_ad(lock);
+
+		if (lock == in_lock) {
+
+			return(found_lock);
+		}
+
+		if (lock_rec_get_nth_bit(lock, heap_no)) {
+
+			found_lock = lock;
+		}
+	}
+}
+
+/*============= FUNCTIONS FOR ANALYZING TABLE LOCK QUEUE ================*/
+
+/*********************************************************************//**
+Checks if a transaction has the specified table lock, or stronger. This
+function should only be called by the thread that owns the transaction.
+@return	lock or NULL */
+UNIV_INLINE
+const lock_t*
+lock_table_has(
+/*===========*/
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_table_t*	table,	/*!< in: table */
+	enum lock_mode		mode)	/*!< in: lock mode */
+{
+	lint			i;
+
+	if (ib_vector_is_empty(trx->lock.table_locks)) {
+		return(NULL);
+	}
+
+	/* Look for stronger locks the same trx already has on the table */
+
+	for (i = ib_vector_size(trx->lock.table_locks) - 1; i >= 0; --i) {
+		const lock_t*	lock;
+		enum lock_mode	lock_mode;
+
+		lock = *static_cast<const lock_t**>(
+			ib_vector_get(trx->lock.table_locks, i));
+
+		if (lock == NULL) {
+			continue;
+		}
+
+		lock_mode = lock_get_mode(lock);
+
+		ut_ad(trx == lock->trx);
+		ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+		ut_ad(lock->un_member.tab_lock.table != NULL);
+
+		if (table == lock->un_member.tab_lock.table
+		    && lock_mode_stronger_or_eq(lock_mode, mode)) {
+
+			ut_ad(!lock_get_wait(lock));
+
+			return(lock);
+		}
+	}
+
+	return(NULL);
+}
+
+/*============= FUNCTIONS FOR ANALYZING RECORD LOCK QUEUE ================*/
+
+/*********************************************************************//**
+Checks if a transaction has a GRANTED explicit lock on rec stronger or equal
+to precise_mode.
+@return	lock or NULL */
+UNIV_INLINE
+lock_t*
+lock_rec_has_expl(
+/*==============*/
+	ulint			precise_mode,/*!< in: LOCK_S or LOCK_X
+					possibly ORed to LOCK_GAP or
+					LOCK_REC_NOT_GAP, for a
+					supremum record we regard this
+					always a gap type request */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of the record */
+	const trx_t*		trx)	/*!< in: transaction */
+{
+	lock_t*	lock;
+
+	ut_ad(lock_mutex_own());
+	ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S
+	      || (precise_mode & LOCK_MODE_MASK) == LOCK_X);
+	ut_ad(!(precise_mode & LOCK_INSERT_INTENTION));
+
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next(heap_no, lock)) {
+
+		if (lock->trx == trx
+		    && !lock_rec_get_insert_intention(lock)
+		    && lock_mode_stronger_or_eq(
+			    lock_get_mode(lock),
+			    static_cast<enum lock_mode>(
+				    precise_mode & LOCK_MODE_MASK))
+		    && !lock_get_wait(lock)
+		    && (!lock_rec_get_rec_not_gap(lock)
+			|| (precise_mode & LOCK_REC_NOT_GAP)
+			|| heap_no == PAGE_HEAP_NO_SUPREMUM)
+		    && (!lock_rec_get_gap(lock)
+			|| (precise_mode & LOCK_GAP)
+			|| heap_no == PAGE_HEAP_NO_SUPREMUM)) {
+
+			return(lock);
+		}
+	}
+
+	return(NULL);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Checks if some other transaction has a lock request in the queue.
+@return	lock or NULL */
+static
+const lock_t*
+lock_rec_other_has_expl_req(
+/*========================*/
+	enum lock_mode		mode,	/*!< in: LOCK_S or LOCK_X */
+	ulint			gap,	/*!< in: LOCK_GAP if also gap
+					locks are taken into account,
+					or 0 if not */
+	ulint			wait,	/*!< in: LOCK_WAIT if also
+					waiting locks are taken into
+					account, or 0 if not */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of the record */
+	const trx_t*		trx)	/*!< in: transaction, or NULL if
+					requests by all transactions
+					are taken into account */
+{
+	const lock_t*	lock;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(mode == LOCK_X || mode == LOCK_S);
+	ut_ad(gap == 0 || gap == LOCK_GAP);
+	ut_ad(wait == 0 || wait == LOCK_WAIT);
+
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next_const(heap_no, lock)) {
+
+		if (lock->trx != trx
+		    && (gap
+			|| !(lock_rec_get_gap(lock)
+			     || heap_no == PAGE_HEAP_NO_SUPREMUM))
+		    && (wait || !lock_get_wait(lock))
+		    && lock_mode_stronger_or_eq(lock_get_mode(lock), mode)) {
+
+			return(lock);
+		}
+	}
+
+	return(NULL);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Checks if some other transaction has a conflicting explicit lock request
+in the queue, so that we have to wait.
+@return	lock or NULL */
+static
+const lock_t*
+lock_rec_other_has_conflicting(
+/*===========================*/
+	enum lock_mode		mode,	/*!< in: LOCK_S or LOCK_X,
+					possibly ORed to LOCK_GAP or
+					LOC_REC_NOT_GAP,
+					LOCK_INSERT_INTENTION */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of the record */
+	const trx_t*		trx)	/*!< in: our transaction */
+{
+	const lock_t*		lock;
+	ibool			is_supremum;
+
+	ut_ad(lock_mutex_own());
+
+	is_supremum = (heap_no == PAGE_HEAP_NO_SUPREMUM);
+
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next_const(heap_no, lock)) {
+
+		if (lock_rec_has_to_wait(trx, mode, lock, is_supremum)) {
+			return(lock);
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Looks for a suitable type record lock struct by the same trx on the same page.
+This can be used to save space when a new record lock should be set on a page:
+no new struct is needed, if a suitable old is found.
+@return	lock or NULL */
+UNIV_INLINE
+lock_t*
+lock_rec_find_similar_on_page(
+/*==========================*/
+	ulint		type_mode,	/*!< in: lock type_mode field */
+	ulint		heap_no,	/*!< in: heap number of the record */
+	lock_t*		lock,		/*!< in: lock_rec_get_first_on_page() */
+	const trx_t*	trx)		/*!< in: transaction */
+{
+	ut_ad(lock_mutex_own());
+
+	for (/* No op */;
+	     lock != NULL;
+	     lock = lock_rec_get_next_on_page(lock)) {
+
+		if (lock->trx == trx
+		    && lock->type_mode == type_mode
+		    && lock_rec_get_n_bits(lock) > heap_no) {
+
+			return(lock);
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a secondary
+index.
+@return	transaction id of the transaction which has the x-lock, or 0;
+NOTE that this function can return false positives but never false
+negatives. The caller must confirm all positive results by calling
+trx_is_active(). */
+static
+trx_id_t
+lock_sec_rec_some_has_impl(
+/*=======================*/
+	const rec_t*	rec,	/*!< in: user record */
+	dict_index_t*	index,	/*!< in: secondary index */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	trx_id_t	trx_id;
+	trx_id_t	max_trx_id;
+	const page_t*	page = page_align(rec);
+
+	ut_ad(!lock_mutex_own());
+	ut_ad(!mutex_own(&trx_sys->mutex));
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(page_rec_is_user_rec(rec));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	max_trx_id = page_get_max_trx_id(page);
+
+	/* Some transaction may have an implicit x-lock on the record only
+	if the max trx id for the page >= min trx id for the trx list, or
+	database recovery is running. We do not write the changes of a page
+	max trx id to the log, and therefore during recovery, this value
+	for a page may be incorrect. */
+
+	if (max_trx_id < trx_rw_min_trx_id() && !recv_recovery_is_on()) {
+
+		trx_id = 0;
+
+	} else if (!lock_check_trx_id_sanity(max_trx_id, rec, index, offsets)) {
+
+		buf_page_print(page, 0, 0);
+
+		/* The page is corrupt: try to avoid a crash by returning 0 */
+		trx_id = 0;
+
+	/* In this case it is possible that some transaction has an implicit
+	x-lock. We have to look in the clustered index. */
+
+	} else {
+		trx_id = row_vers_impl_x_locked(rec, index, offsets);
+	}
+
+	return(trx_id);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Checks if some transaction, other than given trx_id, has an explicit
+lock on the given rec, in the given precise_mode.
+@return	the transaction, whose id is not equal to trx_id, that has an
+explicit lock on the given rec, in the given precise_mode or NULL.*/
+static
+trx_t*
+lock_rec_other_trx_holds_expl(
+/*==========================*/
+	ulint			precise_mode,	/*!< in: LOCK_S or LOCK_X
+						possibly ORed to LOCK_GAP or
+						LOCK_REC_NOT_GAP. */
+	trx_id_t		trx_id,		/*!< in: trx holding implicit
+						lock on rec */
+	const rec_t*		rec,		/*!< in: user record */
+	const buf_block_t*	block)		/*!< in: buffer block
+						containing the record */
+{
+	trx_t* holds = NULL;
+
+	lock_mutex_enter();
+
+	if (trx_t *impl_trx = trx_rw_is_active(trx_id, NULL)) {
+		ulint heap_no = page_rec_get_heap_no(rec);
+		mutex_enter(&trx_sys->mutex);
+
+		for (trx_t* t = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+		     t != NULL;
+		     t = UT_LIST_GET_NEXT(trx_list, t)) {
+
+			lock_t *expl_lock = lock_rec_has_expl(
+				precise_mode, block, heap_no, t);
+
+			if (expl_lock && expl_lock->trx != impl_trx) {
+				/* An explicit lock is held by trx other than
+				the trx holding the implicit lock. */
+				holds = expl_lock->trx;
+				break;
+			}
+		}
+
+		mutex_exit(&trx_sys->mutex);
+        }
+
+	lock_mutex_exit();
+
+	return(holds);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Return approximate number or record locks (bits set in the bitmap) for
+this transaction. Since delete-marked records may be removed, the
+record count will not be precise.
+The caller must be holding lock_sys->mutex. */
+UNIV_INTERN
+ulint
+lock_number_of_rows_locked(
+/*=======================*/
+	const trx_lock_t*	trx_lock)	/*!< in: transaction locks */
+{
+	const lock_t*	lock;
+	ulint		n_records = 0;
+
+	ut_ad(lock_mutex_own());
+
+	for (lock = UT_LIST_GET_FIRST(trx_lock->trx_locks);
+	     lock != NULL;
+	     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+
+		if (lock_get_type_low(lock) == LOCK_REC) {
+			ulint	n_bit;
+			ulint	n_bits = lock_rec_get_n_bits(lock);
+
+			for (n_bit = 0; n_bit < n_bits; n_bit++) {
+				if (lock_rec_get_nth_bit(lock, n_bit)) {
+					n_records++;
+				}
+			}
+		}
+	}
+
+	return(n_records);
+}
+
+/*============== RECORD LOCK CREATION AND QUEUE MANAGEMENT =============*/
+
+/*********************************************************************//**
+Creates a new record lock and inserts it to the lock queue. Does NOT check
+for deadlocks or lock compatibility!
+@return	created lock */
+static
+lock_t*
+lock_rec_create(
+/*============*/
+	ulint			type_mode,/*!< in: lock mode and wait
+					flag, type is ignored and
+					replaced by LOCK_REC */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of the record */
+	dict_index_t*		index,	/*!< in: index of record */
+	trx_t*			trx,	/*!< in/out: transaction */
+	ibool			caller_owns_trx_mutex)
+					/*!< in: TRUE if caller owns
+					trx mutex */
+{
+	lock_t*		lock;
+	ulint		page_no;
+	ulint		space;
+	ulint		n_bits;
+	ulint		n_bytes;
+	const page_t*	page;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(caller_owns_trx_mutex == trx_mutex_own(trx));
+	ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
+
+	/* Non-locking autocommit read-only transactions should not set
+	any locks. */
+	assert_trx_in_list(trx);
+
+	space = buf_block_get_space(block);
+	page_no	= buf_block_get_page_no(block);
+	page = block->frame;
+
+	btr_assert_not_corrupted(block, index);
+
+	/* If rec is the supremum record, then we reset the gap and
+	LOCK_REC_NOT_GAP bits, as all locks on the supremum are
+	automatically of the gap type */
+
+	if (UNIV_UNLIKELY(heap_no == PAGE_HEAP_NO_SUPREMUM)) {
+		ut_ad(!(type_mode & LOCK_REC_NOT_GAP));
+
+		type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP);
+	}
+
+	/* Make lock bitmap bigger by a safety margin */
+	n_bits = page_dir_get_n_heap(page) + LOCK_PAGE_BITMAP_MARGIN;
+	n_bytes = 1 + n_bits / 8;
+
+	lock = static_cast<lock_t*>(
+		mem_heap_alloc(trx->lock.lock_heap, sizeof(lock_t) + n_bytes));
+
+	lock->trx = trx;
+
+	lock->type_mode = (type_mode & ~LOCK_TYPE_MASK) | LOCK_REC;
+	lock->index = index;
+
+	lock->un_member.rec_lock.space = space;
+	lock->un_member.rec_lock.page_no = page_no;
+	lock->un_member.rec_lock.n_bits = n_bytes * 8;
+
+	/* Reset to zero the bitmap which resides immediately after the
+	lock struct */
+
+	lock_rec_bitmap_reset(lock);
+
+	/* Set the bit corresponding to rec */
+	lock_rec_set_nth_bit(lock, heap_no);
+
+	index->table->n_rec_locks++;
+
+	ut_ad(index->table->n_ref_count > 0 || !index->table->can_be_evicted);
+
+	HASH_INSERT(lock_t, hash, lock_sys->rec_hash,
+		    lock_rec_fold(space, page_no), lock);
+
+	if (!caller_owns_trx_mutex) {
+		trx_mutex_enter(trx);
+	}
+	ut_ad(trx_mutex_own(trx));
+
+	if (type_mode & LOCK_WAIT) {
+
+		lock_set_lock_and_trx_wait(lock, trx);
+	}
+
+	UT_LIST_ADD_LAST(trx_locks, trx->lock.trx_locks, lock);
+
+	if (!caller_owns_trx_mutex) {
+		trx_mutex_exit(trx);
+	}
+
+	MONITOR_INC(MONITOR_RECLOCK_CREATED);
+	MONITOR_INC(MONITOR_NUM_RECLOCK);
+
+	return(lock);
+}
+
+/*********************************************************************//**
+Enqueues a waiting request for a lock which cannot be granted immediately.
+Checks for deadlocks.
+@return DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED, or
+DB_SUCCESS_LOCKED_REC; DB_SUCCESS_LOCKED_REC means that
+there was a deadlock, but another transaction was chosen as a victim,
+and we got the lock immediately: no need to wait then */
+static
+dberr_t
+lock_rec_enqueue_waiting(
+/*=====================*/
+	ulint			type_mode,/*!< in: lock mode this
+					transaction is requesting:
+					LOCK_S or LOCK_X, possibly
+					ORed with LOCK_GAP or
+					LOCK_REC_NOT_GAP, ORed with
+					LOCK_INSERT_INTENTION if this
+					waiting lock request is set
+					when performing an insert of
+					an index record */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of the record */
+	dict_index_t*		index,	/*!< in: index of record */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	trx_t*			trx;
+	lock_t*			lock;
+	trx_id_t		victim_trx_id;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(!srv_read_only_mode);
+	ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
+
+	trx = thr_get_trx(thr);
+
+	ut_ad(trx_mutex_own(trx));
+
+	/* Test if there already is some other reason to suspend thread:
+	we do not enqueue a lock request if the query thread should be
+	stopped anyway */
+
+	if (que_thr_stop(thr)) {
+		ut_error;
+
+		return(DB_QUE_THR_SUSPENDED);
+	}
+
+	switch (trx_get_dict_operation(trx)) {
+	case TRX_DICT_OP_NONE:
+		break;
+	case TRX_DICT_OP_TABLE:
+	case TRX_DICT_OP_INDEX:
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: a record lock wait happens"
+		      " in a dictionary operation!\n"
+		      "InnoDB: ", stderr);
+		dict_index_name_print(stderr, trx, index);
+		fputs(".\n"
+		      "InnoDB: Submit a detailed bug report"
+		      " to http://bugs.mysql.com\n",
+		      stderr);
+		ut_ad(0);
+	}
+
+	/* Enqueue the lock request that will wait to be granted, note that
+	we already own the trx mutex. */
+	lock = lock_rec_create(
+		type_mode | LOCK_WAIT, block, heap_no, index, trx, TRUE);
+
+	/* Release the mutex to obey the latching order.
+	This is safe, because lock_deadlock_check_and_resolve()
+	is invoked when a lock wait is enqueued for the currently
+	running transaction. Because trx is a running transaction
+	(it is not currently suspended because of a lock wait),
+	its state can only be changed by this thread, which is
+	currently associated with the transaction. */
+
+	trx_mutex_exit(trx);
+
+	victim_trx_id = lock_deadlock_check_and_resolve(lock, trx);
+
+	trx_mutex_enter(trx);
+
+	if (victim_trx_id != 0) {
+
+		ut_ad(victim_trx_id == trx->id);
+
+		lock_reset_lock_and_trx_wait(lock);
+		lock_rec_reset_nth_bit(lock, heap_no);
+
+		return(DB_DEADLOCK);
+
+	} else if (trx->lock.wait_lock == NULL) {
+
+		/* If there was a deadlock but we chose another
+		transaction as a victim, it is possible that we
+		already have the lock now granted! */
+
+		return(DB_SUCCESS_LOCKED_REC);
+	}
+
+	trx->lock.que_state = TRX_QUE_LOCK_WAIT;
+
+	trx->lock.was_chosen_as_deadlock_victim = FALSE;
+	trx->lock.wait_started = ut_time();
+
+	ut_a(que_thr_stop(thr));
+
+#ifdef UNIV_DEBUG
+	if (lock_print_waits) {
+		fprintf(stderr, "Lock wait for trx " TRX_ID_FMT " in index ",
+			trx->id);
+		ut_print_name(stderr, trx, FALSE, index->name);
+	}
+#endif /* UNIV_DEBUG */
+
+	MONITOR_INC(MONITOR_LOCKREC_WAIT);
+
+	return(DB_LOCK_WAIT);
+}
+
+/*********************************************************************//**
+Adds a record lock request in the record queue. The request is normally
+added as the last in the queue, but if there are no waiting lock requests
+on the record, and the request to be added is not a waiting request, we
+can reuse a suitable record lock object already existing on the same page,
+just setting the appropriate bit in its bitmap. This is a low-level function
+which does NOT check for deadlocks or lock compatibility!
+@return	lock where the bit was set */
+static
+lock_t*
+lock_rec_add_to_queue(
+/*==================*/
+	ulint			type_mode,/*!< in: lock mode, wait, gap
+					etc. flags; type is ignored
+					and replaced by LOCK_REC */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of the record */
+	dict_index_t*		index,	/*!< in: index of record */
+	trx_t*			trx,	/*!< in/out: transaction */
+	ibool			caller_owns_trx_mutex)
+					/*!< in: TRUE if caller owns the
+					transaction mutex */
+{
+	lock_t*	lock;
+	lock_t*	first_lock;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(caller_owns_trx_mutex == trx_mutex_own(trx));
+	ut_ad(dict_index_is_clust(index)
+	      || dict_index_get_online_status(index) != ONLINE_INDEX_CREATION);
+#ifdef UNIV_DEBUG
+	switch (type_mode & LOCK_MODE_MASK) {
+	case LOCK_X:
+	case LOCK_S:
+		break;
+	default:
+		ut_error;
+	}
+
+	if (!(type_mode & (LOCK_WAIT | LOCK_GAP))) {
+		enum lock_mode	mode = (type_mode & LOCK_MODE_MASK) == LOCK_S
+			? LOCK_X
+			: LOCK_S;
+		const lock_t*	other_lock
+			= lock_rec_other_has_expl_req(mode, 0, LOCK_WAIT,
+						      block, heap_no, trx);
+		ut_a(!other_lock);
+	}
+#endif /* UNIV_DEBUG */
+
+	type_mode |= LOCK_REC;
+
+	/* If rec is the supremum record, then we can reset the gap bit, as
+	all locks on the supremum are automatically of the gap type, and we
+	try to avoid unnecessary memory consumption of a new record lock
+	struct for a gap type lock */
+
+	if (UNIV_UNLIKELY(heap_no == PAGE_HEAP_NO_SUPREMUM)) {
+		ut_ad(!(type_mode & LOCK_REC_NOT_GAP));
+
+		/* There should never be LOCK_REC_NOT_GAP on a supremum
+		record, but let us play safe */
+
+		type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP);
+	}
+
+	/* Look for a waiting lock request on the same record or on a gap */
+
+	for (first_lock = lock = lock_rec_get_first_on_page(block);
+	     lock != NULL;
+	     lock = lock_rec_get_next_on_page(lock)) {
+
+		if (lock_get_wait(lock)
+		    && lock_rec_get_nth_bit(lock, heap_no)) {
+
+			goto somebody_waits;
+		}
+	}
+
+	if (UNIV_LIKELY(!(type_mode & LOCK_WAIT))) {
+
+		/* Look for a similar record lock on the same page:
+		if one is found and there are no waiting lock requests,
+		we can just set the bit */
+
+		lock = lock_rec_find_similar_on_page(
+			type_mode, heap_no, first_lock, trx);
+
+		if (lock) {
+
+			lock_rec_set_nth_bit(lock, heap_no);
+
+			return(lock);
+		}
+	}
+
+somebody_waits:
+	return(lock_rec_create(
+			type_mode, block, heap_no, index, trx,
+			caller_owns_trx_mutex));
+}
+
+/** Record locking request status */
+enum lock_rec_req_status {
+	/** Failed to acquire a lock */
+	LOCK_REC_FAIL,
+	/** Succeeded in acquiring a lock (implicit or already acquired) */
+	LOCK_REC_SUCCESS,
+	/** Explicitly created a new lock */
+	LOCK_REC_SUCCESS_CREATED
+};
+
+/*********************************************************************//**
+This is a fast routine for locking a record in the most common cases:
+there are no explicit locks on the page, or there is just one lock, owned
+by this transaction, and of the right type_mode. This is a low-level function
+which does NOT look at implicit locks! Checks lock compatibility within
+explicit locks. This function sets a normal next-key lock, or in the case of
+a page supremum record, a gap type lock.
+@return whether the locking succeeded */
+UNIV_INLINE
+enum lock_rec_req_status
+lock_rec_lock_fast(
+/*===============*/
+	ibool			impl,	/*!< in: if TRUE, no lock is set
+					if no wait is necessary: we
+					assume that the caller will
+					set an implicit lock */
+	ulint			mode,	/*!< in: lock mode: LOCK_X or
+					LOCK_S possibly ORed to either
+					LOCK_GAP or LOCK_REC_NOT_GAP */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of record */
+	dict_index_t*		index,	/*!< in: index of record */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	lock_t*			lock;
+	trx_t*			trx;
+	enum lock_rec_req_status status = LOCK_REC_SUCCESS;
+
+	ut_ad(lock_mutex_own());
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+	ut_ad((LOCK_MODE_MASK & mode) == LOCK_S
+	      || (LOCK_MODE_MASK & mode) == LOCK_X);
+	ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
+	      || mode - (LOCK_MODE_MASK & mode) == 0
+	      || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP);
+	ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
+
+	DBUG_EXECUTE_IF("innodb_report_deadlock", return(LOCK_REC_FAIL););
+
+	lock = lock_rec_get_first_on_page(block);
+
+	trx = thr_get_trx(thr);
+
+	if (lock == NULL) {
+		if (!impl) {
+			/* Note that we don't own the trx mutex. */
+			lock = lock_rec_create(
+				mode, block, heap_no, index, trx, FALSE);
+
+		}
+		status = LOCK_REC_SUCCESS_CREATED;
+	} else {
+		trx_mutex_enter(trx);
+
+		if (lock_rec_get_next_on_page(lock)
+		     || lock->trx != trx
+		     || lock->type_mode != (mode | LOCK_REC)
+		     || lock_rec_get_n_bits(lock) <= heap_no) {
+
+			status = LOCK_REC_FAIL;
+		} else if (!impl) {
+			/* If the nth bit of the record lock is already set
+			then we do not set a new lock bit, otherwise we do
+			set */
+			if (!lock_rec_get_nth_bit(lock, heap_no)) {
+				lock_rec_set_nth_bit(lock, heap_no);
+				status = LOCK_REC_SUCCESS_CREATED;
+			}
+		}
+
+		trx_mutex_exit(trx);
+	}
+
+	return(status);
+}
+
+/*********************************************************************//**
+This is the general, and slower, routine for locking a record. This is a
+low-level function which does NOT look at implicit locks! Checks lock
+compatibility within explicit locks. This function sets a normal next-key
+lock, or in the case of a page supremum record, a gap type lock.
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
+or DB_QUE_THR_SUSPENDED */
+static
+dberr_t
+lock_rec_lock_slow(
+/*===============*/
+	ibool			impl,	/*!< in: if TRUE, no lock is set
+					if no wait is necessary: we
+					assume that the caller will
+					set an implicit lock */
+	ulint			mode,	/*!< in: lock mode: LOCK_X or
+					LOCK_S possibly ORed to either
+					LOCK_GAP or LOCK_REC_NOT_GAP */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of record */
+	dict_index_t*		index,	/*!< in: index of record */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	trx_t*			trx;
+	dberr_t			err = DB_SUCCESS;
+
+	ut_ad(lock_mutex_own());
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+	ut_ad((LOCK_MODE_MASK & mode) == LOCK_S
+	      || (LOCK_MODE_MASK & mode) == LOCK_X);
+	ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
+	      || mode - (LOCK_MODE_MASK & mode) == 0
+	      || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP);
+	ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
+
+	DBUG_EXECUTE_IF("innodb_report_deadlock", return(DB_DEADLOCK););
+
+	trx = thr_get_trx(thr);
+	trx_mutex_enter(trx);
+
+	if (lock_rec_has_expl(mode, block, heap_no, trx)) {
+
+		/* The trx already has a strong enough lock on rec: do
+		nothing */
+
+	} else if (lock_rec_other_has_conflicting(
+			static_cast<enum lock_mode>(mode),
+			block, heap_no, trx)) {
+
+		/* If another transaction has a non-gap conflicting
+		request in the queue, as this transaction does not
+		have a lock strong enough already granted on the
+		record, we have to wait. */
+
+		err = lock_rec_enqueue_waiting(
+			mode, block, heap_no, index, thr);
+
+	} else if (!impl) {
+		/* Set the requested lock on the record, note that
+		we already own the transaction mutex. */
+
+		lock_rec_add_to_queue(
+			LOCK_REC | mode, block, heap_no, index, trx, TRUE);
+
+		err = DB_SUCCESS_LOCKED_REC;
+	}
+
+	trx_mutex_exit(trx);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Tries to lock the specified record in the mode requested. If not immediately
+possible, enqueues a waiting lock request. This is a low-level function
+which does NOT look at implicit locks! Checks lock compatibility within
+explicit locks. This function sets a normal next-key lock, or in the case
+of a page supremum record, a gap type lock.
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
+or DB_QUE_THR_SUSPENDED */
+static
+dberr_t
+lock_rec_lock(
+/*==========*/
+	ibool			impl,	/*!< in: if TRUE, no lock is set
+					if no wait is necessary: we
+					assume that the caller will
+					set an implicit lock */
+	ulint			mode,	/*!< in: lock mode: LOCK_X or
+					LOCK_S possibly ORed to either
+					LOCK_GAP or LOCK_REC_NOT_GAP */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of record */
+	dict_index_t*		index,	/*!< in: index of record */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+	ut_ad((LOCK_MODE_MASK & mode) == LOCK_S
+	      || (LOCK_MODE_MASK & mode) == LOCK_X);
+	ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
+	      || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP
+	      || mode - (LOCK_MODE_MASK & mode) == 0);
+	ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
+
+	/* We try a simplified and faster subroutine for the most
+	common cases */
+	switch (lock_rec_lock_fast(impl, mode, block, heap_no, index, thr)) {
+	case LOCK_REC_SUCCESS:
+		return(DB_SUCCESS);
+	case LOCK_REC_SUCCESS_CREATED:
+		return(DB_SUCCESS_LOCKED_REC);
+	case LOCK_REC_FAIL:
+		return(lock_rec_lock_slow(impl, mode, block,
+					  heap_no, index, thr));
+	}
+
+	ut_error;
+	return(DB_ERROR);
+}
+
+/*********************************************************************//**
+Checks if a waiting record lock request still has to wait in a queue.
+@return	lock that is causing the wait */
+static
+const lock_t*
+lock_rec_has_to_wait_in_queue(
+/*==========================*/
+	const lock_t*	wait_lock)	/*!< in: waiting record lock */
+{
+	const lock_t*	lock;
+	ulint		space;
+	ulint		page_no;
+	ulint		heap_no;
+	ulint		bit_mask;
+	ulint		bit_offset;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(lock_get_wait(wait_lock));
+	ut_ad(lock_get_type_low(wait_lock) == LOCK_REC);
+
+	space = wait_lock->un_member.rec_lock.space;
+	page_no = wait_lock->un_member.rec_lock.page_no;
+	heap_no = lock_rec_find_set_bit(wait_lock);
+
+	bit_offset = heap_no / 8;
+	bit_mask = static_cast<ulint>(1 << (heap_no % 8));
+
+	for (lock = lock_rec_get_first_on_page_addr(space, page_no);
+	     lock != wait_lock;
+	     lock = lock_rec_get_next_on_page_const(lock)) {
+
+		const byte*	p = (const byte*) &lock[1];
+
+		if (heap_no < lock_rec_get_n_bits(lock)
+		    && (p[bit_offset] & bit_mask)
+		    && lock_has_to_wait(wait_lock, lock)) {
+
+			return(lock);
+		}
+	}
+
+	return(NULL);
+}
+
+/*************************************************************//**
+Grants a lock to a waiting lock request and releases the waiting transaction.
+The caller must hold lock_sys->mutex but not lock->trx->mutex. */
+static
+void
+lock_grant(
+/*=======*/
+	lock_t*	lock)	/*!< in/out: waiting lock request */
+{
+	ut_ad(lock_mutex_own());
+
+	lock_reset_lock_and_trx_wait(lock);
+
+	trx_mutex_enter(lock->trx);
+
+	if (lock_get_mode(lock) == LOCK_AUTO_INC) {
+		dict_table_t*	table = lock->un_member.tab_lock.table;
+
+		if (UNIV_UNLIKELY(table->autoinc_trx == lock->trx)) {
+			fprintf(stderr,
+				"InnoDB: Error: trx already had"
+				" an AUTO-INC lock!\n");
+		} else {
+			table->autoinc_trx = lock->trx;
+
+			ib_vector_push(lock->trx->autoinc_locks, &lock);
+		}
+	}
+
+#ifdef UNIV_DEBUG
+	if (lock_print_waits) {
+		fprintf(stderr, "Lock wait for trx " TRX_ID_FMT " ends\n",
+			lock->trx->id);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* If we are resolving a deadlock by choosing another transaction
+	as a victim, then our original transaction may not be in the
+	TRX_QUE_LOCK_WAIT state, and there is no need to end the lock wait
+	for it */
+
+	if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+		que_thr_t*	thr;
+
+		thr = que_thr_end_lock_wait(lock->trx);
+
+		if (thr != NULL) {
+			lock_wait_release_thread_if_suspended(thr);
+		}
+	}
+
+	trx_mutex_exit(lock->trx);
+}
+
+/*************************************************************//**
+Cancels a waiting record lock request and releases the waiting transaction
+that requested it. NOTE: does NOT check if waiting lock requests behind this
+one can now be granted! */
+static
+void
+lock_rec_cancel(
+/*============*/
+	lock_t*	lock)	/*!< in: waiting record lock request */
+{
+	que_thr_t*	thr;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+	/* Reset the bit (there can be only one set bit) in the lock bitmap */
+	lock_rec_reset_nth_bit(lock, lock_rec_find_set_bit(lock));
+
+	/* Reset the wait flag and the back pointer to lock in trx */
+
+	lock_reset_lock_and_trx_wait(lock);
+
+	/* The following function releases the trx from lock wait */
+
+	trx_mutex_enter(lock->trx);
+
+	thr = que_thr_end_lock_wait(lock->trx);
+
+	if (thr != NULL) {
+		lock_wait_release_thread_if_suspended(thr);
+	}
+
+	trx_mutex_exit(lock->trx);
+}
+
+/*************************************************************//**
+Removes a record lock request, waiting or granted, from the queue and
+grants locks to other transactions in the queue if they now are entitled
+to a lock. NOTE: all record locks contained in in_lock are removed. */
+static
+void
+lock_rec_dequeue_from_page(
+/*=======================*/
+	lock_t*		in_lock)	/*!< in: record lock object: all
+					record locks which are contained in
+					this lock object are removed;
+					transactions waiting behind will
+					get their lock requests granted,
+					if they are now qualified to it */
+{
+	ulint		space;
+	ulint		page_no;
+	lock_t*		lock;
+	trx_lock_t*	trx_lock;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
+	/* We may or may not be holding in_lock->trx->mutex here. */
+
+	trx_lock = &in_lock->trx->lock;
+
+	space = in_lock->un_member.rec_lock.space;
+	page_no = in_lock->un_member.rec_lock.page_no;
+
+	in_lock->index->table->n_rec_locks--;
+
+	HASH_DELETE(lock_t, hash, lock_sys->rec_hash,
+		    lock_rec_fold(space, page_no), in_lock);
+
+	UT_LIST_REMOVE(trx_locks, trx_lock->trx_locks, in_lock);
+
+	MONITOR_INC(MONITOR_RECLOCK_REMOVED);
+	MONITOR_DEC(MONITOR_NUM_RECLOCK);
+
+	/* Check if waiting locks in the queue can now be granted: grant
+	locks if there are no conflicting locks ahead. Stop at the first
+	X lock that is waiting or has been granted. */
+
+	for (lock = lock_rec_get_first_on_page_addr(space, page_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next_on_page(lock)) {
+
+		if (lock_get_wait(lock)
+		    && !lock_rec_has_to_wait_in_queue(lock)) {
+
+			/* Grant the lock */
+			ut_ad(lock->trx != in_lock->trx);
+			lock_grant(lock);
+		}
+	}
+}
+
+/*************************************************************//**
+Removes a record lock request, waiting or granted, from the queue. */
+static
+void
+lock_rec_discard(
+/*=============*/
+	lock_t*		in_lock)	/*!< in: record lock object: all
+					record locks which are contained
+					in this lock object are removed */
+{
+	ulint		space;
+	ulint		page_no;
+	trx_lock_t*	trx_lock;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
+
+	trx_lock = &in_lock->trx->lock;
+
+	space = in_lock->un_member.rec_lock.space;
+	page_no = in_lock->un_member.rec_lock.page_no;
+
+	in_lock->index->table->n_rec_locks--;
+
+	HASH_DELETE(lock_t, hash, lock_sys->rec_hash,
+		    lock_rec_fold(space, page_no), in_lock);
+
+	UT_LIST_REMOVE(trx_locks, trx_lock->trx_locks, in_lock);
+
+	MONITOR_INC(MONITOR_RECLOCK_REMOVED);
+	MONITOR_DEC(MONITOR_NUM_RECLOCK);
+}
+
+/*************************************************************//**
+Removes record lock objects set on an index page which is discarded. This
+function does not move locks, or check for waiting locks, therefore the
+lock bitmaps must already be reset when this function is called. */
+static
+void
+lock_rec_free_all_from_discard_page(
+/*================================*/
+	const buf_block_t*	block)	/*!< in: page to be discarded */
+{
+	ulint	space;
+	ulint	page_no;
+	lock_t*	lock;
+	lock_t*	next_lock;
+
+	ut_ad(lock_mutex_own());
+
+	space = buf_block_get_space(block);
+	page_no = buf_block_get_page_no(block);
+
+	lock = lock_rec_get_first_on_page_addr(space, page_no);
+
+	while (lock != NULL) {
+		ut_ad(lock_rec_find_set_bit(lock) == ULINT_UNDEFINED);
+		ut_ad(!lock_get_wait(lock));
+
+		next_lock = lock_rec_get_next_on_page(lock);
+
+		lock_rec_discard(lock);
+
+		lock = next_lock;
+	}
+}
+
+/*============= RECORD LOCK MOVING AND INHERITING ===================*/
+
+/*************************************************************//**
+Resets the lock bits for a single record. Releases transactions waiting for
+lock requests here. */
+static
+void
+lock_rec_reset_and_release_wait(
+/*============================*/
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no)/*!< in: heap number of record */
+{
+	lock_t*	lock;
+
+	ut_ad(lock_mutex_own());
+
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next(heap_no, lock)) {
+
+		if (lock_get_wait(lock)) {
+			lock_rec_cancel(lock);
+		} else {
+			lock_rec_reset_nth_bit(lock, heap_no);
+		}
+	}
+}
+
+/*************************************************************//**
+Makes a record to inherit the locks (except LOCK_INSERT_INTENTION type)
+of another record as gap type locks, but does not reset the lock bits of
+the other record. Also waiting lock requests on rec are inherited as
+GRANTED gap locks. */
+static
+void
+lock_rec_inherit_to_gap(
+/*====================*/
+	const buf_block_t*	heir_block,	/*!< in: block containing the
+						record which inherits */
+	const buf_block_t*	block,		/*!< in: block containing the
+						record from which inherited;
+						does NOT reset the locks on
+						this record */
+	ulint			heir_heap_no,	/*!< in: heap_no of the
+						inheriting record */
+	ulint			heap_no)	/*!< in: heap_no of the
+						donating record */
+{
+	lock_t*	lock;
+
+	ut_ad(lock_mutex_own());
+
+	/* If srv_locks_unsafe_for_binlog is TRUE or session is using
+	READ COMMITTED isolation level, we do not want locks set
+	by an UPDATE or a DELETE to be inherited as gap type locks. But we
+	DO want S-locks set by a consistency constraint to be inherited also
+	then. */
+
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next(heap_no, lock)) {
+
+		if (!lock_rec_get_insert_intention(lock)
+		    && !((srv_locks_unsafe_for_binlog
+			  || lock->trx->isolation_level
+			  <= TRX_ISO_READ_COMMITTED)
+			 && lock_get_mode(lock) == LOCK_X)) {
+
+			lock_rec_add_to_queue(
+				LOCK_REC | LOCK_GAP | lock_get_mode(lock),
+				heir_block, heir_heap_no, lock->index,
+				lock->trx, FALSE);
+		}
+	}
+}
+
+/*************************************************************//**
+Makes a record to inherit the gap locks (except LOCK_INSERT_INTENTION type)
+of another record as gap type locks, but does not reset the lock bits of the
+other record. Also waiting lock requests are inherited as GRANTED gap locks. */
+static
+void
+lock_rec_inherit_to_gap_if_gap_lock(
+/*================================*/
+	const buf_block_t*	block,		/*!< in: buffer block */
+	ulint			heir_heap_no,	/*!< in: heap_no of
+						record which inherits */
+	ulint			heap_no)	/*!< in: heap_no of record
+						from which inherited;
+						does NOT reset the locks
+						on this record */
+{
+	lock_t*	lock;
+
+	lock_mutex_enter();
+
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next(heap_no, lock)) {
+
+		if (!lock_rec_get_insert_intention(lock)
+		    && (heap_no == PAGE_HEAP_NO_SUPREMUM
+			|| !lock_rec_get_rec_not_gap(lock))) {
+
+			lock_rec_add_to_queue(
+				LOCK_REC | LOCK_GAP | lock_get_mode(lock),
+				block, heir_heap_no, lock->index,
+				lock->trx, FALSE);
+		}
+	}
+
+	lock_mutex_exit();
+}
+
+/*************************************************************//**
+Moves the locks of a record to another record and resets the lock bits of
+the donating record. */
+static
+void
+lock_rec_move(
+/*==========*/
+	const buf_block_t*	receiver,	/*!< in: buffer block containing
+						the receiving record */
+	const buf_block_t*	donator,	/*!< in: buffer block containing
+						the donating record */
+	ulint			receiver_heap_no,/*!< in: heap_no of the record
+						which gets the locks; there
+						must be no lock requests
+						on it! */
+	ulint			donator_heap_no)/*!< in: heap_no of the record
+						which gives the locks */
+{
+	lock_t*	lock;
+
+	ut_ad(lock_mutex_own());
+
+	ut_ad(lock_rec_get_first(receiver, receiver_heap_no) == NULL);
+
+	for (lock = lock_rec_get_first(donator, donator_heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next(donator_heap_no, lock)) {
+
+		const ulint	type_mode = lock->type_mode;
+
+		lock_rec_reset_nth_bit(lock, donator_heap_no);
+
+		if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) {
+			lock_reset_lock_and_trx_wait(lock);
+		}
+
+		/* Note that we FIRST reset the bit, and then set the lock:
+		the function works also if donator == receiver */
+
+		lock_rec_add_to_queue(
+			type_mode, receiver, receiver_heap_no,
+			lock->index, lock->trx, FALSE);
+	}
+
+	ut_ad(lock_rec_get_first(donator, donator_heap_no) == NULL);
+}
+
+/*************************************************************//**
+Updates the lock table when we have reorganized a page. NOTE: we copy
+also the locks set on the infimum of the page; the infimum may carry
+locks if an update of a record is occurring on the page, and its locks
+were temporarily stored on the infimum. */
+UNIV_INTERN
+void
+lock_move_reorganize_page(
+/*======================*/
+	const buf_block_t*	block,	/*!< in: old index page, now
+					reorganized */
+	const buf_block_t*	oblock)	/*!< in: copy of the old, not
+					reorganized page */
+{
+	lock_t*		lock;
+	UT_LIST_BASE_NODE_T(lock_t)	old_locks;
+	mem_heap_t*	heap		= NULL;
+	ulint		comp;
+
+	lock_mutex_enter();
+
+	lock = lock_rec_get_first_on_page(block);
+
+	if (lock == NULL) {
+		lock_mutex_exit();
+
+		return;
+	}
+
+	heap = mem_heap_create(256);
+
+	/* Copy first all the locks on the page to heap and reset the
+	bitmaps in the original locks; chain the copies of the locks
+	using the trx_locks field in them. */
+
+	UT_LIST_INIT(old_locks);
+
+	do {
+		/* Make a copy of the lock */
+		lock_t*	old_lock = lock_rec_copy(lock, heap);
+
+		UT_LIST_ADD_LAST(trx_locks, old_locks, old_lock);
+
+		/* Reset bitmap of lock */
+		lock_rec_bitmap_reset(lock);
+
+		if (lock_get_wait(lock)) {
+
+			lock_reset_lock_and_trx_wait(lock);
+		}
+
+		lock = lock_rec_get_next_on_page(lock);
+	} while (lock != NULL);
+
+	comp = page_is_comp(block->frame);
+	ut_ad(comp == page_is_comp(oblock->frame));
+
+	for (lock = UT_LIST_GET_FIRST(old_locks); lock;
+	     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+		/* NOTE: we copy also the locks set on the infimum and
+		supremum of the page; the infimum may carry locks if an
+		update of a record is occurring on the page, and its locks
+		were temporarily stored on the infimum */
+		page_cur_t	cur1;
+		page_cur_t	cur2;
+
+		page_cur_set_before_first(block, &cur1);
+		page_cur_set_before_first(oblock, &cur2);
+
+		/* Set locks according to old locks */
+		for (;;) {
+			ulint	old_heap_no;
+			ulint	new_heap_no;
+
+			ut_ad(comp || !memcmp(page_cur_get_rec(&cur1),
+					      page_cur_get_rec(&cur2),
+					      rec_get_data_size_old(
+						      page_cur_get_rec(
+							      &cur2))));
+			if (UNIV_LIKELY(comp)) {
+				old_heap_no = rec_get_heap_no_new(
+					page_cur_get_rec(&cur2));
+				new_heap_no = rec_get_heap_no_new(
+					page_cur_get_rec(&cur1));
+			} else {
+				old_heap_no = rec_get_heap_no_old(
+					page_cur_get_rec(&cur2));
+				new_heap_no = rec_get_heap_no_old(
+					page_cur_get_rec(&cur1));
+			}
+
+			if (lock_rec_get_nth_bit(lock, old_heap_no)) {
+
+				/* Clear the bit in old_lock. */
+				ut_d(lock_rec_reset_nth_bit(lock,
+							    old_heap_no));
+
+				/* NOTE that the old lock bitmap could be too
+				small for the new heap number! */
+
+				lock_rec_add_to_queue(
+					lock->type_mode, block, new_heap_no,
+					lock->index, lock->trx, FALSE);
+
+				/* if (new_heap_no == PAGE_HEAP_NO_SUPREMUM
+				&& lock_get_wait(lock)) {
+				fprintf(stderr,
+				"---\n--\n!!!Lock reorg: supr type %lu\n",
+				lock->type_mode);
+				} */
+			}
+
+			if (UNIV_UNLIKELY
+			    (new_heap_no == PAGE_HEAP_NO_SUPREMUM)) {
+
+				ut_ad(old_heap_no == PAGE_HEAP_NO_SUPREMUM);
+				break;
+			}
+
+			page_cur_move_to_next(&cur1);
+			page_cur_move_to_next(&cur2);
+		}
+
+#ifdef UNIV_DEBUG
+		{
+			ulint	i = lock_rec_find_set_bit(lock);
+
+			/* Check that all locks were moved. */
+			if (UNIV_UNLIKELY(i != ULINT_UNDEFINED)) {
+				fprintf(stderr,
+					"lock_move_reorganize_page():"
+					" %lu not moved in %p\n",
+					(ulong) i, (void*) lock);
+				ut_error;
+			}
+		}
+#endif /* UNIV_DEBUG */
+	}
+
+	lock_mutex_exit();
+
+	mem_heap_free(heap);
+
+#ifdef UNIV_DEBUG_LOCK_VALIDATE
+	ut_ad(lock_rec_validate_page(block));
+#endif
+}
+
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list end is moved to another page. */
+UNIV_INTERN
+void
+lock_move_rec_list_end(
+/*===================*/
+	const buf_block_t*	new_block,	/*!< in: index page to move to */
+	const buf_block_t*	block,		/*!< in: index page */
+	const rec_t*		rec)		/*!< in: record on page: this
+						is the first record moved */
+{
+	lock_t*		lock;
+	const ulint	comp	= page_rec_is_comp(rec);
+
+	lock_mutex_enter();
+
+	/* Note: when we move locks from record to record, waiting locks
+	and possible granted gap type locks behind them are enqueued in
+	the original order, because new elements are inserted to a hash
+	table to the end of the hash chain, and lock_rec_add_to_queue
+	does not reuse locks if there are waiters in the queue. */
+
+	for (lock = lock_rec_get_first_on_page(block); lock;
+	     lock = lock_rec_get_next_on_page(lock)) {
+		page_cur_t	cur1;
+		page_cur_t	cur2;
+		const ulint	type_mode = lock->type_mode;
+
+		page_cur_position(rec, block, &cur1);
+
+		if (page_cur_is_before_first(&cur1)) {
+			page_cur_move_to_next(&cur1);
+		}
+
+		page_cur_set_before_first(new_block, &cur2);
+		page_cur_move_to_next(&cur2);
+
+		/* Copy lock requests on user records to new page and
+		reset the lock bits on the old */
+
+		while (!page_cur_is_after_last(&cur1)) {
+			ulint	heap_no;
+
+			if (comp) {
+				heap_no = rec_get_heap_no_new(
+					page_cur_get_rec(&cur1));
+			} else {
+				heap_no = rec_get_heap_no_old(
+					page_cur_get_rec(&cur1));
+				ut_ad(!memcmp(page_cur_get_rec(&cur1),
+					 page_cur_get_rec(&cur2),
+					 rec_get_data_size_old(
+						 page_cur_get_rec(&cur2))));
+			}
+
+			if (lock_rec_get_nth_bit(lock, heap_no)) {
+				lock_rec_reset_nth_bit(lock, heap_no);
+
+				if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) {
+					lock_reset_lock_and_trx_wait(lock);
+				}
+
+				if (comp) {
+					heap_no = rec_get_heap_no_new(
+						page_cur_get_rec(&cur2));
+				} else {
+					heap_no = rec_get_heap_no_old(
+						page_cur_get_rec(&cur2));
+				}
+
+				lock_rec_add_to_queue(
+					type_mode, new_block, heap_no,
+					lock->index, lock->trx, FALSE);
+			}
+
+			page_cur_move_to_next(&cur1);
+			page_cur_move_to_next(&cur2);
+		}
+	}
+
+	lock_mutex_exit();
+
+#ifdef UNIV_DEBUG_LOCK_VALIDATE
+	ut_ad(lock_rec_validate_page(block));
+	ut_ad(lock_rec_validate_page(new_block));
+#endif
+}
+
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+UNIV_INTERN
+void
+lock_move_rec_list_start(
+/*=====================*/
+	const buf_block_t*	new_block,	/*!< in: index page to
+						move to */
+	const buf_block_t*	block,		/*!< in: index page */
+	const rec_t*		rec,		/*!< in: record on page:
+						this is the first
+						record NOT copied */
+	const rec_t*		old_end)	/*!< in: old
+						previous-to-last
+						record on new_page
+						before the records
+						were copied */
+{
+	lock_t*		lock;
+	const ulint	comp	= page_rec_is_comp(rec);
+
+	ut_ad(block->frame == page_align(rec));
+	ut_ad(new_block->frame == page_align(old_end));
+
+	lock_mutex_enter();
+
+	for (lock = lock_rec_get_first_on_page(block); lock;
+	     lock = lock_rec_get_next_on_page(lock)) {
+		page_cur_t	cur1;
+		page_cur_t	cur2;
+		const ulint	type_mode = lock->type_mode;
+
+		page_cur_set_before_first(block, &cur1);
+		page_cur_move_to_next(&cur1);
+
+		page_cur_position(old_end, new_block, &cur2);
+		page_cur_move_to_next(&cur2);
+
+		/* Copy lock requests on user records to new page and
+		reset the lock bits on the old */
+
+		while (page_cur_get_rec(&cur1) != rec) {
+			ulint	heap_no;
+
+			if (comp) {
+				heap_no = rec_get_heap_no_new(
+					page_cur_get_rec(&cur1));
+			} else {
+				heap_no = rec_get_heap_no_old(
+					page_cur_get_rec(&cur1));
+				ut_ad(!memcmp(page_cur_get_rec(&cur1),
+					      page_cur_get_rec(&cur2),
+					      rec_get_data_size_old(
+						      page_cur_get_rec(
+							      &cur2))));
+			}
+
+			if (lock_rec_get_nth_bit(lock, heap_no)) {
+				lock_rec_reset_nth_bit(lock, heap_no);
+
+				if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) {
+					lock_reset_lock_and_trx_wait(lock);
+				}
+
+				if (comp) {
+					heap_no = rec_get_heap_no_new(
+						page_cur_get_rec(&cur2));
+				} else {
+					heap_no = rec_get_heap_no_old(
+						page_cur_get_rec(&cur2));
+				}
+
+				lock_rec_add_to_queue(
+					type_mode, new_block, heap_no,
+					lock->index, lock->trx, FALSE);
+			}
+
+			page_cur_move_to_next(&cur1);
+			page_cur_move_to_next(&cur2);
+		}
+
+#ifdef UNIV_DEBUG
+		if (page_rec_is_supremum(rec)) {
+			ulint	i;
+
+			for (i = PAGE_HEAP_NO_USER_LOW;
+			     i < lock_rec_get_n_bits(lock); i++) {
+				if (UNIV_UNLIKELY
+				    (lock_rec_get_nth_bit(lock, i))) {
+
+					fprintf(stderr,
+						"lock_move_rec_list_start():"
+						" %lu not moved in %p\n",
+						(ulong) i, (void*) lock);
+					ut_error;
+				}
+			}
+		}
+#endif /* UNIV_DEBUG */
+	}
+
+	lock_mutex_exit();
+
+#ifdef UNIV_DEBUG_LOCK_VALIDATE
+	ut_ad(lock_rec_validate_page(block));
+#endif
+}
+
+/*************************************************************//**
+Updates the lock table when a page is split to the right. */
+UNIV_INTERN
+void
+lock_update_split_right(
+/*====================*/
+	const buf_block_t*	right_block,	/*!< in: right page */
+	const buf_block_t*	left_block)	/*!< in: left page */
+{
+	ulint	heap_no = lock_get_min_heap_no(right_block);
+
+	lock_mutex_enter();
+
+	/* Move the locks on the supremum of the left page to the supremum
+	of the right page */
+
+	lock_rec_move(right_block, left_block,
+		      PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+
+	/* Inherit the locks to the supremum of left page from the successor
+	of the infimum on right page */
+
+	lock_rec_inherit_to_gap(left_block, right_block,
+				PAGE_HEAP_NO_SUPREMUM, heap_no);
+
+	lock_mutex_exit();
+}
+
+/*************************************************************//**
+Updates the lock table when a page is merged to the right. */
+UNIV_INTERN
+void
+lock_update_merge_right(
+/*====================*/
+	const buf_block_t*	right_block,	/*!< in: right page to
+						which merged */
+	const rec_t*		orig_succ,	/*!< in: original
+						successor of infimum
+						on the right page
+						before merge */
+	const buf_block_t*	left_block)	/*!< in: merged index
+						page which will be
+						discarded */
+{
+	lock_mutex_enter();
+
+	/* Inherit the locks from the supremum of the left page to the
+	original successor of infimum on the right page, to which the left
+	page was merged */
+
+	lock_rec_inherit_to_gap(right_block, left_block,
+				page_rec_get_heap_no(orig_succ),
+				PAGE_HEAP_NO_SUPREMUM);
+
+	/* Reset the locks on the supremum of the left page, releasing
+	waiting transactions */
+
+	lock_rec_reset_and_release_wait(left_block,
+					PAGE_HEAP_NO_SUPREMUM);
+
+	lock_rec_free_all_from_discard_page(left_block);
+
+	lock_mutex_exit();
+}
+
+/*************************************************************//**
+Updates the lock table when the root page is copied to another in
+btr_root_raise_and_insert. Note that we leave lock structs on the
+root page, even though they do not make sense on other than leaf
+pages: the reason is that in a pessimistic update the infimum record
+of the root page will act as a dummy carrier of the locks of the record
+to be updated. */
+UNIV_INTERN
+void
+lock_update_root_raise(
+/*===================*/
+	const buf_block_t*	block,	/*!< in: index page to which copied */
+	const buf_block_t*	root)	/*!< in: root page */
+{
+	lock_mutex_enter();
+
+	/* Move the locks on the supremum of the root to the supremum
+	of block */
+
+	lock_rec_move(block, root,
+		      PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+	lock_mutex_exit();
+}
+
+/*************************************************************//**
+Updates the lock table when a page is copied to another and the original page
+is removed from the chain of leaf pages, except if page is the root! */
+UNIV_INTERN
+void
+lock_update_copy_and_discard(
+/*=========================*/
+	const buf_block_t*	new_block,	/*!< in: index page to
+						which copied */
+	const buf_block_t*	block)		/*!< in: index page;
+						NOT the root! */
+{
+	lock_mutex_enter();
+
+	/* Move the locks on the supremum of the old page to the supremum
+	of new_page */
+
+	lock_rec_move(new_block, block,
+		      PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+	lock_rec_free_all_from_discard_page(block);
+
+	lock_mutex_exit();
+}
+
+/*************************************************************//**
+Updates the lock table when a page is split to the left. */
+UNIV_INTERN
+void
+lock_update_split_left(
+/*===================*/
+	const buf_block_t*	right_block,	/*!< in: right page */
+	const buf_block_t*	left_block)	/*!< in: left page */
+{
+	ulint	heap_no = lock_get_min_heap_no(right_block);
+
+	lock_mutex_enter();
+
+	/* Inherit the locks to the supremum of the left page from the
+	successor of the infimum on the right page */
+
+	lock_rec_inherit_to_gap(left_block, right_block,
+				PAGE_HEAP_NO_SUPREMUM, heap_no);
+
+	lock_mutex_exit();
+}
+
+/*************************************************************//**
+Updates the lock table when a page is merged to the left. */
+UNIV_INTERN
+void
+lock_update_merge_left(
+/*===================*/
+	const buf_block_t*	left_block,	/*!< in: left page to
+						which merged */
+	const rec_t*		orig_pred,	/*!< in: original predecessor
+						of supremum on the left page
+						before merge */
+	const buf_block_t*	right_block)	/*!< in: merged index page
+						which will be discarded */
+{
+	const rec_t*	left_next_rec;
+
+	ut_ad(left_block->frame == page_align(orig_pred));
+
+	lock_mutex_enter();
+
+	left_next_rec = page_rec_get_next_const(orig_pred);
+
+	if (!page_rec_is_supremum(left_next_rec)) {
+
+		/* Inherit the locks on the supremum of the left page to the
+		first record which was moved from the right page */
+
+		lock_rec_inherit_to_gap(left_block, left_block,
+					page_rec_get_heap_no(left_next_rec),
+					PAGE_HEAP_NO_SUPREMUM);
+
+		/* Reset the locks on the supremum of the left page,
+		releasing waiting transactions */
+
+		lock_rec_reset_and_release_wait(left_block,
+						PAGE_HEAP_NO_SUPREMUM);
+	}
+
+	/* Move the locks from the supremum of right page to the supremum
+	of the left page */
+
+	lock_rec_move(left_block, right_block,
+		      PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+
+	lock_rec_free_all_from_discard_page(right_block);
+
+	lock_mutex_exit();
+}
+
+/*************************************************************//**
+Resets the original locks on heir and replaces them with gap type locks
+inherited from rec. */
+UNIV_INTERN
+void
+lock_rec_reset_and_inherit_gap_locks(
+/*=================================*/
+	const buf_block_t*	heir_block,	/*!< in: block containing the
+						record which inherits */
+	const buf_block_t*	block,		/*!< in: block containing the
+						record from which inherited;
+						does NOT reset the locks on
+						this record */
+	ulint			heir_heap_no,	/*!< in: heap_no of the
+						inheriting record */
+	ulint			heap_no)	/*!< in: heap_no of the
+						donating record */
+{
+	lock_mutex_enter();
+
+	lock_rec_reset_and_release_wait(heir_block, heir_heap_no);
+
+	lock_rec_inherit_to_gap(heir_block, block, heir_heap_no, heap_no);
+
+	lock_mutex_exit();
+}
+
+/*************************************************************//**
+Updates the lock table when a page is discarded. */
+UNIV_INTERN
+void
+lock_update_discard(
+/*================*/
+	const buf_block_t*	heir_block,	/*!< in: index page
+						which will inherit the locks */
+	ulint			heir_heap_no,	/*!< in: heap_no of the record
+						which will inherit the locks */
+	const buf_block_t*	block)		/*!< in: index page
+						which will be discarded */
+{
+	const page_t*	page = block->frame;
+	const rec_t*	rec;
+	ulint		heap_no;
+
+	lock_mutex_enter();
+
+	if (!lock_rec_get_first_on_page(block)) {
+		/* No locks exist on page, nothing to do */
+
+		lock_mutex_exit();
+
+		return;
+	}
+
+	/* Inherit all the locks on the page to the record and reset all
+	the locks on the page */
+
+	if (page_is_comp(page)) {
+		rec = page + PAGE_NEW_INFIMUM;
+
+		do {
+			heap_no = rec_get_heap_no_new(rec);
+
+			lock_rec_inherit_to_gap(heir_block, block,
+						heir_heap_no, heap_no);
+
+			lock_rec_reset_and_release_wait(block, heap_no);
+
+			rec = page + rec_get_next_offs(rec, TRUE);
+		} while (heap_no != PAGE_HEAP_NO_SUPREMUM);
+	} else {
+		rec = page + PAGE_OLD_INFIMUM;
+
+		do {
+			heap_no = rec_get_heap_no_old(rec);
+
+			lock_rec_inherit_to_gap(heir_block, block,
+						heir_heap_no, heap_no);
+
+			lock_rec_reset_and_release_wait(block, heap_no);
+
+			rec = page + rec_get_next_offs(rec, FALSE);
+		} while (heap_no != PAGE_HEAP_NO_SUPREMUM);
+	}
+
+	lock_rec_free_all_from_discard_page(block);
+
+	lock_mutex_exit();
+}
+
+/*************************************************************//**
+Updates the lock table when a new user record is inserted. */
+UNIV_INTERN
+void
+lock_update_insert(
+/*===============*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec)	/*!< in: the inserted record */
+{
+	ulint	receiver_heap_no;
+	ulint	donator_heap_no;
+
+	ut_ad(block->frame == page_align(rec));
+
+	/* Inherit the gap-locking locks for rec, in gap mode, from the next
+	record */
+
+	if (page_rec_is_comp(rec)) {
+		receiver_heap_no = rec_get_heap_no_new(rec);
+		donator_heap_no = rec_get_heap_no_new(
+			page_rec_get_next_low(rec, TRUE));
+	} else {
+		receiver_heap_no = rec_get_heap_no_old(rec);
+		donator_heap_no = rec_get_heap_no_old(
+			page_rec_get_next_low(rec, FALSE));
+	}
+
+	lock_rec_inherit_to_gap_if_gap_lock(
+		block, receiver_heap_no, donator_heap_no);
+}
+
+/*************************************************************//**
+Updates the lock table when a record is removed. */
+UNIV_INTERN
+void
+lock_update_delete(
+/*===============*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec)	/*!< in: the record to be removed */
+{
+	const page_t*	page = block->frame;
+	ulint		heap_no;
+	ulint		next_heap_no;
+
+	ut_ad(page == page_align(rec));
+
+	if (page_is_comp(page)) {
+		heap_no = rec_get_heap_no_new(rec);
+		next_heap_no = rec_get_heap_no_new(page
+						   + rec_get_next_offs(rec,
+								       TRUE));
+	} else {
+		heap_no = rec_get_heap_no_old(rec);
+		next_heap_no = rec_get_heap_no_old(page
+						   + rec_get_next_offs(rec,
+								       FALSE));
+	}
+
+	lock_mutex_enter();
+
+	/* Let the next record inherit the locks from rec, in gap mode */
+
+	lock_rec_inherit_to_gap(block, block, next_heap_no, heap_no);
+
+	/* Reset the lock bits on rec and release waiting transactions */
+
+	lock_rec_reset_and_release_wait(block, heap_no);
+
+	lock_mutex_exit();
+}
+
+/*********************************************************************//**
+Stores on the page infimum record the explicit locks of another record.
+This function is used to store the lock state of a record when it is
+updated and the size of the record changes in the update. The record
+is moved in such an update, perhaps to another page. The infimum record
+acts as a dummy carrier record, taking care of lock releases while the
+actual record is being moved. */
+UNIV_INTERN
+void
+lock_rec_store_on_page_infimum(
+/*===========================*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec)	/*!< in: record whose lock state
+					is stored on the infimum
+					record of the same page; lock
+					bits are reset on the
+					record */
+{
+	ulint	heap_no = page_rec_get_heap_no(rec);
+
+	ut_ad(block->frame == page_align(rec));
+
+	lock_mutex_enter();
+
+	lock_rec_move(block, block, PAGE_HEAP_NO_INFIMUM, heap_no);
+
+	lock_mutex_exit();
+}
+
+/*********************************************************************//**
+Restores the state of explicit lock requests on a single record, where the
+state was stored on the infimum of the page. */
+UNIV_INTERN
+void
+lock_rec_restore_from_page_infimum(
+/*===============================*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec,	/*!< in: record whose lock state
+					is restored */
+	const buf_block_t*	donator)/*!< in: page (rec is not
+					necessarily on this page)
+					whose infimum stored the lock
+					state; lock bits are reset on
+					the infimum */
+{
+	ulint	heap_no = page_rec_get_heap_no(rec);
+
+	lock_mutex_enter();
+
+	lock_rec_move(block, donator, heap_no, PAGE_HEAP_NO_INFIMUM);
+
+	lock_mutex_exit();
+}
+
+/*=========== DEADLOCK CHECKING ======================================*/
+
+/*********************************************************************//**
+rewind(3) the file used for storing the latest detected deadlock and
+print a heading message to stderr if printing of all deadlocks to stderr
+is enabled. */
+UNIV_INLINE
+void
+lock_deadlock_start_print()
+/*=======================*/
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(!srv_read_only_mode);
+
+	rewind(lock_latest_err_file);
+	ut_print_timestamp(lock_latest_err_file);
+
+	if (srv_print_all_deadlocks) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "InnoDB: transactions deadlock detected, "
+			"dumping detailed information.\n");
+		ut_print_timestamp(stderr);
+	}
+}
+
+/*********************************************************************//**
+Print a message to the deadlock file and possibly to stderr. */
+UNIV_INLINE
+void
+lock_deadlock_fputs(
+/*================*/
+	const char*	msg)	/*!< in: message to print */
+{
+	if (!srv_read_only_mode) {
+		fputs(msg, lock_latest_err_file);
+
+		if (srv_print_all_deadlocks) {
+			fputs(msg, stderr);
+		}
+	}
+}
+
+/*********************************************************************//**
+Print transaction data to the deadlock file and possibly to stderr. */
+UNIV_INLINE
+void
+lock_deadlock_trx_print(
+/*====================*/
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(!srv_read_only_mode);
+
+	ulint	n_rec_locks = lock_number_of_rows_locked(&trx->lock);
+	ulint	n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
+	ulint	heap_size = mem_heap_get_size(trx->lock.lock_heap);
+
+	mutex_enter(&trx_sys->mutex);
+
+	trx_print_low(lock_latest_err_file, trx, max_query_len,
+		      n_rec_locks, n_trx_locks, heap_size);
+
+	if (srv_print_all_deadlocks) {
+		trx_print_low(stderr, trx, max_query_len,
+			      n_rec_locks, n_trx_locks, heap_size);
+	}
+
+	mutex_exit(&trx_sys->mutex);
+}
+
+/*********************************************************************//**
+Print lock data to the deadlock file and possibly to stderr. */
+UNIV_INLINE
+void
+lock_deadlock_lock_print(
+/*=====================*/
+	const lock_t*	lock)	/*!< in: record or table type lock */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(!srv_read_only_mode);
+
+	if (lock_get_type_low(lock) == LOCK_REC) {
+		lock_rec_print(lock_latest_err_file, lock);
+
+		if (srv_print_all_deadlocks) {
+			lock_rec_print(stderr, lock);
+		}
+	} else {
+		lock_table_print(lock_latest_err_file, lock);
+
+		if (srv_print_all_deadlocks) {
+			lock_table_print(stderr, lock);
+		}
+	}
+}
+
+/** Used in deadlock tracking. Protected by lock_sys->mutex. */
+static ib_uint64_t	lock_mark_counter = 0;
+
+/** Check if the search is too deep. */
+#define lock_deadlock_too_deep(c)				\
+	(c->depth > LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK		\
+	 || c->cost > LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK)
+
+/********************************************************************//**
+Get the next lock in the queue that is owned by a transaction whose
+sub-tree has not already been searched.
+@return next lock or NULL if at end of queue */
+static
+const lock_t*
+lock_get_next_lock(
+/*===============*/
+	const lock_deadlock_ctx_t*
+				ctx,	/*!< in: deadlock context */
+	const lock_t*		lock,	/*!< in: lock in the queue */
+	ulint			heap_no)/*!< in: heap no if rec lock else
+					ULINT_UNDEFINED */
+{
+	ut_ad(lock_mutex_own());
+
+	do {
+		if (lock_get_type_low(lock) == LOCK_REC) {
+			ut_ad(heap_no != ULINT_UNDEFINED);
+			lock = lock_rec_get_next_const(heap_no, lock);
+		} else {
+			ut_ad(heap_no == ULINT_UNDEFINED);
+			ut_ad(lock_get_type_low(lock) == LOCK_TABLE);
+
+			lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock);
+		}
+	} while (lock != NULL
+		 && lock->trx->lock.deadlock_mark > ctx->mark_start);
+
+	ut_ad(lock == NULL
+	      || lock_get_type_low(lock) == lock_get_type_low(ctx->wait_lock));
+
+	return(lock);
+}
+
+/********************************************************************//**
+Get the first lock to search. The search starts from the current
+wait_lock. What we are really interested in is an edge from the
+current wait_lock's owning transaction to another transaction that has
+a lock ahead in the queue. We skip locks where the owning transaction's
+sub-tree has already been searched.
+@return first lock or NULL */
+static
+const lock_t*
+lock_get_first_lock(
+/*================*/
+	const lock_deadlock_ctx_t*
+				ctx,	/*!< in: deadlock context */
+	ulint*			heap_no)/*!< out: heap no if rec lock,
+					else ULINT_UNDEFINED */
+{
+	const lock_t*		lock;
+
+	ut_ad(lock_mutex_own());
+
+	lock = ctx->wait_lock;
+
+	if (lock_get_type_low(lock) == LOCK_REC) {
+
+		*heap_no = lock_rec_find_set_bit(lock);
+		ut_ad(*heap_no != ULINT_UNDEFINED);
+
+		lock = lock_rec_get_first_on_page_addr(
+			lock->un_member.rec_lock.space,
+			lock->un_member.rec_lock.page_no);
+
+		/* Position on the first lock on the physical record. */
+		if (!lock_rec_get_nth_bit(lock, *heap_no)) {
+			lock = lock_rec_get_next_const(*heap_no, lock);
+		}
+
+	} else {
+		*heap_no = ULINT_UNDEFINED;
+		ut_ad(lock_get_type_low(lock) == LOCK_TABLE);
+		lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock);
+	}
+
+	ut_a(lock != NULL);
+	ut_a(lock != ctx->wait_lock);
+	ut_ad(lock_get_type_low(lock) == lock_get_type_low(ctx->wait_lock));
+
+	return(lock);
+}
+
+/********************************************************************//**
+Notify that a deadlock has been detected and print the conflicting
+transaction info. */
+static
+void
+lock_deadlock_notify(
+/*=================*/
+	const lock_deadlock_ctx_t*	ctx,	/*!< in: deadlock context */
+	const lock_t*			lock)	/*!< in: lock causing
+						deadlock */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(!srv_read_only_mode);
+
+	lock_deadlock_start_print();
+
+	lock_deadlock_fputs("\n*** (1) TRANSACTION:\n");
+
+	lock_deadlock_trx_print(ctx->wait_lock->trx, 3000);
+
+	lock_deadlock_fputs("*** (1) WAITING FOR THIS LOCK TO BE GRANTED:\n");
+
+	lock_deadlock_lock_print(ctx->wait_lock);
+
+	lock_deadlock_fputs("*** (2) TRANSACTION:\n");
+
+	lock_deadlock_trx_print(lock->trx, 3000);
+
+	lock_deadlock_fputs("*** (2) HOLDS THE LOCK(S):\n");
+
+	lock_deadlock_lock_print(lock);
+
+	/* It is possible that the joining transaction was granted its
+	lock when we rolled back some other waiting transaction. */
+
+	if (ctx->start->lock.wait_lock != 0) {
+		lock_deadlock_fputs(
+			"*** (2) WAITING FOR THIS LOCK TO BE GRANTED:\n");
+
+		lock_deadlock_lock_print(ctx->start->lock.wait_lock);
+	}
+
+#ifdef UNIV_DEBUG
+	if (lock_print_waits) {
+		fputs("Deadlock detected\n", stderr);
+	}
+#endif /* UNIV_DEBUG */
+}
+
+/********************************************************************//**
+Select the victim transaction that should be rolledback.
+@return victim transaction */
+static
+const trx_t*
+lock_deadlock_select_victim(
+/*========================*/
+	const lock_deadlock_ctx_t*	ctx)	/*!< in: deadlock context */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(ctx->start->lock.wait_lock != 0);
+	ut_ad(ctx->wait_lock->trx != ctx->start);
+
+	if (trx_weight_ge(ctx->wait_lock->trx, ctx->start)) {
+		/* The joining  transaction is 'smaller',
+		choose it as the victim and roll it back. */
+
+		return(ctx->start);
+	}
+
+	return(ctx->wait_lock->trx);
+}
+
+/********************************************************************//**
+Pop the deadlock search state from the stack.
+@return stack slot instance that was on top of the stack. */
+static
+const lock_stack_t*
+lock_deadlock_pop(
+/*==============*/
+	lock_deadlock_ctx_t*	ctx)		/*!< in/out: context */
+{
+	ut_ad(lock_mutex_own());
+
+	ut_ad(ctx->depth > 0);
+
+	return(&lock_stack[--ctx->depth]);
+}
+
+/********************************************************************//**
+Push the deadlock search state onto the stack.
+@return slot that was used in the stack */
+static
+lock_stack_t*
+lock_deadlock_push(
+/*===============*/
+	lock_deadlock_ctx_t*	ctx,		/*!< in/out: context */
+	const lock_t*		lock,		/*!< in: current lock */
+	ulint			heap_no)	/*!< in: heap number */
+{
+	ut_ad(lock_mutex_own());
+
+	/* Save current search state. */
+
+	if (LOCK_STACK_SIZE > ctx->depth) {
+		lock_stack_t*	stack;
+
+		stack = &lock_stack[ctx->depth++];
+
+		stack->lock = lock;
+		stack->heap_no = heap_no;
+		stack->wait_lock = ctx->wait_lock;
+
+		return(stack);
+	}
+
+	return(NULL);
+}
+
+/********************************************************************//**
+Looks iteratively for a deadlock. Note: the joining transaction may
+have been granted its lock by the deadlock checks.
+@return 0 if no deadlock else the victim transaction id.*/
+static
+trx_id_t
+lock_deadlock_search(
+/*=================*/
+	lock_deadlock_ctx_t*	ctx)	/*!< in/out: deadlock context */
+{
+	const lock_t*	lock;
+	ulint		heap_no;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(!trx_mutex_own(ctx->start));
+
+	ut_ad(ctx->start != NULL);
+	ut_ad(ctx->wait_lock != NULL);
+	assert_trx_in_list(ctx->wait_lock->trx);
+	ut_ad(ctx->mark_start <= lock_mark_counter);
+
+	/* Look at the locks ahead of wait_lock in the lock queue. */
+	lock = lock_get_first_lock(ctx, &heap_no);
+
+	for (;;) {
+
+		/* We should never visit the same sub-tree more than once. */
+		ut_ad(lock == NULL
+		      || lock->trx->lock.deadlock_mark <= ctx->mark_start);
+
+		while (ctx->depth > 0 && lock == NULL) {
+			const lock_stack_t*	stack;
+
+			/* Restore previous search state. */
+
+			stack = lock_deadlock_pop(ctx);
+
+			lock = stack->lock;
+			heap_no = stack->heap_no;
+			ctx->wait_lock = stack->wait_lock;
+
+			lock = lock_get_next_lock(ctx, lock, heap_no);
+		}
+
+		if (lock == NULL) {
+			break;
+		} else if (lock == ctx->wait_lock) {
+
+			/* We can mark this subtree as searched */
+			ut_ad(lock->trx->lock.deadlock_mark <= ctx->mark_start);
+
+			lock->trx->lock.deadlock_mark = ++lock_mark_counter;
+
+			/* We are not prepared for an overflow. This 64-bit
+			counter should never wrap around. At 10^9 increments
+			per second, it would take 10^3 years of uptime. */
+
+			ut_ad(lock_mark_counter > 0);
+
+			lock = NULL;
+
+		} else if (!lock_has_to_wait(ctx->wait_lock, lock)) {
+
+			/* No conflict, next lock */
+			lock = lock_get_next_lock(ctx, lock, heap_no);
+
+		} else if (lock->trx == ctx->start) {
+
+			/* Found a cycle. */
+
+			lock_deadlock_notify(ctx, lock);
+
+			return(lock_deadlock_select_victim(ctx)->id);
+
+		} else if (lock_deadlock_too_deep(ctx)) {
+
+			/* Search too deep to continue. */
+
+			ctx->too_deep = TRUE;
+
+			/* Select the joining transaction as the victim. */
+			return(ctx->start->id);
+
+		} else if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+			/* Another trx ahead has requested a lock in an
+			incompatible mode, and is itself waiting for a lock. */
+
+			++ctx->cost;
+
+			/* Save current search state. */
+			if (!lock_deadlock_push(ctx, lock, heap_no)) {
+
+				/* Unable to save current search state, stack
+				size not big enough. */
+
+				ctx->too_deep = TRUE;
+
+				return(ctx->start->id);
+			}
+
+			ctx->wait_lock = lock->trx->lock.wait_lock;
+			lock = lock_get_first_lock(ctx, &heap_no);
+
+			if (lock->trx->lock.deadlock_mark > ctx->mark_start) {
+				lock = lock_get_next_lock(ctx, lock, heap_no);
+			}
+
+		} else {
+			lock = lock_get_next_lock(ctx, lock, heap_no);
+		}
+	}
+
+	ut_a(lock == NULL && ctx->depth == 0);
+
+	/* No deadlock found. */
+	return(0);
+}
+
+/********************************************************************//**
+Print info about transaction that was rolled back. */
+static
+void
+lock_deadlock_joining_trx_print(
+/*============================*/
+	const trx_t*	trx,		/*!< in: transaction rolled back */
+	const lock_t*	lock)		/*!< in: lock trx wants */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(!srv_read_only_mode);
+
+	/* If the lock search exceeds the max step
+	or the max depth, the current trx will be
+	the victim. Print its information. */
+	lock_deadlock_start_print();
+
+	lock_deadlock_fputs(
+		"TOO DEEP OR LONG SEARCH IN THE LOCK TABLE"
+		" WAITS-FOR GRAPH, WE WILL ROLL BACK"
+		" FOLLOWING TRANSACTION \n\n"
+		"*** TRANSACTION:\n");
+
+	lock_deadlock_trx_print(trx, 3000);
+
+	lock_deadlock_fputs("*** WAITING FOR THIS LOCK TO BE GRANTED:\n");
+
+	lock_deadlock_lock_print(lock);
+}
+
+/********************************************************************//**
+Rollback transaction selected as the victim. */
+static
+void
+lock_deadlock_trx_rollback(
+/*=======================*/
+	lock_deadlock_ctx_t*	ctx)		/*!< in: deadlock context */
+{
+	trx_t*			trx;
+
+	ut_ad(lock_mutex_own());
+
+	trx = ctx->wait_lock->trx;
+
+	lock_deadlock_fputs("*** WE ROLL BACK TRANSACTION (1)\n");
+
+	trx_mutex_enter(trx);
+
+	trx->lock.was_chosen_as_deadlock_victim = TRUE;
+
+	lock_cancel_waiting_and_release(trx->lock.wait_lock);
+
+	trx_mutex_exit(trx);
+}
+
+/********************************************************************//**
+Checks if a joining lock request results in a deadlock. If a deadlock is
+found this function will resolve the dadlock by choosing a victim transaction
+and rolling it back. It will attempt to resolve all deadlocks. The returned
+transaction id will be the joining transaction id or 0 if some other
+transaction was chosen as a victim and rolled back or no deadlock found.
+
+@return id of transaction chosen as victim or 0 */
+static
+trx_id_t
+lock_deadlock_check_and_resolve(
+/*============================*/
+	const lock_t*	lock,	/*!< in: lock the transaction is requesting */
+	const trx_t*	trx)	/*!< in: transaction */
+{
+	trx_id_t	victim_trx_id;
+
+	ut_ad(trx != NULL);
+	ut_ad(lock != NULL);
+	ut_ad(lock_mutex_own());
+	assert_trx_in_list(trx);
+
+	/* Try and resolve as many deadlocks as possible. */
+	do {
+		lock_deadlock_ctx_t	ctx;
+
+		/* Reset the context. */
+		ctx.cost = 0;
+		ctx.depth = 0;
+		ctx.start = trx;
+		ctx.too_deep = FALSE;
+		ctx.wait_lock = lock;
+		ctx.mark_start = lock_mark_counter;
+
+		victim_trx_id = lock_deadlock_search(&ctx);
+
+		/* Search too deep, we rollback the joining transaction. */
+		if (ctx.too_deep) {
+
+			ut_a(trx == ctx.start);
+			ut_a(victim_trx_id == trx->id);
+
+			if (!srv_read_only_mode) {
+				lock_deadlock_joining_trx_print(trx, lock);
+			}
+
+			MONITOR_INC(MONITOR_DEADLOCK);
+
+		} else if (victim_trx_id != 0 && victim_trx_id != trx->id) {
+
+			ut_ad(victim_trx_id == ctx.wait_lock->trx->id);
+			lock_deadlock_trx_rollback(&ctx);
+
+			lock_deadlock_found = TRUE;
+
+			MONITOR_INC(MONITOR_DEADLOCK);
+		}
+
+	} while (victim_trx_id != 0 && victim_trx_id != trx->id);
+
+	/* If the joining transaction was selected as the victim. */
+	if (victim_trx_id != 0) {
+		ut_a(victim_trx_id == trx->id);
+
+		lock_deadlock_fputs("*** WE ROLL BACK TRANSACTION (2)\n");
+
+		lock_deadlock_found = TRUE;
+	}
+
+	return(victim_trx_id);
+}
+
+/*========================= TABLE LOCKS ==============================*/
+
+/*********************************************************************//**
+Creates a table lock object and adds it as the last in the lock queue
+of the table. Does NOT check for deadlocks or lock compatibility.
+@return	own: new lock object */
+UNIV_INLINE
+lock_t*
+lock_table_create(
+/*==============*/
+	dict_table_t*	table,	/*!< in/out: database table
+				in dictionary cache */
+	ulint		type_mode,/*!< in: lock mode possibly ORed with
+				LOCK_WAIT */
+	trx_t*		trx)	/*!< in: trx */
+{
+	lock_t*	lock;
+
+	ut_ad(table && trx);
+	ut_ad(lock_mutex_own());
+	ut_ad(trx_mutex_own(trx));
+
+	/* Non-locking autocommit read-only transactions should not set
+	any locks. */
+	assert_trx_in_list(trx);
+
+	if ((type_mode & LOCK_MODE_MASK) == LOCK_AUTO_INC) {
+		++table->n_waiting_or_granted_auto_inc_locks;
+	}
+
+	/* For AUTOINC locking we reuse the lock instance only if
+	there is no wait involved else we allocate the waiting lock
+	from the transaction lock heap. */
+	if (type_mode == LOCK_AUTO_INC) {
+
+		lock = table->autoinc_lock;
+
+		table->autoinc_trx = trx;
+
+		ib_vector_push(trx->autoinc_locks, &lock);
+	} else {
+		lock = static_cast<lock_t*>(
+			mem_heap_alloc(trx->lock.lock_heap, sizeof(*lock)));
+	}
+
+	lock->type_mode = type_mode | LOCK_TABLE;
+	lock->trx = trx;
+
+	lock->un_member.tab_lock.table = table;
+
+	ut_ad(table->n_ref_count > 0 || !table->can_be_evicted);
+
+	UT_LIST_ADD_LAST(trx_locks, trx->lock.trx_locks, lock);
+	UT_LIST_ADD_LAST(un_member.tab_lock.locks, table->locks, lock);
+
+	if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) {
+
+		lock_set_lock_and_trx_wait(lock, trx);
+	}
+
+	ib_vector_push(lock->trx->lock.table_locks, &lock);
+
+	MONITOR_INC(MONITOR_TABLELOCK_CREATED);
+	MONITOR_INC(MONITOR_NUM_TABLELOCK);
+
+	return(lock);
+}
+
+/*************************************************************//**
+Pops autoinc lock requests from the transaction's autoinc_locks. We
+handle the case where there are gaps in the array and they need to
+be popped off the stack. */
+UNIV_INLINE
+void
+lock_table_pop_autoinc_locks(
+/*=========================*/
+	trx_t*	trx)	/*!< in/out: transaction that owns the AUTOINC locks */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(!ib_vector_is_empty(trx->autoinc_locks));
+
+	/* Skip any gaps, gaps are NULL lock entries in the
+	trx->autoinc_locks vector. */
+
+	do {
+		ib_vector_pop(trx->autoinc_locks);
+
+		if (ib_vector_is_empty(trx->autoinc_locks)) {
+			return;
+		}
+
+	} while (*(lock_t**) ib_vector_get_last(trx->autoinc_locks) == NULL);
+}
+
+/*************************************************************//**
+Removes an autoinc lock request from the transaction's autoinc_locks. */
+UNIV_INLINE
+void
+lock_table_remove_autoinc_lock(
+/*===========================*/
+	lock_t*	lock,	/*!< in: table lock */
+	trx_t*	trx)	/*!< in/out: transaction that owns the lock */
+{
+	lock_t*	autoinc_lock;
+	lint	i = ib_vector_size(trx->autoinc_locks) - 1;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(lock_get_mode(lock) == LOCK_AUTO_INC);
+	ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+	ut_ad(!ib_vector_is_empty(trx->autoinc_locks));
+
+	/* With stored functions and procedures the user may drop
+	a table within the same "statement". This special case has
+	to be handled by deleting only those AUTOINC locks that were
+	held by the table being dropped. */
+
+	autoinc_lock = *static_cast<lock_t**>(
+		ib_vector_get(trx->autoinc_locks, i));
+
+	/* This is the default fast case. */
+
+	if (autoinc_lock == lock) {
+		lock_table_pop_autoinc_locks(trx);
+	} else {
+		/* The last element should never be NULL */
+		ut_a(autoinc_lock != NULL);
+
+		/* Handle freeing the locks from within the stack. */
+
+		while (--i >= 0) {
+			autoinc_lock = *static_cast<lock_t**>(
+				ib_vector_get(trx->autoinc_locks, i));
+
+			if (UNIV_LIKELY(autoinc_lock == lock)) {
+				void*	null_var = NULL;
+				ib_vector_set(trx->autoinc_locks, i, &null_var);
+				return;
+			}
+		}
+
+		/* Must find the autoinc lock. */
+		ut_error;
+	}
+}
+
+/*************************************************************//**
+Removes a table lock request from the queue and the trx list of locks;
+this is a low-level function which does NOT check if waiting requests
+can now be granted. */
+UNIV_INLINE
+void
+lock_table_remove_low(
+/*==================*/
+	lock_t*	lock)	/*!< in/out: table lock */
+{
+	trx_t*		trx;
+	dict_table_t*	table;
+
+	ut_ad(lock_mutex_own());
+
+	trx = lock->trx;
+	table = lock->un_member.tab_lock.table;
+
+	/* Remove the table from the transaction's AUTOINC vector, if
+	the lock that is being released is an AUTOINC lock. */
+	if (lock_get_mode(lock) == LOCK_AUTO_INC) {
+
+		/* The table's AUTOINC lock can get transferred to
+		another transaction before we get here. */
+		if (table->autoinc_trx == trx) {
+			table->autoinc_trx = NULL;
+		}
+
+		/* The locks must be freed in the reverse order from
+		the one in which they were acquired. This is to avoid
+		traversing the AUTOINC lock vector unnecessarily.
+
+		We only store locks that were granted in the
+		trx->autoinc_locks vector (see lock_table_create()
+		and lock_grant()). Therefore it can be empty and we
+		need to check for that. */
+
+		if (!lock_get_wait(lock)
+		    && !ib_vector_is_empty(trx->autoinc_locks)) {
+
+			lock_table_remove_autoinc_lock(lock, trx);
+		}
+
+		ut_a(table->n_waiting_or_granted_auto_inc_locks > 0);
+		table->n_waiting_or_granted_auto_inc_locks--;
+	}
+
+	UT_LIST_REMOVE(trx_locks, trx->lock.trx_locks, lock);
+	UT_LIST_REMOVE(un_member.tab_lock.locks, table->locks, lock);
+
+	MONITOR_INC(MONITOR_TABLELOCK_REMOVED);
+	MONITOR_DEC(MONITOR_NUM_TABLELOCK);
+}
+
+/*********************************************************************//**
+Enqueues a waiting request for a table lock which cannot be granted
+immediately. Checks for deadlocks.
+@return DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED, or
+DB_SUCCESS; DB_SUCCESS means that there was a deadlock, but another
+transaction was chosen as a victim, and we got the lock immediately:
+no need to wait then */
+static
+dberr_t
+lock_table_enqueue_waiting(
+/*=======================*/
+	ulint		mode,	/*!< in: lock mode this transaction is
+				requesting */
+	dict_table_t*	table,	/*!< in/out: table */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	trx_t*		trx;
+	lock_t*		lock;
+	trx_id_t	victim_trx_id;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(!srv_read_only_mode);
+
+	trx = thr_get_trx(thr);
+	ut_ad(trx_mutex_own(trx));
+
+	/* Test if there already is some other reason to suspend thread:
+	we do not enqueue a lock request if the query thread should be
+	stopped anyway */
+
+	if (que_thr_stop(thr)) {
+		ut_error;
+
+		return(DB_QUE_THR_SUSPENDED);
+	}
+
+	switch (trx_get_dict_operation(trx)) {
+	case TRX_DICT_OP_NONE:
+		break;
+	case TRX_DICT_OP_TABLE:
+	case TRX_DICT_OP_INDEX:
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: a table lock wait happens"
+		      " in a dictionary operation!\n"
+		      "InnoDB: Table name ", stderr);
+		ut_print_name(stderr, trx, TRUE, table->name);
+		fputs(".\n"
+		      "InnoDB: Submit a detailed bug report"
+		      " to http://bugs.mysql.com\n",
+		      stderr);
+		ut_ad(0);
+	}
+
+	/* Enqueue the lock request that will wait to be granted */
+
+	lock = lock_table_create(table, mode | LOCK_WAIT, trx);
+
+	/* Release the mutex to obey the latching order.
+	This is safe, because lock_deadlock_check_and_resolve()
+	is invoked when a lock wait is enqueued for the currently
+	running transaction. Because trx is a running transaction
+	(it is not currently suspended because of a lock wait),
+	its state can only be changed by this thread, which is
+	currently associated with the transaction. */
+
+	trx_mutex_exit(trx);
+
+	victim_trx_id = lock_deadlock_check_and_resolve(lock, trx);
+
+	trx_mutex_enter(trx);
+
+	if (victim_trx_id != 0) {
+		ut_ad(victim_trx_id == trx->id);
+
+		/* The order here is important, we don't want to
+		lose the state of the lock before calling remove. */
+		lock_table_remove_low(lock);
+		lock_reset_lock_and_trx_wait(lock);
+
+		return(DB_DEADLOCK);
+	} else if (trx->lock.wait_lock == NULL) {
+		/* Deadlock resolution chose another transaction as a victim,
+		and we accidentally got our lock granted! */
+
+		return(DB_SUCCESS);
+	}
+
+	trx->lock.que_state = TRX_QUE_LOCK_WAIT;
+
+	trx->lock.wait_started = ut_time();
+	trx->lock.was_chosen_as_deadlock_victim = FALSE;
+
+	ut_a(que_thr_stop(thr));
+
+	MONITOR_INC(MONITOR_TABLELOCK_WAIT);
+
+	return(DB_LOCK_WAIT);
+}
+
+/*********************************************************************//**
+Checks if other transactions have an incompatible mode lock request in
+the lock queue.
+@return	lock or NULL */
+UNIV_INLINE
+const lock_t*
+lock_table_other_has_incompatible(
+/*==============================*/
+	const trx_t*		trx,	/*!< in: transaction, or NULL if all
+					transactions should be included */
+	ulint			wait,	/*!< in: LOCK_WAIT if also
+					waiting locks are taken into
+					account, or 0 if not */
+	const dict_table_t*	table,	/*!< in: table */
+	enum lock_mode		mode)	/*!< in: lock mode */
+{
+	const lock_t*	lock;
+
+	ut_ad(lock_mutex_own());
+
+	for (lock = UT_LIST_GET_LAST(table->locks);
+	     lock != NULL;
+	     lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock)) {
+
+		if (lock->trx != trx
+		    && !lock_mode_compatible(lock_get_mode(lock), mode)
+		    && (wait || !lock_get_wait(lock))) {
+
+			return(lock);
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Locks the specified database table in the mode given. If the lock cannot
+be granted immediately, the query thread is put to wait.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+dberr_t
+lock_table(
+/*=======*/
+	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is set,
+				does nothing */
+	dict_table_t*	table,	/*!< in/out: database table
+				in dictionary cache */
+	enum lock_mode	mode,	/*!< in: lock mode */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	trx_t*		trx;
+	dberr_t		err;
+	const lock_t*	wait_for;
+
+	ut_ad(table && thr);
+
+	if (flags & BTR_NO_LOCKING_FLAG) {
+
+		return(DB_SUCCESS);
+	}
+
+	ut_a(flags == 0);
+
+	trx = thr_get_trx(thr);
+
+	/* Look for equal or stronger locks the same trx already
+	has on the table. No need to acquire the lock mutex here
+	because only this transacton can add/access table locks
+	to/from trx_t::table_locks. */
+
+	if (lock_table_has(trx, table, mode)) {
+
+		return(DB_SUCCESS);
+	}
+
+	lock_mutex_enter();
+
+	/* We have to check if the new lock is compatible with any locks
+	other transactions have in the table lock queue. */
+
+	wait_for = lock_table_other_has_incompatible(
+		trx, LOCK_WAIT, table, mode);
+
+	trx_mutex_enter(trx);
+
+	/* Another trx has a request on the table in an incompatible
+	mode: this trx may have to wait */
+
+	if (wait_for != NULL) {
+		err = lock_table_enqueue_waiting(mode | flags, table, thr);
+	} else {
+		lock_table_create(table, mode | flags, trx);
+
+		ut_a(!flags || mode == LOCK_S || mode == LOCK_X);
+
+		err = DB_SUCCESS;
+	}
+
+	lock_mutex_exit();
+
+	trx_mutex_exit(trx);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Creates a table IX lock object for a resurrected transaction. */
+UNIV_INTERN
+void
+lock_table_ix_resurrect(
+/*====================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	trx_t*		trx)	/*!< in/out: transaction */
+{
+	ut_ad(trx->is_recovered);
+
+	if (lock_table_has(trx, table, LOCK_IX)) {
+		return;
+	}
+
+	lock_mutex_enter();
+
+	/* We have to check if the new lock is compatible with any locks
+	other transactions have in the table lock queue. */
+
+	ut_ad(!lock_table_other_has_incompatible(
+		      trx, LOCK_WAIT, table, LOCK_IX));
+
+	trx_mutex_enter(trx);
+	lock_table_create(table, LOCK_IX, trx);
+	lock_mutex_exit();
+	trx_mutex_exit(trx);
+}
+
+/*********************************************************************//**
+Checks if a waiting table lock request still has to wait in a queue.
+@return	TRUE if still has to wait */
+static
+ibool
+lock_table_has_to_wait_in_queue(
+/*============================*/
+	const lock_t*	wait_lock)	/*!< in: waiting table lock */
+{
+	const dict_table_t*	table;
+	const lock_t*		lock;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(lock_get_wait(wait_lock));
+
+	table = wait_lock->un_member.tab_lock.table;
+
+	for (lock = UT_LIST_GET_FIRST(table->locks);
+	     lock != wait_lock;
+	     lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) {
+
+		if (lock_has_to_wait(wait_lock, lock)) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*************************************************************//**
+Removes a table lock request, waiting or granted, from the queue and grants
+locks to other transactions in the queue, if they now are entitled to a
+lock. */
+static
+void
+lock_table_dequeue(
+/*===============*/
+	lock_t*	in_lock)/*!< in/out: table lock object; transactions waiting
+			behind will get their lock requests granted, if
+			they are now qualified to it */
+{
+	lock_t*	lock;
+
+	ut_ad(lock_mutex_own());
+	ut_a(lock_get_type_low(in_lock) == LOCK_TABLE);
+
+	lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, in_lock);
+
+	lock_table_remove_low(in_lock);
+
+	/* Check if waiting locks in the queue can now be granted: grant
+	locks if there are no conflicting locks ahead. */
+
+	for (/* No op */;
+	     lock != NULL;
+	     lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) {
+
+		if (lock_get_wait(lock)
+		    && !lock_table_has_to_wait_in_queue(lock)) {
+
+			/* Grant the lock */
+			ut_ad(in_lock->trx != lock->trx);
+			lock_grant(lock);
+		}
+	}
+}
+
+/*=========================== LOCK RELEASE ==============================*/
+
+/*************************************************************//**
+Removes a granted record lock of a transaction from the queue and grants
+locks to other transactions waiting in the queue if they now are entitled
+to a lock. */
+UNIV_INTERN
+void
+lock_rec_unlock(
+/*============*/
+	trx_t*			trx,	/*!< in/out: transaction that has
+					set a record lock */
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec,	/*!< in: record */
+	enum lock_mode		lock_mode)/*!< in: LOCK_S or LOCK_X */
+{
+	lock_t*		first_lock;
+	lock_t*		lock;
+	ulint		heap_no;
+	const char*	stmt;
+	size_t		stmt_len;
+
+	ut_ad(trx);
+	ut_ad(rec);
+	ut_ad(block->frame == page_align(rec));
+	ut_ad(!trx->lock.wait_lock);
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+
+	heap_no = page_rec_get_heap_no(rec);
+
+	lock_mutex_enter();
+	trx_mutex_enter(trx);
+
+	first_lock = lock_rec_get_first(block, heap_no);
+
+	/* Find the last lock with the same lock_mode and transaction
+	on the record. */
+
+	for (lock = first_lock; lock != NULL;
+	     lock = lock_rec_get_next(heap_no, lock)) {
+		if (lock->trx == trx && lock_get_mode(lock) == lock_mode) {
+			goto released;
+		}
+	}
+
+	lock_mutex_exit();
+	trx_mutex_exit(trx);
+
+	stmt = innobase_get_stmt(trx->mysql_thd, &stmt_len);
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: Error: unlock row could not"
+		" find a %lu mode lock on the record\n",
+		(ulong) lock_mode);
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: current statement: %.*s\n",
+		(int) stmt_len, stmt);
+
+	return;
+
+released:
+	ut_a(!lock_get_wait(lock));
+	lock_rec_reset_nth_bit(lock, heap_no);
+
+	/* Check if we can now grant waiting lock requests */
+
+	for (lock = first_lock; lock != NULL;
+	     lock = lock_rec_get_next(heap_no, lock)) {
+		if (lock_get_wait(lock)
+		    && !lock_rec_has_to_wait_in_queue(lock)) {
+
+			/* Grant the lock */
+			ut_ad(trx != lock->trx);
+			lock_grant(lock);
+		}
+	}
+
+	lock_mutex_exit();
+	trx_mutex_exit(trx);
+}
+
+/*********************************************************************//**
+Releases transaction locks, and releases possible other transactions waiting
+because of these locks. */
+static
+void
+lock_release(
+/*=========*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	lock_t*		lock;
+	ulint		count = 0;
+	trx_id_t	max_trx_id;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(!trx_mutex_own(trx));
+
+	max_trx_id = trx_sys_get_max_trx_id();
+
+	for (lock = UT_LIST_GET_LAST(trx->lock.trx_locks);
+	     lock != NULL;
+	     lock = UT_LIST_GET_LAST(trx->lock.trx_locks)) {
+
+		if (lock_get_type_low(lock) == LOCK_REC) {
+
+#ifdef UNIV_DEBUG
+			/* Check if the transcation locked a record
+			in a system table in X mode. It should have set
+			the dict_op code correctly if it did. */
+			if (lock->index->table->id < DICT_HDR_FIRST_ID
+			    && lock_get_mode(lock) == LOCK_X) {
+
+				ut_ad(lock_get_mode(lock) != LOCK_IX);
+				ut_ad(trx->dict_operation != TRX_DICT_OP_NONE);
+			}
+#endif /* UNIV_DEBUG */
+
+			lock_rec_dequeue_from_page(lock);
+		} else {
+			dict_table_t*	table;
+
+			table = lock->un_member.tab_lock.table;
+#ifdef UNIV_DEBUG
+			ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+
+			/* Check if the transcation locked a system table
+			in IX mode. It should have set the dict_op code
+			correctly if it did. */
+			if (table->id < DICT_HDR_FIRST_ID
+			    && (lock_get_mode(lock) == LOCK_X
+				|| lock_get_mode(lock) == LOCK_IX)) {
+
+				ut_ad(trx->dict_operation != TRX_DICT_OP_NONE);
+			}
+#endif /* UNIV_DEBUG */
+
+			if (lock_get_mode(lock) != LOCK_IS
+			    && trx->undo_no != 0) {
+
+				/* The trx may have modified the table. We
+				block the use of the MySQL query cache for
+				all currently active transactions. */
+
+				table->query_cache_inv_trx_id = max_trx_id;
+			}
+
+			lock_table_dequeue(lock);
+		}
+
+		if (count == LOCK_RELEASE_INTERVAL) {
+			/* Release the  mutex for a while, so that we
+			do not monopolize it */
+
+			lock_mutex_exit();
+
+			lock_mutex_enter();
+
+			count = 0;
+		}
+
+		++count;
+	}
+
+	/* We don't remove the locks one by one from the vector for
+	efficiency reasons. We simply reset it because we would have
+	released all the locks anyway. */
+
+	ib_vector_reset(trx->lock.table_locks);
+
+	ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+	ut_a(ib_vector_is_empty(trx->autoinc_locks));
+	ut_a(ib_vector_is_empty(trx->lock.table_locks));
+
+	mem_heap_empty(trx->lock.lock_heap);
+}
+
+/* True if a lock mode is S or X */
+#define IS_LOCK_S_OR_X(lock) \
+	(lock_get_mode(lock) == LOCK_S \
+	 || lock_get_mode(lock) == LOCK_X)
+
+/*********************************************************************//**
+Removes table locks of the transaction on a table to be dropped. */
+static
+void
+lock_trx_table_locks_remove(
+/*========================*/
+	const lock_t*	lock_to_remove)		/*!< in: lock to remove */
+{
+	lint		i;
+	trx_t*		trx = lock_to_remove->trx;
+
+	ut_ad(lock_mutex_own());
+
+	/* It is safe to read this because we are holding the lock mutex */
+	if (!trx->lock.cancel) {
+		trx_mutex_enter(trx);
+	} else {
+		ut_ad(trx_mutex_own(trx));
+	}
+
+	for (i = ib_vector_size(trx->lock.table_locks) - 1; i >= 0; --i) {
+		const lock_t*	lock;
+
+		lock = *static_cast<lock_t**>(
+			ib_vector_get(trx->lock.table_locks, i));
+
+		if (lock == NULL) {
+			continue;
+		}
+
+		ut_a(trx == lock->trx);
+		ut_a(lock_get_type_low(lock) & LOCK_TABLE);
+		ut_a(lock->un_member.tab_lock.table != NULL);
+
+		if (lock == lock_to_remove) {
+			void*	null_var = NULL;
+			ib_vector_set(trx->lock.table_locks, i, &null_var);
+
+			if (!trx->lock.cancel) {
+				trx_mutex_exit(trx);
+			}
+
+			return;
+		}
+	}
+
+	if (!trx->lock.cancel) {
+		trx_mutex_exit(trx);
+	}
+
+	/* Lock must exist in the vector. */
+	ut_error;
+}
+
+/*********************************************************************//**
+Removes locks of a transaction on a table to be dropped.
+If remove_also_table_sx_locks is TRUE then table-level S and X locks are
+also removed in addition to other table-level and record-level locks.
+No lock that is going to be removed is allowed to be a wait lock. */
+static
+void
+lock_remove_all_on_table_for_trx(
+/*=============================*/
+	dict_table_t*	table,			/*!< in: table to be dropped */
+	trx_t*		trx,			/*!< in: a transaction */
+	ibool		remove_also_table_sx_locks)/*!< in: also removes
+						table S and X locks */
+{
+	lock_t*		lock;
+	lock_t*		prev_lock;
+
+	ut_ad(lock_mutex_own());
+
+	for (lock = UT_LIST_GET_LAST(trx->lock.trx_locks);
+	     lock != NULL;
+	     lock = prev_lock) {
+
+		prev_lock = UT_LIST_GET_PREV(trx_locks, lock);
+
+		if (lock_get_type_low(lock) == LOCK_REC
+		    && lock->index->table == table) {
+			ut_a(!lock_get_wait(lock));
+
+			lock_rec_discard(lock);
+		} else if (lock_get_type_low(lock) & LOCK_TABLE
+			   && lock->un_member.tab_lock.table == table
+			   && (remove_also_table_sx_locks
+			       || !IS_LOCK_S_OR_X(lock))) {
+
+			ut_a(!lock_get_wait(lock));
+
+			lock_trx_table_locks_remove(lock);
+			lock_table_remove_low(lock);
+		}
+	}
+}
+
+/*******************************************************************//**
+Remove any explicit record locks held by recovering transactions on
+the table.
+@return number of recovered transactions examined */
+static
+ulint
+lock_remove_recovered_trx_record_locks(
+/*===================================*/
+	dict_table_t*	table)	/*!< in: check if there are any locks
+				held on records in this table or on the
+				table itself */
+{
+	trx_t*		trx;
+	ulint		n_recovered_trx = 0;
+
+	ut_a(table != NULL);
+	ut_ad(lock_mutex_own());
+
+	mutex_enter(&trx_sys->mutex);
+
+	for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+	     trx != NULL;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+		lock_t*	lock;
+		lock_t*	next_lock;
+
+		assert_trx_in_rw_list(trx);
+
+		if (!trx->is_recovered) {
+			continue;
+		}
+
+		/* Because we are holding the lock_sys->mutex,
+		implicit locks cannot be converted to explicit ones
+		while we are scanning the explicit locks. */
+
+		for (lock = UT_LIST_GET_FIRST(trx->lock.trx_locks);
+		     lock != NULL;
+		     lock = next_lock) {
+
+			ut_a(lock->trx == trx);
+
+			/* Recovered transactions can't wait on a lock. */
+
+			ut_a(!lock_get_wait(lock));
+
+			next_lock = UT_LIST_GET_NEXT(trx_locks, lock);
+
+			switch (lock_get_type_low(lock)) {
+			default:
+				ut_error;
+			case LOCK_TABLE:
+				if (lock->un_member.tab_lock.table == table) {
+					lock_trx_table_locks_remove(lock);
+					lock_table_remove_low(lock);
+				}
+				break;
+			case LOCK_REC:
+				if (lock->index->table == table) {
+					lock_rec_discard(lock);
+				}
+			}
+		}
+
+		++n_recovered_trx;
+	}
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(n_recovered_trx);
+}
+
+/*********************************************************************//**
+Removes locks on a table to be dropped or truncated.
+If remove_also_table_sx_locks is TRUE then table-level S and X locks are
+also removed in addition to other table-level and record-level locks.
+No lock, that is going to be removed, is allowed to be a wait lock. */
+UNIV_INTERN
+void
+lock_remove_all_on_table(
+/*=====================*/
+	dict_table_t*	table,			/*!< in: table to be dropped
+						or truncated */
+	ibool		remove_also_table_sx_locks)/*!< in: also removes
+						table S and X locks */
+{
+	lock_t*		lock;
+
+	lock_mutex_enter();
+
+	for (lock = UT_LIST_GET_FIRST(table->locks);
+	     lock != NULL;
+	     /* No op */) {
+
+		lock_t*	prev_lock;
+
+		prev_lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock);
+
+		/* If we should remove all locks (remove_also_table_sx_locks
+		is TRUE), or if the lock is not table-level S or X lock,
+		then check we are not going to remove a wait lock. */
+		if (remove_also_table_sx_locks
+		    || !(lock_get_type(lock) == LOCK_TABLE
+			 && IS_LOCK_S_OR_X(lock))) {
+
+			ut_a(!lock_get_wait(lock));
+		}
+
+		lock_remove_all_on_table_for_trx(
+			table, lock->trx, remove_also_table_sx_locks);
+
+		if (prev_lock == NULL) {
+			if (lock == UT_LIST_GET_FIRST(table->locks)) {
+				/* lock was not removed, pick its successor */
+				lock = UT_LIST_GET_NEXT(
+					un_member.tab_lock.locks, lock);
+			} else {
+				/* lock was removed, pick the first one */
+				lock = UT_LIST_GET_FIRST(table->locks);
+			}
+		} else if (UT_LIST_GET_NEXT(un_member.tab_lock.locks,
+					    prev_lock) != lock) {
+			/* If lock was removed by
+			lock_remove_all_on_table_for_trx() then pick the
+			successor of prev_lock ... */
+			lock = UT_LIST_GET_NEXT(
+				un_member.tab_lock.locks, prev_lock);
+		} else {
+			/* ... otherwise pick the successor of lock. */
+			lock = UT_LIST_GET_NEXT(
+				un_member.tab_lock.locks, lock);
+		}
+	}
+
+	/* Note: Recovered transactions don't have table level IX or IS locks
+	but can have implicit record locks that have been converted to explicit
+	record locks. Such record locks cannot be freed by traversing the
+	transaction lock list in dict_table_t (as above). */
+
+	if (!lock_sys->rollback_complete
+	    && lock_remove_recovered_trx_record_locks(table) == 0) {
+
+		lock_sys->rollback_complete = TRUE;
+	}
+
+	lock_mutex_exit();
+}
+
+/*===================== VALIDATION AND DEBUGGING  ====================*/
+
+/*********************************************************************//**
+Prints info of a table lock. */
+UNIV_INTERN
+void
+lock_table_print(
+/*=============*/
+	FILE*		file,	/*!< in: file where to print */
+	const lock_t*	lock)	/*!< in: table type lock */
+{
+	ut_ad(lock_mutex_own());
+	ut_a(lock_get_type_low(lock) == LOCK_TABLE);
+
+	fputs("TABLE LOCK table ", file);
+	ut_print_name(file, lock->trx, TRUE,
+		      lock->un_member.tab_lock.table->name);
+	fprintf(file, " trx id " TRX_ID_FMT, lock->trx->id);
+
+	if (lock_get_mode(lock) == LOCK_S) {
+		fputs(" lock mode S", file);
+	} else if (lock_get_mode(lock) == LOCK_X) {
+		fputs(" lock mode X", file);
+	} else if (lock_get_mode(lock) == LOCK_IS) {
+		fputs(" lock mode IS", file);
+	} else if (lock_get_mode(lock) == LOCK_IX) {
+		fputs(" lock mode IX", file);
+	} else if (lock_get_mode(lock) == LOCK_AUTO_INC) {
+		fputs(" lock mode AUTO-INC", file);
+	} else {
+		fprintf(file, " unknown lock mode %lu",
+			(ulong) lock_get_mode(lock));
+	}
+
+	if (lock_get_wait(lock)) {
+		fputs(" waiting", file);
+	}
+
+	putc('\n', file);
+}
+
+/*********************************************************************//**
+Prints info of a record lock. */
+UNIV_INTERN
+void
+lock_rec_print(
+/*===========*/
+	FILE*		file,	/*!< in: file where to print */
+	const lock_t*	lock)	/*!< in: record type lock */
+{
+	const buf_block_t*	block;
+	ulint			space;
+	ulint			page_no;
+	ulint			i;
+	mtr_t			mtr;
+	mem_heap_t*		heap		= NULL;
+	ulint			offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*			offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(lock_mutex_own());
+	ut_a(lock_get_type_low(lock) == LOCK_REC);
+
+	space = lock->un_member.rec_lock.space;
+	page_no = lock->un_member.rec_lock.page_no;
+
+	fprintf(file, "RECORD LOCKS space id %lu page no %lu n bits %lu ",
+		(ulong) space, (ulong) page_no,
+		(ulong) lock_rec_get_n_bits(lock));
+	dict_index_name_print(file, lock->trx, lock->index);
+	fprintf(file, " trx id " TRX_ID_FMT, lock->trx->id);
+
+	if (lock_get_mode(lock) == LOCK_S) {
+		fputs(" lock mode S", file);
+	} else if (lock_get_mode(lock) == LOCK_X) {
+		fputs(" lock_mode X", file);
+	} else {
+		ut_error;
+	}
+
+	if (lock_rec_get_gap(lock)) {
+		fputs(" locks gap before rec", file);
+	}
+
+	if (lock_rec_get_rec_not_gap(lock)) {
+		fputs(" locks rec but not gap", file);
+	}
+
+	if (lock_rec_get_insert_intention(lock)) {
+		fputs(" insert intention", file);
+	}
+
+	if (lock_get_wait(lock)) {
+		fputs(" waiting", file);
+	}
+
+	mtr_start(&mtr);
+
+	putc('\n', file);
+
+	block = buf_page_try_get(space, page_no, &mtr);
+
+	for (i = 0; i < lock_rec_get_n_bits(lock); ++i) {
+
+		if (!lock_rec_get_nth_bit(lock, i)) {
+			continue;
+		}
+
+		fprintf(file, "Record lock, heap no %lu", (ulong) i);
+
+		if (block) {
+			const rec_t*	rec;
+
+			rec = page_find_rec_with_heap_no(
+				buf_block_get_frame(block), i);
+
+			offsets = rec_get_offsets(
+				rec, lock->index, offsets,
+				ULINT_UNDEFINED, &heap);
+
+			putc(' ', file);
+			rec_print_new(file, rec, offsets);
+		}
+
+		putc('\n', file);
+	}
+
+	mtr_commit(&mtr);
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+#ifdef UNIV_DEBUG
+/* Print the number of lock structs from lock_print_info_summary() only
+in non-production builds for performance reasons, see
+http://bugs.mysql.com/36942 */
+#define PRINT_NUM_OF_LOCK_STRUCTS
+#endif /* UNIV_DEBUG */
+
+#ifdef PRINT_NUM_OF_LOCK_STRUCTS
+/*********************************************************************//**
+Calculates the number of record lock structs in the record lock hash table.
+@return	number of record locks */
+static
+ulint
+lock_get_n_rec_locks(void)
+/*======================*/
+{
+	ulint	n_locks	= 0;
+	ulint	i;
+
+	ut_ad(lock_mutex_own());
+
+	for (i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) {
+		const lock_t*	lock;
+
+		for (lock = static_cast<const lock_t*>(
+				HASH_GET_FIRST(lock_sys->rec_hash, i));
+		     lock != 0;
+		     lock = static_cast<const lock_t*>(
+				HASH_GET_NEXT(hash, lock))) {
+
+			n_locks++;
+		}
+	}
+
+	return(n_locks);
+}
+#endif /* PRINT_NUM_OF_LOCK_STRUCTS */
+
+/*********************************************************************//**
+Prints info of locks for all transactions.
+@return FALSE if not able to obtain lock mutex
+and exits without printing info */
+UNIV_INTERN
+ibool
+lock_print_info_summary(
+/*====================*/
+	FILE*	file,	/*!< in: file where to print */
+	ibool   nowait)	/*!< in: whether to wait for the lock mutex */
+{
+	/* if nowait is FALSE, wait on the lock mutex,
+	otherwise return immediately if fail to obtain the
+	mutex. */
+	if (!nowait) {
+		lock_mutex_enter();
+	} else if (lock_mutex_enter_nowait()) {
+		fputs("FAIL TO OBTAIN LOCK MUTEX, "
+		      "SKIP LOCK INFO PRINTING\n", file);
+		return(FALSE);
+	}
+
+	if (lock_deadlock_found) {
+		fputs("------------------------\n"
+		      "LATEST DETECTED DEADLOCK\n"
+		      "------------------------\n", file);
+
+		if (!srv_read_only_mode) {
+			ut_copy_file(file, lock_latest_err_file);
+		}
+	}
+
+	fputs("------------\n"
+	      "TRANSACTIONS\n"
+	      "------------\n", file);
+
+	fprintf(file, "Trx id counter " TRX_ID_FMT "\n",
+		trx_sys_get_max_trx_id());
+
+	fprintf(file,
+		"Purge done for trx's n:o < " TRX_ID_FMT
+		" undo n:o < " TRX_ID_FMT " state: ",
+		purge_sys->iter.trx_no,
+		purge_sys->iter.undo_no);
+
+	/* Note: We are reading the state without the latch. One because it
+	will violate the latching order and two because we are merely querying
+	the state of the variable for display. */
+
+	switch (purge_sys->state){
+	case PURGE_STATE_INIT:
+		/* Should never be in this state while the system is running. */
+		ut_error;
+
+	case PURGE_STATE_EXIT:
+		fprintf(file, "exited");
+		break;
+
+	case PURGE_STATE_DISABLED:
+		fprintf(file, "disabled");
+		break;
+
+	case PURGE_STATE_RUN:
+		fprintf(file, "running");
+		/* Check if it is waiting for more data to arrive. */
+		if (!purge_sys->running) {
+			fprintf(file, " but idle");
+		}
+		break;
+
+	case PURGE_STATE_STOP:
+		fprintf(file, "stopped");
+		break;
+	}
+
+	fprintf(file, "\n");
+
+	fprintf(file,
+		"History list length %lu\n",
+		(ulong) trx_sys->rseg_history_len);
+
+#ifdef PRINT_NUM_OF_LOCK_STRUCTS
+	fprintf(file,
+		"Total number of lock structs in row lock hash table %lu\n",
+		(ulong) lock_get_n_rec_locks());
+#endif /* PRINT_NUM_OF_LOCK_STRUCTS */
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Prints info of locks for each transaction. This function assumes that the
+caller holds the lock mutex and more importantly it will release the lock
+mutex on behalf of the caller. (This should be fixed in the future). */
+UNIV_INTERN
+void
+lock_print_info_all_transactions(
+/*=============================*/
+	FILE*	file)	/*!< in: file where to print */
+{
+	const lock_t*	lock;
+	ibool		load_page_first = TRUE;
+	ulint		nth_trx		= 0;
+	ulint		nth_lock	= 0;
+	ulint		i;
+	mtr_t		mtr;
+	const trx_t*	trx;
+	trx_list_t*	trx_list = &trx_sys->rw_trx_list;
+
+	fprintf(file, "LIST OF TRANSACTIONS FOR EACH SESSION:\n");
+
+	ut_ad(lock_mutex_own());
+
+	mutex_enter(&trx_sys->mutex);
+
+	/* First print info on non-active transactions */
+
+	/* NOTE: information of auto-commit non-locking read-only
+	transactions will be omitted here. The information will be
+	available from INFORMATION_SCHEMA.INNODB_TRX. */
+
+	for (trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list);
+	     trx != NULL;
+	     trx = UT_LIST_GET_NEXT(mysql_trx_list, trx)) {
+
+		ut_ad(trx->in_mysql_trx_list);
+
+		/* See state transitions and locking rules in trx0trx.h */
+
+		if (trx_state_eq(trx, TRX_STATE_NOT_STARTED)) {
+			fputs("---", file);
+			trx_print_latched(file, trx, 600);
+		}
+	}
+
+loop:
+	/* Since we temporarily release lock_sys->mutex and
+	trx_sys->mutex when reading a database page in below,
+	variable trx may be obsolete now and we must loop
+	through the trx list to get probably the same trx,
+	or some other trx. */
+
+	for (trx = UT_LIST_GET_FIRST(*trx_list), i = 0;
+	     trx && (i < nth_trx);
+	     trx = UT_LIST_GET_NEXT(trx_list, trx), i++) {
+
+		assert_trx_in_list(trx);
+		ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list));
+	}
+
+	ut_ad(trx == NULL
+	      || trx->read_only == (trx_list == &trx_sys->ro_trx_list));
+
+	if (trx == NULL) {
+		/* Check the read-only transaction list next. */
+		if (trx_list == &trx_sys->rw_trx_list) {
+			trx_list = &trx_sys->ro_trx_list;
+			nth_trx = 0;
+			nth_lock = 0;
+			goto loop;
+		}
+
+		lock_mutex_exit();
+		mutex_exit(&trx_sys->mutex);
+
+		ut_ad(lock_validate());
+
+		return;
+	}
+
+	assert_trx_in_list(trx);
+
+	if (nth_lock == 0) {
+		fputs("---", file);
+
+		trx_print_latched(file, trx, 600);
+
+		if (trx->read_view) {
+			fprintf(file,
+				"Trx read view will not see trx with"
+				" id >= " TRX_ID_FMT
+				", sees < " TRX_ID_FMT "\n",
+				trx->read_view->low_limit_id,
+				trx->read_view->up_limit_id);
+		}
+
+		if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+			fprintf(file,
+				"------- TRX HAS BEEN WAITING %lu SEC"
+				" FOR THIS LOCK TO BE GRANTED:\n",
+				(ulong) difftime(ut_time(),
+						 trx->lock.wait_started));
+
+			if (lock_get_type_low(trx->lock.wait_lock) == LOCK_REC) {
+				lock_rec_print(file, trx->lock.wait_lock);
+			} else {
+				lock_table_print(file, trx->lock.wait_lock);
+			}
+
+			fputs("------------------\n", file);
+		}
+	}
+
+	if (!srv_print_innodb_lock_monitor) {
+		nth_trx++;
+		goto loop;
+	}
+
+	i = 0;
+
+	/* Look at the note about the trx loop above why we loop here:
+	lock may be an obsolete pointer now. */
+
+	lock = UT_LIST_GET_FIRST(trx->lock.trx_locks);
+
+	while (lock && (i < nth_lock)) {
+		lock = UT_LIST_GET_NEXT(trx_locks, lock);
+		i++;
+	}
+
+	if (lock == NULL) {
+		nth_trx++;
+		nth_lock = 0;
+
+		goto loop;
+	}
+
+	if (lock_get_type_low(lock) == LOCK_REC) {
+		if (load_page_first) {
+			ulint	space	= lock->un_member.rec_lock.space;
+			ulint	zip_size= fil_space_get_zip_size(space);
+			ulint	page_no = lock->un_member.rec_lock.page_no;
+			ibool	tablespace_being_deleted = FALSE;
+
+			if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+
+				/* It is a single table tablespace and
+				the .ibd file is missing (TRUNCATE
+				TABLE probably stole the locks): just
+				print the lock without attempting to
+				load the page in the buffer pool. */
+
+				fprintf(file, "RECORD LOCKS on"
+					" non-existing space %lu\n",
+					(ulong) space);
+				goto print_rec;
+			}
+
+			lock_mutex_exit();
+			mutex_exit(&trx_sys->mutex);
+
+			DEBUG_SYNC_C("innodb_monitor_before_lock_page_read");
+
+			/* Check if the space is exists or not. only when the space
+			is valid, try to get the page. */
+			tablespace_being_deleted = fil_inc_pending_ops(space, false);
+
+			if (!tablespace_being_deleted) {
+				mtr_start(&mtr);
+
+				buf_page_get_gen(space, zip_size, page_no,
+						 RW_NO_LATCH, NULL,
+						 BUF_GET_POSSIBLY_FREED,
+						 __FILE__, __LINE__, &mtr);
+
+				mtr_commit(&mtr);
+
+				fil_decr_pending_ops(space);
+			} else {
+				fprintf(file, "RECORD LOCKS on"
+					" non-existing space %lu\n",
+					(ulong) space);
+			}
+
+			load_page_first = FALSE;
+
+			lock_mutex_enter();
+
+			mutex_enter(&trx_sys->mutex);
+
+			goto loop;
+		}
+
+print_rec:
+		lock_rec_print(file, lock);
+	} else {
+		ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+
+		lock_table_print(file, lock);
+	}
+
+	load_page_first = TRUE;
+
+	nth_lock++;
+
+	if (nth_lock >= 10) {
+		fputs("10 LOCKS PRINTED FOR THIS TRX:"
+		      " SUPPRESSING FURTHER PRINTS\n",
+		      file);
+
+		nth_trx++;
+		nth_lock = 0;
+	}
+
+	goto loop;
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Find the the lock in the trx_t::trx_lock_t::table_locks vector.
+@return TRUE if found */
+static
+ibool
+lock_trx_table_locks_find(
+/*======================*/
+	trx_t*		trx,		/*!< in: trx to validate */
+	const lock_t*	find_lock)	/*!< in: lock to find */
+{
+	lint		i;
+	ibool		found = FALSE;
+
+	trx_mutex_enter(trx);
+
+	for (i = ib_vector_size(trx->lock.table_locks) - 1; i >= 0; --i) {
+		const lock_t*	lock;
+
+		lock = *static_cast<const lock_t**>(
+			ib_vector_get(trx->lock.table_locks, i));
+
+		if (lock == NULL) {
+			continue;
+		} else if (lock == find_lock) {
+			/* Can't be duplicates. */
+			ut_a(!found);
+			found = TRUE;
+		}
+
+		ut_a(trx == lock->trx);
+		ut_a(lock_get_type_low(lock) & LOCK_TABLE);
+		ut_a(lock->un_member.tab_lock.table != NULL);
+	}
+
+	trx_mutex_exit(trx);
+
+	return(found);
+}
+
+/*********************************************************************//**
+Validates the lock queue on a table.
+@return	TRUE if ok */
+static
+ibool
+lock_table_queue_validate(
+/*======================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	const lock_t*	lock;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	for (lock = UT_LIST_GET_FIRST(table->locks);
+	     lock != NULL;
+	     lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) {
+
+		/* lock->trx->state cannot change from or to NOT_STARTED
+		while we are holding the trx_sys->mutex. It may change
+		from ACTIVE to PREPARED, but it may not change to
+		COMMITTED, because we are holding the lock_sys->mutex. */
+		ut_ad(trx_assert_started(lock->trx));
+
+		if (!lock_get_wait(lock)) {
+
+			ut_a(!lock_table_other_has_incompatible(
+				     lock->trx, 0, table,
+				     lock_get_mode(lock)));
+		} else {
+
+			ut_a(lock_table_has_to_wait_in_queue(lock));
+		}
+
+		ut_a(lock_trx_table_locks_find(lock->trx, lock));
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Validates the lock queue on a single record.
+@return	TRUE if ok */
+static
+ibool
+lock_rec_queue_validate(
+/*====================*/
+	ibool			locked_lock_trx_sys,
+					/*!< in: if the caller holds
+					both the lock mutex and
+					trx_sys_t->lock. */
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec,	/*!< in: record to look at */
+	const dict_index_t*	index,	/*!< in: index, or NULL if not known */
+	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	const trx_t*	impl_trx;
+	const lock_t*	lock;
+	ulint		heap_no;
+
+	ut_a(rec);
+	ut_a(block->frame == page_align(rec));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
+	ut_ad(lock_mutex_own() == locked_lock_trx_sys);
+	ut_ad(!index || dict_index_is_clust(index)
+	      || !dict_index_is_online_ddl(index));
+
+	heap_no = page_rec_get_heap_no(rec);
+
+	if (!locked_lock_trx_sys) {
+		lock_mutex_enter();
+		mutex_enter(&trx_sys->mutex);
+	}
+
+	if (!page_rec_is_user_rec(rec)) {
+
+		for (lock = lock_rec_get_first(block, heap_no);
+		     lock != NULL;
+		     lock = lock_rec_get_next_const(heap_no, lock)) {
+
+			ut_a(trx_in_trx_list(lock->trx));
+
+			if (lock_get_wait(lock)) {
+				ut_a(lock_rec_has_to_wait_in_queue(lock));
+			}
+
+			if (index) {
+				ut_a(lock->index == index);
+			}
+		}
+
+		goto func_exit;
+	}
+
+	if (!index);
+	else if (dict_index_is_clust(index)) {
+		trx_id_t	trx_id;
+
+		/* Unlike the non-debug code, this invariant can only succeed
+		if the check and assertion are covered by the lock mutex. */
+
+		trx_id = lock_clust_rec_some_has_impl(rec, index, offsets);
+		impl_trx = trx_rw_is_active_low(trx_id, NULL);
+
+		ut_ad(lock_mutex_own());
+		/* impl_trx cannot be committed until lock_mutex_exit()
+		because lock_trx_release_locks() acquires lock_sys->mutex */
+
+		if (impl_trx != NULL
+		    && lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT,
+						   block, heap_no, impl_trx)) {
+
+			ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
+					       block, heap_no, impl_trx));
+		}
+	}
+
+	for (lock = lock_rec_get_first(block, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next_const(heap_no, lock)) {
+
+		ut_a(trx_in_trx_list(lock->trx));
+
+		if (index) {
+			ut_a(lock->index == index);
+		}
+
+		if (!lock_rec_get_gap(lock) && !lock_get_wait(lock)) {
+
+			enum lock_mode	mode;
+
+			if (lock_get_mode(lock) == LOCK_S) {
+				mode = LOCK_X;
+			} else {
+				mode = LOCK_S;
+			}
+			ut_a(!lock_rec_other_has_expl_req(
+				     mode, 0, 0, block, heap_no, lock->trx));
+
+		} else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) {
+
+			ut_a(lock_rec_has_to_wait_in_queue(lock));
+		}
+	}
+
+func_exit:
+	if (!locked_lock_trx_sys) {
+		lock_mutex_exit();
+		mutex_exit(&trx_sys->mutex);
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Validates the record lock queues on a page.
+@return	TRUE if ok */
+static
+ibool
+lock_rec_validate_page(
+/*===================*/
+	const buf_block_t*	block)	/*!< in: buffer block */
+{
+	const lock_t*	lock;
+	const rec_t*	rec;
+	ulint		nth_lock	= 0;
+	ulint		nth_bit		= 0;
+	ulint		i;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(!lock_mutex_own());
+
+	lock_mutex_enter();
+	mutex_enter(&trx_sys->mutex);
+loop:
+	lock = lock_rec_get_first_on_page_addr(buf_block_get_space(block),
+					       buf_block_get_page_no(block));
+
+	if (!lock) {
+		goto function_exit;
+	}
+
+#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
+	ut_a(!block->page.file_page_was_freed);
+#endif
+
+	for (i = 0; i < nth_lock; i++) {
+
+		lock = lock_rec_get_next_on_page_const(lock);
+
+		if (!lock) {
+			goto function_exit;
+		}
+	}
+
+	ut_a(trx_in_trx_list(lock->trx));
+
+# ifdef UNIV_SYNC_DEBUG
+	/* Only validate the record queues when this thread is not
+	holding a space->latch.  Deadlocks are possible due to
+	latching order violation when UNIV_DEBUG is defined while
+	UNIV_SYNC_DEBUG is not. */
+	if (!sync_thread_levels_contains(SYNC_FSP))
+# endif /* UNIV_SYNC_DEBUG */
+	for (i = nth_bit; i < lock_rec_get_n_bits(lock); i++) {
+
+		if (i == 1 || lock_rec_get_nth_bit(lock, i)) {
+
+			rec = page_find_rec_with_heap_no(block->frame, i);
+			ut_a(rec);
+			offsets = rec_get_offsets(rec, lock->index, offsets,
+						  ULINT_UNDEFINED, &heap);
+#if 0
+			fprintf(stderr,
+				"Validating %u %u\n",
+				block->page.space, block->page.offset);
+#endif
+			/* If this thread is holding the file space
+			latch (fil_space_t::latch), the following
+			check WILL break the latching order and may
+			cause a deadlock of threads. */
+
+			lock_rec_queue_validate(
+				TRUE, block, rec, lock->index, offsets);
+
+			nth_bit = i + 1;
+
+			goto loop;
+		}
+	}
+
+	nth_bit = 0;
+	nth_lock++;
+
+	goto loop;
+
+function_exit:
+	lock_mutex_exit();
+	mutex_exit(&trx_sys->mutex);
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Validates the table locks.
+@return	TRUE if ok */
+static
+ibool
+lock_validate_table_locks(
+/*======================*/
+	const trx_list_t*	trx_list)	/*!< in: trx list */
+{
+	const trx_t*	trx;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	ut_ad(trx_list == &trx_sys->rw_trx_list
+	      || trx_list == &trx_sys->ro_trx_list);
+
+	for (trx = UT_LIST_GET_FIRST(*trx_list);
+	     trx != NULL;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+		const lock_t*	lock;
+
+		assert_trx_in_list(trx);
+		ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list));
+
+		for (lock = UT_LIST_GET_FIRST(trx->lock.trx_locks);
+		     lock != NULL;
+		     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+
+			if (lock_get_type_low(lock) & LOCK_TABLE) {
+
+				lock_table_queue_validate(
+					lock->un_member.tab_lock.table);
+			}
+		}
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Validate record locks up to a limit.
+@return lock at limit or NULL if no more locks in the hash bucket */
+static __attribute__((nonnull, warn_unused_result))
+const lock_t*
+lock_rec_validate(
+/*==============*/
+	ulint		start,		/*!< in: lock_sys->rec_hash
+					bucket */
+	ib_uint64_t*	limit)		/*!< in/out: upper limit of
+					(space, page_no) */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	for (const lock_t* lock = static_cast<const lock_t*>(
+			HASH_GET_FIRST(lock_sys->rec_hash, start));
+	     lock != NULL;
+	     lock = static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock))) {
+
+		ib_uint64_t	current;
+
+		ut_a(trx_in_trx_list(lock->trx));
+		ut_a(lock_get_type(lock) == LOCK_REC);
+
+		current = ut_ull_create(
+			lock->un_member.rec_lock.space,
+			lock->un_member.rec_lock.page_no);
+
+		if (current > *limit) {
+			*limit = current + 1;
+			return(lock);
+		}
+	}
+
+	return(0);
+}
+
+/*********************************************************************//**
+Validate a record lock's block */
+static
+void
+lock_rec_block_validate(
+/*====================*/
+	ulint		space,
+	ulint		page_no)
+{
+	/* The lock and the block that it is referring to may be freed at
+	this point. We pass BUF_GET_POSSIBLY_FREED to skip a debug check.
+	If the lock exists in lock_rec_validate_page() we assert
+	!block->page.file_page_was_freed. */
+
+	buf_block_t*	block;
+	mtr_t		mtr;
+
+	/* Make sure that the tablespace is not deleted while we are
+	trying to access the page. */
+	if (!fil_inc_pending_ops(space, true)) {
+		mtr_start(&mtr);
+		block = buf_page_get_gen(
+			space, fil_space_get_zip_size(space),
+			page_no, RW_X_LATCH, NULL,
+			BUF_GET_POSSIBLY_FREED,
+			__FILE__, __LINE__, &mtr);
+
+		buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+		ut_ad(lock_rec_validate_page(block));
+		mtr_commit(&mtr);
+
+		fil_decr_pending_ops(space);
+	}
+}
+
+/*********************************************************************//**
+Validates the lock system.
+@return	TRUE if ok */
+static
+bool
+lock_validate()
+/*===========*/
+{
+	typedef	std::pair<ulint, ulint> page_addr_t;
+	typedef std::set<page_addr_t> page_addr_set;
+	page_addr_set pages;
+
+	lock_mutex_enter();
+	mutex_enter(&trx_sys->mutex);
+
+	ut_a(lock_validate_table_locks(&trx_sys->rw_trx_list));
+	ut_a(lock_validate_table_locks(&trx_sys->ro_trx_list));
+
+	/* Iterate over all the record locks and validate the locks. We
+	don't want to hog the lock_sys_t::mutex and the trx_sys_t::mutex.
+	Release both mutexes during the validation check. */
+
+	for (ulint i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) {
+		const lock_t*	lock;
+		ib_uint64_t	limit = 0;
+
+		while ((lock = lock_rec_validate(i, &limit)) != 0) {
+
+			ulint	space = lock->un_member.rec_lock.space;
+			ulint	page_no = lock->un_member.rec_lock.page_no;
+
+			pages.insert(std::make_pair(space, page_no));
+		}
+	}
+
+	mutex_exit(&trx_sys->mutex);
+	lock_mutex_exit();
+
+	for (page_addr_set::const_iterator it = pages.begin();
+	     it != pages.end();
+	     ++it) {
+		lock_rec_block_validate((*it).first, (*it).second);
+	}
+
+	return(true);
+}
+#endif /* UNIV_DEBUG */
+/*============ RECORD LOCK CHECKS FOR ROW OPERATIONS ====================*/
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate insert of
+a record. If they do, first tests if the query thread should anyway
+be suspended for some reason; if not, then puts the transaction and
+the query thread to the lock wait state and inserts a waiting request
+for a gap x-lock to the lock queue.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+dberr_t
+lock_rec_insert_check_and_lock(
+/*===========================*/
+	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is
+				set, does nothing */
+	const rec_t*	rec,	/*!< in: record after which to insert */
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	dict_index_t*	index,	/*!< in: index */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	ibool*		inherit)/*!< out: set to TRUE if the new
+				inserted record maybe should inherit
+				LOCK_GAP type locks from the successor
+				record */
+{
+	const rec_t*	next_rec;
+	trx_t*		trx;
+	lock_t*		lock;
+	dberr_t		err;
+	ulint		next_rec_heap_no;
+	ibool		inherit_in = *inherit;
+
+	ut_ad(block->frame == page_align(rec));
+	ut_ad(!dict_index_is_online_ddl(index)
+	      || dict_index_is_clust(index)
+	      || (flags & BTR_CREATE_FLAG));
+
+	if (flags & BTR_NO_LOCKING_FLAG) {
+
+		return(DB_SUCCESS);
+	}
+
+	trx = thr_get_trx(thr);
+	next_rec = page_rec_get_next_const(rec);
+	next_rec_heap_no = page_rec_get_heap_no(next_rec);
+
+	lock_mutex_enter();
+	/* Because this code is invoked for a running transaction by
+	the thread that is serving the transaction, it is not necessary
+	to hold trx->mutex here. */
+
+	/* When inserting a record into an index, the table must be at
+	least IX-locked. When we are building an index, we would pass
+	BTR_NO_LOCKING_FLAG and skip the locking altogether. */
+	ut_ad(lock_table_has(trx, index->table, LOCK_IX));
+
+	lock = lock_rec_get_first(block, next_rec_heap_no);
+
+	if (UNIV_LIKELY(lock == NULL)) {
+		/* We optimize CPU time usage in the simplest case */
+
+		lock_mutex_exit();
+
+		if (inherit_in && !dict_index_is_clust(index)) {
+			/* Update the page max trx id field */
+			page_update_max_trx_id(block,
+					       buf_block_get_page_zip(block),
+					       trx->id, mtr);
+		}
+
+		*inherit = FALSE;
+
+		return(DB_SUCCESS);
+	}
+
+	*inherit = TRUE;
+
+	/* If another transaction has an explicit lock request which locks
+	the gap, waiting or granted, on the successor, the insert has to wait.
+
+	An exception is the case where the lock by the another transaction
+	is a gap type lock which it placed to wait for its turn to insert. We
+	do not consider that kind of a lock conflicting with our insert. This
+	eliminates an unnecessary deadlock which resulted when 2 transactions
+	had to wait for their insert. Both had waiting gap type lock requests
+	on the successor, which produced an unnecessary deadlock. */
+
+	if (lock_rec_other_has_conflicting(
+		    static_cast<enum lock_mode>(
+			    LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION),
+		    block, next_rec_heap_no, trx)) {
+
+		/* Note that we may get DB_SUCCESS also here! */
+		trx_mutex_enter(trx);
+
+		err = lock_rec_enqueue_waiting(
+			LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION,
+			block, next_rec_heap_no, index, thr);
+
+		trx_mutex_exit(trx);
+	} else {
+		err = DB_SUCCESS;
+	}
+
+	lock_mutex_exit();
+
+	switch (err) {
+	case DB_SUCCESS_LOCKED_REC:
+		err = DB_SUCCESS;
+		/* fall through */
+	case DB_SUCCESS:
+		if (!inherit_in || dict_index_is_clust(index)) {
+			break;
+		}
+		/* Update the page max trx id field */
+		page_update_max_trx_id(block,
+				       buf_block_get_page_zip(block),
+				       trx->id, mtr);
+	default:
+		/* We only care about the two return values. */
+		break;
+	}
+
+#ifdef UNIV_DEBUG
+	{
+		mem_heap_t*	heap		= NULL;
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		const ulint*	offsets;
+		rec_offs_init(offsets_);
+
+		offsets = rec_get_offsets(next_rec, index, offsets_,
+					  ULINT_UNDEFINED, &heap);
+
+		ut_ad(lock_rec_queue_validate(
+				FALSE, block, next_rec, index, offsets));
+
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	}
+#endif /* UNIV_DEBUG */
+
+	return(err);
+}
+
+/*********************************************************************//**
+If a transaction has an implicit x-lock on a record, but no explicit x-lock
+set on the record, sets one for it. */
+static
+void
+lock_rec_convert_impl_to_expl(
+/*==========================*/
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record on page */
+	dict_index_t*		index,	/*!< in: index of record */
+	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	trx_id_t		trx_id;
+
+	ut_ad(!lock_mutex_own());
+	ut_ad(page_rec_is_user_rec(rec));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
+
+	if (dict_index_is_clust(index)) {
+		trx_id = lock_clust_rec_some_has_impl(rec, index, offsets);
+		/* The clustered index record was last modified by
+		this transaction. The transaction may have been
+		committed a long time ago. */
+	} else {
+		ut_ad(!dict_index_is_online_ddl(index));
+		trx_id = lock_sec_rec_some_has_impl(rec, index, offsets);
+		/* The transaction can be committed before the
+		trx_is_active(trx_id, NULL) check below, because we are not
+		holding lock_mutex. */
+
+		ut_ad(!lock_rec_other_trx_holds_expl(LOCK_S | LOCK_REC_NOT_GAP,
+						     trx_id, rec, block));
+	}
+
+	if (trx_id != 0) {
+		trx_t*	impl_trx;
+		ulint	heap_no = page_rec_get_heap_no(rec);
+
+		lock_mutex_enter();
+
+		/* If the transaction is still active and has no
+		explicit x-lock set on the record, set one for it */
+
+		impl_trx = trx_rw_is_active(trx_id, NULL);
+
+		/* impl_trx cannot be committed until lock_mutex_exit()
+		because lock_trx_release_locks() acquires lock_sys->mutex */
+
+		if (impl_trx != NULL
+		    && !lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, block,
+					  heap_no, impl_trx)) {
+			ulint	type_mode = (LOCK_REC | LOCK_X
+					     | LOCK_REC_NOT_GAP);
+
+			lock_rec_add_to_queue(
+				type_mode, block, heap_no, index,
+				impl_trx, FALSE);
+		}
+
+		lock_mutex_exit();
+	}
+}
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify (update,
+delete mark, or delete unmark) of a clustered index record. If they do,
+first tests if the query thread should anyway be suspended for some
+reason; if not, then puts the transaction and the query thread to the
+lock wait state and inserts a waiting request for a record x-lock to the
+lock queue.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+dberr_t
+lock_clust_rec_modify_check_and_lock(
+/*=================================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record which should be
+					modified */
+	dict_index_t*		index,	/*!< in: clustered index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+	ulint	heap_no;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(block->frame == page_align(rec));
+
+	if (flags & BTR_NO_LOCKING_FLAG) {
+
+		return(DB_SUCCESS);
+	}
+
+	heap_no = rec_offs_comp(offsets)
+		? rec_get_heap_no_new(rec)
+		: rec_get_heap_no_old(rec);
+
+	/* If a transaction has no explicit x-lock set on the record, set one
+	for it */
+
+	lock_rec_convert_impl_to_expl(block, rec, index, offsets);
+
+	lock_mutex_enter();
+
+	ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+
+	err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP,
+			    block, heap_no, index, thr);
+
+	MONITOR_INC(MONITOR_NUM_RECLOCK_REQ);
+
+	lock_mutex_exit();
+
+	ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets));
+
+	if (UNIV_UNLIKELY(err == DB_SUCCESS_LOCKED_REC)) {
+		err = DB_SUCCESS;
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify (delete
+mark or delete unmark) of a secondary index record.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+dberr_t
+lock_sec_rec_modify_check_and_lock(
+/*===============================*/
+	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+				bit is set, does nothing */
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	const rec_t*	rec,	/*!< in: record which should be
+				modified; NOTE: as this is a secondary
+				index, we always have to modify the
+				clustered index record first: see the
+				comment below */
+	dict_index_t*	index,	/*!< in: secondary index */
+	que_thr_t*	thr,	/*!< in: query thread
+				(can be NULL if BTR_NO_LOCKING_FLAG) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	dberr_t	err;
+	ulint	heap_no;
+
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(!dict_index_is_online_ddl(index) || (flags & BTR_CREATE_FLAG));
+	ut_ad(block->frame == page_align(rec));
+
+	if (flags & BTR_NO_LOCKING_FLAG) {
+
+		return(DB_SUCCESS);
+	}
+
+	heap_no = page_rec_get_heap_no(rec);
+
+	/* Another transaction cannot have an implicit lock on the record,
+	because when we come here, we already have modified the clustered
+	index record, and this would not have been possible if another active
+	transaction had modified this secondary index record. */
+
+	lock_mutex_enter();
+
+	ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+
+	err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP,
+			    block, heap_no, index, thr);
+
+	MONITOR_INC(MONITOR_NUM_RECLOCK_REQ);
+
+	lock_mutex_exit();
+
+#ifdef UNIV_DEBUG
+	{
+		mem_heap_t*	heap		= NULL;
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		const ulint*	offsets;
+		rec_offs_init(offsets_);
+
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  ULINT_UNDEFINED, &heap);
+
+		ut_ad(lock_rec_queue_validate(
+			FALSE, block, rec, index, offsets));
+
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	}
+#endif /* UNIV_DEBUG */
+
+	if (err == DB_SUCCESS || err == DB_SUCCESS_LOCKED_REC) {
+		/* Update the page max trx id field */
+		/* It might not be necessary to do this if
+		err == DB_SUCCESS (no new lock created),
+		but it should not cost too much performance. */
+		page_update_max_trx_id(block,
+				       buf_block_get_page_zip(block),
+				       thr_get_trx(thr)->id, mtr);
+		err = DB_SUCCESS;
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Like lock_clust_rec_read_check_and_lock(), but reads a
+secondary index record.
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
+or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+dberr_t
+lock_sec_rec_read_check_and_lock(
+/*=============================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: secondary index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	enum lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	ulint			gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+	ulint	heap_no;
+
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(!dict_index_is_online_ddl(index));
+	ut_ad(block->frame == page_align(rec));
+	ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(mode == LOCK_X || mode == LOCK_S);
+
+	if (flags & BTR_NO_LOCKING_FLAG) {
+
+		return(DB_SUCCESS);
+	}
+
+	heap_no = page_rec_get_heap_no(rec);
+
+	/* Some transaction may have an implicit x-lock on the record only
+	if the max trx id for the page >= min trx id for the trx list or a
+	database recovery is running. */
+
+	if ((page_get_max_trx_id(block->frame) >= trx_rw_min_trx_id()
+	     || recv_recovery_is_on())
+	    && !page_rec_is_supremum(rec)) {
+
+		lock_rec_convert_impl_to_expl(block, rec, index, offsets);
+	}
+
+	lock_mutex_enter();
+
+	ut_ad(mode != LOCK_X
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+	ut_ad(mode != LOCK_S
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+
+	err = lock_rec_lock(FALSE, mode | gap_mode,
+			    block, heap_no, index, thr);
+
+	MONITOR_INC(MONITOR_NUM_RECLOCK_REQ);
+
+	lock_mutex_exit();
+
+	ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets));
+
+	return(err);
+}
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record.
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
+or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+dberr_t
+lock_clust_rec_read_check_and_lock(
+/*===============================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: clustered index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	enum lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	ulint			gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+	ulint	heap_no;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(block->frame == page_align(rec));
+	ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec));
+	ut_ad(gap_mode == LOCK_ORDINARY || gap_mode == LOCK_GAP
+	      || gap_mode == LOCK_REC_NOT_GAP);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (flags & BTR_NO_LOCKING_FLAG) {
+
+		return(DB_SUCCESS);
+	}
+
+	heap_no = page_rec_get_heap_no(rec);
+
+	if (UNIV_LIKELY(heap_no != PAGE_HEAP_NO_SUPREMUM)) {
+
+		lock_rec_convert_impl_to_expl(block, rec, index, offsets);
+	}
+
+	lock_mutex_enter();
+
+	ut_ad(mode != LOCK_X
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+	ut_ad(mode != LOCK_S
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+
+	err = lock_rec_lock(FALSE, mode | gap_mode,
+			    block, heap_no, index, thr);
+
+	MONITOR_INC(MONITOR_NUM_RECLOCK_REQ);
+
+	lock_mutex_exit();
+
+	ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets));
+
+	return(err);
+}
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record. This is an alternative version of
+lock_clust_rec_read_check_and_lock() that does not require the parameter
+"offsets".
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+dberr_t
+lock_clust_rec_read_check_and_lock_alt(
+/*===================================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: clustered index */
+	enum lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	ulint			gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	mem_heap_t*	tmp_heap	= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	dberr_t		err;
+	rec_offs_init(offsets_);
+
+	offsets = rec_get_offsets(rec, index, offsets,
+				  ULINT_UNDEFINED, &tmp_heap);
+	err = lock_clust_rec_read_check_and_lock(flags, block, rec, index,
+						 offsets, mode, gap_mode, thr);
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	if (UNIV_UNLIKELY(err == DB_SUCCESS_LOCKED_REC)) {
+		err = DB_SUCCESS;
+	}
+
+	return(err);
+}
+
+/*******************************************************************//**
+Release the last lock from the transaction's autoinc locks. */
+UNIV_INLINE
+void
+lock_release_autoinc_last_lock(
+/*===========================*/
+	ib_vector_t*	autoinc_locks)	/*!< in/out: vector of AUTOINC locks */
+{
+	ulint		last;
+	lock_t*		lock;
+
+	ut_ad(lock_mutex_own());
+	ut_a(!ib_vector_is_empty(autoinc_locks));
+
+	/* The lock to be release must be the last lock acquired. */
+	last = ib_vector_size(autoinc_locks) - 1;
+	lock = *static_cast<lock_t**>(ib_vector_get(autoinc_locks, last));
+
+	/* Should have only AUTOINC locks in the vector. */
+	ut_a(lock_get_mode(lock) == LOCK_AUTO_INC);
+	ut_a(lock_get_type(lock) == LOCK_TABLE);
+
+	ut_a(lock->un_member.tab_lock.table != NULL);
+
+	/* This will remove the lock from the trx autoinc_locks too. */
+	lock_table_dequeue(lock);
+
+	/* Remove from the table vector too. */
+	lock_trx_table_locks_remove(lock);
+}
+
+/*******************************************************************//**
+Check if a transaction holds any autoinc locks.
+@return TRUE if the transaction holds any AUTOINC locks. */
+static
+ibool
+lock_trx_holds_autoinc_locks(
+/*=========================*/
+	const trx_t*	trx)		/*!< in: transaction */
+{
+	ut_a(trx->autoinc_locks != NULL);
+
+	return(!ib_vector_is_empty(trx->autoinc_locks));
+}
+
+/*******************************************************************//**
+Release all the transaction's autoinc locks. */
+static
+void
+lock_release_autoinc_locks(
+/*=======================*/
+	trx_t*		trx)		/*!< in/out: transaction */
+{
+	ut_ad(lock_mutex_own());
+	/* If this is invoked for a running transaction by the thread
+	that is serving the transaction, then it is not necessary to
+	hold trx->mutex here. */
+
+	ut_a(trx->autoinc_locks != NULL);
+
+	/* We release the locks in the reverse order. This is to
+	avoid searching the vector for the element to delete at
+	the lower level. See (lock_table_remove_low()) for details. */
+	while (!ib_vector_is_empty(trx->autoinc_locks)) {
+
+		/* lock_table_remove_low() will also remove the lock from
+		the transaction's autoinc_locks vector. */
+		lock_release_autoinc_last_lock(trx->autoinc_locks);
+	}
+
+	/* Should release all locks. */
+	ut_a(ib_vector_is_empty(trx->autoinc_locks));
+}
+
+/*******************************************************************//**
+Gets the type of a lock. Non-inline version for using outside of the
+lock module.
+@return	LOCK_TABLE or LOCK_REC */
+UNIV_INTERN
+ulint
+lock_get_type(
+/*==========*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	return(lock_get_type_low(lock));
+}
+
+/*******************************************************************//**
+Gets the id of the transaction owning a lock.
+@return	transaction id */
+UNIV_INTERN
+trx_id_t
+lock_get_trx_id(
+/*============*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	return(lock->trx->id);
+}
+
+/*******************************************************************//**
+Gets the mode of a lock in a human readable string.
+The string should not be free()'d or modified.
+@return	lock mode */
+UNIV_INTERN
+const char*
+lock_get_mode_str(
+/*==============*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	ibool	is_gap_lock;
+
+	is_gap_lock = lock_get_type_low(lock) == LOCK_REC
+		&& lock_rec_get_gap(lock);
+
+	switch (lock_get_mode(lock)) {
+	case LOCK_S:
+		if (is_gap_lock) {
+			return("S,GAP");
+		} else {
+			return("S");
+		}
+	case LOCK_X:
+		if (is_gap_lock) {
+			return("X,GAP");
+		} else {
+			return("X");
+		}
+	case LOCK_IS:
+		if (is_gap_lock) {
+			return("IS,GAP");
+		} else {
+			return("IS");
+		}
+	case LOCK_IX:
+		if (is_gap_lock) {
+			return("IX,GAP");
+		} else {
+			return("IX");
+		}
+	case LOCK_AUTO_INC:
+		return("AUTO_INC");
+	default:
+		return("UNKNOWN");
+	}
+}
+
+/*******************************************************************//**
+Gets the type of a lock in a human readable string.
+The string should not be free()'d or modified.
+@return	lock type */
+UNIV_INTERN
+const char*
+lock_get_type_str(
+/*==============*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	switch (lock_get_type_low(lock)) {
+	case LOCK_REC:
+		return("RECORD");
+	case LOCK_TABLE:
+		return("TABLE");
+	default:
+		return("UNKNOWN");
+	}
+}
+
+/*******************************************************************//**
+Gets the table on which the lock is.
+@return	table */
+UNIV_INLINE
+dict_table_t*
+lock_get_table(
+/*===========*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	switch (lock_get_type_low(lock)) {
+	case LOCK_REC:
+		ut_ad(dict_index_is_clust(lock->index)
+		      || !dict_index_is_online_ddl(lock->index));
+		return(lock->index->table);
+	case LOCK_TABLE:
+		return(lock->un_member.tab_lock.table);
+	default:
+		ut_error;
+		return(NULL);
+	}
+}
+
+/*******************************************************************//**
+Gets the id of the table on which the lock is.
+@return	id of the table */
+UNIV_INTERN
+table_id_t
+lock_get_table_id(
+/*==============*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	dict_table_t*	table;
+
+	table = lock_get_table(lock);
+
+	return(table->id);
+}
+
+/*******************************************************************//**
+Gets the name of the table on which the lock is.
+The string should not be free()'d or modified.
+@return	name of the table */
+UNIV_INTERN
+const char*
+lock_get_table_name(
+/*================*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	dict_table_t*	table;
+
+	table = lock_get_table(lock);
+
+	return(table->name);
+}
+
+/*******************************************************************//**
+For a record lock, gets the index on which the lock is.
+@return	index */
+UNIV_INTERN
+const dict_index_t*
+lock_rec_get_index(
+/*===============*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	ut_a(lock_get_type_low(lock) == LOCK_REC);
+	ut_ad(dict_index_is_clust(lock->index)
+	      || !dict_index_is_online_ddl(lock->index));
+
+	return(lock->index);
+}
+
+/*******************************************************************//**
+For a record lock, gets the name of the index on which the lock is.
+The string should not be free()'d or modified.
+@return	name of the index */
+UNIV_INTERN
+const char*
+lock_rec_get_index_name(
+/*====================*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	ut_a(lock_get_type_low(lock) == LOCK_REC);
+	ut_ad(dict_index_is_clust(lock->index)
+	      || !dict_index_is_online_ddl(lock->index));
+
+	return(lock->index->name);
+}
+
+/*******************************************************************//**
+For a record lock, gets the tablespace number on which the lock is.
+@return	tablespace number */
+UNIV_INTERN
+ulint
+lock_rec_get_space_id(
+/*==================*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	ut_a(lock_get_type_low(lock) == LOCK_REC);
+
+	return(lock->un_member.rec_lock.space);
+}
+
+/*******************************************************************//**
+For a record lock, gets the page number on which the lock is.
+@return	page number */
+UNIV_INTERN
+ulint
+lock_rec_get_page_no(
+/*=================*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	ut_a(lock_get_type_low(lock) == LOCK_REC);
+
+	return(lock->un_member.rec_lock.page_no);
+}
+
+/*********************************************************************//**
+Cancels a waiting lock request and releases possible other transactions
+waiting behind it. */
+UNIV_INTERN
+void
+lock_cancel_waiting_and_release(
+/*============================*/
+	lock_t*	lock)	/*!< in/out: waiting lock request */
+{
+	que_thr_t*	thr;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(trx_mutex_own(lock->trx));
+
+	lock->trx->lock.cancel = TRUE;
+
+	if (lock_get_type_low(lock) == LOCK_REC) {
+
+		lock_rec_dequeue_from_page(lock);
+	} else {
+		ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+
+		if (lock->trx->autoinc_locks != NULL) {
+			/* Release the transaction's AUTOINC locks. */
+			lock_release_autoinc_locks(lock->trx);
+		}
+
+		lock_table_dequeue(lock);
+	}
+
+	/* Reset the wait flag and the back pointer to lock in trx. */
+
+	lock_reset_lock_and_trx_wait(lock);
+
+	/* The following function releases the trx from lock wait. */
+
+	thr = que_thr_end_lock_wait(lock->trx);
+
+	if (thr != NULL) {
+		lock_wait_release_thread_if_suspended(thr);
+	}
+
+	lock->trx->lock.cancel = FALSE;
+}
+
+/*********************************************************************//**
+Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
+function should be called at the the end of an SQL statement, by the
+connection thread that owns the transaction (trx->mysql_thd). */
+UNIV_INTERN
+void
+lock_unlock_table_autoinc(
+/*======================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	ut_ad(!lock_mutex_own());
+	ut_ad(!trx_mutex_own(trx));
+	ut_ad(!trx->lock.wait_lock);
+	/* This can be invoked on NOT_STARTED, ACTIVE, PREPARED,
+	but not COMMITTED transactions. */
+	ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED)
+	      || !trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY));
+
+	/* This function is invoked for a running transaction by the
+	thread that is serving the transaction. Therefore it is not
+	necessary to hold trx->mutex here. */
+
+	if (lock_trx_holds_autoinc_locks(trx)) {
+		lock_mutex_enter();
+
+		lock_release_autoinc_locks(trx);
+
+		lock_mutex_exit();
+	}
+}
+
+/*********************************************************************//**
+Releases a transaction's locks, and releases possible other transactions
+waiting because of these locks. Change the state of the transaction to
+TRX_STATE_COMMITTED_IN_MEMORY. */
+UNIV_INTERN
+void
+lock_trx_release_locks(
+/*===================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	assert_trx_in_list(trx);
+
+	if (trx_state_eq(trx, TRX_STATE_PREPARED)) {
+		mutex_enter(&trx_sys->mutex);
+		ut_a(trx_sys->n_prepared_trx > 0);
+		trx_sys->n_prepared_trx--;
+		if (trx->is_recovered) {
+			ut_a(trx_sys->n_prepared_recovered_trx > 0);
+			trx_sys->n_prepared_recovered_trx--;
+		}
+		mutex_exit(&trx_sys->mutex);
+	} else {
+		ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	}
+
+	/* The transition of trx->state to TRX_STATE_COMMITTED_IN_MEMORY
+	is protected by both the lock_sys->mutex and the trx->mutex. */
+	lock_mutex_enter();
+	trx_mutex_enter(trx);
+
+	/* The following assignment makes the transaction committed in memory
+	and makes its changes to data visible to other transactions.
+	NOTE that there is a small discrepancy from the strict formal
+	visibility rules here: a human user of the database can see
+	modifications made by another transaction T even before the necessary
+	log segment has been flushed to the disk. If the database happens to
+	crash before the flush, the user has seen modifications from T which
+	will never be a committed transaction. However, any transaction T2
+	which sees the modifications of the committing transaction T, and
+	which also itself makes modifications to the database, will get an lsn
+	larger than the committing transaction T. In the case where the log
+	flush fails, and T never gets committed, also T2 will never get
+	committed. */
+
+	/*--------------------------------------*/
+	trx->state = TRX_STATE_COMMITTED_IN_MEMORY;
+	/*--------------------------------------*/
+
+	/* If the background thread trx_rollback_or_clean_recovered()
+	is still active then there is a chance that the rollback
+	thread may see this trx as COMMITTED_IN_MEMORY and goes ahead
+	to clean it up calling trx_cleanup_at_db_startup(). This can
+	happen in the case we are committing a trx here that is left
+	in PREPARED state during the crash. Note that commit of the
+	rollback of a PREPARED trx happens in the recovery thread
+	while the rollback of other transactions happen in the
+	background thread. To avoid this race we unconditionally unset
+	the is_recovered flag. */
+
+	trx->is_recovered = FALSE;
+
+	trx_mutex_exit(trx);
+
+	lock_release(trx);
+
+	lock_mutex_exit();
+}
+
+/*********************************************************************//**
+Check whether the transaction has already been rolled back because it
+was selected as a deadlock victim, or if it has to wait then cancel
+the wait lock.
+@return DB_DEADLOCK, DB_LOCK_WAIT or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+lock_trx_handle_wait(
+/*=================*/
+	trx_t*	trx)	/*!< in/out: trx lock state */
+{
+	dberr_t	err;
+
+	lock_mutex_enter();
+
+	trx_mutex_enter(trx);
+
+	if (trx->lock.was_chosen_as_deadlock_victim) {
+		err = DB_DEADLOCK;
+	} else if (trx->lock.wait_lock != NULL) {
+		lock_cancel_waiting_and_release(trx->lock.wait_lock);
+		err = DB_LOCK_WAIT;
+	} else {
+		/* The lock was probably granted before we got here. */
+		err = DB_SUCCESS;
+	}
+
+	lock_mutex_exit();
+	trx_mutex_exit(trx);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Get the number of locks on a table.
+@return number of locks */
+UNIV_INTERN
+ulint
+lock_table_get_n_locks(
+/*===================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ulint		n_table_locks;
+
+	lock_mutex_enter();
+
+	n_table_locks = UT_LIST_GET_LEN(table->locks);
+
+	lock_mutex_exit();
+
+	return(n_table_locks);
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Do an exhaustive check for any locks (table or rec) against the table.
+@return	lock if found */
+static
+const lock_t*
+lock_table_locks_lookup(
+/*====================*/
+	const dict_table_t*	table,		/*!< in: check if there are
+						any locks held on records in
+						this table or on the table
+						itself */
+	const trx_list_t*	trx_list)	/*!< in: trx list to check */
+{
+	trx_t*			trx;
+
+	ut_a(table != NULL);
+	ut_ad(lock_mutex_own());
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	ut_ad(trx_list == &trx_sys->rw_trx_list
+	      || trx_list == &trx_sys->ro_trx_list);
+
+	for (trx = UT_LIST_GET_FIRST(*trx_list);
+	     trx != NULL;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+		const lock_t*	lock;
+
+		assert_trx_in_list(trx);
+		ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list));
+
+		for (lock = UT_LIST_GET_FIRST(trx->lock.trx_locks);
+		     lock != NULL;
+		     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+
+			ut_a(lock->trx == trx);
+
+			if (lock_get_type_low(lock) == LOCK_REC) {
+				ut_ad(!dict_index_is_online_ddl(lock->index)
+				      || dict_index_is_clust(lock->index));
+				if (lock->index->table == table) {
+					return(lock);
+				}
+			} else if (lock->un_member.tab_lock.table == table) {
+				return(lock);
+			}
+		}
+	}
+
+	return(NULL);
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Check if there are any locks (table or rec) against table.
+@return	TRUE if table has either table or record locks. */
+UNIV_INTERN
+ibool
+lock_table_has_locks(
+/*=================*/
+	const dict_table_t*	table)	/*!< in: check if there are any locks
+					held on records in this table or on the
+					table itself */
+{
+	ibool			has_locks;
+
+	lock_mutex_enter();
+
+	has_locks = UT_LIST_GET_LEN(table->locks) > 0 || table->n_rec_locks > 0;
+
+#ifdef UNIV_DEBUG
+	if (!has_locks) {
+		mutex_enter(&trx_sys->mutex);
+
+		ut_ad(!lock_table_locks_lookup(table, &trx_sys->rw_trx_list));
+		ut_ad(!lock_table_locks_lookup(table, &trx_sys->ro_trx_list));
+
+		mutex_exit(&trx_sys->mutex);
+	}
+#endif /* UNIV_DEBUG */
+
+	lock_mutex_exit();
+
+	return(has_locks);
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Check if the transaction holds any locks on the sys tables
+or its records.
+@return	the strongest lock found on any sys table or 0 for none */
+UNIV_INTERN
+const lock_t*
+lock_trx_has_sys_table_locks(
+/*=========================*/
+	const trx_t*	trx)	/*!< in: transaction to check */
+{
+	lint		i;
+	const lock_t*	strongest_lock = 0;
+	lock_mode	strongest = LOCK_NONE;
+
+	lock_mutex_enter();
+
+	/* Find a valid mode. Note: ib_vector_size() can be 0. */
+	for (i = ib_vector_size(trx->lock.table_locks) - 1; i >= 0; --i) {
+		const lock_t*	lock;
+
+		lock = *static_cast<const lock_t**>(
+			ib_vector_get(trx->lock.table_locks, i));
+
+		if (lock != NULL
+		    && dict_is_sys_table(lock->un_member.tab_lock.table->id)) {
+
+			strongest = lock_get_mode(lock);
+			ut_ad(strongest != LOCK_NONE);
+			strongest_lock = lock;
+			break;
+		}
+	}
+
+	if (strongest == LOCK_NONE) {
+		lock_mutex_exit();
+		return(NULL);
+	}
+
+	for (/* No op */; i >= 0; --i) {
+		const lock_t*	lock;
+
+		lock = *static_cast<const lock_t**>(
+			ib_vector_get(trx->lock.table_locks, i));
+
+		if (lock == NULL) {
+			continue;
+		}
+
+		ut_ad(trx == lock->trx);
+		ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+		ut_ad(lock->un_member.tab_lock.table != NULL);
+
+		lock_mode	mode = lock_get_mode(lock);
+
+		if (dict_is_sys_table(lock->un_member.tab_lock.table->id)
+		    && lock_mode_stronger_or_eq(mode, strongest)) {
+
+			strongest = mode;
+			strongest_lock = lock;
+		}
+	}
+
+	lock_mutex_exit();
+
+	return(strongest_lock);
+}
+
+/*******************************************************************//**
+Check if the transaction holds an exclusive lock on a record.
+@return	whether the locks are held */
+UNIV_INTERN
+bool
+lock_trx_has_rec_x_lock(
+/*====================*/
+	const trx_t*		trx,	/*!< in: transaction to check */
+	const dict_table_t*	table,	/*!< in: table to check */
+	const buf_block_t*	block,	/*!< in: buffer block of the record */
+	ulint			heap_no)/*!< in: record heap number */
+{
+	ut_ad(heap_no > PAGE_HEAP_NO_SUPREMUM);
+
+	lock_mutex_enter();
+	ut_a(lock_table_has(trx, table, LOCK_IX));
+	ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
+			       block, heap_no, trx));
+	lock_mutex_exit();
+	return(true);
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/lock/lock0wait.cc b/storage/innobase/lock/lock0wait.cc
new file mode 100644
index 00000000000..a1c35e20ead
--- /dev/null
+++ b/storage/innobase/lock/lock0wait.cc
@@ -0,0 +1,543 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file lock/lock0wait.cc
+The transaction lock system
+
+Created 25/5/2010 Sunny Bains
+*******************************************************/
+
+#define LOCK_MODULE_IMPLEMENTATION
+
+#include "srv0mon.h"
+#include "que0que.h"
+#include "lock0lock.h"
+#include "row0mysql.h"
+#include "srv0start.h"
+#include "ha_prototypes.h"
+#include "lock0priv.h"
+
+/*********************************************************************//**
+Print the contents of the lock_sys_t::waiting_threads array. */
+static
+void
+lock_wait_table_print(void)
+/*=======================*/
+{
+	ulint			i;
+	const srv_slot_t*	slot;
+
+	ut_ad(lock_wait_mutex_own());
+
+	slot = lock_sys->waiting_threads;
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++, ++slot) {
+
+		fprintf(stderr,
+			"Slot %lu: thread type %lu,"
+			" in use %lu, susp %lu, timeout %lu, time %lu\n",
+			(ulong) i,
+			(ulong) slot->type,
+			(ulong) slot->in_use,
+			(ulong) slot->suspended,
+			slot->wait_timeout,
+			(ulong) difftime(ut_time(), slot->suspend_time));
+	}
+}
+
+/*********************************************************************//**
+Release a slot in the lock_sys_t::waiting_threads. Adjust the array last pointer
+if there are empty slots towards the end of the table. */
+static
+void
+lock_wait_table_release_slot(
+/*=========================*/
+	srv_slot_t*	slot)		/*!< in: slot to release */
+{
+#ifdef UNIV_DEBUG
+	srv_slot_t*	upper = lock_sys->waiting_threads + OS_THREAD_MAX_N;
+#endif /* UNIV_DEBUG */
+
+	lock_wait_mutex_enter();
+
+	ut_ad(slot->in_use);
+	ut_ad(slot->thr != NULL);
+	ut_ad(slot->thr->slot != NULL);
+	ut_ad(slot->thr->slot == slot);
+
+	/* Must be within the array boundaries. */
+	ut_ad(slot >= lock_sys->waiting_threads);
+	ut_ad(slot < upper);
+
+	/* Note: When we reserve the slot we use the trx_t::mutex to update
+	the slot values to change the state to reserved. Here we are using the
+	lock mutex to change the state of the slot to free. This is by design,
+	because when we query the slot state we always hold both the lock and
+	trx_t::mutex. To reduce contention on the lock mutex when reserving the
+	slot we avoid acquiring the lock mutex. */
+
+	lock_mutex_enter();
+
+	slot->thr->slot = NULL;
+	slot->thr = NULL;
+	slot->in_use = FALSE;
+
+	lock_mutex_exit();
+
+	/* Scan backwards and adjust the last free slot pointer. */
+	for (slot = lock_sys->last_slot;
+	     slot > lock_sys->waiting_threads && !slot->in_use;
+	     --slot) {
+		/* No op */
+	}
+
+	/* Either the array is empty or the last scanned slot is in use. */
+	ut_ad(slot->in_use || slot == lock_sys->waiting_threads);
+
+	lock_sys->last_slot = slot + 1;
+
+	/* The last slot is either outside of the array boundary or it's
+	on an empty slot. */
+	ut_ad(lock_sys->last_slot == upper || !lock_sys->last_slot->in_use);
+
+	ut_ad(lock_sys->last_slot >= lock_sys->waiting_threads);
+	ut_ad(lock_sys->last_slot <= upper);
+
+	lock_wait_mutex_exit();
+}
+
+/*********************************************************************//**
+Reserves a slot in the thread table for the current user OS thread.
+@return	reserved slot */
+static
+srv_slot_t*
+lock_wait_table_reserve_slot(
+/*=========================*/
+	que_thr_t*	thr,		/*!< in: query thread associated
+					with the user OS thread */
+	ulong		wait_timeout)	/*!< in: lock wait timeout value */
+{
+	ulint		i;
+	srv_slot_t*	slot;
+
+	ut_ad(lock_wait_mutex_own());
+	ut_ad(trx_mutex_own(thr_get_trx(thr)));
+
+	slot = lock_sys->waiting_threads;
+
+	for (i = OS_THREAD_MAX_N; i--; ++slot) {
+		if (!slot->in_use) {
+			slot->in_use = TRUE;
+			slot->thr = thr;
+			slot->thr->slot = slot;
+
+			if (slot->event == NULL) {
+				slot->event = os_event_create();
+				ut_a(slot->event);
+			}
+
+			os_event_reset(slot->event);
+			slot->suspended = TRUE;
+			slot->suspend_time = ut_time();
+			slot->wait_timeout = wait_timeout;
+
+			if (slot == lock_sys->last_slot) {
+				++lock_sys->last_slot;
+			}
+
+			ut_ad(lock_sys->last_slot
+			      <= lock_sys->waiting_threads + OS_THREAD_MAX_N);
+
+			return(slot);
+		}
+	}
+
+	ut_print_timestamp(stderr);
+
+	fprintf(stderr,
+		"  InnoDB: There appear to be %lu user"
+		" threads currently waiting\n"
+		"InnoDB: inside InnoDB, which is the"
+		" upper limit. Cannot continue operation.\n"
+		"InnoDB: As a last thing, we print"
+		" a list of waiting threads.\n", (ulong) OS_THREAD_MAX_N);
+
+	lock_wait_table_print();
+
+	ut_error;
+	return(NULL);
+}
+
+/***************************************************************//**
+Puts a user OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */
+UNIV_INTERN
+void
+lock_wait_suspend_thread(
+/*=====================*/
+	que_thr_t*	thr)	/*!< in: query thread associated with the
+				user OS thread */
+{
+	srv_slot_t*	slot;
+	double		wait_time;
+	trx_t*		trx;
+	ulint		had_dict_lock;
+	ibool		was_declared_inside_innodb;
+	ib_int64_t	start_time			= 0;
+	ib_int64_t	finish_time;
+	ulint		sec;
+	ulint		ms;
+	ulong		lock_wait_timeout;
+
+	trx = thr_get_trx(thr);
+
+	if (trx->mysql_thd != 0) {
+		DEBUG_SYNC_C("lock_wait_suspend_thread_enter");
+	}
+
+	/* InnoDB system transactions (such as the purge, and
+	incomplete transactions that are being rolled back after crash
+	recovery) will use the global value of
+	innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */
+	lock_wait_timeout = trx_lock_wait_timeout_get(trx);
+
+	lock_wait_mutex_enter();
+
+	trx_mutex_enter(trx);
+
+	trx->error_state = DB_SUCCESS;
+
+	if (thr->state == QUE_THR_RUNNING) {
+
+		ut_ad(thr->is_active);
+
+		/* The lock has already been released or this transaction
+		was chosen as a deadlock victim: no need to suspend */
+
+		if (trx->lock.was_chosen_as_deadlock_victim) {
+
+			trx->error_state = DB_DEADLOCK;
+			trx->lock.was_chosen_as_deadlock_victim = FALSE;
+		}
+
+		lock_wait_mutex_exit();
+		trx_mutex_exit(trx);
+		return;
+	}
+
+	ut_ad(!thr->is_active);
+
+	slot = lock_wait_table_reserve_slot(thr, lock_wait_timeout);
+
+	if (thr->lock_state == QUE_THR_LOCK_ROW) {
+		srv_stats.n_lock_wait_count.inc();
+		srv_stats.n_lock_wait_current_count.inc();
+
+		if (ut_usectime(&sec, &ms) == -1) {
+			start_time = -1;
+		} else {
+			start_time = (ib_int64_t) sec * 1000000 + ms;
+		}
+	}
+
+	/* Wake the lock timeout monitor thread, if it is suspended */
+
+	os_event_set(lock_sys->timeout_event);
+
+	lock_wait_mutex_exit();
+	trx_mutex_exit(trx);
+
+	ulint	lock_type = ULINT_UNDEFINED;
+
+	lock_mutex_enter();
+
+	if (const lock_t* wait_lock = trx->lock.wait_lock) {
+		lock_type = lock_get_type_low(wait_lock);
+	}
+
+	lock_mutex_exit();
+
+	had_dict_lock = trx->dict_operation_lock_mode;
+
+	switch (had_dict_lock) {
+	case 0:
+		break;
+	case RW_S_LATCH:
+		/* Release foreign key check latch */
+		row_mysql_unfreeze_data_dictionary(trx);
+
+		DEBUG_SYNC_C("lock_wait_release_s_latch_before_sleep");
+		break;
+	default:
+		/* There should never be a lock wait when the
+		dictionary latch is reserved in X mode.  Dictionary
+		transactions should only acquire locks on dictionary
+		tables, not other tables. All access to dictionary
+		tables should be covered by dictionary
+		transactions. */
+		ut_error;
+	}
+
+	ut_a(trx->dict_operation_lock_mode == 0);
+
+	/* Suspend this thread and wait for the event. */
+
+	was_declared_inside_innodb = trx->declared_to_be_inside_innodb;
+
+	if (was_declared_inside_innodb) {
+		/* We must declare this OS thread to exit InnoDB, since a
+		possible other thread holding a lock which this thread waits
+		for must be allowed to enter, sooner or later */
+
+		srv_conc_force_exit_innodb(trx);
+	}
+
+	/* Unknown is also treated like a record lock */
+	if (lock_type == ULINT_UNDEFINED || lock_type == LOCK_REC) {
+		thd_wait_begin(trx->mysql_thd, THD_WAIT_ROW_LOCK);
+	} else {
+		ut_ad(lock_type == LOCK_TABLE);
+		thd_wait_begin(trx->mysql_thd, THD_WAIT_TABLE_LOCK);
+	}
+
+	os_event_wait(slot->event);
+
+	thd_wait_end(trx->mysql_thd);
+
+	/* After resuming, reacquire the data dictionary latch if
+	necessary. */
+
+	if (was_declared_inside_innodb) {
+
+		/* Return back inside InnoDB */
+
+		srv_conc_force_enter_innodb(trx);
+	}
+
+	if (had_dict_lock) {
+
+		row_mysql_freeze_data_dictionary(trx);
+	}
+
+	wait_time = ut_difftime(ut_time(), slot->suspend_time);
+
+	/* Release the slot for others to use */
+
+	lock_wait_table_release_slot(slot);
+
+	if (thr->lock_state == QUE_THR_LOCK_ROW) {
+		ulint	diff_time;
+
+		if (ut_usectime(&sec, &ms) == -1) {
+			finish_time = -1;
+		} else {
+			finish_time = (ib_int64_t) sec * 1000000 + ms;
+		}
+
+		diff_time = (finish_time > start_time) ?
+			    (ulint) (finish_time - start_time) : 0;
+
+		srv_stats.n_lock_wait_current_count.dec();
+		srv_stats.n_lock_wait_time.add(diff_time);
+
+		/* Only update the variable if we successfully
+		retrieved the start and finish times. See Bug#36819. */
+		if (diff_time > lock_sys->n_lock_max_wait_time
+		    && start_time != -1
+		    && finish_time != -1) {
+
+			lock_sys->n_lock_max_wait_time = diff_time;
+		}
+
+		/* Record the lock wait time for this thread */
+		thd_set_lock_wait_time(trx->mysql_thd, diff_time);
+
+	}
+
+	if (lock_wait_timeout < 100000000
+	    && wait_time > (double) lock_wait_timeout) {
+
+		trx->error_state = DB_LOCK_WAIT_TIMEOUT;
+
+		MONITOR_INC(MONITOR_TIMEOUT);
+	}
+
+	if (trx_is_interrupted(trx)) {
+
+		trx->error_state = DB_INTERRUPTED;
+	}
+}
+
+/********************************************************************//**
+Releases a user OS thread waiting for a lock to be released, if the
+thread is already suspended. */
+UNIV_INTERN
+void
+lock_wait_release_thread_if_suspended(
+/*==================================*/
+	que_thr_t*	thr)	/*!< in: query thread associated with the
+				user OS thread	 */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(trx_mutex_own(thr_get_trx(thr)));
+
+	/* We own both the lock mutex and the trx_t::mutex but not the
+	lock wait mutex. This is OK because other threads will see the state
+	of this slot as being in use and no other thread can change the state
+	of the slot to free unless that thread also owns the lock mutex. */
+
+	if (thr->slot != NULL && thr->slot->in_use && thr->slot->thr == thr) {
+		trx_t*	trx = thr_get_trx(thr);
+
+		if (trx->lock.was_chosen_as_deadlock_victim) {
+
+			trx->error_state = DB_DEADLOCK;
+			trx->lock.was_chosen_as_deadlock_victim = FALSE;
+		}
+
+		os_event_set(thr->slot->event);
+	}
+}
+
+/*********************************************************************//**
+Check if the thread lock wait has timed out. Release its locks if the
+wait has actually timed out. */
+static
+void
+lock_wait_check_and_cancel(
+/*=======================*/
+	const srv_slot_t*	slot)	/*!< in: slot reserved by a user
+					thread when the wait started */
+{
+	trx_t*		trx;
+	double		wait_time;
+	ib_time_t	suspend_time = slot->suspend_time;
+
+	ut_ad(lock_wait_mutex_own());
+
+	ut_ad(slot->in_use);
+
+	ut_ad(slot->suspended);
+
+	wait_time = ut_difftime(ut_time(), suspend_time);
+
+	trx = thr_get_trx(slot->thr);
+
+	if (trx_is_interrupted(trx)
+	    || (slot->wait_timeout < 100000000
+		&& (wait_time > (double) slot->wait_timeout
+		   || wait_time < 0))) {
+
+		/* Timeout exceeded or a wrap-around in system
+		time counter: cancel the lock request queued
+		by the transaction and release possible
+		other transactions waiting behind; it is
+		possible that the lock has already been
+		granted: in that case do nothing */
+
+		lock_mutex_enter();
+
+		trx_mutex_enter(trx);
+
+		if (trx->lock.wait_lock) {
+
+			ut_a(trx->lock.que_state == TRX_QUE_LOCK_WAIT);
+
+			lock_cancel_waiting_and_release(trx->lock.wait_lock);
+		}
+
+		lock_mutex_exit();
+
+		trx_mutex_exit(trx);
+	}
+
+}
+
+/*********************************************************************//**
+A thread which wakes up threads whose lock wait may have lasted too long.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(lock_wait_timeout_thread)(
+/*=====================================*/
+	void*	arg __attribute__((unused)))
+			/* in: a dummy parameter required by
+			os_thread_create */
+{
+	ib_int64_t	sig_count = 0;
+	os_event_t	event = lock_sys->timeout_event;
+
+	ut_ad(!srv_read_only_mode);
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(srv_lock_timeout_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+	lock_sys->timeout_thread_active = true;
+
+	do {
+		srv_slot_t*	slot;
+
+		/* When someone is waiting for a lock, we wake up every second
+		and check if a timeout has passed for a lock wait */
+
+		os_event_wait_time_low(event, 1000000, sig_count);
+		sig_count = os_event_reset(event);
+
+		if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
+			break;
+		}
+
+		lock_wait_mutex_enter();
+
+		/* Check all slots for user threads that are waiting
+	       	on locks, and if they have exceeded the time limit. */
+
+		for (slot = lock_sys->waiting_threads;
+		     slot < lock_sys->last_slot;
+		     ++slot) {
+
+			/* We are doing a read without the lock mutex
+			and/or the trx mutex. This is OK because a slot
+		       	can't be freed or reserved without the lock wait
+		       	mutex. */
+
+			if (slot->in_use) {
+				lock_wait_check_and_cancel(slot);
+			}
+		}
+
+		sig_count = os_event_reset(event);
+
+		lock_wait_mutex_exit();
+
+	} while (srv_shutdown_state < SRV_SHUTDOWN_CLEANUP);
+
+	lock_sys->timeout_thread_active = false;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
new file mode 100644
index 00000000000..d0e0453849e
--- /dev/null
+++ b/storage/innobase/log/log0log.cc
@@ -0,0 +1,3739 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2009, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file log/log0log.cc
+Database log
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#include "log0log.h"
+
+#ifdef UNIV_NONINL
+#include "log0log.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+#include "mem0mem.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "srv0srv.h"
+#include "log0recv.h"
+#include "fil0fil.h"
+#include "dict0boot.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "srv0mon.h"
+
+/*
+General philosophy of InnoDB redo-logs:
+
+1) Every change to a contents of a data page must be done
+through mtr, which in mtr_commit() writes log records
+to the InnoDB redo log.
+
+2) Normally these changes are performed using a mlog_write_ulint()
+or similar function.
+
+3) In some page level operations only a code number of a
+c-function and its parameters are written to the log to
+reduce the size of the log.
+
+  3a) You should not add parameters to these kind of functions
+  (e.g. trx_undo_header_create(), trx_undo_insert_header_reuse())
+
+  3b) You should not add such functionality which either change
+  working when compared with the old or are dependent on data
+  outside of the page. These kind of functions should implement
+  self-contained page transformation and it should be unchanged
+  if you don't have very essential reasons to change log
+  semantics or format.
+
+*/
+
+/* Global log system variable */
+UNIV_INTERN log_t*	log_sys	= NULL;
+
+#ifdef UNIV_PFS_RWLOCK
+UNIV_INTERN mysql_pfs_key_t	checkpoint_lock_key;
+# ifdef UNIV_LOG_ARCHIVE
+UNIV_INTERN mysql_pfs_key_t	archive_lock_key;
+# endif
+#endif /* UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	log_sys_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	log_flush_order_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+#ifdef UNIV_DEBUG
+UNIV_INTERN ibool	log_do_write = TRUE;
+#endif /* UNIV_DEBUG */
+
+/* These control how often we print warnings if the last checkpoint is too
+old */
+UNIV_INTERN ibool	log_has_printed_chkp_warning = FALSE;
+UNIV_INTERN time_t	log_last_warning_time;
+
+#ifdef UNIV_LOG_ARCHIVE
+/* Pointer to this variable is used as the i/o-message when we do i/o to an
+archive */
+UNIV_INTERN byte	log_archive_io;
+#endif /* UNIV_LOG_ARCHIVE */
+
+/* A margin for free space in the log buffer before a log entry is catenated */
+#define LOG_BUF_WRITE_MARGIN	(4 * OS_FILE_LOG_BLOCK_SIZE)
+
+/* Margins for free space in the log buffer after a log entry is catenated */
+#define LOG_BUF_FLUSH_RATIO	2
+#define LOG_BUF_FLUSH_MARGIN	(LOG_BUF_WRITE_MARGIN + 4 * UNIV_PAGE_SIZE)
+
+/* Margin for the free space in the smallest log group, before a new query
+step which modifies the database, is started */
+
+#define LOG_CHECKPOINT_FREE_PER_THREAD	(4 * UNIV_PAGE_SIZE)
+#define LOG_CHECKPOINT_EXTRA_FREE	(8 * UNIV_PAGE_SIZE)
+
+/* This parameter controls asynchronous making of a new checkpoint; the value
+should be bigger than LOG_POOL_PREFLUSH_RATIO_SYNC */
+
+#define LOG_POOL_CHECKPOINT_RATIO_ASYNC	32
+
+/* This parameter controls synchronous preflushing of modified buffer pages */
+#define LOG_POOL_PREFLUSH_RATIO_SYNC	16
+
+/* The same ratio for asynchronous preflushing; this value should be less than
+the previous */
+#define LOG_POOL_PREFLUSH_RATIO_ASYNC	8
+
+/* Extra margin, in addition to one log file, used in archiving */
+#define LOG_ARCHIVE_EXTRA_MARGIN	(4 * UNIV_PAGE_SIZE)
+
+/* This parameter controls asynchronous writing to the archive */
+#define LOG_ARCHIVE_RATIO_ASYNC		16
+
+/* Codes used in unlocking flush latches */
+#define LOG_UNLOCK_NONE_FLUSHED_LOCK	1
+#define LOG_UNLOCK_FLUSH_LOCK		2
+
+/* States of an archiving operation */
+#define	LOG_ARCHIVE_READ	1
+#define	LOG_ARCHIVE_WRITE	2
+
+/******************************************************//**
+Completes a checkpoint write i/o to a log file. */
+static
+void
+log_io_complete_checkpoint(void);
+/*============================*/
+#ifdef UNIV_LOG_ARCHIVE
+/******************************************************//**
+Completes an archiving i/o. */
+static
+void
+log_io_complete_archive(void);
+/*=========================*/
+#endif /* UNIV_LOG_ARCHIVE */
+
+/****************************************************************//**
+Returns the oldest modified block lsn in the pool, or log_sys->lsn if none
+exists.
+@return	LSN of oldest modification */
+static
+lsn_t
+log_buf_pool_get_oldest_modification(void)
+/*======================================*/
+{
+	lsn_t	lsn;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	lsn = buf_pool_get_oldest_modification();
+
+	if (!lsn) {
+
+		lsn = log_sys->lsn;
+	}
+
+	return(lsn);
+}
+
+/** Extends the log buffer.
+@param[in] len	requested minimum size in bytes */
+static
+void
+log_buffer_extend(
+	ulint	len)
+{
+	ulint	move_start;
+	ulint	move_end;
+	byte	tmp_buf[OS_FILE_LOG_BLOCK_SIZE];
+
+	mutex_enter(&(log_sys->mutex));
+
+	while (log_sys->is_extending) {
+		/* Another thread is trying to extend already.
+		Needs to wait for. */
+		mutex_exit(&(log_sys->mutex));
+
+		log_buffer_flush_to_disk();
+
+		mutex_enter(&(log_sys->mutex));
+
+		if (srv_log_buffer_size > len / UNIV_PAGE_SIZE) {
+			/* Already extended enough by the others */
+			mutex_exit(&(log_sys->mutex));
+			return;
+		}
+	}
+
+	log_sys->is_extending = true;
+
+	while (log_sys->n_pending_writes != 0
+	       || ut_calc_align_down(log_sys->buf_free,
+				     OS_FILE_LOG_BLOCK_SIZE)
+		  != ut_calc_align_down(log_sys->buf_next_to_write,
+					OS_FILE_LOG_BLOCK_SIZE)) {
+		/* Buffer might have >1 blocks to write still. */
+		mutex_exit(&(log_sys->mutex));
+
+		log_buffer_flush_to_disk();
+
+		mutex_enter(&(log_sys->mutex));
+	}
+
+	move_start = ut_calc_align_down(
+		log_sys->buf_free,
+		OS_FILE_LOG_BLOCK_SIZE);
+	move_end = log_sys->buf_free;
+
+	/* store the last log block in buffer */
+	ut_memcpy(tmp_buf, log_sys->buf + move_start,
+		  move_end - move_start);
+
+	log_sys->buf_free -= move_start;
+	log_sys->buf_next_to_write -= move_start;
+
+	/* reallocate log buffer */
+	srv_log_buffer_size = len / UNIV_PAGE_SIZE + 1;
+	mem_free(log_sys->buf_ptr);
+	log_sys->buf_ptr = static_cast<byte*>(
+		mem_zalloc(LOG_BUFFER_SIZE + OS_FILE_LOG_BLOCK_SIZE));
+	log_sys->buf = static_cast<byte*>(
+		ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
+	log_sys->buf_size = LOG_BUFFER_SIZE;
+	log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO
+		- LOG_BUF_FLUSH_MARGIN;
+
+	/* restore the last log block */
+	ut_memcpy(log_sys->buf, tmp_buf, move_end - move_start);
+
+	ut_ad(log_sys->is_extending);
+	log_sys->is_extending = false;
+
+	mutex_exit(&(log_sys->mutex));
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"innodb_log_buffer_size was extended to %lu.",
+		LOG_BUFFER_SIZE);
+}
+
+/************************************************************//**
+Opens the log for log_write_low. The log must be closed with log_close and
+released with log_release.
+@return	start lsn of the log record */
+UNIV_INTERN
+lsn_t
+log_reserve_and_open(
+/*=================*/
+	ulint	len)	/*!< in: length of data to be catenated */
+{
+	log_t*	log			= log_sys;
+	ulint	len_upper_limit;
+#ifdef UNIV_LOG_ARCHIVE
+	ulint	archived_lsn_age;
+	ulint	dummy;
+#endif /* UNIV_LOG_ARCHIVE */
+#ifdef UNIV_DEBUG
+	ulint	count			= 0;
+#endif /* UNIV_DEBUG */
+
+	if (len >= log->buf_size / 2) {
+		DBUG_EXECUTE_IF("ib_log_buffer_is_short_crash",
+				DBUG_SUICIDE(););
+
+		/* log_buffer is too small. try to extend instead of crash. */
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"The transaction log size is too large"
+			" for innodb_log_buffer_size (%lu >= %lu / 2). "
+			"Trying to extend it.",
+			len, LOG_BUFFER_SIZE);
+
+		log_buffer_extend((len + 1) * 2);
+	}
+loop:
+	mutex_enter(&(log->mutex));
+	ut_ad(!recv_no_log_write);
+
+	if (log->is_extending) {
+
+		mutex_exit(&(log->mutex));
+
+		/* Log buffer size is extending. Writing up to the next block
+		should wait for the extending finished. */
+
+		os_thread_sleep(100000);
+
+		ut_ad(++count < 50);
+
+		goto loop;
+	}
+
+	/* Calculate an upper limit for the space the string may take in the
+	log buffer */
+
+	len_upper_limit = LOG_BUF_WRITE_MARGIN + (5 * len) / 4;
+
+	if (log->buf_free + len_upper_limit > log->buf_size) {
+
+		mutex_exit(&(log->mutex));
+
+		/* Not enough free space, do a syncronous flush of the log
+		buffer */
+
+		log_buffer_flush_to_disk();
+
+		srv_stats.log_waits.inc();
+
+		ut_ad(++count < 50);
+
+		goto loop;
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	if (log->archiving_state != LOG_ARCH_OFF) {
+
+		archived_lsn_age = log->lsn - log->archived_lsn;
+		if (archived_lsn_age + len_upper_limit
+		    > log->max_archived_lsn_age) {
+			/* Not enough free archived space in log groups: do a
+			synchronous archive write batch: */
+
+			mutex_exit(&(log->mutex));
+
+			ut_ad(len_upper_limit <= log->max_archived_lsn_age);
+
+			log_archive_do(TRUE, &dummy);
+
+			ut_ad(++count < 50);
+
+			goto loop;
+		}
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+
+#ifdef UNIV_LOG_DEBUG
+	log->old_buf_free = log->buf_free;
+	log->old_lsn = log->lsn;
+#endif
+	return(log->lsn);
+}
+
+/************************************************************//**
+Writes to the log the string given. It is assumed that the caller holds the
+log mutex. */
+UNIV_INTERN
+void
+log_write_low(
+/*==========*/
+	byte*	str,		/*!< in: string */
+	ulint	str_len)	/*!< in: string length */
+{
+	log_t*	log	= log_sys;
+	ulint	len;
+	ulint	data_len;
+	byte*	log_block;
+
+	ut_ad(mutex_own(&(log->mutex)));
+part_loop:
+	ut_ad(!recv_no_log_write);
+	/* Calculate a part length */
+
+	data_len = (log->buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len;
+
+	if (data_len <= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
+
+		/* The string fits within the current log block */
+
+		len = str_len;
+	} else {
+		data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE;
+
+		len = OS_FILE_LOG_BLOCK_SIZE
+			- (log->buf_free % OS_FILE_LOG_BLOCK_SIZE)
+			- LOG_BLOCK_TRL_SIZE;
+	}
+
+	ut_memcpy(log->buf + log->buf_free, str, len);
+
+	str_len -= len;
+	str = str + len;
+
+	log_block = static_cast<byte*>(
+		ut_align_down(
+			log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE));
+
+	log_block_set_data_len(log_block, data_len);
+
+	if (data_len == OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
+		/* This block became full */
+		log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE);
+		log_block_set_checkpoint_no(log_block,
+					    log_sys->next_checkpoint_no);
+		len += LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE;
+
+		log->lsn += len;
+
+		/* Initialize the next block header */
+		log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, log->lsn);
+	} else {
+		log->lsn += len;
+	}
+
+	log->buf_free += len;
+
+	ut_ad(log->buf_free <= log->buf_size);
+
+	if (str_len > 0) {
+		goto part_loop;
+	}
+
+	srv_stats.log_write_requests.inc();
+}
+
+/************************************************************//**
+Closes the log.
+@return	lsn */
+UNIV_INTERN
+lsn_t
+log_close(void)
+/*===========*/
+{
+	byte*		log_block;
+	ulint		first_rec_group;
+	lsn_t		oldest_lsn;
+	lsn_t		lsn;
+	log_t*		log	= log_sys;
+	lsn_t		checkpoint_age;
+
+	ut_ad(mutex_own(&(log->mutex)));
+	ut_ad(!recv_no_log_write);
+
+	lsn = log->lsn;
+
+	log_block = static_cast<byte*>(
+		ut_align_down(
+			log->buf + log->buf_free, OS_FILE_LOG_BLOCK_SIZE));
+
+	first_rec_group = log_block_get_first_rec_group(log_block);
+
+	if (first_rec_group == 0) {
+		/* We initialized a new log block which was not written
+		full by the current mtr: the next mtr log record group
+		will start within this block at the offset data_len */
+
+		log_block_set_first_rec_group(
+			log_block, log_block_get_data_len(log_block));
+	}
+
+	if (log->buf_free > log->max_buf_free) {
+
+		log->check_flush_or_checkpoint = TRUE;
+	}
+
+	checkpoint_age = lsn - log->last_checkpoint_lsn;
+
+	if (checkpoint_age >= log->log_group_capacity) {
+		/* TODO: split btr_store_big_rec_extern_fields() into small
+		steps so that we can release all latches in the middle, and
+		call log_free_check() to ensure we never write over log written
+		after the latest checkpoint. In principle, we should split all
+		big_rec operations, but other operations are smaller. */
+
+		if (!log_has_printed_chkp_warning
+		    || difftime(time(NULL), log_last_warning_time) > 15) {
+
+			log_has_printed_chkp_warning = TRUE;
+			log_last_warning_time = time(NULL);
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: ERROR: the age of the last"
+				" checkpoint is " LSN_PF ",\n"
+				"InnoDB: which exceeds the log group"
+				" capacity " LSN_PF ".\n"
+				"InnoDB: If you are using big"
+				" BLOB or TEXT rows, you must set the\n"
+				"InnoDB: combined size of log files"
+				" at least 10 times bigger than the\n"
+				"InnoDB: largest such row.\n",
+				checkpoint_age,
+				log->log_group_capacity);
+		}
+	}
+
+	if (checkpoint_age <= log->max_modified_age_sync) {
+
+		goto function_exit;
+	}
+
+	oldest_lsn = buf_pool_get_oldest_modification();
+
+	if (!oldest_lsn
+	    || lsn - oldest_lsn > log->max_modified_age_sync
+	    || checkpoint_age > log->max_checkpoint_age_async) {
+
+		log->check_flush_or_checkpoint = TRUE;
+	}
+function_exit:
+
+#ifdef UNIV_LOG_DEBUG
+	log_check_log_recs(log->buf + log->old_buf_free,
+			   log->buf_free - log->old_buf_free, log->old_lsn);
+#endif
+
+	return(lsn);
+}
+
+#ifdef UNIV_LOG_ARCHIVE
+/******************************************************//**
+Pads the current log block full with dummy log records. Used in producing
+consistent archived log files. */
+static
+void
+log_pad_current_log_block(void)
+/*===========================*/
+{
+	byte		b		= MLOG_DUMMY_RECORD;
+	ulint		pad_length;
+	ulint		i;
+	ib_uint64_t	lsn;
+
+	/* We retrieve lsn only because otherwise gcc crashed on HP-UX */
+	lsn = log_reserve_and_open(OS_FILE_LOG_BLOCK_SIZE);
+
+	pad_length = OS_FILE_LOG_BLOCK_SIZE
+		- (log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE)
+		- LOG_BLOCK_TRL_SIZE;
+
+	for (i = 0; i < pad_length; i++) {
+		log_write_low(&b, 1);
+	}
+
+	lsn = log_sys->lsn;
+
+	log_close();
+	log_release();
+
+	ut_a(lsn % OS_FILE_LOG_BLOCK_SIZE == LOG_BLOCK_HDR_SIZE);
+}
+#endif /* UNIV_LOG_ARCHIVE */
+
+/******************************************************//**
+Calculates the data capacity of a log group, when the log file headers are not
+included.
+@return	capacity in bytes */
+UNIV_INTERN
+lsn_t
+log_group_get_capacity(
+/*===================*/
+	const log_group_t*	group)	/*!< in: log group */
+{
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	return((group->file_size - LOG_FILE_HDR_SIZE) * group->n_files);
+}
+
+/******************************************************//**
+Calculates the offset within a log group, when the log file headers are not
+included.
+@return	size offset (<= offset) */
+UNIV_INLINE
+lsn_t
+log_group_calc_size_offset(
+/*=======================*/
+	lsn_t			offset,	/*!< in: real offset within the
+					log group */
+	const log_group_t*	group)	/*!< in: log group */
+{
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	return(offset - LOG_FILE_HDR_SIZE * (1 + offset / group->file_size));
+}
+
+/******************************************************//**
+Calculates the offset within a log group, when the log file headers are
+included.
+@return	real offset (>= offset) */
+UNIV_INLINE
+lsn_t
+log_group_calc_real_offset(
+/*=======================*/
+	lsn_t			offset,	/*!< in: size offset within the
+					log group */
+	const log_group_t*	group)	/*!< in: log group */
+{
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	return(offset + LOG_FILE_HDR_SIZE
+	       * (1 + offset / (group->file_size - LOG_FILE_HDR_SIZE)));
+}
+
+/******************************************************//**
+Calculates the offset of an lsn within a log group.
+@return	offset within the log group */
+static
+lsn_t
+log_group_calc_lsn_offset(
+/*======================*/
+	lsn_t			lsn,	/*!< in: lsn */
+	const log_group_t*	group)	/*!< in: log group */
+{
+	lsn_t	gr_lsn;
+	lsn_t	gr_lsn_size_offset;
+	lsn_t	difference;
+	lsn_t	group_size;
+	lsn_t	offset;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	gr_lsn = group->lsn;
+
+	gr_lsn_size_offset = log_group_calc_size_offset(group->lsn_offset, group);
+
+	group_size = log_group_get_capacity(group);
+
+	if (lsn >= gr_lsn) {
+
+		difference = lsn - gr_lsn;
+	} else {
+		difference = gr_lsn - lsn;
+
+		difference = difference % group_size;
+
+		difference = group_size - difference;
+	}
+
+	offset = (gr_lsn_size_offset + difference) % group_size;
+
+	/* fprintf(stderr,
+	"Offset is " LSN_PF " gr_lsn_offset is " LSN_PF
+	" difference is " LSN_PF "\n",
+	offset, gr_lsn_size_offset, difference);
+	*/
+
+	return(log_group_calc_real_offset(offset, group));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+UNIV_INTERN ibool	log_debug_writes = FALSE;
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Calculates where in log files we find a specified lsn.
+@return	log file number */
+UNIV_INTERN
+ulint
+log_calc_where_lsn_is(
+/*==================*/
+	ib_int64_t*	log_file_offset,	/*!< out: offset in that file
+						(including the header) */
+	ib_uint64_t	first_header_lsn,	/*!< in: first log file start
+						lsn */
+	ib_uint64_t	lsn,			/*!< in: lsn whose position to
+						determine */
+	ulint		n_log_files,		/*!< in: total number of log
+						files */
+	ib_int64_t	log_file_size)		/*!< in: log file size
+						(including the header) */
+{
+	ib_int64_t	capacity	= log_file_size - LOG_FILE_HDR_SIZE;
+	ulint		file_no;
+	ib_int64_t	add_this_many;
+
+	if (lsn < first_header_lsn) {
+		add_this_many = 1 + (first_header_lsn - lsn)
+			/ (capacity * (ib_int64_t) n_log_files);
+		lsn += add_this_many
+			* capacity * (ib_int64_t) n_log_files;
+	}
+
+	ut_a(lsn >= first_header_lsn);
+
+	file_no = ((ulint)((lsn - first_header_lsn) / capacity))
+		% n_log_files;
+	*log_file_offset = (lsn - first_header_lsn) % capacity;
+
+	*log_file_offset = *log_file_offset + LOG_FILE_HDR_SIZE;
+
+	return(file_no);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Sets the field values in group to correspond to a given lsn. For this function
+to work, the values must already be correctly initialized to correspond to
+some lsn, for instance, a checkpoint lsn. */
+UNIV_INTERN
+void
+log_group_set_fields(
+/*=================*/
+	log_group_t*	group,	/*!< in/out: group */
+	lsn_t		lsn)	/*!< in: lsn for which the values should be
+				set */
+{
+	group->lsn_offset = log_group_calc_lsn_offset(lsn, group);
+	group->lsn = lsn;
+}
+
+/*****************************************************************//**
+Calculates the recommended highest values for lsn - last_checkpoint_lsn,
+lsn - buf_get_oldest_modification(), and lsn - max_archive_lsn_age.
+@return error value FALSE if the smallest log group is too small to
+accommodate the number of OS threads in the database server */
+static
+ibool
+log_calc_max_ages(void)
+/*===================*/
+{
+	log_group_t*	group;
+	lsn_t		margin;
+	ulint		free;
+	ibool		success		= TRUE;
+	lsn_t		smallest_capacity;
+	lsn_t		archive_margin;
+	lsn_t		smallest_archive_margin;
+
+	mutex_enter(&(log_sys->mutex));
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	ut_ad(group);
+
+	smallest_capacity = LSN_MAX;
+	smallest_archive_margin = LSN_MAX;
+
+	while (group) {
+		if (log_group_get_capacity(group) < smallest_capacity) {
+
+			smallest_capacity = log_group_get_capacity(group);
+		}
+
+		archive_margin = log_group_get_capacity(group)
+			- (group->file_size - LOG_FILE_HDR_SIZE)
+			- LOG_ARCHIVE_EXTRA_MARGIN;
+
+		if (archive_margin < smallest_archive_margin) {
+
+			smallest_archive_margin = archive_margin;
+		}
+
+		group = UT_LIST_GET_NEXT(log_groups, group);
+	}
+
+	/* Add extra safety */
+	smallest_capacity = smallest_capacity - smallest_capacity / 10;
+
+	/* For each OS thread we must reserve so much free space in the
+	smallest log group that it can accommodate the log entries produced
+	by single query steps: running out of free log space is a serious
+	system error which requires rebooting the database. */
+
+	free = LOG_CHECKPOINT_FREE_PER_THREAD * (10 + srv_thread_concurrency)
+		+ LOG_CHECKPOINT_EXTRA_FREE;
+	if (free >= smallest_capacity / 2) {
+		success = FALSE;
+
+		goto failure;
+	} else {
+		margin = smallest_capacity - free;
+	}
+
+	margin = margin - margin / 10;	/* Add still some extra safety */
+
+	log_sys->log_group_capacity = smallest_capacity;
+
+	log_sys->max_modified_age_async = margin
+		- margin / LOG_POOL_PREFLUSH_RATIO_ASYNC;
+	log_sys->max_modified_age_sync = margin
+		- margin / LOG_POOL_PREFLUSH_RATIO_SYNC;
+
+	log_sys->max_checkpoint_age_async = margin - margin
+		/ LOG_POOL_CHECKPOINT_RATIO_ASYNC;
+	log_sys->max_checkpoint_age = margin;
+
+#ifdef UNIV_LOG_ARCHIVE
+	log_sys->max_archived_lsn_age = smallest_archive_margin;
+
+	log_sys->max_archived_lsn_age_async = smallest_archive_margin
+		- smallest_archive_margin / LOG_ARCHIVE_RATIO_ASYNC;
+#endif /* UNIV_LOG_ARCHIVE */
+failure:
+	mutex_exit(&(log_sys->mutex));
+
+	if (!success) {
+		fprintf(stderr,
+			"InnoDB: Error: ib_logfiles are too small"
+			" for innodb_thread_concurrency %lu.\n"
+			"InnoDB: The combined size of ib_logfiles"
+			" should be bigger than\n"
+			"InnoDB: 200 kB * innodb_thread_concurrency.\n"
+			"InnoDB: To get mysqld to start up, set"
+			" innodb_thread_concurrency in my.cnf\n"
+			"InnoDB: to a lower value, for example, to 8."
+			" After an ERROR-FREE shutdown\n"
+			"InnoDB: of mysqld you can adjust the size of"
+			" ib_logfiles, as explained in\n"
+			"InnoDB: " REFMAN "adding-and-removing.html\n"
+			"InnoDB: Cannot continue operation."
+			" Calling exit(1).\n",
+			(ulong) srv_thread_concurrency);
+
+		exit(1);
+	}
+
+	return(success);
+}
+
+/******************************************************//**
+Initializes the log. */
+UNIV_INTERN
+void
+log_init(void)
+/*==========*/
+{
+	log_sys = static_cast<log_t*>(mem_alloc(sizeof(log_t)));
+
+	mutex_create(log_sys_mutex_key, &log_sys->mutex, SYNC_LOG);
+
+	mutex_create(log_flush_order_mutex_key,
+		     &log_sys->log_flush_order_mutex,
+		     SYNC_LOG_FLUSH_ORDER);
+
+	mutex_enter(&(log_sys->mutex));
+
+	/* Start the lsn from one log block from zero: this way every
+	log record has a start lsn != zero, a fact which we will use */
+
+	log_sys->lsn = LOG_START_LSN;
+
+	ut_a(LOG_BUFFER_SIZE >= 16 * OS_FILE_LOG_BLOCK_SIZE);
+	ut_a(LOG_BUFFER_SIZE >= 4 * UNIV_PAGE_SIZE);
+
+	log_sys->buf_ptr = static_cast<byte*>(
+		mem_zalloc(LOG_BUFFER_SIZE + OS_FILE_LOG_BLOCK_SIZE));
+
+	log_sys->buf = static_cast<byte*>(
+		ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
+
+	log_sys->buf_size = LOG_BUFFER_SIZE;
+	log_sys->is_extending = false;
+
+	log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO
+		- LOG_BUF_FLUSH_MARGIN;
+	log_sys->check_flush_or_checkpoint = TRUE;
+	UT_LIST_INIT(log_sys->log_groups);
+
+	log_sys->n_log_ios = 0;
+
+	log_sys->n_log_ios_old = log_sys->n_log_ios;
+	log_sys->last_printout_time = time(NULL);
+	/*----------------------------*/
+
+	log_sys->buf_next_to_write = 0;
+
+	log_sys->write_lsn = 0;
+	log_sys->current_flush_lsn = 0;
+	log_sys->flushed_to_disk_lsn = 0;
+
+	log_sys->written_to_some_lsn = log_sys->lsn;
+	log_sys->written_to_all_lsn = log_sys->lsn;
+
+	log_sys->n_pending_writes = 0;
+
+	log_sys->no_flush_event = os_event_create();
+
+	os_event_set(log_sys->no_flush_event);
+
+	log_sys->one_flushed_event = os_event_create();
+
+	os_event_set(log_sys->one_flushed_event);
+
+	/*----------------------------*/
+
+	log_sys->next_checkpoint_no = 0;
+	log_sys->last_checkpoint_lsn = log_sys->lsn;
+	log_sys->n_pending_checkpoint_writes = 0;
+
+
+	rw_lock_create(checkpoint_lock_key, &log_sys->checkpoint_lock,
+		       SYNC_NO_ORDER_CHECK);
+
+	log_sys->checkpoint_buf_ptr = static_cast<byte*>(
+		mem_zalloc(2 * OS_FILE_LOG_BLOCK_SIZE));
+
+	log_sys->checkpoint_buf = static_cast<byte*>(
+		ut_align(log_sys->checkpoint_buf_ptr, OS_FILE_LOG_BLOCK_SIZE));
+
+	/*----------------------------*/
+
+#ifdef UNIV_LOG_ARCHIVE
+	/* Under MySQL, log archiving is always off */
+	log_sys->archiving_state = LOG_ARCH_OFF;
+	log_sys->archived_lsn = log_sys->lsn;
+	log_sys->next_archived_lsn = 0;
+
+	log_sys->n_pending_archive_ios = 0;
+
+	rw_lock_create(archive_lock_key, &log_sys->archive_lock,
+		       SYNC_NO_ORDER_CHECK);
+
+	log_sys->archive_buf = NULL;
+
+	/* ut_align(
+	ut_malloc(LOG_ARCHIVE_BUF_SIZE
+	+ OS_FILE_LOG_BLOCK_SIZE),
+	OS_FILE_LOG_BLOCK_SIZE); */
+	log_sys->archive_buf_size = 0;
+
+	/* memset(log_sys->archive_buf, '\0', LOG_ARCHIVE_BUF_SIZE); */
+
+	log_sys->archiving_on = os_event_create();
+#endif /* UNIV_LOG_ARCHIVE */
+
+	/*----------------------------*/
+
+	log_block_init(log_sys->buf, log_sys->lsn);
+	log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE);
+
+	log_sys->buf_free = LOG_BLOCK_HDR_SIZE;
+	log_sys->lsn = LOG_START_LSN + LOG_BLOCK_HDR_SIZE;
+
+	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
+		    log_sys->lsn - log_sys->last_checkpoint_lsn);
+
+	mutex_exit(&(log_sys->mutex));
+
+#ifdef UNIV_LOG_DEBUG
+	recv_sys_create();
+	recv_sys_init(buf_pool_get_curr_size());
+
+	recv_sys->parse_start_lsn = log_sys->lsn;
+	recv_sys->scanned_lsn = log_sys->lsn;
+	recv_sys->scanned_checkpoint_no = 0;
+	recv_sys->recovered_lsn = log_sys->lsn;
+	recv_sys->limit_lsn = LSN_MAX;
+#endif
+}
+
+/******************************************************************//**
+Inits a log group to the log system. */
+UNIV_INTERN
+void
+log_group_init(
+/*===========*/
+	ulint	id,			/*!< in: group id */
+	ulint	n_files,		/*!< in: number of log files */
+	lsn_t	file_size,		/*!< in: log file size in bytes */
+	ulint	space_id,		/*!< in: space id of the file space
+					which contains the log files of this
+					group */
+	ulint	archive_space_id __attribute__((unused)))
+					/*!< in: space id of the file space
+					which contains some archived log
+					files for this group; currently, only
+					for the first log group this is
+					used */
+{
+	ulint	i;
+
+	log_group_t*	group;
+
+	group = static_cast<log_group_t*>(mem_alloc(sizeof(log_group_t)));
+
+	group->id = id;
+	group->n_files = n_files;
+	group->file_size = file_size;
+	group->space_id = space_id;
+	group->state = LOG_GROUP_OK;
+	group->lsn = LOG_START_LSN;
+	group->lsn_offset = LOG_FILE_HDR_SIZE;
+	group->n_pending_writes = 0;
+
+	group->file_header_bufs_ptr = static_cast<byte**>(
+		mem_zalloc(sizeof(byte*) * n_files));
+
+	group->file_header_bufs = static_cast<byte**>(
+		mem_zalloc(sizeof(byte**) * n_files));
+
+#ifdef UNIV_LOG_ARCHIVE
+	group->archive_file_header_bufs_ptr = static_cast<byte*>(
+		mem_zalloc( sizeof(byte*) * n_files));
+
+	group->archive_file_header_bufs = static_cast<byte*>(
+		mem_zalloc(sizeof(byte*) * n_files));
+#endif /* UNIV_LOG_ARCHIVE */
+
+	for (i = 0; i < n_files; i++) {
+		group->file_header_bufs_ptr[i] = static_cast<byte*>(
+			mem_zalloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE));
+
+		group->file_header_bufs[i] = static_cast<byte*>(
+			ut_align(group->file_header_bufs_ptr[i],
+				 OS_FILE_LOG_BLOCK_SIZE));
+
+#ifdef UNIV_LOG_ARCHIVE
+		group->archive_file_header_bufs_ptr[i] = static_cast<byte*>(
+			mem_zalloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE));
+
+		group->archive_file_header_bufs[i] = static_cast<byte*>(
+			ut_align(group->archive_file_header_bufs_ptr[i],
+				 OS_FILE_LOG_BLOCK_SIZE));
+#endif /* UNIV_LOG_ARCHIVE */
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	group->archive_space_id = archive_space_id;
+
+	group->archived_file_no = 0;
+	group->archived_offset = 0;
+#endif /* UNIV_LOG_ARCHIVE */
+
+	group->checkpoint_buf_ptr = static_cast<byte*>(
+		mem_zalloc(2 * OS_FILE_LOG_BLOCK_SIZE));
+
+	group->checkpoint_buf = static_cast<byte*>(
+		ut_align(group->checkpoint_buf_ptr,OS_FILE_LOG_BLOCK_SIZE));
+
+	UT_LIST_ADD_LAST(log_groups, log_sys->log_groups, group);
+
+	ut_a(log_calc_max_ages());
+}
+
+/******************************************************************//**
+Does the unlockings needed in flush i/o completion. */
+UNIV_INLINE
+void
+log_flush_do_unlocks(
+/*=================*/
+	ulint	code)	/*!< in: any ORed combination of LOG_UNLOCK_FLUSH_LOCK
+			and LOG_UNLOCK_NONE_FLUSHED_LOCK */
+{
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	/* NOTE that we must own the log mutex when doing the setting of the
+	events: this is because transactions will wait for these events to
+	be set, and at that moment the log flush they were waiting for must
+	have ended. If the log mutex were not reserved here, the i/o-thread
+	calling this function might be preempted for a while, and when it
+	resumed execution, it might be that a new flush had been started, and
+	this function would erroneously signal the NEW flush as completed.
+	Thus, the changes in the state of these events are performed
+	atomically in conjunction with the changes in the state of
+	log_sys->n_pending_writes etc. */
+
+	if (code & LOG_UNLOCK_NONE_FLUSHED_LOCK) {
+		os_event_set(log_sys->one_flushed_event);
+	}
+
+	if (code & LOG_UNLOCK_FLUSH_LOCK) {
+		os_event_set(log_sys->no_flush_event);
+	}
+}
+
+/******************************************************************//**
+Checks if a flush is completed for a log group and does the completion
+routine if yes.
+@return	LOG_UNLOCK_NONE_FLUSHED_LOCK or 0 */
+UNIV_INLINE
+ulint
+log_group_check_flush_completion(
+/*=============================*/
+	log_group_t*	group)	/*!< in: log group */
+{
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	if (!log_sys->one_flushed && group->n_pending_writes == 0) {
+#ifdef UNIV_DEBUG
+		if (log_debug_writes) {
+			fprintf(stderr,
+				"Log flushed first to group %lu\n",
+				(ulong) group->id);
+		}
+#endif /* UNIV_DEBUG */
+		log_sys->written_to_some_lsn = log_sys->write_lsn;
+		log_sys->one_flushed = TRUE;
+
+		return(LOG_UNLOCK_NONE_FLUSHED_LOCK);
+	}
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes && (group->n_pending_writes == 0)) {
+
+		fprintf(stderr, "Log flushed to group %lu\n",
+			(ulong) group->id);
+	}
+#endif /* UNIV_DEBUG */
+	return(0);
+}
+
+/******************************************************//**
+Checks if a flush is completed and does the completion routine if yes.
+@return	LOG_UNLOCK_FLUSH_LOCK or 0 */
+static
+ulint
+log_sys_check_flush_completion(void)
+/*================================*/
+{
+	ulint	move_start;
+	ulint	move_end;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	if (log_sys->n_pending_writes == 0) {
+
+		log_sys->written_to_all_lsn = log_sys->write_lsn;
+		log_sys->buf_next_to_write = log_sys->write_end_offset;
+
+		if (log_sys->write_end_offset > log_sys->max_buf_free / 2) {
+			/* Move the log buffer content to the start of the
+			buffer */
+
+			move_start = ut_calc_align_down(
+				log_sys->write_end_offset,
+				OS_FILE_LOG_BLOCK_SIZE);
+			move_end = ut_calc_align(log_sys->buf_free,
+						 OS_FILE_LOG_BLOCK_SIZE);
+
+			ut_memmove(log_sys->buf, log_sys->buf + move_start,
+				   move_end - move_start);
+			log_sys->buf_free -= move_start;
+
+			log_sys->buf_next_to_write -= move_start;
+		}
+
+		return(LOG_UNLOCK_FLUSH_LOCK);
+	}
+
+	return(0);
+}
+
+/******************************************************//**
+Completes an i/o to a log file. */
+UNIV_INTERN
+void
+log_io_complete(
+/*============*/
+	log_group_t*	group)	/*!< in: log group or a dummy pointer */
+{
+	ulint	unlock;
+
+#ifdef UNIV_LOG_ARCHIVE
+	if ((byte*) group == &log_archive_io) {
+		/* It was an archive write */
+
+		log_io_complete_archive();
+
+		return;
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+
+	if ((ulint) group & 0x1UL) {
+		/* It was a checkpoint write */
+		group = (log_group_t*)((ulint) group - 1);
+
+		if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
+		    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
+
+			fil_flush(group->space_id);
+		}
+
+#ifdef UNIV_DEBUG
+		if (log_debug_writes) {
+			fprintf(stderr,
+				"Checkpoint info written to group %lu\n",
+				group->id);
+		}
+#endif /* UNIV_DEBUG */
+		log_io_complete_checkpoint();
+
+		return;
+	}
+
+	ut_error;	/*!< We currently use synchronous writing of the
+			logs and cannot end up here! */
+
+	if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
+	    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
+	    && srv_flush_log_at_trx_commit != 2) {
+
+		fil_flush(group->space_id);
+	}
+
+	mutex_enter(&(log_sys->mutex));
+	ut_ad(!recv_no_log_write);
+
+	ut_a(group->n_pending_writes > 0);
+	ut_a(log_sys->n_pending_writes > 0);
+
+	group->n_pending_writes--;
+	log_sys->n_pending_writes--;
+	MONITOR_DEC(MONITOR_PENDING_LOG_WRITE);
+
+	unlock = log_group_check_flush_completion(group);
+	unlock = unlock | log_sys_check_flush_completion();
+
+	log_flush_do_unlocks(unlock);
+
+	mutex_exit(&(log_sys->mutex));
+}
+
+/******************************************************//**
+Writes a log file header to a log file space. */
+static
+void
+log_group_file_header_flush(
+/*========================*/
+	log_group_t*	group,		/*!< in: log group */
+	ulint		nth_file,	/*!< in: header to the nth file in the
+					log file space */
+	lsn_t		start_lsn)	/*!< in: log file data starts at this
+					lsn */
+{
+	byte*	buf;
+	lsn_t	dest_offset;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+	ut_ad(!recv_no_log_write);
+	ut_a(nth_file < group->n_files);
+
+	buf = *(group->file_header_bufs + nth_file);
+
+	mach_write_to_4(buf + LOG_GROUP_ID, group->id);
+	mach_write_to_8(buf + LOG_FILE_START_LSN, start_lsn);
+
+	/* Wipe over possible label of mysqlbackup --restore */
+	memcpy(buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, "    ", 4);
+
+	dest_offset = nth_file * group->file_size;
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes) {
+		fprintf(stderr,
+			"Writing log file header to group %lu file %lu\n",
+			(ulong) group->id, (ulong) nth_file);
+	}
+#endif /* UNIV_DEBUG */
+	if (log_do_write) {
+		log_sys->n_log_ios++;
+
+		MONITOR_INC(MONITOR_LOG_IO);
+
+		srv_stats.os_log_pending_writes.inc();
+
+		fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, group->space_id, 0,
+		       (ulint) (dest_offset / UNIV_PAGE_SIZE),
+		       (ulint) (dest_offset % UNIV_PAGE_SIZE),
+		       OS_FILE_LOG_BLOCK_SIZE,
+		       buf, group);
+
+		srv_stats.os_log_pending_writes.dec();
+	}
+}
+
+/******************************************************//**
+Stores a 4-byte checksum to the trailer checksum field of a log block
+before writing it to a log file. This checksum is used in recovery to
+check the consistency of a log block. */
+static
+void
+log_block_store_checksum(
+/*=====================*/
+	byte*	block)	/*!< in/out: pointer to a log block */
+{
+	log_block_set_checksum(block, log_block_calc_checksum(block));
+}
+
+/******************************************************//**
+Writes a buffer to a log file group. */
+UNIV_INTERN
+void
+log_group_write_buf(
+/*================*/
+	log_group_t*	group,		/*!< in: log group */
+	byte*		buf,		/*!< in: buffer */
+	ulint		len,		/*!< in: buffer len; must be divisible
+					by OS_FILE_LOG_BLOCK_SIZE */
+	lsn_t		start_lsn,	/*!< in: start lsn of the buffer; must
+					be divisible by
+					OS_FILE_LOG_BLOCK_SIZE */
+	ulint		new_data_offset)/*!< in: start offset of new data in
+					buf: this parameter is used to decide
+					if we have to write a new log file
+					header */
+{
+	ulint		write_len;
+	ibool		write_header;
+	lsn_t		next_offset;
+	ulint		i;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+	ut_ad(!recv_no_log_write);
+	ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0);
+	ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
+
+	if (new_data_offset == 0) {
+		write_header = TRUE;
+	} else {
+		write_header = FALSE;
+	}
+loop:
+	if (len == 0) {
+
+		return;
+	}
+
+	next_offset = log_group_calc_lsn_offset(start_lsn, group);
+
+	if ((next_offset % group->file_size == LOG_FILE_HDR_SIZE)
+	    && write_header) {
+		/* We start to write a new log file instance in the group */
+
+		ut_a(next_offset / group->file_size <= ULINT_MAX);
+
+		log_group_file_header_flush(group, (ulint)
+					    (next_offset / group->file_size),
+					    start_lsn);
+		srv_stats.os_log_written.add(OS_FILE_LOG_BLOCK_SIZE);
+
+		srv_stats.log_writes.inc();
+	}
+
+	if ((next_offset % group->file_size) + len > group->file_size) {
+
+		/* if the above condition holds, then the below expression
+		is < len which is ulint, so the typecast is ok */
+		write_len = (ulint)
+			(group->file_size - (next_offset % group->file_size));
+	} else {
+		write_len = len;
+	}
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes) {
+
+		fprintf(stderr,
+			"Writing log file segment to group %lu"
+			" offset " LSN_PF " len %lu\n"
+			"start lsn " LSN_PF "\n"
+			"First block n:o %lu last block n:o %lu\n",
+			(ulong) group->id, next_offset,
+			write_len,
+			start_lsn,
+			(ulong) log_block_get_hdr_no(buf),
+			(ulong) log_block_get_hdr_no(
+				buf + write_len - OS_FILE_LOG_BLOCK_SIZE));
+		ut_a(log_block_get_hdr_no(buf)
+		     == log_block_convert_lsn_to_no(start_lsn));
+
+		for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
+
+			ut_a(log_block_get_hdr_no(buf) + i
+			     == log_block_get_hdr_no(
+				     buf + i * OS_FILE_LOG_BLOCK_SIZE));
+		}
+	}
+#endif /* UNIV_DEBUG */
+	/* Calculate the checksums for each log block and write them to
+	the trailer fields of the log blocks */
+
+	for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
+		log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE);
+	}
+
+	if (log_do_write) {
+		log_sys->n_log_ios++;
+
+		MONITOR_INC(MONITOR_LOG_IO);
+
+		srv_stats.os_log_pending_writes.inc();
+
+		ut_a(next_offset / UNIV_PAGE_SIZE <= ULINT_MAX);
+
+		fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, group->space_id, 0,
+		       (ulint) (next_offset / UNIV_PAGE_SIZE),
+		       (ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf,
+		       group);
+
+		srv_stats.os_log_pending_writes.dec();
+
+		srv_stats.os_log_written.add(write_len);
+		srv_stats.log_writes.inc();
+	}
+
+	if (write_len < len) {
+		start_lsn += write_len;
+		len -= write_len;
+		buf += write_len;
+
+		write_header = TRUE;
+
+		goto loop;
+	}
+}
+
+/******************************************************//**
+This function is called, e.g., when a transaction wants to commit. It checks
+that the log has been written to the log file up to the last log entry written
+by the transaction. If there is a flush running, it waits and checks if the
+flush flushed enough. If not, starts a new flush. */
+UNIV_INTERN
+void
+log_write_up_to(
+/*============*/
+	lsn_t	lsn,	/*!< in: log sequence number up to which
+			the log should be written,
+			LSN_MAX if not specified */
+	ulint	wait,	/*!< in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
+			or LOG_WAIT_ALL_GROUPS */
+	ibool	flush_to_disk)
+			/*!< in: TRUE if we want the written log
+			also to be flushed to disk */
+{
+	log_group_t*	group;
+	ulint		start_offset;
+	ulint		end_offset;
+	ulint		area_start;
+	ulint		area_end;
+#ifdef UNIV_DEBUG
+	ulint		loop_count	= 0;
+#endif /* UNIV_DEBUG */
+	ulint		unlock;
+
+	ut_ad(!srv_read_only_mode);
+
+	if (recv_no_ibuf_operations) {
+		/* Recovery is running and no operations on the log files are
+		allowed yet (the variable name .._no_ibuf_.. is misleading) */
+
+		return;
+	}
+
+loop:
+#ifdef UNIV_DEBUG
+	loop_count++;
+
+	ut_ad(loop_count < 5);
+
+# if 0
+	if (loop_count > 2) {
+		fprintf(stderr, "Log loop count %lu\n", loop_count);
+	}
+# endif
+#endif
+
+	mutex_enter(&(log_sys->mutex));
+	ut_ad(!recv_no_log_write);
+
+	if (flush_to_disk
+	    && log_sys->flushed_to_disk_lsn >= lsn) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		return;
+	}
+
+	if (!flush_to_disk
+	    && (log_sys->written_to_all_lsn >= lsn
+		|| (log_sys->written_to_some_lsn >= lsn
+		    && wait != LOG_WAIT_ALL_GROUPS))) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		return;
+	}
+
+	if (log_sys->n_pending_writes > 0) {
+		/* A write (+ possibly flush to disk) is running */
+
+		if (flush_to_disk
+		    && log_sys->current_flush_lsn >= lsn) {
+			/* The write + flush will write enough: wait for it to
+			complete */
+
+			goto do_waits;
+		}
+
+		if (!flush_to_disk
+		    && log_sys->write_lsn >= lsn) {
+			/* The write will write enough: wait for it to
+			complete */
+
+			goto do_waits;
+		}
+
+		mutex_exit(&(log_sys->mutex));
+
+		/* Wait for the write to complete and try to start a new
+		write */
+
+		os_event_wait(log_sys->no_flush_event);
+
+		goto loop;
+	}
+
+	if (!flush_to_disk
+	    && log_sys->buf_free == log_sys->buf_next_to_write) {
+		/* Nothing to write and no flush to disk requested */
+
+		mutex_exit(&(log_sys->mutex));
+
+		return;
+	}
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes) {
+		fprintf(stderr,
+			"Writing log from " LSN_PF " up to lsn " LSN_PF "\n",
+			log_sys->written_to_all_lsn,
+			log_sys->lsn);
+	}
+#endif /* UNIV_DEBUG */
+	log_sys->n_pending_writes++;
+	MONITOR_INC(MONITOR_PENDING_LOG_WRITE);
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+	group->n_pending_writes++;	/*!< We assume here that we have only
+					one log group! */
+
+	os_event_reset(log_sys->no_flush_event);
+	os_event_reset(log_sys->one_flushed_event);
+
+	start_offset = log_sys->buf_next_to_write;
+	end_offset = log_sys->buf_free;
+
+	area_start = ut_calc_align_down(start_offset, OS_FILE_LOG_BLOCK_SIZE);
+	area_end = ut_calc_align(end_offset, OS_FILE_LOG_BLOCK_SIZE);
+
+	ut_ad(area_end - area_start > 0);
+
+	log_sys->write_lsn = log_sys->lsn;
+
+	if (flush_to_disk) {
+		log_sys->current_flush_lsn = log_sys->lsn;
+	}
+
+	log_sys->one_flushed = FALSE;
+
+	log_block_set_flush_bit(log_sys->buf + area_start, TRUE);
+	log_block_set_checkpoint_no(
+		log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
+		log_sys->next_checkpoint_no);
+
+	/* Copy the last, incompletely written, log block a log block length
+	up, so that when the flush operation writes from the log buffer, the
+	segment to write will not be changed by writers to the log */
+
+	ut_memcpy(log_sys->buf + area_end,
+		  log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
+		  OS_FILE_LOG_BLOCK_SIZE);
+
+	log_sys->buf_free += OS_FILE_LOG_BLOCK_SIZE;
+	log_sys->write_end_offset = log_sys->buf_free;
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	/* Do the write to the log files */
+
+	while (group) {
+		log_group_write_buf(
+			group, log_sys->buf + area_start,
+			area_end - area_start,
+			ut_uint64_align_down(log_sys->written_to_all_lsn,
+					     OS_FILE_LOG_BLOCK_SIZE),
+			start_offset - area_start);
+
+		log_group_set_fields(group, log_sys->write_lsn);
+
+		group = UT_LIST_GET_NEXT(log_groups, group);
+	}
+
+	mutex_exit(&(log_sys->mutex));
+
+	if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
+		/* O_DSYNC means the OS did not buffer the log file at all:
+		so we have also flushed to disk what we have written */
+
+		log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
+
+	} else if (flush_to_disk) {
+
+		group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+		fil_flush(group->space_id);
+		log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
+	}
+
+	mutex_enter(&(log_sys->mutex));
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	ut_a(group->n_pending_writes == 1);
+	ut_a(log_sys->n_pending_writes == 1);
+
+	group->n_pending_writes--;
+	log_sys->n_pending_writes--;
+	MONITOR_DEC(MONITOR_PENDING_LOG_WRITE);
+
+	unlock = log_group_check_flush_completion(group);
+	unlock = unlock | log_sys_check_flush_completion();
+
+	log_flush_do_unlocks(unlock);
+
+	mutex_exit(&(log_sys->mutex));
+
+	return;
+
+do_waits:
+	mutex_exit(&(log_sys->mutex));
+
+	switch (wait) {
+	case LOG_WAIT_ONE_GROUP:
+		os_event_wait(log_sys->one_flushed_event);
+		break;
+	case LOG_WAIT_ALL_GROUPS:
+		os_event_wait(log_sys->no_flush_event);
+		break;
+#ifdef UNIV_DEBUG
+	case LOG_NO_WAIT:
+		break;
+	default:
+		ut_error;
+#endif /* UNIV_DEBUG */
+	}
+}
+
+/****************************************************************//**
+Does a syncronous flush of the log buffer to disk. */
+UNIV_INTERN
+void
+log_buffer_flush_to_disk(void)
+/*==========================*/
+{
+	lsn_t	lsn;
+
+	ut_ad(!srv_read_only_mode);
+	mutex_enter(&(log_sys->mutex));
+
+	lsn = log_sys->lsn;
+
+	mutex_exit(&(log_sys->mutex));
+
+	log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, TRUE);
+}
+
+/****************************************************************//**
+This functions writes the log buffer to the log file and if 'flush'
+is set it forces a flush of the log file as well. This is meant to be
+called from background master thread only as it does not wait for
+the write (+ possible flush) to finish. */
+UNIV_INTERN
+void
+log_buffer_sync_in_background(
+/*==========================*/
+	ibool	flush)	/*!< in: flush the logs to disk */
+{
+	lsn_t	lsn;
+
+	mutex_enter(&(log_sys->mutex));
+
+	lsn = log_sys->lsn;
+
+	mutex_exit(&(log_sys->mutex));
+
+	log_write_up_to(lsn, LOG_NO_WAIT, flush);
+}
+
+/********************************************************************
+
+Tries to establish a big enough margin of free space in the log buffer, such
+that a new log entry can be catenated without an immediate need for a flush. */
+static
+void
+log_flush_margin(void)
+/*==================*/
+{
+	log_t*	log	= log_sys;
+	lsn_t	lsn	= 0;
+
+	mutex_enter(&(log->mutex));
+
+	if (log->buf_free > log->max_buf_free) {
+
+		if (log->n_pending_writes > 0) {
+			/* A flush is running: hope that it will provide enough
+			free space */
+		} else {
+			lsn = log->lsn;
+		}
+	}
+
+	mutex_exit(&(log->mutex));
+
+	if (lsn) {
+		log_write_up_to(lsn, LOG_NO_WAIT, FALSE);
+	}
+}
+
+/****************************************************************//**
+Advances the smallest lsn for which there are unflushed dirty blocks in the
+buffer pool. NOTE: this function may only be called if the calling thread owns
+no synchronization objects!
+@return false if there was a flush batch of the same type running,
+which means that we could not start this flush batch */
+static
+bool
+log_preflush_pool_modified_pages(
+/*=============================*/
+	lsn_t	new_oldest)	/*!< in: try to advance oldest_modified_lsn
+				at least to this lsn */
+{
+	bool	success;
+	ulint	n_pages;
+
+	if (recv_recovery_on) {
+		/* If the recovery is running, we must first apply all
+		log records to their respective file pages to get the
+		right modify lsn values to these pages: otherwise, there
+		might be pages on disk which are not yet recovered to the
+		current lsn, and even after calling this function, we could
+		not know how up-to-date the disk version of the database is,
+		and we could not make a new checkpoint on the basis of the
+		info on the buffer pool only. */
+
+		recv_apply_hashed_log_recs(TRUE);
+	}
+
+	success = buf_flush_list(ULINT_MAX, new_oldest, &n_pages);
+
+	buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+
+	if (!success) {
+		MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
+	}
+
+	MONITOR_INC_VALUE_CUMULATIVE(
+		MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+		MONITOR_FLUSH_SYNC_COUNT,
+		MONITOR_FLUSH_SYNC_PAGES,
+		n_pages);
+
+	return(success);
+}
+
+/******************************************************//**
+Completes a checkpoint. */
+static
+void
+log_complete_checkpoint(void)
+/*=========================*/
+{
+	ut_ad(mutex_own(&(log_sys->mutex)));
+	ut_ad(log_sys->n_pending_checkpoint_writes == 0);
+
+	log_sys->next_checkpoint_no++;
+
+	log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn;
+	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
+		    log_sys->lsn - log_sys->last_checkpoint_lsn);
+
+	rw_lock_x_unlock_gen(&(log_sys->checkpoint_lock), LOG_CHECKPOINT);
+}
+
+/******************************************************//**
+Completes an asynchronous checkpoint info write i/o to a log file. */
+static
+void
+log_io_complete_checkpoint(void)
+/*============================*/
+{
+	mutex_enter(&(log_sys->mutex));
+
+	ut_ad(log_sys->n_pending_checkpoint_writes > 0);
+
+	log_sys->n_pending_checkpoint_writes--;
+	MONITOR_DEC(MONITOR_PENDING_CHECKPOINT_WRITE);
+
+	if (log_sys->n_pending_checkpoint_writes == 0) {
+		log_complete_checkpoint();
+	}
+
+	mutex_exit(&(log_sys->mutex));
+}
+
+/*******************************************************************//**
+Writes info to a checkpoint about a log group. */
+static
+void
+log_checkpoint_set_nth_group_info(
+/*==============================*/
+	byte*	buf,	/*!< in: buffer for checkpoint info */
+	ulint	n,	/*!< in: nth slot */
+	ulint	file_no,/*!< in: archived file number */
+	ulint	offset)	/*!< in: archived file offset */
+{
+	ut_ad(n < LOG_MAX_N_GROUPS);
+
+	mach_write_to_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
+			+ 8 * n + LOG_CHECKPOINT_ARCHIVED_FILE_NO, file_no);
+	mach_write_to_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
+			+ 8 * n + LOG_CHECKPOINT_ARCHIVED_OFFSET, offset);
+}
+
+/*******************************************************************//**
+Gets info from a checkpoint about a log group. */
+UNIV_INTERN
+void
+log_checkpoint_get_nth_group_info(
+/*==============================*/
+	const byte*	buf,	/*!< in: buffer containing checkpoint info */
+	ulint		n,	/*!< in: nth slot */
+	ulint*		file_no,/*!< out: archived file number */
+	ulint*		offset)	/*!< out: archived file offset */
+{
+	ut_ad(n < LOG_MAX_N_GROUPS);
+
+	*file_no = mach_read_from_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
+				    + 8 * n + LOG_CHECKPOINT_ARCHIVED_FILE_NO);
+	*offset = mach_read_from_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
+				   + 8 * n + LOG_CHECKPOINT_ARCHIVED_OFFSET);
+}
+
+/******************************************************//**
+Writes the checkpoint info to a log group header. */
+static
+void
+log_group_checkpoint(
+/*=================*/
+	log_group_t*	group)	/*!< in: log group */
+{
+	log_group_t*	group2;
+#ifdef UNIV_LOG_ARCHIVE
+	ib_uint64_t	archived_lsn;
+	ib_uint64_t	next_archived_lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+	lsn_t		lsn_offset;
+	ulint		write_offset;
+	ulint		fold;
+	byte*		buf;
+	ulint		i;
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(mutex_own(&(log_sys->mutex)));
+#if LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE
+# error "LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE"
+#endif
+
+	buf = group->checkpoint_buf;
+
+	mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no);
+	mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn);
+
+	lsn_offset = log_group_calc_lsn_offset(log_sys->next_checkpoint_lsn,
+					       group);
+	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET_LOW32,
+			lsn_offset & 0xFFFFFFFFUL);
+	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET_HIGH32,
+			lsn_offset >> 32);
+
+	mach_write_to_4(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, log_sys->buf_size);
+
+#ifdef UNIV_LOG_ARCHIVE
+	if (log_sys->archiving_state == LOG_ARCH_OFF) {
+		archived_lsn = LSN_MAX;
+	} else {
+		archived_lsn = log_sys->archived_lsn;
+
+		if (archived_lsn != log_sys->next_archived_lsn) {
+			next_archived_lsn = log_sys->next_archived_lsn;
+			/* For debugging only */
+		}
+	}
+
+	mach_write_to_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN, archived_lsn);
+#else /* UNIV_LOG_ARCHIVE */
+	mach_write_to_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN, LSN_MAX);
+#endif /* UNIV_LOG_ARCHIVE */
+
+	for (i = 0; i < LOG_MAX_N_GROUPS; i++) {
+		log_checkpoint_set_nth_group_info(buf, i, 0, 0);
+	}
+
+	group2 = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	while (group2) {
+		log_checkpoint_set_nth_group_info(buf, group2->id,
+#ifdef UNIV_LOG_ARCHIVE
+						  group2->archived_file_no,
+						  group2->archived_offset
+#else /* UNIV_LOG_ARCHIVE */
+						  0, 0
+#endif /* UNIV_LOG_ARCHIVE */
+						  );
+
+		group2 = UT_LIST_GET_NEXT(log_groups, group2);
+	}
+
+	fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1);
+	mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_1, fold);
+
+	fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN,
+			      LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN);
+	mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_2, fold);
+
+	/* We alternate the physical place of the checkpoint info in the first
+	log file */
+
+	if ((log_sys->next_checkpoint_no & 1) == 0) {
+		write_offset = LOG_CHECKPOINT_1;
+	} else {
+		write_offset = LOG_CHECKPOINT_2;
+	}
+
+	if (log_do_write) {
+		if (log_sys->n_pending_checkpoint_writes == 0) {
+
+			rw_lock_x_lock_gen(&(log_sys->checkpoint_lock),
+					   LOG_CHECKPOINT);
+		}
+
+		log_sys->n_pending_checkpoint_writes++;
+		MONITOR_INC(MONITOR_PENDING_CHECKPOINT_WRITE);
+
+		log_sys->n_log_ios++;
+
+		MONITOR_INC(MONITOR_LOG_IO);
+
+		/* We send as the last parameter the group machine address
+		added with 1, as we want to distinguish between a normal log
+		file write and a checkpoint field write */
+
+		fil_io(OS_FILE_WRITE | OS_FILE_LOG, false, group->space_id, 0,
+		       write_offset / UNIV_PAGE_SIZE,
+		       write_offset % UNIV_PAGE_SIZE,
+		       OS_FILE_LOG_BLOCK_SIZE,
+		       buf, ((byte*) group + 1));
+
+		ut_ad(((ulint) group & 0x1UL) == 0);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_HOTBACKUP
+/******************************************************//**
+Writes info to a buffer of a log group when log files are created in
+backup restoration. */
+UNIV_INTERN
+void
+log_reset_first_header_and_checkpoint(
+/*==================================*/
+	byte*		hdr_buf,/*!< in: buffer which will be written to the
+				start of the first log file */
+	ib_uint64_t	start)	/*!< in: lsn of the start of the first log file;
+				we pretend that there is a checkpoint at
+				start + LOG_BLOCK_HDR_SIZE */
+{
+	ulint		fold;
+	byte*		buf;
+	ib_uint64_t	lsn;
+
+	mach_write_to_4(hdr_buf + LOG_GROUP_ID, 0);
+	mach_write_to_8(hdr_buf + LOG_FILE_START_LSN, start);
+
+	lsn = start + LOG_BLOCK_HDR_SIZE;
+
+	/* Write the label of mysqlbackup --restore */
+	strcpy((char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
+	       "ibbackup ");
+	ut_sprintf_timestamp((char*) hdr_buf
+			     + (LOG_FILE_WAS_CREATED_BY_HOT_BACKUP
+				+ (sizeof "ibbackup ") - 1));
+	buf = hdr_buf + LOG_CHECKPOINT_1;
+
+	mach_write_to_8(buf + LOG_CHECKPOINT_NO, 0);
+	mach_write_to_8(buf + LOG_CHECKPOINT_LSN, lsn);
+
+	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET_LOW32,
+			LOG_FILE_HDR_SIZE + LOG_BLOCK_HDR_SIZE);
+	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET_HIGH32, 0);
+
+	mach_write_to_4(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, 2 * 1024 * 1024);
+
+	mach_write_to_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN, LSN_MAX);
+
+	fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1);
+	mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_1, fold);
+
+	fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN,
+			      LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN);
+	mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_2, fold);
+
+	/* Starting from InnoDB-3.23.50, we should also write info on
+	allocated size in the tablespace, but unfortunately we do not
+	know it here */
+}
+#endif /* UNIV_HOTBACKUP */
+
+#ifndef UNIV_HOTBACKUP
+/******************************************************//**
+Reads a checkpoint info from a log group header to log_sys->checkpoint_buf. */
+UNIV_INTERN
+void
+log_group_read_checkpoint_info(
+/*===========================*/
+	log_group_t*	group,	/*!< in: log group */
+	ulint		field)	/*!< in: LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 */
+{
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	log_sys->n_log_ios++;
+
+	MONITOR_INC(MONITOR_LOG_IO);
+
+	fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->space_id, 0,
+	       field / UNIV_PAGE_SIZE, field % UNIV_PAGE_SIZE,
+	       OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL);
+}
+
+/******************************************************//**
+Writes checkpoint info to groups. */
+UNIV_INTERN
+void
+log_groups_write_checkpoint_info(void)
+/*==================================*/
+{
+	log_group_t*	group;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	if (!srv_read_only_mode) {
+		for (group = UT_LIST_GET_FIRST(log_sys->log_groups);
+		     group;
+		     group = UT_LIST_GET_NEXT(log_groups, group)) {
+
+			log_group_checkpoint(group);
+		}
+	}
+}
+
+/******************************************************//**
+Makes a checkpoint. Note that this function does not flush dirty
+blocks from the buffer pool: it only checks what is lsn of the oldest
+modification in the pool, and writes information about the lsn in
+log files. Use log_make_checkpoint_at to flush also the pool.
+@return	TRUE if success, FALSE if a checkpoint write was already running */
+UNIV_INTERN
+ibool
+log_checkpoint(
+/*===========*/
+	ibool	sync,		/*!< in: TRUE if synchronous operation is
+				desired */
+	ibool	write_always)	/*!< in: the function normally checks if the
+				the new checkpoint would have a greater
+				lsn than the previous one: if not, then no
+				physical write is done; by setting this
+				parameter TRUE, a physical write will always be
+				made to log files */
+{
+	lsn_t	oldest_lsn;
+
+	ut_ad(!srv_read_only_mode);
+
+	if (recv_recovery_is_on()) {
+		recv_apply_hashed_log_recs(TRUE);
+	}
+
+	if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
+		fil_flush_file_spaces(FIL_TABLESPACE);
+	}
+
+	mutex_enter(&(log_sys->mutex));
+
+	ut_ad(!recv_no_log_write);
+	oldest_lsn = log_buf_pool_get_oldest_modification();
+
+	mutex_exit(&(log_sys->mutex));
+
+	/* Because log also contains headers and dummy log records,
+	if the buffer pool contains no dirty buffers, oldest_lsn
+	gets the value log_sys->lsn from the previous function,
+	and we must make sure that the log is flushed up to that
+	lsn. If there are dirty buffers in the buffer pool, then our
+	write-ahead-logging algorithm ensures that the log has been flushed
+	up to oldest_lsn. */
+
+	log_write_up_to(oldest_lsn, LOG_WAIT_ALL_GROUPS, TRUE);
+
+	mutex_enter(&(log_sys->mutex));
+
+	if (!write_always
+	    && log_sys->last_checkpoint_lsn >= oldest_lsn) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		return(TRUE);
+	}
+
+	ut_ad(log_sys->flushed_to_disk_lsn >= oldest_lsn);
+
+	if (log_sys->n_pending_checkpoint_writes > 0) {
+		/* A checkpoint write is running */
+
+		mutex_exit(&(log_sys->mutex));
+
+		if (sync) {
+			/* Wait for the checkpoint write to complete */
+			rw_lock_s_lock(&(log_sys->checkpoint_lock));
+			rw_lock_s_unlock(&(log_sys->checkpoint_lock));
+		}
+
+		return(FALSE);
+	}
+
+	log_sys->next_checkpoint_lsn = oldest_lsn;
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes) {
+		fprintf(stderr, "Making checkpoint no "
+			LSN_PF " at lsn " LSN_PF "\n",
+			log_sys->next_checkpoint_no,
+			oldest_lsn);
+	}
+#endif /* UNIV_DEBUG */
+
+	log_groups_write_checkpoint_info();
+
+	MONITOR_INC(MONITOR_NUM_CHECKPOINT);
+
+	mutex_exit(&(log_sys->mutex));
+
+	if (sync) {
+		/* Wait for the checkpoint write to complete */
+		rw_lock_s_lock(&(log_sys->checkpoint_lock));
+		rw_lock_s_unlock(&(log_sys->checkpoint_lock));
+	}
+
+	return(TRUE);
+}
+
+/****************************************************************//**
+Makes a checkpoint at a given lsn or later. */
+UNIV_INTERN
+void
+log_make_checkpoint_at(
+/*===================*/
+	lsn_t	lsn,		/*!< in: make a checkpoint at this or a
+				later lsn, if LSN_MAX, makes
+				a checkpoint at the latest lsn */
+	ibool	write_always)	/*!< in: the function normally checks if
+				the new checkpoint would have a
+				greater lsn than the previous one: if
+				not, then no physical write is done;
+				by setting this parameter TRUE, a
+				physical write will always be made to
+				log files */
+{
+	/* Preflush pages synchronously */
+
+	while (!log_preflush_pool_modified_pages(lsn)) {
+		/* Flush as much as we can */
+	}
+
+	while (!log_checkpoint(TRUE, write_always)) {
+		/* Force a checkpoint */
+	}
+}
+
+/****************************************************************//**
+Tries to establish a big enough margin of free space in the log groups, such
+that a new log entry can be catenated without an immediate need for a
+checkpoint. NOTE: this function may only be called if the calling thread
+owns no synchronization objects! */
+static
+void
+log_checkpoint_margin(void)
+/*=======================*/
+{
+	log_t*		log		= log_sys;
+	lsn_t		age;
+	lsn_t		checkpoint_age;
+	ib_uint64_t	advance;
+	lsn_t		oldest_lsn;
+	ibool		checkpoint_sync;
+	ibool		do_checkpoint;
+	bool		success;
+loop:
+	checkpoint_sync = FALSE;
+	do_checkpoint = FALSE;
+	advance = 0;
+
+	mutex_enter(&(log->mutex));
+	ut_ad(!recv_no_log_write);
+
+	if (log->check_flush_or_checkpoint == FALSE) {
+		mutex_exit(&(log->mutex));
+
+		return;
+	}
+
+	oldest_lsn = log_buf_pool_get_oldest_modification();
+
+	age = log->lsn - oldest_lsn;
+
+	if (age > log->max_modified_age_sync) {
+
+		/* A flush is urgent: we have to do a synchronous preflush */
+		advance = 2 * (age - log->max_modified_age_sync);
+	}
+
+	checkpoint_age = log->lsn - log->last_checkpoint_lsn;
+
+	if (checkpoint_age > log->max_checkpoint_age) {
+		/* A checkpoint is urgent: we do it synchronously */
+
+		checkpoint_sync = TRUE;
+
+		do_checkpoint = TRUE;
+
+	} else if (checkpoint_age > log->max_checkpoint_age_async) {
+		/* A checkpoint is not urgent: do it asynchronously */
+
+		do_checkpoint = TRUE;
+
+		log->check_flush_or_checkpoint = FALSE;
+	} else {
+		log->check_flush_or_checkpoint = FALSE;
+	}
+
+	mutex_exit(&(log->mutex));
+
+	if (advance) {
+		lsn_t	new_oldest = oldest_lsn + advance;
+
+		success = log_preflush_pool_modified_pages(new_oldest);
+
+		/* If the flush succeeded, this thread has done its part
+		and can proceed. If it did not succeed, there was another
+		thread doing a flush at the same time. */
+		if (!success) {
+			mutex_enter(&(log->mutex));
+
+			log->check_flush_or_checkpoint = TRUE;
+
+			mutex_exit(&(log->mutex));
+			goto loop;
+		}
+	}
+
+	if (do_checkpoint) {
+		log_checkpoint(checkpoint_sync, FALSE);
+
+		if (checkpoint_sync) {
+
+			goto loop;
+		}
+	}
+}
+
+/******************************************************//**
+Reads a specified log segment to a buffer. */
+UNIV_INTERN
+void
+log_group_read_log_seg(
+/*===================*/
+	ulint		type,		/*!< in: LOG_ARCHIVE or LOG_RECOVER */
+	byte*		buf,		/*!< in: buffer where to read */
+	log_group_t*	group,		/*!< in: log group */
+	lsn_t		start_lsn,	/*!< in: read area start */
+	lsn_t		end_lsn)	/*!< in: read area end */
+{
+	ulint	len;
+	lsn_t	source_offset;
+	bool	sync;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	sync = (type == LOG_RECOVER);
+loop:
+	source_offset = log_group_calc_lsn_offset(start_lsn, group);
+
+	ut_a(end_lsn - start_lsn <= ULINT_MAX);
+	len = (ulint) (end_lsn - start_lsn);
+
+	ut_ad(len != 0);
+
+	if ((source_offset % group->file_size) + len > group->file_size) {
+
+		/* If the above condition is true then len (which is ulint)
+		is > the expression below, so the typecast is ok */
+		len = (ulint) (group->file_size -
+			(source_offset % group->file_size));
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	if (type == LOG_ARCHIVE) {
+
+		log_sys->n_pending_archive_ios++;
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+
+	log_sys->n_log_ios++;
+
+	MONITOR_INC(MONITOR_LOG_IO);
+
+	ut_a(source_offset / UNIV_PAGE_SIZE <= ULINT_MAX);
+
+	fil_io(OS_FILE_READ | OS_FILE_LOG, sync, group->space_id, 0,
+	       (ulint) (source_offset / UNIV_PAGE_SIZE),
+	       (ulint) (source_offset % UNIV_PAGE_SIZE),
+	       len, buf, NULL);
+
+	start_lsn += len;
+	buf += len;
+
+	if (start_lsn != end_lsn) {
+
+		goto loop;
+	}
+}
+
+#ifdef UNIV_LOG_ARCHIVE
+/******************************************************//**
+Generates an archived log file name. */
+UNIV_INTERN
+void
+log_archived_file_name_gen(
+/*=======================*/
+	char*	buf,	/*!< in: buffer where to write */
+	ulint	id __attribute__((unused)),
+			/*!< in: group id;
+			currently we only archive the first group */
+	ulint	file_no)/*!< in: file number */
+{
+	sprintf(buf, "%sib_arch_log_%010lu", srv_arch_dir, (ulong) file_no);
+}
+
+/******************************************************//**
+Writes a log file header to a log file space. */
+static
+void
+log_group_archive_file_header_write(
+/*================================*/
+	log_group_t*	group,		/*!< in: log group */
+	ulint		nth_file,	/*!< in: header to the nth file in the
+					archive log file space */
+	ulint		file_no,	/*!< in: archived file number */
+	ib_uint64_t	start_lsn)	/*!< in: log file data starts at this
+					lsn */
+{
+	byte*	buf;
+	ulint	dest_offset;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	ut_a(nth_file < group->n_files);
+
+	buf = *(group->archive_file_header_bufs + nth_file);
+
+	mach_write_to_4(buf + LOG_GROUP_ID, group->id);
+	mach_write_to_8(buf + LOG_FILE_START_LSN, start_lsn);
+	mach_write_to_4(buf + LOG_FILE_NO, file_no);
+
+	mach_write_to_4(buf + LOG_FILE_ARCH_COMPLETED, FALSE);
+
+	dest_offset = nth_file * group->file_size;
+
+	log_sys->n_log_ios++;
+
+	MONITOR_INC(MONITOR_LOG_IO);
+
+	fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, group->archive_space_id,
+	       dest_offset / UNIV_PAGE_SIZE,
+	       dest_offset % UNIV_PAGE_SIZE,
+	       2 * OS_FILE_LOG_BLOCK_SIZE,
+	       buf, &log_archive_io);
+}
+
+/******************************************************//**
+Writes a log file header to a completed archived log file. */
+static
+void
+log_group_archive_completed_header_write(
+/*=====================================*/
+	log_group_t*	group,		/*!< in: log group */
+	ulint		nth_file,	/*!< in: header to the nth file in the
+					archive log file space */
+	ib_uint64_t	end_lsn)	/*!< in: end lsn of the file */
+{
+	byte*	buf;
+	ulint	dest_offset;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+	ut_a(nth_file < group->n_files);
+
+	buf = *(group->archive_file_header_bufs + nth_file);
+
+	mach_write_to_4(buf + LOG_FILE_ARCH_COMPLETED, TRUE);
+	mach_write_to_8(buf + LOG_FILE_END_LSN, end_lsn);
+
+	dest_offset = nth_file * group->file_size + LOG_FILE_ARCH_COMPLETED;
+
+	log_sys->n_log_ios++;
+
+	MONITOR_INC(MONITOR_LOG_IO);
+
+	fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, group->archive_space_id,
+	       dest_offset / UNIV_PAGE_SIZE,
+	       dest_offset % UNIV_PAGE_SIZE,
+	       OS_FILE_LOG_BLOCK_SIZE,
+	       buf + LOG_FILE_ARCH_COMPLETED,
+	       &log_archive_io);
+}
+
+/******************************************************//**
+Does the archive writes for a single log group. */
+static
+void
+log_group_archive(
+/*==============*/
+	log_group_t*	group)	/*!< in: log group */
+{
+	os_file_t	file_handle;
+	lsn_t		start_lsn;
+	lsn_t		end_lsn;
+	char		name[1024];
+	byte*		buf;
+	ulint		len;
+	ibool		ret;
+	lsn_t		next_offset;
+	ulint		n_files;
+	ulint		open_mode;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	start_lsn = log_sys->archived_lsn;
+
+	ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
+
+	end_lsn = log_sys->next_archived_lsn;
+
+	ut_a(end_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
+
+	buf = log_sys->archive_buf;
+
+	n_files = 0;
+
+	next_offset = group->archived_offset;
+loop:
+	if ((next_offset % group->file_size == 0)
+	    || (fil_space_get_size(group->archive_space_id) == 0)) {
+
+		/* Add the file to the archive file space; create or open the
+		file */
+
+		if (next_offset % group->file_size == 0) {
+			open_mode = OS_FILE_CREATE;
+		} else {
+			open_mode = OS_FILE_OPEN;
+		}
+
+		log_archived_file_name_gen(name, group->id,
+					   group->archived_file_no + n_files);
+
+		file_handle = os_file_create(innodb_file_log_key,
+					     name, open_mode,
+					     OS_FILE_AIO,
+					     OS_DATA_FILE, &ret);
+
+		if (!ret && (open_mode == OS_FILE_CREATE)) {
+			file_handle = os_file_create(
+				innodb_file_log_key, name, OS_FILE_OPEN,
+				OS_FILE_AIO, OS_DATA_FILE, &ret);
+		}
+
+		if (!ret) {
+			fprintf(stderr,
+				"InnoDB: Cannot create or open"
+				" archive log file %s.\n"
+				"InnoDB: Cannot continue operation.\n"
+				"InnoDB: Check that the log archive"
+				" directory exists,\n"
+				"InnoDB: you have access rights to it, and\n"
+				"InnoDB: there is space available.\n", name);
+			exit(1);
+		}
+
+#ifdef UNIV_DEBUG
+		if (log_debug_writes) {
+			fprintf(stderr, "Created archive file %s\n", name);
+		}
+#endif /* UNIV_DEBUG */
+
+		ret = os_file_close(file_handle);
+
+		ut_a(ret);
+
+		/* Add the archive file as a node to the space */
+
+		fil_node_create(name, group->file_size / UNIV_PAGE_SIZE,
+				group->archive_space_id, FALSE);
+
+		if (next_offset % group->file_size == 0) {
+			log_group_archive_file_header_write(
+				group, n_files,
+				group->archived_file_no + n_files,
+				start_lsn);
+
+			next_offset += LOG_FILE_HDR_SIZE;
+		}
+	}
+
+	len = end_lsn - start_lsn;
+
+	if (group->file_size < (next_offset % group->file_size) + len) {
+
+		len = group->file_size - (next_offset % group->file_size);
+	}
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes) {
+		fprintf(stderr,
+			"Archiving starting at lsn " LSN_PF ", len %lu"
+			" to group %lu\n",
+			start_lsn,
+			(ulong) len, (ulong) group->id);
+	}
+#endif /* UNIV_DEBUG */
+
+	log_sys->n_pending_archive_ios++;
+
+	log_sys->n_log_ios++;
+
+	MONITOR_INC(MONITOR_LOG_IO);
+
+	fil_io(OS_FILE_WRITE | OS_FILE_LOG, false, group->archive_space_id,
+	       (ulint) (next_offset / UNIV_PAGE_SIZE),
+	       (ulint) (next_offset % UNIV_PAGE_SIZE),
+	       ut_calc_align(len, OS_FILE_LOG_BLOCK_SIZE), buf,
+	       &log_archive_io);
+
+	start_lsn += len;
+	next_offset += len;
+	buf += len;
+
+	if (next_offset % group->file_size == 0) {
+		n_files++;
+	}
+
+	if (end_lsn != start_lsn) {
+
+		goto loop;
+	}
+
+	group->next_archived_file_no = group->archived_file_no + n_files;
+	group->next_archived_offset = next_offset % group->file_size;
+
+	ut_a(group->next_archived_offset % OS_FILE_LOG_BLOCK_SIZE == 0);
+}
+
+/*****************************************************//**
+(Writes to the archive of each log group.) Currently, only the first
+group is archived. */
+static
+void
+log_archive_groups(void)
+/*====================*/
+{
+	log_group_t*	group;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	log_group_archive(group);
+}
+
+/*****************************************************//**
+Completes the archiving write phase for (each log group), currently,
+the first log group. */
+static
+void
+log_archive_write_complete_groups(void)
+/*===================================*/
+{
+	log_group_t*	group;
+	ulint		end_offset;
+	ulint		trunc_files;
+	ulint		n_files;
+	ib_uint64_t	start_lsn;
+	ib_uint64_t	end_lsn;
+	ulint		i;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	group->archived_file_no = group->next_archived_file_no;
+	group->archived_offset = group->next_archived_offset;
+
+	/* Truncate from the archive file space all but the last
+	file, or if it has been written full, all files */
+
+	n_files = (UNIV_PAGE_SIZE
+		   * fil_space_get_size(group->archive_space_id))
+		/ group->file_size;
+	ut_ad(n_files > 0);
+
+	end_offset = group->archived_offset;
+
+	if (end_offset % group->file_size == 0) {
+
+		trunc_files = n_files;
+	} else {
+		trunc_files = n_files - 1;
+	}
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes && trunc_files) {
+		fprintf(stderr,
+			"Complete file(s) archived to group %lu\n",
+			(ulong) group->id);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* Calculate the archive file space start lsn */
+	start_lsn = log_sys->next_archived_lsn
+		- (end_offset - LOG_FILE_HDR_SIZE + trunc_files
+		   * (group->file_size - LOG_FILE_HDR_SIZE));
+	end_lsn = start_lsn;
+
+	for (i = 0; i < trunc_files; i++) {
+
+		end_lsn += group->file_size - LOG_FILE_HDR_SIZE;
+
+		/* Write a notice to the headers of archived log
+		files that the file write has been completed */
+
+		log_group_archive_completed_header_write(group, i, end_lsn);
+	}
+
+	fil_space_truncate_start(group->archive_space_id,
+				 trunc_files * group->file_size);
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes) {
+		fputs("Archiving writes completed\n", stderr);
+	}
+#endif /* UNIV_DEBUG */
+}
+
+/******************************************************//**
+Completes an archiving i/o. */
+static
+void
+log_archive_check_completion_low(void)
+/*==================================*/
+{
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	if (log_sys->n_pending_archive_ios == 0
+	    && log_sys->archiving_phase == LOG_ARCHIVE_READ) {
+
+#ifdef UNIV_DEBUG
+		if (log_debug_writes) {
+			fputs("Archiving read completed\n", stderr);
+		}
+#endif /* UNIV_DEBUG */
+
+		/* Archive buffer has now been read in: start archive writes */
+
+		log_sys->archiving_phase = LOG_ARCHIVE_WRITE;
+
+		log_archive_groups();
+	}
+
+	if (log_sys->n_pending_archive_ios == 0
+	    && log_sys->archiving_phase == LOG_ARCHIVE_WRITE) {
+
+		log_archive_write_complete_groups();
+
+		log_sys->archived_lsn = log_sys->next_archived_lsn;
+
+		rw_lock_x_unlock_gen(&(log_sys->archive_lock), LOG_ARCHIVE);
+	}
+}
+
+/******************************************************//**
+Completes an archiving i/o. */
+static
+void
+log_io_complete_archive(void)
+/*=========================*/
+{
+	log_group_t*	group;
+
+	mutex_enter(&(log_sys->mutex));
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	mutex_exit(&(log_sys->mutex));
+
+	fil_flush(group->archive_space_id);
+
+	mutex_enter(&(log_sys->mutex));
+
+	ut_ad(log_sys->n_pending_archive_ios > 0);
+
+	log_sys->n_pending_archive_ios--;
+
+	log_archive_check_completion_low();
+
+	mutex_exit(&(log_sys->mutex));
+}
+
+/********************************************************************//**
+Starts an archiving operation.
+@return	TRUE if succeed, FALSE if an archiving operation was already running */
+UNIV_INTERN
+ibool
+log_archive_do(
+/*===========*/
+	ibool	sync,	/*!< in: TRUE if synchronous operation is desired */
+	ulint*	n_bytes)/*!< out: archive log buffer size, 0 if nothing to
+			archive */
+{
+	ibool		calc_new_limit;
+	ib_uint64_t	start_lsn;
+	ib_uint64_t	limit_lsn;
+
+	calc_new_limit = TRUE;
+loop:
+	mutex_enter(&(log_sys->mutex));
+
+	switch (log_sys->archiving_state) {
+	case LOG_ARCH_OFF:
+arch_none:
+		mutex_exit(&(log_sys->mutex));
+
+		*n_bytes = 0;
+
+		return(TRUE);
+	case LOG_ARCH_STOPPED:
+	case LOG_ARCH_STOPPING2:
+		mutex_exit(&(log_sys->mutex));
+
+		os_event_wait(log_sys->archiving_on);
+
+		goto loop;
+	}
+
+	start_lsn = log_sys->archived_lsn;
+
+	if (calc_new_limit) {
+		ut_a(log_sys->archive_buf_size % OS_FILE_LOG_BLOCK_SIZE == 0);
+		limit_lsn = start_lsn + log_sys->archive_buf_size;
+
+		*n_bytes = log_sys->archive_buf_size;
+
+		if (limit_lsn >= log_sys->lsn) {
+
+			limit_lsn = ut_uint64_align_down(
+				log_sys->lsn, OS_FILE_LOG_BLOCK_SIZE);
+		}
+	}
+
+	if (log_sys->archived_lsn >= limit_lsn) {
+
+		goto arch_none;
+	}
+
+	if (log_sys->written_to_all_lsn < limit_lsn) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		log_write_up_to(limit_lsn, LOG_WAIT_ALL_GROUPS, TRUE);
+
+		calc_new_limit = FALSE;
+
+		goto loop;
+	}
+
+	if (log_sys->n_pending_archive_ios > 0) {
+		/* An archiving operation is running */
+
+		mutex_exit(&(log_sys->mutex));
+
+		if (sync) {
+			rw_lock_s_lock(&(log_sys->archive_lock));
+			rw_lock_s_unlock(&(log_sys->archive_lock));
+		}
+
+		*n_bytes = log_sys->archive_buf_size;
+
+		return(FALSE);
+	}
+
+	rw_lock_x_lock_gen(&(log_sys->archive_lock), LOG_ARCHIVE);
+
+	log_sys->archiving_phase = LOG_ARCHIVE_READ;
+
+	log_sys->next_archived_lsn = limit_lsn;
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes) {
+		fprintf(stderr,
+			"Archiving from lsn " LSN_PF " to lsn " LSN_PF "\n",
+			log_sys->archived_lsn, limit_lsn);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* Read the log segment to the archive buffer */
+
+	log_group_read_log_seg(LOG_ARCHIVE, log_sys->archive_buf,
+			       UT_LIST_GET_FIRST(log_sys->log_groups),
+			       start_lsn, limit_lsn);
+
+	mutex_exit(&(log_sys->mutex));
+
+	if (sync) {
+		rw_lock_s_lock(&(log_sys->archive_lock));
+		rw_lock_s_unlock(&(log_sys->archive_lock));
+	}
+
+	*n_bytes = log_sys->archive_buf_size;
+
+	return(TRUE);
+}
+
+/****************************************************************//**
+Writes the log contents to the archive at least up to the lsn when this
+function was called. */
+static
+void
+log_archive_all(void)
+/*=================*/
+{
+	ib_uint64_t	present_lsn;
+	ulint		dummy;
+
+	mutex_enter(&(log_sys->mutex));
+
+	if (log_sys->archiving_state == LOG_ARCH_OFF) {
+		mutex_exit(&(log_sys->mutex));
+
+		return;
+	}
+
+	present_lsn = log_sys->lsn;
+
+	mutex_exit(&(log_sys->mutex));
+
+	log_pad_current_log_block();
+
+	for (;;) {
+		mutex_enter(&(log_sys->mutex));
+
+		if (present_lsn <= log_sys->archived_lsn) {
+
+			mutex_exit(&(log_sys->mutex));
+
+			return;
+		}
+
+		mutex_exit(&(log_sys->mutex));
+
+		log_archive_do(TRUE, &dummy);
+	}
+}
+
+/*****************************************************//**
+Closes the possible open archive log file (for each group) the first group,
+and if it was open, increments the group file count by 2, if desired. */
+static
+void
+log_archive_close_groups(
+/*=====================*/
+	ibool	increment_file_count)	/*!< in: TRUE if we want to increment
+					the file count */
+{
+	log_group_t*	group;
+	ulint		trunc_len;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	if (log_sys->archiving_state == LOG_ARCH_OFF) {
+
+		return;
+	}
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	trunc_len = UNIV_PAGE_SIZE
+		* fil_space_get_size(group->archive_space_id);
+	if (trunc_len > 0) {
+		ut_a(trunc_len == group->file_size);
+
+		/* Write a notice to the headers of archived log
+		files that the file write has been completed */
+
+		log_group_archive_completed_header_write(
+			group, 0, log_sys->archived_lsn);
+
+		fil_space_truncate_start(group->archive_space_id,
+					 trunc_len);
+		if (increment_file_count) {
+			group->archived_offset = 0;
+			group->archived_file_no += 2;
+		}
+
+#ifdef UNIV_DEBUG
+		if (log_debug_writes) {
+			fprintf(stderr,
+				"Incrementing arch file no to %lu"
+				" in log group %lu\n",
+				(ulong) group->archived_file_no + 2,
+				(ulong) group->id);
+		}
+#endif /* UNIV_DEBUG */
+	}
+}
+
+/****************************************************************//**
+Writes the log contents to the archive up to the lsn when this function was
+called, and stops the archiving. When archiving is started again, the archived
+log file numbers start from 2 higher, so that the archiving will not write
+again to the archived log files which exist when this function returns.
+@return	DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_stop(void)
+/*==================*/
+{
+	ibool	success;
+
+	mutex_enter(&(log_sys->mutex));
+
+	if (log_sys->archiving_state != LOG_ARCH_ON) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		return(DB_ERROR);
+	}
+
+	log_sys->archiving_state = LOG_ARCH_STOPPING;
+
+	mutex_exit(&(log_sys->mutex));
+
+	log_archive_all();
+
+	mutex_enter(&(log_sys->mutex));
+
+	log_sys->archiving_state = LOG_ARCH_STOPPING2;
+	os_event_reset(log_sys->archiving_on);
+
+	mutex_exit(&(log_sys->mutex));
+
+	/* Wait for a possible archiving operation to end */
+
+	rw_lock_s_lock(&(log_sys->archive_lock));
+	rw_lock_s_unlock(&(log_sys->archive_lock));
+
+	mutex_enter(&(log_sys->mutex));
+
+	/* Close all archived log files, incrementing the file count by 2,
+	if appropriate */
+
+	log_archive_close_groups(TRUE);
+
+	mutex_exit(&(log_sys->mutex));
+
+	/* Make a checkpoint, so that if recovery is needed, the file numbers
+	of new archived log files will start from the right value */
+
+	success = FALSE;
+
+	while (!success) {
+		success = log_checkpoint(TRUE, TRUE);
+	}
+
+	mutex_enter(&(log_sys->mutex));
+
+	log_sys->archiving_state = LOG_ARCH_STOPPED;
+
+	mutex_exit(&(log_sys->mutex));
+
+	return(DB_SUCCESS);
+}
+
+/****************************************************************//**
+Starts again archiving which has been stopped.
+@return	DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_start(void)
+/*===================*/
+{
+	mutex_enter(&(log_sys->mutex));
+
+	if (log_sys->archiving_state != LOG_ARCH_STOPPED) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		return(DB_ERROR);
+	}
+
+	log_sys->archiving_state = LOG_ARCH_ON;
+
+	os_event_set(log_sys->archiving_on);
+
+	mutex_exit(&(log_sys->mutex));
+
+	return(DB_SUCCESS);
+}
+
+/****************************************************************//**
+Stop archiving the log so that a gap may occur in the archived log files.
+@return	DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_noarchivelog(void)
+/*==========================*/
+{
+loop:
+	mutex_enter(&(log_sys->mutex));
+
+	if (log_sys->archiving_state == LOG_ARCH_STOPPED
+	    || log_sys->archiving_state == LOG_ARCH_OFF) {
+
+		log_sys->archiving_state = LOG_ARCH_OFF;
+
+		os_event_set(log_sys->archiving_on);
+
+		mutex_exit(&(log_sys->mutex));
+
+		return(DB_SUCCESS);
+	}
+
+	mutex_exit(&(log_sys->mutex));
+
+	log_archive_stop();
+
+	os_thread_sleep(500000);
+
+	goto loop;
+}
+
+/****************************************************************//**
+Start archiving the log so that a gap may occur in the archived log files.
+@return	DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_archivelog(void)
+/*========================*/
+{
+	mutex_enter(&(log_sys->mutex));
+
+	if (log_sys->archiving_state == LOG_ARCH_OFF) {
+
+		log_sys->archiving_state = LOG_ARCH_ON;
+
+		log_sys->archived_lsn
+			= ut_uint64_align_down(log_sys->lsn,
+					       OS_FILE_LOG_BLOCK_SIZE);
+		mutex_exit(&(log_sys->mutex));
+
+		return(DB_SUCCESS);
+	}
+
+	mutex_exit(&(log_sys->mutex));
+
+	return(DB_ERROR);
+}
+
+/****************************************************************//**
+Tries to establish a big enough margin of free space in the log groups, such
+that a new log entry can be catenated without an immediate need for
+archiving. */
+static
+void
+log_archive_margin(void)
+/*====================*/
+{
+	log_t*	log		= log_sys;
+	ulint	age;
+	ibool	sync;
+	ulint	dummy;
+loop:
+	mutex_enter(&(log->mutex));
+
+	if (log->archiving_state == LOG_ARCH_OFF) {
+		mutex_exit(&(log->mutex));
+
+		return;
+	}
+
+	age = log->lsn - log->archived_lsn;
+
+	if (age > log->max_archived_lsn_age) {
+
+		/* An archiving is urgent: we have to do synchronous i/o */
+
+		sync = TRUE;
+
+	} else if (age > log->max_archived_lsn_age_async) {
+
+		/* An archiving is not urgent: we do asynchronous i/o */
+
+		sync = FALSE;
+	} else {
+		/* No archiving required yet */
+
+		mutex_exit(&(log->mutex));
+
+		return;
+	}
+
+	mutex_exit(&(log->mutex));
+
+	log_archive_do(sync, &dummy);
+
+	if (sync == TRUE) {
+		/* Check again that enough was written to the archive */
+
+		goto loop;
+	}
+}
+#endif /* UNIV_LOG_ARCHIVE */
+
+/********************************************************************//**
+Checks that there is enough free space in the log to start a new query step.
+Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
+function may only be called if the calling thread owns no synchronization
+objects! */
+UNIV_INTERN
+void
+log_check_margins(void)
+/*===================*/
+{
+loop:
+	log_flush_margin();
+
+	log_checkpoint_margin();
+
+#ifdef UNIV_LOG_ARCHIVE
+	log_archive_margin();
+#endif /* UNIV_LOG_ARCHIVE */
+
+	mutex_enter(&(log_sys->mutex));
+	ut_ad(!recv_no_log_write);
+
+	if (log_sys->check_flush_or_checkpoint) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		goto loop;
+	}
+
+	mutex_exit(&(log_sys->mutex));
+}
+
+/****************************************************************//**
+Makes a checkpoint at the latest lsn and writes it to first page of each
+data file in the database, so that we know that the file spaces contain
+all modifications up to that lsn. This can only be called at database
+shutdown. This function also writes all log in log files to the log archive. */
+UNIV_INTERN
+void
+logs_empty_and_mark_files_at_shutdown(void)
+/*=======================================*/
+{
+	lsn_t			lsn;
+	ulint			arch_log_no;
+	ulint			count = 0;
+	ulint			total_trx;
+	ulint			pending_io;
+	enum srv_thread_type	active_thd;
+	const char*		thread_name;
+	ibool			server_busy;
+
+	ib_logf(IB_LOG_LEVEL_INFO, "Starting shutdown...");
+
+	while (srv_fast_shutdown == 0 && trx_rollback_or_clean_is_active) {
+		/* we should wait until rollback after recovery end
+		for slow shutdown */
+		os_thread_sleep(100000);
+	}
+
+	/* Wait until the master thread and all other operations are idle: our
+	algorithm only works if the server is idle at shutdown */
+
+	srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
+loop:
+	os_thread_sleep(100000);
+
+	count++;
+
+	/* We need the monitor threads to stop before we proceed with
+	a shutdown. */
+
+	thread_name = srv_any_background_threads_are_active();
+
+	if (thread_name != NULL) {
+		/* Print a message every 60 seconds if we are waiting
+		for the monitor thread to exit. Master and worker
+		threads check will be done later. */
+
+		if (srv_print_verbose_log && count > 600) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Waiting for %s to exit", thread_name);
+			count = 0;
+		}
+
+		goto loop;
+	}
+
+	/* Check that there are no longer transactions, except for
+	PREPARED ones. We need this wait even for the 'very fast'
+	shutdown, because the InnoDB layer may have committed or
+	prepared transactions and we don't want to lose them. */
+
+	total_trx = trx_sys_any_active_transactions();
+
+	if (total_trx > 0) {
+
+		if (srv_print_verbose_log && count > 600) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Waiting for %lu active transactions to finish",
+				(ulong) total_trx);
+
+			count = 0;
+		}
+
+		goto loop;
+	}
+
+	/* Check that the background threads are suspended */
+
+	active_thd = srv_get_active_thread_type();
+
+	if (active_thd != SRV_NONE) {
+
+		if (active_thd == SRV_PURGE) {
+			srv_purge_wakeup();
+		}
+
+		/* The srv_lock_timeout_thread, srv_error_monitor_thread
+		and srv_monitor_thread should already exit by now. The
+		only threads to be suspended are the master threads
+		and worker threads (purge threads). Print the thread
+		type if any of such threads not in suspended mode */
+		if (srv_print_verbose_log && count > 600) {
+			const char*	thread_type = "<null>";
+
+			switch (active_thd) {
+			case SRV_NONE:
+				/* This shouldn't happen because we've
+				already checked for this case before
+				entering the if(). We handle it here
+				to avoid a compiler warning. */
+				ut_error;
+			case SRV_WORKER:
+				thread_type = "worker threads";
+				break;
+			case SRV_MASTER:
+				thread_type = "master thread";
+				break;
+			case SRV_PURGE:
+				thread_type = "purge thread";
+				break;
+			}
+
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Waiting for %s to be suspended",
+				thread_type);
+			count = 0;
+		}
+
+		goto loop;
+	}
+
+	/* At this point only page_cleaner should be active. We wait
+	here to let it complete the flushing of the buffer pools
+	before proceeding further. */
+	srv_shutdown_state = SRV_SHUTDOWN_FLUSH_PHASE;
+	count = 0;
+	while (buf_page_cleaner_is_active) {
+		++count;
+		os_thread_sleep(100000);
+		if (srv_print_verbose_log && count > 600) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Waiting for page_cleaner to "
+				"finish flushing of buffer pool");
+			count = 0;
+		}
+	}
+
+	mutex_enter(&log_sys->mutex);
+	server_busy = log_sys->n_pending_checkpoint_writes
+#ifdef UNIV_LOG_ARCHIVE
+		|| log_sys->n_pending_archive_ios
+#endif /* UNIV_LOG_ARCHIVE */
+		|| log_sys->n_pending_writes;
+	mutex_exit(&log_sys->mutex);
+
+	if (server_busy) {
+		if (srv_print_verbose_log && count > 600) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Pending checkpoint_writes: %lu. "
+				"Pending log flush writes: %lu",
+				(ulong) log_sys->n_pending_checkpoint_writes,
+				(ulong) log_sys->n_pending_writes);
+			count = 0;
+		}
+		goto loop;
+	}
+
+	pending_io = buf_pool_check_no_pending_io();
+
+	if (pending_io) {
+		if (srv_print_verbose_log && count > 600) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Waiting for %lu buffer page I/Os to complete",
+				(ulong) pending_io);
+			count = 0;
+		}
+
+		goto loop;
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	log_archive_all();
+#endif /* UNIV_LOG_ARCHIVE */
+	if (srv_fast_shutdown == 2) {
+		if (!srv_read_only_mode) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"MySQL has requested a very fast shutdown "
+				"without flushing the InnoDB buffer pool to "
+				"data files. At the next mysqld startup "
+				"InnoDB will do a crash recovery!");
+
+			/* In this fastest shutdown we do not flush the
+			buffer pool:
+
+			it is essentially a 'crash' of the InnoDB server.
+			Make sure that the log is all flushed to disk, so
+			that we can recover all committed transactions in
+			a crash recovery. We must not write the lsn stamps
+			to the data files, since at a startup InnoDB deduces
+			from the stamps if the previous shutdown was clean. */
+
+			log_buffer_flush_to_disk();
+
+			/* Check that the background threads stay suspended */
+			thread_name = srv_any_background_threads_are_active();
+
+			if (thread_name != NULL) {
+				ib_logf(IB_LOG_LEVEL_WARN,
+					"Background thread %s woke up "
+					"during shutdown", thread_name);
+				goto loop;
+			}
+		}
+
+		srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
+
+		fil_close_all_files();
+
+		thread_name = srv_any_background_threads_are_active();
+
+		ut_a(!thread_name);
+
+		return;
+	}
+
+	if (!srv_read_only_mode) {
+		log_make_checkpoint_at(LSN_MAX, TRUE);
+	}
+
+	mutex_enter(&log_sys->mutex);
+
+	lsn = log_sys->lsn;
+
+	ut_ad(srv_force_recovery != SRV_FORCE_NO_LOG_REDO
+	      || lsn == log_sys->last_checkpoint_lsn + LOG_BLOCK_HDR_SIZE);
+
+	if ((srv_force_recovery != SRV_FORCE_NO_LOG_REDO
+	     && lsn != log_sys->last_checkpoint_lsn)
+#ifdef UNIV_LOG_ARCHIVE
+	    || (srv_log_archive_on
+		&& lsn != log_sys->archived_lsn + LOG_BLOCK_HDR_SIZE)
+#endif /* UNIV_LOG_ARCHIVE */
+	    ) {
+
+		mutex_exit(&log_sys->mutex);
+
+		goto loop;
+	}
+
+	arch_log_no = 0;
+
+#ifdef UNIV_LOG_ARCHIVE
+	UT_LIST_GET_FIRST(log_sys->log_groups)->archived_file_no;
+
+	if (0 == UT_LIST_GET_FIRST(log_sys->log_groups)->archived_offset) {
+
+		arch_log_no--;
+	}
+
+	log_archive_close_groups(TRUE);
+#endif /* UNIV_LOG_ARCHIVE */
+
+	mutex_exit(&log_sys->mutex);
+
+	/* Check that the background threads stay suspended */
+	thread_name = srv_any_background_threads_are_active();
+	if (thread_name != NULL) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Background thread %s woke up during shutdown",
+			thread_name);
+
+		goto loop;
+	}
+
+	if (!srv_read_only_mode) {
+		fil_flush_file_spaces(FIL_TABLESPACE);
+		fil_flush_file_spaces(FIL_LOG);
+	}
+
+	/* The call fil_write_flushed_lsn_to_data_files() will pass the buffer
+	pool: therefore it is essential that the buffer pool has been
+	completely flushed to disk! (We do not call fil_write... if the
+	'very fast' shutdown is enabled.) */
+
+	if (!buf_all_freed()) {
+
+		if (srv_print_verbose_log && count > 600) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Waiting for dirty buffer pages to be flushed");
+			count = 0;
+		}
+
+		goto loop;
+	}
+
+	srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
+
+	/* Make some checks that the server really is quiet */
+	srv_thread_type	type = srv_get_active_thread_type();
+	ut_a(type == SRV_NONE);
+
+	bool	freed = buf_all_freed();
+	ut_a(freed);
+
+	ut_a(lsn == log_sys->lsn);
+
+	if (lsn < srv_start_lsn) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Log sequence number at shutdown " LSN_PF " "
+			"is lower than at startup " LSN_PF "!",
+			lsn, srv_start_lsn);
+	}
+
+	srv_shutdown_lsn = lsn;
+
+	if (!srv_read_only_mode) {
+		fil_write_flushed_lsn_to_data_files(lsn, arch_log_no);
+
+		fil_flush_file_spaces(FIL_TABLESPACE);
+	}
+
+	fil_close_all_files();
+
+	/* Make some checks that the server really is quiet */
+	type = srv_get_active_thread_type();
+	ut_a(type == SRV_NONE);
+
+	freed = buf_all_freed();
+	ut_a(freed);
+
+	ut_a(lsn == log_sys->lsn);
+}
+
+#ifdef UNIV_LOG_DEBUG
+/******************************************************//**
+Checks by parsing that the catenated log segment for a single mtr is
+consistent. */
+UNIV_INTERN
+ibool
+log_check_log_recs(
+/*===============*/
+	const byte*	buf,		/*!< in: pointer to the start of
+					the log segment in the
+					log_sys->buf log buffer */
+	ulint		len,		/*!< in: segment length in bytes */
+	ib_uint64_t	buf_start_lsn)	/*!< in: buffer start lsn */
+{
+	ib_uint64_t	contiguous_lsn;
+	ib_uint64_t	scanned_lsn;
+	const byte*	start;
+	const byte*	end;
+	byte*		buf1;
+	byte*		scan_buf;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	if (len == 0) {
+
+		return(TRUE);
+	}
+
+	start = ut_align_down(buf, OS_FILE_LOG_BLOCK_SIZE);
+	end = ut_align(buf + len, OS_FILE_LOG_BLOCK_SIZE);
+
+	buf1 = mem_alloc((end - start) + OS_FILE_LOG_BLOCK_SIZE);
+	scan_buf = ut_align(buf1, OS_FILE_LOG_BLOCK_SIZE);
+
+	ut_memcpy(scan_buf, start, end - start);
+
+	recv_scan_log_recs((buf_pool_get_n_pages()
+			   - (recv_n_pool_free_frames * srv_buf_pool_instances))
+			   * UNIV_PAGE_SIZE, FALSE, scan_buf, end - start,
+			   ut_uint64_align_down(buf_start_lsn,
+						OS_FILE_LOG_BLOCK_SIZE),
+			   &contiguous_lsn, &scanned_lsn);
+
+	ut_a(scanned_lsn == buf_start_lsn + len);
+	ut_a(recv_sys->recovered_lsn == scanned_lsn);
+
+	mem_free(buf1);
+
+	return(TRUE);
+}
+#endif /* UNIV_LOG_DEBUG */
+
+/******************************************************//**
+Peeks the current lsn.
+@return	TRUE if success, FALSE if could not get the log system mutex */
+UNIV_INTERN
+ibool
+log_peek_lsn(
+/*=========*/
+	lsn_t*	lsn)	/*!< out: if returns TRUE, current lsn is here */
+{
+	if (0 == mutex_enter_nowait(&(log_sys->mutex))) {
+		*lsn = log_sys->lsn;
+
+		mutex_exit(&(log_sys->mutex));
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/******************************************************//**
+Prints info of the log. */
+UNIV_INTERN
+void
+log_print(
+/*======*/
+	FILE*	file)	/*!< in: file where to print */
+{
+	double	time_elapsed;
+	time_t	current_time;
+
+	mutex_enter(&(log_sys->mutex));
+
+	fprintf(file,
+		"Log sequence number " LSN_PF "\n"
+		"Log flushed up to   " LSN_PF "\n"
+		"Pages flushed up to " LSN_PF "\n"
+		"Last checkpoint at  " LSN_PF "\n",
+		log_sys->lsn,
+		log_sys->flushed_to_disk_lsn,
+		log_buf_pool_get_oldest_modification(),
+		log_sys->last_checkpoint_lsn);
+
+	current_time = time(NULL);
+
+	time_elapsed = difftime(current_time,
+				log_sys->last_printout_time);
+
+	if (time_elapsed <= 0) {
+		time_elapsed = 1;
+	}
+
+	fprintf(file,
+		"%lu pending log writes, %lu pending chkp writes\n"
+		"%lu log i/o's done, %.2f log i/o's/second\n",
+		(ulong) log_sys->n_pending_writes,
+		(ulong) log_sys->n_pending_checkpoint_writes,
+		(ulong) log_sys->n_log_ios,
+		((double)(log_sys->n_log_ios - log_sys->n_log_ios_old)
+		 / time_elapsed));
+
+	log_sys->n_log_ios_old = log_sys->n_log_ios;
+	log_sys->last_printout_time = current_time;
+
+	mutex_exit(&(log_sys->mutex));
+}
+
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+log_refresh_stats(void)
+/*===================*/
+{
+	log_sys->n_log_ios_old = log_sys->n_log_ios;
+	log_sys->last_printout_time = time(NULL);
+}
+
+/********************************************************//**
+Closes a log group. */
+static
+void
+log_group_close(
+/*===========*/
+	log_group_t*	group)		/* in,own: log group to close */
+{
+	ulint	i;
+
+	for (i = 0; i < group->n_files; i++) {
+		mem_free(group->file_header_bufs_ptr[i]);
+#ifdef UNIV_LOG_ARCHIVE
+		mem_free(group->archive_file_header_bufs_ptr[i]);
+#endif /* UNIV_LOG_ARCHIVE */
+	}
+
+	mem_free(group->file_header_bufs_ptr);
+	mem_free(group->file_header_bufs);
+
+#ifdef UNIV_LOG_ARCHIVE
+	mem_free(group->archive_file_header_bufs_ptr);
+	mem_free(group->archive_file_header_bufs);
+#endif /* UNIV_LOG_ARCHIVE */
+
+	mem_free(group->checkpoint_buf_ptr);
+
+	mem_free(group);
+}
+
+/********************************************************//**
+Closes all log groups. */
+UNIV_INTERN
+void
+log_group_close_all(void)
+/*=====================*/
+{
+	log_group_t*	group;
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	while (UT_LIST_GET_LEN(log_sys->log_groups) > 0) {
+		log_group_t*	prev_group = group;
+
+		group = UT_LIST_GET_NEXT(log_groups, group);
+		UT_LIST_REMOVE(log_groups, log_sys->log_groups, prev_group);
+
+		log_group_close(prev_group);
+	}
+}
+
+/********************************************************//**
+Shutdown the log system but do not release all the memory. */
+UNIV_INTERN
+void
+log_shutdown(void)
+/*==============*/
+{
+	log_group_close_all();
+
+	mem_free(log_sys->buf_ptr);
+	log_sys->buf_ptr = NULL;
+	log_sys->buf = NULL;
+	mem_free(log_sys->checkpoint_buf_ptr);
+	log_sys->checkpoint_buf_ptr = NULL;
+	log_sys->checkpoint_buf = NULL;
+
+	os_event_free(log_sys->no_flush_event);
+	os_event_free(log_sys->one_flushed_event);
+
+	rw_lock_free(&log_sys->checkpoint_lock);
+
+	mutex_free(&log_sys->mutex);
+
+#ifdef UNIV_LOG_ARCHIVE
+	rw_lock_free(&log_sys->archive_lock);
+	os_event_create();
+#endif /* UNIV_LOG_ARCHIVE */
+
+#ifdef UNIV_LOG_DEBUG
+	recv_sys_debug_free();
+#endif
+
+	recv_sys_close();
+}
+
+/********************************************************//**
+Free the log system data structures. */
+UNIV_INTERN
+void
+log_mem_free(void)
+/*==============*/
+{
+	if (log_sys != NULL) {
+		recv_sys_mem_free();
+		mem_free(log_sys);
+
+		log_sys = NULL;
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc
new file mode 100644
index 00000000000..aa6c81483d7
--- /dev/null
+++ b/storage/innobase/log/log0recv.cc
@@ -0,0 +1,4019 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file log/log0recv.cc
+Recovery
+
+Created 9/20/1997 Heikki Tuuri
+*******************************************************/
+
+// First include (the generated) my_config.h, to get correct platform defines.
+#include "my_config.h"
+#include <stdio.h>                              // Solaris/x86 header file bug
+
+#include <vector>
+#include "log0recv.h"
+
+#ifdef UNIV_NONINL
+#include "log0recv.ic"
+#endif
+
+#include "mem0mem.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "page0cur.h"
+#include "page0zip.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "ibuf0ibuf.h"
+#include "trx0undo.h"
+#include "trx0rec.h"
+#include "fil0fil.h"
+#ifndef UNIV_HOTBACKUP
+# include "buf0rea.h"
+# include "srv0srv.h"
+# include "srv0start.h"
+# include "trx0roll.h"
+# include "row0merge.h"
+# include "sync0sync.h"
+#else /* !UNIV_HOTBACKUP */
+
+
+/** This is set to FALSE if the backup was originally taken with the
+mysqlbackup --include regexp option: then we do not want to create tables in
+directories which were not included */
+UNIV_INTERN ibool	recv_replay_file_ops	= TRUE;
+#endif /* !UNIV_HOTBACKUP */
+
+/** Log records are stored in the hash table in chunks at most of this size;
+this must be less than UNIV_PAGE_SIZE as it is stored in the buffer pool */
+#define RECV_DATA_BLOCK_SIZE	(MEM_MAX_ALLOC_IN_BUF - sizeof(recv_data_t))
+
+/** Read-ahead area in applying log records to file pages */
+#define RECV_READ_AHEAD_AREA	32
+
+/** The recovery system */
+UNIV_INTERN recv_sys_t*	recv_sys = NULL;
+/** TRUE when applying redo log records during crash recovery; FALSE
+otherwise.  Note that this is FALSE while a background thread is
+rolling back incomplete transactions. */
+UNIV_INTERN ibool	recv_recovery_on;
+#ifdef UNIV_LOG_ARCHIVE
+/** TRUE when applying redo log records from an archived log file */
+UNIV_INTERN ibool	recv_recovery_from_backup_on;
+#endif /* UNIV_LOG_ARCHIVE */
+
+#ifndef UNIV_HOTBACKUP
+/** TRUE when recv_init_crash_recovery() has been called. */
+UNIV_INTERN ibool	recv_needed_recovery;
+# ifdef UNIV_DEBUG
+/** TRUE if writing to the redo log (mtr_commit) is forbidden.
+Protected by log_sys->mutex. */
+UNIV_INTERN ibool	recv_no_log_write = FALSE;
+# endif /* UNIV_DEBUG */
+
+/** TRUE if buf_page_is_corrupted() should check if the log sequence
+number (FIL_PAGE_LSN) is in the future.  Initially FALSE, and set by
+recv_recovery_from_checkpoint_start_func(). */
+UNIV_INTERN ibool	recv_lsn_checks_on;
+
+/** There are two conditions under which we scan the logs, the first
+is normal startup and the second is when we do a recovery from an
+archive.
+This flag is set if we are doing a scan from the last checkpoint during
+startup. If we find log entries that were written after the last checkpoint
+we know that the server was not cleanly shutdown. We must then initialize
+the crash recovery environment before attempting to store these entries in
+the log hash table. */
+static ibool		recv_log_scan_is_startup_type;
+
+/** If the following is TRUE, the buffer pool file pages must be invalidated
+after recovery and no ibuf operations are allowed; this becomes TRUE if
+the log record hash table becomes too full, and log records must be merged
+to file pages already before the recovery is finished: in this case no
+ibuf operations are allowed, as they could modify the pages read in the
+buffer pool before the pages have been recovered to the up-to-date state.
+
+TRUE means that recovery is running and no operations on the log files
+are allowed yet: the variable name is misleading. */
+UNIV_INTERN ibool	recv_no_ibuf_operations;
+/** TRUE when the redo log is being backed up */
+# define recv_is_making_a_backup		FALSE
+/** TRUE when recovering from a backed up redo log file */
+# define recv_is_from_backup			FALSE
+#else /* !UNIV_HOTBACKUP */
+# define recv_needed_recovery			FALSE
+/** TRUE when the redo log is being backed up */
+UNIV_INTERN ibool	recv_is_making_a_backup	= FALSE;
+/** TRUE when recovering from a backed up redo log file */
+UNIV_INTERN ibool	recv_is_from_backup	= FALSE;
+# define buf_pool_get_curr_size() (5 * 1024 * 1024)
+#endif /* !UNIV_HOTBACKUP */
+/** The following counter is used to decide when to print info on
+log scan */
+static ulint	recv_scan_print_counter;
+
+/** The type of the previous parsed redo log record */
+static ulint	recv_previous_parsed_rec_type;
+/** The offset of the previous parsed redo log record */
+static ulint	recv_previous_parsed_rec_offset;
+/** The 'multi' flag of the previous parsed redo log record */
+static ulint	recv_previous_parsed_rec_is_multi;
+
+/** Maximum page number encountered in the redo log */
+UNIV_INTERN ulint	recv_max_parsed_page_no;
+
+/** This many frames must be left free in the buffer pool when we scan
+the log and store the scanned log records in the buffer pool: we will
+use these free frames to read in pages when we start applying the
+log records to the database.
+This is the default value. If the actual size of the buffer pool is
+larger than 10 MB we'll set this value to 512. */
+UNIV_INTERN ulint	recv_n_pool_free_frames;
+
+/** The maximum lsn we see for a page during the recovery process. If this
+is bigger than the lsn we are able to scan up to, that is an indication that
+the recovery failed and the database may be corrupt. */
+UNIV_INTERN lsn_t	recv_max_page_lsn;
+
+#ifdef UNIV_PFS_THREAD
+UNIV_INTERN mysql_pfs_key_t	trx_rollback_clean_thread_key;
+#endif /* UNIV_PFS_THREAD */
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	recv_sys_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+#ifndef UNIV_HOTBACKUP
+# ifdef UNIV_PFS_THREAD
+UNIV_INTERN mysql_pfs_key_t	recv_writer_thread_key;
+# endif /* UNIV_PFS_THREAD */
+
+# ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	recv_writer_mutex_key;
+# endif /* UNIV_PFS_MUTEX */
+
+/** Flag indicating if recv_writer thread is active. */
+UNIV_INTERN bool		recv_writer_thread_active = false;
+UNIV_INTERN os_thread_t		recv_writer_thread_handle = 0;
+#endif /* !UNIV_HOTBACKUP */
+
+/* prototypes */
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************//**
+Initialize crash recovery environment. Can be called iff
+recv_needed_recovery == FALSE. */
+static
+void
+recv_init_crash_recovery(void);
+/*===========================*/
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Creates the recovery system. */
+UNIV_INTERN
+void
+recv_sys_create(void)
+/*=================*/
+{
+	if (recv_sys != NULL) {
+
+		return;
+	}
+
+	recv_sys = static_cast<recv_sys_t*>(mem_zalloc(sizeof(*recv_sys)));
+
+	mutex_create(recv_sys_mutex_key, &recv_sys->mutex, SYNC_RECV);
+
+#ifndef UNIV_HOTBACKUP
+	mutex_create(recv_writer_mutex_key, &recv_sys->writer_mutex,
+		     SYNC_LEVEL_VARYING);
+#endif /* !UNIV_HOTBACKUP */
+
+	recv_sys->heap = NULL;
+	recv_sys->addr_hash = NULL;
+}
+
+/********************************************************//**
+Release recovery system mutexes. */
+UNIV_INTERN
+void
+recv_sys_close(void)
+/*================*/
+{
+	if (recv_sys != NULL) {
+		if (recv_sys->addr_hash != NULL) {
+			hash_table_free(recv_sys->addr_hash);
+		}
+
+		if (recv_sys->heap != NULL) {
+			mem_heap_free(recv_sys->heap);
+		}
+
+		if (recv_sys->buf != NULL) {
+			ut_free(recv_sys->buf);
+		}
+
+		if (recv_sys->last_block_buf_start != NULL) {
+			mem_free(recv_sys->last_block_buf_start);
+		}
+
+#ifndef UNIV_HOTBACKUP
+		ut_ad(!recv_writer_thread_active);
+		mutex_free(&recv_sys->writer_mutex);
+#endif /* !UNIV_HOTBACKUP */
+
+		mutex_free(&recv_sys->mutex);
+
+		mem_free(recv_sys);
+		recv_sys = NULL;
+	}
+}
+
+/********************************************************//**
+Frees the recovery system memory. */
+UNIV_INTERN
+void
+recv_sys_mem_free(void)
+/*===================*/
+{
+	if (recv_sys != NULL) {
+		if (recv_sys->addr_hash != NULL) {
+			hash_table_free(recv_sys->addr_hash);
+		}
+
+		if (recv_sys->heap != NULL) {
+			mem_heap_free(recv_sys->heap);
+		}
+
+		if (recv_sys->buf != NULL) {
+			ut_free(recv_sys->buf);
+		}
+
+		if (recv_sys->last_block_buf_start != NULL) {
+			mem_free(recv_sys->last_block_buf_start);
+		}
+
+		mem_free(recv_sys);
+		recv_sys = NULL;
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************
+Reset the state of the recovery system variables. */
+UNIV_INTERN
+void
+recv_sys_var_init(void)
+/*===================*/
+{
+	recv_lsn_checks_on = FALSE;
+
+	recv_n_pool_free_frames = 256;
+
+	recv_recovery_on = FALSE;
+
+#ifdef UNIV_LOG_ARCHIVE
+	recv_recovery_from_backup_on = FALSE;
+#endif /* UNIV_LOG_ARCHIVE */
+
+	recv_needed_recovery = FALSE;
+
+	recv_lsn_checks_on = FALSE;
+
+	recv_log_scan_is_startup_type = FALSE;
+
+	recv_no_ibuf_operations = FALSE;
+
+	recv_scan_print_counter	= 0;
+
+	recv_previous_parsed_rec_type	= 999999;
+
+	recv_previous_parsed_rec_offset	= 0;
+
+	recv_previous_parsed_rec_is_multi = 0;
+
+	recv_max_parsed_page_no	= 0;
+
+	recv_n_pool_free_frames	= 256;
+
+	recv_max_page_lsn = 0;
+}
+
+/******************************************************************//**
+recv_writer thread tasked with flushing dirty pages from the buffer
+pools.
+@return a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(recv_writer_thread)(
+/*===============================*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	ut_ad(!srv_read_only_mode);
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(recv_writer_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "InnoDB: recv_writer thread running, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+	recv_writer_thread_active = true;
+
+	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+
+		os_thread_sleep(100000);
+
+		mutex_enter(&recv_sys->writer_mutex);
+
+		if (!recv_recovery_on) {
+			mutex_exit(&recv_sys->writer_mutex);
+			break;
+		}
+
+		/* Flush pages from end of LRU if required */
+		buf_flush_LRU_tail();
+
+		mutex_exit(&recv_sys->writer_mutex);
+	}
+
+	recv_writer_thread_active = false;
+
+	/* We count the number of threads in os_thread_exit().
+	A created thread should always use that to exit and not
+	use return() to exit. */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/************************************************************
+Inits the recovery system for a recovery operation. */
+UNIV_INTERN
+void
+recv_sys_init(
+/*==========*/
+	ulint	available_memory)	/*!< in: available memory in bytes */
+{
+	if (recv_sys->heap != NULL) {
+
+		return;
+	}
+
+#ifndef UNIV_HOTBACKUP
+	/* Initialize red-black tree for fast insertions into the
+	flush_list during recovery process.
+	As this initialization is done while holding the buffer pool
+	mutex we perform it before acquiring recv_sys->mutex. */
+	buf_flush_init_flush_rbt();
+
+	mutex_enter(&(recv_sys->mutex));
+
+	recv_sys->heap = mem_heap_create_typed(256,
+					MEM_HEAP_FOR_RECV_SYS);
+#else /* !UNIV_HOTBACKUP */
+	recv_sys->heap = mem_heap_create(256);
+	recv_is_from_backup = TRUE;
+#endif /* !UNIV_HOTBACKUP */
+
+	/* Set appropriate value of recv_n_pool_free_frames. */
+	if (buf_pool_get_curr_size() >= (10 * 1024 * 1024)) {
+		/* Buffer pool of size greater than 10 MB. */
+		recv_n_pool_free_frames = 512;
+	}
+
+	recv_sys->buf = static_cast<byte*>(ut_malloc(RECV_PARSING_BUF_SIZE));
+	recv_sys->len = 0;
+	recv_sys->recovered_offset = 0;
+
+	recv_sys->addr_hash = hash_create(available_memory / 512);
+	recv_sys->n_addrs = 0;
+
+	recv_sys->apply_log_recs = FALSE;
+	recv_sys->apply_batch_on = FALSE;
+
+	recv_sys->last_block_buf_start = static_cast<byte*>(
+		mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE));
+
+	recv_sys->last_block = static_cast<byte*>(ut_align(
+		recv_sys->last_block_buf_start, OS_FILE_LOG_BLOCK_SIZE));
+
+	recv_sys->found_corrupt_log = FALSE;
+
+	recv_max_page_lsn = 0;
+
+	/* Call the constructor for recv_sys_t::dblwr member */
+	new (&recv_sys->dblwr) recv_dblwr_t();
+
+	mutex_exit(&(recv_sys->mutex));
+}
+
+/********************************************************//**
+Empties the hash table when it has been fully processed. */
+static
+void
+recv_sys_empty_hash(void)
+/*=====================*/
+{
+	ut_ad(mutex_own(&(recv_sys->mutex)));
+
+	if (recv_sys->n_addrs != 0) {
+		fprintf(stderr,
+			"InnoDB: Error: %lu pages with log records"
+			" were left unprocessed!\n"
+			"InnoDB: Maximum page number with"
+			" log records on it %lu\n",
+			(ulong) recv_sys->n_addrs,
+			(ulong) recv_max_parsed_page_no);
+		ut_error;
+	}
+
+	hash_table_free(recv_sys->addr_hash);
+	mem_heap_empty(recv_sys->heap);
+
+	recv_sys->addr_hash = hash_create(buf_pool_get_curr_size() / 512);
+}
+
+#ifndef UNIV_HOTBACKUP
+# ifndef UNIV_LOG_DEBUG
+/********************************************************//**
+Frees the recovery system. */
+static
+void
+recv_sys_debug_free(void)
+/*=====================*/
+{
+	mutex_enter(&(recv_sys->mutex));
+
+	hash_table_free(recv_sys->addr_hash);
+	mem_heap_free(recv_sys->heap);
+	ut_free(recv_sys->buf);
+	mem_free(recv_sys->last_block_buf_start);
+
+	recv_sys->buf = NULL;
+	recv_sys->heap = NULL;
+	recv_sys->addr_hash = NULL;
+	recv_sys->last_block_buf_start = NULL;
+
+	mutex_exit(&(recv_sys->mutex));
+
+	/* Free up the flush_rbt. */
+	buf_flush_free_flush_rbt();
+}
+# endif /* UNIV_LOG_DEBUG */
+
+# ifdef UNIV_LOG_ARCHIVE
+/********************************************************//**
+Truncates possible corrupted or extra records from a log group. */
+static
+void
+recv_truncate_group(
+/*================*/
+	log_group_t*	group,		/*!< in: log group */
+	lsn_t		recovered_lsn,	/*!< in: recovery succeeded up to this
+					lsn */
+	lsn_t		limit_lsn,	/*!< in: this was the limit for
+					recovery */
+	lsn_t		checkpoint_lsn,	/*!< in: recovery was started from this
+					checkpoint */
+	lsn_t		archived_lsn)	/*!< in: the log has been archived up to
+					this lsn */
+{
+	lsn_t		start_lsn;
+	lsn_t		end_lsn;
+	lsn_t		finish_lsn1;
+	lsn_t		finish_lsn2;
+	lsn_t		finish_lsn;
+
+	if (archived_lsn == LSN_MAX) {
+		/* Checkpoint was taken in the NOARCHIVELOG mode */
+		archived_lsn = checkpoint_lsn;
+	}
+
+	finish_lsn1 = ut_uint64_align_down(archived_lsn,
+					   OS_FILE_LOG_BLOCK_SIZE)
+		+ log_group_get_capacity(group);
+
+	finish_lsn2 = ut_uint64_align_up(recovered_lsn,
+					 OS_FILE_LOG_BLOCK_SIZE)
+		+ recv_sys->last_log_buf_size;
+
+	if (limit_lsn != LSN_MAX) {
+		/* We do not know how far we should erase log records: erase
+		as much as possible */
+
+		finish_lsn = finish_lsn1;
+	} else {
+		/* It is enough to erase the length of the log buffer */
+		finish_lsn = finish_lsn1 < finish_lsn2
+			? finish_lsn1 : finish_lsn2;
+	}
+
+	ut_a(RECV_SCAN_SIZE <= log_sys->buf_size);
+
+	memset(log_sys->buf, 0, RECV_SCAN_SIZE);
+
+	start_lsn = ut_uint64_align_down(recovered_lsn,
+					 OS_FILE_LOG_BLOCK_SIZE);
+
+	if (start_lsn != recovered_lsn) {
+		/* Copy the last incomplete log block to the log buffer and
+		edit its data length: */
+		lsn_t	diff = recovered_lsn - start_lsn;
+
+		ut_a(diff <= 0xFFFFUL);
+
+		ut_memcpy(log_sys->buf, recv_sys->last_block,
+			  OS_FILE_LOG_BLOCK_SIZE);
+		log_block_set_data_len(log_sys->buf, (ulint) diff);
+	}
+
+	if (start_lsn >= finish_lsn) {
+
+		return;
+	}
+
+	for (;;) {
+		ulint	len;
+
+		end_lsn = start_lsn + RECV_SCAN_SIZE;
+
+		if (end_lsn > finish_lsn) {
+
+			end_lsn = finish_lsn;
+		}
+
+		len = (ulint) (end_lsn - start_lsn);
+
+		log_group_write_buf(group, log_sys->buf, len, start_lsn, 0);
+		if (end_lsn >= finish_lsn) {
+
+			return;
+		}
+
+		memset(log_sys->buf, 0, RECV_SCAN_SIZE);
+
+		start_lsn = end_lsn;
+	}
+}
+
+/********************************************************//**
+Copies the log segment between group->recovered_lsn and recovered_lsn from the
+most up-to-date log group to group, so that it contains the latest log data. */
+static
+void
+recv_copy_group(
+/*============*/
+	log_group_t*	up_to_date_group,	/*!< in: the most up-to-date log
+						group */
+	log_group_t*	group,			/*!< in: copy to this log
+						group */
+	lsn_t		recovered_lsn)		/*!< in: recovery succeeded up
+						to this lsn */
+{
+	lsn_t		start_lsn;
+	lsn_t		end_lsn;
+
+	if (group->scanned_lsn >= recovered_lsn) {
+
+		return;
+	}
+
+	ut_a(RECV_SCAN_SIZE <= log_sys->buf_size);
+
+	start_lsn = ut_uint64_align_down(group->scanned_lsn,
+					 OS_FILE_LOG_BLOCK_SIZE);
+	for (;;) {
+		ulint	len;
+
+		end_lsn = start_lsn + RECV_SCAN_SIZE;
+
+		if (end_lsn > recovered_lsn) {
+			end_lsn = ut_uint64_align_up(recovered_lsn,
+						     OS_FILE_LOG_BLOCK_SIZE);
+		}
+
+		log_group_read_log_seg(LOG_RECOVER, log_sys->buf,
+				       up_to_date_group, start_lsn, end_lsn);
+
+		len = (ulint) (end_lsn - start_lsn);
+
+		log_group_write_buf(group, log_sys->buf, len, start_lsn, 0);
+
+		if (end_lsn >= recovered_lsn) {
+
+			return;
+		}
+
+		start_lsn = end_lsn;
+	}
+}
+# endif /* UNIV_LOG_ARCHIVE */
+
+/********************************************************//**
+Copies a log segment from the most up-to-date log group to the other log
+groups, so that they all contain the latest log data. Also writes the info
+about the latest checkpoint to the groups, and inits the fields in the group
+memory structs to up-to-date values. */
+static
+void
+recv_synchronize_groups(
+/*====================*/
+#ifdef UNIV_LOG_ARCHIVE
+	log_group_t*	up_to_date_group	/*!< in: the most up-to-date
+						log group */
+#endif
+	)
+{
+	lsn_t		start_lsn;
+	lsn_t		end_lsn;
+	lsn_t		recovered_lsn;
+
+	recovered_lsn = recv_sys->recovered_lsn;
+
+	/* Read the last recovered log block to the recovery system buffer:
+	the block is always incomplete */
+
+	start_lsn = ut_uint64_align_down(recovered_lsn,
+					 OS_FILE_LOG_BLOCK_SIZE);
+	end_lsn = ut_uint64_align_up(recovered_lsn, OS_FILE_LOG_BLOCK_SIZE);
+
+	ut_a(start_lsn != end_lsn);
+
+	log_group_read_log_seg(LOG_RECOVER, recv_sys->last_block,
+#ifdef UNIV_LOG_ARCHIVE
+			       up_to_date_group,
+#else /* UNIV_LOG_ARCHIVE */
+			       UT_LIST_GET_FIRST(log_sys->log_groups),
+#endif /* UNIV_LOG_ARCHIVE */
+			       start_lsn, end_lsn);
+
+	for (log_group_t* group = UT_LIST_GET_FIRST(log_sys->log_groups);
+	     group;
+	     group = UT_LIST_GET_NEXT(log_groups, group)) {
+#ifdef UNIV_LOG_ARCHIVE
+		if (group != up_to_date_group) {
+
+			/* Copy log data if needed */
+
+			recv_copy_group(group, up_to_date_group,
+					recovered_lsn);
+		}
+#endif /* UNIV_LOG_ARCHIVE */
+		/* Update the fields in the group struct to correspond to
+		recovered_lsn */
+
+		log_group_set_fields(group, recovered_lsn);
+	}
+
+	/* Copy the checkpoint info to the groups; remember that we have
+	incremented checkpoint_no by one, and the info will not be written
+	over the max checkpoint info, thus making the preservation of max
+	checkpoint info on disk certain */
+
+	log_groups_write_checkpoint_info();
+
+	mutex_exit(&(log_sys->mutex));
+
+	/* Wait for the checkpoint write to complete */
+	rw_lock_s_lock(&(log_sys->checkpoint_lock));
+	rw_lock_s_unlock(&(log_sys->checkpoint_lock));
+
+	mutex_enter(&(log_sys->mutex));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Checks the consistency of the checkpoint info
+@return	TRUE if ok */
+static
+ibool
+recv_check_cp_is_consistent(
+/*========================*/
+	const byte*	buf)	/*!< in: buffer containing checkpoint info */
+{
+	ulint	fold;
+
+	fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1);
+
+	if ((fold & 0xFFFFFFFFUL) != mach_read_from_4(
+		    buf + LOG_CHECKPOINT_CHECKSUM_1)) {
+		return(FALSE);
+	}
+
+	fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN,
+			      LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN);
+
+	if ((fold & 0xFFFFFFFFUL) != mach_read_from_4(
+		    buf + LOG_CHECKPOINT_CHECKSUM_2)) {
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Looks for the maximum consistent checkpoint from the log groups.
+@return	error code or DB_SUCCESS */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+recv_find_max_checkpoint(
+/*=====================*/
+	log_group_t**	max_group,	/*!< out: max group */
+	ulint*		max_field)	/*!< out: LOG_CHECKPOINT_1 or
+					LOG_CHECKPOINT_2 */
+{
+	log_group_t*	group;
+	ib_uint64_t	max_no;
+	ib_uint64_t	checkpoint_no;
+	ulint		field;
+	byte*		buf;
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	max_no = 0;
+	*max_group = NULL;
+	*max_field = 0;
+
+	buf = log_sys->checkpoint_buf;
+
+	while (group) {
+		group->state = LOG_GROUP_CORRUPTED;
+
+		for (field = LOG_CHECKPOINT_1; field <= LOG_CHECKPOINT_2;
+		     field += LOG_CHECKPOINT_2 - LOG_CHECKPOINT_1) {
+
+			log_group_read_checkpoint_info(group, field);
+
+			if (!recv_check_cp_is_consistent(buf)) {
+#ifdef UNIV_DEBUG
+				if (log_debug_writes) {
+					fprintf(stderr,
+						"InnoDB: Checkpoint in group"
+						" %lu at %lu invalid, %lu\n",
+						(ulong) group->id,
+						(ulong) field,
+						(ulong) mach_read_from_4(
+							buf
+							+ LOG_CHECKPOINT_CHECKSUM_1));
+
+				}
+#endif /* UNIV_DEBUG */
+				goto not_consistent;
+			}
+
+			group->state = LOG_GROUP_OK;
+
+			group->lsn = mach_read_from_8(
+				buf + LOG_CHECKPOINT_LSN);
+			group->lsn_offset = mach_read_from_4(
+				buf + LOG_CHECKPOINT_OFFSET_LOW32);
+			group->lsn_offset |= ((lsn_t) mach_read_from_4(
+				buf + LOG_CHECKPOINT_OFFSET_HIGH32)) << 32;
+			checkpoint_no = mach_read_from_8(
+				buf + LOG_CHECKPOINT_NO);
+
+#ifdef UNIV_DEBUG
+			if (log_debug_writes) {
+				fprintf(stderr,
+					"InnoDB: Checkpoint number %lu"
+					" found in group %lu\n",
+					(ulong) checkpoint_no,
+					(ulong) group->id);
+			}
+#endif /* UNIV_DEBUG */
+
+			if (checkpoint_no >= max_no) {
+				*max_group = group;
+				*max_field = field;
+				max_no = checkpoint_no;
+			}
+
+not_consistent:
+			;
+		}
+
+		group = UT_LIST_GET_NEXT(log_groups, group);
+	}
+
+	if (*max_group == NULL) {
+
+		fprintf(stderr,
+			"InnoDB: No valid checkpoint found.\n"
+			"InnoDB: If this error appears when you are"
+			" creating an InnoDB database,\n"
+			"InnoDB: the problem may be that during"
+			" an earlier attempt you managed\n"
+			"InnoDB: to create the InnoDB data files,"
+			" but log file creation failed.\n"
+			"InnoDB: If that is the case, please refer to\n"
+			"InnoDB: " REFMAN "error-creating-innodb.html\n");
+		return(DB_ERROR);
+	}
+
+	return(DB_SUCCESS);
+}
+#else /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Reads the checkpoint info needed in hot backup.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+recv_read_checkpoint_info_for_backup(
+/*=================================*/
+	const byte*	hdr,	/*!< in: buffer containing the log group
+				header */
+	lsn_t*		lsn,	/*!< out: checkpoint lsn */
+	lsn_t*		offset,	/*!< out: checkpoint offset in the log group */
+	lsn_t*		cp_no,	/*!< out: checkpoint number */
+	lsn_t*		first_header_lsn)
+				/*!< out: lsn of of the start of the
+				first log file */
+{
+	ulint		max_cp		= 0;
+	ib_uint64_t	max_cp_no	= 0;
+	const byte*	cp_buf;
+
+	cp_buf = hdr + LOG_CHECKPOINT_1;
+
+	if (recv_check_cp_is_consistent(cp_buf)) {
+		max_cp_no = mach_read_from_8(cp_buf + LOG_CHECKPOINT_NO);
+		max_cp = LOG_CHECKPOINT_1;
+	}
+
+	cp_buf = hdr + LOG_CHECKPOINT_2;
+
+	if (recv_check_cp_is_consistent(cp_buf)) {
+		if (mach_read_from_8(cp_buf + LOG_CHECKPOINT_NO) > max_cp_no) {
+			max_cp = LOG_CHECKPOINT_2;
+		}
+	}
+
+	if (max_cp == 0) {
+		return(FALSE);
+	}
+
+	cp_buf = hdr + max_cp;
+
+	*lsn = mach_read_from_8(cp_buf + LOG_CHECKPOINT_LSN);
+	*offset = mach_read_from_4(
+		cp_buf + LOG_CHECKPOINT_OFFSET_LOW32);
+	*offset |= ((lsn_t) mach_read_from_4(
+			    cp_buf + LOG_CHECKPOINT_OFFSET_HIGH32)) << 32;
+
+	*cp_no = mach_read_from_8(cp_buf + LOG_CHECKPOINT_NO);
+
+	*first_header_lsn = mach_read_from_8(hdr + LOG_FILE_START_LSN);
+
+	return(TRUE);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/******************************************************//**
+Checks the 4-byte checksum to the trailer checksum field of a log
+block.  We also accept a log block in the old format before
+InnoDB-3.23.52 where the checksum field contains the log block number.
+@return TRUE if ok, or if the log block may be in the format of InnoDB
+version predating 3.23.52 */
+static
+ibool
+log_block_checksum_is_ok_or_old_format(
+/*===================================*/
+	const byte*	block)	/*!< in: pointer to a log block */
+{
+#ifdef UNIV_LOG_DEBUG
+	return(TRUE);
+#endif /* UNIV_LOG_DEBUG */
+	if (log_block_calc_checksum(block) == log_block_get_checksum(block)) {
+
+		return(TRUE);
+	}
+
+	if (log_block_get_hdr_no(block) == log_block_get_checksum(block)) {
+
+		/* We assume the log block is in the format of
+		InnoDB version < 3.23.52 and the block is ok */
+#if 0
+		fprintf(stderr,
+			"InnoDB: Scanned old format < InnoDB-3.23.52"
+			" log block number %lu\n",
+			log_block_get_hdr_no(block));
+#endif
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+#ifdef UNIV_HOTBACKUP
+/*******************************************************************//**
+Scans the log segment and n_bytes_scanned is set to the length of valid
+log scanned. */
+UNIV_INTERN
+void
+recv_scan_log_seg_for_backup(
+/*=========================*/
+	byte*		buf,		/*!< in: buffer containing log data */
+	ulint		buf_len,	/*!< in: data length in that buffer */
+	lsn_t*		scanned_lsn,	/*!< in/out: lsn of buffer start,
+					we return scanned lsn */
+	ulint*		scanned_checkpoint_no,
+					/*!< in/out: 4 lowest bytes of the
+					highest scanned checkpoint number so
+					far */
+	ulint*		n_bytes_scanned)/*!< out: how much we were able to
+					scan, smaller than buf_len if log
+					data ended here */
+{
+	ulint	data_len;
+	byte*	log_block;
+	ulint	no;
+
+	*n_bytes_scanned = 0;
+
+	for (log_block = buf; log_block < buf + buf_len;
+	     log_block += OS_FILE_LOG_BLOCK_SIZE) {
+
+		no = log_block_get_hdr_no(log_block);
+
+#if 0
+		fprintf(stderr, "Log block header no %lu\n", no);
+#endif
+
+		if (no != log_block_convert_lsn_to_no(*scanned_lsn)
+		    || !log_block_checksum_is_ok_or_old_format(log_block)) {
+#if 0
+			fprintf(stderr,
+				"Log block n:o %lu, scanned lsn n:o %lu\n",
+				no, log_block_convert_lsn_to_no(*scanned_lsn));
+#endif
+			/* Garbage or an incompletely written log block */
+
+			log_block += OS_FILE_LOG_BLOCK_SIZE;
+#if 0
+			fprintf(stderr,
+				"Next log block n:o %lu\n",
+				log_block_get_hdr_no(log_block));
+#endif
+			break;
+		}
+
+		if (*scanned_checkpoint_no > 0
+		    && log_block_get_checkpoint_no(log_block)
+		    < *scanned_checkpoint_no
+		    && *scanned_checkpoint_no
+		    - log_block_get_checkpoint_no(log_block)
+		    > 0x80000000UL) {
+
+			/* Garbage from a log buffer flush which was made
+			before the most recent database recovery */
+#if 0
+			fprintf(stderr,
+				"Scanned cp n:o %lu, block cp n:o %lu\n",
+				*scanned_checkpoint_no,
+				log_block_get_checkpoint_no(log_block));
+#endif
+			break;
+		}
+
+		data_len = log_block_get_data_len(log_block);
+
+		*scanned_checkpoint_no
+			= log_block_get_checkpoint_no(log_block);
+		*scanned_lsn += data_len;
+
+		*n_bytes_scanned += data_len;
+
+		if (data_len < OS_FILE_LOG_BLOCK_SIZE) {
+			/* Log data ends here */
+
+#if 0
+			fprintf(stderr, "Log block data len %lu\n",
+				data_len);
+#endif
+			break;
+		}
+	}
+}
+#endif /* UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Tries to parse a single log record body and also applies it to a page if
+specified. File ops are parsed, but not applied in this function.
+@return	log record end, NULL if not a complete record */
+static
+byte*
+recv_parse_or_apply_log_rec_body(
+/*=============================*/
+	byte		type,	/*!< in: type */
+	byte*		ptr,	/*!< in: pointer to a buffer */
+	byte*		end_ptr,/*!< in: pointer to the buffer end */
+	buf_block_t*	block,	/*!< in/out: buffer block or NULL; if
+				not NULL, then the log record is
+				applied to the page, and the log
+				record should be complete then */
+	mtr_t*		mtr,	/*!< in: mtr or NULL; should be non-NULL
+				if and only if block is non-NULL */
+	ulint		space_id)
+				/*!< in: tablespace id obtained by
+				parsing initial log record */
+{
+	dict_index_t*	index	= NULL;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+#ifdef UNIV_DEBUG
+	ulint		page_type;
+#endif /* UNIV_DEBUG */
+
+	ut_ad(!block == !mtr);
+
+	if (block) {
+		page = block->frame;
+		page_zip = buf_block_get_page_zip(block);
+		ut_d(page_type = fil_page_get_type(page));
+	} else {
+		page = NULL;
+		page_zip = NULL;
+		ut_d(page_type = FIL_PAGE_TYPE_ALLOCATED);
+	}
+
+	switch (type) {
+#ifdef UNIV_LOG_LSN_DEBUG
+	case MLOG_LSN:
+		/* The LSN is checked in recv_parse_log_rec(). */
+		break;
+#endif /* UNIV_LOG_LSN_DEBUG */
+	case MLOG_1BYTE: case MLOG_2BYTES: case MLOG_4BYTES: case MLOG_8BYTES:
+#ifdef UNIV_DEBUG
+		if (page && page_type == FIL_PAGE_TYPE_ALLOCATED
+		    && end_ptr >= ptr + 2) {
+			/* It is OK to set FIL_PAGE_TYPE and certain
+			list node fields on an empty page.  Any other
+			write is not OK. */
+
+			/* NOTE: There may be bogus assertion failures for
+			dict_hdr_create(), trx_rseg_header_create(),
+			trx_sys_create_doublewrite_buf(), and
+			trx_sysf_create().
+			These are only called during database creation. */
+			ulint	offs = mach_read_from_2(ptr);
+
+			switch (type) {
+			default:
+				ut_error;
+			case MLOG_2BYTES:
+				/* Note that this can fail when the
+				redo log been written with something
+				older than InnoDB Plugin 1.0.4. */
+				ut_ad(offs == FIL_PAGE_TYPE
+				      || offs == IBUF_TREE_SEG_HEADER
+				      + IBUF_HEADER + FSEG_HDR_OFFSET
+				      || offs == PAGE_BTR_IBUF_FREE_LIST
+				      + PAGE_HEADER + FIL_ADDR_BYTE
+				      || offs == PAGE_BTR_IBUF_FREE_LIST
+				      + PAGE_HEADER + FIL_ADDR_BYTE
+				      + FIL_ADDR_SIZE
+				      || offs == PAGE_BTR_SEG_LEAF
+				      + PAGE_HEADER + FSEG_HDR_OFFSET
+				      || offs == PAGE_BTR_SEG_TOP
+				      + PAGE_HEADER + FSEG_HDR_OFFSET
+				      || offs == PAGE_BTR_IBUF_FREE_LIST_NODE
+				      + PAGE_HEADER + FIL_ADDR_BYTE
+				      + 0 /*FLST_PREV*/
+				      || offs == PAGE_BTR_IBUF_FREE_LIST_NODE
+				      + PAGE_HEADER + FIL_ADDR_BYTE
+				      + FIL_ADDR_SIZE /*FLST_NEXT*/);
+				break;
+			case MLOG_4BYTES:
+				/* Note that this can fail when the
+				redo log been written with something
+				older than InnoDB Plugin 1.0.4. */
+				ut_ad(0
+				      || offs == IBUF_TREE_SEG_HEADER
+				      + IBUF_HEADER + FSEG_HDR_SPACE
+				      || offs == IBUF_TREE_SEG_HEADER
+				      + IBUF_HEADER + FSEG_HDR_PAGE_NO
+				      || offs == PAGE_BTR_IBUF_FREE_LIST
+				      + PAGE_HEADER/* flst_init */
+				      || offs == PAGE_BTR_IBUF_FREE_LIST
+				      + PAGE_HEADER + FIL_ADDR_PAGE
+				      || offs == PAGE_BTR_IBUF_FREE_LIST
+				      + PAGE_HEADER + FIL_ADDR_PAGE
+				      + FIL_ADDR_SIZE
+				      || offs == PAGE_BTR_SEG_LEAF
+				      + PAGE_HEADER + FSEG_HDR_PAGE_NO
+				      || offs == PAGE_BTR_SEG_LEAF
+				      + PAGE_HEADER + FSEG_HDR_SPACE
+				      || offs == PAGE_BTR_SEG_TOP
+				      + PAGE_HEADER + FSEG_HDR_PAGE_NO
+				      || offs == PAGE_BTR_SEG_TOP
+				      + PAGE_HEADER + FSEG_HDR_SPACE
+				      || offs == PAGE_BTR_IBUF_FREE_LIST_NODE
+				      + PAGE_HEADER + FIL_ADDR_PAGE
+				      + 0 /*FLST_PREV*/
+				      || offs == PAGE_BTR_IBUF_FREE_LIST_NODE
+				      + PAGE_HEADER + FIL_ADDR_PAGE
+				      + FIL_ADDR_SIZE /*FLST_NEXT*/);
+				break;
+			}
+		}
+#endif /* UNIV_DEBUG */
+		ptr = mlog_parse_nbytes(type, ptr, end_ptr, page, page_zip);
+		break;
+	case MLOG_REC_INSERT: case MLOG_COMP_REC_INSERT:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+		if (NULL != (ptr = mlog_parse_index(
+				     ptr, end_ptr,
+				     type == MLOG_COMP_REC_INSERT,
+				     &index))) {
+			ut_a(!page
+			     || (ibool)!!page_is_comp(page)
+			     == dict_table_is_comp(index->table));
+			ptr = page_cur_parse_insert_rec(FALSE, ptr, end_ptr,
+							block, index, mtr);
+		}
+		break;
+	case MLOG_REC_CLUST_DELETE_MARK: case MLOG_COMP_REC_CLUST_DELETE_MARK:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+		if (NULL != (ptr = mlog_parse_index(
+				     ptr, end_ptr,
+				     type == MLOG_COMP_REC_CLUST_DELETE_MARK,
+				     &index))) {
+			ut_a(!page
+			     || (ibool)!!page_is_comp(page)
+			     == dict_table_is_comp(index->table));
+			ptr = btr_cur_parse_del_mark_set_clust_rec(
+				ptr, end_ptr, page, page_zip, index);
+		}
+		break;
+	case MLOG_COMP_REC_SEC_DELETE_MARK:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+		/* This log record type is obsolete, but we process it for
+		backward compatibility with MySQL 5.0.3 and 5.0.4. */
+		ut_a(!page || page_is_comp(page));
+		ut_a(!page_zip);
+		ptr = mlog_parse_index(ptr, end_ptr, TRUE, &index);
+		if (!ptr) {
+			break;
+		}
+		/* Fall through */
+	case MLOG_REC_SEC_DELETE_MARK:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+		ptr = btr_cur_parse_del_mark_set_sec_rec(ptr, end_ptr,
+							 page, page_zip);
+		break;
+	case MLOG_REC_UPDATE_IN_PLACE: case MLOG_COMP_REC_UPDATE_IN_PLACE:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+		if (NULL != (ptr = mlog_parse_index(
+				     ptr, end_ptr,
+				     type == MLOG_COMP_REC_UPDATE_IN_PLACE,
+				     &index))) {
+			ut_a(!page
+			     || (ibool)!!page_is_comp(page)
+			     == dict_table_is_comp(index->table));
+			ptr = btr_cur_parse_update_in_place(ptr, end_ptr, page,
+							    page_zip, index);
+		}
+		break;
+	case MLOG_LIST_END_DELETE: case MLOG_COMP_LIST_END_DELETE:
+	case MLOG_LIST_START_DELETE: case MLOG_COMP_LIST_START_DELETE:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+		if (NULL != (ptr = mlog_parse_index(
+				     ptr, end_ptr,
+				     type == MLOG_COMP_LIST_END_DELETE
+				     || type == MLOG_COMP_LIST_START_DELETE,
+				     &index))) {
+			ut_a(!page
+			     || (ibool)!!page_is_comp(page)
+			     == dict_table_is_comp(index->table));
+			ptr = page_parse_delete_rec_list(type, ptr, end_ptr,
+							 block, index, mtr);
+		}
+		break;
+	case MLOG_LIST_END_COPY_CREATED: case MLOG_COMP_LIST_END_COPY_CREATED:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+		if (NULL != (ptr = mlog_parse_index(
+				     ptr, end_ptr,
+				     type == MLOG_COMP_LIST_END_COPY_CREATED,
+				     &index))) {
+			ut_a(!page
+			     || (ibool)!!page_is_comp(page)
+			     == dict_table_is_comp(index->table));
+			ptr = page_parse_copy_rec_list_to_created_page(
+				ptr, end_ptr, block, index, mtr);
+		}
+		break;
+	case MLOG_PAGE_REORGANIZE:
+	case MLOG_COMP_PAGE_REORGANIZE:
+	case MLOG_ZIP_PAGE_REORGANIZE:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+		if (NULL != (ptr = mlog_parse_index(
+				     ptr, end_ptr,
+				     type != MLOG_PAGE_REORGANIZE,
+				     &index))) {
+			ut_a(!page
+			     || (ibool)!!page_is_comp(page)
+			     == dict_table_is_comp(index->table));
+			ptr = btr_parse_page_reorganize(
+				ptr, end_ptr, index,
+				type == MLOG_ZIP_PAGE_REORGANIZE,
+				block, mtr);
+		}
+		break;
+	case MLOG_PAGE_CREATE: case MLOG_COMP_PAGE_CREATE:
+		/* Allow anything in page_type when creating a page. */
+		ut_a(!page_zip);
+		ptr = page_parse_create(ptr, end_ptr,
+					type == MLOG_COMP_PAGE_CREATE,
+					block, mtr);
+		break;
+	case MLOG_UNDO_INSERT:
+		ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG);
+		ptr = trx_undo_parse_add_undo_rec(ptr, end_ptr, page);
+		break;
+	case MLOG_UNDO_ERASE_END:
+		ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG);
+		ptr = trx_undo_parse_erase_page_end(ptr, end_ptr, page, mtr);
+		break;
+	case MLOG_UNDO_INIT:
+		/* Allow anything in page_type when creating a page. */
+		ptr = trx_undo_parse_page_init(ptr, end_ptr, page, mtr);
+		break;
+	case MLOG_UNDO_HDR_DISCARD:
+		ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG);
+		ptr = trx_undo_parse_discard_latest(ptr, end_ptr, page, mtr);
+		break;
+	case MLOG_UNDO_HDR_CREATE:
+	case MLOG_UNDO_HDR_REUSE:
+		ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG);
+		ptr = trx_undo_parse_page_header(type, ptr, end_ptr,
+						 page, mtr);
+		break;
+	case MLOG_REC_MIN_MARK: case MLOG_COMP_REC_MIN_MARK:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+		/* On a compressed page, MLOG_COMP_REC_MIN_MARK
+		will be followed by MLOG_COMP_REC_DELETE
+		or MLOG_ZIP_WRITE_HEADER(FIL_PAGE_PREV, FIL_NULL)
+		in the same mini-transaction. */
+		ut_a(type == MLOG_COMP_REC_MIN_MARK || !page_zip);
+		ptr = btr_parse_set_min_rec_mark(
+			ptr, end_ptr, type == MLOG_COMP_REC_MIN_MARK,
+			page, mtr);
+		break;
+	case MLOG_REC_DELETE: case MLOG_COMP_REC_DELETE:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+		if (NULL != (ptr = mlog_parse_index(
+				     ptr, end_ptr,
+				     type == MLOG_COMP_REC_DELETE,
+				     &index))) {
+			ut_a(!page
+			     || (ibool)!!page_is_comp(page)
+			     == dict_table_is_comp(index->table));
+			ptr = page_cur_parse_delete_rec(ptr, end_ptr,
+							block, index, mtr);
+		}
+		break;
+	case MLOG_IBUF_BITMAP_INIT:
+		/* Allow anything in page_type when creating a page. */
+		ptr = ibuf_parse_bitmap_init(ptr, end_ptr, block, mtr);
+		break;
+	case MLOG_INIT_FILE_PAGE:
+		/* Allow anything in page_type when creating a page. */
+		ptr = fsp_parse_init_file_page(ptr, end_ptr, block);
+		break;
+	case MLOG_WRITE_STRING:
+		ut_ad(!page || page_type != FIL_PAGE_TYPE_ALLOCATED);
+		ptr = mlog_parse_string(ptr, end_ptr, page, page_zip);
+		break;
+	case MLOG_FILE_RENAME:
+		/* Do not rerun file-based log entries if this is
+		IO completion from a page read. */
+		if (page == NULL) {
+			ptr = fil_op_log_parse_or_replay(ptr, end_ptr, type,
+							 space_id, 0);
+		}
+		break;
+	case MLOG_FILE_CREATE:
+	case MLOG_FILE_DELETE:
+	case MLOG_FILE_CREATE2:
+		/* Do not rerun file-based log entries if this is
+		IO completion from a page read. */
+		if (page == NULL) {
+			ptr = fil_op_log_parse_or_replay(ptr, end_ptr,
+							 type, 0, 0);
+		}
+		break;
+	case MLOG_ZIP_WRITE_NODE_PTR:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+		ptr = page_zip_parse_write_node_ptr(ptr, end_ptr,
+						    page, page_zip);
+		break;
+	case MLOG_ZIP_WRITE_BLOB_PTR:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+		ptr = page_zip_parse_write_blob_ptr(ptr, end_ptr,
+						    page, page_zip);
+		break;
+	case MLOG_ZIP_WRITE_HEADER:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+		ptr = page_zip_parse_write_header(ptr, end_ptr,
+						  page, page_zip);
+		break;
+	case MLOG_ZIP_PAGE_COMPRESS:
+		/* Allow anything in page_type when creating a page. */
+		ptr = page_zip_parse_compress(ptr, end_ptr,
+					      page, page_zip);
+		break;
+	case MLOG_ZIP_PAGE_COMPRESS_NO_DATA:
+		if (NULL != (ptr = mlog_parse_index(
+				ptr, end_ptr, TRUE, &index))) {
+
+			ut_a(!page || ((ibool)!!page_is_comp(page)
+				== dict_table_is_comp(index->table)));
+			ptr = page_zip_parse_compress_no_data(
+				ptr, end_ptr, page, page_zip, index);
+		}
+		break;
+	default:
+		ptr = NULL;
+		recv_sys->found_corrupt_log = TRUE;
+	}
+
+	if (index) {
+		dict_table_t*	table = index->table;
+
+		dict_mem_index_free(index);
+		dict_mem_table_free(table);
+	}
+
+	return(ptr);
+}
+
+/*********************************************************************//**
+Calculates the fold value of a page file address: used in inserting or
+searching for a log record in the hash table.
+@return	folded value */
+UNIV_INLINE
+ulint
+recv_fold(
+/*======*/
+	ulint	space,	/*!< in: space */
+	ulint	page_no)/*!< in: page number */
+{
+	return(ut_fold_ulint_pair(space, page_no));
+}
+
+/*********************************************************************//**
+Calculates the hash value of a page file address: used in inserting or
+searching for a log record in the hash table.
+@return	folded value */
+UNIV_INLINE
+ulint
+recv_hash(
+/*======*/
+	ulint	space,	/*!< in: space */
+	ulint	page_no)/*!< in: page number */
+{
+	return(hash_calc_hash(recv_fold(space, page_no), recv_sys->addr_hash));
+}
+
+/*********************************************************************//**
+Gets the hashed file address struct for a page.
+@return	file address struct, NULL if not found from the hash table */
+static
+recv_addr_t*
+recv_get_fil_addr_struct(
+/*=====================*/
+	ulint	space,	/*!< in: space id */
+	ulint	page_no)/*!< in: page number */
+{
+	recv_addr_t*	recv_addr;
+
+	for (recv_addr = static_cast<recv_addr_t*>(
+			HASH_GET_FIRST(recv_sys->addr_hash,
+				       recv_hash(space, page_no)));
+	     recv_addr != 0;
+	     recv_addr = static_cast<recv_addr_t*>(
+		     HASH_GET_NEXT(addr_hash, recv_addr))) {
+
+		if (recv_addr->space == space
+		    && recv_addr->page_no == page_no) {
+
+			return(recv_addr);
+		}
+	}
+
+	return(NULL);
+}
+
+/*******************************************************************//**
+Adds a new log record to the hash table of log records. */
+static
+void
+recv_add_to_hash_table(
+/*===================*/
+	byte	type,		/*!< in: log record type */
+	ulint	space,		/*!< in: space id */
+	ulint	page_no,	/*!< in: page number */
+	byte*	body,		/*!< in: log record body */
+	byte*	rec_end,	/*!< in: log record end */
+	lsn_t	start_lsn,	/*!< in: start lsn of the mtr */
+	lsn_t	end_lsn)	/*!< in: end lsn of the mtr */
+{
+	recv_t*		recv;
+	ulint		len;
+	recv_data_t*	recv_data;
+	recv_data_t**	prev_field;
+	recv_addr_t*	recv_addr;
+
+	if (fil_tablespace_deleted_or_being_deleted_in_mem(space, -1)) {
+		/* The tablespace does not exist any more: do not store the
+		log record */
+
+		return;
+	}
+
+	len = rec_end - body;
+
+	recv = static_cast<recv_t*>(
+		mem_heap_alloc(recv_sys->heap, sizeof(recv_t)));
+
+	recv->type = type;
+	recv->len = rec_end - body;
+	recv->start_lsn = start_lsn;
+	recv->end_lsn = end_lsn;
+
+	recv_addr = recv_get_fil_addr_struct(space, page_no);
+
+	if (recv_addr == NULL) {
+		recv_addr = static_cast<recv_addr_t*>(
+			mem_heap_alloc(recv_sys->heap, sizeof(recv_addr_t)));
+
+		recv_addr->space = space;
+		recv_addr->page_no = page_no;
+		recv_addr->state = RECV_NOT_PROCESSED;
+
+		UT_LIST_INIT(recv_addr->rec_list);
+
+		HASH_INSERT(recv_addr_t, addr_hash, recv_sys->addr_hash,
+			    recv_fold(space, page_no), recv_addr);
+		recv_sys->n_addrs++;
+#if 0
+		fprintf(stderr, "Inserting log rec for space %lu, page %lu\n",
+			space, page_no);
+#endif
+	}
+
+	UT_LIST_ADD_LAST(rec_list, recv_addr->rec_list, recv);
+
+	prev_field = &(recv->data);
+
+	/* Store the log record body in chunks of less than UNIV_PAGE_SIZE:
+	recv_sys->heap grows into the buffer pool, and bigger chunks could not
+	be allocated */
+
+	while (rec_end > body) {
+
+		len = rec_end - body;
+
+		if (len > RECV_DATA_BLOCK_SIZE) {
+			len = RECV_DATA_BLOCK_SIZE;
+		}
+
+		recv_data = static_cast<recv_data_t*>(
+			mem_heap_alloc(recv_sys->heap,
+				       sizeof(recv_data_t) + len));
+
+		*prev_field = recv_data;
+
+		memcpy(recv_data + 1, body, len);
+
+		prev_field = &(recv_data->next);
+
+		body += len;
+	}
+
+	*prev_field = NULL;
+}
+
+/*********************************************************************//**
+Copies the log record body from recv to buf. */
+static
+void
+recv_data_copy_to_buf(
+/*==================*/
+	byte*	buf,	/*!< in: buffer of length at least recv->len */
+	recv_t*	recv)	/*!< in: log record */
+{
+	recv_data_t*	recv_data;
+	ulint		part_len;
+	ulint		len;
+
+	len = recv->len;
+	recv_data = recv->data;
+
+	while (len > 0) {
+		if (len > RECV_DATA_BLOCK_SIZE) {
+			part_len = RECV_DATA_BLOCK_SIZE;
+		} else {
+			part_len = len;
+		}
+
+		ut_memcpy(buf, ((byte*) recv_data) + sizeof(recv_data_t),
+			  part_len);
+		buf += part_len;
+		len -= part_len;
+
+		recv_data = recv_data->next;
+	}
+}
+
+/************************************************************************//**
+Applies the hashed log records to the page, if the page lsn is less than the
+lsn of a log record. This can be called when a buffer page has just been
+read in, or also for a page already in the buffer pool. */
+UNIV_INTERN
+void
+recv_recover_page_func(
+/*===================*/
+#ifndef UNIV_HOTBACKUP
+	ibool		just_read_in,
+				/*!< in: TRUE if the i/o handler calls
+				this for a freshly read page */
+#endif /* !UNIV_HOTBACKUP */
+	buf_block_t*	block)	/*!< in/out: buffer block */
+{
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	recv_addr_t*	recv_addr;
+	recv_t*		recv;
+	byte*		buf;
+	lsn_t		start_lsn;
+	lsn_t		end_lsn;
+	lsn_t		page_lsn;
+	lsn_t		page_newest_lsn;
+	ibool		modification_to_page;
+#ifndef UNIV_HOTBACKUP
+	ibool		success;
+#endif /* !UNIV_HOTBACKUP */
+	mtr_t		mtr;
+
+	mutex_enter(&(recv_sys->mutex));
+
+	if (recv_sys->apply_log_recs == FALSE) {
+
+		/* Log records should not be applied now */
+
+		mutex_exit(&(recv_sys->mutex));
+
+		return;
+	}
+
+	recv_addr = recv_get_fil_addr_struct(buf_block_get_space(block),
+					     buf_block_get_page_no(block));
+
+	if ((recv_addr == NULL)
+	    || (recv_addr->state == RECV_BEING_PROCESSED)
+	    || (recv_addr->state == RECV_PROCESSED)) {
+
+		mutex_exit(&(recv_sys->mutex));
+
+		return;
+	}
+
+#if 0
+	fprintf(stderr, "Recovering space %lu, page %lu\n",
+		buf_block_get_space(block), buf_block_get_page_no(block));
+#endif
+
+	recv_addr->state = RECV_BEING_PROCESSED;
+
+	mutex_exit(&(recv_sys->mutex));
+
+	mtr_start(&mtr);
+	mtr_set_log_mode(&mtr, MTR_LOG_NONE);
+
+	page = block->frame;
+	page_zip = buf_block_get_page_zip(block);
+
+#ifndef UNIV_HOTBACKUP
+	if (just_read_in) {
+		/* Move the ownership of the x-latch on the page to
+		this OS thread, so that we can acquire a second
+		x-latch on it.  This is needed for the operations to
+		the page to pass the debug checks. */
+
+		rw_lock_x_lock_move_ownership(&block->lock);
+	}
+
+	success = buf_page_get_known_nowait(RW_X_LATCH, block,
+					    BUF_KEEP_OLD,
+					    __FILE__, __LINE__,
+					    &mtr);
+	ut_a(success);
+
+	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+#endif /* !UNIV_HOTBACKUP */
+
+	/* Read the newest modification lsn from the page */
+	page_lsn = mach_read_from_8(page + FIL_PAGE_LSN);
+
+#ifndef UNIV_HOTBACKUP
+	/* It may be that the page has been modified in the buffer
+	pool: read the newest modification lsn there */
+
+	page_newest_lsn = buf_page_get_newest_modification(&block->page);
+
+	if (page_newest_lsn) {
+
+		page_lsn = page_newest_lsn;
+	}
+#else /* !UNIV_HOTBACKUP */
+	/* In recovery from a backup we do not really use the buffer pool */
+	page_newest_lsn = 0;
+#endif /* !UNIV_HOTBACKUP */
+
+	modification_to_page = FALSE;
+	start_lsn = end_lsn = 0;
+
+	recv = UT_LIST_GET_FIRST(recv_addr->rec_list);
+
+	while (recv) {
+		end_lsn = recv->end_lsn;
+
+		if (recv->len > RECV_DATA_BLOCK_SIZE) {
+			/* We have to copy the record body to a separate
+			buffer */
+
+			buf = static_cast<byte*>(mem_alloc(recv->len));
+
+			recv_data_copy_to_buf(buf, recv);
+		} else {
+			buf = ((byte*)(recv->data)) + sizeof(recv_data_t);
+		}
+
+		if (recv->type == MLOG_INIT_FILE_PAGE) {
+			page_lsn = page_newest_lsn;
+
+			memset(FIL_PAGE_LSN + page, 0, 8);
+			memset(UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM
+			       + page, 0, 8);
+
+			if (page_zip) {
+				memset(FIL_PAGE_LSN + page_zip->data, 0, 8);
+			}
+		}
+
+		if (recv->start_lsn >= page_lsn) {
+
+			lsn_t	end_lsn;
+
+			if (!modification_to_page) {
+
+				modification_to_page = TRUE;
+				start_lsn = recv->start_lsn;
+			}
+
+			DBUG_PRINT("ib_log",
+				   ("apply " DBUG_LSN_PF ": %u len %u "
+				    "page %u:%u", recv->start_lsn,
+				    (unsigned) recv->type,
+				    (unsigned) recv->len,
+				    (unsigned) recv_addr->space,
+				    (unsigned) recv_addr->page_no));
+
+			recv_parse_or_apply_log_rec_body(recv->type, buf,
+							 buf + recv->len,
+							 block, &mtr,
+							 recv_addr->space);
+
+			end_lsn = recv->start_lsn + recv->len;
+			mach_write_to_8(FIL_PAGE_LSN + page, end_lsn);
+			mach_write_to_8(UNIV_PAGE_SIZE
+					- FIL_PAGE_END_LSN_OLD_CHKSUM
+					+ page, end_lsn);
+
+			if (page_zip) {
+				mach_write_to_8(FIL_PAGE_LSN
+						+ page_zip->data, end_lsn);
+			}
+		}
+
+		if (recv->len > RECV_DATA_BLOCK_SIZE) {
+			mem_free(buf);
+		}
+
+		recv = UT_LIST_GET_NEXT(rec_list, recv);
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	if (fil_page_get_type(page) == FIL_PAGE_INDEX) {
+		page_zip_des_t*	page_zip = buf_block_get_page_zip(block);
+
+		ut_a(!page_zip
+		     || page_zip_validate_low(page_zip, page, NULL, FALSE));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+	if (modification_to_page) {
+		ut_a(block);
+
+		log_flush_order_mutex_enter();
+		buf_flush_recv_note_modification(block, start_lsn, end_lsn);
+		log_flush_order_mutex_exit();
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	/* Make sure that committing mtr does not change the modification
+	lsn values of page */
+
+	mtr.modifications = FALSE;
+
+	mtr_commit(&mtr);
+
+	mutex_enter(&(recv_sys->mutex));
+
+	if (recv_max_page_lsn < page_lsn) {
+		recv_max_page_lsn = page_lsn;
+	}
+
+	recv_addr->state = RECV_PROCESSED;
+
+	ut_a(recv_sys->n_addrs);
+	recv_sys->n_addrs--;
+
+	mutex_exit(&(recv_sys->mutex));
+
+}
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Reads in pages which have hashed log records, from an area around a given
+page number.
+@return	number of pages found */
+static
+ulint
+recv_read_in_area(
+/*==============*/
+	ulint	space,	/*!< in: space */
+	ulint	zip_size,/*!< in: compressed page size in bytes, or 0 */
+	ulint	page_no)/*!< in: page number */
+{
+	recv_addr_t* recv_addr;
+	ulint	page_nos[RECV_READ_AHEAD_AREA];
+	ulint	low_limit;
+	ulint	n;
+
+	low_limit = page_no - (page_no % RECV_READ_AHEAD_AREA);
+
+	n = 0;
+
+	for (page_no = low_limit; page_no < low_limit + RECV_READ_AHEAD_AREA;
+	     page_no++) {
+		recv_addr = recv_get_fil_addr_struct(space, page_no);
+
+		if (recv_addr && !buf_page_peek(space, page_no)) {
+
+			mutex_enter(&(recv_sys->mutex));
+
+			if (recv_addr->state == RECV_NOT_PROCESSED) {
+				recv_addr->state = RECV_BEING_READ;
+
+				page_nos[n] = page_no;
+
+				n++;
+			}
+
+			mutex_exit(&(recv_sys->mutex));
+		}
+	}
+
+	buf_read_recv_pages(FALSE, space, zip_size, page_nos, n);
+	/*
+	fprintf(stderr, "Recv pages at %lu n %lu\n", page_nos[0], n);
+	*/
+	return(n);
+}
+
+/*******************************************************************//**
+Empties the hash table of stored log records, applying them to appropriate
+pages. */
+UNIV_INTERN
+void
+recv_apply_hashed_log_recs(
+/*=======================*/
+	ibool	allow_ibuf)	/*!< in: if TRUE, also ibuf operations are
+				allowed during the application; if FALSE,
+				no ibuf operations are allowed, and after
+				the application all file pages are flushed to
+				disk and invalidated in buffer pool: this
+				alternative means that no new log records
+				can be generated during the application;
+				the caller must in this case own the log
+				mutex */
+{
+	recv_addr_t* recv_addr;
+	ulint	i;
+	ibool	has_printed	= FALSE;
+	mtr_t	mtr;
+loop:
+	mutex_enter(&(recv_sys->mutex));
+
+	if (recv_sys->apply_batch_on) {
+
+		mutex_exit(&(recv_sys->mutex));
+
+		os_thread_sleep(500000);
+
+		goto loop;
+	}
+
+	ut_ad(!allow_ibuf == mutex_own(&log_sys->mutex));
+
+	if (!allow_ibuf) {
+		recv_no_ibuf_operations = TRUE;
+	}
+
+	recv_sys->apply_log_recs = TRUE;
+	recv_sys->apply_batch_on = TRUE;
+
+	for (i = 0; i < hash_get_n_cells(recv_sys->addr_hash); i++) {
+
+		for (recv_addr = static_cast<recv_addr_t*>(
+				HASH_GET_FIRST(recv_sys->addr_hash, i));
+		     recv_addr != 0;
+		     recv_addr = static_cast<recv_addr_t*>(
+				HASH_GET_NEXT(addr_hash, recv_addr))) {
+
+			ulint	space = recv_addr->space;
+			ulint	zip_size = fil_space_get_zip_size(space);
+			ulint	page_no = recv_addr->page_no;
+
+			if (recv_addr->state == RECV_NOT_PROCESSED) {
+				if (!has_printed) {
+					ib_logf(IB_LOG_LEVEL_INFO,
+						"Starting an apply batch"
+						" of log records"
+						" to the database...");
+					fputs("InnoDB: Progress in percent: ",
+					      stderr);
+					has_printed = TRUE;
+				}
+
+				mutex_exit(&(recv_sys->mutex));
+
+				if (buf_page_peek(space, page_no)) {
+					buf_block_t*	block;
+
+					mtr_start(&mtr);
+
+					block = buf_page_get(
+						space, zip_size, page_no,
+						RW_X_LATCH, &mtr);
+					buf_block_dbg_add_level(
+						block, SYNC_NO_ORDER_CHECK);
+
+					recv_recover_page(FALSE, block);
+					mtr_commit(&mtr);
+				} else {
+					recv_read_in_area(space, zip_size,
+							  page_no);
+				}
+
+				mutex_enter(&(recv_sys->mutex));
+			}
+		}
+
+		if (has_printed
+		    && (i * 100) / hash_get_n_cells(recv_sys->addr_hash)
+		    != ((i + 1) * 100)
+		    / hash_get_n_cells(recv_sys->addr_hash)) {
+
+			fprintf(stderr, "%lu ", (ulong)
+				((i * 100)
+				 / hash_get_n_cells(recv_sys->addr_hash)));
+		}
+	}
+
+	/* Wait until all the pages have been processed */
+
+	while (recv_sys->n_addrs != 0) {
+
+		mutex_exit(&(recv_sys->mutex));
+
+		os_thread_sleep(500000);
+
+		mutex_enter(&(recv_sys->mutex));
+	}
+
+	if (has_printed) {
+
+		fprintf(stderr, "\n");
+	}
+
+	if (!allow_ibuf) {
+		bool	success;
+
+		/* Flush all the file pages to disk and invalidate them in
+		the buffer pool */
+
+		ut_d(recv_no_log_write = TRUE);
+		mutex_exit(&(recv_sys->mutex));
+		mutex_exit(&(log_sys->mutex));
+
+		/* Stop the recv_writer thread from issuing any LRU
+		flush batches. */
+		mutex_enter(&recv_sys->writer_mutex);
+
+		/* Wait for any currently run batch to end. */
+		buf_flush_wait_LRU_batch_end();
+
+		success = buf_flush_list(ULINT_MAX, LSN_MAX, NULL);
+
+		ut_a(success);
+
+		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+
+		buf_pool_invalidate();
+
+		/* Allow batches from recv_writer thread. */
+		mutex_exit(&recv_sys->writer_mutex);
+
+		mutex_enter(&(log_sys->mutex));
+		mutex_enter(&(recv_sys->mutex));
+		ut_d(recv_no_log_write = FALSE);
+
+		recv_no_ibuf_operations = FALSE;
+	}
+
+	recv_sys->apply_log_recs = FALSE;
+	recv_sys->apply_batch_on = FALSE;
+
+	recv_sys_empty_hash();
+
+	if (has_printed) {
+		fprintf(stderr, "InnoDB: Apply batch completed\n");
+	}
+
+	mutex_exit(&(recv_sys->mutex));
+}
+#else /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Applies log records in the hash table to a backup. */
+UNIV_INTERN
+void
+recv_apply_log_recs_for_backup(void)
+/*================================*/
+{
+	recv_addr_t*	recv_addr;
+	ulint		n_hash_cells;
+	buf_block_t*	block;
+	ulint		actual_size;
+	ibool		success;
+	ulint		error;
+	ulint		i;
+
+	recv_sys->apply_log_recs = TRUE;
+	recv_sys->apply_batch_on = TRUE;
+
+	block = back_block1;
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Starting an apply batch of log records to the database...");
+
+	fputs("InnoDB: Progress in percent: ", stderr);
+
+	n_hash_cells = hash_get_n_cells(recv_sys->addr_hash);
+
+	for (i = 0; i < n_hash_cells; i++) {
+		/* The address hash table is externally chained */
+		recv_addr = hash_get_nth_cell(recv_sys->addr_hash, i)->node;
+
+		while (recv_addr != NULL) {
+
+			ulint	zip_size
+				= fil_space_get_zip_size(recv_addr->space);
+
+			if (zip_size == ULINT_UNDEFINED) {
+#if 0
+				fprintf(stderr,
+					"InnoDB: Warning: cannot apply"
+					" log record to"
+					" tablespace %lu page %lu,\n"
+					"InnoDB: because tablespace with"
+					" that id does not exist.\n",
+					recv_addr->space, recv_addr->page_no);
+#endif
+				recv_addr->state = RECV_PROCESSED;
+
+				ut_a(recv_sys->n_addrs);
+				recv_sys->n_addrs--;
+
+				goto skip_this_recv_addr;
+			}
+
+			/* We simulate a page read made by the buffer pool, to
+			make sure the recovery apparatus works ok. We must init
+			the block. */
+
+			buf_page_init_for_backup_restore(
+				recv_addr->space, recv_addr->page_no,
+				zip_size, block);
+
+			/* Extend the tablespace's last file if the page_no
+			does not fall inside its bounds; we assume the last
+			file is auto-extending, and mysqlbackup copied the file
+			when it still was smaller */
+
+			success = fil_extend_space_to_desired_size(
+				&actual_size,
+				recv_addr->space, recv_addr->page_no + 1);
+			if (!success) {
+				fprintf(stderr,
+					"InnoDB: Fatal error: cannot extend"
+					" tablespace %u to hold %u pages\n",
+					recv_addr->space, recv_addr->page_no);
+
+				exit(1);
+			}
+
+			/* Read the page from the tablespace file using the
+			fil0fil.cc routines */
+
+			if (zip_size) {
+				error = fil_io(OS_FILE_READ, true,
+					       recv_addr->space, zip_size,
+					       recv_addr->page_no, 0, zip_size,
+					       block->page.zip.data, NULL);
+				if (error == DB_SUCCESS
+				    && !buf_zip_decompress(block, TRUE)) {
+					exit(1);
+				}
+			} else {
+				error = fil_io(OS_FILE_READ, true,
+					       recv_addr->space, 0,
+					       recv_addr->page_no, 0,
+					       UNIV_PAGE_SIZE,
+					       block->frame, NULL);
+			}
+
+			if (error != DB_SUCCESS) {
+				fprintf(stderr,
+					"InnoDB: Fatal error: cannot read"
+					" from tablespace"
+					" %lu page number %lu\n",
+					(ulong) recv_addr->space,
+					(ulong) recv_addr->page_no);
+
+				exit(1);
+			}
+
+			/* Apply the log records to this page */
+			recv_recover_page(FALSE, block);
+
+			/* Write the page back to the tablespace file using the
+			fil0fil.cc routines */
+
+			buf_flush_init_for_writing(
+				block->frame, buf_block_get_page_zip(block),
+				mach_read_from_8(block->frame + FIL_PAGE_LSN));
+
+			if (zip_size) {
+				error = fil_io(OS_FILE_WRITE, true,
+					       recv_addr->space, zip_size,
+					       recv_addr->page_no, 0,
+					       zip_size,
+					       block->page.zip.data, NULL);
+			} else {
+				error = fil_io(OS_FILE_WRITE, true,
+					       recv_addr->space, 0,
+					       recv_addr->page_no, 0,
+					       UNIV_PAGE_SIZE,
+					       block->frame, NULL);
+			}
+skip_this_recv_addr:
+			recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
+		}
+
+		if ((100 * i) / n_hash_cells
+		    != (100 * (i + 1)) / n_hash_cells) {
+			fprintf(stderr, "%lu ",
+				(ulong) ((100 * i) / n_hash_cells));
+			fflush(stderr);
+		}
+	}
+
+	recv_sys_empty_hash();
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Tries to parse a single log record and returns its length.
+@return	length of the record, or 0 if the record was not complete */
+static
+ulint
+recv_parse_log_rec(
+/*===============*/
+	byte*	ptr,	/*!< in: pointer to a buffer */
+	byte*	end_ptr,/*!< in: pointer to the buffer end */
+	byte*	type,	/*!< out: type */
+	ulint*	space,	/*!< out: space id */
+	ulint*	page_no,/*!< out: page number */
+	byte**	body)	/*!< out: log record body start */
+{
+	byte*	new_ptr;
+
+	*body = NULL;
+
+	if (ptr == end_ptr) {
+
+		return(0);
+	}
+
+	if (*ptr == MLOG_MULTI_REC_END) {
+
+		*type = *ptr;
+
+		return(1);
+	}
+
+	if (*ptr == MLOG_DUMMY_RECORD) {
+		*type = *ptr;
+
+		*space = ULINT_UNDEFINED - 1; /* For debugging */
+
+		return(1);
+	}
+
+	new_ptr = mlog_parse_initial_log_record(ptr, end_ptr, type, space,
+						page_no);
+	*body = new_ptr;
+
+	if (UNIV_UNLIKELY(!new_ptr)) {
+
+		return(0);
+	}
+
+#ifdef UNIV_LOG_LSN_DEBUG
+	if (*type == MLOG_LSN) {
+		lsn_t	lsn = (lsn_t) *space << 32 | *page_no;
+# ifdef UNIV_LOG_DEBUG
+		ut_a(lsn == log_sys->old_lsn);
+# else /* UNIV_LOG_DEBUG */
+		ut_a(lsn == recv_sys->recovered_lsn);
+# endif /* UNIV_LOG_DEBUG */
+	}
+#endif /* UNIV_LOG_LSN_DEBUG */
+
+	new_ptr = recv_parse_or_apply_log_rec_body(*type, new_ptr, end_ptr,
+						   NULL, NULL, *space);
+	if (UNIV_UNLIKELY(new_ptr == NULL)) {
+
+		return(0);
+	}
+
+	if (*page_no > recv_max_parsed_page_no) {
+		recv_max_parsed_page_no = *page_no;
+	}
+
+	return(new_ptr - ptr);
+}
+
+/*******************************************************//**
+Calculates the new value for lsn when more data is added to the log. */
+static
+lsn_t
+recv_calc_lsn_on_data_add(
+/*======================*/
+	lsn_t		lsn,	/*!< in: old lsn */
+	ib_uint64_t	len)	/*!< in: this many bytes of data is
+				added, log block headers not included */
+{
+	ulint		frag_len;
+	ib_uint64_t	lsn_len;
+
+	frag_len = (lsn % OS_FILE_LOG_BLOCK_SIZE) - LOG_BLOCK_HDR_SIZE;
+	ut_ad(frag_len < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE
+	      - LOG_BLOCK_TRL_SIZE);
+	lsn_len = len;
+	lsn_len += (lsn_len + frag_len)
+		/ (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE
+		   - LOG_BLOCK_TRL_SIZE)
+		* (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE);
+
+	return(lsn + lsn_len);
+}
+
+#ifdef UNIV_LOG_DEBUG
+/*******************************************************//**
+Checks that the parser recognizes incomplete initial segments of a log
+record as incomplete. */
+static
+void
+recv_check_incomplete_log_recs(
+/*===========================*/
+	byte*	ptr,	/*!< in: pointer to a complete log record */
+	ulint	len)	/*!< in: length of the log record */
+{
+	ulint	i;
+	byte	type;
+	ulint	space;
+	ulint	page_no;
+	byte*	body;
+
+	for (i = 0; i < len; i++) {
+		ut_a(0 == recv_parse_log_rec(ptr, ptr + i, &type, &space,
+					     &page_no, &body));
+	}
+}
+#endif /* UNIV_LOG_DEBUG */
+
+/*******************************************************//**
+Prints diagnostic info of corrupt log. */
+static
+void
+recv_report_corrupt_log(
+/*====================*/
+	byte*	ptr,	/*!< in: pointer to corrupt log record */
+	byte	type,	/*!< in: type of the record */
+	ulint	space,	/*!< in: space id, this may also be garbage */
+	ulint	page_no)/*!< in: page number, this may also be garbage */
+{
+	fprintf(stderr,
+		"InnoDB: ############### CORRUPT LOG RECORD FOUND\n"
+		"InnoDB: Log record type %lu, space id %lu, page number %lu\n"
+		"InnoDB: Log parsing proceeded successfully up to " LSN_PF "\n"
+		"InnoDB: Previous log record type %lu, is multi %lu\n"
+		"InnoDB: Recv offset %lu, prev %lu\n",
+		(ulong) type, (ulong) space, (ulong) page_no,
+		recv_sys->recovered_lsn,
+		(ulong) recv_previous_parsed_rec_type,
+		(ulong) recv_previous_parsed_rec_is_multi,
+		(ulong) (ptr - recv_sys->buf),
+		(ulong) recv_previous_parsed_rec_offset);
+
+	if ((ulint)(ptr - recv_sys->buf + 100)
+	    > recv_previous_parsed_rec_offset
+	    && (ulint)(ptr - recv_sys->buf + 100
+		       - recv_previous_parsed_rec_offset)
+	    < 200000) {
+		fputs("InnoDB: Hex dump of corrupt log starting"
+		      " 100 bytes before the start\n"
+		      "InnoDB: of the previous log rec,\n"
+		      "InnoDB: and ending 100 bytes after the start"
+		      " of the corrupt rec:\n",
+		      stderr);
+
+		ut_print_buf(stderr,
+			     recv_sys->buf
+			     + recv_previous_parsed_rec_offset - 100,
+			     ptr - recv_sys->buf + 200
+			     - recv_previous_parsed_rec_offset);
+		putc('\n', stderr);
+	}
+
+#ifndef UNIV_HOTBACKUP
+	if (!srv_force_recovery) {
+		fputs("InnoDB: Set innodb_force_recovery"
+		      " to ignore this error.\n", stderr);
+		ut_error;
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	fputs("InnoDB: WARNING: the log file may have been corrupt and it\n"
+	      "InnoDB: is possible that the log scan did not proceed\n"
+	      "InnoDB: far enough in recovery! Please run CHECK TABLE\n"
+	      "InnoDB: on your InnoDB tables to check that they are ok!\n"
+	      "InnoDB: If mysqld crashes after this recovery, look at\n"
+	      "InnoDB: " REFMAN "forcing-innodb-recovery.html\n"
+	      "InnoDB: about forcing recovery.\n", stderr);
+
+	fflush(stderr);
+}
+
+/*******************************************************//**
+Parses log records from a buffer and stores them to a hash table to wait
+merging to file pages.
+@return	currently always returns FALSE */
+static
+ibool
+recv_parse_log_recs(
+/*================*/
+	ibool	store_to_hash)	/*!< in: TRUE if the records should be stored
+				to the hash table; this is set to FALSE if just
+				debug checking is needed */
+{
+	byte*	ptr;
+	byte*	end_ptr;
+	ulint	single_rec;
+	ulint	len;
+	ulint	total_len;
+	lsn_t	new_recovered_lsn;
+	lsn_t	old_lsn;
+	byte	type;
+	ulint	space;
+	ulint	page_no;
+	byte*	body;
+	ulint	n_recs;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+	ut_ad(recv_sys->parse_start_lsn != 0);
+loop:
+	ptr = recv_sys->buf + recv_sys->recovered_offset;
+
+	end_ptr = recv_sys->buf + recv_sys->len;
+
+	if (ptr == end_ptr) {
+
+		return(FALSE);
+	}
+
+	single_rec = (ulint)*ptr & MLOG_SINGLE_REC_FLAG;
+
+	if (single_rec || *ptr == MLOG_DUMMY_RECORD) {
+		/* The mtr only modified a single page, or this is a file op */
+
+		old_lsn = recv_sys->recovered_lsn;
+
+		/* Try to parse a log record, fetching its type, space id,
+		page no, and a pointer to the body of the log record */
+
+		len = recv_parse_log_rec(ptr, end_ptr, &type, &space,
+					 &page_no, &body);
+
+		if (len == 0 || recv_sys->found_corrupt_log) {
+			if (recv_sys->found_corrupt_log) {
+
+				recv_report_corrupt_log(ptr,
+							type, space, page_no);
+			}
+
+			return(FALSE);
+		}
+
+		new_recovered_lsn = recv_calc_lsn_on_data_add(old_lsn, len);
+
+		if (new_recovered_lsn > recv_sys->scanned_lsn) {
+			/* The log record filled a log block, and we require
+			that also the next log block should have been scanned
+			in */
+
+			return(FALSE);
+		}
+
+		recv_previous_parsed_rec_type = (ulint) type;
+		recv_previous_parsed_rec_offset = recv_sys->recovered_offset;
+		recv_previous_parsed_rec_is_multi = 0;
+
+		recv_sys->recovered_offset += len;
+		recv_sys->recovered_lsn = new_recovered_lsn;
+
+		DBUG_PRINT("ib_log",
+			   ("scan " DBUG_LSN_PF ": log rec %u len %u "
+			    "page %u:%u", old_lsn,
+			    (unsigned) type, (unsigned) len,
+			    (unsigned) space, (unsigned) page_no));
+
+		if (type == MLOG_DUMMY_RECORD) {
+			/* Do nothing */
+
+		} else if (!store_to_hash) {
+			/* In debug checking, update a replicate page
+			according to the log record, and check that it
+			becomes identical with the original page */
+#ifdef UNIV_LOG_DEBUG
+			recv_check_incomplete_log_recs(ptr, len);
+#endif/* UNIV_LOG_DEBUG */
+
+		} else if (type == MLOG_FILE_CREATE
+			   || type == MLOG_FILE_CREATE2
+			   || type == MLOG_FILE_RENAME
+			   || type == MLOG_FILE_DELETE) {
+			ut_a(space);
+#ifdef UNIV_HOTBACKUP
+			if (recv_replay_file_ops) {
+
+				/* In mysqlbackup --apply-log, replay an .ibd
+				file operation, if possible; note that
+				fil_path_to_mysql_datadir is set in mysqlbackup
+				to point to the datadir we should use there */
+
+				if (NULL == fil_op_log_parse_or_replay(
+					    body, end_ptr, type,
+					    space, page_no)) {
+					fprintf(stderr,
+						"InnoDB: Error: file op"
+						" log record of type %lu"
+						" space %lu not complete in\n"
+						"InnoDB: the replay phase."
+						" Path %s\n",
+						(ulint) type, space,
+						(char*)(body + 2));
+
+					ut_error;
+				}
+			}
+#endif
+			/* In normal mysqld crash recovery we do not try to
+			replay file operations */
+#ifdef UNIV_LOG_LSN_DEBUG
+		} else if (type == MLOG_LSN) {
+			/* Do not add these records to the hash table.
+			The page number and space id fields are misused
+			for something else. */
+#endif /* UNIV_LOG_LSN_DEBUG */
+		} else {
+			recv_add_to_hash_table(type, space, page_no, body,
+					       ptr + len, old_lsn,
+					       recv_sys->recovered_lsn);
+		}
+	} else {
+		/* Check that all the records associated with the single mtr
+		are included within the buffer */
+
+		total_len = 0;
+		n_recs = 0;
+
+		for (;;) {
+			len = recv_parse_log_rec(ptr, end_ptr, &type, &space,
+						 &page_no, &body);
+			if (len == 0 || recv_sys->found_corrupt_log) {
+
+				if (recv_sys->found_corrupt_log) {
+
+					recv_report_corrupt_log(
+						ptr, type, space, page_no);
+				}
+
+				return(FALSE);
+			}
+
+			recv_previous_parsed_rec_type = (ulint) type;
+			recv_previous_parsed_rec_offset
+				= recv_sys->recovered_offset + total_len;
+			recv_previous_parsed_rec_is_multi = 1;
+
+#ifdef UNIV_LOG_DEBUG
+			if ((!store_to_hash) && (type != MLOG_MULTI_REC_END)) {
+				recv_check_incomplete_log_recs(ptr, len);
+			}
+#endif /* UNIV_LOG_DEBUG */
+
+			DBUG_PRINT("ib_log",
+				   ("scan " DBUG_LSN_PF ": multi-log rec %u "
+				    "len %u page %u:%u",
+				    recv_sys->recovered_lsn,
+				    (unsigned) type, (unsigned) len,
+				    (unsigned) space, (unsigned) page_no));
+
+			total_len += len;
+			n_recs++;
+
+			ptr += len;
+
+			if (type == MLOG_MULTI_REC_END) {
+
+				/* Found the end mark for the records */
+
+				break;
+			}
+		}
+
+		new_recovered_lsn = recv_calc_lsn_on_data_add(
+			recv_sys->recovered_lsn, total_len);
+
+		if (new_recovered_lsn > recv_sys->scanned_lsn) {
+			/* The log record filled a log block, and we require
+			that also the next log block should have been scanned
+			in */
+
+			return(FALSE);
+		}
+
+		/* Add all the records to the hash table */
+
+		ptr = recv_sys->buf + recv_sys->recovered_offset;
+
+		for (;;) {
+			old_lsn = recv_sys->recovered_lsn;
+			len = recv_parse_log_rec(ptr, end_ptr, &type, &space,
+						 &page_no, &body);
+			if (recv_sys->found_corrupt_log) {
+
+				recv_report_corrupt_log(ptr,
+							type, space, page_no);
+			}
+
+			ut_a(len != 0);
+			ut_a(0 == ((ulint)*ptr & MLOG_SINGLE_REC_FLAG));
+
+			recv_sys->recovered_offset += len;
+			recv_sys->recovered_lsn
+				= recv_calc_lsn_on_data_add(old_lsn, len);
+			if (type == MLOG_MULTI_REC_END) {
+
+				/* Found the end mark for the records */
+
+				break;
+			}
+
+			if (store_to_hash
+#ifdef UNIV_LOG_LSN_DEBUG
+			    && type != MLOG_LSN
+#endif /* UNIV_LOG_LSN_DEBUG */
+			    ) {
+				recv_add_to_hash_table(type, space, page_no,
+						       body, ptr + len,
+						       old_lsn,
+						       new_recovered_lsn);
+			}
+
+			ptr += len;
+		}
+	}
+
+	goto loop;
+}
+
+/*******************************************************//**
+Adds data from a new log block to the parsing buffer of recv_sys if
+recv_sys->parse_start_lsn is non-zero.
+@return	TRUE if more data added */
+static
+ibool
+recv_sys_add_to_parsing_buf(
+/*========================*/
+	const byte*	log_block,	/*!< in: log block */
+	lsn_t		scanned_lsn)	/*!< in: lsn of how far we were able
+					to find data in this log block */
+{
+	ulint	more_len;
+	ulint	data_len;
+	ulint	start_offset;
+	ulint	end_offset;
+
+	ut_ad(scanned_lsn >= recv_sys->scanned_lsn);
+
+	if (!recv_sys->parse_start_lsn) {
+		/* Cannot start parsing yet because no start point for
+		it found */
+
+		return(FALSE);
+	}
+
+	data_len = log_block_get_data_len(log_block);
+
+	if (recv_sys->parse_start_lsn >= scanned_lsn) {
+
+		return(FALSE);
+
+	} else if (recv_sys->scanned_lsn >= scanned_lsn) {
+
+		return(FALSE);
+
+	} else if (recv_sys->parse_start_lsn > recv_sys->scanned_lsn) {
+		more_len = (ulint) (scanned_lsn - recv_sys->parse_start_lsn);
+	} else {
+		more_len = (ulint) (scanned_lsn - recv_sys->scanned_lsn);
+	}
+
+	if (more_len == 0) {
+
+		return(FALSE);
+	}
+
+	ut_ad(data_len >= more_len);
+
+	start_offset = data_len - more_len;
+
+	if (start_offset < LOG_BLOCK_HDR_SIZE) {
+		start_offset = LOG_BLOCK_HDR_SIZE;
+	}
+
+	end_offset = data_len;
+
+	if (end_offset > OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
+		end_offset = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE;
+	}
+
+	ut_ad(start_offset <= end_offset);
+
+	if (start_offset < end_offset) {
+		ut_memcpy(recv_sys->buf + recv_sys->len,
+			  log_block + start_offset, end_offset - start_offset);
+
+		recv_sys->len += end_offset - start_offset;
+
+		ut_a(recv_sys->len <= RECV_PARSING_BUF_SIZE);
+	}
+
+	return(TRUE);
+}
+
+/*******************************************************//**
+Moves the parsing buffer data left to the buffer start. */
+static
+void
+recv_sys_justify_left_parsing_buf(void)
+/*===================================*/
+{
+	ut_memmove(recv_sys->buf, recv_sys->buf + recv_sys->recovered_offset,
+		   recv_sys->len - recv_sys->recovered_offset);
+
+	recv_sys->len -= recv_sys->recovered_offset;
+
+	recv_sys->recovered_offset = 0;
+}
+
+/*******************************************************//**
+Scans log from a buffer and stores new log data to the parsing buffer.
+Parses and hashes the log records if new data found.  Unless
+UNIV_HOTBACKUP is defined, this function will apply log records
+automatically when the hash table becomes full.
+@return TRUE if limit_lsn has been reached, or not able to scan any
+more in this log group */
+UNIV_INTERN
+ibool
+recv_scan_log_recs(
+/*===============*/
+	ulint		available_memory,/*!< in: we let the hash table of recs
+					to grow to this size, at the maximum */
+	ibool		store_to_hash,	/*!< in: TRUE if the records should be
+					stored to the hash table; this is set
+					to FALSE if just debug checking is
+					needed */
+	const byte*	buf,		/*!< in: buffer containing a log
+					segment or garbage */
+	ulint		len,		/*!< in: buffer length */
+	lsn_t		start_lsn,	/*!< in: buffer start lsn */
+	lsn_t*		contiguous_lsn,	/*!< in/out: it is known that all log
+					groups contain contiguous log data up
+					to this lsn */
+	lsn_t*		group_scanned_lsn)/*!< out: scanning succeeded up to
+					this lsn */
+{
+	const byte*	log_block;
+	ulint		no;
+	lsn_t		scanned_lsn;
+	ibool		finished;
+	ulint		data_len;
+	ibool		more_data;
+
+	ut_ad(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
+	ut_ad(len % OS_FILE_LOG_BLOCK_SIZE == 0);
+	ut_ad(len >= OS_FILE_LOG_BLOCK_SIZE);
+	ut_a(store_to_hash <= TRUE);
+
+	finished = FALSE;
+
+	log_block = buf;
+	scanned_lsn = start_lsn;
+	more_data = FALSE;
+
+	do {
+		no = log_block_get_hdr_no(log_block);
+		/*
+		fprintf(stderr, "Log block header no %lu\n", no);
+
+		fprintf(stderr, "Scanned lsn no %lu\n",
+		log_block_convert_lsn_to_no(scanned_lsn));
+		*/
+		if (no != log_block_convert_lsn_to_no(scanned_lsn)
+		    || !log_block_checksum_is_ok_or_old_format(log_block)) {
+
+			if (no == log_block_convert_lsn_to_no(scanned_lsn)
+			    && !log_block_checksum_is_ok_or_old_format(
+				    log_block)) {
+				fprintf(stderr,
+					"InnoDB: Log block no %lu at"
+					" lsn " LSN_PF " has\n"
+					"InnoDB: ok header, but checksum field"
+					" contains %lu, should be %lu\n",
+					(ulong) no,
+					scanned_lsn,
+					(ulong) log_block_get_checksum(
+						log_block),
+					(ulong) log_block_calc_checksum(
+						log_block));
+			}
+
+			/* Garbage or an incompletely written log block */
+
+			finished = TRUE;
+
+			break;
+		}
+
+		if (log_block_get_flush_bit(log_block)) {
+			/* This block was a start of a log flush operation:
+			we know that the previous flush operation must have
+			been completed for all log groups before this block
+			can have been flushed to any of the groups. Therefore,
+			we know that log data is contiguous up to scanned_lsn
+			in all non-corrupt log groups. */
+
+			if (scanned_lsn > *contiguous_lsn) {
+				*contiguous_lsn = scanned_lsn;
+			}
+		}
+
+		data_len = log_block_get_data_len(log_block);
+
+		if ((store_to_hash || (data_len == OS_FILE_LOG_BLOCK_SIZE))
+		    && scanned_lsn + data_len > recv_sys->scanned_lsn
+		    && (recv_sys->scanned_checkpoint_no > 0)
+		    && (log_block_get_checkpoint_no(log_block)
+			< recv_sys->scanned_checkpoint_no)
+		    && (recv_sys->scanned_checkpoint_no
+			- log_block_get_checkpoint_no(log_block)
+			> 0x80000000UL)) {
+
+			/* Garbage from a log buffer flush which was made
+			before the most recent database recovery */
+
+			finished = TRUE;
+#ifdef UNIV_LOG_DEBUG
+			/* This is not really an error, but currently
+			we stop here in the debug version: */
+
+			ut_error;
+#endif
+			break;
+		}
+
+		if (!recv_sys->parse_start_lsn
+		    && (log_block_get_first_rec_group(log_block) > 0)) {
+
+			/* We found a point from which to start the parsing
+			of log records */
+
+			recv_sys->parse_start_lsn = scanned_lsn
+				+ log_block_get_first_rec_group(log_block);
+			recv_sys->scanned_lsn = recv_sys->parse_start_lsn;
+			recv_sys->recovered_lsn = recv_sys->parse_start_lsn;
+		}
+
+		scanned_lsn += data_len;
+
+		if (scanned_lsn > recv_sys->scanned_lsn) {
+
+			/* We have found more entries. If this scan is
+ 			of startup type, we must initiate crash recovery
+			environment before parsing these log records. */
+
+#ifndef UNIV_HOTBACKUP
+			if (recv_log_scan_is_startup_type
+			    && !recv_needed_recovery) {
+
+				if (!srv_read_only_mode) {
+					ib_logf(IB_LOG_LEVEL_INFO,
+						"Log scan progressed past the "
+						"checkpoint lsn " LSN_PF "",
+						recv_sys->scanned_lsn);
+
+					recv_init_crash_recovery();
+				} else {
+
+					ib_logf(IB_LOG_LEVEL_WARN,
+						"Recovery skipped, "
+						"--innodb-read-only set!");
+
+					return(TRUE);
+				}
+			}
+#endif /* !UNIV_HOTBACKUP */
+
+			/* We were able to find more log data: add it to the
+			parsing buffer if parse_start_lsn is already
+			non-zero */
+
+			if (recv_sys->len + 4 * OS_FILE_LOG_BLOCK_SIZE
+			    >= RECV_PARSING_BUF_SIZE) {
+				fprintf(stderr,
+					"InnoDB: Error: log parsing"
+					" buffer overflow."
+					" Recovery may have failed!\n");
+
+				recv_sys->found_corrupt_log = TRUE;
+
+#ifndef UNIV_HOTBACKUP
+				if (!srv_force_recovery) {
+					fputs("InnoDB: Set"
+					      " innodb_force_recovery"
+					      " to ignore this error.\n",
+					      stderr);
+					ut_error;
+				}
+#endif /* !UNIV_HOTBACKUP */
+
+			} else if (!recv_sys->found_corrupt_log) {
+				more_data = recv_sys_add_to_parsing_buf(
+					log_block, scanned_lsn);
+			}
+
+			recv_sys->scanned_lsn = scanned_lsn;
+			recv_sys->scanned_checkpoint_no
+				= log_block_get_checkpoint_no(log_block);
+		}
+
+		if (data_len < OS_FILE_LOG_BLOCK_SIZE) {
+			/* Log data for this group ends here */
+
+			finished = TRUE;
+			break;
+		} else {
+			log_block += OS_FILE_LOG_BLOCK_SIZE;
+		}
+	} while (log_block < buf + len && !finished);
+
+	*group_scanned_lsn = scanned_lsn;
+
+	if (recv_needed_recovery
+	    || (recv_is_from_backup && !recv_is_making_a_backup)) {
+		recv_scan_print_counter++;
+
+		if (finished || (recv_scan_print_counter % 80 == 0)) {
+
+			fprintf(stderr,
+				"InnoDB: Doing recovery: scanned up to"
+				" log sequence number " LSN_PF "\n",
+				*group_scanned_lsn);
+		}
+	}
+
+	if (more_data && !recv_sys->found_corrupt_log) {
+		/* Try to parse more log records */
+
+		recv_parse_log_recs(store_to_hash);
+
+#ifndef UNIV_HOTBACKUP
+		if (store_to_hash
+		    && mem_heap_get_size(recv_sys->heap) > available_memory) {
+
+			/* Hash table of log records has grown too big:
+			empty it; FALSE means no ibuf operations
+			allowed, as we cannot add new records to the
+			log yet: they would be produced by ibuf
+			operations */
+
+			recv_apply_hashed_log_recs(FALSE);
+		}
+#endif /* !UNIV_HOTBACKUP */
+
+		if (recv_sys->recovered_offset > RECV_PARSING_BUF_SIZE / 4) {
+			/* Move parsing buffer data to the buffer start */
+
+			recv_sys_justify_left_parsing_buf();
+		}
+	}
+
+	return(finished);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************//**
+Scans log from a buffer and stores new log data to the parsing buffer. Parses
+and hashes the log records if new data found. */
+static
+void
+recv_group_scan_log_recs(
+/*=====================*/
+	log_group_t*	group,		/*!< in: log group */
+	lsn_t*		contiguous_lsn,	/*!< in/out: it is known that all log
+					groups contain contiguous log data up
+					to this lsn */
+	lsn_t*		group_scanned_lsn)/*!< out: scanning succeeded up to
+					this lsn */
+{
+	ibool	finished;
+	lsn_t	start_lsn;
+	lsn_t	end_lsn;
+
+	finished = FALSE;
+
+	start_lsn = *contiguous_lsn;
+
+	while (!finished) {
+		end_lsn = start_lsn + RECV_SCAN_SIZE;
+
+		log_group_read_log_seg(LOG_RECOVER, log_sys->buf,
+				       group, start_lsn, end_lsn);
+
+		finished = recv_scan_log_recs(
+			(buf_pool_get_n_pages()
+			- (recv_n_pool_free_frames * srv_buf_pool_instances))
+			* UNIV_PAGE_SIZE,
+			TRUE, log_sys->buf, RECV_SCAN_SIZE,
+			start_lsn, contiguous_lsn, group_scanned_lsn);
+		start_lsn = end_lsn;
+	}
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes) {
+		fprintf(stderr,
+			"InnoDB: Scanned group %lu up to"
+			" log sequence number " LSN_PF "\n",
+			(ulong) group->id,
+			*group_scanned_lsn);
+	}
+#endif /* UNIV_DEBUG */
+}
+
+/*******************************************************//**
+Initialize crash recovery environment. Can be called iff
+recv_needed_recovery == FALSE. */
+static
+void
+recv_init_crash_recovery(void)
+/*==========================*/
+{
+	ut_ad(!srv_read_only_mode);
+	ut_a(!recv_needed_recovery);
+
+	recv_needed_recovery = TRUE;
+
+	ib_logf(IB_LOG_LEVEL_INFO, "Database was not shutdown normally!");
+	ib_logf(IB_LOG_LEVEL_INFO, "Starting crash recovery.");
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Reading tablespace information from the .ibd files...");
+
+	fil_load_single_table_tablespaces();
+
+	/* If we are using the doublewrite method, we will
+	check if there are half-written pages in data files,
+	and restore them from the doublewrite buffer if
+	possible */
+
+	if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) {
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Restoring possible half-written data pages ");
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"from the doublewrite buffer...");
+
+		buf_dblwr_process();
+
+		/* Spawn the background thread to flush dirty pages
+		from the buffer pools. */
+		recv_writer_thread_handle = os_thread_create(
+			recv_writer_thread, 0, 0);
+	}
+}
+
+/********************************************************//**
+Recovers from a checkpoint. When this function returns, the database is able
+to start processing of new user transactions, but the function
+recv_recovery_from_checkpoint_finish should be called later to complete
+the recovery and free the resources used in it.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+recv_recovery_from_checkpoint_start_func(
+/*=====================================*/
+#ifdef UNIV_LOG_ARCHIVE
+	ulint	type,		/*!< in: LOG_CHECKPOINT or LOG_ARCHIVE */
+	lsn_t	limit_lsn,	/*!< in: recover up to this lsn if possible */
+#endif /* UNIV_LOG_ARCHIVE */
+	lsn_t	min_flushed_lsn,/*!< in: min flushed lsn from data files */
+	lsn_t	max_flushed_lsn)/*!< in: max flushed lsn from data files */
+{
+	log_group_t*	group;
+	log_group_t*	max_cp_group;
+	ulint		max_cp_field;
+	lsn_t		checkpoint_lsn;
+	ib_uint64_t	checkpoint_no;
+	lsn_t		group_scanned_lsn = 0;
+	lsn_t		contiguous_lsn;
+#ifdef UNIV_LOG_ARCHIVE
+	log_group_t*	up_to_date_group;
+	lsn_t		archived_lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+	byte*		buf;
+	byte		log_hdr_buf[LOG_FILE_HDR_SIZE];
+	dberr_t		err;
+	ut_when_dtor<recv_dblwr_t> tmp(recv_sys->dblwr);
+
+#ifdef UNIV_LOG_ARCHIVE
+	ut_ad(type != LOG_CHECKPOINT || limit_lsn == LSN_MAX);
+/** TRUE when recovering from a checkpoint */
+# define TYPE_CHECKPOINT	(type == LOG_CHECKPOINT)
+/** Recover up to this log sequence number */
+# define LIMIT_LSN		limit_lsn
+#else /* UNIV_LOG_ARCHIVE */
+/** TRUE when recovering from a checkpoint */
+# define TYPE_CHECKPOINT	1
+/** Recover up to this log sequence number */
+# define LIMIT_LSN		LSN_MAX
+#endif /* UNIV_LOG_ARCHIVE */
+
+	if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) {
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"The user has set SRV_FORCE_NO_LOG_REDO on, "
+			"skipping log redo");
+
+		return(DB_SUCCESS);
+	}
+
+	recv_recovery_on = TRUE;
+
+	recv_sys->limit_lsn = LIMIT_LSN;
+
+	mutex_enter(&(log_sys->mutex));
+
+	/* Look for the latest checkpoint from any of the log groups */
+
+	err = recv_find_max_checkpoint(&max_cp_group, &max_cp_field);
+
+	if (err != DB_SUCCESS) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		return(err);
+	}
+
+	log_group_read_checkpoint_info(max_cp_group, max_cp_field);
+
+	buf = log_sys->checkpoint_buf;
+
+	checkpoint_lsn = mach_read_from_8(buf + LOG_CHECKPOINT_LSN);
+	checkpoint_no = mach_read_from_8(buf + LOG_CHECKPOINT_NO);
+#ifdef UNIV_LOG_ARCHIVE
+	archived_lsn = mach_read_from_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN);
+#endif /* UNIV_LOG_ARCHIVE */
+
+	/* Read the first log file header to print a note if this is
+	a recovery from a restored InnoDB Hot Backup */
+
+	fil_io(OS_FILE_READ | OS_FILE_LOG, true, max_cp_group->space_id, 0,
+	       0, 0, LOG_FILE_HDR_SIZE,
+	       log_hdr_buf, max_cp_group);
+
+	if (0 == ut_memcmp(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
+			   (byte*)"ibbackup", (sizeof "ibbackup") - 1)) {
+
+		if (srv_read_only_mode) {
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Cannot restore from mysqlbackup, InnoDB "
+				"running in read-only mode!");
+
+			return(DB_ERROR);
+		}
+
+		/* This log file was created by mysqlbackup --restore: print
+		a note to the user about it */
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"The log file was created by mysqlbackup --apply-log "
+			"at %s. The following crash recovery is part of a "
+			"normal restore.",
+			log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP);
+
+		/* Wipe over the label now */
+
+		memset(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
+		       ' ', 4);
+		/* Write to the log file to wipe over the label */
+		fil_io(OS_FILE_WRITE | OS_FILE_LOG, true,
+		       max_cp_group->space_id, 0,
+		       0, 0, OS_FILE_LOG_BLOCK_SIZE,
+		       log_hdr_buf, max_cp_group);
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	while (group) {
+		log_checkpoint_get_nth_group_info(buf, group->id,
+						  &(group->archived_file_no),
+						  &(group->archived_offset));
+
+		group = UT_LIST_GET_NEXT(log_groups, group);
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+
+	if (TYPE_CHECKPOINT) {
+		/* Start reading the log groups from the checkpoint lsn up. The
+		variable contiguous_lsn contains an lsn up to which the log is
+		known to be contiguously written to all log groups. */
+
+		recv_sys->parse_start_lsn = checkpoint_lsn;
+		recv_sys->scanned_lsn = checkpoint_lsn;
+		recv_sys->scanned_checkpoint_no = 0;
+		recv_sys->recovered_lsn = checkpoint_lsn;
+
+		srv_start_lsn = checkpoint_lsn;
+	}
+
+	contiguous_lsn = ut_uint64_align_down(recv_sys->scanned_lsn,
+					      OS_FILE_LOG_BLOCK_SIZE);
+#ifdef UNIV_LOG_ARCHIVE
+	if (TYPE_CHECKPOINT) {
+		up_to_date_group = max_cp_group;
+	} else {
+		ulint	capacity;
+
+		/* Try to recover the remaining part from logs: first from
+		the logs of the archived group */
+
+		group = recv_sys->archive_group;
+		capacity = log_group_get_capacity(group);
+
+		if (recv_sys->scanned_lsn > checkpoint_lsn + capacity
+		    || checkpoint_lsn > recv_sys->scanned_lsn + capacity) {
+
+			mutex_exit(&(log_sys->mutex));
+
+			/* The group does not contain enough log: probably
+			an archived log file was missing or corrupt */
+
+			return(DB_ERROR);
+		}
+
+		recv_group_scan_log_recs(group, &contiguous_lsn,
+					 &group_scanned_lsn);
+		if (recv_sys->scanned_lsn < checkpoint_lsn) {
+
+			mutex_exit(&(log_sys->mutex));
+
+			/* The group did not contain enough log: an archived
+			log file was missing or invalid, or the log group
+			was corrupt */
+
+			return(DB_ERROR);
+		}
+
+		group->scanned_lsn = group_scanned_lsn;
+		up_to_date_group = group;
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+
+	ut_ad(RECV_SCAN_SIZE <= log_sys->buf_size);
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+#ifdef UNIV_LOG_ARCHIVE
+	if ((type == LOG_ARCHIVE) && (group == recv_sys->archive_group)) {
+		group = UT_LIST_GET_NEXT(log_groups, group);
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+
+	/* Set the flag to publish that we are doing startup scan. */
+	recv_log_scan_is_startup_type = TYPE_CHECKPOINT;
+	while (group) {
+#ifdef UNIV_LOG_ARCHIVE
+		lsn_t	old_scanned_lsn	= recv_sys->scanned_lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+
+		recv_group_scan_log_recs(group, &contiguous_lsn,
+					 &group_scanned_lsn);
+		group->scanned_lsn = group_scanned_lsn;
+
+#ifdef UNIV_LOG_ARCHIVE
+		if (old_scanned_lsn < group_scanned_lsn) {
+			/* We found a more up-to-date group */
+
+			up_to_date_group = group;
+		}
+
+		if ((type == LOG_ARCHIVE)
+		    && (group == recv_sys->archive_group)) {
+			group = UT_LIST_GET_NEXT(log_groups, group);
+		}
+#endif /* UNIV_LOG_ARCHIVE */
+
+		group = UT_LIST_GET_NEXT(log_groups, group);
+	}
+
+	/* Done with startup scan. Clear the flag. */
+	recv_log_scan_is_startup_type = FALSE;
+	if (TYPE_CHECKPOINT) {
+		/* NOTE: we always do a 'recovery' at startup, but only if
+		there is something wrong we will print a message to the
+		user about recovery: */
+
+		if (checkpoint_lsn != max_flushed_lsn
+		    || checkpoint_lsn != min_flushed_lsn) {
+
+			if (checkpoint_lsn < max_flushed_lsn) {
+
+				ib_logf(IB_LOG_LEVEL_WARN,
+					"The log sequence number "
+					"in the ibdata files is higher "
+					"than the log sequence number "
+					"in the ib_logfiles! Are you sure "
+					"you are using the right "
+					"ib_logfiles to start up the database. "
+					"Log sequence number in the "
+					"ib_logfiles is " LSN_PF ", log"
+					"sequence numbers stamped "
+					"to ibdata file headers are between "
+					"" LSN_PF " and " LSN_PF ".",
+					checkpoint_lsn,
+					min_flushed_lsn,
+					max_flushed_lsn);
+			}
+
+			if (!recv_needed_recovery) {
+				ib_logf(IB_LOG_LEVEL_INFO,
+					"The log sequence numbers "
+					LSN_PF " and " LSN_PF
+					" in ibdata files do not match"
+					" the log sequence number "
+					LSN_PF
+					" in the ib_logfiles!",
+					min_flushed_lsn,
+					max_flushed_lsn,
+					checkpoint_lsn);
+
+				if (!srv_read_only_mode) {
+					recv_init_crash_recovery();
+				} else {
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"Can't initiate database "
+						"recovery, running "
+						"in read-only-mode.");
+					return(DB_READ_ONLY);
+				}
+			}
+		}
+	}
+
+	/* We currently have only one log group */
+	if (group_scanned_lsn < checkpoint_lsn
+	    || group_scanned_lsn < recv_max_page_lsn) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"We scanned the log up to "
+			LSN_PF ". A checkpoint was at " LSN_PF
+			" and the maximum LSN on a database page was " LSN_PF
+			". It is possible that the database is now corrupt!",
+			group_scanned_lsn, checkpoint_lsn, recv_max_page_lsn);
+	}
+
+	if (recv_sys->recovered_lsn < checkpoint_lsn) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		if (recv_sys->recovered_lsn >= LIMIT_LSN) {
+
+			return(DB_SUCCESS);
+		}
+
+		/* No harm in trying to do RO access. */
+		if (!srv_read_only_mode) {
+			ut_error;
+		}
+
+		return(DB_ERROR);
+	}
+
+	/* Synchronize the uncorrupted log groups to the most up-to-date log
+	group; we also copy checkpoint info to groups */
+
+	log_sys->next_checkpoint_lsn = checkpoint_lsn;
+	log_sys->next_checkpoint_no = checkpoint_no + 1;
+
+#ifdef UNIV_LOG_ARCHIVE
+	log_sys->archived_lsn = archived_lsn;
+
+	recv_synchronize_groups(up_to_date_group);
+#else /* UNIV_LOG_ARCHIVE */
+	recv_synchronize_groups();
+#endif /* UNIV_LOG_ARCHIVE */
+
+	if (!recv_needed_recovery) {
+		ut_a(checkpoint_lsn == recv_sys->recovered_lsn);
+	} else {
+		srv_start_lsn = recv_sys->recovered_lsn;
+	}
+
+	log_sys->lsn = recv_sys->recovered_lsn;
+
+	ut_memcpy(log_sys->buf, recv_sys->last_block, OS_FILE_LOG_BLOCK_SIZE);
+
+	log_sys->buf_free = (ulint) log_sys->lsn % OS_FILE_LOG_BLOCK_SIZE;
+	log_sys->buf_next_to_write = log_sys->buf_free;
+	log_sys->written_to_some_lsn = log_sys->lsn;
+	log_sys->written_to_all_lsn = log_sys->lsn;
+
+	log_sys->last_checkpoint_lsn = checkpoint_lsn;
+
+	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
+		    log_sys->lsn - log_sys->last_checkpoint_lsn);
+
+	log_sys->next_checkpoint_no = checkpoint_no + 1;
+
+#ifdef UNIV_LOG_ARCHIVE
+	if (archived_lsn == LSN_MAX) {
+
+		log_sys->archiving_state = LOG_ARCH_OFF;
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+
+	mutex_enter(&recv_sys->mutex);
+
+	recv_sys->apply_log_recs = TRUE;
+
+	mutex_exit(&recv_sys->mutex);
+
+	mutex_exit(&log_sys->mutex);
+
+	recv_lsn_checks_on = TRUE;
+
+	/* The database is now ready to start almost normal processing of user
+	transactions: transaction rollbacks and the application of the log
+	records in the hash table can be run in background. */
+
+	return(DB_SUCCESS);
+
+#undef TYPE_CHECKPOINT
+#undef LIMIT_LSN
+}
+
+/********************************************************//**
+Completes recovery from a checkpoint. */
+UNIV_INTERN
+void
+recv_recovery_from_checkpoint_finish(void)
+/*======================================*/
+{
+	/* Apply the hashed log records to the respective file pages */
+
+	if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) {
+
+		recv_apply_hashed_log_recs(TRUE);
+	}
+
+	DBUG_PRINT("ib_log", ("apply completed"));
+
+	if (recv_needed_recovery) {
+		trx_sys_print_mysql_master_log_pos();
+		trx_sys_print_mysql_binlog_offset();
+	}
+
+	if (recv_sys->found_corrupt_log) {
+
+		fprintf(stderr,
+			"InnoDB: WARNING: the log file may have been"
+			" corrupt and it\n"
+			"InnoDB: is possible that the log scan or parsing"
+			" did not proceed\n"
+			"InnoDB: far enough in recovery. Please run"
+			" CHECK TABLE\n"
+			"InnoDB: on your InnoDB tables to check that"
+			" they are ok!\n"
+			"InnoDB: It may be safest to recover your"
+			" InnoDB database from\n"
+			"InnoDB: a backup!\n");
+	}
+
+	/* Make sure that the recv_writer thread is done. This is
+	required because it grabs various mutexes and we want to
+	ensure that when we enable sync_order_checks there is no
+	mutex currently held by any thread. */
+	mutex_enter(&recv_sys->writer_mutex);
+
+	/* Free the resources of the recovery system */
+	recv_recovery_on = FALSE;
+
+	/* By acquring the mutex we ensure that the recv_writer thread
+	won't trigger any more LRU batchtes. Now wait for currently
+	in progress batches to finish. */
+	buf_flush_wait_LRU_batch_end();
+
+	mutex_exit(&recv_sys->writer_mutex);
+
+	ulint count = 0;
+	while (recv_writer_thread_active) {
+		++count;
+		os_thread_sleep(100000);
+		if (srv_print_verbose_log && count > 600) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Waiting for recv_writer to "
+				"finish flushing of buffer pool");
+			count = 0;
+		}
+	}
+
+#ifdef __WIN__
+	if (recv_writer_thread_handle) {
+		CloseHandle(recv_writer_thread_handle);
+	}
+#endif /* __WIN__ */
+
+#ifndef UNIV_LOG_DEBUG
+	recv_sys_debug_free();
+#endif
+	/* Roll back any recovered data dictionary transactions, so
+	that the data dictionary tables will be free of any locks.
+	The data dictionary latch should guarantee that there is at
+	most one data dictionary transaction active at a time. */
+	if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) {
+		trx_rollback_or_clean_recovered(FALSE);
+	}
+}
+
+/********************************************************//**
+Initiates the rollback of active transactions. */
+UNIV_INTERN
+void
+recv_recovery_rollback_active(void)
+/*===============================*/
+{
+#ifdef UNIV_SYNC_DEBUG
+	/* Wait for a while so that created threads have time to suspend
+	themselves before we switch the latching order checks on */
+	os_thread_sleep(1000000);
+
+	ut_ad(!recv_writer_thread_active);
+
+	/* Switch latching order checks on in sync0sync.cc */
+	sync_order_checks_on = TRUE;
+#endif
+	/* We can't start any (DDL) transactions if UNDO logging
+	has been disabled, additionally disable ROLLBACK of recovered
+	user transactions. */
+	if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
+	    && !srv_read_only_mode) {
+
+		/* Drop partially created indexes. */
+		row_merge_drop_temp_indexes();
+		/* Drop temporary tables. */
+		row_mysql_drop_temp_tables();
+
+		/* Drop any auxiliary tables that were not dropped when the
+		parent table was dropped. This can happen if the parent table
+		was dropped but the server crashed before the auxiliary tables
+		were dropped. */
+		fts_drop_orphaned_tables();
+
+		/* Rollback the uncommitted transactions which have no user
+		session */
+
+		trx_rollback_or_clean_is_active = true;
+		os_thread_create(trx_rollback_or_clean_all_recovered, 0, 0);
+	}
+}
+
+/******************************************************//**
+Resets the logs. The contents of log files will be lost! */
+UNIV_INTERN
+void
+recv_reset_logs(
+/*============*/
+#ifdef UNIV_LOG_ARCHIVE
+	ulint		arch_log_no,	/*!< in: next archived log file number */
+	ibool		new_logs_created,/*!< in: TRUE if resetting logs
+					is done at the log creation;
+					FALSE if it is done after
+					archive recovery */
+#endif /* UNIV_LOG_ARCHIVE */
+	lsn_t		lsn)		/*!< in: reset to this lsn
+					rounded up to be divisible by
+					OS_FILE_LOG_BLOCK_SIZE, after
+					which we add
+					LOG_BLOCK_HDR_SIZE */
+{
+	log_group_t*	group;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	log_sys->lsn = ut_uint64_align_up(lsn, OS_FILE_LOG_BLOCK_SIZE);
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	while (group) {
+		group->lsn = log_sys->lsn;
+		group->lsn_offset = LOG_FILE_HDR_SIZE;
+#ifdef UNIV_LOG_ARCHIVE
+		group->archived_file_no = arch_log_no;
+		group->archived_offset = 0;
+
+		if (!new_logs_created) {
+			recv_truncate_group(group, group->lsn, group->lsn,
+					    group->lsn, group->lsn);
+		}
+#endif /* UNIV_LOG_ARCHIVE */
+
+		group = UT_LIST_GET_NEXT(log_groups, group);
+	}
+
+	log_sys->buf_next_to_write = 0;
+	log_sys->written_to_some_lsn = log_sys->lsn;
+	log_sys->written_to_all_lsn = log_sys->lsn;
+
+	log_sys->next_checkpoint_no = 0;
+	log_sys->last_checkpoint_lsn = 0;
+
+#ifdef UNIV_LOG_ARCHIVE
+	log_sys->archived_lsn = log_sys->lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+
+	log_block_init(log_sys->buf, log_sys->lsn);
+	log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE);
+
+	log_sys->buf_free = LOG_BLOCK_HDR_SIZE;
+	log_sys->lsn += LOG_BLOCK_HDR_SIZE;
+
+	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
+		    (log_sys->lsn - log_sys->last_checkpoint_lsn));
+
+	mutex_exit(&(log_sys->mutex));
+
+	/* Reset the checkpoint fields in logs */
+
+	log_make_checkpoint_at(LSN_MAX, TRUE);
+
+	mutex_enter(&(log_sys->mutex));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_HOTBACKUP
+/******************************************************//**
+Creates new log files after a backup has been restored. */
+UNIV_INTERN
+void
+recv_reset_log_files_for_backup(
+/*============================*/
+	const char*	log_dir,	/*!< in: log file directory path */
+	ulint		n_log_files,	/*!< in: number of log files */
+	lsn_t		log_file_size,	/*!< in: log file size */
+	lsn_t		lsn)		/*!< in: new start lsn, must be
+					divisible by OS_FILE_LOG_BLOCK_SIZE */
+{
+	os_file_t	log_file;
+	ibool		success;
+	byte*		buf;
+	ulint		i;
+	ulint		log_dir_len;
+	char		name[5000];
+	static const char ib_logfile_basename[] = "ib_logfile";
+
+	log_dir_len = strlen(log_dir);
+	/* full path name of ib_logfile consists of log dir path + basename
+	+ number. This must fit in the name buffer.
+	*/
+	ut_a(log_dir_len + strlen(ib_logfile_basename) + 11  < sizeof(name));
+
+	buf = ut_malloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+	memset(buf, '\0', LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+
+	for (i = 0; i < n_log_files; i++) {
+
+		sprintf(name, "%s%s%lu", log_dir,
+			ib_logfile_basename, (ulong) i);
+
+		log_file = os_file_create_simple(innodb_file_log_key,
+						 name, OS_FILE_CREATE,
+						 OS_FILE_READ_WRITE,
+						 &success);
+		if (!success) {
+			fprintf(stderr,
+				"InnoDB: Cannot create %s. Check that"
+				" the file does not exist yet.\n", name);
+
+			exit(1);
+		}
+
+		fprintf(stderr,
+			"Setting log file size to %llu\n",
+			log_file_size);
+
+		success = os_file_set_size(name, log_file, log_file_size);
+
+		if (!success) {
+			fprintf(stderr,
+				"InnoDB: Cannot set %s size to %llu\n",
+				name, log_file_size);
+			exit(1);
+		}
+
+		os_file_flush(log_file);
+		os_file_close(log_file);
+	}
+
+	/* We pretend there is a checkpoint at lsn + LOG_BLOCK_HDR_SIZE */
+
+	log_reset_first_header_and_checkpoint(buf, lsn);
+
+	log_block_init_in_old_format(buf + LOG_FILE_HDR_SIZE, lsn);
+	log_block_set_first_rec_group(buf + LOG_FILE_HDR_SIZE,
+				      LOG_BLOCK_HDR_SIZE);
+	sprintf(name, "%s%s%lu", log_dir, ib_logfile_basename, (ulong)0);
+
+	log_file = os_file_create_simple(innodb_file_log_key,
+					 name, OS_FILE_OPEN,
+					 OS_FILE_READ_WRITE, &success);
+	if (!success) {
+		fprintf(stderr, "InnoDB: Cannot open %s.\n", name);
+
+		exit(1);
+	}
+
+	os_file_write(name, log_file, buf, 0,
+		      LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+	os_file_flush(log_file);
+	os_file_close(log_file);
+
+	ut_free(buf);
+}
+#endif /* UNIV_HOTBACKUP */
+
+#ifdef UNIV_LOG_ARCHIVE
+/* Dead code */
+/******************************************************//**
+Reads from the archive of a log group and performs recovery.
+@return	TRUE if no more complete consistent archive files */
+static
+ibool
+log_group_recover_from_archive_file(
+/*================================*/
+	log_group_t*	group)		/*!< in: log group */
+{
+	os_file_t	file_handle;
+	ib_uint64_t	start_lsn;
+	ib_uint64_t	file_end_lsn;
+	ib_uint64_t	dummy_lsn;
+	ib_uint64_t	scanned_lsn;
+	ulint		len;
+	ibool		ret;
+	byte*		buf;
+	os_offset_t	read_offset;
+	os_offset_t	file_size;
+	int		input_char;
+	char		name[10000];
+
+	ut_a(0);
+
+try_open_again:
+	buf = log_sys->buf;
+
+	/* Add the file to the archive file space; open the file */
+
+	log_archived_file_name_gen(name, group->id, group->archived_file_no);
+
+	file_handle = os_file_create(innodb_file_log_key,
+				     name, OS_FILE_OPEN,
+				     OS_FILE_LOG, OS_FILE_AIO, &ret);
+
+	if (ret == FALSE) {
+ask_again:
+		fprintf(stderr,
+			"InnoDB: Do you want to copy additional"
+			" archived log files\n"
+			"InnoDB: to the directory\n");
+		fprintf(stderr,
+			"InnoDB: or were these all the files needed"
+			" in recovery?\n");
+		fprintf(stderr,
+			"InnoDB: (Y == copy more files; N == this is all)?");
+
+		input_char = getchar();
+
+		if (input_char == (int) 'N') {
+
+			return(TRUE);
+		} else if (input_char == (int) 'Y') {
+
+			goto try_open_again;
+		} else {
+			goto ask_again;
+		}
+	}
+
+	file_size = os_file_get_size(file_handle);
+	ut_a(file_size != (os_offset_t) -1);
+
+	fprintf(stderr, "InnoDB: Opened archived log file %s\n", name);
+
+	ret = os_file_close(file_handle);
+
+	if (file_size < LOG_FILE_HDR_SIZE) {
+		fprintf(stderr,
+			"InnoDB: Archive file header incomplete %s\n", name);
+
+		return(TRUE);
+	}
+
+	ut_a(ret);
+
+	/* Add the archive file as a node to the space */
+
+	fil_node_create(name, 1 + file_size / UNIV_PAGE_SIZE,
+			group->archive_space_id, FALSE);
+#if RECV_SCAN_SIZE < LOG_FILE_HDR_SIZE
+# error "RECV_SCAN_SIZE < LOG_FILE_HDR_SIZE"
+#endif
+
+	/* Read the archive file header */
+	fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->archive_space_id, 0, 0,
+	       LOG_FILE_HDR_SIZE, buf, NULL);
+
+	/* Check if the archive file header is consistent */
+
+	if (mach_read_from_4(buf + LOG_GROUP_ID) != group->id
+	    || mach_read_from_4(buf + LOG_FILE_NO)
+	    != group->archived_file_no) {
+		fprintf(stderr,
+			"InnoDB: Archive file header inconsistent %s\n", name);
+
+		return(TRUE);
+	}
+
+	if (!mach_read_from_4(buf + LOG_FILE_ARCH_COMPLETED)) {
+		fprintf(stderr,
+			"InnoDB: Archive file not completely written %s\n",
+			name);
+
+		return(TRUE);
+	}
+
+	start_lsn = mach_read_from_8(buf + LOG_FILE_START_LSN);
+	file_end_lsn = mach_read_from_8(buf + LOG_FILE_END_LSN);
+
+	if (!recv_sys->scanned_lsn) {
+
+		if (recv_sys->parse_start_lsn < start_lsn) {
+			fprintf(stderr,
+				"InnoDB: Archive log file %s"
+				" starts from too big a lsn\n",
+				name);
+			return(TRUE);
+		}
+
+		recv_sys->scanned_lsn = start_lsn;
+	}
+
+	if (recv_sys->scanned_lsn != start_lsn) {
+
+		fprintf(stderr,
+			"InnoDB: Archive log file %s starts from"
+			" a wrong lsn\n",
+			name);
+		return(TRUE);
+	}
+
+	read_offset = LOG_FILE_HDR_SIZE;
+
+	for (;;) {
+		len = RECV_SCAN_SIZE;
+
+		if (read_offset + len > file_size) {
+			len = ut_calc_align_down(file_size - read_offset,
+						 OS_FILE_LOG_BLOCK_SIZE);
+		}
+
+		if (len == 0) {
+
+			break;
+		}
+
+#ifdef UNIV_DEBUG
+		if (log_debug_writes) {
+			fprintf(stderr,
+				"InnoDB: Archive read starting at"
+				" lsn %llu, len %lu from file %s\n",
+				start_lsn,
+				(ulong) len, name);
+		}
+#endif /* UNIV_DEBUG */
+
+		fil_io(OS_FILE_READ | OS_FILE_LOG, true,
+		       group->archive_space_id, read_offset / UNIV_PAGE_SIZE,
+		       read_offset % UNIV_PAGE_SIZE, len, buf, NULL);
+
+		ret = recv_scan_log_recs(
+			(buf_pool_get_n_pages()
+			- (recv_n_pool_free_frames * srv_buf_pool_instances))
+			* UNIV_PAGE_SIZE, TRUE, buf, len, start_lsn,
+			&dummy_lsn, &scanned_lsn);
+
+		if (scanned_lsn == file_end_lsn) {
+
+			return(FALSE);
+		}
+
+		if (ret) {
+			fprintf(stderr,
+				"InnoDB: Archive log file %s"
+				" does not scan right\n",
+				name);
+			return(TRUE);
+		}
+
+		read_offset += len;
+		start_lsn += len;
+
+		ut_ad(start_lsn == scanned_lsn);
+	}
+
+	return(FALSE);
+}
+
+/********************************************************//**
+Recovers from archived log files, and also from log files, if they exist.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+recv_recovery_from_archive_start(
+/*=============================*/
+	ib_uint64_t	min_flushed_lsn,/*!< in: min flushed lsn field from the
+					data files */
+	ib_uint64_t	limit_lsn,	/*!< in: recover up to this lsn if
+					possible */
+	ulint		first_log_no)	/*!< in: number of the first archived
+					log file to use in the recovery; the
+					file will be searched from
+					INNOBASE_LOG_ARCH_DIR specified in
+					server config file */
+{
+	log_group_t*	group;
+	ulint		group_id;
+	ulint		trunc_len;
+	ibool		ret;
+	ulint		err;
+
+	ut_a(0);
+
+	recv_sys_create();
+	recv_sys_init(buf_pool_get_curr_size());
+
+	recv_recovery_on = TRUE;
+	recv_recovery_from_backup_on = TRUE;
+
+	recv_sys->limit_lsn = limit_lsn;
+
+	group_id = 0;
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	while (group) {
+		if (group->id == group_id) {
+
+			break;
+		}
+
+		group = UT_LIST_GET_NEXT(log_groups, group);
+	}
+
+	if (!group) {
+		fprintf(stderr,
+			"InnoDB: There is no log group defined with id %lu!\n",
+			(ulong) group_id);
+		return(DB_ERROR);
+	}
+
+	group->archived_file_no = first_log_no;
+
+	recv_sys->parse_start_lsn = min_flushed_lsn;
+
+	recv_sys->scanned_lsn = 0;
+	recv_sys->scanned_checkpoint_no = 0;
+	recv_sys->recovered_lsn = recv_sys->parse_start_lsn;
+
+	recv_sys->archive_group = group;
+
+	ret = FALSE;
+
+	mutex_enter(&(log_sys->mutex));
+
+	while (!ret) {
+		ret = log_group_recover_from_archive_file(group);
+
+		/* Close and truncate a possible processed archive file
+		from the file space */
+
+		trunc_len = UNIV_PAGE_SIZE
+			* fil_space_get_size(group->archive_space_id);
+		if (trunc_len > 0) {
+			fil_space_truncate_start(group->archive_space_id,
+						 trunc_len);
+		}
+
+		group->archived_file_no++;
+	}
+
+	if (recv_sys->recovered_lsn < limit_lsn) {
+
+		if (!recv_sys->scanned_lsn) {
+
+			recv_sys->scanned_lsn = recv_sys->parse_start_lsn;
+		}
+
+		mutex_exit(&(log_sys->mutex));
+
+		err = recv_recovery_from_checkpoint_start(LOG_ARCHIVE,
+							  limit_lsn,
+							  LSN_MAX,
+							  LSN_MAX);
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+
+		mutex_enter(&(log_sys->mutex));
+	}
+
+	if (limit_lsn != LSN_MAX) {
+
+		recv_apply_hashed_log_recs(FALSE);
+
+		recv_reset_logs(0, FALSE, recv_sys->recovered_lsn);
+	}
+
+	mutex_exit(&(log_sys->mutex));
+
+	return(DB_SUCCESS);
+}
+
+/********************************************************//**
+Completes recovery from archive. */
+UNIV_INTERN
+void
+recv_recovery_from_archive_finish(void)
+/*===================================*/
+{
+	recv_recovery_from_checkpoint_finish();
+
+	recv_recovery_from_backup_on = FALSE;
+}
+#endif /* UNIV_LOG_ARCHIVE */
+
+
+void recv_dblwr_t::add(byte* page)
+{
+	pages.push_back(page);
+}
+
+byte* recv_dblwr_t::find_page(ulint space_id, ulint page_no)
+{
+	std::vector<byte*> matches;
+	byte*	result = 0;
+
+	for (std::list<byte*>::iterator i = pages.begin();
+	     i != pages.end(); ++i) {
+
+		if ((page_get_space_id(*i) == space_id)
+		    && (page_get_page_no(*i) == page_no)) {
+			matches.push_back(*i);
+		}
+	}
+
+	if (matches.size() == 1) {
+		result = matches[0];
+	} else if (matches.size() > 1) {
+
+		lsn_t max_lsn	= 0;
+		lsn_t page_lsn	= 0;
+
+		for (std::vector<byte*>::iterator i = matches.begin();
+		     i != matches.end(); ++i) {
+
+			page_lsn = mach_read_from_8(*i + FIL_PAGE_LSN);
+
+			if (page_lsn > max_lsn) {
+				max_lsn = page_lsn;
+				result = *i;
+			}
+		}
+	}
+
+	return(result);
+}
+
diff --git a/storage/innobase/mach/mach0data.cc b/storage/innobase/mach/mach0data.cc
new file mode 100644
index 00000000000..df68aab8a18
--- /dev/null
+++ b/storage/innobase/mach/mach0data.cc
@@ -0,0 +1,94 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file mach/mach0data.cc
+Utilities for converting data from the database file
+to the machine format.
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "mach0data.h"
+
+#ifdef UNIV_NONINL
+#include "mach0data.ic"
+#endif
+
+/*********************************************************//**
+Reads a ulint in a compressed form if the log record fully contains it.
+@return	pointer to end of the stored field, NULL if not complete */
+UNIV_INTERN
+byte*
+mach_parse_compressed(
+/*==================*/
+	byte*	ptr,	/*!< in: pointer to buffer from where to read */
+	byte*	end_ptr,/*!< in: pointer to end of the buffer */
+	ulint*	val)	/*!< out: read value (< 2^32) */
+{
+	ulint	flag;
+
+	ut_ad(ptr && end_ptr && val);
+
+	if (ptr >= end_ptr) {
+
+		return(NULL);
+	}
+
+	flag = mach_read_from_1(ptr);
+
+	if (flag < 0x80UL) {
+		*val = flag;
+		return(ptr + 1);
+
+	} else if (flag < 0xC0UL) {
+		if (end_ptr < ptr + 2) {
+			return(NULL);
+		}
+
+		*val = mach_read_from_2(ptr) & 0x7FFFUL;
+
+		return(ptr + 2);
+
+	} else if (flag < 0xE0UL) {
+		if (end_ptr < ptr + 3) {
+			return(NULL);
+		}
+
+		*val = mach_read_from_3(ptr) & 0x3FFFFFUL;
+
+		return(ptr + 3);
+	} else if (flag < 0xF0UL) {
+		if (end_ptr < ptr + 4) {
+			return(NULL);
+		}
+
+		*val = mach_read_from_4(ptr) & 0x1FFFFFFFUL;
+
+		return(ptr + 4);
+	} else {
+		ut_ad(flag == 0xF0UL);
+
+		if (end_ptr < ptr + 5) {
+			return(NULL);
+		}
+
+		*val = mach_read_from_4(ptr + 1);
+		return(ptr + 5);
+	}
+}
diff --git a/storage/innobase/mem/mem0dbg.cc b/storage/innobase/mem/mem0dbg.cc
new file mode 100644
index 00000000000..308c2979551
--- /dev/null
+++ b/storage/innobase/mem/mem0dbg.cc
@@ -0,0 +1,1050 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file mem/mem0dbg.cc
+The memory management: the debug code. This is not a compilation module,
+but is included in mem0mem.* !
+
+Created 6/9/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifdef UNIV_MEM_DEBUG
+# ifndef UNIV_HOTBACKUP
+#  include "ha_prototypes.h"
+/* The mutex which protects in the debug version the hash table
+containing the list of live memory heaps, and also the global
+variables below. */
+UNIV_INTERN ib_mutex_t		mem_hash_mutex;
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register mem_hash_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	mem_hash_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+# endif /* !UNIV_HOTBACKUP */
+
+/* The following variables contain information about the
+extent of memory allocations. Only used in the debug version.
+Protected by mem_hash_mutex above. */
+
+static ulint		mem_n_created_heaps		= 0;
+static ulint		mem_n_allocations		= 0;
+static ulint		mem_total_allocated_memory	= 0;
+UNIV_INTERN ulint	mem_current_allocated_memory	= 0;
+static ulint		mem_max_allocated_memory	= 0;
+# ifndef UNIV_HOTBACKUP
+static ulint		mem_last_print_info		= 0;
+static ibool		mem_hash_initialized		= FALSE;
+# endif /* !UNIV_HOTBACKUP */
+
+/* Size of the hash table for memory management tracking */
+#define	MEM_HASH_SIZE	997
+
+/* The node of the list containing currently allocated memory heaps */
+
+struct mem_hash_node_t {
+	UT_LIST_NODE_T(mem_hash_node_t)
+				list;	/*!< hash list node */
+	mem_heap_t*		heap;	/*!< memory heap */
+	const char*		file_name;/* file where heap was created*/
+	ulint			line;	/*!< file line of creation */
+	ulint			nth_heap;/* this is the nth heap created */
+	UT_LIST_NODE_T(mem_hash_node_t)
+				all_list;/* list of all created heaps */
+};
+
+typedef UT_LIST_BASE_NODE_T(mem_hash_node_t) mem_hash_cell_t;
+
+/* The hash table of allocated heaps */
+static mem_hash_cell_t		mem_hash_table[MEM_HASH_SIZE];
+
+/* The base node of the list of all allocated heaps */
+static mem_hash_cell_t		mem_all_list_base;
+
+
+
+UNIV_INLINE
+mem_hash_cell_t*
+mem_hash_get_nth_cell(ulint i);
+
+/* Accessor function for the hash table. Returns a pointer to the
+table cell. */
+UNIV_INLINE
+mem_hash_cell_t*
+mem_hash_get_nth_cell(ulint i)
+{
+	ut_a(i < MEM_HASH_SIZE);
+
+	return(&(mem_hash_table[i]));
+}
+
+/* Accessor functions for a memory field in the debug version */
+UNIV_INTERN
+void
+mem_field_header_set_len(byte* field, ulint len)
+{
+	mach_write_to_4(field - 2 * sizeof(ulint), len);
+}
+
+UNIV_INTERN
+ulint
+mem_field_header_get_len(byte* field)
+{
+	return(mach_read_from_4(field - 2 * sizeof(ulint)));
+}
+
+UNIV_INTERN
+void
+mem_field_header_set_check(byte* field, ulint check)
+{
+	mach_write_to_4(field - sizeof(ulint), check);
+}
+
+UNIV_INTERN
+ulint
+mem_field_header_get_check(byte* field)
+{
+	return(mach_read_from_4(field - sizeof(ulint)));
+}
+
+UNIV_INTERN
+void
+mem_field_trailer_set_check(byte* field, ulint check)
+{
+	mach_write_to_4(field + mem_field_header_get_len(field), check);
+}
+
+UNIV_INTERN
+ulint
+mem_field_trailer_get_check(byte* field)
+{
+	return(mach_read_from_4(field
+				+ mem_field_header_get_len(field)));
+}
+#endif /* UNIV_MEM_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Initializes the memory system. */
+UNIV_INTERN
+void
+mem_init(
+/*=====*/
+	ulint	size)	/*!< in: common pool size in bytes */
+{
+#ifdef UNIV_MEM_DEBUG
+
+	ulint	i;
+
+	/* Initialize the hash table */
+	ut_a(FALSE == mem_hash_initialized);
+
+	mutex_create(mem_hash_mutex_key, &mem_hash_mutex, SYNC_MEM_HASH);
+
+	for (i = 0; i < MEM_HASH_SIZE; i++) {
+		UT_LIST_INIT(*mem_hash_get_nth_cell(i));
+	}
+
+	UT_LIST_INIT(mem_all_list_base);
+
+	mem_hash_initialized = TRUE;
+#endif
+
+	if (UNIV_LIKELY(srv_use_sys_malloc)) {
+		/* When innodb_use_sys_malloc is set, the
+		mem_comm_pool won't be used for any allocations.  We
+		create a dummy mem_comm_pool, because some statistics
+		and debugging code relies on it being initialized. */
+		size = 1;
+	}
+
+	mem_comm_pool = mem_pool_create(size);
+}
+
+/******************************************************************//**
+Closes the memory system. */
+UNIV_INTERN
+void
+mem_close(void)
+/*===========*/
+{
+	mem_pool_free(mem_comm_pool);
+	mem_comm_pool = NULL;
+#ifdef UNIV_MEM_DEBUG
+	mutex_free(&mem_hash_mutex);
+	mem_hash_initialized = FALSE;
+#endif /* UNIV_MEM_DEBUG */
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_MEM_DEBUG
+/******************************************************************//**
+Initializes an allocated memory field in the debug version. */
+UNIV_INTERN
+void
+mem_field_init(
+/*===========*/
+	byte*	buf,	/*!< in: memory field */
+	ulint	n)	/*!< in: how many bytes the user requested */
+{
+	ulint	rnd;
+	byte*	usr_buf;
+
+	usr_buf = buf + MEM_FIELD_HEADER_SIZE;
+
+	/* In the debug version write the length field and the
+	check fields to the start and the end of the allocated storage.
+	The field header consists of a length field and
+	a random number field, in this order. The field trailer contains
+	the same random number as a check field. */
+
+	mem_field_header_set_len(usr_buf, n);
+
+	rnd = ut_rnd_gen_ulint();
+
+	mem_field_header_set_check(usr_buf, rnd);
+	mem_field_trailer_set_check(usr_buf, rnd);
+
+	/* Update the memory allocation information */
+
+	mutex_enter(&mem_hash_mutex);
+
+	mem_total_allocated_memory += n;
+	mem_current_allocated_memory += n;
+	mem_n_allocations++;
+
+	if (mem_current_allocated_memory > mem_max_allocated_memory) {
+		mem_max_allocated_memory = mem_current_allocated_memory;
+	}
+
+	mutex_exit(&mem_hash_mutex);
+
+	/* In the debug version set the buffer to a random
+	combination of 0xBA and 0xBE */
+
+	mem_init_buf(usr_buf, n);
+}
+
+/******************************************************************//**
+Erases an allocated memory field in the debug version. */
+UNIV_INTERN
+void
+mem_field_erase(
+/*============*/
+	byte*	buf,	/*!< in: memory field */
+	ulint	n __attribute__((unused)))
+			/*!< in: how many bytes the user requested */
+{
+	byte*	usr_buf;
+
+	usr_buf = buf + MEM_FIELD_HEADER_SIZE;
+
+	mutex_enter(&mem_hash_mutex);
+	mem_current_allocated_memory	-= n;
+	mutex_exit(&mem_hash_mutex);
+
+	/* Check that the field lengths agree */
+	ut_ad(n == (ulint) mem_field_header_get_len(usr_buf));
+
+	/* In the debug version, set the freed space to a random
+	combination of 0xDE and 0xAD */
+
+	mem_erase_buf(buf, MEM_SPACE_NEEDED(n));
+}
+
+/***************************************************************//**
+Initializes a buffer to a random combination of hex BA and BE.
+Used to initialize allocated memory. */
+UNIV_INTERN
+void
+mem_init_buf(
+/*=========*/
+	byte*	buf,	/*!< in: pointer to buffer */
+	ulint	 n)	/*!< in: length of buffer */
+{
+	byte*	ptr;
+
+	UNIV_MEM_ASSERT_W(buf, n);
+
+	for (ptr = buf; ptr < buf + n; ptr++) {
+
+		if (ut_rnd_gen_ibool()) {
+			*ptr = 0xBA;
+		} else {
+			*ptr = 0xBE;
+		}
+	}
+
+	UNIV_MEM_INVALID(buf, n);
+}
+
+/***************************************************************//**
+Initializes a buffer to a random combination of hex DE and AD.
+Used to erase freed memory. */
+UNIV_INTERN
+void
+mem_erase_buf(
+/*==========*/
+	byte*	buf,	/*!< in: pointer to buffer */
+	ulint	n)	/*!< in: length of buffer */
+{
+	byte*	ptr;
+
+	UNIV_MEM_ASSERT_W(buf, n);
+
+	for (ptr = buf; ptr < buf + n; ptr++) {
+		if (ut_rnd_gen_ibool()) {
+			*ptr = 0xDE;
+		} else {
+			*ptr = 0xAD;
+		}
+	}
+
+	UNIV_MEM_FREE(buf, n);
+}
+
+/***************************************************************//**
+Inserts a created memory heap to the hash table of current allocated
+memory heaps. */
+UNIV_INTERN
+void
+mem_hash_insert(
+/*============*/
+	mem_heap_t*	heap,	   /*!< in: the created heap */
+	const char*	file_name, /*!< in: file name of creation */
+	ulint		line)	   /*!< in: line where created */
+{
+	mem_hash_node_t*	new_node;
+	ulint			cell_no	;
+
+	ut_ad(mem_heap_check(heap));
+
+	mutex_enter(&mem_hash_mutex);
+
+	cell_no = ut_hash_ulint((ulint) heap, MEM_HASH_SIZE);
+
+	/* Allocate a new node to the list */
+	new_node = static_cast<mem_hash_node_t*>(ut_malloc(sizeof(*new_node)));
+
+	new_node->heap = heap;
+	new_node->file_name = file_name;
+	new_node->line = line;
+	new_node->nth_heap = mem_n_created_heaps;
+
+	/* Insert into lists */
+	UT_LIST_ADD_FIRST(list, *mem_hash_get_nth_cell(cell_no), new_node);
+
+	UT_LIST_ADD_LAST(all_list, mem_all_list_base, new_node);
+
+	mem_n_created_heaps++;
+
+	mutex_exit(&mem_hash_mutex);
+}
+
+/***************************************************************//**
+Removes a memory heap (which is going to be freed by the caller)
+from the list of live memory heaps. Returns the size of the heap
+in terms of how much memory in bytes was allocated for the user of
+the heap (not the total space occupied by the heap).
+Also validates the heap.
+NOTE: This function does not free the storage occupied by the
+heap itself, only the node in the list of heaps. */
+UNIV_INTERN
+void
+mem_hash_remove(
+/*============*/
+	mem_heap_t*	heap,	   /*!< in: the heap to be freed */
+	const char*	file_name, /*!< in: file name of freeing */
+	ulint		line)	   /*!< in: line where freed */
+{
+	mem_hash_node_t*	node;
+	ulint			cell_no;
+	ibool			error;
+	ulint			size;
+
+	ut_ad(mem_heap_check(heap));
+
+	mutex_enter(&mem_hash_mutex);
+
+	cell_no = ut_hash_ulint((ulint) heap, MEM_HASH_SIZE);
+
+	/* Look for the heap in the hash table list */
+	node = UT_LIST_GET_FIRST(*mem_hash_get_nth_cell(cell_no));
+
+	while (node != NULL) {
+		if (node->heap == heap) {
+
+			break;
+		}
+
+		node = UT_LIST_GET_NEXT(list, node);
+	}
+
+	if (node == NULL) {
+		fprintf(stderr,
+			"Memory heap or buffer freed in %s line %lu"
+			" did not exist.\n",
+			innobase_basename(file_name), (ulong) line);
+		ut_error;
+	}
+
+	/* Remove from lists */
+	UT_LIST_REMOVE(list, *mem_hash_get_nth_cell(cell_no), node);
+
+	UT_LIST_REMOVE(all_list, mem_all_list_base, node);
+
+	/* Validate the heap which will be freed */
+	mem_heap_validate_or_print(node->heap, NULL, FALSE, &error, &size,
+				   NULL, NULL);
+	if (error) {
+		fprintf(stderr,
+			"Inconsistency in memory heap or"
+			" buffer n:o %lu created\n"
+			"in %s line %lu and tried to free in %s line %lu.\n"
+			"Hex dump of 400 bytes around memory heap"
+			" first block start:\n",
+			node->nth_heap,
+			innobase_basename(node->file_name), (ulong) node->line,
+			innobase_basename(file_name), (ulong) line);
+		ut_print_buf(stderr, (byte*) node->heap - 200, 400);
+		fputs("\nDump of the mem heap:\n", stderr);
+		mem_heap_validate_or_print(node->heap, NULL, TRUE, &error,
+					   &size, NULL, NULL);
+		ut_error;
+	}
+
+	/* Free the memory occupied by the node struct */
+	ut_free(node);
+
+	mem_current_allocated_memory -= size;
+
+	mutex_exit(&mem_hash_mutex);
+}
+#endif /* UNIV_MEM_DEBUG */
+
+#if defined UNIV_MEM_DEBUG || defined UNIV_DEBUG
+/***************************************************************//**
+Checks a memory heap for consistency and prints the contents if requested.
+Outputs the sum of sizes of buffers given to the user (only in
+the debug version), the physical size of the heap and the number of
+blocks in the heap. In case of error returns 0 as sizes and number
+of blocks. */
+UNIV_INTERN
+void
+mem_heap_validate_or_print(
+/*=======================*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	byte*		top __attribute__((unused)),
+				/*!< in: calculate and validate only until
+				this top pointer in the heap is reached,
+				if this pointer is NULL, ignored */
+	ibool		print,	/*!< in: if TRUE, prints the contents
+				of the heap; works only in
+				the debug version */
+	ibool*		error,	/*!< out: TRUE if error */
+	ulint*		us_size,/*!< out: allocated memory
+				(for the user) in the heap,
+				if a NULL pointer is passed as this
+				argument, it is ignored; in the
+				non-debug version this is always -1 */
+	ulint*		ph_size,/*!< out: physical size of the heap,
+				if a NULL pointer is passed as this
+				argument, it is ignored */
+	ulint*		n_blocks) /*!< out: number of blocks in the heap,
+				if a NULL pointer is passed as this
+				argument, it is ignored */
+{
+	mem_block_t*	block;
+	ulint		total_len	= 0;
+	ulint		block_count	= 0;
+	ulint		phys_len	= 0;
+#ifdef UNIV_MEM_DEBUG
+	ulint		len;
+	byte*		field;
+	byte*		user_field;
+	ulint		check_field;
+#endif
+
+	/* Pessimistically, we set the parameters to error values */
+	if (us_size != NULL) {
+		*us_size = 0;
+	}
+	if (ph_size != NULL) {
+		*ph_size = 0;
+	}
+	if (n_blocks != NULL) {
+		*n_blocks = 0;
+	}
+	*error = TRUE;
+
+	block = heap;
+
+	if (block->magic_n != MEM_BLOCK_MAGIC_N) {
+		return;
+	}
+
+	if (print) {
+		fputs("Memory heap:", stderr);
+	}
+
+	while (block != NULL) {
+		phys_len += mem_block_get_len(block);
+
+		if ((block->type == MEM_HEAP_BUFFER)
+		    && (mem_block_get_len(block) > UNIV_PAGE_SIZE)) {
+
+			fprintf(stderr,
+				"InnoDB: Error: mem block %p"
+				" length %lu > UNIV_PAGE_SIZE\n",
+				(void*) block,
+				(ulong) mem_block_get_len(block));
+			/* error */
+
+			return;
+		}
+
+#ifdef UNIV_MEM_DEBUG
+		/* We can trace the fields of the block only in the debug
+		version */
+		if (print) {
+			fprintf(stderr, " Block %ld:", block_count);
+		}
+
+		field = (byte*) block + mem_block_get_start(block);
+
+		if (top && (field == top)) {
+
+			goto completed;
+		}
+
+		while (field < (byte*) block + mem_block_get_free(block)) {
+
+			/* Calculate the pointer to the storage
+			which was given to the user */
+
+			user_field = field + MEM_FIELD_HEADER_SIZE;
+
+			len = mem_field_header_get_len(user_field);
+
+			if (print) {
+				ut_print_buf(stderr, user_field, len);
+				putc('\n', stderr);
+			}
+
+			total_len += len;
+			check_field = mem_field_header_get_check(user_field);
+
+			if (check_field
+			    != mem_field_trailer_get_check(user_field)) {
+				/* error */
+
+				fprintf(stderr,
+					"InnoDB: Error: block %lx mem"
+					" field %lx len %lu\n"
+					"InnoDB: header check field is"
+					" %lx but trailer %lx\n",
+					(ulint) block,
+					(ulint) field, len, check_field,
+					mem_field_trailer_get_check(
+						user_field));
+
+				return;
+			}
+
+			/* Move to next field */
+			field = field + MEM_SPACE_NEEDED(len);
+
+			if (top && (field == top)) {
+
+				goto completed;
+			}
+
+		}
+
+		/* At the end check that we have arrived to the first free
+		position */
+
+		if (field != (byte*) block + mem_block_get_free(block)) {
+			/* error */
+
+			fprintf(stderr,
+				"InnoDB: Error: block %lx end of"
+				" mem fields %lx\n"
+				"InnoDB: but block free at %lx\n",
+				(ulint) block, (ulint) field,
+				(ulint)((byte*) block
+					+ mem_block_get_free(block)));
+
+			return;
+		}
+
+#endif
+
+		block = UT_LIST_GET_NEXT(list, block);
+		block_count++;
+	}
+#ifdef UNIV_MEM_DEBUG
+completed:
+#endif
+	if (us_size != NULL) {
+		*us_size = total_len;
+	}
+	if (ph_size != NULL) {
+		*ph_size = phys_len;
+	}
+	if (n_blocks != NULL) {
+		*n_blocks = block_count;
+	}
+	*error = FALSE;
+}
+
+/**************************************************************//**
+Prints the contents of a memory heap. */
+static
+void
+mem_heap_print(
+/*===========*/
+	mem_heap_t*	heap)	/*!< in: memory heap */
+{
+	ibool	error;
+	ulint	us_size;
+	ulint	phys_size;
+	ulint	n_blocks;
+
+	ut_ad(mem_heap_check(heap));
+
+	mem_heap_validate_or_print(heap, NULL, TRUE, &error,
+				   &us_size, &phys_size, &n_blocks);
+	fprintf(stderr,
+		"\nheap type: %lu; size: user size %lu;"
+		" physical size %lu; blocks %lu.\n",
+		(ulong) heap->type, (ulong) us_size,
+		(ulong) phys_size, (ulong) n_blocks);
+	ut_a(!error);
+}
+
+/**************************************************************//**
+Validates the contents of a memory heap.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+mem_heap_validate(
+/*==============*/
+	mem_heap_t*	heap)	/*!< in: memory heap */
+{
+	ibool	error;
+	ulint	us_size;
+	ulint	phys_size;
+	ulint	n_blocks;
+
+	ut_ad(mem_heap_check(heap));
+
+	mem_heap_validate_or_print(heap, NULL, FALSE, &error, &us_size,
+				   &phys_size, &n_blocks);
+	if (error) {
+		mem_heap_print(heap);
+	}
+
+	ut_a(!error);
+
+	return(TRUE);
+}
+#endif /* UNIV_MEM_DEBUG || UNIV_DEBUG */
+
+#ifdef UNIV_DEBUG
+/**************************************************************//**
+Checks that an object is a memory heap (or a block of it).
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+mem_heap_check(
+/*===========*/
+	mem_heap_t*	heap)	/*!< in: memory heap */
+{
+	ut_a(heap->magic_n == MEM_BLOCK_MAGIC_N);
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_MEM_DEBUG
+/*****************************************************************//**
+TRUE if no memory is currently allocated.
+@return	TRUE if no heaps exist */
+UNIV_INTERN
+ibool
+mem_all_freed(void)
+/*===============*/
+{
+	mem_hash_node_t*	node;
+	ulint			heap_count	= 0;
+	ulint			i;
+
+	mem_validate();
+
+	mutex_enter(&mem_hash_mutex);
+
+	for (i = 0; i < MEM_HASH_SIZE; i++) {
+
+		node = UT_LIST_GET_FIRST(*mem_hash_get_nth_cell(i));
+		while (node != NULL) {
+			heap_count++;
+			node = UT_LIST_GET_NEXT(list, node);
+		}
+	}
+
+	mutex_exit(&mem_hash_mutex);
+
+	if (heap_count == 0) {
+# ifndef UNIV_HOTBACKUP
+		ut_a(mem_pool_get_reserved(mem_comm_pool) == 0);
+# endif /* !UNIV_HOTBACKUP */
+
+		return(TRUE);
+	} else {
+		return(FALSE);
+	}
+}
+
+/*****************************************************************//**
+Validates the dynamic memory allocation system.
+@return	TRUE if error */
+UNIV_INTERN
+ibool
+mem_validate_no_assert(void)
+/*========================*/
+{
+	mem_hash_node_t*	node;
+	ulint			n_heaps			= 0;
+	ulint			allocated_mem;
+	ulint			ph_size;
+	ulint			total_allocated_mem	= 0;
+	ibool			error			= FALSE;
+	ulint			n_blocks;
+	ulint			i;
+
+# ifndef UNIV_HOTBACKUP
+	mem_pool_validate(mem_comm_pool);
+# endif /* !UNIV_HOTBACKUP */
+
+	mutex_enter(&mem_hash_mutex);
+
+	for (i = 0; i < MEM_HASH_SIZE; i++) {
+
+		node = UT_LIST_GET_FIRST(*mem_hash_get_nth_cell(i));
+
+		while (node != NULL) {
+			n_heaps++;
+
+			mem_heap_validate_or_print(node->heap, NULL,
+						   FALSE, &error,
+						   &allocated_mem,
+						   &ph_size, &n_blocks);
+
+			if (error) {
+				fprintf(stderr,
+					"\nERROR!!!!!!!!!!!!!!!!!!!"
+					"!!!!!!!!!!!!!!!!!!!!!!!\n\n"
+					"Inconsistency in memory heap"
+					" or buffer created\n"
+					"in %s line %lu.\n",
+					innobase_basename(node->file_name),
+					node->line);
+
+				mutex_exit(&mem_hash_mutex);
+
+				return(TRUE);
+			}
+
+			total_allocated_mem += allocated_mem;
+			node = UT_LIST_GET_NEXT(list, node);
+		}
+	}
+
+	if ((n_heaps == 0) && (mem_current_allocated_memory != 0)) {
+		error = TRUE;
+	}
+
+	if (mem_total_allocated_memory < mem_current_allocated_memory) {
+		error = TRUE;
+	}
+
+	if (mem_max_allocated_memory > mem_total_allocated_memory) {
+		error = TRUE;
+	}
+
+	if (mem_n_created_heaps < n_heaps) {
+		error = TRUE;
+	}
+
+	mutex_exit(&mem_hash_mutex);
+
+	return(error);
+}
+
+/************************************************************//**
+Validates the dynamic memory
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+mem_validate(void)
+/*==============*/
+{
+	ut_a(!mem_validate_no_assert());
+
+	return(TRUE);
+}
+#endif /* UNIV_MEM_DEBUG */
+
+/************************************************************//**
+Tries to find neigboring memory allocation blocks and dumps to stderr
+the neighborhood of a given pointer. */
+UNIV_INTERN
+void
+mem_analyze_corruption(
+/*===================*/
+	void*	ptr)	/*!< in: pointer to place of possible corruption */
+{
+	byte*	p;
+	ulint	i;
+	ulint	dist;
+
+	fputs("InnoDB: Apparent memory corruption: mem dump ", stderr);
+	ut_print_buf(stderr, (byte*) ptr - 250, 500);
+
+	fputs("\nInnoDB: Scanning backward trying to find"
+	      " previous allocated mem blocks\n", stderr);
+
+	p = (byte*) ptr;
+	dist = 0;
+
+	for (i = 0; i < 10; i++) {
+		for (;;) {
+			if (((ulint) p) % 4 == 0) {
+
+				if (*((ulint*) p) == MEM_BLOCK_MAGIC_N) {
+					fprintf(stderr,
+						"Mem block at - %lu,"
+						" file %s, line %lu\n",
+						(ulong) dist,
+						(p + sizeof(ulint)),
+						(ulong)
+						(*(ulint*)(p + 8
+							   + sizeof(ulint))));
+
+					break;
+				}
+
+				if (*((ulint*) p) == MEM_FREED_BLOCK_MAGIC_N) {
+					fprintf(stderr,
+						"Freed mem block at - %lu,"
+						" file %s, line %lu\n",
+						(ulong) dist,
+						(p + sizeof(ulint)),
+						(ulong)
+						(*(ulint*)(p + 8
+							   + sizeof(ulint))));
+
+					break;
+				}
+			}
+
+			p--;
+			dist++;
+		}
+
+		p--;
+		dist++;
+	}
+
+	fprintf(stderr,
+		"InnoDB: Scanning forward trying to find next"
+		" allocated mem blocks\n");
+
+	p = (byte*) ptr;
+	dist = 0;
+
+	for (i = 0; i < 10; i++) {
+		for (;;) {
+			if (((ulint) p) % 4 == 0) {
+
+				if (*((ulint*) p) == MEM_BLOCK_MAGIC_N) {
+					fprintf(stderr,
+						"Mem block at + %lu, file %s,"
+						" line %lu\n",
+						(ulong) dist,
+						(p + sizeof(ulint)),
+						(ulong)
+						(*(ulint*)(p + 8
+							   + sizeof(ulint))));
+
+					break;
+				}
+
+				if (*((ulint*) p) == MEM_FREED_BLOCK_MAGIC_N) {
+					fprintf(stderr,
+						"Freed mem block at + %lu,"
+						" file %s, line %lu\n",
+						(ulong) dist,
+						(p + sizeof(ulint)),
+						(ulong)
+						(*(ulint*)(p + 8
+							   + sizeof(ulint))));
+
+					break;
+				}
+			}
+
+			p++;
+			dist++;
+		}
+
+		p++;
+		dist++;
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/*****************************************************************//**
+Prints information of dynamic memory usage and currently allocated
+memory heaps or buffers. Can only be used in the debug version. */
+static
+void
+mem_print_info_low(
+/*===============*/
+	ibool	print_all)	/*!< in: if TRUE, all heaps are printed,
+				else only the heaps allocated after the
+				previous call of this function */
+{
+#ifdef UNIV_MEM_DEBUG
+	mem_hash_node_t*	node;
+	ulint			n_heaps			= 0;
+	ulint			allocated_mem;
+	ulint			ph_size;
+	ulint			total_allocated_mem	= 0;
+	ibool			error;
+	ulint			n_blocks;
+#endif
+	FILE*			outfile;
+
+	/* outfile = fopen("ibdebug", "a"); */
+
+	outfile = stdout;
+
+	fprintf(outfile, "\n");
+	fprintf(outfile,
+		"________________________________________________________\n");
+	fprintf(outfile, "MEMORY ALLOCATION INFORMATION\n\n");
+
+#ifndef UNIV_MEM_DEBUG
+
+	UT_NOT_USED(print_all);
+
+	mem_pool_print_info(outfile, mem_comm_pool);
+
+	fprintf(outfile,
+		"Sorry, non-debug version cannot give more memory info\n");
+
+	/* fclose(outfile); */
+
+	return;
+#else
+	mutex_enter(&mem_hash_mutex);
+
+	fprintf(outfile, "LIST OF CREATED HEAPS AND ALLOCATED BUFFERS: \n\n");
+
+	if (!print_all) {
+		fprintf(outfile, "AFTER THE LAST PRINT INFO\n");
+	}
+
+	node = UT_LIST_GET_FIRST(mem_all_list_base);
+
+	while (node != NULL) {
+		n_heaps++;
+
+		if (!print_all && node->nth_heap < mem_last_print_info) {
+
+			goto next_heap;
+		}
+
+		mem_heap_validate_or_print(node->heap, NULL,
+					   FALSE, &error, &allocated_mem,
+					   &ph_size, &n_blocks);
+		total_allocated_mem += allocated_mem;
+
+		fprintf(outfile,
+			"%lu: file %s line %lu of size %lu phys.size %lu"
+			" with %lu blocks, type %lu\n",
+			node->nth_heap,
+			innobase_basename(node->file_name), node->line,
+			allocated_mem, ph_size, n_blocks,
+			(node->heap)->type);
+next_heap:
+		node = UT_LIST_GET_NEXT(all_list, node);
+	}
+
+	fprintf(outfile, "\n");
+
+	fprintf(outfile, "Current allocated memory              : %lu\n",
+		mem_current_allocated_memory);
+	fprintf(outfile, "Current allocated heaps and buffers   : %lu\n",
+		n_heaps);
+	fprintf(outfile, "Cumulative allocated memory           : %lu\n",
+		mem_total_allocated_memory);
+	fprintf(outfile, "Maximum allocated memory              : %lu\n",
+		mem_max_allocated_memory);
+	fprintf(outfile, "Cumulative created heaps and buffers  : %lu\n",
+		mem_n_created_heaps);
+	fprintf(outfile, "Cumulative number of allocations      : %lu\n",
+		mem_n_allocations);
+
+	mem_last_print_info = mem_n_created_heaps;
+
+	mutex_exit(&mem_hash_mutex);
+
+	mem_pool_print_info(outfile, mem_comm_pool);
+
+	/*	mem_validate(); */
+
+	/*	fclose(outfile); */
+#endif
+}
+
+/*****************************************************************//**
+Prints information of dynamic memory usage and currently allocated memory
+heaps or buffers. Can only be used in the debug version. */
+UNIV_INTERN
+void
+mem_print_info(void)
+/*================*/
+{
+	mem_print_info_low(TRUE);
+}
+
+/*****************************************************************//**
+Prints information of dynamic memory usage and currently allocated memory
+heaps or buffers since the last ..._print_info or..._print_new_info. */
+UNIV_INTERN
+void
+mem_print_new_info(void)
+/*====================*/
+{
+	mem_print_info_low(FALSE);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/mem/mem0mem.cc b/storage/innobase/mem/mem0mem.cc
new file mode 100644
index 00000000000..e066aff5b30
--- /dev/null
+++ b/storage/innobase/mem/mem0mem.cc
@@ -0,0 +1,583 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file mem/mem0mem.cc
+The memory management
+
+Created 6/9/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mem0mem.h"
+#ifdef UNIV_NONINL
+#include "mem0mem.ic"
+#endif
+
+#include "buf0buf.h"
+#include "srv0srv.h"
+#include "mem0dbg.cc"
+#include <stdarg.h>
+
+/*
+			THE MEMORY MANAGEMENT
+			=====================
+
+The basic element of the memory management is called a memory
+heap. A memory heap is conceptually a
+stack from which memory can be allocated. The stack may grow infinitely.
+The top element of the stack may be freed, or
+the whole stack can be freed at one time. The advantage of the
+memory heap concept is that we can avoid using the malloc and free
+functions of C which are quite expensive, for example, on the Solaris + GCC
+system (50 MHz Sparc, 1993) the pair takes 3 microseconds,
+on Win NT + 100MHz Pentium, 2.5 microseconds.
+When we use a memory heap,
+we can allocate larger blocks of memory at a time and thus
+reduce overhead. Slightly more efficient the method is when we
+allocate the memory from the index page buffer pool, as we can
+claim a new page fast. This is called buffer allocation.
+When we allocate the memory from the dynamic memory of the
+C environment, that is called dynamic allocation.
+
+The default way of operation of the memory heap is the following.
+First, when the heap is created, an initial block of memory is
+allocated. In dynamic allocation this may be about 50 bytes.
+If more space is needed, additional blocks are allocated
+and they are put into a linked list.
+After the initial block, each allocated block is twice the size of the
+previous, until a threshold is attained, after which the sizes
+of the blocks stay the same. An exception is, of course, the case
+where the caller requests a memory buffer whose size is
+bigger than the threshold. In that case a block big enough must
+be allocated.
+
+The heap is physically arranged so that if the current block
+becomes full, a new block is allocated and always inserted in the
+chain of blocks as the last block.
+
+In the debug version of the memory management, all the allocated
+heaps are kept in a list (which is implemented as a hash table).
+Thus we can notice if the caller tries to free an already freed
+heap. In addition, each buffer given to the caller contains
+start field at the start and a trailer field at the end of the buffer.
+
+The start field has the following content:
+A. sizeof(ulint) bytes of field length (in the standard byte order)
+B. sizeof(ulint) bytes of check field (a random number)
+
+The trailer field contains:
+A. sizeof(ulint) bytes of check field (the same random number as at the start)
+
+Thus we can notice if something has been copied over the
+borders of the buffer, which is illegal.
+The memory in the buffers is initialized to a random byte sequence.
+After freeing, all the blocks in the heap are set to random bytes
+to help us discover errors which result from the use of
+buffers in an already freed heap. */
+
+#ifdef MEM_PERIODIC_CHECK
+
+ibool					mem_block_list_inited;
+/* List of all mem blocks allocated; protected by the mem_comm_pool mutex */
+UT_LIST_BASE_NODE_T(mem_block_t)	mem_block_list;
+
+#endif
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string, allocated from a memory heap.
+@return	own: a copy of the string */
+UNIV_INTERN
+char*
+mem_heap_strdup(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap where string is allocated */
+	const char*	str)	/*!< in: string to be copied */
+{
+	return(static_cast<char*>(mem_heap_dup(heap, str, strlen(str) + 1)));
+}
+
+/**********************************************************************//**
+Duplicate a block of data, allocated from a memory heap.
+@return	own: a copy of the data */
+UNIV_INTERN
+void*
+mem_heap_dup(
+/*=========*/
+	mem_heap_t*	heap,	/*!< in: memory heap where copy is allocated */
+	const void*	data,	/*!< in: data to be copied */
+	ulint		len)	/*!< in: length of data, in bytes */
+{
+	return(memcpy(mem_heap_alloc(heap, len), data, len));
+}
+
+/**********************************************************************//**
+Concatenate two strings and return the result, using a memory heap.
+@return	own: the result */
+UNIV_INTERN
+char*
+mem_heap_strcat(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap where string is allocated */
+	const char*	s1,	/*!< in: string 1 */
+	const char*	s2)	/*!< in: string 2 */
+{
+	char*	s;
+	ulint	s1_len = strlen(s1);
+	ulint	s2_len = strlen(s2);
+
+	s = static_cast<char*>(mem_heap_alloc(heap, s1_len + s2_len + 1));
+
+	memcpy(s, s1, s1_len);
+	memcpy(s + s1_len, s2, s2_len);
+
+	s[s1_len + s2_len] = '\0';
+
+	return(s);
+}
+
+
+/****************************************************************//**
+Helper function for mem_heap_printf.
+@return	length of formatted string, including terminating NUL */
+static
+ulint
+mem_heap_printf_low(
+/*================*/
+	char*		buf,	/*!< in/out: buffer to store formatted string
+				in, or NULL to just calculate length */
+	const char*	format,	/*!< in: format string */
+	va_list		ap)	/*!< in: arguments */
+{
+	ulint 		len = 0;
+
+	while (*format) {
+
+		/* Does this format specifier have the 'l' length modifier. */
+		ibool	is_long = FALSE;
+
+		/* Length of one parameter. */
+		size_t	plen;
+
+		if (*format++ != '%') {
+			/* Non-format character. */
+
+			len++;
+
+			if (buf) {
+				*buf++ = *(format - 1);
+			}
+
+			continue;
+		}
+
+		if (*format == 'l') {
+			is_long = TRUE;
+			format++;
+		}
+
+		switch (*format++) {
+		case 's':
+			/* string */
+			{
+				char*	s = va_arg(ap, char*);
+
+				/* "%ls" is a non-sensical format specifier. */
+				ut_a(!is_long);
+
+				plen = strlen(s);
+				len += plen;
+
+				if (buf) {
+					memcpy(buf, s, plen);
+					buf += plen;
+				}
+			}
+
+			break;
+
+		case 'u':
+			/* unsigned int */
+			{
+				char		tmp[32];
+				unsigned long	val;
+
+				/* We only support 'long' values for now. */
+				ut_a(is_long);
+
+				val = va_arg(ap, unsigned long);
+
+				plen = sprintf(tmp, "%lu", val);
+				len += plen;
+
+				if (buf) {
+					memcpy(buf, tmp, plen);
+					buf += plen;
+				}
+			}
+
+			break;
+
+		case '%':
+
+			/* "%l%" is a non-sensical format specifier. */
+			ut_a(!is_long);
+
+			len++;
+
+			if (buf) {
+				*buf++ = '%';
+			}
+
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	/* For the NUL character. */
+	len++;
+
+	if (buf) {
+		*buf = '\0';
+	}
+
+	return(len);
+}
+
+/****************************************************************//**
+A simple sprintf replacement that dynamically allocates the space for the
+formatted string from the given heap. This supports a very limited set of
+the printf syntax: types 's' and 'u' and length modifier 'l' (which is
+required for the 'u' type).
+@return	heap-allocated formatted string */
+UNIV_INTERN
+char*
+mem_heap_printf(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	const char*	format,	/*!< in: format string */
+	...)
+{
+	va_list		ap;
+	char*		str;
+	ulint 		len;
+
+	/* Calculate length of string */
+	len = 0;
+	va_start(ap, format);
+	len = mem_heap_printf_low(NULL, format, ap);
+	va_end(ap);
+
+	/* Now create it for real. */
+	str = static_cast<char*>(mem_heap_alloc(heap, len));
+	va_start(ap, format);
+	mem_heap_printf_low(str, format, ap);
+	va_end(ap);
+
+	return(str);
+}
+
+/***************************************************************//**
+Creates a memory heap block where data can be allocated.
+@return own: memory heap block, NULL if did not succeed (only possible
+for MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INTERN
+mem_block_t*
+mem_heap_create_block_func(
+/*=======================*/
+	mem_heap_t*	heap,	/*!< in: memory heap or NULL if first block
+				should be created */
+	ulint		n,	/*!< in: number of bytes needed for user data */
+#ifdef UNIV_DEBUG
+	const char*	file_name,/*!< in: file name where created */
+	ulint		line,	/*!< in: line where created */
+#endif /* UNIV_DEBUG */
+	ulint		type)	/*!< in: type of heap: MEM_HEAP_DYNAMIC or
+				MEM_HEAP_BUFFER */
+{
+#ifndef UNIV_HOTBACKUP
+	buf_block_t*	buf_block = NULL;
+#endif /* !UNIV_HOTBACKUP */
+	mem_block_t*	block;
+	ulint		len;
+
+	ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER)
+	      || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH));
+
+	if (heap && heap->magic_n != MEM_BLOCK_MAGIC_N) {
+		mem_analyze_corruption(heap);
+	}
+
+	/* In dynamic allocation, calculate the size: block header + data. */
+	len = MEM_BLOCK_HEADER_SIZE + MEM_SPACE_NEEDED(n);
+
+#ifndef UNIV_HOTBACKUP
+	if (type == MEM_HEAP_DYNAMIC || len < UNIV_PAGE_SIZE / 2) {
+
+		ut_ad(type == MEM_HEAP_DYNAMIC || n <= MEM_MAX_ALLOC_IN_BUF);
+
+		block = static_cast<mem_block_t*>(
+			mem_area_alloc(&len, mem_comm_pool));
+	} else {
+		len = UNIV_PAGE_SIZE;
+
+		if ((type & MEM_HEAP_BTR_SEARCH) && heap) {
+			/* We cannot allocate the block from the
+			buffer pool, but must get the free block from
+			the heap header free block field */
+
+			buf_block = static_cast<buf_block_t*>(heap->free_block);
+			heap->free_block = NULL;
+
+			if (UNIV_UNLIKELY(!buf_block)) {
+
+				return(NULL);
+			}
+		} else {
+			buf_block = buf_block_alloc(NULL);
+		}
+
+		block = (mem_block_t*) buf_block->frame;
+	}
+
+	if(!block) {
+		ib_logf(IB_LOG_LEVEL_FATAL,
+			" InnoDB: Unable to allocate memory of size %lu.\n",
+			len);
+	}
+	block->buf_block = buf_block;
+	block->free_block = NULL;
+#else /* !UNIV_HOTBACKUP */
+	len = MEM_BLOCK_HEADER_SIZE + MEM_SPACE_NEEDED(n);
+	block = ut_malloc(len);
+	ut_ad(block);
+#endif /* !UNIV_HOTBACKUP */
+
+	block->magic_n = MEM_BLOCK_MAGIC_N;
+	ut_d(ut_strlcpy_rev(block->file_name, file_name,
+			    sizeof(block->file_name)));
+	ut_d(block->line = line);
+
+#ifdef MEM_PERIODIC_CHECK
+	mutex_enter(&(mem_comm_pool->mutex));
+
+	if (!mem_block_list_inited) {
+		mem_block_list_inited = TRUE;
+		UT_LIST_INIT(mem_block_list);
+	}
+
+	UT_LIST_ADD_LAST(mem_block_list, mem_block_list, block);
+
+	mutex_exit(&(mem_comm_pool->mutex));
+#endif
+	mem_block_set_len(block, len);
+	mem_block_set_type(block, type);
+	mem_block_set_free(block, MEM_BLOCK_HEADER_SIZE);
+	mem_block_set_start(block, MEM_BLOCK_HEADER_SIZE);
+
+	if (UNIV_UNLIKELY(heap == NULL)) {
+		/* This is the first block of the heap. The field
+		total_size should be initialized here */
+		block->total_size = len;
+	} else {
+		/* Not the first allocation for the heap. This block's
+		total_length field should be set to undefined. */
+		ut_d(block->total_size = ULINT_UNDEFINED);
+		UNIV_MEM_INVALID(&block->total_size,
+				 sizeof block->total_size);
+
+		heap->total_size += len;
+	}
+
+	ut_ad((ulint)MEM_BLOCK_HEADER_SIZE < len);
+
+	return(block);
+}
+
+/***************************************************************//**
+Adds a new block to a memory heap.
+@return created block, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INTERN
+mem_block_t*
+mem_heap_add_block(
+/*===============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n)	/*!< in: number of bytes user needs */
+{
+	mem_block_t*	block;
+	mem_block_t*	new_block;
+	ulint		new_size;
+
+	ut_ad(mem_heap_check(heap));
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	/* We have to allocate a new block. The size is always at least
+	doubled until the standard size is reached. After that the size
+	stays the same, except in cases where the caller needs more space. */
+
+	new_size = 2 * mem_block_get_len(block);
+
+	if (heap->type != MEM_HEAP_DYNAMIC) {
+		/* From the buffer pool we allocate buffer frames */
+		ut_a(n <= MEM_MAX_ALLOC_IN_BUF);
+
+		if (new_size > MEM_MAX_ALLOC_IN_BUF) {
+			new_size = MEM_MAX_ALLOC_IN_BUF;
+		}
+	} else if (new_size > MEM_BLOCK_STANDARD_SIZE) {
+
+		new_size = MEM_BLOCK_STANDARD_SIZE;
+	}
+
+	if (new_size < n) {
+		new_size = n;
+	}
+
+	new_block = mem_heap_create_block(heap, new_size, heap->type,
+					  heap->file_name, heap->line);
+	if (new_block == NULL) {
+
+		return(NULL);
+	}
+
+	/* Add the new block as the last block */
+
+	UT_LIST_INSERT_AFTER(list, heap->base, block, new_block);
+
+	return(new_block);
+}
+
+/******************************************************************//**
+Frees a block from a memory heap. */
+UNIV_INTERN
+void
+mem_heap_block_free(
+/*================*/
+	mem_heap_t*	heap,	/*!< in: heap */
+	mem_block_t*	block)	/*!< in: block to free */
+{
+	ulint		type;
+	ulint		len;
+#ifndef UNIV_HOTBACKUP
+	buf_block_t*	buf_block;
+
+	buf_block = static_cast<buf_block_t*>(block->buf_block);
+#endif /* !UNIV_HOTBACKUP */
+
+	if (block->magic_n != MEM_BLOCK_MAGIC_N) {
+		mem_analyze_corruption(block);
+	}
+
+	UT_LIST_REMOVE(list, heap->base, block);
+
+#ifdef MEM_PERIODIC_CHECK
+	mutex_enter(&(mem_comm_pool->mutex));
+
+	UT_LIST_REMOVE(mem_block_list, mem_block_list, block);
+
+	mutex_exit(&(mem_comm_pool->mutex));
+#endif
+
+	ut_ad(heap->total_size >= block->len);
+	heap->total_size -= block->len;
+
+	type = heap->type;
+	len = block->len;
+	block->magic_n = MEM_FREED_BLOCK_MAGIC_N;
+
+#ifndef UNIV_HOTBACKUP
+	if (!srv_use_sys_malloc) {
+#ifdef UNIV_MEM_DEBUG
+		/* In the debug version we set the memory to a random
+		combination of hex 0xDE and 0xAD. */
+
+		mem_erase_buf((byte*) block, len);
+#else /* UNIV_MEM_DEBUG */
+		UNIV_MEM_ASSERT_AND_FREE(block, len);
+#endif /* UNIV_MEM_DEBUG */
+
+	}
+	if (type == MEM_HEAP_DYNAMIC || len < UNIV_PAGE_SIZE / 2) {
+
+		ut_ad(!buf_block);
+		mem_area_free(block, mem_comm_pool);
+	} else {
+		ut_ad(type & MEM_HEAP_BUFFER);
+
+		buf_block_free(buf_block);
+	}
+#else /* !UNIV_HOTBACKUP */
+#ifdef UNIV_MEM_DEBUG
+	/* In the debug version we set the memory to a random
+	combination of hex 0xDE and 0xAD. */
+
+	mem_erase_buf((byte*) block, len);
+#else /* UNIV_MEM_DEBUG */
+	UNIV_MEM_ASSERT_AND_FREE(block, len);
+#endif /* UNIV_MEM_DEBUG */
+	ut_free(block);
+#endif /* !UNIV_HOTBACKUP */
+}
+
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Frees the free_block field from a memory heap. */
+UNIV_INTERN
+void
+mem_heap_free_block_free(
+/*=====================*/
+	mem_heap_t*	heap)	/*!< in: heap */
+{
+	if (UNIV_LIKELY_NULL(heap->free_block)) {
+
+		buf_block_free(static_cast<buf_block_t*>(heap->free_block));
+
+		heap->free_block = NULL;
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef MEM_PERIODIC_CHECK
+/******************************************************************//**
+Goes through the list of all allocated mem blocks, checks their magic
+numbers, and reports possible corruption. */
+UNIV_INTERN
+void
+mem_validate_all_blocks(void)
+/*=========================*/
+{
+	mem_block_t*	block;
+
+	mutex_enter(&(mem_comm_pool->mutex));
+
+	block = UT_LIST_GET_FIRST(mem_block_list);
+
+	while (block) {
+		if (block->magic_n != MEM_BLOCK_MAGIC_N) {
+			mem_analyze_corruption(block);
+		}
+
+		block = UT_LIST_GET_NEXT(mem_block_list, block);
+	}
+
+	mutex_exit(&(mem_comm_pool->mutex));
+}
+#endif
diff --git a/storage/innobase/mem/mem0pool.cc b/storage/innobase/mem/mem0pool.cc
new file mode 100644
index 00000000000..fe9a84d21fa
--- /dev/null
+++ b/storage/innobase/mem/mem0pool.cc
@@ -0,0 +1,727 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file mem/mem0pool.cc
+The lowest-level memory management
+
+Created 5/12/1997 Heikki Tuuri
+*************************************************************************/
+
+#include "mem0pool.h"
+#ifdef UNIV_NONINL
+#include "mem0pool.ic"
+#endif
+
+#include "srv0srv.h"
+#include "sync0sync.h"
+#include "ut0mem.h"
+#include "ut0lst.h"
+#include "ut0byte.h"
+#include "mem0mem.h"
+#include "srv0start.h"
+
+/* We would like to use also the buffer frames to allocate memory. This
+would be desirable, because then the memory consumption of the database
+would be fixed, and we might even lock the buffer pool to the main memory.
+The problem here is that the buffer management routines can themselves call
+memory allocation, while the buffer pool mutex is reserved.
+
+The main components of the memory consumption are:
+
+1. buffer pool,
+2. parsed and optimized SQL statements,
+3. data dictionary cache,
+4. log buffer,
+5. locks for each transaction,
+6. hash table for the adaptive index,
+7. state and buffers for each SQL query currently being executed,
+8. session for each user, and
+9. stack for each OS thread.
+
+Items 1 and 2 are managed by an LRU algorithm. Items 5 and 6 can potentially
+consume very much memory. Items 7 and 8 should consume quite little memory,
+and the OS should take care of item 9, which too should consume little memory.
+
+A solution to the memory management:
+
+1. the buffer pool size is set separately;
+2. log buffer size is set separately;
+3. the common pool size for all the other entries, except 8, is set separately.
+
+Problems: we may waste memory if the common pool is set too big. Another
+problem is the locks, which may take very much space in big transactions.
+Then the shared pool size should be set very big. We can allow locks to take
+space from the buffer pool, but the SQL optimizer is then unaware of the
+usable size of the buffer pool. We could also combine the objects in the
+common pool and the buffers in the buffer pool into a single LRU list and
+manage it uniformly, but this approach does not take into account the parsing
+and other costs unique to SQL statements.
+
+The locks for a transaction can be seen as a part of the state of the
+transaction. Hence, they should be stored in the common pool. We still
+have the problem of a very big update transaction, for example, which
+will set very many x-locks on rows, and the locks will consume a lot
+of memory, say, half of the buffer pool size.
+
+Another problem is what to do if we are not able to malloc a requested
+block of memory from the common pool. Then we can request memory from
+the operating system. If it does not help, a system error results.
+
+Because 5 and 6 may potentially consume very much memory, we let them grow
+into the buffer pool. We may let the locks of a transaction take frames
+from the buffer pool, when the corresponding memory heap block has grown to
+the size of a buffer frame. Similarly for the hash node cells of the locks,
+and for the adaptive index. Thus, for each individual transaction, its locks
+can occupy at most about the size of the buffer frame of memory in the common
+pool, and after that its locks will grow into the buffer pool. */
+
+/** Mask used to extract the free bit from area->size */
+#define MEM_AREA_FREE	1
+
+/** The smallest memory area total size */
+#define MEM_AREA_MIN_SIZE	(2 * MEM_AREA_EXTRA_SIZE)
+
+
+/** Data structure for a memory pool. The space is allocated using the buddy
+algorithm, where free list i contains areas of size 2 to power i. */
+struct mem_pool_t{
+	byte*		buf;		/*!< memory pool */
+	ulint		size;		/*!< memory common pool size */
+	ulint		reserved;	/*!< amount of currently allocated
+					memory */
+	ib_mutex_t		mutex;		/*!< mutex protecting this struct */
+	UT_LIST_BASE_NODE_T(mem_area_t)
+			free_list[64];	/*!< lists of free memory areas: an
+					area is put to the list whose number
+					is the 2-logarithm of the area size */
+};
+
+/** The common memory pool */
+UNIV_INTERN mem_pool_t*	mem_comm_pool	= NULL;
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register mutex in mem_pool_t with performance schema */
+UNIV_INTERN mysql_pfs_key_t	mem_pool_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/* We use this counter to check that the mem pool mutex does not leak;
+this is to track a strange assertion failure reported at
+mysql@lists.mysql.com */
+
+UNIV_INTERN ulint	mem_n_threads_inside		= 0;
+
+/********************************************************************//**
+Reserves the mem pool mutex if we are not in server shutdown. Use
+this function only in memory free functions, since only memory
+free functions are used during server shutdown. */
+UNIV_INLINE
+void
+mem_pool_mutex_enter(
+/*=================*/
+	mem_pool_t*	pool)		/*!< in: memory pool */
+{
+	if (srv_shutdown_state < SRV_SHUTDOWN_EXIT_THREADS) {
+		mutex_enter(&(pool->mutex));
+	}
+}
+
+/********************************************************************//**
+Releases the mem pool mutex if we are not in server shutdown. As
+its corresponding mem_pool_mutex_enter() function, use it only
+in memory free functions */
+UNIV_INLINE
+void
+mem_pool_mutex_exit(
+/*================*/
+	mem_pool_t*	pool)		/*!< in: memory pool */
+{
+	if (srv_shutdown_state < SRV_SHUTDOWN_EXIT_THREADS) {
+		mutex_exit(&(pool->mutex));
+	}
+}
+
+/********************************************************************//**
+Returns memory area size.
+@return	size */
+UNIV_INLINE
+ulint
+mem_area_get_size(
+/*==============*/
+	mem_area_t*	area)	/*!< in: area */
+{
+	return(area->size_and_free & ~MEM_AREA_FREE);
+}
+
+/********************************************************************//**
+Sets memory area size. */
+UNIV_INLINE
+void
+mem_area_set_size(
+/*==============*/
+	mem_area_t*	area,	/*!< in: area */
+	ulint		size)	/*!< in: size */
+{
+	area->size_and_free = (area->size_and_free & MEM_AREA_FREE)
+		| size;
+}
+
+/********************************************************************//**
+Returns memory area free bit.
+@return	TRUE if free */
+UNIV_INLINE
+ibool
+mem_area_get_free(
+/*==============*/
+	mem_area_t*	area)	/*!< in: area */
+{
+#if TRUE != MEM_AREA_FREE
+# error "TRUE != MEM_AREA_FREE"
+#endif
+	return(area->size_and_free & MEM_AREA_FREE);
+}
+
+/********************************************************************//**
+Sets memory area free bit. */
+UNIV_INLINE
+void
+mem_area_set_free(
+/*==============*/
+	mem_area_t*	area,	/*!< in: area */
+	ibool		free)	/*!< in: free bit value */
+{
+#if TRUE != MEM_AREA_FREE
+# error "TRUE != MEM_AREA_FREE"
+#endif
+	area->size_and_free = (area->size_and_free & ~MEM_AREA_FREE)
+		| free;
+}
+
+/********************************************************************//**
+Creates a memory pool.
+@return	memory pool */
+UNIV_INTERN
+mem_pool_t*
+mem_pool_create(
+/*============*/
+	ulint	size)	/*!< in: pool size in bytes */
+{
+	mem_pool_t*	pool;
+	mem_area_t*	area;
+	ulint		i;
+	ulint		used;
+
+	pool = static_cast<mem_pool_t*>(ut_malloc(sizeof(mem_pool_t)));
+
+	pool->buf = static_cast<byte*>(ut_malloc_low(size, TRUE));
+	pool->size = size;
+
+	mutex_create(mem_pool_mutex_key, &pool->mutex, SYNC_MEM_POOL);
+
+	/* Initialize the free lists */
+
+	for (i = 0; i < 64; i++) {
+
+		UT_LIST_INIT(pool->free_list[i]);
+	}
+
+	used = 0;
+
+	while (size - used >= MEM_AREA_MIN_SIZE) {
+
+		i = ut_2_log(size - used);
+
+		if (ut_2_exp(i) > size - used) {
+
+			/* ut_2_log rounds upward */
+
+			i--;
+		}
+
+		area = (mem_area_t*)(pool->buf + used);
+
+		mem_area_set_size(area, ut_2_exp(i));
+		mem_area_set_free(area, TRUE);
+		UNIV_MEM_FREE(MEM_AREA_EXTRA_SIZE + (byte*) area,
+			      ut_2_exp(i) - MEM_AREA_EXTRA_SIZE);
+
+		UT_LIST_ADD_FIRST(free_list, pool->free_list[i], area);
+
+		used = used + ut_2_exp(i);
+	}
+
+	ut_ad(size >= used);
+
+	pool->reserved = 0;
+
+	return(pool);
+}
+
+/********************************************************************//**
+Frees a memory pool. */
+UNIV_INTERN
+void
+mem_pool_free(
+/*==========*/
+	mem_pool_t*	pool)	/*!< in, own: memory pool */
+{
+	ut_free(pool->buf);
+	ut_free(pool);
+}
+
+/********************************************************************//**
+Fills the specified free list.
+@return	TRUE if we were able to insert a block to the free list */
+static
+ibool
+mem_pool_fill_free_list(
+/*====================*/
+	ulint		i,	/*!< in: free list index */
+	mem_pool_t*	pool)	/*!< in: memory pool */
+{
+	mem_area_t*	area;
+	mem_area_t*	area2;
+	ibool		ret;
+
+	ut_ad(mutex_own(&(pool->mutex)));
+
+	if (UNIV_UNLIKELY(i >= 63)) {
+		/* We come here when we have run out of space in the
+		memory pool: */
+
+		return(FALSE);
+	}
+
+	area = UT_LIST_GET_FIRST(pool->free_list[i + 1]);
+
+	if (area == NULL) {
+		if (UT_LIST_GET_LEN(pool->free_list[i + 1]) > 0) {
+			ut_print_timestamp(stderr);
+
+			fprintf(stderr,
+				"  InnoDB: Error: mem pool free list %lu"
+				" length is %lu\n"
+				"InnoDB: though the list is empty!\n",
+				(ulong) i + 1,
+				(ulong)
+				UT_LIST_GET_LEN(pool->free_list[i + 1]));
+		}
+
+		ret = mem_pool_fill_free_list(i + 1, pool);
+
+		if (ret == FALSE) {
+
+			return(FALSE);
+		}
+
+		area = UT_LIST_GET_FIRST(pool->free_list[i + 1]);
+	}
+
+	if (UNIV_UNLIKELY(UT_LIST_GET_LEN(pool->free_list[i + 1]) == 0)) {
+		mem_analyze_corruption(area);
+
+		ut_error;
+	}
+
+	UT_LIST_REMOVE(free_list, pool->free_list[i + 1], area);
+
+	area2 = (mem_area_t*)(((byte*) area) + ut_2_exp(i));
+	UNIV_MEM_ALLOC(area2, MEM_AREA_EXTRA_SIZE);
+
+	mem_area_set_size(area2, ut_2_exp(i));
+	mem_area_set_free(area2, TRUE);
+
+	UT_LIST_ADD_FIRST(free_list, pool->free_list[i], area2);
+
+	mem_area_set_size(area, ut_2_exp(i));
+
+	UT_LIST_ADD_FIRST(free_list, pool->free_list[i], area);
+
+	return(TRUE);
+}
+
+/********************************************************************//**
+Allocates memory from a pool. NOTE: This low-level function should only be
+used in mem0mem.*!
+@return	own: allocated memory buffer */
+UNIV_INTERN
+void*
+mem_area_alloc(
+/*===========*/
+	ulint*		psize,	/*!< in: requested size in bytes; for optimum
+				space usage, the size should be a power of 2
+				minus MEM_AREA_EXTRA_SIZE;
+				out: allocated size in bytes (greater than
+				or equal to the requested size) */
+	mem_pool_t*	pool)	/*!< in: memory pool */
+{
+	mem_area_t*	area;
+	ulint		size;
+	ulint		n;
+	ibool		ret;
+
+	/* If we are using os allocator just make a simple call
+	to malloc */
+	if (UNIV_LIKELY(srv_use_sys_malloc)) {
+		return(malloc(*psize));
+	}
+
+	size = *psize;
+	n = ut_2_log(ut_max(size + MEM_AREA_EXTRA_SIZE, MEM_AREA_MIN_SIZE));
+
+	mutex_enter(&(pool->mutex));
+	mem_n_threads_inside++;
+
+	ut_a(mem_n_threads_inside == 1);
+
+	area = UT_LIST_GET_FIRST(pool->free_list[n]);
+
+	if (area == NULL) {
+		ret = mem_pool_fill_free_list(n, pool);
+
+		if (ret == FALSE) {
+			/* Out of memory in memory pool: we try to allocate
+			from the operating system with the regular malloc: */
+
+			mem_n_threads_inside--;
+			mutex_exit(&(pool->mutex));
+
+			return(ut_malloc(size));
+		}
+
+		area = UT_LIST_GET_FIRST(pool->free_list[n]);
+	}
+
+	if (!mem_area_get_free(area)) {
+		fprintf(stderr,
+			"InnoDB: Error: Removing element from mem pool"
+			" free list %lu though the\n"
+			"InnoDB: element is not marked free!\n",
+			(ulong) n);
+
+		mem_analyze_corruption(area);
+
+		/* Try to analyze a strange assertion failure reported at
+		mysql@lists.mysql.com where the free bit IS 1 in the
+		hex dump above */
+
+		if (mem_area_get_free(area)) {
+			fprintf(stderr,
+				"InnoDB: Probably a race condition"
+				" because now the area is marked free!\n");
+		}
+
+		ut_error;
+	}
+
+	if (UT_LIST_GET_LEN(pool->free_list[n]) == 0) {
+		fprintf(stderr,
+			"InnoDB: Error: Removing element from mem pool"
+			" free list %lu\n"
+			"InnoDB: though the list length is 0!\n",
+			(ulong) n);
+		mem_analyze_corruption(area);
+
+		ut_error;
+	}
+
+	ut_ad(mem_area_get_size(area) == ut_2_exp(n));
+
+	mem_area_set_free(area, FALSE);
+
+	UT_LIST_REMOVE(free_list, pool->free_list[n], area);
+
+	pool->reserved += mem_area_get_size(area);
+
+	mem_n_threads_inside--;
+	mutex_exit(&(pool->mutex));
+
+	ut_ad(mem_pool_validate(pool));
+
+	*psize = ut_2_exp(n) - MEM_AREA_EXTRA_SIZE;
+	UNIV_MEM_ALLOC(MEM_AREA_EXTRA_SIZE + (byte*) area, *psize);
+
+	return((void*)(MEM_AREA_EXTRA_SIZE + ((byte*) area)));
+}
+
+/********************************************************************//**
+Gets the buddy of an area, if it exists in pool.
+@return	the buddy, NULL if no buddy in pool */
+UNIV_INLINE
+mem_area_t*
+mem_area_get_buddy(
+/*===============*/
+	mem_area_t*	area,	/*!< in: memory area */
+	ulint		size,	/*!< in: memory area size */
+	mem_pool_t*	pool)	/*!< in: memory pool */
+{
+	mem_area_t*	buddy;
+
+	ut_ad(size != 0);
+
+	if (((((byte*) area) - pool->buf) % (2 * size)) == 0) {
+
+		/* The buddy is in a higher address */
+
+		buddy = (mem_area_t*)(((byte*) area) + size);
+
+		if ((((byte*) buddy) - pool->buf) + size > pool->size) {
+
+			/* The buddy is not wholly contained in the pool:
+			there is no buddy */
+
+			buddy = NULL;
+		}
+	} else {
+		/* The buddy is in a lower address; NOTE that area cannot
+		be at the pool lower end, because then we would end up to
+		the upper branch in this if-clause: the remainder would be
+		0 */
+
+		buddy = (mem_area_t*)(((byte*) area) - size);
+	}
+
+	return(buddy);
+}
+
+/********************************************************************//**
+Frees memory to a pool. */
+UNIV_INTERN
+void
+mem_area_free(
+/*==========*/
+	void*		ptr,	/*!< in, own: pointer to allocated memory
+				buffer */
+	mem_pool_t*	pool)	/*!< in: memory pool */
+{
+	mem_area_t*	area;
+	mem_area_t*	buddy;
+	void*		new_ptr;
+	ulint		size;
+	ulint		n;
+
+	if (UNIV_LIKELY(srv_use_sys_malloc)) {
+		free(ptr);
+
+		return;
+	}
+
+	/* It may be that the area was really allocated from the OS with
+	regular malloc: check if ptr points within our memory pool */
+
+	if ((byte*) ptr < pool->buf || (byte*) ptr >= pool->buf + pool->size) {
+		ut_free(ptr);
+
+		return;
+	}
+
+	area = (mem_area_t*) (((byte*) ptr) - MEM_AREA_EXTRA_SIZE);
+
+	if (mem_area_get_free(area)) {
+		fprintf(stderr,
+			"InnoDB: Error: Freeing element to mem pool"
+			" free list though the\n"
+			"InnoDB: element is marked free!\n");
+
+		mem_analyze_corruption(area);
+		ut_error;
+	}
+
+	size = mem_area_get_size(area);
+	UNIV_MEM_FREE(ptr, size - MEM_AREA_EXTRA_SIZE);
+
+	if (size == 0) {
+		fprintf(stderr,
+			"InnoDB: Error: Mem area size is 0. Possibly a"
+			" memory overrun of the\n"
+			"InnoDB: previous allocated area!\n");
+
+		mem_analyze_corruption(area);
+		ut_error;
+	}
+
+#ifdef UNIV_LIGHT_MEM_DEBUG
+	if (((byte*) area) + size < pool->buf + pool->size) {
+
+		ulint	next_size;
+
+		next_size = mem_area_get_size(
+			(mem_area_t*)(((byte*) area) + size));
+		if (UNIV_UNLIKELY(!next_size || !ut_is_2pow(next_size))) {
+			fprintf(stderr,
+				"InnoDB: Error: Memory area size %lu,"
+				" next area size %lu not a power of 2!\n"
+				"InnoDB: Possibly a memory overrun of"
+				" the buffer being freed here.\n",
+				(ulong) size, (ulong) next_size);
+			mem_analyze_corruption(area);
+
+			ut_error;
+		}
+	}
+#endif
+	buddy = mem_area_get_buddy(area, size, pool);
+
+	n = ut_2_log(size);
+
+	mem_pool_mutex_enter(pool);
+	mem_n_threads_inside++;
+
+	ut_a(mem_n_threads_inside == 1);
+
+	if (buddy && mem_area_get_free(buddy)
+	    && (size == mem_area_get_size(buddy))) {
+
+		/* The buddy is in a free list */
+
+		if ((byte*) buddy < (byte*) area) {
+			new_ptr = ((byte*) buddy) + MEM_AREA_EXTRA_SIZE;
+
+			mem_area_set_size(buddy, 2 * size);
+			mem_area_set_free(buddy, FALSE);
+		} else {
+			new_ptr = ptr;
+
+			mem_area_set_size(area, 2 * size);
+		}
+
+		/* Remove the buddy from its free list and merge it to area */
+
+		UT_LIST_REMOVE(free_list, pool->free_list[n], buddy);
+
+		pool->reserved += ut_2_exp(n);
+
+		mem_n_threads_inside--;
+		mem_pool_mutex_exit(pool);
+
+		mem_area_free(new_ptr, pool);
+
+		return;
+	} else {
+		UT_LIST_ADD_FIRST(free_list, pool->free_list[n], area);
+
+		mem_area_set_free(area, TRUE);
+
+		ut_ad(pool->reserved >= size);
+
+		pool->reserved -= size;
+	}
+
+	mem_n_threads_inside--;
+	mem_pool_mutex_exit(pool);
+
+	ut_ad(mem_pool_validate(pool));
+}
+
+/********************************************************************//**
+Validates a memory pool.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+mem_pool_validate(
+/*==============*/
+	mem_pool_t*	pool)	/*!< in: memory pool */
+{
+	mem_area_t*	area;
+	mem_area_t*	buddy;
+	ulint		free;
+	ulint		i;
+
+	mem_pool_mutex_enter(pool);
+
+	free = 0;
+
+	for (i = 0; i < 64; i++) {
+
+		UT_LIST_CHECK(free_list, mem_area_t, pool->free_list[i]);
+
+		for (area = UT_LIST_GET_FIRST(pool->free_list[i]);
+		     area != 0;
+		     area = UT_LIST_GET_NEXT(free_list, area)) {
+
+			ut_a(mem_area_get_free(area));
+			ut_a(mem_area_get_size(area) == ut_2_exp(i));
+
+			buddy = mem_area_get_buddy(area, ut_2_exp(i), pool);
+
+			ut_a(!buddy || !mem_area_get_free(buddy)
+			     || (ut_2_exp(i) != mem_area_get_size(buddy)));
+
+			free += ut_2_exp(i);
+		}
+	}
+
+	ut_a(free + pool->reserved == pool->size);
+
+	mem_pool_mutex_exit(pool);
+
+	return(TRUE);
+}
+
+/********************************************************************//**
+Prints info of a memory pool. */
+UNIV_INTERN
+void
+mem_pool_print_info(
+/*================*/
+	FILE*		outfile,/*!< in: output file to write to */
+	mem_pool_t*	pool)	/*!< in: memory pool */
+{
+	ulint		i;
+
+	mem_pool_validate(pool);
+
+	fprintf(outfile, "INFO OF A MEMORY POOL\n");
+
+	mutex_enter(&(pool->mutex));
+
+	for (i = 0; i < 64; i++) {
+		if (UT_LIST_GET_LEN(pool->free_list[i]) > 0) {
+
+			fprintf(outfile,
+				"Free list length %lu for"
+				" blocks of size %lu\n",
+				(ulong) UT_LIST_GET_LEN(pool->free_list[i]),
+				(ulong) ut_2_exp(i));
+		}
+	}
+
+	fprintf(outfile, "Pool size %lu, reserved %lu.\n", (ulong) pool->size,
+		(ulong) pool->reserved);
+	mutex_exit(&(pool->mutex));
+}
+
+/********************************************************************//**
+Returns the amount of reserved memory.
+@return	reserved memory in bytes */
+UNIV_INTERN
+ulint
+mem_pool_get_reserved(
+/*==================*/
+	mem_pool_t*	pool)	/*!< in: memory pool */
+{
+	ulint	reserved;
+
+	mutex_enter(&(pool->mutex));
+
+	reserved = pool->reserved;
+
+	mutex_exit(&(pool->mutex));
+
+	return(reserved);
+}
diff --git a/storage/innobase/mtr/mtr0log.cc b/storage/innobase/mtr/mtr0log.cc
new file mode 100644
index 00000000000..5335cb4c9ef
--- /dev/null
+++ b/storage/innobase/mtr/mtr0log.cc
@@ -0,0 +1,609 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file mtr/mtr0log.cc
+Mini-transaction log routines
+
+Created 12/7/1995 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0log.h"
+
+#ifdef UNIV_NONINL
+#include "mtr0log.ic"
+#endif
+
+#include "buf0buf.h"
+#include "dict0dict.h"
+#include "log0recv.h"
+#include "page0page.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "dict0boot.h"
+
+/********************************************************//**
+Catenates n bytes to the mtr log. */
+UNIV_INTERN
+void
+mlog_catenate_string(
+/*=================*/
+	mtr_t*		mtr,	/*!< in: mtr */
+	const byte*	str,	/*!< in: string to write */
+	ulint		len)	/*!< in: string length */
+{
+	dyn_array_t*	mlog;
+
+	if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) {
+
+		return;
+	}
+
+	mlog = &(mtr->log);
+
+	dyn_push_string(mlog, str, len);
+}
+
+/********************************************************//**
+Writes the initial part of a log record consisting of one-byte item
+type and four-byte space and page numbers. Also pushes info
+to the mtr memo that a buffer page has been modified. */
+UNIV_INTERN
+void
+mlog_write_initial_log_record(
+/*==========================*/
+	const byte*	ptr,	/*!< in: pointer to (inside) a buffer
+				frame holding the file page where
+				modification is made */
+	byte		type,	/*!< in: log item type: MLOG_1BYTE, ... */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+{
+	byte*	log_ptr;
+
+	ut_ad(type <= MLOG_BIGGEST_TYPE);
+	ut_ad(type > MLOG_8BYTES);
+
+	log_ptr = mlog_open(mtr, 11);
+
+	/* If no logging is requested, we may return now */
+	if (log_ptr == NULL) {
+
+		return;
+	}
+
+	log_ptr = mlog_write_initial_log_record_fast(ptr, type, log_ptr, mtr);
+
+	mlog_close(mtr, log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Parses an initial log record written by mlog_write_initial_log_record.
+@return	parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_initial_log_record(
+/*==========================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	byte*	type,	/*!< out: log record type: MLOG_1BYTE, ... */
+	ulint*	space,	/*!< out: space id */
+	ulint*	page_no)/*!< out: page number */
+{
+	if (end_ptr < ptr + 1) {
+
+		return(NULL);
+	}
+
+	*type = (byte)((ulint)*ptr & ~MLOG_SINGLE_REC_FLAG);
+	ut_ad(*type <= MLOG_BIGGEST_TYPE);
+
+	ptr++;
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	ptr = mach_parse_compressed(ptr, end_ptr, space);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	ptr = mach_parse_compressed(ptr, end_ptr, page_no);
+
+	return(ptr);
+}
+
+/********************************************************//**
+Parses a log record written by mlog_write_ulint or mlog_write_ull.
+@return	parsed record end, NULL if not a complete record or a corrupt record */
+UNIV_INTERN
+byte*
+mlog_parse_nbytes(
+/*==============*/
+	ulint	type,	/*!< in: log record type: MLOG_1BYTE, ... */
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	byte*	page,	/*!< in: page where to apply the log record, or NULL */
+	void*	page_zip)/*!< in/out: compressed page, or NULL */
+{
+	ulint		offset;
+	ulint		val;
+	ib_uint64_t	dval;
+
+	ut_a(type <= MLOG_8BYTES);
+	ut_a(!page || !page_zip || fil_page_get_type(page) != FIL_PAGE_INDEX);
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	offset = mach_read_from_2(ptr);
+	ptr += 2;
+
+	if (offset >= UNIV_PAGE_SIZE) {
+		recv_sys->found_corrupt_log = TRUE;
+
+		return(NULL);
+	}
+
+	if (type == MLOG_8BYTES) {
+		ptr = mach_ull_parse_compressed(ptr, end_ptr, &dval);
+
+		if (ptr == NULL) {
+
+			return(NULL);
+		}
+
+		if (page) {
+			if (page_zip) {
+				mach_write_to_8
+					(((page_zip_des_t*) page_zip)->data
+					 + offset, dval);
+			}
+			mach_write_to_8(page + offset, dval);
+		}
+
+		return(ptr);
+	}
+
+	ptr = mach_parse_compressed(ptr, end_ptr, &val);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	switch (type) {
+	case MLOG_1BYTE:
+		if (UNIV_UNLIKELY(val > 0xFFUL)) {
+			goto corrupt;
+		}
+		if (page) {
+			if (page_zip) {
+				mach_write_to_1
+					(((page_zip_des_t*) page_zip)->data
+					 + offset, val);
+			}
+			mach_write_to_1(page + offset, val);
+		}
+		break;
+	case MLOG_2BYTES:
+		if (UNIV_UNLIKELY(val > 0xFFFFUL)) {
+			goto corrupt;
+		}
+		if (page) {
+			if (page_zip) {
+				mach_write_to_2
+					(((page_zip_des_t*) page_zip)->data
+					 + offset, val);
+			}
+			mach_write_to_2(page + offset, val);
+		}
+		break;
+	case MLOG_4BYTES:
+		if (page) {
+			if (page_zip) {
+				mach_write_to_4
+					(((page_zip_des_t*) page_zip)->data
+					 + offset, val);
+			}
+			mach_write_to_4(page + offset, val);
+		}
+		break;
+	default:
+	corrupt:
+		recv_sys->found_corrupt_log = TRUE;
+		ptr = NULL;
+	}
+
+	return(ptr);
+}
+
+/********************************************************//**
+Writes 1, 2 or 4 bytes to a file page. Writes the corresponding log
+record to the mini-transaction log if mtr is not NULL. */
+UNIV_INTERN
+void
+mlog_write_ulint(
+/*=============*/
+	byte*	ptr,	/*!< in: pointer where to write */
+	ulint	val,	/*!< in: value to write */
+	byte	type,	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+	mtr_t*	mtr)	/*!< in: mini-transaction handle */
+{
+	switch (type) {
+	case MLOG_1BYTE:
+		mach_write_to_1(ptr, val);
+		break;
+	case MLOG_2BYTES:
+		mach_write_to_2(ptr, val);
+		break;
+	case MLOG_4BYTES:
+		mach_write_to_4(ptr, val);
+		break;
+	default:
+		ut_error;
+	}
+
+	if (mtr != 0) {
+		byte*	log_ptr = mlog_open(mtr, 11 + 2 + 5);
+
+		/* If no logging is requested, we may return now */
+
+		if (log_ptr != 0) {
+
+			log_ptr = mlog_write_initial_log_record_fast(
+				ptr, type, log_ptr, mtr);
+
+			mach_write_to_2(log_ptr, page_offset(ptr));
+			log_ptr += 2;
+
+			log_ptr += mach_write_compressed(log_ptr, val);
+
+			mlog_close(mtr, log_ptr);
+		}
+	}
+}
+
+/********************************************************//**
+Writes 8 bytes to a file page. Writes the corresponding log
+record to the mini-transaction log, only if mtr is not NULL */
+UNIV_INTERN
+void
+mlog_write_ull(
+/*===========*/
+	byte*		ptr,	/*!< in: pointer where to write */
+	ib_uint64_t	val,	/*!< in: value to write */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+{
+	mach_write_to_8(ptr, val);
+
+	if (mtr != 0) {
+		byte*	log_ptr = mlog_open(mtr, 11 + 2 + 9);
+
+		/* If no logging is requested, we may return now */
+		if (log_ptr != 0) {
+
+			log_ptr = mlog_write_initial_log_record_fast(
+				ptr, MLOG_8BYTES, log_ptr, mtr);
+
+			mach_write_to_2(log_ptr, page_offset(ptr));
+			log_ptr += 2;
+
+			log_ptr += mach_ull_write_compressed(log_ptr, val);
+
+			mlog_close(mtr, log_ptr);
+		}
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Writes a string to a file page buffered in the buffer pool. Writes the
+corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_write_string(
+/*==============*/
+	byte*		ptr,	/*!< in: pointer where to write */
+	const byte*	str,	/*!< in: string to write */
+	ulint		len,	/*!< in: string length */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+{
+	ut_ad(ptr && mtr);
+	ut_a(len < UNIV_PAGE_SIZE);
+
+	memcpy(ptr, str, len);
+
+	mlog_log_string(ptr, len, mtr);
+}
+
+/********************************************************//**
+Logs a write of a string to a file page buffered in the buffer pool.
+Writes the corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_log_string(
+/*============*/
+	byte*	ptr,	/*!< in: pointer written to */
+	ulint	len,	/*!< in: string length */
+	mtr_t*	mtr)	/*!< in: mini-transaction handle */
+{
+	byte*	log_ptr;
+
+	ut_ad(ptr && mtr);
+	ut_ad(len <= UNIV_PAGE_SIZE);
+
+	log_ptr = mlog_open(mtr, 30);
+
+	/* If no logging is requested, we may return now */
+	if (log_ptr == NULL) {
+
+		return;
+	}
+
+	log_ptr = mlog_write_initial_log_record_fast(ptr, MLOG_WRITE_STRING,
+						     log_ptr, mtr);
+	mach_write_to_2(log_ptr, page_offset(ptr));
+	log_ptr += 2;
+
+	mach_write_to_2(log_ptr, len);
+	log_ptr += 2;
+
+	mlog_close(mtr, log_ptr);
+
+	mlog_catenate_string(mtr, ptr, len);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Parses a log record written by mlog_write_string.
+@return	parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_string(
+/*==============*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	byte*	page,	/*!< in: page where to apply the log record, or NULL */
+	void*	page_zip)/*!< in/out: compressed page, or NULL */
+{
+	ulint	offset;
+	ulint	len;
+
+	ut_a(!page || !page_zip || fil_page_get_type(page) != FIL_PAGE_INDEX);
+
+	if (end_ptr < ptr + 4) {
+
+		return(NULL);
+	}
+
+	offset = mach_read_from_2(ptr);
+	ptr += 2;
+	len = mach_read_from_2(ptr);
+	ptr += 2;
+
+	if (UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE)
+	    || UNIV_UNLIKELY(len + offset > UNIV_PAGE_SIZE)) {
+		recv_sys->found_corrupt_log = TRUE;
+
+		return(NULL);
+	}
+
+	if (end_ptr < ptr + len) {
+
+		return(NULL);
+	}
+
+	if (page) {
+		if (page_zip) {
+			memcpy(((page_zip_des_t*) page_zip)->data
+				+ offset, ptr, len);
+		}
+		memcpy(page + offset, ptr, len);
+	}
+
+	return(ptr + len);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Opens a buffer for mlog, writes the initial log record and,
+if needed, the field lengths of an index.
+@return	buffer, NULL if log mode MTR_LOG_NONE */
+UNIV_INTERN
+byte*
+mlog_open_and_write_index(
+/*======================*/
+	mtr_t*			mtr,	/*!< in: mtr */
+	const byte*		rec,	/*!< in: index record or page */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	byte			type,	/*!< in: log item type */
+	ulint			size)	/*!< in: requested buffer size in bytes
+					(if 0, calls mlog_close() and
+					returns NULL) */
+{
+	byte*		log_ptr;
+	const byte*	log_start;
+	const byte*	log_end;
+
+	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+
+	if (!page_rec_is_comp(rec)) {
+		log_start = log_ptr = mlog_open(mtr, 11 + size);
+		if (!log_ptr) {
+			return(NULL); /* logging is disabled */
+		}
+		log_ptr = mlog_write_initial_log_record_fast(rec, type,
+							     log_ptr, mtr);
+		log_end = log_ptr + 11 + size;
+	} else {
+		ulint	i;
+		ulint	n	= dict_index_get_n_fields(index);
+		/* total size needed */
+		ulint	total	= 11 + size + (n + 2) * 2;
+		ulint	alloc	= total;
+		/* allocate at most DYN_ARRAY_DATA_SIZE at a time */
+		if (alloc > DYN_ARRAY_DATA_SIZE) {
+			alloc = DYN_ARRAY_DATA_SIZE;
+		}
+		log_start = log_ptr = mlog_open(mtr, alloc);
+		if (!log_ptr) {
+			return(NULL); /* logging is disabled */
+		}
+		log_end = log_ptr + alloc;
+		log_ptr = mlog_write_initial_log_record_fast(rec, type,
+							     log_ptr, mtr);
+		mach_write_to_2(log_ptr, n);
+		log_ptr += 2;
+		mach_write_to_2(log_ptr,
+				dict_index_get_n_unique_in_tree(index));
+		log_ptr += 2;
+		for (i = 0; i < n; i++) {
+			dict_field_t*		field;
+			const dict_col_t*	col;
+			ulint			len;
+
+			field = dict_index_get_nth_field(index, i);
+			col = dict_field_get_col(field);
+			len = field->fixed_len;
+			ut_ad(len < 0x7fff);
+			if (len == 0
+			    && (col->len > 255 || col->mtype == DATA_BLOB)) {
+				/* variable-length field
+				with maximum length > 255 */
+				len = 0x7fff;
+			}
+			if (col->prtype & DATA_NOT_NULL) {
+				len |= 0x8000;
+			}
+			if (log_ptr + 2 > log_end) {
+				mlog_close(mtr, log_ptr);
+				ut_a(total > (ulint) (log_ptr - log_start));
+				total -= log_ptr - log_start;
+				alloc = total;
+				if (alloc > DYN_ARRAY_DATA_SIZE) {
+					alloc = DYN_ARRAY_DATA_SIZE;
+				}
+				log_start = log_ptr = mlog_open(mtr, alloc);
+				if (!log_ptr) {
+					return(NULL); /* logging is disabled */
+				}
+				log_end = log_ptr + alloc;
+			}
+			mach_write_to_2(log_ptr, len);
+			log_ptr += 2;
+		}
+	}
+	if (size == 0) {
+		mlog_close(mtr, log_ptr);
+		log_ptr = NULL;
+	} else if (log_ptr + size > log_end) {
+		mlog_close(mtr, log_ptr);
+		log_ptr = mlog_open(mtr, size);
+	}
+	return(log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Parses a log record written by mlog_open_and_write_index.
+@return	parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_index(
+/*=============*/
+	byte*		ptr,	/*!< in: buffer */
+	const byte*	end_ptr,/*!< in: buffer end */
+	ibool		comp,	/*!< in: TRUE=compact row format */
+	dict_index_t**	index)	/*!< out, own: dummy index */
+{
+	ulint		i, n, n_uniq;
+	dict_table_t*	table;
+	dict_index_t*	ind;
+
+	ut_ad(comp == FALSE || comp == TRUE);
+
+	if (comp) {
+		if (end_ptr < ptr + 4) {
+			return(NULL);
+		}
+		n = mach_read_from_2(ptr);
+		ptr += 2;
+		n_uniq = mach_read_from_2(ptr);
+		ptr += 2;
+		ut_ad(n_uniq <= n);
+		if (end_ptr < ptr + n * 2) {
+			return(NULL);
+		}
+	} else {
+		n = n_uniq = 1;
+	}
+	table = dict_mem_table_create("LOG_DUMMY", DICT_HDR_SPACE, n,
+				      comp ? DICT_TF_COMPACT : 0, 0);
+	ind = dict_mem_index_create("LOG_DUMMY", "LOG_DUMMY",
+				    DICT_HDR_SPACE, 0, n);
+	ind->table = table;
+	ind->n_uniq = (unsigned int) n_uniq;
+	if (n_uniq != n) {
+		ut_a(n_uniq + DATA_ROLL_PTR <= n);
+		ind->type = DICT_CLUSTERED;
+	}
+	if (comp) {
+		for (i = 0; i < n; i++) {
+			ulint	len = mach_read_from_2(ptr);
+			ptr += 2;
+			/* The high-order bit of len is the NOT NULL flag;
+			the rest is 0 or 0x7fff for variable-length fields,
+			and 1..0x7ffe for fixed-length fields. */
+			dict_mem_table_add_col(
+				table, NULL, NULL,
+				((len + 1) & 0x7fff) <= 1
+				? DATA_BINARY : DATA_FIXBINARY,
+				len & 0x8000 ? DATA_NOT_NULL : 0,
+				len & 0x7fff);
+
+			dict_index_add_col(ind, table,
+					   dict_table_get_nth_col(table, i),
+					   0);
+		}
+		dict_table_add_system_columns(table, table->heap);
+		if (n_uniq != n) {
+			/* Identify DB_TRX_ID and DB_ROLL_PTR in the index. */
+			ut_a(DATA_TRX_ID_LEN
+			     == dict_index_get_nth_col(ind, DATA_TRX_ID - 1
+						       + n_uniq)->len);
+			ut_a(DATA_ROLL_PTR_LEN
+			     == dict_index_get_nth_col(ind, DATA_ROLL_PTR - 1
+						       + n_uniq)->len);
+			ind->fields[DATA_TRX_ID - 1 + n_uniq].col
+				= &table->cols[n + DATA_TRX_ID];
+			ind->fields[DATA_ROLL_PTR - 1 + n_uniq].col
+				= &table->cols[n + DATA_ROLL_PTR];
+		}
+	}
+	/* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+	ind->cached = TRUE;
+	*index = ind;
+	return(ptr);
+}
diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc
new file mode 100644
index 00000000000..869586bcd90
--- /dev/null
+++ b/storage/innobase/mtr/mtr0mtr.cc
@@ -0,0 +1,439 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file mtr/mtr0mtr.cc
+Mini-transaction buffer
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0mtr.h"
+
+#ifdef UNIV_NONINL
+#include "mtr0mtr.ic"
+#endif
+
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "page0types.h"
+#include "mtr0log.h"
+#include "log0log.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "log0recv.h"
+
+/***************************************************//**
+Checks if a mini-transaction is dirtying a clean page.
+@return TRUE if the mtr is dirtying a clean page. */
+UNIV_INTERN
+ibool
+mtr_block_dirtied(
+/*==============*/
+	const buf_block_t*	block)	/*!< in: block being x-fixed */
+{
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(block->page.buf_fix_count > 0);
+
+	/* It is OK to read oldest_modification because no
+	other thread can be performing a write of it and it
+	is only during write that the value is reset to 0. */
+	return(block->page.oldest_modification == 0);
+}
+
+/*****************************************************************//**
+Releases the item in the slot given. */
+static __attribute__((nonnull))
+void
+mtr_memo_slot_release_func(
+/*=======================*/
+#ifdef UNIV_DEBUG
+ 	mtr_t*			mtr,	/*!< in/out: mini-transaction */
+#endif /* UNIV_DEBUG */
+	mtr_memo_slot_t*	slot)	/*!< in: memo slot */
+{
+	void*	object = slot->object;
+	slot->object = NULL;
+
+	/* slot release is a local operation for the current mtr.
+	We must not be holding the flush_order mutex while
+	doing this. */
+	ut_ad(!log_flush_order_mutex_own());
+
+	switch (slot->type) {
+	case MTR_MEMO_PAGE_S_FIX:
+	case MTR_MEMO_PAGE_X_FIX:
+	case MTR_MEMO_BUF_FIX:
+		buf_page_release((buf_block_t*) object, slot->type);
+		break;
+	case MTR_MEMO_S_LOCK:
+		rw_lock_s_unlock((rw_lock_t*) object);
+		break;
+	case MTR_MEMO_X_LOCK:
+		rw_lock_x_unlock((rw_lock_t*) object);
+		break;
+#ifdef UNIV_DEBUG
+	default:
+		ut_ad(slot->type == MTR_MEMO_MODIFY);
+		ut_ad(mtr_memo_contains(mtr, object, MTR_MEMO_PAGE_X_FIX));
+#endif /* UNIV_DEBUG */
+	}
+}
+
+#ifdef UNIV_DEBUG
+# define mtr_memo_slot_release(mtr, slot) mtr_memo_slot_release_func(mtr, slot)
+#else /* UNIV_DEBUG */
+# define mtr_memo_slot_release(mtr, slot) mtr_memo_slot_release_func(slot)
+#endif /* UNIV_DEBUG */
+
+/**********************************************************//**
+Releases the mlocks and other objects stored in an mtr memo.
+They are released in the order opposite to which they were pushed
+to the memo. */
+static __attribute__((nonnull))
+void
+mtr_memo_pop_all(
+/*=============*/
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in
+					     commit */
+
+	for (const dyn_block_t* block = dyn_array_get_last_block(&mtr->memo);
+	     block;
+	     block = dyn_array_get_prev_block(&mtr->memo, block)) {
+		const mtr_memo_slot_t*	start
+			= reinterpret_cast<mtr_memo_slot_t*>(
+				dyn_block_get_data(block));
+		mtr_memo_slot_t*	slot
+			= reinterpret_cast<mtr_memo_slot_t*>(
+				dyn_block_get_data(block)
+				+ dyn_block_get_used(block));
+
+		ut_ad(!(dyn_block_get_used(block) % sizeof(mtr_memo_slot_t)));
+
+		while (slot-- != start) {
+			if (slot->object != NULL) {
+				mtr_memo_slot_release(mtr, slot);
+			}
+		}
+	}
+}
+
+/*****************************************************************//**
+Releases the item in the slot given. */
+static
+void
+mtr_memo_slot_note_modification(
+/*============================*/
+	mtr_t*			mtr,	/*!< in: mtr */
+	mtr_memo_slot_t*	slot)	/*!< in: memo slot */
+{
+	ut_ad(mtr->modifications);
+	ut_ad(!srv_read_only_mode);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+
+	if (slot->object != NULL && slot->type == MTR_MEMO_PAGE_X_FIX) {
+		buf_block_t*	block = (buf_block_t*) slot->object;
+
+		ut_ad(!mtr->made_dirty || log_flush_order_mutex_own());
+		buf_flush_note_modification(block, mtr);
+	}
+}
+
+/**********************************************************//**
+Add the modified pages to the buffer flush list. They are released
+in the order opposite to which they were pushed to the memo. NOTE! It is
+essential that the x-rw-lock on a modified buffer page is not released
+before buf_page_note_modification is called for that page! Otherwise,
+some thread might race to modify it, and the flush list sort order on
+lsn would be destroyed. */
+static
+void
+mtr_memo_note_modifications(
+/*========================*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	ut_ad(!srv_read_only_mode);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in
+					     commit */
+
+	for (const dyn_block_t* block = dyn_array_get_last_block(&mtr->memo);
+	     block;
+	     block = dyn_array_get_prev_block(&mtr->memo, block)) {
+		const mtr_memo_slot_t*	start
+			= reinterpret_cast<mtr_memo_slot_t*>(
+				dyn_block_get_data(block));
+		mtr_memo_slot_t*	slot
+			= reinterpret_cast<mtr_memo_slot_t*>(
+				dyn_block_get_data(block)
+				+ dyn_block_get_used(block));
+
+		ut_ad(!(dyn_block_get_used(block) % sizeof(mtr_memo_slot_t)));
+
+		while (slot-- != start) {
+			if (slot->object != NULL) {
+				mtr_memo_slot_note_modification(mtr, slot);
+			}
+		}
+	}
+}
+
+/************************************************************//**
+Append the dirty pages to the flush list. */
+static
+void
+mtr_add_dirtied_pages_to_flush_list(
+/*================================*/
+	mtr_t*	mtr)	/*!< in/out: mtr */
+{
+	ut_ad(!srv_read_only_mode);
+
+	/* No need to acquire log_flush_order_mutex if this mtr has
+	not dirtied a clean page. log_flush_order_mutex is used to
+	ensure ordered insertions in the flush_list. We need to
+	insert in the flush_list iff the page in question was clean
+	before modifications. */
+	if (mtr->made_dirty) {
+		log_flush_order_mutex_enter();
+	}
+
+	/* It is now safe to release the log mutex because the
+	flush_order mutex will ensure that we are the first one
+	to insert into the flush list. */
+	log_release();
+
+	if (mtr->modifications) {
+		mtr_memo_note_modifications(mtr);
+	}
+
+	if (mtr->made_dirty) {
+		log_flush_order_mutex_exit();
+	}
+}
+
+/************************************************************//**
+Writes the contents of a mini-transaction log, if any, to the database log. */
+static
+void
+mtr_log_reserve_and_write(
+/*======================*/
+	mtr_t*	mtr)	/*!< in/out: mtr */
+{
+	dyn_array_t*	mlog;
+	ulint		data_size;
+	byte*		first_data;
+
+	ut_ad(!srv_read_only_mode);
+
+	mlog = &(mtr->log);
+
+	first_data = dyn_block_get_data(mlog);
+
+	if (mtr->n_log_recs > 1) {
+		mlog_catenate_ulint(mtr, MLOG_MULTI_REC_END, MLOG_1BYTE);
+	} else {
+		*first_data = (byte)((ulint)*first_data
+				     | MLOG_SINGLE_REC_FLAG);
+	}
+
+	if (mlog->heap == NULL) {
+		ulint	len;
+
+		len = mtr->log_mode != MTR_LOG_NO_REDO
+			? dyn_block_get_used(mlog) : 0;
+
+		mtr->end_lsn = log_reserve_and_write_fast(
+			first_data, len, &mtr->start_lsn);
+
+		if (mtr->end_lsn) {
+
+			/* Success. We have the log mutex.
+			Add pages to flush list and exit */
+			mtr_add_dirtied_pages_to_flush_list(mtr);
+
+			return;
+		}
+	}
+
+	data_size = dyn_array_get_data_size(mlog);
+
+	/* Open the database log for log_write_low */
+	mtr->start_lsn = log_reserve_and_open(data_size);
+
+	if (mtr->log_mode == MTR_LOG_ALL) {
+
+		for (dyn_block_t* block = mlog;
+		     block != 0;
+		     block = dyn_array_get_next_block(mlog, block)) {
+
+			log_write_low(
+				dyn_block_get_data(block),
+				dyn_block_get_used(block));
+		}
+
+	} else {
+		ut_ad(mtr->log_mode == MTR_LOG_NONE
+		      || mtr->log_mode == MTR_LOG_NO_REDO);
+		/* Do nothing */
+	}
+
+	mtr->end_lsn = log_close();
+
+	mtr_add_dirtied_pages_to_flush_list(mtr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+Commits a mini-transaction. */
+UNIV_INTERN
+void
+mtr_commit(
+/*=======*/
+	mtr_t*	mtr)	/*!< in: mini-transaction */
+{
+	ut_ad(mtr);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+	ut_ad(!mtr->inside_ibuf);
+	ut_d(mtr->state = MTR_COMMITTING);
+
+#ifndef UNIV_HOTBACKUP
+	/* This is a dirty read, for debugging. */
+	ut_ad(!recv_no_log_write);
+
+	if (mtr->modifications && mtr->n_log_recs) {
+		ut_ad(!srv_read_only_mode);
+		mtr_log_reserve_and_write(mtr);
+	}
+
+	mtr_memo_pop_all(mtr);
+#endif /* !UNIV_HOTBACKUP */
+
+	dyn_array_free(&(mtr->memo));
+	dyn_array_free(&(mtr->log));
+#ifdef UNIV_DEBUG_VALGRIND
+	/* Declare everything uninitialized except
+	mtr->start_lsn, mtr->end_lsn and mtr->state. */
+	{
+		lsn_t	start_lsn	= mtr->start_lsn;
+		lsn_t	end_lsn		= mtr->end_lsn;
+		UNIV_MEM_INVALID(mtr, sizeof *mtr);
+		mtr->start_lsn = start_lsn;
+		mtr->end_lsn = end_lsn;
+	}
+#endif /* UNIV_DEBUG_VALGRIND */
+	ut_d(mtr->state = MTR_COMMITTED);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***************************************************//**
+Releases an object in the memo stack.
+@return true if released */
+UNIV_INTERN
+bool
+mtr_memo_release(
+/*=============*/
+	mtr_t*	mtr,	/*!< in/out: mini-transaction */
+	void*	object,	/*!< in: object */
+	ulint	type)	/*!< in: object type: MTR_MEMO_S_LOCK, ... */
+{
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+	/* We cannot release a page that has been written to in the
+	middle of a mini-transaction. */
+	ut_ad(!mtr->modifications || type != MTR_MEMO_PAGE_X_FIX);
+
+	for (const dyn_block_t* block = dyn_array_get_last_block(&mtr->memo);
+	     block;
+	     block = dyn_array_get_prev_block(&mtr->memo, block)) {
+		const mtr_memo_slot_t*	start
+			= reinterpret_cast<mtr_memo_slot_t*>(
+				dyn_block_get_data(block));
+		mtr_memo_slot_t*	slot
+			= reinterpret_cast<mtr_memo_slot_t*>(
+				dyn_block_get_data(block)
+				+ dyn_block_get_used(block));
+
+		ut_ad(!(dyn_block_get_used(block) % sizeof(mtr_memo_slot_t)));
+
+		while (slot-- != start) {
+			if (object == slot->object && type == slot->type) {
+				mtr_memo_slot_release(mtr, slot);
+				return(true);
+			}
+		}
+	}
+
+	return(false);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Reads 1 - 4 bytes from a file page buffered in the buffer pool.
+@return	value read */
+UNIV_INTERN
+ulint
+mtr_read_ulint(
+/*===========*/
+	const byte*	ptr,	/*!< in: pointer from where to read */
+	ulint		type,	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+	mtr_t*		mtr __attribute__((unused)))
+				/*!< in: mini-transaction handle */
+{
+	ut_ad(mtr->state == MTR_ACTIVE);
+	ut_ad(mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_S_FIX)
+	      || mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_X_FIX));
+
+	return(mach_read_ulint(ptr, type));
+}
+
+#ifdef UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Checks if memo contains the given page.
+@return	TRUE if contains */
+UNIV_INTERN
+ibool
+mtr_memo_contains_page(
+/*===================*/
+	mtr_t*		mtr,	/*!< in: mtr */
+	const byte*	ptr,	/*!< in: pointer to buffer frame */
+	ulint		type)	/*!< in: type of object */
+{
+	return(mtr_memo_contains(mtr, buf_block_align(ptr), type));
+}
+
+/*********************************************************//**
+Prints info of an mtr handle. */
+UNIV_INTERN
+void
+mtr_print(
+/*======*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	fprintf(stderr,
+		"Mini-transaction handle: memo size %lu bytes"
+		" log size %lu bytes\n",
+		(ulong) dyn_array_get_data_size(&(mtr->memo)),
+		(ulong) dyn_array_get_data_size(&(mtr->log)));
+}
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc
new file mode 100644
index 00000000000..fb7e8ca1eb7
--- /dev/null
+++ b/storage/innobase/os/os0file.cc
@@ -0,0 +1,5807 @@
+/***********************************************************************
+
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file os/os0file.cc
+The interface to the operating system file i/o primitives
+
+Created 10/21/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0file.h"
+
+#ifdef UNIV_NONINL
+#include "os0file.ic"
+#endif
+
+#include "ut0mem.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "fil0fil.h"
+#include "buf0buf.h"
+#include "srv0mon.h"
+#ifndef UNIV_HOTBACKUP
+# include "os0sync.h"
+# include "os0thread.h"
+#else /* !UNIV_HOTBACKUP */
+# ifdef __WIN__
+/* Add includes for the _stat() call to compile on Windows */
+#  include <sys/types.h>
+#  include <sys/stat.h>
+#  include <errno.h>
+# endif /* __WIN__ */
+#endif /* !UNIV_HOTBACKUP */
+
+#if defined(LINUX_NATIVE_AIO)
+#include <libaio.h>
+#endif
+
+/** Insert buffer segment id */
+static const ulint IO_IBUF_SEGMENT = 0;
+
+/** Log segment id */
+static const ulint IO_LOG_SEGMENT = 1;
+
+/* This specifies the file permissions InnoDB uses when it creates files in
+Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
+my_umask */
+
+#ifndef __WIN__
+/** Umask for creating files */
+UNIV_INTERN ulint	os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
+#else
+/** Umask for creating files */
+UNIV_INTERN ulint	os_innodb_umask	= 0;
+#endif /* __WIN__ */
+
+#ifndef UNIV_HOTBACKUP
+/* We use these mutexes to protect lseek + file i/o operation, if the
+OS does not provide an atomic pread or pwrite, or similar */
+#define OS_FILE_N_SEEK_MUTEXES	16
+UNIV_INTERN os_ib_mutex_t	os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
+
+/* In simulated aio, merge at most this many consecutive i/os */
+#define OS_AIO_MERGE_N_CONSECUTIVE	64
+
+/**********************************************************************
+
+InnoDB AIO Implementation:
+=========================
+
+We support native AIO for windows and linux. For rest of the platforms
+we simulate AIO by special io-threads servicing the IO-requests.
+
+Simulated AIO:
+==============
+
+In platforms where we 'simulate' AIO following is a rough explanation
+of the high level design.
+There are four io-threads (for ibuf, log, read, write).
+All synchronous IO requests are serviced by the calling thread using
+os_file_write/os_file_read. The Asynchronous requests are queued up
+in an array (there are four such arrays) by the calling thread.
+Later these requests are picked up by the io-thread and are serviced
+synchronously.
+
+Windows native AIO:
+==================
+
+If srv_use_native_aio is not set then windows follow the same
+code as simulated AIO. If the flag is set then native AIO interface
+is used. On windows, one of the limitation is that if a file is opened
+for AIO no synchronous IO can be done on it. Therefore we have an
+extra fifth array to queue up synchronous IO requests.
+There are innodb_file_io_threads helper threads. These threads work
+on the four arrays mentioned above in Simulated AIO. No thread is
+required for the sync array.
+If a synchronous IO request is made, it is first queued in the sync
+array. Then the calling thread itself waits on the request, thus
+making the call synchronous.
+If an AIO request is made the calling thread not only queues it in the
+array but also submits the requests. The helper thread then collects
+the completed IO request and calls completion routine on it.
+
+Linux native AIO:
+=================
+
+If we have libaio installed on the system and innodb_use_native_aio
+is set to TRUE we follow the code path of native AIO, otherwise we
+do simulated AIO.
+There are innodb_file_io_threads helper threads. These threads work
+on the four arrays mentioned above in Simulated AIO.
+If a synchronous IO request is made, it is handled by calling
+os_file_write/os_file_read.
+If an AIO request is made the calling thread not only queues it in the
+array but also submits the requests. The helper thread then collects
+the completed IO request and calls completion routine on it.
+
+**********************************************************************/
+
+/** Flag: enable debug printout for asynchronous i/o */
+UNIV_INTERN ibool	os_aio_print_debug	= FALSE;
+
+#ifdef UNIV_PFS_IO
+/* Keys to register InnoDB I/O with performance schema */
+UNIV_INTERN mysql_pfs_key_t  innodb_file_data_key;
+UNIV_INTERN mysql_pfs_key_t  innodb_file_log_key;
+UNIV_INTERN mysql_pfs_key_t  innodb_file_temp_key;
+#endif /* UNIV_PFS_IO */
+
+/** The asynchronous i/o array slot structure */
+struct os_aio_slot_t{
+	ibool		is_read;	/*!< TRUE if a read operation */
+	ulint		pos;		/*!< index of the slot in the aio
+					array */
+	ibool		reserved;	/*!< TRUE if this slot is reserved */
+	time_t		reservation_time;/*!< time when reserved */
+	ulint		len;		/*!< length of the block to read or
+					write */
+	byte*		buf;		/*!< buffer used in i/o */
+	ulint		type;		/*!< OS_FILE_READ or OS_FILE_WRITE */
+	os_offset_t	offset;		/*!< file offset in bytes */
+	os_file_t	file;		/*!< file where to read or write */
+	const char*	name;		/*!< file name or path */
+	ibool		io_already_done;/*!< used only in simulated aio:
+					TRUE if the physical i/o already
+					made and only the slot message
+					needs to be passed to the caller
+					of os_aio_simulated_handle */
+	fil_node_t*	message1;	/*!< message which is given by the */
+	void*		message2;	/*!< the requester of an aio operation
+					and which can be used to identify
+					which pending aio operation was
+					completed */
+#ifdef WIN_ASYNC_IO
+	HANDLE		handle;		/*!< handle object we need in the
+					OVERLAPPED struct */
+	OVERLAPPED	control;	/*!< Windows control block for the
+					aio request */
+#elif defined(LINUX_NATIVE_AIO)
+	struct iocb	control;	/* Linux control block for aio */
+	int		n_bytes;	/* bytes written/read. */
+	int		ret;		/* AIO return code */
+#endif /* WIN_ASYNC_IO */
+};
+
+/** The asynchronous i/o array structure */
+struct os_aio_array_t{
+	os_ib_mutex_t	mutex;	/*!< the mutex protecting the aio array */
+	os_event_t	not_full;
+				/*!< The event which is set to the
+				signaled state when there is space in
+				the aio outside the ibuf segment */
+	os_event_t	is_empty;
+				/*!< The event which is set to the
+				signaled state when there are no
+				pending i/os in this array */
+	ulint		n_slots;/*!< Total number of slots in the aio
+				array.  This must be divisible by
+				n_threads. */
+	ulint		n_segments;
+				/*!< Number of segments in the aio
+				array of pending aio requests. A
+				thread can wait separately for any one
+				of the segments. */
+	ulint		cur_seg;/*!< We reserve IO requests in round
+				robin fashion to different segments.
+				This points to the segment that is to
+				be used to service next IO request. */
+	ulint		n_reserved;
+				/*!< Number of reserved slots in the
+				aio array outside the ibuf segment */
+	os_aio_slot_t*	slots;	/*!< Pointer to the slots in the array */
+#ifdef __WIN__
+	HANDLE*		handles;
+				/*!< Pointer to an array of OS native
+				event handles where we copied the
+				handles from slots, in the same
+				order. This can be used in
+				WaitForMultipleObjects; used only in
+				Windows */
+#endif /* __WIN__ */
+
+#if defined(LINUX_NATIVE_AIO)
+	io_context_t*		aio_ctx;
+				/* completion queue for IO. There is
+				one such queue per segment. Each thread
+				will work on one ctx exclusively. */
+	struct io_event*	aio_events;
+				/* The array to collect completed IOs.
+				There is one such event for each
+				possible pending IO. The size of the
+				array is equal to n_slots. */
+#endif /* LINUX_NATIV_AIO */
+};
+
+#if defined(LINUX_NATIVE_AIO)
+/** timeout for each io_getevents() call = 500ms. */
+#define OS_AIO_REAP_TIMEOUT	(500000000UL)
+
+/** time to sleep, in microseconds if io_setup() returns EAGAIN. */
+#define OS_AIO_IO_SETUP_RETRY_SLEEP	(500000UL)
+
+/** number of attempts before giving up on io_setup(). */
+#define OS_AIO_IO_SETUP_RETRY_ATTEMPTS	5
+#endif
+
+/** Array of events used in simulated aio */
+static os_event_t*	os_aio_segment_wait_events = NULL;
+
+/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
+are NULL when the module has not yet been initialized. @{ */
+static os_aio_array_t*	os_aio_read_array	= NULL;	/*!< Reads */
+static os_aio_array_t*	os_aio_write_array	= NULL;	/*!< Writes */
+static os_aio_array_t*	os_aio_ibuf_array	= NULL;	/*!< Insert buffer */
+static os_aio_array_t*	os_aio_log_array	= NULL;	/*!< Redo log */
+static os_aio_array_t*	os_aio_sync_array	= NULL;	/*!< Synchronous I/O */
+/* @} */
+
+/** Number of asynchronous I/O segments.  Set by os_aio_init(). */
+static ulint	os_aio_n_segments	= ULINT_UNDEFINED;
+
+/** If the following is TRUE, read i/o handler threads try to
+wait until a batch of new read requests have been posted */
+static ibool	os_aio_recommend_sleep_for_read_threads	= FALSE;
+#endif /* !UNIV_HOTBACKUP */
+
+UNIV_INTERN ulint	os_n_file_reads		= 0;
+UNIV_INTERN ulint	os_bytes_read_since_printout = 0;
+UNIV_INTERN ulint	os_n_file_writes	= 0;
+UNIV_INTERN ulint	os_n_fsyncs		= 0;
+UNIV_INTERN ulint	os_n_file_reads_old	= 0;
+UNIV_INTERN ulint	os_n_file_writes_old	= 0;
+UNIV_INTERN ulint	os_n_fsyncs_old		= 0;
+UNIV_INTERN time_t	os_last_printout;
+
+UNIV_INTERN ibool	os_has_said_disk_full	= FALSE;
+
+#if !defined(UNIV_HOTBACKUP)	\
+    && (!defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8)
+/** The mutex protecting the following counts of pending I/O operations */
+static os_ib_mutex_t	os_file_count_mutex;
+#endif /* !UNIV_HOTBACKUP && (!HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8) */
+
+/** Number of pending os_file_pread() operations */
+UNIV_INTERN ulint	os_file_n_pending_preads  = 0;
+/** Number of pending os_file_pwrite() operations */
+UNIV_INTERN ulint	os_file_n_pending_pwrites = 0;
+/** Number of pending write operations */
+UNIV_INTERN ulint	os_n_pending_writes = 0;
+/** Number of pending read operations */
+UNIV_INTERN ulint	os_n_pending_reads = 0;
+
+#ifdef UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Validates the consistency the aio system some of the time.
+@return	TRUE if ok or the check was skipped */
+UNIV_INTERN
+ibool
+os_aio_validate_skip(void)
+/*======================*/
+{
+/** Try os_aio_validate() every this many times */
+# define OS_AIO_VALIDATE_SKIP	13
+
+	/** The os_aio_validate() call skip counter.
+	Use a signed type because of the race condition below. */
+	static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
+
+	/* There is a race condition below, but it does not matter,
+	because this call is only for heuristic purposes. We want to
+	reduce the call frequency of the costly os_aio_validate()
+	check in debug builds. */
+	if (--os_aio_validate_count > 0) {
+		return(TRUE);
+	}
+
+	os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
+	return(os_aio_validate());
+}
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_DEBUG */
+
+#ifdef __WIN__
+/***********************************************************************//**
+Gets the operating system version. Currently works only on Windows.
+@return	OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA,
+OS_WIN7. */
+UNIV_INTERN
+ulint
+os_get_os_version(void)
+/*===================*/
+{
+	OSVERSIONINFO	os_info;
+
+	os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+
+	ut_a(GetVersionEx(&os_info));
+
+	if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
+		return(OS_WIN31);
+	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
+		return(OS_WIN95);
+	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
+		switch (os_info.dwMajorVersion) {
+		case 3:
+		case 4:
+			return(OS_WINNT);
+		case 5:
+			return (os_info.dwMinorVersion == 0)
+				? OS_WIN2000 : OS_WINXP;
+		case 6:
+			return (os_info.dwMinorVersion == 0)
+				? OS_WINVISTA : OS_WIN7;
+		default:
+			return(OS_WIN7);
+		}
+	} else {
+		ut_error;
+		return(0);
+	}
+}
+#endif /* __WIN__ */
+
+/***********************************************************************//**
+Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@return	error number, or OS error number + 100 */
+static
+ulint
+os_file_get_last_error_low(
+/*=======================*/
+	bool	report_all_errors,	/*!< in: TRUE if we want an error
+					message printed of all errors */
+	bool	on_error_silent)	/*!< in: TRUE then don't print any
+					diagnostic to the log */
+{
+#ifdef __WIN__
+
+	ulint	err = (ulint) GetLastError();
+	if (err == ERROR_SUCCESS) {
+		return(0);
+	}
+
+	if (report_all_errors
+	    || (!on_error_silent
+		&& err != ERROR_DISK_FULL
+		&& err != ERROR_FILE_EXISTS)) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Operating system error number %lu"
+			" in a file operation.\n", (ulong) err);
+
+		if (err == ERROR_PATH_NOT_FOUND) {
+			fprintf(stderr,
+				"InnoDB: The error means the system"
+				" cannot find the path specified.\n");
+
+			if (srv_is_being_started) {
+				fprintf(stderr,
+					"InnoDB: If you are installing InnoDB,"
+					" remember that you must create\n"
+					"InnoDB: directories yourself, InnoDB"
+					" does not create them.\n");
+			}
+		} else if (err == ERROR_ACCESS_DENIED) {
+			fprintf(stderr,
+				"InnoDB: The error means mysqld does not have"
+				" the access rights to\n"
+				"InnoDB: the directory. It may also be"
+				" you have created a subdirectory\n"
+				"InnoDB: of the same name as a data file.\n");
+		} else if (err == ERROR_SHARING_VIOLATION
+			   || err == ERROR_LOCK_VIOLATION) {
+			fprintf(stderr,
+				"InnoDB: The error means that another program"
+				" is using InnoDB's files.\n"
+				"InnoDB: This might be a backup or antivirus"
+				" software or another instance\n"
+				"InnoDB: of MySQL."
+				" Please close it to get rid of this error.\n");
+		} else if (err == ERROR_WORKING_SET_QUOTA
+			   || err == ERROR_NO_SYSTEM_RESOURCES) {
+			fprintf(stderr,
+				"InnoDB: The error means that there are no"
+				" sufficient system resources or quota to"
+				" complete the operation.\n");
+		} else if (err == ERROR_OPERATION_ABORTED) {
+			fprintf(stderr,
+				"InnoDB: The error means that the I/O"
+				" operation has been aborted\n"
+				"InnoDB: because of either a thread exit"
+				" or an application request.\n"
+				"InnoDB: Retry attempt is made.\n");
+		} else {
+			fprintf(stderr,
+				"InnoDB: Some operating system error numbers"
+				" are described at\n"
+				"InnoDB: "
+				REFMAN
+				"operating-system-error-codes.html\n");
+		}
+	}
+
+	fflush(stderr);
+
+	if (err == ERROR_FILE_NOT_FOUND) {
+		return(OS_FILE_NOT_FOUND);
+	} else if (err == ERROR_DISK_FULL) {
+		return(OS_FILE_DISK_FULL);
+	} else if (err == ERROR_FILE_EXISTS) {
+		return(OS_FILE_ALREADY_EXISTS);
+	} else if (err == ERROR_SHARING_VIOLATION
+		   || err == ERROR_LOCK_VIOLATION) {
+		return(OS_FILE_SHARING_VIOLATION);
+	} else if (err == ERROR_WORKING_SET_QUOTA
+		   || err == ERROR_NO_SYSTEM_RESOURCES) {
+		return(OS_FILE_INSUFFICIENT_RESOURCE);
+	} else if (err == ERROR_OPERATION_ABORTED) {
+		return(OS_FILE_OPERATION_ABORTED);
+	} else if (err == ERROR_ACCESS_DENIED) {
+		return(OS_FILE_ACCESS_VIOLATION);
+	} else {
+		return(OS_FILE_ERROR_MAX + err);
+	}
+#else
+	int err = errno;
+	if (err == 0) {
+		return(0);
+	}
+
+	if (report_all_errors
+	    || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Operating system error number %d"
+			" in a file operation.\n", err);
+
+		if (err == ENOENT) {
+			fprintf(stderr,
+				"InnoDB: The error means the system"
+				" cannot find the path specified.\n");
+
+			if (srv_is_being_started) {
+				fprintf(stderr,
+					"InnoDB: If you are installing InnoDB,"
+					" remember that you must create\n"
+					"InnoDB: directories yourself, InnoDB"
+					" does not create them.\n");
+			}
+		} else if (err == EACCES) {
+			fprintf(stderr,
+				"InnoDB: The error means mysqld does not have"
+				" the access rights to\n"
+				"InnoDB: the directory.\n");
+		} else {
+			if (strerror(err) != NULL) {
+				fprintf(stderr,
+					"InnoDB: Error number %d"
+					" means '%s'.\n",
+					err, strerror(err));
+			}
+
+
+			fprintf(stderr,
+				"InnoDB: Some operating system"
+				" error numbers are described at\n"
+				"InnoDB: "
+				REFMAN
+				"operating-system-error-codes.html\n");
+		}
+	}
+
+	fflush(stderr);
+
+	switch (err) {
+	case ENOSPC:
+		return(OS_FILE_DISK_FULL);
+	case ENOENT:
+		return(OS_FILE_NOT_FOUND);
+	case EEXIST:
+		return(OS_FILE_ALREADY_EXISTS);
+	case EXDEV:
+	case ENOTDIR:
+	case EISDIR:
+		return(OS_FILE_PATH_ERROR);
+	case EAGAIN:
+		if (srv_use_native_aio) {
+			return(OS_FILE_AIO_RESOURCES_RESERVED);
+		}
+		break;
+	case EINTR:
+		if (srv_use_native_aio) {
+			return(OS_FILE_AIO_INTERRUPTED);
+		}
+		break;
+	case EACCES:
+		return(OS_FILE_ACCESS_VIOLATION);
+	}
+	return(OS_FILE_ERROR_MAX + err);
+#endif
+}
+
+/***********************************************************************//**
+Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@return	error number, or OS error number + 100 */
+UNIV_INTERN
+ulint
+os_file_get_last_error(
+/*===================*/
+	bool	report_all_errors)	/*!< in: TRUE if we want an error
+					message printed of all errors */
+{
+	return(os_file_get_last_error_low(report_all_errors, false));
+}
+
+/****************************************************************//**
+Does error handling when a file operation fails.
+Conditionally exits (calling exit(3)) based on should_exit value and the
+error type, if should_exit is TRUE then on_error_silent is ignored.
+@return	TRUE if we should retry the operation */
+static
+ibool
+os_file_handle_error_cond_exit(
+/*===========================*/
+	const char*	name,		/*!< in: name of a file or NULL */
+	const char*	operation,	/*!< in: operation */
+	ibool		should_exit,	/*!< in: call exit(3) if unknown error
+					and this parameter is TRUE */
+	ibool		on_error_silent)/*!< in: if TRUE then don't print
+					any message to the log iff it is
+					an unknown non-fatal error */
+{
+	ulint	err;
+
+	err = os_file_get_last_error_low(false, on_error_silent);
+
+	switch (err) {
+	case OS_FILE_DISK_FULL:
+		/* We only print a warning about disk full once */
+
+		if (os_has_said_disk_full) {
+
+			return(FALSE);
+		}
+
+		/* Disk full error is reported irrespective of the
+		on_error_silent setting. */
+
+		if (name) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Encountered a problem with"
+				" file %s\n", name);
+		}
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Disk is full. Try to clean the disk"
+			" to free space.\n");
+
+		os_has_said_disk_full = TRUE;
+
+		fflush(stderr);
+
+		return(FALSE);
+
+	case OS_FILE_AIO_RESOURCES_RESERVED:
+	case OS_FILE_AIO_INTERRUPTED:
+
+		return(TRUE);
+
+	case OS_FILE_PATH_ERROR:
+	case OS_FILE_ALREADY_EXISTS:
+	case OS_FILE_ACCESS_VIOLATION:
+
+		return(FALSE);
+
+	case OS_FILE_SHARING_VIOLATION:
+
+		os_thread_sleep(10000000);  /* 10 sec */
+		return(TRUE);
+
+	case OS_FILE_OPERATION_ABORTED:
+	case OS_FILE_INSUFFICIENT_RESOURCE:
+
+		os_thread_sleep(100000);	/* 100 ms */
+		return(TRUE);
+
+	default:
+
+		/* If it is an operation that can crash on error then it
+		is better to ignore on_error_silent and print an error message
+		to the log. */
+
+		if (should_exit || !on_error_silent) {
+			ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS "
+				"error " ULINTPF ".%s", name ? name : "(unknown)",
+				operation, err, should_exit
+				? " Cannot continue operation" : "");
+		}
+
+		if (should_exit) {
+			exit(1);
+		}
+	}
+
+	return(FALSE);
+}
+
+/****************************************************************//**
+Does error handling when a file operation fails.
+@return	TRUE if we should retry the operation */
+static
+ibool
+os_file_handle_error(
+/*=================*/
+	const char*	name,		/*!< in: name of a file or NULL */
+	const char*	operation)	/*!< in: operation */
+{
+	/* exit in case of unknown error */
+	return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE));
+}
+
+/****************************************************************//**
+Does error handling when a file operation fails.
+@return	TRUE if we should retry the operation */
+static
+ibool
+os_file_handle_error_no_exit(
+/*=========================*/
+	const char*	name,		/*!< in: name of a file or NULL */
+	const char*	operation,	/*!< in: operation */
+	ibool		on_error_silent)/*!< in: if TRUE then don't print
+					any message to the log. */
+{
+	/* don't exit in case of unknown error */
+	return(os_file_handle_error_cond_exit(
+			name, operation, FALSE, on_error_silent));
+}
+
+#undef USE_FILE_LOCK
+#define USE_FILE_LOCK
+#if defined(UNIV_HOTBACKUP) || defined(__WIN__)
+/* InnoDB Hot Backup does not lock the data files.
+ * On Windows, mandatory locking is used.
+ */
+# undef USE_FILE_LOCK
+#endif
+#ifdef USE_FILE_LOCK
+/****************************************************************//**
+Obtain an exclusive lock on a file.
+@return	0 on success */
+static
+int
+os_file_lock(
+/*=========*/
+	int		fd,	/*!< in: file descriptor */
+	const char*	name)	/*!< in: file name */
+{
+	struct flock lk;
+
+	ut_ad(!srv_read_only_mode);
+
+	lk.l_type = F_WRLCK;
+	lk.l_whence = SEEK_SET;
+	lk.l_start = lk.l_len = 0;
+
+	if (fcntl(fd, F_SETLK, &lk) == -1) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unable to lock %s, error: %d", name, errno);
+
+		if (errno == EAGAIN || errno == EACCES) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Check that you do not already have "
+				"another mysqld process using the "
+				"same InnoDB data or log files.");
+		}
+
+		return(-1);
+	}
+
+	return(0);
+}
+#endif /* USE_FILE_LOCK */
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Creates the seek mutexes used in positioned reads and writes. */
+UNIV_INTERN
+void
+os_io_init_simple(void)
+/*===================*/
+{
+#if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
+	os_file_count_mutex = os_mutex_create();
+#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8 */
+
+	for (ulint i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
+		os_file_seek_mutexes[i] = os_mutex_create();
+	}
+}
+
+/***********************************************************************//**
+Creates a temporary file.  This function is like tmpfile(3), but
+the temporary file is created in the MySQL temporary directory.
+@return	temporary file handle, or NULL on error */
+UNIV_INTERN
+FILE*
+os_file_create_tmpfile(void)
+/*========================*/
+{
+	FILE*	file	= NULL;
+	int	fd	= innobase_mysql_tmpfile();
+
+	ut_ad(!srv_read_only_mode);
+
+	if (fd >= 0) {
+		file = fdopen(fd, "w+b");
+	}
+
+	if (!file) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: unable to create temporary file;"
+			" errno: %d\n", errno);
+		if (fd >= 0) {
+			close(fd);
+		}
+	}
+
+	return(file);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+The os_file_opendir() function opens a directory stream corresponding to the
+directory named by the dirname argument. The directory stream is positioned
+at the first entry. In both Unix and Windows we automatically skip the '.'
+and '..' items at the start of the directory listing.
+@return	directory stream, NULL if error */
+UNIV_INTERN
+os_file_dir_t
+os_file_opendir(
+/*============*/
+	const char*	dirname,	/*!< in: directory name; it must not
+					contain a trailing '\' or '/' */
+	ibool		error_is_fatal)	/*!< in: TRUE if we should treat an
+					error as a fatal error; if we try to
+					open symlinks then we do not wish a
+					fatal error if it happens not to be
+					a directory */
+{
+	os_file_dir_t		dir;
+#ifdef __WIN__
+	LPWIN32_FIND_DATA	lpFindFileData;
+	char			path[OS_FILE_MAX_PATH + 3];
+
+	ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
+
+	strcpy(path, dirname);
+	strcpy(path + strlen(path), "\\*");
+
+	/* Note that in Windows opening the 'directory stream' also retrieves
+	the first entry in the directory. Since it is '.', that is no problem,
+	as we will skip over the '.' and '..' entries anyway. */
+
+	lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
+		ut_malloc(sizeof(WIN32_FIND_DATA)));
+
+	dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
+
+	ut_free(lpFindFileData);
+
+	if (dir == INVALID_HANDLE_VALUE) {
+
+		if (error_is_fatal) {
+			os_file_handle_error(dirname, "opendir");
+		}
+
+		return(NULL);
+	}
+
+	return(dir);
+#else
+	dir = opendir(dirname);
+
+	if (dir == NULL && error_is_fatal) {
+		os_file_handle_error(dirname, "opendir");
+	}
+
+	return(dir);
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+Closes a directory stream.
+@return	0 if success, -1 if failure */
+UNIV_INTERN
+int
+os_file_closedir(
+/*=============*/
+	os_file_dir_t	dir)	/*!< in: directory stream */
+{
+#ifdef __WIN__
+	BOOL		ret;
+
+	ret = FindClose(dir);
+
+	if (!ret) {
+		os_file_handle_error_no_exit(NULL, "closedir", FALSE);
+
+		return(-1);
+	}
+
+	return(0);
+#else
+	int	ret;
+
+	ret = closedir(dir);
+
+	if (ret) {
+		os_file_handle_error_no_exit(NULL, "closedir", FALSE);
+	}
+
+	return(ret);
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+This function returns information of the next file in the directory. We jump
+over the '.' and '..' entries in the directory.
+@return	0 if ok, -1 if error, 1 if at the end of the directory */
+UNIV_INTERN
+int
+os_file_readdir_next_file(
+/*======================*/
+	const char*	dirname,/*!< in: directory name or path */
+	os_file_dir_t	dir,	/*!< in: directory stream */
+	os_file_stat_t*	info)	/*!< in/out: buffer where the info is returned */
+{
+#ifdef __WIN__
+	LPWIN32_FIND_DATA	lpFindFileData;
+	BOOL			ret;
+
+	lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
+		ut_malloc(sizeof(WIN32_FIND_DATA)));
+next_file:
+	ret = FindNextFile(dir, lpFindFileData);
+
+	if (ret) {
+		ut_a(strlen((char*) lpFindFileData->cFileName)
+		     < OS_FILE_MAX_PATH);
+
+		if (strcmp((char*) lpFindFileData->cFileName, ".") == 0
+		    || strcmp((char*) lpFindFileData->cFileName, "..") == 0) {
+
+			goto next_file;
+		}
+
+		strcpy(info->name, (char*) lpFindFileData->cFileName);
+
+		info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
+			+ (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
+			   << 32);
+
+		if (lpFindFileData->dwFileAttributes
+		    & FILE_ATTRIBUTE_REPARSE_POINT) {
+			/* TODO: test Windows symlinks */
+			/* TODO: MySQL has apparently its own symlink
+			implementation in Windows, dbname.sym can
+			redirect a database directory:
+			REFMAN "windows-symbolic-links.html" */
+			info->type = OS_FILE_TYPE_LINK;
+		} else if (lpFindFileData->dwFileAttributes
+			   & FILE_ATTRIBUTE_DIRECTORY) {
+			info->type = OS_FILE_TYPE_DIR;
+		} else {
+			/* It is probably safest to assume that all other
+			file types are normal. Better to check them rather
+			than blindly skip them. */
+
+			info->type = OS_FILE_TYPE_FILE;
+		}
+	}
+
+	ut_free(lpFindFileData);
+
+	if (ret) {
+		return(0);
+	} else if (GetLastError() == ERROR_NO_MORE_FILES) {
+
+		return(1);
+	} else {
+		os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE);
+		return(-1);
+	}
+#else
+	struct dirent*	ent;
+	char*		full_path;
+	int		ret;
+	struct stat	statinfo;
+#ifdef HAVE_READDIR_R
+	char		dirent_buf[sizeof(struct dirent)
+				   + _POSIX_PATH_MAX + 100];
+	/* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
+	the max file name len; but in most standards, the
+	length is NAME_MAX; we add 100 to be even safer */
+#endif
+
+next_file:
+
+#ifdef HAVE_READDIR_R
+	ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent);
+
+	if (ret != 0
+#ifdef UNIV_AIX
+	    /* On AIX, only if we got non-NULL 'ent' (result) value and
+	    a non-zero 'ret' (return) value, it indicates a failed
+	    readdir_r() call. An NULL 'ent' with an non-zero 'ret'
+	    would indicate the "end of the directory" is reached. */
+	    && ent != NULL
+#endif
+	   ) {
+		fprintf(stderr,
+			"InnoDB: cannot read directory %s, error %lu\n",
+			dirname, (ulong) ret);
+
+		return(-1);
+	}
+
+	if (ent == NULL) {
+		/* End of directory */
+
+		return(1);
+	}
+
+	ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
+#else
+	ent = readdir(dir);
+
+	if (ent == NULL) {
+
+		return(1);
+	}
+#endif
+	ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
+
+	if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
+
+		goto next_file;
+	}
+
+	strcpy(info->name, ent->d_name);
+
+	full_path = static_cast<char*>(
+		ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10));
+
+	sprintf(full_path, "%s/%s", dirname, ent->d_name);
+
+	ret = stat(full_path, &statinfo);
+
+	if (ret) {
+
+		if (errno == ENOENT) {
+			/* readdir() returned a file that does not exist,
+			it must have been deleted in the meantime. Do what
+			would have happened if the file was deleted before
+			readdir() - ignore and go to the next entry.
+			If this is the last entry then info->name will still
+			contain the name of the deleted file when this
+			function returns, but this is not an issue since the
+			caller shouldn't be looking at info when end of
+			directory is returned. */
+
+			ut_free(full_path);
+
+			goto next_file;
+		}
+
+		os_file_handle_error_no_exit(full_path, "stat", FALSE);
+
+		ut_free(full_path);
+
+		return(-1);
+	}
+
+	info->size = (ib_int64_t) statinfo.st_size;
+
+	if (S_ISDIR(statinfo.st_mode)) {
+		info->type = OS_FILE_TYPE_DIR;
+	} else if (S_ISLNK(statinfo.st_mode)) {
+		info->type = OS_FILE_TYPE_LINK;
+	} else if (S_ISREG(statinfo.st_mode)) {
+		info->type = OS_FILE_TYPE_FILE;
+	} else {
+		info->type = OS_FILE_TYPE_UNKNOWN;
+	}
+
+	ut_free(full_path);
+
+	return(0);
+#endif
+}
+
+/*****************************************************************//**
+This function attempts to create a directory named pathname. The new
+directory gets default permissions. On Unix the permissions are
+(0770 & ~umask). If the directory exists already, nothing is done and
+the call succeeds, unless the fail_if_exists arguments is true.
+If another error occurs, such as a permission error, this does not crash,
+but reports the error and returns FALSE.
+@return	TRUE if call succeeds, FALSE on error */
+UNIV_INTERN
+ibool
+os_file_create_directory(
+/*=====================*/
+	const char*	pathname,	/*!< in: directory name as
+					null-terminated string */
+	ibool		fail_if_exists)	/*!< in: if TRUE, pre-existing directory
+					is treated as an error. */
+{
+#ifdef __WIN__
+	BOOL	rcode;
+
+	rcode = CreateDirectory((LPCTSTR) pathname, NULL);
+	if (!(rcode != 0
+	      || (GetLastError() == ERROR_ALREADY_EXISTS
+		  && !fail_if_exists))) {
+
+		os_file_handle_error_no_exit(
+			pathname, "CreateDirectory", FALSE);
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+#else
+	int	rcode;
+
+	rcode = mkdir(pathname, 0770);
+
+	if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
+		/* failure */
+		os_file_handle_error_no_exit(pathname, "mkdir", FALSE);
+
+		return(FALSE);
+	}
+
+	return (TRUE);
+#endif /* __WIN__ */
+}
+
+/****************************************************************//**
+NOTE! Use the corresponding macro os_file_create_simple(), not directly
+this function!
+A simple function to open or create a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create_simple_func(
+/*=======================*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	ulint		create_mode,/*!< in: create mode */
+	ulint		access_type,/*!< in: OS_FILE_READ_ONLY or
+				OS_FILE_READ_WRITE */
+	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
+{
+	os_file_t	file;
+	ibool		retry;
+
+	*success = FALSE;
+#ifdef __WIN__
+	DWORD		access;
+	DWORD		create_flag;
+	DWORD		attributes	= 0;
+
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+	if (create_mode == OS_FILE_OPEN) {
+
+		create_flag = OPEN_EXISTING;
+
+	} else if (srv_read_only_mode) {
+
+		create_flag = OPEN_EXISTING;
+
+	} else if (create_mode == OS_FILE_CREATE) {
+
+		create_flag = CREATE_NEW;
+
+	} else if (create_mode == OS_FILE_CREATE_PATH) {
+
+		ut_a(!srv_read_only_mode);
+
+		/* Create subdirs along the path if needed  */
+		*success = os_file_create_subdirs_if_needed(name);
+
+		if (!*success) {
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Unable to create subdirectories '%s'",
+				name);
+
+			return((os_file_t) -1);
+		}
+
+		create_flag = CREATE_NEW;
+		create_mode = OS_FILE_CREATE;
+
+	} else {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unknown file create mode (%lu) for file '%s'",
+			create_mode, name);
+
+		return((os_file_t) -1);
+	}
+
+	if (access_type == OS_FILE_READ_ONLY) {
+		access = GENERIC_READ;
+	} else if (srv_read_only_mode) {
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"read only mode set. Unable to "
+			"open file '%s' in RW mode, trying RO mode", name);
+
+		access = GENERIC_READ;
+
+	} else if (access_type == OS_FILE_READ_WRITE) {
+		access = GENERIC_READ | GENERIC_WRITE;
+	} else {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unknown file access type (%lu) for file '%s'",
+			access_type, name);
+
+		return((os_file_t) -1);
+	}
+
+	do {
+		/* Use default security attributes and no template file. */
+
+		file = CreateFile(
+			(LPCTSTR) name, access, FILE_SHARE_READ, NULL,
+			create_flag, attributes, NULL);
+
+		if (file == INVALID_HANDLE_VALUE) {
+
+			*success = FALSE;
+
+			retry = os_file_handle_error(
+				name, create_mode == OS_FILE_OPEN ?
+				"open" : "create");
+
+		} else {
+			*success = TRUE;
+			retry = false;
+		}
+
+	} while (retry);
+
+#else /* __WIN__ */
+	int		create_flag;
+
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+	if (create_mode == OS_FILE_OPEN) {
+
+		if (access_type == OS_FILE_READ_ONLY) {
+			create_flag = O_RDONLY;
+		} else if (srv_read_only_mode) {
+			create_flag = O_RDONLY;
+		} else {
+			create_flag = O_RDWR;
+		}
+
+	} else if (srv_read_only_mode) {
+
+		create_flag = O_RDONLY;
+
+	} else if (create_mode == OS_FILE_CREATE) {
+
+		create_flag = O_RDWR | O_CREAT | O_EXCL;
+
+	} else if (create_mode == OS_FILE_CREATE_PATH) {
+
+		/* Create subdirs along the path if needed  */
+
+		*success = os_file_create_subdirs_if_needed(name);
+
+		if (!*success) {
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Unable to create subdirectories '%s'",
+				name);
+
+			return((os_file_t) -1);
+		}
+
+		create_flag = O_RDWR | O_CREAT | O_EXCL;
+		create_mode = OS_FILE_CREATE;
+	} else {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unknown file create mode (%lu) for file '%s'",
+			create_mode, name);
+
+		return((os_file_t) -1);
+	}
+
+	do {
+		file = ::open(name, create_flag, os_innodb_umask);
+
+		if (file == -1) {
+			*success = FALSE;
+
+			retry = os_file_handle_error(
+				name,
+				create_mode == OS_FILE_OPEN
+				?  "open" : "create");
+		} else {
+			*success = TRUE;
+			retry = false;
+		}
+
+	} while (retry);
+
+#ifdef USE_FILE_LOCK
+	if (!srv_read_only_mode
+	    && *success
+	    && access_type == OS_FILE_READ_WRITE
+	    && os_file_lock(file, name)) {
+
+		*success = FALSE;
+		close(file);
+		file = -1;
+	}
+#endif /* USE_FILE_LOCK */
+
+#endif /* __WIN__ */
+
+	return(file);
+}
+
+/****************************************************************//**
+NOTE! Use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A simple function to open or create a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create_simple_no_error_handling_func(
+/*=========================================*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	ulint		create_mode,/*!< in: create mode */
+	ulint		access_type,/*!< in: OS_FILE_READ_ONLY,
+				OS_FILE_READ_WRITE, or
+				OS_FILE_READ_ALLOW_DELETE; the last option is
+				used by a backup program reading the file */
+	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
+{
+	os_file_t	file;
+
+	*success = FALSE;
+#ifdef __WIN__
+	DWORD		access;
+	DWORD		create_flag;
+	DWORD		attributes	= 0;
+	DWORD		share_mode	= FILE_SHARE_READ;
+
+	ut_a(name);
+
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+	if (create_mode == OS_FILE_OPEN) {
+		create_flag = OPEN_EXISTING;
+	} else if (srv_read_only_mode) {
+		create_flag = OPEN_EXISTING;
+	} else if (create_mode == OS_FILE_CREATE) {
+		create_flag = CREATE_NEW;
+	} else {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unknown file create mode (%lu) for file '%s'",
+			create_mode, name);
+
+		return((os_file_t) -1);
+	}
+
+	if (access_type == OS_FILE_READ_ONLY) {
+		access = GENERIC_READ;
+	} else if (srv_read_only_mode) {
+		access = GENERIC_READ;
+	} else if (access_type == OS_FILE_READ_WRITE) {
+		access = GENERIC_READ | GENERIC_WRITE;
+	} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
+
+		ut_a(!srv_read_only_mode);
+
+		access = GENERIC_READ;
+
+		/*!< A backup program has to give mysqld the maximum
+		freedom to do what it likes with the file */
+
+		share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
+	} else {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unknown file access type (%lu) for file '%s'",
+			access_type, name);
+
+		return((os_file_t) -1);
+	}
+
+	file = CreateFile((LPCTSTR) name,
+			  access,
+			  share_mode,
+			  NULL,			// Security attributes
+			  create_flag,
+			  attributes,
+			  NULL);		// No template file
+
+	*success = (file != INVALID_HANDLE_VALUE);
+#else /* __WIN__ */
+	int		create_flag;
+
+	ut_a(name);
+
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+	if (create_mode == OS_FILE_OPEN) {
+
+		if (access_type == OS_FILE_READ_ONLY) {
+
+			create_flag = O_RDONLY;
+
+		} else if (srv_read_only_mode) {
+
+			create_flag = O_RDONLY;
+
+		} else {
+
+			ut_a(access_type == OS_FILE_READ_WRITE
+			     || access_type == OS_FILE_READ_ALLOW_DELETE);
+
+			create_flag = O_RDWR;
+		}
+
+	} else if (srv_read_only_mode) {
+
+		create_flag = O_RDONLY;
+
+	} else if (create_mode == OS_FILE_CREATE) {
+
+		create_flag = O_RDWR | O_CREAT | O_EXCL;
+
+	} else {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unknown file create mode (%lu) for file '%s'",
+			create_mode, name);
+
+		return((os_file_t) -1);
+	}
+
+	file = ::open(name, create_flag, os_innodb_umask);
+
+	*success = file == -1 ? FALSE : TRUE;
+
+#ifdef USE_FILE_LOCK
+	if (!srv_read_only_mode
+	    && *success
+	    && access_type == OS_FILE_READ_WRITE
+	    && os_file_lock(file, name)) {
+
+		*success = FALSE;
+		close(file);
+		file = -1;
+
+	}
+#endif /* USE_FILE_LOCK */
+
+#endif /* __WIN__ */
+
+	return(file);
+}
+
+/****************************************************************//**
+Tries to disable OS caching on an opened file descriptor. */
+UNIV_INTERN
+void
+os_file_set_nocache(
+/*================*/
+	int		fd		/*!< in: file descriptor to alter */
+					__attribute__((unused)),
+	const char*	file_name	/*!< in: used in the diagnostic
+					message */
+					__attribute__((unused)),
+	const char*	operation_name __attribute__((unused)))
+					/*!< in: "open" or "create"; used
+					in the diagnostic message */
+{
+	/* some versions of Solaris may not have DIRECTIO_ON */
+#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
+	if (directio(fd, DIRECTIO_ON) == -1) {
+		int	errno_save = errno;
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Failed to set DIRECTIO_ON on file %s: %s: %s, "
+			"continuing anyway.",
+			file_name, operation_name, strerror(errno_save));
+	}
+#elif defined(O_DIRECT)
+	if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
+		int		errno_save = errno;
+		static bool	warning_message_printed = false;
+		if (errno_save == EINVAL) {
+			if (!warning_message_printed) {
+				warning_message_printed = true;
+# ifdef UNIV_LINUX
+				ib_logf(IB_LOG_LEVEL_WARN,
+					"Failed to set O_DIRECT on file "
+					"%s: %s: %s, continuing anyway. "
+					"O_DIRECT is known to result "
+					"in 'Invalid argument' on Linux on "
+					"tmpfs, see MySQL Bug#26662.",
+					file_name, operation_name,
+					strerror(errno_save));
+# else /* UNIV_LINUX */
+				goto short_warning;
+# endif /* UNIV_LINUX */
+			}
+		} else {
+# ifndef UNIV_LINUX
+short_warning:
+# endif
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Failed to set O_DIRECT on file %s: %s: %s, "
+				"continuing anyway.",
+				file_name, operation_name, strerror(errno_save));
+		}
+	}
+#endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
+}
+
+/****************************************************************//**
+NOTE! Use the corresponding macro os_file_create(), not directly
+this function!
+Opens an existing file or creates a new.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create_func(
+/*================*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	ulint		create_mode,/*!< in: create mode */
+	ulint		purpose,/*!< in: OS_FILE_AIO, if asynchronous,
+				non-buffered i/o is desired,
+				OS_FILE_NORMAL, if any normal file;
+				NOTE that it also depends on type, os_aio_..
+				and srv_.. variables whether we really use
+				async i/o or unbuffered i/o: look in the
+				function source code for the exact rules */
+	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
+	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
+{
+	os_file_t	file;
+	ibool		retry;
+	ibool		on_error_no_exit;
+	ibool		on_error_silent;
+
+#ifdef __WIN__
+	DBUG_EXECUTE_IF(
+		"ib_create_table_fail_disk_full",
+		*success = FALSE;
+		SetLastError(ERROR_DISK_FULL);
+		return((os_file_t) -1);
+	);
+#else /* __WIN__ */
+	DBUG_EXECUTE_IF(
+		"ib_create_table_fail_disk_full",
+		*success = FALSE;
+		errno = ENOSPC;
+		return((os_file_t) -1);
+	);
+#endif /* __WIN__ */
+
+#ifdef __WIN__
+	DWORD		create_flag;
+	DWORD		share_mode	= FILE_SHARE_READ;
+
+	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
+		? TRUE : FALSE;
+
+	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
+		? TRUE : FALSE;
+
+	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
+	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
+
+	if (create_mode == OS_FILE_OPEN_RAW) {
+
+		ut_a(!srv_read_only_mode);
+
+		create_flag = OPEN_EXISTING;
+
+		/* On Windows Physical devices require admin privileges and
+		have to have the write-share mode set. See the remarks
+		section for the CreateFile() function documentation in MSDN. */
+
+		share_mode |= FILE_SHARE_WRITE;
+
+	} else if (create_mode == OS_FILE_OPEN
+		   || create_mode == OS_FILE_OPEN_RETRY) {
+
+		create_flag = OPEN_EXISTING;
+
+	} else if (srv_read_only_mode) {
+
+		create_flag = OPEN_EXISTING;
+
+	} else if (create_mode == OS_FILE_CREATE) {
+
+		create_flag = CREATE_NEW;
+
+	} else if (create_mode == OS_FILE_OVERWRITE) {
+
+		create_flag = CREATE_ALWAYS;
+
+	} else {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unknown file create mode (%lu) for file '%s'",
+			create_mode, name);
+
+		return((os_file_t) -1);
+	}
+
+	DWORD		attributes = 0;
+
+#ifdef UNIV_HOTBACKUP
+	attributes |= FILE_FLAG_NO_BUFFERING;
+#else
+	if (purpose == OS_FILE_AIO) {
+
+#ifdef WIN_ASYNC_IO
+		/* If specified, use asynchronous (overlapped) io and no
+		buffering of writes in the OS */
+
+		if (srv_use_native_aio) {
+			attributes |= FILE_FLAG_OVERLAPPED;
+		}
+#endif /* WIN_ASYNC_IO */
+
+	} else if (purpose == OS_FILE_NORMAL) {
+		/* Use default setting. */
+	} else {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unknown purpose flag (%lu) while opening file '%s'",
+			purpose, name);
+
+		return((os_file_t)(-1));
+	}
+
+#ifdef UNIV_NON_BUFFERED_IO
+	// TODO: Create a bug, this looks wrong. The flush log
+	// parameter is dynamic.
+	if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
+
+		/* Do not use unbuffered i/o for the log files because
+		value 2 denotes that we do not flush the log at every
+		commit, but only once per second */
+
+	} else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
+
+		attributes |= FILE_FLAG_NO_BUFFERING;
+	}
+#endif /* UNIV_NON_BUFFERED_IO */
+
+#endif /* UNIV_HOTBACKUP */
+	DWORD	access = GENERIC_READ;
+
+	if (!srv_read_only_mode) {
+		access |= GENERIC_WRITE;
+	}
+
+	do {
+		/* Use default security attributes and no template file. */
+		file = CreateFile(
+			(LPCTSTR) name, access, share_mode, NULL,
+			create_flag, attributes, NULL);
+
+		if (file == INVALID_HANDLE_VALUE) {
+			const char*	operation;
+
+			operation = (create_mode == OS_FILE_CREATE
+				     && !srv_read_only_mode)
+				? "create" : "open";
+
+			*success = FALSE;
+
+			if (on_error_no_exit) {
+				retry = os_file_handle_error_no_exit(
+					name, operation, on_error_silent);
+			} else {
+				retry = os_file_handle_error(name, operation);
+			}
+		} else {
+			*success = TRUE;
+			retry = FALSE;
+		}
+
+	} while (retry);
+
+#else /* __WIN__ */
+	int		create_flag;
+	const char*	mode_str	= NULL;
+
+	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
+		? TRUE : FALSE;
+	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
+		? TRUE : FALSE;
+
+	create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
+	create_mode &= ~OS_FILE_ON_ERROR_SILENT;
+
+	if (create_mode == OS_FILE_OPEN
+	    || create_mode == OS_FILE_OPEN_RAW
+	    || create_mode == OS_FILE_OPEN_RETRY) {
+
+		mode_str = "OPEN";
+
+		create_flag = srv_read_only_mode ? O_RDONLY : O_RDWR;
+
+	} else if (srv_read_only_mode) {
+
+		mode_str = "OPEN";
+
+		create_flag = O_RDONLY;
+
+	} else if (create_mode == OS_FILE_CREATE) {
+
+		mode_str = "CREATE";
+		create_flag = O_RDWR | O_CREAT | O_EXCL;
+
+	} else if (create_mode == OS_FILE_OVERWRITE) {
+
+		mode_str = "OVERWRITE";
+		create_flag = O_RDWR | O_CREAT | O_TRUNC;
+
+	} else {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unknown file create mode (%lu) for file '%s'",
+			create_mode, name);
+
+		return((os_file_t) -1);
+	}
+
+	ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE);
+	ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
+
+#ifdef O_SYNC
+	/* We let O_SYNC only affect log files; note that we map O_DSYNC to
+	O_SYNC because the datasync options seemed to corrupt files in 2001
+	in both Linux and Solaris */
+
+	if (!srv_read_only_mode
+	    && type == OS_LOG_FILE
+	    && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
+
+		create_flag |= O_SYNC;
+	}
+#endif /* O_SYNC */
+
+	do {
+		file = ::open(name, create_flag, os_innodb_umask);
+
+		if (file == -1) {
+			const char*	operation;
+
+			operation = (create_mode == OS_FILE_CREATE
+				     && !srv_read_only_mode)
+				? "create" : "open";
+
+			*success = FALSE;
+
+			if (on_error_no_exit) {
+				retry = os_file_handle_error_no_exit(
+					name, operation, on_error_silent);
+			} else {
+				retry = os_file_handle_error(name, operation);
+			}
+		} else {
+			*success = TRUE;
+			retry = false;
+		}
+
+	} while (retry);
+
+	/* We disable OS caching (O_DIRECT) only on data files */
+
+	if (!srv_read_only_mode
+	    && *success
+	    && type != OS_LOG_FILE
+	    && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
+		|| srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
+
+		os_file_set_nocache(file, name, mode_str);
+	}
+
+#ifdef USE_FILE_LOCK
+	if (!srv_read_only_mode
+	    && *success
+	    && create_mode != OS_FILE_OPEN_RAW
+	    && os_file_lock(file, name)) {
+
+		if (create_mode == OS_FILE_OPEN_RETRY) {
+
+			ut_a(!srv_read_only_mode);
+
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Retrying to lock the first data file");
+
+			for (int i = 0; i < 100; i++) {
+				os_thread_sleep(1000000);
+
+				if (!os_file_lock(file, name)) {
+					*success = TRUE;
+					return(file);
+				}
+			}
+
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Unable to open the first data file");
+		}
+
+		*success = FALSE;
+		close(file);
+		file = -1;
+	}
+#endif /* USE_FILE_LOCK */
+
+#endif /* __WIN__ */
+
+	return(file);
+}
+
+/***********************************************************************//**
+Deletes a file if it exists. The file has to be closed before calling this.
+@return	TRUE if success */
+UNIV_INTERN
+bool
+os_file_delete_if_exists_func(
+/*==========================*/
+	const char*	name)	/*!< in: file path as a null-terminated
+				string */
+{
+#ifdef __WIN__
+	bool	ret;
+	ulint	count	= 0;
+loop:
+	/* In Windows, deleting an .ibd file may fail if mysqlbackup is copying
+	it */
+
+	ret = DeleteFile((LPCTSTR) name);
+
+	if (ret) {
+		return(true);
+	}
+
+	DWORD lasterr = GetLastError();
+	if (lasterr == ERROR_FILE_NOT_FOUND
+	    || lasterr == ERROR_PATH_NOT_FOUND) {
+		/* the file does not exist, this not an error */
+
+		return(true);
+	}
+
+	count++;
+
+	if (count > 100 && 0 == (count % 10)) {
+		os_file_get_last_error(true); /* print error information */
+
+		ib_logf(IB_LOG_LEVEL_WARN, "Delete of file %s failed.", name);
+	}
+
+	os_thread_sleep(500000);	/* sleep for 0.5 second */
+
+	if (count > 2000) {
+
+		return(false);
+	}
+
+	goto loop;
+#else
+	int	ret;
+
+	ret = unlink(name);
+
+	if (ret != 0 && errno != ENOENT) {
+		os_file_handle_error_no_exit(name, "delete", FALSE);
+
+		return(false);
+	}
+
+	return(true);
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+Deletes a file. The file has to be closed before calling this.
+@return	TRUE if success */
+UNIV_INTERN
+bool
+os_file_delete_func(
+/*================*/
+	const char*	name)	/*!< in: file path as a null-terminated
+				string */
+{
+#ifdef __WIN__
+	BOOL	ret;
+	ulint	count	= 0;
+loop:
+	/* In Windows, deleting an .ibd file may fail if mysqlbackup is copying
+	it */
+
+	ret = DeleteFile((LPCTSTR) name);
+
+	if (ret) {
+		return(true);
+	}
+
+	if (GetLastError() == ERROR_FILE_NOT_FOUND) {
+		/* If the file does not exist, we classify this as a 'mild'
+		error and return */
+
+		return(false);
+	}
+
+	count++;
+
+	if (count > 100 && 0 == (count % 10)) {
+		os_file_get_last_error(true); /* print error information */
+
+		fprintf(stderr,
+			"InnoDB: Warning: cannot delete file %s\n"
+			"InnoDB: Are you running mysqlbackup"
+			" to back up the file?\n", name);
+	}
+
+	os_thread_sleep(1000000);	/* sleep for a second */
+
+	if (count > 2000) {
+
+		return(false);
+	}
+
+	goto loop;
+#else
+	int	ret;
+
+	ret = unlink(name);
+
+	if (ret != 0) {
+		os_file_handle_error_no_exit(name, "delete", FALSE);
+
+		return(false);
+	}
+
+	return(true);
+#endif
+}
+
+/***********************************************************************//**
+NOTE! Use the corresponding macro os_file_rename(), not directly this function!
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_rename_func(
+/*================*/
+	const char*	oldpath,/*!< in: old file path as a null-terminated
+				string */
+	const char*	newpath)/*!< in: new file path */
+{
+#ifdef UNIV_DEBUG
+	os_file_type_t	type;
+	ibool		exists;
+
+	/* New path must not exist. */
+	ut_ad(os_file_status(newpath, &exists, &type));
+	ut_ad(!exists);
+
+	/* Old path must exist. */
+	ut_ad(os_file_status(oldpath, &exists, &type));
+	ut_ad(exists);
+#endif /* UNIV_DEBUG */
+
+#ifdef __WIN__
+	BOOL	ret;
+
+	ret = MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath);
+
+	if (ret) {
+		return(TRUE);
+	}
+
+	os_file_handle_error_no_exit(oldpath, "rename", FALSE);
+
+	return(FALSE);
+#else
+	int	ret;
+
+	ret = rename(oldpath, newpath);
+
+	if (ret != 0) {
+		os_file_handle_error_no_exit(oldpath, "rename", FALSE);
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+NOTE! Use the corresponding macro os_file_close(), not directly this function!
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_close_func(
+/*===============*/
+	os_file_t	file)	/*!< in, own: handle to a file */
+{
+#ifdef __WIN__
+	BOOL	ret;
+
+	ut_a(file);
+
+	ret = CloseHandle(file);
+
+	if (ret) {
+		return(TRUE);
+	}
+
+	os_file_handle_error(NULL, "close");
+
+	return(FALSE);
+#else
+	int	ret;
+
+	ret = close(file);
+
+	if (ret == -1) {
+		os_file_handle_error(NULL, "close");
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+#endif /* __WIN__ */
+}
+
+#ifdef UNIV_HOTBACKUP
+/***********************************************************************//**
+Closes a file handle.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_close_no_error_handling(
+/*============================*/
+	os_file_t	file)	/*!< in, own: handle to a file */
+{
+#ifdef __WIN__
+	BOOL	ret;
+
+	ut_a(file);
+
+	ret = CloseHandle(file);
+
+	if (ret) {
+		return(TRUE);
+	}
+
+	return(FALSE);
+#else
+	int	ret;
+
+	ret = close(file);
+
+	if (ret == -1) {
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+#endif /* __WIN__ */
+}
+#endif /* UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Gets a file size.
+@return	file size, or (os_offset_t) -1 on failure */
+UNIV_INTERN
+os_offset_t
+os_file_get_size(
+/*=============*/
+	os_file_t	file)	/*!< in: handle to a file */
+{
+#ifdef __WIN__
+	os_offset_t	offset;
+	DWORD		high;
+	DWORD		low;
+
+	low = GetFileSize(file, &high);
+
+	if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
+		return((os_offset_t) -1);
+	}
+
+	offset = (os_offset_t) low | ((os_offset_t) high << 32);
+
+	return(offset);
+#else
+	return((os_offset_t) lseek(file, 0, SEEK_END));
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+Write the specified number of zeros to a newly created file.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_size(
+/*=============*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	os_file_t	file,	/*!< in: handle to a file */
+	os_offset_t	size)	/*!< in: file size */
+{
+	os_offset_t	current_size;
+	ibool		ret;
+	byte*		buf;
+	byte*		buf2;
+	ulint		buf_size;
+
+	current_size = 0;
+
+	/* Write up to 1 megabyte at a time. */
+	buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE))
+		* UNIV_PAGE_SIZE;
+	buf2 = static_cast<byte*>(ut_malloc(buf_size + UNIV_PAGE_SIZE));
+
+	/* Align the buffer for possible raw i/o */
+	buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
+
+	/* Write buffer full of zeros */
+	memset(buf, 0, buf_size);
+
+	if (size >= (os_offset_t) 100 << 20) {
+
+		fprintf(stderr, "InnoDB: Progress in MB:");
+	}
+
+	while (current_size < size) {
+		ulint	n_bytes;
+
+		if (size - current_size < (os_offset_t) buf_size) {
+			n_bytes = (ulint) (size - current_size);
+		} else {
+			n_bytes = buf_size;
+		}
+
+		ret = os_file_write(name, file, buf, current_size, n_bytes);
+		if (!ret) {
+			ut_free(buf2);
+			goto error_handling;
+		}
+
+		/* Print about progress for each 100 MB written */
+		if ((current_size + n_bytes) / (100 << 20)
+		    != current_size / (100 << 20)) {
+
+			fprintf(stderr, " %lu00",
+				(ulong) ((current_size + n_bytes)
+					 / (100 << 20)));
+		}
+
+		current_size += n_bytes;
+	}
+
+	if (size >= (os_offset_t) 100 << 20) {
+
+		fprintf(stderr, "\n");
+	}
+
+	ut_free(buf2);
+
+	ret = os_file_flush(file);
+
+	if (ret) {
+		return(TRUE);
+	}
+
+error_handling:
+	return(FALSE);
+}
+
+/***********************************************************************//**
+Truncates a file at its current position.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_eof(
+/*============*/
+	FILE*		file)	/*!< in: file to be truncated */
+{
+#ifdef __WIN__
+	HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
+	return(SetEndOfFile(h));
+#else /* __WIN__ */
+	return(!ftruncate(fileno(file), ftell(file)));
+#endif /* __WIN__ */
+}
+
+#ifndef __WIN__
+/***********************************************************************//**
+Wrapper to fsync(2) that retries the call on some errors.
+Returns the value 0 if successful; otherwise the value -1 is returned and
+the global variable errno is set to indicate the error.
+@return	0 if success, -1 otherwise */
+
+static
+int
+os_file_fsync(
+/*==========*/
+	os_file_t	file)	/*!< in: handle to a file */
+{
+	int	ret;
+	int	failures;
+	ibool	retry;
+
+	failures = 0;
+
+	do {
+		ret = fsync(file);
+
+		os_n_fsyncs++;
+
+		if (ret == -1 && errno == ENOLCK) {
+
+			if (failures % 100 == 0) {
+
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					" InnoDB: fsync(): "
+					"No locks available; retrying\n");
+			}
+
+			os_thread_sleep(200000 /* 0.2 sec */);
+
+			failures++;
+
+			retry = TRUE;
+		} else {
+
+			retry = FALSE;
+		}
+	} while (retry);
+
+	return(ret);
+}
+#endif /* !__WIN__ */
+
+/***********************************************************************//**
+NOTE! Use the corresponding macro os_file_flush(), not directly this function!
+Flushes the write buffers of a given file to the disk.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_flush_func(
+/*===============*/
+	os_file_t	file)	/*!< in, own: handle to a file */
+{
+#ifdef __WIN__
+	BOOL	ret;
+
+	ut_a(file);
+
+	os_n_fsyncs++;
+
+	ret = FlushFileBuffers(file);
+
+	if (ret) {
+		return(TRUE);
+	}
+
+	/* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
+	actually a raw device, we choose to ignore that error if we are using
+	raw disks */
+
+	if (srv_start_raw_disk_in_use && GetLastError()
+	    == ERROR_INVALID_FUNCTION) {
+		return(TRUE);
+	}
+
+	os_file_handle_error(NULL, "flush");
+
+	/* It is a fatal error if a file flush does not succeed, because then
+	the database can get corrupt on disk */
+	ut_error;
+
+	return(FALSE);
+#else
+	int	ret;
+
+#if defined(HAVE_DARWIN_THREADS)
+# ifndef F_FULLFSYNC
+	/* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
+#  define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
+# elif F_FULLFSYNC != 51
+#  error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
+# endif
+	/* Apple has disabled fsync() for internal disk drives in OS X. That
+	caused corruption for a user when he tested a power outage. Let us in
+	OS X use a nonstandard flush method recommended by an Apple
+	engineer. */
+
+	if (!srv_have_fullfsync) {
+		/* If we are not on an operating system that supports this,
+		then fall back to a plain fsync. */
+
+		ret = os_file_fsync(file);
+	} else {
+		ret = fcntl(file, F_FULLFSYNC, NULL);
+
+		if (ret) {
+			/* If we are not on a file system that supports this,
+			then fall back to a plain fsync. */
+			ret = os_file_fsync(file);
+		}
+	}
+#else
+	ret = os_file_fsync(file);
+#endif
+
+	if (ret == 0) {
+		return(TRUE);
+	}
+
+	/* Since Linux returns EINVAL if the 'file' is actually a raw device,
+	we choose to ignore that error if we are using raw disks */
+
+	if (srv_start_raw_disk_in_use && errno == EINVAL) {
+
+		return(TRUE);
+	}
+
+	ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed");
+
+	os_file_handle_error(NULL, "flush");
+
+	/* It is a fatal error if a file flush does not succeed, because then
+	the database can get corrupt on disk */
+	ut_error;
+
+	return(FALSE);
+#endif
+}
+
+#ifndef __WIN__
+/*******************************************************************//**
+Does a synchronous read operation in Posix.
+@return	number of bytes read, -1 if error */
+static __attribute__((nonnull, warn_unused_result))
+ssize_t
+os_file_pread(
+/*==========*/
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read */
+	ulint		n,	/*!< in: number of bytes to read */
+	os_offset_t	offset)	/*!< in: file offset from where to read */
+{
+	off_t	offs;
+#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
+	ssize_t	n_bytes;
+#endif /* HAVE_PREAD && !HAVE_BROKEN_PREAD */
+
+	ut_ad(n);
+
+	/* If off_t is > 4 bytes in size, then we assume we can pass a
+	64-bit address */
+	offs = (off_t) offset;
+
+	if (sizeof(off_t) <= 4) {
+		if (offset != (os_offset_t) offs) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"File read at offset > 4 GB");
+		}
+	}
+
+	os_n_file_reads++;
+
+#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
+#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
+	(void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
+	(void) os_atomic_increment_ulint(&os_file_n_pending_preads, 1);
+	MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
+#else
+	os_mutex_enter(os_file_count_mutex);
+	os_file_n_pending_preads++;
+	os_n_pending_reads++;
+	MONITOR_INC(MONITOR_OS_PENDING_READS);
+	os_mutex_exit(os_file_count_mutex);
+#endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */
+
+	n_bytes = pread(file, buf, n, offs);
+
+#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
+	(void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
+	(void) os_atomic_decrement_ulint(&os_file_n_pending_preads, 1);
+	MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS);
+#else
+	os_mutex_enter(os_file_count_mutex);
+	os_file_n_pending_preads--;
+	os_n_pending_reads--;
+	MONITOR_DEC(MONITOR_OS_PENDING_READS);
+	os_mutex_exit(os_file_count_mutex);
+#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD == 8 */
+
+	return(n_bytes);
+#else
+	{
+		off_t	ret_offset;
+		ssize_t	ret;
+#ifndef UNIV_HOTBACKUP
+		ulint	i;
+#endif /* !UNIV_HOTBACKUP */
+
+#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
+		(void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
+		MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
+#else
+		os_mutex_enter(os_file_count_mutex);
+		os_n_pending_reads++;
+		MONITOR_INC(MONITOR_OS_PENDING_READS);
+		os_mutex_exit(os_file_count_mutex);
+#endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */
+#ifndef UNIV_HOTBACKUP
+		/* Protect the seek / read operation with a mutex */
+		i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+		os_mutex_enter(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+		ret_offset = lseek(file, offs, SEEK_SET);
+
+		if (ret_offset < 0) {
+			ret = -1;
+		} else {
+			ret = read(file, buf, (ssize_t) n);
+		}
+
+#ifndef UNIV_HOTBACKUP
+		os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8
+		(void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
+		MONITOR_ATOIC_DEC(MONITOR_OS_PENDING_READS);
+#else
+		os_mutex_enter(os_file_count_mutex);
+		os_n_pending_reads--;
+		MONITOR_DEC(MONITOR_OS_PENDING_READS);
+		os_mutex_exit(os_file_count_mutex);
+#endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD_SIZE == 8 */
+
+		return(ret);
+	}
+#endif
+}
+
+/*******************************************************************//**
+Does a synchronous write operation in Posix.
+@return	number of bytes written, -1 if error */
+static __attribute__((nonnull, warn_unused_result))
+ssize_t
+os_file_pwrite(
+/*===========*/
+	os_file_t	file,	/*!< in: handle to a file */
+	const void*	buf,	/*!< in: buffer from where to write */
+	ulint		n,	/*!< in: number of bytes to write */
+	os_offset_t	offset)	/*!< in: file offset where to write */
+{
+	ssize_t	ret;
+	off_t	offs;
+
+	ut_ad(n);
+	ut_ad(!srv_read_only_mode);
+
+	/* If off_t is > 4 bytes in size, then we assume we can pass a
+	64-bit address */
+	offs = (off_t) offset;
+
+	if (sizeof(off_t) <= 4) {
+		if (offset != (os_offset_t) offs) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"File write at offset > 4 GB.");
+		}
+	}
+
+	os_n_file_writes++;
+
+#if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
+#if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
+	os_mutex_enter(os_file_count_mutex);
+	os_file_n_pending_pwrites++;
+	os_n_pending_writes++;
+	MONITOR_INC(MONITOR_OS_PENDING_WRITES);
+	os_mutex_exit(os_file_count_mutex);
+#else
+	(void) os_atomic_increment_ulint(&os_n_pending_writes, 1);
+	(void) os_atomic_increment_ulint(&os_file_n_pending_pwrites, 1);
+	MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES);
+#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */
+
+	ret = pwrite(file, buf, (ssize_t) n, offs);
+
+#if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8
+	os_mutex_enter(os_file_count_mutex);
+	os_file_n_pending_pwrites--;
+	os_n_pending_writes--;
+	MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
+	os_mutex_exit(os_file_count_mutex);
+#else
+	(void) os_atomic_decrement_ulint(&os_n_pending_writes, 1);
+	(void) os_atomic_decrement_ulint(&os_file_n_pending_pwrites, 1);
+	MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES);
+#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */
+
+	return(ret);
+#else
+	{
+		off_t	ret_offset;
+# ifndef UNIV_HOTBACKUP
+		ulint	i;
+# endif /* !UNIV_HOTBACKUP */
+
+		os_mutex_enter(os_file_count_mutex);
+		os_n_pending_writes++;
+		MONITOR_INC(MONITOR_OS_PENDING_WRITES);
+		os_mutex_exit(os_file_count_mutex);
+
+# ifndef UNIV_HOTBACKUP
+		/* Protect the seek / write operation with a mutex */
+		i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+		os_mutex_enter(os_file_seek_mutexes[i]);
+# endif /* UNIV_HOTBACKUP */
+
+		ret_offset = lseek(file, offs, SEEK_SET);
+
+		if (ret_offset < 0) {
+			ret = -1;
+
+			goto func_exit;
+		}
+
+		ret = write(file, buf, (ssize_t) n);
+
+func_exit:
+# ifndef UNIV_HOTBACKUP
+		os_mutex_exit(os_file_seek_mutexes[i]);
+# endif /* !UNIV_HOTBACKUP */
+
+		os_mutex_enter(os_file_count_mutex);
+		os_n_pending_writes--;
+		MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
+		os_mutex_exit(os_file_count_mutex);
+
+		return(ret);
+	}
+#endif /* !UNIV_HOTBACKUP */
+}
+#endif
+
+/*******************************************************************//**
+NOTE! Use the corresponding macro os_file_read(), not directly this
+function!
+Requests a synchronous positioned read operation.
+@return	TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_read_func(
+/*==============*/
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read */
+	os_offset_t	offset,	/*!< in: file offset where to read */
+	ulint		n)	/*!< in: number of bytes to read */
+{
+#ifdef __WIN__
+	BOOL		ret;
+	DWORD		len;
+	DWORD		ret2;
+	DWORD		low;
+	DWORD		high;
+	ibool		retry;
+#ifndef UNIV_HOTBACKUP
+	ulint		i;
+#endif /* !UNIV_HOTBACKUP */
+
+	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
+	no more than 32 bits. */
+	ut_a((n & 0xFFFFFFFFUL) == n);
+
+	os_n_file_reads++;
+	os_bytes_read_since_printout += n;
+
+try_again:
+	ut_ad(file);
+	ut_ad(buf);
+	ut_ad(n > 0);
+
+	low = (DWORD) offset & 0xFFFFFFFF;
+	high = (DWORD) (offset >> 32);
+
+	os_mutex_enter(os_file_count_mutex);
+	os_n_pending_reads++;
+	MONITOR_INC(MONITOR_OS_PENDING_READS);
+	os_mutex_exit(os_file_count_mutex);
+
+#ifndef UNIV_HOTBACKUP
+	/* Protect the seek / read operation with a mutex */
+	i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+	os_mutex_enter(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+	ret2 = SetFilePointer(
+		file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
+
+	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
+
+#ifndef UNIV_HOTBACKUP
+		os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+		os_mutex_enter(os_file_count_mutex);
+		os_n_pending_reads--;
+		MONITOR_DEC(MONITOR_OS_PENDING_READS);
+		os_mutex_exit(os_file_count_mutex);
+
+		goto error_handling;
+	}
+
+	ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
+
+#ifndef UNIV_HOTBACKUP
+	os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+	os_mutex_enter(os_file_count_mutex);
+	os_n_pending_reads--;
+	MONITOR_DEC(MONITOR_OS_PENDING_READS);
+	os_mutex_exit(os_file_count_mutex);
+
+	if (ret && len == n) {
+		return(TRUE);
+	}
+#else /* __WIN__ */
+	ibool	retry;
+	ssize_t	ret;
+
+	os_bytes_read_since_printout += n;
+
+try_again:
+	ret = os_file_pread(file, buf, n, offset);
+
+	if ((ulint) ret == n) {
+		return(TRUE);
+	} else if (ret == -1) {
+                ib_logf(IB_LOG_LEVEL_ERROR,
+			"Error in system call pread(). The operating"
+			" system error number is %lu.",(ulint) errno);
+        } else {
+		/* Partial read occured */
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Tried to read " ULINTPF " bytes at offset "
+			UINT64PF ". Was only able to read %ld.",
+			n, offset, (lint) ret);
+	}
+#endif /* __WIN__ */
+#ifdef __WIN__
+error_handling:
+#endif
+	retry = os_file_handle_error(NULL, "read");
+
+	if (retry) {
+		goto try_again;
+	}
+
+	fprintf(stderr,
+		"InnoDB: Fatal error: cannot read from file."
+		" OS error number %lu.\n",
+#ifdef __WIN__
+		(ulong) GetLastError()
+#else
+		(ulong) errno
+#endif /* __WIN__ */
+		);
+	fflush(stderr);
+
+	ut_error;
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+NOTE! Use the corresponding macro os_file_read_no_error_handling(),
+not directly this function!
+Requests a synchronous positioned read operation. This function does not do
+any error handling. In case of error it returns FALSE.
+@return	TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_read_no_error_handling_func(
+/*================================*/
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read */
+	os_offset_t	offset,	/*!< in: file offset where to read */
+	ulint		n)	/*!< in: number of bytes to read */
+{
+#ifdef __WIN__
+	BOOL		ret;
+	DWORD		len;
+	DWORD		ret2;
+	DWORD		low;
+	DWORD		high;
+	ibool		retry;
+#ifndef UNIV_HOTBACKUP
+	ulint		i;
+#endif /* !UNIV_HOTBACKUP */
+
+	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
+	no more than 32 bits. */
+	ut_a((n & 0xFFFFFFFFUL) == n);
+
+	os_n_file_reads++;
+	os_bytes_read_since_printout += n;
+
+try_again:
+	ut_ad(file);
+	ut_ad(buf);
+	ut_ad(n > 0);
+
+	low = (DWORD) offset & 0xFFFFFFFF;
+	high = (DWORD) (offset >> 32);
+
+	os_mutex_enter(os_file_count_mutex);
+	os_n_pending_reads++;
+	MONITOR_INC(MONITOR_OS_PENDING_READS);
+	os_mutex_exit(os_file_count_mutex);
+
+#ifndef UNIV_HOTBACKUP
+	/* Protect the seek / read operation with a mutex */
+	i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+	os_mutex_enter(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+	ret2 = SetFilePointer(
+		file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
+
+	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
+
+#ifndef UNIV_HOTBACKUP
+		os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+		os_mutex_enter(os_file_count_mutex);
+		os_n_pending_reads--;
+		MONITOR_DEC(MONITOR_OS_PENDING_READS);
+		os_mutex_exit(os_file_count_mutex);
+
+		goto error_handling;
+	}
+
+	ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
+
+#ifndef UNIV_HOTBACKUP
+	os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+	os_mutex_enter(os_file_count_mutex);
+	os_n_pending_reads--;
+	MONITOR_DEC(MONITOR_OS_PENDING_READS);
+	os_mutex_exit(os_file_count_mutex);
+
+	if (ret && len == n) {
+		return(TRUE);
+	}
+#else /* __WIN__ */
+	ibool	retry;
+	ssize_t	ret;
+
+	os_bytes_read_since_printout += n;
+
+try_again:
+	ret = os_file_pread(file, buf, n, offset);
+
+	if ((ulint) ret == n) {
+		return(TRUE);
+	} else if (ret == -1) {
+                ib_logf(IB_LOG_LEVEL_ERROR,
+			"Error in system call pread(). The operating"
+			" system error number is %lu.",(ulint) errno);
+        } else {
+		/* Partial read occured */
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Tried to read " ULINTPF " bytes at offset "
+			UINT64PF ". Was only able to read %ld.",
+			n, offset, (lint) ret);
+	}
+#endif /* __WIN__ */
+#ifdef __WIN__
+error_handling:
+#endif
+	retry = os_file_handle_error_no_exit(NULL, "read", FALSE);
+
+	if (retry) {
+		goto try_again;
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Rewind file to its start, read at most size - 1 bytes from it to str, and
+NUL-terminate str. All errors are silently ignored. This function is
+mostly meant to be used with temporary files. */
+UNIV_INTERN
+void
+os_file_read_string(
+/*================*/
+	FILE*	file,	/*!< in: file to read from */
+	char*	str,	/*!< in: buffer where to read */
+	ulint	size)	/*!< in: size of buffer */
+{
+	size_t	flen;
+
+	if (size == 0) {
+		return;
+	}
+
+	rewind(file);
+	flen = fread(str, 1, size - 1, file);
+	str[flen] = '\0';
+}
+
+/*******************************************************************//**
+NOTE! Use the corresponding macro os_file_write(), not directly
+this function!
+Requests a synchronous write operation.
+@return	TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_write_func(
+/*===============*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	os_file_t	file,	/*!< in: handle to a file */
+	const void*	buf,	/*!< in: buffer from which to write */
+	os_offset_t	offset,	/*!< in: file offset where to write */
+	ulint		n)	/*!< in: number of bytes to write */
+{
+	ut_ad(!srv_read_only_mode);
+
+#ifdef __WIN__
+	BOOL		ret;
+	DWORD		len;
+	DWORD		ret2;
+	DWORD		low;
+	DWORD		high;
+	ulint		n_retries	= 0;
+	ulint		err;
+#ifndef UNIV_HOTBACKUP
+	ulint		i;
+#endif /* !UNIV_HOTBACKUP */
+
+	/* On 64-bit Windows, ulint is 64 bits. But offset and n should be
+	no more than 32 bits. */
+	ut_a((n & 0xFFFFFFFFUL) == n);
+
+	os_n_file_writes++;
+
+	ut_ad(file);
+	ut_ad(buf);
+	ut_ad(n > 0);
+retry:
+	low = (DWORD) offset & 0xFFFFFFFF;
+	high = (DWORD) (offset >> 32);
+
+	os_mutex_enter(os_file_count_mutex);
+	os_n_pending_writes++;
+	MONITOR_INC(MONITOR_OS_PENDING_WRITES);
+	os_mutex_exit(os_file_count_mutex);
+
+#ifndef UNIV_HOTBACKUP
+	/* Protect the seek / write operation with a mutex */
+	i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+	os_mutex_enter(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+	ret2 = SetFilePointer(
+		file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN);
+
+	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
+
+#ifndef UNIV_HOTBACKUP
+		os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+		os_mutex_enter(os_file_count_mutex);
+		os_n_pending_writes--;
+		MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
+		os_mutex_exit(os_file_count_mutex);
+
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr,
+			" InnoDB: Error: File pointer positioning to"
+			" file %s failed at\n"
+			"InnoDB: offset %llu. Operating system"
+			" error number %lu.\n"
+			"InnoDB: Some operating system error numbers"
+			" are described at\n"
+			"InnoDB: "
+			REFMAN "operating-system-error-codes.html\n",
+			name, offset, (ulong) GetLastError());
+
+		return(FALSE);
+	}
+
+	ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
+
+#ifndef UNIV_HOTBACKUP
+	os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+	os_mutex_enter(os_file_count_mutex);
+	os_n_pending_writes--;
+	MONITOR_DEC(MONITOR_OS_PENDING_WRITES);
+	os_mutex_exit(os_file_count_mutex);
+
+	if (ret && len == n) {
+
+		return(TRUE);
+	}
+
+	/* If some background file system backup tool is running, then, at
+	least in Windows 2000, we may get here a specific error. Let us
+	retry the operation 100 times, with 1 second waits. */
+
+	if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
+
+		os_thread_sleep(1000000);
+
+		n_retries++;
+
+		goto retry;
+	}
+
+	if (!os_has_said_disk_full) {
+
+		err = (ulint) GetLastError();
+
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr,
+			" InnoDB: Error: Write to file %s failed"
+			" at offset %llu.\n"
+			"InnoDB: %lu bytes should have been written,"
+			" only %lu were written.\n"
+			"InnoDB: Operating system error number %lu.\n"
+			"InnoDB: Check that your OS and file system"
+			" support files of this size.\n"
+			"InnoDB: Check also that the disk is not full"
+			" or a disk quota exceeded.\n",
+			name, offset,
+			(ulong) n, (ulong) len, (ulong) err);
+
+		if (strerror((int) err) != NULL) {
+			fprintf(stderr,
+				"InnoDB: Error number %lu means '%s'.\n",
+				(ulong) err, strerror((int) err));
+		}
+
+		fprintf(stderr,
+			"InnoDB: Some operating system error numbers"
+			" are described at\n"
+			"InnoDB: "
+			REFMAN "operating-system-error-codes.html\n");
+
+		os_has_said_disk_full = TRUE;
+	}
+
+	return(FALSE);
+#else
+	ssize_t	ret;
+
+	ret = os_file_pwrite(file, buf, n, offset);
+
+	if ((ulint) ret == n) {
+
+		return(TRUE);
+	}
+
+	if (!os_has_said_disk_full) {
+
+		ut_print_timestamp(stderr);
+
+		if(ret == -1) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Failure of system call pwrite(). Operating"
+				" system error number is %lu.",
+				(ulint) errno);
+		} else {
+			fprintf(stderr,
+				" InnoDB: Error: Write to file %s failed"
+				" at offset " UINT64PF ".\n"
+				"InnoDB: %lu bytes should have been written,"
+				" only %ld were written.\n"
+				"InnoDB: Operating system error number %lu.\n"
+				"InnoDB: Check that your OS and file system"
+				" support files of this size.\n"
+				"InnoDB: Check also that the disk is not full"
+				" or a disk quota exceeded.\n",
+				name, offset, n, (lint) ret,
+				(ulint) errno);
+		}
+
+		if (strerror(errno) != NULL) {
+			fprintf(stderr,
+				"InnoDB: Error number %d means '%s'.\n",
+				errno, strerror(errno));
+		}
+
+		fprintf(stderr,
+			"InnoDB: Some operating system error numbers"
+			" are described at\n"
+			"InnoDB: "
+			REFMAN "operating-system-error-codes.html\n");
+
+		os_has_said_disk_full = TRUE;
+	}
+
+	return(FALSE);
+#endif
+}
+
+/*******************************************************************//**
+Check the existence and type of the given file.
+@return	TRUE if call succeeded */
+UNIV_INTERN
+ibool
+os_file_status(
+/*===========*/
+	const char*	path,	/*!< in: pathname of the file */
+	ibool*		exists,	/*!< out: TRUE if file exists */
+	os_file_type_t* type)	/*!< out: type of the file (if it exists) */
+{
+#ifdef __WIN__
+	int		ret;
+	struct _stat64	statinfo;
+
+	ret = _stat64(path, &statinfo);
+	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
+		/* file does not exist */
+		*exists = FALSE;
+		return(TRUE);
+	} else if (ret) {
+		/* file exists, but stat call failed */
+
+		os_file_handle_error_no_exit(path, "stat", FALSE);
+
+		return(FALSE);
+	}
+
+	if (_S_IFDIR & statinfo.st_mode) {
+		*type = OS_FILE_TYPE_DIR;
+	} else if (_S_IFREG & statinfo.st_mode) {
+		*type = OS_FILE_TYPE_FILE;
+	} else {
+		*type = OS_FILE_TYPE_UNKNOWN;
+	}
+
+	*exists = TRUE;
+
+	return(TRUE);
+#else
+	int		ret;
+	struct stat	statinfo;
+
+	ret = stat(path, &statinfo);
+	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
+		/* file does not exist */
+		*exists = FALSE;
+		return(TRUE);
+	} else if (ret) {
+		/* file exists, but stat call failed */
+
+		os_file_handle_error_no_exit(path, "stat", FALSE);
+
+		return(FALSE);
+	}
+
+	if (S_ISDIR(statinfo.st_mode)) {
+		*type = OS_FILE_TYPE_DIR;
+	} else if (S_ISLNK(statinfo.st_mode)) {
+		*type = OS_FILE_TYPE_LINK;
+	} else if (S_ISREG(statinfo.st_mode)) {
+		*type = OS_FILE_TYPE_FILE;
+	} else {
+		*type = OS_FILE_TYPE_UNKNOWN;
+	}
+
+	*exists = TRUE;
+
+	return(TRUE);
+#endif
+}
+
+/*******************************************************************//**
+This function returns information about the specified file
+@return	DB_SUCCESS if all OK */
+UNIV_INTERN
+dberr_t
+os_file_get_status(
+/*===============*/
+	const char*	path,		/*!< in:	pathname of the file */
+	os_file_stat_t* stat_info,	/*!< information of a file in a
+					directory */
+	bool		check_rw_perm)	/*!< in: for testing whether the
+					file can be opened in RW mode */
+{
+	int		ret;
+
+#ifdef __WIN__
+	struct _stat64	statinfo;
+
+	ret = _stat64(path, &statinfo);
+
+	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
+		/* file does not exist */
+
+		return(DB_NOT_FOUND);
+
+	} else if (ret) {
+		/* file exists, but stat call failed */
+
+		os_file_handle_error_no_exit(path, "stat", FALSE);
+
+		return(DB_FAIL);
+
+	} else if (_S_IFDIR & statinfo.st_mode) {
+		stat_info->type = OS_FILE_TYPE_DIR;
+	} else if (_S_IFREG & statinfo.st_mode) {
+
+		DWORD	access = GENERIC_READ;
+
+		if (!srv_read_only_mode) {
+			access |= GENERIC_WRITE;
+		}
+
+		stat_info->type = OS_FILE_TYPE_FILE;
+
+		/* Check if we can open it in read-only mode. */
+
+		if (check_rw_perm) {
+			HANDLE	fh;
+
+			fh = CreateFile(
+				(LPCTSTR) path,		// File to open
+				access,
+				0,			// No sharing
+				NULL,			// Default security
+				OPEN_EXISTING,		// Existing file only
+				FILE_ATTRIBUTE_NORMAL,	// Normal file
+				NULL);			// No attr. template
+
+			if (fh == INVALID_HANDLE_VALUE) {
+				stat_info->rw_perm = false;
+			} else {
+				stat_info->rw_perm = true;
+				CloseHandle(fh);
+			}
+		}
+	} else {
+		stat_info->type = OS_FILE_TYPE_UNKNOWN;
+	}
+#else
+	struct stat	statinfo;
+
+	ret = stat(path, &statinfo);
+
+	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
+		/* file does not exist */
+
+		return(DB_NOT_FOUND);
+
+	} else if (ret) {
+		/* file exists, but stat call failed */
+
+		os_file_handle_error_no_exit(path, "stat", FALSE);
+
+		return(DB_FAIL);
+
+	}
+
+	switch (statinfo.st_mode & S_IFMT) {
+	case S_IFDIR:
+		stat_info->type = OS_FILE_TYPE_DIR;
+		break;
+	case S_IFLNK:
+		stat_info->type = OS_FILE_TYPE_LINK;
+		break;
+	case S_IFBLK:
+		stat_info->type = OS_FILE_TYPE_BLOCK;
+		break;
+	case S_IFREG:
+		stat_info->type = OS_FILE_TYPE_FILE;
+		break;
+	default:
+		stat_info->type = OS_FILE_TYPE_UNKNOWN;
+	}
+
+
+	if (check_rw_perm && (stat_info->type == OS_FILE_TYPE_FILE
+			      || stat_info->type == OS_FILE_TYPE_BLOCK)) {
+		int	fh;
+		int	access;
+
+		access = !srv_read_only_mode ? O_RDWR : O_RDONLY;
+
+		fh = ::open(path, access, os_innodb_umask);
+
+		if (fh == -1) {
+			stat_info->rw_perm = false;
+		} else {
+			stat_info->rw_perm = true;
+			close(fh);
+		}
+	}
+
+#endif /* _WIN_ */
+
+	stat_info->ctime = statinfo.st_ctime;
+	stat_info->atime = statinfo.st_atime;
+	stat_info->mtime = statinfo.st_mtime;
+	stat_info->size  = statinfo.st_size;
+
+	return(DB_SUCCESS);
+}
+
+/* path name separator character */
+#ifdef __WIN__
+#  define OS_FILE_PATH_SEPARATOR	'\\'
+#else
+#  define OS_FILE_PATH_SEPARATOR	'/'
+#endif
+
+/****************************************************************//**
+This function returns a new path name after replacing the basename
+in an old path with a new basename.  The old_path is a full path
+name including the extension.  The tablename is in the normal
+form "databasename/tablename".  The new base name is found after
+the forward slash.  Both input strings are null terminated.
+
+This function allocates memory to be returned.  It is the callers
+responsibility to free the return value after it is no longer needed.
+
+@return	own: new full pathname */
+UNIV_INTERN
+char*
+os_file_make_new_pathname(
+/*======================*/
+	const char*	old_path,	/*!< in: pathname */
+	const char*	tablename)	/*!< in: contains new base name */
+{
+	ulint		dir_len;
+	char*		last_slash;
+	char*		base_name;
+	char*		new_path;
+	ulint		new_path_len;
+
+	/* Split the tablename into its database and table name components.
+	They are separated by a '/'. */
+	last_slash = strrchr((char*) tablename, '/');
+	base_name = last_slash ? last_slash + 1 : (char*) tablename;
+
+	/* Find the offset of the last slash. We will strip off the
+	old basename.ibd which starts after that slash. */
+	last_slash = strrchr((char*) old_path, OS_FILE_PATH_SEPARATOR);
+	dir_len = last_slash ? last_slash - old_path : strlen(old_path);
+
+	/* allocate a new path and move the old directory path to it. */
+	new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
+	new_path = static_cast<char*>(mem_alloc(new_path_len));
+	memcpy(new_path, old_path, dir_len);
+
+	ut_snprintf(new_path + dir_len,
+		    new_path_len - dir_len,
+		    "%c%s.ibd",
+		    OS_FILE_PATH_SEPARATOR,
+		    base_name);
+
+	return(new_path);
+}
+
+/****************************************************************//**
+This function returns a remote path name by combining a data directory
+path provided in a DATA DIRECTORY clause with the tablename which is
+in the form 'database/tablename'.  It strips the file basename (which
+is the tablename) found after the last directory in the path provided.
+The full filepath created will include the database name as a directory
+under the path provided.  The filename is the tablename with the '.ibd'
+extension. All input and output strings are null-terminated.
+
+This function allocates memory to be returned.  It is the callers
+responsibility to free the return value after it is no longer needed.
+
+@return	own: A full pathname; data_dir_path/databasename/tablename.ibd */
+UNIV_INTERN
+char*
+os_file_make_remote_pathname(
+/*=========================*/
+	const char*	data_dir_path,	/*!< in: pathname */
+	const char*	tablename,	/*!< in: tablename */
+	const char*	extention)	/*!< in: file extention; ibd,cfg */
+{
+	ulint		data_dir_len;
+	char*		last_slash;
+	char*		new_path;
+	ulint		new_path_len;
+
+	ut_ad(extention && strlen(extention) == 3);
+
+	/* Find the offset of the last slash. We will strip off the
+	old basename or tablename which starts after that slash. */
+	last_slash = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
+	data_dir_len = last_slash ? last_slash - data_dir_path : strlen(data_dir_path);
+
+	/* allocate a new path and move the old directory path to it. */
+	new_path_len = data_dir_len + strlen(tablename)
+		       + sizeof "/." + strlen(extention);
+	new_path = static_cast<char*>(mem_alloc(new_path_len));
+	memcpy(new_path, data_dir_path, data_dir_len);
+	ut_snprintf(new_path + data_dir_len,
+		    new_path_len - data_dir_len,
+		    "%c%s.%s",
+		    OS_FILE_PATH_SEPARATOR,
+		    tablename,
+		    extention);
+
+	srv_normalize_path_for_win(new_path);
+
+	return(new_path);
+}
+
+/****************************************************************//**
+This function reduces a null-terminated full remote path name into
+the path that is sent by MySQL for DATA DIRECTORY clause.  It replaces
+the 'databasename/tablename.ibd' found at the end of the path with just
+'tablename'.
+
+Since the result is always smaller than the path sent in, no new memory
+is allocated. The caller should allocate memory for the path sent in.
+This function manipulates that path in place.
+
+If the path format is not as expected, just return.  The result is used
+to inform a SHOW CREATE TABLE command. */
+UNIV_INTERN
+void
+os_file_make_data_dir_path(
+/*========================*/
+	char*	data_dir_path)	/*!< in/out: full path/data_dir_path */
+{
+	char*	ptr;
+	char*	tablename;
+	ulint	tablename_len;
+
+	/* Replace the period before the extension with a null byte. */
+	ptr = strrchr((char*) data_dir_path, '.');
+	if (!ptr) {
+		return;
+	}
+	ptr[0] = '\0';
+
+	/* The tablename starts after the last slash. */
+	ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
+	if (!ptr) {
+		return;
+	}
+	ptr[0] = '\0';
+	tablename = ptr + 1;
+
+	/* The databasename starts after the next to last slash. */
+	ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR);
+	if (!ptr) {
+		return;
+	}
+	tablename_len = ut_strlen(tablename);
+
+	ut_memmove(++ptr, tablename, tablename_len);
+
+	ptr[tablename_len] = '\0';
+}
+
+/****************************************************************//**
+The function os_file_dirname returns a directory component of a
+null-terminated pathname string. In the usual case, dirname returns
+the string up to, but not including, the final '/', and basename
+is the component following the final '/'. Trailing '/' characters
+are not counted as part of the pathname.
+
+If path does not contain a slash, dirname returns the string ".".
+
+Concatenating the string returned by dirname, a "/", and the basename
+yields a complete pathname.
+
+The return value is a copy of the directory component of the pathname.
+The copy is allocated from heap. It is the caller responsibility
+to free it after it is no longer needed.
+
+The following list of examples (taken from SUSv2) shows the strings
+returned by dirname and basename for different paths:
+
+       path	      dirname	     basename
+       "/usr/lib"     "/usr"	     "lib"
+       "/usr/"	      "/"	     "usr"
+       "usr"	      "."	     "usr"
+       "/"	      "/"	     "/"
+       "."	      "."	     "."
+       ".."	      "."	     ".."
+
+@return	own: directory component of the pathname */
+UNIV_INTERN
+char*
+os_file_dirname(
+/*============*/
+	const char*	path)	/*!< in: pathname */
+{
+	/* Find the offset of the last slash */
+	const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
+	if (!last_slash) {
+		/* No slash in the path, return "." */
+
+		return(mem_strdup("."));
+	}
+
+	/* Ok, there is a slash */
+
+	if (last_slash == path) {
+		/* last slash is the first char of the path */
+
+		return(mem_strdup("/"));
+	}
+
+	/* Non-trivial directory component */
+
+	return(mem_strdupl(path, last_slash - path));
+}
+
+/****************************************************************//**
+Creates all missing subdirectories along the given path.
+@return	TRUE if call succeeded FALSE otherwise */
+UNIV_INTERN
+ibool
+os_file_create_subdirs_if_needed(
+/*=============================*/
+	const char*	path)	/*!< in: path name */
+{
+	if (srv_read_only_mode) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"read only mode set. Can't create subdirectories '%s'",
+			path);
+
+		return(FALSE);
+
+	}
+
+	char*	subdir = os_file_dirname(path);
+
+	if (strlen(subdir) == 1
+	    && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
+		/* subdir is root or cwd, nothing to do */
+		mem_free(subdir);
+
+		return(TRUE);
+	}
+
+	/* Test if subdir exists */
+	os_file_type_t	type;
+	ibool	subdir_exists;
+	ibool	success = os_file_status(subdir, &subdir_exists, &type);
+
+	if (success && !subdir_exists) {
+
+		/* subdir does not exist, create it */
+		success = os_file_create_subdirs_if_needed(subdir);
+
+		if (!success) {
+			mem_free(subdir);
+
+			return(FALSE);
+		}
+
+		success = os_file_create_directory(subdir, FALSE);
+	}
+
+	mem_free(subdir);
+
+	return(success);
+}
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Returns a pointer to the nth slot in the aio array.
+@return	pointer to slot */
+static
+os_aio_slot_t*
+os_aio_array_get_nth_slot(
+/*======================*/
+	os_aio_array_t*		array,	/*!< in: aio array */
+	ulint			index)	/*!< in: index of the slot */
+{
+	ut_a(index < array->n_slots);
+
+	return(&array->slots[index]);
+}
+
+#if defined(LINUX_NATIVE_AIO)
+/******************************************************************//**
+Creates an io_context for native linux AIO.
+@return	TRUE on success. */
+static
+ibool
+os_aio_linux_create_io_ctx(
+/*=======================*/
+	ulint		max_events,	/*!< in: number of events. */
+	io_context_t*	io_ctx)		/*!< out: io_ctx to initialize. */
+{
+	int	ret;
+	ulint	retries = 0;
+
+retry:
+	memset(io_ctx, 0x0, sizeof(*io_ctx));
+
+	/* Initialize the io_ctx. Tell it how many pending
+	IO requests this context will handle. */
+
+	ret = io_setup(max_events, io_ctx);
+	if (ret == 0) {
+#if defined(UNIV_AIO_DEBUG)
+		fprintf(stderr,
+			"InnoDB: Linux native AIO:"
+			" initialized io_ctx for segment\n");
+#endif
+		/* Success. Return now. */
+		return(TRUE);
+	}
+
+	/* If we hit EAGAIN we'll make a few attempts before failing. */
+
+	switch (ret) {
+	case -EAGAIN:
+		if (retries == 0) {
+			/* First time around. */
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Warning: io_setup() failed"
+				" with EAGAIN. Will make %d attempts"
+				" before giving up.\n",
+				OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
+		}
+
+		if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
+			++retries;
+			fprintf(stderr,
+				"InnoDB: Warning: io_setup() attempt"
+				" %lu failed.\n",
+				retries);
+			os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
+			goto retry;
+		}
+
+		/* Have tried enough. Better call it a day. */
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error: io_setup() failed"
+			" with EAGAIN after %d attempts.\n",
+			OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
+		break;
+
+	case -ENOSYS:
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error: Linux Native AIO interface"
+			" is not supported on this platform. Please"
+			" check your OS documentation and install"
+			" appropriate binary of InnoDB.\n");
+
+		break;
+
+	default:
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error: Linux Native AIO setup"
+			" returned following error[%d]\n", -ret);
+		break;
+	}
+
+	fprintf(stderr,
+		"InnoDB: You can disable Linux Native AIO by"
+		" setting innodb_use_native_aio = 0 in my.cnf\n");
+	return(FALSE);
+}
+
+/******************************************************************//**
+Checks if the system supports native linux aio. On some kernel
+versions where native aio is supported it won't work on tmpfs. In such
+cases we can't use native aio as it is not possible to mix simulated
+and native aio.
+@return: TRUE if supported, FALSE otherwise. */
+static
+ibool
+os_aio_native_aio_supported(void)
+/*=============================*/
+{
+	int			fd;
+	io_context_t		io_ctx;
+	char			name[1000];
+
+	if (!os_aio_linux_create_io_ctx(1, &io_ctx)) {
+		/* The platform does not support native aio. */
+		return(FALSE);
+	} else if (!srv_read_only_mode) {
+		/* Now check if tmpdir supports native aio ops. */
+		fd = innobase_mysql_tmpfile();
+
+		if (fd < 0) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Unable to create temp file to check "
+				"native AIO support.");
+
+			return(FALSE);
+		}
+	} else {
+
+		srv_normalize_path_for_win(srv_log_group_home_dir);
+
+		ulint	dirnamelen = strlen(srv_log_group_home_dir);
+		ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
+		memcpy(name, srv_log_group_home_dir, dirnamelen);
+
+		/* Add a path separator if needed. */
+		if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
+			name[dirnamelen++] = SRV_PATH_SEPARATOR;
+		}
+
+		strcpy(name + dirnamelen, "ib_logfile0");
+
+		fd = ::open(name, O_RDONLY);
+
+		if (fd == -1) {
+
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Unable to open \"%s\" to check "
+				"native AIO read support.", name);
+
+			return(FALSE);
+		}
+	}
+
+	struct io_event	io_event;
+
+	memset(&io_event, 0x0, sizeof(io_event));
+
+	byte*	buf = static_cast<byte*>(ut_malloc(UNIV_PAGE_SIZE * 2));
+	byte*	ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
+
+	struct iocb	iocb;
+
+	/* Suppress valgrind warning. */
+	memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
+	memset(&iocb, 0x0, sizeof(iocb));
+
+	struct iocb*	p_iocb = &iocb;
+
+	if (!srv_read_only_mode) {
+		io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
+	} else {
+		ut_a(UNIV_PAGE_SIZE >= 512);
+		io_prep_pread(p_iocb, fd, ptr, 512, 0);
+	}
+
+	int	err = io_submit(io_ctx, 1, &p_iocb);
+
+	if (err >= 1) {
+		/* Now collect the submitted IO request. */
+		err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
+	}
+
+	ut_free(buf);
+	close(fd);
+
+	switch (err) {
+	case 1:
+		return(TRUE);
+
+	case -EINVAL:
+	case -ENOSYS:
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Linux Native AIO not supported. You can either "
+			"move %s to a file system that supports native "
+			"AIO or you can set innodb_use_native_aio to "
+			"FALSE to avoid this message.",
+			srv_read_only_mode ? name : "tmpdir");
+
+		/* fall through. */
+	default:
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Linux Native AIO check on %s returned error[%d]",
+			srv_read_only_mode ? name : "tmpdir", -err);
+	}
+
+	return(FALSE);
+}
+#endif /* LINUX_NATIVE_AIO */
+
+/******************************************************************//**
+Creates an aio wait array. Note that we return NULL in case of failure.
+We don't care about freeing memory here because we assume that a
+failure will result in server refusing to start up.
+@return	own: aio array, NULL on failure */
+static
+os_aio_array_t*
+os_aio_array_create(
+/*================*/
+	ulint	n,		/*!< in: maximum number of pending aio
+				operations allowed; n must be
+				divisible by n_segments */
+	ulint	n_segments)	/*!< in: number of segments in the aio array */
+{
+	os_aio_array_t*	array;
+#ifdef WIN_ASYNC_IO
+	OVERLAPPED*	over;
+#elif defined(LINUX_NATIVE_AIO)
+	struct io_event*	io_event = NULL;
+#endif /* WIN_ASYNC_IO */
+	ut_a(n > 0);
+	ut_a(n_segments > 0);
+
+	array = static_cast<os_aio_array_t*>(ut_malloc(sizeof(*array)));
+	memset(array, 0x0, sizeof(*array));
+
+	array->mutex = os_mutex_create();
+	array->not_full = os_event_create();
+	array->is_empty = os_event_create();
+
+	os_event_set(array->is_empty);
+
+	array->n_slots = n;
+	array->n_segments = n_segments;
+
+	array->slots = static_cast<os_aio_slot_t*>(
+		ut_malloc(n * sizeof(*array->slots)));
+
+	memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots)));
+#ifdef __WIN__
+	array->handles = static_cast<HANDLE*>(ut_malloc(n * sizeof(HANDLE)));
+#endif /* __WIN__ */
+
+#if defined(LINUX_NATIVE_AIO)
+	array->aio_ctx = NULL;
+	array->aio_events = NULL;
+
+	/* If we are not using native aio interface then skip this
+	part of initialization. */
+	if (!srv_use_native_aio) {
+		goto skip_native_aio;
+	}
+
+	/* Initialize the io_context array. One io_context
+	per segment in the array. */
+
+	array->aio_ctx = static_cast<io_context**>(
+		ut_malloc(n_segments * sizeof(*array->aio_ctx)));
+
+	for (ulint i = 0; i < n_segments; ++i) {
+		if (!os_aio_linux_create_io_ctx(n/n_segments,
+						&array->aio_ctx[i])) {
+			/* If something bad happened during aio setup
+			we should call it a day and return right away.
+			We don't care about any leaks because a failure
+			to initialize the io subsystem means that the
+			server (or atleast the innodb storage engine)
+			is not going to startup. */
+			return(NULL);
+		}
+	}
+
+	/* Initialize the event array. One event per slot. */
+	io_event = static_cast<struct io_event*>(
+		ut_malloc(n * sizeof(*io_event)));
+
+	memset(io_event, 0x0, sizeof(*io_event) * n);
+	array->aio_events = io_event;
+
+skip_native_aio:
+#endif /* LINUX_NATIVE_AIO */
+	for (ulint i = 0; i < n; i++) {
+		os_aio_slot_t*	slot;
+
+		slot = os_aio_array_get_nth_slot(array, i);
+
+		slot->pos = i;
+		slot->reserved = FALSE;
+#ifdef WIN_ASYNC_IO
+		slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
+
+		over = &slot->control;
+
+		over->hEvent = slot->handle;
+
+		array->handles[i] = over->hEvent;
+
+#elif defined(LINUX_NATIVE_AIO)
+		memset(&slot->control, 0x0, sizeof(slot->control));
+		slot->n_bytes = 0;
+		slot->ret = 0;
+#endif /* WIN_ASYNC_IO */
+	}
+
+	return(array);
+}
+
+/************************************************************************//**
+Frees an aio wait array. */
+static
+void
+os_aio_array_free(
+/*==============*/
+	os_aio_array_t*& array)	/*!< in, own: array to free */
+{
+#ifdef WIN_ASYNC_IO
+	ulint	i;
+
+	for (i = 0; i < array->n_slots; i++) {
+		os_aio_slot_t*	slot = os_aio_array_get_nth_slot(array, i);
+		CloseHandle(slot->handle);
+	}
+#endif /* WIN_ASYNC_IO */
+
+#ifdef __WIN__
+	ut_free(array->handles);
+#endif /* __WIN__ */
+	os_mutex_free(array->mutex);
+	os_event_free(array->not_full);
+	os_event_free(array->is_empty);
+
+#if defined(LINUX_NATIVE_AIO)
+	if (srv_use_native_aio) {
+		ut_free(array->aio_events);
+		ut_free(array->aio_ctx);
+	}
+#endif /* LINUX_NATIVE_AIO */
+
+	ut_free(array->slots);
+	ut_free(array);
+
+	array = 0;
+}
+
+/***********************************************************************
+Initializes the asynchronous io system. Creates one array each for ibuf
+and log i/o. Also creates one array each for read and write where each
+array is divided logically into n_read_segs and n_write_segs
+respectively. The caller must create an i/o handler thread for each
+segment in these arrays. This function also creates the sync array.
+No i/o handler thread needs to be created for that */
+UNIV_INTERN
+ibool
+os_aio_init(
+/*========*/
+	ulint	n_per_seg,	/*<! in: maximum number of pending aio
+				operations allowed per segment */
+	ulint	n_read_segs,	/*<! in: number of reader threads */
+	ulint	n_write_segs,	/*<! in: number of writer threads */
+	ulint	n_slots_sync)	/*<! in: number of slots in the sync aio
+				array */
+{
+	os_io_init_simple();
+
+#if defined(LINUX_NATIVE_AIO)
+	/* Check if native aio is supported on this system and tmpfs */
+	if (srv_use_native_aio && !os_aio_native_aio_supported()) {
+
+		ib_logf(IB_LOG_LEVEL_WARN, "Linux Native AIO disabled.");
+
+		srv_use_native_aio = FALSE;
+	}
+#endif /* LINUX_NATIVE_AIO */
+
+	srv_reset_io_thread_op_info();
+
+	os_aio_read_array = os_aio_array_create(
+		n_read_segs * n_per_seg, n_read_segs);
+
+	if (os_aio_read_array == NULL) {
+		return(FALSE);
+	}
+
+	ulint	start = (srv_read_only_mode) ? 0 : 2;
+	ulint	n_segs = n_read_segs + start;
+
+	/* 0 is the ibuf segment and 1 is the insert buffer segment. */
+	for (ulint i = start; i < n_segs; ++i) {
+		ut_a(i < SRV_MAX_N_IO_THREADS);
+		srv_io_thread_function[i] = "read thread";
+	}
+
+	ulint	n_segments = n_read_segs;
+
+	if (!srv_read_only_mode) {
+
+		os_aio_log_array = os_aio_array_create(n_per_seg, 1);
+
+		if (os_aio_log_array == NULL) {
+			return(FALSE);
+		}
+
+		++n_segments;
+
+		srv_io_thread_function[1] = "log thread";
+
+		os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
+
+		if (os_aio_ibuf_array == NULL) {
+			return(FALSE);
+		}
+
+		++n_segments;
+
+		srv_io_thread_function[0] = "insert buffer thread";
+
+		os_aio_write_array = os_aio_array_create(
+			n_write_segs * n_per_seg, n_write_segs);
+
+		if (os_aio_write_array == NULL) {
+			return(FALSE);
+		}
+
+		n_segments += n_write_segs;
+
+		for (ulint i = start + n_read_segs; i < n_segments; ++i) {
+			ut_a(i < SRV_MAX_N_IO_THREADS);
+			srv_io_thread_function[i] = "write thread";
+		}
+
+		ut_ad(n_segments >= 4);
+	} else {
+		ut_ad(n_segments > 0);
+	}
+
+	os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
+
+	if (os_aio_sync_array == NULL) {
+		return(FALSE);
+	}
+
+	os_aio_n_segments = n_segments;
+
+	os_aio_validate();
+
+	os_aio_segment_wait_events = static_cast<os_event_t*>(
+		ut_malloc(n_segments * sizeof *os_aio_segment_wait_events));
+
+	for (ulint i = 0; i < n_segments; ++i) {
+		os_aio_segment_wait_events[i] = os_event_create();
+	}
+
+	os_last_printout = ut_time();
+
+	return(TRUE);
+
+}
+
+/***********************************************************************
+Frees the asynchronous io system. */
+UNIV_INTERN
+void
+os_aio_free(void)
+/*=============*/
+{
+	if (os_aio_ibuf_array != 0) {
+		os_aio_array_free(os_aio_ibuf_array);
+	}
+
+	if (os_aio_log_array != 0) {
+		os_aio_array_free(os_aio_log_array);
+	}
+
+	if (os_aio_write_array != 0) {
+		os_aio_array_free(os_aio_write_array);
+	}
+
+	if (os_aio_sync_array != 0) {
+		os_aio_array_free(os_aio_sync_array);
+	}
+
+	os_aio_array_free(os_aio_read_array);
+
+	for (ulint i = 0; i < os_aio_n_segments; i++) {
+		os_event_free(os_aio_segment_wait_events[i]);
+	}
+
+	ut_free(os_aio_segment_wait_events);
+	os_aio_segment_wait_events = 0;
+	os_aio_n_segments = 0;
+}
+
+#ifdef WIN_ASYNC_IO
+/************************************************************************//**
+Wakes up all async i/o threads in the array in Windows async i/o at
+shutdown. */
+static
+void
+os_aio_array_wake_win_aio_at_shutdown(
+/*==================================*/
+	os_aio_array_t*	array)	/*!< in: aio array */
+{
+	ulint	i;
+
+	for (i = 0; i < array->n_slots; i++) {
+
+		SetEvent((array->slots + i)->handle);
+	}
+}
+#endif
+
+/************************************************************************//**
+Wakes up all async i/o threads so that they know to exit themselves in
+shutdown. */
+UNIV_INTERN
+void
+os_aio_wake_all_threads_at_shutdown(void)
+/*=====================================*/
+{
+#ifdef WIN_ASYNC_IO
+	/* This code wakes up all ai/o threads in Windows native aio */
+	os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
+	if (os_aio_write_array != 0) {
+		os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
+	}
+
+	if (os_aio_ibuf_array != 0) {
+		os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
+	}
+
+	if (os_aio_log_array != 0) {
+		os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
+	}
+
+#elif defined(LINUX_NATIVE_AIO)
+
+	/* When using native AIO interface the io helper threads
+	wait on io_getevents with a timeout value of 500ms. At
+	each wake up these threads check the server status.
+	No need to do anything to wake them up. */
+
+	if (srv_use_native_aio) {
+		return;
+	}
+
+	/* Fall through to simulated AIO handler wakeup if we are
+	not using native AIO. */
+#endif /* !WIN_ASYNC_AIO */
+
+	/* This loop wakes up all simulated ai/o threads */
+
+	for (ulint i = 0; i < os_aio_n_segments; i++) {
+
+		os_event_set(os_aio_segment_wait_events[i]);
+	}
+}
+
+/************************************************************************//**
+Waits until there are no pending writes in os_aio_write_array. There can
+be other, synchronous, pending writes. */
+UNIV_INTERN
+void
+os_aio_wait_until_no_pending_writes(void)
+/*=====================================*/
+{
+	ut_ad(!srv_read_only_mode);
+	os_event_wait(os_aio_write_array->is_empty);
+}
+
+/**********************************************************************//**
+Calculates segment number for a slot.
+@return segment number (which is the number used by, for example,
+i/o-handler threads) */
+static
+ulint
+os_aio_get_segment_no_from_slot(
+/*============================*/
+	os_aio_array_t*	array,	/*!< in: aio wait array */
+	os_aio_slot_t*	slot)	/*!< in: slot in this array */
+{
+	ulint	segment;
+	ulint	seg_len;
+
+	if (array == os_aio_ibuf_array) {
+		ut_ad(!srv_read_only_mode);
+
+		segment = IO_IBUF_SEGMENT;
+
+	} else if (array == os_aio_log_array) {
+		ut_ad(!srv_read_only_mode);
+
+		segment = IO_LOG_SEGMENT;
+
+	} else if (array == os_aio_read_array) {
+		seg_len = os_aio_read_array->n_slots
+			/ os_aio_read_array->n_segments;
+
+		segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
+	} else {
+		ut_ad(!srv_read_only_mode);
+		ut_a(array == os_aio_write_array);
+
+		seg_len = os_aio_write_array->n_slots
+			/ os_aio_write_array->n_segments;
+
+		segment = os_aio_read_array->n_segments + 2
+			+ slot->pos / seg_len;
+	}
+
+	return(segment);
+}
+
+/**********************************************************************//**
+Calculates local segment number and aio array from global segment number.
+@return	local segment number within the aio array */
+static
+ulint
+os_aio_get_array_and_local_segment(
+/*===============================*/
+	os_aio_array_t** array,		/*!< out: aio wait array */
+	ulint		 global_segment)/*!< in: global segment number */
+{
+	ulint		segment;
+
+	ut_a(global_segment < os_aio_n_segments);
+
+	if (srv_read_only_mode) {
+		*array = os_aio_read_array;
+
+		return(global_segment);
+	} else if (global_segment == IO_IBUF_SEGMENT) {
+		*array = os_aio_ibuf_array;
+		segment = 0;
+
+	} else if (global_segment == IO_LOG_SEGMENT) {
+		*array = os_aio_log_array;
+		segment = 0;
+
+	} else if (global_segment < os_aio_read_array->n_segments + 2) {
+		*array = os_aio_read_array;
+
+		segment = global_segment - 2;
+	} else {
+		*array = os_aio_write_array;
+
+		segment = global_segment - (os_aio_read_array->n_segments + 2);
+	}
+
+	return(segment);
+}
+
+/*******************************************************************//**
+Requests for a slot in the aio array. If no slot is available, waits until
+not_full-event becomes signaled.
+@return	pointer to slot */
+static
+os_aio_slot_t*
+os_aio_array_reserve_slot(
+/*======================*/
+	ulint		type,	/*!< in: OS_FILE_READ or OS_FILE_WRITE */
+	os_aio_array_t*	array,	/*!< in: aio array */
+	fil_node_t*	message1,/*!< in: message to be passed along with
+				the aio operation */
+	void*		message2,/*!< in: message to be passed along with
+				the aio operation */
+	os_file_t	file,	/*!< in: file handle */
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	void*		buf,	/*!< in: buffer where to read or from which
+				to write */
+	os_offset_t	offset,	/*!< in: file offset */
+	ulint		len)	/*!< in: length of the block to read or write */
+{
+	os_aio_slot_t*	slot = NULL;
+#ifdef WIN_ASYNC_IO
+	OVERLAPPED*	control;
+
+#elif defined(LINUX_NATIVE_AIO)
+
+	struct iocb*	iocb;
+	off_t		aio_offset;
+
+#endif /* WIN_ASYNC_IO */
+	ulint		i;
+	ulint		counter;
+	ulint		slots_per_seg;
+	ulint		local_seg;
+
+#ifdef WIN_ASYNC_IO
+	ut_a((len & 0xFFFFFFFFUL) == len);
+#endif /* WIN_ASYNC_IO */
+
+	/* No need of a mutex. Only reading constant fields */
+	slots_per_seg = array->n_slots / array->n_segments;
+
+	/* We attempt to keep adjacent blocks in the same local
+	segment. This can help in merging IO requests when we are
+	doing simulated AIO */
+	local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
+		% array->n_segments;
+
+loop:
+	os_mutex_enter(array->mutex);
+
+	if (array->n_reserved == array->n_slots) {
+		os_mutex_exit(array->mutex);
+
+		if (!srv_use_native_aio) {
+			/* If the handler threads are suspended, wake them
+			so that we get more slots */
+
+			os_aio_simulated_wake_handler_threads();
+		}
+
+		os_event_wait(array->not_full);
+
+		goto loop;
+	}
+
+	/* We start our search for an available slot from our preferred
+	local segment and do a full scan of the array. We are
+	guaranteed to find a slot in full scan. */
+	for (i = local_seg * slots_per_seg, counter = 0;
+	     counter < array->n_slots;
+	     i++, counter++) {
+
+		i %= array->n_slots;
+
+		slot = os_aio_array_get_nth_slot(array, i);
+
+		if (slot->reserved == FALSE) {
+			goto found;
+		}
+	}
+
+	/* We MUST always be able to get hold of a reserved slot. */
+	ut_error;
+
+found:
+	ut_a(slot->reserved == FALSE);
+	array->n_reserved++;
+
+	if (array->n_reserved == 1) {
+		os_event_reset(array->is_empty);
+	}
+
+	if (array->n_reserved == array->n_slots) {
+		os_event_reset(array->not_full);
+	}
+
+	slot->reserved = TRUE;
+	slot->reservation_time = ut_time();
+	slot->message1 = message1;
+	slot->message2 = message2;
+	slot->file     = file;
+	slot->name     = name;
+	slot->len      = len;
+	slot->type     = type;
+	slot->buf      = static_cast<byte*>(buf);
+	slot->offset   = offset;
+	slot->io_already_done = FALSE;
+
+#ifdef WIN_ASYNC_IO
+	control = &slot->control;
+	control->Offset = (DWORD) offset & 0xFFFFFFFF;
+	control->OffsetHigh = (DWORD) (offset >> 32);
+	ResetEvent(slot->handle);
+
+#elif defined(LINUX_NATIVE_AIO)
+
+	/* If we are not using native AIO skip this part. */
+	if (!srv_use_native_aio) {
+		goto skip_native_aio;
+	}
+
+	/* Check if we are dealing with 64 bit arch.
+	If not then make sure that offset fits in 32 bits. */
+	aio_offset = (off_t) offset;
+
+	ut_a(sizeof(aio_offset) >= sizeof(offset)
+	     || ((os_offset_t) aio_offset) == offset);
+
+	iocb = &slot->control;
+
+	if (type == OS_FILE_READ) {
+		io_prep_pread(iocb, file, buf, len, aio_offset);
+	} else {
+		ut_a(type == OS_FILE_WRITE);
+		io_prep_pwrite(iocb, file, buf, len, aio_offset);
+	}
+
+	iocb->data = (void*) slot;
+	slot->n_bytes = 0;
+	slot->ret = 0;
+
+skip_native_aio:
+#endif /* LINUX_NATIVE_AIO */
+	os_mutex_exit(array->mutex);
+
+	return(slot);
+}
+
+/*******************************************************************//**
+Frees a slot in the aio array. */
+static
+void
+os_aio_array_free_slot(
+/*===================*/
+	os_aio_array_t*	array,	/*!< in: aio array */
+	os_aio_slot_t*	slot)	/*!< in: pointer to slot */
+{
+	os_mutex_enter(array->mutex);
+
+	ut_ad(slot->reserved);
+
+	slot->reserved = FALSE;
+
+	array->n_reserved--;
+
+	if (array->n_reserved == array->n_slots - 1) {
+		os_event_set(array->not_full);
+	}
+
+	if (array->n_reserved == 0) {
+		os_event_set(array->is_empty);
+	}
+
+#ifdef WIN_ASYNC_IO
+
+	ResetEvent(slot->handle);
+
+#elif defined(LINUX_NATIVE_AIO)
+
+	if (srv_use_native_aio) {
+		memset(&slot->control, 0x0, sizeof(slot->control));
+		slot->n_bytes = 0;
+		slot->ret = 0;
+		/*fprintf(stderr, "Freed up Linux native slot.\n");*/
+	} else {
+		/* These fields should not be used if we are not
+		using native AIO. */
+		ut_ad(slot->n_bytes == 0);
+		ut_ad(slot->ret == 0);
+	}
+
+#endif
+	os_mutex_exit(array->mutex);
+}
+
+/**********************************************************************//**
+Wakes up a simulated aio i/o-handler thread if it has something to do. */
+static
+void
+os_aio_simulated_wake_handler_thread(
+/*=================================*/
+	ulint	global_segment)	/*!< in: the number of the segment in the aio
+				arrays */
+{
+	os_aio_array_t*	array;
+	ulint		segment;
+
+	ut_ad(!srv_use_native_aio);
+
+	segment = os_aio_get_array_and_local_segment(&array, global_segment);
+
+	ulint	n = array->n_slots / array->n_segments;
+
+	segment *= n;
+
+	/* Look through n slots after the segment * n'th slot */
+
+	os_mutex_enter(array->mutex);
+
+	for (ulint i = 0; i < n; ++i) {
+		const os_aio_slot_t*	slot;
+
+		slot = os_aio_array_get_nth_slot(array, segment + i);
+
+		if (slot->reserved) {
+
+			/* Found an i/o request */
+
+			os_mutex_exit(array->mutex);
+
+			os_event_t	event;
+
+			event = os_aio_segment_wait_events[global_segment];
+
+			os_event_set(event);
+
+			return;
+		}
+	}
+
+	os_mutex_exit(array->mutex);
+}
+
+/**********************************************************************//**
+Wakes up simulated aio i/o-handler threads if they have something to do. */
+UNIV_INTERN
+void
+os_aio_simulated_wake_handler_threads(void)
+/*=======================================*/
+{
+	if (srv_use_native_aio) {
+		/* We do not use simulated aio: do nothing */
+
+		return;
+	}
+
+	os_aio_recommend_sleep_for_read_threads	= FALSE;
+
+	for (ulint i = 0; i < os_aio_n_segments; i++) {
+		os_aio_simulated_wake_handler_thread(i);
+	}
+}
+
+/**********************************************************************//**
+This function can be called if one wants to post a batch of reads and
+prefers an i/o-handler thread to handle them all at once later. You must
+call os_aio_simulated_wake_handler_threads later to ensure the threads
+are not left sleeping! */
+UNIV_INTERN
+void
+os_aio_simulated_put_read_threads_to_sleep(void)
+/*============================================*/
+{
+
+/* The idea of putting background IO threads to sleep is only for
+Windows when using simulated AIO. Windows XP seems to schedule
+background threads too eagerly to allow for coalescing during
+readahead requests. */
+#ifdef __WIN__
+	os_aio_array_t*	array;
+
+	if (srv_use_native_aio) {
+		/* We do not use simulated aio: do nothing */
+
+		return;
+	}
+
+	os_aio_recommend_sleep_for_read_threads	= TRUE;
+
+	for (ulint i = 0; i < os_aio_n_segments; i++) {
+		os_aio_get_array_and_local_segment(&array, i);
+
+		if (array == os_aio_read_array) {
+
+			os_event_reset(os_aio_segment_wait_events[i]);
+		}
+	}
+#endif /* __WIN__ */
+}
+
+#if defined(LINUX_NATIVE_AIO)
+/*******************************************************************//**
+Dispatch an AIO request to the kernel.
+@return	TRUE on success. */
+static
+ibool
+os_aio_linux_dispatch(
+/*==================*/
+	os_aio_array_t*	array,	/*!< in: io request array. */
+	os_aio_slot_t*	slot)	/*!< in: an already reserved slot. */
+{
+	int		ret;
+	ulint		io_ctx_index;
+	struct iocb*	iocb;
+
+	ut_ad(slot != NULL);
+	ut_ad(array);
+
+	ut_a(slot->reserved);
+
+	/* Find out what we are going to work with.
+	The iocb struct is directly in the slot.
+	The io_context is one per segment. */
+
+	iocb = &slot->control;
+	io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
+
+	ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
+
+#if defined(UNIV_AIO_DEBUG)
+	fprintf(stderr,
+		"io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
+		(slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
+		array->aio_ctx[io_ctx_index], (ulong) io_ctx_index);
+#endif
+
+	/* io_submit returns number of successfully
+	queued requests or -errno. */
+	if (UNIV_UNLIKELY(ret != 1)) {
+		errno = -ret;
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+#endif /* LINUX_NATIVE_AIO */
+
+
+/*******************************************************************//**
+NOTE! Use the corresponding macro os_aio(), not directly this function!
+Requests an asynchronous i/o operation.
+@return	TRUE if request was queued successfully, FALSE if fail */
+UNIV_INTERN
+ibool
+os_aio_func(
+/*========*/
+	ulint		type,	/*!< in: OS_FILE_READ or OS_FILE_WRITE */
+	ulint		mode,	/*!< in: OS_AIO_NORMAL, ..., possibly ORed
+				to OS_AIO_SIMULATED_WAKE_LATER: the
+				last flag advises this function not to wake
+				i/o-handler threads, but the caller will
+				do the waking explicitly later, in this
+				way the caller can post several requests in
+				a batch; NOTE that the batch must not be
+				so big that it exhausts the slots in aio
+				arrays! NOTE that a simulated batch
+				may introduce hidden chances of deadlocks,
+				because i/os are not actually handled until
+				all have been posted: use with great
+				caution! */
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read or from which
+				to write */
+	os_offset_t	offset,	/*!< in: file offset where to read or write */
+	ulint		n,	/*!< in: number of bytes to read or write */
+	fil_node_t*	message1,/*!< in: message for the aio handler
+				(can be used to identify a completed
+				aio operation); ignored if mode is
+				OS_AIO_SYNC */
+	void*		message2)/*!< in: message for the aio handler
+				(can be used to identify a completed
+				aio operation); ignored if mode is
+				OS_AIO_SYNC */
+{
+	os_aio_array_t*	array;
+	os_aio_slot_t*	slot;
+#ifdef WIN_ASYNC_IO
+	ibool		retval;
+	BOOL		ret		= TRUE;
+	DWORD		len		= (DWORD) n;
+	struct fil_node_t* dummy_mess1;
+	void*		dummy_mess2;
+	ulint		dummy_type;
+#endif /* WIN_ASYNC_IO */
+	ulint		wake_later;
+
+	ut_ad(file);
+	ut_ad(buf);
+	ut_ad(n > 0);
+	ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
+	ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
+	ut_ad(os_aio_validate_skip());
+#ifdef WIN_ASYNC_IO
+	ut_ad((n & 0xFFFFFFFFUL) == n);
+#endif
+
+	wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
+	mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
+
+	if (mode == OS_AIO_SYNC
+#ifdef WIN_ASYNC_IO
+	    && !srv_use_native_aio
+#endif /* WIN_ASYNC_IO */
+	    ) {
+		/* This is actually an ordinary synchronous read or write:
+		no need to use an i/o-handler thread. NOTE that if we use
+		Windows async i/o, Windows does not allow us to use
+		ordinary synchronous os_file_read etc. on the same file,
+		therefore we have built a special mechanism for synchronous
+		wait in the Windows case.
+		Also note that the Performance Schema instrumentation has
+		been performed by current os_aio_func()'s wrapper function
+		pfs_os_aio_func(). So we would no longer need to call
+		Performance Schema instrumented os_file_read() and
+		os_file_write(). Instead, we should use os_file_read_func()
+		and os_file_write_func() */
+
+		if (type == OS_FILE_READ) {
+			return(os_file_read_func(file, buf, offset, n));
+		}
+
+		ut_ad(!srv_read_only_mode);
+		ut_a(type == OS_FILE_WRITE);
+
+		return(os_file_write_func(name, file, buf, offset, n));
+	}
+
+try_again:
+	switch (mode) {
+	case OS_AIO_NORMAL:
+		if (type == OS_FILE_READ) {
+			array = os_aio_read_array;
+		} else {
+			ut_ad(!srv_read_only_mode);
+			array = os_aio_write_array;
+		}
+		break;
+	case OS_AIO_IBUF:
+		ut_ad(type == OS_FILE_READ);
+		/* Reduce probability of deadlock bugs in connection with ibuf:
+		do not let the ibuf i/o handler sleep */
+
+		wake_later = FALSE;
+
+		if (srv_read_only_mode) {
+			array = os_aio_read_array;
+		} else {
+			array = os_aio_ibuf_array;
+		}
+		break;
+	case OS_AIO_LOG:
+		if (srv_read_only_mode) {
+			array = os_aio_read_array;
+		} else {
+			array = os_aio_log_array;
+		}
+		break;
+	case OS_AIO_SYNC:
+		array = os_aio_sync_array;
+#if defined(LINUX_NATIVE_AIO)
+		/* In Linux native AIO we don't use sync IO array. */
+		ut_a(!srv_use_native_aio);
+#endif /* LINUX_NATIVE_AIO */
+		break;
+	default:
+		ut_error;
+		array = NULL; /* Eliminate compiler warning */
+	}
+
+	slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
+					 name, buf, offset, n);
+	if (type == OS_FILE_READ) {
+		if (srv_use_native_aio) {
+			os_n_file_reads++;
+			os_bytes_read_since_printout += n;
+#ifdef WIN_ASYNC_IO
+			ret = ReadFile(file, buf, (DWORD) n, &len,
+				       &(slot->control));
+
+#elif defined(LINUX_NATIVE_AIO)
+			if (!os_aio_linux_dispatch(array, slot)) {
+				goto err_exit;
+			}
+#endif /* WIN_ASYNC_IO */
+		} else {
+			if (!wake_later) {
+				os_aio_simulated_wake_handler_thread(
+					os_aio_get_segment_no_from_slot(
+						array, slot));
+			}
+		}
+	} else if (type == OS_FILE_WRITE) {
+		ut_ad(!srv_read_only_mode);
+		if (srv_use_native_aio) {
+			os_n_file_writes++;
+#ifdef WIN_ASYNC_IO
+			ret = WriteFile(file, buf, (DWORD) n, &len,
+					&(slot->control));
+
+#elif defined(LINUX_NATIVE_AIO)
+			if (!os_aio_linux_dispatch(array, slot)) {
+				goto err_exit;
+			}
+#endif /* WIN_ASYNC_IO */
+		} else {
+			if (!wake_later) {
+				os_aio_simulated_wake_handler_thread(
+					os_aio_get_segment_no_from_slot(
+						array, slot));
+			}
+		}
+	} else {
+		ut_error;
+	}
+
+#ifdef WIN_ASYNC_IO
+	if (srv_use_native_aio) {
+		if ((ret && len == n)
+		    || (!ret && GetLastError() == ERROR_IO_PENDING)) {
+			/* aio was queued successfully! */
+
+			if (mode == OS_AIO_SYNC) {
+				/* We want a synchronous i/o operation on a
+				file where we also use async i/o: in Windows
+				we must use the same wait mechanism as for
+				async i/o */
+
+				retval = os_aio_windows_handle(
+					ULINT_UNDEFINED, slot->pos,
+					&dummy_mess1, &dummy_mess2,
+					&dummy_type);
+
+				return(retval);
+			}
+
+			return(TRUE);
+		}
+
+		goto err_exit;
+	}
+#endif /* WIN_ASYNC_IO */
+	/* aio was queued successfully! */
+	return(TRUE);
+
+#if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
+err_exit:
+#endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
+	os_aio_array_free_slot(array, slot);
+
+	if (os_file_handle_error(
+		name,type == OS_FILE_READ ? "aio read" : "aio write")) {
+
+		goto try_again;
+	}
+
+	return(FALSE);
+}
+
+#ifdef WIN_ASYNC_IO
+/**********************************************************************//**
+This function is only used in Windows asynchronous i/o.
+Waits for an aio operation to complete. This function is used to wait the
+for completed requests. The aio array of pending requests is divided
+into segments. The thread specifies which segment or slot it wants to wait
+for. NOTE: this function will also take care of freeing the aio slot,
+therefore no other thread is allowed to do the freeing!
+@return	TRUE if the aio operation succeeded */
+UNIV_INTERN
+ibool
+os_aio_windows_handle(
+/*==================*/
+	ulint	segment,	/*!< in: the number of the segment in the aio
+				arrays to wait for; segment 0 is the ibuf
+				i/o thread, segment 1 the log i/o thread,
+				then follow the non-ibuf read threads, and as
+				the last are the non-ibuf write threads; if
+				this is ULINT_UNDEFINED, then it means that
+				sync aio is used, and this parameter is
+				ignored */
+	ulint	pos,		/*!< this parameter is used only in sync aio:
+				wait for the aio slot at this position */
+	fil_node_t**message1,	/*!< out: the messages passed with the aio
+				request; note that also in the case where
+				the aio operation failed, these output
+				parameters are valid and can be used to
+				restart the operation, for example */
+	void**	message2,
+	ulint*	type)		/*!< out: OS_FILE_WRITE or ..._READ */
+{
+	ulint		orig_seg	= segment;
+	os_aio_array_t*	array;
+	os_aio_slot_t*	slot;
+	ulint		n;
+	ulint		i;
+	ibool		ret_val;
+	BOOL		ret;
+	DWORD		len;
+	BOOL		retry		= FALSE;
+
+	if (segment == ULINT_UNDEFINED) {
+		segment = 0;
+		array = os_aio_sync_array;
+	} else {
+		segment = os_aio_get_array_and_local_segment(&array, segment);
+	}
+
+	/* NOTE! We only access constant fields in os_aio_array. Therefore
+	we do not have to acquire the protecting mutex yet */
+
+	ut_ad(os_aio_validate_skip());
+	ut_ad(segment < array->n_segments);
+
+	n = array->n_slots / array->n_segments;
+
+	if (array == os_aio_sync_array) {
+
+		WaitForSingleObject(
+			os_aio_array_get_nth_slot(array, pos)->handle,
+			INFINITE);
+
+		i = pos;
+
+	} else {
+		if (orig_seg != ULINT_UNDEFINED) {
+			srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
+		}
+
+		i = WaitForMultipleObjects(
+			(DWORD) n, array->handles + segment * n,
+			FALSE, INFINITE);
+	}
+
+	os_mutex_enter(array->mutex);
+
+	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
+	    && array->n_reserved == 0) {
+		*message1 = NULL;
+		*message2 = NULL;
+		os_mutex_exit(array->mutex);
+		return(TRUE);
+	}
+
+	ut_a(i >= WAIT_OBJECT_0 && i <= WAIT_OBJECT_0 + n);
+
+	slot = os_aio_array_get_nth_slot(array, i + segment * n);
+
+	ut_a(slot->reserved);
+
+	if (orig_seg != ULINT_UNDEFINED) {
+		srv_set_io_thread_op_info(
+			orig_seg, "get windows aio return value");
+	}
+
+	ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE);
+
+	*message1 = slot->message1;
+	*message2 = slot->message2;
+
+	*type = slot->type;
+
+	if (ret && len == slot->len) {
+
+		ret_val = TRUE;
+	} else if (os_file_handle_error(slot->name, "Windows aio")) {
+
+		retry = TRUE;
+	} else {
+
+		ret_val = FALSE;
+	}
+
+	os_mutex_exit(array->mutex);
+
+	if (retry) {
+		/* retry failed read/write operation synchronously.
+		No need to hold array->mutex. */
+
+#ifdef UNIV_PFS_IO
+		/* This read/write does not go through os_file_read
+		and os_file_write APIs, need to register with
+		performance schema explicitly here. */
+		struct PSI_file_locker* locker = NULL;
+		register_pfs_file_io_begin(locker, slot->file, slot->len,
+					   (slot->type == OS_FILE_WRITE)
+						? PSI_FILE_WRITE
+						: PSI_FILE_READ,
+					    __FILE__, __LINE__);
+#endif
+
+		ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
+
+		switch (slot->type) {
+		case OS_FILE_WRITE:
+			ret = WriteFile(slot->file, slot->buf,
+					(DWORD) slot->len, &len,
+					&(slot->control));
+
+			break;
+		case OS_FILE_READ:
+			ret = ReadFile(slot->file, slot->buf,
+				       (DWORD) slot->len, &len,
+				       &(slot->control));
+
+			break;
+		default:
+			ut_error;
+		}
+
+#ifdef UNIV_PFS_IO
+		register_pfs_file_io_end(locker, len);
+#endif
+
+		if (!ret && GetLastError() == ERROR_IO_PENDING) {
+			/* aio was queued successfully!
+			We want a synchronous i/o operation on a
+			file where we also use async i/o: in Windows
+			we must use the same wait mechanism as for
+			async i/o */
+
+			ret = GetOverlappedResult(slot->file,
+						  &(slot->control),
+						  &len, TRUE);
+		}
+
+		ret_val = ret && len == slot->len;
+	}
+
+	os_aio_array_free_slot(array, slot);
+
+	return(ret_val);
+}
+#endif
+
+#if defined(LINUX_NATIVE_AIO)
+/******************************************************************//**
+This function is only used in Linux native asynchronous i/o. This is
+called from within the io-thread. If there are no completed IO requests
+in the slot array, the thread calls this function to collect more
+requests from the kernel.
+The io-thread waits on io_getevents(), which is a blocking call, with
+a timeout value. Unless the system is very heavy loaded, keeping the
+io-thread very busy, the io-thread will spend most of its time waiting
+in this function.
+The io-thread also exits in this function. It checks server status at
+each wakeup and that is why we use timed wait in io_getevents(). */
+static
+void
+os_aio_linux_collect(
+/*=================*/
+	os_aio_array_t* array,		/*!< in/out: slot array. */
+	ulint		segment,	/*!< in: local segment no. */
+	ulint		seg_size)	/*!< in: segment size. */
+{
+	int			i;
+	int			ret;
+	ulint			start_pos;
+	ulint			end_pos;
+	struct timespec		timeout;
+	struct io_event*	events;
+	struct io_context*	io_ctx;
+
+	/* sanity checks. */
+	ut_ad(array != NULL);
+	ut_ad(seg_size > 0);
+	ut_ad(segment < array->n_segments);
+
+	/* Which part of event array we are going to work on. */
+	events = &array->aio_events[segment * seg_size];
+
+	/* Which io_context we are going to use. */
+	io_ctx = array->aio_ctx[segment];
+
+	/* Starting point of the segment we will be working on. */
+	start_pos = segment * seg_size;
+
+	/* End point. */
+	end_pos = start_pos + seg_size;
+
+retry:
+
+	/* Initialize the events. The timeout value is arbitrary.
+	We probably need to experiment with it a little. */
+	memset(events, 0, sizeof(*events) * seg_size);
+	timeout.tv_sec = 0;
+	timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
+
+	ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
+
+	if (ret > 0) {
+		for (i = 0; i < ret; i++) {
+			os_aio_slot_t*	slot;
+			struct iocb*	control;
+
+			control = (struct iocb*) events[i].obj;
+			ut_a(control != NULL);
+
+			slot = (os_aio_slot_t*) control->data;
+
+			/* Some sanity checks. */
+			ut_a(slot != NULL);
+			ut_a(slot->reserved);
+
+#if defined(UNIV_AIO_DEBUG)
+			fprintf(stderr,
+				"io_getevents[%c]: slot[%p] ctx[%p]"
+				" seg[%lu]\n",
+				(slot->type == OS_FILE_WRITE) ? 'w' : 'r',
+				slot, io_ctx, segment);
+#endif
+
+			/* We are not scribbling previous segment. */
+			ut_a(slot->pos >= start_pos);
+
+			/* We have not overstepped to next segment. */
+			ut_a(slot->pos < end_pos);
+
+			/* Mark this request as completed. The error handling
+			will be done in the calling function. */
+			os_mutex_enter(array->mutex);
+			slot->n_bytes = events[i].res;
+			slot->ret = events[i].res2;
+			slot->io_already_done = TRUE;
+			os_mutex_exit(array->mutex);
+		}
+		return;
+	}
+
+	if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
+		return;
+	}
+
+	/* This error handling is for any error in collecting the
+	IO requests. The errors, if any, for any particular IO
+	request are simply passed on to the calling routine. */
+
+	switch (ret) {
+	case -EAGAIN:
+		/* Not enough resources! Try again. */
+	case -EINTR:
+		/* Interrupted! I have tested the behaviour in case of an
+		interrupt. If we have some completed IOs available then
+		the return code will be the number of IOs. We get EINTR only
+		if there are no completed IOs and we have been interrupted. */
+	case 0:
+		/* No pending request! Go back and check again. */
+		goto retry;
+	}
+
+	/* All other errors should cause a trap for now. */
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: unexpected ret_code[%d] from io_getevents()!\n",
+		ret);
+	ut_error;
+}
+
+/**********************************************************************//**
+This function is only used in Linux native asynchronous i/o.
+Waits for an aio operation to complete. This function is used to wait for
+the completed requests. The aio array of pending requests is divided
+into segments. The thread specifies which segment or slot it wants to wait
+for. NOTE: this function will also take care of freeing the aio slot,
+therefore no other thread is allowed to do the freeing!
+@return	TRUE if the IO was successful */
+UNIV_INTERN
+ibool
+os_aio_linux_handle(
+/*================*/
+	ulint	global_seg,	/*!< in: segment number in the aio array
+				to wait for; segment 0 is the ibuf
+				i/o thread, segment 1 is log i/o thread,
+				then follow the non-ibuf read threads,
+				and the last are the non-ibuf write
+				threads. */
+	fil_node_t**message1,	/*!< out: the messages passed with the */
+	void**	message2,	/*!< aio request; note that in case the
+				aio operation failed, these output
+				parameters are valid and can be used to
+				restart the operation. */
+	ulint*	type)		/*!< out: OS_FILE_WRITE or ..._READ */
+{
+	ulint		segment;
+	os_aio_array_t*	array;
+	os_aio_slot_t*	slot;
+	ulint		n;
+	ulint		i;
+	ibool		ret = FALSE;
+
+	/* Should never be doing Sync IO here. */
+	ut_a(global_seg != ULINT_UNDEFINED);
+
+	/* Find the array and the local segment. */
+	segment = os_aio_get_array_and_local_segment(&array, global_seg);
+	n = array->n_slots / array->n_segments;
+
+	/* Loop until we have found a completed request. */
+	for (;;) {
+		ibool	any_reserved = FALSE;
+		os_mutex_enter(array->mutex);
+		for (i = 0; i < n; ++i) {
+			slot = os_aio_array_get_nth_slot(
+				array, i + segment * n);
+			if (!slot->reserved) {
+				continue;
+			} else if (slot->io_already_done) {
+				/* Something for us to work on. */
+				goto found;
+			} else {
+				any_reserved = TRUE;
+			}
+		}
+
+		os_mutex_exit(array->mutex);
+
+		/* There is no completed request.
+		If there is no pending request at all,
+		and the system is being shut down, exit. */
+		if (UNIV_UNLIKELY
+		    (!any_reserved
+		     && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) {
+			*message1 = NULL;
+			*message2 = NULL;
+			return(TRUE);
+		}
+
+		/* Wait for some request. Note that we return
+		from wait iff we have found a request. */
+
+		srv_set_io_thread_op_info(global_seg,
+			"waiting for completed aio requests");
+		os_aio_linux_collect(array, segment, n);
+	}
+
+found:
+	/* Note that it may be that there are more then one completed
+	IO requests. We process them one at a time. We may have a case
+	here to improve the performance slightly by dealing with all
+	requests in one sweep. */
+	srv_set_io_thread_op_info(global_seg,
+				"processing completed aio requests");
+
+	/* Ensure that we are scribbling only our segment. */
+	ut_a(i < n);
+
+	ut_ad(slot != NULL);
+	ut_ad(slot->reserved);
+	ut_ad(slot->io_already_done);
+
+	*message1 = slot->message1;
+	*message2 = slot->message2;
+
+	*type = slot->type;
+
+	if (slot->ret == 0 && slot->n_bytes == (long) slot->len) {
+
+		ret = TRUE;
+	} else {
+		errno = -slot->ret;
+
+		/* os_file_handle_error does tell us if we should retry
+		this IO. As it stands now, we don't do this retry when
+		reaping requests from a different context than
+		the dispatcher. This non-retry logic is the same for
+		windows and linux native AIO.
+		We should probably look into this to transparently
+		re-submit the IO. */
+		os_file_handle_error(slot->name, "Linux aio");
+
+		ret = FALSE;
+	}
+
+	os_mutex_exit(array->mutex);
+
+	os_aio_array_free_slot(array, slot);
+
+	return(ret);
+}
+#endif /* LINUX_NATIVE_AIO */
+
+/**********************************************************************//**
+Does simulated aio. This function should be called by an i/o-handler
+thread.
+@return	TRUE if the aio operation succeeded */
+UNIV_INTERN
+ibool
+os_aio_simulated_handle(
+/*====================*/
+	ulint	global_segment,	/*!< in: the number of the segment in the aio
+				arrays to wait for; segment 0 is the ibuf
+				i/o thread, segment 1 the log i/o thread,
+				then follow the non-ibuf read threads, and as
+				the last are the non-ibuf write threads */
+	fil_node_t**message1,	/*!< out: the messages passed with the aio
+				request; note that also in the case where
+				the aio operation failed, these output
+				parameters are valid and can be used to
+				restart the operation, for example */
+	void**	message2,
+	ulint*	type)		/*!< out: OS_FILE_WRITE or ..._READ */
+{
+	os_aio_array_t*	array;
+	ulint		segment;
+	os_aio_slot_t*	consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
+	ulint		n_consecutive;
+	ulint		total_len;
+	ulint		offs;
+	os_offset_t	lowest_offset;
+	ulint		biggest_age;
+	ulint		age;
+	byte*		combined_buf;
+	byte*		combined_buf2;
+	ibool		ret;
+	ibool		any_reserved;
+	ulint		n;
+	os_aio_slot_t*	aio_slot;
+
+	/* Fix compiler warning */
+	*consecutive_ios = NULL;
+
+	segment = os_aio_get_array_and_local_segment(&array, global_segment);
+
+restart:
+	/* NOTE! We only access constant fields in os_aio_array. Therefore
+	we do not have to acquire the protecting mutex yet */
+
+	srv_set_io_thread_op_info(global_segment,
+				  "looking for i/o requests (a)");
+	ut_ad(os_aio_validate_skip());
+	ut_ad(segment < array->n_segments);
+
+	n = array->n_slots / array->n_segments;
+
+	/* Look through n slots after the segment * n'th slot */
+
+	if (array == os_aio_read_array
+	    && os_aio_recommend_sleep_for_read_threads) {
+
+		/* Give other threads chance to add several i/os to the array
+		at once. */
+
+		goto recommended_sleep;
+	}
+
+	srv_set_io_thread_op_info(global_segment,
+				  "looking for i/o requests (b)");
+
+	/* Check if there is a slot for which the i/o has already been
+	done */
+	any_reserved = FALSE;
+
+	os_mutex_enter(array->mutex);
+
+	for (ulint i = 0; i < n; i++) {
+		os_aio_slot_t*	slot;
+
+		slot = os_aio_array_get_nth_slot(array, i + segment * n);
+
+		if (!slot->reserved) {
+			continue;
+		} else if (slot->io_already_done) {
+
+			if (os_aio_print_debug) {
+				fprintf(stderr,
+					"InnoDB: i/o for slot %lu"
+					" already done, returning\n",
+					(ulong) i);
+			}
+
+			aio_slot = slot;
+			ret = TRUE;
+			goto slot_io_done;
+		} else {
+			any_reserved = TRUE;
+		}
+	}
+
+	/* There is no completed request.
+	If there is no pending request at all,
+	and the system is being shut down, exit. */
+	if (!any_reserved && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+		os_mutex_exit(array->mutex);
+		*message1 = NULL;
+		*message2 = NULL;
+		return(TRUE);
+	}
+
+	n_consecutive = 0;
+
+	/* If there are at least 2 seconds old requests, then pick the oldest
+	one to prevent starvation. If several requests have the same age,
+	then pick the one at the lowest offset. */
+
+	biggest_age = 0;
+	lowest_offset = IB_UINT64_MAX;
+
+	for (ulint i = 0; i < n; i++) {
+		os_aio_slot_t*	slot;
+
+		slot = os_aio_array_get_nth_slot(array, i + segment * n);
+
+		if (slot->reserved) {
+
+			age = (ulint) difftime(
+				ut_time(), slot->reservation_time);
+
+			if ((age >= 2 && age > biggest_age)
+			    || (age >= 2 && age == biggest_age
+				&& slot->offset < lowest_offset)) {
+
+				/* Found an i/o request */
+				consecutive_ios[0] = slot;
+
+				n_consecutive = 1;
+
+				biggest_age = age;
+				lowest_offset = slot->offset;
+			}
+		}
+	}
+
+	if (n_consecutive == 0) {
+		/* There were no old requests. Look for an i/o request at the
+		lowest offset in the array (we ignore the high 32 bits of the
+		offset in these heuristics) */
+
+		lowest_offset = IB_UINT64_MAX;
+
+		for (ulint i = 0; i < n; i++) {
+			os_aio_slot_t*	slot;
+
+			slot = os_aio_array_get_nth_slot(
+				array, i + segment * n);
+
+			if (slot->reserved && slot->offset < lowest_offset) {
+
+				/* Found an i/o request */
+				consecutive_ios[0] = slot;
+
+				n_consecutive = 1;
+
+				lowest_offset = slot->offset;
+			}
+		}
+	}
+
+	if (n_consecutive == 0) {
+
+		/* No i/o requested at the moment */
+
+		goto wait_for_io;
+	}
+
+	/* if n_consecutive != 0, then we have assigned
+	something valid to consecutive_ios[0] */
+	ut_ad(n_consecutive != 0);
+	ut_ad(consecutive_ios[0] != NULL);
+
+	aio_slot = consecutive_ios[0];
+
+	/* Check if there are several consecutive blocks to read or write */
+
+consecutive_loop:
+	for (ulint i = 0; i < n; i++) {
+		os_aio_slot_t*	slot;
+
+		slot = os_aio_array_get_nth_slot(array, i + segment * n);
+
+		if (slot->reserved
+		    && slot != aio_slot
+		    && slot->offset == aio_slot->offset + aio_slot->len
+		    && slot->type == aio_slot->type
+		    && slot->file == aio_slot->file) {
+
+			/* Found a consecutive i/o request */
+
+			consecutive_ios[n_consecutive] = slot;
+			n_consecutive++;
+
+			aio_slot = slot;
+
+			if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
+
+				goto consecutive_loop;
+			} else {
+				break;
+			}
+		}
+	}
+
+	srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
+
+	/* We have now collected n_consecutive i/o requests in the array;
+	allocate a single buffer which can hold all data, and perform the
+	i/o */
+
+	total_len = 0;
+	aio_slot = consecutive_ios[0];
+
+	for (ulint i = 0; i < n_consecutive; i++) {
+		total_len += consecutive_ios[i]->len;
+	}
+
+	if (n_consecutive == 1) {
+		/* We can use the buffer of the i/o request */
+		combined_buf = aio_slot->buf;
+		combined_buf2 = NULL;
+	} else {
+		combined_buf2 = static_cast<byte*>(
+			ut_malloc(total_len + UNIV_PAGE_SIZE));
+
+		ut_a(combined_buf2);
+
+		combined_buf = static_cast<byte*>(
+			ut_align(combined_buf2, UNIV_PAGE_SIZE));
+	}
+
+	/* We release the array mutex for the time of the i/o: NOTE that
+	this assumes that there is just one i/o-handler thread serving
+	a single segment of slots! */
+
+	os_mutex_exit(array->mutex);
+
+	if (aio_slot->type == OS_FILE_WRITE && n_consecutive > 1) {
+		/* Copy the buffers to the combined buffer */
+		offs = 0;
+
+		for (ulint i = 0; i < n_consecutive; i++) {
+
+			ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
+				  consecutive_ios[i]->len);
+
+			offs += consecutive_ios[i]->len;
+		}
+	}
+
+	srv_set_io_thread_op_info(global_segment, "doing file i/o");
+
+	/* Do the i/o with ordinary, synchronous i/o functions: */
+	if (aio_slot->type == OS_FILE_WRITE) {
+		ut_ad(!srv_read_only_mode);
+		ret = os_file_write(
+			aio_slot->name, aio_slot->file, combined_buf,
+			aio_slot->offset, total_len);
+	} else {
+		ret = os_file_read(
+			aio_slot->file, combined_buf,
+			aio_slot->offset, total_len);
+	}
+
+	ut_a(ret);
+	srv_set_io_thread_op_info(global_segment, "file i/o done");
+
+	if (aio_slot->type == OS_FILE_READ && n_consecutive > 1) {
+		/* Copy the combined buffer to individual buffers */
+		offs = 0;
+
+		for (ulint i = 0; i < n_consecutive; i++) {
+
+			ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
+				  consecutive_ios[i]->len);
+			offs += consecutive_ios[i]->len;
+		}
+	}
+
+	if (combined_buf2) {
+		ut_free(combined_buf2);
+	}
+
+	os_mutex_enter(array->mutex);
+
+	/* Mark the i/os done in slots */
+
+	for (ulint i = 0; i < n_consecutive; i++) {
+		consecutive_ios[i]->io_already_done = TRUE;
+	}
+
+	/* We return the messages for the first slot now, and if there were
+	several slots, the messages will be returned with subsequent calls
+	of this function */
+
+slot_io_done:
+
+	ut_a(aio_slot->reserved);
+
+	*message1 = aio_slot->message1;
+	*message2 = aio_slot->message2;
+
+	*type = aio_slot->type;
+
+	os_mutex_exit(array->mutex);
+
+	os_aio_array_free_slot(array, aio_slot);
+
+	return(ret);
+
+wait_for_io:
+	srv_set_io_thread_op_info(global_segment, "resetting wait event");
+
+	/* We wait here until there again can be i/os in the segment
+	of this thread */
+
+	os_event_reset(os_aio_segment_wait_events[global_segment]);
+
+	os_mutex_exit(array->mutex);
+
+recommended_sleep:
+	srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
+
+	os_event_wait(os_aio_segment_wait_events[global_segment]);
+
+	goto restart;
+}
+
+/**********************************************************************//**
+Validates the consistency of an aio array.
+@return	true if ok */
+static
+bool
+os_aio_array_validate(
+/*==================*/
+	os_aio_array_t*	array)	/*!< in: aio wait array */
+{
+	ulint		i;
+	ulint		n_reserved	= 0;
+
+	os_mutex_enter(array->mutex);
+
+	ut_a(array->n_slots > 0);
+	ut_a(array->n_segments > 0);
+
+	for (i = 0; i < array->n_slots; i++) {
+		os_aio_slot_t*	slot;
+
+		slot = os_aio_array_get_nth_slot(array, i);
+
+		if (slot->reserved) {
+			n_reserved++;
+			ut_a(slot->len > 0);
+		}
+	}
+
+	ut_a(array->n_reserved == n_reserved);
+
+	os_mutex_exit(array->mutex);
+
+	return(true);
+}
+
+/**********************************************************************//**
+Validates the consistency the aio system.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+os_aio_validate(void)
+/*=================*/
+{
+	os_aio_array_validate(os_aio_read_array);
+
+	if (os_aio_write_array != 0) {
+		os_aio_array_validate(os_aio_write_array);
+	}
+
+	if (os_aio_ibuf_array != 0) {
+		os_aio_array_validate(os_aio_ibuf_array);
+	}
+
+	if (os_aio_log_array != 0) {
+		os_aio_array_validate(os_aio_log_array);
+	}
+
+	if (os_aio_sync_array != 0) {
+		os_aio_array_validate(os_aio_sync_array);
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Prints pending IO requests per segment of an aio array.
+We probably don't need per segment statistics but they can help us
+during development phase to see if the IO requests are being
+distributed as expected. */
+static
+void
+os_aio_print_segment_info(
+/*======================*/
+	FILE*		file,	/*!< in: file where to print */
+	ulint*		n_seg,	/*!< in: pending IO array */
+	os_aio_array_t*	array)	/*!< in: array to process */
+{
+	ulint	i;
+
+	ut_ad(array);
+	ut_ad(n_seg);
+	ut_ad(array->n_segments > 0);
+
+	if (array->n_segments == 1) {
+		return;
+	}
+
+	fprintf(file, " [");
+	for (i = 0; i < array->n_segments; i++) {
+		if (i != 0) {
+			fprintf(file, ", ");
+		}
+
+		fprintf(file, "%lu", n_seg[i]);
+	}
+	fprintf(file, "] ");
+}
+
+/**********************************************************************//**
+Prints info about the aio array. */
+UNIV_INTERN
+void
+os_aio_print_array(
+/*==============*/
+	FILE*		file,	/*!< in: file where to print */
+	os_aio_array_t*	array)	/*!< in: aio array to print */
+{
+	ulint			n_reserved = 0;
+	ulint			n_res_seg[SRV_MAX_N_IO_THREADS];
+
+	os_mutex_enter(array->mutex);
+
+	ut_a(array->n_slots > 0);
+	ut_a(array->n_segments > 0);
+
+	memset(n_res_seg, 0x0, sizeof(n_res_seg));
+
+	for (ulint i = 0; i < array->n_slots; ++i) {
+		os_aio_slot_t*	slot;
+		ulint		seg_no;
+
+		slot = os_aio_array_get_nth_slot(array, i);
+
+		seg_no = (i * array->n_segments) / array->n_slots;
+
+		if (slot->reserved) {
+			++n_reserved;
+			++n_res_seg[seg_no];
+
+			ut_a(slot->len > 0);
+		}
+	}
+
+	ut_a(array->n_reserved == n_reserved);
+
+	fprintf(file, " %lu", (ulong) n_reserved);
+
+	os_aio_print_segment_info(file, n_res_seg, array);
+
+	os_mutex_exit(array->mutex);
+}
+
+/**********************************************************************//**
+Prints info of the aio arrays. */
+UNIV_INTERN
+void
+os_aio_print(
+/*=========*/
+	FILE*	file)	/*!< in: file where to print */
+{
+	time_t		current_time;
+	double		time_elapsed;
+	double		avg_bytes_read;
+
+	for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
+		fprintf(file, "I/O thread %lu state: %s (%s)",
+			(ulong) i,
+			srv_io_thread_op_info[i],
+			srv_io_thread_function[i]);
+
+#ifndef __WIN__
+		if (os_aio_segment_wait_events[i]->is_set) {
+			fprintf(file, " ev set");
+		}
+#endif /* __WIN__ */
+
+		fprintf(file, "\n");
+	}
+
+	fputs("Pending normal aio reads:", file);
+
+	os_aio_print_array(file, os_aio_read_array);
+
+	if (os_aio_write_array != 0) {
+		fputs(", aio writes:", file);
+		os_aio_print_array(file, os_aio_write_array);
+	}
+
+	if (os_aio_ibuf_array != 0) {
+		fputs(",\n ibuf aio reads:", file);
+		os_aio_print_array(file, os_aio_ibuf_array);
+	}
+
+	if (os_aio_log_array != 0) {
+		fputs(", log i/o's:", file);
+		os_aio_print_array(file, os_aio_log_array);
+	}
+
+	if (os_aio_sync_array != 0) {
+		fputs(", sync i/o's:", file);
+		os_aio_print_array(file, os_aio_sync_array);
+	}
+
+	putc('\n', file);
+	current_time = ut_time();
+	time_elapsed = 0.001 + difftime(current_time, os_last_printout);
+
+	fprintf(file,
+		"Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
+		"%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
+		(ulong) fil_n_pending_log_flushes,
+		(ulong) fil_n_pending_tablespace_flushes,
+		(ulong) os_n_file_reads,
+		(ulong) os_n_file_writes,
+		(ulong) os_n_fsyncs);
+
+	if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) {
+		fprintf(file,
+			"%lu pending preads, %lu pending pwrites\n",
+			(ulong) os_file_n_pending_preads,
+			(ulong) os_file_n_pending_pwrites);
+	}
+
+	if (os_n_file_reads == os_n_file_reads_old) {
+		avg_bytes_read = 0.0;
+	} else {
+		avg_bytes_read = (double) os_bytes_read_since_printout
+			/ (os_n_file_reads - os_n_file_reads_old);
+	}
+
+	fprintf(file,
+		"%.2f reads/s, %lu avg bytes/read,"
+		" %.2f writes/s, %.2f fsyncs/s\n",
+		(os_n_file_reads - os_n_file_reads_old)
+		/ time_elapsed,
+		(ulong) avg_bytes_read,
+		(os_n_file_writes - os_n_file_writes_old)
+		/ time_elapsed,
+		(os_n_fsyncs - os_n_fsyncs_old)
+		/ time_elapsed);
+
+	os_n_file_reads_old = os_n_file_reads;
+	os_n_file_writes_old = os_n_file_writes;
+	os_n_fsyncs_old = os_n_fsyncs;
+	os_bytes_read_since_printout = 0;
+
+	os_last_printout = current_time;
+}
+
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+os_aio_refresh_stats(void)
+/*======================*/
+{
+	os_n_file_reads_old = os_n_file_reads;
+	os_n_file_writes_old = os_n_file_writes;
+	os_n_fsyncs_old = os_n_fsyncs;
+	os_bytes_read_since_printout = 0;
+
+	os_last_printout = time(NULL);
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Checks that all slots in the system have been freed, that is, there are
+no pending io operations.
+@return	TRUE if all free */
+UNIV_INTERN
+ibool
+os_aio_all_slots_free(void)
+/*=======================*/
+{
+	os_aio_array_t*	array;
+	ulint		n_res	= 0;
+
+	array = os_aio_read_array;
+
+	os_mutex_enter(array->mutex);
+
+	n_res += array->n_reserved;
+
+	os_mutex_exit(array->mutex);
+
+	if (!srv_read_only_mode) {
+		ut_a(os_aio_write_array == 0);
+
+		array = os_aio_write_array;
+
+		os_mutex_enter(array->mutex);
+
+		n_res += array->n_reserved;
+
+		os_mutex_exit(array->mutex);
+
+		ut_a(os_aio_ibuf_array == 0);
+
+		array = os_aio_ibuf_array;
+
+		os_mutex_enter(array->mutex);
+
+		n_res += array->n_reserved;
+
+		os_mutex_exit(array->mutex);
+	}
+
+	ut_a(os_aio_log_array == 0);
+
+	array = os_aio_log_array;
+
+	os_mutex_enter(array->mutex);
+
+	n_res += array->n_reserved;
+
+	os_mutex_exit(array->mutex);
+
+	array = os_aio_sync_array;
+
+	os_mutex_enter(array->mutex);
+
+	n_res += array->n_reserved;
+
+	os_mutex_exit(array->mutex);
+
+	if (n_res == 0) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+#endif /* UNIV_DEBUG */
+
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/os/os0proc.cc b/storage/innobase/os/os0proc.cc
new file mode 100644
index 00000000000..ff6d65e4ae6
--- /dev/null
+++ b/storage/innobase/os/os0proc.cc
@@ -0,0 +1,232 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file os/os0proc.cc
+The interface to the operating system
+process control primitives
+
+Created 9/30/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0proc.h"
+#ifdef UNIV_NONINL
+#include "os0proc.ic"
+#endif
+
+#include "ut0mem.h"
+#include "ut0byte.h"
+
+/* FreeBSD for example has only MAP_ANON, Linux has MAP_ANONYMOUS and
+MAP_ANON but MAP_ANON is marked as deprecated */
+#if defined(MAP_ANONYMOUS)
+#define OS_MAP_ANON	MAP_ANONYMOUS
+#elif defined(MAP_ANON)
+#define OS_MAP_ANON	MAP_ANON
+#endif
+
+UNIV_INTERN ibool os_use_large_pages;
+/* Large page size. This may be a boot-time option on some platforms */
+UNIV_INTERN ulint os_large_page_size;
+
+/****************************************************************//**
+Converts the current process id to a number. It is not guaranteed that the
+number is unique. In Linux returns the 'process number' of the current
+thread. That number is the same as one sees in 'top', for example. In Linux
+the thread id is not the same as one sees in 'top'.
+@return	process id as a number */
+UNIV_INTERN
+ulint
+os_proc_get_number(void)
+/*====================*/
+{
+#ifdef __WIN__
+	return((ulint)GetCurrentProcessId());
+#else
+	return((ulint) getpid());
+#endif
+}
+
+/****************************************************************//**
+Allocates large pages memory.
+@return	allocated memory */
+UNIV_INTERN
+void*
+os_mem_alloc_large(
+/*===============*/
+	ulint*	n)			/*!< in/out: number of bytes */
+{
+	void*	ptr;
+	ulint	size;
+#if defined HAVE_LARGE_PAGES && defined UNIV_LINUX
+	int shmid;
+	struct shmid_ds buf;
+
+	if (!os_use_large_pages || !os_large_page_size) {
+		goto skip;
+	}
+
+	/* Align block size to os_large_page_size */
+	ut_ad(ut_is_2pow(os_large_page_size));
+	size = ut_2pow_round(*n + (os_large_page_size - 1),
+			     os_large_page_size);
+
+	shmid = shmget(IPC_PRIVATE, (size_t) size, SHM_HUGETLB | SHM_R | SHM_W);
+	if (shmid < 0) {
+		fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to allocate"
+			" %lu bytes. errno %d\n", size, errno);
+		ptr = NULL;
+	} else {
+		ptr = shmat(shmid, NULL, 0);
+		if (ptr == (void*)-1) {
+			fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to"
+				" attach shared memory segment, errno %d\n",
+				errno);
+			ptr = NULL;
+		}
+
+		/* Remove the shared memory segment so that it will be
+		automatically freed after memory is detached or
+		process exits */
+		shmctl(shmid, IPC_RMID, &buf);
+	}
+
+	if (ptr) {
+		*n = size;
+		os_fast_mutex_lock(&ut_list_mutex);
+		ut_total_allocated_memory += size;
+		os_fast_mutex_unlock(&ut_list_mutex);
+		UNIV_MEM_ALLOC(ptr, size);
+		return(ptr);
+	}
+
+	fprintf(stderr, "InnoDB HugeTLB: Warning: Using conventional"
+		" memory pool\n");
+skip:
+#endif /* HAVE_LARGE_PAGES && UNIV_LINUX */
+
+#ifdef __WIN__
+	SYSTEM_INFO	system_info;
+	GetSystemInfo(&system_info);
+
+	/* Align block size to system page size */
+	ut_ad(ut_is_2pow(system_info.dwPageSize));
+	/* system_info.dwPageSize is only 32-bit. Casting to ulint is required
+	on 64-bit Windows. */
+	size = *n = ut_2pow_round(*n + (system_info.dwPageSize - 1),
+				  (ulint) system_info.dwPageSize);
+	ptr = VirtualAlloc(NULL, size, MEM_COMMIT | MEM_RESERVE,
+			   PAGE_READWRITE);
+	if (!ptr) {
+		fprintf(stderr, "InnoDB: VirtualAlloc(%lu bytes) failed;"
+			" Windows error %lu\n",
+			(ulong) size, (ulong) GetLastError());
+	} else {
+		os_fast_mutex_lock(&ut_list_mutex);
+		ut_total_allocated_memory += size;
+		os_fast_mutex_unlock(&ut_list_mutex);
+		UNIV_MEM_ALLOC(ptr, size);
+	}
+#elif !defined OS_MAP_ANON
+	size = *n;
+	ptr = ut_malloc_low(size, TRUE, FALSE);
+#else
+# ifdef HAVE_GETPAGESIZE
+	size = getpagesize();
+# else
+	size = UNIV_PAGE_SIZE;
+# endif
+	/* Align block size to system page size */
+	ut_ad(ut_is_2pow(size));
+	size = *n = ut_2pow_round(*n + (size - 1), size);
+	ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE | OS_MAP_ANON, -1, 0);
+	if (UNIV_UNLIKELY(ptr == (void*) -1)) {
+		fprintf(stderr, "InnoDB: mmap(%lu bytes) failed;"
+			" errno %lu\n",
+			(ulong) size, (ulong) errno);
+		ptr = NULL;
+	} else {
+		os_fast_mutex_lock(&ut_list_mutex);
+		ut_total_allocated_memory += size;
+		os_fast_mutex_unlock(&ut_list_mutex);
+		UNIV_MEM_ALLOC(ptr, size);
+	}
+#endif
+	return(ptr);
+}
+
+/****************************************************************//**
+Frees large pages memory. */
+UNIV_INTERN
+void
+os_mem_free_large(
+/*==============*/
+	void	*ptr,			/*!< in: pointer returned by
+					os_mem_alloc_large() */
+	ulint	size)			/*!< in: size returned by
+					os_mem_alloc_large() */
+{
+	os_fast_mutex_lock(&ut_list_mutex);
+	ut_a(ut_total_allocated_memory >= size);
+	os_fast_mutex_unlock(&ut_list_mutex);
+
+#if defined HAVE_LARGE_PAGES && defined UNIV_LINUX
+	if (os_use_large_pages && os_large_page_size && !shmdt(ptr)) {
+		os_fast_mutex_lock(&ut_list_mutex);
+		ut_a(ut_total_allocated_memory >= size);
+		ut_total_allocated_memory -= size;
+		os_fast_mutex_unlock(&ut_list_mutex);
+		UNIV_MEM_FREE(ptr, size);
+		return;
+	}
+#endif /* HAVE_LARGE_PAGES && UNIV_LINUX */
+#ifdef __WIN__
+	/* When RELEASE memory, the size parameter must be 0.
+	Do not use MEM_RELEASE with MEM_DECOMMIT. */
+	if (!VirtualFree(ptr, 0, MEM_RELEASE)) {
+		fprintf(stderr, "InnoDB: VirtualFree(%p, %lu) failed;"
+			" Windows error %lu\n",
+			ptr, (ulong) size, (ulong) GetLastError());
+	} else {
+		os_fast_mutex_lock(&ut_list_mutex);
+		ut_a(ut_total_allocated_memory >= size);
+		ut_total_allocated_memory -= size;
+		os_fast_mutex_unlock(&ut_list_mutex);
+		UNIV_MEM_FREE(ptr, size);
+	}
+#elif !defined OS_MAP_ANON
+	ut_free(ptr);
+#else
+# if defined(UNIV_SOLARIS)
+	if (munmap(static_cast<caddr_t>(ptr), size)) {
+# else
+	if (munmap(ptr, size)) {
+# endif /* UNIV_SOLARIS */
+		fprintf(stderr, "InnoDB: munmap(%p, %lu) failed;"
+			" errno %lu\n",
+			ptr, (ulong) size, (ulong) errno);
+	} else {
+		os_fast_mutex_lock(&ut_list_mutex);
+		ut_a(ut_total_allocated_memory >= size);
+		ut_total_allocated_memory -= size;
+		os_fast_mutex_unlock(&ut_list_mutex);
+		UNIV_MEM_FREE(ptr, size);
+	}
+#endif
+}
diff --git a/storage/innobase/os/os0sync.cc b/storage/innobase/os/os0sync.cc
new file mode 100644
index 00000000000..e42c5900c0c
--- /dev/null
+++ b/storage/innobase/os/os0sync.cc
@@ -0,0 +1,934 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file os/os0sync.cc
+The interface to the operating system
+synchronization primitives.
+
+Created 9/6/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0sync.h"
+#ifdef UNIV_NONINL
+#include "os0sync.ic"
+#endif
+
+#ifdef __WIN__
+#include <windows.h>
+#endif
+
+#include "ut0mem.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+
+/* Type definition for an operating system mutex struct */
+struct os_mutex_t{
+	os_event_t	event;	/*!< Used by sync0arr.cc for queing threads */
+	void*		handle;	/*!< OS handle to mutex */
+	ulint		count;	/*!< we use this counter to check
+				that the same thread does not
+				recursively lock the mutex: we
+				do not assume that the OS mutex
+				supports recursive locking, though
+				NT seems to do that */
+	UT_LIST_NODE_T(os_mutex_t) os_mutex_list;
+				/* list of all 'slow' OS mutexes created */
+};
+
+/** Mutex protecting counts and the lists of OS mutexes and events */
+UNIV_INTERN os_ib_mutex_t	os_sync_mutex;
+/** TRUE if os_sync_mutex has been initialized */
+static ibool		os_sync_mutex_inited	= FALSE;
+/** TRUE when os_sync_free() is being executed */
+static ibool		os_sync_free_called	= FALSE;
+
+/** This is incremented by 1 in os_thread_create and decremented by 1 in
+os_thread_exit */
+UNIV_INTERN ulint	os_thread_count		= 0;
+
+/** The list of all events created */
+static UT_LIST_BASE_NODE_T(os_event)		os_event_list;
+
+/** The list of all OS 'slow' mutexes */
+static UT_LIST_BASE_NODE_T(os_mutex_t)		os_mutex_list;
+
+UNIV_INTERN ulint	os_event_count		= 0;
+UNIV_INTERN ulint	os_mutex_count		= 0;
+UNIV_INTERN ulint	os_fast_mutex_count	= 0;
+
+/* The number of microsecnds in a second. */
+static const ulint MICROSECS_IN_A_SECOND = 1000000;
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	event_os_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	os_mutex_key;
+#endif
+
+/* Because a mutex is embedded inside an event and there is an
+event embedded inside a mutex, on free, this generates a recursive call.
+This version of the free event function doesn't acquire the global lock */
+static void os_event_free_internal(os_event_t	event);
+
+/* On Windows (Vista and later), load function pointers for condition
+variable handling. Those functions are not available in prior versions,
+so we have to use them via runtime loading, as long as we support XP. */
+static void os_cond_module_init(void);
+
+#ifdef __WIN__
+/* Prototypes and function pointers for condition variable functions */
+typedef VOID (WINAPI* InitializeConditionVariableProc)
+	     (PCONDITION_VARIABLE ConditionVariable);
+static InitializeConditionVariableProc initialize_condition_variable;
+
+typedef BOOL (WINAPI* SleepConditionVariableCSProc)
+	     (PCONDITION_VARIABLE ConditionVariable,
+	      PCRITICAL_SECTION CriticalSection,
+	      DWORD dwMilliseconds);
+static SleepConditionVariableCSProc sleep_condition_variable;
+
+typedef VOID (WINAPI* WakeAllConditionVariableProc)
+	     (PCONDITION_VARIABLE ConditionVariable);
+static WakeAllConditionVariableProc wake_all_condition_variable;
+
+typedef VOID (WINAPI* WakeConditionVariableProc)
+	     (PCONDITION_VARIABLE ConditionVariable);
+static WakeConditionVariableProc wake_condition_variable;
+#endif
+
+/*********************************************************//**
+Initialitze condition variable */
+UNIV_INLINE
+void
+os_cond_init(
+/*=========*/
+	os_cond_t*	cond)	/*!< in: condition variable. */
+{
+	ut_a(cond);
+
+#ifdef __WIN__
+	ut_a(initialize_condition_variable != NULL);
+	initialize_condition_variable(cond);
+#else
+	ut_a(pthread_cond_init(cond, NULL) == 0);
+#endif
+}
+
+/*********************************************************//**
+Do a timed wait on condition variable.
+@return TRUE if timed out, FALSE otherwise */
+UNIV_INLINE
+ibool
+os_cond_wait_timed(
+/*===============*/
+	os_cond_t*		cond,		/*!< in: condition variable. */
+	os_fast_mutex_t*	fast_mutex,	/*!< in: fast mutex */
+#ifndef __WIN__
+	const struct timespec*	abstime		/*!< in: timeout */
+#else
+	DWORD			time_in_ms	/*!< in: timeout in
+						milliseconds*/
+#endif /* !__WIN__ */
+)
+{
+	fast_mutex_t*	mutex = &fast_mutex->mutex;
+#ifdef __WIN__
+	BOOL	ret;
+	DWORD	err;
+
+	ut_a(sleep_condition_variable != NULL);
+
+	ret = sleep_condition_variable(cond, mutex, time_in_ms);
+
+	if (!ret) {
+		err = GetLastError();
+		/* From http://msdn.microsoft.com/en-us/library/ms686301%28VS.85%29.aspx,
+		"Condition variables are subject to spurious wakeups
+		(those not associated with an explicit wake) and stolen wakeups
+		(another thread manages to run before the woken thread)."
+		Check for both types of timeouts.
+		Conditions are checked by the caller.*/
+		if ((err == WAIT_TIMEOUT) || (err == ERROR_TIMEOUT)) {
+			return(TRUE);
+		}
+	}
+
+	ut_a(ret);
+
+	return(FALSE);
+#else
+	int	ret;
+
+	ret = pthread_cond_timedwait(cond, mutex, abstime);
+
+	switch (ret) {
+	case 0:
+	case ETIMEDOUT:
+	/* We play it safe by checking for EINTR even though
+	according to the POSIX documentation it can't return EINTR. */
+	case EINTR:
+		break;
+
+	default:
+		fprintf(stderr, "  InnoDB: pthread_cond_timedwait() returned: "
+				"%d: abstime={%lu,%lu}\n",
+				ret, (ulong) abstime->tv_sec, (ulong) abstime->tv_nsec);
+		ut_error;
+	}
+
+	return(ret == ETIMEDOUT);
+#endif
+}
+/*********************************************************//**
+Wait on condition variable */
+UNIV_INLINE
+void
+os_cond_wait(
+/*=========*/
+	os_cond_t*		cond,	/*!< in: condition variable. */
+	os_fast_mutex_t*	fast_mutex)/*!< in: fast mutex */
+{
+	fast_mutex_t*	mutex = &fast_mutex->mutex;
+	ut_a(cond);
+	ut_a(mutex);
+
+#ifdef __WIN__
+	ut_a(sleep_condition_variable != NULL);
+	ut_a(sleep_condition_variable(cond, mutex, INFINITE));
+#else
+	ut_a(pthread_cond_wait(cond, mutex) == 0);
+#endif
+}
+
+/*********************************************************//**
+Wakes all threads  waiting for condition variable */
+UNIV_INLINE
+void
+os_cond_broadcast(
+/*==============*/
+	os_cond_t*	cond)	/*!< in: condition variable. */
+{
+	ut_a(cond);
+
+#ifdef __WIN__
+	ut_a(wake_all_condition_variable != NULL);
+	wake_all_condition_variable(cond);
+#else
+	ut_a(pthread_cond_broadcast(cond) == 0);
+#endif
+}
+
+/*********************************************************//**
+Wakes one thread waiting for condition variable */
+UNIV_INLINE
+void
+os_cond_signal(
+/*==========*/
+	os_cond_t*	cond)	/*!< in: condition variable. */
+{
+	ut_a(cond);
+
+#ifdef __WIN__
+	ut_a(wake_condition_variable != NULL);
+	wake_condition_variable(cond);
+#else
+	ut_a(pthread_cond_signal(cond) == 0);
+#endif
+}
+
+/*********************************************************//**
+Destroys condition variable */
+UNIV_INLINE
+void
+os_cond_destroy(
+/*============*/
+	os_cond_t*	cond)	/*!< in: condition variable. */
+{
+#ifdef __WIN__
+	/* Do nothing */
+#else
+	ut_a(pthread_cond_destroy(cond) == 0);
+#endif
+}
+
+/*********************************************************//**
+On Windows (Vista and later), load function pointers for condition variable
+handling. Those functions are not available in prior versions, so we have to
+use them via runtime loading, as long as we support XP. */
+static
+void
+os_cond_module_init(void)
+/*=====================*/
+{
+#ifdef __WIN__
+	HMODULE		h_dll;
+
+	if (!srv_use_native_conditions)
+		return;
+
+	h_dll = GetModuleHandle("kernel32");
+
+	initialize_condition_variable = (InitializeConditionVariableProc)
+			 GetProcAddress(h_dll, "InitializeConditionVariable");
+	sleep_condition_variable = (SleepConditionVariableCSProc)
+			  GetProcAddress(h_dll, "SleepConditionVariableCS");
+	wake_all_condition_variable = (WakeAllConditionVariableProc)
+			     GetProcAddress(h_dll, "WakeAllConditionVariable");
+	wake_condition_variable = (WakeConditionVariableProc)
+			 GetProcAddress(h_dll, "WakeConditionVariable");
+
+	/* When using native condition variables, check function pointers */
+	ut_a(initialize_condition_variable);
+	ut_a(sleep_condition_variable);
+	ut_a(wake_all_condition_variable);
+	ut_a(wake_condition_variable);
+#endif
+}
+
+/*********************************************************//**
+Initializes global event and OS 'slow' mutex lists. */
+UNIV_INTERN
+void
+os_sync_init(void)
+/*==============*/
+{
+	UT_LIST_INIT(os_event_list);
+	UT_LIST_INIT(os_mutex_list);
+
+	os_sync_mutex = NULL;
+	os_sync_mutex_inited = FALSE;
+
+	/* Now for Windows only */
+	os_cond_module_init();
+
+	os_sync_mutex = os_mutex_create();
+
+	os_sync_mutex_inited = TRUE;
+}
+
+/*********************************************************//**
+Frees created events and OS 'slow' mutexes. */
+UNIV_INTERN
+void
+os_sync_free(void)
+/*==============*/
+{
+	os_event_t	event;
+	os_ib_mutex_t	mutex;
+
+	os_sync_free_called = TRUE;
+	event = UT_LIST_GET_FIRST(os_event_list);
+
+	while (event) {
+
+		os_event_free(event);
+
+		event = UT_LIST_GET_FIRST(os_event_list);
+	}
+
+	mutex = UT_LIST_GET_FIRST(os_mutex_list);
+
+	while (mutex) {
+		if (mutex == os_sync_mutex) {
+			/* Set the flag to FALSE so that we do not try to
+			reserve os_sync_mutex any more in remaining freeing
+			operations in shutdown */
+			os_sync_mutex_inited = FALSE;
+		}
+
+		os_mutex_free(mutex);
+
+		mutex = UT_LIST_GET_FIRST(os_mutex_list);
+	}
+	os_sync_free_called = FALSE;
+}
+
+/*********************************************************//**
+Creates an event semaphore, i.e., a semaphore which may just have two
+states: signaled and nonsignaled. The created event is manual reset: it
+must be reset explicitly by calling sync_os_reset_event.
+@return	the event handle */
+UNIV_INTERN
+os_event_t
+os_event_create(void)
+/*==================*/
+{
+	os_event_t	event;
+
+#ifdef __WIN__
+	if(!srv_use_native_conditions) {
+
+		event = static_cast<os_event_t>(ut_malloc(sizeof(*event)));
+
+		event->handle = CreateEvent(NULL, TRUE, FALSE, NULL);
+		if (!event->handle) {
+			fprintf(stderr,
+				"InnoDB: Could not create a Windows event"
+				" semaphore; Windows error %lu\n",
+				(ulong) GetLastError());
+		}
+	} else /* Windows with condition variables */
+#endif
+	{
+		event = static_cast<os_event_t>(ut_malloc(sizeof *event));
+
+#ifndef PFS_SKIP_EVENT_MUTEX
+		os_fast_mutex_init(event_os_mutex_key, &event->os_mutex);
+#else
+		os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &event->os_mutex);
+#endif
+
+		os_cond_init(&(event->cond_var));
+
+		event->is_set = FALSE;
+
+		/* We return this value in os_event_reset(), which can then be
+		be used to pass to the os_event_wait_low(). The value of zero
+		is reserved in os_event_wait_low() for the case when the
+		caller does not want to pass any signal_count value. To
+		distinguish between the two cases we initialize signal_count
+		to 1 here. */
+		event->signal_count = 1;
+	}
+
+	/* The os_sync_mutex can be NULL because during startup an event
+	can be created [ because it's embedded in the mutex/rwlock ] before
+	this module has been initialized */
+	if (os_sync_mutex != NULL) {
+		os_mutex_enter(os_sync_mutex);
+	}
+
+	/* Put to the list of events */
+	UT_LIST_ADD_FIRST(os_event_list, os_event_list, event);
+
+	os_event_count++;
+
+	if (os_sync_mutex != NULL) {
+		os_mutex_exit(os_sync_mutex);
+	}
+
+	return(event);
+}
+
+/**********************************************************//**
+Sets an event semaphore to the signaled state: lets waiting threads
+proceed. */
+UNIV_INTERN
+void
+os_event_set(
+/*=========*/
+	os_event_t	event)	/*!< in: event to set */
+{
+	ut_a(event);
+
+#ifdef __WIN__
+	if (!srv_use_native_conditions) {
+		ut_a(SetEvent(event->handle));
+		return;
+	}
+#endif
+
+	os_fast_mutex_lock(&(event->os_mutex));
+
+	if (event->is_set) {
+		/* Do nothing */
+	} else {
+		event->is_set = TRUE;
+		event->signal_count += 1;
+		os_cond_broadcast(&(event->cond_var));
+	}
+
+	os_fast_mutex_unlock(&(event->os_mutex));
+}
+
+/**********************************************************//**
+Resets an event semaphore to the nonsignaled state. Waiting threads will
+stop to wait for the event.
+The return value should be passed to os_even_wait_low() if it is desired
+that this thread should not wait in case of an intervening call to
+os_event_set() between this os_event_reset() and the
+os_event_wait_low() call. See comments for os_event_wait_low().
+@return	current signal_count. */
+UNIV_INTERN
+ib_int64_t
+os_event_reset(
+/*===========*/
+	os_event_t	event)	/*!< in: event to reset */
+{
+	ib_int64_t	ret = 0;
+
+	ut_a(event);
+
+#ifdef __WIN__
+	if(!srv_use_native_conditions) {
+		ut_a(ResetEvent(event->handle));
+		return(0);
+	}
+#endif
+
+	os_fast_mutex_lock(&(event->os_mutex));
+
+	if (!event->is_set) {
+		/* Do nothing */
+	} else {
+		event->is_set = FALSE;
+	}
+	ret = event->signal_count;
+
+	os_fast_mutex_unlock(&(event->os_mutex));
+	return(ret);
+}
+
+/**********************************************************//**
+Frees an event object, without acquiring the global lock. */
+static
+void
+os_event_free_internal(
+/*===================*/
+	os_event_t	event)	/*!< in: event to free */
+{
+#ifdef __WIN__
+	if(!srv_use_native_conditions) {
+		ut_a(event);
+		ut_a(CloseHandle(event->handle));
+	} else
+#endif
+	{
+		ut_a(event);
+
+		/* This is to avoid freeing the mutex twice */
+		os_fast_mutex_free(&(event->os_mutex));
+
+		os_cond_destroy(&(event->cond_var));
+	}
+
+	/* Remove from the list of events */
+	UT_LIST_REMOVE(os_event_list, os_event_list, event);
+
+	os_event_count--;
+
+	ut_free(event);
+}
+
+/**********************************************************//**
+Frees an event object. */
+UNIV_INTERN
+void
+os_event_free(
+/*==========*/
+	os_event_t	event)	/*!< in: event to free */
+
+{
+	ut_a(event);
+#ifdef __WIN__
+	if(!srv_use_native_conditions){
+		ut_a(CloseHandle(event->handle));
+	} else /*Windows with condition variables */
+#endif
+	{
+		os_fast_mutex_free(&(event->os_mutex));
+
+		os_cond_destroy(&(event->cond_var));
+	}
+
+	/* Remove from the list of events */
+	os_mutex_enter(os_sync_mutex);
+
+	UT_LIST_REMOVE(os_event_list, os_event_list, event);
+
+	os_event_count--;
+
+	os_mutex_exit(os_sync_mutex);
+
+	ut_free(event);
+}
+
+/**********************************************************//**
+Waits for an event object until it is in the signaled state.
+
+Typically, if the event has been signalled after the os_event_reset()
+we'll return immediately because event->is_set == TRUE.
+There are, however, situations (e.g.: sync_array code) where we may
+lose this information. For example:
+
+thread A calls os_event_reset()
+thread B calls os_event_set()   [event->is_set == TRUE]
+thread C calls os_event_reset() [event->is_set == FALSE]
+thread A calls os_event_wait()  [infinite wait!]
+thread C calls os_event_wait()  [infinite wait!]
+
+Where such a scenario is possible, to avoid infinite wait, the
+value returned by os_event_reset() should be passed in as
+reset_sig_count. */
+UNIV_INTERN
+void
+os_event_wait_low(
+/*==============*/
+	os_event_t	event,		/*!< in: event to wait */
+	ib_int64_t	reset_sig_count)/*!< in: zero or the value
+					returned by previous call of
+					os_event_reset(). */
+{
+#ifdef __WIN__
+	if(!srv_use_native_conditions) {
+		DWORD	err;
+
+		ut_a(event);
+
+		UT_NOT_USED(reset_sig_count);
+
+		/* Specify an infinite wait */
+		err = WaitForSingleObject(event->handle, INFINITE);
+
+		ut_a(err == WAIT_OBJECT_0);
+		return;
+	}
+#endif
+
+	os_fast_mutex_lock(&event->os_mutex);
+
+	if (!reset_sig_count) {
+		reset_sig_count = event->signal_count;
+	}
+
+	while (!event->is_set && event->signal_count == reset_sig_count) {
+		os_cond_wait(&(event->cond_var), &(event->os_mutex));
+
+		/* Solaris manual said that spurious wakeups may occur: we
+		have to check if the event really has been signaled after
+		we came here to wait */
+	}
+
+	os_fast_mutex_unlock(&event->os_mutex);
+}
+
+/**********************************************************//**
+Waits for an event object until it is in the signaled state or
+a timeout is exceeded.
+@return	0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
+UNIV_INTERN
+ulint
+os_event_wait_time_low(
+/*===================*/
+	os_event_t	event,			/*!< in: event to wait */
+	ulint		time_in_usec,		/*!< in: timeout in
+						microseconds, or
+						OS_SYNC_INFINITE_TIME */
+	ib_int64_t	reset_sig_count)	/*!< in: zero or the value
+						returned by previous call of
+						os_event_reset(). */
+{
+	ibool		timed_out = FALSE;
+
+#ifdef __WIN__
+	DWORD		time_in_ms;
+
+	if (!srv_use_native_conditions) {
+		DWORD	err;
+
+		ut_a(event);
+
+		if (time_in_usec != OS_SYNC_INFINITE_TIME) {
+			time_in_ms = static_cast<DWORD>(time_in_usec / 1000);
+			err = WaitForSingleObject(event->handle, time_in_ms);
+		} else {
+			err = WaitForSingleObject(event->handle, INFINITE);
+		}
+
+		if (err == WAIT_OBJECT_0) {
+			return(0);
+		} else if ((err == WAIT_TIMEOUT) || (err == ERROR_TIMEOUT)) {
+			return(OS_SYNC_TIME_EXCEEDED);
+		}
+
+		ut_error;
+		/* Dummy value to eliminate compiler warning. */
+		return(42);
+	} else {
+		ut_a(sleep_condition_variable != NULL);
+
+		if (time_in_usec != OS_SYNC_INFINITE_TIME) {
+			time_in_ms = static_cast<DWORD>(time_in_usec / 1000);
+		} else {
+			time_in_ms = INFINITE;
+		}
+	}
+#else
+	struct timespec	abstime;
+
+	if (time_in_usec != OS_SYNC_INFINITE_TIME) {
+		struct timeval	tv;
+		int		ret;
+		ulint		sec;
+		ulint		usec;
+
+		ret = ut_usectime(&sec, &usec);
+		ut_a(ret == 0);
+
+		tv.tv_sec = sec;
+		tv.tv_usec = usec;
+
+		tv.tv_usec += time_in_usec;
+
+		if ((ulint) tv.tv_usec >= MICROSECS_IN_A_SECOND) {
+			tv.tv_sec += time_in_usec / MICROSECS_IN_A_SECOND;
+			tv.tv_usec %= MICROSECS_IN_A_SECOND;
+		}
+
+		abstime.tv_sec  = tv.tv_sec;
+		abstime.tv_nsec = tv.tv_usec * 1000;
+	} else {
+		abstime.tv_nsec = 999999999;
+		abstime.tv_sec = (time_t) ULINT_MAX;
+	}
+
+	ut_a(abstime.tv_nsec <= 999999999);
+
+#endif /* __WIN__ */
+
+	os_fast_mutex_lock(&event->os_mutex);
+
+	if (!reset_sig_count) {
+		reset_sig_count = event->signal_count;
+	}
+
+	do {
+		if (event->is_set || event->signal_count != reset_sig_count) {
+
+			break;
+		}
+
+		timed_out = os_cond_wait_timed(
+			&event->cond_var, &event->os_mutex,
+#ifndef __WIN__
+			&abstime
+#else
+			time_in_ms
+#endif /* !__WIN__ */
+		);
+
+	} while (!timed_out);
+
+	os_fast_mutex_unlock(&event->os_mutex);
+
+	return(timed_out ? OS_SYNC_TIME_EXCEEDED : 0);
+}
+
+/*********************************************************//**
+Creates an operating system mutex semaphore. Because these are slow, the
+mutex semaphore of InnoDB itself (ib_mutex_t) should be used where possible.
+@return	the mutex handle */
+UNIV_INTERN
+os_ib_mutex_t
+os_mutex_create(void)
+/*=================*/
+{
+	os_fast_mutex_t*	mutex;
+	os_ib_mutex_t		mutex_str;
+
+	mutex = static_cast<os_fast_mutex_t*>(
+		ut_malloc(sizeof(os_fast_mutex_t)));
+
+	os_fast_mutex_init(os_mutex_key, mutex);
+
+	mutex_str = static_cast<os_ib_mutex_t>(ut_malloc(sizeof *mutex_str));
+
+	mutex_str->handle = mutex;
+	mutex_str->count = 0;
+	mutex_str->event = os_event_create();
+
+	if (UNIV_LIKELY(os_sync_mutex_inited)) {
+		/* When creating os_sync_mutex itself we cannot reserve it */
+		os_mutex_enter(os_sync_mutex);
+	}
+
+	UT_LIST_ADD_FIRST(os_mutex_list, os_mutex_list, mutex_str);
+
+	os_mutex_count++;
+
+	if (UNIV_LIKELY(os_sync_mutex_inited)) {
+		os_mutex_exit(os_sync_mutex);
+	}
+
+	return(mutex_str);
+}
+
+/**********************************************************//**
+Acquires ownership of a mutex semaphore. */
+UNIV_INTERN
+void
+os_mutex_enter(
+/*===========*/
+	os_ib_mutex_t	mutex)	/*!< in: mutex to acquire */
+{
+	os_fast_mutex_lock(static_cast<os_fast_mutex_t*>(mutex->handle));
+
+	(mutex->count)++;
+
+	ut_a(mutex->count == 1);
+}
+
+/**********************************************************//**
+Releases ownership of a mutex. */
+UNIV_INTERN
+void
+os_mutex_exit(
+/*==========*/
+	os_ib_mutex_t	mutex)	/*!< in: mutex to release */
+{
+	ut_a(mutex);
+
+	ut_a(mutex->count == 1);
+
+	(mutex->count)--;
+	os_fast_mutex_unlock(static_cast<os_fast_mutex_t*>(mutex->handle));
+}
+
+/**********************************************************//**
+Frees a mutex object. */
+UNIV_INTERN
+void
+os_mutex_free(
+/*==========*/
+	os_ib_mutex_t	mutex)	/*!< in: mutex to free */
+{
+	ut_a(mutex);
+
+	if (UNIV_LIKELY(!os_sync_free_called)) {
+		os_event_free_internal(mutex->event);
+	}
+
+	if (UNIV_LIKELY(os_sync_mutex_inited)) {
+		os_mutex_enter(os_sync_mutex);
+	}
+
+	UT_LIST_REMOVE(os_mutex_list, os_mutex_list, mutex);
+
+	os_mutex_count--;
+
+	if (UNIV_LIKELY(os_sync_mutex_inited)) {
+		os_mutex_exit(os_sync_mutex);
+	}
+
+	os_fast_mutex_free(static_cast<os_fast_mutex_t*>(mutex->handle));
+	ut_free(mutex->handle);
+	ut_free(mutex);
+}
+
+/*********************************************************//**
+Initializes an operating system fast mutex semaphore. */
+UNIV_INTERN
+void
+os_fast_mutex_init_func(
+/*====================*/
+	fast_mutex_t*		fast_mutex)	/*!< in: fast mutex */
+{
+#ifdef __WIN__
+	ut_a(fast_mutex);
+
+	InitializeCriticalSection((LPCRITICAL_SECTION) fast_mutex);
+#else
+	ut_a(0 == pthread_mutex_init(fast_mutex, MY_MUTEX_INIT_FAST));
+#endif
+	if (UNIV_LIKELY(os_sync_mutex_inited)) {
+		/* When creating os_sync_mutex itself (in Unix) we cannot
+		reserve it */
+
+		os_mutex_enter(os_sync_mutex);
+	}
+
+	os_fast_mutex_count++;
+
+	if (UNIV_LIKELY(os_sync_mutex_inited)) {
+		os_mutex_exit(os_sync_mutex);
+	}
+}
+
+/**********************************************************//**
+Acquires ownership of a fast mutex. */
+UNIV_INTERN
+void
+os_fast_mutex_lock_func(
+/*====================*/
+	fast_mutex_t*		fast_mutex)	/*!< in: mutex to acquire */
+{
+#ifdef __WIN__
+	EnterCriticalSection((LPCRITICAL_SECTION) fast_mutex);
+#else
+	pthread_mutex_lock(fast_mutex);
+#endif
+}
+
+/**********************************************************//**
+Releases ownership of a fast mutex. */
+UNIV_INTERN
+void
+os_fast_mutex_unlock_func(
+/*======================*/
+	fast_mutex_t*		fast_mutex)	/*!< in: mutex to release */
+{
+#ifdef __WIN__
+	LeaveCriticalSection(fast_mutex);
+#else
+	pthread_mutex_unlock(fast_mutex);
+#endif
+}
+
+/**********************************************************//**
+Frees a mutex object. */
+UNIV_INTERN
+void
+os_fast_mutex_free_func(
+/*====================*/
+	fast_mutex_t*		fast_mutex)	/*!< in: mutex to free */
+{
+#ifdef __WIN__
+	ut_a(fast_mutex);
+
+	DeleteCriticalSection((LPCRITICAL_SECTION) fast_mutex);
+#else
+	int	ret;
+
+	ret = pthread_mutex_destroy(fast_mutex);
+
+	if (UNIV_UNLIKELY(ret != 0)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: error: return value %lu when calling\n"
+			"InnoDB: pthread_mutex_destroy().\n", (ulint) ret);
+		fprintf(stderr,
+			"InnoDB: Byte contents of the pthread mutex at %p:\n",
+			(void*) fast_mutex);
+		ut_print_buf(stderr, fast_mutex, sizeof(os_fast_mutex_t));
+		putc('\n', stderr);
+	}
+#endif
+	if (UNIV_LIKELY(os_sync_mutex_inited)) {
+		/* When freeing the last mutexes, we have
+		already freed os_sync_mutex */
+
+		os_mutex_enter(os_sync_mutex);
+	}
+
+	ut_ad(os_fast_mutex_count > 0);
+	os_fast_mutex_count--;
+
+	if (UNIV_LIKELY(os_sync_mutex_inited)) {
+		os_mutex_exit(os_sync_mutex);
+	}
+}
diff --git a/storage/innobase/os/os0thread.cc b/storage/innobase/os/os0thread.cc
new file mode 100644
index 00000000000..772336215c9
--- /dev/null
+++ b/storage/innobase/os/os0thread.cc
@@ -0,0 +1,263 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file os/os0thread.cc
+The interface to the operating system thread control primitives
+
+Created 9/8/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0thread.h"
+#ifdef UNIV_NONINL
+#include "os0thread.ic"
+#endif
+
+#ifdef __WIN__
+#include <windows.h>
+#endif
+
+#ifndef UNIV_HOTBACKUP
+#include "srv0srv.h"
+#include "os0sync.h"
+
+/***************************************************************//**
+Compares two thread ids for equality.
+@return	TRUE if equal */
+UNIV_INTERN
+ibool
+os_thread_eq(
+/*=========*/
+	os_thread_id_t	a,	/*!< in: OS thread or thread id */
+	os_thread_id_t	b)	/*!< in: OS thread or thread id */
+{
+#ifdef __WIN__
+	if (a == b) {
+		return(TRUE);
+	}
+
+	return(FALSE);
+#else
+	if (pthread_equal(a, b)) {
+		return(TRUE);
+	}
+
+	return(FALSE);
+#endif
+}
+
+/****************************************************************//**
+Converts an OS thread id to a ulint. It is NOT guaranteed that the ulint is
+unique for the thread though!
+@return	thread identifier as a number */
+UNIV_INTERN
+ulint
+os_thread_pf(
+/*=========*/
+	os_thread_id_t	a)	/*!< in: OS thread identifier */
+{
+#ifdef UNIV_HPUX10
+	/* In HP-UX-10.20 a pthread_t is a struct of 3 fields: field1, field2,
+	field3. We do not know if field1 determines the thread uniquely. */
+
+	return((ulint)(a.field1));
+#else
+	return((ulint) a);
+#endif
+}
+
+/*****************************************************************//**
+Returns the thread identifier of current thread. Currently the thread
+identifier in Unix is the thread handle itself. Note that in HP-UX
+pthread_t is a struct of 3 fields.
+@return	current thread identifier */
+UNIV_INTERN
+os_thread_id_t
+os_thread_get_curr_id(void)
+/*=======================*/
+{
+#ifdef __WIN__
+	return(GetCurrentThreadId());
+#else
+	return(pthread_self());
+#endif
+}
+
+/****************************************************************//**
+Creates a new thread of execution. The execution starts from
+the function given. The start function takes a void* parameter
+and returns an ulint.
+@return	handle to the thread */
+UNIV_INTERN
+os_thread_t
+os_thread_create_func(
+/*==================*/
+	os_thread_func_t	func,		/*!< in: pointer to function
+						from which to start */
+	void*			arg,		/*!< in: argument to start
+						function */
+	os_thread_id_t*		thread_id)	/*!< out: id of the created
+						thread, or NULL */
+{
+	/* the new thread should look recent changes up here so far. */
+	os_wmb;
+
+#ifdef __WIN__
+	os_thread_t	thread;
+	DWORD		win_thread_id;
+
+	os_mutex_enter(os_sync_mutex);
+	os_thread_count++;
+	os_mutex_exit(os_sync_mutex);
+
+	thread = CreateThread(NULL,	/* no security attributes */
+			      0,	/* default size stack */
+			      func,
+			      arg,
+			      0,	/* thread runs immediately */
+			      &win_thread_id);
+
+	if (thread_id) {
+		*thread_id = win_thread_id;
+	}
+
+	return(thread);
+#else
+	int		ret;
+	os_thread_t	pthread;
+	pthread_attr_t	attr;
+
+#ifndef UNIV_HPUX10
+	pthread_attr_init(&attr);
+#endif
+
+#ifdef UNIV_AIX
+	/* We must make sure a thread stack is at least 32 kB, otherwise
+	InnoDB might crash; we do not know if the default stack size on
+	AIX is always big enough. An empirical test on AIX-4.3 suggested
+	the size was 96 kB, though. */
+
+	ret = pthread_attr_setstacksize(&attr,
+					(size_t)(PTHREAD_STACK_MIN
+						 + 32 * 1024));
+	if (ret) {
+		fprintf(stderr,
+			"InnoDB: Error: pthread_attr_setstacksize"
+			" returned %d\n", ret);
+		exit(1);
+	}
+#endif
+	os_mutex_enter(os_sync_mutex);
+	os_thread_count++;
+	os_mutex_exit(os_sync_mutex);
+
+#ifdef UNIV_HPUX10
+	ret = pthread_create(&pthread, pthread_attr_default, func, arg);
+#else
+	ret = pthread_create(&pthread, &attr, func, arg);
+#endif
+	if (ret) {
+		fprintf(stderr,
+			"InnoDB: Error: pthread_create returned %d\n", ret);
+		exit(1);
+	}
+
+#ifndef UNIV_HPUX10
+	pthread_attr_destroy(&attr);
+#endif
+
+	ut_a(os_thread_count <= OS_THREAD_MAX_N);
+
+	if (thread_id) {
+		*thread_id = pthread;
+	}
+
+	return(pthread);
+#endif
+}
+
+/*****************************************************************//**
+Exits the current thread. */
+UNIV_INTERN
+void
+os_thread_exit(
+/*===========*/
+	void*	exit_value)	/*!< in: exit value; in Windows this void*
+				is cast as a DWORD */
+{
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Thread exits, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif
+
+#ifdef UNIV_PFS_THREAD
+	pfs_delete_thread();
+#endif
+
+	os_mutex_enter(os_sync_mutex);
+	os_thread_count--;
+	os_mutex_exit(os_sync_mutex);
+
+#ifdef __WIN__
+	ExitThread((DWORD) exit_value);
+#else
+	pthread_detach(pthread_self());
+	pthread_exit(exit_value);
+#endif
+}
+
+/*****************************************************************//**
+Advises the os to give up remainder of the thread's time slice. */
+UNIV_INTERN
+void
+os_thread_yield(void)
+/*=================*/
+{
+#if defined(__WIN__)
+	SwitchToThread();
+#elif (defined(HAVE_SCHED_YIELD) && defined(HAVE_SCHED_H))
+	sched_yield();
+#elif defined(HAVE_PTHREAD_YIELD_ZERO_ARG)
+	pthread_yield();
+#elif defined(HAVE_PTHREAD_YIELD_ONE_ARG)
+	pthread_yield(0);
+#else
+	os_thread_sleep(0);
+#endif
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*****************************************************************//**
+The thread sleeps at least the time given in microseconds. */
+UNIV_INTERN
+void
+os_thread_sleep(
+/*============*/
+	ulint	tm)	/*!< in: time in microseconds */
+{
+#ifdef __WIN__
+	Sleep((DWORD) tm / 1000);
+#else
+	struct timeval	t;
+
+	t.tv_sec = tm / 1000000;
+	t.tv_usec = tm % 1000000;
+
+	select(0, NULL, NULL, NULL, &t);
+#endif
+}
diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc
new file mode 100644
index 00000000000..f5f7e1299ce
--- /dev/null
+++ b/storage/innobase/page/page0cur.cc
@@ -0,0 +1,2145 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file page/page0cur.cc
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "page0cur.h"
+#ifdef UNIV_NONINL
+#include "page0cur.ic"
+#endif
+
+#include "page0zip.h"
+#include "btr0btr.h"
+#include "mtr0log.h"
+#include "log0recv.h"
+#include "ut0ut.h"
+#ifndef UNIV_HOTBACKUP
+#include "rem0cmp.h"
+
+#ifdef PAGE_CUR_ADAPT
+# ifdef UNIV_SEARCH_PERF_STAT
+static ulint	page_cur_short_succ	= 0;
+# endif /* UNIV_SEARCH_PERF_STAT */
+
+/*******************************************************************//**
+This is a linear congruential generator PRNG. Returns a pseudo random
+number between 0 and 2^64-1 inclusive. The formula and the constants
+being used are:
+X[n+1] = (a * X[n] + c) mod m
+where:
+X[0] = ut_time_us(NULL)
+a = 1103515245 (3^5 * 5 * 7 * 129749)
+c = 12345 (3 * 5 * 823)
+m = 18446744073709551616 (2^64)
+
+@return	number between 0 and 2^64-1 */
+static
+ib_uint64_t
+page_cur_lcg_prng(void)
+/*===================*/
+{
+#define LCG_a	1103515245
+#define LCG_c	12345
+	static ib_uint64_t	lcg_current = 0;
+	static ibool		initialized = FALSE;
+
+	if (!initialized) {
+		lcg_current = (ib_uint64_t) ut_time_us(NULL);
+		initialized = TRUE;
+	}
+
+	/* no need to "% 2^64" explicitly because lcg_current is
+	64 bit and this will be done anyway */
+	lcg_current = LCG_a * lcg_current + LCG_c;
+
+	return(lcg_current);
+}
+
+/****************************************************************//**
+Tries a search shortcut based on the last insert.
+@return	TRUE on success */
+UNIV_INLINE
+ibool
+page_cur_try_search_shortcut(
+/*=========================*/
+	const buf_block_t*	block,	/*!< in: index page */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		tuple,	/*!< in: data tuple */
+	ulint*			iup_matched_fields,
+					/*!< in/out: already matched
+					fields in upper limit record */
+	ulint*			iup_matched_bytes,
+					/*!< in/out: already matched
+					bytes in a field not yet
+					completely matched */
+	ulint*			ilow_matched_fields,
+					/*!< in/out: already matched
+					fields in lower limit record */
+	ulint*			ilow_matched_bytes,
+					/*!< in/out: already matched
+					bytes in a field not yet
+					completely matched */
+	page_cur_t*		cursor) /*!< out: page cursor */
+{
+	const rec_t*	rec;
+	const rec_t*	next_rec;
+	ulint		low_match;
+	ulint		low_bytes;
+	ulint		up_match;
+	ulint		up_bytes;
+#ifdef UNIV_SEARCH_DEBUG
+	page_cur_t	cursor2;
+#endif
+	ibool		success		= FALSE;
+	const page_t*	page		= buf_block_get_frame(block);
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(dtuple_check_typed(tuple));
+
+	rec = page_header_get_ptr(page, PAGE_LAST_INSERT);
+	offsets = rec_get_offsets(rec, index, offsets,
+				  dtuple_get_n_fields(tuple), &heap);
+
+	ut_ad(rec);
+	ut_ad(page_rec_is_user_rec(rec));
+
+	ut_pair_min(&low_match, &low_bytes,
+		    *ilow_matched_fields, *ilow_matched_bytes,
+		    *iup_matched_fields, *iup_matched_bytes);
+
+	up_match = low_match;
+	up_bytes = low_bytes;
+
+	if (page_cmp_dtuple_rec_with_match(tuple, rec, offsets,
+					   &low_match, &low_bytes) < 0) {
+		goto exit_func;
+	}
+
+	next_rec = page_rec_get_next_const(rec);
+	offsets = rec_get_offsets(next_rec, index, offsets,
+				  dtuple_get_n_fields(tuple), &heap);
+
+	if (page_cmp_dtuple_rec_with_match(tuple, next_rec, offsets,
+					   &up_match, &up_bytes) >= 0) {
+		goto exit_func;
+	}
+
+	page_cur_position(rec, block, cursor);
+
+#ifdef UNIV_SEARCH_DEBUG
+	page_cur_search_with_match(block, index, tuple, PAGE_CUR_DBG,
+				   iup_matched_fields,
+				   iup_matched_bytes,
+				   ilow_matched_fields,
+				   ilow_matched_bytes,
+				   &cursor2);
+	ut_a(cursor2.rec == cursor->rec);
+
+	if (!page_rec_is_supremum(next_rec)) {
+
+		ut_a(*iup_matched_fields == up_match);
+		ut_a(*iup_matched_bytes == up_bytes);
+	}
+
+	ut_a(*ilow_matched_fields == low_match);
+	ut_a(*ilow_matched_bytes == low_bytes);
+#endif
+	if (!page_rec_is_supremum(next_rec)) {
+
+		*iup_matched_fields = up_match;
+		*iup_matched_bytes = up_bytes;
+	}
+
+	*ilow_matched_fields = low_match;
+	*ilow_matched_bytes = low_bytes;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+	page_cur_short_succ++;
+#endif
+	success = TRUE;
+exit_func:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(success);
+}
+
+#endif
+
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+/****************************************************************//**
+Checks if the nth field in a record is a character type field which extends
+the nth field in tuple, i.e., the field is longer or equal in length and has
+common first characters.
+@return	TRUE if rec field extends tuple field */
+static
+ibool
+page_cur_rec_field_extends(
+/*=======================*/
+	const dtuple_t*	tuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: record */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n)	/*!< in: compare nth field */
+{
+	const dtype_t*	type;
+	const dfield_t*	dfield;
+	const byte*	rec_f;
+	ulint		rec_f_len;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	dfield = dtuple_get_nth_field(tuple, n);
+
+	type = dfield_get_type(dfield);
+
+	rec_f = rec_get_nth_field(rec, offsets, n, &rec_f_len);
+
+	if (type->mtype == DATA_VARCHAR
+	    || type->mtype == DATA_CHAR
+	    || type->mtype == DATA_FIXBINARY
+	    || type->mtype == DATA_BINARY
+	    || type->mtype == DATA_BLOB
+	    || type->mtype == DATA_VARMYSQL
+	    || type->mtype == DATA_MYSQL) {
+
+		if (dfield_get_len(dfield) != UNIV_SQL_NULL
+		    && rec_f_len != UNIV_SQL_NULL
+		    && rec_f_len >= dfield_get_len(dfield)
+		    && !cmp_data_data_slow(type->mtype, type->prtype,
+					   dfield_get_data(dfield),
+					   dfield_get_len(dfield),
+					   rec_f, dfield_get_len(dfield))) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+
+/****************************************************************//**
+Searches the right position for a page cursor. */
+UNIV_INTERN
+void
+page_cur_search_with_match(
+/*=======================*/
+	const buf_block_t*	block,	/*!< in: buffer block */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		tuple,	/*!< in: data tuple */
+	ulint			mode,	/*!< in: PAGE_CUR_L,
+					PAGE_CUR_LE, PAGE_CUR_G, or
+					PAGE_CUR_GE */
+	ulint*			iup_matched_fields,
+					/*!< in/out: already matched
+					fields in upper limit record */
+	ulint*			iup_matched_bytes,
+					/*!< in/out: already matched
+					bytes in a field not yet
+					completely matched */
+	ulint*			ilow_matched_fields,
+					/*!< in/out: already matched
+					fields in lower limit record */
+	ulint*			ilow_matched_bytes,
+					/*!< in/out: already matched
+					bytes in a field not yet
+					completely matched */
+	page_cur_t*		cursor)	/*!< out: page cursor */
+{
+	ulint		up;
+	ulint		low;
+	ulint		mid;
+	const page_t*	page;
+	const page_dir_slot_t* slot;
+	const rec_t*	up_rec;
+	const rec_t*	low_rec;
+	const rec_t*	mid_rec;
+	ulint		up_matched_fields;
+	ulint		up_matched_bytes;
+	ulint		low_matched_fields;
+	ulint		low_matched_bytes;
+	ulint		cur_matched_fields;
+	ulint		cur_matched_bytes;
+	int		cmp;
+#ifdef UNIV_SEARCH_DEBUG
+	int		dbg_cmp;
+	ulint		dbg_matched_fields;
+	ulint		dbg_matched_bytes;
+#endif
+#ifdef UNIV_ZIP_DEBUG
+	const page_zip_des_t*	page_zip = buf_block_get_page_zip(block);
+#endif /* UNIV_ZIP_DEBUG */
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(block && tuple && iup_matched_fields && iup_matched_bytes
+	      && ilow_matched_fields && ilow_matched_bytes && cursor);
+	ut_ad(dtuple_validate(tuple));
+#ifdef UNIV_DEBUG
+# ifdef PAGE_CUR_DBG
+	if (mode != PAGE_CUR_DBG)
+# endif /* PAGE_CUR_DBG */
+# ifdef PAGE_CUR_LE_OR_EXTENDS
+		if (mode != PAGE_CUR_LE_OR_EXTENDS)
+# endif /* PAGE_CUR_LE_OR_EXTENDS */
+			ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+			      || mode == PAGE_CUR_G || mode == PAGE_CUR_GE);
+#endif /* UNIV_DEBUG */
+	page = buf_block_get_frame(block);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	page_check_dir(page);
+
+#ifdef PAGE_CUR_ADAPT
+	if (page_is_leaf(page)
+	    && (mode == PAGE_CUR_LE)
+	    && (page_header_get_field(page, PAGE_N_DIRECTION) > 3)
+	    && (page_header_get_ptr(page, PAGE_LAST_INSERT))
+	    && (page_header_get_field(page, PAGE_DIRECTION) == PAGE_RIGHT)) {
+
+		if (page_cur_try_search_shortcut(
+			    block, index, tuple,
+			    iup_matched_fields, iup_matched_bytes,
+			    ilow_matched_fields, ilow_matched_bytes,
+			    cursor)) {
+			return;
+		}
+	}
+# ifdef PAGE_CUR_DBG
+	if (mode == PAGE_CUR_DBG) {
+		mode = PAGE_CUR_LE;
+	}
+# endif
+#endif
+
+	/* The following flag does not work for non-latin1 char sets because
+	cmp_full_field does not tell how many bytes matched */
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+	ut_a(mode != PAGE_CUR_LE_OR_EXTENDS);
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+
+	/* If mode PAGE_CUR_G is specified, we are trying to position the
+	cursor to answer a query of the form "tuple < X", where tuple is
+	the input parameter, and X denotes an arbitrary physical record on
+	the page. We want to position the cursor on the first X which
+	satisfies the condition. */
+
+	up_matched_fields  = *iup_matched_fields;
+	up_matched_bytes   = *iup_matched_bytes;
+	low_matched_fields = *ilow_matched_fields;
+	low_matched_bytes  = *ilow_matched_bytes;
+
+	/* Perform binary search. First the search is done through the page
+	directory, after that as a linear search in the list of records
+	owned by the upper limit directory slot. */
+
+	low = 0;
+	up = page_dir_get_n_slots(page) - 1;
+
+	/* Perform binary search until the lower and upper limit directory
+	slots come to the distance 1 of each other */
+
+	while (up - low > 1) {
+		mid = (low + up) / 2;
+		slot = page_dir_get_nth_slot(page, mid);
+		mid_rec = page_dir_slot_get_rec(slot);
+
+		ut_pair_min(&cur_matched_fields, &cur_matched_bytes,
+			    low_matched_fields, low_matched_bytes,
+			    up_matched_fields, up_matched_bytes);
+
+		offsets = rec_get_offsets(mid_rec, index, offsets,
+					  dtuple_get_n_fields_cmp(tuple),
+					  &heap);
+
+		cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets,
+						&cur_matched_fields,
+						&cur_matched_bytes);
+		if (UNIV_LIKELY(cmp > 0)) {
+low_slot_match:
+			low = mid;
+			low_matched_fields = cur_matched_fields;
+			low_matched_bytes = cur_matched_bytes;
+
+		} else if (UNIV_EXPECT(cmp, -1)) {
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+			if (mode == PAGE_CUR_LE_OR_EXTENDS
+			    && page_cur_rec_field_extends(
+				    tuple, mid_rec, offsets,
+				    cur_matched_fields)) {
+
+				goto low_slot_match;
+			}
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+up_slot_match:
+			up = mid;
+			up_matched_fields = cur_matched_fields;
+			up_matched_bytes = cur_matched_bytes;
+
+		} else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+			   || mode == PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+			   ) {
+
+			goto low_slot_match;
+		} else {
+
+			goto up_slot_match;
+		}
+	}
+
+	slot = page_dir_get_nth_slot(page, low);
+	low_rec = page_dir_slot_get_rec(slot);
+	slot = page_dir_get_nth_slot(page, up);
+	up_rec = page_dir_slot_get_rec(slot);
+
+	/* Perform linear search until the upper and lower records come to
+	distance 1 of each other. */
+
+	while (page_rec_get_next_const(low_rec) != up_rec) {
+
+		mid_rec = page_rec_get_next_const(low_rec);
+
+		ut_pair_min(&cur_matched_fields, &cur_matched_bytes,
+			    low_matched_fields, low_matched_bytes,
+			    up_matched_fields, up_matched_bytes);
+
+		offsets = rec_get_offsets(mid_rec, index, offsets,
+					  dtuple_get_n_fields_cmp(tuple),
+					  &heap);
+
+		cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets,
+						&cur_matched_fields,
+						&cur_matched_bytes);
+		if (UNIV_LIKELY(cmp > 0)) {
+low_rec_match:
+			low_rec = mid_rec;
+			low_matched_fields = cur_matched_fields;
+			low_matched_bytes = cur_matched_bytes;
+
+		} else if (UNIV_EXPECT(cmp, -1)) {
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+			if (mode == PAGE_CUR_LE_OR_EXTENDS
+			    && page_cur_rec_field_extends(
+				    tuple, mid_rec, offsets,
+				    cur_matched_fields)) {
+
+				goto low_rec_match;
+			}
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+up_rec_match:
+			up_rec = mid_rec;
+			up_matched_fields = cur_matched_fields;
+			up_matched_bytes = cur_matched_bytes;
+		} else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+			   || mode == PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+			   ) {
+
+			goto low_rec_match;
+		} else {
+
+			goto up_rec_match;
+		}
+	}
+
+#ifdef UNIV_SEARCH_DEBUG
+
+	/* Check that the lower and upper limit records have the
+	right alphabetical order compared to tuple. */
+	dbg_matched_fields = 0;
+	dbg_matched_bytes = 0;
+
+	offsets = rec_get_offsets(low_rec, index, offsets,
+				  ULINT_UNDEFINED, &heap);
+	dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, low_rec, offsets,
+						 &dbg_matched_fields,
+						 &dbg_matched_bytes);
+	if (mode == PAGE_CUR_G) {
+		ut_a(dbg_cmp >= 0);
+	} else if (mode == PAGE_CUR_GE) {
+		ut_a(dbg_cmp == 1);
+	} else if (mode == PAGE_CUR_L) {
+		ut_a(dbg_cmp == 1);
+	} else if (mode == PAGE_CUR_LE) {
+		ut_a(dbg_cmp >= 0);
+	}
+
+	if (!page_rec_is_infimum(low_rec)) {
+
+		ut_a(low_matched_fields == dbg_matched_fields);
+		ut_a(low_matched_bytes == dbg_matched_bytes);
+	}
+
+	dbg_matched_fields = 0;
+	dbg_matched_bytes = 0;
+
+	offsets = rec_get_offsets(up_rec, index, offsets,
+				  ULINT_UNDEFINED, &heap);
+	dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, up_rec, offsets,
+						 &dbg_matched_fields,
+						 &dbg_matched_bytes);
+	if (mode == PAGE_CUR_G) {
+		ut_a(dbg_cmp == -1);
+	} else if (mode == PAGE_CUR_GE) {
+		ut_a(dbg_cmp <= 0);
+	} else if (mode == PAGE_CUR_L) {
+		ut_a(dbg_cmp <= 0);
+	} else if (mode == PAGE_CUR_LE) {
+		ut_a(dbg_cmp == -1);
+	}
+
+	if (!page_rec_is_supremum(up_rec)) {
+
+		ut_a(up_matched_fields == dbg_matched_fields);
+		ut_a(up_matched_bytes == dbg_matched_bytes);
+	}
+#endif
+	if (mode <= PAGE_CUR_GE) {
+		page_cur_position(up_rec, block, cursor);
+	} else {
+		page_cur_position(low_rec, block, cursor);
+	}
+
+	*iup_matched_fields  = up_matched_fields;
+	*iup_matched_bytes   = up_matched_bytes;
+	*ilow_matched_fields = low_matched_fields;
+	*ilow_matched_bytes  = low_matched_bytes;
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/***********************************************************//**
+Positions a page cursor on a randomly chosen user record on a page. If there
+are no user records, sets the cursor on the infimum record. */
+UNIV_INTERN
+void
+page_cur_open_on_rnd_user_rec(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: page */
+	page_cur_t*	cursor)	/*!< out: page cursor */
+{
+	ulint	rnd;
+	ulint	n_recs = page_get_n_recs(buf_block_get_frame(block));
+
+	page_cur_set_before_first(block, cursor);
+
+	if (UNIV_UNLIKELY(n_recs == 0)) {
+
+		return;
+	}
+
+	rnd = (ulint) (page_cur_lcg_prng() % n_recs);
+
+	do {
+		page_cur_move_to_next(cursor);
+	} while (rnd--);
+}
+
+/***********************************************************//**
+Writes the log record of a record insert on a page. */
+static
+void
+page_cur_insert_rec_write_log(
+/*==========================*/
+	rec_t*		insert_rec,	/*!< in: inserted physical record */
+	ulint		rec_size,	/*!< in: insert_rec size */
+	rec_t*		cursor_rec,	/*!< in: record the
+					cursor is pointing to */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mini-transaction handle */
+{
+	ulint	cur_rec_size;
+	ulint	extra_size;
+	ulint	cur_extra_size;
+	const byte* ins_ptr;
+	byte*	log_ptr;
+	const byte* log_end;
+	ulint	i;
+
+	ut_a(rec_size < UNIV_PAGE_SIZE);
+	ut_ad(page_align(insert_rec) == page_align(cursor_rec));
+	ut_ad(!page_rec_is_comp(insert_rec)
+	      == !dict_table_is_comp(index->table));
+
+	{
+		mem_heap_t*	heap		= NULL;
+		ulint		cur_offs_[REC_OFFS_NORMAL_SIZE];
+		ulint		ins_offs_[REC_OFFS_NORMAL_SIZE];
+
+		ulint*		cur_offs;
+		ulint*		ins_offs;
+
+		rec_offs_init(cur_offs_);
+		rec_offs_init(ins_offs_);
+
+		cur_offs = rec_get_offsets(cursor_rec, index, cur_offs_,
+					   ULINT_UNDEFINED, &heap);
+		ins_offs = rec_get_offsets(insert_rec, index, ins_offs_,
+					   ULINT_UNDEFINED, &heap);
+
+		extra_size = rec_offs_extra_size(ins_offs);
+		cur_extra_size = rec_offs_extra_size(cur_offs);
+		ut_ad(rec_size == rec_offs_size(ins_offs));
+		cur_rec_size = rec_offs_size(cur_offs);
+
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	}
+
+	ins_ptr = insert_rec - extra_size;
+
+	i = 0;
+
+	if (cur_extra_size == extra_size) {
+		ulint		min_rec_size = ut_min(cur_rec_size, rec_size);
+
+		const byte*	cur_ptr = cursor_rec - cur_extra_size;
+
+		/* Find out the first byte in insert_rec which differs from
+		cursor_rec; skip the bytes in the record info */
+
+		do {
+			if (*ins_ptr == *cur_ptr) {
+				i++;
+				ins_ptr++;
+				cur_ptr++;
+			} else if ((i < extra_size)
+				   && (i >= extra_size
+				       - page_rec_get_base_extra_size
+				       (insert_rec))) {
+				i = extra_size;
+				ins_ptr = insert_rec;
+				cur_ptr = cursor_rec;
+			} else {
+				break;
+			}
+		} while (i < min_rec_size);
+	}
+
+	if (mtr_get_log_mode(mtr) != MTR_LOG_SHORT_INSERTS) {
+
+		if (page_rec_is_comp(insert_rec)) {
+			log_ptr = mlog_open_and_write_index(
+				mtr, insert_rec, index, MLOG_COMP_REC_INSERT,
+				2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN);
+			if (UNIV_UNLIKELY(!log_ptr)) {
+				/* Logging in mtr is switched off
+				during crash recovery: in that case
+				mlog_open returns NULL */
+				return;
+			}
+		} else {
+			log_ptr = mlog_open(mtr, 11
+					    + 2 + 5 + 1 + 5 + 5
+					    + MLOG_BUF_MARGIN);
+			if (UNIV_UNLIKELY(!log_ptr)) {
+				/* Logging in mtr is switched off
+				during crash recovery: in that case
+				mlog_open returns NULL */
+				return;
+			}
+
+			log_ptr = mlog_write_initial_log_record_fast(
+				insert_rec, MLOG_REC_INSERT, log_ptr, mtr);
+		}
+
+		log_end = &log_ptr[2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN];
+		/* Write the cursor rec offset as a 2-byte ulint */
+		mach_write_to_2(log_ptr, page_offset(cursor_rec));
+		log_ptr += 2;
+	} else {
+		log_ptr = mlog_open(mtr, 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN);
+		if (!log_ptr) {
+			/* Logging in mtr is switched off during crash
+			recovery: in that case mlog_open returns NULL */
+			return;
+		}
+		log_end = &log_ptr[5 + 1 + 5 + 5 + MLOG_BUF_MARGIN];
+	}
+
+	if (page_rec_is_comp(insert_rec)) {
+		if (UNIV_UNLIKELY
+		    (rec_get_info_and_status_bits(insert_rec, TRUE)
+		     != rec_get_info_and_status_bits(cursor_rec, TRUE))) {
+
+			goto need_extra_info;
+		}
+	} else {
+		if (UNIV_UNLIKELY
+		    (rec_get_info_and_status_bits(insert_rec, FALSE)
+		     != rec_get_info_and_status_bits(cursor_rec, FALSE))) {
+
+			goto need_extra_info;
+		}
+	}
+
+	if (extra_size != cur_extra_size || rec_size != cur_rec_size) {
+need_extra_info:
+		/* Write the record end segment length
+		and the extra info storage flag */
+		log_ptr += mach_write_compressed(log_ptr,
+						 2 * (rec_size - i) + 1);
+
+		/* Write the info bits */
+		mach_write_to_1(log_ptr,
+				rec_get_info_and_status_bits(
+					insert_rec,
+					page_rec_is_comp(insert_rec)));
+		log_ptr++;
+
+		/* Write the record origin offset */
+		log_ptr += mach_write_compressed(log_ptr, extra_size);
+
+		/* Write the mismatch index */
+		log_ptr += mach_write_compressed(log_ptr, i);
+
+		ut_a(i < UNIV_PAGE_SIZE);
+		ut_a(extra_size < UNIV_PAGE_SIZE);
+	} else {
+		/* Write the record end segment length
+		and the extra info storage flag */
+		log_ptr += mach_write_compressed(log_ptr, 2 * (rec_size - i));
+	}
+
+	/* Write to the log the inserted index record end segment which
+	differs from the cursor record */
+
+	rec_size -= i;
+
+	if (log_ptr + rec_size <= log_end) {
+		memcpy(log_ptr, ins_ptr, rec_size);
+		mlog_close(mtr, log_ptr + rec_size);
+	} else {
+		mlog_close(mtr, log_ptr);
+		ut_a(rec_size < UNIV_PAGE_SIZE);
+		mlog_catenate_string(mtr, ins_ptr, rec_size);
+	}
+}
+#else /* !UNIV_HOTBACKUP */
+# define page_cur_insert_rec_write_log(ins_rec,size,cur,index,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses a log record of a record insert on a page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_cur_parse_insert_rec(
+/*======================*/
+	ibool		is_short,/*!< in: TRUE if short inserts */
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	buf_block_t*	block,	/*!< in: page or NULL */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr or NULL */
+{
+	ulint	origin_offset;
+	ulint	end_seg_len;
+	ulint	mismatch_index;
+	page_t*	page;
+	rec_t*	cursor_rec;
+	byte	buf1[1024];
+	byte*	buf;
+	byte*	ptr2			= ptr;
+	ulint	info_and_status_bits = 0; /* remove warning */
+	page_cur_t	cursor;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	page = block ? buf_block_get_frame(block) : NULL;
+
+	if (is_short) {
+		cursor_rec = page_rec_get_prev(page_get_supremum_rec(page));
+	} else {
+		ulint	offset;
+
+		/* Read the cursor rec offset as a 2-byte ulint */
+
+		if (UNIV_UNLIKELY(end_ptr < ptr + 2)) {
+
+			return(NULL);
+		}
+
+		offset = mach_read_from_2(ptr);
+		ptr += 2;
+
+		cursor_rec = page + offset;
+
+		if (UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE)) {
+
+			recv_sys->found_corrupt_log = TRUE;
+
+			return(NULL);
+		}
+	}
+
+	ptr = mach_parse_compressed(ptr, end_ptr, &end_seg_len);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	if (UNIV_UNLIKELY(end_seg_len >= UNIV_PAGE_SIZE << 1)) {
+		recv_sys->found_corrupt_log = TRUE;
+
+		return(NULL);
+	}
+
+	if (end_seg_len & 0x1UL) {
+		/* Read the info bits */
+
+		if (end_ptr < ptr + 1) {
+
+			return(NULL);
+		}
+
+		info_and_status_bits = mach_read_from_1(ptr);
+		ptr++;
+
+		ptr = mach_parse_compressed(ptr, end_ptr, &origin_offset);
+
+		if (ptr == NULL) {
+
+			return(NULL);
+		}
+
+		ut_a(origin_offset < UNIV_PAGE_SIZE);
+
+		ptr = mach_parse_compressed(ptr, end_ptr, &mismatch_index);
+
+		if (ptr == NULL) {
+
+			return(NULL);
+		}
+
+		ut_a(mismatch_index < UNIV_PAGE_SIZE);
+	}
+
+	if (UNIV_UNLIKELY(end_ptr < ptr + (end_seg_len >> 1))) {
+
+		return(NULL);
+	}
+
+	if (!block) {
+
+		return(ptr + (end_seg_len >> 1));
+	}
+
+	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+	ut_ad(!buf_block_get_page_zip(block) || page_is_comp(page));
+
+	/* Read from the log the inserted index record end segment which
+	differs from the cursor record */
+
+	offsets = rec_get_offsets(cursor_rec, index, offsets,
+				  ULINT_UNDEFINED, &heap);
+
+	if (!(end_seg_len & 0x1UL)) {
+		info_and_status_bits = rec_get_info_and_status_bits(
+			cursor_rec, page_is_comp(page));
+		origin_offset = rec_offs_extra_size(offsets);
+		mismatch_index = rec_offs_size(offsets) - (end_seg_len >> 1);
+	}
+
+	end_seg_len >>= 1;
+
+	if (mismatch_index + end_seg_len < sizeof buf1) {
+		buf = buf1;
+	} else {
+		buf = static_cast<byte*>(
+			mem_alloc(mismatch_index + end_seg_len));
+	}
+
+	/* Build the inserted record to buf */
+
+        if (UNIV_UNLIKELY(mismatch_index >= UNIV_PAGE_SIZE)) {
+		fprintf(stderr,
+			"Is short %lu, info_and_status_bits %lu, offset %lu, "
+			"o_offset %lu\n"
+			"mismatch index %lu, end_seg_len %lu\n"
+			"parsed len %lu\n",
+			(ulong) is_short, (ulong) info_and_status_bits,
+			(ulong) page_offset(cursor_rec),
+			(ulong) origin_offset,
+			(ulong) mismatch_index, (ulong) end_seg_len,
+			(ulong) (ptr - ptr2));
+
+		fputs("Dump of 300 bytes of log:\n", stderr);
+		ut_print_buf(stderr, ptr2, 300);
+		putc('\n', stderr);
+
+		buf_page_print(page, 0, 0);
+
+		ut_error;
+	}
+
+	ut_memcpy(buf, rec_get_start(cursor_rec, offsets), mismatch_index);
+	ut_memcpy(buf + mismatch_index, ptr, end_seg_len);
+
+	if (page_is_comp(page)) {
+		rec_set_info_and_status_bits(buf + origin_offset,
+				     info_and_status_bits);
+	} else {
+		rec_set_info_bits_old(buf + origin_offset,
+							info_and_status_bits);
+	}
+
+	page_cur_position(cursor_rec, block, &cursor);
+
+	offsets = rec_get_offsets(buf + origin_offset, index, offsets,
+				  ULINT_UNDEFINED, &heap);
+	if (UNIV_UNLIKELY(!page_cur_rec_insert(&cursor,
+					       buf + origin_offset,
+					       index, offsets, mtr))) {
+		/* The redo log record should only have been written
+		after the write was successful. */
+		ut_error;
+	}
+
+	if (buf != buf1) {
+
+		mem_free(buf);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return(ptr + end_seg_len);
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor on an uncompressed page.
+Returns pointer to inserted record if succeed, i.e., enough
+space available, NULL otherwise. The cursor stays at the same position.
+@return	pointer to record if succeed, NULL otherwise */
+UNIV_INTERN
+rec_t*
+page_cur_insert_rec_low(
+/*====================*/
+	rec_t*		current_rec,/*!< in: pointer to current record after
+				which the new record is inserted */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	const rec_t*	rec,	/*!< in: pointer to a physical record */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
+{
+	byte*		insert_buf;
+	ulint		rec_size;
+	page_t*		page;		/*!< the relevant page */
+	rec_t*		last_insert;	/*!< cursor position at previous
+					insert */
+	rec_t*		free_rec;	/*!< a free record that was reused,
+					or NULL */
+	rec_t*		insert_rec;	/*!< inserted record */
+	ulint		heap_no;	/*!< heap number of the inserted
+					record */
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	page = page_align(current_rec);
+	ut_ad(dict_table_is_comp(index->table)
+	      == (ibool) !!page_is_comp(page));
+	ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+	ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID)
+	      == index->id || recv_recovery_is_on()
+	      || (mtr ? mtr->inside_ibuf : dict_index_is_ibuf(index)));
+
+	ut_ad(!page_rec_is_supremum(current_rec));
+
+	/* 1. Get the size of the physical record in the page */
+	rec_size = rec_offs_size(offsets);
+
+#ifdef UNIV_DEBUG_VALGRIND
+	{
+		const void*	rec_start
+			= rec - rec_offs_extra_size(offsets);
+		ulint		extra_size
+			= rec_offs_extra_size(offsets)
+			- (rec_offs_comp(offsets)
+			   ? REC_N_NEW_EXTRA_BYTES
+			   : REC_N_OLD_EXTRA_BYTES);
+
+		/* All data bytes of the record must be valid. */
+		UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+		/* The variable-length header must be valid. */
+		UNIV_MEM_ASSERT_RW(rec_start, extra_size);
+	}
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	/* 2. Try to find suitable space from page memory management */
+
+	free_rec = page_header_get_ptr(page, PAGE_FREE);
+	if (UNIV_LIKELY_NULL(free_rec)) {
+		/* Try to allocate from the head of the free list. */
+		ulint		foffsets_[REC_OFFS_NORMAL_SIZE];
+		ulint*		foffsets	= foffsets_;
+		mem_heap_t*	heap		= NULL;
+
+		rec_offs_init(foffsets_);
+
+		foffsets = rec_get_offsets(
+			free_rec, index, foffsets, ULINT_UNDEFINED, &heap);
+		if (rec_offs_size(foffsets) < rec_size) {
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+
+			goto use_heap;
+		}
+
+		insert_buf = free_rec - rec_offs_extra_size(foffsets);
+
+		if (page_is_comp(page)) {
+			heap_no = rec_get_heap_no_new(free_rec);
+			page_mem_alloc_free(page, NULL,
+					rec_get_next_ptr(free_rec, TRUE),
+					rec_size);
+		} else {
+			heap_no = rec_get_heap_no_old(free_rec);
+			page_mem_alloc_free(page, NULL,
+					rec_get_next_ptr(free_rec, FALSE),
+					rec_size);
+		}
+
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	} else {
+use_heap:
+		free_rec = NULL;
+		insert_buf = page_mem_alloc_heap(page, NULL,
+						 rec_size, &heap_no);
+
+		if (UNIV_UNLIKELY(insert_buf == NULL)) {
+			return(NULL);
+		}
+	}
+
+	/* 3. Create the record */
+	insert_rec = rec_copy(insert_buf, rec, offsets);
+	rec_offs_make_valid(insert_rec, index, offsets);
+
+	/* 4. Insert the record in the linked list of records */
+	ut_ad(current_rec != insert_rec);
+
+	{
+		/* next record after current before the insertion */
+		rec_t*	next_rec = page_rec_get_next(current_rec);
+#ifdef UNIV_DEBUG
+		if (page_is_comp(page)) {
+			ut_ad(rec_get_status(current_rec)
+				<= REC_STATUS_INFIMUM);
+			ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM);
+			ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
+		}
+#endif
+		page_rec_set_next(insert_rec, next_rec);
+		page_rec_set_next(current_rec, insert_rec);
+	}
+
+	page_header_set_field(page, NULL, PAGE_N_RECS,
+			      1 + page_get_n_recs(page));
+
+	/* 5. Set the n_owned field in the inserted record to zero,
+	and set the heap_no field */
+	if (page_is_comp(page)) {
+		rec_set_n_owned_new(insert_rec, NULL, 0);
+		rec_set_heap_no_new(insert_rec, heap_no);
+	} else {
+		rec_set_n_owned_old(insert_rec, 0);
+		rec_set_heap_no_old(insert_rec, heap_no);
+	}
+
+	UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets),
+			   rec_offs_size(offsets));
+	/* 6. Update the last insertion info in page header */
+
+	last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT);
+	ut_ad(!last_insert || !page_is_comp(page)
+	      || rec_get_node_ptr_flag(last_insert)
+	      == rec_get_node_ptr_flag(insert_rec));
+
+	if (UNIV_UNLIKELY(last_insert == NULL)) {
+		page_header_set_field(page, NULL, PAGE_DIRECTION,
+				      PAGE_NO_DIRECTION);
+		page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0);
+
+	} else if ((last_insert == current_rec)
+		   && (page_header_get_field(page, PAGE_DIRECTION)
+		       != PAGE_LEFT)) {
+
+		page_header_set_field(page, NULL, PAGE_DIRECTION,
+							PAGE_RIGHT);
+		page_header_set_field(page, NULL, PAGE_N_DIRECTION,
+				      page_header_get_field(
+					      page, PAGE_N_DIRECTION) + 1);
+
+	} else if ((page_rec_get_next(insert_rec) == last_insert)
+		   && (page_header_get_field(page, PAGE_DIRECTION)
+		       != PAGE_RIGHT)) {
+
+		page_header_set_field(page, NULL, PAGE_DIRECTION,
+							PAGE_LEFT);
+		page_header_set_field(page, NULL, PAGE_N_DIRECTION,
+				      page_header_get_field(
+					      page, PAGE_N_DIRECTION) + 1);
+	} else {
+		page_header_set_field(page, NULL, PAGE_DIRECTION,
+							PAGE_NO_DIRECTION);
+		page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0);
+	}
+
+	page_header_set_ptr(page, NULL, PAGE_LAST_INSERT, insert_rec);
+
+	/* 7. It remains to update the owner record. */
+	{
+		rec_t*	owner_rec	= page_rec_find_owner_rec(insert_rec);
+		ulint	n_owned;
+		if (page_is_comp(page)) {
+			n_owned = rec_get_n_owned_new(owner_rec);
+			rec_set_n_owned_new(owner_rec, NULL, n_owned + 1);
+		} else {
+			n_owned = rec_get_n_owned_old(owner_rec);
+			rec_set_n_owned_old(owner_rec, n_owned + 1);
+		}
+
+		/* 8. Now we have incremented the n_owned field of the owner
+		record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED,
+		we have to split the corresponding directory slot in two. */
+
+		if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) {
+			page_dir_split_slot(
+				page, NULL,
+				page_dir_find_owner_slot(owner_rec));
+		}
+	}
+
+	/* 9. Write log record of the insert */
+	if (UNIV_LIKELY(mtr != NULL)) {
+		page_cur_insert_rec_write_log(insert_rec, rec_size,
+					      current_rec, index, mtr);
+	}
+
+	btr_blob_dbg_add_rec(insert_rec, index, offsets, "insert");
+
+	return(insert_rec);
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor on a compressed and uncompressed
+page. Returns pointer to inserted record if succeed, i.e.,
+enough space available, NULL otherwise.
+The cursor stays at the same position.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return	pointer to record if succeed, NULL otherwise */
+UNIV_INTERN
+rec_t*
+page_cur_insert_rec_zip(
+/*====================*/
+	page_cur_t*	cursor,	/*!< in/out: page cursor */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	const rec_t*	rec,	/*!< in: pointer to a physical record */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
+{
+	byte*		insert_buf;
+	ulint		rec_size;
+	page_t*		page;		/*!< the relevant page */
+	rec_t*		last_insert;	/*!< cursor position at previous
+					insert */
+	rec_t*		free_rec;	/*!< a free record that was reused,
+					or NULL */
+	rec_t*		insert_rec;	/*!< inserted record */
+	ulint		heap_no;	/*!< heap number of the inserted
+					record */
+	page_zip_des_t*	page_zip;
+
+	page_zip = page_cur_get_page_zip(cursor);
+	ut_ad(page_zip);
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	page = page_cur_get_page(cursor);
+	ut_ad(dict_table_is_comp(index->table));
+	ut_ad(page_is_comp(page));
+	ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+	ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID)
+	      == index->id || recv_recovery_is_on()
+	      || (mtr ? mtr->inside_ibuf : dict_index_is_ibuf(index)));
+
+	ut_ad(!page_cur_is_after_last(cursor));
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	/* 1. Get the size of the physical record in the page */
+	rec_size = rec_offs_size(offsets);
+
+#ifdef UNIV_DEBUG_VALGRIND
+	{
+		const void*	rec_start
+			= rec - rec_offs_extra_size(offsets);
+		ulint		extra_size
+			= rec_offs_extra_size(offsets)
+			- (rec_offs_comp(offsets)
+			   ? REC_N_NEW_EXTRA_BYTES
+			   : REC_N_OLD_EXTRA_BYTES);
+
+		/* All data bytes of the record must be valid. */
+		UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+		/* The variable-length header must be valid. */
+		UNIV_MEM_ASSERT_RW(rec_start, extra_size);
+	}
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	const bool reorg_before_insert = page_has_garbage(page)
+		&& rec_size > page_get_max_insert_size(page, 1)
+		&& rec_size <= page_get_max_insert_size_after_reorganize(
+			page, 1);
+
+	/* 2. Try to find suitable space from page memory management */
+	if (!page_zip_available(page_zip, dict_index_is_clust(index),
+				rec_size, 1)
+	    || reorg_before_insert) {
+		/* The values can change dynamically. */
+		bool	log_compressed	= page_zip_log_pages;
+		ulint	level		= page_zip_level;
+#ifdef UNIV_DEBUG
+		rec_t*	cursor_rec	= page_cur_get_rec(cursor);
+#endif /* UNIV_DEBUG */
+
+		/* If we are not writing compressed page images, we
+		must reorganize the page before attempting the
+		insert. */
+		if (recv_recovery_is_on()) {
+			/* Insert into the uncompressed page only.
+			The page reorganization or creation that we
+			would attempt outside crash recovery would
+			have been covered by a previous redo log record. */
+		} else if (page_is_empty(page)) {
+			ut_ad(page_cur_is_before_first(cursor));
+
+			/* This is an empty page. Recreate it to
+			get rid of the modification log. */
+			page_create_zip(page_cur_get_block(cursor), index,
+					page_header_get_field(page, PAGE_LEVEL),
+					0, mtr);
+			ut_ad(!page_header_get_ptr(page, PAGE_FREE));
+
+			if (page_zip_available(
+				    page_zip, dict_index_is_clust(index),
+				    rec_size, 1)) {
+				goto use_heap;
+			}
+
+			/* The cursor should remain on the page infimum. */
+			return(NULL);
+		} else if (!page_zip->m_nonempty && !page_has_garbage(page)) {
+			/* The page has been freshly compressed, so
+			reorganizing it will not help. */
+		} else if (log_compressed && !reorg_before_insert) {
+			/* Insert into uncompressed page only, and
+			try page_zip_reorganize() afterwards. */
+		} else if (btr_page_reorganize_low(
+				   recv_recovery_is_on(), level,
+				   cursor, index, mtr)) {
+			ut_ad(!page_header_get_ptr(page, PAGE_FREE));
+
+			if (page_zip_available(
+				    page_zip, dict_index_is_clust(index),
+				    rec_size, 1)) {
+				/* After reorganizing, there is space
+				available. */
+				goto use_heap;
+			}
+		} else {
+			ut_ad(cursor->rec == cursor_rec);
+			return(NULL);
+		}
+
+		/* Try compressing the whole page afterwards. */
+		insert_rec = page_cur_insert_rec_low(
+			cursor->rec, index, rec, offsets, NULL);
+
+		/* If recovery is on, this implies that the compression
+		of the page was successful during runtime. Had that not
+		been the case or had the redo logging of compressed
+		pages been enabled during runtime then we'd have seen
+		a MLOG_ZIP_PAGE_COMPRESS redo record. Therefore, we
+		know that we don't need to reorganize the page. We,
+		however, do need to recompress the page. That will
+		happen when the next redo record is read which must
+		be of type MLOG_ZIP_PAGE_COMPRESS_NO_DATA and it must
+		contain a valid compression level value.
+		This implies that during recovery from this point till
+		the next redo is applied the uncompressed and
+		compressed versions are not identical and
+		page_zip_validate will fail but that is OK because
+		we call page_zip_validate only after processing
+		all changes to a page under a single mtr during
+		recovery. */
+		if (insert_rec == NULL) {
+			/* Out of space.
+			This should never occur during crash recovery,
+			because the MLOG_COMP_REC_INSERT should only
+			be logged after a successful operation. */
+			ut_ad(!recv_recovery_is_on());
+		} else if (recv_recovery_is_on()) {
+			/* This should be followed by
+			MLOG_ZIP_PAGE_COMPRESS_NO_DATA,
+			which should succeed. */
+			rec_offs_make_valid(insert_rec, index, offsets);
+		} else {
+			ulint	pos = page_rec_get_n_recs_before(insert_rec);
+			ut_ad(pos > 0);
+
+			if (!log_compressed) {
+				if (page_zip_compress(
+					    page_zip, page, index,
+					    level, NULL)) {
+					page_cur_insert_rec_write_log(
+						insert_rec, rec_size,
+						cursor->rec, index, mtr);
+					page_zip_compress_write_log_no_data(
+						level, page, index, mtr);
+
+					rec_offs_make_valid(
+						insert_rec, index, offsets);
+					return(insert_rec);
+				}
+
+				ut_ad(cursor->rec
+				      == (pos > 1
+					  ? page_rec_get_nth(
+						  page, pos - 1)
+					  : page + PAGE_NEW_INFIMUM));
+			} else {
+				/* We are writing entire page images
+				to the log. Reduce the redo log volume
+				by reorganizing the page at the same time. */
+				if (page_zip_reorganize(
+					    cursor->block, index, mtr)) {
+					/* The page was reorganized:
+					Seek to pos. */
+					if (pos > 1) {
+						cursor->rec = page_rec_get_nth(
+							page, pos - 1);
+					} else {
+						cursor->rec = page
+							+ PAGE_NEW_INFIMUM;
+					}
+
+					insert_rec = page + rec_get_next_offs(
+						cursor->rec, TRUE);
+					rec_offs_make_valid(
+						insert_rec, index, offsets);
+					return(insert_rec);
+				}
+
+				/* Theoretically, we could try one
+				last resort of btr_page_reorganize_low()
+				followed by page_zip_available(), but
+				that would be very unlikely to
+				succeed. (If the full reorganized page
+				failed to compress, why would it
+				succeed to compress the page, plus log
+				the insert of this record? */
+			}
+
+			/* Out of space: restore the page */
+			btr_blob_dbg_remove(page, index, "insert_zip_fail");
+			if (!page_zip_decompress(page_zip, page, FALSE)) {
+				ut_error; /* Memory corrupted? */
+			}
+			ut_ad(page_validate(page, index));
+			btr_blob_dbg_add(page, index, "insert_zip_fail");
+			insert_rec = NULL;
+		}
+
+		return(insert_rec);
+	}
+
+	free_rec = page_header_get_ptr(page, PAGE_FREE);
+	if (UNIV_LIKELY_NULL(free_rec)) {
+		/* Try to allocate from the head of the free list. */
+		lint	extra_size_diff;
+		ulint		foffsets_[REC_OFFS_NORMAL_SIZE];
+		ulint*		foffsets	= foffsets_;
+		mem_heap_t*	heap		= NULL;
+
+		rec_offs_init(foffsets_);
+
+		foffsets = rec_get_offsets(free_rec, index, foffsets,
+					   ULINT_UNDEFINED, &heap);
+		if (rec_offs_size(foffsets) < rec_size) {
+too_small:
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+
+			goto use_heap;
+		}
+
+		insert_buf = free_rec - rec_offs_extra_size(foffsets);
+
+		/* On compressed pages, do not relocate records from
+		the free list.  If extra_size would grow, use the heap. */
+		extra_size_diff
+			= rec_offs_extra_size(offsets)
+			- rec_offs_extra_size(foffsets);
+
+		if (UNIV_UNLIKELY(extra_size_diff < 0)) {
+			/* Add an offset to the extra_size. */
+			if (rec_offs_size(foffsets)
+			    < rec_size - extra_size_diff) {
+
+				goto too_small;
+			}
+
+			insert_buf -= extra_size_diff;
+		} else if (UNIV_UNLIKELY(extra_size_diff)) {
+			/* Do not allow extra_size to grow */
+
+			goto too_small;
+		}
+
+		heap_no = rec_get_heap_no_new(free_rec);
+		page_mem_alloc_free(page, page_zip,
+				    rec_get_next_ptr(free_rec, TRUE),
+				    rec_size);
+
+		if (!page_is_leaf(page)) {
+			/* Zero out the node pointer of free_rec,
+			in case it will not be overwritten by
+			insert_rec. */
+
+			ut_ad(rec_size > REC_NODE_PTR_SIZE);
+
+			if (rec_offs_extra_size(foffsets)
+			    + rec_offs_data_size(foffsets) > rec_size) {
+
+				memset(rec_get_end(free_rec, foffsets)
+				       - REC_NODE_PTR_SIZE, 0,
+				       REC_NODE_PTR_SIZE);
+			}
+		} else if (dict_index_is_clust(index)) {
+			/* Zero out the DB_TRX_ID and DB_ROLL_PTR
+			columns of free_rec, in case it will not be
+			overwritten by insert_rec. */
+
+			ulint	trx_id_col;
+			ulint	trx_id_offs;
+			ulint	len;
+
+			trx_id_col = dict_index_get_sys_col_pos(index,
+								DATA_TRX_ID);
+			ut_ad(trx_id_col > 0);
+			ut_ad(trx_id_col != ULINT_UNDEFINED);
+
+			trx_id_offs = rec_get_nth_field_offs(foffsets,
+							     trx_id_col, &len);
+			ut_ad(len == DATA_TRX_ID_LEN);
+
+			if (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + trx_id_offs
+			    + rec_offs_extra_size(foffsets) > rec_size) {
+				/* We will have to zero out the
+				DB_TRX_ID and DB_ROLL_PTR, because
+				they will not be fully overwritten by
+				insert_rec. */
+
+				memset(free_rec + trx_id_offs, 0,
+				       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+			}
+
+			ut_ad(free_rec + trx_id_offs + DATA_TRX_ID_LEN
+			      == rec_get_nth_field(free_rec, foffsets,
+						   trx_id_col + 1, &len));
+			ut_ad(len == DATA_ROLL_PTR_LEN);
+		}
+
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	} else {
+use_heap:
+		free_rec = NULL;
+		insert_buf = page_mem_alloc_heap(page, page_zip,
+						 rec_size, &heap_no);
+
+		if (UNIV_UNLIKELY(insert_buf == NULL)) {
+			return(NULL);
+		}
+
+		page_zip_dir_add_slot(page_zip, dict_index_is_clust(index));
+	}
+
+	/* 3. Create the record */
+	insert_rec = rec_copy(insert_buf, rec, offsets);
+	rec_offs_make_valid(insert_rec, index, offsets);
+
+	/* 4. Insert the record in the linked list of records */
+	ut_ad(cursor->rec != insert_rec);
+
+	{
+		/* next record after current before the insertion */
+		const rec_t*	next_rec = page_rec_get_next_low(
+			cursor->rec, TRUE);
+		ut_ad(rec_get_status(cursor->rec)
+		      <= REC_STATUS_INFIMUM);
+		ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM);
+		ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
+
+		page_rec_set_next(insert_rec, next_rec);
+		page_rec_set_next(cursor->rec, insert_rec);
+	}
+
+	page_header_set_field(page, page_zip, PAGE_N_RECS,
+			      1 + page_get_n_recs(page));
+
+	/* 5. Set the n_owned field in the inserted record to zero,
+	and set the heap_no field */
+	rec_set_n_owned_new(insert_rec, NULL, 0);
+	rec_set_heap_no_new(insert_rec, heap_no);
+
+	UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets),
+			   rec_offs_size(offsets));
+
+	page_zip_dir_insert(page_zip, cursor->rec, free_rec, insert_rec);
+
+	/* 6. Update the last insertion info in page header */
+
+	last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT);
+	ut_ad(!last_insert
+	      || rec_get_node_ptr_flag(last_insert)
+	      == rec_get_node_ptr_flag(insert_rec));
+
+	if (UNIV_UNLIKELY(last_insert == NULL)) {
+		page_header_set_field(page, page_zip, PAGE_DIRECTION,
+							PAGE_NO_DIRECTION);
+		page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0);
+
+	} else if ((last_insert == cursor->rec)
+		   && (page_header_get_field(page, PAGE_DIRECTION)
+		       != PAGE_LEFT)) {
+
+		page_header_set_field(page, page_zip, PAGE_DIRECTION,
+							PAGE_RIGHT);
+		page_header_set_field(page, page_zip, PAGE_N_DIRECTION,
+				      page_header_get_field(
+					      page, PAGE_N_DIRECTION) + 1);
+
+	} else if ((page_rec_get_next(insert_rec) == last_insert)
+		   && (page_header_get_field(page, PAGE_DIRECTION)
+		       != PAGE_RIGHT)) {
+
+		page_header_set_field(page, page_zip, PAGE_DIRECTION,
+							PAGE_LEFT);
+		page_header_set_field(page, page_zip, PAGE_N_DIRECTION,
+				      page_header_get_field(
+					      page, PAGE_N_DIRECTION) + 1);
+	} else {
+		page_header_set_field(page, page_zip, PAGE_DIRECTION,
+							PAGE_NO_DIRECTION);
+		page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0);
+	}
+
+	page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, insert_rec);
+
+	/* 7. It remains to update the owner record. */
+	{
+		rec_t*	owner_rec	= page_rec_find_owner_rec(insert_rec);
+		ulint	n_owned;
+
+		n_owned = rec_get_n_owned_new(owner_rec);
+		rec_set_n_owned_new(owner_rec, page_zip, n_owned + 1);
+
+		/* 8. Now we have incremented the n_owned field of the owner
+		record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED,
+		we have to split the corresponding directory slot in two. */
+
+		if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) {
+			page_dir_split_slot(
+				page, page_zip,
+				page_dir_find_owner_slot(owner_rec));
+		}
+	}
+
+	page_zip_write_rec(page_zip, insert_rec, index, offsets, 1);
+
+	btr_blob_dbg_add_rec(insert_rec, index, offsets, "insert_zip_ok");
+
+	/* 9. Write log record of the insert */
+	if (UNIV_LIKELY(mtr != NULL)) {
+		page_cur_insert_rec_write_log(insert_rec, rec_size,
+					      cursor->rec, index, mtr);
+	}
+
+	return(insert_rec);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Writes a log record of copying a record list end to a new created page.
+@return 4-byte field where to write the log data length, or NULL if
+logging is disabled */
+UNIV_INLINE
+byte*
+page_copy_rec_list_to_created_page_write_log(
+/*=========================================*/
+	page_t*		page,	/*!< in: index page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	byte*	log_ptr;
+
+	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+
+	log_ptr = mlog_open_and_write_index(mtr, page, index,
+					    page_is_comp(page)
+					    ? MLOG_COMP_LIST_END_COPY_CREATED
+					    : MLOG_LIST_END_COPY_CREATED, 4);
+	if (UNIV_LIKELY(log_ptr != NULL)) {
+		mlog_close(mtr, log_ptr + 4);
+	}
+
+	return(log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************//**
+Parses a log record of copying a record list end to a new created page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_copy_rec_list_to_created_page(
+/*=====================================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	buf_block_t*	block,	/*!< in: page or NULL */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr or NULL */
+{
+	byte*		rec_end;
+	ulint		log_data_len;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+
+	if (ptr + 4 > end_ptr) {
+
+		return(NULL);
+	}
+
+	log_data_len = mach_read_from_4(ptr);
+	ptr += 4;
+
+	rec_end = ptr + log_data_len;
+
+	if (rec_end > end_ptr) {
+
+		return(NULL);
+	}
+
+	if (!block) {
+
+		return(rec_end);
+	}
+
+	while (ptr < rec_end) {
+		ptr = page_cur_parse_insert_rec(TRUE, ptr, end_ptr,
+						block, index, mtr);
+	}
+
+	ut_a(ptr == rec_end);
+
+	page = buf_block_get_frame(block);
+	page_zip = buf_block_get_page_zip(block);
+
+	page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL);
+	page_header_set_field(page, page_zip, PAGE_DIRECTION,
+							PAGE_NO_DIRECTION);
+	page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0);
+
+	return(rec_end);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Copies records from page to a newly created page, from a given record onward,
+including that record. Infimum and supremum records are not copied.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit(). */
+UNIV_INTERN
+void
+page_copy_rec_list_end_to_created_page(
+/*===================================*/
+	page_t*		new_page,	/*!< in/out: index page to copy to */
+	rec_t*		rec,		/*!< in: first record to copy */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_dir_slot_t* slot = 0; /* remove warning */
+	byte*	heap_top;
+	rec_t*	insert_rec = 0; /* remove warning */
+	rec_t*	prev_rec;
+	ulint	count;
+	ulint	n_recs;
+	ulint	slot_index;
+	ulint	rec_size;
+	ulint	log_mode;
+	byte*	log_ptr;
+	ulint	log_data_len;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW);
+	ut_ad(page_align(rec) != new_page);
+	ut_ad(page_rec_is_comp(rec) == page_is_comp(new_page));
+
+	if (page_rec_is_infimum(rec)) {
+
+		rec = page_rec_get_next(rec);
+	}
+
+	if (page_rec_is_supremum(rec)) {
+
+		return;
+	}
+
+#ifdef UNIV_DEBUG
+	/* To pass the debug tests we have to set these dummy values
+	in the debug version */
+	page_dir_set_n_slots(new_page, NULL, UNIV_PAGE_SIZE / 2);
+	page_header_set_ptr(new_page, NULL, PAGE_HEAP_TOP,
+			    new_page + UNIV_PAGE_SIZE - 1);
+#endif
+
+	log_ptr = page_copy_rec_list_to_created_page_write_log(new_page,
+							       index, mtr);
+
+	log_data_len = dyn_array_get_data_size(&(mtr->log));
+
+	/* Individual inserts are logged in a shorter form */
+
+	log_mode = mtr_set_log_mode(mtr, MTR_LOG_SHORT_INSERTS);
+
+	prev_rec = page_get_infimum_rec(new_page);
+	if (page_is_comp(new_page)) {
+		heap_top = new_page + PAGE_NEW_SUPREMUM_END;
+	} else {
+		heap_top = new_page + PAGE_OLD_SUPREMUM_END;
+	}
+	count = 0;
+	slot_index = 0;
+	n_recs = 0;
+
+	do {
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		insert_rec = rec_copy(heap_top, rec, offsets);
+
+		if (page_is_comp(new_page)) {
+			rec_set_next_offs_new(prev_rec,
+					      page_offset(insert_rec));
+
+			rec_set_n_owned_new(insert_rec, NULL, 0);
+			rec_set_heap_no_new(insert_rec,
+					    PAGE_HEAP_NO_USER_LOW + n_recs);
+		} else {
+			rec_set_next_offs_old(prev_rec,
+					      page_offset(insert_rec));
+
+			rec_set_n_owned_old(insert_rec, 0);
+			rec_set_heap_no_old(insert_rec,
+					    PAGE_HEAP_NO_USER_LOW + n_recs);
+		}
+
+		count++;
+		n_recs++;
+
+		if (UNIV_UNLIKELY
+		    (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)) {
+
+			slot_index++;
+
+			slot = page_dir_get_nth_slot(new_page, slot_index);
+
+			page_dir_slot_set_rec(slot, insert_rec);
+			page_dir_slot_set_n_owned(slot, NULL, count);
+
+			count = 0;
+		}
+
+		rec_size = rec_offs_size(offsets);
+
+		ut_ad(heap_top < new_page + UNIV_PAGE_SIZE);
+
+		heap_top += rec_size;
+
+		rec_offs_make_valid(insert_rec, index, offsets);
+		btr_blob_dbg_add_rec(insert_rec, index, offsets, "copy_end");
+
+		page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec,
+					      index, mtr);
+		prev_rec = insert_rec;
+		rec = page_rec_get_next(rec);
+	} while (!page_rec_is_supremum(rec));
+
+	if ((slot_index > 0) && (count + 1
+				 + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2
+				 <= PAGE_DIR_SLOT_MAX_N_OWNED)) {
+		/* We can merge the two last dir slots. This operation is
+		here to make this function imitate exactly the equivalent
+		task made using page_cur_insert_rec, which we use in database
+		recovery to reproduce the task performed by this function.
+		To be able to check the correctness of recovery, it is good
+		that it imitates exactly. */
+
+		count += (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2;
+
+		page_dir_slot_set_n_owned(slot, NULL, 0);
+
+		slot_index--;
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	log_data_len = dyn_array_get_data_size(&(mtr->log)) - log_data_len;
+
+	ut_a(log_data_len < 100 * UNIV_PAGE_SIZE);
+
+	if (UNIV_LIKELY(log_ptr != NULL)) {
+		mach_write_to_4(log_ptr, log_data_len);
+	}
+
+	if (page_is_comp(new_page)) {
+		rec_set_next_offs_new(insert_rec, PAGE_NEW_SUPREMUM);
+	} else {
+		rec_set_next_offs_old(insert_rec, PAGE_OLD_SUPREMUM);
+	}
+
+	slot = page_dir_get_nth_slot(new_page, 1 + slot_index);
+
+	page_dir_slot_set_rec(slot, page_get_supremum_rec(new_page));
+	page_dir_slot_set_n_owned(slot, NULL, count + 1);
+
+	page_dir_set_n_slots(new_page, NULL, 2 + slot_index);
+	page_header_set_ptr(new_page, NULL, PAGE_HEAP_TOP, heap_top);
+	page_dir_set_n_heap(new_page, NULL, PAGE_HEAP_NO_USER_LOW + n_recs);
+	page_header_set_field(new_page, NULL, PAGE_N_RECS, n_recs);
+
+	page_header_set_ptr(new_page, NULL, PAGE_LAST_INSERT, NULL);
+	page_header_set_field(new_page, NULL, PAGE_DIRECTION,
+							PAGE_NO_DIRECTION);
+	page_header_set_field(new_page, NULL, PAGE_N_DIRECTION, 0);
+
+	/* Restore the log mode */
+
+	mtr_set_log_mode(mtr, log_mode);
+}
+
+/***********************************************************//**
+Writes log record of a record delete on a page. */
+UNIV_INLINE
+void
+page_cur_delete_rec_write_log(
+/*==========================*/
+	rec_t*			rec,	/*!< in: record to be deleted */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	byte*	log_ptr;
+
+	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+
+	log_ptr = mlog_open_and_write_index(mtr, rec, index,
+					    page_rec_is_comp(rec)
+					    ? MLOG_COMP_REC_DELETE
+					    : MLOG_REC_DELETE, 2);
+
+	if (!log_ptr) {
+		/* Logging in mtr is switched off during crash recovery:
+		in that case mlog_open returns NULL */
+		return;
+	}
+
+	/* Write the cursor rec offset as a 2-byte ulint */
+	mach_write_to_2(log_ptr, page_offset(rec));
+
+	mlog_close(mtr, log_ptr + 2);
+}
+#else /* !UNIV_HOTBACKUP */
+# define page_cur_delete_rec_write_log(rec,index,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses log record of a record delete on a page.
+@return	pointer to record end or NULL */
+UNIV_INTERN
+byte*
+page_cur_parse_delete_rec(
+/*======================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	buf_block_t*	block,	/*!< in: page or NULL */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr or NULL */
+{
+	ulint		offset;
+	page_cur_t	cursor;
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	/* Read the cursor rec offset as a 2-byte ulint */
+	offset = mach_read_from_2(ptr);
+	ptr += 2;
+
+	ut_a(offset <= UNIV_PAGE_SIZE);
+
+	if (block) {
+		page_t*		page		= buf_block_get_frame(block);
+		mem_heap_t*	heap		= NULL;
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		rec_t*		rec		= page + offset;
+		rec_offs_init(offsets_);
+
+		page_cur_position(rec, block, &cursor);
+		ut_ad(!buf_block_get_page_zip(block) || page_is_comp(page));
+
+		page_cur_delete_rec(&cursor, index,
+				    rec_get_offsets(rec, index, offsets_,
+						    ULINT_UNDEFINED, &heap),
+				    mtr);
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	}
+
+	return(ptr);
+}
+
+/***********************************************************//**
+Deletes a record at the page cursor. The cursor is moved to the next
+record after the deleted one. */
+UNIV_INTERN
+void
+page_cur_delete_rec(
+/*================*/
+	page_cur_t*		cursor,	/*!< in/out: a page cursor */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const ulint*		offsets,/*!< in: rec_get_offsets(
+					cursor->rec, index) */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle
+					or NULL */
+{
+	page_dir_slot_t* cur_dir_slot;
+	page_dir_slot_t* prev_slot;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	rec_t*		current_rec;
+	rec_t*		prev_rec	= NULL;
+	rec_t*		next_rec;
+	ulint		cur_slot_no;
+	ulint		cur_n_owned;
+	rec_t*		rec;
+
+	page = page_cur_get_page(cursor);
+	page_zip = page_cur_get_page_zip(cursor);
+
+	/* page_zip_validate() will fail here when
+	btr_cur_pessimistic_delete() invokes btr_set_min_rec_mark().
+	Then, both "page_zip" and "page" would have the min-rec-mark
+	set on the smallest user record, but "page" would additionally
+	have it set on the smallest-but-one record.  Because sloppy
+	page_zip_validate_low() only ignores min-rec-flag differences
+	in the smallest user record, it cannot be used here either. */
+
+	current_rec = cursor->rec;
+	ut_ad(rec_offs_validate(current_rec, index, offsets));
+	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+	ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+	ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID)
+	      == index->id || recv_recovery_is_on()
+	      || (mtr ? mtr->inside_ibuf : dict_index_is_ibuf(index)));
+
+	/* The record must not be the supremum or infimum record. */
+	ut_ad(page_rec_is_user_rec(current_rec));
+
+	if (page_get_n_recs(page) == 1 && !recv_recovery_is_on()) {
+		/* Empty the page, unless we are applying the redo log
+		during crash recovery. During normal operation, the
+		page_create_empty() gets logged as one of MLOG_PAGE_CREATE,
+		MLOG_COMP_PAGE_CREATE, MLOG_ZIP_PAGE_COMPRESS. */
+		ut_ad(page_is_leaf(page));
+		/* Usually, this should be the root page,
+		and the whole index tree should become empty.
+		However, this could also be a call in
+		btr_cur_pessimistic_update() to delete the only
+		record in the page and to insert another one. */
+		page_cur_move_to_next(cursor);
+		ut_ad(page_cur_is_after_last(cursor));
+		page_create_empty(page_cur_get_block(cursor),
+				  const_cast<dict_index_t*>(index), mtr);
+		return;
+	}
+
+	/* Save to local variables some data associated with current_rec */
+	cur_slot_no = page_dir_find_owner_slot(current_rec);
+	ut_ad(cur_slot_no > 0);
+	cur_dir_slot = page_dir_get_nth_slot(page, cur_slot_no);
+	cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot);
+
+	/* 0. Write the log record */
+	if (mtr != 0) {
+		page_cur_delete_rec_write_log(current_rec, index, mtr);
+	}
+
+	/* 1. Reset the last insert info in the page header and increment
+	the modify clock for the frame */
+
+	page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL);
+
+	/* The page gets invalid for optimistic searches: increment the
+	frame modify clock only if there is an mini-transaction covering
+	the change. During IMPORT we allocate local blocks that are not
+	part of the buffer pool. */
+
+	if (mtr != 0) {
+		buf_block_modify_clock_inc(page_cur_get_block(cursor));
+	}
+
+	/* 2. Find the next and the previous record. Note that the cursor is
+	left at the next record. */
+
+	ut_ad(cur_slot_no > 0);
+	prev_slot = page_dir_get_nth_slot(page, cur_slot_no - 1);
+
+	rec = (rec_t*) page_dir_slot_get_rec(prev_slot);
+
+	/* rec now points to the record of the previous directory slot. Look
+	for the immediate predecessor of current_rec in a loop. */
+
+	while(current_rec != rec) {
+		prev_rec = rec;
+		rec = page_rec_get_next(rec);
+	}
+
+	page_cur_move_to_next(cursor);
+	next_rec = cursor->rec;
+
+	/* 3. Remove the record from the linked list of records */
+
+	page_rec_set_next(prev_rec, next_rec);
+
+	/* 4. If the deleted record is pointed to by a dir slot, update the
+	record pointer in slot. In the following if-clause we assume that
+	prev_rec is owned by the same slot, i.e., PAGE_DIR_SLOT_MIN_N_OWNED
+	>= 2. */
+
+#if PAGE_DIR_SLOT_MIN_N_OWNED < 2
+# error "PAGE_DIR_SLOT_MIN_N_OWNED < 2"
+#endif
+	ut_ad(cur_n_owned > 1);
+
+	if (current_rec == page_dir_slot_get_rec(cur_dir_slot)) {
+		page_dir_slot_set_rec(cur_dir_slot, prev_rec);
+	}
+
+	/* 5. Update the number of owned records of the slot */
+
+	page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1);
+
+	/* 6. Free the memory occupied by the record */
+	btr_blob_dbg_remove_rec(current_rec, const_cast<dict_index_t*>(index),
+				offsets, "delete");
+	page_mem_free(page, page_zip, current_rec, index, offsets);
+
+	/* 7. Now we have decremented the number of owned records of the slot.
+	If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the
+	slots. */
+
+	if (cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
+		page_dir_balance_slot(page, page_zip, cur_slot_no);
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+/*******************************************************************//**
+Print the first n numbers, generated by page_cur_lcg_prng() to make sure
+(visually) that it works properly. */
+void
+test_page_cur_lcg_prng(
+/*===================*/
+	int	n)	/*!< in: print first n numbers */
+{
+	int			i;
+	unsigned long long	rnd;
+
+	for (i = 0; i < n; i++) {
+		rnd = page_cur_lcg_prng();
+		printf("%llu\t%%2=%llu %%3=%llu %%5=%llu %%7=%llu %%11=%llu\n",
+		       rnd,
+		       rnd % 2,
+		       rnd % 3,
+		       rnd % 5,
+		       rnd % 7,
+		       rnd % 11);
+	}
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc
new file mode 100644
index 00000000000..bd5fb36af8f
--- /dev/null
+++ b/storage/innobase/page/page0page.cc
@@ -0,0 +1,2813 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file page/page0page.cc
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#define THIS_MODULE
+#include "page0page.h"
+#ifdef UNIV_NONINL
+#include "page0page.ic"
+#endif
+#undef THIS_MODULE
+
+#include "page0cur.h"
+#include "page0zip.h"
+#include "buf0buf.h"
+#include "btr0btr.h"
+#ifndef UNIV_HOTBACKUP
+# include "srv0srv.h"
+# include "lock0lock.h"
+# include "fut0lst.h"
+# include "btr0sea.h"
+#endif /* !UNIV_HOTBACKUP */
+
+/*			THE INDEX PAGE
+			==============
+
+The index page consists of a page header which contains the page's
+id and other information. On top of it are the index records
+in a heap linked into a one way linear list according to alphabetic order.
+
+Just below page end is an array of pointers which we call page directory,
+to about every sixth record in the list. The pointers are placed in
+the directory in the alphabetical order of the records pointed to,
+enabling us to make binary search using the array. Each slot n:o I
+in the directory points to a record, where a 4-bit field contains a count
+of those records which are in the linear list between pointer I and
+the pointer I - 1 in the directory, including the record
+pointed to by pointer I and not including the record pointed to by I - 1.
+We say that the record pointed to by slot I, or that slot I, owns
+these records. The count is always kept in the range 4 to 8, with
+the exception that it is 1 for the first slot, and 1--8 for the second slot.
+
+An essentially binary search can be performed in the list of index
+records, like we could do if we had pointer to every record in the
+page directory. The data structure is, however, more efficient when
+we are doing inserts, because most inserts are just pushed on a heap.
+Only every 8th insert requires block move in the directory pointer
+table, which itself is quite small. A record is deleted from the page
+by just taking it off the linear list and updating the number of owned
+records-field of the record which owns it, and updating the page directory,
+if necessary. A special case is the one when the record owns itself.
+Because the overhead of inserts is so small, we may also increase the
+page size from the projected default of 8 kB to 64 kB without too
+much loss of efficiency in inserts. Bigger page becomes actual
+when the disk transfer rate compared to seek and latency time rises.
+On the present system, the page size is set so that the page transfer
+time (3 ms) is 20 % of the disk random access time (15 ms).
+
+When the page is split, merged, or becomes full but contains deleted
+records, we have to reorganize the page.
+
+Assuming a page size of 8 kB, a typical index page of a secondary
+index contains 300 index entries, and the size of the page directory
+is 50 x 4 bytes = 200 bytes. */
+
+/***************************************************************//**
+Looks for the directory slot which owns the given record.
+@return	the directory slot number */
+UNIV_INTERN
+ulint
+page_dir_find_owner_slot(
+/*=====================*/
+	const rec_t*	rec)	/*!< in: the physical record */
+{
+	const page_t*			page;
+	register uint16			rec_offs_bytes;
+	register const page_dir_slot_t*	slot;
+	register const page_dir_slot_t*	first_slot;
+	register const rec_t*		r = rec;
+
+	ut_ad(page_rec_check(rec));
+
+	page = page_align(rec);
+	first_slot = page_dir_get_nth_slot(page, 0);
+	slot = page_dir_get_nth_slot(page, page_dir_get_n_slots(page) - 1);
+
+	if (page_is_comp(page)) {
+		while (rec_get_n_owned_new(r) == 0) {
+			r = rec_get_next_ptr_const(r, TRUE);
+			ut_ad(r >= page + PAGE_NEW_SUPREMUM);
+			ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR));
+		}
+	} else {
+		while (rec_get_n_owned_old(r) == 0) {
+			r = rec_get_next_ptr_const(r, FALSE);
+			ut_ad(r >= page + PAGE_OLD_SUPREMUM);
+			ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR));
+		}
+	}
+
+	rec_offs_bytes = mach_encode_2(r - page);
+
+	while (UNIV_LIKELY(*(uint16*) slot != rec_offs_bytes)) {
+
+		if (UNIV_UNLIKELY(slot == first_slot)) {
+			fprintf(stderr,
+				"InnoDB: Probable data corruption on"
+				" page %lu\n"
+				"InnoDB: Original record ",
+				(ulong) page_get_page_no(page));
+
+			if (page_is_comp(page)) {
+				fputs("(compact record)", stderr);
+			} else {
+				rec_print_old(stderr, rec);
+			}
+
+			fputs("\n"
+			      "InnoDB: on that page.\n"
+			      "InnoDB: Cannot find the dir slot for record ",
+			      stderr);
+			if (page_is_comp(page)) {
+				fputs("(compact record)", stderr);
+			} else {
+				rec_print_old(stderr, page
+					      + mach_decode_2(rec_offs_bytes));
+			}
+			fputs("\n"
+			      "InnoDB: on that page!\n", stderr);
+
+			buf_page_print(page, 0, 0);
+
+			ut_error;
+		}
+
+		slot += PAGE_DIR_SLOT_SIZE;
+	}
+
+	return(((ulint) (first_slot - slot)) / PAGE_DIR_SLOT_SIZE);
+}
+
+/**************************************************************//**
+Used to check the consistency of a directory slot.
+@return	TRUE if succeed */
+static
+ibool
+page_dir_slot_check(
+/*================*/
+	const page_dir_slot_t*	slot)	/*!< in: slot */
+{
+	const page_t*	page;
+	ulint		n_slots;
+	ulint		n_owned;
+
+	ut_a(slot);
+
+	page = page_align(slot);
+
+	n_slots = page_dir_get_n_slots(page);
+
+	ut_a(slot <= page_dir_get_nth_slot(page, 0));
+	ut_a(slot >= page_dir_get_nth_slot(page, n_slots - 1));
+
+	ut_a(page_rec_check(page_dir_slot_get_rec(slot)));
+
+	if (page_is_comp(page)) {
+		n_owned = rec_get_n_owned_new(page_dir_slot_get_rec(slot));
+	} else {
+		n_owned = rec_get_n_owned_old(page_dir_slot_get_rec(slot));
+	}
+
+	if (slot == page_dir_get_nth_slot(page, 0)) {
+		ut_a(n_owned == 1);
+	} else if (slot == page_dir_get_nth_slot(page, n_slots - 1)) {
+		ut_a(n_owned >= 1);
+		ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED);
+	} else {
+		ut_a(n_owned >= PAGE_DIR_SLOT_MIN_N_OWNED);
+		ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED);
+	}
+
+	return(TRUE);
+}
+
+/*************************************************************//**
+Sets the max trx id field value. */
+UNIV_INTERN
+void
+page_set_max_trx_id(
+/*================*/
+	buf_block_t*	block,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction, or NULL */
+{
+	page_t*		page		= buf_block_get_frame(block);
+#ifndef UNIV_HOTBACKUP
+	ut_ad(!mtr || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+#endif /* !UNIV_HOTBACKUP */
+
+	/* It is not necessary to write this change to the redo log, as
+	during a database recovery we assume that the max trx id of every
+	page is the maximum trx id assigned before the crash. */
+
+	if (page_zip) {
+		mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id);
+		page_zip_write_header(page_zip,
+				      page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
+				      8, mtr);
+#ifndef UNIV_HOTBACKUP
+	} else if (mtr) {
+		mlog_write_ull(page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
+			       trx_id, mtr);
+#endif /* !UNIV_HOTBACKUP */
+	} else {
+		mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id);
+	}
+}
+
+/************************************************************//**
+Allocates a block of memory from the heap of an index page.
+@return	pointer to start of allocated buffer, or NULL if allocation fails */
+UNIV_INTERN
+byte*
+page_mem_alloc_heap(
+/*================*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page with enough
+				space available for inserting the record,
+				or NULL */
+	ulint		need,	/*!< in: total number of bytes needed */
+	ulint*		heap_no)/*!< out: this contains the heap number
+				of the allocated record
+				if allocation succeeds */
+{
+	byte*	block;
+	ulint	avl_space;
+
+	ut_ad(page && heap_no);
+
+	avl_space = page_get_max_insert_size(page, 1);
+
+	if (avl_space >= need) {
+		block = page_header_get_ptr(page, PAGE_HEAP_TOP);
+
+		page_header_set_ptr(page, page_zip, PAGE_HEAP_TOP,
+				    block + need);
+		*heap_no = page_dir_get_n_heap(page);
+
+		page_dir_set_n_heap(page, page_zip, 1 + *heap_no);
+
+		return(block);
+	}
+
+	return(NULL);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Writes a log record of page creation. */
+UNIV_INLINE
+void
+page_create_write_log(
+/*==================*/
+	buf_frame_t*	frame,	/*!< in: a buffer frame where the page is
+				created */
+	mtr_t*		mtr,	/*!< in: mini-transaction handle */
+	ibool		comp)	/*!< in: TRUE=compact page format */
+{
+	mlog_write_initial_log_record(frame, comp
+				      ? MLOG_COMP_PAGE_CREATE
+				      : MLOG_PAGE_CREATE, mtr);
+}
+#else /* !UNIV_HOTBACKUP */
+# define page_create_write_log(frame,mtr,comp) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses a redo log record of creating a page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_create(
+/*==============*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr __attribute__((unused)), /*!< in: buffer end */
+	ulint		comp,	/*!< in: nonzero=compact page format */
+	buf_block_t*	block,	/*!< in: block or NULL */
+	mtr_t*		mtr)	/*!< in: mtr or NULL */
+{
+	ut_ad(ptr && end_ptr);
+
+	/* The record is empty, except for the record initial part */
+
+	if (block) {
+		page_create(block, mtr, comp);
+	}
+
+	return(ptr);
+}
+
+/**********************************************************//**
+The index page creation function.
+@return	pointer to the page */
+static
+page_t*
+page_create_low(
+/*============*/
+	buf_block_t*	block,		/*!< in: a buffer block where the
+					page is created */
+	ulint		comp)		/*!< in: nonzero=compact page format */
+{
+	page_dir_slot_t* slot;
+	mem_heap_t*	heap;
+	dtuple_t*	tuple;
+	dfield_t*	field;
+	byte*		heap_top;
+	rec_t*		infimum_rec;
+	rec_t*		supremum_rec;
+	page_t*		page;
+	dict_index_t*	index;
+	ulint*		offsets;
+
+	ut_ad(block);
+#if PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE > PAGE_DATA
+# error "PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE > PAGE_DATA"
+#endif
+#if PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE > PAGE_DATA
+# error "PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE > PAGE_DATA"
+#endif
+
+	/* The infimum and supremum records use a dummy index. */
+	if (UNIV_LIKELY(comp)) {
+		index = dict_ind_compact;
+	} else {
+		index = dict_ind_redundant;
+	}
+
+	/* 1. INCREMENT MODIFY CLOCK */
+	buf_block_modify_clock_inc(block);
+
+	page = buf_block_get_frame(block);
+
+	fil_page_set_type(page, FIL_PAGE_INDEX);
+
+	heap = mem_heap_create(200);
+
+	/* 3. CREATE THE INFIMUM AND SUPREMUM RECORDS */
+
+	/* Create first a data tuple for infimum record */
+	tuple = dtuple_create(heap, 1);
+	dtuple_set_info_bits(tuple, REC_STATUS_INFIMUM);
+	field = dtuple_get_nth_field(tuple, 0);
+
+	dfield_set_data(field, "infimum", 8);
+	dtype_set(dfield_get_type(field),
+		  DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, 8);
+	/* Set the corresponding physical record to its place in the page
+	record heap */
+
+	heap_top = page + PAGE_DATA;
+
+	infimum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple, 0);
+
+	if (UNIV_LIKELY(comp)) {
+		ut_a(infimum_rec == page + PAGE_NEW_INFIMUM);
+
+		rec_set_n_owned_new(infimum_rec, NULL, 1);
+		rec_set_heap_no_new(infimum_rec, 0);
+	} else {
+		ut_a(infimum_rec == page + PAGE_OLD_INFIMUM);
+
+		rec_set_n_owned_old(infimum_rec, 1);
+		rec_set_heap_no_old(infimum_rec, 0);
+	}
+
+	offsets = rec_get_offsets(infimum_rec, index, NULL,
+				  ULINT_UNDEFINED, &heap);
+
+	heap_top = rec_get_end(infimum_rec, offsets);
+
+	/* Create then a tuple for supremum */
+
+	tuple = dtuple_create(heap, 1);
+	dtuple_set_info_bits(tuple, REC_STATUS_SUPREMUM);
+	field = dtuple_get_nth_field(tuple, 0);
+
+	dfield_set_data(field, "supremum", comp ? 8 : 9);
+	dtype_set(dfield_get_type(field),
+		  DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, comp ? 8 : 9);
+
+	supremum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple, 0);
+
+	if (UNIV_LIKELY(comp)) {
+		ut_a(supremum_rec == page + PAGE_NEW_SUPREMUM);
+
+		rec_set_n_owned_new(supremum_rec, NULL, 1);
+		rec_set_heap_no_new(supremum_rec, 1);
+	} else {
+		ut_a(supremum_rec == page + PAGE_OLD_SUPREMUM);
+
+		rec_set_n_owned_old(supremum_rec, 1);
+		rec_set_heap_no_old(supremum_rec, 1);
+	}
+
+	offsets = rec_get_offsets(supremum_rec, index, offsets,
+				  ULINT_UNDEFINED, &heap);
+	heap_top = rec_get_end(supremum_rec, offsets);
+
+	ut_ad(heap_top == page
+	      + (comp ? PAGE_NEW_SUPREMUM_END : PAGE_OLD_SUPREMUM_END));
+
+	mem_heap_free(heap);
+
+	/* 4. INITIALIZE THE PAGE */
+
+	page_header_set_field(page, NULL, PAGE_N_DIR_SLOTS, 2);
+	page_header_set_ptr(page, NULL, PAGE_HEAP_TOP, heap_top);
+	page_header_set_field(page, NULL, PAGE_N_HEAP, comp
+			      ? 0x8000 | PAGE_HEAP_NO_USER_LOW
+			      : PAGE_HEAP_NO_USER_LOW);
+	page_header_set_ptr(page, NULL, PAGE_FREE, NULL);
+	page_header_set_field(page, NULL, PAGE_GARBAGE, 0);
+	page_header_set_ptr(page, NULL, PAGE_LAST_INSERT, NULL);
+	page_header_set_field(page, NULL, PAGE_DIRECTION, PAGE_NO_DIRECTION);
+	page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0);
+	page_header_set_field(page, NULL, PAGE_N_RECS, 0);
+	page_set_max_trx_id(block, NULL, 0, NULL);
+	memset(heap_top, 0, UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START
+	       - page_offset(heap_top));
+
+	/* 5. SET POINTERS IN RECORDS AND DIR SLOTS */
+
+	/* Set the slots to point to infimum and supremum. */
+
+	slot = page_dir_get_nth_slot(page, 0);
+	page_dir_slot_set_rec(slot, infimum_rec);
+
+	slot = page_dir_get_nth_slot(page, 1);
+	page_dir_slot_set_rec(slot, supremum_rec);
+
+	/* Set the next pointers in infimum and supremum */
+
+	if (UNIV_LIKELY(comp)) {
+		rec_set_next_offs_new(infimum_rec, PAGE_NEW_SUPREMUM);
+		rec_set_next_offs_new(supremum_rec, 0);
+	} else {
+		rec_set_next_offs_old(infimum_rec, PAGE_OLD_SUPREMUM);
+		rec_set_next_offs_old(supremum_rec, 0);
+	}
+
+	return(page);
+}
+
+/**********************************************************//**
+Create an uncompressed B-tree index page.
+@return	pointer to the page */
+UNIV_INTERN
+page_t*
+page_create(
+/*========*/
+	buf_block_t*	block,		/*!< in: a buffer block where the
+					page is created */
+	mtr_t*		mtr,		/*!< in: mini-transaction handle */
+	ulint		comp)		/*!< in: nonzero=compact page format */
+{
+	page_create_write_log(buf_block_get_frame(block), mtr, comp);
+	return(page_create_low(block, comp));
+}
+
+/**********************************************************//**
+Create a compressed B-tree index page.
+@return	pointer to the page */
+UNIV_INTERN
+page_t*
+page_create_zip(
+/*============*/
+	buf_block_t*	block,		/*!< in/out: a buffer frame where the
+					page is created */
+	dict_index_t*	index,		/*!< in: the index of the page */
+	ulint		level,		/*!< in: the B-tree level of the page */
+	trx_id_t	max_trx_id,	/*!< in: PAGE_MAX_TRX_ID */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	page_t*		page;
+	page_zip_des_t*	page_zip	= buf_block_get_page_zip(block);
+
+	ut_ad(block);
+	ut_ad(page_zip);
+	ut_ad(index);
+	ut_ad(dict_table_is_comp(index->table));
+
+	page = page_create_low(block, TRUE);
+	mach_write_to_2(PAGE_HEADER + PAGE_LEVEL + page, level);
+	mach_write_to_8(PAGE_HEADER + PAGE_MAX_TRX_ID + page, max_trx_id);
+
+	if (!page_zip_compress(page_zip, page, index,
+			       page_zip_level, mtr)) {
+		/* The compression of a newly created page
+		should always succeed. */
+		ut_error;
+	}
+
+	return(page);
+}
+
+/**********************************************************//**
+Empty a previously created B-tree index page. */
+UNIV_INTERN
+void
+page_create_empty(
+/*==============*/
+	buf_block_t*	block,	/*!< in/out: B-tree block */
+	dict_index_t*	index,	/*!< in: the index of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	trx_id_t	max_trx_id = 0;
+	const page_t*	page	= buf_block_get_frame(block);
+	page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
+
+	ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+
+	if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
+		max_trx_id = page_get_max_trx_id(page);
+		ut_ad(max_trx_id);
+	}
+
+	if (page_zip) {
+		page_create_zip(block, index,
+				page_header_get_field(page, PAGE_LEVEL),
+				max_trx_id, mtr);
+	} else {
+		page_create(block, mtr, page_is_comp(page));
+
+		if (max_trx_id) {
+			page_update_max_trx_id(
+				block, page_zip, max_trx_id, mtr);
+		}
+	}
+}
+
+/*************************************************************//**
+Differs from page_copy_rec_list_end, because this function does not
+touch the lock table and max trx id on page or compress the page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit(). */
+UNIV_INTERN
+void
+page_copy_rec_list_end_no_locks(
+/*============================*/
+	buf_block_t*	new_block,	/*!< in: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page of rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_t*		new_page	= buf_block_get_frame(new_block);
+	page_cur_t	cur1;
+	rec_t*		cur2;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	page_cur_position(rec, block, &cur1);
+
+	if (page_cur_is_before_first(&cur1)) {
+
+		page_cur_move_to_next(&cur1);
+	}
+
+	btr_assert_not_corrupted(new_block, index);
+	ut_a(page_is_comp(new_page) == page_rec_is_comp(rec));
+	ut_a(mach_read_from_2(new_page + UNIV_PAGE_SIZE - 10) == (ulint)
+	     (page_is_comp(new_page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM));
+
+	cur2 = page_get_infimum_rec(buf_block_get_frame(new_block));
+
+	/* Copy records from the original page to the new page */
+
+	while (!page_cur_is_after_last(&cur1)) {
+		rec_t*	cur1_rec = page_cur_get_rec(&cur1);
+		rec_t*	ins_rec;
+		offsets = rec_get_offsets(cur1_rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		ins_rec = page_cur_insert_rec_low(cur2, index,
+						  cur1_rec, offsets, mtr);
+		if (UNIV_UNLIKELY(!ins_rec)) {
+			/* Track an assertion failure reported on the mailing
+			list on June 18th, 2003 */
+
+			buf_page_print(new_page, 0,
+				       BUF_PAGE_PRINT_NO_CRASH);
+			buf_page_print(page_align(rec), 0,
+				       BUF_PAGE_PRINT_NO_CRASH);
+			ut_print_timestamp(stderr);
+
+			fprintf(stderr,
+				"InnoDB: rec offset %lu, cur1 offset %lu,"
+				" cur2 offset %lu\n",
+				(ulong) page_offset(rec),
+				(ulong) page_offset(page_cur_get_rec(&cur1)),
+				(ulong) page_offset(cur2));
+			ut_error;
+		}
+
+		page_cur_move_to_next(&cur1);
+		cur2 = ins_rec;
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Copies records from page to new_page, from a given record onward,
+including that record. Infimum and supremum records are not copied.
+The records are copied to the start of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to the original successor of the infimum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+UNIV_INTERN
+rec_t*
+page_copy_rec_list_end(
+/*===================*/
+	buf_block_t*	new_block,	/*!< in/out: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page containing rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_t*		new_page	= buf_block_get_frame(new_block);
+	page_zip_des_t*	new_page_zip	= buf_block_get_page_zip(new_block);
+	page_t*		page		= page_align(rec);
+	rec_t*		ret		= page_rec_get_next(
+		page_get_infimum_rec(new_page));
+	ulint		log_mode	= 0; /* remove warning */
+
+#ifdef UNIV_ZIP_DEBUG
+	if (new_page_zip) {
+		page_zip_des_t*	page_zip = buf_block_get_page_zip(block);
+		ut_a(page_zip);
+
+		/* Strict page_zip_validate() may fail here.
+		Furthermore, btr_compress() may set FIL_PAGE_PREV to
+		FIL_NULL on new_page while leaving it intact on
+		new_page_zip.  So, we cannot validate new_page_zip. */
+		ut_a(page_zip_validate_low(page_zip, page, index, TRUE));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+	ut_ad(buf_block_get_frame(block) == page);
+	ut_ad(page_is_leaf(page) == page_is_leaf(new_page));
+	ut_ad(page_is_comp(page) == page_is_comp(new_page));
+	/* Here, "ret" may be pointing to a user record or the
+	predefined supremum record. */
+
+	if (new_page_zip) {
+		log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+	}
+
+	if (page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW) {
+		page_copy_rec_list_end_to_created_page(new_page, rec,
+						       index, mtr);
+	} else {
+		page_copy_rec_list_end_no_locks(new_block, block, rec,
+						index, mtr);
+	}
+
+	/* Update PAGE_MAX_TRX_ID on the uncompressed page.
+	Modifications will be redo logged and copied to the compressed
+	page in page_zip_compress() or page_zip_reorganize() below. */
+	if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
+		page_update_max_trx_id(new_block, NULL,
+				       page_get_max_trx_id(page), mtr);
+	}
+
+	if (new_page_zip) {
+		mtr_set_log_mode(mtr, log_mode);
+
+		if (!page_zip_compress(new_page_zip, new_page,
+				       index, page_zip_level, mtr)) {
+			/* Before trying to reorganize the page,
+			store the number of preceding records on the page. */
+			ulint	ret_pos
+				= page_rec_get_n_recs_before(ret);
+			/* Before copying, "ret" was the successor of
+			the predefined infimum record.  It must still
+			have at least one predecessor (the predefined
+			infimum record, or a freshly copied record
+			that is smaller than "ret"). */
+			ut_a(ret_pos > 0);
+
+			if (!page_zip_reorganize(new_block, index, mtr)) {
+
+				btr_blob_dbg_remove(new_page, index,
+						    "copy_end_reorg_fail");
+				if (!page_zip_decompress(new_page_zip,
+							 new_page, FALSE)) {
+					ut_error;
+				}
+				ut_ad(page_validate(new_page, index));
+				btr_blob_dbg_add(new_page, index,
+						 "copy_end_reorg_fail");
+				return(NULL);
+			} else {
+				/* The page was reorganized:
+				Seek to ret_pos. */
+				ret = new_page + PAGE_NEW_INFIMUM;
+
+				do {
+					ret = rec_get_next_ptr(ret, TRUE);
+				} while (--ret_pos);
+			}
+		}
+	}
+
+	/* Update the lock table and possible hash index */
+
+	lock_move_rec_list_end(new_block, block, rec);
+
+	btr_search_move_or_delete_hash_entries(new_block, block, index);
+
+	return(ret);
+}
+
+/*************************************************************//**
+Copies records from page to new_page, up to the given record,
+NOT including that record. Infimum and supremum records are not copied.
+The records are copied to the end of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to the original predecessor of the supremum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+UNIV_INTERN
+rec_t*
+page_copy_rec_list_start(
+/*=====================*/
+	buf_block_t*	new_block,	/*!< in/out: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page containing rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_t*		new_page	= buf_block_get_frame(new_block);
+	page_zip_des_t*	new_page_zip	= buf_block_get_page_zip(new_block);
+	page_cur_t	cur1;
+	rec_t*		cur2;
+	ulint		log_mode	= 0 /* remove warning */;
+	mem_heap_t*	heap		= NULL;
+	rec_t*		ret
+		= page_rec_get_prev(page_get_supremum_rec(new_page));
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	/* Here, "ret" may be pointing to a user record or the
+	predefined infimum record. */
+
+	if (page_rec_is_infimum(rec)) {
+
+		return(ret);
+	}
+
+	if (new_page_zip) {
+		log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+	}
+
+	page_cur_set_before_first(block, &cur1);
+	page_cur_move_to_next(&cur1);
+
+	cur2 = ret;
+
+	/* Copy records from the original page to the new page */
+
+	while (page_cur_get_rec(&cur1) != rec) {
+		rec_t*	cur1_rec = page_cur_get_rec(&cur1);
+		offsets = rec_get_offsets(cur1_rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		cur2 = page_cur_insert_rec_low(cur2, index,
+					       cur1_rec, offsets, mtr);
+		ut_a(cur2);
+
+		page_cur_move_to_next(&cur1);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	/* Update PAGE_MAX_TRX_ID on the uncompressed page.
+	Modifications will be redo logged and copied to the compressed
+	page in page_zip_compress() or page_zip_reorganize() below. */
+	if (dict_index_is_sec_or_ibuf(index)
+	    && page_is_leaf(page_align(rec))) {
+		page_update_max_trx_id(new_block, NULL,
+				       page_get_max_trx_id(page_align(rec)),
+				       mtr);
+	}
+
+	if (new_page_zip) {
+		mtr_set_log_mode(mtr, log_mode);
+
+		DBUG_EXECUTE_IF("page_copy_rec_list_start_compress_fail",
+				goto zip_reorganize;);
+
+		if (!page_zip_compress(new_page_zip, new_page, index,
+				       page_zip_level, mtr)) {
+
+			ulint	ret_pos;
+#ifndef DBUG_OFF
+zip_reorganize:
+#endif /* DBUG_OFF */
+			/* Before trying to reorganize the page,
+			store the number of preceding records on the page. */
+			ret_pos = page_rec_get_n_recs_before(ret);
+			/* Before copying, "ret" was the predecessor
+			of the predefined supremum record.  If it was
+			the predefined infimum record, then it would
+			still be the infimum, and we would have
+			ret_pos == 0. */
+
+			if (UNIV_UNLIKELY
+			    (!page_zip_reorganize(new_block, index, mtr))) {
+
+				btr_blob_dbg_remove(new_page, index,
+						    "copy_start_reorg_fail");
+				if (UNIV_UNLIKELY
+				    (!page_zip_decompress(new_page_zip,
+							  new_page, FALSE))) {
+					ut_error;
+				}
+				ut_ad(page_validate(new_page, index));
+				btr_blob_dbg_add(new_page, index,
+						 "copy_start_reorg_fail");
+				return(NULL);
+			}
+
+			/* The page was reorganized: Seek to ret_pos. */
+			ret = page_rec_get_nth(new_page, ret_pos);
+		}
+	}
+
+	/* Update the lock table and possible hash index */
+
+	lock_move_rec_list_start(new_block, block, rec, ret);
+
+	btr_search_move_or_delete_hash_entries(new_block, block, index);
+
+	return(ret);
+}
+
+/**********************************************************//**
+Writes a log record of a record list end or start deletion. */
+UNIV_INLINE
+void
+page_delete_rec_list_write_log(
+/*===========================*/
+	rec_t*		rec,	/*!< in: record on page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	byte		type,	/*!< in: operation type:
+				MLOG_LIST_END_DELETE, ... */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	byte*	log_ptr;
+	ut_ad(type == MLOG_LIST_END_DELETE
+	      || type == MLOG_LIST_START_DELETE
+	      || type == MLOG_COMP_LIST_END_DELETE
+	      || type == MLOG_COMP_LIST_START_DELETE);
+
+	log_ptr = mlog_open_and_write_index(mtr, rec, index, type, 2);
+	if (log_ptr) {
+		/* Write the parameter as a 2-byte ulint */
+		mach_write_to_2(log_ptr, page_offset(rec));
+		mlog_close(mtr, log_ptr + 2);
+	}
+}
+#else /* !UNIV_HOTBACKUP */
+# define page_delete_rec_list_write_log(rec,index,type,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************//**
+Parses a log record of a record list end or start deletion.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_delete_rec_list(
+/*=======================*/
+	byte		type,	/*!< in: MLOG_LIST_END_DELETE,
+				MLOG_LIST_START_DELETE,
+				MLOG_COMP_LIST_END_DELETE or
+				MLOG_COMP_LIST_START_DELETE */
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	buf_block_t*	block,	/*!< in/out: buffer block or NULL */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr or NULL */
+{
+	page_t*	page;
+	ulint	offset;
+
+	ut_ad(type == MLOG_LIST_END_DELETE
+	      || type == MLOG_LIST_START_DELETE
+	      || type == MLOG_COMP_LIST_END_DELETE
+	      || type == MLOG_COMP_LIST_START_DELETE);
+
+	/* Read the record offset as a 2-byte ulint */
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	offset = mach_read_from_2(ptr);
+	ptr += 2;
+
+	if (!block) {
+
+		return(ptr);
+	}
+
+	page = buf_block_get_frame(block);
+
+	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+
+	if (type == MLOG_LIST_END_DELETE
+	    || type == MLOG_COMP_LIST_END_DELETE) {
+		page_delete_rec_list_end(page + offset, block, index,
+					 ULINT_UNDEFINED, ULINT_UNDEFINED,
+					 mtr);
+	} else {
+		page_delete_rec_list_start(page + offset, block, index, mtr);
+	}
+
+	return(ptr);
+}
+
+/*************************************************************//**
+Deletes records from a page from a given record onward, including that record.
+The infimum and supremum records are not deleted. */
+UNIV_INTERN
+void
+page_delete_rec_list_end(
+/*=====================*/
+	rec_t*		rec,	/*!< in: pointer to record on page */
+	buf_block_t*	block,	/*!< in: buffer block of the page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint		n_recs,	/*!< in: number of records to delete,
+				or ULINT_UNDEFINED if not known */
+	ulint		size,	/*!< in: the sum of the sizes of the
+				records in the end of the chain to
+				delete, or ULINT_UNDEFINED if not known */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_dir_slot_t*slot;
+	ulint		slot_index;
+	rec_t*		last_rec;
+	rec_t*		prev_rec;
+	ulint		n_owned;
+	page_zip_des_t*	page_zip	= buf_block_get_page_zip(block);
+	page_t*		page		= page_align(rec);
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(size == ULINT_UNDEFINED || size < UNIV_PAGE_SIZE);
+	ut_ad(!page_zip || page_rec_is_comp(rec));
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (page_rec_is_supremum(rec)) {
+		ut_ad(n_recs == 0 || n_recs == ULINT_UNDEFINED);
+		/* Nothing to do, there are no records bigger than the
+		page supremum. */
+		return;
+	}
+
+	if (recv_recovery_is_on()) {
+		/* If we are replaying a redo log record, we must
+		replay it exactly. Since MySQL 5.6.11, we should be
+		generating a redo log record for page creation if
+		the page would become empty. Thus, this branch should
+		only be executed when applying redo log that was
+		generated by an older version of MySQL. */
+	} else if (page_rec_is_infimum(rec)
+		   || n_recs == page_get_n_recs(page)) {
+delete_all:
+		/* We are deleting all records. */
+		page_create_empty(block, index, mtr);
+		return;
+	} else if (page_is_comp(page)) {
+		if (page_rec_get_next_low(page + PAGE_NEW_INFIMUM, 1) == rec) {
+			/* We are deleting everything from the first
+			user record onwards. */
+			goto delete_all;
+		}
+	} else {
+		if (page_rec_get_next_low(page + PAGE_OLD_INFIMUM, 0) == rec) {
+			/* We are deleting everything from the first
+			user record onwards. */
+			goto delete_all;
+		}
+	}
+
+	/* Reset the last insert info in the page header and increment
+	the modify clock for the frame */
+
+	page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL);
+
+	/* The page gets invalid for optimistic searches: increment the
+	frame modify clock */
+
+	buf_block_modify_clock_inc(block);
+
+	page_delete_rec_list_write_log(rec, index, page_is_comp(page)
+				       ? MLOG_COMP_LIST_END_DELETE
+				       : MLOG_LIST_END_DELETE, mtr);
+
+	if (page_zip) {
+		ulint		log_mode;
+
+		ut_a(page_is_comp(page));
+		/* Individual deletes are not logged */
+
+		log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+
+		do {
+			page_cur_t	cur;
+			page_cur_position(rec, block, &cur);
+
+			offsets = rec_get_offsets(rec, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+			rec = rec_get_next_ptr(rec, TRUE);
+#ifdef UNIV_ZIP_DEBUG
+			ut_a(page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+			page_cur_delete_rec(&cur, index, offsets, mtr);
+		} while (page_offset(rec) != PAGE_NEW_SUPREMUM);
+
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+
+		/* Restore log mode */
+
+		mtr_set_log_mode(mtr, log_mode);
+		return;
+	}
+
+	prev_rec = page_rec_get_prev(rec);
+
+	last_rec = page_rec_get_prev(page_get_supremum_rec(page));
+
+	if ((size == ULINT_UNDEFINED) || (n_recs == ULINT_UNDEFINED)) {
+		rec_t*		rec2		= rec;
+		/* Calculate the sum of sizes and the number of records */
+		size = 0;
+		n_recs = 0;
+
+		do {
+			ulint	s;
+			offsets = rec_get_offsets(rec2, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+			s = rec_offs_size(offsets);
+			ut_ad(rec2 - page + s - rec_offs_extra_size(offsets)
+			      < UNIV_PAGE_SIZE);
+			ut_ad(size + s < UNIV_PAGE_SIZE);
+			size += s;
+			n_recs++;
+
+			rec2 = page_rec_get_next(rec2);
+		} while (!page_rec_is_supremum(rec2));
+
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	}
+
+	ut_ad(size < UNIV_PAGE_SIZE);
+
+	/* Update the page directory; there is no need to balance the number
+	of the records owned by the supremum record, as it is allowed to be
+	less than PAGE_DIR_SLOT_MIN_N_OWNED */
+
+	if (page_is_comp(page)) {
+		rec_t*	rec2	= rec;
+		ulint	count	= 0;
+
+		while (rec_get_n_owned_new(rec2) == 0) {
+			count++;
+
+			rec2 = rec_get_next_ptr(rec2, TRUE);
+		}
+
+		ut_ad(rec_get_n_owned_new(rec2) > count);
+
+		n_owned = rec_get_n_owned_new(rec2) - count;
+		slot_index = page_dir_find_owner_slot(rec2);
+		ut_ad(slot_index > 0);
+		slot = page_dir_get_nth_slot(page, slot_index);
+	} else {
+		rec_t*	rec2	= rec;
+		ulint	count	= 0;
+
+		while (rec_get_n_owned_old(rec2) == 0) {
+			count++;
+
+			rec2 = rec_get_next_ptr(rec2, FALSE);
+		}
+
+		ut_ad(rec_get_n_owned_old(rec2) > count);
+
+		n_owned = rec_get_n_owned_old(rec2) - count;
+		slot_index = page_dir_find_owner_slot(rec2);
+		ut_ad(slot_index > 0);
+		slot = page_dir_get_nth_slot(page, slot_index);
+	}
+
+	page_dir_slot_set_rec(slot, page_get_supremum_rec(page));
+	page_dir_slot_set_n_owned(slot, NULL, n_owned);
+
+	page_dir_set_n_slots(page, NULL, slot_index + 1);
+
+	/* Remove the record chain segment from the record chain */
+	page_rec_set_next(prev_rec, page_get_supremum_rec(page));
+
+	btr_blob_dbg_op(page, rec, index, "delete_end",
+			btr_blob_dbg_remove_rec);
+
+	/* Catenate the deleted chain segment to the page free list */
+
+	page_rec_set_next(last_rec, page_header_get_ptr(page, PAGE_FREE));
+	page_header_set_ptr(page, NULL, PAGE_FREE, rec);
+
+	page_header_set_field(page, NULL, PAGE_GARBAGE, size
+			      + page_header_get_field(page, PAGE_GARBAGE));
+
+	page_header_set_field(page, NULL, PAGE_N_RECS,
+			      (ulint)(page_get_n_recs(page) - n_recs));
+}
+
+/*************************************************************//**
+Deletes records from page, up to the given record, NOT including
+that record. Infimum and supremum records are not deleted. */
+UNIV_INTERN
+void
+page_delete_rec_list_start(
+/*=======================*/
+	rec_t*		rec,	/*!< in: record on page */
+	buf_block_t*	block,	/*!< in: buffer block of the page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_cur_t	cur1;
+	ulint		log_mode;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	mem_heap_t*	heap		= NULL;
+	byte		type;
+
+	rec_offs_init(offsets_);
+
+	ut_ad((ibool) !!page_rec_is_comp(rec)
+	      == dict_table_is_comp(index->table));
+#ifdef UNIV_ZIP_DEBUG
+	{
+		page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
+		page_t*		page	= buf_block_get_frame(block);
+
+		/* page_zip_validate() would detect a min_rec_mark mismatch
+		in btr_page_split_and_insert()
+		between btr_attach_half_pages() and insert_page = ...
+		when btr_page_get_split_rec_to_left() holds
+		(direction == FSP_DOWN). */
+		ut_a(!page_zip
+		     || page_zip_validate_low(page_zip, page, index, TRUE));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (page_rec_is_infimum(rec)) {
+		return;
+	}
+
+	if (page_rec_is_supremum(rec)) {
+		/* We are deleting all records. */
+		page_create_empty(block, index, mtr);
+		return;
+	}
+
+	if (page_rec_is_comp(rec)) {
+		type = MLOG_COMP_LIST_START_DELETE;
+	} else {
+		type = MLOG_LIST_START_DELETE;
+	}
+
+	page_delete_rec_list_write_log(rec, index, type, mtr);
+
+	page_cur_set_before_first(block, &cur1);
+	page_cur_move_to_next(&cur1);
+
+	/* Individual deletes are not logged */
+
+	log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+
+	while (page_cur_get_rec(&cur1) != rec) {
+		offsets = rec_get_offsets(page_cur_get_rec(&cur1), index,
+					  offsets, ULINT_UNDEFINED, &heap);
+		page_cur_delete_rec(&cur1, index, offsets, mtr);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	/* Restore log mode */
+
+	mtr_set_log_mode(mtr, log_mode);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Moves record list end to another page. Moved records include
+split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return TRUE on success; FALSE on compression failure (new_block will
+be decompressed) */
+UNIV_INTERN
+ibool
+page_move_rec_list_end(
+/*===================*/
+	buf_block_t*	new_block,	/*!< in/out: index page where to move */
+	buf_block_t*	block,		/*!< in: index page from where to move */
+	rec_t*		split_rec,	/*!< in: first record to move */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_t*		new_page	= buf_block_get_frame(new_block);
+	ulint		old_data_size;
+	ulint		new_data_size;
+	ulint		old_n_recs;
+	ulint		new_n_recs;
+
+	old_data_size = page_get_data_size(new_page);
+	old_n_recs = page_get_n_recs(new_page);
+#ifdef UNIV_ZIP_DEBUG
+	{
+		page_zip_des_t*	new_page_zip
+			= buf_block_get_page_zip(new_block);
+		page_zip_des_t*	page_zip
+			= buf_block_get_page_zip(block);
+		ut_a(!new_page_zip == !page_zip);
+		ut_a(!new_page_zip
+		     || page_zip_validate(new_page_zip, new_page, index));
+		ut_a(!page_zip
+		     || page_zip_validate(page_zip, page_align(split_rec),
+					  index));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (UNIV_UNLIKELY(!page_copy_rec_list_end(new_block, block,
+						  split_rec, index, mtr))) {
+		return(FALSE);
+	}
+
+	new_data_size = page_get_data_size(new_page);
+	new_n_recs = page_get_n_recs(new_page);
+
+	ut_ad(new_data_size >= old_data_size);
+
+	page_delete_rec_list_end(split_rec, block, index,
+				 new_n_recs - old_n_recs,
+				 new_data_size - old_data_size, mtr);
+
+	return(TRUE);
+}
+
+/*************************************************************//**
+Moves record list start to another page. Moved records do not include
+split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return	TRUE on success; FALSE on compression failure */
+UNIV_INTERN
+ibool
+page_move_rec_list_start(
+/*=====================*/
+	buf_block_t*	new_block,	/*!< in/out: index page where to move */
+	buf_block_t*	block,		/*!< in/out: page containing split_rec */
+	rec_t*		split_rec,	/*!< in: first record not to move */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	if (UNIV_UNLIKELY(!page_copy_rec_list_start(new_block, block,
+						    split_rec, index, mtr))) {
+		return(FALSE);
+	}
+
+	page_delete_rec_list_start(split_rec, block, index, mtr);
+
+	return(TRUE);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**************************************************************//**
+Used to delete n slots from the directory. This function updates
+also n_owned fields in the records, so that the first slot after
+the deleted ones inherits the records of the deleted slots. */
+UNIV_INLINE
+void
+page_dir_delete_slot(
+/*=================*/
+	page_t*		page,	/*!< in/out: the index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	ulint		slot_no)/*!< in: slot to be deleted */
+{
+	page_dir_slot_t*	slot;
+	ulint			n_owned;
+	ulint			i;
+	ulint			n_slots;
+
+	ut_ad(!page_zip || page_is_comp(page));
+	ut_ad(slot_no > 0);
+	ut_ad(slot_no + 1 < page_dir_get_n_slots(page));
+
+	n_slots = page_dir_get_n_slots(page);
+
+	/* 1. Reset the n_owned fields of the slots to be
+	deleted */
+	slot = page_dir_get_nth_slot(page, slot_no);
+	n_owned = page_dir_slot_get_n_owned(slot);
+	page_dir_slot_set_n_owned(slot, page_zip, 0);
+
+	/* 2. Update the n_owned value of the first non-deleted slot */
+
+	slot = page_dir_get_nth_slot(page, slot_no + 1);
+	page_dir_slot_set_n_owned(slot, page_zip,
+				  n_owned + page_dir_slot_get_n_owned(slot));
+
+	/* 3. Destroy the slot by copying slots */
+	for (i = slot_no + 1; i < n_slots; i++) {
+		rec_t*	rec = (rec_t*)
+			page_dir_slot_get_rec(page_dir_get_nth_slot(page, i));
+		page_dir_slot_set_rec(page_dir_get_nth_slot(page, i - 1), rec);
+	}
+
+	/* 4. Zero out the last slot, which will be removed */
+	mach_write_to_2(page_dir_get_nth_slot(page, n_slots - 1), 0);
+
+	/* 5. Update the page header */
+	page_header_set_field(page, page_zip, PAGE_N_DIR_SLOTS, n_slots - 1);
+}
+
+/**************************************************************//**
+Used to add n slots to the directory. Does not set the record pointers
+in the added slots or update n_owned values: this is the responsibility
+of the caller. */
+UNIV_INLINE
+void
+page_dir_add_slot(
+/*==============*/
+	page_t*		page,	/*!< in/out: the index page */
+	page_zip_des_t*	page_zip,/*!< in/out: comprssed page, or NULL */
+	ulint		start)	/*!< in: the slot above which the new slots
+				are added */
+{
+	page_dir_slot_t*	slot;
+	ulint			n_slots;
+
+	n_slots = page_dir_get_n_slots(page);
+
+	ut_ad(start < n_slots - 1);
+
+	/* Update the page header */
+	page_dir_set_n_slots(page, page_zip, n_slots + 1);
+
+	/* Move slots up */
+	slot = page_dir_get_nth_slot(page, n_slots);
+	memmove(slot, slot + PAGE_DIR_SLOT_SIZE,
+		(n_slots - 1 - start) * PAGE_DIR_SLOT_SIZE);
+}
+
+/****************************************************************//**
+Splits a directory slot which owns too many records. */
+UNIV_INTERN
+void
+page_dir_split_slot(
+/*================*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be written, or NULL */
+	ulint		slot_no)/*!< in: the directory slot */
+{
+	rec_t*			rec;
+	page_dir_slot_t*	new_slot;
+	page_dir_slot_t*	prev_slot;
+	page_dir_slot_t*	slot;
+	ulint			i;
+	ulint			n_owned;
+
+	ut_ad(page);
+	ut_ad(!page_zip || page_is_comp(page));
+	ut_ad(slot_no > 0);
+
+	slot = page_dir_get_nth_slot(page, slot_no);
+
+	n_owned = page_dir_slot_get_n_owned(slot);
+	ut_ad(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED + 1);
+
+	/* 1. We loop to find a record approximately in the middle of the
+	records owned by the slot. */
+
+	prev_slot = page_dir_get_nth_slot(page, slot_no - 1);
+	rec = (rec_t*) page_dir_slot_get_rec(prev_slot);
+
+	for (i = 0; i < n_owned / 2; i++) {
+		rec = page_rec_get_next(rec);
+	}
+
+	ut_ad(n_owned / 2 >= PAGE_DIR_SLOT_MIN_N_OWNED);
+
+	/* 2. We add one directory slot immediately below the slot to be
+	split. */
+
+	page_dir_add_slot(page, page_zip, slot_no - 1);
+
+	/* The added slot is now number slot_no, and the old slot is
+	now number slot_no + 1 */
+
+	new_slot = page_dir_get_nth_slot(page, slot_no);
+	slot = page_dir_get_nth_slot(page, slot_no + 1);
+
+	/* 3. We store the appropriate values to the new slot. */
+
+	page_dir_slot_set_rec(new_slot, rec);
+	page_dir_slot_set_n_owned(new_slot, page_zip, n_owned / 2);
+
+	/* 4. Finally, we update the number of records field of the
+	original slot */
+
+	page_dir_slot_set_n_owned(slot, page_zip, n_owned - (n_owned / 2));
+}
+
+/*************************************************************//**
+Tries to balance the given directory slot with too few records with the upper
+neighbor, so that there are at least the minimum number of records owned by
+the slot; this may result in the merging of two slots. */
+UNIV_INTERN
+void
+page_dir_balance_slot(
+/*==================*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	ulint		slot_no)/*!< in: the directory slot */
+{
+	page_dir_slot_t*	slot;
+	page_dir_slot_t*	up_slot;
+	ulint			n_owned;
+	ulint			up_n_owned;
+	rec_t*			old_rec;
+	rec_t*			new_rec;
+
+	ut_ad(page);
+	ut_ad(!page_zip || page_is_comp(page));
+	ut_ad(slot_no > 0);
+
+	slot = page_dir_get_nth_slot(page, slot_no);
+
+	/* The last directory slot cannot be balanced with the upper
+	neighbor, as there is none. */
+
+	if (UNIV_UNLIKELY(slot_no == page_dir_get_n_slots(page) - 1)) {
+
+		return;
+	}
+
+	up_slot = page_dir_get_nth_slot(page, slot_no + 1);
+
+	n_owned = page_dir_slot_get_n_owned(slot);
+	up_n_owned = page_dir_slot_get_n_owned(up_slot);
+
+	ut_ad(n_owned == PAGE_DIR_SLOT_MIN_N_OWNED - 1);
+
+	/* If the upper slot has the minimum value of n_owned, we will merge
+	the two slots, therefore we assert: */
+	ut_ad(2 * PAGE_DIR_SLOT_MIN_N_OWNED - 1 <= PAGE_DIR_SLOT_MAX_N_OWNED);
+
+	if (up_n_owned > PAGE_DIR_SLOT_MIN_N_OWNED) {
+
+		/* In this case we can just transfer one record owned
+		by the upper slot to the property of the lower slot */
+		old_rec = (rec_t*) page_dir_slot_get_rec(slot);
+
+		if (page_is_comp(page)) {
+			new_rec = rec_get_next_ptr(old_rec, TRUE);
+
+			rec_set_n_owned_new(old_rec, page_zip, 0);
+			rec_set_n_owned_new(new_rec, page_zip, n_owned + 1);
+		} else {
+			new_rec = rec_get_next_ptr(old_rec, FALSE);
+
+			rec_set_n_owned_old(old_rec, 0);
+			rec_set_n_owned_old(new_rec, n_owned + 1);
+		}
+
+		page_dir_slot_set_rec(slot, new_rec);
+
+		page_dir_slot_set_n_owned(up_slot, page_zip, up_n_owned -1);
+	} else {
+		/* In this case we may merge the two slots */
+		page_dir_delete_slot(page, page_zip, slot_no);
+	}
+}
+
+/************************************************************//**
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return	nth record */
+UNIV_INTERN
+const rec_t*
+page_rec_get_nth_const(
+/*===================*/
+	const page_t*	page,	/*!< in: page */
+	ulint		nth)	/*!< in: nth record */
+{
+	const page_dir_slot_t*	slot;
+	ulint			i;
+	ulint			n_owned;
+	const rec_t*		rec;
+
+	if (nth == 0) {
+		return(page_get_infimum_rec(page));
+	}
+
+	ut_ad(nth < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1));
+
+	for (i = 0;; i++) {
+
+		slot = page_dir_get_nth_slot(page, i);
+		n_owned = page_dir_slot_get_n_owned(slot);
+
+		if (n_owned > nth) {
+			break;
+		} else {
+			nth -= n_owned;
+		}
+	}
+
+	ut_ad(i > 0);
+	slot = page_dir_get_nth_slot(page, i - 1);
+	rec = page_dir_slot_get_rec(slot);
+
+	if (page_is_comp(page)) {
+		do {
+			rec = page_rec_get_next_low(rec, TRUE);
+			ut_ad(rec);
+		} while (nth--);
+	} else {
+		do {
+			rec = page_rec_get_next_low(rec, FALSE);
+			ut_ad(rec);
+		} while (nth--);
+	}
+
+	return(rec);
+}
+
+/***************************************************************//**
+Returns the number of records before the given record in chain.
+The number includes infimum and supremum records.
+@return	number of records */
+UNIV_INTERN
+ulint
+page_rec_get_n_recs_before(
+/*=======================*/
+	const rec_t*	rec)	/*!< in: the physical record */
+{
+	const page_dir_slot_t*	slot;
+	const rec_t*		slot_rec;
+	const page_t*		page;
+	ulint			i;
+	lint			n	= 0;
+
+	ut_ad(page_rec_check(rec));
+
+	page = page_align(rec);
+	if (page_is_comp(page)) {
+		while (rec_get_n_owned_new(rec) == 0) {
+
+			rec = rec_get_next_ptr_const(rec, TRUE);
+			n--;
+		}
+
+		for (i = 0; ; i++) {
+			slot = page_dir_get_nth_slot(page, i);
+			slot_rec = page_dir_slot_get_rec(slot);
+
+			n += rec_get_n_owned_new(slot_rec);
+
+			if (rec == slot_rec) {
+
+				break;
+			}
+		}
+	} else {
+		while (rec_get_n_owned_old(rec) == 0) {
+
+			rec = rec_get_next_ptr_const(rec, FALSE);
+			n--;
+		}
+
+		for (i = 0; ; i++) {
+			slot = page_dir_get_nth_slot(page, i);
+			slot_rec = page_dir_slot_get_rec(slot);
+
+			n += rec_get_n_owned_old(slot_rec);
+
+			if (rec == slot_rec) {
+
+				break;
+			}
+		}
+	}
+
+	n--;
+
+	ut_ad(n >= 0);
+	ut_ad((ulong) n < UNIV_PAGE_SIZE / (REC_N_NEW_EXTRA_BYTES + 1));
+
+	return((ulint) n);
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Prints record contents including the data relevant only in
+the index page context. */
+UNIV_INTERN
+void
+page_rec_print(
+/*===========*/
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: record descriptor */
+{
+	ut_a(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
+	rec_print_new(stderr, rec, offsets);
+	if (page_rec_is_comp(rec)) {
+		fprintf(stderr,
+			" n_owned: %lu; heap_no: %lu; next rec: %lu\n",
+			(ulong) rec_get_n_owned_new(rec),
+			(ulong) rec_get_heap_no_new(rec),
+			(ulong) rec_get_next_offs(rec, TRUE));
+	} else {
+		fprintf(stderr,
+			" n_owned: %lu; heap_no: %lu; next rec: %lu\n",
+			(ulong) rec_get_n_owned_old(rec),
+			(ulong) rec_get_heap_no_old(rec),
+			(ulong) rec_get_next_offs(rec, FALSE));
+	}
+
+	page_rec_check(rec);
+	rec_validate(rec, offsets);
+}
+
+# ifdef UNIV_BTR_PRINT
+/***************************************************************//**
+This is used to print the contents of the directory for
+debugging purposes. */
+UNIV_INTERN
+void
+page_dir_print(
+/*===========*/
+	page_t*	page,	/*!< in: index page */
+	ulint	pr_n)	/*!< in: print n first and n last entries */
+{
+	ulint			n;
+	ulint			i;
+	page_dir_slot_t*	slot;
+
+	n = page_dir_get_n_slots(page);
+
+	fprintf(stderr, "--------------------------------\n"
+		"PAGE DIRECTORY\n"
+		"Page address %p\n"
+		"Directory stack top at offs: %lu; number of slots: %lu\n",
+		page, (ulong) page_offset(page_dir_get_nth_slot(page, n - 1)),
+		(ulong) n);
+	for (i = 0; i < n; i++) {
+		slot = page_dir_get_nth_slot(page, i);
+		if ((i == pr_n) && (i < n - pr_n)) {
+			fputs("    ...   \n", stderr);
+		}
+		if ((i < pr_n) || (i >= n - pr_n)) {
+			fprintf(stderr,
+				"Contents of slot: %lu: n_owned: %lu,"
+				" rec offs: %lu\n",
+				(ulong) i,
+				(ulong) page_dir_slot_get_n_owned(slot),
+				(ulong)
+				page_offset(page_dir_slot_get_rec(slot)));
+		}
+	}
+	fprintf(stderr, "Total of %lu records\n"
+		"--------------------------------\n",
+		(ulong) (PAGE_HEAP_NO_USER_LOW + page_get_n_recs(page)));
+}
+
+/***************************************************************//**
+This is used to print the contents of the page record list for
+debugging purposes. */
+UNIV_INTERN
+void
+page_print_list(
+/*============*/
+	buf_block_t*	block,	/*!< in: index page */
+	dict_index_t*	index,	/*!< in: dictionary index of the page */
+	ulint		pr_n)	/*!< in: print n first and n last entries */
+{
+	page_t*		page		= block->frame;
+	page_cur_t	cur;
+	ulint		count;
+	ulint		n_recs;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
+
+	fprintf(stderr,
+		"--------------------------------\n"
+		"PAGE RECORD LIST\n"
+		"Page address %p\n", page);
+
+	n_recs = page_get_n_recs(page);
+
+	page_cur_set_before_first(block, &cur);
+	count = 0;
+	for (;;) {
+		offsets = rec_get_offsets(cur.rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		page_rec_print(cur.rec, offsets);
+
+		if (count == pr_n) {
+			break;
+		}
+		if (page_cur_is_after_last(&cur)) {
+			break;
+		}
+		page_cur_move_to_next(&cur);
+		count++;
+	}
+
+	if (n_recs > 2 * pr_n) {
+		fputs(" ... \n", stderr);
+	}
+
+	while (!page_cur_is_after_last(&cur)) {
+		page_cur_move_to_next(&cur);
+
+		if (count + pr_n >= n_recs) {
+			offsets = rec_get_offsets(cur.rec, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+			page_rec_print(cur.rec, offsets);
+		}
+		count++;
+	}
+
+	fprintf(stderr,
+		"Total of %lu records \n"
+		"--------------------------------\n",
+		(ulong) (count + 1));
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/***************************************************************//**
+Prints the info in a page header. */
+UNIV_INTERN
+void
+page_header_print(
+/*==============*/
+	const page_t*	page)
+{
+	fprintf(stderr,
+		"--------------------------------\n"
+		"PAGE HEADER INFO\n"
+		"Page address %p, n records %lu (%s)\n"
+		"n dir slots %lu, heap top %lu\n"
+		"Page n heap %lu, free %lu, garbage %lu\n"
+		"Page last insert %lu, direction %lu, n direction %lu\n",
+		page, (ulong) page_header_get_field(page, PAGE_N_RECS),
+		page_is_comp(page) ? "compact format" : "original format",
+		(ulong) page_header_get_field(page, PAGE_N_DIR_SLOTS),
+		(ulong) page_header_get_field(page, PAGE_HEAP_TOP),
+		(ulong) page_dir_get_n_heap(page),
+		(ulong) page_header_get_field(page, PAGE_FREE),
+		(ulong) page_header_get_field(page, PAGE_GARBAGE),
+		(ulong) page_header_get_field(page, PAGE_LAST_INSERT),
+		(ulong) page_header_get_field(page, PAGE_DIRECTION),
+		(ulong) page_header_get_field(page, PAGE_N_DIRECTION));
+}
+
+/***************************************************************//**
+This is used to print the contents of the page for
+debugging purposes. */
+UNIV_INTERN
+void
+page_print(
+/*=======*/
+	buf_block_t*	block,	/*!< in: index page */
+	dict_index_t*	index,	/*!< in: dictionary index of the page */
+	ulint		dn,	/*!< in: print dn first and last entries
+				in directory */
+	ulint		rn)	/*!< in: print rn first and last records
+				in directory */
+{
+	page_t*	page = block->frame;
+
+	page_header_print(page);
+	page_dir_print(page, dn);
+	page_print_list(block, index, rn);
+}
+# endif /* UNIV_BTR_PRINT */
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+The following is used to validate a record on a page. This function
+differs from rec_validate as it can also check the n_owned field and
+the heap_no field.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+page_rec_validate(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint		n_owned;
+	ulint		heap_no;
+	const page_t*	page;
+
+	page = page_align(rec);
+	ut_a(!page_is_comp(page) == !rec_offs_comp(offsets));
+
+	page_rec_check(rec);
+	rec_validate(rec, offsets);
+
+	if (page_rec_is_comp(rec)) {
+		n_owned = rec_get_n_owned_new(rec);
+		heap_no = rec_get_heap_no_new(rec);
+	} else {
+		n_owned = rec_get_n_owned_old(rec);
+		heap_no = rec_get_heap_no_old(rec);
+	}
+
+	if (UNIV_UNLIKELY(!(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED))) {
+		fprintf(stderr,
+			"InnoDB: Dir slot of rec %lu, n owned too big %lu\n",
+			(ulong) page_offset(rec), (ulong) n_owned);
+		return(FALSE);
+	}
+
+	if (UNIV_UNLIKELY(!(heap_no < page_dir_get_n_heap(page)))) {
+		fprintf(stderr,
+			"InnoDB: Heap no of rec %lu too big %lu %lu\n",
+			(ulong) page_offset(rec), (ulong) heap_no,
+			(ulong) page_dir_get_n_heap(page));
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Checks that the first directory slot points to the infimum record and
+the last to the supremum. This function is intended to track if the
+bug fixed in 4.0.14 has caused corruption to users' databases. */
+UNIV_INTERN
+void
+page_check_dir(
+/*===========*/
+	const page_t*	page)	/*!< in: index page */
+{
+	ulint	n_slots;
+	ulint	infimum_offs;
+	ulint	supremum_offs;
+
+	n_slots = page_dir_get_n_slots(page);
+	infimum_offs = mach_read_from_2(page_dir_get_nth_slot(page, 0));
+	supremum_offs = mach_read_from_2(page_dir_get_nth_slot(page,
+							       n_slots - 1));
+
+	if (UNIV_UNLIKELY(!page_rec_is_infimum_low(infimum_offs))) {
+
+		fprintf(stderr,
+			"InnoDB: Page directory corruption:"
+			" infimum not pointed to\n");
+		buf_page_print(page, 0, 0);
+	}
+
+	if (UNIV_UNLIKELY(!page_rec_is_supremum_low(supremum_offs))) {
+
+		fprintf(stderr,
+			"InnoDB: Page directory corruption:"
+			" supremum not pointed to\n");
+		buf_page_print(page, 0, 0);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+page_simple_validate_old(
+/*=====================*/
+	const page_t*	page)	/*!< in: index page in ROW_FORMAT=REDUNDANT */
+{
+	const page_dir_slot_t*	slot;
+	ulint			slot_no;
+	ulint			n_slots;
+	const rec_t*		rec;
+	const byte*		rec_heap_top;
+	ulint			count;
+	ulint			own_count;
+	ibool			ret	= FALSE;
+
+	ut_a(!page_is_comp(page));
+
+	/* Check first that the record heap and the directory do not
+	overlap. */
+
+	n_slots = page_dir_get_n_slots(page);
+
+	if (UNIV_UNLIKELY(n_slots > UNIV_PAGE_SIZE / 4)) {
+		fprintf(stderr,
+			"InnoDB: Nonsensical number %lu of page dir slots\n",
+			(ulong) n_slots);
+
+		goto func_exit;
+	}
+
+	rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP);
+
+	if (UNIV_UNLIKELY(rec_heap_top
+			  > page_dir_get_nth_slot(page, n_slots - 1))) {
+
+		fprintf(stderr,
+			"InnoDB: Record heap and dir overlap on a page,"
+			" heap top %lu, dir %lu\n",
+			(ulong) page_header_get_field(page, PAGE_HEAP_TOP),
+			(ulong)
+			page_offset(page_dir_get_nth_slot(page, n_slots - 1)));
+
+		goto func_exit;
+	}
+
+	/* Validate the record list in a loop checking also that it is
+	consistent with the page record directory. */
+
+	count = 0;
+	own_count = 1;
+	slot_no = 0;
+	slot = page_dir_get_nth_slot(page, slot_no);
+
+	rec = page_get_infimum_rec(page);
+
+	for (;;) {
+		if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+			fprintf(stderr,
+				"InnoDB: Record %lu is above"
+				" rec heap top %lu\n",
+				(ulong)(rec - page),
+				(ulong)(rec_heap_top - page));
+
+			goto func_exit;
+		}
+
+		if (UNIV_UNLIKELY(rec_get_n_owned_old(rec))) {
+			/* This is a record pointed to by a dir slot */
+			if (UNIV_UNLIKELY(rec_get_n_owned_old(rec)
+					  != own_count)) {
+
+				fprintf(stderr,
+					"InnoDB: Wrong owned count %lu, %lu,"
+					" rec %lu\n",
+					(ulong) rec_get_n_owned_old(rec),
+					(ulong) own_count,
+					(ulong)(rec - page));
+
+				goto func_exit;
+			}
+
+			if (UNIV_UNLIKELY
+			    (page_dir_slot_get_rec(slot) != rec)) {
+				fprintf(stderr,
+					"InnoDB: Dir slot does not point"
+					" to right rec %lu\n",
+					(ulong)(rec - page));
+
+				goto func_exit;
+			}
+
+			own_count = 0;
+
+			if (!page_rec_is_supremum(rec)) {
+				slot_no++;
+				slot = page_dir_get_nth_slot(page, slot_no);
+			}
+		}
+
+		if (page_rec_is_supremum(rec)) {
+
+			break;
+		}
+
+		if (UNIV_UNLIKELY
+		    (rec_get_next_offs(rec, FALSE) < FIL_PAGE_DATA
+		     || rec_get_next_offs(rec, FALSE) >= UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Next record offset"
+				" nonsensical %lu for rec %lu\n",
+				(ulong) rec_get_next_offs(rec, FALSE),
+				(ulong) (rec - page));
+
+			goto func_exit;
+		}
+
+		count++;
+
+		if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Page record list appears"
+				" to be circular %lu\n",
+				(ulong) count);
+			goto func_exit;
+		}
+
+		rec = page_rec_get_next_const(rec);
+		own_count++;
+	}
+
+	if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) {
+		fprintf(stderr, "InnoDB: n owned is zero in a supremum rec\n");
+
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(slot_no != n_slots - 1)) {
+		fprintf(stderr, "InnoDB: n slots wrong %lu, %lu\n",
+			(ulong) slot_no, (ulong) (n_slots - 1));
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS)
+			  + PAGE_HEAP_NO_USER_LOW
+			  != count + 1)) {
+		fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n",
+			(ulong) page_header_get_field(page, PAGE_N_RECS)
+			+ PAGE_HEAP_NO_USER_LOW,
+			(ulong) (count + 1));
+
+		goto func_exit;
+	}
+
+	/* Check then the free list */
+	rec = page_header_get_ptr(page, PAGE_FREE);
+
+	while (rec != NULL) {
+		if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA
+				  || rec >= page + UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Free list record has"
+				" a nonsensical offset %lu\n",
+				(ulong) (rec - page));
+
+			goto func_exit;
+		}
+
+		if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+			fprintf(stderr,
+				"InnoDB: Free list record %lu"
+				" is above rec heap top %lu\n",
+				(ulong) (rec - page),
+				(ulong) (rec_heap_top - page));
+
+			goto func_exit;
+		}
+
+		count++;
+
+		if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Page free list appears"
+				" to be circular %lu\n",
+				(ulong) count);
+			goto func_exit;
+		}
+
+		rec = page_rec_get_next_const(rec);
+	}
+
+	if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) {
+
+		fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n",
+			(ulong) page_dir_get_n_heap(page),
+			(ulong) (count + 1));
+
+		goto func_exit;
+	}
+
+	ret = TRUE;
+
+func_exit:
+	return(ret);
+}
+
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+page_simple_validate_new(
+/*=====================*/
+	const page_t*	page)	/*!< in: index page in ROW_FORMAT!=REDUNDANT */
+{
+	const page_dir_slot_t*	slot;
+	ulint			slot_no;
+	ulint			n_slots;
+	const rec_t*		rec;
+	const byte*		rec_heap_top;
+	ulint			count;
+	ulint			own_count;
+	ibool			ret	= FALSE;
+
+	ut_a(page_is_comp(page));
+
+	/* Check first that the record heap and the directory do not
+	overlap. */
+
+	n_slots = page_dir_get_n_slots(page);
+
+	if (UNIV_UNLIKELY(n_slots > UNIV_PAGE_SIZE / 4)) {
+		fprintf(stderr,
+			"InnoDB: Nonsensical number %lu"
+			" of page dir slots\n", (ulong) n_slots);
+
+		goto func_exit;
+	}
+
+	rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP);
+
+	if (UNIV_UNLIKELY(rec_heap_top
+			  > page_dir_get_nth_slot(page, n_slots - 1))) {
+
+		fprintf(stderr,
+			"InnoDB: Record heap and dir overlap on a page,"
+			" heap top %lu, dir %lu\n",
+			(ulong) page_header_get_field(page, PAGE_HEAP_TOP),
+			(ulong)
+			page_offset(page_dir_get_nth_slot(page, n_slots - 1)));
+
+		goto func_exit;
+	}
+
+	/* Validate the record list in a loop checking also that it is
+	consistent with the page record directory. */
+
+	count = 0;
+	own_count = 1;
+	slot_no = 0;
+	slot = page_dir_get_nth_slot(page, slot_no);
+
+	rec = page_get_infimum_rec(page);
+
+	for (;;) {
+		if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+			fprintf(stderr,
+				"InnoDB: Record %lu is above rec"
+				" heap top %lu\n",
+				(ulong) page_offset(rec),
+				(ulong) page_offset(rec_heap_top));
+
+			goto func_exit;
+		}
+
+		if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) {
+			/* This is a record pointed to by a dir slot */
+			if (UNIV_UNLIKELY(rec_get_n_owned_new(rec)
+					  != own_count)) {
+
+				fprintf(stderr,
+					"InnoDB: Wrong owned count %lu, %lu,"
+					" rec %lu\n",
+					(ulong) rec_get_n_owned_new(rec),
+					(ulong) own_count,
+					(ulong) page_offset(rec));
+
+				goto func_exit;
+			}
+
+			if (UNIV_UNLIKELY
+			    (page_dir_slot_get_rec(slot) != rec)) {
+				fprintf(stderr,
+					"InnoDB: Dir slot does not point"
+					" to right rec %lu\n",
+					(ulong) page_offset(rec));
+
+				goto func_exit;
+			}
+
+			own_count = 0;
+
+			if (!page_rec_is_supremum(rec)) {
+				slot_no++;
+				slot = page_dir_get_nth_slot(page, slot_no);
+			}
+		}
+
+		if (page_rec_is_supremum(rec)) {
+
+			break;
+		}
+
+		if (UNIV_UNLIKELY
+		    (rec_get_next_offs(rec, TRUE) < FIL_PAGE_DATA
+		     || rec_get_next_offs(rec, TRUE) >= UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Next record offset nonsensical %lu"
+				" for rec %lu\n",
+				(ulong) rec_get_next_offs(rec, TRUE),
+				(ulong) page_offset(rec));
+
+			goto func_exit;
+		}
+
+		count++;
+
+		if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Page record list appears"
+				" to be circular %lu\n",
+				(ulong) count);
+			goto func_exit;
+		}
+
+		rec = page_rec_get_next_const(rec);
+		own_count++;
+	}
+
+	if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) {
+		fprintf(stderr, "InnoDB: n owned is zero"
+			" in a supremum rec\n");
+
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(slot_no != n_slots - 1)) {
+		fprintf(stderr, "InnoDB: n slots wrong %lu, %lu\n",
+			(ulong) slot_no, (ulong) (n_slots - 1));
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS)
+			  + PAGE_HEAP_NO_USER_LOW
+			  != count + 1)) {
+		fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n",
+			(ulong) page_header_get_field(page, PAGE_N_RECS)
+			+ PAGE_HEAP_NO_USER_LOW,
+			(ulong) (count + 1));
+
+		goto func_exit;
+	}
+
+	/* Check then the free list */
+	rec = page_header_get_ptr(page, PAGE_FREE);
+
+	while (rec != NULL) {
+		if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA
+				  || rec >= page + UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Free list record has"
+				" a nonsensical offset %lu\n",
+				(ulong) page_offset(rec));
+
+			goto func_exit;
+		}
+
+		if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+			fprintf(stderr,
+				"InnoDB: Free list record %lu"
+				" is above rec heap top %lu\n",
+				(ulong) page_offset(rec),
+				(ulong) page_offset(rec_heap_top));
+
+			goto func_exit;
+		}
+
+		count++;
+
+		if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Page free list appears"
+				" to be circular %lu\n",
+				(ulong) count);
+			goto func_exit;
+		}
+
+		rec = page_rec_get_next_const(rec);
+	}
+
+	if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) {
+
+		fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n",
+			(ulong) page_dir_get_n_heap(page),
+			(ulong) (count + 1));
+
+		goto func_exit;
+	}
+
+	ret = TRUE;
+
+func_exit:
+	return(ret);
+}
+
+/***************************************************************//**
+This function checks the consistency of an index page.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+page_validate(
+/*==========*/
+	const page_t*	page,	/*!< in: index page */
+	dict_index_t*	index)	/*!< in: data dictionary index containing
+				the page record type definition */
+{
+	const page_dir_slot_t*	slot;
+	mem_heap_t*		heap;
+	byte*			buf;
+	ulint			count;
+	ulint			own_count;
+	ulint			rec_own_count;
+	ulint			slot_no;
+	ulint			data_size;
+	const rec_t*		rec;
+	const rec_t*		old_rec		= NULL;
+	ulint			offs;
+	ulint			n_slots;
+	ibool			ret		= FALSE;
+	ulint			i;
+	ulint*			offsets		= NULL;
+	ulint*			old_offsets	= NULL;
+
+	if (UNIV_UNLIKELY((ibool) !!page_is_comp(page)
+			  != dict_table_is_comp(index->table))) {
+		fputs("InnoDB: 'compact format' flag mismatch\n", stderr);
+		goto func_exit2;
+	}
+	if (page_is_comp(page)) {
+		if (UNIV_UNLIKELY(!page_simple_validate_new(page))) {
+			goto func_exit2;
+		}
+	} else {
+		if (UNIV_UNLIKELY(!page_simple_validate_old(page))) {
+			goto func_exit2;
+		}
+	}
+
+	if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)
+	    && !page_is_empty(page)) {
+		trx_id_t	max_trx_id	= page_get_max_trx_id(page);
+		trx_id_t	sys_max_trx_id	= trx_sys_get_max_trx_id();
+
+		if (max_trx_id == 0 || max_trx_id > sys_max_trx_id) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"PAGE_MAX_TRX_ID out of bounds: "
+				TRX_ID_FMT ", " TRX_ID_FMT,
+				max_trx_id, sys_max_trx_id);
+			goto func_exit2;
+		}
+	}
+
+	heap = mem_heap_create(UNIV_PAGE_SIZE + 200);
+
+	/* The following buffer is used to check that the
+	records in the page record heap do not overlap */
+
+	buf = static_cast<byte*>(mem_heap_zalloc(heap, UNIV_PAGE_SIZE));
+
+	/* Check first that the record heap and the directory do not
+	overlap. */
+
+	n_slots = page_dir_get_n_slots(page);
+
+	if (UNIV_UNLIKELY(!(page_header_get_ptr(page, PAGE_HEAP_TOP)
+			    <= page_dir_get_nth_slot(page, n_slots - 1)))) {
+
+		fprintf(stderr,
+			"InnoDB: Record heap and dir overlap"
+			" on space %lu page %lu index %s, %p, %p\n",
+			(ulong) page_get_space_id(page),
+			(ulong) page_get_page_no(page), index->name,
+			page_header_get_ptr(page, PAGE_HEAP_TOP),
+			page_dir_get_nth_slot(page, n_slots - 1));
+
+		goto func_exit;
+	}
+
+	/* Validate the record list in a loop checking also that
+	it is consistent with the directory. */
+	count = 0;
+	data_size = 0;
+	own_count = 1;
+	slot_no = 0;
+	slot = page_dir_get_nth_slot(page, slot_no);
+
+	rec = page_get_infimum_rec(page);
+
+	for (;;) {
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+
+		if (page_is_comp(page) && page_rec_is_user_rec(rec)
+		    && UNIV_UNLIKELY(rec_get_node_ptr_flag(rec)
+				     == page_is_leaf(page))) {
+			fputs("InnoDB: node_ptr flag mismatch\n", stderr);
+			goto func_exit;
+		}
+
+		if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) {
+			goto func_exit;
+		}
+
+#ifndef UNIV_HOTBACKUP
+		/* Check that the records are in the ascending order */
+		if (UNIV_LIKELY(count >= PAGE_HEAP_NO_USER_LOW)
+		    && !page_rec_is_supremum(rec)) {
+			if (UNIV_UNLIKELY
+			    (1 != cmp_rec_rec(rec, old_rec,
+					      offsets, old_offsets, index))) {
+				fprintf(stderr,
+					"InnoDB: Records in wrong order"
+					" on space %lu page %lu index %s\n",
+					(ulong) page_get_space_id(page),
+					(ulong) page_get_page_no(page),
+					index->name);
+				fputs("\nInnoDB: previous record ", stderr);
+				rec_print_new(stderr, old_rec, old_offsets);
+				fputs("\nInnoDB: record ", stderr);
+				rec_print_new(stderr, rec, offsets);
+				putc('\n', stderr);
+
+				goto func_exit;
+			}
+		}
+#endif /* !UNIV_HOTBACKUP */
+
+		if (page_rec_is_user_rec(rec)) {
+
+			data_size += rec_offs_size(offsets);
+		}
+
+		offs = page_offset(rec_get_start(rec, offsets));
+		i = rec_offs_size(offsets);
+		if (UNIV_UNLIKELY(offs + i >= UNIV_PAGE_SIZE)) {
+			fputs("InnoDB: record offset out of bounds\n", stderr);
+			goto func_exit;
+		}
+
+		while (i--) {
+			if (UNIV_UNLIKELY(buf[offs + i])) {
+				/* No other record may overlap this */
+
+				fputs("InnoDB: Record overlaps another\n",
+				      stderr);
+				goto func_exit;
+			}
+
+			buf[offs + i] = 1;
+		}
+
+		if (page_is_comp(page)) {
+			rec_own_count = rec_get_n_owned_new(rec);
+		} else {
+			rec_own_count = rec_get_n_owned_old(rec);
+		}
+
+		if (UNIV_UNLIKELY(rec_own_count)) {
+			/* This is a record pointed to by a dir slot */
+			if (UNIV_UNLIKELY(rec_own_count != own_count)) {
+				fprintf(stderr,
+					"InnoDB: Wrong owned count %lu, %lu\n",
+					(ulong) rec_own_count,
+					(ulong) own_count);
+				goto func_exit;
+			}
+
+			if (page_dir_slot_get_rec(slot) != rec) {
+				fputs("InnoDB: Dir slot does not"
+				      " point to right rec\n",
+				      stderr);
+				goto func_exit;
+			}
+
+			page_dir_slot_check(slot);
+
+			own_count = 0;
+			if (!page_rec_is_supremum(rec)) {
+				slot_no++;
+				slot = page_dir_get_nth_slot(page, slot_no);
+			}
+		}
+
+		if (page_rec_is_supremum(rec)) {
+			break;
+		}
+
+		count++;
+		own_count++;
+		old_rec = rec;
+		rec = page_rec_get_next_const(rec);
+
+		/* set old_offsets to offsets; recycle offsets */
+		{
+			ulint* offs = old_offsets;
+			old_offsets = offsets;
+			offsets = offs;
+		}
+	}
+
+	if (page_is_comp(page)) {
+		if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) {
+
+			goto n_owned_zero;
+		}
+	} else if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) {
+n_owned_zero:
+		fputs("InnoDB: n owned is zero\n", stderr);
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(slot_no != n_slots - 1)) {
+		fprintf(stderr, "InnoDB: n slots wrong %lu %lu\n",
+			(ulong) slot_no, (ulong) (n_slots - 1));
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS)
+			  + PAGE_HEAP_NO_USER_LOW
+			  != count + 1)) {
+		fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n",
+			(ulong) page_header_get_field(page, PAGE_N_RECS)
+			+ PAGE_HEAP_NO_USER_LOW,
+			(ulong) (count + 1));
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(data_size != page_get_data_size(page))) {
+		fprintf(stderr,
+			"InnoDB: Summed data size %lu, returned by func %lu\n",
+			(ulong) data_size, (ulong) page_get_data_size(page));
+		goto func_exit;
+	}
+
+	/* Check then the free list */
+	rec = page_header_get_ptr(page, PAGE_FREE);
+
+	while (rec != NULL) {
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) {
+
+			goto func_exit;
+		}
+
+		count++;
+		offs = page_offset(rec_get_start(rec, offsets));
+		i = rec_offs_size(offsets);
+		if (UNIV_UNLIKELY(offs + i >= UNIV_PAGE_SIZE)) {
+			fputs("InnoDB: record offset out of bounds\n", stderr);
+			goto func_exit;
+		}
+
+		while (i--) {
+
+			if (UNIV_UNLIKELY(buf[offs + i])) {
+				fputs("InnoDB: Record overlaps another"
+				      " in free list\n", stderr);
+				goto func_exit;
+			}
+
+			buf[offs + i] = 1;
+		}
+
+		rec = page_rec_get_next_const(rec);
+	}
+
+	if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) {
+		fprintf(stderr, "InnoDB: N heap is wrong %lu %lu\n",
+			(ulong) page_dir_get_n_heap(page),
+			(ulong) count + 1);
+		goto func_exit;
+	}
+
+	ret = TRUE;
+
+func_exit:
+	mem_heap_free(heap);
+
+	if (UNIV_UNLIKELY(ret == FALSE)) {
+func_exit2:
+		fprintf(stderr,
+			"InnoDB: Apparent corruption"
+			" in space %lu page %lu index %s\n",
+			(ulong) page_get_space_id(page),
+			(ulong) page_get_page_no(page),
+			index->name);
+		buf_page_print(page, 0, 0);
+	}
+
+	return(ret);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Looks in the page record list for a record with the given heap number.
+@return	record, NULL if not found */
+UNIV_INTERN
+const rec_t*
+page_find_rec_with_heap_no(
+/*=======================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		heap_no)/*!< in: heap number */
+{
+	const rec_t*	rec;
+
+	if (page_is_comp(page)) {
+		rec = page + PAGE_NEW_INFIMUM;
+
+		for(;;) {
+			ulint	rec_heap_no = rec_get_heap_no_new(rec);
+
+			if (rec_heap_no == heap_no) {
+
+				return(rec);
+			} else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) {
+
+				return(NULL);
+			}
+
+			rec = page + rec_get_next_offs(rec, TRUE);
+		}
+	} else {
+		rec = page + PAGE_OLD_INFIMUM;
+
+		for (;;) {
+			ulint	rec_heap_no = rec_get_heap_no_old(rec);
+
+			if (rec_heap_no == heap_no) {
+
+				return(rec);
+			} else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) {
+
+				return(NULL);
+			}
+
+			rec = page + rec_get_next_offs(rec, FALSE);
+		}
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************//**
+Removes the record from a leaf page. This function does not log
+any changes. It is used by the IMPORT tablespace functions.
+The cursor is moved to the next record after the deleted one.
+@return	true if success, i.e., the page did not become too empty */
+UNIV_INTERN
+bool
+page_delete_rec(
+/*============*/
+	const dict_index_t*	index,	/*!< in: The index that the record
+					belongs to */
+	page_cur_t*		pcur,	/*!< in/out: page cursor on record
+					to delete */
+	page_zip_des_t*		page_zip,/*!< in: compressed page descriptor */
+	const ulint*		offsets)/*!< in: offsets for record */
+{
+	bool		no_compress_needed;
+	buf_block_t*	block = pcur->block;
+	page_t*		page = buf_block_get_frame(block);
+
+	ut_ad(page_is_leaf(page));
+
+	if (!rec_offs_any_extern(offsets)
+	    && ((page_get_data_size(page) - rec_offs_size(offsets)
+		< BTR_CUR_PAGE_COMPRESS_LIMIT)
+		|| (mach_read_from_4(page + FIL_PAGE_NEXT) == FIL_NULL
+		    && mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL)
+		|| (page_get_n_recs(page) < 2))) {
+
+		ulint	root_page_no = dict_index_get_page(index);
+
+		/* The page fillfactor will drop below a predefined
+		minimum value, OR the level in the B-tree contains just
+		one page, OR the page will become empty: we recommend
+		compression if this is not the root page. */
+
+		no_compress_needed = page_get_page_no(page) == root_page_no;
+	} else {
+		no_compress_needed = true;
+	}
+
+	if (no_compress_needed) {
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+		page_cur_delete_rec(pcur, index, offsets, 0);
+
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+	}
+
+	return(no_compress_needed);
+}
+
+/** Get the last non-delete-marked record on a page.
+@param[in]	page	index tree leaf page
+@return the last record, not delete-marked
+@retval infimum record if all records are delete-marked */
+
+const rec_t*
+page_find_rec_max_not_deleted(
+	const page_t*	page)
+{
+	const rec_t*	rec = page_get_infimum_rec(page);
+	const rec_t*	prev_rec = NULL; // remove warning
+
+	/* Because the page infimum is never delete-marked,
+	prev_rec will always be assigned to it first. */
+	ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec)));
+	if (page_is_comp(page)) {
+		do {
+			if (!rec_get_deleted_flag(rec, true)) {
+				prev_rec = rec;
+			}
+			rec = page_rec_get_next_low(rec, true);
+		} while (rec != page + PAGE_NEW_SUPREMUM);
+	} else {
+		do {
+			if (!rec_get_deleted_flag(rec, false)) {
+				prev_rec = rec;
+			}
+			rec = page_rec_get_next_low(rec, false);
+		} while (rec != page + PAGE_OLD_SUPREMUM);
+	}
+	return(prev_rec);
+}
diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc
new file mode 100644
index 00000000000..4fcf38e9a8c
--- /dev/null
+++ b/storage/innobase/page/page0zip.cc
@@ -0,0 +1,4948 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file page/page0zip.cc
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+// First include (the generated) my_config.h, to get correct platform defines.
+#include "my_config.h"
+
+#include <map>
+using namespace std;
+
+#define THIS_MODULE
+#include "page0zip.h"
+#ifdef UNIV_NONINL
+# include "page0zip.ic"
+#endif
+#undef THIS_MODULE
+#include "page0page.h"
+#include "mtr0log.h"
+#include "ut0sort.h"
+#include "dict0dict.h"
+#include "btr0cur.h"
+#include "page0types.h"
+#include "log0recv.h"
+#include "zlib.h"
+#ifndef UNIV_HOTBACKUP
+# include "buf0buf.h"
+# include "buf0lru.h"
+# include "btr0sea.h"
+# include "dict0boot.h"
+# include "lock0lock.h"
+# include "srv0mon.h"
+# include "srv0srv.h"
+# include "ut0crc32.h"
+#else /* !UNIV_HOTBACKUP */
+# include "buf0checksum.h"
+# define lock_move_reorganize_page(block, temp_block)	((void) 0)
+# define buf_LRU_stat_inc_unzip()			((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_HOTBACKUP
+/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */
+UNIV_INTERN page_zip_stat_t		page_zip_stat[PAGE_ZIP_SSIZE_MAX];
+/** Statistics on compression, indexed by index->id */
+UNIV_INTERN page_zip_stat_per_index_t	page_zip_stat_per_index;
+/** Mutex protecting page_zip_stat_per_index */
+UNIV_INTERN ib_mutex_t			page_zip_stat_per_index_mutex;
+#ifdef HAVE_PSI_INTERFACE
+UNIV_INTERN mysql_pfs_key_t		page_zip_stat_per_index_mutex_key;
+#endif /* HAVE_PSI_INTERFACE */
+#endif /* !UNIV_HOTBACKUP */
+
+/* Compression level to be used by zlib. Settable by user. */
+UNIV_INTERN uint	page_zip_level = DEFAULT_COMPRESSION_LEVEL;
+
+/* Whether or not to log compressed page images to avoid possible
+compression algorithm changes in zlib. */
+UNIV_INTERN my_bool	page_zip_log_pages = true;
+
+/* Please refer to ../include/page0zip.ic for a description of the
+compressed page format. */
+
+/* The infimum and supremum records are omitted from the compressed page.
+On compress, we compare that the records are there, and on uncompress we
+restore the records. */
+/** Extra bytes of an infimum record */
+static const byte infimum_extra[] = {
+	0x01,			/* info_bits=0, n_owned=1 */
+	0x00, 0x02		/* heap_no=0, status=2 */
+	/* ?, ?	*/		/* next=(first user rec, or supremum) */
+};
+/** Data bytes of an infimum record */
+static const byte infimum_data[] = {
+	0x69, 0x6e, 0x66, 0x69,
+	0x6d, 0x75, 0x6d, 0x00	/* "infimum\0" */
+};
+/** Extra bytes and data bytes of a supremum record */
+static const byte supremum_extra_data[] = {
+	/* 0x0?, */		/* info_bits=0, n_owned=1..8 */
+	0x00, 0x0b,		/* heap_no=1, status=3 */
+	0x00, 0x00,		/* next=0 */
+	0x73, 0x75, 0x70, 0x72,
+	0x65, 0x6d, 0x75, 0x6d	/* "supremum" */
+};
+
+/** Assert that a block of memory is filled with zero bytes.
+Compare at most sizeof(field_ref_zero) bytes.
+@param b	in: memory block
+@param s	in: size of the memory block, in bytes */
+#define ASSERT_ZERO(b, s) \
+	ut_ad(!memcmp(b, field_ref_zero, ut_min(s, sizeof field_ref_zero)))
+/** Assert that a BLOB pointer is filled with zero bytes.
+@param b	in: BLOB pointer */
+#define ASSERT_ZERO_BLOB(b) \
+	ut_ad(!memcmp(b, field_ref_zero, sizeof field_ref_zero))
+
+/* Enable some extra debugging output.  This code can be enabled
+independently of any UNIV_ debugging conditions. */
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+# include <stdarg.h>
+__attribute__((format (printf, 1, 2)))
+/**********************************************************************//**
+Report a failure to decompress or compress.
+@return	number of characters printed */
+static
+int
+page_zip_fail_func(
+/*===============*/
+	const char*	fmt,	/*!< in: printf(3) format string */
+	...)			/*!< in: arguments corresponding to fmt */
+{
+	int	res;
+	va_list	ap;
+
+	ut_print_timestamp(stderr);
+	fputs("  InnoDB: ", stderr);
+	va_start(ap, fmt);
+	res = vfprintf(stderr, fmt, ap);
+	va_end(ap);
+
+	return(res);
+}
+/** Wrapper for page_zip_fail_func()
+@param fmt_args	in: printf(3) format string and arguments */
+# define page_zip_fail(fmt_args) page_zip_fail_func fmt_args
+#else /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+/** Dummy wrapper for page_zip_fail_func()
+@param fmt_args	ignored: printf(3) format string and arguments */
+# define page_zip_fail(fmt_args) /* empty */
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Determine the guaranteed free space on an empty page.
+@return	minimum payload size on the page */
+UNIV_INTERN
+ulint
+page_zip_empty_size(
+/*================*/
+	ulint	n_fields,	/*!< in: number of columns in the index */
+	ulint	zip_size)	/*!< in: compressed page size in bytes */
+{
+	lint	size = zip_size
+		/* subtract the page header and the longest
+		uncompressed data needed for one record */
+		- (PAGE_DATA
+		   + PAGE_ZIP_DIR_SLOT_SIZE
+		   + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN
+		   + 1/* encoded heap_no==2 in page_zip_write_rec() */
+		   + 1/* end of modification log */
+		   - REC_N_NEW_EXTRA_BYTES/* omitted bytes */)
+		/* subtract the space for page_zip_fields_encode() */
+		- compressBound(static_cast<uLong>(2 * (n_fields + 1)));
+	return(size > 0 ? (ulint) size : 0);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Gets the number of elements in the dense page directory,
+including deleted records (the free list).
+@return	number of elements in the dense page directory */
+UNIV_INLINE
+ulint
+page_zip_dir_elems(
+/*===============*/
+	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
+{
+	/* Exclude the page infimum and supremum from the record count. */
+	return(page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW);
+}
+
+/*************************************************************//**
+Gets the size of the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@return	length of dense page directory, in bytes */
+UNIV_INLINE
+ulint
+page_zip_dir_size(
+/*==============*/
+	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
+{
+	return(PAGE_ZIP_DIR_SLOT_SIZE * page_zip_dir_elems(page_zip));
+}
+
+/*************************************************************//**
+Gets an offset to the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@return	offset of the dense page directory */
+UNIV_INLINE
+ulint
+page_zip_dir_start_offs(
+/*====================*/
+	const page_zip_des_t*	page_zip,	/*!< in: compressed page */
+	ulint			n_dense)	/*!< in: directory size */
+{
+	ut_ad(n_dense * PAGE_ZIP_DIR_SLOT_SIZE < page_zip_get_size(page_zip));
+
+	return(page_zip_get_size(page_zip) - n_dense * PAGE_ZIP_DIR_SLOT_SIZE);
+}
+
+/*************************************************************//**
+Gets a pointer to the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@param[in] page_zip	compressed page
+@param[in] n_dense	number of entries in the directory
+@return	pointer to the dense page directory */
+#define page_zip_dir_start_low(page_zip, n_dense)			\
+	((page_zip)->data + page_zip_dir_start_offs(page_zip, n_dense))
+/*************************************************************//**
+Gets a pointer to the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@param[in] page_zip	compressed page
+@return	pointer to the dense page directory */
+#define page_zip_dir_start(page_zip)					\
+	page_zip_dir_start_low(page_zip, page_zip_dir_elems(page_zip))
+
+/*************************************************************//**
+Gets the size of the compressed page trailer (the dense page directory),
+only including user records (excluding the free list).
+@return	length of dense page directory comprising existing records, in bytes */
+UNIV_INLINE
+ulint
+page_zip_dir_user_size(
+/*===================*/
+	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
+{
+	ulint	size = PAGE_ZIP_DIR_SLOT_SIZE
+		* page_get_n_recs(page_zip->data);
+	ut_ad(size <= page_zip_dir_size(page_zip));
+	return(size);
+}
+
+/*************************************************************//**
+Find the slot of the given record in the dense page directory.
+@return	dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find_low(
+/*==================*/
+	byte*	slot,			/*!< in: start of records */
+	byte*	end,			/*!< in: end of records */
+	ulint	offset)			/*!< in: offset of user record */
+{
+	ut_ad(slot <= end);
+
+	for (; slot < end; slot += PAGE_ZIP_DIR_SLOT_SIZE) {
+		if ((mach_read_from_2(slot) & PAGE_ZIP_DIR_SLOT_MASK)
+		    == offset) {
+			return(slot);
+		}
+	}
+
+	return(NULL);
+}
+
+/*************************************************************//**
+Find the slot of the given non-free record in the dense page directory.
+@return	dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find(
+/*==============*/
+	page_zip_des_t*	page_zip,		/*!< in: compressed page */
+	ulint		offset)			/*!< in: offset of user record */
+{
+	byte*	end	= page_zip->data + page_zip_get_size(page_zip);
+
+	ut_ad(page_zip_simple_validate(page_zip));
+
+	return(page_zip_dir_find_low(end - page_zip_dir_user_size(page_zip),
+				     end,
+				     offset));
+}
+
+/*************************************************************//**
+Find the slot of the given free record in the dense page directory.
+@return	dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find_free(
+/*===================*/
+	page_zip_des_t*	page_zip,		/*!< in: compressed page */
+	ulint		offset)			/*!< in: offset of user record */
+{
+	byte*	end	= page_zip->data + page_zip_get_size(page_zip);
+
+	ut_ad(page_zip_simple_validate(page_zip));
+
+	return(page_zip_dir_find_low(end - page_zip_dir_size(page_zip),
+				     end - page_zip_dir_user_size(page_zip),
+				     offset));
+}
+
+/*************************************************************//**
+Read a given slot in the dense page directory.
+@return record offset on the uncompressed page, possibly ORed with
+PAGE_ZIP_DIR_SLOT_DEL or PAGE_ZIP_DIR_SLOT_OWNED */
+UNIV_INLINE
+ulint
+page_zip_dir_get(
+/*=============*/
+	const page_zip_des_t*	page_zip,	/*!< in: compressed page */
+	ulint			slot)		/*!< in: slot
+						(0=first user record) */
+{
+	ut_ad(page_zip_simple_validate(page_zip));
+	ut_ad(slot < page_zip_dir_size(page_zip) / PAGE_ZIP_DIR_SLOT_SIZE);
+	return(mach_read_from_2(page_zip->data + page_zip_get_size(page_zip)
+				- PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1)));
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Write a log record of compressing an index page. */
+static
+void
+page_zip_compress_write_log(
+/*========================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	const page_t*		page,	/*!< in: uncompressed page */
+	dict_index_t*		index,	/*!< in: index of the B-tree node */
+	mtr_t*			mtr)	/*!< in: mini-transaction */
+{
+	byte*	log_ptr;
+	ulint	trailer_size;
+
+	ut_ad(!dict_index_is_ibuf(index));
+
+	log_ptr = mlog_open(mtr, 11 + 2 + 2);
+
+	if (!log_ptr) {
+
+		return;
+	}
+
+	/* Read the number of user records. */
+	trailer_size = page_dir_get_n_heap(page_zip->data)
+		- PAGE_HEAP_NO_USER_LOW;
+	/* Multiply by uncompressed of size stored per record */
+	if (!page_is_leaf(page)) {
+		trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
+	} else if (dict_index_is_clust(index)) {
+		trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE
+			+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+	} else {
+		trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE;
+	}
+	/* Add the space occupied by BLOB pointers. */
+	trailer_size += page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+	ut_a(page_zip->m_end > PAGE_DATA);
+#if FIL_PAGE_DATA > PAGE_DATA
+# error "FIL_PAGE_DATA > PAGE_DATA"
+#endif
+	ut_a(page_zip->m_end + trailer_size <= page_zip_get_size(page_zip));
+
+	log_ptr = mlog_write_initial_log_record_fast((page_t*) page,
+						     MLOG_ZIP_PAGE_COMPRESS,
+						     log_ptr, mtr);
+	mach_write_to_2(log_ptr, page_zip->m_end - FIL_PAGE_TYPE);
+	log_ptr += 2;
+	mach_write_to_2(log_ptr, trailer_size);
+	log_ptr += 2;
+	mlog_close(mtr, log_ptr);
+
+	/* Write FIL_PAGE_PREV and FIL_PAGE_NEXT */
+	mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_PREV, 4);
+	mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_NEXT, 4);
+	/* Write most of the page header, the compressed stream and
+	the modification log. */
+	mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_TYPE,
+			     page_zip->m_end - FIL_PAGE_TYPE);
+	/* Write the uncompressed trailer of the compressed page. */
+	mlog_catenate_string(mtr, page_zip->data + page_zip_get_size(page_zip)
+			     - trailer_size, trailer_size);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/******************************************************//**
+Determine how many externally stored columns are contained
+in existing records with smaller heap_no than rec. */
+static
+ulint
+page_zip_get_n_prev_extern(
+/*=======================*/
+	const page_zip_des_t*	page_zip,/*!< in: dense page directory on
+					compressed page */
+	const rec_t*		rec,	/*!< in: compact physical record
+					on a B-tree leaf page */
+	const dict_index_t*	index)	/*!< in: record descriptor */
+{
+	const page_t*	page	= page_align(rec);
+	ulint		n_ext	= 0;
+	ulint		i;
+	ulint		left;
+	ulint		heap_no;
+	ulint		n_recs	= page_get_n_recs(page_zip->data);
+
+	ut_ad(page_is_leaf(page));
+	ut_ad(page_is_comp(page));
+	ut_ad(dict_table_is_comp(index->table));
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(!dict_index_is_ibuf(index));
+
+	heap_no = rec_get_heap_no_new(rec);
+	ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+	left = heap_no - PAGE_HEAP_NO_USER_LOW;
+	if (UNIV_UNLIKELY(!left)) {
+		return(0);
+	}
+
+	for (i = 0; i < n_recs; i++) {
+		const rec_t*	r	= page + (page_zip_dir_get(page_zip, i)
+						  & PAGE_ZIP_DIR_SLOT_MASK);
+
+		if (rec_get_heap_no_new(r) < heap_no) {
+			n_ext += rec_get_n_extern_new(r, index,
+						      ULINT_UNDEFINED);
+			if (!--left) {
+				break;
+			}
+		}
+	}
+
+	return(n_ext);
+}
+
+/**********************************************************************//**
+Encode the length of a fixed-length column.
+@return	buf + length of encoded val */
+static
+byte*
+page_zip_fixed_field_encode(
+/*========================*/
+	byte*	buf,	/*!< in: pointer to buffer where to write */
+	ulint	val)	/*!< in: value to write */
+{
+	ut_ad(val >= 2);
+
+	if (UNIV_LIKELY(val < 126)) {
+		/*
+		0 = nullable variable field of at most 255 bytes length;
+		1 = not null variable field of at most 255 bytes length;
+		126 = nullable variable field with maximum length >255;
+		127 = not null variable field with maximum length >255
+		*/
+		*buf++ = (byte) val;
+	} else {
+		*buf++ = (byte) (0x80 | val >> 8);
+		*buf++ = (byte) val;
+	}
+
+	return(buf);
+}
+
+/**********************************************************************//**
+Write the index information for the compressed page.
+@return	used size of buf */
+static
+ulint
+page_zip_fields_encode(
+/*===================*/
+	ulint		n,	/*!< in: number of fields to compress */
+	dict_index_t*	index,	/*!< in: index comprising at least n fields */
+	ulint		trx_id_pos,/*!< in: position of the trx_id column
+				in the index, or ULINT_UNDEFINED if
+				this is a non-leaf page */
+	byte*		buf)	/*!< out: buffer of (n + 1) * 2 bytes */
+{
+	const byte*	buf_start	= buf;
+	ulint		i;
+	ulint		col;
+	ulint		trx_id_col	= 0;
+	/* sum of lengths of preceding non-nullable fixed fields, or 0 */
+	ulint		fixed_sum	= 0;
+
+	ut_ad(trx_id_pos == ULINT_UNDEFINED || trx_id_pos < n);
+
+	for (i = col = 0; i < n; i++) {
+		dict_field_t*	field = dict_index_get_nth_field(index, i);
+		ulint		val;
+
+		if (dict_field_get_col(field)->prtype & DATA_NOT_NULL) {
+			val = 1; /* set the "not nullable" flag */
+		} else {
+			val = 0; /* nullable field */
+		}
+
+		if (!field->fixed_len) {
+			/* variable-length field */
+			const dict_col_t*	column
+				= dict_field_get_col(field);
+
+			if (UNIV_UNLIKELY(column->len > 255)
+			    || UNIV_UNLIKELY(column->mtype == DATA_BLOB)) {
+				val |= 0x7e; /* max > 255 bytes */
+			}
+
+			if (fixed_sum) {
+				/* write out the length of any
+				preceding non-nullable fields */
+				buf = page_zip_fixed_field_encode(
+					buf, fixed_sum << 1 | 1);
+				fixed_sum = 0;
+				col++;
+			}
+
+			*buf++ = (byte) val;
+			col++;
+		} else if (val) {
+			/* fixed-length non-nullable field */
+
+			if (fixed_sum && UNIV_UNLIKELY
+			    (fixed_sum + field->fixed_len
+			     > DICT_MAX_FIXED_COL_LEN)) {
+				/* Write out the length of the
+				preceding non-nullable fields,
+				to avoid exceeding the maximum
+				length of a fixed-length column. */
+				buf = page_zip_fixed_field_encode(
+					buf, fixed_sum << 1 | 1);
+				fixed_sum = 0;
+				col++;
+			}
+
+			if (i && UNIV_UNLIKELY(i == trx_id_pos)) {
+				if (fixed_sum) {
+					/* Write out the length of any
+					preceding non-nullable fields,
+					and start a new trx_id column. */
+					buf = page_zip_fixed_field_encode(
+						buf, fixed_sum << 1 | 1);
+					col++;
+				}
+
+				trx_id_col = col;
+				fixed_sum = field->fixed_len;
+			} else {
+				/* add to the sum */
+				fixed_sum += field->fixed_len;
+			}
+		} else {
+			/* fixed-length nullable field */
+
+			if (fixed_sum) {
+				/* write out the length of any
+				preceding non-nullable fields */
+				buf = page_zip_fixed_field_encode(
+					buf, fixed_sum << 1 | 1);
+				fixed_sum = 0;
+				col++;
+			}
+
+			buf = page_zip_fixed_field_encode(
+				buf, field->fixed_len << 1);
+			col++;
+		}
+	}
+
+	if (fixed_sum) {
+		/* Write out the lengths of last fixed-length columns. */
+		buf = page_zip_fixed_field_encode(buf, fixed_sum << 1 | 1);
+	}
+
+	if (trx_id_pos != ULINT_UNDEFINED) {
+		/* Write out the position of the trx_id column */
+		i = trx_id_col;
+	} else {
+		/* Write out the number of nullable fields */
+		i = index->n_nullable;
+	}
+
+	if (i < 128) {
+		*buf++ = (byte) i;
+	} else {
+		*buf++ = (byte) (0x80 | i >> 8);
+		*buf++ = (byte) i;
+	}
+
+	ut_ad((ulint) (buf - buf_start) <= (n + 2) * 2);
+	return((ulint) (buf - buf_start));
+}
+
+/**********************************************************************//**
+Populate the dense page directory from the sparse directory. */
+static
+void
+page_zip_dir_encode(
+/*================*/
+	const page_t*	page,	/*!< in: compact page */
+	byte*		buf,	/*!< in: pointer to dense page directory[-1];
+				out: dense directory on compressed page */
+	const rec_t**	recs)	/*!< in: pointer to an array of 0, or NULL;
+				out: dense page directory sorted by ascending
+				address (and heap_no) */
+{
+	const byte*	rec;
+	ulint		status;
+	ulint		min_mark;
+	ulint		heap_no;
+	ulint		i;
+	ulint		n_heap;
+	ulint		offs;
+
+	min_mark = 0;
+
+	if (page_is_leaf(page)) {
+		status = REC_STATUS_ORDINARY;
+	} else {
+		status = REC_STATUS_NODE_PTR;
+		if (UNIV_UNLIKELY
+		    (mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL)) {
+			min_mark = REC_INFO_MIN_REC_FLAG;
+		}
+	}
+
+	n_heap = page_dir_get_n_heap(page);
+
+	/* Traverse the list of stored records in the collation order,
+	starting from the first user record. */
+
+	rec = page + PAGE_NEW_INFIMUM;
+
+	i = 0;
+
+	for (;;) {
+		ulint	info_bits;
+		offs = rec_get_next_offs(rec, TRUE);
+		if (UNIV_UNLIKELY(offs == PAGE_NEW_SUPREMUM)) {
+			break;
+		}
+		rec = page + offs;
+		heap_no = rec_get_heap_no_new(rec);
+		ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW);
+		ut_a(heap_no < n_heap);
+		ut_a(offs < UNIV_PAGE_SIZE - PAGE_DIR);
+		ut_a(offs >= PAGE_ZIP_START);
+#if PAGE_ZIP_DIR_SLOT_MASK & (PAGE_ZIP_DIR_SLOT_MASK + 1)
+# error "PAGE_ZIP_DIR_SLOT_MASK is not 1 less than a power of 2"
+#endif
+#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_MAX - 1
+# error "PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_MAX - 1"
+#endif
+		if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) {
+			offs |= PAGE_ZIP_DIR_SLOT_OWNED;
+		}
+
+		info_bits = rec_get_info_bits(rec, TRUE);
+		if (info_bits & REC_INFO_DELETED_FLAG) {
+			info_bits &= ~REC_INFO_DELETED_FLAG;
+			offs |= PAGE_ZIP_DIR_SLOT_DEL;
+		}
+		ut_a(info_bits == min_mark);
+		/* Only the smallest user record can have
+		REC_INFO_MIN_REC_FLAG set. */
+		min_mark = 0;
+
+		mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs);
+
+		if (UNIV_LIKELY_NULL(recs)) {
+			/* Ensure that each heap_no occurs at most once. */
+			ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]);
+			/* exclude infimum and supremum */
+			recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec;
+		}
+
+		ut_a(rec_get_status(rec) == status);
+	}
+
+	offs = page_header_get_field(page, PAGE_FREE);
+
+	/* Traverse the free list (of deleted records). */
+	while (offs) {
+		ut_ad(!(offs & ~PAGE_ZIP_DIR_SLOT_MASK));
+		rec = page + offs;
+
+		heap_no = rec_get_heap_no_new(rec);
+		ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW);
+		ut_a(heap_no < n_heap);
+
+		ut_a(!rec[-REC_N_NEW_EXTRA_BYTES]); /* info_bits and n_owned */
+		ut_a(rec_get_status(rec) == status);
+
+		mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs);
+
+		if (UNIV_LIKELY_NULL(recs)) {
+			/* Ensure that each heap_no occurs at most once. */
+			ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]);
+			/* exclude infimum and supremum */
+			recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec;
+		}
+
+		offs = rec_get_next_offs(rec, TRUE);
+	}
+
+	/* Ensure that each heap no occurs at least once. */
+	ut_a(i + PAGE_HEAP_NO_USER_LOW == n_heap);
+}
+
+extern "C" {
+
+/**********************************************************************//**
+Allocate memory for zlib. */
+static
+void*
+page_zip_zalloc(
+/*============*/
+	void*	opaque,	/*!< in/out: memory heap */
+	uInt	items,	/*!< in: number of items to allocate */
+	uInt	size)	/*!< in: size of an item in bytes */
+{
+	return(mem_heap_zalloc(static_cast<mem_heap_t*>(opaque), items * size));
+}
+
+/**********************************************************************//**
+Deallocate memory for zlib. */
+static
+void
+page_zip_free(
+/*==========*/
+	void*	opaque __attribute__((unused)),	/*!< in: memory heap */
+	void*	address __attribute__((unused)))/*!< in: object to free */
+{
+}
+
+} /* extern "C" */
+
+/**********************************************************************//**
+Configure the zlib allocator to use the given memory heap. */
+UNIV_INTERN
+void
+page_zip_set_alloc(
+/*===============*/
+	void*		stream,		/*!< in/out: zlib stream */
+	mem_heap_t*	heap)		/*!< in: memory heap to use */
+{
+	z_stream*	strm = static_cast<z_stream*>(stream);
+
+	strm->zalloc = page_zip_zalloc;
+	strm->zfree = page_zip_free;
+	strm->opaque = heap;
+}
+
+#if 0 || defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+/** Symbol for enabling compression and decompression diagnostics */
+# define PAGE_ZIP_COMPRESS_DBG
+#endif
+
+#ifdef PAGE_ZIP_COMPRESS_DBG
+/** Set this variable in a debugger to enable
+excessive logging in page_zip_compress(). */
+UNIV_INTERN ibool	page_zip_compress_dbg;
+/** Set this variable in a debugger to enable
+binary logging of the data passed to deflate().
+When this variable is nonzero, it will act
+as a log file name generator. */
+UNIV_INTERN unsigned	page_zip_compress_log;
+
+/**********************************************************************//**
+Wrapper for deflate().  Log the operation if page_zip_compress_dbg is set.
+@return	deflate() status: Z_OK, Z_BUF_ERROR, ... */
+static
+int
+page_zip_compress_deflate(
+/*======================*/
+	FILE*		logfile,/*!< in: log file, or NULL */
+	z_streamp	strm,	/*!< in/out: compressed stream for deflate() */
+	int		flush)	/*!< in: deflate() flushing method */
+{
+	int	status;
+	if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+		ut_print_buf(stderr, strm->next_in, strm->avail_in);
+	}
+	if (UNIV_LIKELY_NULL(logfile)) {
+		fwrite(strm->next_in, 1, strm->avail_in, logfile);
+	}
+	status = deflate(strm, flush);
+	if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+		fprintf(stderr, " -> %d\n", status);
+	}
+	return(status);
+}
+
+/* Redefine deflate(). */
+# undef deflate
+/** Debug wrapper for the zlib compression routine deflate().
+Log the operation if page_zip_compress_dbg is set.
+@param strm	in/out: compressed stream
+@param flush	in: flushing method
+@return		deflate() status: Z_OK, Z_BUF_ERROR, ... */
+# define deflate(strm, flush) page_zip_compress_deflate(logfile, strm, flush)
+/** Declaration of the logfile parameter */
+# define FILE_LOGFILE FILE* logfile,
+/** The logfile parameter */
+# define LOGFILE logfile,
+#else /* PAGE_ZIP_COMPRESS_DBG */
+/** Empty declaration of the logfile parameter */
+# define FILE_LOGFILE
+/** Missing logfile parameter */
+# define LOGFILE
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+
+/**********************************************************************//**
+Compress the records of a node pointer page.
+@return	Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_node_ptrs(
+/*========================*/
+	FILE_LOGFILE
+	z_stream*	c_stream,	/*!< in/out: compressed page stream */
+	const rec_t**	recs,		/*!< in: dense page directory
+					sorted by address */
+	ulint		n_dense,	/*!< in: size of recs[] */
+	dict_index_t*	index,		/*!< in: the index of the page */
+	byte*		storage,	/*!< in: end of dense page directory */
+	mem_heap_t*	heap)		/*!< in: temporary memory heap */
+{
+	int	err	= Z_OK;
+	ulint*	offsets = NULL;
+
+	do {
+		const rec_t*	rec = *recs++;
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		/* Only leaf nodes may contain externally stored columns. */
+		ut_ad(!rec_offs_any_extern(offsets));
+
+		UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+		UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+				   rec_offs_extra_size(offsets));
+
+		/* Compress the extra bytes. */
+		c_stream->avail_in = static_cast<uInt>(
+			rec - REC_N_NEW_EXTRA_BYTES - c_stream->next_in);
+
+		if (c_stream->avail_in) {
+			err = deflate(c_stream, Z_NO_FLUSH);
+			if (UNIV_UNLIKELY(err != Z_OK)) {
+				break;
+			}
+		}
+		ut_ad(!c_stream->avail_in);
+
+		/* Compress the data bytes, except node_ptr. */
+		c_stream->next_in = (byte*) rec;
+		c_stream->avail_in = static_cast<uInt>(
+			rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE);
+
+		if (c_stream->avail_in) {
+			err = deflate(c_stream, Z_NO_FLUSH);
+			if (UNIV_UNLIKELY(err != Z_OK)) {
+				break;
+			}
+		}
+
+		ut_ad(!c_stream->avail_in);
+
+		memcpy(storage - REC_NODE_PTR_SIZE
+		       * (rec_get_heap_no_new(rec) - 1),
+		       c_stream->next_in, REC_NODE_PTR_SIZE);
+		c_stream->next_in += REC_NODE_PTR_SIZE;
+	} while (--n_dense);
+
+	return(err);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a secondary index.
+@return	Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_sec(
+/*==================*/
+	FILE_LOGFILE
+	z_stream*	c_stream,	/*!< in/out: compressed page stream */
+	const rec_t**	recs,		/*!< in: dense page directory
+					sorted by address */
+	ulint		n_dense)	/*!< in: size of recs[] */
+{
+	int		err	= Z_OK;
+
+	ut_ad(n_dense > 0);
+
+	do {
+		const rec_t*	rec = *recs++;
+
+		/* Compress everything up to this record. */
+		c_stream->avail_in = static_cast<uInt>(
+			rec - REC_N_NEW_EXTRA_BYTES
+			- c_stream->next_in);
+
+		if (UNIV_LIKELY(c_stream->avail_in)) {
+			UNIV_MEM_ASSERT_RW(c_stream->next_in,
+					   c_stream->avail_in);
+			err = deflate(c_stream, Z_NO_FLUSH);
+			if (UNIV_UNLIKELY(err != Z_OK)) {
+				break;
+			}
+		}
+
+		ut_ad(!c_stream->avail_in);
+		ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES);
+
+		/* Skip the REC_N_NEW_EXTRA_BYTES. */
+
+		c_stream->next_in = (byte*) rec;
+	} while (--n_dense);
+
+	return(err);
+}
+
+/**********************************************************************//**
+Compress a record of a leaf node of a clustered index that contains
+externally stored columns.
+@return	Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_clust_ext(
+/*========================*/
+	FILE_LOGFILE
+	z_stream*	c_stream,	/*!< in/out: compressed page stream */
+	const rec_t*	rec,		/*!< in: record */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec) */
+	ulint		trx_id_col,	/*!< in: position of of DB_TRX_ID */
+	byte*		deleted,	/*!< in: dense directory entry pointing
+					to the head of the free list */
+	byte*		storage,	/*!< in: end of dense page directory */
+	byte**		externs,	/*!< in/out: pointer to the next
+					available BLOB pointer */
+	ulint*		n_blobs)	/*!< in/out: number of
+					externally stored columns */
+{
+	int	err;
+	ulint	i;
+
+	UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+	UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+			   rec_offs_extra_size(offsets));
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		ulint		len;
+		const byte*	src;
+
+		if (UNIV_UNLIKELY(i == trx_id_col)) {
+			ut_ad(!rec_offs_nth_extern(offsets, i));
+			/* Store trx_id and roll_ptr
+			in uncompressed form. */
+			src = rec_get_nth_field(rec, offsets, i, &len);
+			ut_ad(src + DATA_TRX_ID_LEN
+			      == rec_get_nth_field(rec, offsets,
+						   i + 1, &len));
+			ut_ad(len == DATA_ROLL_PTR_LEN);
+
+			/* Compress any preceding bytes. */
+			c_stream->avail_in = static_cast<uInt>(
+				src - c_stream->next_in);
+
+			if (c_stream->avail_in) {
+				err = deflate(c_stream, Z_NO_FLUSH);
+				if (UNIV_UNLIKELY(err != Z_OK)) {
+
+					return(err);
+				}
+			}
+
+			ut_ad(!c_stream->avail_in);
+			ut_ad(c_stream->next_in == src);
+
+			memcpy(storage
+			       - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+			       * (rec_get_heap_no_new(rec) - 1),
+			       c_stream->next_in,
+			       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+			c_stream->next_in
+				+= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+			/* Skip also roll_ptr */
+			i++;
+		} else if (rec_offs_nth_extern(offsets, i)) {
+			src = rec_get_nth_field(rec, offsets, i, &len);
+			ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+			src += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+			c_stream->avail_in = static_cast<uInt>(
+				src - c_stream->next_in);
+			if (UNIV_LIKELY(c_stream->avail_in)) {
+				err = deflate(c_stream, Z_NO_FLUSH);
+				if (UNIV_UNLIKELY(err != Z_OK)) {
+
+					return(err);
+				}
+			}
+
+			ut_ad(!c_stream->avail_in);
+			ut_ad(c_stream->next_in == src);
+
+			/* Reserve space for the data at
+			the end of the space reserved for
+			the compressed data and the page
+			modification log. */
+
+			if (UNIV_UNLIKELY
+			    (c_stream->avail_out
+			     <= BTR_EXTERN_FIELD_REF_SIZE)) {
+				/* out of space */
+				return(Z_BUF_ERROR);
+			}
+
+			ut_ad(*externs == c_stream->next_out
+			      + c_stream->avail_out
+			      + 1/* end of modif. log */);
+
+			c_stream->next_in
+				+= BTR_EXTERN_FIELD_REF_SIZE;
+
+			/* Skip deleted records. */
+			if (UNIV_LIKELY_NULL
+			    (page_zip_dir_find_low(
+				    storage, deleted,
+				    page_offset(rec)))) {
+				continue;
+			}
+
+			(*n_blobs)++;
+			c_stream->avail_out
+				-= BTR_EXTERN_FIELD_REF_SIZE;
+			*externs -= BTR_EXTERN_FIELD_REF_SIZE;
+
+			/* Copy the BLOB pointer */
+			memcpy(*externs, c_stream->next_in
+			       - BTR_EXTERN_FIELD_REF_SIZE,
+			       BTR_EXTERN_FIELD_REF_SIZE);
+		}
+	}
+
+	return(Z_OK);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a clustered index.
+@return	Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_clust(
+/*====================*/
+	FILE_LOGFILE
+	z_stream*	c_stream,	/*!< in/out: compressed page stream */
+	const rec_t**	recs,		/*!< in: dense page directory
+					sorted by address */
+	ulint		n_dense,	/*!< in: size of recs[] */
+	dict_index_t*	index,		/*!< in: the index of the page */
+	ulint*		n_blobs,	/*!< in: 0; out: number of
+					externally stored columns */
+	ulint		trx_id_col,	/*!< index of the trx_id column */
+	byte*		deleted,	/*!< in: dense directory entry pointing
+					to the head of the free list */
+	byte*		storage,	/*!< in: end of dense page directory */
+	mem_heap_t*	heap)		/*!< in: temporary memory heap */
+{
+	int	err		= Z_OK;
+	ulint*	offsets		= NULL;
+	/* BTR_EXTERN_FIELD_REF storage */
+	byte*	externs		= storage - n_dense
+		* (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+	ut_ad(*n_blobs == 0);
+
+	do {
+		const rec_t*	rec = *recs++;
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		ut_ad(rec_offs_n_fields(offsets)
+		      == dict_index_get_n_fields(index));
+		UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+		UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+				   rec_offs_extra_size(offsets));
+
+		/* Compress the extra bytes. */
+		c_stream->avail_in = static_cast<uInt>(
+			rec - REC_N_NEW_EXTRA_BYTES
+			- c_stream->next_in);
+
+		if (c_stream->avail_in) {
+			err = deflate(c_stream, Z_NO_FLUSH);
+			if (UNIV_UNLIKELY(err != Z_OK)) {
+
+				goto func_exit;
+			}
+		}
+		ut_ad(!c_stream->avail_in);
+		ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES);
+
+		/* Compress the data bytes. */
+
+		c_stream->next_in = (byte*) rec;
+
+		/* Check if there are any externally stored columns.
+		For each externally stored column, store the
+		BTR_EXTERN_FIELD_REF separately. */
+		if (rec_offs_any_extern(offsets)) {
+			ut_ad(dict_index_is_clust(index));
+
+			err = page_zip_compress_clust_ext(
+				LOGFILE
+				c_stream, rec, offsets, trx_id_col,
+				deleted, storage, &externs, n_blobs);
+
+			if (UNIV_UNLIKELY(err != Z_OK)) {
+
+				goto func_exit;
+			}
+		} else {
+			ulint		len;
+			const byte*	src;
+
+			/* Store trx_id and roll_ptr in uncompressed form. */
+			src = rec_get_nth_field(rec, offsets,
+						trx_id_col, &len);
+			ut_ad(src + DATA_TRX_ID_LEN
+			      == rec_get_nth_field(rec, offsets,
+						   trx_id_col + 1, &len));
+			ut_ad(len == DATA_ROLL_PTR_LEN);
+			UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+			UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+					   rec_offs_extra_size(offsets));
+
+			/* Compress any preceding bytes. */
+			c_stream->avail_in = static_cast<uInt>(
+				src - c_stream->next_in);
+
+			if (c_stream->avail_in) {
+				err = deflate(c_stream, Z_NO_FLUSH);
+				if (UNIV_UNLIKELY(err != Z_OK)) {
+
+					return(err);
+				}
+			}
+
+			ut_ad(!c_stream->avail_in);
+			ut_ad(c_stream->next_in == src);
+
+			memcpy(storage
+			       - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+			       * (rec_get_heap_no_new(rec) - 1),
+			       c_stream->next_in,
+			       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+			c_stream->next_in
+				+= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+			/* Skip also roll_ptr */
+			ut_ad(trx_id_col + 1 < rec_offs_n_fields(offsets));
+		}
+
+		/* Compress the last bytes of the record. */
+		c_stream->avail_in = static_cast<uInt>(
+			rec + rec_offs_data_size(offsets) - c_stream->next_in);
+
+		if (c_stream->avail_in) {
+			err = deflate(c_stream, Z_NO_FLUSH);
+			if (UNIV_UNLIKELY(err != Z_OK)) {
+
+				goto func_exit;
+			}
+		}
+		ut_ad(!c_stream->avail_in);
+	} while (--n_dense);
+
+func_exit:
+	return(err);
+}
+
+/**********************************************************************//**
+Compress a page.
+@return TRUE on success, FALSE on failure; page_zip will be left
+intact on failure. */
+UNIV_INTERN
+ibool
+page_zip_compress(
+/*==============*/
+	page_zip_des_t*	page_zip,/*!< in: size; out: data, n_blobs,
+				m_start, m_end, m_nonempty */
+	const page_t*	page,	/*!< in: uncompressed page */
+	dict_index_t*	index,	/*!< in: index of the B-tree node */
+	ulint		level,	/*!< in: compression level */
+	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
+{
+	z_stream	c_stream;
+	int		err;
+	ulint		n_fields;/* number of index fields needed */
+	byte*		fields;	/*!< index field information */
+	byte*		buf;	/*!< compressed payload of the page */
+	byte*		buf_end;/* end of buf */
+	ulint		n_dense;
+	ulint		slot_size;/* amount of uncompressed bytes per record */
+	const rec_t**	recs;	/*!< dense page directory, sorted by address */
+	mem_heap_t*	heap;
+	ulint		trx_id_col;
+	ulint		n_blobs	= 0;
+	byte*		storage;/* storage of uncompressed columns */
+#ifndef UNIV_HOTBACKUP
+	ullint		usec = ut_time_us(NULL);
+#endif /* !UNIV_HOTBACKUP */
+#ifdef PAGE_ZIP_COMPRESS_DBG
+	FILE*		logfile = NULL;
+#endif
+	/* A local copy of srv_cmp_per_index_enabled to avoid reading that
+	variable multiple times in this function since it can be changed at
+	anytime. */
+	my_bool		cmp_per_index_enabled = srv_cmp_per_index_enabled;
+
+	ut_a(page_is_comp(page));
+	ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX);
+	ut_ad(page_simple_validate_new((page_t*) page));
+	ut_ad(page_zip_simple_validate(page_zip));
+	ut_ad(dict_table_is_comp(index->table));
+	ut_ad(!dict_index_is_ibuf(index));
+
+	UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
+
+	/* Check the data that will be omitted. */
+	ut_a(!memcmp(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
+		     infimum_extra, sizeof infimum_extra));
+	ut_a(!memcmp(page + PAGE_NEW_INFIMUM,
+		     infimum_data, sizeof infimum_data));
+	ut_a(page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES]
+	     /* info_bits == 0, n_owned <= max */
+	     <= PAGE_DIR_SLOT_MAX_N_OWNED);
+	ut_a(!memcmp(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1),
+		     supremum_extra_data, sizeof supremum_extra_data));
+
+	if (page_is_empty(page)) {
+		ut_a(rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE)
+		     == PAGE_NEW_SUPREMUM);
+	}
+
+	if (page_is_leaf(page)) {
+		n_fields = dict_index_get_n_fields(index);
+	} else {
+		n_fields = dict_index_get_n_unique_in_tree(index);
+	}
+
+	/* The dense directory excludes the infimum and supremum records. */
+	n_dense = page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW;
+#ifdef PAGE_ZIP_COMPRESS_DBG
+	if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+		fprintf(stderr, "compress %p %p %lu %lu %lu\n",
+			(void*) page_zip, (void*) page,
+			(ibool) page_is_leaf(page),
+			n_fields, n_dense);
+	}
+	if (UNIV_UNLIKELY(page_zip_compress_log)) {
+		/* Create a log file for every compression attempt. */
+		char	logfilename[9];
+		ut_snprintf(logfilename, sizeof logfilename,
+			    "%08x", page_zip_compress_log++);
+		logfile = fopen(logfilename, "wb");
+
+		if (logfile) {
+			/* Write the uncompressed page to the log. */
+			fwrite(page, 1, UNIV_PAGE_SIZE, logfile);
+			/* Record the compressed size as zero.
+			This will be overwritten at successful exit. */
+			putc(0, logfile);
+			putc(0, logfile);
+			putc(0, logfile);
+			putc(0, logfile);
+		}
+	}
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+#ifndef UNIV_HOTBACKUP
+	page_zip_stat[page_zip->ssize - 1].compressed++;
+	if (cmp_per_index_enabled) {
+		mutex_enter(&page_zip_stat_per_index_mutex);
+		page_zip_stat_per_index[index->id].compressed++;
+		mutex_exit(&page_zip_stat_per_index_mutex);
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
+			  >= page_zip_get_size(page_zip))) {
+
+		goto err_exit;
+	}
+
+	MONITOR_INC(MONITOR_PAGE_COMPRESS);
+
+	heap = mem_heap_create(page_zip_get_size(page_zip)
+			       + n_fields * (2 + sizeof(ulint))
+			       + REC_OFFS_HEADER_SIZE
+			       + n_dense * ((sizeof *recs)
+					    - PAGE_ZIP_DIR_SLOT_SIZE)
+			       + UNIV_PAGE_SIZE * 4
+			       + (512 << MAX_MEM_LEVEL));
+
+	recs = static_cast<const rec_t**>(
+		mem_heap_zalloc(heap, n_dense * sizeof *recs));
+
+	fields = static_cast<byte*>(mem_heap_alloc(heap, (n_fields + 1) * 2));
+
+	buf = static_cast<byte*>(
+		mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA));
+
+	buf_end = buf + page_zip_get_size(page_zip) - PAGE_DATA;
+
+	/* Compress the data payload. */
+	page_zip_set_alloc(&c_stream, heap);
+
+	err = deflateInit2(&c_stream, static_cast<int>(level),
+			   Z_DEFLATED, UNIV_PAGE_SIZE_SHIFT,
+			   MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+	ut_a(err == Z_OK);
+
+	c_stream.next_out = buf;
+	/* Subtract the space reserved for uncompressed data. */
+	/* Page header and the end marker of the modification log */
+	c_stream.avail_out = static_cast<uInt>(buf_end - buf - 1);
+
+	/* Dense page directory and uncompressed columns, if any */
+	if (page_is_leaf(page)) {
+		if (dict_index_is_clust(index)) {
+			trx_id_col = dict_index_get_sys_col_pos(
+				index, DATA_TRX_ID);
+			ut_ad(trx_id_col > 0);
+			ut_ad(trx_id_col != ULINT_UNDEFINED);
+
+			slot_size = PAGE_ZIP_DIR_SLOT_SIZE
+				+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+		} else {
+			/* Signal the absence of trx_id
+			in page_zip_fields_encode() */
+			ut_ad(dict_index_get_sys_col_pos(index, DATA_TRX_ID)
+			      == ULINT_UNDEFINED);
+			trx_id_col = 0;
+			slot_size = PAGE_ZIP_DIR_SLOT_SIZE;
+		}
+	} else {
+		slot_size = PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
+		trx_id_col = ULINT_UNDEFINED;
+	}
+
+	if (UNIV_UNLIKELY(c_stream.avail_out <= n_dense * slot_size
+			  + 6/* sizeof(zlib header and footer) */)) {
+		goto zlib_error;
+	}
+
+	c_stream.avail_out -= static_cast<uInt>(n_dense * slot_size);
+	c_stream.avail_in = static_cast<uInt>(
+		page_zip_fields_encode(n_fields, index, trx_id_col, fields));
+	c_stream.next_in = fields;
+	if (UNIV_LIKELY(!trx_id_col)) {
+		trx_id_col = ULINT_UNDEFINED;
+	}
+
+	UNIV_MEM_ASSERT_RW(c_stream.next_in, c_stream.avail_in);
+	err = deflate(&c_stream, Z_FULL_FLUSH);
+	if (err != Z_OK) {
+		goto zlib_error;
+	}
+
+	ut_ad(!c_stream.avail_in);
+
+	page_zip_dir_encode(page, buf_end, recs);
+
+	c_stream.next_in = (byte*) page + PAGE_ZIP_START;
+
+	storage = buf_end - n_dense * PAGE_ZIP_DIR_SLOT_SIZE;
+
+	/* Compress the records in heap_no order. */
+	if (UNIV_UNLIKELY(!n_dense)) {
+	} else if (!page_is_leaf(page)) {
+		/* This is a node pointer page. */
+		err = page_zip_compress_node_ptrs(LOGFILE
+						  &c_stream, recs, n_dense,
+						  index, storage, heap);
+		if (UNIV_UNLIKELY(err != Z_OK)) {
+			goto zlib_error;
+		}
+	} else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+		/* This is a leaf page in a secondary index. */
+		err = page_zip_compress_sec(LOGFILE
+					    &c_stream, recs, n_dense);
+		if (UNIV_UNLIKELY(err != Z_OK)) {
+			goto zlib_error;
+		}
+	} else {
+		/* This is a leaf page in a clustered index. */
+		err = page_zip_compress_clust(LOGFILE
+					      &c_stream, recs, n_dense,
+					      index, &n_blobs, trx_id_col,
+					      buf_end - PAGE_ZIP_DIR_SLOT_SIZE
+					      * page_get_n_recs(page),
+					      storage, heap);
+		if (UNIV_UNLIKELY(err != Z_OK)) {
+			goto zlib_error;
+		}
+	}
+
+	/* Finish the compression. */
+	ut_ad(!c_stream.avail_in);
+	/* Compress any trailing garbage, in case the last record was
+	allocated from an originally longer space on the free list,
+	or the data of the last record from page_zip_compress_sec(). */
+	c_stream.avail_in = static_cast<uInt>(
+		page_header_get_field(page, PAGE_HEAP_TOP)
+		- (c_stream.next_in - page));
+	ut_a(c_stream.avail_in <= UNIV_PAGE_SIZE - PAGE_ZIP_START - PAGE_DIR);
+
+	UNIV_MEM_ASSERT_RW(c_stream.next_in, c_stream.avail_in);
+	err = deflate(&c_stream, Z_FINISH);
+
+	if (UNIV_UNLIKELY(err != Z_STREAM_END)) {
+zlib_error:
+		deflateEnd(&c_stream);
+		mem_heap_free(heap);
+err_exit:
+#ifdef PAGE_ZIP_COMPRESS_DBG
+		if (logfile) {
+			fclose(logfile);
+		}
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+#ifndef UNIV_HOTBACKUP
+		if (page_is_leaf(page)) {
+			dict_index_zip_failure(index);
+		}
+
+		ullint	time_diff = ut_time_us(NULL) - usec;
+		page_zip_stat[page_zip->ssize - 1].compressed_usec
+			+= time_diff;
+		if (cmp_per_index_enabled) {
+			mutex_enter(&page_zip_stat_per_index_mutex);
+			page_zip_stat_per_index[index->id].compressed_usec
+				+= time_diff;
+			mutex_exit(&page_zip_stat_per_index_mutex);
+		}
+#endif /* !UNIV_HOTBACKUP */
+		return(FALSE);
+	}
+
+	err = deflateEnd(&c_stream);
+	ut_a(err == Z_OK);
+
+	ut_ad(buf + c_stream.total_out == c_stream.next_out);
+	ut_ad((ulint) (storage - c_stream.next_out) >= c_stream.avail_out);
+
+	/* Valgrind believes that zlib does not initialize some bits
+	in the last 7 or 8 bytes of the stream.  Make Valgrind happy. */
+	UNIV_MEM_VALID(buf, c_stream.total_out);
+
+	/* Zero out the area reserved for the modification log.
+	Space for the end marker of the modification log is not
+	included in avail_out. */
+	memset(c_stream.next_out, 0, c_stream.avail_out + 1/* end marker */);
+
+#ifdef UNIV_DEBUG
+	page_zip->m_start =
+#endif /* UNIV_DEBUG */
+		page_zip->m_end = PAGE_DATA + c_stream.total_out;
+	page_zip->m_nonempty = FALSE;
+	page_zip->n_blobs = n_blobs;
+	/* Copy those header fields that will not be written
+	in buf_flush_init_for_writing() */
+	memcpy(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+	       FIL_PAGE_LSN - FIL_PAGE_PREV);
+	memcpy(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2);
+	memcpy(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+	       PAGE_DATA - FIL_PAGE_DATA);
+	/* Copy the rest of the compressed page */
+	memcpy(page_zip->data + PAGE_DATA, buf,
+	       page_zip_get_size(page_zip) - PAGE_DATA);
+	mem_heap_free(heap);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (mtr) {
+#ifndef UNIV_HOTBACKUP
+		page_zip_compress_write_log(page_zip, page, index, mtr);
+#endif /* !UNIV_HOTBACKUP */
+	}
+
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+#ifdef PAGE_ZIP_COMPRESS_DBG
+	if (logfile) {
+		/* Record the compressed size of the block. */
+		byte sz[4];
+		mach_write_to_4(sz, c_stream.total_out);
+		fseek(logfile, UNIV_PAGE_SIZE, SEEK_SET);
+		fwrite(sz, 1, sizeof sz, logfile);
+		fclose(logfile);
+	}
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+#ifndef UNIV_HOTBACKUP
+	ullint	time_diff = ut_time_us(NULL) - usec;
+	page_zip_stat[page_zip->ssize - 1].compressed_ok++;
+	page_zip_stat[page_zip->ssize - 1].compressed_usec += time_diff;
+	if (cmp_per_index_enabled) {
+		mutex_enter(&page_zip_stat_per_index_mutex);
+		page_zip_stat_per_index[index->id].compressed_ok++;
+		page_zip_stat_per_index[index->id].compressed_usec += time_diff;
+		mutex_exit(&page_zip_stat_per_index_mutex);
+	}
+
+	if (page_is_leaf(page)) {
+		dict_index_zip_success(index);
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Compare two page directory entries.
+@return	positive if rec1 > rec2 */
+UNIV_INLINE
+ibool
+page_zip_dir_cmp(
+/*=============*/
+	const rec_t*	rec1,	/*!< in: rec1 */
+	const rec_t*	rec2)	/*!< in: rec2 */
+{
+	return(rec1 > rec2);
+}
+
+/**********************************************************************//**
+Sort the dense page directory by address (heap_no). */
+static
+void
+page_zip_dir_sort(
+/*==============*/
+	rec_t**	arr,	/*!< in/out: dense page directory */
+	rec_t**	aux_arr,/*!< in/out: work area */
+	ulint	low,	/*!< in: lower bound of the sorting area, inclusive */
+	ulint	high)	/*!< in: upper bound of the sorting area, exclusive */
+{
+	UT_SORT_FUNCTION_BODY(page_zip_dir_sort, arr, aux_arr, low, high,
+			      page_zip_dir_cmp);
+}
+
+/**********************************************************************//**
+Deallocate the index information initialized by page_zip_fields_decode(). */
+static
+void
+page_zip_fields_free(
+/*=================*/
+	dict_index_t*	index)	/*!< in: dummy index to be freed */
+{
+	if (index) {
+		dict_table_t*	table = index->table;
+		os_fast_mutex_free(&index->zip_pad.mutex);
+		mem_heap_free(index->heap);
+
+		dict_mem_table_free(table);
+	}
+}
+
+/**********************************************************************//**
+Read the index information for the compressed page.
+@return	own: dummy index describing the page, or NULL on error */
+static
+dict_index_t*
+page_zip_fields_decode(
+/*===================*/
+	const byte*	buf,	/*!< in: index information */
+	const byte*	end,	/*!< in: end of buf */
+	ulint*		trx_id_col)/*!< in: NULL for non-leaf pages;
+				for leaf pages, pointer to where to store
+				the position of the trx_id column */
+{
+	const byte*	b;
+	ulint		n;
+	ulint		i;
+	ulint		val;
+	dict_table_t*	table;
+	dict_index_t*	index;
+
+	/* Determine the number of fields. */
+	for (b = buf, n = 0; b < end; n++) {
+		if (*b++ & 0x80) {
+			b++; /* skip the second byte */
+		}
+	}
+
+	n--; /* n_nullable or trx_id */
+
+	if (UNIV_UNLIKELY(n > REC_MAX_N_FIELDS)) {
+
+		page_zip_fail(("page_zip_fields_decode: n = %lu\n",
+			       (ulong) n));
+		return(NULL);
+	}
+
+	if (UNIV_UNLIKELY(b > end)) {
+
+		page_zip_fail(("page_zip_fields_decode: %p > %p\n",
+			       (const void*) b, (const void*) end));
+		return(NULL);
+	}
+
+	table = dict_mem_table_create("ZIP_DUMMY", DICT_HDR_SPACE, n,
+				      DICT_TF_COMPACT, 0);
+	index = dict_mem_index_create("ZIP_DUMMY", "ZIP_DUMMY",
+				      DICT_HDR_SPACE, 0, n);
+	index->table = table;
+	index->n_uniq = n;
+	/* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+	index->cached = TRUE;
+
+	/* Initialize the fields. */
+	for (b = buf, i = 0; i < n; i++) {
+		ulint	mtype;
+		ulint	len;
+
+		val = *b++;
+
+		if (UNIV_UNLIKELY(val & 0x80)) {
+			/* fixed length > 62 bytes */
+			val = (val & 0x7f) << 8 | *b++;
+			len = val >> 1;
+			mtype = DATA_FIXBINARY;
+		} else if (UNIV_UNLIKELY(val >= 126)) {
+			/* variable length with max > 255 bytes */
+			len = 0x7fff;
+			mtype = DATA_BINARY;
+		} else if (val <= 1) {
+			/* variable length with max <= 255 bytes */
+			len = 0;
+			mtype = DATA_BINARY;
+		} else {
+			/* fixed length < 62 bytes */
+			len = val >> 1;
+			mtype = DATA_FIXBINARY;
+		}
+
+		dict_mem_table_add_col(table, NULL, NULL, mtype,
+				       val & 1 ? DATA_NOT_NULL : 0, len);
+		dict_index_add_col(index, table,
+				   dict_table_get_nth_col(table, i), 0);
+	}
+
+	val = *b++;
+	if (UNIV_UNLIKELY(val & 0x80)) {
+		val = (val & 0x7f) << 8 | *b++;
+	}
+
+	/* Decode the position of the trx_id column. */
+	if (trx_id_col) {
+		if (!val) {
+			val = ULINT_UNDEFINED;
+		} else if (UNIV_UNLIKELY(val >= n)) {
+			page_zip_fields_free(index);
+			index = NULL;
+		} else {
+			index->type = DICT_CLUSTERED;
+		}
+
+		*trx_id_col = val;
+	} else {
+		/* Decode the number of nullable fields. */
+		if (UNIV_UNLIKELY(index->n_nullable > val)) {
+			page_zip_fields_free(index);
+			index = NULL;
+		} else {
+			index->n_nullable = val;
+		}
+	}
+
+	ut_ad(b == end);
+
+	return(index);
+}
+
+/**********************************************************************//**
+Populate the sparse page directory from the dense directory.
+@return	TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_dir_decode(
+/*================*/
+	const page_zip_des_t*	page_zip,/*!< in: dense page directory on
+					compressed page */
+	page_t*			page,	/*!< in: compact page with valid header;
+					out: trailer and sparse page directory
+					filled in */
+	rec_t**			recs,	/*!< out: dense page directory sorted by
+					ascending address (and heap_no) */
+	rec_t**			recs_aux,/*!< in/out: scratch area */
+	ulint			n_dense)/*!< in: number of user records, and
+					size of recs[] and recs_aux[] */
+{
+	ulint	i;
+	ulint	n_recs;
+	byte*	slot;
+
+	n_recs = page_get_n_recs(page);
+
+	if (UNIV_UNLIKELY(n_recs > n_dense)) {
+		page_zip_fail(("page_zip_dir_decode 1: %lu > %lu\n",
+			       (ulong) n_recs, (ulong) n_dense));
+		return(FALSE);
+	}
+
+	/* Traverse the list of stored records in the sorting order,
+	starting from the first user record. */
+
+	slot = page + (UNIV_PAGE_SIZE - PAGE_DIR - PAGE_DIR_SLOT_SIZE);
+	UNIV_PREFETCH_RW(slot);
+
+	/* Zero out the page trailer. */
+	memset(slot + PAGE_DIR_SLOT_SIZE, 0, PAGE_DIR);
+
+	mach_write_to_2(slot, PAGE_NEW_INFIMUM);
+	slot -= PAGE_DIR_SLOT_SIZE;
+	UNIV_PREFETCH_RW(slot);
+
+	/* Initialize the sparse directory and copy the dense directory. */
+	for (i = 0; i < n_recs; i++) {
+		ulint	offs = page_zip_dir_get(page_zip, i);
+
+		if (offs & PAGE_ZIP_DIR_SLOT_OWNED) {
+			mach_write_to_2(slot, offs & PAGE_ZIP_DIR_SLOT_MASK);
+			slot -= PAGE_DIR_SLOT_SIZE;
+			UNIV_PREFETCH_RW(slot);
+		}
+
+		if (UNIV_UNLIKELY((offs & PAGE_ZIP_DIR_SLOT_MASK)
+				  < PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES)) {
+			page_zip_fail(("page_zip_dir_decode 2: %u %u %lx\n",
+				       (unsigned) i, (unsigned) n_recs,
+				       (ulong) offs));
+			return(FALSE);
+		}
+
+		recs[i] = page + (offs & PAGE_ZIP_DIR_SLOT_MASK);
+	}
+
+	mach_write_to_2(slot, PAGE_NEW_SUPREMUM);
+	{
+		const page_dir_slot_t*	last_slot = page_dir_get_nth_slot(
+			page, page_dir_get_n_slots(page) - 1);
+
+		if (UNIV_UNLIKELY(slot != last_slot)) {
+			page_zip_fail(("page_zip_dir_decode 3: %p != %p\n",
+				       (const void*) slot,
+				       (const void*) last_slot));
+			return(FALSE);
+		}
+	}
+
+	/* Copy the rest of the dense directory. */
+	for (; i < n_dense; i++) {
+		ulint	offs = page_zip_dir_get(page_zip, i);
+
+		if (UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) {
+			page_zip_fail(("page_zip_dir_decode 4: %u %u %lx\n",
+				       (unsigned) i, (unsigned) n_dense,
+				       (ulong) offs));
+			return(FALSE);
+		}
+
+		recs[i] = page + offs;
+	}
+
+	if (UNIV_LIKELY(n_dense > 1)) {
+		page_zip_dir_sort(recs, recs_aux, 0, n_dense);
+	}
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Initialize the REC_N_NEW_EXTRA_BYTES of each record.
+@return	TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_set_extra_bytes(
+/*=====================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	page_t*			page,	/*!< in/out: uncompressed page */
+	ulint			info_bits)/*!< in: REC_INFO_MIN_REC_FLAG or 0 */
+{
+	ulint	n;
+	ulint	i;
+	ulint	n_owned = 1;
+	ulint	offs;
+	rec_t*	rec;
+
+	n = page_get_n_recs(page);
+	rec = page + PAGE_NEW_INFIMUM;
+
+	for (i = 0; i < n; i++) {
+		offs = page_zip_dir_get(page_zip, i);
+
+		if (offs & PAGE_ZIP_DIR_SLOT_DEL) {
+			info_bits |= REC_INFO_DELETED_FLAG;
+		}
+		if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_OWNED)) {
+			info_bits |= n_owned;
+			n_owned = 1;
+		} else {
+			n_owned++;
+		}
+		offs &= PAGE_ZIP_DIR_SLOT_MASK;
+		if (UNIV_UNLIKELY(offs < PAGE_ZIP_START
+				  + REC_N_NEW_EXTRA_BYTES)) {
+			page_zip_fail(("page_zip_set_extra_bytes 1:"
+				       " %u %u %lx\n",
+				       (unsigned) i, (unsigned) n,
+				       (ulong) offs));
+			return(FALSE);
+		}
+
+		rec_set_next_offs_new(rec, offs);
+		rec = page + offs;
+		rec[-REC_N_NEW_EXTRA_BYTES] = (byte) info_bits;
+		info_bits = 0;
+	}
+
+	/* Set the next pointer of the last user record. */
+	rec_set_next_offs_new(rec, PAGE_NEW_SUPREMUM);
+
+	/* Set n_owned of the supremum record. */
+	page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] = (byte) n_owned;
+
+	/* The dense directory excludes the infimum and supremum records. */
+	n = page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW;
+
+	if (i >= n) {
+		if (UNIV_LIKELY(i == n)) {
+			return(TRUE);
+		}
+
+		page_zip_fail(("page_zip_set_extra_bytes 2: %u != %u\n",
+			       (unsigned) i, (unsigned) n));
+		return(FALSE);
+	}
+
+	offs = page_zip_dir_get(page_zip, i);
+
+	/* Set the extra bytes of deleted records on the free list. */
+	for (;;) {
+		if (UNIV_UNLIKELY(!offs)
+		    || UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) {
+
+			page_zip_fail(("page_zip_set_extra_bytes 3: %lx\n",
+				       (ulong) offs));
+			return(FALSE);
+		}
+
+		rec = page + offs;
+		rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
+
+		if (++i == n) {
+			break;
+		}
+
+		offs = page_zip_dir_get(page_zip, i);
+		rec_set_next_offs_new(rec, offs);
+	}
+
+	/* Terminate the free list. */
+	rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
+	rec_set_next_offs_new(rec, 0);
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Apply the modification log to a record containing externally stored
+columns.  Do not copy the fields that are stored separately.
+@return	pointer to modification log, or NULL on failure */
+static
+const byte*
+page_zip_apply_log_ext(
+/*===================*/
+	rec_t*		rec,		/*!< in/out: record */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec) */
+	ulint		trx_id_col,	/*!< in: position of of DB_TRX_ID */
+	const byte*	data,		/*!< in: modification log */
+	const byte*	end)		/*!< in: end of modification log */
+{
+	ulint	i;
+	ulint	len;
+	byte*	next_out = rec;
+
+	/* Check if there are any externally stored columns.
+	For each externally stored column, skip the
+	BTR_EXTERN_FIELD_REF. */
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		byte*	dst;
+
+		if (UNIV_UNLIKELY(i == trx_id_col)) {
+			/* Skip trx_id and roll_ptr */
+			dst = rec_get_nth_field(rec, offsets,
+						i, &len);
+			if (UNIV_UNLIKELY(dst - next_out >= end - data)
+			    || UNIV_UNLIKELY
+			    (len < (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN))
+			    || rec_offs_nth_extern(offsets, i)) {
+				page_zip_fail(("page_zip_apply_log_ext:"
+					       " trx_id len %lu,"
+					       " %p - %p >= %p - %p\n",
+					       (ulong) len,
+					       (const void*) dst,
+					       (const void*) next_out,
+					       (const void*) end,
+					       (const void*) data));
+				return(NULL);
+			}
+
+			memcpy(next_out, data, dst - next_out);
+			data += dst - next_out;
+			next_out = dst + (DATA_TRX_ID_LEN
+					  + DATA_ROLL_PTR_LEN);
+		} else if (rec_offs_nth_extern(offsets, i)) {
+			dst = rec_get_nth_field(rec, offsets,
+						i, &len);
+			ut_ad(len
+			      >= BTR_EXTERN_FIELD_REF_SIZE);
+
+			len += dst - next_out
+				- BTR_EXTERN_FIELD_REF_SIZE;
+
+			if (UNIV_UNLIKELY(data + len >= end)) {
+				page_zip_fail(("page_zip_apply_log_ext: "
+					       "ext %p+%lu >= %p\n",
+					       (const void*) data,
+					       (ulong) len,
+					       (const void*) end));
+				return(NULL);
+			}
+
+			memcpy(next_out, data, len);
+			data += len;
+			next_out += len
+				+ BTR_EXTERN_FIELD_REF_SIZE;
+		}
+	}
+
+	/* Copy the last bytes of the record. */
+	len = rec_get_end(rec, offsets) - next_out;
+	if (UNIV_UNLIKELY(data + len >= end)) {
+		page_zip_fail(("page_zip_apply_log_ext: "
+			       "last %p+%lu >= %p\n",
+			       (const void*) data,
+			       (ulong) len,
+			       (const void*) end));
+		return(NULL);
+	}
+	memcpy(next_out, data, len);
+	data += len;
+
+	return(data);
+}
+
+/**********************************************************************//**
+Apply the modification log to an uncompressed page.
+Do not copy the fields that are stored separately.
+@return	pointer to end of modification log, or NULL on failure */
+static
+const byte*
+page_zip_apply_log(
+/*===============*/
+	const byte*	data,	/*!< in: modification log */
+	ulint		size,	/*!< in: maximum length of the log, in bytes */
+	rec_t**		recs,	/*!< in: dense page directory,
+				sorted by address (indexed by
+				heap_no - PAGE_HEAP_NO_USER_LOW) */
+	ulint		n_dense,/*!< in: size of recs[] */
+	ulint		trx_id_col,/*!< in: column number of trx_id in the index,
+				or ULINT_UNDEFINED if none */
+	ulint		heap_status,
+				/*!< in: heap_no and status bits for
+				the next record to uncompress */
+	dict_index_t*	index,	/*!< in: index of the page */
+	ulint*		offsets)/*!< in/out: work area for
+				rec_get_offsets_reverse() */
+{
+	const byte* const end = data + size;
+
+	for (;;) {
+		ulint	val;
+		rec_t*	rec;
+		ulint	len;
+		ulint	hs;
+
+		val = *data++;
+		if (UNIV_UNLIKELY(!val)) {
+			return(data - 1);
+		}
+		if (val & 0x80) {
+			val = (val & 0x7f) << 8 | *data++;
+			if (UNIV_UNLIKELY(!val)) {
+				page_zip_fail(("page_zip_apply_log:"
+					       " invalid val %x%x\n",
+					       data[-2], data[-1]));
+				return(NULL);
+			}
+		}
+		if (UNIV_UNLIKELY(data >= end)) {
+			page_zip_fail(("page_zip_apply_log: %p >= %p\n",
+				       (const void*) data,
+				       (const void*) end));
+			return(NULL);
+		}
+		if (UNIV_UNLIKELY((val >> 1) > n_dense)) {
+			page_zip_fail(("page_zip_apply_log: %lu>>1 > %lu\n",
+				       (ulong) val, (ulong) n_dense));
+			return(NULL);
+		}
+
+		/* Determine the heap number and status bits of the record. */
+		rec = recs[(val >> 1) - 1];
+
+		hs = ((val >> 1) + 1) << REC_HEAP_NO_SHIFT;
+		hs |= heap_status & ((1 << REC_HEAP_NO_SHIFT) - 1);
+
+		/* This may either be an old record that is being
+		overwritten (updated in place, or allocated from
+		the free list), or a new record, with the next
+		available_heap_no. */
+		if (UNIV_UNLIKELY(hs > heap_status)) {
+			page_zip_fail(("page_zip_apply_log: %lu > %lu\n",
+				       (ulong) hs, (ulong) heap_status));
+			return(NULL);
+		} else if (hs == heap_status) {
+			/* A new record was allocated from the heap. */
+			if (UNIV_UNLIKELY(val & 1)) {
+				/* Only existing records may be cleared. */
+				page_zip_fail(("page_zip_apply_log:"
+					       " attempting to create"
+					       " deleted rec %lu\n",
+					       (ulong) hs));
+				return(NULL);
+			}
+			heap_status += 1 << REC_HEAP_NO_SHIFT;
+		}
+
+		mach_write_to_2(rec - REC_NEW_HEAP_NO, hs);
+
+		if (val & 1) {
+			/* Clear the data bytes of the record. */
+			mem_heap_t*	heap	= NULL;
+			ulint*		offs;
+			offs = rec_get_offsets(rec, index, offsets,
+					       ULINT_UNDEFINED, &heap);
+			memset(rec, 0, rec_offs_data_size(offs));
+
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+			continue;
+		}
+
+#if REC_STATUS_NODE_PTR != TRUE
+# error "REC_STATUS_NODE_PTR != TRUE"
+#endif
+		rec_get_offsets_reverse(data, index,
+					hs & REC_STATUS_NODE_PTR,
+					offsets);
+		rec_offs_make_valid(rec, index, offsets);
+
+		/* Copy the extra bytes (backwards). */
+		{
+			byte*	start	= rec_get_start(rec, offsets);
+			byte*	b	= rec - REC_N_NEW_EXTRA_BYTES;
+			while (b != start) {
+				*--b = *data++;
+			}
+		}
+
+		/* Copy the data bytes. */
+		if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) {
+			/* Non-leaf nodes should not contain any
+			externally stored columns. */
+			if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) {
+				page_zip_fail(("page_zip_apply_log: "
+					       "%lu&REC_STATUS_NODE_PTR\n",
+					       (ulong) hs));
+				return(NULL);
+			}
+
+			data = page_zip_apply_log_ext(
+				rec, offsets, trx_id_col, data, end);
+
+			if (UNIV_UNLIKELY(!data)) {
+				return(NULL);
+			}
+		} else if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) {
+			len = rec_offs_data_size(offsets)
+				- REC_NODE_PTR_SIZE;
+			/* Copy the data bytes, except node_ptr. */
+			if (UNIV_UNLIKELY(data + len >= end)) {
+				page_zip_fail(("page_zip_apply_log: "
+					       "node_ptr %p+%lu >= %p\n",
+					       (const void*) data,
+					       (ulong) len,
+					       (const void*) end));
+				return(NULL);
+			}
+			memcpy(rec, data, len);
+			data += len;
+		} else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+			len = rec_offs_data_size(offsets);
+
+			/* Copy all data bytes of
+			a record in a secondary index. */
+			if (UNIV_UNLIKELY(data + len >= end)) {
+				page_zip_fail(("page_zip_apply_log: "
+					       "sec %p+%lu >= %p\n",
+					       (const void*) data,
+					       (ulong) len,
+					       (const void*) end));
+				return(NULL);
+			}
+
+			memcpy(rec, data, len);
+			data += len;
+		} else {
+			/* Skip DB_TRX_ID and DB_ROLL_PTR. */
+			ulint	l = rec_get_nth_field_offs(offsets,
+							   trx_id_col, &len);
+			byte*	b;
+
+			if (UNIV_UNLIKELY(data + l >= end)
+			    || UNIV_UNLIKELY(len < (DATA_TRX_ID_LEN
+						    + DATA_ROLL_PTR_LEN))) {
+				page_zip_fail(("page_zip_apply_log: "
+					       "trx_id %p+%lu >= %p\n",
+					       (const void*) data,
+					       (ulong) l,
+					       (const void*) end));
+				return(NULL);
+			}
+
+			/* Copy any preceding data bytes. */
+			memcpy(rec, data, l);
+			data += l;
+
+			/* Copy any bytes following DB_TRX_ID, DB_ROLL_PTR. */
+			b = rec + l + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+			len = rec_get_end(rec, offsets) - b;
+			if (UNIV_UNLIKELY(data + len >= end)) {
+				page_zip_fail(("page_zip_apply_log: "
+					       "clust %p+%lu >= %p\n",
+					       (const void*) data,
+					       (ulong) len,
+					       (const void*) end));
+				return(NULL);
+			}
+			memcpy(b, data, len);
+			data += len;
+		}
+	}
+}
+
+/**********************************************************************//**
+Set the heap_no in a record, and skip the fixed-size record header
+that is not included in the d_stream.
+@return	TRUE on success, FALSE if d_stream does not end at rec */
+static
+ibool
+page_zip_decompress_heap_no(
+/*========================*/
+	z_stream*	d_stream,	/*!< in/out: compressed page stream */
+	rec_t*		rec,		/*!< in/out: record */
+	ulint&		heap_status)	/*!< in/out: heap_no and status bits */
+{
+	if (d_stream->next_out != rec - REC_N_NEW_EXTRA_BYTES) {
+		/* n_dense has grown since the page was last compressed. */
+		return(FALSE);
+	}
+
+	/* Skip the REC_N_NEW_EXTRA_BYTES. */
+	d_stream->next_out = rec;
+
+	/* Set heap_no and the status bits. */
+	mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
+	heap_status += 1 << REC_HEAP_NO_SHIFT;
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress the records of a node pointer page.
+@return	TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_node_ptrs(
+/*==========================*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	z_stream*	d_stream,	/*!< in/out: compressed page stream */
+	rec_t**		recs,		/*!< in: dense page directory
+					sorted by address */
+	ulint		n_dense,	/*!< in: size of recs[] */
+	dict_index_t*	index,		/*!< in: the index of the page */
+	ulint*		offsets,	/*!< in/out: temporary offsets */
+	mem_heap_t*	heap)		/*!< in: temporary memory heap */
+{
+	ulint		heap_status = REC_STATUS_NODE_PTR
+		| PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+	ulint		slot;
+	const byte*	storage;
+
+	/* Subtract the space reserved for uncompressed data. */
+	d_stream->avail_in -= static_cast<uInt>(
+		n_dense * (PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE));
+
+	/* Decompress the records in heap_no order. */
+	for (slot = 0; slot < n_dense; slot++) {
+		rec_t*	rec = recs[slot];
+
+		d_stream->avail_out = static_cast<uInt>(
+			rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
+
+		ut_ad(d_stream->avail_out < UNIV_PAGE_SIZE
+		      - PAGE_ZIP_START - PAGE_DIR);
+		switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+		case Z_STREAM_END:
+			page_zip_decompress_heap_no(
+				d_stream, rec, heap_status);
+			goto zlib_done;
+		case Z_OK:
+		case Z_BUF_ERROR:
+			if (!d_stream->avail_out) {
+				break;
+			}
+			/* fall through */
+		default:
+			page_zip_fail(("page_zip_decompress_node_ptrs:"
+				       " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+				       d_stream->msg));
+			goto zlib_error;
+		}
+
+		if (!page_zip_decompress_heap_no(
+			    d_stream, rec, heap_status)) {
+			ut_ad(0);
+		}
+
+		/* Read the offsets. The status bits are needed here. */
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+
+		/* Non-leaf nodes should not have any externally
+		stored columns. */
+		ut_ad(!rec_offs_any_extern(offsets));
+
+		/* Decompress the data bytes, except node_ptr. */
+		d_stream->avail_out =static_cast<uInt>(
+			rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE);
+
+		switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+		case Z_STREAM_END:
+			goto zlib_done;
+		case Z_OK:
+		case Z_BUF_ERROR:
+			if (!d_stream->avail_out) {
+				break;
+			}
+			/* fall through */
+		default:
+			page_zip_fail(("page_zip_decompress_node_ptrs:"
+				       " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+				       d_stream->msg));
+			goto zlib_error;
+		}
+
+		/* Clear the node pointer in case the record
+		will be deleted and the space will be reallocated
+		to a smaller record. */
+		memset(d_stream->next_out, 0, REC_NODE_PTR_SIZE);
+		d_stream->next_out += REC_NODE_PTR_SIZE;
+
+		ut_ad(d_stream->next_out == rec_get_end(rec, offsets));
+	}
+
+	/* Decompress any trailing garbage, in case the last record was
+	allocated from an originally longer space on the free list. */
+	d_stream->avail_out = static_cast<uInt>(
+		page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
+		- page_offset(d_stream->next_out));
+	if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE
+			  - PAGE_ZIP_START - PAGE_DIR)) {
+
+		page_zip_fail(("page_zip_decompress_node_ptrs:"
+			       " avail_out = %u\n",
+			       d_stream->avail_out));
+		goto zlib_error;
+	}
+
+	if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+		page_zip_fail(("page_zip_decompress_node_ptrs:"
+			       " inflate(Z_FINISH)=%s\n",
+			       d_stream->msg));
+zlib_error:
+		inflateEnd(d_stream);
+		return(FALSE);
+	}
+
+	/* Note that d_stream->avail_out > 0 may hold here
+	if the modification log is nonempty. */
+
+zlib_done:
+	if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+		ut_error;
+	}
+
+	{
+		page_t*	page = page_align(d_stream->next_out);
+
+		/* Clear the unused heap space on the uncompressed page. */
+		memset(d_stream->next_out, 0,
+		       page_dir_get_nth_slot(page,
+					     page_dir_get_n_slots(page) - 1)
+		       - d_stream->next_out);
+	}
+
+#ifdef UNIV_DEBUG
+	page_zip->m_start = PAGE_DATA + d_stream->total_in;
+#endif /* UNIV_DEBUG */
+
+	/* Apply the modification log. */
+	{
+		const byte*	mod_log_ptr;
+		mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+						 d_stream->avail_in + 1,
+						 recs, n_dense,
+						 ULINT_UNDEFINED, heap_status,
+						 index, offsets);
+
+		if (UNIV_UNLIKELY(!mod_log_ptr)) {
+			return(FALSE);
+		}
+		page_zip->m_end = mod_log_ptr - page_zip->data;
+		page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+	}
+
+	if (UNIV_UNLIKELY
+	    (page_zip_get_trailer_len(page_zip,
+				      dict_index_is_clust(index))
+	     + page_zip->m_end >= page_zip_get_size(page_zip))) {
+		page_zip_fail(("page_zip_decompress_node_ptrs:"
+			       " %lu + %lu >= %lu, %lu\n",
+			       (ulong) page_zip_get_trailer_len(
+				       page_zip, dict_index_is_clust(index)),
+			       (ulong) page_zip->m_end,
+			       (ulong) page_zip_get_size(page_zip),
+			       (ulong) dict_index_is_clust(index)));
+		return(FALSE);
+	}
+
+	/* Restore the uncompressed columns in heap_no order. */
+	storage = page_zip_dir_start_low(page_zip, n_dense);
+
+	for (slot = 0; slot < n_dense; slot++) {
+		rec_t*		rec	= recs[slot];
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		/* Non-leaf nodes should not have any externally
+		stored columns. */
+		ut_ad(!rec_offs_any_extern(offsets));
+		storage -= REC_NODE_PTR_SIZE;
+
+		memcpy(rec_get_end(rec, offsets) - REC_NODE_PTR_SIZE,
+		       storage, REC_NODE_PTR_SIZE);
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress the records of a leaf node of a secondary index.
+@return	TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_sec(
+/*====================*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	z_stream*	d_stream,	/*!< in/out: compressed page stream */
+	rec_t**		recs,		/*!< in: dense page directory
+					sorted by address */
+	ulint		n_dense,	/*!< in: size of recs[] */
+	dict_index_t*	index,		/*!< in: the index of the page */
+	ulint*		offsets)	/*!< in/out: temporary offsets */
+{
+	ulint	heap_status	= REC_STATUS_ORDINARY
+		| PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+	ulint	slot;
+
+	ut_a(!dict_index_is_clust(index));
+
+	/* Subtract the space reserved for uncompressed data. */
+	d_stream->avail_in -= static_cast<uint>(
+		n_dense * PAGE_ZIP_DIR_SLOT_SIZE);
+
+	for (slot = 0; slot < n_dense; slot++) {
+		rec_t*	rec = recs[slot];
+
+		/* Decompress everything up to this record. */
+		d_stream->avail_out = static_cast<uint>(
+			rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
+
+		if (UNIV_LIKELY(d_stream->avail_out)) {
+			switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+			case Z_STREAM_END:
+				page_zip_decompress_heap_no(
+					d_stream, rec, heap_status);
+				goto zlib_done;
+			case Z_OK:
+			case Z_BUF_ERROR:
+				if (!d_stream->avail_out) {
+					break;
+				}
+				/* fall through */
+			default:
+				page_zip_fail(("page_zip_decompress_sec:"
+					       " inflate(Z_SYNC_FLUSH)=%s\n",
+					       d_stream->msg));
+				goto zlib_error;
+			}
+		}
+
+		if (!page_zip_decompress_heap_no(
+			    d_stream, rec, heap_status)) {
+			ut_ad(0);
+		}
+	}
+
+	/* Decompress the data of the last record and any trailing garbage,
+	in case the last record was allocated from an originally longer space
+	on the free list. */
+	d_stream->avail_out = static_cast<uInt>(
+		page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
+		- page_offset(d_stream->next_out));
+	if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE
+			  - PAGE_ZIP_START - PAGE_DIR)) {
+
+		page_zip_fail(("page_zip_decompress_sec:"
+			       " avail_out = %u\n",
+			       d_stream->avail_out));
+		goto zlib_error;
+	}
+
+	if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+		page_zip_fail(("page_zip_decompress_sec:"
+			       " inflate(Z_FINISH)=%s\n",
+			       d_stream->msg));
+zlib_error:
+		inflateEnd(d_stream);
+		return(FALSE);
+	}
+
+	/* Note that d_stream->avail_out > 0 may hold here
+	if the modification log is nonempty. */
+
+zlib_done:
+	if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+		ut_error;
+	}
+
+	{
+		page_t*	page = page_align(d_stream->next_out);
+
+		/* Clear the unused heap space on the uncompressed page. */
+		memset(d_stream->next_out, 0,
+		       page_dir_get_nth_slot(page,
+					     page_dir_get_n_slots(page) - 1)
+		       - d_stream->next_out);
+	}
+
+#ifdef UNIV_DEBUG
+	page_zip->m_start = PAGE_DATA + d_stream->total_in;
+#endif /* UNIV_DEBUG */
+
+	/* Apply the modification log. */
+	{
+		const byte*	mod_log_ptr;
+		mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+						 d_stream->avail_in + 1,
+						 recs, n_dense,
+						 ULINT_UNDEFINED, heap_status,
+						 index, offsets);
+
+		if (UNIV_UNLIKELY(!mod_log_ptr)) {
+			return(FALSE);
+		}
+		page_zip->m_end = mod_log_ptr - page_zip->data;
+		page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+	}
+
+	if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, FALSE)
+			  + page_zip->m_end >= page_zip_get_size(page_zip))) {
+
+		page_zip_fail(("page_zip_decompress_sec: %lu + %lu >= %lu\n",
+			       (ulong) page_zip_get_trailer_len(
+				       page_zip, FALSE),
+			       (ulong) page_zip->m_end,
+			       (ulong) page_zip_get_size(page_zip)));
+		return(FALSE);
+	}
+
+	/* There are no uncompressed columns on leaf pages of
+	secondary indexes. */
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress a record of a leaf node of a clustered index that contains
+externally stored columns.
+@return	TRUE on success */
+static
+ibool
+page_zip_decompress_clust_ext(
+/*==========================*/
+	z_stream*	d_stream,	/*!< in/out: compressed page stream */
+	rec_t*		rec,		/*!< in/out: record */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec) */
+	ulint		trx_id_col)	/*!< in: position of of DB_TRX_ID */
+{
+	ulint	i;
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		ulint	len;
+		byte*	dst;
+
+		if (UNIV_UNLIKELY(i == trx_id_col)) {
+			/* Skip trx_id and roll_ptr */
+			dst = rec_get_nth_field(rec, offsets, i, &len);
+			if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN
+					  + DATA_ROLL_PTR_LEN)) {
+
+				page_zip_fail(("page_zip_decompress_clust_ext:"
+					       " len[%lu] = %lu\n",
+					       (ulong) i, (ulong) len));
+				return(FALSE);
+			}
+
+			if (rec_offs_nth_extern(offsets, i)) {
+
+				page_zip_fail(("page_zip_decompress_clust_ext:"
+					       " DB_TRX_ID at %lu is ext\n",
+					       (ulong) i));
+				return(FALSE);
+			}
+
+			d_stream->avail_out = static_cast<uInt>(
+				dst - d_stream->next_out);
+
+			switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+			case Z_STREAM_END:
+			case Z_OK:
+			case Z_BUF_ERROR:
+				if (!d_stream->avail_out) {
+					break;
+				}
+				/* fall through */
+			default:
+				page_zip_fail(("page_zip_decompress_clust_ext:"
+					       " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+					       d_stream->msg));
+				return(FALSE);
+			}
+
+			ut_ad(d_stream->next_out == dst);
+
+			/* Clear DB_TRX_ID and DB_ROLL_PTR in order to
+			avoid uninitialized bytes in case the record
+			is affected by page_zip_apply_log(). */
+			memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+			d_stream->next_out += DATA_TRX_ID_LEN
+				+ DATA_ROLL_PTR_LEN;
+		} else if (rec_offs_nth_extern(offsets, i)) {
+			dst = rec_get_nth_field(rec, offsets, i, &len);
+			ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+			dst += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+			d_stream->avail_out = static_cast<uInt>(
+				dst - d_stream->next_out);
+			switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+			case Z_STREAM_END:
+			case Z_OK:
+			case Z_BUF_ERROR:
+				if (!d_stream->avail_out) {
+					break;
+				}
+				/* fall through */
+			default:
+				page_zip_fail(("page_zip_decompress_clust_ext:"
+					       " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+					       d_stream->msg));
+				return(FALSE);
+			}
+
+			ut_ad(d_stream->next_out == dst);
+
+			/* Clear the BLOB pointer in case
+			the record will be deleted and the
+			space will not be reused.  Note that
+			the final initialization of the BLOB
+			pointers (copying from "externs"
+			or clearing) will have to take place
+			only after the page modification log
+			has been applied.  Otherwise, we
+			could end up with an uninitialized
+			BLOB pointer when a record is deleted,
+			reallocated and deleted. */
+			memset(d_stream->next_out, 0,
+			       BTR_EXTERN_FIELD_REF_SIZE);
+			d_stream->next_out
+				+= BTR_EXTERN_FIELD_REF_SIZE;
+		}
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a clustered index.
+@return	TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_clust(
+/*======================*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	z_stream*	d_stream,	/*!< in/out: compressed page stream */
+	rec_t**		recs,		/*!< in: dense page directory
+					sorted by address */
+	ulint		n_dense,	/*!< in: size of recs[] */
+	dict_index_t*	index,		/*!< in: the index of the page */
+	ulint		trx_id_col,	/*!< index of the trx_id column */
+	ulint*		offsets,	/*!< in/out: temporary offsets */
+	mem_heap_t*	heap)		/*!< in: temporary memory heap */
+{
+	int		err;
+	ulint		slot;
+	ulint		heap_status	= REC_STATUS_ORDINARY
+		| PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+	const byte*	storage;
+	const byte*	externs;
+
+	ut_a(dict_index_is_clust(index));
+
+	/* Subtract the space reserved for uncompressed data. */
+	d_stream->avail_in -= static_cast<uInt>(n_dense)
+			    * (PAGE_ZIP_DIR_SLOT_SIZE
+			      + DATA_TRX_ID_LEN
+			      + DATA_ROLL_PTR_LEN);
+
+	/* Decompress the records in heap_no order. */
+	for (slot = 0; slot < n_dense; slot++) {
+		rec_t*	rec	= recs[slot];
+
+		d_stream->avail_out =static_cast<uInt>(
+			rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
+
+		ut_ad(d_stream->avail_out < UNIV_PAGE_SIZE
+		      - PAGE_ZIP_START - PAGE_DIR);
+		err = inflate(d_stream, Z_SYNC_FLUSH);
+		switch (err) {
+		case Z_STREAM_END:
+			page_zip_decompress_heap_no(
+				d_stream, rec, heap_status);
+			goto zlib_done;
+		case Z_OK:
+		case Z_BUF_ERROR:
+			if (UNIV_LIKELY(!d_stream->avail_out)) {
+				break;
+			}
+			/* fall through */
+		default:
+			page_zip_fail(("page_zip_decompress_clust:"
+				       " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+				       d_stream->msg));
+			goto zlib_error;
+		}
+
+		if (!page_zip_decompress_heap_no(
+			    d_stream, rec, heap_status)) {
+			ut_ad(0);
+		}
+
+		/* Read the offsets. The status bits are needed here. */
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+
+		/* This is a leaf page in a clustered index. */
+
+		/* Check if there are any externally stored columns.
+		For each externally stored column, restore the
+		BTR_EXTERN_FIELD_REF separately. */
+
+		if (rec_offs_any_extern(offsets)) {
+			if (UNIV_UNLIKELY
+			    (!page_zip_decompress_clust_ext(
+				    d_stream, rec, offsets, trx_id_col))) {
+
+				goto zlib_error;
+			}
+		} else {
+			/* Skip trx_id and roll_ptr */
+			ulint	len;
+			byte*	dst = rec_get_nth_field(rec, offsets,
+							trx_id_col, &len);
+			if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN
+					  + DATA_ROLL_PTR_LEN)) {
+
+				page_zip_fail(("page_zip_decompress_clust:"
+					       " len = %lu\n", (ulong) len));
+				goto zlib_error;
+			}
+
+			d_stream->avail_out = static_cast<uInt>(
+				dst - d_stream->next_out);
+
+			switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+			case Z_STREAM_END:
+			case Z_OK:
+			case Z_BUF_ERROR:
+				if (!d_stream->avail_out) {
+					break;
+				}
+				/* fall through */
+			default:
+				page_zip_fail(("page_zip_decompress_clust:"
+					       " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+					       d_stream->msg));
+				goto zlib_error;
+			}
+
+			ut_ad(d_stream->next_out == dst);
+
+			/* Clear DB_TRX_ID and DB_ROLL_PTR in order to
+			avoid uninitialized bytes in case the record
+			is affected by page_zip_apply_log(). */
+			memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+			d_stream->next_out += DATA_TRX_ID_LEN
+				+ DATA_ROLL_PTR_LEN;
+		}
+
+		/* Decompress the last bytes of the record. */
+		d_stream->avail_out = static_cast<uInt>(
+			rec_get_end(rec, offsets) - d_stream->next_out);
+
+		switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+		case Z_STREAM_END:
+		case Z_OK:
+		case Z_BUF_ERROR:
+			if (!d_stream->avail_out) {
+				break;
+			}
+			/* fall through */
+		default:
+			page_zip_fail(("page_zip_decompress_clust:"
+				       " 3 inflate(Z_SYNC_FLUSH)=%s\n",
+				       d_stream->msg));
+			goto zlib_error;
+		}
+	}
+
+	/* Decompress any trailing garbage, in case the last record was
+	allocated from an originally longer space on the free list. */
+	d_stream->avail_out = static_cast<uInt>(
+		page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
+		- page_offset(d_stream->next_out));
+	if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE
+			  - PAGE_ZIP_START - PAGE_DIR)) {
+
+		page_zip_fail(("page_zip_decompress_clust:"
+			       " avail_out = %u\n",
+			       d_stream->avail_out));
+		goto zlib_error;
+	}
+
+	if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+		page_zip_fail(("page_zip_decompress_clust:"
+			       " inflate(Z_FINISH)=%s\n",
+			       d_stream->msg));
+zlib_error:
+		inflateEnd(d_stream);
+		return(FALSE);
+	}
+
+	/* Note that d_stream->avail_out > 0 may hold here
+	if the modification log is nonempty. */
+
+zlib_done:
+	if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+		ut_error;
+	}
+
+	{
+		page_t*	page = page_align(d_stream->next_out);
+
+		/* Clear the unused heap space on the uncompressed page. */
+		memset(d_stream->next_out, 0,
+		       page_dir_get_nth_slot(page,
+					     page_dir_get_n_slots(page) - 1)
+		       - d_stream->next_out);
+	}
+
+#ifdef UNIV_DEBUG
+	page_zip->m_start = PAGE_DATA + d_stream->total_in;
+#endif /* UNIV_DEBUG */
+
+	/* Apply the modification log. */
+	{
+		const byte*	mod_log_ptr;
+		mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+						 d_stream->avail_in + 1,
+						 recs, n_dense,
+						 trx_id_col, heap_status,
+						 index, offsets);
+
+		if (UNIV_UNLIKELY(!mod_log_ptr)) {
+			return(FALSE);
+		}
+		page_zip->m_end = mod_log_ptr - page_zip->data;
+		page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+	}
+
+	if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, TRUE)
+			  + page_zip->m_end >= page_zip_get_size(page_zip))) {
+
+		page_zip_fail(("page_zip_decompress_clust: %lu + %lu >= %lu\n",
+			       (ulong) page_zip_get_trailer_len(
+				       page_zip, TRUE),
+			       (ulong) page_zip->m_end,
+			       (ulong) page_zip_get_size(page_zip)));
+		return(FALSE);
+	}
+
+	storage = page_zip_dir_start_low(page_zip, n_dense);
+
+	externs = storage - n_dense
+		* (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+	/* Restore the uncompressed columns in heap_no order. */
+
+	for (slot = 0; slot < n_dense; slot++) {
+		ulint	i;
+		ulint	len;
+		byte*	dst;
+		rec_t*	rec	= recs[slot];
+		ibool	exists	= !page_zip_dir_find_free(
+			page_zip, page_offset(rec));
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+
+		dst = rec_get_nth_field(rec, offsets,
+					trx_id_col, &len);
+		ut_ad(len >= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+		storage -= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+		memcpy(dst, storage,
+		       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+		/* Check if there are any externally stored
+		columns in this record.  For each externally
+		stored column, restore or clear the
+		BTR_EXTERN_FIELD_REF. */
+		if (!rec_offs_any_extern(offsets)) {
+			continue;
+		}
+
+		for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+			if (!rec_offs_nth_extern(offsets, i)) {
+				continue;
+			}
+			dst = rec_get_nth_field(rec, offsets, i, &len);
+
+			if (UNIV_UNLIKELY(len < BTR_EXTERN_FIELD_REF_SIZE)) {
+				page_zip_fail(("page_zip_decompress_clust:"
+					       " %lu < 20\n",
+					       (ulong) len));
+				return(FALSE);
+			}
+
+			dst += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+			if (UNIV_LIKELY(exists)) {
+				/* Existing record:
+				restore the BLOB pointer */
+				externs -= BTR_EXTERN_FIELD_REF_SIZE;
+
+				if (UNIV_UNLIKELY
+				    (externs < page_zip->data
+				     + page_zip->m_end)) {
+					page_zip_fail(("page_zip_"
+						       "decompress_clust: "
+						       "%p < %p + %lu\n",
+						       (const void*) externs,
+						       (const void*)
+						       page_zip->data,
+						       (ulong)
+						       page_zip->m_end));
+					return(FALSE);
+				}
+
+				memcpy(dst, externs,
+				       BTR_EXTERN_FIELD_REF_SIZE);
+
+				page_zip->n_blobs++;
+			} else {
+				/* Deleted record:
+				clear the BLOB pointer */
+				memset(dst, 0,
+				       BTR_EXTERN_FIELD_REF_SIZE);
+			}
+		}
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress a page.  This function should tolerate errors on the compressed
+page.  Instead of letting assertions fail, it will return FALSE if an
+inconsistency is detected.
+@return	TRUE on success, FALSE on failure */
+UNIV_INTERN
+ibool
+page_zip_decompress(
+/*================*/
+	page_zip_des_t*	page_zip,/*!< in: data, ssize;
+				out: m_start, m_end, m_nonempty, n_blobs */
+	page_t*		page,	/*!< out: uncompressed page, may be trashed */
+	ibool		all)	/*!< in: TRUE=decompress the whole page;
+				FALSE=verify but do not copy some
+				page header fields that should not change
+				after page creation */
+{
+	z_stream	d_stream;
+	dict_index_t*	index	= NULL;
+	rec_t**		recs;	/*!< dense page directory, sorted by address */
+	ulint		n_dense;/* number of user records on the page */
+	ulint		trx_id_col = ULINT_UNDEFINED;
+	mem_heap_t*	heap;
+	ulint*		offsets;
+#ifndef UNIV_HOTBACKUP
+	ullint		usec = ut_time_us(NULL);
+#endif /* !UNIV_HOTBACKUP */
+
+	ut_ad(page_zip_simple_validate(page_zip));
+	UNIV_MEM_ASSERT_W(page, UNIV_PAGE_SIZE);
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+	/* The dense directory excludes the infimum and supremum records. */
+	n_dense = page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW;
+	if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
+			  >= page_zip_get_size(page_zip))) {
+		page_zip_fail(("page_zip_decompress 1: %lu %lu\n",
+			       (ulong) n_dense,
+			       (ulong) page_zip_get_size(page_zip)));
+		return(FALSE);
+	}
+
+	heap = mem_heap_create(n_dense * (3 * sizeof *recs) + UNIV_PAGE_SIZE);
+
+	recs = static_cast<rec_t**>(
+		mem_heap_alloc(heap, n_dense * (2 * sizeof *recs)));
+
+	if (all) {
+		/* Copy the page header. */
+		memcpy(page, page_zip->data, PAGE_DATA);
+	} else {
+		/* Check that the bytes that we skip are identical. */
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+		ut_a(!memcmp(FIL_PAGE_TYPE + page,
+			     FIL_PAGE_TYPE + page_zip->data,
+			     PAGE_HEADER - FIL_PAGE_TYPE));
+		ut_a(!memcmp(PAGE_HEADER + PAGE_LEVEL + page,
+			     PAGE_HEADER + PAGE_LEVEL + page_zip->data,
+			     PAGE_DATA - (PAGE_HEADER + PAGE_LEVEL)));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+		/* Copy the mutable parts of the page header. */
+		memcpy(page, page_zip->data, FIL_PAGE_TYPE);
+		memcpy(PAGE_HEADER + page, PAGE_HEADER + page_zip->data,
+		       PAGE_LEVEL - PAGE_N_DIR_SLOTS);
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+		/* Check that the page headers match after copying. */
+		ut_a(!memcmp(page, page_zip->data, PAGE_DATA));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	/* Clear the uncompressed page, except the header. */
+	memset(PAGE_DATA + page, 0x55, UNIV_PAGE_SIZE - PAGE_DATA);
+#endif /* UNIV_ZIP_DEBUG */
+	UNIV_MEM_INVALID(PAGE_DATA + page, UNIV_PAGE_SIZE - PAGE_DATA);
+
+	/* Copy the page directory. */
+	if (UNIV_UNLIKELY(!page_zip_dir_decode(page_zip, page, recs,
+					       recs + n_dense, n_dense))) {
+zlib_error:
+		mem_heap_free(heap);
+		return(FALSE);
+	}
+
+	/* Copy the infimum and supremum records. */
+	memcpy(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
+	       infimum_extra, sizeof infimum_extra);
+	if (page_is_empty(page)) {
+		rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
+				      PAGE_NEW_SUPREMUM);
+	} else {
+		rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
+				      page_zip_dir_get(page_zip, 0)
+				      & PAGE_ZIP_DIR_SLOT_MASK);
+	}
+	memcpy(page + PAGE_NEW_INFIMUM, infimum_data, sizeof infimum_data);
+	memcpy(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1),
+	       supremum_extra_data, sizeof supremum_extra_data);
+
+	page_zip_set_alloc(&d_stream, heap);
+
+	d_stream.next_in = page_zip->data + PAGE_DATA;
+	/* Subtract the space reserved for
+	the page header and the end marker of the modification log. */
+	d_stream.avail_in = static_cast<uInt>(
+		page_zip_get_size(page_zip) - (PAGE_DATA + 1));
+	d_stream.next_out = page + PAGE_ZIP_START;
+	d_stream.avail_out = UNIV_PAGE_SIZE - PAGE_ZIP_START;
+
+	if (UNIV_UNLIKELY(inflateInit2(&d_stream, UNIV_PAGE_SIZE_SHIFT)
+			  != Z_OK)) {
+		ut_error;
+	}
+
+	/* Decode the zlib header and the index information. */
+	if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
+
+		page_zip_fail(("page_zip_decompress:"
+			       " 1 inflate(Z_BLOCK)=%s\n", d_stream.msg));
+		goto zlib_error;
+	}
+
+	if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
+
+		page_zip_fail(("page_zip_decompress:"
+			       " 2 inflate(Z_BLOCK)=%s\n", d_stream.msg));
+		goto zlib_error;
+	}
+
+	index = page_zip_fields_decode(
+		page + PAGE_ZIP_START, d_stream.next_out,
+		page_is_leaf(page) ? &trx_id_col : NULL);
+
+	if (UNIV_UNLIKELY(!index)) {
+
+		goto zlib_error;
+	}
+
+	/* Decompress the user records. */
+	page_zip->n_blobs = 0;
+	d_stream.next_out = page + PAGE_ZIP_START;
+
+	{
+		/* Pre-allocate the offsets for rec_get_offsets_reverse(). */
+		ulint	n = 1 + 1/* node ptr */ + REC_OFFS_HEADER_SIZE
+			+ dict_index_get_n_fields(index);
+
+		offsets = static_cast<ulint*>(
+			mem_heap_alloc(heap, n * sizeof(ulint)));
+
+		*offsets = n;
+	}
+
+	/* Decompress the records in heap_no order. */
+	if (!page_is_leaf(page)) {
+		/* This is a node pointer page. */
+		ulint	info_bits;
+
+		if (UNIV_UNLIKELY
+		    (!page_zip_decompress_node_ptrs(page_zip, &d_stream,
+						    recs, n_dense, index,
+						    offsets, heap))) {
+			goto err_exit;
+		}
+
+		info_bits = mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL
+			? REC_INFO_MIN_REC_FLAG : 0;
+
+		if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, page,
+							    info_bits))) {
+			goto err_exit;
+		}
+	} else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+		/* This is a leaf page in a secondary index. */
+		if (UNIV_UNLIKELY(!page_zip_decompress_sec(page_zip, &d_stream,
+							   recs, n_dense,
+							   index, offsets))) {
+			goto err_exit;
+		}
+
+		if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip,
+							    page, 0))) {
+err_exit:
+			page_zip_fields_free(index);
+			mem_heap_free(heap);
+			return(FALSE);
+		}
+	} else {
+		/* This is a leaf page in a clustered index. */
+		if (UNIV_UNLIKELY(!page_zip_decompress_clust(page_zip,
+							     &d_stream, recs,
+							     n_dense, index,
+							     trx_id_col,
+							     offsets, heap))) {
+			goto err_exit;
+		}
+
+		if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip,
+							    page, 0))) {
+			goto err_exit;
+		}
+	}
+
+	ut_a(page_is_comp(page));
+	UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
+
+	page_zip_fields_free(index);
+	mem_heap_free(heap);
+#ifndef UNIV_HOTBACKUP
+	ullint	time_diff = ut_time_us(NULL) - usec;
+	page_zip_stat[page_zip->ssize - 1].decompressed++;
+	page_zip_stat[page_zip->ssize - 1].decompressed_usec += time_diff;
+
+	index_id_t	index_id = btr_page_get_index_id(page);
+
+	if (srv_cmp_per_index_enabled) {
+		mutex_enter(&page_zip_stat_per_index_mutex);
+		page_zip_stat_per_index[index_id].decompressed++;
+		page_zip_stat_per_index[index_id].decompressed_usec += time_diff;
+		mutex_exit(&page_zip_stat_per_index_mutex);
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	/* Update the stat counter for LRU policy. */
+	buf_LRU_stat_inc_unzip();
+
+	MONITOR_INC(MONITOR_PAGE_DECOMPRESS);
+
+	return(TRUE);
+}
+
+#ifdef UNIV_ZIP_DEBUG
+/**********************************************************************//**
+Dump a block of memory on the standard error stream. */
+static
+void
+page_zip_hexdump_func(
+/*==================*/
+	const char*	name,	/*!< in: name of the data structure */
+	const void*	buf,	/*!< in: data */
+	ulint		size)	/*!< in: length of the data, in bytes */
+{
+	const byte*	s	= static_cast<const byte*>(buf);
+	ulint		addr;
+	const ulint	width	= 32; /* bytes per line */
+
+	fprintf(stderr, "%s:\n", name);
+
+	for (addr = 0; addr < size; addr += width) {
+		ulint	i;
+
+		fprintf(stderr, "%04lx ", (ulong) addr);
+
+		i = ut_min(width, size - addr);
+
+		while (i--) {
+			fprintf(stderr, "%02x", *s++);
+		}
+
+		putc('\n', stderr);
+	}
+}
+
+/** Dump a block of memory on the standard error stream.
+@param buf	in: data
+@param size	in: length of the data, in bytes */
+#define page_zip_hexdump(buf, size) page_zip_hexdump_func(#buf, buf, size)
+
+/** Flag: make page_zip_validate() compare page headers only */
+UNIV_INTERN ibool	page_zip_validate_header_only = FALSE;
+
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return	TRUE if valid, FALSE if not */
+UNIV_INTERN
+ibool
+page_zip_validate_low(
+/*==================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	const page_t*		page,	/*!< in: uncompressed page */
+	const dict_index_t*	index,	/*!< in: index of the page, if known */
+	ibool			sloppy)	/*!< in: FALSE=strict,
+					TRUE=ignore the MIN_REC_FLAG */
+{
+	page_zip_des_t	temp_page_zip;
+	byte*		temp_page_buf;
+	page_t*		temp_page;
+	ibool		valid;
+
+	if (memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+		   FIL_PAGE_LSN - FIL_PAGE_PREV)
+	    || memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2)
+	    || memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+		      PAGE_DATA - FIL_PAGE_DATA)) {
+		page_zip_fail(("page_zip_validate: page header\n"));
+		page_zip_hexdump(page_zip, sizeof *page_zip);
+		page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip));
+		page_zip_hexdump(page, UNIV_PAGE_SIZE);
+		return(FALSE);
+	}
+
+	ut_a(page_is_comp(page));
+
+	if (page_zip_validate_header_only) {
+		return(TRUE);
+	}
+
+	/* page_zip_decompress() expects the uncompressed page to be
+	UNIV_PAGE_SIZE aligned. */
+	temp_page_buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
+	temp_page = static_cast<byte*>(ut_align(temp_page_buf, UNIV_PAGE_SIZE));
+
+	UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+	temp_page_zip = *page_zip;
+	valid = page_zip_decompress(&temp_page_zip, temp_page, TRUE);
+	if (!valid) {
+		fputs("page_zip_validate(): failed to decompress\n", stderr);
+		goto func_exit;
+	}
+	if (page_zip->n_blobs != temp_page_zip.n_blobs) {
+		page_zip_fail(("page_zip_validate: n_blobs: %u!=%u\n",
+			       page_zip->n_blobs, temp_page_zip.n_blobs));
+		valid = FALSE;
+	}
+#ifdef UNIV_DEBUG
+	if (page_zip->m_start != temp_page_zip.m_start) {
+		page_zip_fail(("page_zip_validate: m_start: %u!=%u\n",
+			       page_zip->m_start, temp_page_zip.m_start));
+		valid = FALSE;
+	}
+#endif /* UNIV_DEBUG */
+	if (page_zip->m_end != temp_page_zip.m_end) {
+		page_zip_fail(("page_zip_validate: m_end: %u!=%u\n",
+			       page_zip->m_end, temp_page_zip.m_end));
+		valid = FALSE;
+	}
+	if (page_zip->m_nonempty != temp_page_zip.m_nonempty) {
+		page_zip_fail(("page_zip_validate(): m_nonempty: %u!=%u\n",
+			       page_zip->m_nonempty,
+			       temp_page_zip.m_nonempty));
+		valid = FALSE;
+	}
+	if (memcmp(page + PAGE_HEADER, temp_page + PAGE_HEADER,
+		   UNIV_PAGE_SIZE - PAGE_HEADER - FIL_PAGE_DATA_END)) {
+
+		/* In crash recovery, the "minimum record" flag may be
+		set incorrectly until the mini-transaction is
+		committed.  Let us tolerate that difference when we
+		are performing a sloppy validation. */
+
+		ulint*		offsets;
+		mem_heap_t*	heap;
+		const rec_t*	rec;
+		const rec_t*	trec;
+		byte		info_bits_diff;
+		ulint		offset
+			= rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE);
+		ut_a(offset >= PAGE_NEW_SUPREMUM);
+		offset -= 5/*REC_NEW_INFO_BITS*/;
+
+		info_bits_diff = page[offset] ^ temp_page[offset];
+
+		if (info_bits_diff == REC_INFO_MIN_REC_FLAG) {
+			temp_page[offset] = page[offset];
+
+			if (!memcmp(page + PAGE_HEADER,
+				    temp_page + PAGE_HEADER,
+				    UNIV_PAGE_SIZE - PAGE_HEADER
+				    - FIL_PAGE_DATA_END)) {
+
+				/* Only the minimum record flag
+				differed.  Let us ignore it. */
+				page_zip_fail(("page_zip_validate: "
+					       "min_rec_flag "
+					       "(%s"
+					       "%lu,%lu,0x%02lx)\n",
+					       sloppy ? "ignored, " : "",
+					       page_get_space_id(page),
+					       page_get_page_no(page),
+					       (ulong) page[offset]));
+				valid = sloppy;
+				goto func_exit;
+			}
+		}
+
+		/* Compare the pointers in the PAGE_FREE list. */
+		rec = page_header_get_ptr(page, PAGE_FREE);
+		trec = page_header_get_ptr(temp_page, PAGE_FREE);
+
+		while (rec || trec) {
+			if (page_offset(rec) != page_offset(trec)) {
+				page_zip_fail(("page_zip_validate: "
+					       "PAGE_FREE list: %u!=%u\n",
+					       (unsigned) page_offset(rec),
+					       (unsigned) page_offset(trec)));
+				valid = FALSE;
+				goto func_exit;
+			}
+
+			rec = page_rec_get_next_low(rec, TRUE);
+			trec = page_rec_get_next_low(trec, TRUE);
+		}
+
+		/* Compare the records. */
+		heap = NULL;
+		offsets = NULL;
+		rec = page_rec_get_next_low(
+			page + PAGE_NEW_INFIMUM, TRUE);
+		trec = page_rec_get_next_low(
+			temp_page + PAGE_NEW_INFIMUM, TRUE);
+
+		do {
+			if (page_offset(rec) != page_offset(trec)) {
+				page_zip_fail(("page_zip_validate: "
+					       "record list: 0x%02x!=0x%02x\n",
+					       (unsigned) page_offset(rec),
+					       (unsigned) page_offset(trec)));
+				valid = FALSE;
+				break;
+			}
+
+			if (index) {
+				/* Compare the data. */
+				offsets = rec_get_offsets(
+					rec, index, offsets,
+					ULINT_UNDEFINED, &heap);
+
+				if (memcmp(rec - rec_offs_extra_size(offsets),
+					   trec - rec_offs_extra_size(offsets),
+					   rec_offs_size(offsets))) {
+					page_zip_fail(
+						("page_zip_validate: "
+						 "record content: 0x%02x",
+						 (unsigned) page_offset(rec)));
+					valid = FALSE;
+					break;
+				}
+			}
+
+			rec = page_rec_get_next_low(rec, TRUE);
+			trec = page_rec_get_next_low(trec, TRUE);
+		} while (rec || trec);
+
+		if (heap) {
+			mem_heap_free(heap);
+		}
+	}
+
+func_exit:
+	if (!valid) {
+		page_zip_hexdump(page_zip, sizeof *page_zip);
+		page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip));
+		page_zip_hexdump(page, UNIV_PAGE_SIZE);
+		page_zip_hexdump(temp_page, UNIV_PAGE_SIZE);
+	}
+	ut_free(temp_page_buf);
+	return(valid);
+}
+
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return	TRUE if valid, FALSE if not */
+UNIV_INTERN
+ibool
+page_zip_validate(
+/*==============*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	const page_t*		page,	/*!< in: uncompressed page */
+	const dict_index_t*	index)	/*!< in: index of the page, if known */
+{
+	return(page_zip_validate_low(page_zip, page, index,
+				     recv_recovery_is_on()));
+}
+#endif /* UNIV_ZIP_DEBUG */
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Assert that the compressed and decompressed page headers match.
+@return	TRUE */
+static
+ibool
+page_zip_header_cmp(
+/*================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	const byte*		page)	/*!< in: uncompressed page */
+{
+	ut_ad(!memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+		      FIL_PAGE_LSN - FIL_PAGE_PREV));
+	ut_ad(!memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE,
+		      2));
+	ut_ad(!memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+		      PAGE_DATA - FIL_PAGE_DATA));
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Write a record on the compressed page that contains externally stored
+columns.  The data must already have been written to the uncompressed page.
+@return	end of modification log */
+static
+byte*
+page_zip_write_rec_ext(
+/*===================*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	const page_t*	page,		/*!< in: page containing rec */
+	const byte*	rec,		/*!< in: record being written */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	ulint		create,		/*!< in: nonzero=insert, zero=update */
+	ulint		trx_id_col,	/*!< in: position of DB_TRX_ID */
+	ulint		heap_no,	/*!< in: heap number of rec */
+	byte*		storage,	/*!< in: end of dense page directory */
+	byte*		data)		/*!< in: end of modification log */
+{
+	const byte*	start	= rec;
+	ulint		i;
+	ulint		len;
+	byte*		externs	= storage;
+	ulint		n_ext	= rec_offs_n_extern(offsets);
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+	UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+			   rec_offs_extra_size(offsets));
+
+	externs -= (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+		* (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW);
+
+	/* Note that this will not take into account
+	the BLOB columns of rec if create==TRUE. */
+	ut_ad(data + rec_offs_data_size(offsets)
+	      - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+	      - n_ext * BTR_EXTERN_FIELD_REF_SIZE
+	      < externs - BTR_EXTERN_FIELD_REF_SIZE * page_zip->n_blobs);
+
+	{
+		ulint	blob_no = page_zip_get_n_prev_extern(
+			page_zip, rec, index);
+		byte*	ext_end = externs - page_zip->n_blobs
+			* BTR_EXTERN_FIELD_REF_SIZE;
+		ut_ad(blob_no <= page_zip->n_blobs);
+		externs -= blob_no * BTR_EXTERN_FIELD_REF_SIZE;
+
+		if (create) {
+			page_zip->n_blobs += static_cast<unsigned>(n_ext);
+			ASSERT_ZERO_BLOB(ext_end - n_ext
+					 * BTR_EXTERN_FIELD_REF_SIZE);
+			memmove(ext_end - n_ext
+				* BTR_EXTERN_FIELD_REF_SIZE,
+				ext_end,
+				externs - ext_end);
+		}
+
+		ut_a(blob_no + n_ext <= page_zip->n_blobs);
+	}
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		const byte*	src;
+
+		if (UNIV_UNLIKELY(i == trx_id_col)) {
+			ut_ad(!rec_offs_nth_extern(offsets,
+						   i));
+			ut_ad(!rec_offs_nth_extern(offsets,
+						   i + 1));
+			/* Locate trx_id and roll_ptr. */
+			src = rec_get_nth_field(rec, offsets,
+						i, &len);
+			ut_ad(len == DATA_TRX_ID_LEN);
+			ut_ad(src + DATA_TRX_ID_LEN
+			      == rec_get_nth_field(
+				      rec, offsets,
+				      i + 1, &len));
+			ut_ad(len == DATA_ROLL_PTR_LEN);
+
+			/* Log the preceding fields. */
+			ASSERT_ZERO(data, src - start);
+			memcpy(data, start, src - start);
+			data += src - start;
+			start = src + (DATA_TRX_ID_LEN
+				       + DATA_ROLL_PTR_LEN);
+
+			/* Store trx_id and roll_ptr. */
+			memcpy(storage - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+			       * (heap_no - 1),
+			       src, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+			i++; /* skip also roll_ptr */
+		} else if (rec_offs_nth_extern(offsets, i)) {
+			src = rec_get_nth_field(rec, offsets,
+						i, &len);
+
+			ut_ad(dict_index_is_clust(index));
+			ut_ad(len
+			      >= BTR_EXTERN_FIELD_REF_SIZE);
+			src += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+			ASSERT_ZERO(data, src - start);
+			memcpy(data, start, src - start);
+			data += src - start;
+			start = src + BTR_EXTERN_FIELD_REF_SIZE;
+
+			/* Store the BLOB pointer. */
+			externs -= BTR_EXTERN_FIELD_REF_SIZE;
+			ut_ad(data < externs);
+			memcpy(externs, src, BTR_EXTERN_FIELD_REF_SIZE);
+		}
+	}
+
+	/* Log the last bytes of the record. */
+	len = rec_offs_data_size(offsets) - (start - rec);
+
+	ASSERT_ZERO(data, len);
+	memcpy(data, start, len);
+	data += len;
+
+	return(data);
+}
+
+/**********************************************************************//**
+Write an entire record on the compressed page.  The data must already
+have been written to the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_write_rec(
+/*===============*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in: record being written */
+	dict_index_t*	index,	/*!< in: the index the record belongs to */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint		create)	/*!< in: nonzero=insert, zero=update */
+{
+	const page_t*	page;
+	byte*		data;
+	byte*		storage;
+	ulint		heap_no;
+	byte*		slot;
+
+	ut_ad(PAGE_ZIP_MATCH(rec, page_zip));
+	ut_ad(page_zip_simple_validate(page_zip));
+	ut_ad(page_zip_get_size(page_zip)
+	      > PAGE_DATA + page_zip_dir_size(page_zip));
+	ut_ad(rec_offs_comp(offsets));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	ut_ad(page_zip->m_start >= PAGE_DATA);
+
+	page = page_align(rec);
+
+	ut_ad(page_zip_header_cmp(page_zip, page));
+	ut_ad(page_simple_validate_new((page_t*) page));
+
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+	UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+	UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+			   rec_offs_extra_size(offsets));
+
+	slot = page_zip_dir_find(page_zip, page_offset(rec));
+	ut_a(slot);
+	/* Copy the delete mark. */
+	if (rec_get_deleted_flag(rec, TRUE)) {
+		*slot |= PAGE_ZIP_DIR_SLOT_DEL >> 8;
+	} else {
+		*slot &= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8);
+	}
+
+	ut_ad(rec_get_start((rec_t*) rec, offsets) >= page + PAGE_ZIP_START);
+	ut_ad(rec_get_end((rec_t*) rec, offsets) <= page + UNIV_PAGE_SIZE
+	      - PAGE_DIR - PAGE_DIR_SLOT_SIZE
+	      * page_dir_get_n_slots(page));
+
+	heap_no = rec_get_heap_no_new(rec);
+	ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); /* not infimum or supremum */
+	ut_ad(heap_no < page_dir_get_n_heap(page));
+
+	/* Append to the modification log. */
+	data = page_zip->data + page_zip->m_end;
+	ut_ad(!*data);
+
+	/* Identify the record by writing its heap number - 1.
+	0 is reserved to indicate the end of the modification log. */
+
+	if (UNIV_UNLIKELY(heap_no - 1 >= 64)) {
+		*data++ = (byte) (0x80 | (heap_no - 1) >> 7);
+		ut_ad(!*data);
+	}
+	*data++ = (byte) ((heap_no - 1) << 1);
+	ut_ad(!*data);
+
+	{
+		const byte*	start	= rec - rec_offs_extra_size(offsets);
+		const byte*	b	= rec - REC_N_NEW_EXTRA_BYTES;
+
+		/* Write the extra bytes backwards, so that
+		rec_offs_extra_size() can be easily computed in
+		page_zip_apply_log() by invoking
+		rec_get_offsets_reverse(). */
+
+		while (b != start) {
+			*data++ = *--b;
+			ut_ad(!*data);
+		}
+	}
+
+	/* Write the data bytes.  Store the uncompressed bytes separately. */
+	storage = page_zip_dir_start(page_zip);
+
+	if (page_is_leaf(page)) {
+		ulint		len;
+
+		if (dict_index_is_clust(index)) {
+			ulint		trx_id_col;
+
+			trx_id_col = dict_index_get_sys_col_pos(index,
+								DATA_TRX_ID);
+			ut_ad(trx_id_col != ULINT_UNDEFINED);
+
+			/* Store separately trx_id, roll_ptr and
+			the BTR_EXTERN_FIELD_REF of each BLOB column. */
+			if (rec_offs_any_extern(offsets)) {
+				data = page_zip_write_rec_ext(
+					page_zip, page,
+					rec, index, offsets, create,
+					trx_id_col, heap_no, storage, data);
+			} else {
+				/* Locate trx_id and roll_ptr. */
+				const byte*	src
+					= rec_get_nth_field(rec, offsets,
+							    trx_id_col, &len);
+				ut_ad(len == DATA_TRX_ID_LEN);
+				ut_ad(src + DATA_TRX_ID_LEN
+				      == rec_get_nth_field(
+					      rec, offsets,
+					      trx_id_col + 1, &len));
+				ut_ad(len == DATA_ROLL_PTR_LEN);
+
+				/* Log the preceding fields. */
+				ASSERT_ZERO(data, src - rec);
+				memcpy(data, rec, src - rec);
+				data += src - rec;
+
+				/* Store trx_id and roll_ptr. */
+				memcpy(storage
+				       - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+				       * (heap_no - 1),
+				       src,
+				       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+				src += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+				/* Log the last bytes of the record. */
+				len = rec_offs_data_size(offsets)
+					- (src - rec);
+
+				ASSERT_ZERO(data, len);
+				memcpy(data, src, len);
+				data += len;
+			}
+		} else {
+			/* Leaf page of a secondary index:
+			no externally stored columns */
+			ut_ad(dict_index_get_sys_col_pos(index, DATA_TRX_ID)
+			      == ULINT_UNDEFINED);
+			ut_ad(!rec_offs_any_extern(offsets));
+
+			/* Log the entire record. */
+			len = rec_offs_data_size(offsets);
+
+			ASSERT_ZERO(data, len);
+			memcpy(data, rec, len);
+			data += len;
+		}
+	} else {
+		/* This is a node pointer page. */
+		ulint	len;
+
+		/* Non-leaf nodes should not have any externally
+		stored columns. */
+		ut_ad(!rec_offs_any_extern(offsets));
+
+		/* Copy the data bytes, except node_ptr. */
+		len = rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE;
+		ut_ad(data + len < storage - REC_NODE_PTR_SIZE
+		      * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW));
+		ASSERT_ZERO(data, len);
+		memcpy(data, rec, len);
+		data += len;
+
+		/* Copy the node pointer to the uncompressed area. */
+		memcpy(storage - REC_NODE_PTR_SIZE
+		       * (heap_no - 1),
+		       rec + len,
+		       REC_NODE_PTR_SIZE);
+	}
+
+	ut_a(!*data);
+	ut_ad((ulint) (data - page_zip->data) < page_zip_get_size(page_zip));
+	page_zip->m_end = data - page_zip->data;
+	page_zip->m_nonempty = TRUE;
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(page_zip_validate(page_zip, page_align(rec), index));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/***********************************************************//**
+Parses a log record of writing a BLOB pointer of a record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_blob_ptr(
+/*==========================*/
+	byte*		ptr,	/*!< in: redo log buffer */
+	byte*		end_ptr,/*!< in: redo log buffer end */
+	page_t*		page,	/*!< in/out: uncompressed page */
+	page_zip_des_t*	page_zip)/*!< in/out: compressed page */
+{
+	ulint	offset;
+	ulint	z_offset;
+
+	ut_ad(!page == !page_zip);
+
+	if (UNIV_UNLIKELY
+	    (end_ptr < ptr + (2 + 2 + BTR_EXTERN_FIELD_REF_SIZE))) {
+
+		return(NULL);
+	}
+
+	offset = mach_read_from_2(ptr);
+	z_offset = mach_read_from_2(ptr + 2);
+
+	if (UNIV_UNLIKELY(offset < PAGE_ZIP_START)
+	    || UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE)
+	    || UNIV_UNLIKELY(z_offset >= UNIV_PAGE_SIZE)) {
+corrupt:
+		recv_sys->found_corrupt_log = TRUE;
+
+		return(NULL);
+	}
+
+	if (page) {
+		if (UNIV_UNLIKELY(!page_zip)
+		    || UNIV_UNLIKELY(!page_is_leaf(page))) {
+
+			goto corrupt;
+		}
+
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(page_zip_validate(page_zip, page, NULL));
+#endif /* UNIV_ZIP_DEBUG */
+
+		memcpy(page + offset,
+		       ptr + 4, BTR_EXTERN_FIELD_REF_SIZE);
+		memcpy(page_zip->data + z_offset,
+		       ptr + 4, BTR_EXTERN_FIELD_REF_SIZE);
+
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(page_zip_validate(page_zip, page, NULL));
+#endif /* UNIV_ZIP_DEBUG */
+	}
+
+	return(ptr + (2 + 2 + BTR_EXTERN_FIELD_REF_SIZE));
+}
+
+/**********************************************************************//**
+Write a BLOB pointer of a record on the leaf page of a clustered index.
+The information must already have been updated on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_write_blob_ptr(
+/*====================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in/out: record whose data is being
+				written */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint		n,	/*!< in: column index */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle,
+				or NULL if no logging is needed */
+{
+	const byte*	field;
+	byte*		externs;
+	const page_t*	page	= page_align(rec);
+	ulint		blob_no;
+	ulint		len;
+
+	ut_ad(PAGE_ZIP_MATCH(rec, page_zip));
+	ut_ad(page_simple_validate_new((page_t*) page));
+	ut_ad(page_zip_simple_validate(page_zip));
+	ut_ad(page_zip_get_size(page_zip)
+	      > PAGE_DATA + page_zip_dir_size(page_zip));
+	ut_ad(rec_offs_comp(offsets));
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(rec_offs_any_extern(offsets));
+	ut_ad(rec_offs_nth_extern(offsets, n));
+
+	ut_ad(page_zip->m_start >= PAGE_DATA);
+	ut_ad(page_zip_header_cmp(page_zip, page));
+
+	ut_ad(page_is_leaf(page));
+	ut_ad(dict_index_is_clust(index));
+
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+	UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+	UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+			   rec_offs_extra_size(offsets));
+
+	blob_no = page_zip_get_n_prev_extern(page_zip, rec, index)
+		+ rec_get_n_extern_new(rec, index, n);
+	ut_a(blob_no < page_zip->n_blobs);
+
+	externs = page_zip->data + page_zip_get_size(page_zip)
+		- (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
+		* (PAGE_ZIP_DIR_SLOT_SIZE
+		   + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+	field = rec_get_nth_field(rec, offsets, n, &len);
+
+	externs -= (blob_no + 1) * BTR_EXTERN_FIELD_REF_SIZE;
+	field += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+	memcpy(externs, field, BTR_EXTERN_FIELD_REF_SIZE);
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (mtr) {
+#ifndef UNIV_HOTBACKUP
+		byte*	log_ptr	= mlog_open(
+			mtr, 11 + 2 + 2 + BTR_EXTERN_FIELD_REF_SIZE);
+		if (UNIV_UNLIKELY(!log_ptr)) {
+			return;
+		}
+
+		log_ptr = mlog_write_initial_log_record_fast(
+			(byte*) field, MLOG_ZIP_WRITE_BLOB_PTR, log_ptr, mtr);
+		mach_write_to_2(log_ptr, page_offset(field));
+		log_ptr += 2;
+		mach_write_to_2(log_ptr, externs - page_zip->data);
+		log_ptr += 2;
+		memcpy(log_ptr, externs, BTR_EXTERN_FIELD_REF_SIZE);
+		log_ptr += BTR_EXTERN_FIELD_REF_SIZE;
+		mlog_close(mtr, log_ptr);
+#endif /* !UNIV_HOTBACKUP */
+	}
+}
+
+/***********************************************************//**
+Parses a log record of writing the node pointer of a record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_node_ptr(
+/*==========================*/
+	byte*		ptr,	/*!< in: redo log buffer */
+	byte*		end_ptr,/*!< in: redo log buffer end */
+	page_t*		page,	/*!< in/out: uncompressed page */
+	page_zip_des_t*	page_zip)/*!< in/out: compressed page */
+{
+	ulint	offset;
+	ulint	z_offset;
+
+	ut_ad(!page == !page_zip);
+
+	if (UNIV_UNLIKELY(end_ptr < ptr + (2 + 2 + REC_NODE_PTR_SIZE))) {
+
+		return(NULL);
+	}
+
+	offset = mach_read_from_2(ptr);
+	z_offset = mach_read_from_2(ptr + 2);
+
+	if (UNIV_UNLIKELY(offset < PAGE_ZIP_START)
+	    || UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE)
+	    || UNIV_UNLIKELY(z_offset >= UNIV_PAGE_SIZE)) {
+corrupt:
+		recv_sys->found_corrupt_log = TRUE;
+
+		return(NULL);
+	}
+
+	if (page) {
+		byte*	storage_end;
+		byte*	field;
+		byte*	storage;
+		ulint	heap_no;
+
+		if (UNIV_UNLIKELY(!page_zip)
+		    || UNIV_UNLIKELY(page_is_leaf(page))) {
+
+			goto corrupt;
+		}
+
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(page_zip_validate(page_zip, page, NULL));
+#endif /* UNIV_ZIP_DEBUG */
+
+		field = page + offset;
+		storage = page_zip->data + z_offset;
+
+		storage_end = page_zip_dir_start(page_zip);
+
+		heap_no = 1 + (storage_end - storage) / REC_NODE_PTR_SIZE;
+
+		if (UNIV_UNLIKELY((storage_end - storage) % REC_NODE_PTR_SIZE)
+		    || UNIV_UNLIKELY(heap_no < PAGE_HEAP_NO_USER_LOW)
+		    || UNIV_UNLIKELY(heap_no >= page_dir_get_n_heap(page))) {
+
+			goto corrupt;
+		}
+
+		memcpy(field, ptr + 4, REC_NODE_PTR_SIZE);
+		memcpy(storage, ptr + 4, REC_NODE_PTR_SIZE);
+
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(page_zip_validate(page_zip, page, NULL));
+#endif /* UNIV_ZIP_DEBUG */
+	}
+
+	return(ptr + (2 + 2 + REC_NODE_PTR_SIZE));
+}
+
+/**********************************************************************//**
+Write the node pointer of a record on a non-leaf compressed page. */
+UNIV_INTERN
+void
+page_zip_write_node_ptr(
+/*====================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	byte*		rec,	/*!< in/out: record */
+	ulint		size,	/*!< in: data size of rec */
+	ulint		ptr,	/*!< in: node pointer */
+	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
+{
+	byte*	field;
+	byte*	storage;
+#ifdef UNIV_DEBUG
+	page_t*	page	= page_align(rec);
+#endif /* UNIV_DEBUG */
+
+	ut_ad(PAGE_ZIP_MATCH(rec, page_zip));
+	ut_ad(page_simple_validate_new(page));
+	ut_ad(page_zip_simple_validate(page_zip));
+	ut_ad(page_zip_get_size(page_zip)
+	      > PAGE_DATA + page_zip_dir_size(page_zip));
+	ut_ad(page_rec_is_comp(rec));
+
+	ut_ad(page_zip->m_start >= PAGE_DATA);
+	ut_ad(page_zip_header_cmp(page_zip, page));
+
+	ut_ad(!page_is_leaf(page));
+
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+	UNIV_MEM_ASSERT_RW(rec, size);
+
+	storage = page_zip_dir_start(page_zip)
+		- (rec_get_heap_no_new(rec) - 1) * REC_NODE_PTR_SIZE;
+	field = rec + size - REC_NODE_PTR_SIZE;
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+	ut_a(!memcmp(storage, field, REC_NODE_PTR_SIZE));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+#if REC_NODE_PTR_SIZE != 4
+# error "REC_NODE_PTR_SIZE != 4"
+#endif
+	mach_write_to_4(field, ptr);
+	memcpy(storage, field, REC_NODE_PTR_SIZE);
+
+	if (mtr) {
+#ifndef UNIV_HOTBACKUP
+		byte*	log_ptr	= mlog_open(mtr,
+					    11 + 2 + 2 + REC_NODE_PTR_SIZE);
+		if (UNIV_UNLIKELY(!log_ptr)) {
+			return;
+		}
+
+		log_ptr = mlog_write_initial_log_record_fast(
+			field, MLOG_ZIP_WRITE_NODE_PTR, log_ptr, mtr);
+		mach_write_to_2(log_ptr, page_offset(field));
+		log_ptr += 2;
+		mach_write_to_2(log_ptr, storage - page_zip->data);
+		log_ptr += 2;
+		memcpy(log_ptr, field, REC_NODE_PTR_SIZE);
+		log_ptr += REC_NODE_PTR_SIZE;
+		mlog_close(mtr, log_ptr);
+#endif /* !UNIV_HOTBACKUP */
+	}
+}
+
+/**********************************************************************//**
+Write the trx_id and roll_ptr of a record on a B-tree leaf node page. */
+UNIV_INTERN
+void
+page_zip_write_trx_id_and_roll_ptr(
+/*===============================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	byte*		rec,	/*!< in/out: record */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint		trx_id_col,/*!< in: column number of TRX_ID in rec */
+	trx_id_t	trx_id,	/*!< in: transaction identifier */
+	roll_ptr_t	roll_ptr)/*!< in: roll_ptr */
+{
+	byte*	field;
+	byte*	storage;
+#ifdef UNIV_DEBUG
+	page_t*	page	= page_align(rec);
+#endif /* UNIV_DEBUG */
+	ulint	len;
+
+	ut_ad(PAGE_ZIP_MATCH(rec, page_zip));
+
+	ut_ad(page_simple_validate_new(page));
+	ut_ad(page_zip_simple_validate(page_zip));
+	ut_ad(page_zip_get_size(page_zip)
+	      > PAGE_DATA + page_zip_dir_size(page_zip));
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(rec_offs_comp(offsets));
+
+	ut_ad(page_zip->m_start >= PAGE_DATA);
+	ut_ad(page_zip_header_cmp(page_zip, page));
+
+	ut_ad(page_is_leaf(page));
+
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+	storage = page_zip_dir_start(page_zip)
+		- (rec_get_heap_no_new(rec) - 1)
+		* (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+#if DATA_TRX_ID + 1 != DATA_ROLL_PTR
+# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR"
+#endif
+	field = rec_get_nth_field(rec, offsets, trx_id_col, &len);
+	ut_ad(len == DATA_TRX_ID_LEN);
+	ut_ad(field + DATA_TRX_ID_LEN
+	      == rec_get_nth_field(rec, offsets, trx_id_col + 1, &len));
+	ut_ad(len == DATA_ROLL_PTR_LEN);
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+	ut_a(!memcmp(storage, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+#if DATA_TRX_ID_LEN != 6
+# error "DATA_TRX_ID_LEN != 6"
+#endif
+	mach_write_to_6(field, trx_id);
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+	mach_write_to_7(field + DATA_TRX_ID_LEN, roll_ptr);
+	memcpy(storage, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+	UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+	UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+			   rec_offs_extra_size(offsets));
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+}
+
+/**********************************************************************//**
+Clear an area on the uncompressed and compressed page.
+Do not clear the data payload, as that would grow the modification log. */
+static
+void
+page_zip_clear_rec(
+/*===============*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	byte*		rec,		/*!< in: record to clear */
+	const dict_index_t*	index,	/*!< in: index of rec */
+	const ulint*	offsets)	/*!< in: rec_get_offsets(rec, index) */
+{
+	ulint	heap_no;
+	page_t*	page	= page_align(rec);
+	byte*	storage;
+	byte*	field;
+	ulint	len;
+	/* page_zip_validate() would fail here if a record
+	containing externally stored columns is being deleted. */
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!page_zip_dir_find(page_zip, page_offset(rec)));
+	ut_ad(page_zip_dir_find_free(page_zip, page_offset(rec)));
+	ut_ad(page_zip_header_cmp(page_zip, page));
+
+	heap_no = rec_get_heap_no_new(rec);
+	ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+	UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+	UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+			   rec_offs_extra_size(offsets));
+
+	if (!page_is_leaf(page)) {
+		/* Clear node_ptr. On the compressed page,
+		there is an array of node_ptr immediately before the
+		dense page directory, at the very end of the page. */
+		storage	= page_zip_dir_start(page_zip);
+		ut_ad(dict_index_get_n_unique_in_tree(index) ==
+		      rec_offs_n_fields(offsets) - 1);
+		field	= rec_get_nth_field(rec, offsets,
+					    rec_offs_n_fields(offsets) - 1,
+					    &len);
+		ut_ad(len == REC_NODE_PTR_SIZE);
+
+		ut_ad(!rec_offs_any_extern(offsets));
+		memset(field, 0, REC_NODE_PTR_SIZE);
+		memset(storage - (heap_no - 1) * REC_NODE_PTR_SIZE,
+		       0, REC_NODE_PTR_SIZE);
+	} else if (dict_index_is_clust(index)) {
+		/* Clear trx_id and roll_ptr. On the compressed page,
+		there is an array of these fields immediately before the
+		dense page directory, at the very end of the page. */
+		const ulint	trx_id_pos
+			= dict_col_get_clust_pos(
+			dict_table_get_sys_col(
+				index->table, DATA_TRX_ID), index);
+		storage	= page_zip_dir_start(page_zip);
+		field	= rec_get_nth_field(rec, offsets, trx_id_pos, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+
+		memset(field, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+		memset(storage - (heap_no - 1)
+		       * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN),
+		       0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+		if (rec_offs_any_extern(offsets)) {
+			ulint	i;
+
+			for (i = rec_offs_n_fields(offsets); i--; ) {
+				/* Clear all BLOB pointers in order to make
+				page_zip_validate() pass. */
+				if (rec_offs_nth_extern(offsets, i)) {
+					field = rec_get_nth_field(
+						rec, offsets, i, &len);
+					ut_ad(len
+					      == BTR_EXTERN_FIELD_REF_SIZE);
+					memset(field + len
+					       - BTR_EXTERN_FIELD_REF_SIZE,
+					       0, BTR_EXTERN_FIELD_REF_SIZE);
+				}
+			}
+		}
+	} else {
+		ut_ad(!rec_offs_any_extern(offsets));
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/**********************************************************************//**
+Write the "deleted" flag of a record on a compressed page.  The flag must
+already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_deleted(
+/*=====================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in: record on the uncompressed page */
+	ulint		flag)	/*!< in: the deleted flag (nonzero=TRUE) */
+{
+	byte*	slot = page_zip_dir_find(page_zip, page_offset(rec));
+	ut_a(slot);
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+	if (flag) {
+		*slot |= (PAGE_ZIP_DIR_SLOT_DEL >> 8);
+	} else {
+		*slot &= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8);
+	}
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(page_zip_validate(page_zip, page_align(rec), NULL));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/**********************************************************************//**
+Write the "owned" flag of a record on a compressed page.  The n_owned field
+must already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_owned(
+/*===================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in: record on the uncompressed page */
+	ulint		flag)	/*!< in: the owned flag (nonzero=TRUE) */
+{
+	byte*	slot = page_zip_dir_find(page_zip, page_offset(rec));
+	ut_a(slot);
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+	if (flag) {
+		*slot |= (PAGE_ZIP_DIR_SLOT_OWNED >> 8);
+	} else {
+		*slot &= ~(PAGE_ZIP_DIR_SLOT_OWNED >> 8);
+	}
+}
+
+/**********************************************************************//**
+Insert a record to the dense page directory. */
+UNIV_INTERN
+void
+page_zip_dir_insert(
+/*================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	prev_rec,/*!< in: record after which to insert */
+	const byte*	free_rec,/*!< in: record from which rec was
+				allocated, or NULL */
+	byte*		rec)	/*!< in: record to insert */
+{
+	ulint	n_dense;
+	byte*	slot_rec;
+	byte*	slot_free;
+
+	ut_ad(prev_rec != rec);
+	ut_ad(page_rec_get_next((rec_t*) prev_rec) == rec);
+	ut_ad(page_zip_simple_validate(page_zip));
+
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+	if (page_rec_is_infimum(prev_rec)) {
+		/* Use the first slot. */
+		slot_rec = page_zip->data + page_zip_get_size(page_zip);
+	} else {
+		byte*	end	= page_zip->data + page_zip_get_size(page_zip);
+		byte*	start	= end - page_zip_dir_user_size(page_zip);
+
+		if (UNIV_LIKELY(!free_rec)) {
+			/* PAGE_N_RECS was already incremented
+			in page_cur_insert_rec_zip(), but the
+			dense directory slot at that position
+			contains garbage.  Skip it. */
+			start += PAGE_ZIP_DIR_SLOT_SIZE;
+		}
+
+		slot_rec = page_zip_dir_find_low(start, end,
+						 page_offset(prev_rec));
+		ut_a(slot_rec);
+	}
+
+	/* Read the old n_dense (n_heap may have been incremented). */
+	n_dense = page_dir_get_n_heap(page_zip->data)
+		- (PAGE_HEAP_NO_USER_LOW + 1);
+
+	if (UNIV_LIKELY_NULL(free_rec)) {
+		/* The record was allocated from the free list.
+		Shift the dense directory only up to that slot.
+		Note that in this case, n_dense is actually
+		off by one, because page_cur_insert_rec_zip()
+		did not increment n_heap. */
+		ut_ad(rec_get_heap_no_new(rec) < n_dense + 1
+		      + PAGE_HEAP_NO_USER_LOW);
+		ut_ad(rec >= free_rec);
+		slot_free = page_zip_dir_find(page_zip, page_offset(free_rec));
+		ut_ad(slot_free);
+		slot_free += PAGE_ZIP_DIR_SLOT_SIZE;
+	} else {
+		/* The record was allocated from the heap.
+		Shift the entire dense directory. */
+		ut_ad(rec_get_heap_no_new(rec) == n_dense
+		      + PAGE_HEAP_NO_USER_LOW);
+
+		/* Shift to the end of the dense page directory. */
+		slot_free = page_zip->data + page_zip_get_size(page_zip)
+			- PAGE_ZIP_DIR_SLOT_SIZE * n_dense;
+	}
+
+	/* Shift the dense directory to allocate place for rec. */
+	memmove(slot_free - PAGE_ZIP_DIR_SLOT_SIZE, slot_free,
+		slot_rec - slot_free);
+
+	/* Write the entry for the inserted record.
+	The "owned" and "deleted" flags must be zero. */
+	mach_write_to_2(slot_rec - PAGE_ZIP_DIR_SLOT_SIZE, page_offset(rec));
+}
+
+/**********************************************************************//**
+Shift the dense page directory and the array of BLOB pointers
+when a record is deleted. */
+UNIV_INTERN
+void
+page_zip_dir_delete(
+/*================*/
+	page_zip_des_t*		page_zip,	/*!< in/out: compressed page */
+	byte*			rec,		/*!< in: deleted record */
+	const dict_index_t*	index,		/*!< in: index of rec */
+	const ulint*		offsets,	/*!< in: rec_get_offsets(rec) */
+	const byte*		free)		/*!< in: previous start of
+						the free list */
+{
+	byte*	slot_rec;
+	byte*	slot_free;
+	ulint	n_ext;
+	page_t*	page	= page_align(rec);
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(rec_offs_comp(offsets));
+
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+	UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+	UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+			   rec_offs_extra_size(offsets));
+
+	slot_rec = page_zip_dir_find(page_zip, page_offset(rec));
+
+	ut_a(slot_rec);
+
+	/* This could not be done before page_zip_dir_find(). */
+	page_header_set_field(page, page_zip, PAGE_N_RECS,
+			      (ulint)(page_get_n_recs(page) - 1));
+
+	if (UNIV_UNLIKELY(!free)) {
+		/* Make the last slot the start of the free list. */
+		slot_free = page_zip->data + page_zip_get_size(page_zip)
+			- PAGE_ZIP_DIR_SLOT_SIZE
+			* (page_dir_get_n_heap(page_zip->data)
+			   - PAGE_HEAP_NO_USER_LOW);
+	} else {
+		slot_free = page_zip_dir_find_free(page_zip,
+						   page_offset(free));
+		ut_a(slot_free < slot_rec);
+		/* Grow the free list by one slot by moving the start. */
+		slot_free += PAGE_ZIP_DIR_SLOT_SIZE;
+	}
+
+	if (UNIV_LIKELY(slot_rec > slot_free)) {
+		memmove(slot_free + PAGE_ZIP_DIR_SLOT_SIZE,
+			slot_free,
+			slot_rec - slot_free);
+	}
+
+	/* Write the entry for the deleted record.
+	The "owned" and "deleted" flags will be cleared. */
+	mach_write_to_2(slot_free, page_offset(rec));
+
+	if (!page_is_leaf(page) || !dict_index_is_clust(index)) {
+		ut_ad(!rec_offs_any_extern(offsets));
+		goto skip_blobs;
+	}
+
+	n_ext = rec_offs_n_extern(offsets);
+	if (UNIV_UNLIKELY(n_ext)) {
+		/* Shift and zero fill the array of BLOB pointers. */
+		ulint	blob_no;
+		byte*	externs;
+		byte*	ext_end;
+
+		blob_no = page_zip_get_n_prev_extern(page_zip, rec, index);
+		ut_a(blob_no + n_ext <= page_zip->n_blobs);
+
+		externs = page_zip->data + page_zip_get_size(page_zip)
+			- (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
+			* (PAGE_ZIP_DIR_SLOT_SIZE
+			   + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+		ext_end = externs - page_zip->n_blobs
+			* BTR_EXTERN_FIELD_REF_SIZE;
+		externs -= blob_no * BTR_EXTERN_FIELD_REF_SIZE;
+
+		page_zip->n_blobs -= static_cast<unsigned>(n_ext);
+		/* Shift and zero fill the array. */
+		memmove(ext_end + n_ext * BTR_EXTERN_FIELD_REF_SIZE, ext_end,
+			(page_zip->n_blobs - blob_no)
+			* BTR_EXTERN_FIELD_REF_SIZE);
+		memset(ext_end, 0, n_ext * BTR_EXTERN_FIELD_REF_SIZE);
+	}
+
+skip_blobs:
+	/* The compression algorithm expects info_bits and n_owned
+	to be 0 for deleted records. */
+	rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
+
+	page_zip_clear_rec(page_zip, rec, index, offsets);
+}
+
+/**********************************************************************//**
+Add a slot to the dense page directory. */
+UNIV_INTERN
+void
+page_zip_dir_add_slot(
+/*==================*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	ulint		is_clustered)	/*!< in: nonzero for clustered index,
+					zero for others */
+{
+	ulint	n_dense;
+	byte*	dir;
+	byte*	stored;
+
+	ut_ad(page_is_comp(page_zip->data));
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+	/* Read the old n_dense (n_heap has already been incremented). */
+	n_dense = page_dir_get_n_heap(page_zip->data)
+		- (PAGE_HEAP_NO_USER_LOW + 1);
+
+	dir = page_zip->data + page_zip_get_size(page_zip)
+		- PAGE_ZIP_DIR_SLOT_SIZE * n_dense;
+
+	if (!page_is_leaf(page_zip->data)) {
+		ut_ad(!page_zip->n_blobs);
+		stored = dir - n_dense * REC_NODE_PTR_SIZE;
+	} else if (is_clustered) {
+		/* Move the BLOB pointer array backwards to make space for the
+		roll_ptr and trx_id columns and the dense directory slot. */
+		byte*	externs;
+
+		stored = dir - n_dense
+			* (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+		externs = stored
+			- page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+		ASSERT_ZERO(externs
+			    - (PAGE_ZIP_DIR_SLOT_SIZE
+			       + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN),
+			    PAGE_ZIP_DIR_SLOT_SIZE
+			    + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+		memmove(externs - (PAGE_ZIP_DIR_SLOT_SIZE
+				   + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN),
+			externs, stored - externs);
+	} else {
+		stored = dir
+			- page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+		ASSERT_ZERO(stored - PAGE_ZIP_DIR_SLOT_SIZE,
+			    PAGE_ZIP_DIR_SLOT_SIZE);
+	}
+
+	/* Move the uncompressed area backwards to make space
+	for one directory slot. */
+	memmove(stored - PAGE_ZIP_DIR_SLOT_SIZE, stored, dir - stored);
+}
+
+/***********************************************************//**
+Parses a log record of writing to the header of a page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_header(
+/*========================*/
+	byte*		ptr,	/*!< in: redo log buffer */
+	byte*		end_ptr,/*!< in: redo log buffer end */
+	page_t*		page,	/*!< in/out: uncompressed page */
+	page_zip_des_t*	page_zip)/*!< in/out: compressed page */
+{
+	ulint	offset;
+	ulint	len;
+
+	ut_ad(ptr && end_ptr);
+	ut_ad(!page == !page_zip);
+
+	if (UNIV_UNLIKELY(end_ptr < ptr + (1 + 1))) {
+
+		return(NULL);
+	}
+
+	offset = (ulint) *ptr++;
+	len = (ulint) *ptr++;
+
+	if (UNIV_UNLIKELY(!len) || UNIV_UNLIKELY(offset + len >= PAGE_DATA)) {
+corrupt:
+		recv_sys->found_corrupt_log = TRUE;
+
+		return(NULL);
+	}
+
+	if (UNIV_UNLIKELY(end_ptr < ptr + len)) {
+
+		return(NULL);
+	}
+
+	if (page) {
+		if (UNIV_UNLIKELY(!page_zip)) {
+
+			goto corrupt;
+		}
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(page_zip_validate(page_zip, page, NULL));
+#endif /* UNIV_ZIP_DEBUG */
+
+		memcpy(page + offset, ptr, len);
+		memcpy(page_zip->data + offset, ptr, len);
+
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(page_zip_validate(page_zip, page, NULL));
+#endif /* UNIV_ZIP_DEBUG */
+	}
+
+	return(ptr + len);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Write a log record of writing to the uncompressed header portion of a page. */
+UNIV_INTERN
+void
+page_zip_write_header_log(
+/*======================*/
+	const byte*	data,	/*!< in: data on the uncompressed page */
+	ulint		length,	/*!< in: length of the data */
+	mtr_t*		mtr)	/*!< in: mini-transaction */
+{
+	byte*	log_ptr	= mlog_open(mtr, 11 + 1 + 1);
+	ulint	offset	= page_offset(data);
+
+	ut_ad(offset < PAGE_DATA);
+	ut_ad(offset + length < PAGE_DATA);
+#if PAGE_DATA > 255
+# error "PAGE_DATA > 255"
+#endif
+	ut_ad(length < 256);
+
+	/* If no logging is requested, we may return now */
+	if (UNIV_UNLIKELY(!log_ptr)) {
+
+		return;
+	}
+
+	log_ptr = mlog_write_initial_log_record_fast(
+		(byte*) data, MLOG_ZIP_WRITE_HEADER, log_ptr, mtr);
+	*log_ptr++ = (byte) offset;
+	*log_ptr++ = (byte) length;
+	mlog_close(mtr, log_ptr);
+
+	mlog_catenate_string(mtr, data, length);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Reorganize and compress a page.  This is a low-level operation for
+compressed pages, to be used when page_zip_compress() fails.
+On success, a redo log entry MLOG_ZIP_PAGE_COMPRESS will be written.
+The function btr_page_reorganize() should be preferred whenever possible.
+IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
+non-clustered index, the caller must update the insert buffer free
+bits in the same mini-transaction in such a way that the modification
+will be redo-logged.
+@return TRUE on success, FALSE on failure; page_zip will be left
+intact on failure, but page will be overwritten. */
+UNIV_INTERN
+ibool
+page_zip_reorganize(
+/*================*/
+	buf_block_t*	block,	/*!< in/out: page with compressed page;
+				on the compressed page, in: size;
+				out: data, n_blobs,
+				m_start, m_end, m_nonempty */
+	dict_index_t*	index,	/*!< in: index of the B-tree node */
+	mtr_t*		mtr)	/*!< in: mini-transaction */
+{
+#ifndef UNIV_HOTBACKUP
+	buf_pool_t*	buf_pool	= buf_pool_from_block(block);
+#endif /* !UNIV_HOTBACKUP */
+	page_zip_des_t*	page_zip	= buf_block_get_page_zip(block);
+	page_t*		page		= buf_block_get_frame(block);
+	buf_block_t*	temp_block;
+	page_t*		temp_page;
+	ulint		log_mode;
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(page_is_comp(page));
+	ut_ad(!dict_index_is_ibuf(index));
+	/* Note that page_zip_validate(page_zip, page, index) may fail here. */
+	UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+	/* Disable logging */
+	log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+
+#ifndef UNIV_HOTBACKUP
+	temp_block = buf_block_alloc(buf_pool);
+	btr_search_drop_page_hash_index(block);
+	block->check_index_page_at_flush = TRUE;
+#else /* !UNIV_HOTBACKUP */
+	ut_ad(block == back_block1);
+	temp_block = back_block2;
+#endif /* !UNIV_HOTBACKUP */
+	temp_page = temp_block->frame;
+
+	/* Copy the old page to temporary space */
+	buf_frame_copy(temp_page, page);
+
+	btr_blob_dbg_remove(page, index, "zip_reorg");
+
+	/* Recreate the page: note that global data on page (possible
+	segment headers, next page-field, etc.) is preserved intact */
+
+	page_create(block, mtr, TRUE);
+
+	/* Copy the records from the temporary space to the recreated page;
+	do not copy the lock bits yet */
+
+	page_copy_rec_list_end_no_locks(block, temp_block,
+					page_get_infimum_rec(temp_page),
+					index, mtr);
+
+	if (!dict_index_is_clust(index) && page_is_leaf(temp_page)) {
+		/* Copy max trx id to recreated page */
+		trx_id_t	max_trx_id = page_get_max_trx_id(temp_page);
+		page_set_max_trx_id(block, NULL, max_trx_id, NULL);
+		ut_ad(max_trx_id != 0);
+	}
+
+	/* Restore logging. */
+	mtr_set_log_mode(mtr, log_mode);
+
+	if (!page_zip_compress(page_zip, page, index, page_zip_level, mtr)) {
+
+#ifndef UNIV_HOTBACKUP
+		buf_block_free(temp_block);
+#endif /* !UNIV_HOTBACKUP */
+		return(FALSE);
+	}
+
+	lock_move_reorganize_page(block, temp_block);
+
+#ifndef UNIV_HOTBACKUP
+	buf_block_free(temp_block);
+#endif /* !UNIV_HOTBACKUP */
+	return(TRUE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Copy the records of a page byte for byte.  Do not copy the page header
+or trailer, except those B-tree header fields that are directly
+related to the storage of records.  Also copy PAGE_MAX_TRX_ID.
+NOTE: The caller must update the lock table and the adaptive hash index. */
+UNIV_INTERN
+void
+page_zip_copy_recs(
+/*===============*/
+	page_zip_des_t*		page_zip,	/*!< out: copy of src_zip
+						(n_blobs, m_start, m_end,
+						m_nonempty, data[0..size-1]) */
+	page_t*			page,		/*!< out: copy of src */
+	const page_zip_des_t*	src_zip,	/*!< in: compressed page */
+	const page_t*		src,		/*!< in: page */
+	dict_index_t*		index,		/*!< in: index of the B-tree */
+	mtr_t*			mtr)		/*!< in: mini-transaction */
+{
+	ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, src, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!dict_index_is_ibuf(index));
+#ifdef UNIV_ZIP_DEBUG
+	/* The B-tree operations that call this function may set
+	FIL_PAGE_PREV or PAGE_LEVEL, causing a temporary min_rec_flag
+	mismatch.  A strict page_zip_validate() will be executed later
+	during the B-tree operations. */
+	ut_a(page_zip_validate_low(src_zip, src, index, TRUE));
+#endif /* UNIV_ZIP_DEBUG */
+	ut_a(page_zip_get_size(page_zip) == page_zip_get_size(src_zip));
+	if (UNIV_UNLIKELY(src_zip->n_blobs)) {
+		ut_a(page_is_leaf(src));
+		ut_a(dict_index_is_clust(index));
+	}
+
+	/* The PAGE_MAX_TRX_ID must be set on leaf pages of secondary
+	indexes.  It does not matter on other pages. */
+	ut_a(dict_index_is_clust(index) || !page_is_leaf(src)
+	     || page_get_max_trx_id(src));
+
+	UNIV_MEM_ASSERT_W(page, UNIV_PAGE_SIZE);
+	UNIV_MEM_ASSERT_W(page_zip->data, page_zip_get_size(page_zip));
+	UNIV_MEM_ASSERT_RW(src, UNIV_PAGE_SIZE);
+	UNIV_MEM_ASSERT_RW(src_zip->data, page_zip_get_size(page_zip));
+
+	/* Copy those B-tree page header fields that are related to
+	the records stored in the page.  Also copy the field
+	PAGE_MAX_TRX_ID.  Skip the rest of the page header and
+	trailer.  On the compressed page, there is no trailer. */
+#if PAGE_MAX_TRX_ID + 8 != PAGE_HEADER_PRIV_END
+# error "PAGE_MAX_TRX_ID + 8 != PAGE_HEADER_PRIV_END"
+#endif
+	memcpy(PAGE_HEADER + page, PAGE_HEADER + src,
+	       PAGE_HEADER_PRIV_END);
+	memcpy(PAGE_DATA + page, PAGE_DATA + src,
+	       UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END);
+	memcpy(PAGE_HEADER + page_zip->data, PAGE_HEADER + src_zip->data,
+	       PAGE_HEADER_PRIV_END);
+	memcpy(PAGE_DATA + page_zip->data, PAGE_DATA + src_zip->data,
+	       page_zip_get_size(page_zip) - PAGE_DATA);
+
+	/* Copy all fields of src_zip to page_zip, except the pointer
+	to the compressed data page. */
+	{
+		page_zip_t*	data = page_zip->data;
+		memcpy(page_zip, src_zip, sizeof *page_zip);
+		page_zip->data = data;
+	}
+	ut_ad(page_zip_get_trailer_len(page_zip, dict_index_is_clust(index))
+	      + page_zip->m_end < page_zip_get_size(page_zip));
+
+	if (!page_is_leaf(src)
+	    && UNIV_UNLIKELY(mach_read_from_4(src + FIL_PAGE_PREV) == FIL_NULL)
+	    && UNIV_LIKELY(mach_read_from_4(page
+					    + FIL_PAGE_PREV) != FIL_NULL)) {
+		/* Clear the REC_INFO_MIN_REC_FLAG of the first user record. */
+		ulint	offs = rec_get_next_offs(page + PAGE_NEW_INFIMUM,
+						 TRUE);
+		if (UNIV_LIKELY(offs != PAGE_NEW_SUPREMUM)) {
+			rec_t*	rec = page + offs;
+			ut_a(rec[-REC_N_NEW_EXTRA_BYTES]
+			     & REC_INFO_MIN_REC_FLAG);
+			rec[-REC_N_NEW_EXTRA_BYTES] &= ~ REC_INFO_MIN_REC_FLAG;
+		}
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+	btr_blob_dbg_add(page, index, "page_zip_copy_recs");
+
+	page_zip_compress_write_log(page_zip, page, index, mtr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Parses a log record of compressing an index page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_compress(
+/*====================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	page_t*		page,	/*!< out: uncompressed page */
+	page_zip_des_t*	page_zip)/*!< out: compressed page */
+{
+	ulint	size;
+	ulint	trailer_size;
+
+	ut_ad(ptr && end_ptr);
+	ut_ad(!page == !page_zip);
+
+	if (UNIV_UNLIKELY(ptr + (2 + 2) > end_ptr)) {
+
+		return(NULL);
+	}
+
+	size = mach_read_from_2(ptr);
+	ptr += 2;
+	trailer_size = mach_read_from_2(ptr);
+	ptr += 2;
+
+	if (UNIV_UNLIKELY(ptr + 8 + size + trailer_size > end_ptr)) {
+
+		return(NULL);
+	}
+
+	if (page) {
+		if (UNIV_UNLIKELY(!page_zip)
+		    || UNIV_UNLIKELY(page_zip_get_size(page_zip) < size)) {
+corrupt:
+			recv_sys->found_corrupt_log = TRUE;
+
+			return(NULL);
+		}
+
+		memcpy(page_zip->data + FIL_PAGE_PREV, ptr, 4);
+		memcpy(page_zip->data + FIL_PAGE_NEXT, ptr + 4, 4);
+		memcpy(page_zip->data + FIL_PAGE_TYPE, ptr + 8, size);
+		memset(page_zip->data + FIL_PAGE_TYPE + size, 0,
+		       page_zip_get_size(page_zip) - trailer_size
+		       - (FIL_PAGE_TYPE + size));
+		memcpy(page_zip->data + page_zip_get_size(page_zip)
+		       - trailer_size, ptr + 8 + size, trailer_size);
+
+		if (UNIV_UNLIKELY(!page_zip_decompress(page_zip, page,
+						       TRUE))) {
+
+			goto corrupt;
+		}
+	}
+
+	return(ptr + 8 + size + trailer_size);
+}
+
+/**********************************************************************//**
+Calculate the compressed page checksum.
+@return	page checksum */
+UNIV_INTERN
+ulint
+page_zip_calc_checksum(
+/*===================*/
+	const void*	data,	/*!< in: compressed page */
+	ulint		size,	/*!< in: size of compressed page */
+	srv_checksum_algorithm_t algo) /*!< in: algorithm to use */
+{
+	uLong		adler;
+	ib_uint32_t	crc32;
+	const Bytef*	s = static_cast<const byte*>(data);
+
+	/* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN,
+	and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */
+
+	switch (algo) {
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+
+		ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+		crc32 = ut_crc32(s + FIL_PAGE_OFFSET,
+				 FIL_PAGE_LSN - FIL_PAGE_OFFSET)
+			^ ut_crc32(s + FIL_PAGE_TYPE, 2)
+			^ ut_crc32(s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+				   size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+		return((ulint) crc32);
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+		ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+		adler = adler32(0L, s + FIL_PAGE_OFFSET,
+				FIL_PAGE_LSN - FIL_PAGE_OFFSET);
+		adler = adler32(adler, s + FIL_PAGE_TYPE, 2);
+		adler = adler32(
+			adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+			static_cast<uInt>(size)
+			- FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+		return((ulint) adler);
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+		return(BUF_NO_CHECKSUM_MAGIC);
+	/* no default so the compiler will emit a warning if new enum
+	is added and not handled here */
+	}
+
+	ut_error;
+	return(0);
+}
+
+/**********************************************************************//**
+Verify a compressed page's checksum.
+@return	TRUE if the stored checksum is valid according to the value of
+innodb_checksum_algorithm */
+UNIV_INTERN
+ibool
+page_zip_verify_checksum(
+/*=====================*/
+	const void*	data,	/*!< in: compressed page */
+	ulint		size)	/*!< in: size of compressed page */
+{
+	ib_uint32_t	stored;
+	ib_uint32_t	calc;
+	ib_uint32_t	crc32 = 0 /* silence bogus warning */;
+	ib_uint32_t	innodb = 0 /* silence bogus warning */;
+
+	stored = static_cast<ib_uint32_t>(mach_read_from_4(
+		static_cast<const unsigned char*>(data) + FIL_PAGE_SPACE_OR_CHKSUM));
+
+#if FIL_PAGE_LSN % 8
+#error "FIL_PAGE_LSN must be 64 bit aligned"
+#endif
+
+	/* Check if page is empty */
+	if (stored == 0
+	    && *reinterpret_cast<const ib_uint64_t*>(static_cast<const char*>(
+		data)
+		+ FIL_PAGE_LSN) == 0) {
+		/* make sure that the page is really empty */
+		ulint i;
+		for (i = 0; i < size; i++) {
+			if (*((const char*) data + i) != 0) {
+				return(FALSE);
+			}
+		}
+		/* Empty page */
+		return(TRUE);
+	}
+
+	calc = static_cast<ib_uint32_t>(page_zip_calc_checksum(
+		data, size, static_cast<srv_checksum_algorithm_t>(
+			srv_checksum_algorithm)));
+
+	if (stored == calc) {
+		return(TRUE);
+	}
+
+	switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) {
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
+		return(stored == calc);
+	case SRV_CHECKSUM_ALGORITHM_CRC32:
+		if (stored == BUF_NO_CHECKSUM_MAGIC) {
+			return(TRUE);
+		}
+		crc32 = calc;
+		innodb = static_cast<ib_uint32_t>(page_zip_calc_checksum(
+			data, size, SRV_CHECKSUM_ALGORITHM_INNODB));
+		break;
+	case SRV_CHECKSUM_ALGORITHM_INNODB:
+		if (stored == BUF_NO_CHECKSUM_MAGIC) {
+			return(TRUE);
+		}
+		crc32 = static_cast<ib_uint32_t>(page_zip_calc_checksum(
+			data, size, SRV_CHECKSUM_ALGORITHM_CRC32));
+		innodb = calc;
+		break;
+	case SRV_CHECKSUM_ALGORITHM_NONE:
+		return(TRUE);
+	/* no default so the compiler will emit a warning if new enum
+	is added and not handled here */
+	}
+
+	return(stored == crc32 || stored == innodb);
+}
diff --git a/storage/innobase/pars/lexyy.cc b/storage/innobase/pars/lexyy.cc
new file mode 100644
index 00000000000..1c01becd9ed
--- /dev/null
+++ b/storage/innobase/pars/lexyy.cc
@@ -0,0 +1,3130 @@
+#include "univ.i"
+#line 2 "lexyy.cc"
+
+#line 4 "lexyy.cc"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 5
+#define YY_FLEX_SUBMINOR_VERSION 35
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types.
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t;
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+#ifdef __cplusplus
+
+/* The "const" storage-class-modifier is valid. */
+#define YY_USE_CONST
+
+#else	/* ! __cplusplus */
+
+/* C99 requires __STDC__ to be defined as 1. */
+#if defined (__STDC__)
+
+#define YY_USE_CONST
+
+#endif	/* defined (__STDC__) */
+#endif	/* ! __cplusplus */
+
+#ifdef YY_USE_CONST
+#define yyconst const
+#else
+#define yyconst
+#endif
+
+/* Returned upon end-of-file. */
+#define YY_NULL 0
+
+/* Promotes a possibly negative, possibly signed char to an unsigned
+ * integer for use as an array index.  If the signed char is negative,
+ * we want to instead treat it as an 8-bit unsigned char, hence the
+ * double cast.
+ */
+#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c)
+
+/* Enter a start condition.  This macro really ought to take a parameter,
+ * but we do it the disgusting crufty way forced on us by the ()-less
+ * definition of BEGIN.
+ */
+#define BEGIN (yy_start) = 1 + 2 *
+
+/* Translate the current start state into a value that can be later handed
+ * to BEGIN to return to the state.  The YYSTATE alias is for lex
+ * compatibility.
+ */
+#define YY_START (((yy_start) - 1) / 2)
+#define YYSTATE YY_START
+
+/* Action number for EOF rule of a given start state. */
+#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1)
+
+/* Special action meaning "start processing a new file". */
+#define YY_NEW_FILE yyrestart(yyin  )
+
+#define YY_END_OF_BUFFER_CHAR 0
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+/* The state buf must be large enough to hold one state per character in the main buffer.
+ */
+#define YY_STATE_BUF_SIZE   ((YY_BUF_SIZE + 2) * sizeof(yy_state_type))
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+extern yy_size_t yyleng;
+
+extern FILE *yyin, *yyout;
+
+#define EOB_ACT_CONTINUE_SCAN 0
+#define EOB_ACT_END_OF_FILE 1
+#define EOB_ACT_LAST_MATCH 2
+
+    #define YY_LESS_LINENO(n)
+
+/* Return all but the first "n" matched characters back to the input stream. */
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		*yy_cp = (yy_hold_char); \
+		YY_RESTORE_YY_MORE_OFFSET \
+		(yy_c_buf_p) = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \
+		YY_DO_BEFORE_ACTION; /* set up yytext again */ \
+		} \
+	while ( 0 )
+
+#define unput(c) yyunput( c, (yytext_ptr)  )
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	yy_size_t yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	yy_size_t yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+#define YY_BUFFER_NEW 0
+#define YY_BUFFER_NORMAL 1
+	/* When an EOF's been seen but there's still some text to process
+	 * then we mark the buffer as YY_EOF_PENDING, to indicate that we
+	 * shouldn't try reading from the input source any more.  We might
+	 * still have a bunch of tokens to match, though, because of
+	 * possible backing-up.
+	 *
+	 * When we actually see the EOF, we change the status to "new"
+	 * (via yyrestart()), so that the user can continue scanning by
+	 * just pointing yyin at a new input file.
+	 */
+#define YY_BUFFER_EOF_PENDING 2
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+/* Stack of input buffers. */
+static size_t yy_buffer_stack_top = 0; /**< index of top of stack. */
+static size_t yy_buffer_stack_max = 0; /**< capacity of stack. */
+static YY_BUFFER_STATE * yy_buffer_stack = 0; /**< Stack as an array. */
+
+/* We provide macros for accessing buffer states in case in the
+ * future we want to put the buffer states in a more general
+ * "scanner state".
+ *
+ * Returns the top of the stack, or NULL.
+ */
+#define YY_CURRENT_BUFFER ( (yy_buffer_stack) \
+                          ? (yy_buffer_stack)[(yy_buffer_stack_top)] \
+                          : NULL)
+
+/* Same as previous macro, but useful when we know that the buffer stack is not
+ * NULL or when we need an lvalue. For internal use only.
+ */
+#define YY_CURRENT_BUFFER_LVALUE (yy_buffer_stack)[(yy_buffer_stack_top)]
+
+/* yy_hold_char holds the character lost when yytext is formed. */
+static char yy_hold_char;
+static yy_size_t yy_n_chars;		/* number of characters read into yy_ch_buf */
+yy_size_t yyleng;
+
+/* Points to current character in buffer. */
+static char *yy_c_buf_p = (char *) 0;
+static int yy_init = 0;		/* whether we need to initialize */
+static int yy_start = 0;	/* start state number */
+
+/* Flag which is used to allow yywrap()'s to do buffer switches
+ * instead of setting up a fresh yyin.  A bit of a hack ...
+ */
+static int yy_did_buffer_switch_on_eof;
+
+void yyrestart (FILE *input_file  );
+__attribute__((unused)) static void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer  );
+static YY_BUFFER_STATE yy_create_buffer (FILE *file,int size  );
+void yy_delete_buffer (YY_BUFFER_STATE b  );
+void yy_flush_buffer (YY_BUFFER_STATE b  );
+void yypush_buffer_state (YY_BUFFER_STATE new_buffer  );
+void yypop_buffer_state (void );
+
+static void yyensure_buffer_stack (void );
+static void yy_load_buffer_state (void );
+static void yy_init_buffer (YY_BUFFER_STATE b,FILE *file  );
+
+#define YY_FLUSH_BUFFER yy_flush_buffer(YY_CURRENT_BUFFER )
+
+YY_BUFFER_STATE yy_scan_buffer (char *base,yy_size_t size  );
+YY_BUFFER_STATE yy_scan_string (yyconst char *yy_str  );
+YY_BUFFER_STATE yy_scan_bytes (yyconst char *bytes,yy_size_t len  );
+
+void *yyalloc (yy_size_t  );
+void *yyrealloc (void *,yy_size_t  );
+void yyfree (void *  );
+
+#define yy_new_buffer yy_create_buffer
+
+#define yy_set_interactive(is_interactive) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){ \
+        yyensure_buffer_stack (); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            yy_create_buffer(yyin,YY_BUF_SIZE ); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \
+	}
+
+#define yy_set_bol(at_bol) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){\
+        yyensure_buffer_stack (); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            yy_create_buffer(yyin,YY_BUF_SIZE ); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \
+	}
+
+#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol)
+
+/* Begin user sect3 */
+
+#define yywrap(n) 1
+#define YY_SKIP_YYWRAP
+
+typedef unsigned char YY_CHAR;
+
+FILE *yyin = (FILE *) 0, *yyout = (FILE *) 0;
+
+typedef int yy_state_type;
+
+extern int yylineno;
+
+int yylineno = 1;
+
+extern char *yytext;
+#define yytext_ptr yytext
+
+static yy_state_type yy_get_previous_state (void );
+static yy_state_type yy_try_NUL_trans (yy_state_type current_state  );
+static int yy_get_next_buffer (void );
+static void yy_fatal_error (yyconst char msg[]  );
+
+/* Done after the current pattern has been matched and before the
+ * corresponding action - sets up yytext.
+ */
+#define YY_DO_BEFORE_ACTION \
+	(yytext_ptr) = yy_bp; \
+	yyleng = (size_t) (yy_cp - yy_bp); \
+	(yy_hold_char) = *yy_cp; \
+	*yy_cp = '\0'; \
+	(yy_c_buf_p) = yy_cp;
+
+#define YY_NUM_RULES 124
+#define YY_END_OF_BUFFER 125
+/* This struct is not used in this scanner,
+   but its presence is necessary. */
+struct yy_trans_info
+	{
+	flex_int32_t yy_verify;
+	flex_int32_t yy_nxt;
+	};
+static yyconst flex_int16_t yy_accept[425] =
+    {   0,
+        0,    0,  119,  119,    0,    0,    0,    0,  125,  123,
+      122,  122,    8,  123,  114,    5,  103,  109,  112,  110,
+      107,  111,  123,  113,    1,  123,  108,  106,  104,  105,
+      117,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+       96,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+      115,  116,  119,  120,    6,    7,    9,   10,  122,    4,
+       98,  118,    2,    1,    3,   99,  100,  102,  101,    0,
+       96,    0,   96,   96,   96,   96,   96,   44,   96,   96,
+       96,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+       96,   96,   96,   96,   28,   17,   25,   96,   96,   96,
+
+       96,   96,   96,   54,   63,   96,   14,   96,   96,   96,
+       96,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+       96,   96,   96,   96,   96,  119,  120,  120,  121,    6,
+        7,    9,   10,    2,    0,   97,   13,   45,   96,   96,
+       96,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+       96,   96,   96,   96,   96,   96,   96,   27,   96,   96,
+       96,   41,   96,   96,   96,   96,   21,   96,   96,   96,
+       96,   96,   15,   96,   96,   96,   18,   96,   96,   96,
+       96,   96,   82,   96,   96,   96,   51,   96,   12,   96,
+       36,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+
+       96,   96,    0,   97,   96,   96,   96,   96,   20,   96,
+       24,   96,   96,   96,   96,   96,   96,   96,   96,   96,
+       96,   96,   46,   96,   96,   30,   96,   89,   96,   96,
+       39,   96,   96,   96,   96,   96,   48,   96,   94,   91,
+       32,   93,   96,   11,   66,   96,   96,   96,   42,   96,
+       96,   96,   96,   96,   96,   96,   96,   96,   96,   29,
+       96,   96,   96,   96,   96,   96,   96,   96,   96,   87,
+        0,   96,   26,   96,   96,   96,   68,   96,   96,   96,
+       96,   37,   96,   96,   96,   96,   96,   96,   96,   31,
+       67,   23,   96,   59,   96,   77,   96,   96,   96,   43,
+
+       96,   96,   96,   96,   96,   96,   96,   96,   92,   96,
+       96,   56,   96,   96,   96,   96,   96,   96,   96,   40,
+       33,    0,   81,   95,   19,   96,   96,   85,   96,   76,
+       55,   96,   65,   96,   52,   96,   96,   96,   47,   96,
+       78,   96,   80,   96,   96,   34,   96,   96,   96,   35,
+       74,   96,   96,   96,   96,   60,   96,   50,   49,   96,
+       96,   96,   57,   53,   64,   96,   96,   96,   22,   96,
+       96,   75,   83,   96,   96,   79,   96,   70,   96,   96,
+       96,   96,   96,   38,   96,   90,   69,   96,   86,   96,
+       96,   96,   88,   96,   96,   61,   96,   16,   96,   72,
+
+       71,   96,   58,   96,   84,   96,   96,   96,   96,   96,
+       96,   96,   96,   96,   96,   73,   96,   96,   96,   96,
+       96,   96,   62,    0
+    } ;
+
+static yyconst flex_int32_t yy_ec[256] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    1,    2,    3,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    2,    1,    4,    5,    6,    7,    1,    8,    9,
+       10,   11,   12,   13,   14,   15,   16,   17,   17,   17,
+       17,   17,   17,   17,   17,   17,   17,   18,   19,   20,
+       21,   22,   23,   24,   25,   26,   27,   28,   29,   30,
+       31,   32,   33,   34,   35,   36,   37,   38,   39,   40,
+       41,   42,   43,   44,   45,   46,   47,   48,   49,   50,
+        1,    1,    1,    1,   51,    1,   34,   34,   34,   34,
+
+       34,   34,   34,   34,   34,   34,   34,   52,   34,   34,
+       34,   34,   53,   34,   54,   34,   34,   34,   34,   34,
+       34,   34,   55,    1,   56,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1
+    } ;
+
+static yyconst flex_int32_t yy_meta[57] =
+    {   0,
+        1,    1,    1,    2,    3,    1,    1,    4,    1,    1,
+        5,    1,    1,    1,    1,    6,    7,    1,    1,    1,
+        8,    1,    1,    6,    9,    9,    9,    9,    9,    9,
+        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+        9,    9,    9,    9,    1,    1
+    } ;
+
+static yyconst flex_int16_t yy_base[438] =
+    {   0,
+        0,    0,  293,  287,  284,  281,  272,  256,  254, 1357,
+       55,   57, 1357,    0, 1357, 1357, 1357, 1357, 1357, 1357,
+     1357, 1357,  238,  227,   46,  205, 1357,   43, 1357,  203,
+     1357,   46,   50,   56,   52,   66,   64,   51,   81,   92,
+       91,   94,   96,  111,  113,  116,  130,  134,   53,  143,
+     1357, 1357,    0,  106,    0,  212,    0,  210,  141,    0,
+     1357, 1357,  192,   56,  173, 1357, 1357, 1357, 1357,  168,
+      140,  150,  152,  154,  155,  161,  167,  171,  177,  172,
+      184,  174,  188,  189,  191,  194,  203,  212,  215,  217,
+      219,  221,  226,  228,  231,  240,  233,  235,  246,  251,
+
+      258,  253,  255,  256,  269,  271,  278,  272,  285,  283,
+      287,  289,  296,  305,  298,  315,  319,  321,  322,  326,
+      332,  333,  342,  339,  343,    0,  112,  173, 1357,    0,
+      155,    0,  156,  132,   93,    0,  355,  357,  358,  360,
+      364,  367,  374,  370,  379,  380,  389,  383,  390,  392,
+      395,  408,  411,  409,  415,  418,  425,  427,  429,  436,
+      431,  441,  446,  448,  450,  452,  453,  462,  471,  464,
+      473,  474,  478,  485,  488,  490,  491,  494,  500,  501,
+      504,  506,  507,  517,  518,  519,  520,  521,  522,  523,
+      533,  536,  538,  543,  549,  554,  555,  561,  556,  566,
+
+      567,  576,   60,    0,  573,  578,  580,  582,  583,  593,
+      589,  596,  598,  603,  605,  607,  610,  617,  619,  621,
+      622,  628,  633,  634,  635,  639,  640,  649,  650,  652,
+      653,  655,  659,  664,  668,  669,  665,  671,  674,  678,
+      681,  685,  687,  688,  692,  697,  698,  701,  703,  704,
+      707,  708,  717,  713,  728,  730,  724,  740,  734,  745,
+      746,  750,  751,  756,  757,  760,  761,  762,  771,  773,
+       42,  778,  782,  783,  787,  789,  792,  794,  793,  804,
+      805,  808,  809,  810,  819,  823,  826,  828,  829,  830,
+      835,  840,  844,  846,  847,  856,  857,  858,  859,  860,
+
+      863,  872,  873,  878,  879,  882,  885,  889,  894,  895,
+      896,  898,  905,  910,  908,  912,  914,  915,  926,  930,
+      931,   73,  932,  933,  935,  937,  942,  944,  946,  947,
+      948,  949,  951,  958,  961,  965,  967,  972,  978,  979,
+      981,  984,  983,  985,  994,  988,  999, 1000, 1001, 1004,
+     1013, 1015, 1022, 1016, 1019, 1026, 1032, 1033, 1035, 1036,
+     1038, 1039, 1048, 1049, 1050, 1051, 1053, 1054, 1060, 1063,
+     1065, 1066, 1069, 1070, 1072, 1082, 1084, 1085, 1087, 1096,
+     1097, 1098, 1099, 1101, 1113, 1114, 1115, 1116, 1117, 1118,
+     1119, 1128, 1130, 1131, 1134, 1133, 1135, 1137, 1150, 1151,
+
+     1153, 1155, 1157, 1162, 1160, 1167, 1172, 1173, 1174, 1176,
+     1185, 1190, 1183, 1187, 1189, 1199, 1204, 1206, 1208, 1210,
+     1215, 1220, 1222, 1357, 1269, 1278, 1287, 1290, 1293, 1297,
+     1306, 1315, 1324, 1333, 1340, 1344, 1347
+    } ;
+
+static yyconst flex_int16_t yy_def[438] =
+    {   0,
+      424,    1,  425,  425,  426,  426,  427,  427,  424,  424,
+      424,  424,  424,  428,  424,  424,  424,  424,  424,  424,
+      424,  424,  424,  424,  424,  429,  424,  424,  424,  424,
+      424,  430,  430,  430,  430,  430,   34,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      424,  424,  431,  432,  433,  424,  434,  424,  424,  428,
+      424,  424,  424,  424,  429,  424,  424,  424,  424,  435,
+      430,  436,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  431,  432,  432,  424,  433,
+      424,  434,  424,  424,  424,  437,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+
+      430,  430,  424,  437,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      424,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  424,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,  430,  430,  430,  430,  430,  430,  430,
+      430,  430,  430,    0,  424,  424,  424,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424
+    } ;
+
+static yyconst flex_int16_t yy_nxt[1414] =
+    {   0,
+       10,   11,   12,   13,   10,   14,   15,   16,   17,   18,
+       19,   20,   21,   22,   23,   24,   25,   26,   27,   28,
+       29,   30,   31,   10,   32,   33,   34,   35,   36,   37,
+       38,   38,   39,   38,   38,   40,   41,   42,   43,   44,
+       38,   45,   46,   47,   48,   49,   50,   38,   38,   38,
+       38,   38,   38,   38,   51,   52,   59,   59,   59,   59,
+       63,   70,   64,   67,   68,   70,   70,   70,   70,   72,
+       63,   70,   64,   72,   72,   72,   72,  123,   75,   72,
+       84,   70,   76,   73,   85,   77,  136,   79,   74,   72,
+       86,   80,   90,  322,   81,   71,   70,   82,   78,   91,
+
+       83,   87,   92,   88,   72,   93,   70,   70,   94,   70,
+       95,   70,  271,   89,   72,   72,  128,   72,   96,   72,
+       98,  129,  424,   97,   99,  104,   70,  424,   70,  101,
+      100,   70,  102,  105,   72,  106,   72,  107,  103,   72,
+      108,  110,   59,   59,  113,   70,  203,  114,  134,   70,
+      111,  112,  109,   72,  118,   70,  115,   72,   70,  133,
+      116,  119,  131,   72,  117,   70,   72,   70,  120,   70,
+       70,  121,  135,  122,  124,   72,   70,   72,   72,  137,
+      138,  125,   70,  128,   72,  140,   70,   70,  129,   70,
+       72,  141,   70,  424,   72,   72,  139,   72,  142,   70,
+
+       72,  144,  150,   70,   70,  143,   70,   72,  134,   70,
+      145,   72,   72,  133,   72,  152,  146,   72,   70,  131,
+      147,  148,  156,   69,  153,   66,   72,   70,  149,  151,
+       70,  154,   70,  155,   70,   72,   70,   62,   72,  158,
+       72,   70,   72,   70,   72,  157,   70,  159,   70,   72,
+       70,   72,   61,  424,   72,   70,   72,  161,   72,   58,
+      160,   70,  162,   72,  163,  164,   70,  165,   70,   72,
+       70,   70,  168,   70,   72,   58,   72,  170,   72,   72,
+      169,   72,  166,  167,   70,  172,   70,   70,   56,  171,
+      174,   56,   72,   70,   72,   72,  173,   54,   70,  175,
+
+       70,   72,   70,   54,   70,  176,   72,  180,   72,  424,
+       72,   70,   72,   70,  183,  177,  424,  178,  424,   72,
+       70,   72,  181,  179,  184,  424,  182,  424,   72,  188,
+       70,  186,  424,  189,   70,  185,   70,   70,   72,  187,
+      190,   70,   72,  424,   72,   72,  193,   70,   70,   72,
+      194,  191,  424,  424,   70,   72,   72,   70,   70,  424,
+      198,  192,   72,  424,  196,   72,   72,  200,  424,  424,
+       70,  201,   70,   70,  197,   70,  195,  199,   72,   70,
+       72,   72,   70,   72,  202,   70,  205,   72,  424,   70,
+       72,  208,  206,   72,   70,   70,  207,   72,   70,  209,
+
+      210,  424,   72,   72,   70,   70,   72,   70,  424,  216,
+       70,  211,   72,   72,  424,   72,  218,  424,   72,  424,
+      424,  212,  213,   70,   70,  214,   70,  217,  215,  424,
+       70,   72,   72,   70,   72,  223,  219,  220,   72,  222,
+       70,   72,   70,  221,   70,  424,   70,  424,   72,  424,
+       72,   70,   72,  226,   72,  230,   70,  227,  224,   72,
+      225,   70,  229,   70,   72,   70,  424,   70,   70,   72,
+      424,   72,  228,   72,  232,   72,   72,   70,  233,   70,
+      234,  236,  231,  424,  424,   72,   70,   72,   70,   70,
+      424,  237,  238,   70,   72,  235,   72,   72,  240,  239,
+
+       70,   72,  242,   70,  424,   70,   70,  243,   72,   70,
+      424,   72,  241,   72,   72,   70,   70,   72,  246,   70,
+      244,   70,   70,   72,   72,  245,  248,   72,  249,   72,
+       72,  247,   70,   70,   70,   70,   70,   70,   70,  250,
+       72,   72,   72,   72,   72,   72,   72,  255,   70,  424,
+      251,   70,  253,   70,  424,  424,   72,  252,   70,   72,
+      424,   72,  256,  258,   70,  257,   72,  424,  254,   70,
+       70,   70,   72,  259,  261,  262,   70,   72,   72,   72,
+      260,   70,   70,  424,   72,  266,  263,  265,   70,   72,
+       72,   70,  424,   70,  264,   70,   72,   70,   70,   72,
+
+      267,   72,  269,   72,   70,   72,   72,  268,   70,  424,
+      270,   70,   72,   70,  272,  273,   72,  274,   70,   72,
+       70,   72,   70,  275,  277,   70,   72,  276,   72,  280,
+       72,  281,   70,   72,   70,  279,   70,   70,  424,  424,
+       72,  278,   72,   70,   72,   72,  286,  284,   70,   70,
+       70,   72,  424,  282,   70,   70,   72,   72,   72,  285,
+      283,  424,   72,   72,   70,   70,  288,   70,   70,  290,
+       70,  287,   72,   72,   70,   72,   72,  424,   72,   70,
+       70,  291,   72,   70,   70,  289,   70,   72,   72,   70,
+      424,   72,   72,   70,   72,  292,   70,   72,  293,  297,
+
+       70,   72,   70,   70,   72,  295,  294,   70,   72,  296,
+       72,   72,   70,   70,  298,   72,   70,  424,   70,   70,
+       72,   72,   70,   70,   72,  299,   72,   72,   70,  302,
+       72,   72,   70,  424,  424,  424,   72,  424,  300,   70,
+       72,  301,  306,   70,  424,   70,  303,   72,  304,   70,
+      305,   72,  307,   72,  308,   70,  424,   72,  309,  424,
+       70,   70,  312,   72,  311,   70,   70,  310,   72,   72,
+      424,   70,   70,   72,   72,   70,   70,   70,  313,   72,
+       72,  314,  424,   72,   72,   72,   70,  317,   70,  319,
+      320,  424,  424,   70,   72,  315,   72,   70,   70,  321,
+
+      316,   72,   70,  318,   70,   72,   72,   70,   70,   70,
+       72,  424,   72,  424,  424,   72,   72,   72,  424,   70,
+       70,  323,  327,   70,   70,   70,  324,   72,   72,  424,
+      329,   72,   72,   72,   70,  325,  328,  331,   70,  326,
+      424,   70,   72,   70,   70,   70,   72,  332,  330,   72,
+       70,   72,   72,   72,  335,   70,  424,  424,   72,   70,
+      333,   70,   70,   72,  334,  336,  337,   72,  424,   72,
+       72,   70,   70,   70,   70,   70,  338,  424,   70,   72,
+       72,   72,   72,   72,  424,  340,   72,   70,   70,  341,
+      339,  424,  343,   70,   70,   72,   72,   70,  424,  344,
+
+       70,   72,   72,  342,   70,   72,  348,  424,   72,   70,
+       70,   70,   72,   70,  424,  346,  345,   72,   72,   72,
+       70,   72,  347,   70,  424,   70,  349,   70,   72,   70,
+       70,   72,  350,   72,  354,   72,  351,   72,   72,  352,
+      356,   70,  353,  358,  355,   70,   70,   70,   70,   72,
+       70,  357,   70,   72,   72,   72,   72,   70,   72,   70,
+       72,   70,   70,   70,   70,   72,   70,   72,  359,   72,
+       72,   72,   72,   70,   72,  424,   70,  424,  424,  361,
+       70,   72,   70,  362,   72,  360,  365,   70,   72,  363,
+       72,  366,  364,   70,   70,   72,   70,  424,   70,   70,
+
+       70,   72,   72,   70,   72,  367,   72,   72,   72,   70,
+      368,   72,  424,  424,   70,   70,   70,   72,  424,   70,
+      369,  370,   72,   72,   72,  424,  374,   72,   70,  371,
+       70,   70,  424,  375,   70,  372,   72,   70,   72,   72,
+      373,   70,   72,  376,  379,   72,  377,   70,   70,   72,
+       70,   70,  424,   70,   70,   72,   72,  378,   72,   72,
+      380,   72,   72,   70,   70,   70,   70,  383,   70,   70,
+      382,   72,   72,   72,   72,   70,   72,   72,   70,  381,
+       70,   70,  424,   72,   70,   70,   72,   70,   72,   72,
+      387,  386,   72,   72,  384,   72,  385,   70,  424,   70,
+
+       70,  424,   70,  424,  389,   72,  388,   72,   72,  390,
+       72,   70,   70,   70,   70,  392,   70,  424,  424,   72,
+       72,   72,   72,  393,   72,  391,  396,  424,   70,   70,
+       70,   70,   70,   70,   70,  394,   72,   72,   72,   72,
+       72,   72,   72,   70,  398,   70,   70,  395,   70,   70,
+       70,   72,   70,   72,   72,  424,   72,   72,   72,  424,
+       72,  399,  403,  397,  404,   70,   70,  400,   70,  401,
+       70,  424,   70,   72,   72,   70,   72,   70,   72,  405,
+       72,  402,   70,   72,  424,   72,  424,   70,   70,   70,
+       72,   70,  406,  424,  407,   72,   72,   72,   70,   72,
+
+       70,  412,   70,  424,   70,   70,   72,  424,   72,  410,
+       72,  408,   72,   72,   70,  409,  424,  413,  414,   70,
+      415,   70,   72,   70,  411,   70,  424,   72,  416,   72,
+       70,   72,  424,   72,  419,   70,  424,   70,   72,  417,
+      418,  424,  424,   72,  420,   72,  424,  424,  421,  424,
+      424,  424,  424,  424,  424,  424,  422,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424,  424,  423,   53,
+       53,   53,   53,   53,   53,   53,   53,   53,   55,   55,
+       55,   55,   55,   55,   55,   55,   55,   57,   57,   57,
+       57,   57,   57,   57,   57,   57,   60,  424,   60,   65,
+
+       65,   65,   71,   71,  424,   71,  126,  126,  126,  126,
+      424,  126,  126,  126,  126,  127,  127,  127,  127,  127,
+      127,  127,  127,  127,  130,  130,  130,  424,  130,  130,
+      130,  130,  130,  132,  424,  132,  132,  132,  132,  132,
+      132,  132,  136,  424,  424,  424,  424,  424,  136,   72,
+       72,  424,   72,  204,  424,  204,    9,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+      424,  424,  424
+    } ;
+
+static yyconst flex_int16_t yy_chk[1414] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,   11,   11,   12,   12,
+       25,   32,   25,   28,   28,   33,   38,   35,   49,   32,
+       64,   34,   64,   33,   38,   35,   49,   49,   33,   34,
+       35,   36,   33,   32,   35,   33,  322,   34,   32,   36,
+       35,   34,   37,  271,   34,   37,   39,   34,   33,   37,
+
+       34,   36,   37,   36,   39,   37,   41,   40,   37,   42,
+       39,   43,  203,   36,   41,   40,   54,   42,   39,   43,
+       40,   54,  127,   39,   40,   43,   44,  127,   45,   41,
+       40,   46,   42,   43,   44,   43,   45,   43,   42,   46,
+       43,   45,   59,   59,   46,   47,  135,   46,  134,   48,
+       45,   45,   44,   47,   47,   71,   46,   48,   50,  133,
+       46,   47,  131,   71,   46,   72,   50,   73,   47,   74,
+       75,   48,   70,   48,   50,   73,   76,   74,   75,   73,
+       74,   50,   77,  128,   76,   75,   78,   80,  128,   82,
+       77,   76,   79,   65,   78,   80,   74,   82,   76,   81,
+
+       79,   79,   82,   83,   84,   77,   85,   81,   63,   86,
+       80,   83,   84,   58,   85,   84,   80,   86,   87,   56,
+       81,   81,   86,   30,   84,   26,   87,   88,   81,   83,
+       89,   84,   90,   85,   91,   88,   92,   24,   89,   88,
+       90,   93,   91,   94,   92,   87,   95,   89,   97,   93,
+       98,   94,   23,    9,   95,   96,   97,   91,   98,    8,
+       90,   99,   92,   96,   93,   94,  100,   96,  102,   99,
+      103,  104,   98,  101,  100,    7,  102,  100,  103,  104,
+       99,  101,   96,   96,  105,  101,  106,  108,    6,  100,
+      103,    5,  105,  107,  106,  108,  102,    4,  110,  106,
+
+      109,  107,  111,    3,  112,  107,  110,  110,  109,    0,
+      111,  113,  112,  115,  111,  108,    0,  109,    0,  113,
+      114,  115,  110,  109,  112,    0,  110,    0,  114,  114,
+      116,  113,    0,  115,  117,  112,  118,  119,  116,  113,
+      116,  120,  117,    0,  118,  119,  118,  121,  122,  120,
+      119,  116,    0,    0,  124,  121,  122,  123,  125,    0,
+      122,  117,  124,    0,  121,  123,  125,  124,    0,    0,
+      137,  124,  138,  139,  121,  140,  120,  123,  137,  141,
+      138,  139,  142,  140,  125,  144,  139,  141,    0,  143,
+      142,  142,  140,  144,  145,  146,  141,  143,  148,  143,
+
+      143,    0,  145,  146,  147,  149,  148,  150,    0,  148,
+      151,  144,  147,  149,    0,  150,  150,    0,  151,    0,
+        0,  145,  146,  152,  154,  147,  153,  149,  147,    0,
+      155,  152,  154,  156,  153,  154,  151,  151,  155,  153,
+      157,  156,  158,  152,  159,    0,  161,    0,  157,    0,
+      158,  160,  159,  157,  161,  161,  162,  157,  155,  160,
+      156,  163,  160,  164,  162,  165,    0,  166,  167,  163,
+        0,  164,  159,  165,  164,  166,  167,  168,  165,  170,
+      166,  167,  163,    0,    0,  168,  169,  170,  171,  172,
+        0,  167,  168,  173,  169,  166,  171,  172,  170,  169,
+
+      174,  173,  172,  175,    0,  176,  177,  173,  174,  178,
+        0,  175,  171,  176,  177,  179,  180,  178,  176,  181,
+      174,  182,  183,  179,  180,  175,  179,  181,  180,  182,
+      183,  178,  184,  185,  186,  187,  188,  189,  190,  181,
+      184,  185,  186,  187,  188,  189,  190,  186,  191,    0,
+      182,  192,  184,  193,    0,    0,  191,  183,  194,  192,
+        0,  193,  188,  192,  195,  190,  194,    0,  185,  196,
+      197,  199,  195,  193,  195,  195,  198,  196,  197,  199,
+      194,  200,  201,    0,  198,  198,  195,  197,  205,  200,
+      201,  202,    0,  206,  196,  207,  205,  208,  209,  202,
+
+      199,  206,  201,  207,  211,  208,  209,  200,  210,    0,
+      202,  212,  211,  213,  205,  206,  210,  207,  214,  212,
+      215,  213,  216,  208,  212,  217,  214,  210,  215,  215,
+      216,  216,  218,  217,  219,  214,  220,  221,    0,    0,
+      218,  213,  219,  222,  220,  221,  221,  219,  223,  224,
+      225,  222,    0,  217,  226,  227,  223,  224,  225,  220,
+      218,    0,  226,  227,  228,  229,  224,  230,  231,  227,
+      232,  222,  228,  229,  233,  230,  231,    0,  232,  234,
+      237,  229,  233,  235,  236,  225,  238,  234,  237,  239,
+        0,  235,  236,  240,  238,  230,  241,  239,  232,  236,
+
+      242,  240,  243,  244,  241,  234,  233,  245,  242,  235,
+      243,  244,  246,  247,  238,  245,  248,    0,  249,  250,
+      246,  247,  251,  252,  248,  243,  249,  250,  254,  248,
+      251,  252,  253,    0,    0,    0,  254,    0,  246,  257,
+      253,  247,  253,  255,    0,  256,  250,  257,  251,  259,
+      252,  255,  254,  256,  255,  258,    0,  259,  256,    0,
+      260,  261,  259,  258,  258,  262,  263,  257,  260,  261,
+        0,  264,  265,  262,  263,  266,  267,  268,  261,  264,
+      265,  262,    0,  266,  267,  268,  269,  265,  270,  267,
+      268,    0,    0,  272,  269,  263,  270,  273,  274,  269,
+
+      264,  272,  275,  266,  276,  273,  274,  277,  279,  278,
+      275,    0,  276,    0,    0,  277,  279,  278,    0,  280,
+      281,  272,  278,  282,  283,  284,  274,  280,  281,    0,
+      280,  282,  283,  284,  285,  275,  279,  283,  286,  276,
+        0,  287,  285,  288,  289,  290,  286,  284,  281,  287,
+      291,  288,  289,  290,  287,  292,    0,    0,  291,  293,
+      285,  294,  295,  292,  286,  288,  289,  293,    0,  294,
+      295,  296,  297,  298,  299,  300,  293,    0,  301,  296,
+      297,  298,  299,  300,    0,  297,  301,  302,  303,  298,
+      295,    0,  301,  304,  305,  302,  303,  306,    0,  302,
+
+      307,  304,  305,  299,  308,  306,  306,    0,  307,  309,
+      310,  311,  308,  312,    0,  304,  303,  309,  310,  311,
+      313,  312,  305,  315,    0,  314,  307,  316,  313,  317,
+      318,  315,  308,  314,  314,  316,  310,  317,  318,  311,
+      316,  319,  313,  318,  315,  320,  321,  323,  324,  319,
+      325,  317,  326,  320,  321,  323,  324,  327,  325,  328,
+      326,  329,  330,  331,  332,  327,  333,  328,  319,  329,
+      330,  331,  332,  334,  333,    0,  335,    0,    0,  326,
+      336,  334,  337,  327,  335,  325,  334,  338,  336,  329,
+      337,  336,  332,  339,  340,  338,  341,    0,  343,  342,
+
+      344,  339,  340,  346,  341,  337,  343,  342,  344,  345,
+      338,  346,    0,    0,  347,  348,  349,  345,    0,  350,
+      340,  342,  347,  348,  349,    0,  348,  350,  351,  344,
+      352,  354,    0,  349,  355,  345,  351,  353,  352,  354,
+      347,  356,  355,  352,  355,  353,  353,  357,  358,  356,
+      359,  360,    0,  361,  362,  357,  358,  354,  359,  360,
+      357,  361,  362,  363,  364,  365,  366,  362,  367,  368,
+      361,  363,  364,  365,  366,  369,  367,  368,  370,  360,
+      371,  372,    0,  369,  373,  374,  370,  375,  371,  372,
+      370,  368,  373,  374,  366,  375,  367,  376,    0,  377,
+
+      378,    0,  379,    0,  374,  376,  371,  377,  378,  375,
+      379,  380,  381,  382,  383,  379,  384,    0,    0,  380,
+      381,  382,  383,  380,  384,  377,  383,    0,  385,  386,
+      387,  388,  389,  390,  391,  381,  385,  386,  387,  388,
+      389,  390,  391,  392,  388,  393,  394,  382,  396,  395,
+      397,  392,  398,  393,  394,    0,  396,  395,  397,    0,
+      398,  390,  395,  385,  397,  399,  400,  391,  401,  392,
+      402,    0,  403,  399,  400,  405,  401,  404,  402,  399,
+      403,  394,  406,  405,    0,  404,    0,  407,  408,  409,
+      406,  410,  402,    0,  404,  407,  408,  409,  413,  410,
+
+      411,  410,  414,    0,  415,  412,  413,    0,  411,  408,
+      414,  406,  415,  412,  416,  407,    0,  411,  412,  417,
+      413,  418,  416,  419,  409,  420,    0,  417,  414,  418,
+      421,  419,    0,  420,  418,  422,    0,  423,  421,  415,
+      417,    0,    0,  422,  419,  423,    0,    0,  420,    0,
+        0,    0,    0,    0,    0,    0,  421,    0,    0,    0,
+        0,    0,    0,    0,    0,    0,    0,    0,  422,  425,
+      425,  425,  425,  425,  425,  425,  425,  425,  426,  426,
+      426,  426,  426,  426,  426,  426,  426,  427,  427,  427,
+      427,  427,  427,  427,  427,  427,  428,    0,  428,  429,
+
+      429,  429,  430,  430,    0,  430,  431,  431,  431,  431,
+        0,  431,  431,  431,  431,  432,  432,  432,  432,  432,
+      432,  432,  432,  432,  433,  433,  433,    0,  433,  433,
+      433,  433,  433,  434,    0,  434,  434,  434,  434,  434,
+      434,  434,  435,    0,    0,    0,    0,    0,  435,  436,
+      436,    0,  436,  437,    0,  437,  424,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+
+      424,  424,  424,  424,  424,  424,  424,  424,  424,  424,
+      424,  424,  424
+    } ;
+
+static yy_state_type yy_last_accepting_state;
+static char *yy_last_accepting_cpos;
+
+extern int yy_flex_debug;
+int yy_flex_debug = 0;
+
+/* The intent behind this definition is that it'll catch
+ * any uses of REJECT which flex missed.
+ */
+#define REJECT reject_used_but_not_detected
+#define yymore() yymore_used_but_not_detected
+#define YY_MORE_ADJ 0
+#define YY_RESTORE_YY_MORE_OFFSET
+char *yytext;
+#line 1 "pars0lex.l"
+/*****************************************************************************
+
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/******************************************************
+SQL parser lexical analyzer: input file for the GNU Flex lexer generator
+
+The InnoDB parser is frozen because MySQL takes care of SQL parsing.
+Therefore we normally keep the InnoDB parser C files as they are, and do
+not automatically generate them from pars0grm.y and pars0lex.l.
+
+How to make the InnoDB parser and lexer C files:
+
+1. Run ./make_flex.sh to generate lexer files.
+
+2. Run ./make_bison.sh to generate parser files.
+
+These instructions seem to work at least with bison-1.875d and flex-2.5.31 on
+Linux.
+
+Created 12/14/1997 Heikki Tuuri
+*******************************************************/
+#define YY_NO_INPUT 1
+#define YY_NO_UNISTD_H 1
+#line 53 "pars0lex.l"
+#define YYSTYPE que_node_t*
+
+#include "univ.i"
+#include "pars0pars.h"
+#include "pars0grm.h"
+#include "pars0sym.h"
+#include "mem0mem.h"
+#include "os0proc.h"
+
+#define malloc(A)	ut_malloc(A)
+#define free(A)		ut_free(A)
+#define realloc(P, A)	ut_realloc(P, A)
+#define exit(A) 	ut_error
+
+/* Note: We cast &result to int* from yysize_t* */
+#define YY_INPUT(buf, result, max_size) \
+	(result = pars_get_lex_chars(buf, max_size))
+
+/* String buffer for removing quotes */
+static ulint	stringbuf_len_alloc = 0; /* Allocated length */
+static ulint	stringbuf_len = 0; /* Current length */
+static char*	stringbuf; /* Start of buffer */
+/** Appends a string to the buffer. */
+static
+void
+string_append(
+/*==========*/
+	const char*	str,	/*!< in: string to be appended */
+	ulint		len)	/*!< in: length of the string */
+{
+	if (stringbuf == NULL) {
+		stringbuf = static_cast<char*>(malloc(1));
+		stringbuf_len_alloc = 1;
+	}
+
+	if (stringbuf_len + len > stringbuf_len_alloc) {
+		while (stringbuf_len + len > stringbuf_len_alloc) {
+			stringbuf_len_alloc <<= 1;
+		}
+
+		stringbuf = static_cast<char*>(
+			realloc(stringbuf, stringbuf_len_alloc));
+	}
+
+	memcpy(stringbuf + stringbuf_len, str, len);
+	stringbuf_len += len;
+}
+
+
+
+
+#line 1006 "lexyy.cc"
+
+#define INITIAL 0
+#define comment 1
+#define quoted 2
+#define id 3
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+static int yy_init_globals (void );
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+__attribute__((unused)) static int yylex_destroy (void );
+
+int yyget_debug (void );
+
+void yyset_debug (int debug_flag  );
+
+YY_EXTRA_TYPE yyget_extra (void );
+
+void yyset_extra (YY_EXTRA_TYPE user_defined  );
+
+FILE *yyget_in (void );
+
+void yyset_in  (FILE * in_str  );
+
+FILE *yyget_out (void );
+
+void yyset_out  (FILE * out_str  );
+
+yy_size_t yyget_leng (void );
+
+char *yyget_text (void );
+
+int yyget_lineno (void );
+
+void yyset_lineno (int line_number  );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int yywrap (void );
+#else
+extern int yywrap (void );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char *,yyconst char *,int );
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * );
+#endif
+
+#ifndef YY_NO_INPUT
+
+#ifdef __cplusplus
+static int yyinput (void );
+#else
+static int input (void );
+#endif
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Copy whatever the last rule matched to the standard output. */
+#ifndef ECHO
+/* This used to be an fputs(), but since the string might contain NUL's,
+ * we now use fwrite().
+ */
+#define ECHO do { if (fwrite( yytext, yyleng, 1, yyout )) {} } while (0)
+#endif
+
+/* Gets input and stuffs it into "buf".  number of characters read, or YY_NULL,
+ * is returned in "result".
+ */
+#ifndef YY_INPUT
+#define YY_INPUT(buf,result,max_size) \
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \
+		{ \
+		int c = '*'; \
+		size_t n; \
+		for ( n = 0; n < max_size && \
+			     (c = getc( yyin )) != EOF && c != '\n'; ++n ) \
+			buf[n] = (char) c; \
+		if ( c == '\n' ) \
+			buf[n++] = (char) c; \
+		if ( c == EOF && ferror( yyin ) ) \
+			YY_FATAL_ERROR( "input in flex scanner failed" ); \
+		result = n; \
+		} \
+	else \
+		{ \
+		errno=0; \
+		while ( (result = fread(buf, 1, max_size, yyin))==0 && ferror(yyin)) \
+			{ \
+			if( errno != EINTR) \
+				{ \
+				YY_FATAL_ERROR( "input in flex scanner failed" ); \
+				break; \
+				} \
+			errno=0; \
+			clearerr(yyin); \
+			} \
+		}\
+\
+
+#endif
+
+/* No semi-colon after return; correct usage is to write "yyterminate();" -
+ * we don't want an extra ';' after the "return" because that will cause
+ * some compilers to complain about unreachable statements.
+ */
+#ifndef yyterminate
+#define yyterminate() return YY_NULL
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Report a fatal error. */
+#ifndef YY_FATAL_ERROR
+#define YY_FATAL_ERROR(msg) yy_fatal_error( msg )
+#endif
+
+/* end tables serialization structures and prototypes */
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int yylex (void);
+
+#define YY_DECL int yylex (void)
+#endif /* !YY_DECL */
+
+/* Code executed at the beginning of each rule, after yytext and yyleng
+ * have been set up.
+ */
+#ifndef YY_USER_ACTION
+#define YY_USER_ACTION
+#endif
+
+/* Code executed at the end of each rule. */
+#ifndef YY_BREAK
+#define YY_BREAK break;
+#endif
+
+#define YY_RULE_SETUP \
+	YY_USER_ACTION
+
+/** The main scanner function which does all the work.
+ */
+YY_DECL
+{
+	register yy_state_type yy_current_state;
+	register char *yy_cp, *yy_bp;
+	register int yy_act;
+
+#line 112 "pars0lex.l"
+
+
+#line 1197 "lexyy.cc"
+
+	if ( !(yy_init) )
+		{
+		(yy_init) = 1;
+
+#ifdef YY_USER_INIT
+		YY_USER_INIT;
+#endif
+
+		if ( ! (yy_start) )
+			(yy_start) = 1;	/* first start state */
+
+		if ( ! yyin )
+			yyin = stdin;
+
+		if ( ! yyout )
+			yyout = stdout;
+
+		if ( ! YY_CURRENT_BUFFER ) {
+			yyensure_buffer_stack ();
+			YY_CURRENT_BUFFER_LVALUE =
+				yy_create_buffer(yyin,YY_BUF_SIZE );
+		}
+
+		yy_load_buffer_state( );
+		}
+
+	while ( 1 )		/* loops until end-of-file is reached */
+		{
+		yy_cp = (yy_c_buf_p);
+
+		/* Support of yytext. */
+		*yy_cp = (yy_hold_char);
+
+		/* yy_bp points to the position in yy_ch_buf of the start of
+		 * the current run.
+		 */
+		yy_bp = yy_cp;
+
+		yy_current_state = (yy_start);
+yy_match:
+		do
+			{
+			register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)];
+			if ( yy_accept[yy_current_state] )
+				{
+				(yy_last_accepting_state) = yy_current_state;
+				(yy_last_accepting_cpos) = yy_cp;
+				}
+			while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+				{
+				yy_current_state = (int) yy_def[yy_current_state];
+				if ( yy_current_state >= 425 )
+					yy_c = yy_meta[(unsigned int) yy_c];
+				}
+			yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+			++yy_cp;
+			}
+		while ( yy_current_state != 424 );
+		yy_cp = (yy_last_accepting_cpos);
+		yy_current_state = (yy_last_accepting_state);
+
+yy_find_action:
+		yy_act = yy_accept[yy_current_state];
+
+		YY_DO_BEFORE_ACTION;
+
+do_action:	/* This label is used only to access EOF actions. */
+
+		switch ( yy_act )
+	{ /* beginning of action switch */
+			case 0: /* must back up */
+			/* undo the effects of YY_DO_BEFORE_ACTION */
+			*yy_cp = (yy_hold_char);
+			yy_cp = (yy_last_accepting_cpos);
+			yy_current_state = (yy_last_accepting_state);
+			goto yy_find_action;
+
+case 1:
+YY_RULE_SETUP
+#line 114 "pars0lex.l"
+{
+			yylval = sym_tab_add_int_lit(pars_sym_tab_global,
+								atoi(yytext));
+			return(PARS_INT_LIT);
+}
+	YY_BREAK
+case 2:
+YY_RULE_SETUP
+#line 120 "pars0lex.l"
+{
+			ut_error;	/* not implemented */
+
+			return(PARS_FLOAT_LIT);
+}
+	YY_BREAK
+case 3:
+YY_RULE_SETUP
+#line 126 "pars0lex.l"
+{
+			ulint	type;
+
+			yylval = sym_tab_add_bound_lit(pars_sym_tab_global,
+				yytext + 1, &type);
+
+			return((int) type);
+}
+	YY_BREAK
+case 4:
+YY_RULE_SETUP
+#line 135 "pars0lex.l"
+{
+			yylval = sym_tab_add_bound_id(pars_sym_tab_global,
+				yytext + 1);
+
+			return(PARS_ID_TOKEN);
+}
+	YY_BREAK
+case 5:
+YY_RULE_SETUP
+#line 142 "pars0lex.l"
+{
+/* Quoted character string literals are handled in an explicit
+start state 'quoted'.  This state is entered and the buffer for
+the scanned string is emptied upon encountering a starting quote.
+
+In the state 'quoted', only two actions are possible (defined below). */
+			BEGIN(quoted);
+			stringbuf_len = 0;
+}
+	YY_BREAK
+case 6:
+/* rule 6 can match eol */
+YY_RULE_SETUP
+#line 151 "pars0lex.l"
+{
+			/* Got a sequence of characters other than "'":
+			append to string buffer */
+			string_append(yytext, yyleng);
+}
+	YY_BREAK
+case 7:
+YY_RULE_SETUP
+#line 156 "pars0lex.l"
+{
+			/* Got a sequence of "'" characters:
+			append half of them to string buffer,
+			as "''" represents a single "'".
+			We apply truncating division,
+			so that "'''" will result in "'". */
+
+			string_append(yytext, yyleng / 2);
+
+			/* If we got an odd number of quotes, then the
+			last quote we got is the terminating quote.
+			At the end of the string, we return to the
+			initial start state and report the scanned
+			string literal. */
+
+			if (yyleng % 2) {
+				BEGIN(INITIAL);
+				yylval = sym_tab_add_str_lit(
+					pars_sym_tab_global,
+					(byte*) stringbuf, stringbuf_len);
+				return(PARS_STR_LIT);
+			}
+}
+	YY_BREAK
+case 8:
+YY_RULE_SETUP
+#line 180 "pars0lex.l"
+{
+/* Quoted identifiers are handled in an explicit start state 'id'.
+This state is entered and the buffer for the scanned string is emptied
+upon encountering a starting quote.
+
+In the state 'id', only two actions are possible (defined below). */
+			BEGIN(id);
+			stringbuf_len = 0;
+}
+	YY_BREAK
+case 9:
+/* rule 9 can match eol */
+YY_RULE_SETUP
+#line 189 "pars0lex.l"
+{
+			/* Got a sequence of characters other than '"':
+			append to string buffer */
+			string_append(yytext, yyleng);
+}
+	YY_BREAK
+case 10:
+YY_RULE_SETUP
+#line 194 "pars0lex.l"
+{
+			/* Got a sequence of '"' characters:
+			append half of them to string buffer,
+			as '""' represents a single '"'.
+			We apply truncating division,
+			so that '"""' will result in '"'. */
+
+			string_append(yytext, yyleng / 2);
+
+			/* If we got an odd number of quotes, then the
+			last quote we got is the terminating quote.
+			At the end of the string, we return to the
+			initial start state and report the scanned
+			identifier. */
+
+			if (yyleng % 2) {
+				BEGIN(INITIAL);
+				yylval = sym_tab_add_id(
+					pars_sym_tab_global,
+					(byte*) stringbuf, stringbuf_len);
+
+				return(PARS_ID_TOKEN);
+			}
+}
+	YY_BREAK
+case 11:
+YY_RULE_SETUP
+#line 219 "pars0lex.l"
+{
+			yylval = sym_tab_add_null_lit(pars_sym_tab_global);
+
+			return(PARS_NULL_LIT);
+}
+	YY_BREAK
+case 12:
+YY_RULE_SETUP
+#line 225 "pars0lex.l"
+{
+			/* Implicit cursor name */
+			yylval = sym_tab_add_str_lit(pars_sym_tab_global,
+							(byte*) yytext, yyleng);
+			return(PARS_SQL_TOKEN);
+}
+	YY_BREAK
+case 13:
+YY_RULE_SETUP
+#line 232 "pars0lex.l"
+{
+			return(PARS_AND_TOKEN);
+}
+	YY_BREAK
+case 14:
+YY_RULE_SETUP
+#line 236 "pars0lex.l"
+{
+			return(PARS_OR_TOKEN);
+}
+	YY_BREAK
+case 15:
+YY_RULE_SETUP
+#line 240 "pars0lex.l"
+{
+			return(PARS_NOT_TOKEN);
+}
+	YY_BREAK
+case 16:
+YY_RULE_SETUP
+#line 244 "pars0lex.l"
+{
+			return(PARS_PROCEDURE_TOKEN);
+}
+	YY_BREAK
+case 17:
+YY_RULE_SETUP
+#line 248 "pars0lex.l"
+{
+			return(PARS_IN_TOKEN);
+}
+	YY_BREAK
+case 18:
+YY_RULE_SETUP
+#line 252 "pars0lex.l"
+{
+			return(PARS_OUT_TOKEN);
+}
+	YY_BREAK
+case 19:
+YY_RULE_SETUP
+#line 256 "pars0lex.l"
+{
+			return(PARS_BINARY_TOKEN);
+}
+	YY_BREAK
+case 20:
+YY_RULE_SETUP
+#line 260 "pars0lex.l"
+{
+			return(PARS_BLOB_TOKEN);
+}
+	YY_BREAK
+case 21:
+YY_RULE_SETUP
+#line 264 "pars0lex.l"
+{
+			return(PARS_INT_TOKEN);
+}
+	YY_BREAK
+case 22:
+YY_RULE_SETUP
+#line 268 "pars0lex.l"
+{
+			return(PARS_INT_TOKEN);
+}
+	YY_BREAK
+case 23:
+YY_RULE_SETUP
+#line 272 "pars0lex.l"
+{
+			return(PARS_FLOAT_TOKEN);
+}
+	YY_BREAK
+case 24:
+YY_RULE_SETUP
+#line 276 "pars0lex.l"
+{
+			return(PARS_CHAR_TOKEN);
+}
+	YY_BREAK
+case 25:
+YY_RULE_SETUP
+#line 280 "pars0lex.l"
+{
+			return(PARS_IS_TOKEN);
+}
+	YY_BREAK
+case 26:
+YY_RULE_SETUP
+#line 284 "pars0lex.l"
+{
+			return(PARS_BEGIN_TOKEN);
+}
+	YY_BREAK
+case 27:
+YY_RULE_SETUP
+#line 288 "pars0lex.l"
+{
+			return(PARS_END_TOKEN);
+}
+	YY_BREAK
+case 28:
+YY_RULE_SETUP
+#line 292 "pars0lex.l"
+{
+			return(PARS_IF_TOKEN);
+}
+	YY_BREAK
+case 29:
+YY_RULE_SETUP
+#line 296 "pars0lex.l"
+{
+			return(PARS_THEN_TOKEN);
+}
+	YY_BREAK
+case 30:
+YY_RULE_SETUP
+#line 300 "pars0lex.l"
+{
+			return(PARS_ELSE_TOKEN);
+}
+	YY_BREAK
+case 31:
+YY_RULE_SETUP
+#line 304 "pars0lex.l"
+{
+			return(PARS_ELSIF_TOKEN);
+}
+	YY_BREAK
+case 32:
+YY_RULE_SETUP
+#line 308 "pars0lex.l"
+{
+			return(PARS_LOOP_TOKEN);
+}
+	YY_BREAK
+case 33:
+YY_RULE_SETUP
+#line 312 "pars0lex.l"
+{
+			return(PARS_WHILE_TOKEN);
+}
+	YY_BREAK
+case 34:
+YY_RULE_SETUP
+#line 316 "pars0lex.l"
+{
+			return(PARS_RETURN_TOKEN);
+}
+	YY_BREAK
+case 35:
+YY_RULE_SETUP
+#line 320 "pars0lex.l"
+{
+			return(PARS_SELECT_TOKEN);
+}
+	YY_BREAK
+case 36:
+YY_RULE_SETUP
+#line 324 "pars0lex.l"
+{
+			return(PARS_SUM_TOKEN);
+}
+	YY_BREAK
+case 37:
+YY_RULE_SETUP
+#line 328 "pars0lex.l"
+{
+			return(PARS_COUNT_TOKEN);
+}
+	YY_BREAK
+case 38:
+YY_RULE_SETUP
+#line 332 "pars0lex.l"
+{
+			return(PARS_DISTINCT_TOKEN);
+}
+	YY_BREAK
+case 39:
+YY_RULE_SETUP
+#line 336 "pars0lex.l"
+{
+			return(PARS_FROM_TOKEN);
+}
+	YY_BREAK
+case 40:
+YY_RULE_SETUP
+#line 340 "pars0lex.l"
+{
+			return(PARS_WHERE_TOKEN);
+}
+	YY_BREAK
+case 41:
+YY_RULE_SETUP
+#line 344 "pars0lex.l"
+{
+			return(PARS_FOR_TOKEN);
+}
+	YY_BREAK
+case 42:
+YY_RULE_SETUP
+#line 348 "pars0lex.l"
+{
+			return(PARS_READ_TOKEN);
+}
+	YY_BREAK
+case 43:
+YY_RULE_SETUP
+#line 352 "pars0lex.l"
+{
+			return(PARS_ORDER_TOKEN);
+}
+	YY_BREAK
+case 44:
+YY_RULE_SETUP
+#line 356 "pars0lex.l"
+{
+			return(PARS_BY_TOKEN);
+}
+	YY_BREAK
+case 45:
+YY_RULE_SETUP
+#line 360 "pars0lex.l"
+{
+			return(PARS_ASC_TOKEN);
+}
+	YY_BREAK
+case 46:
+YY_RULE_SETUP
+#line 364 "pars0lex.l"
+{
+			return(PARS_DESC_TOKEN);
+}
+	YY_BREAK
+case 47:
+YY_RULE_SETUP
+#line 368 "pars0lex.l"
+{
+			return(PARS_INSERT_TOKEN);
+}
+	YY_BREAK
+case 48:
+YY_RULE_SETUP
+#line 372 "pars0lex.l"
+{
+			return(PARS_INTO_TOKEN);
+}
+	YY_BREAK
+case 49:
+YY_RULE_SETUP
+#line 376 "pars0lex.l"
+{
+			return(PARS_VALUES_TOKEN);
+}
+	YY_BREAK
+case 50:
+YY_RULE_SETUP
+#line 380 "pars0lex.l"
+{
+			return(PARS_UPDATE_TOKEN);
+}
+	YY_BREAK
+case 51:
+YY_RULE_SETUP
+#line 384 "pars0lex.l"
+{
+			return(PARS_SET_TOKEN);
+}
+	YY_BREAK
+case 52:
+YY_RULE_SETUP
+#line 388 "pars0lex.l"
+{
+			return(PARS_DELETE_TOKEN);
+}
+	YY_BREAK
+case 53:
+YY_RULE_SETUP
+#line 392 "pars0lex.l"
+{
+			return(PARS_CURRENT_TOKEN);
+}
+	YY_BREAK
+case 54:
+YY_RULE_SETUP
+#line 396 "pars0lex.l"
+{
+			return(PARS_OF_TOKEN);
+}
+	YY_BREAK
+case 55:
+YY_RULE_SETUP
+#line 400 "pars0lex.l"
+{
+			return(PARS_CREATE_TOKEN);
+}
+	YY_BREAK
+case 56:
+YY_RULE_SETUP
+#line 404 "pars0lex.l"
+{
+			return(PARS_TABLE_TOKEN);
+}
+	YY_BREAK
+case 57:
+YY_RULE_SETUP
+#line 408 "pars0lex.l"
+{
+			return(PARS_COMPACT_TOKEN);
+}
+	YY_BREAK
+case 58:
+YY_RULE_SETUP
+#line 412 "pars0lex.l"
+{
+			return(PARS_BLOCK_SIZE_TOKEN);
+}
+	YY_BREAK
+case 59:
+YY_RULE_SETUP
+#line 416 "pars0lex.l"
+{
+			return(PARS_INDEX_TOKEN);
+}
+	YY_BREAK
+case 60:
+YY_RULE_SETUP
+#line 420 "pars0lex.l"
+{
+			return(PARS_UNIQUE_TOKEN);
+}
+	YY_BREAK
+case 61:
+YY_RULE_SETUP
+#line 424 "pars0lex.l"
+{
+			return(PARS_CLUSTERED_TOKEN);
+}
+	YY_BREAK
+case 62:
+YY_RULE_SETUP
+#line 428 "pars0lex.l"
+{
+			return(PARS_DOES_NOT_FIT_IN_MEM_TOKEN);
+}
+	YY_BREAK
+case 63:
+YY_RULE_SETUP
+#line 432 "pars0lex.l"
+{
+			return(PARS_ON_TOKEN);
+}
+	YY_BREAK
+case 64:
+YY_RULE_SETUP
+#line 436 "pars0lex.l"
+{
+			return(PARS_DECLARE_TOKEN);
+}
+	YY_BREAK
+case 65:
+YY_RULE_SETUP
+#line 440 "pars0lex.l"
+{
+			return(PARS_CURSOR_TOKEN);
+}
+	YY_BREAK
+case 66:
+YY_RULE_SETUP
+#line 444 "pars0lex.l"
+{
+			return(PARS_OPEN_TOKEN);
+}
+	YY_BREAK
+case 67:
+YY_RULE_SETUP
+#line 448 "pars0lex.l"
+{
+			return(PARS_FETCH_TOKEN);
+}
+	YY_BREAK
+case 68:
+YY_RULE_SETUP
+#line 452 "pars0lex.l"
+{
+			return(PARS_CLOSE_TOKEN);
+}
+	YY_BREAK
+case 69:
+YY_RULE_SETUP
+#line 456 "pars0lex.l"
+{
+			return(PARS_NOTFOUND_TOKEN);
+}
+	YY_BREAK
+case 70:
+YY_RULE_SETUP
+#line 460 "pars0lex.l"
+{
+			return(PARS_TO_CHAR_TOKEN);
+}
+	YY_BREAK
+case 71:
+YY_RULE_SETUP
+#line 464 "pars0lex.l"
+{
+			return(PARS_TO_NUMBER_TOKEN);
+}
+	YY_BREAK
+case 72:
+YY_RULE_SETUP
+#line 468 "pars0lex.l"
+{
+			return(PARS_TO_BINARY_TOKEN);
+}
+	YY_BREAK
+case 73:
+YY_RULE_SETUP
+#line 472 "pars0lex.l"
+{
+			return(PARS_BINARY_TO_NUMBER_TOKEN);
+}
+	YY_BREAK
+case 74:
+YY_RULE_SETUP
+#line 476 "pars0lex.l"
+{
+			return(PARS_SUBSTR_TOKEN);
+}
+	YY_BREAK
+case 75:
+YY_RULE_SETUP
+#line 480 "pars0lex.l"
+{
+			return(PARS_REPLSTR_TOKEN);
+}
+	YY_BREAK
+case 76:
+YY_RULE_SETUP
+#line 484 "pars0lex.l"
+{
+			return(PARS_CONCAT_TOKEN);
+}
+	YY_BREAK
+case 77:
+YY_RULE_SETUP
+#line 488 "pars0lex.l"
+{
+			return(PARS_INSTR_TOKEN);
+}
+	YY_BREAK
+case 78:
+YY_RULE_SETUP
+#line 492 "pars0lex.l"
+{
+			return(PARS_LENGTH_TOKEN);
+}
+	YY_BREAK
+case 79:
+YY_RULE_SETUP
+#line 496 "pars0lex.l"
+{
+			return(PARS_SYSDATE_TOKEN);
+}
+	YY_BREAK
+case 80:
+YY_RULE_SETUP
+#line 500 "pars0lex.l"
+{
+			return(PARS_PRINTF_TOKEN);
+}
+	YY_BREAK
+case 81:
+YY_RULE_SETUP
+#line 504 "pars0lex.l"
+{
+			return(PARS_ASSERT_TOKEN);
+}
+	YY_BREAK
+case 82:
+YY_RULE_SETUP
+#line 508 "pars0lex.l"
+{
+			return(PARS_RND_TOKEN);
+}
+	YY_BREAK
+case 83:
+YY_RULE_SETUP
+#line 512 "pars0lex.l"
+{
+			return(PARS_RND_STR_TOKEN);
+}
+	YY_BREAK
+case 84:
+YY_RULE_SETUP
+#line 516 "pars0lex.l"
+{
+			return(PARS_ROW_PRINTF_TOKEN);
+}
+	YY_BREAK
+case 85:
+YY_RULE_SETUP
+#line 520 "pars0lex.l"
+{
+			return(PARS_COMMIT_TOKEN);
+}
+	YY_BREAK
+case 86:
+YY_RULE_SETUP
+#line 524 "pars0lex.l"
+{
+			return(PARS_ROLLBACK_TOKEN);
+}
+	YY_BREAK
+case 87:
+YY_RULE_SETUP
+#line 528 "pars0lex.l"
+{
+			return(PARS_WORK_TOKEN);
+}
+	YY_BREAK
+case 88:
+YY_RULE_SETUP
+#line 532 "pars0lex.l"
+{
+			return(PARS_UNSIGNED_TOKEN);
+}
+	YY_BREAK
+case 89:
+YY_RULE_SETUP
+#line 536 "pars0lex.l"
+{
+			return(PARS_EXIT_TOKEN);
+}
+	YY_BREAK
+case 90:
+YY_RULE_SETUP
+#line 540 "pars0lex.l"
+{
+			return(PARS_FUNCTION_TOKEN);
+}
+	YY_BREAK
+case 91:
+YY_RULE_SETUP
+#line 544 "pars0lex.l"
+{
+			return(PARS_LOCK_TOKEN);
+}
+	YY_BREAK
+case 92:
+YY_RULE_SETUP
+#line 548 "pars0lex.l"
+{
+			return(PARS_SHARE_TOKEN);
+}
+	YY_BREAK
+case 93:
+YY_RULE_SETUP
+#line 552 "pars0lex.l"
+{
+			return(PARS_MODE_TOKEN);
+}
+	YY_BREAK
+case 94:
+YY_RULE_SETUP
+#line 556 "pars0lex.l"
+{
+                        return(PARS_LIKE_TOKEN);
+}
+	YY_BREAK
+case 95:
+YY_RULE_SETUP
+#line 560 "pars0lex.l"
+{
+			return(PARS_BIGINT_TOKEN);
+}
+	YY_BREAK
+case 96:
+YY_RULE_SETUP
+#line 564 "pars0lex.l"
+{
+			yylval = sym_tab_add_id(pars_sym_tab_global,
+							(byte*) yytext,
+							ut_strlen(yytext));
+			return(PARS_ID_TOKEN);
+}
+	YY_BREAK
+case 97:
+YY_RULE_SETUP
+#line 571 "pars0lex.l"
+{
+			yylval = sym_tab_add_id(pars_sym_tab_global,
+							(byte*) yytext,
+							ut_strlen(yytext));
+			return(PARS_TABLE_NAME_TOKEN);
+}
+	YY_BREAK
+case 98:
+YY_RULE_SETUP
+#line 578 "pars0lex.l"
+{
+			return(PARS_DDOT_TOKEN);
+}
+	YY_BREAK
+case 99:
+YY_RULE_SETUP
+#line 582 "pars0lex.l"
+{
+			return(PARS_ASSIGN_TOKEN);
+}
+	YY_BREAK
+case 100:
+YY_RULE_SETUP
+#line 586 "pars0lex.l"
+{
+			return(PARS_LE_TOKEN);
+}
+	YY_BREAK
+case 101:
+YY_RULE_SETUP
+#line 590 "pars0lex.l"
+{
+			return(PARS_GE_TOKEN);
+}
+	YY_BREAK
+case 102:
+YY_RULE_SETUP
+#line 594 "pars0lex.l"
+{
+			return(PARS_NE_TOKEN);
+}
+	YY_BREAK
+case 103:
+YY_RULE_SETUP
+#line 598 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 104:
+YY_RULE_SETUP
+#line 603 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 105:
+YY_RULE_SETUP
+#line 608 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 106:
+YY_RULE_SETUP
+#line 613 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 107:
+YY_RULE_SETUP
+#line 618 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 108:
+YY_RULE_SETUP
+#line 623 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 109:
+YY_RULE_SETUP
+#line 628 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 110:
+YY_RULE_SETUP
+#line 633 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 111:
+YY_RULE_SETUP
+#line 638 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 112:
+YY_RULE_SETUP
+#line 643 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 113:
+YY_RULE_SETUP
+#line 648 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 114:
+YY_RULE_SETUP
+#line 653 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 115:
+YY_RULE_SETUP
+#line 658 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 116:
+YY_RULE_SETUP
+#line 663 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 117:
+YY_RULE_SETUP
+#line 668 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 118:
+YY_RULE_SETUP
+#line 673 "pars0lex.l"
+BEGIN(comment); /* eat up comment */
+	YY_BREAK
+case 119:
+/* rule 119 can match eol */
+YY_RULE_SETUP
+#line 675 "pars0lex.l"
+
+	YY_BREAK
+case 120:
+/* rule 120 can match eol */
+YY_RULE_SETUP
+#line 676 "pars0lex.l"
+
+	YY_BREAK
+case 121:
+YY_RULE_SETUP
+#line 677 "pars0lex.l"
+BEGIN(INITIAL);
+	YY_BREAK
+case 122:
+/* rule 122 can match eol */
+YY_RULE_SETUP
+#line 679 "pars0lex.l"
+/* eat up whitespace */
+	YY_BREAK
+case 123:
+YY_RULE_SETUP
+#line 682 "pars0lex.l"
+{
+			fprintf(stderr,"Unrecognized character: %02x\n",
+				*yytext);
+
+			ut_error;
+
+			return(0);
+}
+	YY_BREAK
+case 124:
+YY_RULE_SETUP
+#line 691 "pars0lex.l"
+YY_FATAL_ERROR( "flex scanner jammed" );
+	YY_BREAK
+#line 2237 "lexyy.cc"
+case YY_STATE_EOF(INITIAL):
+case YY_STATE_EOF(comment):
+case YY_STATE_EOF(quoted):
+case YY_STATE_EOF(id):
+	yyterminate();
+
+	case YY_END_OF_BUFFER:
+		{
+		/* Amount of text matched not including the EOB char. */
+		int yy_amount_of_matched_text = (int) (yy_cp - (yytext_ptr)) - 1;
+
+		/* Undo the effects of YY_DO_BEFORE_ACTION. */
+		*yy_cp = (yy_hold_char);
+		YY_RESTORE_YY_MORE_OFFSET
+
+		if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW )
+			{
+			/* We're scanning a new file or input source.  It's
+			 * possible that this happened because the user
+			 * just pointed yyin at a new source and called
+			 * yylex().  If so, then we have to assure
+			 * consistency between YY_CURRENT_BUFFER and our
+			 * globals.  Here is the right place to do so, because
+			 * this is the first action (other than possibly a
+			 * back-up) that will match for the new input source.
+			 */
+			(yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+			YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL;
+			}
+
+		/* Note that here we test for yy_c_buf_p "<=" to the position
+		 * of the first EOB in the buffer, since yy_c_buf_p will
+		 * already have been incremented past the NUL character
+		 * (since all states make transitions on EOB to the
+		 * end-of-buffer state).  Contrast this with the test
+		 * in input().
+		 */
+		if ( (yy_c_buf_p) <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] )
+			{ /* This was really a NUL. */
+			yy_state_type yy_next_state;
+
+			(yy_c_buf_p) = (yytext_ptr) + yy_amount_of_matched_text;
+
+			yy_current_state = yy_get_previous_state(  );
+
+			/* Okay, we're now positioned to make the NUL
+			 * transition.  We couldn't have
+			 * yy_get_previous_state() go ahead and do it
+			 * for us because it doesn't know how to deal
+			 * with the possibility of jamming (and we don't
+			 * want to build jamming into it because then it
+			 * will run more slowly).
+			 */
+
+			yy_next_state = yy_try_NUL_trans( yy_current_state );
+
+			yy_bp = (yytext_ptr) + YY_MORE_ADJ;
+
+			if ( yy_next_state )
+				{
+				/* Consume the NUL. */
+				yy_cp = ++(yy_c_buf_p);
+				yy_current_state = yy_next_state;
+				goto yy_match;
+				}
+
+			else
+				{
+				yy_cp = (yy_last_accepting_cpos);
+				yy_current_state = (yy_last_accepting_state);
+				goto yy_find_action;
+				}
+			}
+
+		else switch ( yy_get_next_buffer(  ) )
+			{
+			case EOB_ACT_END_OF_FILE:
+				{
+				(yy_did_buffer_switch_on_eof) = 0;
+
+				if ( yywrap( ) )
+					{
+					/* Note: because we've taken care in
+					 * yy_get_next_buffer() to have set up
+					 * yytext, we can now set up
+					 * yy_c_buf_p so that if some total
+					 * hoser (like flex itself) wants to
+					 * call the scanner after we return the
+					 * YY_NULL, it'll still work - another
+					 * YY_NULL will get returned.
+					 */
+					(yy_c_buf_p) = (yytext_ptr) + YY_MORE_ADJ;
+
+					yy_act = YY_STATE_EOF(YY_START);
+					goto do_action;
+					}
+
+				else
+					{
+					if ( ! (yy_did_buffer_switch_on_eof) )
+						YY_NEW_FILE;
+					}
+				break;
+				}
+
+			case EOB_ACT_CONTINUE_SCAN:
+				(yy_c_buf_p) =
+					(yytext_ptr) + yy_amount_of_matched_text;
+
+				yy_current_state = yy_get_previous_state(  );
+
+				yy_cp = (yy_c_buf_p);
+				yy_bp = (yytext_ptr) + YY_MORE_ADJ;
+				goto yy_match;
+
+			case EOB_ACT_LAST_MATCH:
+				(yy_c_buf_p) =
+				&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)];
+
+				yy_current_state = yy_get_previous_state(  );
+
+				yy_cp = (yy_c_buf_p);
+				yy_bp = (yytext_ptr) + YY_MORE_ADJ;
+				goto yy_find_action;
+			}
+		break;
+		}
+
+	default:
+		YY_FATAL_ERROR(
+			"fatal flex scanner internal error--no action found" );
+	} /* end of action switch */
+		} /* end of scanning one token */
+} /* end of yylex */
+
+/* yy_get_next_buffer - try to read in a new buffer
+ *
+ * Returns a code representing an action:
+ *	EOB_ACT_LAST_MATCH -
+ *	EOB_ACT_CONTINUE_SCAN - continue scanning from current position
+ *	EOB_ACT_END_OF_FILE - end of file
+ */
+static int yy_get_next_buffer (void)
+{
+    	register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf;
+	register char *source = (yytext_ptr);
+	register int number_to_move, i;
+	int ret_val;
+
+	if ( (yy_c_buf_p) > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] )
+		YY_FATAL_ERROR(
+		"fatal flex scanner internal error--end of buffer missed" );
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 )
+		{ /* Don't try to fill the buffer, so this is an EOF. */
+		if ( (yy_c_buf_p) - (yytext_ptr) - YY_MORE_ADJ == 1 )
+			{
+			/* We matched a single character, the EOB, so
+			 * treat this as a final EOF.
+			 */
+			return EOB_ACT_END_OF_FILE;
+			}
+
+		else
+			{
+			/* We matched some text prior to the EOB, first
+			 * process it.
+			 */
+			return EOB_ACT_LAST_MATCH;
+			}
+		}
+
+	/* Try to read more data. */
+
+	/* First move last chars to start of buffer. */
+	number_to_move = (int) ((yy_c_buf_p) - (yytext_ptr)) - 1;
+
+	for ( i = 0; i < number_to_move; ++i )
+		*(dest++) = *(source++);
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING )
+		/* don't do the read, it's not guaranteed to return an EOF,
+		 * just force an EOF
+		 */
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars) = 0;
+
+	else
+		{
+			int num_to_read = static_cast<int>(
+				YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1);
+
+		while ( num_to_read <= 0 )
+			{ /* Not enough room in the buffer - grow it. */
+
+			/* just a shorter name for the current buffer */
+			YY_BUFFER_STATE b = YY_CURRENT_BUFFER;
+
+			int yy_c_buf_p_offset =
+				(int) ((yy_c_buf_p) - b->yy_ch_buf);
+
+			if ( b->yy_is_our_buffer )
+				{
+				int new_size = static_cast<int>(b->yy_buf_size * 2);
+
+				if ( new_size <= 0 )
+					b->yy_buf_size += b->yy_buf_size / 8;
+				else
+					b->yy_buf_size *= 2;
+
+				b->yy_ch_buf = (char *)
+					/* Include room in for 2 EOB chars. */
+					yyrealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2  );
+				}
+			else
+				/* Can't grow it, we don't own it. */
+				b->yy_ch_buf = 0;
+
+			if ( ! b->yy_ch_buf )
+				YY_FATAL_ERROR(
+				"fatal error - scanner input buffer overflow" );
+
+			(yy_c_buf_p) = &b->yy_ch_buf[yy_c_buf_p_offset];
+
+			num_to_read = static_cast<int>(
+				YY_CURRENT_BUFFER_LVALUE->yy_buf_size
+				- number_to_move - 1);
+
+			}
+
+		if ( num_to_read > YY_READ_BUF_SIZE )
+			num_to_read = YY_READ_BUF_SIZE;
+
+		/* Read in more data. */
+		YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]),
+			(yy_n_chars), (size_t) num_to_read );
+
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars);
+		}
+
+	if ( (yy_n_chars) == 0 )
+		{
+		if ( number_to_move == YY_MORE_ADJ )
+			{
+			ret_val = EOB_ACT_END_OF_FILE;
+			yyrestart(yyin  );
+			}
+
+		else
+			{
+			ret_val = EOB_ACT_LAST_MATCH;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status =
+				YY_BUFFER_EOF_PENDING;
+			}
+		}
+
+	else
+		ret_val = EOB_ACT_CONTINUE_SCAN;
+
+	if ((yy_size_t) ((yy_n_chars) + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) {
+		/* Extend the array by 50%, plus the number we really need. */
+		yy_size_t new_size = (yy_n_chars) + number_to_move + ((yy_n_chars) >> 1);
+		YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size  );
+		if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf )
+			YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" );
+	}
+
+	(yy_n_chars) += number_to_move;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] = YY_END_OF_BUFFER_CHAR;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] = YY_END_OF_BUFFER_CHAR;
+
+	(yytext_ptr) = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0];
+
+	return ret_val;
+}
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+     yy_state_type yy_get_previous_state (void)
+{
+	register yy_state_type yy_current_state;
+	register char *yy_cp;
+
+	yy_current_state = (yy_start);
+
+	for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp )
+		{
+		register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1);
+		if ( yy_accept[yy_current_state] )
+			{
+			(yy_last_accepting_state) = yy_current_state;
+			(yy_last_accepting_cpos) = yy_cp;
+			}
+		while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+			{
+			yy_current_state = (int) yy_def[yy_current_state];
+			if ( yy_current_state >= 425 )
+				yy_c = yy_meta[(unsigned int) yy_c];
+			}
+		yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+		}
+
+	return yy_current_state;
+}
+
+/* yy_try_NUL_trans - try to make a transition on the NUL character
+ *
+ * synopsis
+ *	next_state = yy_try_NUL_trans( current_state );
+ */
+     static yy_state_type yy_try_NUL_trans  (yy_state_type yy_current_state )
+{
+	register int yy_is_jam;
+    	register char *yy_cp = (yy_c_buf_p);
+
+	register YY_CHAR yy_c = 1;
+	if ( yy_accept[yy_current_state] )
+		{
+		(yy_last_accepting_state) = yy_current_state;
+		(yy_last_accepting_cpos) = yy_cp;
+		}
+	while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+		{
+		yy_current_state = (int) yy_def[yy_current_state];
+		if ( yy_current_state >= 425 )
+			yy_c = yy_meta[(unsigned int) yy_c];
+		}
+	yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+	yy_is_jam = (yy_current_state == 424);
+
+	return yy_is_jam ? 0 : yy_current_state;
+}
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+    static int yyinput (void)
+#else
+    static int input  (void)
+#endif
+
+{
+	int c;
+
+	*(yy_c_buf_p) = (yy_hold_char);
+
+	if ( *(yy_c_buf_p) == YY_END_OF_BUFFER_CHAR )
+		{
+		/* yy_c_buf_p now points to the character we want to return.
+		 * If this occurs *before* the EOB characters, then it's a
+		 * valid NUL; if not, then we've hit the end of the buffer.
+		 */
+		if ( (yy_c_buf_p) < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] )
+			/* This was really a NUL. */
+			*(yy_c_buf_p) = '\0';
+
+		else
+			{ /* need more input */
+			int offset = (int)((yy_c_buf_p) - (yytext_ptr));
+			++(yy_c_buf_p);
+
+			switch ( yy_get_next_buffer(  ) )
+				{
+				case EOB_ACT_LAST_MATCH:
+					/* This happens because yy_g_n_b()
+					 * sees that we've accumulated a
+					 * token and flags that we need to
+					 * try matching the token before
+					 * proceeding.  But for input(),
+					 * there's no matching to consider.
+					 * So convert the EOB_ACT_LAST_MATCH
+					 * to EOB_ACT_END_OF_FILE.
+					 */
+
+					/* Reset buffer status. */
+					yyrestart(yyin );
+
+					/*FALLTHROUGH*/
+
+				case EOB_ACT_END_OF_FILE:
+					{
+					if ( yywrap( ) )
+						return EOF;
+
+					if ( ! (yy_did_buffer_switch_on_eof) )
+						YY_NEW_FILE;
+#ifdef __cplusplus
+					return yyinput();
+#else
+					return input();
+#endif
+					}
+
+				case EOB_ACT_CONTINUE_SCAN:
+					(yy_c_buf_p) = (yytext_ptr) + offset;
+					break;
+				}
+			}
+		}
+
+	c = *(unsigned char *) (yy_c_buf_p);	/* cast for 8-bit char's */
+	*(yy_c_buf_p) = '\0';	/* preserve yytext */
+	(yy_hold_char) = *++(yy_c_buf_p);
+
+	return c;
+}
+#endif	/* ifndef YY_NO_INPUT */
+
+/** Immediately switch to a different input stream.
+ * @param input_file A readable stream.
+ *
+ * @note This function does not reset the start condition to @c INITIAL .
+ */
+    void yyrestart  (FILE * input_file )
+{
+
+	if ( ! YY_CURRENT_BUFFER ){
+        yyensure_buffer_stack ();
+		YY_CURRENT_BUFFER_LVALUE =
+            yy_create_buffer(yyin,YY_BUF_SIZE );
+	}
+
+	yy_init_buffer(YY_CURRENT_BUFFER,input_file );
+	yy_load_buffer_state( );
+}
+
+/** Switch to a different input buffer.
+ * @param new_buffer The new input buffer.
+ *
+ */
+    __attribute__((unused)) static void yy_switch_to_buffer  (YY_BUFFER_STATE  new_buffer )
+{
+
+	/* TODO. We should be able to replace this entire function body
+	 * with
+	 *		yypop_buffer_state();
+	 *		yypush_buffer_state(new_buffer);
+     */
+	yyensure_buffer_stack ();
+	if ( YY_CURRENT_BUFFER == new_buffer )
+		return;
+
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*(yy_c_buf_p) = (yy_hold_char);
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p);
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars);
+		}
+
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+	yy_load_buffer_state( );
+
+	/* We don't actually know whether we did this switch during
+	 * EOF (yywrap()) processing, but the only time this flag
+	 * is looked at is after yywrap() is called, so it's safe
+	 * to go ahead and always set it.
+	 */
+	(yy_did_buffer_switch_on_eof) = 1;
+}
+
+static void yy_load_buffer_state  (void)
+{
+    	(yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+	(yytext_ptr) = (yy_c_buf_p) = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos;
+	yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file;
+	(yy_hold_char) = *(yy_c_buf_p);
+}
+
+/** Allocate and initialize an input buffer state.
+ * @param file A readable stream.
+ * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE.
+ *
+ * @return the allocated buffer state.
+ */
+    static YY_BUFFER_STATE yy_create_buffer  (FILE * file, int  size )
+{
+	YY_BUFFER_STATE b;
+
+	b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state )  );
+	if ( ! b )
+		YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
+
+	b->yy_buf_size = size;
+
+	/* yy_ch_buf has to be 2 characters longer than the size given because
+	 * we need to put in 2 end-of-buffer characters.
+	 */
+	b->yy_ch_buf = (char *) yyalloc(b->yy_buf_size + 2  );
+	if ( ! b->yy_ch_buf )
+		YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
+
+	b->yy_is_our_buffer = 1;
+
+	yy_init_buffer(b,file );
+
+	return b;
+}
+
+/** Destroy the buffer.
+ * @param b a buffer created with yy_create_buffer()
+ *
+ */
+     void yy_delete_buffer (YY_BUFFER_STATE  b )
+{
+
+	if ( ! b )
+		return;
+
+	if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */
+		YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0;
+
+	if ( b->yy_is_our_buffer )
+		yyfree((void *) b->yy_ch_buf  );
+
+	yyfree((void *) b  );
+}
+
+/* Initializes or reinitializes a buffer.
+ * This function is sometimes called more than once on the same buffer,
+ * such as during a yyrestart() or at EOF.
+ */
+     static void yy_init_buffer  (YY_BUFFER_STATE  b, FILE * file )
+
+{
+	int oerrno = errno;
+
+	yy_flush_buffer(b );
+
+	b->yy_input_file = file;
+	b->yy_fill_buffer = 1;
+
+    /* If b is the current buffer, then yy_init_buffer was _probably_
+     * called from yyrestart() or through yy_get_next_buffer.
+     * In that case, we don't want to reset the lineno or column.
+     */
+    if (b != YY_CURRENT_BUFFER){
+        b->yy_bs_lineno = 1;
+        b->yy_bs_column = 0;
+    }
+
+        b->yy_is_interactive = 0;
+
+	errno = oerrno;
+}
+
+/** Discard all buffered characters. On the next scan, YY_INPUT will be called.
+ * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER.
+ *
+ */
+    void yy_flush_buffer (YY_BUFFER_STATE  b )
+{
+    	if ( ! b )
+		return;
+
+	b->yy_n_chars = 0;
+
+	/* We always need two end-of-buffer characters.  The first causes
+	 * a transition to the end-of-buffer state.  The second causes
+	 * a jam in that state.
+	 */
+	b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR;
+	b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR;
+
+	b->yy_buf_pos = &b->yy_ch_buf[0];
+
+	b->yy_at_bol = 1;
+	b->yy_buffer_status = YY_BUFFER_NEW;
+
+	if ( b == YY_CURRENT_BUFFER )
+		yy_load_buffer_state( );
+}
+
+/** Pushes the new state onto the stack. The new state becomes
+ *  the current state. This function will allocate the stack
+ *  if necessary.
+ *  @param new_buffer The new state.
+ *
+ */
+void yypush_buffer_state (YY_BUFFER_STATE new_buffer )
+{
+    	if (new_buffer == NULL)
+		return;
+
+	yyensure_buffer_stack();
+
+	/* This block is copied from yy_switch_to_buffer. */
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*(yy_c_buf_p) = (yy_hold_char);
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p);
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars);
+		}
+
+	/* Only push if top exists. Otherwise, replace top. */
+	if (YY_CURRENT_BUFFER)
+		(yy_buffer_stack_top)++;
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+
+	/* copied from yy_switch_to_buffer. */
+	yy_load_buffer_state( );
+	(yy_did_buffer_switch_on_eof) = 1;
+}
+
+/** Removes and deletes the top of the stack, if present.
+ *  The next element becomes the new top.
+ *
+ */
+void yypop_buffer_state (void)
+{
+    	if (!YY_CURRENT_BUFFER)
+		return;
+
+	yy_delete_buffer(YY_CURRENT_BUFFER );
+	YY_CURRENT_BUFFER_LVALUE = NULL;
+	if ((yy_buffer_stack_top) > 0)
+		--(yy_buffer_stack_top);
+
+	if (YY_CURRENT_BUFFER) {
+		yy_load_buffer_state( );
+		(yy_did_buffer_switch_on_eof) = 1;
+	}
+}
+
+/* Allocates the stack if it does not exist.
+ *  Guarantees space for at least one push.
+ */
+static void yyensure_buffer_stack (void)
+{
+	int num_to_alloc;
+
+	if (!(yy_buffer_stack)) {
+
+		/* First allocation is just for 2 elements, since we don't know if this
+		 * scanner will even need a stack. We use 2 instead of 1 to avoid an
+		 * immediate realloc on the next call.
+         */
+		num_to_alloc = 1;
+		(yy_buffer_stack) = (struct yy_buffer_state**)yyalloc
+								(num_to_alloc * sizeof(struct yy_buffer_state*)
+								);
+		if ( ! (yy_buffer_stack) )
+			YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
+
+		memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*));
+
+		(yy_buffer_stack_max) = num_to_alloc;
+		(yy_buffer_stack_top) = 0;
+		return;
+	}
+
+	if ((yy_buffer_stack_top) >= ((yy_buffer_stack_max)) - 1){
+
+		/* Increase the buffer to prepare for a possible push. */
+		int grow_size = 8 /* arbitrary grow size */;
+
+		num_to_alloc = static_cast<int>(
+			(yy_buffer_stack_max) + grow_size);
+		(yy_buffer_stack) = (struct yy_buffer_state**)yyrealloc
+								((yy_buffer_stack),
+								num_to_alloc * sizeof(struct yy_buffer_state*)
+								);
+		if ( ! (yy_buffer_stack) )
+			YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
+
+		/* zero only the new slots.*/
+		memset((yy_buffer_stack) + (yy_buffer_stack_max), 0, grow_size * sizeof(struct yy_buffer_state*));
+		(yy_buffer_stack_max) = num_to_alloc;
+	}
+}
+
+#ifndef YY_EXIT_FAILURE
+#define YY_EXIT_FAILURE 2
+#endif
+
+static void yy_fatal_error (yyconst char* msg )
+{
+    	(void) fprintf( stderr, "%s\n", msg );
+	exit( YY_EXIT_FAILURE );
+}
+
+/* Redefine yyless() so it works in section 3 code. */
+
+#undef yyless
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		yytext[yyleng] = (yy_hold_char); \
+		(yy_c_buf_p) = yytext + yyless_macro_arg; \
+		(yy_hold_char) = *(yy_c_buf_p); \
+		*(yy_c_buf_p) = '\0'; \
+		yyleng = yyless_macro_arg; \
+		} \
+	while ( 0 )
+
+/* Accessor  methods (get/set functions) to struct members. */
+
+/** Get the current line number.
+ *
+ */
+int yyget_lineno  (void)
+{
+
+    return yylineno;
+}
+
+/** Get the input stream.
+ *
+ */
+FILE *yyget_in  (void)
+{
+        return yyin;
+}
+
+/** Get the output stream.
+ *
+ */
+FILE *yyget_out  (void)
+{
+        return yyout;
+}
+
+/** Get the length of the current token.
+ *
+ */
+yy_size_t yyget_leng  (void)
+{
+        return yyleng;
+}
+
+/** Get the current token.
+ *
+ */
+
+char *yyget_text  (void)
+{
+        return yytext;
+}
+
+/** Set the current line number.
+ * @param line_number
+ *
+ */
+void yyset_lineno (int  line_number )
+{
+
+    yylineno = line_number;
+}
+
+/** Set the input stream. This does not discard the current
+ * input buffer.
+ * @param in_str A readable stream.
+ *
+ * @see yy_switch_to_buffer
+ */
+void yyset_in (FILE *  in_str )
+{
+        yyin = in_str ;
+}
+
+void yyset_out (FILE *  out_str )
+{
+        yyout = out_str ;
+}
+
+int yyget_debug  (void)
+{
+        return yy_flex_debug;
+}
+
+void yyset_debug (int  bdebug )
+{
+        yy_flex_debug = bdebug ;
+}
+
+static int yy_init_globals (void)
+{
+        /* Initialization is the same as for the non-reentrant scanner.
+     * This function is called from yylex_destroy(), so don't allocate here.
+     */
+
+    (yy_buffer_stack) = 0;
+    (yy_buffer_stack_top) = 0;
+    (yy_buffer_stack_max) = 0;
+    (yy_c_buf_p) = (char *) 0;
+    (yy_init) = 0;
+    (yy_start) = 0;
+
+/* Defined in main.c */
+#ifdef YY_STDINIT
+    yyin = stdin;
+    yyout = stdout;
+#else
+    yyin = (FILE *) 0;
+    yyout = (FILE *) 0;
+#endif
+
+    /* For future reference: Set errno on error, since we are called by
+     * yylex_init()
+     */
+    return 0;
+}
+
+/* yylex_destroy is for both reentrant and non-reentrant scanners. */
+__attribute__((unused)) static int yylex_destroy  (void)
+{
+
+    /* Pop the buffer stack, destroying each element. */
+	while(YY_CURRENT_BUFFER){
+		yy_delete_buffer(YY_CURRENT_BUFFER  );
+		YY_CURRENT_BUFFER_LVALUE = NULL;
+		yypop_buffer_state();
+	}
+
+	/* Destroy the stack itself. */
+	yyfree((yy_buffer_stack) );
+	(yy_buffer_stack) = NULL;
+
+    /* Reset the globals. This is important in a non-reentrant scanner so the next time
+     * yylex() is called, initialization will occur. */
+    yy_init_globals( );
+
+    return 0;
+}
+
+/*
+ * Internal utility routines.
+ */
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char* s1, yyconst char * s2, int n )
+{
+	register int i;
+	for ( i = 0; i < n; ++i )
+		s1[i] = s2[i];
+}
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * s )
+{
+	register int n;
+	for ( n = 0; s[n]; ++n )
+		;
+
+	return n;
+}
+#endif
+
+void *yyalloc (yy_size_t  size )
+{
+	return (void *) malloc( size );
+}
+
+void *yyrealloc  (void * ptr, yy_size_t  size )
+{
+	/* The cast to (char *) in the following accommodates both
+	 * implementations that use char* generic pointers, and those
+	 * that use void* generic pointers.  It works with the latter
+	 * because both ANSI C and C++ allow castless assignment from
+	 * any pointer type to void*, and deal with argument conversions
+	 * as though doing an assignment.
+	 */
+	return (void *) realloc( (char *) ptr, size );
+}
+
+void yyfree (void * ptr )
+{
+	free( (char*) ptr );	/* see yyrealloc() for (char *) cast */
+}
+
+#define YYTABLES_NAME "yytables"
+
+#line 691 "pars0lex.l"
+
+
+
+/**********************************************************************
+Release any resources used by the lexer. */
+UNIV_INTERN
+void
+pars_lexer_close(void)
+/*==================*/
+{
+	yylex_destroy();
+	free(stringbuf);
+	stringbuf = NULL;
+	stringbuf_len_alloc = stringbuf_len = 0;
+}
+
diff --git a/storage/innobase/pars/make_bison.sh b/storage/innobase/pars/make_bison.sh
new file mode 100755
index 00000000000..2618be102bc
--- /dev/null
+++ b/storage/innobase/pars/make_bison.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+#
+# Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+#
+# generate parser files from bison input files.
+
+set -eu
+TMPFILE=pars0grm.tab.c
+OUTFILE=pars0grm.cc
+
+bison -d pars0grm.y
+mv pars0grm.tab.h ../include/pars0grm.h
+
+sed -e '
+s/'"$TMPFILE"'/'"$OUTFILE"'/;
+s/^\(\(YYSTYPE\|int\) yy\(char\|nerrs\)\)/static \1/;
+s/\(\(YYSTYPE\|int\) yy\(lval\|parse\)\)/UNIV_INTERN \1/;
+' < "$TMPFILE" > "$OUTFILE"
+
+rm "$TMPFILE"
diff --git a/storage/innobase/pars/make_flex.sh b/storage/innobase/pars/make_flex.sh
new file mode 100755
index 00000000000..581fc2342aa
--- /dev/null
+++ b/storage/innobase/pars/make_flex.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+#
+# Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+#
+# generate lexer files from flex input files.
+
+set -eu
+
+TMPFILE=_flex_tmp.cc
+OUTFILE=lexyy.cc
+
+flex -o $TMPFILE pars0lex.l
+
+# AIX needs its includes done in a certain order, so include "univ.i" first
+# to be sure we get it right.
+echo '#include "univ.i"' > $OUTFILE
+
+# flex assigns a pointer to an int in one place without a cast, resulting in
+# a warning on Win64.  Add the cast.  Also define some symbols as static.
+sed -e '
+s/'"$TMPFILE"'/'"$OUTFILE"'/;
+s/\(int offset = \)\((yy_c_buf_p) - (yytext_ptr)\);/\1(int)(\2);/;
+s/\(void yy\(restart\|_\(delete\|flush\)_buffer\)\)/static \1/;
+s/\(void yy_switch_to_buffer\)/__attribute__((unused)) static \1/;
+s/\(void yy\(push\|pop\)_buffer_state\)/__attribute__((unused)) static \1/;
+s/\(YY_BUFFER_STATE yy_create_buffer\)/static \1/;
+s/\(\(int\|void\) yy[gs]et_\)/__attribute__((unused)) static \1/;
+s/\(void \*\?yy\(\(re\)\?alloc\|free\)\)/static \1/;
+s/\(extern \)\?\(int yy\(leng\|lineno\|_flex_debug\)\)/static \2/;
+s/\(int yylex_destroy\)/__attribute__((unused)) static \1/;
+s/\(extern \)\?\(int yylex \)/UNIV_INTERN \2/;
+s/^\(\(FILE\|char\) *\* *yyget\)/__attribute__((unused)) static \1/;
+s/^\(extern \)\?\(\(FILE\|char\) *\* *yy\)/static \2/;
+' < $TMPFILE >> $OUTFILE
+
+rm $TMPFILE
diff --git a/storage/innobase/pars/pars0grm.cc b/storage/innobase/pars/pars0grm.cc
new file mode 100644
index 00000000000..b360f36e597
--- /dev/null
+++ b/storage/innobase/pars/pars0grm.cc
@@ -0,0 +1,3034 @@
+/* A Bison parser, made by GNU Bison 2.3.  */
+
+/* Skeleton implementation for Bison's Yacc-like parsers in C
+
+   Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004, 2005, 2006
+   Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor,
+   Boston, MA 02110-1301, USA.  */
+
+/* As a special exception, you may create a larger work that contains
+   part or all of the Bison parser skeleton and distribute that work
+   under terms of your choice, so long as that work isn't itself a
+   parser generator using the skeleton or a modified version thereof
+   as a parser skeleton.  Alternatively, if you modify or redistribute
+   the parser skeleton itself, you may (at your option) remove this
+   special exception, which will cause the skeleton and the resulting
+   Bison output files to be licensed under the GNU General Public
+   License without this special exception.
+
+   This special exception was added by the Free Software Foundation in
+   version 2.2 of Bison.  */
+
+/* C LALR(1) parser skeleton written by Richard Stallman, by
+   simplifying the original so-called "semantic" parser.  */
+
+/* All symbols defined below should begin with yy or YY, to avoid
+   infringing on user name space.  This should be done even for local
+   variables, as they might otherwise be expanded by user macros.
+   There are some unavoidable exceptions within include files to
+   define necessary library symbols; they are noted "INFRINGES ON
+   USER NAME SPACE" below.  */
+
+/* Identify Bison output.  */
+#define YYBISON 1
+
+/* Bison version.  */
+#define YYBISON_VERSION "2.3"
+
+/* Skeleton name.  */
+#define YYSKELETON_NAME "yacc.c"
+
+/* Pure parsers.  */
+#define YYPURE 0
+
+/* Using locations.  */
+#define YYLSP_NEEDED 0
+
+
+
+/* Tokens.  */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+   /* Put the tokens into the symbol table, so that GDB and other debuggers
+      know about them.  */
+   enum yytokentype {
+     PARS_INT_LIT = 258,
+     PARS_FLOAT_LIT = 259,
+     PARS_STR_LIT = 260,
+     PARS_FIXBINARY_LIT = 261,
+     PARS_BLOB_LIT = 262,
+     PARS_NULL_LIT = 263,
+     PARS_ID_TOKEN = 264,
+     PARS_AND_TOKEN = 265,
+     PARS_OR_TOKEN = 266,
+     PARS_NOT_TOKEN = 267,
+     PARS_GE_TOKEN = 268,
+     PARS_LE_TOKEN = 269,
+     PARS_NE_TOKEN = 270,
+     PARS_PROCEDURE_TOKEN = 271,
+     PARS_IN_TOKEN = 272,
+     PARS_OUT_TOKEN = 273,
+     PARS_BINARY_TOKEN = 274,
+     PARS_BLOB_TOKEN = 275,
+     PARS_INT_TOKEN = 276,
+     PARS_INTEGER_TOKEN = 277,
+     PARS_FLOAT_TOKEN = 278,
+     PARS_CHAR_TOKEN = 279,
+     PARS_IS_TOKEN = 280,
+     PARS_BEGIN_TOKEN = 281,
+     PARS_END_TOKEN = 282,
+     PARS_IF_TOKEN = 283,
+     PARS_THEN_TOKEN = 284,
+     PARS_ELSE_TOKEN = 285,
+     PARS_ELSIF_TOKEN = 286,
+     PARS_LOOP_TOKEN = 287,
+     PARS_WHILE_TOKEN = 288,
+     PARS_RETURN_TOKEN = 289,
+     PARS_SELECT_TOKEN = 290,
+     PARS_SUM_TOKEN = 291,
+     PARS_COUNT_TOKEN = 292,
+     PARS_DISTINCT_TOKEN = 293,
+     PARS_FROM_TOKEN = 294,
+     PARS_WHERE_TOKEN = 295,
+     PARS_FOR_TOKEN = 296,
+     PARS_DDOT_TOKEN = 297,
+     PARS_READ_TOKEN = 298,
+     PARS_ORDER_TOKEN = 299,
+     PARS_BY_TOKEN = 300,
+     PARS_ASC_TOKEN = 301,
+     PARS_DESC_TOKEN = 302,
+     PARS_INSERT_TOKEN = 303,
+     PARS_INTO_TOKEN = 304,
+     PARS_VALUES_TOKEN = 305,
+     PARS_UPDATE_TOKEN = 306,
+     PARS_SET_TOKEN = 307,
+     PARS_DELETE_TOKEN = 308,
+     PARS_CURRENT_TOKEN = 309,
+     PARS_OF_TOKEN = 310,
+     PARS_CREATE_TOKEN = 311,
+     PARS_TABLE_TOKEN = 312,
+     PARS_INDEX_TOKEN = 313,
+     PARS_UNIQUE_TOKEN = 314,
+     PARS_CLUSTERED_TOKEN = 315,
+     PARS_DOES_NOT_FIT_IN_MEM_TOKEN = 316,
+     PARS_ON_TOKEN = 317,
+     PARS_ASSIGN_TOKEN = 318,
+     PARS_DECLARE_TOKEN = 319,
+     PARS_CURSOR_TOKEN = 320,
+     PARS_SQL_TOKEN = 321,
+     PARS_OPEN_TOKEN = 322,
+     PARS_FETCH_TOKEN = 323,
+     PARS_CLOSE_TOKEN = 324,
+     PARS_NOTFOUND_TOKEN = 325,
+     PARS_TO_CHAR_TOKEN = 326,
+     PARS_TO_NUMBER_TOKEN = 327,
+     PARS_TO_BINARY_TOKEN = 328,
+     PARS_BINARY_TO_NUMBER_TOKEN = 329,
+     PARS_SUBSTR_TOKEN = 330,
+     PARS_REPLSTR_TOKEN = 331,
+     PARS_CONCAT_TOKEN = 332,
+     PARS_INSTR_TOKEN = 333,
+     PARS_LENGTH_TOKEN = 334,
+     PARS_SYSDATE_TOKEN = 335,
+     PARS_PRINTF_TOKEN = 336,
+     PARS_ASSERT_TOKEN = 337,
+     PARS_RND_TOKEN = 338,
+     PARS_RND_STR_TOKEN = 339,
+     PARS_ROW_PRINTF_TOKEN = 340,
+     PARS_COMMIT_TOKEN = 341,
+     PARS_ROLLBACK_TOKEN = 342,
+     PARS_WORK_TOKEN = 343,
+     PARS_UNSIGNED_TOKEN = 344,
+     PARS_EXIT_TOKEN = 345,
+     PARS_FUNCTION_TOKEN = 346,
+     PARS_LOCK_TOKEN = 347,
+     PARS_SHARE_TOKEN = 348,
+     PARS_MODE_TOKEN = 349,
+     PARS_LIKE_TOKEN = 350,
+     PARS_LIKE_TOKEN_EXACT = 351,
+     PARS_LIKE_TOKEN_PREFIX = 352,
+     PARS_LIKE_TOKEN_SUFFIX = 353,
+     PARS_LIKE_TOKEN_SUBSTR = 354,
+     PARS_TABLE_NAME_TOKEN = 355,
+     PARS_COMPACT_TOKEN = 356,
+     PARS_BLOCK_SIZE_TOKEN = 357,
+     PARS_BIGINT_TOKEN = 358,
+     NEG = 359
+   };
+#endif
+/* Tokens.  */
+#define PARS_INT_LIT 258
+#define PARS_FLOAT_LIT 259
+#define PARS_STR_LIT 260
+#define PARS_FIXBINARY_LIT 261
+#define PARS_BLOB_LIT 262
+#define PARS_NULL_LIT 263
+#define PARS_ID_TOKEN 264
+#define PARS_AND_TOKEN 265
+#define PARS_OR_TOKEN 266
+#define PARS_NOT_TOKEN 267
+#define PARS_GE_TOKEN 268
+#define PARS_LE_TOKEN 269
+#define PARS_NE_TOKEN 270
+#define PARS_PROCEDURE_TOKEN 271
+#define PARS_IN_TOKEN 272
+#define PARS_OUT_TOKEN 273
+#define PARS_BINARY_TOKEN 274
+#define PARS_BLOB_TOKEN 275
+#define PARS_INT_TOKEN 276
+#define PARS_INTEGER_TOKEN 277
+#define PARS_FLOAT_TOKEN 278
+#define PARS_CHAR_TOKEN 279
+#define PARS_IS_TOKEN 280
+#define PARS_BEGIN_TOKEN 281
+#define PARS_END_TOKEN 282
+#define PARS_IF_TOKEN 283
+#define PARS_THEN_TOKEN 284
+#define PARS_ELSE_TOKEN 285
+#define PARS_ELSIF_TOKEN 286
+#define PARS_LOOP_TOKEN 287
+#define PARS_WHILE_TOKEN 288
+#define PARS_RETURN_TOKEN 289
+#define PARS_SELECT_TOKEN 290
+#define PARS_SUM_TOKEN 291
+#define PARS_COUNT_TOKEN 292
+#define PARS_DISTINCT_TOKEN 293
+#define PARS_FROM_TOKEN 294
+#define PARS_WHERE_TOKEN 295
+#define PARS_FOR_TOKEN 296
+#define PARS_DDOT_TOKEN 297
+#define PARS_READ_TOKEN 298
+#define PARS_ORDER_TOKEN 299
+#define PARS_BY_TOKEN 300
+#define PARS_ASC_TOKEN 301
+#define PARS_DESC_TOKEN 302
+#define PARS_INSERT_TOKEN 303
+#define PARS_INTO_TOKEN 304
+#define PARS_VALUES_TOKEN 305
+#define PARS_UPDATE_TOKEN 306
+#define PARS_SET_TOKEN 307
+#define PARS_DELETE_TOKEN 308
+#define PARS_CURRENT_TOKEN 309
+#define PARS_OF_TOKEN 310
+#define PARS_CREATE_TOKEN 311
+#define PARS_TABLE_TOKEN 312
+#define PARS_INDEX_TOKEN 313
+#define PARS_UNIQUE_TOKEN 314
+#define PARS_CLUSTERED_TOKEN 315
+#define PARS_DOES_NOT_FIT_IN_MEM_TOKEN 316
+#define PARS_ON_TOKEN 317
+#define PARS_ASSIGN_TOKEN 318
+#define PARS_DECLARE_TOKEN 319
+#define PARS_CURSOR_TOKEN 320
+#define PARS_SQL_TOKEN 321
+#define PARS_OPEN_TOKEN 322
+#define PARS_FETCH_TOKEN 323
+#define PARS_CLOSE_TOKEN 324
+#define PARS_NOTFOUND_TOKEN 325
+#define PARS_TO_CHAR_TOKEN 326
+#define PARS_TO_NUMBER_TOKEN 327
+#define PARS_TO_BINARY_TOKEN 328
+#define PARS_BINARY_TO_NUMBER_TOKEN 329
+#define PARS_SUBSTR_TOKEN 330
+#define PARS_REPLSTR_TOKEN 331
+#define PARS_CONCAT_TOKEN 332
+#define PARS_INSTR_TOKEN 333
+#define PARS_LENGTH_TOKEN 334
+#define PARS_SYSDATE_TOKEN 335
+#define PARS_PRINTF_TOKEN 336
+#define PARS_ASSERT_TOKEN 337
+#define PARS_RND_TOKEN 338
+#define PARS_RND_STR_TOKEN 339
+#define PARS_ROW_PRINTF_TOKEN 340
+#define PARS_COMMIT_TOKEN 341
+#define PARS_ROLLBACK_TOKEN 342
+#define PARS_WORK_TOKEN 343
+#define PARS_UNSIGNED_TOKEN 344
+#define PARS_EXIT_TOKEN 345
+#define PARS_FUNCTION_TOKEN 346
+#define PARS_LOCK_TOKEN 347
+#define PARS_SHARE_TOKEN 348
+#define PARS_MODE_TOKEN 349
+#define PARS_LIKE_TOKEN 350
+#define PARS_LIKE_TOKEN_EXACT 351
+#define PARS_LIKE_TOKEN_PREFIX 352
+#define PARS_LIKE_TOKEN_SUFFIX 353
+#define PARS_LIKE_TOKEN_SUBSTR 354
+#define PARS_TABLE_NAME_TOKEN 355
+#define PARS_COMPACT_TOKEN 356
+#define PARS_BLOCK_SIZE_TOKEN 357
+#define PARS_BIGINT_TOKEN 358
+#define NEG 359
+
+
+
+
+/* Copy the first part of user declarations.  */
+#line 28 "pars0grm.y"
+
+/* The value of the semantic attribute is a pointer to a query tree node
+que_node_t */
+
+#include "univ.i"
+#include <math.h>				/* Can't be before univ.i */
+#include "pars0pars.h"
+#include "mem0mem.h"
+#include "que0types.h"
+#include "que0que.h"
+#include "row0sel.h"
+
+#define YYSTYPE que_node_t*
+
+/* #define __STDC__ */
+
+int
+yylex(void);
+
+
+/* Enabling traces.  */
+#ifndef YYDEBUG
+# define YYDEBUG 0
+#endif
+
+/* Enabling verbose error messages.  */
+#ifdef YYERROR_VERBOSE
+# undef YYERROR_VERBOSE
+# define YYERROR_VERBOSE 1
+#else
+# define YYERROR_VERBOSE 0
+#endif
+
+/* Enabling the token table.  */
+#ifndef YYTOKEN_TABLE
+# define YYTOKEN_TABLE 0
+#endif
+
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef int YYSTYPE;
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+# define YYSTYPE_IS_TRIVIAL 1
+#endif
+
+
+
+/* Copy the second part of user declarations.  */
+
+
+/* Line 216 of yacc.c.  */
+#line 334 "pars0grm.cc"
+
+#ifdef short
+# undef short
+#endif
+
+#ifdef YYTYPE_UINT8
+typedef YYTYPE_UINT8 yytype_uint8;
+#else
+typedef unsigned char yytype_uint8;
+#endif
+
+#ifdef YYTYPE_INT8
+typedef YYTYPE_INT8 yytype_int8;
+#elif (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+typedef signed char yytype_int8;
+#else
+typedef short int yytype_int8;
+#endif
+
+#ifdef YYTYPE_UINT16
+typedef YYTYPE_UINT16 yytype_uint16;
+#else
+typedef unsigned short int yytype_uint16;
+#endif
+
+#ifdef YYTYPE_INT16
+typedef YYTYPE_INT16 yytype_int16;
+#else
+typedef short int yytype_int16;
+#endif
+
+#ifndef YYSIZE_T
+# ifdef __SIZE_TYPE__
+#  define YYSIZE_T __SIZE_TYPE__
+# elif defined size_t
+#  define YYSIZE_T size_t
+# elif ! defined YYSIZE_T && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+#  include <stddef.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYSIZE_T size_t
+# else
+#  define YYSIZE_T unsigned int
+# endif
+#endif
+
+#define YYSIZE_MAXIMUM ((YYSIZE_T) -1)
+
+#ifndef YY_
+# if defined YYENABLE_NLS && YYENABLE_NLS
+#  if ENABLE_NLS
+#   include <libintl.h> /* INFRINGES ON USER NAME SPACE */
+#   define YY_(msgid) dgettext ("bison-runtime", msgid)
+#  endif
+# endif
+# ifndef YY_
+#  define YY_(msgid) msgid
+# endif
+#endif
+
+/* Suppress unused-variable warnings by "using" E.  */
+#if ! defined lint || defined __GNUC__
+# define YYUSE(e) ((void) (e))
+#else
+# define YYUSE(e) /* empty */
+#endif
+
+/* Identity function, used to suppress warnings about constant conditions.  */
+#ifndef lint
+# define YYID(n) (n)
+#else
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static int
+YYID (int i)
+#else
+static int
+YYID (i)
+    int i;
+#endif
+{
+  return i;
+}
+#endif
+
+#if ! defined yyoverflow || YYERROR_VERBOSE
+
+/* The parser invokes alloca or malloc; define the necessary symbols.  */
+
+# ifdef YYSTACK_USE_ALLOCA
+#  if YYSTACK_USE_ALLOCA
+#   ifdef __GNUC__
+#    define YYSTACK_ALLOC __builtin_alloca
+#   elif defined __BUILTIN_VA_ARG_INCR
+#    include <alloca.h> /* INFRINGES ON USER NAME SPACE */
+#   elif defined _AIX
+#    define YYSTACK_ALLOC __alloca
+#   elif defined _MSC_VER
+#    include <malloc.h> /* INFRINGES ON USER NAME SPACE */
+#    define alloca _alloca
+#   else
+#    define YYSTACK_ALLOC alloca
+#    if ! defined _ALLOCA_H && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+#     include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+#     ifndef _STDLIB_H
+#      define _STDLIB_H 1
+#     endif
+#    endif
+#   endif
+#  endif
+# endif
+
+# ifdef YYSTACK_ALLOC
+   /* Pacify GCC's `empty if-body' warning.  */
+#  define YYSTACK_FREE(Ptr) do { /* empty */; } while (YYID (0))
+#  ifndef YYSTACK_ALLOC_MAXIMUM
+    /* The OS might guarantee only one guard page at the bottom of the stack,
+       and a page size can be as small as 4096 bytes.  So we cannot safely
+       invoke alloca (N) if N exceeds 4096.  Use a slightly smaller number
+       to allow for a few compiler-allocated temporary stack slots.  */
+#   define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */
+#  endif
+# else
+#  define YYSTACK_ALLOC YYMALLOC
+#  define YYSTACK_FREE YYFREE
+#  ifndef YYSTACK_ALLOC_MAXIMUM
+#   define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM
+#  endif
+#  if (defined __cplusplus && ! defined _STDLIB_H \
+       && ! ((defined YYMALLOC || defined malloc) \
+	     && (defined YYFREE || defined free)))
+#   include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+#   ifndef _STDLIB_H
+#    define _STDLIB_H 1
+#   endif
+#  endif
+#  ifndef YYMALLOC
+#   define YYMALLOC malloc
+#   if ! defined malloc && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */
+#   endif
+#  endif
+#  ifndef YYFREE
+#   define YYFREE free
+#   if ! defined free && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+void free (void*); /* INFRINGES ON USER NAME SPACE */
+#   endif
+#  endif
+# endif
+#endif /* ! defined yyoverflow || YYERROR_VERBOSE */
+
+
+#if (! defined yyoverflow \
+     && (! defined __cplusplus \
+	 || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL)))
+
+/* A type that is properly aligned for any stack member.  */
+union yyalloc
+{
+  yytype_int16 yyss;
+  YYSTYPE yyvs;
+  };
+
+/* The size of the maximum gap between one aligned stack and the next.  */
+# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1)
+
+/* The size of an array large to enough to hold all stacks, each with
+   N elements.  */
+# define YYSTACK_BYTES(N) \
+     ((N) * (sizeof (yytype_int16) + sizeof (YYSTYPE)) \
+      + YYSTACK_GAP_MAXIMUM)
+
+/* Copy COUNT objects from FROM to TO.  The source and destination do
+   not overlap.  */
+# ifndef YYCOPY
+#  if defined __GNUC__ && 1 < __GNUC__
+#   define YYCOPY(To, From, Count) \
+      __builtin_memcpy (To, From, (Count) * sizeof (*(From)))
+#  else
+#   define YYCOPY(To, From, Count)		\
+      do					\
+	{					\
+	  YYSIZE_T yyi;				\
+	  for (yyi = 0; yyi < (Count); yyi++)	\
+	    (To)[yyi] = (From)[yyi];		\
+	}					\
+      while (YYID (0))
+#  endif
+# endif
+
+/* Relocate STACK from its old location to the new one.  The
+   local variables YYSIZE and YYSTACKSIZE give the old and new number of
+   elements in the stack, and YYPTR gives the new location of the
+   stack.  Advance YYPTR to a properly aligned location for the next
+   stack.  */
+# define YYSTACK_RELOCATE(Stack)					\
+    do									\
+      {									\
+	YYSIZE_T yynewbytes;						\
+	YYCOPY (&yyptr->Stack, Stack, yysize);				\
+	Stack = &yyptr->Stack;						\
+	yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \
+	yyptr += yynewbytes / sizeof (*yyptr);				\
+      }									\
+    while (YYID (0))
+
+#endif
+
+/* YYFINAL -- State number of the termination state.  */
+#define YYFINAL  5
+/* YYLAST -- Last index in YYTABLE.  */
+#define YYLAST   816
+
+/* YYNTOKENS -- Number of terminals.  */
+#define YYNTOKENS  120
+/* YYNNTS -- Number of nonterminals.  */
+#define YYNNTS  73
+/* YYNRULES -- Number of rules.  */
+#define YYNRULES  183
+/* YYNRULES -- Number of states.  */
+#define YYNSTATES  350
+
+/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX.  */
+#define YYUNDEFTOK  2
+#define YYMAXUTOK   359
+
+#define YYTRANSLATE(YYX)						\
+  ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK)
+
+/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX.  */
+static const yytype_uint8 yytranslate[] =
+{
+       0,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,   112,     2,     2,
+     114,   115,   109,   108,   117,   107,     2,   110,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,   113,
+     105,   104,   106,   116,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,   118,     2,   119,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     1,     2,     3,     4,
+       5,     6,     7,     8,     9,    10,    11,    12,    13,    14,
+      15,    16,    17,    18,    19,    20,    21,    22,    23,    24,
+      25,    26,    27,    28,    29,    30,    31,    32,    33,    34,
+      35,    36,    37,    38,    39,    40,    41,    42,    43,    44,
+      45,    46,    47,    48,    49,    50,    51,    52,    53,    54,
+      55,    56,    57,    58,    59,    60,    61,    62,    63,    64,
+      65,    66,    67,    68,    69,    70,    71,    72,    73,    74,
+      75,    76,    77,    78,    79,    80,    81,    82,    83,    84,
+      85,    86,    87,    88,    89,    90,    91,    92,    93,    94,
+      95,    96,    97,    98,    99,   100,   101,   102,   103,   111
+};
+
+#if YYDEBUG
+/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in
+   YYRHS.  */
+static const yytype_uint16 yyprhs[] =
+{
+       0,     0,     3,     6,     8,    11,    14,    17,    20,    23,
+      26,    29,    32,    35,    38,    41,    44,    47,    50,    53,
+      56,    59,    62,    65,    68,    71,    73,    76,    78,    83,
+      85,    87,    89,    91,    93,    95,    97,   101,   105,   109,
+     113,   116,   120,   124,   128,   132,   136,   140,   144,   148,
+     152,   156,   159,   163,   167,   169,   171,   173,   175,   177,
+     179,   181,   183,   185,   187,   189,   190,   192,   196,   203,
+     208,   210,   212,   214,   218,   220,   224,   225,   227,   231,
+     232,   234,   238,   240,   245,   251,   256,   257,   259,   263,
+     265,   269,   271,   272,   275,   276,   279,   280,   285,   286,
+     288,   290,   291,   296,   305,   309,   315,   318,   322,   324,
+     328,   333,   338,   341,   344,   348,   351,   354,   357,   361,
+     366,   368,   371,   372,   375,   377,   385,   392,   403,   405,
+     407,   410,   413,   418,   423,   429,   431,   435,   436,   440,
+     441,   443,   444,   447,   448,   450,   451,   453,   454,   458,
+     468,   470,   474,   475,   477,   478,   480,   491,   493,   495,
+     498,   501,   503,   505,   507,   509,   511,   513,   517,   521,
+     522,   524,   528,   532,   533,   535,   538,   545,   550,   552,
+     554,   555,   557,   560
+};
+
+/* YYRHS -- A `-1'-separated list of the rules' RHS.  */
+static const yytype_int16 yyrhs[] =
+{
+     121,     0,    -1,   192,   113,    -1,   127,    -1,   128,   113,
+      -1,   160,   113,    -1,   161,   113,    -1,   162,   113,    -1,
+     159,   113,    -1,   163,   113,    -1,   155,   113,    -1,   142,
+     113,    -1,   144,   113,    -1,   154,   113,    -1,   152,   113,
+      -1,   153,   113,    -1,   149,   113,    -1,   150,   113,    -1,
+     164,   113,    -1,   166,   113,    -1,   165,   113,    -1,   181,
+     113,    -1,   182,   113,    -1,   175,   113,    -1,   179,   113,
+      -1,   122,    -1,   123,   122,    -1,     9,    -1,   125,   114,
+     133,   115,    -1,     3,    -1,     4,    -1,     5,    -1,     6,
+      -1,     7,    -1,     8,    -1,    66,    -1,   124,   108,   124,
+      -1,   124,   107,   124,    -1,   124,   109,   124,    -1,   124,
+     110,   124,    -1,   107,   124,    -1,   114,   124,   115,    -1,
+     124,   104,   124,    -1,   124,    95,     5,    -1,   124,   105,
+     124,    -1,   124,   106,   124,    -1,   124,    13,   124,    -1,
+     124,    14,   124,    -1,   124,    15,   124,    -1,   124,    10,
+     124,    -1,   124,    11,   124,    -1,    12,   124,    -1,     9,
+     112,    70,    -1,    66,   112,    70,    -1,    71,    -1,    72,
+      -1,    73,    -1,    74,    -1,    75,    -1,    77,    -1,    78,
+      -1,    79,    -1,    80,    -1,    83,    -1,    84,    -1,    -1,
+     116,    -1,   126,   117,   116,    -1,   118,     9,   114,   126,
+     115,   119,    -1,   129,   114,   133,   115,    -1,    76,    -1,
+      81,    -1,    82,    -1,     9,   114,   115,    -1,   180,    -1,
+     131,   117,   180,    -1,    -1,     9,    -1,   132,   117,     9,
+      -1,    -1,   124,    -1,   133,   117,   124,    -1,   124,    -1,
+      37,   114,   109,   115,    -1,    37,   114,    38,     9,   115,
+      -1,    36,   114,   124,   115,    -1,    -1,   134,    -1,   135,
+     117,   134,    -1,   109,    -1,   135,    49,   132,    -1,   135,
+      -1,    -1,    40,   124,    -1,    -1,    41,    51,    -1,    -1,
+      92,    17,    93,    94,    -1,    -1,    46,    -1,    47,    -1,
+      -1,    44,    45,     9,   140,    -1,    35,   136,    39,   131,
+     137,   138,   139,   141,    -1,    48,    49,   180,    -1,   143,
+      50,   114,   133,   115,    -1,   143,   142,    -1,     9,   104,
+     124,    -1,   145,    -1,   146,   117,   145,    -1,    40,    54,
+      55,     9,    -1,    51,   180,    52,   146,    -1,   148,   137,
+      -1,   148,   147,    -1,    53,    39,   180,    -1,   151,   137,
+      -1,   151,   147,    -1,    85,   142,    -1,     9,    63,   124,
+      -1,    31,   124,    29,   123,    -1,   156,    -1,   157,   156,
+      -1,    -1,    30,   123,    -1,   157,    -1,    28,   124,    29,
+     123,   158,    27,    28,    -1,    33,   124,    32,   123,    27,
+      32,    -1,    41,     9,    17,   124,    42,   124,    32,   123,
+      27,    32,    -1,    90,    -1,    34,    -1,    67,     9,    -1,
+      69,     9,    -1,    68,     9,    49,   132,    -1,    68,     9,
+      49,   130,    -1,     9,   183,   169,   170,   171,    -1,   167,
+      -1,   168,   117,   167,    -1,    -1,   114,     3,   115,    -1,
+      -1,    89,    -1,    -1,    12,     8,    -1,    -1,    61,    -1,
+      -1,   101,    -1,    -1,   102,   104,     3,    -1,    56,    57,
+     180,   114,   168,   115,   172,   173,   174,    -1,     9,    -1,
+     176,   117,     9,    -1,    -1,    59,    -1,    -1,    60,    -1,
+      56,   177,   178,    58,     9,    62,   180,   114,   176,   115,
+      -1,     9,    -1,   100,    -1,    86,    88,    -1,    87,    88,
+      -1,    21,    -1,    22,    -1,   103,    -1,    24,    -1,    19,
+      -1,    20,    -1,     9,    17,   183,    -1,     9,    18,   183,
+      -1,    -1,   184,    -1,   185,   117,   184,    -1,     9,   183,
+     113,    -1,    -1,   186,    -1,   187,   186,    -1,    64,    65,
+       9,    25,   142,   113,    -1,    64,    91,     9,   113,    -1,
+     188,    -1,   189,    -1,    -1,   190,    -1,   191,   190,    -1,
+      16,     9,   114,   185,   115,    25,   187,   191,    26,   123,
+      27,    -1
+};
+
+/* YYRLINE[YYN] -- source line where rule number YYN was defined.  */
+static const yytype_uint16 yyrline[] =
+{
+       0,   162,   162,   165,   166,   167,   168,   169,   170,   171,
+     172,   173,   174,   175,   176,   177,   178,   179,   180,   181,
+     182,   183,   184,   185,   186,   190,   191,   196,   197,   199,
+     200,   201,   202,   203,   204,   205,   206,   207,   208,   209,
+     210,   211,   212,   213,   215,   216,   217,   218,   219,   220,
+     221,   222,   223,   225,   230,   231,   232,   233,   235,   236,
+     237,   238,   239,   240,   241,   244,   246,   247,   251,   257,
+     262,   263,   264,   268,   272,   273,   278,   279,   280,   285,
+     286,   287,   291,   292,   297,   303,   310,   311,   312,   317,
+     319,   322,   326,   327,   331,   332,   337,   338,   343,   344,
+     345,   349,   350,   357,   372,   377,   380,   388,   394,   395,
+     400,   406,   415,   423,   431,   438,   446,   454,   460,   467,
+     473,   474,   479,   480,   482,   486,   493,   499,   509,   513,
+     517,   524,   531,   535,   543,   552,   553,   558,   559,   564,
+     565,   571,   572,   578,   579,   585,   586,   591,   592,   597,
+     608,   609,   614,   615,   619,   620,   624,   638,   639,   643,
+     648,   653,   654,   655,   656,   657,   658,   662,   667,   675,
+     676,   677,   682,   688,   690,   691,   695,   703,   709,   710,
+     713,   715,   716,   720
+};
+#endif
+
+#if YYDEBUG || YYERROR_VERBOSE || YYTOKEN_TABLE
+/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
+   First, the terminals, then, starting at YYNTOKENS, nonterminals.  */
+static const char *const yytname[] =
+{
+  "$end", "error", "$undefined", "PARS_INT_LIT", "PARS_FLOAT_LIT",
+  "PARS_STR_LIT", "PARS_FIXBINARY_LIT", "PARS_BLOB_LIT", "PARS_NULL_LIT",
+  "PARS_ID_TOKEN", "PARS_AND_TOKEN", "PARS_OR_TOKEN", "PARS_NOT_TOKEN",
+  "PARS_GE_TOKEN", "PARS_LE_TOKEN", "PARS_NE_TOKEN",
+  "PARS_PROCEDURE_TOKEN", "PARS_IN_TOKEN", "PARS_OUT_TOKEN",
+  "PARS_BINARY_TOKEN", "PARS_BLOB_TOKEN", "PARS_INT_TOKEN",
+  "PARS_INTEGER_TOKEN", "PARS_FLOAT_TOKEN", "PARS_CHAR_TOKEN",
+  "PARS_IS_TOKEN", "PARS_BEGIN_TOKEN", "PARS_END_TOKEN", "PARS_IF_TOKEN",
+  "PARS_THEN_TOKEN", "PARS_ELSE_TOKEN", "PARS_ELSIF_TOKEN",
+  "PARS_LOOP_TOKEN", "PARS_WHILE_TOKEN", "PARS_RETURN_TOKEN",
+  "PARS_SELECT_TOKEN", "PARS_SUM_TOKEN", "PARS_COUNT_TOKEN",
+  "PARS_DISTINCT_TOKEN", "PARS_FROM_TOKEN", "PARS_WHERE_TOKEN",
+  "PARS_FOR_TOKEN", "PARS_DDOT_TOKEN", "PARS_READ_TOKEN",
+  "PARS_ORDER_TOKEN", "PARS_BY_TOKEN", "PARS_ASC_TOKEN", "PARS_DESC_TOKEN",
+  "PARS_INSERT_TOKEN", "PARS_INTO_TOKEN", "PARS_VALUES_TOKEN",
+  "PARS_UPDATE_TOKEN", "PARS_SET_TOKEN", "PARS_DELETE_TOKEN",
+  "PARS_CURRENT_TOKEN", "PARS_OF_TOKEN", "PARS_CREATE_TOKEN",
+  "PARS_TABLE_TOKEN", "PARS_INDEX_TOKEN", "PARS_UNIQUE_TOKEN",
+  "PARS_CLUSTERED_TOKEN", "PARS_DOES_NOT_FIT_IN_MEM_TOKEN",
+  "PARS_ON_TOKEN", "PARS_ASSIGN_TOKEN", "PARS_DECLARE_TOKEN",
+  "PARS_CURSOR_TOKEN", "PARS_SQL_TOKEN", "PARS_OPEN_TOKEN",
+  "PARS_FETCH_TOKEN", "PARS_CLOSE_TOKEN", "PARS_NOTFOUND_TOKEN",
+  "PARS_TO_CHAR_TOKEN", "PARS_TO_NUMBER_TOKEN", "PARS_TO_BINARY_TOKEN",
+  "PARS_BINARY_TO_NUMBER_TOKEN", "PARS_SUBSTR_TOKEN", "PARS_REPLSTR_TOKEN",
+  "PARS_CONCAT_TOKEN", "PARS_INSTR_TOKEN", "PARS_LENGTH_TOKEN",
+  "PARS_SYSDATE_TOKEN", "PARS_PRINTF_TOKEN", "PARS_ASSERT_TOKEN",
+  "PARS_RND_TOKEN", "PARS_RND_STR_TOKEN", "PARS_ROW_PRINTF_TOKEN",
+  "PARS_COMMIT_TOKEN", "PARS_ROLLBACK_TOKEN", "PARS_WORK_TOKEN",
+  "PARS_UNSIGNED_TOKEN", "PARS_EXIT_TOKEN", "PARS_FUNCTION_TOKEN",
+  "PARS_LOCK_TOKEN", "PARS_SHARE_TOKEN", "PARS_MODE_TOKEN",
+  "PARS_LIKE_TOKEN", "PARS_LIKE_TOKEN_EXACT", "PARS_LIKE_TOKEN_PREFIX",
+  "PARS_LIKE_TOKEN_SUFFIX", "PARS_LIKE_TOKEN_SUBSTR",
+  "PARS_TABLE_NAME_TOKEN", "PARS_COMPACT_TOKEN", "PARS_BLOCK_SIZE_TOKEN",
+  "PARS_BIGINT_TOKEN", "'='", "'<'", "'>'", "'-'", "'+'", "'*'", "'/'",
+  "NEG", "'%'", "';'", "'('", "')'", "'?'", "','", "'{'", "'}'", "$accept",
+  "top_statement", "statement", "statement_list", "exp", "function_name",
+  "question_mark_list", "stored_procedure_call",
+  "predefined_procedure_call", "predefined_procedure_name",
+  "user_function_call", "table_list", "variable_list", "exp_list",
+  "select_item", "select_item_list", "select_list", "search_condition",
+  "for_update_clause", "lock_shared_clause", "order_direction",
+  "order_by_clause", "select_statement", "insert_statement_start",
+  "insert_statement", "column_assignment", "column_assignment_list",
+  "cursor_positioned", "update_statement_start",
+  "update_statement_searched", "update_statement_positioned",
+  "delete_statement_start", "delete_statement_searched",
+  "delete_statement_positioned", "row_printf_statement",
+  "assignment_statement", "elsif_element", "elsif_list", "else_part",
+  "if_statement", "while_statement", "for_statement", "exit_statement",
+  "return_statement", "open_cursor_statement", "close_cursor_statement",
+  "fetch_statement", "column_def", "column_def_list", "opt_column_len",
+  "opt_unsigned", "opt_not_null", "not_fit_in_memory", "compact",
+  "block_size", "create_table", "column_list", "unique_def",
+  "clustered_def", "create_index", "table_name", "commit_statement",
+  "rollback_statement", "type_name", "parameter_declaration",
+  "parameter_declaration_list", "variable_declaration",
+  "variable_declaration_list", "cursor_declaration",
+  "function_declaration", "declaration", "declaration_list",
+  "procedure_definition", 0
+};
+#endif
+
+# ifdef YYPRINT
+/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to
+   token YYLEX-NUM.  */
+static const yytype_uint16 yytoknum[] =
+{
+       0,   256,   257,   258,   259,   260,   261,   262,   263,   264,
+     265,   266,   267,   268,   269,   270,   271,   272,   273,   274,
+     275,   276,   277,   278,   279,   280,   281,   282,   283,   284,
+     285,   286,   287,   288,   289,   290,   291,   292,   293,   294,
+     295,   296,   297,   298,   299,   300,   301,   302,   303,   304,
+     305,   306,   307,   308,   309,   310,   311,   312,   313,   314,
+     315,   316,   317,   318,   319,   320,   321,   322,   323,   324,
+     325,   326,   327,   328,   329,   330,   331,   332,   333,   334,
+     335,   336,   337,   338,   339,   340,   341,   342,   343,   344,
+     345,   346,   347,   348,   349,   350,   351,   352,   353,   354,
+     355,   356,   357,   358,    61,    60,    62,    45,    43,    42,
+      47,   359,    37,    59,    40,    41,    63,    44,   123,   125
+};
+# endif
+
+/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives.  */
+static const yytype_uint8 yyr1[] =
+{
+       0,   120,   121,   122,   122,   122,   122,   122,   122,   122,
+     122,   122,   122,   122,   122,   122,   122,   122,   122,   122,
+     122,   122,   122,   122,   122,   123,   123,   124,   124,   124,
+     124,   124,   124,   124,   124,   124,   124,   124,   124,   124,
+     124,   124,   124,   124,   124,   124,   124,   124,   124,   124,
+     124,   124,   124,   124,   125,   125,   125,   125,   125,   125,
+     125,   125,   125,   125,   125,   126,   126,   126,   127,   128,
+     129,   129,   129,   130,   131,   131,   132,   132,   132,   133,
+     133,   133,   134,   134,   134,   134,   135,   135,   135,   136,
+     136,   136,   137,   137,   138,   138,   139,   139,   140,   140,
+     140,   141,   141,   142,   143,   144,   144,   145,   146,   146,
+     147,   148,   149,   150,   151,   152,   153,   154,   155,   156,
+     157,   157,   158,   158,   158,   159,   160,   161,   162,   163,
+     164,   165,   166,   166,   167,   168,   168,   169,   169,   170,
+     170,   171,   171,   172,   172,   173,   173,   174,   174,   175,
+     176,   176,   177,   177,   178,   178,   179,   180,   180,   181,
+     182,   183,   183,   183,   183,   183,   183,   184,   184,   185,
+     185,   185,   186,   187,   187,   187,   188,   189,   190,   190,
+     191,   191,   191,   192
+};
+
+/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN.  */
+static const yytype_uint8 yyr2[] =
+{
+       0,     2,     2,     1,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     1,     2,     1,     4,     1,
+       1,     1,     1,     1,     1,     1,     3,     3,     3,     3,
+       2,     3,     3,     3,     3,     3,     3,     3,     3,     3,
+       3,     2,     3,     3,     1,     1,     1,     1,     1,     1,
+       1,     1,     1,     1,     1,     0,     1,     3,     6,     4,
+       1,     1,     1,     3,     1,     3,     0,     1,     3,     0,
+       1,     3,     1,     4,     5,     4,     0,     1,     3,     1,
+       3,     1,     0,     2,     0,     2,     0,     4,     0,     1,
+       1,     0,     4,     8,     3,     5,     2,     3,     1,     3,
+       4,     4,     2,     2,     3,     2,     2,     2,     3,     4,
+       1,     2,     0,     2,     1,     7,     6,    10,     1,     1,
+       2,     2,     4,     4,     5,     1,     3,     0,     3,     0,
+       1,     0,     2,     0,     1,     0,     1,     0,     3,     9,
+       1,     3,     0,     1,     0,     1,    10,     1,     1,     2,
+       2,     1,     1,     1,     1,     1,     1,     3,     3,     0,
+       1,     3,     3,     0,     1,     2,     6,     4,     1,     1,
+       0,     1,     2,    11
+};
+
+/* YYDEFACT[STATE-NAME] -- Default rule to reduce with in state
+   STATE-NUM when YYTABLE doesn't specify something else to do.  Zero
+   means the default is an error.  */
+static const yytype_uint8 yydefact[] =
+{
+       0,     0,     0,     0,     0,     1,     2,   169,     0,   170,
+       0,     0,     0,     0,     0,   165,   166,   161,   162,   164,
+     163,   167,   168,   173,   171,     0,   174,   180,     0,     0,
+     175,   178,   179,   181,     0,   172,     0,     0,     0,   182,
+       0,     0,     0,     0,     0,   129,    86,     0,     0,     0,
+       0,   152,     0,     0,     0,    70,    71,    72,     0,     0,
+       0,   128,     0,    25,     0,     3,     0,     0,     0,     0,
+       0,    92,     0,     0,    92,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+       0,     0,   177,     0,    29,    30,    31,    32,    33,    34,
+      27,     0,    35,    54,    55,    56,    57,    58,    59,    60,
+      61,    62,    63,    64,     0,     0,     0,     0,     0,     0,
+       0,    89,    82,    87,    91,     0,     0,     0,   157,   158,
+       0,     0,     0,   153,   154,   130,     0,   131,   117,   159,
+     160,     0,   183,    26,     4,    79,    11,     0,   106,    12,
+       0,   112,   113,    16,    17,   115,   116,    14,    15,    13,
+      10,     8,     5,     6,     7,     9,    18,    20,    19,    23,
+      24,    21,    22,     0,   118,     0,    51,     0,    40,     0,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,    79,     0,     0,     0,    76,     0,
+       0,     0,   104,     0,   114,     0,   155,     0,    76,    65,
+      80,     0,    79,     0,    93,   176,    52,    53,    41,    49,
+      50,    46,    47,    48,   122,    43,    42,    44,    45,    37,
+      36,    38,    39,     0,     0,     0,     0,     0,    77,    90,
+      88,    92,    74,     0,     0,   108,   111,     0,     0,    77,
+     133,   132,    66,     0,    69,     0,     0,     0,     0,     0,
+     120,   124,     0,    28,     0,    85,     0,    83,     0,     0,
+       0,    94,     0,     0,     0,     0,   135,     0,     0,     0,
+       0,     0,    81,   105,   110,   123,     0,   121,     0,   126,
+      84,    78,    75,     0,    96,     0,   107,   109,   137,   143,
+       0,     0,    73,    68,    67,     0,   125,    95,     0,   101,
+       0,     0,   139,   144,   145,   136,     0,   119,     0,     0,
+     103,     0,     0,   140,   141,   146,   147,     0,     0,     0,
+       0,   138,     0,   134,     0,   149,   150,     0,    97,    98,
+     127,   142,     0,   156,     0,    99,   100,   102,   148,   151
+};
+
+/* YYDEFGOTO[NTERM-NUM].  */
+static const yytype_int16 yydefgoto[] =
+{
+      -1,     2,    63,    64,   210,   117,   253,    65,    66,    67,
+     250,   241,   239,   211,   123,   124,   125,   151,   294,   309,
+     347,   320,    68,    69,    70,   245,   246,   152,    71,    72,
+      73,    74,    75,    76,    77,    78,   260,   261,   262,    79,
+      80,    81,    82,    83,    84,    85,    86,   276,   277,   312,
+     324,   333,   314,   326,   335,    87,   337,   134,   207,    88,
+     130,    89,    90,    21,     9,    10,    26,    27,    31,    32,
+      33,    34,     3
+};
+
+/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
+   STATE-NUM.  */
+#define YYPACT_NINF -179
+static const yytype_int16 yypact[] =
+{
+      24,    36,    58,   -48,   -25,  -179,  -179,    57,    31,  -179,
+     -74,    14,    14,    50,    57,  -179,  -179,  -179,  -179,  -179,
+    -179,  -179,  -179,    72,  -179,    14,  -179,     3,   -26,   -28,
+    -179,  -179,  -179,  -179,     4,  -179,    91,    95,   589,  -179,
+      80,    -6,    43,   285,   285,  -179,    19,    99,    69,    -5,
+      81,   -13,   110,   112,   114,  -179,  -179,  -179,    89,    37,
+      41,  -179,   122,  -179,   406,  -179,    25,    40,    44,    -3,
+      46,   116,    49,    51,   116,    52,    53,    54,    55,    56,
+      59,    61,    62,    70,    73,    74,    75,    76,    77,    78,
+      79,    89,  -179,   285,  -179,  -179,  -179,  -179,  -179,  -179,
+      82,   285,    83,  -179,  -179,  -179,  -179,  -179,  -179,  -179,
+    -179,  -179,  -179,  -179,   285,   285,   577,    92,   618,    94,
+      97,  -179,   706,  -179,   -33,   124,   153,    -5,  -179,  -179,
+     141,    -5,    -5,  -179,   136,  -179,   148,  -179,  -179,  -179,
+    -179,    98,  -179,  -179,  -179,   285,  -179,   101,  -179,  -179,
+     195,  -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,
+    -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,
+    -179,  -179,  -179,   100,   706,   135,     6,   154,    -7,   206,
+     285,   285,   285,   285,   285,   589,   218,   285,   285,   285,
+     285,   285,   285,   285,   285,   589,   285,   -27,   216,   173,
+      -5,   285,  -179,   217,  -179,   113,  -179,   171,   221,   119,
+     706,   -56,   285,   185,   706,  -179,  -179,  -179,  -179,     6,
+       6,    27,    27,   706,   345,  -179,    27,    27,    27,    35,
+      35,    -7,    -7,   -53,   467,   223,   232,   127,  -179,   126,
+    -179,   -31,  -179,   638,   151,  -179,   142,   251,   253,   150,
+    -179,   126,  -179,   -46,  -179,   285,   -45,   256,   589,   285,
+    -179,   240,   249,  -179,   245,  -179,   166,  -179,   273,   285,
+      -5,   242,   285,   285,   217,    14,  -179,   -39,   222,   170,
+     167,   179,   706,  -179,  -179,   589,   679,  -179,   268,  -179,
+    -179,  -179,  -179,   247,   207,   686,   706,  -179,   186,   243,
+     251,    -5,  -179,  -179,  -179,   589,  -179,  -179,   286,   261,
+     589,   303,   219,  -179,   224,  -179,   193,   589,   226,   272,
+    -179,   528,   205,  -179,   310,  -179,   233,   314,   230,   317,
+     302,  -179,   328,  -179,   235,  -179,  -179,   -38,  -179,     7,
+    -179,  -179,   334,  -179,   331,  -179,  -179,  -179,  -179,  -179
+};
+
+/* YYPGOTO[NTERM-NUM].  */
+static const yytype_int16 yypgoto[] =
+{
+    -179,  -179,   -63,  -178,   -41,  -179,  -179,  -179,  -179,  -179,
+    -179,  -179,   133,  -155,   143,  -179,  -179,   -68,  -179,  -179,
+    -179,  -179,   -40,  -179,  -179,    71,  -179,   269,  -179,  -179,
+    -179,  -179,  -179,  -179,  -179,  -179,    85,  -179,  -179,  -179,
+    -179,  -179,  -179,  -179,  -179,  -179,  -179,    47,  -179,  -179,
+    -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,  -179,
+    -117,  -179,  -179,   -12,   330,  -179,   321,  -179,  -179,  -179,
+     315,  -179,  -179
+};
+
+/* YYTABLE[YYPACT[STATE-NUM]].  What to do in state STATE-NUM.  If
+   positive, shift that token.  If negative, reduce the rule which
+   number is the opposite.  If zero, do what YYDEFACT says.
+   If YYTABLE_NINF, syntax error.  */
+#define YYTABLE_NINF -1
+static const yytype_uint16 yytable[] =
+{
+      22,   143,   116,   118,   128,   122,   155,   224,   184,   269,
+     202,   236,    25,    28,   204,   205,   198,   234,   138,   182,
+     183,   184,    94,    95,    96,    97,    98,    99,   100,   148,
+      38,   101,    46,    15,    16,    17,    18,    36,    19,   233,
+       1,    13,   184,    14,   132,     4,   133,   147,    11,    12,
+     184,   173,   174,   345,   346,   119,   120,   256,     5,   254,
+     176,   255,   263,    37,   255,     6,     8,    29,    29,   280,
+     283,   281,   255,   178,   179,    23,   299,   343,   300,   344,
+     285,    25,   237,   242,   199,   102,   270,    35,   186,     7,
+     103,   104,   105,   106,   107,   129,   108,   109,   110,   111,
+      40,   186,   112,   113,    41,    91,    93,    92,   126,   214,
+     187,   188,   189,   190,   191,   192,   193,    20,   127,   135,
+     131,   136,   186,   137,    46,   139,   114,   317,   121,   140,
+     186,   141,   321,   115,   190,   191,   192,   193,   144,   219,
+     220,   221,   222,   223,   192,   193,   226,   227,   228,   229,
+     230,   231,   232,   292,   145,   235,   150,   146,   122,   149,
+     243,   143,   153,   200,   154,   157,   158,   159,   160,   161,
+     201,   143,   162,   271,   163,   164,    94,    95,    96,    97,
+      98,    99,   100,   165,   316,   101,   166,   167,   168,   169,
+     170,   171,   172,   203,   175,   177,   206,   208,    94,    95,
+      96,    97,    98,    99,   100,   216,   194,   101,   196,   119,
+     120,   197,   209,   215,   282,   212,   180,   181,   286,   182,
+     183,   184,   143,   225,   217,   238,   244,   247,   214,   248,
+     249,   295,   296,   180,   181,   252,   182,   183,   184,   102,
+     257,   266,   267,   268,   103,   104,   105,   106,   107,   213,
+     108,   109,   110,   111,   143,   273,   112,   113,   143,   274,
+     275,   102,   278,   298,   279,   284,   103,   104,   105,   106,
+     107,   259,   108,   109,   110,   111,   288,   289,   112,   113,
+     114,   290,   291,   293,   301,   302,   303,   115,    94,    95,
+      96,    97,    98,    99,   100,   304,   306,   101,   307,   308,
+     311,   186,   114,   318,   313,   319,   322,   327,   323,   115,
+     187,   188,   189,   190,   191,   192,   193,   329,   186,   328,
+     331,   218,   332,   336,   338,   325,   339,   187,   188,   189,
+     190,   191,   192,   193,   340,   334,   341,   348,   265,   342,
+     349,   251,   240,   156,    24,   297,   287,   315,    30,    39,
+       0,   102,     0,     0,    42,     0,   103,   104,   105,   106,
+     107,     0,   108,   109,   110,   111,     0,     0,   112,   113,
+       0,     0,     0,    43,     0,   258,   259,     0,    44,    45,
+      46,     0,     0,     0,     0,     0,    47,     0,     0,     0,
+       0,     0,   114,    48,     0,     0,    49,     0,    50,   115,
+       0,    51,     0,     0,     0,     0,     0,     0,     0,     0,
+       0,     0,    52,    53,    54,    42,     0,     0,     0,     0,
+       0,    55,     0,     0,     0,     0,    56,    57,     0,     0,
+      58,    59,    60,   142,    43,    61,     0,     0,     0,    44,
+      45,    46,     0,     0,     0,     0,     0,    47,     0,     0,
+       0,     0,     0,     0,    48,     0,     0,    49,     0,    50,
+       0,     0,    51,    62,     0,     0,     0,     0,     0,     0,
+       0,     0,     0,    52,    53,    54,    42,     0,     0,     0,
+       0,     0,    55,     0,     0,     0,     0,    56,    57,     0,
+       0,    58,    59,    60,   264,    43,    61,     0,     0,     0,
+      44,    45,    46,     0,     0,     0,     0,     0,    47,     0,
+       0,     0,     0,     0,     0,    48,     0,     0,    49,     0,
+      50,     0,     0,    51,    62,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,    52,    53,    54,    42,     0,     0,
+       0,     0,     0,    55,     0,     0,     0,     0,    56,    57,
+       0,     0,    58,    59,    60,   330,    43,    61,     0,     0,
+       0,    44,    45,    46,     0,     0,     0,     0,     0,    47,
+       0,     0,     0,     0,     0,     0,    48,     0,     0,    49,
+       0,    50,     0,     0,    51,    62,     0,   180,   181,     0,
+     182,   183,   184,     0,     0,    52,    53,    54,    42,     0,
+       0,     0,     0,     0,    55,     0,   185,     0,     0,    56,
+      57,     0,     0,    58,    59,    60,     0,    43,    61,     0,
+       0,     0,    44,    45,    46,     0,     0,     0,   180,   181,
+      47,   182,   183,   184,     0,     0,     0,    48,     0,     0,
+      49,     0,    50,     0,     0,    51,    62,     0,   180,   181,
+     195,   182,   183,   184,     0,     0,    52,    53,    54,     0,
+       0,     0,     0,     0,     0,    55,     0,     0,     0,     0,
+      56,    57,   186,     0,    58,    59,    60,     0,     0,    61,
+     272,   187,   188,   189,   190,   191,   192,   193,     0,   180,
+     181,     0,   182,   183,   184,     0,   180,   181,     0,   182,
+     183,   184,     0,     0,     0,     0,     0,    62,   305,     0,
+       0,     0,     0,   186,     0,     0,   180,   181,   310,   182,
+     183,   184,   187,   188,   189,   190,   191,   192,   193,     0,
+       0,     0,     0,   186,     0,     0,     0,     0,     0,     0,
+       0,     0,   187,   188,   189,   190,   191,   192,   193,     0,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,   186,     0,     0,     0,     0,     0,
+       0,   186,     0,   187,   188,   189,   190,   191,   192,   193,
+     187,   188,   189,   190,   191,   192,   193,     0,     0,     0,
+       0,   186,     0,     0,     0,     0,     0,     0,     0,     0,
+     187,   188,   189,   190,   191,   192,   193
+};
+
+static const yytype_int16 yycheck[] =
+{
+      12,    64,    43,    44,     9,    46,    74,   185,    15,    40,
+     127,    38,     9,    25,   131,   132,    49,   195,    58,    13,
+      14,    15,     3,     4,     5,     6,     7,     8,     9,    69,
+      26,    12,    35,    19,    20,    21,    22,    65,    24,   194,
+      16,   115,    15,   117,    57,     9,    59,    50,    17,    18,
+      15,    91,    93,    46,    47,    36,    37,   212,     0,   115,
+     101,   117,   115,    91,   117,   113,     9,    64,    64,   115,
+     115,   117,   117,   114,   115,    25,   115,   115,   117,   117,
+     258,     9,   109,   200,   117,    66,   117,   113,    95,   114,
+      71,    72,    73,    74,    75,   100,    77,    78,    79,    80,
+       9,    95,    83,    84,     9,    25,    63,   113,     9,   150,
+     104,   105,   106,   107,   108,   109,   110,   103,    49,     9,
+      39,     9,    95,     9,    35,    88,   107,   305,   109,    88,
+      95,     9,   310,   114,   107,   108,   109,   110,   113,   180,
+     181,   182,   183,   184,   109,   110,   187,   188,   189,   190,
+     191,   192,   193,   270,   114,   196,    40,   113,   199,   113,
+     201,   224,   113,    39,   113,   113,   113,   113,   113,   113,
+      17,   234,   113,   241,   113,   113,     3,     4,     5,     6,
+       7,     8,     9,   113,   301,    12,   113,   113,   113,   113,
+     113,   113,   113,    52,   112,   112,    60,    49,     3,     4,
+       5,     6,     7,     8,     9,    70,   114,    12,   114,    36,
+      37,   114,   114,   113,   255,   114,    10,    11,   259,    13,
+      14,    15,   285,     5,    70,     9,     9,   114,   269,    58,
+       9,   272,   273,    10,    11,   116,    13,    14,    15,    66,
+      55,     9,   115,   117,    71,    72,    73,    74,    75,    54,
+      77,    78,    79,    80,   317,   104,    83,    84,   321,   117,
+       9,    66,     9,   275,   114,     9,    71,    72,    73,    74,
+      75,    31,    77,    78,    79,    80,    27,    32,    83,    84,
+     107,   115,     9,    41,    62,   115,   119,   114,     3,     4,
+       5,     6,     7,     8,     9,   116,    28,    12,    51,    92,
+     114,    95,   107,    17,    61,    44,     3,   114,    89,   114,
+     104,   105,   106,   107,   108,   109,   110,    45,    95,    93,
+     115,   115,    12,     9,    94,   101,     9,   104,   105,   106,
+     107,   108,   109,   110,    32,   102,     8,     3,   115,   104,
+       9,   208,   199,    74,    14,   274,   261,   300,    27,    34,
+      -1,    66,    -1,    -1,     9,    -1,    71,    72,    73,    74,
+      75,    -1,    77,    78,    79,    80,    -1,    -1,    83,    84,
+      -1,    -1,    -1,    28,    -1,    30,    31,    -1,    33,    34,
+      35,    -1,    -1,    -1,    -1,    -1,    41,    -1,    -1,    -1,
+      -1,    -1,   107,    48,    -1,    -1,    51,    -1,    53,   114,
+      -1,    56,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,    67,    68,    69,     9,    -1,    -1,    -1,    -1,
+      -1,    76,    -1,    -1,    -1,    -1,    81,    82,    -1,    -1,
+      85,    86,    87,    27,    28,    90,    -1,    -1,    -1,    33,
+      34,    35,    -1,    -1,    -1,    -1,    -1,    41,    -1,    -1,
+      -1,    -1,    -1,    -1,    48,    -1,    -1,    51,    -1,    53,
+      -1,    -1,    56,   118,    -1,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,    -1,    67,    68,    69,     9,    -1,    -1,    -1,
+      -1,    -1,    76,    -1,    -1,    -1,    -1,    81,    82,    -1,
+      -1,    85,    86,    87,    27,    28,    90,    -1,    -1,    -1,
+      33,    34,    35,    -1,    -1,    -1,    -1,    -1,    41,    -1,
+      -1,    -1,    -1,    -1,    -1,    48,    -1,    -1,    51,    -1,
+      53,    -1,    -1,    56,   118,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,    -1,    -1,    67,    68,    69,     9,    -1,    -1,
+      -1,    -1,    -1,    76,    -1,    -1,    -1,    -1,    81,    82,
+      -1,    -1,    85,    86,    87,    27,    28,    90,    -1,    -1,
+      -1,    33,    34,    35,    -1,    -1,    -1,    -1,    -1,    41,
+      -1,    -1,    -1,    -1,    -1,    -1,    48,    -1,    -1,    51,
+      -1,    53,    -1,    -1,    56,   118,    -1,    10,    11,    -1,
+      13,    14,    15,    -1,    -1,    67,    68,    69,     9,    -1,
+      -1,    -1,    -1,    -1,    76,    -1,    29,    -1,    -1,    81,
+      82,    -1,    -1,    85,    86,    87,    -1,    28,    90,    -1,
+      -1,    -1,    33,    34,    35,    -1,    -1,    -1,    10,    11,
+      41,    13,    14,    15,    -1,    -1,    -1,    48,    -1,    -1,
+      51,    -1,    53,    -1,    -1,    56,   118,    -1,    10,    11,
+      32,    13,    14,    15,    -1,    -1,    67,    68,    69,    -1,
+      -1,    -1,    -1,    -1,    -1,    76,    -1,    -1,    -1,    -1,
+      81,    82,    95,    -1,    85,    86,    87,    -1,    -1,    90,
+      42,   104,   105,   106,   107,   108,   109,   110,    -1,    10,
+      11,    -1,    13,    14,    15,    -1,    10,    11,    -1,    13,
+      14,    15,    -1,    -1,    -1,    -1,    -1,   118,    29,    -1,
+      -1,    -1,    -1,    95,    -1,    -1,    10,    11,    32,    13,
+      14,    15,   104,   105,   106,   107,   108,   109,   110,    -1,
+      -1,    -1,    -1,    95,    -1,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,   104,   105,   106,   107,   108,   109,   110,    -1,
+      -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,    -1,    -1,    95,    -1,    -1,    -1,    -1,    -1,
+      -1,    95,    -1,   104,   105,   106,   107,   108,   109,   110,
+     104,   105,   106,   107,   108,   109,   110,    -1,    -1,    -1,
+      -1,    95,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
+     104,   105,   106,   107,   108,   109,   110
+};
+
+/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
+   symbol of state STATE-NUM.  */
+static const yytype_uint8 yystos[] =
+{
+       0,    16,   121,   192,     9,     0,   113,   114,     9,   184,
+     185,    17,    18,   115,   117,    19,    20,    21,    22,    24,
+     103,   183,   183,    25,   184,     9,   186,   187,   183,    64,
+     186,   188,   189,   190,   191,   113,    65,    91,    26,   190,
+       9,     9,     9,    28,    33,    34,    35,    41,    48,    51,
+      53,    56,    67,    68,    69,    76,    81,    82,    85,    86,
+      87,    90,   118,   122,   123,   127,   128,   129,   142,   143,
+     144,   148,   149,   150,   151,   152,   153,   154,   155,   159,
+     160,   161,   162,   163,   164,   165,   166,   175,   179,   181,
+     182,    25,   113,    63,     3,     4,     5,     6,     7,     8,
+       9,    12,    66,    71,    72,    73,    74,    75,    77,    78,
+      79,    80,    83,    84,   107,   114,   124,   125,   124,    36,
+      37,   109,   124,   134,   135,   136,     9,    49,     9,   100,
+     180,    39,    57,    59,   177,     9,     9,     9,   142,    88,
+      88,     9,    27,   122,   113,   114,   113,    50,   142,   113,
+      40,   137,   147,   113,   113,   137,   147,   113,   113,   113,
+     113,   113,   113,   113,   113,   113,   113,   113,   113,   113,
+     113,   113,   113,   142,   124,   112,   124,   112,   124,   124,
+      10,    11,    13,    14,    15,    29,    95,   104,   105,   106,
+     107,   108,   109,   110,   114,    32,   114,   114,    49,   117,
+      39,    17,   180,    52,   180,   180,    60,   178,    49,   114,
+     124,   133,   114,    54,   124,   113,    70,    70,   115,   124,
+     124,   124,   124,   124,   123,     5,   124,   124,   124,   124,
+     124,   124,   124,   133,   123,   124,    38,   109,     9,   132,
+     134,   131,   180,   124,     9,   145,   146,   114,    58,     9,
+     130,   132,   116,   126,   115,   117,   133,    55,    30,    31,
+     156,   157,   158,   115,    27,   115,     9,   115,   117,    40,
+     117,   137,    42,   104,   117,     9,   167,   168,     9,   114,
+     115,   117,   124,   115,     9,   123,   124,   156,    27,    32,
+     115,     9,   180,    41,   138,   124,   124,   145,   183,   115,
+     117,    62,   115,   119,   116,    29,    28,    51,    92,   139,
+      32,   114,   169,    61,   172,   167,   180,   123,    17,    44,
+     141,   123,     3,    89,   170,   101,   173,   114,    93,    45,
+      27,   115,    12,   171,   102,   174,     9,   176,    94,     9,
+      32,     8,   104,   115,   117,    46,    47,   140,     3,     9
+};
+
+#define yyerrok		(yyerrstatus = 0)
+#define yyclearin	(yychar = YYEMPTY)
+#define YYEMPTY		(-2)
+#define YYEOF		0
+
+#define YYACCEPT	goto yyacceptlab
+#define YYABORT		goto yyabortlab
+#define YYERROR		goto yyerrorlab
+
+
+/* Like YYERROR except do call yyerror.  This remains here temporarily
+   to ease the transition to the new meaning of YYERROR, for GCC.
+   Once GCC version 2 has supplanted version 1, this can go.  */
+
+#define YYFAIL		goto yyerrlab
+
+#define YYRECOVERING()  (!!yyerrstatus)
+
+#define YYBACKUP(Token, Value)					\
+do								\
+  if (yychar == YYEMPTY && yylen == 1)				\
+    {								\
+      yychar = (Token);						\
+      yylval = (Value);						\
+      yytoken = YYTRANSLATE (yychar);				\
+      YYPOPSTACK (1);						\
+      goto yybackup;						\
+    }								\
+  else								\
+    {								\
+      yyerror (YY_("syntax error: cannot back up")); \
+      YYERROR;							\
+    }								\
+while (YYID (0))
+
+
+#define YYTERROR	1
+#define YYERRCODE	256
+
+
+/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N].
+   If N is 0, then set CURRENT to the empty location which ends
+   the previous symbol: RHS[0] (always defined).  */
+
+#define YYRHSLOC(Rhs, K) ((Rhs)[K])
+#ifndef YYLLOC_DEFAULT
+# define YYLLOC_DEFAULT(Current, Rhs, N)				\
+    do									\
+      if (YYID (N))                                                    \
+	{								\
+	  (Current).first_line   = YYRHSLOC (Rhs, 1).first_line;	\
+	  (Current).first_column = YYRHSLOC (Rhs, 1).first_column;	\
+	  (Current).last_line    = YYRHSLOC (Rhs, N).last_line;		\
+	  (Current).last_column  = YYRHSLOC (Rhs, N).last_column;	\
+	}								\
+      else								\
+	{								\
+	  (Current).first_line   = (Current).last_line   =		\
+	    YYRHSLOC (Rhs, 0).last_line;				\
+	  (Current).first_column = (Current).last_column =		\
+	    YYRHSLOC (Rhs, 0).last_column;				\
+	}								\
+    while (YYID (0))
+#endif
+
+
+/* YY_LOCATION_PRINT -- Print the location on the stream.
+   This macro was not mandated originally: define only if we know
+   we won't break user code: when these are the locations we know.  */
+
+#ifndef YY_LOCATION_PRINT
+# if defined YYLTYPE_IS_TRIVIAL && YYLTYPE_IS_TRIVIAL
+#  define YY_LOCATION_PRINT(File, Loc)			\
+     fprintf (File, "%d.%d-%d.%d",			\
+	      (Loc).first_line, (Loc).first_column,	\
+	      (Loc).last_line,  (Loc).last_column)
+# else
+#  define YY_LOCATION_PRINT(File, Loc) ((void) 0)
+# endif
+#endif
+
+
+/* YYLEX -- calling `yylex' with the right arguments.  */
+
+#ifdef YYLEX_PARAM
+# define YYLEX yylex (YYLEX_PARAM)
+#else
+# define YYLEX yylex ()
+#endif
+
+/* Enable debugging if requested.  */
+#if YYDEBUG
+
+# ifndef YYFPRINTF
+#  include <stdio.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYFPRINTF fprintf
+# endif
+
+# define YYDPRINTF(Args)			\
+do {						\
+  if (yydebug)					\
+    YYFPRINTF Args;				\
+} while (YYID (0))
+
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)			  \
+do {									  \
+  if (yydebug)								  \
+    {									  \
+      YYFPRINTF (stderr, "%s ", Title);					  \
+      yy_symbol_print (stderr,						  \
+		  Type, Value); \
+      YYFPRINTF (stderr, "\n");						  \
+    }									  \
+} while (YYID (0))
+
+
+/*--------------------------------.
+| Print this symbol on YYOUTPUT.  |
+`--------------------------------*/
+
+/*ARGSUSED*/
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_symbol_value_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep)
+#else
+static void
+yy_symbol_value_print (yyoutput, yytype, yyvaluep)
+    FILE *yyoutput;
+    int yytype;
+    YYSTYPE const * const yyvaluep;
+#endif
+{
+  if (!yyvaluep)
+    return;
+# ifdef YYPRINT
+  if (yytype < YYNTOKENS)
+    YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep);
+# else
+  YYUSE (yyoutput);
+# endif
+  switch (yytype)
+    {
+      default:
+	break;
+    }
+}
+
+
+/*--------------------------------.
+| Print this symbol on YYOUTPUT.  |
+`--------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_symbol_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep)
+#else
+static void
+yy_symbol_print (yyoutput, yytype, yyvaluep)
+    FILE *yyoutput;
+    int yytype;
+    YYSTYPE const * const yyvaluep;
+#endif
+{
+  if (yytype < YYNTOKENS)
+    YYFPRINTF (yyoutput, "token %s (", yytname[yytype]);
+  else
+    YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]);
+
+  yy_symbol_value_print (yyoutput, yytype, yyvaluep);
+  YYFPRINTF (yyoutput, ")");
+}
+
+/*------------------------------------------------------------------.
+| yy_stack_print -- Print the state stack from its BOTTOM up to its |
+| TOP (included).                                                   |
+`------------------------------------------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_stack_print (yytype_int16 *bottom, yytype_int16 *top)
+#else
+static void
+yy_stack_print (bottom, top)
+    yytype_int16 *bottom;
+    yytype_int16 *top;
+#endif
+{
+  YYFPRINTF (stderr, "Stack now");
+  for (; bottom <= top; ++bottom)
+    YYFPRINTF (stderr, " %d", *bottom);
+  YYFPRINTF (stderr, "\n");
+}
+
+# define YY_STACK_PRINT(Bottom, Top)				\
+do {								\
+  if (yydebug)							\
+    yy_stack_print ((Bottom), (Top));				\
+} while (YYID (0))
+
+
+/*------------------------------------------------.
+| Report that the YYRULE is going to be reduced.  |
+`------------------------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_reduce_print (YYSTYPE *yyvsp, int yyrule)
+#else
+static void
+yy_reduce_print (yyvsp, yyrule)
+    YYSTYPE *yyvsp;
+    int yyrule;
+#endif
+{
+  int yynrhs = yyr2[yyrule];
+  int yyi;
+  unsigned long int yylno = yyrline[yyrule];
+  YYFPRINTF (stderr, "Reducing stack by rule %d (line %lu):\n",
+	     yyrule - 1, yylno);
+  /* The symbols being reduced.  */
+  for (yyi = 0; yyi < yynrhs; yyi++)
+    {
+      fprintf (stderr, "   $%d = ", yyi + 1);
+      yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi],
+		       &(yyvsp[(yyi + 1) - (yynrhs)])
+		       		       );
+      fprintf (stderr, "\n");
+    }
+}
+
+# define YY_REDUCE_PRINT(Rule)		\
+do {					\
+  if (yydebug)				\
+    yy_reduce_print (yyvsp, Rule); \
+} while (YYID (0))
+
+/* Nonzero means print parse trace.  It is left uninitialized so that
+   multiple parsers can coexist.  */
+int yydebug;
+#else /* !YYDEBUG */
+# define YYDPRINTF(Args)
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)
+# define YY_STACK_PRINT(Bottom, Top)
+# define YY_REDUCE_PRINT(Rule)
+#endif /* !YYDEBUG */
+
+
+/* YYINITDEPTH -- initial size of the parser's stacks.  */
+#ifndef	YYINITDEPTH
+# define YYINITDEPTH 200
+#endif
+
+/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only
+   if the built-in stack extension method is used).
+
+   Do not make this value too large; the results are undefined if
+   YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH)
+   evaluated with infinite-precision integer arithmetic.  */
+
+#ifndef YYMAXDEPTH
+# define YYMAXDEPTH 10000
+#endif
+
+
+
+#if YYERROR_VERBOSE
+
+# ifndef yystrlen
+#  if defined __GLIBC__ && defined _STRING_H
+#   define yystrlen strlen
+#  else
+/* Return the length of YYSTR.  */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static YYSIZE_T
+yystrlen (const char *yystr)
+#else
+static YYSIZE_T
+yystrlen (yystr)
+    const char *yystr;
+#endif
+{
+  YYSIZE_T yylen;
+  for (yylen = 0; yystr[yylen]; yylen++)
+    continue;
+  return yylen;
+}
+#  endif
+# endif
+
+# ifndef yystpcpy
+#  if defined __GLIBC__ && defined _STRING_H && defined _GNU_SOURCE
+#   define yystpcpy stpcpy
+#  else
+/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in
+   YYDEST.  */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static char *
+yystpcpy (char *yydest, const char *yysrc)
+#else
+static char *
+yystpcpy (yydest, yysrc)
+    char *yydest;
+    const char *yysrc;
+#endif
+{
+  char *yyd = yydest;
+  const char *yys = yysrc;
+
+  while ((*yyd++ = *yys++) != '\0')
+    continue;
+
+  return yyd - 1;
+}
+#  endif
+# endif
+
+# ifndef yytnamerr
+/* Copy to YYRES the contents of YYSTR after stripping away unnecessary
+   quotes and backslashes, so that it's suitable for yyerror.  The
+   heuristic is that double-quoting is unnecessary unless the string
+   contains an apostrophe, a comma, or backslash (other than
+   backslash-backslash).  YYSTR is taken from yytname.  If YYRES is
+   null, do not copy; instead, return the length of what the result
+   would have been.  */
+static YYSIZE_T
+yytnamerr (char *yyres, const char *yystr)
+{
+  if (*yystr == '"')
+    {
+      YYSIZE_T yyn = 0;
+      char const *yyp = yystr;
+
+      for (;;)
+	switch (*++yyp)
+	  {
+	  case '\'':
+	  case ',':
+	    goto do_not_strip_quotes;
+
+	  case '\\':
+	    if (*++yyp != '\\')
+	      goto do_not_strip_quotes;
+	    /* Fall through.  */
+	  default:
+	    if (yyres)
+	      yyres[yyn] = *yyp;
+	    yyn++;
+	    break;
+
+	  case '"':
+	    if (yyres)
+	      yyres[yyn] = '\0';
+	    return yyn;
+	  }
+    do_not_strip_quotes: ;
+    }
+
+  if (! yyres)
+    return yystrlen (yystr);
+
+  return yystpcpy (yyres, yystr) - yyres;
+}
+# endif
+
+/* Copy into YYRESULT an error message about the unexpected token
+   YYCHAR while in state YYSTATE.  Return the number of bytes copied,
+   including the terminating null byte.  If YYRESULT is null, do not
+   copy anything; just return the number of bytes that would be
+   copied.  As a special case, return 0 if an ordinary "syntax error"
+   message will do.  Return YYSIZE_MAXIMUM if overflow occurs during
+   size calculation.  */
+static YYSIZE_T
+yysyntax_error (char *yyresult, int yystate, int yychar)
+{
+  int yyn = yypact[yystate];
+
+  if (! (YYPACT_NINF < yyn && yyn <= YYLAST))
+    return 0;
+  else
+    {
+      int yytype = YYTRANSLATE (yychar);
+      YYSIZE_T yysize0 = yytnamerr (0, yytname[yytype]);
+      YYSIZE_T yysize = yysize0;
+      YYSIZE_T yysize1;
+      int yysize_overflow = 0;
+      enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 };
+      char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM];
+      int yyx;
+
+# if 0
+      /* This is so xgettext sees the translatable formats that are
+	 constructed on the fly.  */
+      YY_("syntax error, unexpected %s");
+      YY_("syntax error, unexpected %s, expecting %s");
+      YY_("syntax error, unexpected %s, expecting %s or %s");
+      YY_("syntax error, unexpected %s, expecting %s or %s or %s");
+      YY_("syntax error, unexpected %s, expecting %s or %s or %s or %s");
+# endif
+      char *yyfmt;
+      char const *yyf;
+      static char const yyunexpected[] = "syntax error, unexpected %s";
+      static char const yyexpecting[] = ", expecting %s";
+      static char const yyor[] = " or %s";
+      char yyformat[sizeof yyunexpected
+		    + sizeof yyexpecting - 1
+		    + ((YYERROR_VERBOSE_ARGS_MAXIMUM - 2)
+		       * (sizeof yyor - 1))];
+      char const *yyprefix = yyexpecting;
+
+      /* Start YYX at -YYN if negative to avoid negative indexes in
+	 YYCHECK.  */
+      int yyxbegin = yyn < 0 ? -yyn : 0;
+
+      /* Stay within bounds of both yycheck and yytname.  */
+      int yychecklim = YYLAST - yyn + 1;
+      int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS;
+      int yycount = 1;
+
+      yyarg[0] = yytname[yytype];
+      yyfmt = yystpcpy (yyformat, yyunexpected);
+
+      for (yyx = yyxbegin; yyx < yyxend; ++yyx)
+	if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR)
+	  {
+	    if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM)
+	      {
+		yycount = 1;
+		yysize = yysize0;
+		yyformat[sizeof yyunexpected - 1] = '\0';
+		break;
+	      }
+	    yyarg[yycount++] = yytname[yyx];
+	    yysize1 = yysize + yytnamerr (0, yytname[yyx]);
+	    yysize_overflow |= (yysize1 < yysize);
+	    yysize = yysize1;
+	    yyfmt = yystpcpy (yyfmt, yyprefix);
+	    yyprefix = yyor;
+	  }
+
+      yyf = YY_(yyformat);
+      yysize1 = yysize + yystrlen (yyf);
+      yysize_overflow |= (yysize1 < yysize);
+      yysize = yysize1;
+
+      if (yysize_overflow)
+	return YYSIZE_MAXIMUM;
+
+      if (yyresult)
+	{
+	  /* Avoid sprintf, as that infringes on the user's name space.
+	     Don't have undefined behavior even if the translation
+	     produced a string with the wrong number of "%s"s.  */
+	  char *yyp = yyresult;
+	  int yyi = 0;
+	  while ((*yyp = *yyf) != '\0')
+	    {
+	      if (*yyp == '%' && yyf[1] == 's' && yyi < yycount)
+		{
+		  yyp += yytnamerr (yyp, yyarg[yyi++]);
+		  yyf += 2;
+		}
+	      else
+		{
+		  yyp++;
+		  yyf++;
+		}
+	    }
+	}
+      return yysize;
+    }
+}
+#endif /* YYERROR_VERBOSE */
+
+
+/*-----------------------------------------------.
+| Release the memory associated to this symbol.  |
+`-----------------------------------------------*/
+
+/*ARGSUSED*/
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep)
+#else
+static void
+yydestruct (yymsg, yytype, yyvaluep)
+    const char *yymsg;
+    int yytype;
+    YYSTYPE *yyvaluep;
+#endif
+{
+  YYUSE (yyvaluep);
+
+  if (!yymsg)
+    yymsg = "Deleting";
+  YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp);
+}
+
+
+/* Prevent warnings from -Wmissing-prototypes.  */
+
+#ifdef YYPARSE_PARAM
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void *YYPARSE_PARAM);
+#else
+int yyparse ();
+#endif
+#else /* ! YYPARSE_PARAM */
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void);
+#else
+int yyparse ();
+#endif
+#endif /* ! YYPARSE_PARAM */
+
+
+
+/* The look-ahead symbol.  */
+int yychar;
+
+/* The semantic value of the look-ahead symbol.  */
+YYSTYPE yylval;
+
+/* Number of syntax errors so far.  */
+int yynerrs;
+
+
+
+/*----------.
+| yyparse.  |
+`----------*/
+
+#ifdef YYPARSE_PARAM
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+int
+yyparse (void *YYPARSE_PARAM)
+#else
+int
+yyparse (YYPARSE_PARAM)
+    void *YYPARSE_PARAM;
+#endif
+#else /* ! YYPARSE_PARAM */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+int
+yyparse (void)
+#else
+int
+yyparse ()
+
+#endif
+#endif
+{
+
+  int yystate;
+  int yyn;
+  int yyresult;
+  /* Number of tokens to shift before error messages enabled.  */
+  int yyerrstatus;
+  /* Look-ahead token as an internal (translated) token number.  */
+  int yytoken = 0;
+#if YYERROR_VERBOSE
+  /* Buffer for error messages, and its allocated size.  */
+  char yymsgbuf[128];
+  char *yymsg = yymsgbuf;
+  YYSIZE_T yymsg_alloc = sizeof yymsgbuf;
+#endif
+
+  /* Three stacks and their tools:
+     `yyss': related to states,
+     `yyvs': related to semantic values,
+     `yyls': related to locations.
+
+     Refer to the stacks thru separate pointers, to allow yyoverflow
+     to reallocate them elsewhere.  */
+
+  /* The state stack.  */
+  yytype_int16 yyssa[YYINITDEPTH];
+  yytype_int16 *yyss = yyssa;
+  yytype_int16 *yyssp;
+
+  /* The semantic value stack.  */
+  YYSTYPE yyvsa[YYINITDEPTH];
+  YYSTYPE *yyvs = yyvsa;
+  YYSTYPE *yyvsp;
+
+
+
+#define YYPOPSTACK(N)   (yyvsp -= (N), yyssp -= (N))
+
+  YYSIZE_T yystacksize = YYINITDEPTH;
+
+  /* The variables used to return semantic value and location from the
+     action routines.  */
+  YYSTYPE yyval;
+
+
+  /* The number of symbols on the RHS of the reduced rule.
+     Keep to zero when no symbol should be popped.  */
+  int yylen = 0;
+
+  YYDPRINTF ((stderr, "Starting parse\n"));
+
+  yystate = 0;
+  yyerrstatus = 0;
+  yynerrs = 0;
+  yychar = YYEMPTY;		/* Cause a token to be read.  */
+
+  /* Initialize stack pointers.
+     Waste one element of value and location stack
+     so that they stay on the same level as the state stack.
+     The wasted elements are never initialized.  */
+
+  yyssp = yyss;
+  yyvsp = yyvs;
+
+  goto yysetstate;
+
+/*------------------------------------------------------------.
+| yynewstate -- Push a new state, which is found in yystate.  |
+`------------------------------------------------------------*/
+ yynewstate:
+  /* In all cases, when you get here, the value and location stacks
+     have just been pushed.  So pushing a state here evens the stacks.  */
+  yyssp++;
+
+ yysetstate:
+  *yyssp = yystate;
+
+  if (yyss + yystacksize - 1 <= yyssp)
+    {
+      /* Get the current used size of the three stacks, in elements.  */
+      YYSIZE_T yysize = yyssp - yyss + 1;
+
+#ifdef yyoverflow
+      {
+	/* Give user a chance to reallocate the stack.  Use copies of
+	   these so that the &'s don't force the real ones into
+	   memory.  */
+	YYSTYPE *yyvs1 = yyvs;
+	yytype_int16 *yyss1 = yyss;
+
+
+	/* Each stack pointer address is followed by the size of the
+	   data in use in that stack, in bytes.  This used to be a
+	   conditional around just the two extra args, but that might
+	   be undefined if yyoverflow is a macro.  */
+	yyoverflow (YY_("memory exhausted"),
+		    &yyss1, yysize * sizeof (*yyssp),
+		    &yyvs1, yysize * sizeof (*yyvsp),
+
+		    &yystacksize);
+
+	yyss = yyss1;
+	yyvs = yyvs1;
+      }
+#else /* no yyoverflow */
+# ifndef YYSTACK_RELOCATE
+      goto yyexhaustedlab;
+# else
+      /* Extend the stack our own way.  */
+      if (YYMAXDEPTH <= yystacksize)
+	goto yyexhaustedlab;
+      yystacksize *= 2;
+      if (YYMAXDEPTH < yystacksize)
+	yystacksize = YYMAXDEPTH;
+
+      {
+	yytype_int16 *yyss1 = yyss;
+	union yyalloc *yyptr =
+	  (union yyalloc*) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize));
+	if (! yyptr)
+	  goto yyexhaustedlab;
+	YYSTACK_RELOCATE (yyss);
+	YYSTACK_RELOCATE (yyvs);
+
+#  undef YYSTACK_RELOCATE
+	if (yyss1 != yyssa)
+	  YYSTACK_FREE (yyss1);
+      }
+# endif
+#endif /* no yyoverflow */
+
+      yyssp = yyss + yysize - 1;
+      yyvsp = yyvs + yysize - 1;
+
+
+      YYDPRINTF ((stderr, "Stack size increased to %lu\n",
+		  (unsigned long int) yystacksize));
+
+      if (yyss + yystacksize - 1 <= yyssp)
+	YYABORT;
+    }
+
+  YYDPRINTF ((stderr, "Entering state %d\n", yystate));
+
+  goto yybackup;
+
+/*-----------.
+| yybackup.  |
+`-----------*/
+yybackup:
+
+  /* Do appropriate processing given the current state.  Read a
+     look-ahead token if we need one and don't already have one.  */
+
+  /* First try to decide what to do without reference to look-ahead token.  */
+  yyn = yypact[yystate];
+  if (yyn == YYPACT_NINF)
+    goto yydefault;
+
+  /* Not known => get a look-ahead token if don't already have one.  */
+
+  /* YYCHAR is either YYEMPTY or YYEOF or a valid look-ahead symbol.  */
+  if (yychar == YYEMPTY)
+    {
+      YYDPRINTF ((stderr, "Reading a token: "));
+      yychar = YYLEX;
+    }
+
+  if (yychar <= YYEOF)
+    {
+      yychar = yytoken = YYEOF;
+      YYDPRINTF ((stderr, "Now at end of input.\n"));
+    }
+  else
+    {
+      yytoken = YYTRANSLATE (yychar);
+      YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc);
+    }
+
+  /* If the proper action on seeing token YYTOKEN is to reduce or to
+     detect an error, take that action.  */
+  yyn += yytoken;
+  if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken)
+    goto yydefault;
+  yyn = yytable[yyn];
+  if (yyn <= 0)
+    {
+      if (yyn == 0 || yyn == YYTABLE_NINF)
+	goto yyerrlab;
+      yyn = -yyn;
+      goto yyreduce;
+    }
+
+  if (yyn == YYFINAL)
+    YYACCEPT;
+
+  /* Count tokens shifted since error; after three, turn off error
+     status.  */
+  if (yyerrstatus)
+    yyerrstatus--;
+
+  /* Shift the look-ahead token.  */
+  YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc);
+
+  /* Discard the shifted token unless it is eof.  */
+  if (yychar != YYEOF)
+    yychar = YYEMPTY;
+
+  yystate = yyn;
+  *++yyvsp = yylval;
+
+  goto yynewstate;
+
+
+/*-----------------------------------------------------------.
+| yydefault -- do the default action for the current state.  |
+`-----------------------------------------------------------*/
+yydefault:
+  yyn = yydefact[yystate];
+  if (yyn == 0)
+    goto yyerrlab;
+  goto yyreduce;
+
+
+/*-----------------------------.
+| yyreduce -- Do a reduction.  |
+`-----------------------------*/
+yyreduce:
+  /* yyn is the number of a rule to reduce with.  */
+  yylen = yyr2[yyn];
+
+  /* If YYLEN is nonzero, implement the default value of the action:
+     `$$ = $1'.
+
+     Otherwise, the following line sets YYVAL to garbage.
+     This behavior is undocumented and Bison
+     users should not rely upon it.  Assigning to YYVAL
+     unconditionally makes the parser a bit smaller, and it avoids a
+     GCC warning that YYVAL may be used uninitialized.  */
+  yyval = yyvsp[1-yylen];
+
+
+  YY_REDUCE_PRINT (yyn);
+  switch (yyn)
+    {
+        case 25:
+#line 190 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 26:
+#line 192 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (2)]), (yyvsp[(2) - (2)])); ;}
+    break;
+
+  case 27:
+#line 196 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 28:
+#line 198 "pars0grm.y"
+    { (yyval) = pars_func((yyvsp[(1) - (4)]), (yyvsp[(3) - (4)])); ;}
+    break;
+
+  case 29:
+#line 199 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 30:
+#line 200 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 31:
+#line 201 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 32:
+#line 202 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 33:
+#line 203 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 34:
+#line 204 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 35:
+#line 205 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]);;}
+    break;
+
+  case 36:
+#line 206 "pars0grm.y"
+    { (yyval) = pars_op('+', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 37:
+#line 207 "pars0grm.y"
+    { (yyval) = pars_op('-', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 38:
+#line 208 "pars0grm.y"
+    { (yyval) = pars_op('*', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 39:
+#line 209 "pars0grm.y"
+    { (yyval) = pars_op('/', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 40:
+#line 210 "pars0grm.y"
+    { (yyval) = pars_op('-', (yyvsp[(2) - (2)]), NULL); ;}
+    break;
+
+  case 41:
+#line 211 "pars0grm.y"
+    { (yyval) = (yyvsp[(2) - (3)]); ;}
+    break;
+
+  case 42:
+#line 212 "pars0grm.y"
+    { (yyval) = pars_op('=', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 43:
+#line 214 "pars0grm.y"
+    { (yyval) = pars_op(PARS_LIKE_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 44:
+#line 215 "pars0grm.y"
+    { (yyval) = pars_op('<', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 45:
+#line 216 "pars0grm.y"
+    { (yyval) = pars_op('>', (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 46:
+#line 217 "pars0grm.y"
+    { (yyval) = pars_op(PARS_GE_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 47:
+#line 218 "pars0grm.y"
+    { (yyval) = pars_op(PARS_LE_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 48:
+#line 219 "pars0grm.y"
+    { (yyval) = pars_op(PARS_NE_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 49:
+#line 220 "pars0grm.y"
+    { (yyval) = pars_op(PARS_AND_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 50:
+#line 221 "pars0grm.y"
+    { (yyval) = pars_op(PARS_OR_TOKEN, (yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 51:
+#line 222 "pars0grm.y"
+    { (yyval) = pars_op(PARS_NOT_TOKEN, (yyvsp[(2) - (2)]), NULL); ;}
+    break;
+
+  case 52:
+#line 224 "pars0grm.y"
+    { (yyval) = pars_op(PARS_NOTFOUND_TOKEN, (yyvsp[(1) - (3)]), NULL); ;}
+    break;
+
+  case 53:
+#line 226 "pars0grm.y"
+    { (yyval) = pars_op(PARS_NOTFOUND_TOKEN, (yyvsp[(1) - (3)]), NULL); ;}
+    break;
+
+  case 54:
+#line 230 "pars0grm.y"
+    { (yyval) = &pars_to_char_token; ;}
+    break;
+
+  case 55:
+#line 231 "pars0grm.y"
+    { (yyval) = &pars_to_number_token; ;}
+    break;
+
+  case 56:
+#line 232 "pars0grm.y"
+    { (yyval) = &pars_to_binary_token; ;}
+    break;
+
+  case 57:
+#line 234 "pars0grm.y"
+    { (yyval) = &pars_binary_to_number_token; ;}
+    break;
+
+  case 58:
+#line 235 "pars0grm.y"
+    { (yyval) = &pars_substr_token; ;}
+    break;
+
+  case 59:
+#line 236 "pars0grm.y"
+    { (yyval) = &pars_concat_token; ;}
+    break;
+
+  case 60:
+#line 237 "pars0grm.y"
+    { (yyval) = &pars_instr_token; ;}
+    break;
+
+  case 61:
+#line 238 "pars0grm.y"
+    { (yyval) = &pars_length_token; ;}
+    break;
+
+  case 62:
+#line 239 "pars0grm.y"
+    { (yyval) = &pars_sysdate_token; ;}
+    break;
+
+  case 63:
+#line 240 "pars0grm.y"
+    { (yyval) = &pars_rnd_token; ;}
+    break;
+
+  case 64:
+#line 241 "pars0grm.y"
+    { (yyval) = &pars_rnd_str_token; ;}
+    break;
+
+  case 68:
+#line 252 "pars0grm.y"
+    { (yyval) = pars_stored_procedure_call(
+					static_cast<sym_node_t*>((yyvsp[(2) - (6)]))); ;}
+    break;
+
+  case 69:
+#line 258 "pars0grm.y"
+    { (yyval) = pars_procedure_call((yyvsp[(1) - (4)]), (yyvsp[(3) - (4)])); ;}
+    break;
+
+  case 70:
+#line 262 "pars0grm.y"
+    { (yyval) = &pars_replstr_token; ;}
+    break;
+
+  case 71:
+#line 263 "pars0grm.y"
+    { (yyval) = &pars_printf_token; ;}
+    break;
+
+  case 72:
+#line 264 "pars0grm.y"
+    { (yyval) = &pars_assert_token; ;}
+    break;
+
+  case 73:
+#line 268 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (3)]); ;}
+    break;
+
+  case 74:
+#line 272 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 75:
+#line 274 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 76:
+#line 278 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 77:
+#line 279 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 78:
+#line 281 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 79:
+#line 285 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 80:
+#line 286 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)]));;}
+    break;
+
+  case 81:
+#line 287 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 82:
+#line 291 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]); ;}
+    break;
+
+  case 83:
+#line 293 "pars0grm.y"
+    { (yyval) = pars_func(&pars_count_token,
+				          que_node_list_add_last(NULL,
+					    sym_tab_add_int_lit(
+						pars_sym_tab_global, 1))); ;}
+    break;
+
+  case 84:
+#line 298 "pars0grm.y"
+    { (yyval) = pars_func(&pars_count_token,
+					    que_node_list_add_last(NULL,
+						pars_func(&pars_distinct_token,
+						     que_node_list_add_last(
+								NULL, (yyvsp[(4) - (5)]))))); ;}
+    break;
+
+  case 85:
+#line 304 "pars0grm.y"
+    { (yyval) = pars_func(&pars_sum_token,
+						que_node_list_add_last(NULL,
+									(yyvsp[(3) - (4)]))); ;}
+    break;
+
+  case 86:
+#line 310 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 87:
+#line 311 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 88:
+#line 313 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 89:
+#line 317 "pars0grm.y"
+    { (yyval) = pars_select_list(&pars_star_denoter,
+								NULL); ;}
+    break;
+
+  case 90:
+#line 320 "pars0grm.y"
+    { (yyval) = pars_select_list(
+					(yyvsp[(1) - (3)]), static_cast<sym_node_t*>((yyvsp[(3) - (3)]))); ;}
+    break;
+
+  case 91:
+#line 322 "pars0grm.y"
+    { (yyval) = pars_select_list((yyvsp[(1) - (1)]), NULL); ;}
+    break;
+
+  case 92:
+#line 326 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 93:
+#line 327 "pars0grm.y"
+    { (yyval) = (yyvsp[(2) - (2)]); ;}
+    break;
+
+  case 94:
+#line 331 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 95:
+#line 333 "pars0grm.y"
+    { (yyval) = &pars_update_token; ;}
+    break;
+
+  case 96:
+#line 337 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 97:
+#line 339 "pars0grm.y"
+    { (yyval) = &pars_share_token; ;}
+    break;
+
+  case 98:
+#line 343 "pars0grm.y"
+    { (yyval) = &pars_asc_token; ;}
+    break;
+
+  case 99:
+#line 344 "pars0grm.y"
+    { (yyval) = &pars_asc_token; ;}
+    break;
+
+  case 100:
+#line 345 "pars0grm.y"
+    { (yyval) = &pars_desc_token; ;}
+    break;
+
+  case 101:
+#line 349 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 102:
+#line 351 "pars0grm.y"
+    { (yyval) = pars_order_by(
+					static_cast<sym_node_t*>((yyvsp[(3) - (4)])),
+					static_cast<pars_res_word_t*>((yyvsp[(4) - (4)]))); ;}
+    break;
+
+  case 103:
+#line 362 "pars0grm.y"
+    { (yyval) = pars_select_statement(
+					static_cast<sel_node_t*>((yyvsp[(2) - (8)])),
+					static_cast<sym_node_t*>((yyvsp[(4) - (8)])),
+					static_cast<que_node_t*>((yyvsp[(5) - (8)])),
+					static_cast<pars_res_word_t*>((yyvsp[(6) - (8)])),
+					static_cast<pars_res_word_t*>((yyvsp[(7) - (8)])),
+					static_cast<order_node_t*>((yyvsp[(8) - (8)]))); ;}
+    break;
+
+  case 104:
+#line 373 "pars0grm.y"
+    { (yyval) = (yyvsp[(3) - (3)]); ;}
+    break;
+
+  case 105:
+#line 378 "pars0grm.y"
+    { (yyval) = pars_insert_statement(
+					static_cast<sym_node_t*>((yyvsp[(1) - (5)])), (yyvsp[(4) - (5)]), NULL); ;}
+    break;
+
+  case 106:
+#line 381 "pars0grm.y"
+    { (yyval) = pars_insert_statement(
+					static_cast<sym_node_t*>((yyvsp[(1) - (2)])),
+					NULL,
+					static_cast<sel_node_t*>((yyvsp[(2) - (2)]))); ;}
+    break;
+
+  case 107:
+#line 388 "pars0grm.y"
+    { (yyval) = pars_column_assignment(
+					static_cast<sym_node_t*>((yyvsp[(1) - (3)])),
+					static_cast<que_node_t*>((yyvsp[(3) - (3)]))); ;}
+    break;
+
+  case 108:
+#line 394 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 109:
+#line 396 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 110:
+#line 402 "pars0grm.y"
+    { (yyval) = (yyvsp[(4) - (4)]); ;}
+    break;
+
+  case 111:
+#line 408 "pars0grm.y"
+    { (yyval) = pars_update_statement_start(
+					FALSE,
+					static_cast<sym_node_t*>((yyvsp[(2) - (4)])),
+					static_cast<col_assign_node_t*>((yyvsp[(4) - (4)]))); ;}
+    break;
+
+  case 112:
+#line 416 "pars0grm.y"
+    { (yyval) = pars_update_statement(
+					static_cast<upd_node_t*>((yyvsp[(1) - (2)])),
+					NULL,
+					static_cast<que_node_t*>((yyvsp[(2) - (2)]))); ;}
+    break;
+
+  case 113:
+#line 424 "pars0grm.y"
+    { (yyval) = pars_update_statement(
+					static_cast<upd_node_t*>((yyvsp[(1) - (2)])),
+					static_cast<sym_node_t*>((yyvsp[(2) - (2)])),
+					NULL); ;}
+    break;
+
+  case 114:
+#line 432 "pars0grm.y"
+    { (yyval) = pars_update_statement_start(
+					TRUE,
+					static_cast<sym_node_t*>((yyvsp[(3) - (3)])), NULL); ;}
+    break;
+
+  case 115:
+#line 439 "pars0grm.y"
+    { (yyval) = pars_update_statement(
+					static_cast<upd_node_t*>((yyvsp[(1) - (2)])),
+					NULL,
+					static_cast<que_node_t*>((yyvsp[(2) - (2)]))); ;}
+    break;
+
+  case 116:
+#line 447 "pars0grm.y"
+    { (yyval) = pars_update_statement(
+					static_cast<upd_node_t*>((yyvsp[(1) - (2)])),
+					static_cast<sym_node_t*>((yyvsp[(2) - (2)])),
+					NULL); ;}
+    break;
+
+  case 117:
+#line 455 "pars0grm.y"
+    { (yyval) = pars_row_printf_statement(
+					static_cast<sel_node_t*>((yyvsp[(2) - (2)]))); ;}
+    break;
+
+  case 118:
+#line 461 "pars0grm.y"
+    { (yyval) = pars_assignment_statement(
+					static_cast<sym_node_t*>((yyvsp[(1) - (3)])),
+					static_cast<que_node_t*>((yyvsp[(3) - (3)]))); ;}
+    break;
+
+  case 119:
+#line 469 "pars0grm.y"
+    { (yyval) = pars_elsif_element((yyvsp[(2) - (4)]), (yyvsp[(4) - (4)])); ;}
+    break;
+
+  case 120:
+#line 473 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 121:
+#line 475 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (2)]), (yyvsp[(2) - (2)])); ;}
+    break;
+
+  case 122:
+#line 479 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 123:
+#line 481 "pars0grm.y"
+    { (yyval) = (yyvsp[(2) - (2)]); ;}
+    break;
+
+  case 124:
+#line 482 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]); ;}
+    break;
+
+  case 125:
+#line 489 "pars0grm.y"
+    { (yyval) = pars_if_statement((yyvsp[(2) - (7)]), (yyvsp[(4) - (7)]), (yyvsp[(5) - (7)])); ;}
+    break;
+
+  case 126:
+#line 495 "pars0grm.y"
+    { (yyval) = pars_while_statement((yyvsp[(2) - (6)]), (yyvsp[(4) - (6)])); ;}
+    break;
+
+  case 127:
+#line 503 "pars0grm.y"
+    { (yyval) = pars_for_statement(
+					static_cast<sym_node_t*>((yyvsp[(2) - (10)])),
+					(yyvsp[(4) - (10)]), (yyvsp[(6) - (10)]), (yyvsp[(8) - (10)])); ;}
+    break;
+
+  case 128:
+#line 509 "pars0grm.y"
+    { (yyval) = pars_exit_statement(); ;}
+    break;
+
+  case 129:
+#line 513 "pars0grm.y"
+    { (yyval) = pars_return_statement(); ;}
+    break;
+
+  case 130:
+#line 518 "pars0grm.y"
+    { (yyval) = pars_open_statement(
+						ROW_SEL_OPEN_CURSOR,
+						static_cast<sym_node_t*>((yyvsp[(2) - (2)]))); ;}
+    break;
+
+  case 131:
+#line 525 "pars0grm.y"
+    { (yyval) = pars_open_statement(
+						ROW_SEL_CLOSE_CURSOR,
+						static_cast<sym_node_t*>((yyvsp[(2) - (2)]))); ;}
+    break;
+
+  case 132:
+#line 532 "pars0grm.y"
+    { (yyval) = pars_fetch_statement(
+					static_cast<sym_node_t*>((yyvsp[(2) - (4)])),
+					static_cast<sym_node_t*>((yyvsp[(4) - (4)])), NULL); ;}
+    break;
+
+  case 133:
+#line 536 "pars0grm.y"
+    { (yyval) = pars_fetch_statement(
+					static_cast<sym_node_t*>((yyvsp[(2) - (4)])),
+					NULL,
+					static_cast<sym_node_t*>((yyvsp[(4) - (4)]))); ;}
+    break;
+
+  case 134:
+#line 544 "pars0grm.y"
+    { (yyval) = pars_column_def(
+					static_cast<sym_node_t*>((yyvsp[(1) - (5)])),
+					static_cast<pars_res_word_t*>((yyvsp[(2) - (5)])),
+					static_cast<sym_node_t*>((yyvsp[(3) - (5)])),
+					(yyvsp[(4) - (5)]), (yyvsp[(5) - (5)])); ;}
+    break;
+
+  case 135:
+#line 552 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 136:
+#line 554 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 137:
+#line 558 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 138:
+#line 560 "pars0grm.y"
+    { (yyval) = (yyvsp[(2) - (3)]); ;}
+    break;
+
+  case 139:
+#line 564 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 140:
+#line 566 "pars0grm.y"
+    { (yyval) = &pars_int_token;
+					/* pass any non-NULL pointer */ ;}
+    break;
+
+  case 141:
+#line 571 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 142:
+#line 573 "pars0grm.y"
+    { (yyval) = &pars_int_token;
+					/* pass any non-NULL pointer */ ;}
+    break;
+
+  case 143:
+#line 578 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 144:
+#line 580 "pars0grm.y"
+    { (yyval) = &pars_int_token;
+					/* pass any non-NULL pointer */ ;}
+    break;
+
+  case 145:
+#line 585 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 146:
+#line 586 "pars0grm.y"
+    { (yyval) = &pars_int_token;
+					/* pass any non-NULL pointer */ ;}
+    break;
+
+  case 147:
+#line 591 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 148:
+#line 593 "pars0grm.y"
+    { (yyval) = (yyvsp[(3) - (3)]); ;}
+    break;
+
+  case 149:
+#line 600 "pars0grm.y"
+    { (yyval) = pars_create_table(
+					static_cast<sym_node_t*>((yyvsp[(3) - (9)])),
+					static_cast<sym_node_t*>((yyvsp[(5) - (9)])),
+					static_cast<sym_node_t*>((yyvsp[(8) - (9)])),
+					static_cast<sym_node_t*>((yyvsp[(9) - (9)])), (yyvsp[(7) - (9)])); ;}
+    break;
+
+  case 150:
+#line 608 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 151:
+#line 610 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 152:
+#line 614 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 153:
+#line 615 "pars0grm.y"
+    { (yyval) = &pars_unique_token; ;}
+    break;
+
+  case 154:
+#line 619 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 155:
+#line 620 "pars0grm.y"
+    { (yyval) = &pars_clustered_token; ;}
+    break;
+
+  case 156:
+#line 629 "pars0grm.y"
+    { (yyval) = pars_create_index(
+					static_cast<pars_res_word_t*>((yyvsp[(2) - (10)])),
+					static_cast<pars_res_word_t*>((yyvsp[(3) - (10)])),
+					static_cast<sym_node_t*>((yyvsp[(5) - (10)])),
+					static_cast<sym_node_t*>((yyvsp[(7) - (10)])),
+					static_cast<sym_node_t*>((yyvsp[(9) - (10)]))); ;}
+    break;
+
+  case 157:
+#line 638 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]); ;}
+    break;
+
+  case 158:
+#line 639 "pars0grm.y"
+    { (yyval) = (yyvsp[(1) - (1)]); ;}
+    break;
+
+  case 159:
+#line 644 "pars0grm.y"
+    { (yyval) = pars_commit_statement(); ;}
+    break;
+
+  case 160:
+#line 649 "pars0grm.y"
+    { (yyval) = pars_rollback_statement(); ;}
+    break;
+
+  case 161:
+#line 653 "pars0grm.y"
+    { (yyval) = &pars_int_token; ;}
+    break;
+
+  case 162:
+#line 654 "pars0grm.y"
+    { (yyval) = &pars_int_token; ;}
+    break;
+
+  case 163:
+#line 655 "pars0grm.y"
+    { (yyval) = &pars_bigint_token; ;}
+    break;
+
+  case 164:
+#line 656 "pars0grm.y"
+    { (yyval) = &pars_char_token; ;}
+    break;
+
+  case 165:
+#line 657 "pars0grm.y"
+    { (yyval) = &pars_binary_token; ;}
+    break;
+
+  case 166:
+#line 658 "pars0grm.y"
+    { (yyval) = &pars_blob_token; ;}
+    break;
+
+  case 167:
+#line 663 "pars0grm.y"
+    { (yyval) = pars_parameter_declaration(
+					static_cast<sym_node_t*>((yyvsp[(1) - (3)])),
+					PARS_INPUT,
+					static_cast<pars_res_word_t*>((yyvsp[(3) - (3)]))); ;}
+    break;
+
+  case 168:
+#line 668 "pars0grm.y"
+    { (yyval) = pars_parameter_declaration(
+					static_cast<sym_node_t*>((yyvsp[(1) - (3)])),
+					PARS_OUTPUT,
+					static_cast<pars_res_word_t*>((yyvsp[(3) - (3)]))); ;}
+    break;
+
+  case 169:
+#line 675 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 170:
+#line 676 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[(1) - (1)])); ;}
+    break;
+
+  case 171:
+#line 678 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[(1) - (3)]), (yyvsp[(3) - (3)])); ;}
+    break;
+
+  case 172:
+#line 683 "pars0grm.y"
+    { (yyval) = pars_variable_declaration(
+					static_cast<sym_node_t*>((yyvsp[(1) - (3)])),
+					static_cast<pars_res_word_t*>((yyvsp[(2) - (3)]))); ;}
+    break;
+
+  case 176:
+#line 697 "pars0grm.y"
+    { (yyval) = pars_cursor_declaration(
+					static_cast<sym_node_t*>((yyvsp[(3) - (6)])),
+					static_cast<sel_node_t*>((yyvsp[(5) - (6)]))); ;}
+    break;
+
+  case 177:
+#line 704 "pars0grm.y"
+    { (yyval) = pars_function_declaration(
+					static_cast<sym_node_t*>((yyvsp[(3) - (4)]))); ;}
+    break;
+
+  case 183:
+#line 726 "pars0grm.y"
+    { (yyval) = pars_procedure_definition(
+					static_cast<sym_node_t*>((yyvsp[(2) - (11)])),
+					static_cast<sym_node_t*>((yyvsp[(4) - (11)])),
+					(yyvsp[(10) - (11)])); ;}
+    break;
+
+
+/* Line 1267 of yacc.c.  */
+#line 2826 "pars0grm.cc"
+      default: break;
+    }
+  YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc);
+
+  YYPOPSTACK (yylen);
+  yylen = 0;
+  YY_STACK_PRINT (yyss, yyssp);
+
+  *++yyvsp = yyval;
+
+
+  /* Now `shift' the result of the reduction.  Determine what state
+     that goes to, based on the state we popped back to and the rule
+     number reduced by.  */
+
+  yyn = yyr1[yyn];
+
+  yystate = yypgoto[yyn - YYNTOKENS] + *yyssp;
+  if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp)
+    yystate = yytable[yystate];
+  else
+    yystate = yydefgoto[yyn - YYNTOKENS];
+
+  goto yynewstate;
+
+
+/*------------------------------------.
+| yyerrlab -- here on detecting error |
+`------------------------------------*/
+yyerrlab:
+  /* If not already recovering from an error, report this error.  */
+  if (!yyerrstatus)
+    {
+      ++yynerrs;
+#if ! YYERROR_VERBOSE
+      yyerror (YY_("syntax error"));
+#else
+      {
+	YYSIZE_T yysize = yysyntax_error (0, yystate, yychar);
+	if (yymsg_alloc < yysize && yymsg_alloc < YYSTACK_ALLOC_MAXIMUM)
+	  {
+	    YYSIZE_T yyalloc = 2 * yysize;
+	    if (! (yysize <= yyalloc && yyalloc <= YYSTACK_ALLOC_MAXIMUM))
+	      yyalloc = YYSTACK_ALLOC_MAXIMUM;
+	    if (yymsg != yymsgbuf)
+	      YYSTACK_FREE (yymsg);
+	    yymsg = (char*) YYSTACK_ALLOC (yyalloc);
+	    if (yymsg)
+	      yymsg_alloc = yyalloc;
+	    else
+	      {
+		yymsg = yymsgbuf;
+		yymsg_alloc = sizeof yymsgbuf;
+	      }
+	  }
+
+	if (0 < yysize && yysize <= yymsg_alloc)
+	  {
+	    (void) yysyntax_error (yymsg, yystate, yychar);
+	    yyerror (yymsg);
+	  }
+	else
+	  {
+	    yyerror (YY_("syntax error"));
+	    if (yysize != 0)
+	      goto yyexhaustedlab;
+	  }
+      }
+#endif
+    }
+
+
+
+  if (yyerrstatus == 3)
+    {
+      /* If just tried and failed to reuse look-ahead token after an
+	 error, discard it.  */
+
+      if (yychar <= YYEOF)
+	{
+	  /* Return failure if at end of input.  */
+	  if (yychar == YYEOF)
+	    YYABORT;
+	}
+      else
+	{
+	  yydestruct ("Error: discarding",
+		      yytoken, &yylval);
+	  yychar = YYEMPTY;
+	}
+    }
+
+  /* Else will try to reuse look-ahead token after shifting the error
+     token.  */
+  goto yyerrlab1;
+
+
+/*---------------------------------------------------.
+| yyerrorlab -- error raised explicitly by YYERROR.  |
+`---------------------------------------------------*/
+yyerrorlab:
+
+  /* Pacify compilers like GCC when the user code never invokes
+     YYERROR and the label yyerrorlab therefore never appears in user
+     code.  */
+  if (/*CONSTCOND*/ 0)
+     goto yyerrorlab;
+
+  /* Do not reclaim the symbols of the rule which action triggered
+     this YYERROR.  */
+  YYPOPSTACK (yylen);
+  yylen = 0;
+  YY_STACK_PRINT (yyss, yyssp);
+  yystate = *yyssp;
+  goto yyerrlab1;
+
+
+/*-------------------------------------------------------------.
+| yyerrlab1 -- common code for both syntax error and YYERROR.  |
+`-------------------------------------------------------------*/
+yyerrlab1:
+  yyerrstatus = 3;	/* Each real token shifted decrements this.  */
+
+  for (;;)
+    {
+      yyn = yypact[yystate];
+      if (yyn != YYPACT_NINF)
+	{
+	  yyn += YYTERROR;
+	  if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR)
+	    {
+	      yyn = yytable[yyn];
+	      if (0 < yyn)
+		break;
+	    }
+	}
+
+      /* Pop the current state because it cannot handle the error token.  */
+      if (yyssp == yyss)
+	YYABORT;
+
+
+      yydestruct ("Error: popping",
+		  yystos[yystate], yyvsp);
+      YYPOPSTACK (1);
+      yystate = *yyssp;
+      YY_STACK_PRINT (yyss, yyssp);
+    }
+
+  if (yyn == YYFINAL)
+    YYACCEPT;
+
+  *++yyvsp = yylval;
+
+
+  /* Shift the error token.  */
+  YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp);
+
+  yystate = yyn;
+  goto yynewstate;
+
+
+/*-------------------------------------.
+| yyacceptlab -- YYACCEPT comes here.  |
+`-------------------------------------*/
+yyacceptlab:
+  yyresult = 0;
+  goto yyreturn;
+
+/*-----------------------------------.
+| yyabortlab -- YYABORT comes here.  |
+`-----------------------------------*/
+yyabortlab:
+  yyresult = 1;
+  goto yyreturn;
+
+#ifndef yyoverflow
+/*-------------------------------------------------.
+| yyexhaustedlab -- memory exhaustion comes here.  |
+`-------------------------------------------------*/
+yyexhaustedlab:
+  yyerror (YY_("memory exhausted"));
+  yyresult = 2;
+  /* Fall through.  */
+#endif
+
+yyreturn:
+  if (yychar != YYEOF && yychar != YYEMPTY)
+     yydestruct ("Cleanup: discarding lookahead",
+		 yytoken, &yylval);
+  /* Do not reclaim the symbols of the rule which action triggered
+     this YYABORT or YYACCEPT.  */
+  YYPOPSTACK (yylen);
+  YY_STACK_PRINT (yyss, yyssp);
+  while (yyssp != yyss)
+    {
+      yydestruct ("Cleanup: popping",
+		  yystos[*yyssp], yyvsp);
+      YYPOPSTACK (1);
+    }
+#ifndef yyoverflow
+  if (yyss != yyssa)
+    YYSTACK_FREE (yyss);
+#endif
+#if YYERROR_VERBOSE
+  if (yymsg != yymsgbuf)
+    YYSTACK_FREE (yymsg);
+#endif
+  /* Make sure YYID is used.  */
+  return YYID (yyresult);
+}
+
+
+#line 732 "pars0grm.y"
+
+
diff --git a/storage/innobase/pars/pars0grm.y b/storage/innobase/pars/pars0grm.y
new file mode 100644
index 00000000000..60913287cc4
--- /dev/null
+++ b/storage/innobase/pars/pars0grm.y
@@ -0,0 +1,732 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************
+SQL parser: input file for the GNU Bison parser generator
+
+Look from pars0lex.l for instructions how to generate the C files for
+the InnoDB parser.
+
+Created 12/14/1997 Heikki Tuuri
+*******************************************************/
+
+%{
+/* The value of the semantic attribute is a pointer to a query tree node
+que_node_t */
+
+#include "univ.i"
+#include <math.h>				/* Can't be before univ.i */
+#include "pars0pars.h"
+#include "mem0mem.h"
+#include "que0types.h"
+#include "que0que.h"
+#include "row0sel.h"
+
+#define YYSTYPE que_node_t*
+
+/* #define __STDC__ */
+
+int
+yylex(void);
+%}
+
+%token PARS_INT_LIT
+%token PARS_FLOAT_LIT
+%token PARS_STR_LIT
+%token PARS_FIXBINARY_LIT
+%token PARS_BLOB_LIT
+%token PARS_NULL_LIT
+%token PARS_ID_TOKEN
+%token PARS_AND_TOKEN
+%token PARS_OR_TOKEN
+%token PARS_NOT_TOKEN
+%token PARS_GE_TOKEN
+%token PARS_LE_TOKEN
+%token PARS_NE_TOKEN
+%token PARS_PROCEDURE_TOKEN
+%token PARS_IN_TOKEN
+%token PARS_OUT_TOKEN
+%token PARS_BINARY_TOKEN
+%token PARS_BLOB_TOKEN
+%token PARS_INT_TOKEN
+%token PARS_INTEGER_TOKEN
+%token PARS_FLOAT_TOKEN
+%token PARS_CHAR_TOKEN
+%token PARS_IS_TOKEN
+%token PARS_BEGIN_TOKEN
+%token PARS_END_TOKEN
+%token PARS_IF_TOKEN
+%token PARS_THEN_TOKEN
+%token PARS_ELSE_TOKEN
+%token PARS_ELSIF_TOKEN
+%token PARS_LOOP_TOKEN
+%token PARS_WHILE_TOKEN
+%token PARS_RETURN_TOKEN
+%token PARS_SELECT_TOKEN
+%token PARS_SUM_TOKEN
+%token PARS_COUNT_TOKEN
+%token PARS_DISTINCT_TOKEN
+%token PARS_FROM_TOKEN
+%token PARS_WHERE_TOKEN
+%token PARS_FOR_TOKEN
+%token PARS_DDOT_TOKEN
+%token PARS_READ_TOKEN
+%token PARS_ORDER_TOKEN
+%token PARS_BY_TOKEN
+%token PARS_ASC_TOKEN
+%token PARS_DESC_TOKEN
+%token PARS_INSERT_TOKEN
+%token PARS_INTO_TOKEN
+%token PARS_VALUES_TOKEN
+%token PARS_UPDATE_TOKEN
+%token PARS_SET_TOKEN
+%token PARS_DELETE_TOKEN
+%token PARS_CURRENT_TOKEN
+%token PARS_OF_TOKEN
+%token PARS_CREATE_TOKEN
+%token PARS_TABLE_TOKEN
+%token PARS_INDEX_TOKEN
+%token PARS_UNIQUE_TOKEN
+%token PARS_CLUSTERED_TOKEN
+%token PARS_DOES_NOT_FIT_IN_MEM_TOKEN
+%token PARS_ON_TOKEN
+%token PARS_ASSIGN_TOKEN
+%token PARS_DECLARE_TOKEN
+%token PARS_CURSOR_TOKEN
+%token PARS_SQL_TOKEN
+%token PARS_OPEN_TOKEN
+%token PARS_FETCH_TOKEN
+%token PARS_CLOSE_TOKEN
+%token PARS_NOTFOUND_TOKEN
+%token PARS_TO_CHAR_TOKEN
+%token PARS_TO_NUMBER_TOKEN
+%token PARS_TO_BINARY_TOKEN
+%token PARS_BINARY_TO_NUMBER_TOKEN
+%token PARS_SUBSTR_TOKEN
+%token PARS_REPLSTR_TOKEN
+%token PARS_CONCAT_TOKEN
+%token PARS_INSTR_TOKEN
+%token PARS_LENGTH_TOKEN
+%token PARS_SYSDATE_TOKEN
+%token PARS_PRINTF_TOKEN
+%token PARS_ASSERT_TOKEN
+%token PARS_RND_TOKEN
+%token PARS_RND_STR_TOKEN
+%token PARS_ROW_PRINTF_TOKEN
+%token PARS_COMMIT_TOKEN
+%token PARS_ROLLBACK_TOKEN
+%token PARS_WORK_TOKEN
+%token PARS_UNSIGNED_TOKEN
+%token PARS_EXIT_TOKEN
+%token PARS_FUNCTION_TOKEN
+%token PARS_LOCK_TOKEN
+%token PARS_SHARE_TOKEN
+%token PARS_MODE_TOKEN
+%token PARS_LIKE_TOKEN
+%token PARS_LIKE_TOKEN_EXACT
+%token PARS_LIKE_TOKEN_PREFIX
+%token PARS_LIKE_TOKEN_SUFFIX
+%token PARS_LIKE_TOKEN_SUBSTR
+%token PARS_TABLE_NAME_TOKEN
+%token PARS_COMPACT_TOKEN
+%token PARS_BLOCK_SIZE_TOKEN
+%token PARS_BIGINT_TOKEN
+
+%left PARS_AND_TOKEN PARS_OR_TOKEN
+%left PARS_NOT_TOKEN
+%left '=' '<' '>' PARS_GE_TOKEN PARS_LE_TOKEN
+%left '-' '+'
+%left '*' '/'
+%left NEG     /* negation--unary minus */
+%left '%'
+
+/* Grammar follows */
+%%
+
+top_statement:
+        procedure_definition ';'
+
+statement:
+	stored_procedure_call
+	| predefined_procedure_call ';'
+	| while_statement ';'
+	| for_statement ';'
+	| exit_statement ';'
+	| if_statement ';'
+	| return_statement ';'
+	| assignment_statement ';'
+	| select_statement ';'
+	| insert_statement ';'
+	| row_printf_statement ';'
+	| delete_statement_searched ';'
+	| delete_statement_positioned ';'
+	| update_statement_searched ';'
+	| update_statement_positioned ';'
+	| open_cursor_statement ';'
+	| fetch_statement ';'
+	| close_cursor_statement ';'
+	| commit_statement ';'
+	| rollback_statement ';'
+	| create_table ';'
+	| create_index ';'
+;
+
+statement_list:
+	statement		{ $$ = que_node_list_add_last(NULL, $1); }
+	| statement_list statement
+				{ $$ = que_node_list_add_last($1, $2); }
+;
+
+exp:
+	PARS_ID_TOKEN		{ $$ = $1;}
+	| function_name '(' exp_list ')'
+				{ $$ = pars_func($1, $3); }
+	| PARS_INT_LIT		{ $$ = $1;}
+	| PARS_FLOAT_LIT	{ $$ = $1;}
+	| PARS_STR_LIT		{ $$ = $1;}
+	| PARS_FIXBINARY_LIT	{ $$ = $1;}
+	| PARS_BLOB_LIT		{ $$ = $1;}
+	| PARS_NULL_LIT		{ $$ = $1;}
+	| PARS_SQL_TOKEN	{ $$ = $1;}
+	| exp '+' exp        	{ $$ = pars_op('+', $1, $3); }
+	| exp '-' exp        	{ $$ = pars_op('-', $1, $3); }
+	| exp '*' exp        	{ $$ = pars_op('*', $1, $3); }
+	| exp '/' exp        	{ $$ = pars_op('/', $1, $3); }
+	| '-' exp %prec NEG 	{ $$ = pars_op('-', $2, NULL); }
+	| '(' exp ')'        	{ $$ = $2; }
+	| exp '=' exp		{ $$ = pars_op('=', $1, $3); }
+	| exp PARS_LIKE_TOKEN PARS_STR_LIT
+				{ $$ = pars_op(PARS_LIKE_TOKEN, $1, $3); }
+	| exp '<' exp           { $$ = pars_op('<', $1, $3); }
+	| exp '>' exp           { $$ = pars_op('>', $1, $3); }
+	| exp PARS_GE_TOKEN exp	{ $$ = pars_op(PARS_GE_TOKEN, $1, $3); }
+	| exp PARS_LE_TOKEN exp	{ $$ = pars_op(PARS_LE_TOKEN, $1, $3); }
+	| exp PARS_NE_TOKEN exp	{ $$ = pars_op(PARS_NE_TOKEN, $1, $3); }
+	| exp PARS_AND_TOKEN exp{ $$ = pars_op(PARS_AND_TOKEN, $1, $3); }
+	| exp PARS_OR_TOKEN exp	{ $$ = pars_op(PARS_OR_TOKEN, $1, $3); }
+	| PARS_NOT_TOKEN exp	{ $$ = pars_op(PARS_NOT_TOKEN, $2, NULL); }
+	| PARS_ID_TOKEN '%' PARS_NOTFOUND_TOKEN
+				{ $$ = pars_op(PARS_NOTFOUND_TOKEN, $1, NULL); }
+	| PARS_SQL_TOKEN '%' PARS_NOTFOUND_TOKEN
+				{ $$ = pars_op(PARS_NOTFOUND_TOKEN, $1, NULL); }
+;
+
+function_name:
+	PARS_TO_CHAR_TOKEN	{ $$ = &pars_to_char_token; }
+	| PARS_TO_NUMBER_TOKEN	{ $$ = &pars_to_number_token; }
+	| PARS_TO_BINARY_TOKEN	{ $$ = &pars_to_binary_token; }
+	| PARS_BINARY_TO_NUMBER_TOKEN
+				{ $$ = &pars_binary_to_number_token; }
+	| PARS_SUBSTR_TOKEN	{ $$ = &pars_substr_token; }
+	| PARS_CONCAT_TOKEN	{ $$ = &pars_concat_token; }
+	| PARS_INSTR_TOKEN	{ $$ = &pars_instr_token; }
+	| PARS_LENGTH_TOKEN	{ $$ = &pars_length_token; }
+	| PARS_SYSDATE_TOKEN	{ $$ = &pars_sysdate_token; }
+	| PARS_RND_TOKEN	{ $$ = &pars_rnd_token; }
+	| PARS_RND_STR_TOKEN	{ $$ = &pars_rnd_str_token; }
+;
+
+question_mark_list:
+	/* Nothing */
+	| '?'
+	| question_mark_list ',' '?'
+;
+
+stored_procedure_call:
+	'{' PARS_ID_TOKEN '(' question_mark_list ')' '}'
+				{ $$ = pars_stored_procedure_call(
+					static_cast<sym_node_t*>($2)); }
+;
+
+predefined_procedure_call:
+	predefined_procedure_name '(' exp_list ')'
+				{ $$ = pars_procedure_call($1, $3); }
+;
+
+predefined_procedure_name:
+	PARS_REPLSTR_TOKEN	{ $$ = &pars_replstr_token; }
+	| PARS_PRINTF_TOKEN	{ $$ = &pars_printf_token; }
+	| PARS_ASSERT_TOKEN	{ $$ = &pars_assert_token; }
+;
+
+user_function_call:
+	PARS_ID_TOKEN '(' ')'	{ $$ = $1; }
+;
+
+table_list:
+	table_name		{ $$ = que_node_list_add_last(NULL, $1); }
+	| table_list ',' table_name
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+variable_list:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_ID_TOKEN		{ $$ = que_node_list_add_last(NULL, $1); }
+	| variable_list ',' PARS_ID_TOKEN
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+exp_list:
+	/* Nothing */		{ $$ = NULL; }
+	| exp			{ $$ = que_node_list_add_last(NULL, $1);}
+	| exp_list ',' exp	{ $$ = que_node_list_add_last($1, $3); }
+;
+
+select_item:
+	exp			{ $$ = $1; }
+	| PARS_COUNT_TOKEN '(' '*' ')'
+				{ $$ = pars_func(&pars_count_token,
+				          que_node_list_add_last(NULL,
+					    sym_tab_add_int_lit(
+						pars_sym_tab_global, 1))); }
+	| PARS_COUNT_TOKEN '(' PARS_DISTINCT_TOKEN PARS_ID_TOKEN ')'
+				{ $$ = pars_func(&pars_count_token,
+					    que_node_list_add_last(NULL,
+						pars_func(&pars_distinct_token,
+						     que_node_list_add_last(
+								NULL, $4)))); }
+	| PARS_SUM_TOKEN '(' exp ')'
+				{ $$ = pars_func(&pars_sum_token,
+						que_node_list_add_last(NULL,
+									$3)); }
+;
+
+select_item_list:
+	/* Nothing */		{ $$ = NULL; }
+	| select_item		{ $$ = que_node_list_add_last(NULL, $1); }
+	| select_item_list ',' select_item
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+select_list:
+	'*'			{ $$ = pars_select_list(&pars_star_denoter,
+								NULL); }
+	| select_item_list PARS_INTO_TOKEN variable_list
+				{ $$ = pars_select_list(
+					$1, static_cast<sym_node_t*>($3)); }
+	| select_item_list	{ $$ = pars_select_list($1, NULL); }
+;
+
+search_condition:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_WHERE_TOKEN exp	{ $$ = $2; }
+;
+
+for_update_clause:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_FOR_TOKEN PARS_UPDATE_TOKEN
+				{ $$ = &pars_update_token; }
+;
+
+lock_shared_clause:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_LOCK_TOKEN PARS_IN_TOKEN PARS_SHARE_TOKEN PARS_MODE_TOKEN
+				{ $$ = &pars_share_token; }
+;
+
+order_direction:
+	/* Nothing */		{ $$ = &pars_asc_token; }
+	| PARS_ASC_TOKEN	{ $$ = &pars_asc_token; }
+	| PARS_DESC_TOKEN	{ $$ = &pars_desc_token; }
+;
+
+order_by_clause:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_ORDER_TOKEN PARS_BY_TOKEN PARS_ID_TOKEN order_direction
+				{ $$ = pars_order_by(
+					static_cast<sym_node_t*>($3),
+					static_cast<pars_res_word_t*>($4)); }
+;
+
+select_statement:
+	PARS_SELECT_TOKEN select_list
+	PARS_FROM_TOKEN table_list
+	search_condition
+	for_update_clause
+	lock_shared_clause
+	order_by_clause		{ $$ = pars_select_statement(
+					static_cast<sel_node_t*>($2),
+					static_cast<sym_node_t*>($4),
+					static_cast<que_node_t*>($5),
+					static_cast<pars_res_word_t*>($6),
+					static_cast<pars_res_word_t*>($7),
+					static_cast<order_node_t*>($8)); }
+;
+
+insert_statement_start:
+	PARS_INSERT_TOKEN PARS_INTO_TOKEN
+	table_name		{ $$ = $3; }
+;
+
+insert_statement:
+	insert_statement_start PARS_VALUES_TOKEN '(' exp_list ')'
+				{ $$ = pars_insert_statement(
+					static_cast<sym_node_t*>($1), $4, NULL); }
+	| insert_statement_start select_statement
+				{ $$ = pars_insert_statement(
+					static_cast<sym_node_t*>($1),
+					NULL,
+					static_cast<sel_node_t*>($2)); }
+;
+
+column_assignment:
+	PARS_ID_TOKEN '=' exp	{ $$ = pars_column_assignment(
+					static_cast<sym_node_t*>($1),
+					static_cast<que_node_t*>($3)); }
+;
+
+column_assignment_list:
+	column_assignment	{ $$ = que_node_list_add_last(NULL, $1); }
+	| column_assignment_list ',' column_assignment
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+cursor_positioned:
+	PARS_WHERE_TOKEN
+	PARS_CURRENT_TOKEN PARS_OF_TOKEN
+	PARS_ID_TOKEN 		{ $$ = $4; }
+;
+
+update_statement_start:
+	PARS_UPDATE_TOKEN table_name
+	PARS_SET_TOKEN
+	column_assignment_list	{ $$ = pars_update_statement_start(
+					FALSE,
+					static_cast<sym_node_t*>($2),
+					static_cast<col_assign_node_t*>($4)); }
+;
+
+update_statement_searched:
+	update_statement_start
+	search_condition	{ $$ = pars_update_statement(
+					static_cast<upd_node_t*>($1),
+					NULL,
+					static_cast<que_node_t*>($2)); }
+;
+
+update_statement_positioned:
+	update_statement_start
+	cursor_positioned	{ $$ = pars_update_statement(
+					static_cast<upd_node_t*>($1),
+					static_cast<sym_node_t*>($2),
+					NULL); }
+;
+
+delete_statement_start:
+	PARS_DELETE_TOKEN PARS_FROM_TOKEN
+	table_name		{ $$ = pars_update_statement_start(
+					TRUE,
+					static_cast<sym_node_t*>($3), NULL); }
+;
+
+delete_statement_searched:
+	delete_statement_start
+	search_condition	{ $$ = pars_update_statement(
+					static_cast<upd_node_t*>($1),
+					NULL,
+					static_cast<que_node_t*>($2)); }
+;
+
+delete_statement_positioned:
+	delete_statement_start
+	cursor_positioned	{ $$ = pars_update_statement(
+					static_cast<upd_node_t*>($1),
+					static_cast<sym_node_t*>($2),
+					NULL); }
+;
+
+row_printf_statement:
+	PARS_ROW_PRINTF_TOKEN select_statement
+				{ $$ = pars_row_printf_statement(
+					static_cast<sel_node_t*>($2)); }
+;
+
+assignment_statement:
+	PARS_ID_TOKEN PARS_ASSIGN_TOKEN exp
+				{ $$ = pars_assignment_statement(
+					static_cast<sym_node_t*>($1),
+					static_cast<que_node_t*>($3)); }
+;
+
+elsif_element:
+	PARS_ELSIF_TOKEN
+	exp PARS_THEN_TOKEN statement_list
+				{ $$ = pars_elsif_element($2, $4); }
+;
+
+elsif_list:
+	elsif_element		{ $$ = que_node_list_add_last(NULL, $1); }
+	| elsif_list elsif_element
+				{ $$ = que_node_list_add_last($1, $2); }
+;
+
+else_part:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_ELSE_TOKEN statement_list
+				{ $$ = $2; }
+	| elsif_list		{ $$ = $1; }
+;
+
+if_statement:
+	PARS_IF_TOKEN exp PARS_THEN_TOKEN statement_list
+	else_part
+	PARS_END_TOKEN PARS_IF_TOKEN
+				{ $$ = pars_if_statement($2, $4, $5); }
+;
+
+while_statement:
+	PARS_WHILE_TOKEN exp PARS_LOOP_TOKEN statement_list
+	PARS_END_TOKEN PARS_LOOP_TOKEN
+				{ $$ = pars_while_statement($2, $4); }
+;
+
+for_statement:
+	PARS_FOR_TOKEN PARS_ID_TOKEN PARS_IN_TOKEN
+	exp PARS_DDOT_TOKEN exp
+	PARS_LOOP_TOKEN statement_list
+	PARS_END_TOKEN PARS_LOOP_TOKEN
+				{ $$ = pars_for_statement(
+					static_cast<sym_node_t*>($2),
+					$4, $6, $8); }
+;
+
+exit_statement:
+	PARS_EXIT_TOKEN		{ $$ = pars_exit_statement(); }
+;
+
+return_statement:
+	PARS_RETURN_TOKEN	{ $$ = pars_return_statement(); }
+;
+
+open_cursor_statement:
+	PARS_OPEN_TOKEN PARS_ID_TOKEN
+				{ $$ = pars_open_statement(
+						ROW_SEL_OPEN_CURSOR,
+						static_cast<sym_node_t*>($2)); }
+;
+
+close_cursor_statement:
+	PARS_CLOSE_TOKEN PARS_ID_TOKEN
+				{ $$ = pars_open_statement(
+						ROW_SEL_CLOSE_CURSOR,
+						static_cast<sym_node_t*>($2)); }
+;
+
+fetch_statement:
+	PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN variable_list
+				{ $$ = pars_fetch_statement(
+					static_cast<sym_node_t*>($2),
+					static_cast<sym_node_t*>($4), NULL); }
+	| PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN user_function_call
+				{ $$ = pars_fetch_statement(
+					static_cast<sym_node_t*>($2),
+					NULL,
+					static_cast<sym_node_t*>($4)); }
+;
+
+column_def:
+	PARS_ID_TOKEN type_name	opt_column_len opt_unsigned opt_not_null
+				{ $$ = pars_column_def(
+					static_cast<sym_node_t*>($1),
+					static_cast<pars_res_word_t*>($2),
+					static_cast<sym_node_t*>($3),
+					$4, $5); }
+;
+
+column_def_list:
+	column_def		{ $$ = que_node_list_add_last(NULL, $1); }
+	| column_def_list ',' column_def
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+opt_column_len:
+	/* Nothing */		{ $$ = NULL; }
+	| '(' PARS_INT_LIT ')'
+				{ $$ = $2; }
+;
+
+opt_unsigned:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_UNSIGNED_TOKEN
+				{ $$ = &pars_int_token;
+					/* pass any non-NULL pointer */ }
+;
+
+opt_not_null:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_NOT_TOKEN PARS_NULL_LIT
+				{ $$ = &pars_int_token;
+					/* pass any non-NULL pointer */ }
+;
+
+not_fit_in_memory:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_DOES_NOT_FIT_IN_MEM_TOKEN
+				{ $$ = &pars_int_token;
+					/* pass any non-NULL pointer */ }
+;
+
+compact:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_COMPACT_TOKEN	{ $$ = &pars_int_token;
+					/* pass any non-NULL pointer */ }
+;
+
+block_size:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_BLOCK_SIZE_TOKEN	'=' PARS_INT_LIT
+			{ $$ = $3; }
+;
+
+create_table:
+	PARS_CREATE_TOKEN PARS_TABLE_TOKEN
+	table_name '(' column_def_list ')'
+	not_fit_in_memory compact block_size
+				{ $$ = pars_create_table(
+					static_cast<sym_node_t*>($3),
+					static_cast<sym_node_t*>($5),
+					static_cast<sym_node_t*>($8),
+					static_cast<sym_node_t*>($9), $7); }
+;
+
+column_list:
+	PARS_ID_TOKEN		{ $$ = que_node_list_add_last(NULL, $1); }
+	| column_list ',' PARS_ID_TOKEN
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+unique_def:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_UNIQUE_TOKEN	{ $$ = &pars_unique_token; }
+;
+
+clustered_def:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_CLUSTERED_TOKEN	{ $$ = &pars_clustered_token; }
+;
+
+create_index:
+	PARS_CREATE_TOKEN unique_def
+	clustered_def
+	PARS_INDEX_TOKEN
+	PARS_ID_TOKEN PARS_ON_TOKEN
+	table_name
+	'(' column_list ')'	{ $$ = pars_create_index(
+					static_cast<pars_res_word_t*>($2),
+					static_cast<pars_res_word_t*>($3),
+					static_cast<sym_node_t*>($5),
+					static_cast<sym_node_t*>($7),
+					static_cast<sym_node_t*>($9)); }
+;
+
+table_name:
+	PARS_ID_TOKEN		{ $$ = $1; }
+	| PARS_TABLE_NAME_TOKEN	{ $$ = $1; }
+;
+
+commit_statement:
+	PARS_COMMIT_TOKEN PARS_WORK_TOKEN
+				{ $$ = pars_commit_statement(); }
+;
+
+rollback_statement:
+	PARS_ROLLBACK_TOKEN PARS_WORK_TOKEN
+				{ $$ = pars_rollback_statement(); }
+;
+
+type_name:
+	PARS_INT_TOKEN		{ $$ = &pars_int_token; }
+	| PARS_INTEGER_TOKEN	{ $$ = &pars_int_token; }
+	| PARS_BIGINT_TOKEN	{ $$ = &pars_bigint_token; }
+	| PARS_CHAR_TOKEN	{ $$ = &pars_char_token; }
+	| PARS_BINARY_TOKEN	{ $$ = &pars_binary_token; }
+	| PARS_BLOB_TOKEN	{ $$ = &pars_blob_token; }
+;
+
+parameter_declaration:
+	PARS_ID_TOKEN PARS_IN_TOKEN type_name
+				{ $$ = pars_parameter_declaration(
+					static_cast<sym_node_t*>($1),
+					PARS_INPUT,
+					static_cast<pars_res_word_t*>($3)); }
+	| PARS_ID_TOKEN PARS_OUT_TOKEN type_name
+				{ $$ = pars_parameter_declaration(
+					static_cast<sym_node_t*>($1),
+					PARS_OUTPUT,
+					static_cast<pars_res_word_t*>($3)); }
+;
+
+parameter_declaration_list:
+	/* Nothing */		{ $$ = NULL; }
+	| parameter_declaration	{ $$ = que_node_list_add_last(NULL, $1); }
+	| parameter_declaration_list ',' parameter_declaration
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+variable_declaration:
+	PARS_ID_TOKEN type_name ';'
+				{ $$ = pars_variable_declaration(
+					static_cast<sym_node_t*>($1),
+					static_cast<pars_res_word_t*>($2)); }
+;
+
+variable_declaration_list:
+	/* Nothing */
+	| variable_declaration
+	| variable_declaration_list variable_declaration
+;
+
+cursor_declaration:
+	PARS_DECLARE_TOKEN PARS_CURSOR_TOKEN PARS_ID_TOKEN
+	PARS_IS_TOKEN select_statement ';'
+				{ $$ = pars_cursor_declaration(
+					static_cast<sym_node_t*>($3),
+					static_cast<sel_node_t*>($5)); }
+;
+
+function_declaration:
+	PARS_DECLARE_TOKEN PARS_FUNCTION_TOKEN PARS_ID_TOKEN ';'
+				{ $$ = pars_function_declaration(
+					static_cast<sym_node_t*>($3)); }
+;
+
+declaration:
+	cursor_declaration
+	| function_declaration
+;
+
+declaration_list:
+	/* Nothing */
+	| declaration
+	| declaration_list declaration
+;
+
+procedure_definition:
+	PARS_PROCEDURE_TOKEN PARS_ID_TOKEN '(' parameter_declaration_list ')'
+	PARS_IS_TOKEN
+	variable_declaration_list
+	declaration_list
+	PARS_BEGIN_TOKEN
+	statement_list
+	PARS_END_TOKEN		{ $$ = pars_procedure_definition(
+					static_cast<sym_node_t*>($2),
+					static_cast<sym_node_t*>($4),
+					$10); }
+;
+
+%%
diff --git a/storage/innobase/pars/pars0lex.l b/storage/innobase/pars/pars0lex.l
new file mode 100644
index 00000000000..83c3af4b6c5
--- /dev/null
+++ b/storage/innobase/pars/pars0lex.l
@@ -0,0 +1,704 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************
+SQL parser lexical analyzer: input file for the GNU Flex lexer generator
+
+The InnoDB parser is frozen because MySQL takes care of SQL parsing.
+Therefore we normally keep the InnoDB parser C files as they are, and do
+not automatically generate them from pars0grm.y and pars0lex.l.
+
+How to make the InnoDB parser and lexer C files:
+
+1. Run ./make_flex.sh to generate lexer files.
+
+2. Run ./make_bison.sh to generate parser files.
+
+These instructions seem to work at least with bison-1.875d and flex-2.5.31 on
+Linux.
+
+Created 12/14/1997 Heikki Tuuri
+*******************************************************/
+
+%option nostdinit
+%option 8bit
+%option warn
+%option pointer
+%option never-interactive
+%option nodefault
+%option noinput
+%option nounput
+%option noyywrap
+%option noyy_scan_buffer
+%option noyy_scan_bytes
+%option noyy_scan_string
+%option nounistd
+
+%{
+#define YYSTYPE que_node_t*
+
+#include "univ.i"
+#include "pars0pars.h"
+#include "pars0grm.h"
+#include "pars0sym.h"
+#include "mem0mem.h"
+#include "os0proc.h"
+
+#define malloc(A)	ut_malloc(A)
+#define free(A)		ut_free(A)
+#define realloc(P, A)	ut_realloc(P, A)
+#define exit(A) 	ut_error
+
+/* Note: We cast &result to int* from yysize_t* */
+#define YY_INPUT(buf, result, max_size) \
+	pars_get_lex_chars(buf, (int*) &result, max_size)
+
+/* String buffer for removing quotes */
+static ulint	stringbuf_len_alloc = 0; /* Allocated length */
+static ulint	stringbuf_len = 0; /* Current length */
+static char*	stringbuf; /* Start of buffer */
+/** Appends a string to the buffer. */
+static
+void
+string_append(
+/*==========*/
+	const char*	str,	/*!< in: string to be appended */
+	ulint		len)	/*!< in: length of the string */
+{
+	if (stringbuf == NULL) {
+		stringbuf = static_cast<char*>(malloc(1));
+		stringbuf_len_alloc = 1;
+	}
+
+	if (stringbuf_len + len > stringbuf_len_alloc) {
+		while (stringbuf_len + len > stringbuf_len_alloc) {
+			stringbuf_len_alloc <<= 1;
+		}
+
+		stringbuf = static_cast<char*>(
+			realloc(stringbuf, stringbuf_len_alloc));
+	}
+
+	memcpy(stringbuf + stringbuf_len, str, len);
+	stringbuf_len += len;
+}
+
+%}
+
+DIGIT		[0-9]
+ID		[a-z_A-Z][a-z_A-Z0-9]*
+TABLE_NAME	[a-z_A-Z][@a-z_A-Z0-9]*\/(#sql-|[a-z_A-Z])[a-z_A-Z0-9]*
+BOUND_LIT	\:[a-z_A-Z0-9]+
+BOUND_ID	\$[a-z_A-Z0-9]+
+
+%x comment
+%x quoted
+%x id
+%%
+
+{DIGIT}+	{
+			yylval = sym_tab_add_int_lit(pars_sym_tab_global,
+								atoi(yytext));
+			return(PARS_INT_LIT);
+}
+
+{DIGIT}+"."{DIGIT}* {
+			ut_error;	/* not implemented */
+
+			return(PARS_FLOAT_LIT);
+}
+
+{BOUND_LIT}	{
+			ulint	type;
+
+			yylval = sym_tab_add_bound_lit(pars_sym_tab_global,
+				yytext + 1, &type);
+
+			return((int) type);
+}
+
+{BOUND_ID}	{
+			yylval = sym_tab_add_bound_id(pars_sym_tab_global,
+				yytext + 1);
+
+			return(PARS_ID_TOKEN);
+}
+
+"'"		{
+/* Quoted character string literals are handled in an explicit
+start state 'quoted'.  This state is entered and the buffer for
+the scanned string is emptied upon encountering a starting quote.
+
+In the state 'quoted', only two actions are possible (defined below). */
+			BEGIN(quoted);
+			stringbuf_len = 0;
+}
+<quoted>[^\']+	{
+			/* Got a sequence of characters other than "'":
+			append to string buffer */
+			string_append(yytext, yyleng);
+}
+<quoted>"'"+	{
+			/* Got a sequence of "'" characters:
+			append half of them to string buffer,
+			as "''" represents a single "'".
+			We apply truncating division,
+			so that "'''" will result in "'". */
+
+			string_append(yytext, yyleng / 2);
+
+			/* If we got an odd number of quotes, then the
+			last quote we got is the terminating quote.
+			At the end of the string, we return to the
+			initial start state and report the scanned
+			string literal. */
+
+			if (yyleng % 2) {
+				BEGIN(INITIAL);
+				yylval = sym_tab_add_str_lit(
+					pars_sym_tab_global,
+					(byte*) stringbuf, stringbuf_len);
+				return(PARS_STR_LIT);
+			}
+}
+
+\"		{
+/* Quoted identifiers are handled in an explicit start state 'id'.
+This state is entered and the buffer for the scanned string is emptied
+upon encountering a starting quote.
+
+In the state 'id', only two actions are possible (defined below). */
+			BEGIN(id);
+			stringbuf_len = 0;
+}
+<id>[^\"]+	{
+			/* Got a sequence of characters other than '"':
+			append to string buffer */
+			string_append(yytext, yyleng);
+}
+<id>\"+	{
+			/* Got a sequence of '"' characters:
+			append half of them to string buffer,
+			as '""' represents a single '"'.
+			We apply truncating division,
+			so that '"""' will result in '"'. */
+
+			string_append(yytext, yyleng / 2);
+
+			/* If we got an odd number of quotes, then the
+			last quote we got is the terminating quote.
+			At the end of the string, we return to the
+			initial start state and report the scanned
+			identifier. */
+
+			if (yyleng % 2) {
+				BEGIN(INITIAL);
+				yylval = sym_tab_add_id(
+					pars_sym_tab_global,
+					(byte*) stringbuf, stringbuf_len);
+
+				return(PARS_ID_TOKEN);
+			}
+}
+
+"NULL"		{
+			yylval = sym_tab_add_null_lit(pars_sym_tab_global);
+
+			return(PARS_NULL_LIT);
+}
+
+"SQL"		{
+			/* Implicit cursor name */
+			yylval = sym_tab_add_str_lit(pars_sym_tab_global,
+							(byte*) yytext, yyleng);
+			return(PARS_SQL_TOKEN);
+}
+
+"AND"		{
+			return(PARS_AND_TOKEN);
+}
+
+"OR"		{
+			return(PARS_OR_TOKEN);
+}
+
+"NOT"		{
+			return(PARS_NOT_TOKEN);
+}
+
+"PROCEDURE"	{
+			return(PARS_PROCEDURE_TOKEN);
+}
+
+"IN"		{
+			return(PARS_IN_TOKEN);
+}
+
+"OUT"		{
+			return(PARS_OUT_TOKEN);
+}
+
+"BINARY"	{
+			return(PARS_BINARY_TOKEN);
+}
+
+"BLOB"		{
+			return(PARS_BLOB_TOKEN);
+}
+
+"INT"		{
+			return(PARS_INT_TOKEN);
+}
+
+"INTEGER"	{
+			return(PARS_INT_TOKEN);
+}
+
+"FLOAT"		{
+			return(PARS_FLOAT_TOKEN);
+}
+
+"CHAR"		{
+			return(PARS_CHAR_TOKEN);
+}
+
+"IS"		{
+			return(PARS_IS_TOKEN);
+}
+
+"BEGIN"		{
+			return(PARS_BEGIN_TOKEN);
+}
+
+"END"		{
+			return(PARS_END_TOKEN);
+}
+
+"IF"		{
+			return(PARS_IF_TOKEN);
+}
+
+"THEN"		{
+			return(PARS_THEN_TOKEN);
+}
+
+"ELSE"		{
+			return(PARS_ELSE_TOKEN);
+}
+
+"ELSIF"		{
+			return(PARS_ELSIF_TOKEN);
+}
+
+"LOOP"		{
+			return(PARS_LOOP_TOKEN);
+}
+
+"WHILE"		{
+			return(PARS_WHILE_TOKEN);
+}
+
+"RETURN"	{
+			return(PARS_RETURN_TOKEN);
+}
+
+"SELECT"	{
+			return(PARS_SELECT_TOKEN);
+}
+
+"SUM"		{
+			return(PARS_SUM_TOKEN);
+}
+
+"COUNT"		{
+			return(PARS_COUNT_TOKEN);
+}
+
+"DISTINCT"	{
+			return(PARS_DISTINCT_TOKEN);
+}
+
+"FROM"		{
+			return(PARS_FROM_TOKEN);
+}
+
+"WHERE"		{
+			return(PARS_WHERE_TOKEN);
+}
+
+"FOR"		{
+			return(PARS_FOR_TOKEN);
+}
+
+"READ"		{
+			return(PARS_READ_TOKEN);
+}
+
+"ORDER"		{
+			return(PARS_ORDER_TOKEN);
+}
+
+"BY"		{
+			return(PARS_BY_TOKEN);
+}
+
+"ASC"		{
+			return(PARS_ASC_TOKEN);
+}
+
+"DESC"		{
+			return(PARS_DESC_TOKEN);
+}
+
+"INSERT"	{
+			return(PARS_INSERT_TOKEN);
+}
+
+"INTO"		{
+			return(PARS_INTO_TOKEN);
+}
+
+"VALUES"	{
+			return(PARS_VALUES_TOKEN);
+}
+
+"UPDATE"	{
+			return(PARS_UPDATE_TOKEN);
+}
+
+"SET"		{
+			return(PARS_SET_TOKEN);
+}
+
+"DELETE"	{
+			return(PARS_DELETE_TOKEN);
+}
+
+"CURRENT"	{
+			return(PARS_CURRENT_TOKEN);
+}
+
+"OF"		{
+			return(PARS_OF_TOKEN);
+}
+
+"CREATE"	{
+			return(PARS_CREATE_TOKEN);
+}
+
+"TABLE"		{
+			return(PARS_TABLE_TOKEN);
+}
+
+"COMPACT"	{
+			return(PARS_COMPACT_TOKEN);
+}
+
+"BLOCK_SIZE"	{
+			return(PARS_BLOCK_SIZE_TOKEN);
+}
+
+"INDEX"		{
+			return(PARS_INDEX_TOKEN);
+}
+
+"UNIQUE"	{
+			return(PARS_UNIQUE_TOKEN);
+}
+
+"CLUSTERED"	{
+			return(PARS_CLUSTERED_TOKEN);
+}
+
+"DOES_NOT_FIT_IN_MEMORY"	{
+			return(PARS_DOES_NOT_FIT_IN_MEM_TOKEN);
+}
+
+"ON"		{
+			return(PARS_ON_TOKEN);
+}
+
+"DECLARE"	{
+			return(PARS_DECLARE_TOKEN);
+}
+
+"CURSOR"	{
+			return(PARS_CURSOR_TOKEN);
+}
+
+"OPEN"	{
+			return(PARS_OPEN_TOKEN);
+}
+
+"FETCH"	{
+			return(PARS_FETCH_TOKEN);
+}
+
+"CLOSE"	{
+			return(PARS_CLOSE_TOKEN);
+}
+
+"NOTFOUND"	{
+			return(PARS_NOTFOUND_TOKEN);
+}
+
+"TO_CHAR"	{
+			return(PARS_TO_CHAR_TOKEN);
+}
+
+"TO_NUMBER"	{
+			return(PARS_TO_NUMBER_TOKEN);
+}
+
+"TO_BINARY"	{
+			return(PARS_TO_BINARY_TOKEN);
+}
+
+"BINARY_TO_NUMBER" {
+			return(PARS_BINARY_TO_NUMBER_TOKEN);
+}
+
+"SUBSTR"	{
+			return(PARS_SUBSTR_TOKEN);
+}
+
+"REPLSTR"	{
+			return(PARS_REPLSTR_TOKEN);
+}
+
+"CONCAT"	{
+			return(PARS_CONCAT_TOKEN);
+}
+
+"INSTR"		{
+			return(PARS_INSTR_TOKEN);
+}
+
+"LENGTH"	{
+			return(PARS_LENGTH_TOKEN);
+}
+
+"SYSDATE"	{
+			return(PARS_SYSDATE_TOKEN);
+}
+
+"PRINTF"	{
+			return(PARS_PRINTF_TOKEN);
+}
+
+"ASSERT"	{
+			return(PARS_ASSERT_TOKEN);
+}
+
+"RND"		{
+			return(PARS_RND_TOKEN);
+}
+
+"RND_STR"	{
+			return(PARS_RND_STR_TOKEN);
+}
+
+"ROW_PRINTF"	{
+			return(PARS_ROW_PRINTF_TOKEN);
+}
+
+"COMMIT"	{
+			return(PARS_COMMIT_TOKEN);
+}
+
+"ROLLBACK"	{
+			return(PARS_ROLLBACK_TOKEN);
+}
+
+"WORK"		{
+			return(PARS_WORK_TOKEN);
+}
+
+"UNSIGNED"	{
+			return(PARS_UNSIGNED_TOKEN);
+}
+
+"EXIT"		{
+			return(PARS_EXIT_TOKEN);
+}
+
+"FUNCTION"	{
+			return(PARS_FUNCTION_TOKEN);
+}
+
+"LOCK"	{
+			return(PARS_LOCK_TOKEN);
+}
+
+"SHARE"	{
+			return(PARS_SHARE_TOKEN);
+}
+
+"MODE"	{
+			return(PARS_MODE_TOKEN);
+}
+
+"LIKE"  {
+                        return(PARS_LIKE_TOKEN);
+}
+
+"BIGINT"	{
+			return(PARS_BIGINT_TOKEN);
+}
+
+{ID}		{
+			yylval = sym_tab_add_id(pars_sym_tab_global,
+							(byte*) yytext,
+							ut_strlen(yytext));
+			return(PARS_ID_TOKEN);
+}
+
+{TABLE_NAME}	{
+			yylval = sym_tab_add_id(pars_sym_tab_global,
+							(byte*) yytext,
+							ut_strlen(yytext));
+			return(PARS_TABLE_NAME_TOKEN);
+}
+
+".."		{
+			return(PARS_DDOT_TOKEN);
+}
+
+":="		{
+			return(PARS_ASSIGN_TOKEN);
+}
+
+"<="		{
+			return(PARS_LE_TOKEN);
+}
+
+">="		{
+			return(PARS_GE_TOKEN);
+}
+
+"<>"		{
+			return(PARS_NE_TOKEN);
+}
+
+"("		{
+
+			return((int)(*yytext));
+}
+
+"="		{
+
+			return((int)(*yytext));
+}
+
+">"		{
+
+			return((int)(*yytext));
+}
+
+"<"		{
+
+			return((int)(*yytext));
+}
+
+","		{
+
+			return((int)(*yytext));
+}
+
+";"		{
+
+			return((int)(*yytext));
+}
+
+")"		{
+
+			return((int)(*yytext));
+}
+
+"+" 		{
+
+			return((int)(*yytext));
+}
+
+"-"		{
+
+			return((int)(*yytext));
+}
+
+"*"		{
+
+			return((int)(*yytext));
+}
+
+"/"		{
+
+			return((int)(*yytext));
+}
+
+"%"		{
+
+			return((int)(*yytext));
+}
+
+"{"		{
+
+			return((int)(*yytext));
+}
+
+"}"		{
+
+			return((int)(*yytext));
+}
+
+"?"		{
+
+			return((int)(*yytext));
+}
+
+"/*"			BEGIN(comment); /* eat up comment */
+
+<comment>[^*]*
+<comment>"*"+[^*/]*
+<comment>"*"+"/"        BEGIN(INITIAL);
+
+[ \t\n]+		/* eat up whitespace */
+
+
+.		{
+			fprintf(stderr,"Unrecognized character: %02x\n",
+				*yytext);
+
+			ut_error;
+
+			return(0);
+}
+
+%%
+
+/**********************************************************************
+Release any resources used by the lexer. */
+UNIV_INTERN
+void
+pars_lexer_close(void)
+/*==================*/
+{
+	yylex_destroy();
+	free(stringbuf);
+	stringbuf = NULL;
+	stringbuf_len_alloc = stringbuf_len = 0;
+}
diff --git a/storage/innobase/pars/pars0opt.cc b/storage/innobase/pars/pars0opt.cc
new file mode 100644
index 00000000000..cbed2b39eeb
--- /dev/null
+++ b/storage/innobase/pars/pars0opt.cc
@@ -0,0 +1,1259 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file pars/pars0opt.cc
+Simple SQL optimizer
+
+Created 12/21/1997 Heikki Tuuri
+*******************************************************/
+
+#include "pars0opt.h"
+
+#ifdef UNIV_NONINL
+#include "pars0opt.ic"
+#endif
+
+#include "row0sel.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "dict0dict.h"
+#include "dict0mem.h"
+#include "que0que.h"
+#include "pars0grm.h"
+#include "pars0pars.h"
+#include "lock0lock.h"
+
+#define OPT_EQUAL	1	/* comparison by = */
+#define OPT_COMPARISON	2	/* comparison by <, >, <=, or >= */
+
+#define OPT_NOT_COND	1
+#define OPT_END_COND	2
+#define OPT_TEST_COND	3
+#define OPT_SCROLL_COND	4
+
+
+/*******************************************************************//**
+Inverts a comparison operator.
+@return	the equivalent operator when the order of the arguments is switched */
+static
+int
+opt_invert_cmp_op(
+/*==============*/
+	int	op)	/*!< in: operator */
+{
+	if (op == '<') {
+		return('>');
+	} else if (op == '>') {
+		return('<');
+	} else if (op == '=') {
+		return('=');
+	} else if (op == PARS_LE_TOKEN) {
+		return(PARS_GE_TOKEN);
+	} else if (op == PARS_GE_TOKEN) {
+		return(PARS_LE_TOKEN);
+	} else {
+		/* TODO: LIKE operator */
+		ut_error;
+	}
+
+	return(0);
+}
+
+/*******************************************************************//**
+Checks if the value of an expression can be calculated BEFORE the nth table
+in a join is accessed. If this is the case, it can possibly be used in an
+index search for the nth table.
+@return	TRUE if already determined */
+static
+ibool
+opt_check_exp_determined_before(
+/*============================*/
+	que_node_t*	exp,		/*!< in: expression */
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		nth_table)	/*!< in: nth table will be accessed */
+{
+	func_node_t*	func_node;
+	sym_node_t*	sym_node;
+	dict_table_t*	table;
+	que_node_t*	arg;
+	ulint		i;
+
+	ut_ad(exp && sel_node);
+
+	if (que_node_get_type(exp) == QUE_NODE_FUNC) {
+		func_node = static_cast<func_node_t*>(exp);
+
+		arg = func_node->args;
+
+		while (arg) {
+			if (!opt_check_exp_determined_before(arg, sel_node,
+							     nth_table)) {
+				return(FALSE);
+			}
+
+			arg = que_node_get_next(arg);
+		}
+
+		return(TRUE);
+	}
+
+	ut_a(que_node_get_type(exp) == QUE_NODE_SYMBOL);
+
+	sym_node = static_cast<sym_node_t*>(exp);
+
+	if (sym_node->token_type != SYM_COLUMN) {
+
+		return(TRUE);
+	}
+
+	for (i = 0; i < nth_table; i++) {
+
+		table = sel_node_get_nth_plan(sel_node, i)->table;
+
+		if (sym_node->table == table) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Looks in a comparison condition if a column value is already restricted by
+it BEFORE the nth table is accessed.
+@return	expression restricting the value of the column, or NULL if not known */
+static
+que_node_t*
+opt_look_for_col_in_comparison_before(
+/*==================================*/
+	ulint		cmp_type,	/*!< in: OPT_EQUAL, OPT_COMPARISON */
+	ulint		col_no,		/*!< in: column number */
+	func_node_t*	search_cond,	/*!< in: comparison condition */
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		nth_table,	/*!< in: nth table in a join (a query
+					from a single table is considered a
+					join of 1 table) */
+	ulint*		op)		/*!< out: comparison operator ('=',
+					PARS_GE_TOKEN, ... ); this is inverted
+					if the column appears on the right
+					side */
+{
+	sym_node_t*	sym_node;
+	dict_table_t*	table;
+	que_node_t*	exp;
+	que_node_t*	arg;
+
+	ut_ad(search_cond);
+
+	ut_a((search_cond->func == '<')
+	     || (search_cond->func == '>')
+	     || (search_cond->func == '=')
+	     || (search_cond->func == PARS_GE_TOKEN)
+	     || (search_cond->func == PARS_LE_TOKEN)
+	     || (search_cond->func == PARS_LIKE_TOKEN_EXACT)
+	     || (search_cond->func == PARS_LIKE_TOKEN_PREFIX)
+	     || (search_cond->func == PARS_LIKE_TOKEN_SUFFIX)
+	     || (search_cond->func == PARS_LIKE_TOKEN_SUBSTR));
+
+	table = sel_node_get_nth_plan(sel_node, nth_table)->table;
+
+	if ((cmp_type == OPT_EQUAL)
+	    && (search_cond->func != '=')
+	    && (search_cond->func != PARS_LIKE_TOKEN_EXACT)
+            && (search_cond->func != PARS_LIKE_TOKEN_PREFIX)) {
+
+		return(NULL);
+
+	} else if ((cmp_type == OPT_COMPARISON)
+		   && (search_cond->func != '<')
+		   && (search_cond->func != '>')
+		   && (search_cond->func != PARS_GE_TOKEN)
+		   && (search_cond->func != PARS_LE_TOKEN)
+		   && (search_cond->func != PARS_LIKE_TOKEN_PREFIX)
+                   && (search_cond->func != PARS_LIKE_TOKEN_SUFFIX)) {
+
+		return(NULL);
+	}
+
+	arg = search_cond->args;
+
+	if (que_node_get_type(arg) == QUE_NODE_SYMBOL) {
+		sym_node = static_cast<sym_node_t*>(arg);
+
+		if ((sym_node->token_type == SYM_COLUMN)
+		    && (sym_node->table == table)
+		    && (sym_node->col_no == col_no)) {
+
+			/* sym_node contains the desired column id */
+
+			/* Check if the expression on the right side of the
+			operator is already determined */
+
+			exp = que_node_get_next(arg);
+
+			if (opt_check_exp_determined_before(exp, sel_node,
+							    nth_table)) {
+				*op = search_cond->func;
+
+				return(exp);
+			}
+		}
+	}
+
+	exp = search_cond->args;
+	arg = que_node_get_next(arg);
+
+	if (que_node_get_type(arg) == QUE_NODE_SYMBOL) {
+		sym_node = static_cast<sym_node_t*>(arg);
+
+		if ((sym_node->token_type == SYM_COLUMN)
+		    && (sym_node->table == table)
+		    && (sym_node->col_no == col_no)) {
+
+			if (opt_check_exp_determined_before(exp, sel_node,
+							    nth_table)) {
+				*op = opt_invert_cmp_op(search_cond->func);
+
+				return(exp);
+			}
+		}
+	}
+
+	return(NULL);
+}
+
+/*******************************************************************//**
+Looks in a search condition if a column value is already restricted by the
+search condition BEFORE the nth table is accessed. Takes into account that
+if we will fetch in an ascending order, we cannot utilize an upper limit for
+a column value; in a descending order, respectively, a lower limit.
+@return	expression restricting the value of the column, or NULL if not known */
+static
+que_node_t*
+opt_look_for_col_in_cond_before(
+/*============================*/
+	ulint		cmp_type,	/*!< in: OPT_EQUAL, OPT_COMPARISON */
+	ulint		col_no,		/*!< in: column number */
+	func_node_t*	search_cond,	/*!< in: search condition or NULL */
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		nth_table,	/*!< in: nth table in a join (a query
+					from a single table is considered a
+					join of 1 table) */
+	ulint*		op)		/*!< out: comparison operator ('=',
+					PARS_GE_TOKEN, ... ) */
+{
+	func_node_t*	new_cond;
+	que_node_t*	exp;
+
+	if (search_cond == NULL) {
+
+		return(NULL);
+	}
+
+	ut_a(que_node_get_type(search_cond) == QUE_NODE_FUNC);
+	ut_a(search_cond->func != PARS_OR_TOKEN);
+	ut_a(search_cond->func != PARS_NOT_TOKEN);
+
+	if (search_cond->func == PARS_AND_TOKEN) {
+		new_cond = static_cast<func_node_t*>(search_cond->args);
+
+		exp = opt_look_for_col_in_cond_before(cmp_type, col_no,
+						      new_cond, sel_node,
+						      nth_table, op);
+		if (exp) {
+
+			return(exp);
+		}
+
+		new_cond = static_cast<func_node_t*>(
+			que_node_get_next(new_cond));
+
+		exp = opt_look_for_col_in_cond_before(cmp_type, col_no,
+						      new_cond, sel_node,
+						      nth_table, op);
+		return(exp);
+	}
+
+	exp = opt_look_for_col_in_comparison_before(cmp_type, col_no,
+						    search_cond, sel_node,
+						    nth_table, op);
+	if (exp == NULL) {
+
+		return(NULL);
+	}
+
+	/* If we will fetch in an ascending order, we cannot utilize an upper
+	limit for a column value; in a descending order, respectively, a lower
+	limit */
+
+	if (sel_node->asc && ((*op == '<') || (*op == PARS_LE_TOKEN))) {
+
+		return(NULL);
+
+	} else if (!sel_node->asc
+		   && ((*op == '>') || (*op == PARS_GE_TOKEN))) {
+
+		return(NULL);
+	}
+
+	return(exp);
+}
+
+/*******************************************************************//**
+Calculates the goodness for an index according to a select node. The
+goodness is 4 times the number of first fields in index whose values we
+already know exactly in the query. If we have a comparison condition for
+an additional field, 2 point are added. If the index is unique, and we know
+all the unique fields for the index we add 1024 points. For a clustered index
+we add 1 point.
+@return	goodness */
+static
+ulint
+opt_calc_index_goodness(
+/*====================*/
+	dict_index_t*	index,		/*!< in: index */
+	sel_node_t*	sel_node,	/*!< in: parsed select node */
+	ulint		nth_table,	/*!< in: nth table in a join */
+	que_node_t**	index_plan,	/*!< in/out: comparison expressions for
+					this index */
+	ulint*		last_op)	/*!< out: last comparison operator, if
+					goodness > 1 */
+{
+	que_node_t*	exp;
+	ulint		goodness;
+	ulint		n_fields;
+	ulint		col_no;
+	ulint		op;
+	ulint		j;
+
+	/* At least for now we don't support using FTS indexes for queries
+	done through InnoDB's own SQL parser. */
+	if (dict_index_is_online_ddl(index) || (index->type & DICT_FTS)) {
+		return(0);
+	}
+
+	goodness = 0;
+
+	/* Note that as higher level node pointers in the B-tree contain
+	page addresses as the last field, we must not put more fields in
+	the search tuple than dict_index_get_n_unique_in_tree(index); see
+	the note in btr_cur_search_to_nth_level. */
+
+	n_fields = dict_index_get_n_unique_in_tree(index);
+
+	for (j = 0; j < n_fields; j++) {
+
+		col_no = dict_index_get_nth_col_no(index, j);
+
+		exp = opt_look_for_col_in_cond_before(
+			OPT_EQUAL, col_no,
+			static_cast<func_node_t*>(sel_node->search_cond),
+			sel_node, nth_table, &op);
+		if (exp) {
+			/* The value for this column is exactly known already
+			at this stage of the join */
+
+			index_plan[j] = exp;
+			*last_op = op;
+			goodness += 4;
+		} else {
+			/* Look for non-equality comparisons */
+
+			exp = opt_look_for_col_in_cond_before(
+				OPT_COMPARISON, col_no,
+				static_cast<func_node_t*>(
+					sel_node->search_cond),
+				sel_node, nth_table, &op);
+			if (exp) {
+				index_plan[j] = exp;
+				*last_op = op;
+				goodness += 2;
+			}
+
+			break;
+		}
+	}
+
+	if (goodness >= 4 * dict_index_get_n_unique(index)) {
+		goodness += 1024;
+
+		if (dict_index_is_clust(index)) {
+
+			goodness += 1024;
+		}
+	}
+
+	/* We have to test for goodness here, as last_op may not be set */
+	if (goodness && dict_index_is_clust(index)) {
+
+		goodness++;
+	}
+
+	return(goodness);
+}
+
+/*******************************************************************//**
+Calculates the number of matched fields based on an index goodness.
+@return	number of excatly or partially matched fields */
+UNIV_INLINE
+ulint
+opt_calc_n_fields_from_goodness(
+/*============================*/
+	ulint	goodness)	/*!< in: goodness */
+{
+	return(((goodness % 1024) + 2) / 4);
+}
+
+/*******************************************************************//**
+Converts a comparison operator to the corresponding search mode PAGE_CUR_GE,
+...
+@return	search mode */
+UNIV_INLINE
+ulint
+opt_op_to_search_mode(
+/*==================*/
+	ibool	asc,	/*!< in: TRUE if the rows should be fetched in an
+			ascending order */
+	ulint	op)	/*!< in: operator '=', PARS_GE_TOKEN, ... */
+{
+	if (op == '='
+	    || op == PARS_LIKE_TOKEN_EXACT
+	    || op == PARS_LIKE_TOKEN_PREFIX
+	    || op == PARS_LIKE_TOKEN_SUFFIX
+	    || op == PARS_LIKE_TOKEN_SUBSTR) {
+
+		if (asc) {
+			return(PAGE_CUR_GE);
+		} else {
+			return(PAGE_CUR_LE);
+		}
+	} else if (op == '<') {
+		ut_a(!asc);
+		return(PAGE_CUR_L);
+	} else if (op == '>') {
+		ut_a(asc);
+		return(PAGE_CUR_G);
+	} else if (op == PARS_GE_TOKEN) {
+		ut_a(asc);
+		return(PAGE_CUR_GE);
+	} else if (op == PARS_LE_TOKEN) {
+		ut_a(!asc);
+		return(PAGE_CUR_LE);
+	} else {
+		ut_error;
+	}
+
+	return(0);
+}
+
+/*******************************************************************//**
+Determines if a node is an argument node of a function node.
+@return	TRUE if is an argument */
+static
+ibool
+opt_is_arg(
+/*=======*/
+	que_node_t*	arg_node,	/*!< in: possible argument node */
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg;
+
+	arg = func_node->args;
+
+	while (arg) {
+		if (arg == arg_node) {
+
+			return(TRUE);
+		}
+
+		arg = que_node_get_next(arg);
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Decides if the fetching of rows should be made in a descending order, and
+also checks that the chosen query plan produces a result which satisfies
+the order-by. */
+static
+void
+opt_check_order_by(
+/*===============*/
+	sel_node_t*	sel_node)	/*!< in: select node; asserts an error
+					if the plan does not agree with the
+					order-by */
+{
+	order_node_t*	order_node;
+	dict_table_t*	order_table;
+	ulint		order_col_no;
+	plan_t*		plan;
+	ulint		i;
+
+	if (!sel_node->order_by) {
+
+		return;
+	}
+
+	order_node = sel_node->order_by;
+	order_col_no = order_node->column->col_no;
+	order_table = order_node->column->table;
+
+	/* If there is an order-by clause, the first non-exactly matched field
+	in the index used for the last table in the table list should be the
+	column defined in the order-by clause, and for all the other tables
+	we should get only at most a single row, otherwise we cannot presently
+	calculate the order-by, as we have no sort utility */
+
+	for (i = 0; i < sel_node->n_tables; i++) {
+
+		plan = sel_node_get_nth_plan(sel_node, i);
+
+		if (i < sel_node->n_tables - 1) {
+			ut_a(dict_index_get_n_unique(plan->index)
+			     <= plan->n_exact_match);
+		} else {
+			ut_a(plan->table == order_table);
+
+			ut_a((dict_index_get_n_unique(plan->index)
+			      <= plan->n_exact_match)
+			     || (dict_index_get_nth_col_no(plan->index,
+							   plan->n_exact_match)
+				 == order_col_no));
+		}
+	}
+}
+
+/*******************************************************************//**
+Optimizes a select. Decides which indexes to tables to use. The tables
+are accessed in the order that they were written to the FROM part in the
+select statement. */
+static
+void
+opt_search_plan_for_table(
+/*======================*/
+	sel_node_t*	sel_node,	/*!< in: parsed select node */
+	ulint		i,		/*!< in: this is the ith table */
+	dict_table_t*	table)		/*!< in: table */
+{
+	plan_t*		plan;
+	dict_index_t*	index;
+	dict_index_t*	best_index;
+	ulint		n_fields;
+	ulint		goodness;
+	ulint		last_op		= 75946965;	/* Eliminate a Purify
+							warning */
+	ulint		best_goodness;
+	ulint		best_last_op = 0; /* remove warning */
+	que_node_t*	index_plan[256];
+	que_node_t*	best_index_plan[256];
+
+	plan = sel_node_get_nth_plan(sel_node, i);
+
+	plan->table = table;
+	plan->asc = sel_node->asc;
+	plan->pcur_is_open = FALSE;
+	plan->cursor_at_end = FALSE;
+
+	/* Calculate goodness for each index of the table */
+
+	index = dict_table_get_first_index(table);
+	best_index = index; /* Eliminate compiler warning */
+	best_goodness = 0;
+
+	/* should be do ... until ? comment by Jani */
+	while (index) {
+		goodness = opt_calc_index_goodness(index, sel_node, i,
+						   index_plan, &last_op);
+		if (goodness > best_goodness) {
+
+			best_index = index;
+			best_goodness = goodness;
+			n_fields = opt_calc_n_fields_from_goodness(goodness);
+
+			ut_memcpy(best_index_plan, index_plan,
+				  n_fields * sizeof(void*));
+			best_last_op = last_op;
+		}
+
+		dict_table_next_uncorrupted_index(index);
+	}
+
+	plan->index = best_index;
+
+	n_fields = opt_calc_n_fields_from_goodness(best_goodness);
+
+	if (n_fields == 0) {
+		plan->tuple = NULL;
+		plan->n_exact_match = 0;
+	} else {
+		plan->tuple = dtuple_create(pars_sym_tab_global->heap,
+					    n_fields);
+		dict_index_copy_types(plan->tuple, plan->index, n_fields);
+
+		plan->tuple_exps = static_cast<que_node_t**>(
+			mem_heap_alloc(
+				pars_sym_tab_global->heap,
+				n_fields * sizeof(void*)));
+
+		ut_memcpy(plan->tuple_exps, best_index_plan,
+			  n_fields * sizeof(void*));
+		if (best_last_op == '='
+		    || best_last_op == PARS_LIKE_TOKEN_EXACT
+                    || best_last_op == PARS_LIKE_TOKEN_PREFIX
+                    || best_last_op == PARS_LIKE_TOKEN_SUFFIX
+                    || best_last_op == PARS_LIKE_TOKEN_SUBSTR) {
+			plan->n_exact_match = n_fields;
+		} else {
+			plan->n_exact_match = n_fields - 1;
+		}
+
+		plan->mode = opt_op_to_search_mode(sel_node->asc,
+						   best_last_op);
+	}
+
+	if (dict_index_is_clust(best_index)
+	    && (plan->n_exact_match >= dict_index_get_n_unique(best_index))) {
+
+		plan->unique_search = TRUE;
+	} else {
+		plan->unique_search = FALSE;
+	}
+
+	plan->old_vers_heap = NULL;
+
+	btr_pcur_init(&(plan->pcur));
+	btr_pcur_init(&(plan->clust_pcur));
+}
+
+/*******************************************************************//**
+Looks at a comparison condition and decides if it can, and need, be tested for
+a table AFTER the table has been accessed.
+@return OPT_NOT_COND if not for this table, else OPT_END_COND,
+OPT_TEST_COND, or OPT_SCROLL_COND, where the last means that the
+condition need not be tested, except when scroll cursors are used */
+static
+ulint
+opt_classify_comparison(
+/*====================*/
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		i,		/*!< in: ith table in the join */
+	func_node_t*	cond)		/*!< in: comparison condition */
+{
+	plan_t*	plan;
+	ulint	n_fields;
+	ulint	op;
+	ulint	j;
+
+	ut_ad(cond && sel_node);
+
+	plan = sel_node_get_nth_plan(sel_node, i);
+
+	/* Check if the condition is determined after the ith table has been
+	accessed, but not after the i - 1:th */
+
+	if (!opt_check_exp_determined_before(cond, sel_node, i + 1)) {
+
+		return(OPT_NOT_COND);
+	}
+
+	if ((i > 0) && opt_check_exp_determined_before(cond, sel_node, i)) {
+
+		return(OPT_NOT_COND);
+	}
+
+	/* If the condition is an exact match condition used in constructing
+	the search tuple, it is classified as OPT_END_COND */
+
+	if (plan->tuple) {
+		n_fields = dtuple_get_n_fields(plan->tuple);
+	} else {
+		n_fields = 0;
+	}
+
+	for (j = 0; j < plan->n_exact_match; j++) {
+
+		if (opt_is_arg(plan->tuple_exps[j], cond)) {
+
+			return(OPT_END_COND);
+		}
+	}
+
+	/* If the condition is an non-exact match condition used in
+	constructing the search tuple, it is classified as OPT_SCROLL_COND.
+	When the cursor is positioned, and if a non-scroll cursor is used,
+	there is no need to test this condition; if a scroll cursor is used
+	the testing is necessary when the cursor is reversed. */
+
+	if ((n_fields > plan->n_exact_match)
+	    && opt_is_arg(plan->tuple_exps[n_fields - 1], cond)) {
+
+		return(OPT_SCROLL_COND);
+	}
+
+	/* If the condition is a non-exact match condition on the first field
+	in index for which there is no exact match, and it limits the search
+	range from the opposite side of the search tuple already BEFORE we
+	access the table, it is classified as OPT_END_COND */
+
+	if ((dict_index_get_n_fields(plan->index) > plan->n_exact_match)
+	    && opt_look_for_col_in_comparison_before(
+		    OPT_COMPARISON,
+		    dict_index_get_nth_col_no(plan->index,
+					      plan->n_exact_match),
+		    cond, sel_node, i, &op)) {
+
+		if (sel_node->asc && ((op == '<') || (op == PARS_LE_TOKEN))) {
+
+			return(OPT_END_COND);
+		}
+
+		if (!sel_node->asc && ((op == '>') || (op == PARS_GE_TOKEN))) {
+
+			return(OPT_END_COND);
+		}
+	}
+
+	/* Otherwise, cond is classified as OPT_TEST_COND */
+
+	return(OPT_TEST_COND);
+}
+
+/*******************************************************************//**
+Recursively looks for test conditions for a table in a join. */
+static
+void
+opt_find_test_conds(
+/*================*/
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		i,		/*!< in: ith table in the join */
+	func_node_t*	cond)		/*!< in: conjunction of search
+					conditions or NULL */
+{
+	func_node_t*	new_cond;
+	ulint		fclass;
+	plan_t*		plan;
+
+	if (cond == NULL) {
+
+		return;
+	}
+
+	if (cond->func == PARS_AND_TOKEN) {
+		new_cond = static_cast<func_node_t*>(cond->args);
+
+		opt_find_test_conds(sel_node, i, new_cond);
+
+		new_cond = static_cast<func_node_t*>(
+			que_node_get_next(new_cond));
+
+		opt_find_test_conds(sel_node, i, new_cond);
+
+		return;
+	}
+
+	plan = sel_node_get_nth_plan(sel_node, i);
+
+	fclass = opt_classify_comparison(sel_node, i, cond);
+
+	if (fclass == OPT_END_COND) {
+		UT_LIST_ADD_LAST(cond_list, plan->end_conds, cond);
+
+	} else if (fclass == OPT_TEST_COND) {
+		UT_LIST_ADD_LAST(cond_list, plan->other_conds, cond);
+
+	}
+}
+
+/*******************************************************************//**
+Normalizes a list of comparison conditions so that a column of the table
+appears on the left side of the comparison if possible. This is accomplished
+by switching the arguments of the operator. */
+static
+void
+opt_normalize_cmp_conds(
+/*====================*/
+	func_node_t*	cond,	/*!< in: first in a list of comparison
+				conditions, or NULL */
+	dict_table_t*	table)	/*!< in: table */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	sym_node_t*	sym_node;
+
+	while (cond) {
+		arg1 = cond->args;
+		arg2 = que_node_get_next(arg1);
+
+		if (que_node_get_type(arg2) == QUE_NODE_SYMBOL) {
+
+			sym_node = static_cast<sym_node_t*>(arg2);
+
+			if ((sym_node->token_type == SYM_COLUMN)
+			    && (sym_node->table == table)) {
+
+				/* Switch the order of the arguments */
+
+				cond->args = arg2;
+				que_node_list_add_last(NULL, arg2);
+				que_node_list_add_last(arg2, arg1);
+
+				/* Invert the operator */
+				cond->func = opt_invert_cmp_op(cond->func);
+			}
+		}
+
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+}
+
+/*******************************************************************//**
+Finds out the search condition conjuncts we can, and need, to test as the ith
+table in a join is accessed. The search tuple can eliminate the need to test
+some conjuncts. */
+static
+void
+opt_determine_and_normalize_test_conds(
+/*===================================*/
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		i)		/*!< in: ith table in the join */
+{
+	plan_t*	plan;
+
+	plan = sel_node_get_nth_plan(sel_node, i);
+
+	UT_LIST_INIT(plan->end_conds);
+	UT_LIST_INIT(plan->other_conds);
+
+	/* Recursively go through the conjuncts and classify them */
+
+	opt_find_test_conds(
+		sel_node,
+		i,
+		static_cast<func_node_t*>(sel_node->search_cond));
+
+	opt_normalize_cmp_conds(UT_LIST_GET_FIRST(plan->end_conds),
+				plan->table);
+
+	ut_a(UT_LIST_GET_LEN(plan->end_conds) >= plan->n_exact_match);
+}
+
+/*******************************************************************//**
+Looks for occurrences of the columns of the table in the query subgraph and
+adds them to the list of columns if an occurrence of the same column does not
+already exist in the list. If the column is already in the list, puts a value
+indirection to point to the occurrence in the column list, except if the
+column occurrence we are looking at is in the column list, in which case
+nothing is done. */
+UNIV_INTERN
+void
+opt_find_all_cols(
+/*==============*/
+	ibool		copy_val,	/*!< in: if TRUE, new found columns are
+					added as columns to copy */
+	dict_index_t*	index,		/*!< in: index of the table to use */
+	sym_node_list_t* col_list,	/*!< in: base node of a list where
+					to add new found columns */
+	plan_t*		plan,		/*!< in: plan or NULL */
+	que_node_t*	exp)		/*!< in: expression or condition or
+					NULL */
+{
+	func_node_t*	func_node;
+	que_node_t*	arg;
+	sym_node_t*	sym_node;
+	sym_node_t*	col_node;
+	ulint		col_pos;
+
+	if (exp == NULL) {
+
+		return;
+	}
+
+	if (que_node_get_type(exp) == QUE_NODE_FUNC) {
+		func_node = static_cast<func_node_t*>(exp);
+
+		for (arg = func_node->args;
+		     arg != 0;
+		     arg = que_node_get_next(arg)) {
+
+			opt_find_all_cols(
+				copy_val, index, col_list, plan, arg);
+		}
+
+		return;
+	}
+
+	ut_a(que_node_get_type(exp) == QUE_NODE_SYMBOL);
+
+	sym_node = static_cast<sym_node_t*>(exp);
+
+	if (sym_node->token_type != SYM_COLUMN) {
+
+		return;
+	}
+
+	if (sym_node->table != index->table) {
+
+		return;
+	}
+
+	/* Look for an occurrence of the same column in the plan column
+	list */
+
+	col_node = UT_LIST_GET_FIRST(*col_list);
+
+	while (col_node) {
+		if (col_node->col_no == sym_node->col_no) {
+
+			if (col_node == sym_node) {
+				/* sym_node was already in a list: do
+				nothing */
+
+				return;
+			}
+
+			/* Put an indirection */
+			sym_node->indirection = col_node;
+			sym_node->alias = col_node;
+
+			return;
+		}
+
+		col_node = UT_LIST_GET_NEXT(col_var_list, col_node);
+	}
+
+	/* The same column did not occur in the list: add it */
+
+	UT_LIST_ADD_LAST(col_var_list, *col_list, sym_node);
+
+	sym_node->copy_val = copy_val;
+
+	/* Fill in the field_no fields in sym_node */
+
+	sym_node->field_nos[SYM_CLUST_FIELD_NO] = dict_index_get_nth_col_pos(
+		dict_table_get_first_index(index->table), sym_node->col_no);
+	if (!dict_index_is_clust(index)) {
+
+		ut_a(plan);
+
+		col_pos = dict_index_get_nth_col_pos(index, sym_node->col_no);
+
+		if (col_pos == ULINT_UNDEFINED) {
+
+			plan->must_get_clust = TRUE;
+		}
+
+		sym_node->field_nos[SYM_SEC_FIELD_NO] = col_pos;
+	}
+}
+
+/*******************************************************************//**
+Looks for occurrences of the columns of the table in conditions which are
+not yet determined AFTER the join operation has fetched a row in the ith
+table. The values for these column must be copied to dynamic memory for
+later use. */
+static
+void
+opt_find_copy_cols(
+/*===============*/
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		i,		/*!< in: ith table in the join */
+	func_node_t*	search_cond)	/*!< in: search condition or NULL */
+{
+	func_node_t*	new_cond;
+	plan_t*		plan;
+
+	if (search_cond == NULL) {
+
+		return;
+	}
+
+	ut_ad(que_node_get_type(search_cond) == QUE_NODE_FUNC);
+
+	if (search_cond->func == PARS_AND_TOKEN) {
+		new_cond = static_cast<func_node_t*>(search_cond->args);
+
+		opt_find_copy_cols(sel_node, i, new_cond);
+
+		new_cond = static_cast<func_node_t*>(
+			que_node_get_next(new_cond));
+
+		opt_find_copy_cols(sel_node, i, new_cond);
+
+		return;
+	}
+
+	if (!opt_check_exp_determined_before(search_cond, sel_node, i + 1)) {
+
+		/* Any ith table columns occurring in search_cond should be
+		copied, as this condition cannot be tested already on the
+		fetch from the ith table */
+
+		plan = sel_node_get_nth_plan(sel_node, i);
+
+		opt_find_all_cols(TRUE, plan->index, &(plan->columns), plan,
+				  search_cond);
+	}
+}
+
+/*******************************************************************//**
+Classifies the table columns according to whether we use the column only while
+holding the latch on the page, or whether we have to copy the column value to
+dynamic memory. Puts the first occurrence of a column to either list in the
+plan node, and puts indirections to later occurrences of the column. */
+static
+void
+opt_classify_cols(
+/*==============*/
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		i)		/*!< in: ith table in the join */
+{
+	plan_t*		plan;
+	que_node_t*	exp;
+
+	plan = sel_node_get_nth_plan(sel_node, i);
+
+	/* The final value of the following field will depend on the
+	environment of the select statement: */
+
+	plan->must_get_clust = FALSE;
+
+	UT_LIST_INIT(plan->columns);
+
+	/* All select list columns should be copied: therefore TRUE as the
+	first argument */
+
+	for (exp = sel_node->select_list;
+	     exp != 0;
+	     exp = que_node_get_next(exp)) {
+
+		opt_find_all_cols(
+			TRUE, plan->index, &(plan->columns), plan, exp);
+	}
+
+	opt_find_copy_cols(
+		sel_node, i, static_cast<func_node_t*>(sel_node->search_cond));
+
+	/* All remaining columns in the search condition are temporary
+	columns: therefore FALSE */
+
+	opt_find_all_cols(
+		FALSE, plan->index, &plan->columns, plan,
+		static_cast<func_node_t*>(sel_node->search_cond));
+}
+
+/*******************************************************************//**
+Fills in the info in plan which is used in accessing a clustered index
+record. The columns must already be classified for the plan node. */
+static
+void
+opt_clust_access(
+/*=============*/
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		n)		/*!< in: nth table in select */
+{
+	plan_t*		plan;
+	dict_table_t*	table;
+	dict_index_t*	clust_index;
+	dict_index_t*	index;
+	mem_heap_t*	heap;
+	ulint		n_fields;
+	ulint		pos;
+	ulint		i;
+
+	plan = sel_node_get_nth_plan(sel_node, n);
+
+	index = plan->index;
+
+	/* The final value of the following field depends on the environment
+	of the select statement: */
+
+	plan->no_prefetch = FALSE;
+
+	if (dict_index_is_clust(index)) {
+		plan->clust_map = NULL;
+		plan->clust_ref = NULL;
+
+		return;
+	}
+
+	table = index->table;
+
+	clust_index = dict_table_get_first_index(table);
+
+	n_fields = dict_index_get_n_unique(clust_index);
+
+	heap = pars_sym_tab_global->heap;
+
+	plan->clust_ref = dtuple_create(heap, n_fields);
+
+	dict_index_copy_types(plan->clust_ref, clust_index, n_fields);
+
+	plan->clust_map = static_cast<ulint*>(
+		mem_heap_alloc(heap, n_fields * sizeof(ulint)));
+
+	for (i = 0; i < n_fields; i++) {
+		pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+		ut_a(pos != ULINT_UNDEFINED);
+
+		/* We optimize here only queries to InnoDB's internal system
+		tables, and they should not contain column prefix indexes. */
+
+		if (dict_index_get_nth_field(index, pos)->prefix_len != 0
+		    || dict_index_get_nth_field(clust_index, i)
+		    ->prefix_len != 0) {
+			fprintf(stderr,
+				"InnoDB: Error in pars0opt.cc:"
+				" table %s has prefix_len != 0\n",
+				index->table_name);
+		}
+
+		*(plan->clust_map + i) = pos;
+
+		ut_ad(pos != ULINT_UNDEFINED);
+	}
+}
+
+/*******************************************************************//**
+Optimizes a select. Decides which indexes to tables to use. The tables
+are accessed in the order that they were written to the FROM part in the
+select statement. */
+UNIV_INTERN
+void
+opt_search_plan(
+/*============*/
+	sel_node_t*	sel_node)	/*!< in: parsed select node */
+{
+	sym_node_t*	table_node;
+	dict_table_t*	table;
+	order_node_t*	order_by;
+	ulint		i;
+
+	sel_node->plans = static_cast<plan_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap,
+			sel_node->n_tables * sizeof(plan_t)));
+
+	/* Analyze the search condition to find out what we know at each
+	join stage about the conditions that the columns of a table should
+	satisfy */
+
+	table_node = sel_node->table_list;
+
+	if (sel_node->order_by == NULL) {
+		sel_node->asc = TRUE;
+	} else {
+		order_by = sel_node->order_by;
+
+		sel_node->asc = order_by->asc;
+	}
+
+	for (i = 0; i < sel_node->n_tables; i++) {
+
+		table = table_node->table;
+
+		/* Choose index through which to access the table */
+
+		opt_search_plan_for_table(sel_node, i, table);
+
+		/* Determine the search condition conjuncts we can test at
+		this table; normalize the end conditions */
+
+		opt_determine_and_normalize_test_conds(sel_node, i);
+
+		table_node = static_cast<sym_node_t*>(
+			que_node_get_next(table_node));
+	}
+
+	table_node = sel_node->table_list;
+
+	for (i = 0; i < sel_node->n_tables; i++) {
+
+		/* Classify the table columns into those we only need to access
+		but not copy, and to those we must copy to dynamic memory */
+
+		opt_classify_cols(sel_node, i);
+
+		/* Calculate possible info for accessing the clustered index
+		record */
+
+		opt_clust_access(sel_node, i);
+
+		table_node = static_cast<sym_node_t*>(
+			que_node_get_next(table_node));
+	}
+
+	/* Check that the plan obeys a possible order-by clause: if not,
+	an assertion error occurs */
+
+	opt_check_order_by(sel_node);
+
+#ifdef UNIV_SQL_DEBUG
+	opt_print_query_plan(sel_node);
+#endif
+}
+
+/********************************************************************//**
+Prints info of a query plan. */
+UNIV_INTERN
+void
+opt_print_query_plan(
+/*=================*/
+	sel_node_t*	sel_node)	/*!< in: select node */
+{
+	plan_t*	plan;
+	ulint	n_fields;
+	ulint	i;
+
+	fputs("QUERY PLAN FOR A SELECT NODE\n", stderr);
+
+	fputs(sel_node->asc ? "Asc. search; " : "Desc. search; ", stderr);
+
+	if (sel_node->set_x_locks) {
+		fputs("sets row x-locks; ", stderr);
+		ut_a(sel_node->row_lock_mode == LOCK_X);
+		ut_a(!sel_node->consistent_read);
+	} else if (sel_node->consistent_read) {
+		fputs("consistent read; ", stderr);
+	} else {
+		ut_a(sel_node->row_lock_mode == LOCK_S);
+		fputs("sets row s-locks; ", stderr);
+	}
+
+	putc('\n', stderr);
+
+	for (i = 0; i < sel_node->n_tables; i++) {
+		plan = sel_node_get_nth_plan(sel_node, i);
+
+		if (plan->tuple) {
+			n_fields = dtuple_get_n_fields(plan->tuple);
+		} else {
+			n_fields = 0;
+		}
+
+		fputs("Table ", stderr);
+		dict_index_name_print(stderr, NULL, plan->index);
+		fprintf(stderr,"; exact m. %lu, match %lu, end conds %lu\n",
+			(unsigned long) plan->n_exact_match,
+			(unsigned long) n_fields,
+			(unsigned long) UT_LIST_GET_LEN(plan->end_conds));
+	}
+}
diff --git a/storage/innobase/pars/pars0pars.cc b/storage/innobase/pars/pars0pars.cc
new file mode 100644
index 00000000000..655e5ba1324
--- /dev/null
+++ b/storage/innobase/pars/pars0pars.cc
@@ -0,0 +1,2668 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St,
+Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file pars/pars0pars.c
+SQL parser
+
+Created 11/19/1996 Heikki Tuuri
+*******************************************************/
+
+/* Historical note: Innobase executed its first SQL string (CREATE TABLE)
+on 1/27/1998 */
+
+#include "pars0pars.h"
+
+#ifdef UNIV_NONINL
+#include "pars0pars.ic"
+#endif
+
+#include "row0sel.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "dict0dict.h"
+#include "dict0mem.h"
+#include "dict0crea.h"
+#include "que0que.h"
+#include "pars0grm.h"
+#include "pars0opt.h"
+#include "data0data.h"
+#include "data0type.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "lock0lock.h"
+#include "eval0eval.h"
+
+#ifdef UNIV_SQL_DEBUG
+/** If the following is set TRUE, the lexer will print the SQL string
+as it tokenizes it */
+UNIV_INTERN ibool	pars_print_lexed	= FALSE;
+#endif /* UNIV_SQL_DEBUG */
+
+/* Global variable used while parsing a single procedure or query : the code is
+NOT re-entrant */
+UNIV_INTERN sym_tab_t*	pars_sym_tab_global;
+
+/* Global variables used to denote certain reserved words, used in
+constructing the parsing tree */
+
+UNIV_INTERN pars_res_word_t	pars_to_char_token = {PARS_TO_CHAR_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_to_number_token = {PARS_TO_NUMBER_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_to_binary_token = {PARS_TO_BINARY_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_binary_to_number_token = {PARS_BINARY_TO_NUMBER_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_substr_token = {PARS_SUBSTR_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_replstr_token = {PARS_REPLSTR_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_concat_token = {PARS_CONCAT_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_instr_token = {PARS_INSTR_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_length_token = {PARS_LENGTH_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_sysdate_token = {PARS_SYSDATE_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_printf_token = {PARS_PRINTF_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_assert_token = {PARS_ASSERT_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_rnd_token = {PARS_RND_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_rnd_str_token = {PARS_RND_STR_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_count_token = {PARS_COUNT_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_sum_token = {PARS_SUM_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_distinct_token = {PARS_DISTINCT_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_binary_token = {PARS_BINARY_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_blob_token = {PARS_BLOB_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_int_token = {PARS_INT_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_bigint_token = {PARS_BIGINT_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_char_token = {PARS_CHAR_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_float_token = {PARS_FLOAT_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_update_token = {PARS_UPDATE_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_asc_token = {PARS_ASC_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_desc_token = {PARS_DESC_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_open_token = {PARS_OPEN_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_close_token = {PARS_CLOSE_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_share_token = {PARS_SHARE_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_unique_token = {PARS_UNIQUE_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_clustered_token = {PARS_CLUSTERED_TOKEN};
+
+/** Global variable used to denote the '*' in SELECT * FROM.. */
+UNIV_INTERN ulint	pars_star_denoter	= 12345678;
+
+/********************************************************************
+Get user function with the given name.*/
+UNIV_INLINE
+pars_user_func_t*
+pars_info_lookup_user_func(
+/*=======================*/
+					/* out: user func, or NULL if not
+					found */
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name)	/* in: function name to find*/
+{
+	if (info && info->funcs) {
+		ulint		i;
+		ib_vector_t*	vec = info->funcs;
+
+		for (i = 0; i < ib_vector_size(vec); i++) {
+			pars_user_func_t*	puf;
+
+			puf = static_cast<pars_user_func_t*>(
+				ib_vector_get(vec, i));
+
+			if (strcmp(puf->name, name) == 0) {
+				return(puf);
+			}
+		}
+	}
+
+	return(NULL);
+}
+
+/********************************************************************
+Get bound identifier with the given name.*/
+UNIV_INLINE
+pars_bound_id_t*
+pars_info_lookup_bound_id(
+/*======================*/
+					/* out: bound literal, or NULL if
+					not found */
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name)	/* in: bound literal name to find */
+{
+	if (info && info->bound_ids) {
+		ulint		i;
+		ib_vector_t*	vec = info->bound_ids;
+
+		for (i = 0; i < ib_vector_size(vec); i++) {
+			pars_bound_id_t*	bid;
+
+		       	bid = static_cast<pars_bound_id_t*>(
+				ib_vector_get(vec, i));
+
+			if (strcmp(bid->name, name) == 0) {
+				return(bid);
+			}
+		}
+	}
+
+	return(NULL);
+}
+
+/********************************************************************
+Get bound literal with the given name.*/
+UNIV_INLINE
+pars_bound_lit_t*
+pars_info_lookup_bound_lit(
+/*=======================*/
+					/* out: bound literal, or NULL if
+					not found */
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name)	/* in: bound literal name to find */
+{
+	if (info && info->bound_lits) {
+		ulint		i;
+		ib_vector_t*	vec = info->bound_lits;
+
+		for (i = 0; i < ib_vector_size(vec); i++) {
+			pars_bound_lit_t*	pbl;
+
+			pbl = static_cast<pars_bound_lit_t*>(
+				ib_vector_get(vec, i));
+
+			if (strcmp(pbl->name, name) == 0) {
+				return(pbl);
+			}
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Determines the class of a function code.
+@return	function class: PARS_FUNC_ARITH, ... */
+static
+ulint
+pars_func_get_class(
+/*================*/
+	int	func)	/*!< in: function code: '=', PARS_GE_TOKEN, ... */
+{
+	switch (func) {
+	case '+': case '-': case '*': case '/':
+		return(PARS_FUNC_ARITH);
+
+	case '=': case '<': case '>':
+	case PARS_GE_TOKEN: case PARS_LE_TOKEN: case PARS_NE_TOKEN:
+		return(PARS_FUNC_CMP);
+
+	case PARS_AND_TOKEN: case PARS_OR_TOKEN: case PARS_NOT_TOKEN:
+		return(PARS_FUNC_LOGICAL);
+
+	case PARS_COUNT_TOKEN: case PARS_SUM_TOKEN:
+		return(PARS_FUNC_AGGREGATE);
+
+	case PARS_TO_CHAR_TOKEN:
+	case PARS_TO_NUMBER_TOKEN:
+	case PARS_TO_BINARY_TOKEN:
+	case PARS_BINARY_TO_NUMBER_TOKEN:
+	case PARS_SUBSTR_TOKEN:
+	case PARS_CONCAT_TOKEN:
+	case PARS_LENGTH_TOKEN:
+	case PARS_INSTR_TOKEN:
+	case PARS_SYSDATE_TOKEN:
+	case PARS_NOTFOUND_TOKEN:
+	case PARS_PRINTF_TOKEN:
+	case PARS_ASSERT_TOKEN:
+	case PARS_RND_TOKEN:
+	case PARS_RND_STR_TOKEN:
+	case PARS_REPLSTR_TOKEN:
+		return(PARS_FUNC_PREDEFINED);
+
+	default:
+		return(PARS_FUNC_OTHER);
+	}
+}
+
+/*********************************************************************//**
+Parses an operator or predefined function expression.
+@return	own: function node in a query tree */
+static
+func_node_t*
+pars_func_low(
+/*==========*/
+	int		func,	/*!< in: function token code */
+	que_node_t*	arg)	/*!< in: first argument in the argument list */
+{
+	func_node_t*	node;
+
+	node = static_cast<func_node_t*>(
+		mem_heap_alloc(pars_sym_tab_global->heap, sizeof(func_node_t)));
+
+	node->common.type = QUE_NODE_FUNC;
+	dfield_set_data(&(node->common.val), NULL, 0);
+	node->common.val_buf_size = 0;
+
+	node->func = func;
+
+	node->fclass = pars_func_get_class(func);
+
+	node->args = arg;
+
+	UT_LIST_ADD_LAST(func_node_list, pars_sym_tab_global->func_node_list,
+			 node);
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a function expression.
+@return	own: function node in a query tree */
+UNIV_INTERN
+func_node_t*
+pars_func(
+/*======*/
+	que_node_t*	res_word,/*!< in: function name reserved word */
+	que_node_t*	arg)	/*!< in: first argument in the argument list */
+{
+	return(pars_func_low(((pars_res_word_t*) res_word)->code, arg));
+}
+
+/*************************************************************************
+Rebind a LIKE search string. NOTE: We ignore any '%' characters embedded
+within the search string.*/
+
+int
+pars_like_rebind(
+/*=============*/
+				/* out, own: function node in a query tree */
+	sym_node_t*	node,	/* in: The search string node.*/
+	const byte*	ptr,	/* in: literal to (re) bind */
+	ulint		ptr_len)/* in: length of literal to (re) bind*/
+{
+	dtype_t*	dtype;
+	dfield_t*	dfield;
+	ib_like_t	op_check;
+	sym_node_t*	like_node;
+	sym_node_t*	str_node = NULL;
+	ib_like_t	op = IB_LIKE_EXACT;
+	int		func = PARS_LIKE_TOKEN_EXACT;
+
+	/* Is this a STRING% ? */
+	if (ptr[ptr_len - 1] == '%') {
+		op = IB_LIKE_PREFIX;
+	}
+
+	/* Is this a '%STRING' or %STRING% ?*/
+	if (*ptr == '%') {
+		op = (op == IB_LIKE_PREFIX) ? IB_LIKE_SUBSTR : IB_LIKE_SUFFIX;
+	}
+
+	if (node->like_node == NULL) {
+		/* Add the LIKE operator info node to the node list.
+		This will be used during the comparison phase to determine
+		how to match.*/
+		like_node = sym_tab_add_int_lit(node->sym_table, op);
+		que_node_list_add_last(NULL, like_node);
+		node->like_node = like_node;
+		str_node = sym_tab_add_str_lit(node->sym_table, ptr, ptr_len);
+		que_node_list_add_last(like_node, str_node);
+	} else {
+		like_node = node->like_node;
+
+		/* Change the value of the string in the existing
+		string node of like node */
+		str_node = static_cast<sym_node_t*>(
+			que_node_list_get_last(like_node));
+
+		/* Must find the string node */
+		ut_a(str_node);
+		ut_a(str_node != like_node);
+		ut_a(str_node->token_type == SYM_LIT);
+
+		dfield = que_node_get_val(str_node);
+		dfield_set_data(dfield, ptr, ptr_len);
+	}
+
+	dfield = que_node_get_val(like_node);
+	dtype = dfield_get_type(dfield);
+
+	ut_a(dtype_get_mtype(dtype) == DATA_INT);
+	op_check = static_cast<ib_like_t>(
+		mach_read_from_4(static_cast<byte*>(dfield_get_data(dfield))));
+
+	switch (op_check) {
+	case	IB_LIKE_PREFIX:
+	case	IB_LIKE_SUFFIX:
+	case	IB_LIKE_SUBSTR:
+	case	IB_LIKE_EXACT:
+		break;
+
+	default:
+		ut_error;
+	}
+
+	mach_write_to_4(static_cast<byte*>(dfield_get_data(dfield)), op);
+
+	dfield = que_node_get_val(node);
+
+	/* Adjust the length of the search value so the '%' is not
+	visible. Then create and add a search string node to the
+	search value node. Searching for %SUFFIX and %SUBSTR% requires
+	a full table scan and so we set the search value to ''.
+	For PREFIX% we simply remove the trailing '%'.*/
+
+	switch (op) {
+	case	IB_LIKE_EXACT:
+		dfield = que_node_get_val(str_node);
+		dtype = dfield_get_type(dfield);
+
+		ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+		dfield_set_data(dfield, ptr, ptr_len);
+		break;
+
+	case	IB_LIKE_PREFIX:
+		func = PARS_LIKE_TOKEN_PREFIX;
+
+		/* Modify the original node */
+		dfield_set_len(dfield, ptr_len - 1);
+
+		dfield = que_node_get_val(str_node);
+		dtype = dfield_get_type(dfield);
+
+		ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+		dfield_set_data(dfield, ptr, ptr_len - 1);
+		break;
+
+	case	IB_LIKE_SUFFIX:
+		func = PARS_LIKE_TOKEN_SUFFIX;
+
+		/* Modify the original node */
+		/* Make it an '' empty string */
+		dfield_set_len(dfield, 0);
+
+		dfield = que_node_get_val(str_node);
+		dtype = dfield_get_type(dfield);
+
+		ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+		dfield_set_data(dfield, ptr + 1, ptr_len - 1);
+		break;
+
+	case	IB_LIKE_SUBSTR:
+		func = PARS_LIKE_TOKEN_SUBSTR;
+
+		/* Modify the original node */
+		/* Make it an '' empty string */
+		dfield_set_len(dfield, 0);
+
+		dfield = que_node_get_val(str_node);
+		dtype = dfield_get_type(dfield);
+
+		ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+		dfield_set_data(dfield, ptr + 1, ptr_len - 2);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	return(func);
+}
+
+/*************************************************************************
+Parses a LIKE operator expression. */
+static
+int
+pars_like_op(
+/*=========*/
+				/* out, own: function node in a query tree */
+	que_node_t*	arg)	/* in: LIKE comparison string.*/
+{
+	char*		ptr;
+	ulint		ptr_len;
+	int		func = PARS_LIKE_TOKEN_EXACT;
+	dfield_t*	dfield = que_node_get_val(arg);
+	dtype_t*	dtype = dfield_get_type(dfield);
+
+	ut_a(dtype_get_mtype(dtype) == DATA_CHAR
+	     || dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+	ptr = static_cast<char*>(dfield_get_data(dfield));
+	ptr_len = strlen(ptr);
+
+	if (ptr_len) {
+
+		func = pars_like_rebind(
+			static_cast<sym_node_t*>(arg), (byte*) ptr, ptr_len);
+	}
+
+	return(func);
+}
+/*********************************************************************//**
+Parses an operator expression.
+@return	own: function node in a query tree */
+UNIV_INTERN
+func_node_t*
+pars_op(
+/*====*/
+	int		func,	/*!< in: operator token code */
+	que_node_t*	arg1,	/*!< in: first argument */
+	que_node_t*	arg2)	/*!< in: second argument or NULL for an unary
+				operator */
+{
+	que_node_list_add_last(NULL, arg1);
+
+	if (arg2) {
+		que_node_list_add_last(arg1, arg2);
+	}
+
+	/* We need to parse the string and determine whether it's a
+	PREFIX, SUFFIX or SUBSTRING comparison */
+	if (func == PARS_LIKE_TOKEN) {
+
+		ut_a(que_node_get_type(arg2) == QUE_NODE_SYMBOL);
+
+		func = pars_like_op(arg2);
+
+		ut_a(func == PARS_LIKE_TOKEN_EXACT
+		     || func == PARS_LIKE_TOKEN_PREFIX
+		     || func == PARS_LIKE_TOKEN_SUFFIX
+		     || func == PARS_LIKE_TOKEN_SUBSTR);
+	}
+
+	return(pars_func_low(func, arg1));
+}
+
+/*********************************************************************//**
+Parses an ORDER BY clause. Order by a single column only is supported.
+@return	own: order-by node in a query tree */
+UNIV_INTERN
+order_node_t*
+pars_order_by(
+/*==========*/
+	sym_node_t*	column,	/*!< in: column name */
+	pars_res_word_t* asc)	/*!< in: &pars_asc_token or pars_desc_token */
+{
+	order_node_t*	node;
+
+	node = static_cast<order_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(order_node_t)));
+
+	node->common.type = QUE_NODE_ORDER;
+
+	node->column = column;
+
+	if (asc == &pars_asc_token) {
+		node->asc = TRUE;
+	} else {
+		ut_a(asc == &pars_desc_token);
+		node->asc = FALSE;
+	}
+
+	return(node);
+}
+
+/*********************************************************************//**
+Determine if a data type is a built-in string data type of the InnoDB
+SQL parser.
+@return	TRUE if string data type */
+static
+ibool
+pars_is_string_type(
+/*================*/
+	ulint	mtype)	/*!< in: main data type */
+{
+	switch (mtype) {
+	case DATA_VARCHAR: case DATA_CHAR:
+	case DATA_FIXBINARY: case DATA_BINARY:
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Resolves the data type of a function in an expression. The argument data
+types must already be resolved. */
+static
+void
+pars_resolve_func_data_type(
+/*========================*/
+	func_node_t*	node)	/*!< in: function node */
+{
+	que_node_t*	arg;
+
+	ut_a(que_node_get_type(node) == QUE_NODE_FUNC);
+
+	arg = node->args;
+
+	switch (node->func) {
+	case PARS_SUM_TOKEN:
+	case '+': case '-': case '*': case '/':
+		/* Inherit the data type from the first argument (which must
+		not be the SQL null literal whose type is DATA_ERROR) */
+
+		dtype_copy(que_node_get_data_type(node),
+			   que_node_get_data_type(arg));
+
+		ut_a(dtype_get_mtype(que_node_get_data_type(node))
+		     == DATA_INT);
+		break;
+
+	case PARS_COUNT_TOKEN:
+		ut_a(arg);
+		dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+		break;
+
+	case PARS_TO_CHAR_TOKEN:
+	case PARS_RND_STR_TOKEN:
+		ut_a(dtype_get_mtype(que_node_get_data_type(arg)) == DATA_INT);
+		dtype_set(que_node_get_data_type(node), DATA_VARCHAR,
+			  DATA_ENGLISH, 0);
+		break;
+
+	case PARS_TO_BINARY_TOKEN:
+		if (dtype_get_mtype(que_node_get_data_type(arg)) == DATA_INT) {
+			dtype_set(que_node_get_data_type(node), DATA_VARCHAR,
+				  DATA_ENGLISH, 0);
+		} else {
+			dtype_set(que_node_get_data_type(node), DATA_BINARY,
+				  0, 0);
+		}
+		break;
+
+	case PARS_TO_NUMBER_TOKEN:
+	case PARS_BINARY_TO_NUMBER_TOKEN:
+	case PARS_LENGTH_TOKEN:
+	case PARS_INSTR_TOKEN:
+		ut_a(pars_is_string_type(que_node_get_data_type(arg)->mtype));
+		dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+		break;
+
+	case PARS_SYSDATE_TOKEN:
+		ut_a(arg == NULL);
+		dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+		break;
+
+	case PARS_SUBSTR_TOKEN:
+	case PARS_CONCAT_TOKEN:
+		ut_a(pars_is_string_type(que_node_get_data_type(arg)->mtype));
+		dtype_set(que_node_get_data_type(node), DATA_VARCHAR,
+			  DATA_ENGLISH, 0);
+		break;
+
+	case '>': case '<': case '=':
+	case PARS_GE_TOKEN:
+	case PARS_LE_TOKEN:
+	case PARS_NE_TOKEN:
+	case PARS_AND_TOKEN:
+	case PARS_OR_TOKEN:
+	case PARS_NOT_TOKEN:
+	case PARS_NOTFOUND_TOKEN:
+
+		/* We currently have no iboolean type: use integer type */
+		dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+		break;
+
+	case PARS_RND_TOKEN:
+		ut_a(dtype_get_mtype(que_node_get_data_type(arg)) == DATA_INT);
+		dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+		break;
+
+	case PARS_LIKE_TOKEN_EXACT:
+	case PARS_LIKE_TOKEN_PREFIX:
+	case PARS_LIKE_TOKEN_SUFFIX:
+	case PARS_LIKE_TOKEN_SUBSTR:
+		dtype_set(que_node_get_data_type(node), DATA_VARCHAR,
+			  DATA_ENGLISH, 0);
+		break;
+
+	default:
+		ut_error;
+	}
+}
+
+/*********************************************************************//**
+Resolves the meaning of variables in an expression and the data types of
+functions. It is an error if some identifier cannot be resolved here. */
+static
+void
+pars_resolve_exp_variables_and_types(
+/*=================================*/
+	sel_node_t*	select_node,	/*!< in: select node or NULL; if
+					this is not NULL then the variable
+					sym nodes are added to the
+					copy_variables list of select_node */
+	que_node_t*	exp_node)	/*!< in: expression */
+{
+	func_node_t*	func_node;
+	que_node_t*	arg;
+	sym_node_t*	sym_node;
+	sym_node_t*	node;
+
+	ut_a(exp_node);
+
+	if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
+		func_node = static_cast<func_node_t*>(exp_node);
+
+		arg = func_node->args;
+
+		while (arg) {
+			pars_resolve_exp_variables_and_types(select_node, arg);
+
+			arg = que_node_get_next(arg);
+		}
+
+		pars_resolve_func_data_type(func_node);
+
+		return;
+	}
+
+	ut_a(que_node_get_type(exp_node) == QUE_NODE_SYMBOL);
+
+	sym_node = static_cast<sym_node_t*>(exp_node);
+
+	if (sym_node->resolved) {
+
+		return;
+	}
+
+	/* Not resolved yet: look in the symbol table for a variable
+	or a cursor or a function with the same name */
+
+	node = UT_LIST_GET_FIRST(pars_sym_tab_global->sym_list);
+
+	while (node) {
+		if (node->resolved
+		    && ((node->token_type == SYM_VAR)
+			|| (node->token_type == SYM_CURSOR)
+			|| (node->token_type == SYM_FUNCTION))
+		    && node->name
+		    && (sym_node->name_len == node->name_len)
+		    && (ut_memcmp(sym_node->name, node->name,
+				  node->name_len) == 0)) {
+
+			/* Found a variable or a cursor declared with
+			the same name */
+
+			break;
+		}
+
+		node = UT_LIST_GET_NEXT(sym_list, node);
+	}
+
+	if (!node) {
+		fprintf(stderr, "PARSER ERROR: Unresolved identifier %s\n",
+			sym_node->name);
+	}
+
+	ut_a(node);
+
+	sym_node->resolved = TRUE;
+	sym_node->token_type = SYM_IMPLICIT_VAR;
+	sym_node->alias = node;
+	sym_node->indirection = node;
+
+	if (select_node) {
+		UT_LIST_ADD_LAST(col_var_list, select_node->copy_variables,
+				 sym_node);
+	}
+
+	dfield_set_type(que_node_get_val(sym_node),
+			que_node_get_data_type(node));
+}
+
+/*********************************************************************//**
+Resolves the meaning of variables in an expression list. It is an error if
+some identifier cannot be resolved here. Resolves also the data types of
+functions. */
+static
+void
+pars_resolve_exp_list_variables_and_types(
+/*======================================*/
+	sel_node_t*	select_node,	/*!< in: select node or NULL */
+	que_node_t*	exp_node)	/*!< in: expression list first node, or
+					NULL */
+{
+	while (exp_node) {
+		pars_resolve_exp_variables_and_types(select_node, exp_node);
+
+		exp_node = que_node_get_next(exp_node);
+	}
+}
+
+/*********************************************************************//**
+Resolves the columns in an expression. */
+static
+void
+pars_resolve_exp_columns(
+/*=====================*/
+	sym_node_t*	table_node,	/*!< in: first node in a table list */
+	que_node_t*	exp_node)	/*!< in: expression */
+{
+	func_node_t*	func_node;
+	que_node_t*	arg;
+	sym_node_t*	sym_node;
+	dict_table_t*	table;
+	sym_node_t*	t_node;
+	ulint		n_cols;
+	ulint		i;
+
+	ut_a(exp_node);
+
+	if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
+		func_node = static_cast<func_node_t*>(exp_node);
+
+		arg = func_node->args;
+
+		while (arg) {
+			pars_resolve_exp_columns(table_node, arg);
+
+			arg = que_node_get_next(arg);
+		}
+
+		return;
+	}
+
+	ut_a(que_node_get_type(exp_node) == QUE_NODE_SYMBOL);
+
+	sym_node = static_cast<sym_node_t*>(exp_node);
+
+	if (sym_node->resolved) {
+
+		return;
+	}
+
+	/* Not resolved yet: look in the table list for a column with the
+	same name */
+
+	t_node = table_node;
+
+	while (t_node) {
+		table = t_node->table;
+
+		n_cols = dict_table_get_n_cols(table);
+
+		for (i = 0; i < n_cols; i++) {
+			const dict_col_t*	col
+				= dict_table_get_nth_col(table, i);
+			const char*		col_name
+				= dict_table_get_col_name(table, i);
+
+			if ((sym_node->name_len == ut_strlen(col_name))
+			    && (0 == ut_memcmp(sym_node->name, col_name,
+					       sym_node->name_len))) {
+				/* Found */
+				sym_node->resolved = TRUE;
+				sym_node->token_type = SYM_COLUMN;
+				sym_node->table = table;
+				sym_node->col_no = i;
+				sym_node->prefetch_buf = NULL;
+
+				dict_col_copy_type(
+					col,
+					dfield_get_type(&sym_node
+							->common.val));
+
+				return;
+			}
+		}
+
+		t_node = static_cast<sym_node_t*>(que_node_get_next(t_node));
+	}
+}
+
+/*********************************************************************//**
+Resolves the meaning of columns in an expression list. */
+static
+void
+pars_resolve_exp_list_columns(
+/*==========================*/
+	sym_node_t*	table_node,	/*!< in: first node in a table list */
+	que_node_t*	exp_node)	/*!< in: expression list first node, or
+					NULL */
+{
+	while (exp_node) {
+		pars_resolve_exp_columns(table_node, exp_node);
+
+		exp_node = que_node_get_next(exp_node);
+	}
+}
+
+/*********************************************************************//**
+Retrieves the table definition for a table name id. */
+static
+void
+pars_retrieve_table_def(
+/*====================*/
+	sym_node_t*	sym_node)	/*!< in: table node */
+{
+	ut_a(sym_node);
+	ut_a(que_node_get_type(sym_node) == QUE_NODE_SYMBOL);
+
+	/* Open the table only if it is not already opened. */
+	if (sym_node->token_type != SYM_TABLE_REF_COUNTED) {
+
+		ut_a(sym_node->table == NULL);
+
+		sym_node->resolved = TRUE;
+		sym_node->token_type = SYM_TABLE_REF_COUNTED;
+
+		sym_node->table = dict_table_open_on_name(
+			sym_node->name, TRUE, FALSE, DICT_ERR_IGNORE_NONE);
+
+		ut_a(sym_node->table != NULL);
+	}
+}
+
+/*********************************************************************//**
+Retrieves the table definitions for a list of table name ids.
+@return	number of tables */
+static
+ulint
+pars_retrieve_table_list_defs(
+/*==========================*/
+	sym_node_t*	sym_node)	/*!< in: first table node in list */
+{
+	ulint		count		= 0;
+
+	if (sym_node == NULL) {
+
+		return(count);
+	}
+
+	while (sym_node) {
+		pars_retrieve_table_def(sym_node);
+
+		count++;
+
+		sym_node = static_cast<sym_node_t*>(
+			que_node_get_next(sym_node));
+	}
+
+	return(count);
+}
+
+/*********************************************************************//**
+Adds all columns to the select list if the query is SELECT * FROM ... */
+static
+void
+pars_select_all_columns(
+/*====================*/
+	sel_node_t*	select_node)	/*!< in: select node already containing
+					the table list */
+{
+	sym_node_t*	col_node;
+	sym_node_t*	table_node;
+	dict_table_t*	table;
+	ulint		i;
+
+	select_node->select_list = NULL;
+
+	table_node = select_node->table_list;
+
+	while (table_node) {
+		table = table_node->table;
+
+		for (i = 0; i < dict_table_get_n_user_cols(table); i++) {
+			const char*	col_name = dict_table_get_col_name(
+				table, i);
+
+			col_node = sym_tab_add_id(pars_sym_tab_global,
+						  (byte*) col_name,
+						  ut_strlen(col_name));
+
+			select_node->select_list = que_node_list_add_last(
+				select_node->select_list, col_node);
+		}
+
+		table_node = static_cast<sym_node_t*>(
+			que_node_get_next(table_node));
+	}
+}
+
+/*********************************************************************//**
+Parses a select list; creates a query graph node for the whole SELECT
+statement.
+@return	own: select node in a query tree */
+UNIV_INTERN
+sel_node_t*
+pars_select_list(
+/*=============*/
+	que_node_t*	select_list,	/*!< in: select list */
+	sym_node_t*	into_list)	/*!< in: variables list or NULL */
+{
+	sel_node_t*	node;
+
+	node = sel_node_create(pars_sym_tab_global->heap);
+
+	node->select_list = select_list;
+	node->into_list = into_list;
+
+	pars_resolve_exp_list_variables_and_types(NULL, into_list);
+
+	return(node);
+}
+
+/*********************************************************************//**
+Checks if the query is an aggregate query, in which case the selct list must
+contain only aggregate function items. */
+static
+void
+pars_check_aggregate(
+/*=================*/
+	sel_node_t*	select_node)	/*!< in: select node already containing
+					the select list */
+{
+	que_node_t*	exp_node;
+	func_node_t*	func_node;
+	ulint		n_nodes			= 0;
+	ulint		n_aggregate_nodes	= 0;
+
+	exp_node = select_node->select_list;
+
+	while (exp_node) {
+
+		n_nodes++;
+
+		if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
+
+			func_node = static_cast<func_node_t*>(exp_node);
+
+			if (func_node->fclass == PARS_FUNC_AGGREGATE) {
+
+				n_aggregate_nodes++;
+			}
+		}
+
+		exp_node = que_node_get_next(exp_node);
+	}
+
+	if (n_aggregate_nodes > 0) {
+		ut_a(n_nodes == n_aggregate_nodes);
+
+		select_node->is_aggregate = TRUE;
+	} else {
+		select_node->is_aggregate = FALSE;
+	}
+}
+
+/*********************************************************************//**
+Parses a select statement.
+@return	own: select node in a query tree */
+UNIV_INTERN
+sel_node_t*
+pars_select_statement(
+/*==================*/
+	sel_node_t*	select_node,	/*!< in: select node already containing
+					the select list */
+	sym_node_t*	table_list,	/*!< in: table list */
+	que_node_t*	search_cond,	/*!< in: search condition or NULL */
+	pars_res_word_t* for_update,	/*!< in: NULL or &pars_update_token */
+	pars_res_word_t* lock_shared,	/*!< in: NULL or &pars_share_token */
+	order_node_t*	order_by)	/*!< in: NULL or an order-by node */
+{
+	select_node->state = SEL_NODE_OPEN;
+
+	select_node->table_list = table_list;
+	select_node->n_tables = pars_retrieve_table_list_defs(table_list);
+
+	if (select_node->select_list == &pars_star_denoter) {
+
+		/* SELECT * FROM ... */
+		pars_select_all_columns(select_node);
+	}
+
+	if (select_node->into_list) {
+		ut_a(que_node_list_get_len(select_node->into_list)
+		     == que_node_list_get_len(select_node->select_list));
+	}
+
+	UT_LIST_INIT(select_node->copy_variables);
+
+	pars_resolve_exp_list_columns(table_list, select_node->select_list);
+	pars_resolve_exp_list_variables_and_types(select_node,
+						  select_node->select_list);
+	pars_check_aggregate(select_node);
+
+	select_node->search_cond = search_cond;
+
+	if (search_cond) {
+		pars_resolve_exp_columns(table_list, search_cond);
+		pars_resolve_exp_variables_and_types(select_node, search_cond);
+	}
+
+	if (for_update) {
+		ut_a(!lock_shared);
+
+		select_node->set_x_locks = TRUE;
+		select_node->row_lock_mode = LOCK_X;
+
+		select_node->consistent_read = FALSE;
+		select_node->read_view = NULL;
+	} else if (lock_shared){
+		select_node->set_x_locks = FALSE;
+		select_node->row_lock_mode = LOCK_S;
+
+		select_node->consistent_read = FALSE;
+		select_node->read_view = NULL;
+	} else {
+		select_node->set_x_locks = FALSE;
+		select_node->row_lock_mode = LOCK_S;
+
+		select_node->consistent_read = TRUE;
+	}
+
+	select_node->order_by = order_by;
+
+	if (order_by) {
+		pars_resolve_exp_columns(table_list, order_by->column);
+	}
+
+	/* The final value of the following fields depend on the environment
+	where the select statement appears: */
+
+	select_node->can_get_updated = FALSE;
+	select_node->explicit_cursor = NULL;
+
+	opt_search_plan(select_node);
+
+	return(select_node);
+}
+
+/*********************************************************************//**
+Parses a cursor declaration.
+@return	sym_node */
+UNIV_INTERN
+que_node_t*
+pars_cursor_declaration(
+/*====================*/
+	sym_node_t*	sym_node,	/*!< in: cursor id node in the symbol
+					table */
+	sel_node_t*	select_node)	/*!< in: select node */
+{
+	sym_node->resolved = TRUE;
+	sym_node->token_type = SYM_CURSOR;
+	sym_node->cursor_def = select_node;
+
+	select_node->state = SEL_NODE_CLOSED;
+	select_node->explicit_cursor = sym_node;
+
+	return(sym_node);
+}
+
+/*********************************************************************//**
+Parses a function declaration.
+@return	sym_node */
+UNIV_INTERN
+que_node_t*
+pars_function_declaration(
+/*======================*/
+	sym_node_t*	sym_node)	/*!< in: function id node in the symbol
+					table */
+{
+	sym_node->resolved = TRUE;
+	sym_node->token_type = SYM_FUNCTION;
+
+	/* Check that the function exists. */
+	ut_a(pars_info_lookup_user_func(
+		pars_sym_tab_global->info, sym_node->name));
+
+	return(sym_node);
+}
+
+/*********************************************************************//**
+Parses a delete or update statement start.
+@return	own: update node in a query tree */
+UNIV_INTERN
+upd_node_t*
+pars_update_statement_start(
+/*========================*/
+	ibool		is_delete,	/*!< in: TRUE if delete */
+	sym_node_t*	table_sym,	/*!< in: table name node */
+	col_assign_node_t* col_assign_list)/*!< in: column assignment list, NULL
+					if delete */
+{
+	upd_node_t*	node;
+
+	node = upd_node_create(pars_sym_tab_global->heap);
+
+	node->is_delete = is_delete;
+
+	node->table_sym = table_sym;
+	node->col_assign_list = col_assign_list;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a column assignment in an update.
+@return	column assignment node */
+UNIV_INTERN
+col_assign_node_t*
+pars_column_assignment(
+/*===================*/
+	sym_node_t*	column,	/*!< in: column to assign */
+	que_node_t*	exp)	/*!< in: value to assign */
+{
+	col_assign_node_t*	node;
+
+	node = static_cast<col_assign_node_t*>(
+		mem_heap_alloc(pars_sym_tab_global->heap,
+			      sizeof(col_assign_node_t)));
+	node->common.type = QUE_NODE_COL_ASSIGNMENT;
+
+	node->col = column;
+	node->val = exp;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Processes an update node assignment list. */
+static
+void
+pars_process_assign_list(
+/*=====================*/
+	upd_node_t*	node)	/*!< in: update node */
+{
+	col_assign_node_t*	col_assign_list;
+	sym_node_t*		table_sym;
+	col_assign_node_t*	assign_node;
+	upd_field_t*		upd_field;
+	dict_index_t*		clust_index;
+	sym_node_t*		col_sym;
+	ulint			changes_ord_field;
+	ulint			changes_field_size;
+	ulint			n_assigns;
+	ulint			i;
+
+	table_sym = node->table_sym;
+	col_assign_list = static_cast<col_assign_node_t*>(
+		 node->col_assign_list);
+	clust_index = dict_table_get_first_index(node->table);
+
+	assign_node = col_assign_list;
+	n_assigns = 0;
+
+	while (assign_node) {
+		pars_resolve_exp_columns(table_sym, assign_node->col);
+		pars_resolve_exp_columns(table_sym, assign_node->val);
+		pars_resolve_exp_variables_and_types(NULL, assign_node->val);
+#if 0
+		ut_a(dtype_get_mtype(
+			     dfield_get_type(que_node_get_val(
+						     assign_node->col)))
+		     == dtype_get_mtype(
+			     dfield_get_type(que_node_get_val(
+						     assign_node->val))));
+#endif
+
+		/* Add to the update node all the columns found in assignment
+		values as columns to copy: therefore, TRUE */
+
+		opt_find_all_cols(TRUE, clust_index, &(node->columns), NULL,
+				  assign_node->val);
+		n_assigns++;
+
+		assign_node = static_cast<col_assign_node_t*>(
+				que_node_get_next(assign_node));
+	}
+
+	node->update = upd_create(n_assigns, pars_sym_tab_global->heap);
+
+	assign_node = col_assign_list;
+
+	changes_field_size = UPD_NODE_NO_SIZE_CHANGE;
+
+	for (i = 0; i < n_assigns; i++) {
+		upd_field = upd_get_nth_field(node->update, i);
+
+		col_sym = assign_node->col;
+
+		upd_field_set_field_no(upd_field, dict_index_get_nth_col_pos(
+					       clust_index, col_sym->col_no),
+				       clust_index, NULL);
+		upd_field->exp = assign_node->val;
+
+		if (!dict_col_get_fixed_size(
+			    dict_index_get_nth_col(clust_index,
+						   upd_field->field_no),
+			    dict_table_is_comp(node->table))) {
+			changes_field_size = 0;
+		}
+
+		assign_node = static_cast<col_assign_node_t*>(
+				que_node_get_next(assign_node));
+	}
+
+	/* Find out if the update can modify an ordering field in any index */
+
+	changes_ord_field = UPD_NODE_NO_ORD_CHANGE;
+
+	if (row_upd_changes_some_index_ord_field_binary(node->table,
+							node->update)) {
+		changes_ord_field = 0;
+	}
+
+	node->cmpl_info = changes_ord_field | changes_field_size;
+}
+
+/*********************************************************************//**
+Parses an update or delete statement.
+@return	own: update node in a query tree */
+UNIV_INTERN
+upd_node_t*
+pars_update_statement(
+/*==================*/
+	upd_node_t*	node,		/*!< in: update node */
+	sym_node_t*	cursor_sym,	/*!< in: pointer to a cursor entry in
+					the symbol table or NULL */
+	que_node_t*	search_cond)	/*!< in: search condition or NULL */
+{
+	sym_node_t*	table_sym;
+	sel_node_t*	sel_node;
+	plan_t*		plan;
+
+	table_sym = node->table_sym;
+
+	pars_retrieve_table_def(table_sym);
+	node->table = table_sym->table;
+
+	UT_LIST_INIT(node->columns);
+
+	/* Make the single table node into a list of table nodes of length 1 */
+
+	que_node_list_add_last(NULL, table_sym);
+
+	if (cursor_sym) {
+		pars_resolve_exp_variables_and_types(NULL, cursor_sym);
+
+		sel_node = cursor_sym->alias->cursor_def;
+
+		node->searched_update = FALSE;
+	} else {
+		sel_node = pars_select_list(NULL, NULL);
+
+		pars_select_statement(sel_node, table_sym, search_cond, NULL,
+				      &pars_share_token, NULL);
+		node->searched_update = TRUE;
+		sel_node->common.parent = node;
+	}
+
+	node->select = sel_node;
+
+	ut_a(!node->is_delete || (node->col_assign_list == NULL));
+	ut_a(node->is_delete || (node->col_assign_list != NULL));
+
+	if (node->is_delete) {
+		node->cmpl_info = 0;
+	} else {
+		pars_process_assign_list(node);
+	}
+
+	if (node->searched_update) {
+		node->has_clust_rec_x_lock = TRUE;
+		sel_node->set_x_locks = TRUE;
+		sel_node->row_lock_mode = LOCK_X;
+	} else {
+		node->has_clust_rec_x_lock = sel_node->set_x_locks;
+	}
+
+	ut_a(sel_node->n_tables == 1);
+	ut_a(sel_node->consistent_read == FALSE);
+	ut_a(sel_node->order_by == NULL);
+	ut_a(sel_node->is_aggregate == FALSE);
+
+	sel_node->can_get_updated = TRUE;
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	plan = sel_node_get_nth_plan(sel_node, 0);
+
+	plan->no_prefetch = TRUE;
+
+	if (!dict_index_is_clust(plan->index)) {
+
+		plan->must_get_clust = TRUE;
+
+		node->pcur = &(plan->clust_pcur);
+	} else {
+		node->pcur = &(plan->pcur);
+	}
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses an insert statement.
+@return	own: update node in a query tree */
+UNIV_INTERN
+ins_node_t*
+pars_insert_statement(
+/*==================*/
+	sym_node_t*	table_sym,	/*!< in: table name node */
+	que_node_t*	values_list,	/*!< in: value expression list or NULL */
+	sel_node_t*	select)		/*!< in: select condition or NULL */
+{
+	ins_node_t*	node;
+	dtuple_t*	row;
+	ulint		ins_type;
+
+	ut_a(values_list || select);
+	ut_a(!values_list || !select);
+
+	if (values_list) {
+		ins_type = INS_VALUES;
+	} else {
+		ins_type = INS_SEARCHED;
+	}
+
+	pars_retrieve_table_def(table_sym);
+
+	node = ins_node_create(ins_type, table_sym->table,
+			       pars_sym_tab_global->heap);
+
+	row = dtuple_create(pars_sym_tab_global->heap,
+			    dict_table_get_n_cols(node->table));
+
+	dict_table_copy_types(row, table_sym->table);
+
+	ins_node_set_new_row(node, row);
+
+	node->select = select;
+
+	if (select) {
+		select->common.parent = node;
+
+		ut_a(que_node_list_get_len(select->select_list)
+		     == dict_table_get_n_user_cols(table_sym->table));
+	}
+
+	node->values_list = values_list;
+
+	if (node->values_list) {
+		pars_resolve_exp_list_variables_and_types(NULL, values_list);
+
+		ut_a(que_node_list_get_len(values_list)
+		     == dict_table_get_n_user_cols(table_sym->table));
+	}
+
+	return(node);
+}
+
+/*********************************************************************//**
+Set the type of a dfield. */
+static
+void
+pars_set_dfield_type(
+/*=================*/
+	dfield_t*		dfield,		/*!< in: dfield */
+	pars_res_word_t*	type,		/*!< in: pointer to a type
+						token */
+	ulint			len,		/*!< in: length, or 0 */
+	ibool			is_unsigned,	/*!< in: if TRUE, column is
+						UNSIGNED. */
+	ibool			is_not_null)	/*!< in: if TRUE, column is
+						NOT NULL. */
+{
+	ulint flags = 0;
+
+	if (is_not_null) {
+		flags |= DATA_NOT_NULL;
+	}
+
+	if (is_unsigned) {
+		flags |= DATA_UNSIGNED;
+	}
+
+	if (type == &pars_bigint_token) {
+		ut_a(len == 0);
+
+		dtype_set(dfield_get_type(dfield), DATA_INT, flags, 8);
+	} else if (type == &pars_int_token) {
+		ut_a(len == 0);
+
+		dtype_set(dfield_get_type(dfield), DATA_INT, flags, 4);
+
+	} else if (type == &pars_char_token) {
+		//ut_a(len == 0);
+
+		dtype_set(dfield_get_type(dfield), DATA_VARCHAR,
+			  DATA_ENGLISH | flags, len);
+	} else if (type == &pars_binary_token) {
+		ut_a(len != 0);
+
+		dtype_set(dfield_get_type(dfield), DATA_FIXBINARY,
+			  DATA_BINARY_TYPE | flags, len);
+	} else if (type == &pars_blob_token) {
+		ut_a(len == 0);
+
+		dtype_set(dfield_get_type(dfield), DATA_BLOB,
+			  DATA_BINARY_TYPE | flags, 0);
+	} else {
+		ut_error;
+	}
+}
+
+/*********************************************************************//**
+Parses a variable declaration.
+@return	own: symbol table node of type SYM_VAR */
+UNIV_INTERN
+sym_node_t*
+pars_variable_declaration(
+/*======================*/
+	sym_node_t*	node,	/*!< in: symbol table node allocated for the
+				id of the variable */
+	pars_res_word_t* type)	/*!< in: pointer to a type token */
+{
+	node->resolved = TRUE;
+	node->token_type = SYM_VAR;
+
+	node->param_type = PARS_NOT_PARAM;
+
+	pars_set_dfield_type(que_node_get_val(node), type, 0, FALSE, FALSE);
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a procedure parameter declaration.
+@return	own: symbol table node of type SYM_VAR */
+UNIV_INTERN
+sym_node_t*
+pars_parameter_declaration(
+/*=======================*/
+	sym_node_t*	node,	/*!< in: symbol table node allocated for the
+				id of the parameter */
+	ulint		param_type,
+				/*!< in: PARS_INPUT or PARS_OUTPUT */
+	pars_res_word_t* type)	/*!< in: pointer to a type token */
+{
+	ut_a((param_type == PARS_INPUT) || (param_type == PARS_OUTPUT));
+
+	pars_variable_declaration(node, type);
+
+	node->param_type = param_type;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Sets the parent field in a query node list. */
+static
+void
+pars_set_parent_in_list(
+/*====================*/
+	que_node_t*	node_list,	/*!< in: first node in a list */
+	que_node_t*	parent)		/*!< in: parent value to set in all
+					nodes of the list */
+{
+	que_common_t*	common;
+
+	common = static_cast<que_common_t*>(node_list);
+
+	while (common) {
+		common->parent = parent;
+
+		common = static_cast<que_common_t*>(que_node_get_next(common));
+	}
+}
+
+/*********************************************************************//**
+Parses an elsif element.
+@return	elsif node */
+UNIV_INTERN
+elsif_node_t*
+pars_elsif_element(
+/*===============*/
+	que_node_t*	cond,		/*!< in: if-condition */
+	que_node_t*	stat_list)	/*!< in: statement list */
+{
+	elsif_node_t*	node;
+
+	node = static_cast<elsif_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(elsif_node_t)));
+
+	node->common.type = QUE_NODE_ELSIF;
+
+	node->cond = cond;
+
+	pars_resolve_exp_variables_and_types(NULL, cond);
+
+	node->stat_list = stat_list;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses an if-statement.
+@return	if-statement node */
+UNIV_INTERN
+if_node_t*
+pars_if_statement(
+/*==============*/
+	que_node_t*	cond,		/*!< in: if-condition */
+	que_node_t*	stat_list,	/*!< in: statement list */
+	que_node_t*	else_part)	/*!< in: else-part statement list
+					or elsif element list */
+{
+	if_node_t*	node;
+	elsif_node_t*	elsif_node;
+
+	node = static_cast<if_node_t*>(
+		 mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(if_node_t)));
+
+	node->common.type = QUE_NODE_IF;
+
+	node->cond = cond;
+
+	pars_resolve_exp_variables_and_types(NULL, cond);
+
+	node->stat_list = stat_list;
+
+	if (else_part && (que_node_get_type(else_part) == QUE_NODE_ELSIF)) {
+
+		/* There is a list of elsif conditions */
+
+		node->else_part = NULL;
+		node->elsif_list = static_cast<elsif_node_t*>(else_part);
+
+		elsif_node = static_cast<elsif_node_t*>(else_part);
+
+		while (elsif_node) {
+			pars_set_parent_in_list(elsif_node->stat_list, node);
+
+			elsif_node = static_cast<elsif_node_t*>(
+				que_node_get_next(elsif_node));
+		}
+	} else {
+		node->else_part = else_part;
+		node->elsif_list = NULL;
+
+		pars_set_parent_in_list(else_part, node);
+	}
+
+	pars_set_parent_in_list(stat_list, node);
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a while-statement.
+@return	while-statement node */
+UNIV_INTERN
+while_node_t*
+pars_while_statement(
+/*=================*/
+	que_node_t*	cond,		/*!< in: while-condition */
+	que_node_t*	stat_list)	/*!< in: statement list */
+{
+	while_node_t*	node;
+
+	node = static_cast<while_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(while_node_t)));
+
+	node->common.type = QUE_NODE_WHILE;
+
+	node->cond = cond;
+
+	pars_resolve_exp_variables_and_types(NULL, cond);
+
+	node->stat_list = stat_list;
+
+	pars_set_parent_in_list(stat_list, node);
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a for-loop-statement.
+@return	for-statement node */
+UNIV_INTERN
+for_node_t*
+pars_for_statement(
+/*===============*/
+	sym_node_t*	loop_var,	/*!< in: loop variable */
+	que_node_t*	loop_start_limit,/*!< in: loop start expression */
+	que_node_t*	loop_end_limit,	/*!< in: loop end expression */
+	que_node_t*	stat_list)	/*!< in: statement list */
+{
+	for_node_t*	node;
+
+	node = static_cast<for_node_t*>(
+		mem_heap_alloc(pars_sym_tab_global->heap, sizeof(for_node_t)));
+
+	node->common.type = QUE_NODE_FOR;
+
+	pars_resolve_exp_variables_and_types(NULL, loop_var);
+	pars_resolve_exp_variables_and_types(NULL, loop_start_limit);
+	pars_resolve_exp_variables_and_types(NULL, loop_end_limit);
+
+	node->loop_var = loop_var->indirection;
+
+	ut_a(loop_var->indirection);
+
+	node->loop_start_limit = loop_start_limit;
+	node->loop_end_limit = loop_end_limit;
+
+	node->stat_list = stat_list;
+
+	pars_set_parent_in_list(stat_list, node);
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses an exit statement.
+@return	exit statement node */
+UNIV_INTERN
+exit_node_t*
+pars_exit_statement(void)
+/*=====================*/
+{
+	exit_node_t*	node;
+
+	node = static_cast<exit_node_t*>(
+		mem_heap_alloc(pars_sym_tab_global->heap, sizeof(exit_node_t)));
+	node->common.type = QUE_NODE_EXIT;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a return-statement.
+@return	return-statement node */
+UNIV_INTERN
+return_node_t*
+pars_return_statement(void)
+/*=======================*/
+{
+	return_node_t*	node;
+
+	node = static_cast<return_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(return_node_t)));
+	node->common.type = QUE_NODE_RETURN;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses an assignment statement.
+@return	assignment statement node */
+UNIV_INTERN
+assign_node_t*
+pars_assignment_statement(
+/*======================*/
+	sym_node_t*	var,	/*!< in: variable to assign */
+	que_node_t*	val)	/*!< in: value to assign */
+{
+	assign_node_t*	node;
+
+	node = static_cast<assign_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(assign_node_t)));
+	node->common.type = QUE_NODE_ASSIGNMENT;
+
+	node->var = var;
+	node->val = val;
+
+	pars_resolve_exp_variables_and_types(NULL, var);
+	pars_resolve_exp_variables_and_types(NULL, val);
+
+	ut_a(dtype_get_mtype(dfield_get_type(que_node_get_val(var)))
+	     == dtype_get_mtype(dfield_get_type(que_node_get_val(val))));
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a procedure call.
+@return	function node */
+UNIV_INTERN
+func_node_t*
+pars_procedure_call(
+/*================*/
+	que_node_t*	res_word,/*!< in: procedure name reserved word */
+	que_node_t*	args)	/*!< in: argument list */
+{
+	func_node_t*	node;
+
+	node = pars_func(res_word, args);
+
+	pars_resolve_exp_list_variables_and_types(NULL, args);
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a fetch statement. into_list or user_func (but not both) must be
+non-NULL.
+@return	fetch statement node */
+UNIV_INTERN
+fetch_node_t*
+pars_fetch_statement(
+/*=================*/
+	sym_node_t*	cursor,		/*!< in: cursor node */
+	sym_node_t*	into_list,	/*!< in: variables to set, or NULL */
+	sym_node_t*	user_func)	/*!< in: user function name, or NULL */
+{
+	sym_node_t*	cursor_decl;
+	fetch_node_t*	node;
+
+	/* Logical XOR. */
+	ut_a(!into_list != !user_func);
+
+	node = static_cast<fetch_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(fetch_node_t)));
+
+	node->common.type = QUE_NODE_FETCH;
+
+	pars_resolve_exp_variables_and_types(NULL, cursor);
+
+	if (into_list) {
+		pars_resolve_exp_list_variables_and_types(NULL, into_list);
+		node->into_list = into_list;
+		node->func = NULL;
+	} else {
+		pars_resolve_exp_variables_and_types(NULL, user_func);
+
+		node->func = pars_info_lookup_user_func(
+			pars_sym_tab_global->info, user_func->name);
+
+		ut_a(node->func);
+
+		node->into_list = NULL;
+	}
+
+	cursor_decl = cursor->alias;
+
+	ut_a(cursor_decl->token_type == SYM_CURSOR);
+
+	node->cursor_def = cursor_decl->cursor_def;
+
+	if (into_list) {
+		ut_a(que_node_list_get_len(into_list)
+		     == que_node_list_get_len(node->cursor_def->select_list));
+	}
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses an open or close cursor statement.
+@return	fetch statement node */
+UNIV_INTERN
+open_node_t*
+pars_open_statement(
+/*================*/
+	ulint		type,	/*!< in: ROW_SEL_OPEN_CURSOR
+				or ROW_SEL_CLOSE_CURSOR */
+	sym_node_t*	cursor)	/*!< in: cursor node */
+{
+	sym_node_t*	cursor_decl;
+	open_node_t*	node;
+
+	node = static_cast<open_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(open_node_t)));
+
+	node->common.type = QUE_NODE_OPEN;
+
+	pars_resolve_exp_variables_and_types(NULL, cursor);
+
+	cursor_decl = cursor->alias;
+
+	ut_a(cursor_decl->token_type == SYM_CURSOR);
+
+	node->op_type = static_cast<open_node_op>(type);
+	node->cursor_def = cursor_decl->cursor_def;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a row_printf-statement.
+@return	row_printf-statement node */
+UNIV_INTERN
+row_printf_node_t*
+pars_row_printf_statement(
+/*======================*/
+	sel_node_t*	sel_node)	/*!< in: select node */
+{
+	row_printf_node_t*	node;
+
+	node = static_cast<row_printf_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(row_printf_node_t)));
+	node->common.type = QUE_NODE_ROW_PRINTF;
+
+	node->sel_node = sel_node;
+
+	sel_node->common.parent = node;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a commit statement.
+@return	own: commit node struct */
+UNIV_INTERN
+commit_node_t*
+pars_commit_statement(void)
+/*=======================*/
+{
+	return(trx_commit_node_create(pars_sym_tab_global->heap));
+}
+
+/*********************************************************************//**
+Parses a rollback statement.
+@return	own: rollback node struct */
+UNIV_INTERN
+roll_node_t*
+pars_rollback_statement(void)
+/*=========================*/
+{
+	return(roll_node_create(pars_sym_tab_global->heap));
+}
+
+/*********************************************************************//**
+Parses a column definition at a table creation.
+@return	column sym table node */
+UNIV_INTERN
+sym_node_t*
+pars_column_def(
+/*============*/
+	sym_node_t*		sym_node,	/*!< in: column node in the
+						symbol table */
+	pars_res_word_t*	type,		/*!< in: data type */
+	sym_node_t*		len,		/*!< in: length of column, or
+						NULL */
+	void*			is_unsigned,	/*!< in: if not NULL, column
+						is of type UNSIGNED. */
+	void*			is_not_null)	/*!< in: if not NULL, column
+						is of type NOT NULL. */
+{
+	ulint len2;
+
+	if (len) {
+		len2 = eval_node_get_int_val(len);
+	} else {
+		len2 = 0;
+	}
+
+	pars_set_dfield_type(que_node_get_val(sym_node), type, len2,
+			     is_unsigned != NULL, is_not_null != NULL);
+
+	return(sym_node);
+}
+
+/*********************************************************************//**
+Parses a table creation operation.
+@return	table create subgraph */
+UNIV_INTERN
+tab_node_t*
+pars_create_table(
+/*==============*/
+	sym_node_t*	table_sym,	/*!< in: table name node in the symbol
+					table */
+	sym_node_t*	column_defs,	/*!< in: list of column names */
+	sym_node_t*	compact,	/* in: non-NULL if COMPACT table. */
+	sym_node_t*	block_size,	/* in: block size (can be NULL) */
+	void*		not_fit_in_memory __attribute__((unused)))
+					/*!< in: a non-NULL pointer means that
+					this is a table which in simulations
+					should be simulated as not fitting
+					in memory; thread is put to sleep
+					to simulate disk accesses; NOTE that
+					this flag is not stored to the data
+					dictionary on disk, and the database
+					will forget about non-NULL value if
+					it has to reload the table definition
+					from disk */
+{
+	dict_table_t*	table;
+	sym_node_t*	column;
+	tab_node_t*	node;
+	const dtype_t*	dtype;
+	ulint		n_cols;
+	ulint		flags = 0;
+	ulint		flags2 = 0;
+
+	if (compact != NULL) {
+
+		/* System tables currently only use the REDUNDANT row
+		format therefore the check for srv_file_per_table should be
+		safe for now. */
+
+		flags |= DICT_TF_COMPACT;
+
+		/* FIXME: Ideally this should be part of the SQL syntax
+		or use some other mechanism. We want to reduce dependency
+		on global variables. There is an inherent race here but
+		that has always existed around this variable. */
+		if (srv_file_per_table) {
+			flags2 |= DICT_TF2_USE_TABLESPACE;
+		}
+	}
+
+	if (block_size != NULL) {
+		ulint		size;
+		dfield_t*	dfield;
+
+		dfield = que_node_get_val(block_size);
+
+		ut_a(dfield_get_len(dfield) == 4);
+		size = mach_read_from_4(static_cast<byte*>(
+			dfield_get_data(dfield)));
+
+
+		switch (size) {
+		case 0:
+			break;
+
+		case 1: case 2: case 4: case 8: case 16:
+			flags |= DICT_TF_COMPACT;
+			/* FTS-FIXME: needs the zip changes */
+			/* flags |= size << DICT_TF_COMPRESSED_SHIFT; */
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	/* Set the flags2 when create table or alter tables */
+	flags2 |= DICT_TF2_FTS_AUX_HEX_NAME;
+	DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+			flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME;);
+
+
+	n_cols = que_node_list_get_len(column_defs);
+
+	table = dict_mem_table_create(
+		table_sym->name, 0, n_cols, flags, flags2);
+
+#ifdef UNIV_DEBUG
+	if (not_fit_in_memory != NULL) {
+		table->does_not_fit_in_memory = TRUE;
+	}
+#endif /* UNIV_DEBUG */
+	column = column_defs;
+
+	while (column) {
+		dtype = dfield_get_type(que_node_get_val(column));
+
+		dict_mem_table_add_col(table, table->heap,
+				       column->name, dtype->mtype,
+				       dtype->prtype, dtype->len);
+		column->resolved = TRUE;
+		column->token_type = SYM_COLUMN;
+
+		column = static_cast<sym_node_t*>(que_node_get_next(column));
+	}
+
+	node = tab_create_graph_create(table, pars_sym_tab_global->heap, true);
+
+	table_sym->resolved = TRUE;
+	table_sym->token_type = SYM_TABLE;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses an index creation operation.
+@return	index create subgraph */
+UNIV_INTERN
+ind_node_t*
+pars_create_index(
+/*==============*/
+	pars_res_word_t* unique_def,	/*!< in: not NULL if a unique index */
+	pars_res_word_t* clustered_def,	/*!< in: not NULL if a clustered index */
+	sym_node_t*	index_sym,	/*!< in: index name node in the symbol
+					table */
+	sym_node_t*	table_sym,	/*!< in: table name node in the symbol
+					table */
+	sym_node_t*	column_list)	/*!< in: list of column names */
+{
+	dict_index_t*	index;
+	sym_node_t*	column;
+	ind_node_t*	node;
+	ulint		n_fields;
+	ulint		ind_type;
+
+	n_fields = que_node_list_get_len(column_list);
+
+	ind_type = 0;
+
+	if (unique_def) {
+		ind_type = ind_type | DICT_UNIQUE;
+	}
+
+	if (clustered_def) {
+		ind_type = ind_type | DICT_CLUSTERED;
+	}
+
+	index = dict_mem_index_create(table_sym->name, index_sym->name, 0,
+				      ind_type, n_fields);
+	column = column_list;
+
+	while (column) {
+		dict_mem_index_add_field(index, column->name, 0);
+
+		column->resolved = TRUE;
+		column->token_type = SYM_COLUMN;
+
+		column = static_cast<sym_node_t*>(que_node_get_next(column));
+	}
+
+	node = ind_create_graph_create(index, pars_sym_tab_global->heap, true);
+
+	table_sym->resolved = TRUE;
+	table_sym->token_type = SYM_TABLE;
+
+	index_sym->resolved = TRUE;
+	index_sym->token_type = SYM_TABLE;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a procedure definition.
+@return	query fork node */
+UNIV_INTERN
+que_fork_t*
+pars_procedure_definition(
+/*======================*/
+	sym_node_t*	sym_node,	/*!< in: procedure id node in the symbol
+					table */
+	sym_node_t*	param_list,	/*!< in: parameter declaration list */
+	que_node_t*	stat_list)	/*!< in: statement list */
+{
+	proc_node_t*	node;
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+	mem_heap_t*	heap;
+
+	heap = pars_sym_tab_global->heap;
+
+	fork = que_fork_create(NULL, NULL, QUE_FORK_PROCEDURE, heap);
+	fork->trx = NULL;
+
+	thr = que_thr_create(fork, heap);
+
+	node = static_cast<proc_node_t*>(
+		mem_heap_alloc(heap, sizeof(proc_node_t)));
+
+	node->common.type = QUE_NODE_PROC;
+	node->common.parent = thr;
+
+	sym_node->token_type = SYM_PROCEDURE_NAME;
+	sym_node->resolved = TRUE;
+
+	node->proc_id = sym_node;
+	node->param_list = param_list;
+	node->stat_list = stat_list;
+
+	pars_set_parent_in_list(stat_list, node);
+
+	node->sym_tab = pars_sym_tab_global;
+
+	thr->child = node;
+
+	pars_sym_tab_global->query_graph = fork;
+
+	return(fork);
+}
+
+/*************************************************************//**
+Parses a stored procedure call, when this is not within another stored
+procedure, that is, the client issues a procedure call directly.
+In MySQL/InnoDB, stored InnoDB procedures are invoked via the
+parsed procedure tree, not via InnoDB SQL, so this function is not used.
+@return	query graph */
+UNIV_INTERN
+que_fork_t*
+pars_stored_procedure_call(
+/*=======================*/
+	sym_node_t*	sym_node __attribute__((unused)))
+					/*!< in: stored procedure name */
+{
+	ut_error;
+	return(NULL);
+}
+
+/*************************************************************//**
+Retrieves characters to the lexical analyzer. */
+UNIV_INTERN
+int
+pars_get_lex_chars(
+/*===============*/
+	char*	buf,		/*!< in/out: buffer where to copy */
+	int	max_size)	/*!< in: maximum number of characters which fit
+				in the buffer */
+{
+	int	len;
+
+	len = static_cast<int>(
+		pars_sym_tab_global->string_len
+		- pars_sym_tab_global->next_char_pos);
+	if (len == 0) {
+#ifdef YYDEBUG
+		/* fputs("SQL string ends\n", stderr); */
+#endif
+		return(0);
+	}
+
+	if (len > max_size) {
+		len = max_size;
+	}
+
+#ifdef UNIV_SQL_DEBUG
+	if (pars_print_lexed) {
+
+		if (len >= 5) {
+			len = 5;
+		}
+
+		fwrite(pars_sym_tab_global->sql_string
+		       + pars_sym_tab_global->next_char_pos,
+		       1, len, stderr);
+	}
+#endif /* UNIV_SQL_DEBUG */
+
+	ut_memcpy(buf, pars_sym_tab_global->sql_string
+		  + pars_sym_tab_global->next_char_pos, len);
+
+	pars_sym_tab_global->next_char_pos += len;
+
+	return(len);
+}
+
+/*************************************************************//**
+Called by yyparse on error. */
+UNIV_INTERN
+void
+yyerror(
+/*====*/
+	const char*	s __attribute__((unused)))
+				/*!< in: error message string */
+{
+	ut_ad(s);
+
+	fputs("PARSER ERROR: Syntax error in SQL string\n", stderr);
+
+	ut_error;
+}
+
+/*************************************************************//**
+Parses an SQL string returning the query graph.
+@return	own: the query graph */
+UNIV_INTERN
+que_t*
+pars_sql(
+/*=====*/
+	pars_info_t*	info,	/*!< in: extra information, or NULL */
+	const char*	str)	/*!< in: SQL string */
+{
+	sym_node_t*	sym_node;
+	mem_heap_t*	heap;
+	que_t*		graph;
+
+	ut_ad(str);
+
+	heap = mem_heap_create(16000);
+
+	/* Currently, the parser is not reentrant: */
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	pars_sym_tab_global = sym_tab_create(heap);
+
+	pars_sym_tab_global->string_len = strlen(str);
+	pars_sym_tab_global->sql_string = static_cast<char*>(
+		mem_heap_dup(heap, str, pars_sym_tab_global->string_len + 1));
+	pars_sym_tab_global->next_char_pos = 0;
+	pars_sym_tab_global->info = info;
+
+	yyparse();
+
+	sym_node = UT_LIST_GET_FIRST(pars_sym_tab_global->sym_list);
+
+	while (sym_node) {
+		ut_a(sym_node->resolved);
+
+		sym_node = UT_LIST_GET_NEXT(sym_list, sym_node);
+	}
+
+	graph = pars_sym_tab_global->query_graph;
+
+	graph->sym_tab = pars_sym_tab_global;
+	graph->info = info;
+
+	pars_sym_tab_global = NULL;
+
+	/* fprintf(stderr, "SQL graph size %lu\n", mem_heap_get_size(heap)); */
+
+	return(graph);
+}
+
+/******************************************************************//**
+Completes a query graph by adding query thread and fork nodes
+above it and prepares the graph for running. The fork created is of
+type QUE_FORK_MYSQL_INTERFACE.
+@return	query thread node to run */
+UNIV_INTERN
+que_thr_t*
+pars_complete_graph_for_exec(
+/*=========================*/
+	que_node_t*	node,	/*!< in: root node for an incomplete
+				query graph, or NULL for dummy graph */
+	trx_t*		trx,	/*!< in: transaction handle */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+{
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+
+	fork = que_fork_create(NULL, NULL, QUE_FORK_MYSQL_INTERFACE, heap);
+	fork->trx = trx;
+
+	thr = que_thr_create(fork, heap);
+
+	thr->child = node;
+
+	if (node) {
+		que_node_set_parent(node, thr);
+	}
+
+	trx->graph = NULL;
+
+	return(thr);
+}
+
+/****************************************************************//**
+Create parser info struct.
+@return	own: info struct */
+UNIV_INTERN
+pars_info_t*
+pars_info_create(void)
+/*==================*/
+{
+	pars_info_t*	info;
+	mem_heap_t*	heap;
+
+	heap = mem_heap_create(512);
+
+	info = static_cast<pars_info_t*>(mem_heap_alloc(heap, sizeof(*info)));
+
+	info->heap = heap;
+	info->funcs = NULL;
+	info->bound_lits = NULL;
+	info->bound_ids = NULL;
+	info->graph_owns_us = TRUE;
+
+	return(info);
+}
+
+/****************************************************************//**
+Free info struct and everything it contains. */
+UNIV_INTERN
+void
+pars_info_free(
+/*===========*/
+	pars_info_t*	info)	/*!< in, own: info struct */
+{
+	mem_heap_free(info->heap);
+}
+
+/****************************************************************//**
+Add bound literal. */
+UNIV_INTERN
+void
+pars_info_add_literal(
+/*==================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const void*	address,	/*!< in: address */
+	ulint		length,		/*!< in: length of data */
+	ulint		type,		/*!< in: type, e.g. DATA_FIXBINARY */
+	ulint		prtype)		/*!< in: precise type, e.g.
+					DATA_UNSIGNED */
+{
+	pars_bound_lit_t*	pbl;
+
+	ut_ad(!pars_info_get_bound_lit(info, name));
+
+	pbl = static_cast<pars_bound_lit_t*>(
+		mem_heap_alloc(info->heap, sizeof(*pbl)));
+
+	pbl->name = name;
+
+	pbl->address = address;
+	pbl->length = length;
+	pbl->type = type;
+	pbl->prtype = prtype;
+
+	if (!info->bound_lits) {
+		ib_alloc_t*     heap_alloc;
+
+		heap_alloc = ib_heap_allocator_create(info->heap);
+
+		info->bound_lits = ib_vector_create(heap_alloc, sizeof(*pbl), 8);
+	}
+
+	ib_vector_push(info->bound_lits, pbl);
+}
+
+/****************************************************************//**
+Equivalent to pars_info_add_literal(info, name, str, strlen(str),
+DATA_VARCHAR, DATA_ENGLISH). */
+UNIV_INTERN
+void
+pars_info_add_str_literal(
+/*======================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const char*	str)		/*!< in: string */
+{
+	pars_info_add_literal(info, name, str, strlen(str),
+			      DATA_VARCHAR, DATA_ENGLISH);
+}
+
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+UNIV_INTERN
+void
+pars_info_bind_literal(
+/*===================*/
+	pars_info_t*	info,		/* in: info struct */
+	const char*	name,		/* in: name */
+	const void*	address,	/* in: address */
+	ulint		length,		/* in: length of data */
+	ulint		type,		/* in: type, e.g. DATA_FIXBINARY */
+	ulint		prtype)		/* in: precise type, e.g. */
+{
+	pars_bound_lit_t*	pbl;
+
+	pbl = pars_info_lookup_bound_lit(info, name);
+
+	if (!pbl) {
+		pars_info_add_literal(
+			info, name, address, length, type, prtype);
+	} else {
+		pbl->address = address;
+		pbl->length = length;
+
+		sym_tab_rebind_lit(pbl->node, address, length);
+	}
+}
+
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+UNIV_INTERN
+void
+pars_info_bind_varchar_literal(
+/*===========================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const byte*	str,		/*!< in: string */
+	ulint		str_len)	/*!< in: string length */
+{
+	pars_bound_lit_t*	pbl;
+
+	pbl = pars_info_lookup_bound_lit(info, name);
+
+	if (!pbl) {
+		pars_info_add_literal(
+			info, name, str, str_len, DATA_VARCHAR, DATA_ENGLISH);
+	} else {
+
+		pbl->address = str;
+		pbl->length = str_len;
+
+		sym_tab_rebind_lit(pbl->node, str, str_len);
+	}
+}
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+UNIV_INTERN
+void
+pars_info_add_int4_literal(
+/*=======================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	lint		val)		/*!< in: value */
+{
+	byte*	buf = static_cast<byte*>(mem_heap_alloc(info->heap, 4));
+
+	mach_write_to_4(buf, val);
+	pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+}
+
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+UNIV_INTERN
+void
+pars_info_bind_int4_literal(
+/*========================*/
+	pars_info_t*		info,   /* in: info struct */
+	const char*		name,   /* in: name */
+	const ib_uint32_t*	val)    /* in: value */
+{
+	pars_bound_lit_t*       pbl;
+
+	pbl = pars_info_lookup_bound_lit(info, name);
+
+	if (!pbl) {
+		pars_info_add_literal(info, name, val, 4, DATA_INT, 0);
+	} else {
+
+		pbl->address = val;
+		pbl->length = sizeof(*val);
+
+		sym_tab_rebind_lit(pbl->node, val, sizeof(*val));
+	}
+}
+
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+UNIV_INTERN
+void
+pars_info_bind_int8_literal(
+/*========================*/
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name,	/* in: name */
+	const ib_uint64_t*	val)	/* in: value */
+{
+	pars_bound_lit_t*	pbl;
+
+	pbl = pars_info_lookup_bound_lit(info, name);
+
+	if (!pbl) {
+		pars_info_add_literal(
+			info, name, val, sizeof(*val), DATA_INT, 0);
+	} else {
+
+		pbl->address = val;
+		pbl->length = sizeof(*val);
+
+		sym_tab_rebind_lit(pbl->node, val, sizeof(*val));
+	}
+}
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[8];
+mach_write_to_8(buf, val);
+pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+UNIV_INTERN
+void
+pars_info_add_ull_literal(
+/*======================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	ib_uint64_t	val)		/*!< in: value */
+{
+	byte*	buf = static_cast<byte*>(mem_heap_alloc(info->heap, 8));
+
+	mach_write_to_8(buf, val);
+
+	pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0);
+}
+
+/****************************************************************//**
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+UNIV_INTERN
+void
+pars_info_bind_ull_literal(
+/*=======================*/
+	pars_info_t*		info,		/*!< in: info struct */
+	const char*		name,		/*!< in: name */
+	const ib_uint64_t*	val)		/*!< in: value */
+{
+	pars_bound_lit_t*	pbl;
+
+	pbl = pars_info_lookup_bound_lit(info, name);
+
+	if (!pbl) {
+		pars_info_add_literal(
+			info, name, val, sizeof(*val), DATA_FIXBINARY, 0);
+	} else {
+
+		pbl->address = val;
+		pbl->length = sizeof(*val);
+
+		sym_tab_rebind_lit(pbl->node, val, sizeof(*val));
+	}
+}
+
+/****************************************************************//**
+Add user function. */
+UNIV_INTERN
+void
+pars_info_bind_function(
+/*====================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name,	/*!< in: function name */
+	pars_user_func_cb_t	func,	/*!< in: function address */
+	void*			arg)	/*!< in: user-supplied argument */
+{
+	pars_user_func_t*	puf;
+
+	puf = pars_info_lookup_user_func(info, name);
+
+	if (!puf) {
+		if (!info->funcs) {
+			ib_alloc_t*     heap_alloc;
+
+			heap_alloc = ib_heap_allocator_create(info->heap);
+
+			info->funcs = ib_vector_create(
+				heap_alloc, sizeof(*puf), 8);
+		}
+
+		/* Create a "new" element */
+		puf = static_cast<pars_user_func_t*>(
+			ib_vector_push(info->funcs, NULL));
+		puf->name = name;
+	}
+
+	puf->arg = arg;
+	puf->func = func;
+}
+
+/********************************************************************
+Add bound id. */
+UNIV_INTERN
+void
+pars_info_bind_id(
+/*==============*/
+	pars_info_t*	info,		/*!< in: info struct */
+	ibool		copy_name,	/* in: copy name if TRUE */
+	const char*	name,		/*!< in: name */
+	const char*	id)		/*!< in: id */
+{
+	pars_bound_id_t*	bid;
+
+	bid = pars_info_lookup_bound_id(info, name);
+
+	if (!bid) {
+
+		if (!info->bound_ids) {
+			ib_alloc_t*     heap_alloc;
+
+			heap_alloc = ib_heap_allocator_create(info->heap);
+
+			info->bound_ids = ib_vector_create(
+				heap_alloc, sizeof(*bid), 8);
+		}
+
+		/* Create a "new" element */
+		bid = static_cast<pars_bound_id_t*>(
+			ib_vector_push(info->bound_ids, NULL));
+
+		bid->name = (copy_name)
+		    ? mem_heap_strdup(info->heap, name) : name;
+	}
+
+	bid->id = id;
+}
+
+/********************************************************************
+Get bound identifier with the given name.*/
+
+pars_bound_id_t*
+pars_info_get_bound_id(
+/*===================*/
+					/* out: bound id, or NULL if not
+					found */
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name)	/* in: bound id name to find */
+{
+	return(pars_info_lookup_bound_id(info, name));
+}
+
+/****************************************************************//**
+Get bound literal with the given name.
+@return	bound literal, or NULL if not found */
+UNIV_INTERN
+pars_bound_lit_t*
+pars_info_get_bound_lit(
+/*====================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name)	/*!< in: bound literal name to find */
+{
+	return(pars_info_lookup_bound_lit(info, name));
+}
diff --git a/storage/innobase/pars/pars0sym.cc b/storage/innobase/pars/pars0sym.cc
new file mode 100644
index 00000000000..b01a69cb33a
--- /dev/null
+++ b/storage/innobase/pars/pars0sym.cc
@@ -0,0 +1,440 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file pars/pars0sym.cc
+SQL parser symbol table
+
+Created 12/15/1997 Heikki Tuuri
+*******************************************************/
+
+#include "pars0sym.h"
+
+#ifdef UNIV_NONINL
+#include "pars0sym.ic"
+#endif
+
+#include "mem0mem.h"
+#include "data0type.h"
+#include "data0data.h"
+#include "pars0grm.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "eval0eval.h"
+#include "row0sel.h"
+
+/******************************************************************//**
+Creates a symbol table for a single stored procedure or query.
+@return	own: symbol table */
+UNIV_INTERN
+sym_tab_t*
+sym_tab_create(
+/*===========*/
+	mem_heap_t*	heap)	/*!< in: memory heap where to create */
+{
+	sym_tab_t*	sym_tab;
+
+	sym_tab = static_cast<sym_tab_t*>(
+		mem_heap_alloc(heap, sizeof(sym_tab_t)));
+
+	UT_LIST_INIT(sym_tab->sym_list);
+	UT_LIST_INIT(sym_tab->func_node_list);
+
+	sym_tab->heap = heap;
+
+	return(sym_tab);
+}
+
+
+/******************************************************************//**
+Frees the memory allocated dynamically AFTER parsing phase for variables
+etc. in the symbol table. Does not free the mem heap where the table was
+originally created. Frees also SQL explicit cursor definitions. */
+UNIV_INTERN
+void
+sym_tab_free_private(
+/*=================*/
+	sym_tab_t*	sym_tab)	/*!< in, own: symbol table */
+{
+	sym_node_t*	sym;
+	func_node_t*	func;
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	for (sym = UT_LIST_GET_FIRST(sym_tab->sym_list);
+	     sym != NULL;
+	     sym = UT_LIST_GET_NEXT(sym_list, sym)) {
+
+		/* Close the tables opened in pars_retrieve_table_def(). */
+
+		if (sym->token_type == SYM_TABLE_REF_COUNTED) {
+
+			dict_table_close(sym->table, TRUE, FALSE);
+
+			sym->table = NULL;
+			sym->resolved = FALSE;
+			sym->token_type = SYM_UNSET;
+		}
+
+		eval_node_free_val_buf(sym);
+
+		if (sym->prefetch_buf) {
+			sel_col_prefetch_buf_free(sym->prefetch_buf);
+		}
+
+		if (sym->cursor_def) {
+			que_graph_free_recursive(sym->cursor_def);
+		}
+	}
+
+	for (func = UT_LIST_GET_FIRST(sym_tab->func_node_list);
+	     func != NULL;
+	     func = UT_LIST_GET_NEXT(func_node_list, func)) {
+
+		eval_node_free_val_buf(func);
+	}
+}
+
+/******************************************************************//**
+Adds an integer literal to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_int_lit(
+/*================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	ulint		val)		/*!< in: integer value */
+{
+	sym_node_t*	node;
+	byte*		data;
+
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
+
+	node->common.type = QUE_NODE_SYMBOL;
+
+	node->table = NULL;
+	node->resolved = TRUE;
+	node->token_type = SYM_LIT;
+
+	node->indirection = NULL;
+
+	dtype_set(dfield_get_type(&node->common.val), DATA_INT, 0, 4);
+
+	data = static_cast<byte*>(mem_heap_alloc(sym_tab->heap, 4));
+	mach_write_to_4(data, val);
+
+	dfield_set_data(&(node->common.val), data, 4);
+
+	node->common.val_buf_size = 0;
+	node->prefetch_buf = NULL;
+	node->cursor_def = NULL;
+
+	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
+
+	node->like_node = NULL;
+
+	node->sym_table = sym_tab;
+
+	return(node);
+}
+
+/******************************************************************//**
+Adds a string literal to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_str_lit(
+/*================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	const byte*	str,		/*!< in: string with no quotes around
+					it */
+	ulint		len)		/*!< in: string length */
+{
+	sym_node_t*	node;
+	byte*		data;
+
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
+
+	node->common.type = QUE_NODE_SYMBOL;
+
+	node->table = NULL;
+	node->resolved = TRUE;
+	node->token_type = SYM_LIT;
+
+	node->indirection = NULL;
+
+	dtype_set(dfield_get_type(&node->common.val),
+		  DATA_VARCHAR, DATA_ENGLISH, 0);
+
+	data = (len) ? static_cast<byte*>(mem_heap_dup(sym_tab->heap, str, len))
+	      	     : NULL;
+
+	dfield_set_data(&(node->common.val), data, len);
+
+	node->common.val_buf_size = 0;
+	node->prefetch_buf = NULL;
+	node->cursor_def = NULL;
+
+	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
+
+	node->like_node = NULL;
+
+	node->sym_table = sym_tab;
+
+	return(node);
+}
+
+/******************************************************************//**
+Add a bound literal to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_bound_lit(
+/*==================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	const char*	name,		/*!< in: name of bound literal */
+	ulint*		lit_type)	/*!< out: type of literal (PARS_*_LIT) */
+{
+	sym_node_t*		node;
+	pars_bound_lit_t*	blit;
+	ulint			len = 0;
+
+	blit = pars_info_get_bound_lit(sym_tab->info, name);
+	ut_a(blit);
+
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
+
+	node->common.type = QUE_NODE_SYMBOL;
+	node->common.brother = node->common.parent = NULL;
+
+	node->table = NULL;
+	node->resolved = TRUE;
+	node->token_type = SYM_LIT;
+
+	node->indirection = NULL;
+
+	switch (blit->type) {
+	case DATA_FIXBINARY:
+		len = blit->length;
+		*lit_type = PARS_FIXBINARY_LIT;
+		break;
+
+	case DATA_BLOB:
+		*lit_type = PARS_BLOB_LIT;
+		break;
+
+	case DATA_VARCHAR:
+		*lit_type = PARS_STR_LIT;
+		break;
+
+	case DATA_CHAR:
+		ut_a(blit->length > 0);
+
+		len = blit->length;
+		*lit_type = PARS_STR_LIT;
+		break;
+
+	case DATA_INT:
+		ut_a(blit->length > 0);
+		ut_a(blit->length <= 8);
+
+		len = blit->length;
+		*lit_type = PARS_INT_LIT;
+		break;
+
+	default:
+		ut_error;
+	}
+
+	dtype_set(dfield_get_type(&node->common.val),
+		  blit->type, blit->prtype, len);
+
+	dfield_set_data(&(node->common.val), blit->address, blit->length);
+
+	node->common.val_buf_size = 0;
+	node->prefetch_buf = NULL;
+	node->cursor_def = NULL;
+
+	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
+
+	blit->node = node;
+	node->like_node = NULL;
+	node->sym_table = sym_tab;
+
+	return(node);
+}
+
+/**********************************************************************
+Rebind literal to a node in the symbol table. */
+
+sym_node_t*
+sym_tab_rebind_lit(
+/*===============*/
+					/* out: symbol table node */
+	sym_node_t*	node,		/* in: node that is bound to literal*/
+	const void*	address,	/* in: pointer to data */
+	ulint		length)		/* in: length of data */
+{
+	dfield_t*	dfield = que_node_get_val(node);
+	dtype_t*	dtype = dfield_get_type(dfield);
+
+	ut_a(node->token_type == SYM_LIT);
+
+	dfield_set_data(&node->common.val, address, length);
+
+	if (node->like_node) {
+
+	    ut_a(dtype_get_mtype(dtype) == DATA_CHAR
+		 || dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+		/* Don't force [FALSE] creation of sub-nodes (for LIKE) */
+		pars_like_rebind(
+			node,static_cast<const byte*>(address), length);
+	}
+
+	/* FIXME: What's this ? */
+	node->common.val_buf_size = 0;
+
+	if (node->prefetch_buf) {
+		sel_col_prefetch_buf_free(node->prefetch_buf);
+		node->prefetch_buf = NULL;
+	}
+
+	if (node->cursor_def) {
+		que_graph_free_recursive(node->cursor_def);
+		node->cursor_def = NULL;
+	}
+
+	return(node);
+}
+
+/******************************************************************//**
+Adds an SQL null literal to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_null_lit(
+/*=================*/
+	sym_tab_t*	sym_tab)	/*!< in: symbol table */
+{
+	sym_node_t*	node;
+
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
+
+	node->common.type = QUE_NODE_SYMBOL;
+
+	node->table = NULL;
+	node->resolved = TRUE;
+	node->token_type = SYM_LIT;
+
+	node->indirection = NULL;
+
+	dfield_get_type(&node->common.val)->mtype = DATA_ERROR;
+
+	dfield_set_null(&node->common.val);
+
+	node->common.val_buf_size = 0;
+	node->prefetch_buf = NULL;
+	node->cursor_def = NULL;
+
+	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
+
+	node->like_node = NULL;
+
+	node->sym_table = sym_tab;
+
+	return(node);
+}
+
+/******************************************************************//**
+Adds an identifier to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_id(
+/*===========*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	byte*		name,		/*!< in: identifier name */
+	ulint		len)		/*!< in: identifier length */
+{
+	sym_node_t*	node;
+
+	node = static_cast<sym_node_t*>(
+		mem_heap_zalloc(sym_tab->heap, sizeof(*node)));
+
+	node->common.type = QUE_NODE_SYMBOL;
+
+	node->name = mem_heap_strdupl(sym_tab->heap, (char*) name, len);
+	node->name_len = len;
+
+	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
+
+	dfield_set_null(&node->common.val);
+
+	node->sym_table = sym_tab;
+
+	return(node);
+}
+
+/******************************************************************//**
+Add a bound identifier to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_bound_id(
+/*=================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	const char*	name)		/*!< in: name of bound id */
+{
+	sym_node_t*		node;
+	pars_bound_id_t*	bid;
+
+	bid = pars_info_get_bound_id(sym_tab->info, name);
+	ut_a(bid);
+
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
+
+	node->common.type = QUE_NODE_SYMBOL;
+
+	node->table = NULL;
+	node->resolved = FALSE;
+	node->token_type = SYM_UNSET;
+	node->indirection = NULL;
+
+	node->name = mem_heap_strdup(sym_tab->heap, bid->id);
+	node->name_len = strlen(node->name);
+
+	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
+
+	dfield_set_null(&node->common.val);
+
+	node->common.val_buf_size = 0;
+	node->prefetch_buf = NULL;
+	node->cursor_def = NULL;
+
+	node->like_node = NULL;
+
+	node->sym_table = sym_tab;
+
+	return(node);
+}
diff --git a/storage/innobase/que/que0que.cc b/storage/innobase/que/que0que.cc
new file mode 100644
index 00000000000..fb185959d56
--- /dev/null
+++ b/storage/innobase/que/que0que.cc
@@ -0,0 +1,1318 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file que/que0que.cc
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+
+#ifdef UNIV_NONINL
+#include "que0que.ic"
+#endif
+
+#include "usr0sess.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "row0undo.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "row0sel.h"
+#include "row0purge.h"
+#include "dict0crea.h"
+#include "log0log.h"
+#include "eval0proc.h"
+#include "lock0lock.h"
+#include "eval0eval.h"
+#include "pars0types.h"
+
+#define QUE_MAX_LOOPS_WITHOUT_CHECK	16
+
+#ifdef UNIV_DEBUG
+/* If the following flag is set TRUE, the module will print trace info
+of SQL execution in the UNIV_SQL_DEBUG version */
+UNIV_INTERN ibool	que_trace_on		= FALSE;
+#endif /* UNIV_DEBUG */
+
+/* Short introduction to query graphs
+   ==================================
+
+A query graph consists of nodes linked to each other in various ways. The
+execution starts at que_run_threads() which takes a que_thr_t parameter.
+que_thr_t contains two fields that control query graph execution: run_node
+and prev_node. run_node is the next node to execute and prev_node is the
+last node executed.
+
+Each node has a pointer to a 'next' statement, i.e., its brother, and a
+pointer to its parent node. The next pointer is NULL in the last statement
+of a block.
+
+Loop nodes contain a link to the first statement of the enclosed statement
+list. While the loop runs, que_thr_step() checks if execution to the loop
+node came from its parent or from one of the statement nodes in the loop. If
+it came from the parent of the loop node it starts executing the first
+statement node in the loop. If it came from one of the statement nodes in
+the loop, then it checks if the statement node has another statement node
+following it, and runs it if so.
+
+To signify loop ending, the loop statements (see e.g. while_step()) set
+que_thr_t->run_node to the loop node's parent node. This is noticed on the
+next call of que_thr_step() and execution proceeds to the node pointed to by
+the loop node's 'next' pointer.
+
+For example, the code:
+
+X := 1;
+WHILE X < 5 LOOP
+ X := X + 1;
+ X := X + 1;
+X := 5
+
+will result in the following node hierarchy, with the X-axis indicating
+'next' links and the Y-axis indicating parent/child links:
+
+A - W - A
+    |
+    |
+    A - A
+
+A = assign_node_t, W = while_node_t. */
+
+/* How a stored procedure containing COMMIT or ROLLBACK commands
+is executed?
+
+The commit or rollback can be seen as a subprocedure call.
+
+When the transaction starts to handle a rollback or commit.
+It builds a query graph which, when executed, will roll back
+or commit the incomplete transaction. The transaction
+is moved to the TRX_QUE_ROLLING_BACK or TRX_QUE_COMMITTING state.
+If specified, the SQL cursors opened by the transaction are closed.
+When the execution of the graph completes, it is like returning
+from a subprocedure: the query thread which requested the operation
+starts running again. */
+
+/**********************************************************************//**
+Moves a thread from another state to the QUE_THR_RUNNING state. Increments
+the n_active_thrs counters of the query graph and transaction.
+***NOTE***: This is the only function in which such a transition is allowed
+to happen! */
+static
+void
+que_thr_move_to_run_state(
+/*======================*/
+	que_thr_t*	thr);	/*!< in: an query thread */
+
+/***********************************************************************//**
+Creates a query graph fork node.
+@return	own: fork node */
+UNIV_INTERN
+que_fork_t*
+que_fork_create(
+/*============*/
+	que_t*		graph,		/*!< in: graph, if NULL then this
+					fork node is assumed to be the
+					graph root */
+	que_node_t*	parent,		/*!< in: parent node */
+	ulint		fork_type,	/*!< in: fork type */
+	mem_heap_t*	heap)		/*!< in: memory heap where created */
+{
+	que_fork_t*	fork;
+
+	ut_ad(heap);
+
+	fork = static_cast<que_fork_t*>(mem_heap_zalloc(heap, sizeof(*fork)));
+
+	fork->heap = heap;
+
+	fork->fork_type = fork_type;
+
+	fork->common.parent = parent;
+
+	fork->common.type = QUE_NODE_FORK;
+
+	fork->state = QUE_FORK_COMMAND_WAIT;
+
+	fork->graph = (graph != NULL) ? graph : fork;
+
+	return(fork);
+}
+
+/***********************************************************************//**
+Creates a query graph thread node.
+@return	own: query thread node */
+UNIV_INTERN
+que_thr_t*
+que_thr_create(
+/*===========*/
+	que_fork_t*	parent,	/*!< in: parent node, i.e., a fork node */
+	mem_heap_t*	heap)	/*!< in: memory heap where created */
+{
+	que_thr_t*	thr;
+
+	ut_ad(parent && heap);
+
+	thr = static_cast<que_thr_t*>(mem_heap_zalloc(heap, sizeof(*thr)));
+
+	thr->graph = parent->graph;
+
+	thr->common.parent = parent;
+
+	thr->magic_n = QUE_THR_MAGIC_N;
+
+	thr->common.type = QUE_NODE_THR;
+
+	thr->state = QUE_THR_COMMAND_WAIT;
+
+	thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+	UT_LIST_ADD_LAST(thrs, parent->thrs, thr);
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Moves a suspended query thread to the QUE_THR_RUNNING state and may release
+a worker thread to execute it. This function should be used to end
+the wait state of a query thread waiting for a lock or a stored procedure
+completion.
+@return the query thread that needs to be released. */
+UNIV_INTERN
+que_thr_t*
+que_thr_end_lock_wait(
+/*==================*/
+	trx_t*		trx)	/*!< in: transaction with que_state in
+		       		QUE_THR_LOCK_WAIT */
+{
+	que_thr_t*	thr;
+	ibool		was_active;
+
+	ut_ad(lock_mutex_own());
+	ut_ad(trx_mutex_own(trx));
+
+	thr = trx->lock.wait_thr;
+
+	ut_ad(thr != NULL);
+
+	ut_ad(trx->lock.que_state == TRX_QUE_LOCK_WAIT);
+	/* In MySQL this is the only possible state here */
+	ut_a(thr->state == QUE_THR_LOCK_WAIT);
+
+	was_active = thr->is_active;
+
+	que_thr_move_to_run_state(thr);
+
+	trx->lock.que_state = TRX_QUE_RUNNING;
+
+	trx->lock.wait_thr = NULL;
+
+	/* In MySQL we let the OS thread (not just the query thread) to wait
+	for the lock to be released: */
+
+	return((!was_active && thr != NULL) ? thr : NULL);
+}
+
+/**********************************************************************//**
+Inits a query thread for a command. */
+UNIV_INLINE
+void
+que_thr_init_command(
+/*=================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	thr->run_node = thr;
+	thr->prev_node = thr->common.parent;
+
+	que_thr_move_to_run_state(thr);
+}
+
+/**********************************************************************//**
+Round robin scheduler.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+UNIV_INTERN
+que_thr_t*
+que_fork_scheduler_round_robin(
+/*===========================*/
+	que_fork_t*	fork,		/*!< in: a query fork */
+	que_thr_t*	thr)		/*!< in: current pos */
+{
+	trx_mutex_enter(fork->trx);
+
+	/* If no current, start first available. */
+	if (thr == NULL) {
+		thr = UT_LIST_GET_FIRST(fork->thrs);
+	} else {
+		thr = UT_LIST_GET_NEXT(thrs, thr);
+	}
+
+	if (thr) {
+
+		fork->state = QUE_FORK_ACTIVE;
+
+		fork->last_sel_node = NULL;
+
+		switch (thr->state) {
+		case QUE_THR_COMMAND_WAIT:
+		case QUE_THR_COMPLETED:
+			ut_a(!thr->is_active);
+			que_thr_init_command(thr);
+			break;
+
+		case QUE_THR_SUSPENDED:
+		case QUE_THR_LOCK_WAIT:
+		default:
+			ut_error;
+
+		}
+	}
+
+	trx_mutex_exit(fork->trx);
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Starts execution of a command in a query fork. Picks a query thread which
+is not in the QUE_THR_RUNNING state and moves it to that state. If none
+can be chosen, a situation which may arise in parallelized fetches, NULL
+is returned.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+UNIV_INTERN
+que_thr_t*
+que_fork_start_command(
+/*===================*/
+	que_fork_t*	fork)	/*!< in: a query fork */
+{
+	que_thr_t*	thr;
+	que_thr_t*	suspended_thr = NULL;
+	que_thr_t*	completed_thr = NULL;
+
+	fork->state = QUE_FORK_ACTIVE;
+
+	fork->last_sel_node = NULL;
+
+	suspended_thr = NULL;
+	completed_thr = NULL;
+
+	/* Choose the query thread to run: usually there is just one thread,
+	but in a parallelized select, which necessarily is non-scrollable,
+	there may be several to choose from */
+
+	/* First we try to find a query thread in the QUE_THR_COMMAND_WAIT
+	state. Then we try to find a query thread in the QUE_THR_SUSPENDED
+	state, finally we try to find a query thread in the QUE_THR_COMPLETED
+	state */
+
+	/* We make a single pass over the thr list within which we note which
+	threads are ready to run. */
+	for (thr = UT_LIST_GET_FIRST(fork->thrs);
+	     thr != NULL;
+	     thr = UT_LIST_GET_NEXT(thrs, thr)) {
+
+		switch (thr->state) {
+		case QUE_THR_COMMAND_WAIT:
+
+			/* We have to send the initial message to query thread
+			to start it */
+
+			que_thr_init_command(thr);
+
+			return(thr);
+
+		case QUE_THR_SUSPENDED:
+			/* In this case the execution of the thread was
+			suspended: no initial message is needed because
+			execution can continue from where it was left */
+			if (!suspended_thr) {
+				suspended_thr = thr;
+			}
+
+			break;
+
+		case QUE_THR_COMPLETED:
+			if (!completed_thr) {
+				completed_thr = thr;
+			}
+
+			break;
+
+		case QUE_THR_LOCK_WAIT:
+			ut_error;
+
+		}
+	}
+
+	if (suspended_thr) {
+
+		thr = suspended_thr;
+		que_thr_move_to_run_state(thr);
+
+	} else if (completed_thr) {
+
+		thr = completed_thr;
+		que_thr_init_command(thr);
+	} else {
+		ut_error;
+	}
+
+	return(thr);
+}
+
+/****************************************************************//**
+Tests if all the query threads in the same fork have a given state.
+@return TRUE if all the query threads in the same fork were in the
+given state */
+UNIV_INLINE
+ibool
+que_fork_all_thrs_in_state(
+/*=======================*/
+	que_fork_t*	fork,	/*!< in: query fork */
+	ulint		state)	/*!< in: state */
+{
+	que_thr_t*	thr_node;
+
+	for (thr_node = UT_LIST_GET_FIRST(fork->thrs);
+	     thr_node != NULL;
+	     thr_node = UT_LIST_GET_NEXT(thrs, thr_node)) {
+
+		if (thr_node->state != state) {
+
+			return(FALSE);
+		}
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Calls que_graph_free_recursive for statements in a statement list. */
+static
+void
+que_graph_free_stat_list(
+/*=====================*/
+	que_node_t*	node)	/*!< in: first query graph node in the list */
+{
+	while (node) {
+		que_graph_free_recursive(node);
+
+		node = que_node_get_next(node);
+	}
+}
+
+/**********************************************************************//**
+Frees a query graph, but not the heap where it was created. Does not free
+explicit cursor declarations, they are freed in que_graph_free. */
+UNIV_INTERN
+void
+que_graph_free_recursive(
+/*=====================*/
+	que_node_t*	node)	/*!< in: query graph node */
+{
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+	undo_node_t*	undo;
+	sel_node_t*	sel;
+	ins_node_t*	ins;
+	upd_node_t*	upd;
+	tab_node_t*	cre_tab;
+	ind_node_t*	cre_ind;
+	purge_node_t*	purge;
+
+	if (node == NULL) {
+
+		return;
+	}
+
+	switch (que_node_get_type(node)) {
+
+	case QUE_NODE_FORK:
+		fork = static_cast<que_fork_t*>(node);
+
+		thr = UT_LIST_GET_FIRST(fork->thrs);
+
+		while (thr) {
+			que_graph_free_recursive(thr);
+
+			thr = UT_LIST_GET_NEXT(thrs, thr);
+		}
+
+		break;
+	case QUE_NODE_THR:
+
+		thr = static_cast<que_thr_t*>(node);
+
+		if (thr->magic_n != QUE_THR_MAGIC_N) {
+			fprintf(stderr,
+				"que_thr struct appears corrupt;"
+				" magic n %lu\n",
+				(unsigned long) thr->magic_n);
+			mem_analyze_corruption(thr);
+			ut_error;
+		}
+
+		thr->magic_n = QUE_THR_MAGIC_FREED;
+
+		que_graph_free_recursive(thr->child);
+
+		break;
+	case QUE_NODE_UNDO:
+
+		undo = static_cast<undo_node_t*>(node);
+
+		mem_heap_free(undo->heap);
+
+		break;
+	case QUE_NODE_SELECT:
+
+		sel = static_cast<sel_node_t*>(node);
+
+		sel_node_free_private(sel);
+
+		break;
+	case QUE_NODE_INSERT:
+
+		ins = static_cast<ins_node_t*>(node);
+
+		que_graph_free_recursive(ins->select);
+
+		mem_heap_free(ins->entry_sys_heap);
+
+		break;
+	case QUE_NODE_PURGE:
+		purge = static_cast<purge_node_t*>(node);
+
+		mem_heap_free(purge->heap);
+
+		break;
+
+	case QUE_NODE_UPDATE:
+
+		upd = static_cast<upd_node_t*>(node);
+
+		if (upd->in_mysql_interface) {
+
+			btr_pcur_free_for_mysql(upd->pcur);
+		}
+
+		que_graph_free_recursive(upd->cascade_node);
+
+		if (upd->cascade_heap) {
+			mem_heap_free(upd->cascade_heap);
+		}
+
+		que_graph_free_recursive(upd->select);
+
+		mem_heap_free(upd->heap);
+
+		break;
+	case QUE_NODE_CREATE_TABLE:
+		cre_tab = static_cast<tab_node_t*>(node);
+
+		que_graph_free_recursive(cre_tab->tab_def);
+		que_graph_free_recursive(cre_tab->col_def);
+		que_graph_free_recursive(cre_tab->commit_node);
+
+		mem_heap_free(cre_tab->heap);
+
+		break;
+	case QUE_NODE_CREATE_INDEX:
+		cre_ind = static_cast<ind_node_t*>(node);
+
+		que_graph_free_recursive(cre_ind->ind_def);
+		que_graph_free_recursive(cre_ind->field_def);
+		que_graph_free_recursive(cre_ind->commit_node);
+
+		mem_heap_free(cre_ind->heap);
+
+		break;
+	case QUE_NODE_PROC:
+		que_graph_free_stat_list(((proc_node_t*) node)->stat_list);
+
+		break;
+	case QUE_NODE_IF:
+		que_graph_free_stat_list(((if_node_t*) node)->stat_list);
+		que_graph_free_stat_list(((if_node_t*) node)->else_part);
+		que_graph_free_stat_list(((if_node_t*) node)->elsif_list);
+
+		break;
+	case QUE_NODE_ELSIF:
+		que_graph_free_stat_list(((elsif_node_t*) node)->stat_list);
+
+		break;
+	case QUE_NODE_WHILE:
+		que_graph_free_stat_list(((while_node_t*) node)->stat_list);
+
+		break;
+	case QUE_NODE_FOR:
+		que_graph_free_stat_list(((for_node_t*) node)->stat_list);
+
+		break;
+
+	case QUE_NODE_ASSIGNMENT:
+	case QUE_NODE_EXIT:
+	case QUE_NODE_RETURN:
+	case QUE_NODE_COMMIT:
+	case QUE_NODE_ROLLBACK:
+	case QUE_NODE_LOCK:
+	case QUE_NODE_FUNC:
+	case QUE_NODE_ORDER:
+	case QUE_NODE_ROW_PRINTF:
+	case QUE_NODE_OPEN:
+	case QUE_NODE_FETCH:
+		/* No need to do anything */
+
+		break;
+	default:
+		fprintf(stderr,
+			"que_node struct appears corrupt; type %lu\n",
+			(unsigned long) que_node_get_type(node));
+		mem_analyze_corruption(node);
+		ut_error;
+	}
+}
+
+/**********************************************************************//**
+Frees a query graph. */
+UNIV_INTERN
+void
+que_graph_free(
+/*===========*/
+	que_t*	graph)	/*!< in: query graph; we assume that the memory
+			heap where this graph was created is private
+			to this graph: if not, then use
+			que_graph_free_recursive and free the heap
+			afterwards! */
+{
+	ut_ad(graph);
+
+	if (graph->sym_tab) {
+		/* The following call frees dynamic memory allocated
+		for variables etc. during execution. Frees also explicit
+		cursor definitions. */
+
+		sym_tab_free_private(graph->sym_tab);
+	}
+
+	if (graph->info && graph->info->graph_owns_us) {
+		pars_info_free(graph->info);
+	}
+
+	que_graph_free_recursive(graph);
+
+	mem_heap_free(graph->heap);
+}
+
+/****************************************************************//**
+Performs an execution step on a thr node.
+@return	query thread to run next, or NULL if none */
+static
+que_thr_t*
+que_thr_node_step(
+/*==============*/
+	que_thr_t*	thr)	/*!< in: query thread where run_node must
+				be the thread node itself */
+{
+	ut_ad(thr->run_node == thr);
+
+	if (thr->prev_node == thr->common.parent) {
+		/* If control to the node came from above, it is just passed
+		on */
+
+		thr->run_node = thr->child;
+
+		return(thr);
+	}
+
+	trx_mutex_enter(thr_get_trx(thr));
+
+	if (que_thr_peek_stop(thr)) {
+
+		trx_mutex_exit(thr_get_trx(thr));
+
+		return(thr);
+	}
+
+	/* Thread execution completed */
+
+	thr->state = QUE_THR_COMPLETED;
+
+	trx_mutex_exit(thr_get_trx(thr));
+
+	return(NULL);
+}
+
+/**********************************************************************//**
+Moves a thread from another state to the QUE_THR_RUNNING state. Increments
+the n_active_thrs counters of the query graph and transaction if thr was
+not active.
+***NOTE***: This and ..._mysql are  the only functions in which such a
+transition is allowed to happen! */
+static
+void
+que_thr_move_to_run_state(
+/*======================*/
+	que_thr_t*	thr)	/*!< in: an query thread */
+{
+	ut_ad(thr->state != QUE_THR_RUNNING);
+
+	if (!thr->is_active) {
+		trx_t*	trx;
+
+		trx = thr_get_trx(thr);
+
+		thr->graph->n_active_thrs++;
+
+		trx->lock.n_active_thrs++;
+
+		thr->is_active = TRUE;
+	}
+
+	thr->state = QUE_THR_RUNNING;
+}
+
+/**********************************************************************//**
+Stops a query thread if graph or trx is in a state requiring it. The
+conditions are tested in the order (1) graph, (2) trx.
+@return	TRUE if stopped */
+UNIV_INTERN
+ibool
+que_thr_stop(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	que_t*		graph;
+	trx_t*		trx = thr_get_trx(thr);
+
+	graph = thr->graph;
+
+	ut_ad(trx_mutex_own(trx));
+
+	if (graph->state == QUE_FORK_COMMAND_WAIT) {
+
+		thr->state = QUE_THR_SUSPENDED;
+
+	} else if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+		trx->lock.wait_thr = thr;
+		thr->state = QUE_THR_LOCK_WAIT;
+
+	} else if (trx->error_state != DB_SUCCESS
+		   && trx->error_state != DB_LOCK_WAIT) {
+
+		/* Error handling built for the MySQL interface */
+		thr->state = QUE_THR_COMPLETED;
+
+	} else if (graph->fork_type == QUE_FORK_ROLLBACK) {
+
+		thr->state = QUE_THR_SUSPENDED;
+	} else {
+		ut_ad(graph->state == QUE_FORK_ACTIVE);
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Decrements the query thread reference counts in the query graph and the
+transaction.
+*** NOTE ***:
+This and que_thr_stop_for_mysql are the only functions where the reference
+count can be decremented and this function may only be called from inside
+que_run_threads! These restrictions exist to make the rollback code easier
+to maintain. */
+static
+void
+que_thr_dec_refer_count(
+/*====================*/
+	que_thr_t*	thr,		/*!< in: query thread */
+	que_thr_t**	next_thr)	/*!< in/out: next query thread to run;
+					if the value which is passed in is
+					a pointer to a NULL pointer, then the
+					calling function can start running
+					a new query thread */
+{
+	trx_t*		trx;
+	que_fork_t*	fork;
+
+	trx = thr_get_trx(thr);
+
+	ut_a(thr->is_active);
+	ut_ad(trx_mutex_own(trx));
+
+	if (thr->state == QUE_THR_RUNNING) {
+
+		if (!que_thr_stop(thr)) {
+
+			ut_a(next_thr != NULL && *next_thr == NULL);
+
+			/* The reason for the thr suspension or wait was
+			already canceled before we came here: continue
+			running the thread.
+
+			This is also possible because in trx_commit_step() we
+			assume a single query thread. We set the query thread
+			state to QUE_THR_RUNNING. */
+
+			/* fprintf(stderr,
+		       		"Wait already ended: trx: %p\n", trx); */
+
+			/* Normally srv_suspend_mysql_thread resets
+			the state to DB_SUCCESS before waiting, but
+			in this case we have to do it here,
+			otherwise nobody does it. */
+
+			trx->error_state = DB_SUCCESS;
+
+			*next_thr = thr;
+
+			return;
+		}
+	}
+
+	fork = static_cast<que_fork_t*>(thr->common.parent);
+
+	--trx->lock.n_active_thrs;
+
+	--fork->n_active_thrs;
+
+	thr->is_active = FALSE;
+}
+
+/**********************************************************************//**
+A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The
+query thread is stopped and made inactive, except in the case where
+it was put to the lock wait state in lock0lock.cc, but the lock has already
+been granted or the transaction chosen as a victim in deadlock resolution. */
+UNIV_INTERN
+void
+que_thr_stop_for_mysql(
+/*===================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	trx_t*	trx;
+
+	trx = thr_get_trx(thr);
+
+	/* Can't be the purge transaction. */
+	ut_a(trx->id != 0);
+
+	trx_mutex_enter(trx);
+
+	if (thr->state == QUE_THR_RUNNING) {
+
+		if (trx->error_state != DB_SUCCESS
+		    && trx->error_state != DB_LOCK_WAIT) {
+
+			/* Error handling built for the MySQL interface */
+			thr->state = QUE_THR_COMPLETED;
+		} else {
+			/* It must have been a lock wait but the lock was
+			already released, or this transaction was chosen
+			as a victim in selective deadlock resolution */
+
+			trx_mutex_exit(trx);
+
+			return;
+		}
+	}
+
+	ut_ad(thr->is_active == TRUE);
+	ut_ad(trx->lock.n_active_thrs == 1);
+	ut_ad(thr->graph->n_active_thrs == 1);
+
+	thr->is_active = FALSE;
+	thr->graph->n_active_thrs--;
+
+	trx->lock.n_active_thrs--;
+
+	trx_mutex_exit(trx);
+}
+
+/**********************************************************************//**
+Moves a thread from another state to the QUE_THR_RUNNING state. Increments
+the n_active_thrs counters of the query graph and transaction if thr was
+not active. */
+UNIV_INTERN
+void
+que_thr_move_to_run_state_for_mysql(
+/*================================*/
+	que_thr_t*	thr,	/*!< in: an query thread */
+	trx_t*		trx)	/*!< in: transaction */
+{
+	if (thr->magic_n != QUE_THR_MAGIC_N) {
+		fprintf(stderr,
+			"que_thr struct appears corrupt; magic n %lu\n",
+			(unsigned long) thr->magic_n);
+
+		mem_analyze_corruption(thr);
+
+		ut_error;
+	}
+
+	if (!thr->is_active) {
+
+		thr->graph->n_active_thrs++;
+
+		trx->lock.n_active_thrs++;
+
+		thr->is_active = TRUE;
+	}
+
+	thr->state = QUE_THR_RUNNING;
+}
+
+/**********************************************************************//**
+A patch for MySQL used to 'stop' a dummy query thread used in MySQL
+select, when there is no error or lock wait. */
+UNIV_INTERN
+void
+que_thr_stop_for_mysql_no_error(
+/*============================*/
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_t*		trx)	/*!< in: transaction */
+{
+	ut_ad(thr->state == QUE_THR_RUNNING);
+	ut_ad(thr_get_trx(thr)->id != 0);
+	ut_ad(thr->is_active == TRUE);
+	ut_ad(trx->lock.n_active_thrs == 1);
+	ut_ad(thr->graph->n_active_thrs == 1);
+
+	if (thr->magic_n != QUE_THR_MAGIC_N) {
+		fprintf(stderr,
+			"que_thr struct appears corrupt; magic n %lu\n",
+			(unsigned long) thr->magic_n);
+
+		mem_analyze_corruption(thr);
+
+		ut_error;
+	}
+
+	thr->state = QUE_THR_COMPLETED;
+
+	thr->is_active = FALSE;
+	thr->graph->n_active_thrs--;
+
+	trx->lock.n_active_thrs--;
+}
+
+/****************************************************************//**
+Get the first containing loop node (e.g. while_node_t or for_node_t) for the
+given node, or NULL if the node is not within a loop.
+@return	containing loop node, or NULL. */
+UNIV_INTERN
+que_node_t*
+que_node_get_containing_loop_node(
+/*==============================*/
+	que_node_t*	node)	/*!< in: node */
+{
+	ut_ad(node);
+
+	for (;;) {
+		ulint	type;
+
+		node = que_node_get_parent(node);
+
+		if (!node) {
+			break;
+		}
+
+		type = que_node_get_type(node);
+
+		if ((type == QUE_NODE_FOR) || (type == QUE_NODE_WHILE)) {
+			break;
+		}
+	}
+
+	return(node);
+}
+
+/**********************************************************************//**
+Prints info of an SQL query graph node. */
+UNIV_INTERN
+void
+que_node_print_info(
+/*================*/
+	que_node_t*	node)	/*!< in: query graph node */
+{
+	ulint		type;
+	const char*	str;
+
+	type = que_node_get_type(node);
+
+	if (type == QUE_NODE_SELECT) {
+		str = "SELECT";
+	} else if (type == QUE_NODE_INSERT) {
+		str = "INSERT";
+	} else if (type == QUE_NODE_UPDATE) {
+		str = "UPDATE";
+	} else if (type == QUE_NODE_WHILE) {
+		str = "WHILE";
+	} else if (type == QUE_NODE_ASSIGNMENT) {
+		str = "ASSIGNMENT";
+	} else if (type == QUE_NODE_IF) {
+		str = "IF";
+	} else if (type == QUE_NODE_FETCH) {
+		str = "FETCH";
+	} else if (type == QUE_NODE_OPEN) {
+		str = "OPEN";
+	} else if (type == QUE_NODE_PROC) {
+		str = "STORED PROCEDURE";
+	} else if (type == QUE_NODE_FUNC) {
+		str = "FUNCTION";
+	} else if (type == QUE_NODE_LOCK) {
+		str = "LOCK";
+	} else if (type == QUE_NODE_THR) {
+		str = "QUERY THREAD";
+	} else if (type == QUE_NODE_COMMIT) {
+		str = "COMMIT";
+	} else if (type == QUE_NODE_UNDO) {
+		str = "UNDO ROW";
+	} else if (type == QUE_NODE_PURGE) {
+		str = "PURGE ROW";
+	} else if (type == QUE_NODE_ROLLBACK) {
+		str = "ROLLBACK";
+	} else if (type == QUE_NODE_CREATE_TABLE) {
+		str = "CREATE TABLE";
+	} else if (type == QUE_NODE_CREATE_INDEX) {
+		str = "CREATE INDEX";
+	} else if (type == QUE_NODE_FOR) {
+		str = "FOR LOOP";
+	} else if (type == QUE_NODE_RETURN) {
+		str = "RETURN";
+	} else if (type == QUE_NODE_EXIT) {
+		str = "EXIT";
+	} else {
+		str = "UNKNOWN NODE TYPE";
+	}
+
+	fprintf(stderr, "Node type %lu: %s, address %p\n",
+		(ulong) type, str, (void*) node);
+}
+
+/**********************************************************************//**
+Performs an execution step on a query thread.
+@return query thread to run next: it may differ from the input
+parameter if, e.g., a subprocedure call is made */
+UNIV_INLINE
+que_thr_t*
+que_thr_step(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	que_node_t*	node;
+	que_thr_t*	old_thr;
+	trx_t*		trx;
+	ulint		type;
+
+	trx = thr_get_trx(thr);
+
+	ut_ad(thr->state == QUE_THR_RUNNING);
+	ut_a(trx->error_state == DB_SUCCESS);
+
+	thr->resource++;
+
+	node = thr->run_node;
+	type = que_node_get_type(node);
+
+	old_thr = thr;
+
+#ifdef UNIV_DEBUG
+	if (que_trace_on) {
+		fputs("To execute: ", stderr);
+		que_node_print_info(node);
+	}
+#endif
+	if (type & QUE_NODE_CONTROL_STAT) {
+		if ((thr->prev_node != que_node_get_parent(node))
+		    && que_node_get_next(thr->prev_node)) {
+
+			/* The control statements, like WHILE, always pass the
+			control to the next child statement if there is any
+			child left */
+
+			thr->run_node = que_node_get_next(thr->prev_node);
+
+		} else if (type == QUE_NODE_IF) {
+			if_step(thr);
+		} else if (type == QUE_NODE_FOR) {
+			for_step(thr);
+		} else if (type == QUE_NODE_PROC) {
+
+			/* We can access trx->undo_no without reserving
+			trx->undo_mutex, because there cannot be active query
+			threads doing updating or inserting at the moment! */
+
+			if (thr->prev_node == que_node_get_parent(node)) {
+				trx->last_sql_stat_start.least_undo_no
+					= trx->undo_no;
+			}
+
+			proc_step(thr);
+		} else if (type == QUE_NODE_WHILE) {
+			while_step(thr);
+		} else {
+			ut_error;
+		}
+	} else if (type == QUE_NODE_ASSIGNMENT) {
+		assign_step(thr);
+	} else if (type == QUE_NODE_SELECT) {
+		thr = row_sel_step(thr);
+	} else if (type == QUE_NODE_INSERT) {
+		thr = row_ins_step(thr);
+	} else if (type == QUE_NODE_UPDATE) {
+		thr = row_upd_step(thr);
+	} else if (type == QUE_NODE_FETCH) {
+		thr = fetch_step(thr);
+	} else if (type == QUE_NODE_OPEN) {
+		thr = open_step(thr);
+	} else if (type == QUE_NODE_FUNC) {
+		proc_eval_step(thr);
+
+	} else if (type == QUE_NODE_LOCK) {
+
+		ut_error;
+	} else if (type == QUE_NODE_THR) {
+		thr = que_thr_node_step(thr);
+	} else if (type == QUE_NODE_COMMIT) {
+		thr = trx_commit_step(thr);
+	} else if (type == QUE_NODE_UNDO) {
+		thr = row_undo_step(thr);
+	} else if (type == QUE_NODE_PURGE) {
+		thr = row_purge_step(thr);
+	} else if (type == QUE_NODE_RETURN) {
+		thr = return_step(thr);
+	} else if (type == QUE_NODE_EXIT) {
+		thr = exit_step(thr);
+	} else if (type == QUE_NODE_ROLLBACK) {
+		thr = trx_rollback_step(thr);
+	} else if (type == QUE_NODE_CREATE_TABLE) {
+		thr = dict_create_table_step(thr);
+	} else if (type == QUE_NODE_CREATE_INDEX) {
+		thr = dict_create_index_step(thr);
+	} else if (type == QUE_NODE_ROW_PRINTF) {
+		thr = row_printf_step(thr);
+	} else {
+		ut_error;
+	}
+
+	if (type == QUE_NODE_EXIT) {
+		old_thr->prev_node = que_node_get_containing_loop_node(node);
+	} else {
+		old_thr->prev_node = node;
+	}
+
+	if (thr) {
+		ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Run a query thread until it finishes or encounters e.g. a lock wait. */
+static
+void
+que_run_threads_low(
+/*================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	trx_t*		trx;
+	que_thr_t*	next_thr;
+
+	ut_ad(thr->state == QUE_THR_RUNNING);
+	ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS);
+	ut_ad(!trx_mutex_own(thr_get_trx(thr)));
+
+	/* cumul_resource counts how much resources the OS thread (NOT the
+	query thread) has spent in this function */
+
+	trx = thr_get_trx(thr);
+
+	do {
+		/* Check that there is enough space in the log to accommodate
+		possible log entries by this query step; if the operation can
+		touch more than about 4 pages, checks must be made also within
+		the query step! */
+
+		log_free_check();
+
+		/* Perform the actual query step: note that the query thread
+		may change if, e.g., a subprocedure call is made */
+
+		/*-------------------------*/
+		next_thr = que_thr_step(thr);
+		/*-------------------------*/
+
+		trx_mutex_enter(trx);
+
+		ut_a(next_thr == NULL || trx->error_state == DB_SUCCESS);
+
+		if (next_thr != thr) {
+			ut_a(next_thr == NULL);
+
+			/* This can change next_thr to a non-NULL value
+			if there was a lock wait that already completed. */
+
+			que_thr_dec_refer_count(thr, &next_thr);
+
+			if (next_thr != NULL) {
+
+				thr = next_thr;
+			}
+		}
+
+		ut_ad(trx == thr_get_trx(thr));
+
+		trx_mutex_exit(trx);
+
+	} while (next_thr != NULL);
+}
+
+/**********************************************************************//**
+Run a query thread. Handles lock waits. */
+UNIV_INTERN
+void
+que_run_threads(
+/*============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ut_ad(!trx_mutex_own(thr_get_trx(thr)));
+
+loop:
+	ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS);
+
+	que_run_threads_low(thr);
+
+	switch (thr->state) {
+
+	case QUE_THR_RUNNING:
+		/* There probably was a lock wait, but it already ended
+		before we came here: continue running thr */
+
+		goto loop;
+
+	case QUE_THR_LOCK_WAIT:
+		lock_wait_suspend_thread(thr);
+
+		trx_mutex_enter(thr_get_trx(thr));
+
+		ut_a(thr_get_trx(thr)->id != 0);
+
+		if (thr_get_trx(thr)->error_state != DB_SUCCESS) {
+			/* thr was chosen as a deadlock victim or there was
+			a lock wait timeout */
+
+			que_thr_dec_refer_count(thr, NULL);
+			trx_mutex_exit(thr_get_trx(thr));
+			break;
+		}
+
+		trx_mutex_exit(thr_get_trx(thr));
+		goto loop;
+
+	case QUE_THR_COMPLETED:
+	case QUE_THR_COMMAND_WAIT:
+		/* Do nothing */
+		break;
+
+	default:
+		ut_error;
+	}
+}
+
+/*********************************************************************//**
+Evaluate the given SQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+que_eval_sql(
+/*=========*/
+	pars_info_t*	info,	/*!< in: info struct, or NULL */
+	const char*	sql,	/*!< in: SQL string */
+	ibool		reserve_dict_mutex,
+				/*!< in: if TRUE, acquire/release
+				dict_sys->mutex around call to pars_sql. */
+	trx_t*		trx)	/*!< in: trx */
+{
+	que_thr_t*	thr;
+	que_t*		graph;
+
+	ut_a(trx->error_state == DB_SUCCESS);
+
+	if (reserve_dict_mutex) {
+		mutex_enter(&dict_sys->mutex);
+	}
+
+	graph = pars_sql(info, sql);
+
+	if (reserve_dict_mutex) {
+		mutex_exit(&dict_sys->mutex);
+	}
+
+	ut_a(graph);
+
+	graph->trx = trx;
+	trx->graph = NULL;
+
+	graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+	ut_a(thr = que_fork_start_command(graph));
+
+	que_run_threads(thr);
+
+	if (reserve_dict_mutex) {
+		mutex_enter(&dict_sys->mutex);
+	}
+
+	que_graph_free(graph);
+
+	if (reserve_dict_mutex) {
+		mutex_exit(&dict_sys->mutex);
+	}
+
+	return(trx->error_state);
+}
+
+/*********************************************************************//**
+Initialise the query sub-system. */
+UNIV_INTERN
+void
+que_init(void)
+/*==========*/
+{
+	/* No op */
+}
+
+/*********************************************************************//**
+Close the query sub-system. */
+UNIV_INTERN
+void
+que_close(void)
+/*===========*/
+{
+	/* No op */
+}
diff --git a/storage/innobase/read/read0read.cc b/storage/innobase/read/read0read.cc
new file mode 100644
index 00000000000..faf4102437b
--- /dev/null
+++ b/storage/innobase/read/read0read.cc
@@ -0,0 +1,654 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file read/read0read.cc
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#include "read0read.h"
+
+#ifdef UNIV_NONINL
+#include "read0read.ic"
+#endif
+
+#include "srv0srv.h"
+#include "trx0sys.h"
+
+/*
+-------------------------------------------------------------------------------
+FACT A: Cursor read view on a secondary index sees only committed versions
+-------
+of the records in the secondary index or those versions of rows created
+by transaction which created a cursor before cursor was created even
+if transaction which created the cursor has changed that clustered index page.
+
+PROOF: We must show that read goes always to the clustered index record
+to see that record is visible in the cursor read view. Consider e.g.
+following table and SQL-clauses:
+
+create table t1(a int not null, b int, primary key(a), index(b));
+insert into t1 values (1,1),(2,2);
+commit;
+
+Now consider that we have a cursor for a query
+
+select b from t1 where b >= 1;
+
+This query will use secondary key on the table t1. Now after the first fetch
+on this cursor if we do a update:
+
+update t1 set b = 5 where b = 2;
+
+Now second fetch of the cursor should not see record (2,5) instead it should
+see record (2,2).
+
+We also should show that if we have delete t1 where b = 5; we still
+can see record (2,2).
+
+When we access a secondary key record maximum transaction id is fetched
+from this record and this trx_id is compared to up_limit_id in the view.
+If trx_id in the record is greater or equal than up_limit_id in the view
+cluster record is accessed.  Because trx_id of the creating
+transaction is stored when this view was created to the list of
+trx_ids not seen by this read view previous version of the
+record is requested to be built. This is build using clustered record.
+If the secondary key record is delete-marked, its corresponding
+clustered record can be already be purged only if records
+trx_id < low_limit_no. Purge can't remove any record deleted by a
+transaction which was active when cursor was created. But, we still
+may have a deleted secondary key record but no clustered record. But,
+this is not a problem because this case is handled in
+row_sel_get_clust_rec() function which is called
+whenever we note that this read view does not see trx_id in the
+record. Thus, we see correct version. Q. E. D.
+
+-------------------------------------------------------------------------------
+FACT B: Cursor read view on a clustered index sees only committed versions
+-------
+of the records in the clustered index or those versions of rows created
+by transaction which created a cursor before cursor was created even
+if transaction which created the cursor has changed that clustered index page.
+
+PROOF:  Consider e.g.following table and SQL-clauses:
+
+create table t1(a int not null, b int, primary key(a));
+insert into t1 values (1),(2);
+commit;
+
+Now consider that we have a cursor for a query
+
+select a from t1 where a >= 1;
+
+This query will use clustered key on the table t1. Now after the first fetch
+on this cursor if we do a update:
+
+update t1 set a = 5 where a = 2;
+
+Now second fetch of the cursor should not see record (5) instead it should
+see record (2).
+
+We also should show that if we have execute delete t1 where a = 5; after
+the cursor is opened we still can see record (2).
+
+When accessing clustered record we always check if this read view sees
+trx_id stored to clustered record. By default we don't see any changes
+if record trx_id >= low_limit_id i.e. change was made transaction
+which started after transaction which created the cursor. If row
+was changed by the future transaction a previous version of the
+clustered record is created. Thus we see only committed version in
+this case. We see all changes made by committed transactions i.e.
+record trx_id < up_limit_id. In this case we don't need to do anything,
+we already see correct version of the record. We don't see any changes
+made by active transaction except creating transaction. We have stored
+trx_id of creating transaction to list of trx_ids when this view was
+created. Thus we can easily see if this record was changed by the
+creating transaction. Because we already have clustered record we can
+access roll_ptr. Using this roll_ptr we can fetch undo record.
+We can now check that undo_no of the undo record is less than undo_no of the
+trancaction which created a view when cursor was created. We see this
+clustered record only in case when record undo_no is less than undo_no
+in the view. If this is not true we build based on undo_rec previous
+version of the record. This record is found because purge can't remove
+records accessed by active transaction. Thus we see correct version. Q. E. D.
+-------------------------------------------------------------------------------
+FACT C: Purge does not remove any delete-marked row that is visible
+-------
+in any cursor read view.
+
+PROOF: We know that:
+ 1: Currently active read views in trx_sys_t::view_list are ordered by
+    read_view_t::low_limit_no in descending order, that is,
+    newest read view first.
+
+ 2: Purge clones the oldest read view and uses that to determine whether there
+    are any active transactions that can see the to be purged records.
+
+Therefore any joining or active transaction will not have a view older
+than the purge view, according to 1.
+
+When purge needs to remove a delete-marked row from a secondary index,
+it will first check that the DB_TRX_ID value of the corresponding
+record in the clustered index is older than the purge view. It will
+also check if there is a newer version of the row (clustered index
+record) that is not delete-marked in the secondary index. If such a
+row exists and is collation-equal to the delete-marked secondary index
+record then purge will not remove the secondary index record.
+
+Delete-marked clustered index records will be removed by
+row_purge_remove_clust_if_poss(), unless the clustered index record
+(and its DB_ROLL_PTR) has been updated. Every new version of the
+clustered index record will update DB_ROLL_PTR, pointing to a new UNDO
+log entry that allows the old version to be reconstructed. The
+DB_ROLL_PTR in the oldest remaining version in the old-version chain
+may be pointing to garbage (an undo log record discarded by purge),
+but it will never be dereferenced, because the purge view is older
+than any active transaction.
+
+For details see: row_vers_old_has_index_entry() and row_purge_poss_sec()
+
+Some additional issues:
+
+What if trx_sys->view_list == NULL and some transaction T1 and Purge both
+try to open read_view at same time. Only one can acquire trx_sys->mutex.
+In which order will the views be opened? Should it matter? If no, why?
+
+The order does not matter. No new transactions can be created and no running
+transaction can commit or rollback (or free views).
+*/
+
+/*********************************************************************//**
+Creates a read view object.
+@return	own: read view struct */
+UNIV_INLINE
+read_view_t*
+read_view_create_low(
+/*=================*/
+	ulint		n,	/*!< in: number of cells in the trx_ids array */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+{
+	read_view_t*	view;
+
+	view = static_cast<read_view_t*>(
+		mem_heap_alloc(
+			heap, sizeof(*view) + n * sizeof(*view->trx_ids)));
+
+	view->n_trx_ids = n;
+	view->trx_ids = (trx_id_t*) &view[1];
+
+	return(view);
+}
+
+/*********************************************************************//**
+Clones a read view object. This function will allocate space for two read
+views contiguously, one identical in size and content as @param view (starting
+at returned pointer) and another view immediately following the trx_ids array.
+The second view will have space for an extra trx_id_t element.
+@return	read view struct */
+UNIV_INLINE
+read_view_t*
+read_view_clone(
+/*============*/
+	const read_view_t*	view,	/*!< in: view to clone */
+	mem_heap_t*		heap)	/*!< in: memory heap
+					from which allocated */
+{
+	ulint		sz;
+	read_view_t*	clone;
+	read_view_t*	new_view;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	/* Allocate space for two views. */
+
+	sz = sizeof(*view) + view->n_trx_ids * sizeof(*view->trx_ids);
+
+	/* Add an extra trx_id_t slot for the new view. */
+
+	clone = static_cast<read_view_t*>(
+		mem_heap_alloc(heap, (sz * 2) + sizeof(trx_id_t)));
+
+	/* Only the contents of the old view are important, the new view
+	will be created from this and so we don't copy that across. */
+
+	memcpy(clone, view, sz);
+
+	clone->trx_ids = (trx_id_t*) &clone[1];
+
+	new_view = (read_view_t*) &clone->trx_ids[clone->n_trx_ids];
+	new_view->trx_ids = (trx_id_t*) &new_view[1];
+	new_view->n_trx_ids = clone->n_trx_ids + 1;
+
+	ut_a(new_view->n_trx_ids == view->n_trx_ids + 1);
+
+	return(clone);
+}
+
+/*********************************************************************//**
+Insert the view in the proper order into the trx_sys->view_list. The
+read view list is ordered by read_view_t::low_limit_no in descending order. */
+static
+void
+read_view_add(
+/*==========*/
+	read_view_t*	view)		/*!< in: view to add to */
+{
+	read_view_t*	elem;
+	read_view_t*	prev_elem;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+	ut_ad(read_view_validate(view));
+
+	/* Find the correct slot for insertion. */
+	for (elem = UT_LIST_GET_FIRST(trx_sys->view_list), prev_elem = NULL;
+	     elem != NULL && view->low_limit_no < elem->low_limit_no;
+	     prev_elem = elem, elem = UT_LIST_GET_NEXT(view_list, elem)) {
+		/* No op */
+	}
+
+	if (prev_elem == NULL) {
+		UT_LIST_ADD_FIRST(view_list, trx_sys->view_list, view);
+	} else {
+		UT_LIST_INSERT_AFTER(
+			view_list, trx_sys->view_list, prev_elem, view);
+	}
+
+	ut_ad(read_view_list_validate());
+}
+
+/** Functor to create thew view trx_ids array. */
+struct	CreateView {
+
+	CreateView(read_view_t*	view)
+		: m_view(view)
+	{
+		  m_n_trx = m_view->n_trx_ids;
+		  m_view->n_trx_ids = 0;
+	}
+
+	void	operator()(const trx_t* trx)
+	{
+		ut_ad(mutex_own(&trx_sys->mutex));
+		ut_ad(trx->in_rw_trx_list);
+
+		/* trx->state cannot change from or to NOT_STARTED
+		while we are holding the trx_sys->mutex. It may change
+		from ACTIVE to PREPARED or COMMITTED. */
+
+		if (trx->id != m_view->creator_trx_id
+		    && !trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)) {
+
+			ut_ad(m_n_trx > m_view->n_trx_ids);
+
+			m_view->trx_ids[m_view->n_trx_ids++] = trx->id;
+
+			/* NOTE that a transaction whose trx number is <
+			trx_sys->max_trx_id can still be active, if it is
+			in the middle of its commit! Note that when a
+			transaction starts, we initialize trx->no to
+			TRX_ID_MAX. */
+
+			/* trx->no is protected by trx_sys->mutex, which
+			we are holding. It is assigned by trx_commit()
+			before lock_trx_release_locks() assigns
+			trx->state = TRX_STATE_COMMITTED_IN_MEMORY. */
+
+			if (m_view->low_limit_no > trx->no) {
+				m_view->low_limit_no = trx->no;
+			}
+		}
+	}
+
+	read_view_t*	m_view;
+	ulint		m_n_trx;
+};
+
+/*********************************************************************//**
+Opens a read view where exactly the transactions serialized before this
+point in time are seen in the view.
+@return	own: read view struct */
+static
+read_view_t*
+read_view_open_now_low(
+/*===================*/
+	trx_id_t	cr_trx_id,	/*!< in: trx_id of creating
+					transaction, or 0 used in purge */
+	mem_heap_t*	heap)		/*!< in: memory heap from which
+					allocated */
+{
+	read_view_t*	view;
+	ulint		n_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list);
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	view = read_view_create_low(n_trx, heap);
+
+	view->undo_no = 0;
+	view->type = VIEW_NORMAL;
+	view->creator_trx_id = cr_trx_id;
+
+	/* No future transactions should be visible in the view */
+
+	view->low_limit_no = trx_sys->max_trx_id;
+	view->low_limit_id = view->low_limit_no;
+
+	/* No active transaction should be visible, except cr_trx */
+
+	ut_list_map(trx_sys->rw_trx_list, &trx_t::trx_list, CreateView(view));
+
+	if (view->n_trx_ids > 0) {
+		/* The last active transaction has the smallest id: */
+		view->up_limit_id = view->trx_ids[view->n_trx_ids - 1];
+	} else {
+		view->up_limit_id = view->low_limit_id;
+	}
+
+	/* Purge views are not added to the view list. */
+	if (cr_trx_id > 0) {
+		read_view_add(view);
+	}
+
+	return(view);
+}
+
+/*********************************************************************//**
+Opens a read view where exactly the transactions serialized before this
+point in time are seen in the view.
+@return	own: read view struct */
+UNIV_INTERN
+read_view_t*
+read_view_open_now(
+/*===============*/
+	trx_id_t	cr_trx_id,	/*!< in: trx_id of creating
+					transaction, or 0 used in purge */
+	mem_heap_t*	heap)		/*!< in: memory heap from which
+					allocated */
+{
+	read_view_t*	view;
+
+	mutex_enter(&trx_sys->mutex);
+
+	view = read_view_open_now_low(cr_trx_id, heap);
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(view);
+}
+
+/*********************************************************************//**
+Makes a copy of the oldest existing read view, with the exception that also
+the creating trx of the oldest view is set as not visible in the 'copied'
+view. Opens a new view if no views currently exist. The view must be closed
+with ..._close. This is used in purge.
+@return	own: read view struct */
+UNIV_INTERN
+read_view_t*
+read_view_purge_open(
+/*=================*/
+	mem_heap_t*	heap)		/*!< in: memory heap from which
+					allocated */
+{
+	ulint		i;
+	read_view_t*	view;
+	read_view_t*	oldest_view;
+	trx_id_t	creator_trx_id;
+	ulint		insert_done	= 0;
+
+	mutex_enter(&trx_sys->mutex);
+
+	oldest_view = UT_LIST_GET_LAST(trx_sys->view_list);
+
+	if (oldest_view == NULL) {
+
+		view = read_view_open_now_low(0, heap);
+
+		mutex_exit(&trx_sys->mutex);
+
+		return(view);
+	}
+
+	/* Allocate space for both views, the oldest and the new purge view. */
+
+	oldest_view = read_view_clone(oldest_view, heap);
+
+	ut_ad(read_view_validate(oldest_view));
+
+	mutex_exit(&trx_sys->mutex);
+
+	ut_a(oldest_view->creator_trx_id > 0);
+	creator_trx_id = oldest_view->creator_trx_id;
+
+	view = (read_view_t*) &oldest_view->trx_ids[oldest_view->n_trx_ids];
+
+	/* Add the creator transaction id in the trx_ids array in the
+	correct slot. */
+
+	for (i = 0; i < oldest_view->n_trx_ids; ++i) {
+		trx_id_t	id;
+
+		id = oldest_view->trx_ids[i - insert_done];
+
+		if (insert_done == 0 && creator_trx_id > id) {
+			id = creator_trx_id;
+			insert_done = 1;
+		}
+
+		view->trx_ids[i] = id;
+	}
+
+	if (insert_done == 0) {
+		view->trx_ids[i] = creator_trx_id;
+	} else {
+		ut_a(i > 0);
+		view->trx_ids[i] = oldest_view->trx_ids[i - 1];
+	}
+
+	view->creator_trx_id = 0;
+
+	view->low_limit_no = oldest_view->low_limit_no;
+	view->low_limit_id = oldest_view->low_limit_id;
+
+	if (view->n_trx_ids > 0) {
+		/* The last active transaction has the smallest id: */
+
+		view->up_limit_id = view->trx_ids[view->n_trx_ids - 1];
+	} else {
+		view->up_limit_id = oldest_view->up_limit_id;
+	}
+
+	return(view);
+}
+
+/*********************************************************************//**
+Closes a consistent read view for MySQL. This function is called at an SQL
+statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */
+UNIV_INTERN
+void
+read_view_close_for_mysql(
+/*======================*/
+	trx_t*		trx)	/*!< in: trx which has a read view */
+{
+	ut_a(trx->global_read_view);
+
+	read_view_remove(trx->global_read_view, false);
+
+	mem_heap_empty(trx->global_read_view_heap);
+
+	trx->read_view = NULL;
+	trx->global_read_view = NULL;
+}
+
+/*********************************************************************//**
+Prints a read view to stderr. */
+UNIV_INTERN
+void
+read_view_print(
+/*============*/
+	const read_view_t*	view)	/*!< in: read view */
+{
+	ulint	n_ids;
+	ulint	i;
+
+	if (view->type == VIEW_HIGH_GRANULARITY) {
+		fprintf(stderr,
+			"High-granularity read view undo_n:o " TRX_ID_FMT "\n",
+			view->undo_no);
+	} else {
+		fprintf(stderr, "Normal read view\n");
+	}
+
+	fprintf(stderr, "Read view low limit trx n:o " TRX_ID_FMT "\n",
+		view->low_limit_no);
+
+	fprintf(stderr, "Read view up limit trx id " TRX_ID_FMT "\n",
+		view->up_limit_id);
+
+	fprintf(stderr, "Read view low limit trx id " TRX_ID_FMT "\n",
+		view->low_limit_id);
+
+	fprintf(stderr, "Read view individually stored trx ids:\n");
+
+	n_ids = view->n_trx_ids;
+
+	for (i = 0; i < n_ids; i++) {
+		fprintf(stderr, "Read view trx id " TRX_ID_FMT "\n",
+			view->trx_ids[i]);
+	}
+}
+
+/*********************************************************************//**
+Create a high-granularity consistent cursor view for mysql to be used
+in cursors. In this consistent read view modifications done by the
+creating transaction after the cursor is created or future transactions
+are not visible. */
+UNIV_INTERN
+cursor_view_t*
+read_cursor_view_create_for_mysql(
+/*==============================*/
+	trx_t*		cr_trx)	/*!< in: trx where cursor view is created */
+{
+	read_view_t*	view;
+	mem_heap_t*	heap;
+	ulint		n_trx;
+	cursor_view_t*	curview;
+
+	/* Use larger heap than in trx_create when creating a read_view
+	because cursors are quite long. */
+
+	heap = mem_heap_create(512);
+
+	curview = (cursor_view_t*) mem_heap_alloc(heap, sizeof(*curview));
+
+	curview->heap = heap;
+
+	/* Drop cursor tables from consideration when evaluating the
+	need of auto-commit */
+
+	curview->n_mysql_tables_in_use = cr_trx->n_mysql_tables_in_use;
+
+	cr_trx->n_mysql_tables_in_use = 0;
+
+	mutex_enter(&trx_sys->mutex);
+
+	n_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list);
+
+	curview->read_view = read_view_create_low(n_trx, curview->heap);
+
+	view = curview->read_view;
+	view->undo_no = cr_trx->undo_no;
+	view->type = VIEW_HIGH_GRANULARITY;
+	view->creator_trx_id = UINT64_UNDEFINED;
+
+	/* No future transactions should be visible in the view */
+
+	view->low_limit_no = trx_sys->max_trx_id;
+	view->low_limit_id = view->low_limit_no;
+
+	/* No active transaction should be visible */
+
+	ut_list_map(trx_sys->rw_trx_list, &trx_t::trx_list, CreateView(view));
+
+	view->creator_trx_id = cr_trx->id;
+
+	if (view->n_trx_ids > 0) {
+		/* The last active transaction has the smallest id: */
+
+		view->up_limit_id = view->trx_ids[view->n_trx_ids - 1];
+	} else {
+		view->up_limit_id = view->low_limit_id;
+	}
+
+	read_view_add(view);
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(curview);
+}
+
+/*********************************************************************//**
+Close a given consistent cursor view for mysql and restore global read view
+back to a transaction read view. */
+UNIV_INTERN
+void
+read_cursor_view_close_for_mysql(
+/*=============================*/
+	trx_t*		trx,	/*!< in: trx */
+	cursor_view_t*	curview)/*!< in: cursor view to be closed */
+{
+	ut_a(curview);
+	ut_a(curview->read_view);
+	ut_a(curview->heap);
+
+	/* Add cursor's tables to the global count of active tables that
+	belong to this transaction */
+	trx->n_mysql_tables_in_use += curview->n_mysql_tables_in_use;
+
+	read_view_remove(curview->read_view, false);
+
+	trx->read_view = trx->global_read_view;
+
+	mem_heap_free(curview->heap);
+}
+
+/*********************************************************************//**
+This function sets a given consistent cursor view to a transaction
+read view if given consistent cursor view is not NULL. Otherwise, function
+restores a global read view to a transaction read view. */
+UNIV_INTERN
+void
+read_cursor_set_for_mysql(
+/*======================*/
+	trx_t*		trx,	/*!< in: transaction where cursor is set */
+	cursor_view_t*	curview)/*!< in: consistent cursor view to be set */
+{
+	ut_a(trx);
+
+	mutex_enter(&trx_sys->mutex);
+
+	if (UNIV_LIKELY(curview != NULL)) {
+		trx->read_view = curview->read_view;
+	} else {
+		trx->read_view = trx->global_read_view;
+	}
+
+	ut_ad(read_view_validate(trx->read_view));
+
+	mutex_exit(&trx_sys->mutex);
+}
diff --git a/storage/innobase/rem/rem0cmp.cc b/storage/innobase/rem/rem0cmp.cc
new file mode 100644
index 00000000000..426cf9e3ac5
--- /dev/null
+++ b/storage/innobase/rem/rem0cmp.cc
@@ -0,0 +1,1458 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file rem/rem0cmp.cc
+Comparison services for records
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+#include "rem0cmp.h"
+
+#ifdef UNIV_NONINL
+#include "rem0cmp.ic"
+#endif
+
+#include "ha_prototypes.h"
+#include "handler0alter.h"
+#include "srv0srv.h"
+
+/*		ALPHABETICAL ORDER
+		==================
+
+The records are put into alphabetical order in the following
+way: let F be the first field where two records disagree.
+If there is a character in some position n where the
+records disagree, the order is determined by comparison of
+the characters at position n, possibly after
+collating transformation. If there is no such character,
+but the corresponding fields have different lengths, then
+if the data type of the fields is paddable,
+shorter field is padded with a padding character. If the
+data type is not paddable, longer field is considered greater.
+Finally, the SQL null is bigger than any other value.
+
+At the present, the comparison functions return 0 in the case,
+where two records disagree only in the way that one
+has more fields than the other. */
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Used in debug checking of cmp_dtuple_... .
+This function is used to compare a data tuple to a physical record. If
+dtuple has n fields then rec must have either m >= n fields, or it must
+differ from dtuple in some of the m fields rec has.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared */
+static
+int
+cmp_debug_dtuple_rec_with_match(
+/*============================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record which differs from
+				dtuple in some of the common fields, or which
+				has an equal number or more fields than
+				dtuple */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n_cmp,	/*!< in: number of fields to compare */
+	ulint*		matched_fields)/*!< in/out: number of already
+				completely  matched fields; when function
+				returns, contains the value for current
+				comparison */
+	__attribute__((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+This function is used to compare two data fields for which the data type
+is such that we must use MySQL code to compare them. The prototype here
+must be a copy of the one in ha_innobase.cc!
+@return	1, 0, -1, if a is greater, equal, less than b, respectively */
+extern
+int
+innobase_mysql_cmp(
+/*===============*/
+	int		mysql_type,	/*!< in: MySQL type */
+	uint		charset_number,	/*!< in: number of the charset */
+	const unsigned char* a,		/*!< in: data field */
+	unsigned int	a_length,	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+	const unsigned char* b,		/*!< in: data field */
+	unsigned int	b_length);	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+/*************************************************************//**
+This function is used to compare two data fields for which the data type
+is such that we must use MySQL code to compare them. The prototype here
+must be a copy of the one in ha_innobase.cc!
+@return	1, 0, -1, if a is greater, equal, less than b, respectively */
+extern
+int
+innobase_mysql_cmp_prefix(
+/*======================*/
+	int		mysql_type,	/*!< in: MySQL type */
+	uint		charset_number,	/*!< in: number of the charset */
+	const unsigned char* a,		/*!< in: data field */
+	unsigned int	a_length,	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+	const unsigned char* b,		/*!< in: data field */
+	unsigned int	b_length);	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+/*********************************************************************//**
+Transforms the character code so that it is ordered appropriately for the
+language. This is only used for the latin1 char set. MySQL does the
+comparisons for other char sets.
+@return	collation order position */
+UNIV_INLINE
+ulint
+cmp_collate(
+/*========*/
+	ulint	code)	/*!< in: code of a character stored in database record */
+{
+	return((ulint) srv_latin1_ordering[code]);
+}
+
+/*************************************************************//**
+Returns TRUE if two columns are equal for comparison purposes.
+@return	TRUE if the columns are considered equal in comparisons */
+UNIV_INTERN
+ibool
+cmp_cols_are_equal(
+/*===============*/
+	const dict_col_t*	col1,	/*!< in: column 1 */
+	const dict_col_t*	col2,	/*!< in: column 2 */
+	ibool			check_charsets)
+					/*!< in: whether to check charsets */
+{
+	if (dtype_is_non_binary_string_type(col1->mtype, col1->prtype)
+	    && dtype_is_non_binary_string_type(col2->mtype, col2->prtype)) {
+
+		/* Both are non-binary string types: they can be compared if
+		and only if the charset-collation is the same */
+
+		if (check_charsets) {
+			return(dtype_get_charset_coll(col1->prtype)
+			       == dtype_get_charset_coll(col2->prtype));
+		} else {
+			return(TRUE);
+		}
+	}
+
+	if (dtype_is_binary_string_type(col1->mtype, col1->prtype)
+	    && dtype_is_binary_string_type(col2->mtype, col2->prtype)) {
+
+		/* Both are binary string types: they can be compared */
+
+		return(TRUE);
+	}
+
+	if (col1->mtype != col2->mtype) {
+
+		return(FALSE);
+	}
+
+	if (col1->mtype == DATA_INT
+	    && (col1->prtype & DATA_UNSIGNED)
+	    != (col2->prtype & DATA_UNSIGNED)) {
+
+		/* The storage format of an unsigned integer is different
+		from a signed integer: in a signed integer we OR
+		0x8000... to the value of positive integers. */
+
+		return(FALSE);
+	}
+
+	return(col1->mtype != DATA_INT || col1->len == col2->len);
+}
+
+/*************************************************************//**
+Innobase uses this function to compare two data fields for which the data type
+is such that we must compare whole fields or call MySQL to do the comparison
+@return	1, 0, -1, if a is greater, equal, less than b, respectively */
+static
+int
+cmp_whole_field(
+/*============*/
+	ulint		mtype,		/*!< in: main type */
+	ulint		prtype,		/*!< in: precise type */
+	const byte*	a,		/*!< in: data field */
+	unsigned int	a_length,	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+	const byte*	b,		/*!< in: data field */
+	unsigned int	b_length)	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+{
+	float		f_1;
+	float		f_2;
+	double		d_1;
+	double		d_2;
+	int		swap_flag	= 1;
+
+	switch (mtype) {
+
+	case DATA_DECIMAL:
+		/* Remove preceding spaces */
+		for (; a_length && *a == ' '; a++, a_length--) { }
+		for (; b_length && *b == ' '; b++, b_length--) { }
+
+		if (*a == '-') {
+			if (*b != '-') {
+				return(-1);
+			}
+
+			a++; b++;
+			a_length--;
+			b_length--;
+
+			swap_flag = -1;
+
+		} else if (*b == '-') {
+
+			return(1);
+		}
+
+		while (a_length > 0 && (*a == '+' || *a == '0')) {
+			a++; a_length--;
+		}
+
+		while (b_length > 0 && (*b == '+' || *b == '0')) {
+			b++; b_length--;
+		}
+
+		if (a_length != b_length) {
+			if (a_length < b_length) {
+				return(-swap_flag);
+			}
+
+			return(swap_flag);
+		}
+
+		while (a_length > 0 && *a == *b) {
+
+			a++; b++; a_length--;
+		}
+
+		if (a_length == 0) {
+
+			return(0);
+		}
+
+		if (*a > *b) {
+			return(swap_flag);
+		}
+
+		return(-swap_flag);
+	case DATA_DOUBLE:
+		d_1 = mach_double_read(a);
+		d_2 = mach_double_read(b);
+
+		if (d_1 > d_2) {
+			return(1);
+		} else if (d_2 > d_1) {
+			return(-1);
+		}
+
+		return(0);
+
+	case DATA_FLOAT:
+		f_1 = mach_float_read(a);
+		f_2 = mach_float_read(b);
+
+		if (f_1 > f_2) {
+			return(1);
+		} else if (f_2 > f_1) {
+			return(-1);
+		}
+
+		return(0);
+	case DATA_BLOB:
+		if (prtype & DATA_BINARY_TYPE) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Error: comparing a binary BLOB"
+				" with a character set sensitive\n"
+				"InnoDB: comparison!\n");
+		}
+		/* fall through */
+	case DATA_VARMYSQL:
+	case DATA_MYSQL:
+		return(innobase_mysql_cmp(
+			       (int)(prtype & DATA_MYSQL_TYPE_MASK),
+			       (uint) dtype_get_charset_coll(prtype),
+			       a, a_length, b, b_length));
+	default:
+		fprintf(stderr,
+			"InnoDB: unknown type number %lu\n",
+			(ulong) mtype);
+		ut_error;
+	}
+
+	return(0);
+}
+
+/*****************************************************************
+This function is used to compare two dfields where at least the first
+has its data type field set. */
+UNIV_INTERN
+int
+cmp_dfield_dfield_like_prefix(
+/*==========================*/
+				/* out: 1, 0, -1, if dfield1 is greater, equal,
+				less than dfield2, respectively */
+	dfield_t*	dfield1,/* in: data field; must have type field set */
+	dfield_t*	dfield2)/* in: data field */
+{
+	const dtype_t*	type;
+	int		ret;
+
+	ut_ad(dfield_check_typed(dfield1));
+
+	type = dfield_get_type(dfield1);
+
+	if (type->mtype >= DATA_FLOAT) {
+		ret = innobase_mysql_cmp_prefix(
+			static_cast<int>(type->prtype & DATA_MYSQL_TYPE_MASK),
+			static_cast<uint>(dtype_get_charset_coll(type->prtype)),
+			static_cast<byte*>(dfield_get_data(dfield1)),
+			static_cast<uint>(dfield_get_len(dfield1)),
+			static_cast<byte*>(dfield_get_data(dfield2)),
+			static_cast<uint>(dfield_get_len(dfield2)));
+	} else {
+		ret = (cmp_data_data_like_prefix(
+			static_cast<byte*>(dfield_get_data(dfield1)),
+			dfield_get_len(dfield1),
+			static_cast<byte*>(dfield_get_data(dfield2)),
+			dfield_get_len(dfield2)));
+	}
+
+	return(ret);
+}
+
+/*************************************************************//**
+This function is used to compare two data fields for which we know the
+data type.
+@return	1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INTERN
+int
+cmp_data_data_slow(
+/*===============*/
+	ulint		mtype,	/*!< in: main type */
+	ulint		prtype,	/*!< in: precise type */
+	const byte*	data1,	/*!< in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/*!< in: data field length or UNIV_SQL_NULL */
+	const byte*	data2,	/*!< in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2)	/*!< in: data field length or UNIV_SQL_NULL */
+{
+	ulint	data1_byte;
+	ulint	data2_byte;
+	ulint	cur_bytes;
+
+	if (len1 == UNIV_SQL_NULL || len2 == UNIV_SQL_NULL) {
+
+		if (len1 == len2) {
+
+			return(0);
+		}
+
+		if (len1 == UNIV_SQL_NULL) {
+			/* We define the SQL null to be the smallest possible
+			value of a field in the alphabetical order */
+
+			return(-1);
+		}
+
+		return(1);
+	}
+
+	if (mtype >= DATA_FLOAT
+	    || (mtype == DATA_BLOB
+		&& 0 == (prtype & DATA_BINARY_TYPE)
+		&& dtype_get_charset_coll(prtype)
+		!= DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {
+
+		return(cmp_whole_field(mtype, prtype,
+				       data1, (unsigned) len1,
+				       data2, (unsigned) len2));
+	}
+
+	/* Compare then the fields */
+
+	cur_bytes = 0;
+
+	for (;;) {
+		if (len1 <= cur_bytes) {
+			if (len2 <= cur_bytes) {
+
+				return(0);
+			}
+
+			data1_byte = dtype_get_pad_char(mtype, prtype);
+
+			if (data1_byte == ULINT_UNDEFINED) {
+
+				return(-1);
+			}
+		} else {
+			data1_byte = *data1;
+		}
+
+		if (len2 <= cur_bytes) {
+			data2_byte = dtype_get_pad_char(mtype, prtype);
+
+			if (data2_byte == ULINT_UNDEFINED) {
+
+				return(1);
+			}
+		} else {
+			data2_byte = *data2;
+		}
+
+		if (data1_byte == data2_byte) {
+			/* If the bytes are equal, they will remain such even
+			after the collation transformation below */
+
+			goto next_byte;
+		}
+
+		if (mtype <= DATA_CHAR
+		    || (mtype == DATA_BLOB
+			&& 0 == (prtype & DATA_BINARY_TYPE))) {
+
+			data1_byte = cmp_collate(data1_byte);
+			data2_byte = cmp_collate(data2_byte);
+		}
+
+		if (data1_byte > data2_byte) {
+
+			return(1);
+		} else if (data1_byte < data2_byte) {
+
+			return(-1);
+		}
+next_byte:
+		/* Next byte */
+		cur_bytes++;
+		data1++;
+		data2++;
+	}
+
+	return(0);		/* Not reached */
+}
+
+/*****************************************************************
+This function is used to compare two data fields for which we know the
+data type to be VARCHAR */
+
+int
+cmp_data_data_slow_varchar(
+/*=======================*/
+				/* out: 1, 0, -1, if lhs is greater, equal,
+				less than rhs, respectively */
+	const byte*	lhs,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		lhs_len,/* in: data field length or UNIV_SQL_NULL */
+	const byte*	rhs,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		rhs_len)/* in: data field length or UNIV_SQL_NULL */
+{
+	ulint	i;
+
+	ut_a(rhs_len != UNIV_SQL_NULL);
+
+	if (lhs_len == UNIV_SQL_NULL) {
+
+		/* We define the SQL null to be the smallest possible
+		value of a field in the alphabetical order */
+
+		return(-1);
+	}
+
+	/* Compare the values.*/
+
+	for (i = 0; i < lhs_len && i < rhs_len; ++i, ++rhs, ++lhs) {
+		ulint	lhs_byte = *lhs;
+		ulint	rhs_byte = *rhs;
+
+		if (lhs_byte != rhs_byte) {
+			/* If the bytes are equal, they will remain such even
+			after the collation transformation below */
+
+			lhs_byte = cmp_collate(lhs_byte);
+			rhs_byte = cmp_collate(rhs_byte);
+
+			if (lhs_byte > rhs_byte) {
+
+				return(1);
+			} else if (lhs_byte < rhs_byte) {
+
+				return(-1);
+			}
+		}
+	}
+
+	return((i == lhs_len && i == rhs_len) ? 0 :
+		static_cast<int>(rhs_len - lhs_len));
+}
+
+/*****************************************************************
+This function is used to compare two data fields for which we know the
+data type. The comparison is done for the LIKE operator.*/
+
+int
+cmp_data_data_slow_like_prefix(
+/*===========================*/
+				/* out: 1, 0, -1, if lhs is greater, equal,
+				less than rhs, respectively */
+	const byte*	lhs,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/* in: data field length or UNIV_SQL_NULL */
+	const byte*	rhs,	/* in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2)	/* in: data field length or UNIV_SQL_NULL */
+{
+	ulint	i;
+
+	ut_a(len2 != UNIV_SQL_NULL);
+
+	if (len1 == UNIV_SQL_NULL) {
+
+		/* We define the SQL null to be the smallest possible
+		value of a field in the alphabetical order */
+
+		return(-1);
+	}
+
+	/* Compare the values.*/
+
+	for (i = 0; i < len1 && i < len2; ++i, ++rhs, ++lhs) {
+		ulint	lhs_byte = *lhs;
+		ulint	rhs_byte = *rhs;
+
+		if (lhs_byte != rhs_byte) {
+			/* If the bytes are equal, they will remain such even
+			after the collation transformation below */
+
+			lhs_byte = cmp_collate(lhs_byte);
+			rhs_byte = cmp_collate(rhs_byte);
+
+			if (lhs_byte > rhs_byte) {
+
+				return(1);
+			} else if (lhs_byte < rhs_byte) {
+
+				return(-1);
+			}
+		}
+	}
+
+	return(i == len2 ? 0 : 1);
+}
+
+/*****************************************************************
+This function is used to compare two data fields for which we know the
+data type. The comparison is done for the LIKE operator.*/
+
+int
+cmp_data_data_slow_like_suffix(
+/*===========================*/
+				/* out: 1, 0, -1, if data1 is greater, equal,
+				less than data2, respectively */
+				/* in: data field (== a pointer to a
+				memory buffer) */
+	const byte*	data1 UNIV_UNUSED,
+				/* in: data field length or UNIV_SQL_NULL */
+	ulint		len1 UNIV_UNUSED,
+				/* in: data field (== a pointer to a memory
+				buffer) */
+	const byte*	data2 UNIV_UNUSED,
+				/* in: data field length or UNIV_SQL_NULL */
+	ulint		len2 UNIV_UNUSED)
+
+{
+	ut_error;	// FIXME:
+	return(1);
+}
+
+/*****************************************************************
+This function is used to compare two data fields for which we know the
+data type. The comparison is done for the LIKE operator.*/
+
+int
+cmp_data_data_slow_like_substr(
+/*===========================*/
+				/* out: 1, 0, -1, if data1 is greater, equal,
+				less than data2, respectively */
+				/* in: data field (== a pointer to a
+				memory buffer) */
+	const byte*	data1 UNIV_UNUSED,
+				/* in: data field length or UNIV_SQL_NULL */
+	ulint		len1 UNIV_UNUSED,
+				/* in: data field (== a pointer to a memory
+				buffer) */
+	const byte*	data2 UNIV_UNUSED,
+				/* in: data field length or UNIV_SQL_NULL */
+	ulint		len2 UNIV_UNUSED)
+{
+	ut_error;	// FIXME:
+	return(1);
+}
+/*************************************************************//**
+This function is used to compare a data tuple to a physical record.
+Only dtuple->n_fields_cmp first fields are taken into account for
+the data tuple! If we denote by n = n_fields_cmp, then rec must
+have either m >= n fields, or it must differ from dtuple in some of
+the m fields rec has. If rec has an externally stored field we do not
+compare it but return with value 0 if such a comparison should be
+made.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared, or until
+the first externally stored field in rec */
+UNIV_INTERN
+int
+cmp_dtuple_rec_with_match_low(
+/*==========================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record which differs from
+				dtuple in some of the common fields, or which
+				has an equal number or more fields than
+				dtuple */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n_cmp,	/*!< in: number of fields to compare */
+	ulint*		matched_fields, /*!< in/out: number of already completely
+				matched fields; when function returns,
+				contains the value for current comparison */
+	ulint*		matched_bytes) /*!< in/out: number of already matched
+				bytes within the first field not completely
+				matched; when function returns, contains the
+				value for current comparison */
+{
+	const dfield_t*	dtuple_field;	/* current field in logical record */
+	ulint		dtuple_f_len;	/* the length of the current field
+					in the logical record */
+	const byte*	dtuple_b_ptr;	/* pointer to the current byte in
+					logical field data */
+	ulint		dtuple_byte;	/* value of current byte to be compared
+					in dtuple*/
+	ulint		rec_f_len;	/* length of current field in rec */
+	const byte*	rec_b_ptr;	/* pointer to the current byte in
+					rec field */
+	ulint		rec_byte;	/* value of current byte to be
+					compared in rec */
+	ulint		cur_field;	/* current field number */
+	ulint		cur_bytes;	/* number of already matched bytes
+					in current field */
+	int		ret;		/* return value */
+
+	ut_ad(dtuple && rec && matched_fields && matched_bytes);
+	ut_ad(dtuple_check_typed(dtuple));
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	cur_field = *matched_fields;
+	cur_bytes = *matched_bytes;
+
+	ut_ad(n_cmp > 0);
+	ut_ad(n_cmp <= dtuple_get_n_fields(dtuple));
+	ut_ad(cur_field <= n_cmp);
+	ut_ad(cur_field <= rec_offs_n_fields(offsets));
+
+	if (cur_bytes == 0 && cur_field == 0) {
+		ulint	rec_info = rec_get_info_bits(rec,
+						     rec_offs_comp(offsets));
+		ulint	tup_info = dtuple_get_info_bits(dtuple);
+
+		if (UNIV_UNLIKELY(rec_info & REC_INFO_MIN_REC_FLAG)) {
+			ret = !(tup_info & REC_INFO_MIN_REC_FLAG);
+			goto order_resolved;
+		} else if (UNIV_UNLIKELY(tup_info & REC_INFO_MIN_REC_FLAG)) {
+			ret = -1;
+			goto order_resolved;
+		}
+	}
+
+	/* Match fields in a loop; stop if we run out of fields in dtuple
+	or find an externally stored field */
+
+	while (cur_field < n_cmp) {
+
+		ulint	mtype;
+		ulint	prtype;
+
+		dtuple_field = dtuple_get_nth_field(dtuple, cur_field);
+		{
+			const dtype_t*	type
+				= dfield_get_type(dtuple_field);
+
+			mtype = type->mtype;
+			prtype = type->prtype;
+		}
+
+		dtuple_f_len = dfield_get_len(dtuple_field);
+
+		rec_b_ptr = rec_get_nth_field(rec, offsets,
+					      cur_field, &rec_f_len);
+
+		/* If we have matched yet 0 bytes, it may be that one or
+		both the fields are SQL null, or the record or dtuple may be
+		the predefined minimum record, or the field is externally
+		stored */
+
+		if (UNIV_LIKELY(cur_bytes == 0)) {
+			if (rec_offs_nth_extern(offsets, cur_field)) {
+				/* We do not compare to an externally
+				stored field */
+
+				ret = 0;
+
+				goto order_resolved;
+			}
+
+			if (dtuple_f_len == UNIV_SQL_NULL) {
+				if (rec_f_len == UNIV_SQL_NULL) {
+
+					goto next_field;
+				}
+
+				ret = -1;
+				goto order_resolved;
+			} else if (rec_f_len == UNIV_SQL_NULL) {
+				/* We define the SQL null to be the
+				smallest possible value of a field
+				in the alphabetical order */
+
+				ret = 1;
+				goto order_resolved;
+			}
+		}
+
+		if (mtype >= DATA_FLOAT
+		    || (mtype == DATA_BLOB
+			&& 0 == (prtype & DATA_BINARY_TYPE)
+			&& dtype_get_charset_coll(prtype)
+			!= DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {
+
+			ret = cmp_whole_field(
+				mtype, prtype,
+				static_cast<const byte*>(
+					dfield_get_data(dtuple_field)),
+				(unsigned) dtuple_f_len,
+				rec_b_ptr, (unsigned) rec_f_len);
+
+			if (ret != 0) {
+				cur_bytes = 0;
+
+				goto order_resolved;
+			} else {
+				goto next_field;
+			}
+		}
+
+		/* Set the pointers at the current byte */
+
+		rec_b_ptr = rec_b_ptr + cur_bytes;
+		dtuple_b_ptr = (byte*) dfield_get_data(dtuple_field)
+			+ cur_bytes;
+		/* Compare then the fields */
+
+		for (;;) {
+			if (UNIV_UNLIKELY(rec_f_len <= cur_bytes)) {
+				if (dtuple_f_len <= cur_bytes) {
+
+					goto next_field;
+				}
+
+				rec_byte = dtype_get_pad_char(mtype, prtype);
+
+				if (rec_byte == ULINT_UNDEFINED) {
+					ret = 1;
+
+					goto order_resolved;
+				}
+			} else {
+				rec_byte = *rec_b_ptr;
+			}
+
+			if (UNIV_UNLIKELY(dtuple_f_len <= cur_bytes)) {
+				dtuple_byte = dtype_get_pad_char(mtype,
+								 prtype);
+
+				if (dtuple_byte == ULINT_UNDEFINED) {
+					ret = -1;
+
+					goto order_resolved;
+				}
+			} else {
+				dtuple_byte = *dtuple_b_ptr;
+			}
+
+			if (dtuple_byte == rec_byte) {
+				/* If the bytes are equal, they will
+				remain such even after the collation
+				transformation below */
+
+				goto next_byte;
+			}
+
+			if (mtype <= DATA_CHAR
+			    || (mtype == DATA_BLOB
+				&& !(prtype & DATA_BINARY_TYPE))) {
+
+				rec_byte = cmp_collate(rec_byte);
+				dtuple_byte = cmp_collate(dtuple_byte);
+			}
+
+			ret = (int) (dtuple_byte - rec_byte);
+			if (UNIV_LIKELY(ret)) {
+				if (ret < 0) {
+					ret = -1;
+					goto order_resolved;
+				} else {
+					ret = 1;
+					goto order_resolved;
+				}
+			}
+next_byte:
+			/* Next byte */
+			cur_bytes++;
+			rec_b_ptr++;
+			dtuple_b_ptr++;
+		}
+
+next_field:
+		cur_field++;
+		cur_bytes = 0;
+	}
+
+	ut_ad(cur_bytes == 0);
+
+	ret = 0;	/* If we ran out of fields, dtuple was equal to rec
+			up to the common fields */
+order_resolved:
+	ut_ad((ret >= - 1) && (ret <= 1));
+	ut_ad(ret == cmp_debug_dtuple_rec_with_match(dtuple, rec, offsets,
+						     n_cmp, matched_fields));
+	ut_ad(*matched_fields == cur_field); /* In the debug version, the
+					     above cmp_debug_... sets
+					     *matched_fields to a value */
+	*matched_fields = cur_field;
+	*matched_bytes = cur_bytes;
+
+	return(ret);
+}
+
+/**************************************************************//**
+Compares a data tuple to a physical record.
+@see cmp_dtuple_rec_with_match
+@return 1, 0, -1, if dtuple is greater, equal, less than rec, respectively */
+UNIV_INTERN
+int
+cmp_dtuple_rec(
+/*===========*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	matched_fields	= 0;
+	ulint	matched_bytes	= 0;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	return(cmp_dtuple_rec_with_match(dtuple, rec, offsets,
+					 &matched_fields, &matched_bytes));
+}
+
+/**************************************************************//**
+Checks if a dtuple is a prefix of a record. The last field in dtuple
+is allowed to be a prefix of the corresponding field in the record.
+@return	TRUE if prefix */
+UNIV_INTERN
+ibool
+cmp_dtuple_is_prefix_of_rec(
+/*========================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	n_fields;
+	ulint	matched_fields	= 0;
+	ulint	matched_bytes	= 0;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	n_fields = dtuple_get_n_fields(dtuple);
+
+	if (n_fields > rec_offs_n_fields(offsets)) {
+
+		return(FALSE);
+	}
+
+	cmp_dtuple_rec_with_match(dtuple, rec, offsets,
+				  &matched_fields, &matched_bytes);
+	if (matched_fields == n_fields) {
+
+		return(TRUE);
+	}
+
+	if (matched_fields == n_fields - 1
+	    && matched_bytes == dfield_get_len(
+		    dtuple_get_nth_field(dtuple, n_fields - 1))) {
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*************************************************************//**
+Compare two physical record fields.
+@retval 1 if rec1 field is greater than rec2
+@retval -1 if rec1 field is less than rec2
+@retval 0 if rec1 field equals to rec2 */
+static __attribute__((nonnull, warn_unused_result))
+int
+cmp_rec_rec_simple_field(
+/*=====================*/
+	const rec_t*		rec1,	/*!< in: physical record */
+	const rec_t*		rec2,	/*!< in: physical record */
+	const ulint*		offsets1,/*!< in: rec_get_offsets(rec1, ...) */
+	const ulint*		offsets2,/*!< in: rec_get_offsets(rec2, ...) */
+	const dict_index_t*	index,	/*!< in: data dictionary index */
+	ulint			n)	/*!< in: field to compare */
+{
+	const byte*	rec1_b_ptr;
+	const byte*	rec2_b_ptr;
+	ulint		rec1_f_len;
+	ulint		rec2_f_len;
+	const dict_col_t*	col	= dict_index_get_nth_col(index, n);
+
+	ut_ad(!rec_offs_nth_extern(offsets1, n));
+	ut_ad(!rec_offs_nth_extern(offsets2, n));
+
+	rec1_b_ptr = rec_get_nth_field(rec1, offsets1, n, &rec1_f_len);
+	rec2_b_ptr = rec_get_nth_field(rec2, offsets2, n, &rec2_f_len);
+
+	if (rec1_f_len == UNIV_SQL_NULL || rec2_f_len == UNIV_SQL_NULL) {
+		if (rec1_f_len == rec2_f_len) {
+			return(0);
+		}
+		/* We define the SQL null to be the smallest possible
+		value of a field in the alphabetical order */
+		return(rec1_f_len == UNIV_SQL_NULL ? -1 : 1);
+	}
+
+	if (col->mtype >= DATA_FLOAT
+	    || (col->mtype == DATA_BLOB
+		&& !(col->prtype & DATA_BINARY_TYPE)
+		&& dtype_get_charset_coll(col->prtype)
+		!= DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {
+		return(cmp_whole_field(col->mtype, col->prtype,
+				       rec1_b_ptr, (unsigned) rec1_f_len,
+				       rec2_b_ptr, (unsigned) rec2_f_len));
+	}
+
+	/* Compare the fields */
+	for (ulint cur_bytes = 0;; cur_bytes++, rec1_b_ptr++, rec2_b_ptr++) {
+		ulint		rec1_byte;
+		ulint		rec2_byte;
+
+		if (rec2_f_len <= cur_bytes) {
+			if (rec1_f_len <= cur_bytes) {
+				return(0);
+			}
+
+			rec2_byte = dtype_get_pad_char(
+				col->mtype, col->prtype);
+
+			if (rec2_byte == ULINT_UNDEFINED) {
+				return(1);
+			}
+		} else {
+			rec2_byte = *rec2_b_ptr;
+		}
+
+		if (rec1_f_len <= cur_bytes) {
+			rec1_byte = dtype_get_pad_char(
+				col->mtype, col->prtype);
+
+			if (rec1_byte == ULINT_UNDEFINED) {
+				return(-1);
+			}
+		} else {
+			rec1_byte = *rec1_b_ptr;
+		}
+
+		if (rec1_byte == rec2_byte) {
+			/* If the bytes are equal, they will remain such
+			even after the collation transformation below */
+			continue;
+		}
+
+		if (col->mtype <= DATA_CHAR
+		    || (col->mtype == DATA_BLOB
+			&& !(col->prtype & DATA_BINARY_TYPE))) {
+
+			rec1_byte = cmp_collate(rec1_byte);
+			rec2_byte = cmp_collate(rec2_byte);
+		}
+
+		if (rec1_byte < rec2_byte) {
+			return(-1);
+		} else if (rec1_byte > rec2_byte) {
+			return(1);
+		}
+	}
+}
+
+/*************************************************************//**
+Compare two physical records that contain the same number of columns,
+none of which are stored externally.
+@retval 1 if rec1 (including non-ordering columns) is greater than rec2
+@retval -1 if rec1 (including non-ordering columns) is less than rec2
+@retval 0 if rec1 is a duplicate of rec2 */
+UNIV_INTERN
+int
+cmp_rec_rec_simple(
+/*===============*/
+	const rec_t*		rec1,	/*!< in: physical record */
+	const rec_t*		rec2,	/*!< in: physical record */
+	const ulint*		offsets1,/*!< in: rec_get_offsets(rec1, ...) */
+	const ulint*		offsets2,/*!< in: rec_get_offsets(rec2, ...) */
+	const dict_index_t*	index,	/*!< in: data dictionary index */
+	struct TABLE*		table)	/*!< in: MySQL table, for reporting
+					duplicate key value if applicable,
+					or NULL */
+{
+	ulint		n;
+	ulint		n_uniq	= dict_index_get_n_unique(index);
+	bool		null_eq	= false;
+
+	ut_ad(rec_offs_n_fields(offsets1) >= n_uniq);
+	ut_ad(rec_offs_n_fields(offsets2) == rec_offs_n_fields(offsets2));
+
+	ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2));
+
+	for (n = 0; n < n_uniq; n++) {
+		int cmp = cmp_rec_rec_simple_field(
+			rec1, rec2, offsets1, offsets2, index, n);
+
+		if (cmp) {
+			return(cmp);
+		}
+
+		/* If the fields are internally equal, they must both
+		be NULL or non-NULL. */
+		ut_ad(rec_offs_nth_sql_null(offsets1, n)
+		      == rec_offs_nth_sql_null(offsets2, n));
+
+		if (rec_offs_nth_sql_null(offsets1, n)) {
+			ut_ad(!(dict_index_get_nth_col(index, n)->prtype
+				& DATA_NOT_NULL));
+			null_eq = true;
+		}
+	}
+
+	/* If we ran out of fields, the ordering columns of rec1 were
+	equal to rec2. Issue a duplicate key error if needed. */
+
+	if (!null_eq && table && dict_index_is_unique(index)) {
+		/* Report erroneous row using new version of table. */
+		innobase_rec_to_mysql(table, rec1, index, offsets1);
+		return(0);
+	}
+
+	/* Else, keep comparing so that we have the full internal
+	order. */
+	for (; n < dict_index_get_n_fields(index); n++) {
+		int cmp = cmp_rec_rec_simple_field(
+			rec1, rec2, offsets1, offsets2, index, n);
+
+		if (cmp) {
+			return(cmp);
+		}
+
+		/* If the fields are internally equal, they must both
+		be NULL or non-NULL. */
+		ut_ad(rec_offs_nth_sql_null(offsets1, n)
+		      == rec_offs_nth_sql_null(offsets2, n));
+	}
+
+	/* This should never be reached. Internally, an index must
+	never contain duplicate entries. */
+	ut_ad(0);
+	return(0);
+}
+
+/*************************************************************//**
+This function is used to compare two physical records. Only the common
+first fields are compared, and if an externally stored field is
+encountered, then 0 is returned.
+@return 1, 0, -1 if rec1 is greater, equal, less, respectively */
+UNIV_INTERN
+int
+cmp_rec_rec_with_match(
+/*===================*/
+	const rec_t*	rec1,	/*!< in: physical record */
+	const rec_t*	rec2,	/*!< in: physical record */
+	const ulint*	offsets1,/*!< in: rec_get_offsets(rec1, index) */
+	const ulint*	offsets2,/*!< in: rec_get_offsets(rec2, index) */
+	dict_index_t*	index,	/*!< in: data dictionary index */
+	ibool		nulls_unequal,
+				/* in: TRUE if this is for index statistics
+				cardinality estimation, and innodb_stats_method
+				is "nulls_unequal" or "nulls_ignored" */
+	ulint*		matched_fields, /*!< in/out: number of already completely
+				matched fields; when the function returns,
+				contains the value the for current
+				comparison */
+	ulint*		matched_bytes) /*!< in/out: number of already matched
+				bytes within the first field not completely
+				matched; when the function returns, contains
+				the value for the current comparison */
+{
+	ulint		rec1_n_fields;	/* the number of fields in rec */
+	ulint		rec1_f_len;	/* length of current field in rec */
+	const byte*	rec1_b_ptr;	/* pointer to the current byte
+					in rec field */
+	ulint		rec1_byte;	/* value of current byte to be
+					compared in rec */
+	ulint		rec2_n_fields;	/* the number of fields in rec */
+	ulint		rec2_f_len;	/* length of current field in rec */
+	const byte*	rec2_b_ptr;	/* pointer to the current byte
+					in rec field */
+	ulint		rec2_byte;	/* value of current byte to be
+					compared in rec */
+	ulint		cur_field;	/* current field number */
+	ulint		cur_bytes;	/* number of already matched
+					bytes in current field */
+	int		ret = 0;	/* return value */
+	ulint		comp;
+
+	ut_ad(rec1 && rec2 && index);
+	ut_ad(rec_offs_validate(rec1, index, offsets1));
+	ut_ad(rec_offs_validate(rec2, index, offsets2));
+	ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2));
+
+	comp = rec_offs_comp(offsets1);
+	rec1_n_fields = rec_offs_n_fields(offsets1);
+	rec2_n_fields = rec_offs_n_fields(offsets2);
+
+	cur_field = *matched_fields;
+	cur_bytes = *matched_bytes;
+
+	/* Match fields in a loop */
+
+	while ((cur_field < rec1_n_fields) && (cur_field < rec2_n_fields)) {
+
+		ulint	mtype;
+		ulint	prtype;
+
+		if (dict_index_is_univ(index)) {
+			/* This is for the insert buffer B-tree. */
+			mtype = DATA_BINARY;
+			prtype = 0;
+		} else {
+			const dict_col_t*	col
+				= dict_index_get_nth_col(index, cur_field);
+
+			mtype = col->mtype;
+			prtype = col->prtype;
+		}
+
+		rec1_b_ptr = rec_get_nth_field(rec1, offsets1,
+					       cur_field, &rec1_f_len);
+		rec2_b_ptr = rec_get_nth_field(rec2, offsets2,
+					       cur_field, &rec2_f_len);
+
+		if (cur_bytes == 0) {
+			if (cur_field == 0) {
+				/* Test if rec is the predefined minimum
+				record */
+				if (UNIV_UNLIKELY(rec_get_info_bits(rec1, comp)
+						  & REC_INFO_MIN_REC_FLAG)) {
+
+					if (!(rec_get_info_bits(rec2, comp)
+					      & REC_INFO_MIN_REC_FLAG)) {
+						ret = -1;
+					}
+
+					goto order_resolved;
+
+				} else if (UNIV_UNLIKELY
+					   (rec_get_info_bits(rec2, comp)
+					    & REC_INFO_MIN_REC_FLAG)) {
+
+					ret = 1;
+
+					goto order_resolved;
+				}
+			}
+
+			if (rec_offs_nth_extern(offsets1, cur_field)
+			    || rec_offs_nth_extern(offsets2, cur_field)) {
+				/* We do not compare to an externally
+				stored field */
+
+				goto order_resolved;
+			}
+
+			if (rec1_f_len == UNIV_SQL_NULL
+			    || rec2_f_len == UNIV_SQL_NULL) {
+
+				if (rec1_f_len == rec2_f_len) {
+					/* This is limited to stats collection,
+					cannot use it for regular search */
+					if (nulls_unequal) {
+						ret = -1;
+					} else {
+						goto next_field;
+					}
+				} else if (rec2_f_len == UNIV_SQL_NULL) {
+
+					/* We define the SQL null to be the
+					smallest possible value of a field
+					in the alphabetical order */
+
+					ret = 1;
+				} else {
+					ret = -1;
+				}
+
+				goto order_resolved;
+			}
+		}
+
+		if (mtype >= DATA_FLOAT
+		    || (mtype == DATA_BLOB
+			&& 0 == (prtype & DATA_BINARY_TYPE)
+			&& dtype_get_charset_coll(prtype)
+			!= DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {
+
+			ret = cmp_whole_field(mtype, prtype,
+					      rec1_b_ptr,
+					      (unsigned) rec1_f_len,
+					      rec2_b_ptr,
+					      (unsigned) rec2_f_len);
+			if (ret != 0) {
+				cur_bytes = 0;
+
+				goto order_resolved;
+			} else {
+				goto next_field;
+			}
+		}
+
+		/* Set the pointers at the current byte */
+		rec1_b_ptr = rec1_b_ptr + cur_bytes;
+		rec2_b_ptr = rec2_b_ptr + cur_bytes;
+
+		/* Compare then the fields */
+		for (;;) {
+			if (rec2_f_len <= cur_bytes) {
+
+				if (rec1_f_len <= cur_bytes) {
+
+					goto next_field;
+				}
+
+				rec2_byte = dtype_get_pad_char(mtype, prtype);
+
+				if (rec2_byte == ULINT_UNDEFINED) {
+					ret = 1;
+
+					goto order_resolved;
+				}
+			} else {
+				rec2_byte = *rec2_b_ptr;
+			}
+
+			if (rec1_f_len <= cur_bytes) {
+				rec1_byte = dtype_get_pad_char(mtype, prtype);
+
+				if (rec1_byte == ULINT_UNDEFINED) {
+					ret = -1;
+
+					goto order_resolved;
+				}
+			} else {
+				rec1_byte = *rec1_b_ptr;
+			}
+
+			if (rec1_byte == rec2_byte) {
+				/* If the bytes are equal, they will remain
+				such even after the collation transformation
+				below */
+
+				goto next_byte;
+			}
+
+			if (mtype <= DATA_CHAR
+			    || (mtype == DATA_BLOB
+				&& !(prtype & DATA_BINARY_TYPE))) {
+
+				rec1_byte = cmp_collate(rec1_byte);
+				rec2_byte = cmp_collate(rec2_byte);
+			}
+
+			if (rec1_byte < rec2_byte) {
+				ret = -1;
+				goto order_resolved;
+			} else if (rec1_byte > rec2_byte) {
+				ret = 1;
+				goto order_resolved;
+			}
+next_byte:
+			/* Next byte */
+
+			cur_bytes++;
+			rec1_b_ptr++;
+			rec2_b_ptr++;
+		}
+
+next_field:
+		cur_field++;
+		cur_bytes = 0;
+	}
+
+	ut_ad(cur_bytes == 0);
+
+	/* If we ran out of fields, rec1 was equal to rec2 up
+	to the common fields */
+	ut_ad(ret == 0);
+order_resolved:
+
+	ut_ad((ret >= - 1) && (ret <= 1));
+
+	*matched_fields = cur_field;
+	*matched_bytes = cur_bytes;
+
+	return(ret);
+}
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Used in debug checking of cmp_dtuple_... .
+This function is used to compare a data tuple to a physical record. If
+dtuple has n fields then rec must have either m >= n fields, or it must
+differ from dtuple in some of the m fields rec has. If encounters an
+externally stored field, returns 0.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared */
+static
+int
+cmp_debug_dtuple_rec_with_match(
+/*============================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record which differs from
+				dtuple in some of the common fields, or which
+				has an equal number or more fields than
+				dtuple */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n_cmp,	/*!< in: number of fields to compare */
+	ulint*		matched_fields) /*!< in/out: number of already
+				completely matched fields; when function
+				returns, contains the value for current
+				comparison */
+{
+	const dfield_t*	dtuple_field;	/* current field in logical record */
+	ulint		dtuple_f_len;	/* the length of the current field
+					in the logical record */
+	const byte*	dtuple_f_data;	/* pointer to the current logical
+					field data */
+	ulint		rec_f_len;	/* length of current field in rec */
+	const byte*	rec_f_data;	/* pointer to the current rec field */
+	int		ret;		/* return value */
+	ulint		cur_field;	/* current field number */
+
+	ut_ad(dtuple && rec && matched_fields);
+	ut_ad(dtuple_check_typed(dtuple));
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	ut_ad(n_cmp > 0);
+	ut_ad(n_cmp <= dtuple_get_n_fields(dtuple));
+	ut_ad(*matched_fields <= n_cmp);
+	ut_ad(*matched_fields <= rec_offs_n_fields(offsets));
+
+	cur_field = *matched_fields;
+
+	if (cur_field == 0) {
+		if (UNIV_UNLIKELY
+		    (rec_get_info_bits(rec, rec_offs_comp(offsets))
+		     & REC_INFO_MIN_REC_FLAG)) {
+
+			ret = !(dtuple_get_info_bits(dtuple)
+				& REC_INFO_MIN_REC_FLAG);
+
+			goto order_resolved;
+		}
+
+		if (UNIV_UNLIKELY
+		    (dtuple_get_info_bits(dtuple) & REC_INFO_MIN_REC_FLAG)) {
+			ret = -1;
+
+			goto order_resolved;
+		}
+	}
+
+	/* Match fields in a loop; stop if we run out of fields in dtuple */
+
+	while (cur_field < n_cmp) {
+
+		ulint	mtype;
+		ulint	prtype;
+
+		dtuple_field = dtuple_get_nth_field(dtuple, cur_field);
+		{
+			const dtype_t*	type
+				= dfield_get_type(dtuple_field);
+
+			mtype = type->mtype;
+			prtype = type->prtype;
+		}
+
+		dtuple_f_data = static_cast<const byte*>(
+			dfield_get_data(dtuple_field));
+
+		dtuple_f_len = dfield_get_len(dtuple_field);
+
+		rec_f_data = rec_get_nth_field(rec, offsets,
+					       cur_field, &rec_f_len);
+
+		if (rec_offs_nth_extern(offsets, cur_field)) {
+			/* We do not compare to an externally stored field */
+
+			ret = 0;
+
+			goto order_resolved;
+		}
+
+		ret = cmp_data_data(mtype, prtype, dtuple_f_data, dtuple_f_len,
+				    rec_f_data, rec_f_len);
+		if (ret != 0) {
+			goto order_resolved;
+		}
+
+		cur_field++;
+	}
+
+	ret = 0;	/* If we ran out of fields, dtuple was equal to rec
+			up to the common fields */
+order_resolved:
+	ut_ad((ret >= - 1) && (ret <= 1));
+
+	*matched_fields = cur_field;
+
+	return(ret);
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/rem/rem0rec.cc b/storage/innobase/rem/rem0rec.cc
new file mode 100644
index 00000000000..0d7b7c16785
--- /dev/null
+++ b/storage/innobase/rem/rem0rec.cc
@@ -0,0 +1,1963 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file rem/rem0rec.cc
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "rem0rec.h"
+
+#ifdef UNIV_NONINL
+#include "rem0rec.ic"
+#endif
+
+#include "page0page.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "fts0fts.h"
+
+/*			PHYSICAL RECORD (OLD STYLE)
+			===========================
+
+The physical record, which is the data type of all the records
+found in index pages of the database, has the following format
+(lower addresses and more significant bits inside a byte are below
+represented on a higher text line):
+
+| offset of the end of the last field of data, the most significant
+  bit is set to 1 if and only if the field is SQL-null,
+  if the offset is 2-byte, then the second most significant
+  bit is set to 1 if the field is stored on another page:
+  mostly this will occur in the case of big BLOB fields |
+...
+| offset of the end of the first field of data + the SQL-null bit |
+| 4 bits used to delete mark a record, and mark a predefined
+  minimum record in alphabetical order |
+| 4 bits giving the number of records owned by this record
+  (this term is explained in page0page.h) |
+| 13 bits giving the order number of this record in the
+  heap of the index page |
+| 10 bits giving the number of fields in this record |
+| 1 bit which is set to 1 if the offsets above are given in
+  one byte format, 0 if in two byte format |
+| two bytes giving an absolute pointer to the next record in the page |
+ORIGIN of the record
+| first field of data |
+...
+| last field of data |
+
+The origin of the record is the start address of the first field
+of data. The offsets are given relative to the origin.
+The offsets of the data fields are stored in an inverted
+order because then the offset of the first fields are near the
+origin, giving maybe a better processor cache hit rate in searches.
+
+The offsets of the data fields are given as one-byte
+(if there are less than 127 bytes of data in the record)
+or two-byte unsigned integers. The most significant bit
+is not part of the offset, instead it indicates the SQL-null
+if the bit is set to 1. */
+
+/*			PHYSICAL RECORD (NEW STYLE)
+			===========================
+
+The physical record, which is the data type of all the records
+found in index pages of the database, has the following format
+(lower addresses and more significant bits inside a byte are below
+represented on a higher text line):
+
+| length of the last non-null variable-length field of data:
+  if the maximum length is 255, one byte; otherwise,
+  0xxxxxxx (one byte, length=0..127), or 1exxxxxxxxxxxxxx (two bytes,
+  length=128..16383, extern storage flag) |
+...
+| length of first variable-length field of data |
+| SQL-null flags (1 bit per nullable field), padded to full bytes |
+| 4 bits used to delete mark a record, and mark a predefined
+  minimum record in alphabetical order |
+| 4 bits giving the number of records owned by this record
+  (this term is explained in page0page.h) |
+| 13 bits giving the order number of this record in the
+  heap of the index page |
+| 3 bits record type: 000=conventional, 001=node pointer (inside B-tree),
+  010=infimum, 011=supremum, 1xx=reserved |
+| two bytes giving a relative pointer to the next record in the page |
+ORIGIN of the record
+| first field of data |
+...
+| last field of data |
+
+The origin of the record is the start address of the first field
+of data. The offsets are given relative to the origin.
+The offsets of the data fields are stored in an inverted
+order because then the offset of the first fields are near the
+origin, giving maybe a better processor cache hit rate in searches.
+
+The offsets of the data fields are given as one-byte
+(if there are less than 127 bytes of data in the record)
+or two-byte unsigned integers. The most significant bit
+is not part of the offset, instead it indicates the SQL-null
+if the bit is set to 1. */
+
+/* CANONICAL COORDINATES. A record can be seen as a single
+string of 'characters' in the following way: catenate the bytes
+in each field, in the order of fields. An SQL-null field
+is taken to be an empty sequence of bytes. Then after
+the position of each field insert in the string
+the 'character' <FIELD-END>, except that after an SQL-null field
+insert <NULL-FIELD-END>. Now the ordinal position of each
+byte in this canonical string is its canonical coordinate.
+So, for the record ("AA", SQL-NULL, "BB", ""), the canonical
+string is "AA<FIELD_END><NULL-FIELD-END>BB<FIELD-END><FIELD-END>".
+We identify prefixes (= initial segments) of a record
+with prefixes of the canonical string. The canonical
+length of the prefix is the length of the corresponding
+prefix of the canonical string. The canonical length of
+a record is the length of its canonical string.
+
+For example, the maximal common prefix of records
+("AA", SQL-NULL, "BB", "C") and ("AA", SQL-NULL, "B", "C")
+is "AA<FIELD-END><NULL-FIELD-END>B", and its canonical
+length is 5.
+
+A complete-field prefix of a record is a prefix which ends at the
+end of some field (containing also <FIELD-END>).
+A record is a complete-field prefix of another record, if
+the corresponding canonical strings have the same property. */
+
+/* this is used to fool compiler in rec_validate */
+UNIV_INTERN ulint	rec_dummy;
+
+/***************************************************************//**
+Validates the consistency of an old-style physical record.
+@return	TRUE if ok */
+static
+ibool
+rec_validate_old(
+/*=============*/
+	const rec_t*	rec);	/*!< in: physical record */
+
+/******************************************************//**
+Determine how many of the first n columns in a compact
+physical record are stored externally.
+@return	number of externally stored columns */
+UNIV_INTERN
+ulint
+rec_get_n_extern_new(
+/*=================*/
+	const rec_t*		rec,	/*!< in: compact physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint			n)	/*!< in: number of columns to scan */
+{
+	const byte*	nulls;
+	const byte*	lens;
+	ulint		null_mask;
+	ulint		n_extern;
+	ulint		i;
+
+	ut_ad(dict_table_is_comp(index->table));
+	ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY);
+	ut_ad(n == ULINT_UNDEFINED || n <= dict_index_get_n_fields(index));
+
+	if (n == ULINT_UNDEFINED) {
+		n = dict_index_get_n_fields(index);
+	}
+
+	nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1);
+	lens = nulls - UT_BITS_IN_BYTES(index->n_nullable);
+	null_mask = 1;
+	n_extern = 0;
+	i = 0;
+
+	/* read the lengths of fields 0..n */
+	do {
+		const dict_field_t*	field
+			= dict_index_get_nth_field(index, i);
+		const dict_col_t*	col
+			= dict_field_get_col(field);
+		ulint			len;
+
+		if (!(col->prtype & DATA_NOT_NULL)) {
+			/* nullable field => read the null flag */
+
+			if (UNIV_UNLIKELY(!(byte) null_mask)) {
+				nulls--;
+				null_mask = 1;
+			}
+
+			if (*nulls & null_mask) {
+				null_mask <<= 1;
+				/* No length is stored for NULL fields. */
+				continue;
+			}
+			null_mask <<= 1;
+		}
+
+		if (UNIV_UNLIKELY(!field->fixed_len)) {
+			/* Variable-length field: read the length */
+			len = *lens--;
+			/* If the maximum length of the field is up
+			to 255 bytes, the actual length is always
+			stored in one byte. If the maximum length is
+			more than 255 bytes, the actual length is
+			stored in one byte for 0..127.  The length
+			will be encoded in two bytes when it is 128 or
+			more, or when the field is stored externally. */
+			if (UNIV_UNLIKELY(col->len > 255)
+			    || UNIV_UNLIKELY(col->mtype == DATA_BLOB)) {
+				if (len & 0x80) {
+					/* 1exxxxxxx xxxxxxxx */
+					if (len & 0x40) {
+						n_extern++;
+					}
+					lens--;
+				}
+			}
+		}
+	} while (++i < n);
+
+	return(n_extern);
+}
+
+/******************************************************//**
+Determine the offset to each field in a leaf-page record
+in ROW_FORMAT=COMPACT.  This is a special case of
+rec_init_offsets() and rec_get_offsets_func(). */
+UNIV_INLINE __attribute__((nonnull))
+void
+rec_init_offsets_comp_ordinary(
+/*===========================*/
+	const rec_t*		rec,	/*!< in: physical record in
+					ROW_FORMAT=COMPACT */
+	bool			temp,	/*!< in: whether to use the
+					format for temporary files in
+					index creation */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint*			offsets)/*!< in/out: array of offsets;
+					in: n=rec_offs_n_fields(offsets) */
+{
+	ulint		i		= 0;
+	ulint		offs		= 0;
+	ulint		any_ext		= 0;
+	ulint		n_null		= index->n_nullable;
+	const byte*	nulls		= temp
+		? rec - 1
+		: rec - (1 + REC_N_NEW_EXTRA_BYTES);
+	const byte*	lens		= nulls - UT_BITS_IN_BYTES(n_null);
+	ulint		null_mask	= 1;
+
+#ifdef UNIV_DEBUG
+	/* We cannot invoke rec_offs_make_valid() here if temp=true.
+	Similarly, rec_offs_validate() will fail in that case, because
+	it invokes rec_get_status(). */
+	offsets[2] = (ulint) rec;
+	offsets[3] = (ulint) index;
+#endif /* UNIV_DEBUG */
+
+	ut_ad(temp || dict_table_is_comp(index->table));
+
+	if (temp && dict_table_is_comp(index->table)) {
+		/* No need to do adjust fixed_len=0. We only need to
+		adjust it for ROW_FORMAT=REDUNDANT. */
+		temp = false;
+	}
+
+	/* read the lengths of fields 0..n */
+	do {
+		const dict_field_t*	field
+			= dict_index_get_nth_field(index, i);
+		const dict_col_t*	col
+			= dict_field_get_col(field);
+		ulint			len;
+
+		if (!(col->prtype & DATA_NOT_NULL)) {
+			/* nullable field => read the null flag */
+			ut_ad(n_null--);
+
+			if (UNIV_UNLIKELY(!(byte) null_mask)) {
+				nulls--;
+				null_mask = 1;
+			}
+
+			if (*nulls & null_mask) {
+				null_mask <<= 1;
+				/* No length is stored for NULL fields.
+				We do not advance offs, and we set
+				the length to zero and enable the
+				SQL NULL flag in offsets[]. */
+				len = offs | REC_OFFS_SQL_NULL;
+				goto resolved;
+			}
+			null_mask <<= 1;
+		}
+
+		if (!field->fixed_len
+		    || (temp && !dict_col_get_fixed_size(col, temp))) {
+			/* Variable-length field: read the length */
+			len = *lens--;
+			/* If the maximum length of the field is up
+			to 255 bytes, the actual length is always
+			stored in one byte. If the maximum length is
+			more than 255 bytes, the actual length is
+			stored in one byte for 0..127.  The length
+			will be encoded in two bytes when it is 128 or
+			more, or when the field is stored externally. */
+			if (UNIV_UNLIKELY(col->len > 255)
+			    || UNIV_UNLIKELY(col->mtype
+					     == DATA_BLOB)) {
+				if (len & 0x80) {
+					/* 1exxxxxxx xxxxxxxx */
+					len <<= 8;
+					len |= *lens--;
+
+					offs += len & 0x3fff;
+					if (UNIV_UNLIKELY(len
+							  & 0x4000)) {
+						ut_ad(dict_index_is_clust
+						      (index));
+						any_ext = REC_OFFS_EXTERNAL;
+						len = offs
+							| REC_OFFS_EXTERNAL;
+					} else {
+						len = offs;
+					}
+
+					goto resolved;
+				}
+			}
+
+			len = offs += len;
+		} else {
+			len = offs += field->fixed_len;
+		}
+resolved:
+		rec_offs_base(offsets)[i + 1] = len;
+	} while (++i < rec_offs_n_fields(offsets));
+
+	*rec_offs_base(offsets)
+		= (rec - (lens + 1)) | REC_OFFS_COMPACT | any_ext;
+}
+
+/******************************************************//**
+The following function determines the offsets to each field in the
+record.	 The offsets are written to a previously allocated array of
+ulint, where rec_offs_n_fields(offsets) has been initialized to the
+number of fields in the record.	 The rest of the array will be
+initialized by this function.  rec_offs_base(offsets)[0] will be set
+to the extra size (if REC_OFFS_COMPACT is set, the record is in the
+new format; if REC_OFFS_EXTERNAL is set, the record contains externally
+stored columns), and rec_offs_base(offsets)[1..n_fields] will be set to
+offsets past the end of fields 0..n_fields, or to the beginning of
+fields 1..n_fields+1.  When the high-order bit of the offset at [i+1]
+is set (REC_OFFS_SQL_NULL), the field i is NULL.  When the second
+high-order bit of the offset at [i+1] is set (REC_OFFS_EXTERNAL), the
+field i is being stored externally. */
+static
+void
+rec_init_offsets(
+/*=============*/
+	const rec_t*		rec,	/*!< in: physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint*			offsets)/*!< in/out: array of offsets;
+					in: n=rec_offs_n_fields(offsets) */
+{
+	ulint	i	= 0;
+	ulint	offs;
+
+	rec_offs_make_valid(rec, index, offsets);
+
+	if (dict_table_is_comp(index->table)) {
+		const byte*	nulls;
+		const byte*	lens;
+		dict_field_t*	field;
+		ulint		null_mask;
+		ulint		status = rec_get_status(rec);
+		ulint		n_node_ptr_field = ULINT_UNDEFINED;
+
+		switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) {
+		case REC_STATUS_INFIMUM:
+		case REC_STATUS_SUPREMUM:
+			/* the field is 8 bytes long */
+			rec_offs_base(offsets)[0]
+				= REC_N_NEW_EXTRA_BYTES | REC_OFFS_COMPACT;
+			rec_offs_base(offsets)[1] = 8;
+			return;
+		case REC_STATUS_NODE_PTR:
+			n_node_ptr_field
+				= dict_index_get_n_unique_in_tree(index);
+			break;
+		case REC_STATUS_ORDINARY:
+			rec_init_offsets_comp_ordinary(
+				rec, false, index, offsets);
+			return;
+		}
+
+		nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1);
+		lens = nulls - UT_BITS_IN_BYTES(index->n_nullable);
+		offs = 0;
+		null_mask = 1;
+
+		/* read the lengths of fields 0..n */
+		do {
+			ulint	len;
+			if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
+				len = offs += REC_NODE_PTR_SIZE;
+				goto resolved;
+			}
+
+			field = dict_index_get_nth_field(index, i);
+			if (!(dict_field_get_col(field)->prtype
+			      & DATA_NOT_NULL)) {
+				/* nullable field => read the null flag */
+
+				if (UNIV_UNLIKELY(!(byte) null_mask)) {
+					nulls--;
+					null_mask = 1;
+				}
+
+				if (*nulls & null_mask) {
+					null_mask <<= 1;
+					/* No length is stored for NULL fields.
+					We do not advance offs, and we set
+					the length to zero and enable the
+					SQL NULL flag in offsets[]. */
+					len = offs | REC_OFFS_SQL_NULL;
+					goto resolved;
+				}
+				null_mask <<= 1;
+			}
+
+			if (UNIV_UNLIKELY(!field->fixed_len)) {
+				/* Variable-length field: read the length */
+				const dict_col_t*	col
+					= dict_field_get_col(field);
+				len = *lens--;
+				/* If the maximum length of the field
+				is up to 255 bytes, the actual length
+				is always stored in one byte. If the
+				maximum length is more than 255 bytes,
+				the actual length is stored in one
+				byte for 0..127.  The length will be
+				encoded in two bytes when it is 128 or
+				more, or when the field is stored
+				externally. */
+				if (UNIV_UNLIKELY(col->len > 255)
+				    || UNIV_UNLIKELY(col->mtype
+						     == DATA_BLOB)) {
+					if (len & 0x80) {
+						/* 1exxxxxxx xxxxxxxx */
+
+						len <<= 8;
+						len |= *lens--;
+
+						/* B-tree node pointers
+						must not contain externally
+						stored columns.  Thus
+						the "e" flag must be 0. */
+						ut_a(!(len & 0x4000));
+						offs += len & 0x3fff;
+						len = offs;
+
+						goto resolved;
+					}
+				}
+
+				len = offs += len;
+			} else {
+				len = offs += field->fixed_len;
+			}
+resolved:
+			rec_offs_base(offsets)[i + 1] = len;
+		} while (++i < rec_offs_n_fields(offsets));
+
+		*rec_offs_base(offsets)
+			= (rec - (lens + 1)) | REC_OFFS_COMPACT;
+	} else {
+		/* Old-style record: determine extra size and end offsets */
+		offs = REC_N_OLD_EXTRA_BYTES;
+		if (rec_get_1byte_offs_flag(rec)) {
+			offs += rec_offs_n_fields(offsets);
+			*rec_offs_base(offsets) = offs;
+			/* Determine offsets to fields */
+			do {
+				offs = rec_1_get_field_end_info(rec, i);
+				if (offs & REC_1BYTE_SQL_NULL_MASK) {
+					offs &= ~REC_1BYTE_SQL_NULL_MASK;
+					offs |= REC_OFFS_SQL_NULL;
+				}
+				rec_offs_base(offsets)[1 + i] = offs;
+			} while (++i < rec_offs_n_fields(offsets));
+		} else {
+			offs += 2 * rec_offs_n_fields(offsets);
+			*rec_offs_base(offsets) = offs;
+			/* Determine offsets to fields */
+			do {
+				offs = rec_2_get_field_end_info(rec, i);
+				if (offs & REC_2BYTE_SQL_NULL_MASK) {
+					offs &= ~REC_2BYTE_SQL_NULL_MASK;
+					offs |= REC_OFFS_SQL_NULL;
+				}
+				if (offs & REC_2BYTE_EXTERN_MASK) {
+					offs &= ~REC_2BYTE_EXTERN_MASK;
+					offs |= REC_OFFS_EXTERNAL;
+					*rec_offs_base(offsets) |= REC_OFFS_EXTERNAL;
+				}
+				rec_offs_base(offsets)[1 + i] = offs;
+			} while (++i < rec_offs_n_fields(offsets));
+		}
+	}
+}
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record.	It can reuse a previously returned array.
+@return	the new offsets */
+UNIV_INTERN
+ulint*
+rec_get_offsets_func(
+/*=================*/
+	const rec_t*		rec,	/*!< in: physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint*			offsets,/*!< in/out: array consisting of
+					offsets[0] allocated elements,
+					or an array from rec_get_offsets(),
+					or NULL */
+	ulint			n_fields,/*!< in: maximum number of
+					initialized fields
+					 (ULINT_UNDEFINED if all fields) */
+#ifdef UNIV_DEBUG
+	const char*		file,	/*!< in: file name where called */
+	ulint			line,	/*!< in: line number where called */
+#endif /* UNIV_DEBUG */
+	mem_heap_t**		heap)	/*!< in/out: memory heap */
+{
+	ulint	n;
+	ulint	size;
+
+	ut_ad(rec);
+	ut_ad(index);
+	ut_ad(heap);
+
+	if (dict_table_is_comp(index->table)) {
+		switch (UNIV_EXPECT(rec_get_status(rec),
+				    REC_STATUS_ORDINARY)) {
+		case REC_STATUS_ORDINARY:
+			n = dict_index_get_n_fields(index);
+			break;
+		case REC_STATUS_NODE_PTR:
+			/* Node pointer records consist of the
+			uniquely identifying fields of the record
+			followed by a child page number field. */
+			n = dict_index_get_n_unique_in_tree(index) + 1;
+			break;
+		case REC_STATUS_INFIMUM:
+		case REC_STATUS_SUPREMUM:
+			/* infimum or supremum record */
+			n = 1;
+			break;
+		default:
+			ut_error;
+			return(NULL);
+		}
+	} else {
+		n = rec_get_n_fields_old(rec);
+	}
+
+	if (UNIV_UNLIKELY(n_fields < n)) {
+		n = n_fields;
+	}
+
+	/* The offsets header consists of the allocation size at
+	offsets[0] and the REC_OFFS_HEADER_SIZE bytes. */
+	size = n + (1 + REC_OFFS_HEADER_SIZE);
+
+	if (UNIV_UNLIKELY(!offsets)
+	    || UNIV_UNLIKELY(rec_offs_get_n_alloc(offsets) < size)) {
+		if (UNIV_UNLIKELY(!*heap)) {
+			*heap = mem_heap_create_at(size * sizeof(ulint),
+						   file, line);
+		}
+		offsets = static_cast<ulint*>(
+			mem_heap_alloc(*heap, size * sizeof(ulint)));
+
+		rec_offs_set_n_alloc(offsets, size);
+	}
+
+	rec_offs_set_n_fields(offsets, n);
+	rec_init_offsets(rec, index, offsets);
+	return(offsets);
+}
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record.  It can reuse a previously allocated array. */
+UNIV_INTERN
+void
+rec_get_offsets_reverse(
+/*====================*/
+	const byte*		extra,	/*!< in: the extra bytes of a
+					compact record in reverse order,
+					excluding the fixed-size
+					REC_N_NEW_EXTRA_BYTES */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint			node_ptr,/*!< in: nonzero=node pointer,
+					0=leaf node */
+	ulint*			offsets)/*!< in/out: array consisting of
+					offsets[0] allocated elements */
+{
+	ulint		n;
+	ulint		i;
+	ulint		offs;
+	ulint		any_ext;
+	const byte*	nulls;
+	const byte*	lens;
+	dict_field_t*	field;
+	ulint		null_mask;
+	ulint		n_node_ptr_field;
+
+	ut_ad(extra);
+	ut_ad(index);
+	ut_ad(offsets);
+	ut_ad(dict_table_is_comp(index->table));
+
+	if (UNIV_UNLIKELY(node_ptr)) {
+		n_node_ptr_field = dict_index_get_n_unique_in_tree(index);
+		n = n_node_ptr_field + 1;
+	} else {
+		n_node_ptr_field = ULINT_UNDEFINED;
+		n = dict_index_get_n_fields(index);
+	}
+
+	ut_a(rec_offs_get_n_alloc(offsets) >= n + (1 + REC_OFFS_HEADER_SIZE));
+	rec_offs_set_n_fields(offsets, n);
+
+	nulls = extra;
+	lens = nulls + UT_BITS_IN_BYTES(index->n_nullable);
+	i = offs = 0;
+	null_mask = 1;
+	any_ext = 0;
+
+	/* read the lengths of fields 0..n */
+	do {
+		ulint	len;
+		if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
+			len = offs += REC_NODE_PTR_SIZE;
+			goto resolved;
+		}
+
+		field = dict_index_get_nth_field(index, i);
+		if (!(dict_field_get_col(field)->prtype & DATA_NOT_NULL)) {
+			/* nullable field => read the null flag */
+
+			if (UNIV_UNLIKELY(!(byte) null_mask)) {
+				nulls++;
+				null_mask = 1;
+			}
+
+			if (*nulls & null_mask) {
+				null_mask <<= 1;
+				/* No length is stored for NULL fields.
+				We do not advance offs, and we set
+				the length to zero and enable the
+				SQL NULL flag in offsets[]. */
+				len = offs | REC_OFFS_SQL_NULL;
+				goto resolved;
+			}
+			null_mask <<= 1;
+		}
+
+		if (UNIV_UNLIKELY(!field->fixed_len)) {
+			/* Variable-length field: read the length */
+			const dict_col_t*	col
+				= dict_field_get_col(field);
+			len = *lens++;
+			/* If the maximum length of the field is up
+			to 255 bytes, the actual length is always
+			stored in one byte. If the maximum length is
+			more than 255 bytes, the actual length is
+			stored in one byte for 0..127.  The length
+			will be encoded in two bytes when it is 128 or
+			more, or when the field is stored externally. */
+			if (UNIV_UNLIKELY(col->len > 255)
+			    || UNIV_UNLIKELY(col->mtype == DATA_BLOB)) {
+				if (len & 0x80) {
+					/* 1exxxxxxx xxxxxxxx */
+					len <<= 8;
+					len |= *lens++;
+
+					offs += len & 0x3fff;
+					if (UNIV_UNLIKELY(len & 0x4000)) {
+						any_ext = REC_OFFS_EXTERNAL;
+						len = offs | REC_OFFS_EXTERNAL;
+					} else {
+						len = offs;
+					}
+
+					goto resolved;
+				}
+			}
+
+			len = offs += len;
+		} else {
+			len = offs += field->fixed_len;
+		}
+resolved:
+		rec_offs_base(offsets)[i + 1] = len;
+	} while (++i < rec_offs_n_fields(offsets));
+
+	ut_ad(lens >= extra);
+	*rec_offs_base(offsets) = (lens - extra + REC_N_NEW_EXTRA_BYTES)
+		| REC_OFFS_COMPACT | any_ext;
+}
+
+/************************************************************//**
+The following function is used to get the offset to the nth
+data field in an old-style record.
+@return	offset to the field */
+UNIV_INTERN
+ulint
+rec_get_nth_field_offs_old(
+/*=======================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n,	/*!< in: index of the field */
+	ulint*		len)	/*!< out: length of the field;
+				UNIV_SQL_NULL if SQL null */
+{
+	ulint	os;
+	ulint	next_os;
+
+	ut_ad(len);
+	ut_a(rec);
+	ut_a(n < rec_get_n_fields_old(rec));
+
+	if (rec_get_1byte_offs_flag(rec)) {
+		os = rec_1_get_field_start_offs(rec, n);
+
+		next_os = rec_1_get_field_end_info(rec, n);
+
+		if (next_os & REC_1BYTE_SQL_NULL_MASK) {
+			*len = UNIV_SQL_NULL;
+
+			return(os);
+		}
+
+		next_os = next_os & ~REC_1BYTE_SQL_NULL_MASK;
+	} else {
+		os = rec_2_get_field_start_offs(rec, n);
+
+		next_os = rec_2_get_field_end_info(rec, n);
+
+		if (next_os & REC_2BYTE_SQL_NULL_MASK) {
+			*len = UNIV_SQL_NULL;
+
+			return(os);
+		}
+
+		next_os = next_os & ~(REC_2BYTE_SQL_NULL_MASK
+				      | REC_2BYTE_EXTERN_MASK);
+	}
+
+	*len = next_os - os;
+
+	ut_ad(*len < UNIV_PAGE_SIZE);
+
+	return(os);
+}
+
+/**********************************************************//**
+Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT.
+@return	total size */
+UNIV_INLINE __attribute__((warn_unused_result, nonnull(1,2)))
+ulint
+rec_get_converted_size_comp_prefix_low(
+/*===================================*/
+	const dict_index_t*	index,	/*!< in: record descriptor;
+					dict_table_is_comp() is
+					assumed to hold, even if
+					it does not */
+	const dfield_t*		fields,	/*!< in: array of data fields */
+	ulint			n_fields,/*!< in: number of data fields */
+	ulint*			extra,	/*!< out: extra size */
+	bool			temp)	/*!< in: whether this is a
+					temporary file record */
+{
+	ulint	extra_size;
+	ulint	data_size;
+	ulint	i;
+	ulint	n_null	= index->n_nullable;
+	ut_ad(n_fields > 0);
+	ut_ad(n_fields <= dict_index_get_n_fields(index));
+	ut_ad(!temp || extra);
+
+	extra_size = temp
+		? UT_BITS_IN_BYTES(n_null)
+		: REC_N_NEW_EXTRA_BYTES
+		+ UT_BITS_IN_BYTES(n_null);
+	data_size = 0;
+
+	if (temp && dict_table_is_comp(index->table)) {
+		/* No need to do adjust fixed_len=0. We only need to
+		adjust it for ROW_FORMAT=REDUNDANT. */
+		temp = false;
+	}
+
+	/* read the lengths of fields 0..n */
+	for (i = 0; i < n_fields; i++) {
+		const dict_field_t*	field;
+		ulint			len;
+		ulint			fixed_len;
+		const dict_col_t*	col;
+
+		field = dict_index_get_nth_field(index, i);
+		len = dfield_get_len(&fields[i]);
+		col = dict_field_get_col(field);
+
+		ut_ad(dict_col_type_assert_equal(col,
+						 dfield_get_type(&fields[i])));
+		/* All NULLable fields must be included in the n_null count. */
+		ut_ad((col->prtype & DATA_NOT_NULL) || n_null--);
+
+		if (dfield_is_null(&fields[i])) {
+			/* No length is stored for NULL fields. */
+			ut_ad(!(col->prtype & DATA_NOT_NULL));
+			continue;
+		}
+
+		ut_ad(len <= col->len || col->mtype == DATA_BLOB
+		      || (col->len == 0 && col->mtype == DATA_VARCHAR));
+
+		fixed_len = field->fixed_len;
+		if (temp && fixed_len
+		    && !dict_col_get_fixed_size(col, temp)) {
+			fixed_len = 0;
+		}
+		/* If the maximum length of a variable-length field
+		is up to 255 bytes, the actual length is always stored
+		in one byte. If the maximum length is more than 255
+		bytes, the actual length is stored in one byte for
+		0..127.  The length will be encoded in two bytes when
+		it is 128 or more, or when the field is stored externally. */
+
+		if (fixed_len) {
+#ifdef UNIV_DEBUG
+			ulint	mbminlen = DATA_MBMINLEN(col->mbminmaxlen);
+			ulint	mbmaxlen = DATA_MBMAXLEN(col->mbminmaxlen);
+
+			ut_ad(len <= fixed_len);
+
+			ut_ad(!mbmaxlen || len >= mbminlen
+			      * (fixed_len / mbmaxlen));
+
+			/* dict_index_add_col() should guarantee this */
+			ut_ad(!field->prefix_len
+			      || fixed_len == field->prefix_len);
+#endif /* UNIV_DEBUG */
+		} else if (dfield_is_ext(&fields[i])) {
+			ut_ad(col->len >= 256 || col->mtype == DATA_BLOB);
+			extra_size += 2;
+		} else if (len < 128
+			   || (col->len < 256 && col->mtype != DATA_BLOB)) {
+			extra_size++;
+		} else {
+			/* For variable-length columns, we look up the
+			maximum length from the column itself.  If this
+			is a prefix index column shorter than 256 bytes,
+			this will waste one byte. */
+			extra_size += 2;
+		}
+		data_size += len;
+	}
+
+	if (extra) {
+		*extra = extra_size;
+	}
+
+	return(extra_size + data_size);
+}
+
+/**********************************************************//**
+Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT.
+@return	total size */
+UNIV_INTERN
+ulint
+rec_get_converted_size_comp_prefix(
+/*===============================*/
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dfield_t*		fields,	/*!< in: array of data fields */
+	ulint			n_fields,/*!< in: number of data fields */
+	ulint*			extra)	/*!< out: extra size */
+{
+	ut_ad(dict_table_is_comp(index->table));
+	return(rec_get_converted_size_comp_prefix_low(
+		       index, fields, n_fields, extra, false));
+}
+
+/**********************************************************//**
+Determines the size of a data tuple in ROW_FORMAT=COMPACT.
+@return	total size */
+UNIV_INTERN
+ulint
+rec_get_converted_size_comp(
+/*========================*/
+	const dict_index_t*	index,	/*!< in: record descriptor;
+					dict_table_is_comp() is
+					assumed to hold, even if
+					it does not */
+	ulint			status,	/*!< in: status bits of the record */
+	const dfield_t*		fields,	/*!< in: array of data fields */
+	ulint			n_fields,/*!< in: number of data fields */
+	ulint*			extra)	/*!< out: extra size */
+{
+	ulint	size;
+	ut_ad(n_fields > 0);
+
+	switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) {
+	case REC_STATUS_ORDINARY:
+		ut_ad(n_fields == dict_index_get_n_fields(index));
+		size = 0;
+		break;
+	case REC_STATUS_NODE_PTR:
+		n_fields--;
+		ut_ad(n_fields == dict_index_get_n_unique_in_tree(index));
+		ut_ad(dfield_get_len(&fields[n_fields]) == REC_NODE_PTR_SIZE);
+		size = REC_NODE_PTR_SIZE; /* child page number */
+		break;
+	case REC_STATUS_INFIMUM:
+	case REC_STATUS_SUPREMUM:
+		/* infimum or supremum record, 8 data bytes */
+		if (UNIV_LIKELY_NULL(extra)) {
+			*extra = REC_N_NEW_EXTRA_BYTES;
+		}
+		return(REC_N_NEW_EXTRA_BYTES + 8);
+	default:
+		ut_error;
+		return(ULINT_UNDEFINED);
+	}
+
+	return(size + rec_get_converted_size_comp_prefix_low(
+		       index, fields, n_fields, extra, false));
+}
+
+/***********************************************************//**
+Sets the value of the ith field SQL null bit of an old-style record. */
+UNIV_INTERN
+void
+rec_set_nth_field_null_bit(
+/*=======================*/
+	rec_t*	rec,	/*!< in: record */
+	ulint	i,	/*!< in: ith field */
+	ibool	val)	/*!< in: value to set */
+{
+	ulint	info;
+
+	if (rec_get_1byte_offs_flag(rec)) {
+
+		info = rec_1_get_field_end_info(rec, i);
+
+		if (val) {
+			info = info | REC_1BYTE_SQL_NULL_MASK;
+		} else {
+			info = info & ~REC_1BYTE_SQL_NULL_MASK;
+		}
+
+		rec_1_set_field_end_info(rec, i, info);
+
+		return;
+	}
+
+	info = rec_2_get_field_end_info(rec, i);
+
+	if (val) {
+		info = info | REC_2BYTE_SQL_NULL_MASK;
+	} else {
+		info = info & ~REC_2BYTE_SQL_NULL_MASK;
+	}
+
+	rec_2_set_field_end_info(rec, i, info);
+}
+
+/***********************************************************//**
+Sets an old-style record field to SQL null.
+The physical size of the field is not changed. */
+UNIV_INTERN
+void
+rec_set_nth_field_sql_null(
+/*=======================*/
+	rec_t*	rec,	/*!< in: record */
+	ulint	n)	/*!< in: index of the field */
+{
+	ulint	offset;
+
+	offset = rec_get_field_start_offs(rec, n);
+
+	data_write_sql_null(rec + offset, rec_get_nth_field_size(rec, n));
+
+	rec_set_nth_field_null_bit(rec, n, TRUE);
+}
+
+/*********************************************************//**
+Builds an old-style physical record out of a data tuple and
+stores it beginning from the start of the given buffer.
+@return	pointer to the origin of physical record */
+static
+rec_t*
+rec_convert_dtuple_to_rec_old(
+/*==========================*/
+	byte*		buf,	/*!< in: start address of the physical record */
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+{
+	const dfield_t*	field;
+	ulint		n_fields;
+	ulint		data_size;
+	rec_t*		rec;
+	ulint		end_offset;
+	ulint		ored_offset;
+	ulint		len;
+	ulint		i;
+
+	ut_ad(buf && dtuple);
+	ut_ad(dtuple_validate(dtuple));
+	ut_ad(dtuple_check_typed(dtuple));
+
+	n_fields = dtuple_get_n_fields(dtuple);
+	data_size = dtuple_get_data_size(dtuple, 0);
+
+	ut_ad(n_fields > 0);
+
+	/* Calculate the offset of the origin in the physical record */
+
+	rec = buf + rec_get_converted_extra_size(data_size, n_fields, n_ext);
+#ifdef UNIV_DEBUG
+	/* Suppress Valgrind warnings of ut_ad()
+	in mach_write_to_1(), mach_write_to_2() et al. */
+	memset(buf, 0xff, rec - buf + data_size);
+#endif /* UNIV_DEBUG */
+	/* Store the number of fields */
+	rec_set_n_fields_old(rec, n_fields);
+
+	/* Set the info bits of the record */
+	rec_set_info_bits_old(rec, dtuple_get_info_bits(dtuple)
+			      & REC_INFO_BITS_MASK);
+
+	/* Store the data and the offsets */
+
+	end_offset = 0;
+
+	if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) {
+
+		rec_set_1byte_offs_flag(rec, TRUE);
+
+		for (i = 0; i < n_fields; i++) {
+
+			field = dtuple_get_nth_field(dtuple, i);
+
+			if (dfield_is_null(field)) {
+				len = dtype_get_sql_null_size(
+					dfield_get_type(field), 0);
+				data_write_sql_null(rec + end_offset, len);
+
+				end_offset += len;
+				ored_offset = end_offset
+					| REC_1BYTE_SQL_NULL_MASK;
+			} else {
+				/* If the data is not SQL null, store it */
+				len = dfield_get_len(field);
+
+				memcpy(rec + end_offset,
+				       dfield_get_data(field), len);
+
+				end_offset += len;
+				ored_offset = end_offset;
+			}
+
+			rec_1_set_field_end_info(rec, i, ored_offset);
+		}
+	} else {
+		rec_set_1byte_offs_flag(rec, FALSE);
+
+		for (i = 0; i < n_fields; i++) {
+
+			field = dtuple_get_nth_field(dtuple, i);
+
+			if (dfield_is_null(field)) {
+				len = dtype_get_sql_null_size(
+					dfield_get_type(field), 0);
+				data_write_sql_null(rec + end_offset, len);
+
+				end_offset += len;
+				ored_offset = end_offset
+					| REC_2BYTE_SQL_NULL_MASK;
+			} else {
+				/* If the data is not SQL null, store it */
+				len = dfield_get_len(field);
+
+				memcpy(rec + end_offset,
+				       dfield_get_data(field), len);
+
+				end_offset += len;
+				ored_offset = end_offset;
+
+				if (dfield_is_ext(field)) {
+					ored_offset |= REC_2BYTE_EXTERN_MASK;
+				}
+			}
+
+			rec_2_set_field_end_info(rec, i, ored_offset);
+		}
+	}
+
+	return(rec);
+}
+
+/*********************************************************//**
+Builds a ROW_FORMAT=COMPACT record out of a data tuple. */
+UNIV_INLINE __attribute__((nonnull))
+void
+rec_convert_dtuple_to_rec_comp(
+/*===========================*/
+	rec_t*			rec,	/*!< in: origin of record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dfield_t*		fields,	/*!< in: array of data fields */
+	ulint			n_fields,/*!< in: number of data fields */
+	ulint			status,	/*!< in: status bits of the record */
+	bool			temp)	/*!< in: whether to use the
+					format for temporary files in
+					index creation */
+{
+	const dfield_t*	field;
+	const dtype_t*	type;
+	byte*		end;
+	byte*		nulls;
+	byte*		lens;
+	ulint		len;
+	ulint		i;
+	ulint		n_node_ptr_field;
+	ulint		fixed_len;
+	ulint		null_mask	= 1;
+	ulint		n_null;
+
+	ut_ad(temp || dict_table_is_comp(index->table));
+	ut_ad(n_fields > 0);
+
+	if (temp) {
+		ut_ad(status == REC_STATUS_ORDINARY);
+		ut_ad(n_fields <= dict_index_get_n_fields(index));
+		n_node_ptr_field = ULINT_UNDEFINED;
+		nulls = rec - 1;
+		if (dict_table_is_comp(index->table)) {
+			/* No need to do adjust fixed_len=0. We only
+			need to adjust it for ROW_FORMAT=REDUNDANT. */
+			temp = false;
+		}
+	} else {
+		nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1);
+
+		switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) {
+		case REC_STATUS_ORDINARY:
+			ut_ad(n_fields <= dict_index_get_n_fields(index));
+			n_node_ptr_field = ULINT_UNDEFINED;
+			break;
+		case REC_STATUS_NODE_PTR:
+			ut_ad(n_fields
+			      == dict_index_get_n_unique_in_tree(index) + 1);
+			n_node_ptr_field = n_fields - 1;
+			break;
+		case REC_STATUS_INFIMUM:
+		case REC_STATUS_SUPREMUM:
+			ut_ad(n_fields == 1);
+			n_node_ptr_field = ULINT_UNDEFINED;
+			break;
+		default:
+			ut_error;
+			return;
+		}
+	}
+
+	end = rec;
+	n_null = index->n_nullable;
+	lens = nulls - UT_BITS_IN_BYTES(n_null);
+	/* clear the SQL-null flags */
+	memset(lens + 1, 0, nulls - lens);
+
+	/* Store the data and the offsets */
+
+	for (i = 0, field = fields; i < n_fields; i++, field++) {
+		const dict_field_t*	ifield;
+
+		type = dfield_get_type(field);
+		len = dfield_get_len(field);
+
+		if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
+			ut_ad(dtype_get_prtype(type) & DATA_NOT_NULL);
+			ut_ad(len == REC_NODE_PTR_SIZE);
+			memcpy(end, dfield_get_data(field), len);
+			end += REC_NODE_PTR_SIZE;
+			break;
+		}
+
+		if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) {
+			/* nullable field */
+			ut_ad(n_null--);
+
+			if (UNIV_UNLIKELY(!(byte) null_mask)) {
+				nulls--;
+				null_mask = 1;
+			}
+
+			ut_ad(*nulls < null_mask);
+
+			/* set the null flag if necessary */
+			if (dfield_is_null(field)) {
+				*nulls |= null_mask;
+				null_mask <<= 1;
+				continue;
+			}
+
+			null_mask <<= 1;
+		}
+		/* only nullable fields can be null */
+		ut_ad(!dfield_is_null(field));
+
+		ifield = dict_index_get_nth_field(index, i);
+		fixed_len = ifield->fixed_len;
+		if (temp && fixed_len
+		    && !dict_col_get_fixed_size(ifield->col, temp)) {
+			fixed_len = 0;
+		}
+		/* If the maximum length of a variable-length field
+		is up to 255 bytes, the actual length is always stored
+		in one byte. If the maximum length is more than 255
+		bytes, the actual length is stored in one byte for
+		0..127.  The length will be encoded in two bytes when
+		it is 128 or more, or when the field is stored externally. */
+		if (fixed_len) {
+#ifdef UNIV_DEBUG
+			ulint	mbminlen = DATA_MBMINLEN(
+				ifield->col->mbminmaxlen);
+			ulint	mbmaxlen = DATA_MBMAXLEN(
+				ifield->col->mbminmaxlen);
+
+			ut_ad(len <= fixed_len);
+			ut_ad(!mbmaxlen || len >= mbminlen
+			      * (fixed_len / mbmaxlen));
+			ut_ad(!dfield_is_ext(field));
+#endif /* UNIV_DEBUG */
+		} else if (dfield_is_ext(field)) {
+			ut_ad(ifield->col->len >= 256
+			      || ifield->col->mtype == DATA_BLOB);
+			ut_ad(len <= REC_ANTELOPE_MAX_INDEX_COL_LEN
+			      + BTR_EXTERN_FIELD_REF_SIZE);
+			*lens-- = (byte) (len >> 8) | 0xc0;
+			*lens-- = (byte) len;
+		} else {
+			ut_ad(len <= dtype_get_len(type)
+			      || dtype_get_mtype(type) == DATA_BLOB
+			      || !strcmp(index->name,
+					 FTS_INDEX_TABLE_IND_NAME));
+			if (len < 128
+			    || (dtype_get_len(type) < 256
+				&& dtype_get_mtype(type) != DATA_BLOB)) {
+
+				*lens-- = (byte) len;
+			} else {
+				ut_ad(len < 16384);
+				*lens-- = (byte) (len >> 8) | 0x80;
+				*lens-- = (byte) len;
+			}
+		}
+
+		memcpy(end, dfield_get_data(field), len);
+		end += len;
+	}
+}
+
+/*********************************************************//**
+Builds a new-style physical record out of a data tuple and
+stores it beginning from the start of the given buffer.
+@return	pointer to the origin of physical record */
+static
+rec_t*
+rec_convert_dtuple_to_rec_new(
+/*==========================*/
+	byte*			buf,	/*!< in: start address of
+					the physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		dtuple)	/*!< in: data tuple */
+{
+	ulint	extra_size;
+	ulint	status;
+	rec_t*	rec;
+
+	status = dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK;
+	rec_get_converted_size_comp(
+		index, status, dtuple->fields, dtuple->n_fields, &extra_size);
+	rec = buf + extra_size;
+
+	rec_convert_dtuple_to_rec_comp(
+		rec, index, dtuple->fields, dtuple->n_fields, status, false);
+
+	/* Set the info bits of the record */
+	rec_set_info_and_status_bits(rec, dtuple_get_info_bits(dtuple));
+
+	return(rec);
+}
+
+/*********************************************************//**
+Builds a physical record out of a data tuple and
+stores it beginning from the start of the given buffer.
+@return	pointer to the origin of physical record */
+UNIV_INTERN
+rec_t*
+rec_convert_dtuple_to_rec(
+/*======================*/
+	byte*			buf,	/*!< in: start address of the
+					physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		dtuple,	/*!< in: data tuple */
+	ulint			n_ext)	/*!< in: number of
+					externally stored columns */
+{
+	rec_t*	rec;
+
+	ut_ad(buf && index && dtuple);
+	ut_ad(dtuple_validate(dtuple));
+	ut_ad(dtuple_check_typed(dtuple));
+
+	if (dict_table_is_comp(index->table)) {
+		rec = rec_convert_dtuple_to_rec_new(buf, index, dtuple);
+	} else {
+		rec = rec_convert_dtuple_to_rec_old(buf, dtuple, n_ext);
+	}
+
+#ifdef UNIV_DEBUG
+	{
+		mem_heap_t*	heap	= NULL;
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		const ulint*	offsets;
+		ulint		i;
+		rec_offs_init(offsets_);
+
+		offsets = rec_get_offsets(rec, index,
+					  offsets_, ULINT_UNDEFINED, &heap);
+		ut_ad(rec_validate(rec, offsets));
+		ut_ad(dtuple_get_n_fields(dtuple)
+		      == rec_offs_n_fields(offsets));
+
+		for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+			ut_ad(!dfield_is_ext(dtuple_get_nth_field(dtuple, i))
+			      == !rec_offs_nth_extern(offsets, i));
+		}
+
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	}
+#endif /* UNIV_DEBUG */
+	return(rec);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT.
+@return	total size */
+UNIV_INTERN
+ulint
+rec_get_converted_size_temp(
+/*========================*/
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dfield_t*		fields,	/*!< in: array of data fields */
+	ulint			n_fields,/*!< in: number of data fields */
+	ulint*			extra)	/*!< out: extra size */
+{
+	return(rec_get_converted_size_comp_prefix_low(
+		       index, fields, n_fields, extra, true));
+}
+
+/******************************************************//**
+Determine the offset to each field in temporary file.
+@see rec_convert_dtuple_to_temp() */
+UNIV_INTERN
+void
+rec_init_offsets_temp(
+/*==================*/
+	const rec_t*		rec,	/*!< in: temporary file record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint*			offsets)/*!< in/out: array of offsets;
+					in: n=rec_offs_n_fields(offsets) */
+{
+	rec_init_offsets_comp_ordinary(rec, true, index, offsets);
+}
+
+/*********************************************************//**
+Builds a temporary file record out of a data tuple.
+@see rec_init_offsets_temp() */
+UNIV_INTERN
+void
+rec_convert_dtuple_to_temp(
+/*=======================*/
+	rec_t*			rec,		/*!< out: record */
+	const dict_index_t*	index,		/*!< in: record descriptor */
+	const dfield_t*		fields,		/*!< in: array of data fields */
+	ulint			n_fields)	/*!< in: number of fields */
+{
+	rec_convert_dtuple_to_rec_comp(rec, index, fields, n_fields,
+				       REC_STATUS_ORDINARY, true);
+}
+
+/**************************************************************//**
+Copies the first n fields of a physical record to a data tuple. The fields
+are copied to the memory heap. */
+UNIV_INTERN
+void
+rec_copy_prefix_to_dtuple(
+/*======================*/
+	dtuple_t*		tuple,		/*!< out: data tuple */
+	const rec_t*		rec,		/*!< in: physical record */
+	const dict_index_t*	index,		/*!< in: record descriptor */
+	ulint			n_fields,	/*!< in: number of fields
+						to copy */
+	mem_heap_t*		heap)		/*!< in: memory heap */
+{
+	ulint	i;
+	ulint	offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*	offsets	= offsets_;
+	rec_offs_init(offsets_);
+
+	offsets = rec_get_offsets(rec, index, offsets, n_fields, &heap);
+
+	ut_ad(rec_validate(rec, offsets));
+	ut_ad(dtuple_check_typed(tuple));
+
+	dtuple_set_info_bits(tuple, rec_get_info_bits(
+				     rec, dict_table_is_comp(index->table)));
+
+	for (i = 0; i < n_fields; i++) {
+		dfield_t*	field;
+		const byte*	data;
+		ulint		len;
+
+		field = dtuple_get_nth_field(tuple, i);
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		if (len != UNIV_SQL_NULL) {
+			dfield_set_data(field,
+					mem_heap_dup(heap, data, len), len);
+			ut_ad(!rec_offs_nth_extern(offsets, i));
+		} else {
+			dfield_set_null(field);
+		}
+	}
+}
+
+/**************************************************************//**
+Copies the first n fields of an old-style physical record
+to a new physical record in a buffer.
+@return	own: copied record */
+static
+rec_t*
+rec_copy_prefix_to_buf_old(
+/*=======================*/
+	const rec_t*	rec,		/*!< in: physical record */
+	ulint		n_fields,	/*!< in: number of fields to copy */
+	ulint		area_end,	/*!< in: end of the prefix data */
+	byte**		buf,		/*!< in/out: memory buffer for
+					the copied prefix, or NULL */
+	ulint*		buf_size)	/*!< in/out: buffer size */
+{
+	rec_t*	copy_rec;
+	ulint	area_start;
+	ulint	prefix_len;
+
+	if (rec_get_1byte_offs_flag(rec)) {
+		area_start = REC_N_OLD_EXTRA_BYTES + n_fields;
+	} else {
+		area_start = REC_N_OLD_EXTRA_BYTES + 2 * n_fields;
+	}
+
+	prefix_len = area_start + area_end;
+
+	if ((*buf == NULL) || (*buf_size < prefix_len)) {
+		if (*buf != NULL) {
+			mem_free(*buf);
+		}
+
+		*buf = static_cast<byte*>(mem_alloc2(prefix_len, buf_size));
+	}
+
+	ut_memcpy(*buf, rec - area_start, prefix_len);
+
+	copy_rec = *buf + area_start;
+
+	rec_set_n_fields_old(copy_rec, n_fields);
+
+	return(copy_rec);
+}
+
+/**************************************************************//**
+Copies the first n fields of a physical record to a new physical record in
+a buffer.
+@return	own: copied record */
+UNIV_INTERN
+rec_t*
+rec_copy_prefix_to_buf(
+/*===================*/
+	const rec_t*		rec,		/*!< in: physical record */
+	const dict_index_t*	index,		/*!< in: record descriptor */
+	ulint			n_fields,	/*!< in: number of fields
+						to copy */
+	byte**			buf,		/*!< in/out: memory buffer
+						for the copied prefix,
+						or NULL */
+	ulint*			buf_size)	/*!< in/out: buffer size */
+{
+	const byte*	nulls;
+	const byte*	lens;
+	ulint		i;
+	ulint		prefix_len;
+	ulint		null_mask;
+	ulint		status;
+
+	UNIV_PREFETCH_RW(*buf);
+
+	if (!dict_table_is_comp(index->table)) {
+		ut_ad(rec_validate_old(rec));
+		return(rec_copy_prefix_to_buf_old(
+			       rec, n_fields,
+			       rec_get_field_start_offs(rec, n_fields),
+			       buf, buf_size));
+	}
+
+	status = rec_get_status(rec);
+
+	switch (status) {
+	case REC_STATUS_ORDINARY:
+		ut_ad(n_fields <= dict_index_get_n_fields(index));
+		break;
+	case REC_STATUS_NODE_PTR:
+		/* it doesn't make sense to copy the child page number field */
+		ut_ad(n_fields <= dict_index_get_n_unique_in_tree(index));
+		break;
+	case REC_STATUS_INFIMUM:
+	case REC_STATUS_SUPREMUM:
+		/* infimum or supremum record: no sense to copy anything */
+	default:
+		ut_error;
+		return(NULL);
+	}
+
+	nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1);
+	lens = nulls - UT_BITS_IN_BYTES(index->n_nullable);
+	UNIV_PREFETCH_R(lens);
+	prefix_len = 0;
+	null_mask = 1;
+
+	/* read the lengths of fields 0..n */
+	for (i = 0; i < n_fields; i++) {
+		const dict_field_t*	field;
+		const dict_col_t*	col;
+
+		field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(field);
+
+		if (!(col->prtype & DATA_NOT_NULL)) {
+			/* nullable field => read the null flag */
+			if (UNIV_UNLIKELY(!(byte) null_mask)) {
+				nulls--;
+				null_mask = 1;
+			}
+
+			if (*nulls & null_mask) {
+				null_mask <<= 1;
+				continue;
+			}
+
+			null_mask <<= 1;
+		}
+
+		if (field->fixed_len) {
+			prefix_len += field->fixed_len;
+		} else {
+			ulint	len = *lens--;
+			/* If the maximum length of the column is up
+			to 255 bytes, the actual length is always
+			stored in one byte. If the maximum length is
+			more than 255 bytes, the actual length is
+			stored in one byte for 0..127.  The length
+			will be encoded in two bytes when it is 128 or
+			more, or when the column is stored externally. */
+			if (col->len > 255 || col->mtype == DATA_BLOB) {
+				if (len & 0x80) {
+					/* 1exxxxxx */
+					len &= 0x3f;
+					len <<= 8;
+					len |= *lens--;
+					UNIV_PREFETCH_R(lens);
+				}
+			}
+			prefix_len += len;
+		}
+	}
+
+	UNIV_PREFETCH_R(rec + prefix_len);
+
+	prefix_len += rec - (lens + 1);
+
+	if ((*buf == NULL) || (*buf_size < prefix_len)) {
+		if (*buf != NULL) {
+			mem_free(*buf);
+		}
+
+		*buf = static_cast<byte*>(mem_alloc2(prefix_len, buf_size));
+	}
+
+	memcpy(*buf, lens + 1, prefix_len);
+
+	return(*buf + (rec - (lens + 1)));
+}
+#endif /* UNIV_HOTBACKUP */
+
+/***************************************************************//**
+Validates the consistency of an old-style physical record.
+@return	TRUE if ok */
+static
+ibool
+rec_validate_old(
+/*=============*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	const byte*	data;
+	ulint		len;
+	ulint		n_fields;
+	ulint		len_sum		= 0;
+	ulint		sum		= 0;
+	ulint		i;
+
+	ut_a(rec);
+	n_fields = rec_get_n_fields_old(rec);
+
+	if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) {
+		fprintf(stderr, "InnoDB: Error: record has %lu fields\n",
+			(ulong) n_fields);
+		return(FALSE);
+	}
+
+	for (i = 0; i < n_fields; i++) {
+		data = rec_get_nth_field_old(rec, i, &len);
+
+		if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) {
+			fprintf(stderr,
+				"InnoDB: Error: record field %lu len %lu\n",
+				(ulong) i,
+				(ulong) len);
+			return(FALSE);
+		}
+
+		if (len != UNIV_SQL_NULL) {
+			len_sum += len;
+			sum += *(data + len -1); /* dereference the
+						 end of the field to
+						 cause a memory trap
+						 if possible */
+		} else {
+			len_sum += rec_get_nth_field_size(rec, i);
+		}
+	}
+
+	if (len_sum != rec_get_data_size_old(rec)) {
+		fprintf(stderr,
+			"InnoDB: Error: record len should be %lu, len %lu\n",
+			(ulong) len_sum,
+			rec_get_data_size_old(rec));
+		return(FALSE);
+	}
+
+	rec_dummy = sum; /* This is here only to fool the compiler */
+
+	return(TRUE);
+}
+
+/***************************************************************//**
+Validates the consistency of a physical record.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+rec_validate(
+/*=========*/
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	const byte*	data;
+	ulint		len;
+	ulint		n_fields;
+	ulint		len_sum		= 0;
+	ulint		sum		= 0;
+	ulint		i;
+
+	ut_a(rec);
+	n_fields = rec_offs_n_fields(offsets);
+
+	if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) {
+		fprintf(stderr, "InnoDB: Error: record has %lu fields\n",
+			(ulong) n_fields);
+		return(FALSE);
+	}
+
+	ut_a(rec_offs_comp(offsets) || n_fields <= rec_get_n_fields_old(rec));
+
+	for (i = 0; i < n_fields; i++) {
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) {
+			fprintf(stderr,
+				"InnoDB: Error: record field %lu len %lu\n",
+				(ulong) i,
+				(ulong) len);
+			return(FALSE);
+		}
+
+		if (len != UNIV_SQL_NULL) {
+			len_sum += len;
+			sum += *(data + len -1); /* dereference the
+						 end of the field to
+						 cause a memory trap
+						 if possible */
+		} else if (!rec_offs_comp(offsets)) {
+			len_sum += rec_get_nth_field_size(rec, i);
+		}
+	}
+
+	if (len_sum != rec_offs_data_size(offsets)) {
+		fprintf(stderr,
+			"InnoDB: Error: record len should be %lu, len %lu\n",
+			(ulong) len_sum,
+			(ulong) rec_offs_data_size(offsets));
+		return(FALSE);
+	}
+
+	rec_dummy = sum; /* This is here only to fool the compiler */
+
+	if (!rec_offs_comp(offsets)) {
+		ut_a(rec_validate_old(rec));
+	}
+
+	return(TRUE);
+}
+
+/***************************************************************//**
+Prints an old-style physical record. */
+UNIV_INTERN
+void
+rec_print_old(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	const byte*	data;
+	ulint		len;
+	ulint		n;
+	ulint		i;
+
+	ut_ad(rec);
+
+	n = rec_get_n_fields_old(rec);
+
+	fprintf(file, "PHYSICAL RECORD: n_fields %lu;"
+		" %u-byte offsets; info bits %lu\n",
+		(ulong) n,
+		rec_get_1byte_offs_flag(rec) ? 1 : 2,
+		(ulong) rec_get_info_bits(rec, FALSE));
+
+	for (i = 0; i < n; i++) {
+
+		data = rec_get_nth_field_old(rec, i, &len);
+
+		fprintf(file, " %lu:", (ulong) i);
+
+		if (len != UNIV_SQL_NULL) {
+			if (len <= 30) {
+
+				ut_print_buf(file, data, len);
+			} else {
+				ut_print_buf(file, data, 30);
+
+				fprintf(file, " (total %lu bytes)",
+					(ulong) len);
+			}
+		} else {
+			fprintf(file, " SQL NULL, size %lu ",
+				rec_get_nth_field_size(rec, i));
+		}
+
+		putc(';', file);
+		putc('\n', file);
+	}
+
+	rec_validate_old(rec);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Prints a physical record in ROW_FORMAT=COMPACT.  Ignores the
+record header. */
+UNIV_INTERN
+void
+rec_print_comp(
+/*===========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	i;
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		const byte*	data;
+		ulint		len;
+
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		fprintf(file, " %lu:", (ulong) i);
+
+		if (len != UNIV_SQL_NULL) {
+			if (len <= 30) {
+
+				ut_print_buf(file, data, len);
+			} else if (rec_offs_nth_extern(offsets, i)) {
+				ut_print_buf(file, data, 30);
+				fprintf(file, " (total %lu bytes, external)",
+					(ulong) len);
+				ut_print_buf(file, data + len
+					     - BTR_EXTERN_FIELD_REF_SIZE,
+					     BTR_EXTERN_FIELD_REF_SIZE);
+			} else {
+				ut_print_buf(file, data, 30);
+
+				fprintf(file, " (total %lu bytes)",
+					(ulong) len);
+			}
+		} else {
+			fputs(" SQL NULL", file);
+		}
+		putc(';', file);
+		putc('\n', file);
+	}
+}
+
+/***************************************************************//**
+Prints a physical record. */
+UNIV_INTERN
+void
+rec_print_new(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ut_ad(rec);
+	ut_ad(offsets);
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	if (!rec_offs_comp(offsets)) {
+		rec_print_old(file, rec);
+		return;
+	}
+
+	fprintf(file, "PHYSICAL RECORD: n_fields %lu;"
+		" compact format; info bits %lu\n",
+		(ulong) rec_offs_n_fields(offsets),
+		(ulong) rec_get_info_bits(rec, TRUE));
+
+	rec_print_comp(file, rec, offsets);
+	rec_validate(rec, offsets);
+}
+
+/***************************************************************//**
+Prints a physical record. */
+UNIV_INTERN
+void
+rec_print(
+/*======*/
+	FILE*			file,	/*!< in: file where to print */
+	const rec_t*		rec,	/*!< in: physical record */
+	const dict_index_t*	index)	/*!< in: record descriptor */
+{
+	ut_ad(index);
+
+	if (!dict_table_is_comp(index->table)) {
+		rec_print_old(file, rec);
+		return;
+	} else {
+		mem_heap_t*	heap	= NULL;
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		rec_offs_init(offsets_);
+
+		rec_print_new(file, rec,
+			      rec_get_offsets(rec, index, offsets_,
+					      ULINT_UNDEFINED, &heap));
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	}
+}
+
+# ifdef UNIV_DEBUG
+/************************************************************//**
+Reads the DB_TRX_ID of a clustered index record.
+@return	the value of DB_TRX_ID */
+UNIV_INTERN
+trx_id_t
+rec_get_trx_id(
+/*===========*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index)	/*!< in: clustered index */
+{
+	const page_t*	page
+		= page_align(rec);
+	ulint		trx_id_col
+		= dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+	const byte*	trx_id;
+	ulint		len;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+	ut_ad(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID)
+	      == index->id);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(trx_id_col > 0);
+	ut_ad(trx_id_col != ULINT_UNDEFINED);
+
+	offsets = rec_get_offsets(rec, index, offsets, trx_id_col + 1, &heap);
+
+	trx_id = rec_get_nth_field(rec, offsets, trx_id_col, &len);
+
+	ut_ad(len == DATA_TRX_ID_LEN);
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+
+	return(trx_read_trx_id(trx_id));
+}
+# endif /* UNIV_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/row/row0ext.cc b/storage/innobase/row/row0ext.cc
new file mode 100644
index 00000000000..32b78391d6a
--- /dev/null
+++ b/storage/innobase/row/row0ext.cc
@@ -0,0 +1,142 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ext.cc
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#include "row0ext.h"
+
+#ifdef UNIV_NONINL
+#include "row0ext.ic"
+#endif
+
+#include "btr0cur.h"
+
+/********************************************************************//**
+Fills the column prefix cache of an externally stored column. */
+static
+void
+row_ext_cache_fill(
+/*===============*/
+	row_ext_t*	ext,	/*!< in/out: column prefix cache */
+	ulint		i,	/*!< in: index of ext->ext[] */
+	ulint		zip_size,/*!< compressed page size in bytes, or 0 */
+	const dfield_t*	dfield)	/*!< in: data field */
+{
+	const byte*	field	= static_cast<const byte*>(
+					dfield_get_data(dfield));
+	ulint		f_len	= dfield_get_len(dfield);
+	byte*		buf	= ext->buf + i * ext->max_len;
+
+	ut_ad(ext->max_len > 0);
+	ut_ad(i < ext->n_ext);
+	ut_ad(dfield_is_ext(dfield));
+	ut_a(f_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	if (UNIV_UNLIKELY(!memcmp(field_ref_zero,
+				  field + f_len - BTR_EXTERN_FIELD_REF_SIZE,
+				  BTR_EXTERN_FIELD_REF_SIZE))) {
+		/* The BLOB pointer is not set: we cannot fetch it */
+		ext->len[i] = 0;
+	} else {
+		if (ext->max_len == REC_VERSION_56_MAX_INDEX_COL_LEN
+		    && f_len > BTR_EXTERN_FIELD_REF_SIZE) {
+			/* In this case, the field is in B format or beyond,
+			(refer to the definition of row_ext_t.max_len)
+			and the field is already fill with prefix, otherwise
+			f_len would be BTR_EXTERN_FIELD_REF_SIZE.
+			So there is no need to re-read the prefix externally,
+			but just copy the local prefix to buf. Please note
+			if the ext->len[i] is zero, it means an error
+			as above. */
+			memcpy(buf, field, f_len - BTR_EXTERN_FIELD_REF_SIZE);
+			ext->len[i] = f_len - BTR_EXTERN_FIELD_REF_SIZE;
+		} else {
+			/* Fetch at most ext->max_len of the column.
+			The column should be non-empty.  However,
+			trx_rollback_or_clean_all_recovered() may try to
+			access a half-deleted BLOB if the server previously
+			crashed during the execution of
+			btr_free_externally_stored_field(). */
+			ext->len[i] = btr_copy_externally_stored_field_prefix(
+				buf, ext->max_len, zip_size, field, f_len);
+		}
+	}
+}
+
+/********************************************************************//**
+Creates a cache of column prefixes of externally stored columns.
+@return	own: column prefix cache */
+UNIV_INTERN
+row_ext_t*
+row_ext_create(
+/*===========*/
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	const ulint*	ext,	/*!< in: col_no's of externally stored columns
+				in the InnoDB table object, as reported by
+				dict_col_get_no(); NOT relative to the records
+				in the clustered index */
+	ulint		flags,	/*!< in: table->flags */
+	const dtuple_t*	tuple,	/*!< in: data tuple containing the field
+				references of the externally stored
+				columns; must be indexed by col_no;
+				the clustered index record must be
+				covered by a lock or a page latch
+				to prevent deletion (rollback or purge). */
+	mem_heap_t*	heap)	/*!< in: heap where created */
+{
+	ulint		i;
+	ulint		zip_size = dict_tf_get_zip_size(flags);
+
+	row_ext_t*	ret;
+
+	ut_ad(n_ext > 0);
+
+	ret = static_cast<row_ext_t*>(
+		mem_heap_alloc(heap,
+			       (sizeof *ret) + (n_ext - 1) * sizeof ret->len));
+
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(zip_size <= UNIV_ZIP_SIZE_MAX);
+
+	ret->n_ext = n_ext;
+	ret->ext = ext;
+	ret->max_len = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags);
+
+	ret->buf = static_cast<byte*>(
+		mem_heap_alloc(heap, n_ext * ret->max_len));
+
+#ifdef UNIV_DEBUG
+	memset(ret->buf, 0xaa, n_ext * ret->max_len);
+	UNIV_MEM_ALLOC(ret->buf, n_ext * ret->max_len);
+#endif
+
+	/* Fetch the BLOB prefixes */
+	for (i = 0; i < n_ext; i++) {
+		const dfield_t*	dfield;
+
+		dfield = dtuple_get_nth_field(tuple, ext[i]);
+		row_ext_cache_fill(ret, i, zip_size, dfield);
+	}
+
+	return(ret);
+}
diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc
new file mode 100644
index 00000000000..6da430a03d6
--- /dev/null
+++ b/storage/innobase/row/row0ftsort.cc
@@ -0,0 +1,1570 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ftsort.cc
+Create Full Text Index with (parallel) merge sort
+
+Created 10/13/2010 Jimmy Yang
+*******************************************************/
+
+#include "dict0dict.h" /* dict_table_stats_lock() */
+#include "row0merge.h"
+#include "pars0pars.h"
+#include "row0ftsort.h"
+#include "row0merge.h"
+#include "row0row.h"
+#include "btr0cur.h"
+
+/** Read the next record to buffer N.
+@param N	index into array of merge info structure */
+#define ROW_MERGE_READ_GET_NEXT(N)					\
+	do {								\
+		b[N] = row_merge_read_rec(				\
+			block[N], buf[N], b[N], index,			\
+			fd[N], &foffs[N], &mrec[N], offsets[N]);	\
+		if (UNIV_UNLIKELY(!b[N])) {				\
+			if (mrec[N]) {					\
+				goto exit;				\
+			}						\
+		}							\
+	} while (0)
+
+/** Parallel sort degree */
+UNIV_INTERN ulong	fts_sort_pll_degree	= 2;
+
+/*********************************************************************//**
+Create a temporary "fts sort index" used to merge sort the
+tokenized doc string. The index has three "fields":
+
+1) Tokenized word,
+2) Doc ID (depend on number of records to sort, it can be a 4 bytes or 8 bytes
+integer value)
+3) Word's position in original doc.
+
+@return dict_index_t structure for the fts sort index */
+UNIV_INTERN
+dict_index_t*
+row_merge_create_fts_sort_index(
+/*============================*/
+	dict_index_t*		index,	/*!< in: Original FTS index
+					based on which this sort index
+					is created */
+	const dict_table_t*	table,	/*!< in: table that FTS index
+					is being created on */
+	ibool*			opt_doc_id_size)
+					/*!< out: whether to use 4 bytes
+					instead of 8 bytes integer to
+					store Doc ID during sort */
+{
+	dict_index_t*   new_index;
+	dict_field_t*   field;
+	dict_field_t*   idx_field;
+	CHARSET_INFO*	charset;
+
+	// FIXME: This name shouldn't be hard coded here.
+	new_index = dict_mem_index_create(
+		index->table->name, "tmp_fts_idx", 0, DICT_FTS, 3);
+
+	new_index->id = index->id;
+	new_index->table = (dict_table_t*) table;
+	new_index->n_uniq = FTS_NUM_FIELDS_SORT;
+	new_index->n_def = FTS_NUM_FIELDS_SORT;
+	new_index->cached = TRUE;
+
+	idx_field = dict_index_get_nth_field(index, 0);
+	charset = fts_index_get_charset(index);
+
+	/* The first field is on the Tokenized Word */
+	field = dict_index_get_nth_field(new_index, 0);
+	field->name = NULL;
+	field->prefix_len = 0;
+	field->col = static_cast<dict_col_t*>(
+		mem_heap_alloc(new_index->heap, sizeof(dict_col_t)));
+	field->col->len = FTS_MAX_WORD_LEN;
+
+	if (strcmp(charset->name, "latin1_swedish_ci") == 0) {
+		field->col->mtype = DATA_VARCHAR;
+	} else {
+		field->col->mtype = DATA_VARMYSQL;
+	}
+
+	field->col->prtype = idx_field->col->prtype | DATA_NOT_NULL;
+	field->col->mbminmaxlen = idx_field->col->mbminmaxlen;
+	field->fixed_len = 0;
+
+	/* Doc ID */
+	field = dict_index_get_nth_field(new_index, 1);
+	field->name = NULL;
+	field->prefix_len = 0;
+	field->col = static_cast<dict_col_t*>(
+		mem_heap_alloc(new_index->heap, sizeof(dict_col_t)));
+	field->col->mtype = DATA_INT;
+	*opt_doc_id_size = FALSE;
+
+	/* Check whether we can use 4 bytes instead of 8 bytes integer
+	field to hold the Doc ID, thus reduce the overall sort size */
+	if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+		/* If Doc ID column is being added by this create
+		index, then just check the number of rows in the table */
+		if (dict_table_get_n_rows(table) < MAX_DOC_ID_OPT_VAL) {
+			*opt_doc_id_size = TRUE;
+		}
+	} else {
+		doc_id_t	max_doc_id;
+
+		/* If the Doc ID column is supplied by user, then
+		check the maximum Doc ID in the table */
+		max_doc_id = fts_get_max_doc_id((dict_table_t*) table);
+
+		if (max_doc_id && max_doc_id < MAX_DOC_ID_OPT_VAL) {
+			*opt_doc_id_size = TRUE;
+		}
+	}
+
+	if (*opt_doc_id_size) {
+		field->col->len = sizeof(ib_uint32_t);
+		field->fixed_len = sizeof(ib_uint32_t);
+	} else {
+		field->col->len = FTS_DOC_ID_LEN;
+		field->fixed_len = FTS_DOC_ID_LEN;
+	}
+
+	field->col->prtype = DATA_NOT_NULL | DATA_BINARY_TYPE;
+
+	field->col->mbminmaxlen = 0;
+
+	/* The third field is on the word's position in the original doc */
+	field = dict_index_get_nth_field(new_index, 2);
+	field->name = NULL;
+	field->prefix_len = 0;
+	field->col = static_cast<dict_col_t*>(
+		mem_heap_alloc(new_index->heap, sizeof(dict_col_t)));
+	field->col->mtype = DATA_INT;
+	field->col->len = 4 ;
+	field->fixed_len = 4;
+	field->col->prtype = DATA_NOT_NULL;
+	field->col->mbminmaxlen = 0;
+
+	return(new_index);
+}
+/*********************************************************************//**
+Initialize FTS parallel sort structures.
+@return TRUE if all successful */
+UNIV_INTERN
+ibool
+row_fts_psort_info_init(
+/*====================*/
+	trx_t*			trx,	/*!< in: transaction */
+	row_merge_dup_t*	dup,	/*!< in,own: descriptor of
+					FTS index being created */
+	const dict_table_t*	new_table,/*!< in: table on which indexes are
+					created */
+	ibool			opt_doc_id_size,
+					/*!< in: whether to use 4 bytes
+					instead of 8 bytes integer to
+					store Doc ID during sort */
+	fts_psort_t**		psort,	/*!< out: parallel sort info to be
+					instantiated */
+	fts_psort_t**		merge)	/*!< out: parallel merge info
+					to be instantiated */
+{
+	ulint			i;
+	ulint			j;
+	fts_psort_common_t*	common_info = NULL;
+	fts_psort_t*		psort_info = NULL;
+	fts_psort_t*		merge_info = NULL;
+	ulint			block_size;
+	ibool			ret = TRUE;
+
+	block_size = 3 * srv_sort_buf_size;
+
+	*psort = psort_info = static_cast<fts_psort_t*>(mem_zalloc(
+		 fts_sort_pll_degree * sizeof *psort_info));
+
+	if (!psort_info) {
+		ut_free(dup);
+		return(FALSE);
+	}
+
+	/* Common Info for all sort threads */
+	common_info = static_cast<fts_psort_common_t*>(
+		mem_alloc(sizeof *common_info));
+
+	if (!common_info) {
+		ut_free(dup);
+		mem_free(psort_info);
+		return(FALSE);
+	}
+
+	common_info->dup = dup;
+	common_info->new_table = (dict_table_t*) new_table;
+	common_info->trx = trx;
+	common_info->all_info = psort_info;
+	common_info->sort_event = os_event_create();
+	common_info->merge_event = os_event_create();
+	common_info->opt_doc_id_size = opt_doc_id_size;
+
+	/* There will be FTS_NUM_AUX_INDEX number of "sort buckets" for
+	each parallel sort thread. Each "sort bucket" holds records for
+	a particular "FTS index partition" */
+	for (j = 0; j < fts_sort_pll_degree; j++) {
+
+		UT_LIST_INIT(psort_info[j].fts_doc_list);
+
+		for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+
+			psort_info[j].merge_file[i] =
+				 static_cast<merge_file_t*>(
+					mem_zalloc(sizeof(merge_file_t)));
+
+			if (!psort_info[j].merge_file[i]) {
+				ret = FALSE;
+				goto func_exit;
+			}
+
+			psort_info[j].merge_buf[i] = row_merge_buf_create(
+				dup->index);
+
+			if (row_merge_file_create(psort_info[j].merge_file[i])
+			    < 0) {
+				goto func_exit;
+			}
+
+			/* Need to align memory for O_DIRECT write */
+			psort_info[j].block_alloc[i] =
+				static_cast<row_merge_block_t*>(ut_malloc(
+					block_size + 1024));
+
+			psort_info[j].merge_block[i] =
+				static_cast<row_merge_block_t*>(
+					ut_align(
+					psort_info[j].block_alloc[i], 1024));
+
+			if (!psort_info[j].merge_block[i]) {
+				ret = FALSE;
+				goto func_exit;
+			}
+		}
+
+		psort_info[j].child_status = 0;
+		psort_info[j].state = 0;
+		psort_info[j].psort_common = common_info;
+		psort_info[j].error = DB_SUCCESS;
+		psort_info[j].memory_used = 0;
+		mutex_create(fts_pll_tokenize_mutex_key, &psort_info[j].mutex, SYNC_FTS_TOKENIZE);
+	}
+
+	/* Initialize merge_info structures parallel merge and insert
+	into auxiliary FTS tables (FTS_INDEX_TABLE) */
+	*merge = merge_info = static_cast<fts_psort_t*>(
+		mem_alloc(FTS_NUM_AUX_INDEX * sizeof *merge_info));
+
+	for (j = 0; j < FTS_NUM_AUX_INDEX; j++) {
+
+		merge_info[j].child_status = 0;
+		merge_info[j].state = 0;
+		merge_info[j].psort_common = common_info;
+	}
+
+func_exit:
+	if (!ret) {
+		row_fts_psort_info_destroy(psort_info, merge_info);
+	}
+
+	return(ret);
+}
+/*********************************************************************//**
+Clean up and deallocate FTS parallel sort structures, and close the
+merge sort files  */
+UNIV_INTERN
+void
+row_fts_psort_info_destroy(
+/*=======================*/
+	fts_psort_t*	psort_info,	/*!< parallel sort info */
+	fts_psort_t*	merge_info)	/*!< parallel merge info */
+{
+	ulint	i;
+	ulint	j;
+
+	if (psort_info) {
+		for (j = 0; j < fts_sort_pll_degree; j++) {
+			for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+				if (psort_info[j].merge_file[i]) {
+					row_merge_file_destroy(
+						psort_info[j].merge_file[i]);
+				}
+
+				if (psort_info[j].block_alloc[i]) {
+					ut_free(psort_info[j].block_alloc[i]);
+				}
+				mem_free(psort_info[j].merge_file[i]);
+			}
+
+			mutex_free(&psort_info[j].mutex);
+		}
+
+		os_event_free(merge_info[0].psort_common->sort_event);
+		os_event_free(merge_info[0].psort_common->merge_event);
+		ut_free(merge_info[0].psort_common->dup);
+		mem_free(merge_info[0].psort_common);
+		mem_free(psort_info);
+	}
+
+	if (merge_info) {
+		mem_free(merge_info);
+	}
+}
+/*********************************************************************//**
+Free up merge buffers when merge sort is done */
+UNIV_INTERN
+void
+row_fts_free_pll_merge_buf(
+/*=======================*/
+	fts_psort_t*	psort_info)	/*!< in: parallel sort info */
+{
+	ulint	j;
+	ulint	i;
+
+	if (!psort_info) {
+		return;
+	}
+
+	for (j = 0; j < fts_sort_pll_degree; j++) {
+		for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+			row_merge_buf_free(psort_info[j].merge_buf[i]);
+		}
+	}
+
+	return;
+}
+
+/*********************************************************************//**
+Tokenize incoming text data and add to the sort buffer.
+@return	TRUE if the record passed, FALSE if out of space */
+static
+ibool
+row_merge_fts_doc_tokenize(
+/*=======================*/
+	row_merge_buf_t**	sort_buf,	/*!< in/out: sort buffer */
+	doc_id_t		doc_id,		/*!< in: Doc ID */
+	fts_doc_t*		doc,		/*!< in: Doc to be tokenized */
+	dtype_t*		word_dtype,	/*!< in: data structure for
+						word col */
+	merge_file_t**		merge_file,	/*!< in/out: merge file */
+	ibool			opt_doc_id_size,/*!< in: whether to use 4 bytes
+						instead of 8 bytes integer to
+						store Doc ID during sort*/
+	fts_tokenize_ctx_t*	t_ctx)          /*!< in/out: tokenize context */
+{
+	ulint		i;
+	ulint		inc;
+	fts_string_t	str;
+	ulint		len;
+	row_merge_buf_t* buf;
+	dfield_t*	field;
+	fts_string_t	t_str;
+	ibool		buf_full = FALSE;
+	byte		str_buf[FTS_MAX_WORD_LEN + 1];
+	ulint		data_size[FTS_NUM_AUX_INDEX];
+	ulint		n_tuple[FTS_NUM_AUX_INDEX];
+
+	t_str.f_n_char = 0;
+	t_ctx->buf_used = 0;
+
+	memset(n_tuple, 0, FTS_NUM_AUX_INDEX * sizeof(ulint));
+	memset(data_size, 0, FTS_NUM_AUX_INDEX * sizeof(ulint));
+
+	/* Tokenize the data and add each word string, its corresponding
+	doc id and position to sort buffer */
+	for (i = t_ctx->processed_len; i < doc->text.f_len; i += inc) {
+		ib_rbt_bound_t	parent;
+		ulint		idx = 0;
+		ib_uint32_t	position;
+		ulint           offset = 0;
+		ulint		cur_len = 0;
+		doc_id_t	write_doc_id;
+
+		inc = innobase_mysql_fts_get_token(
+			doc->charset, doc->text.f_str + i,
+			doc->text.f_str + doc->text.f_len, &str, &offset);
+
+		ut_a(inc > 0);
+
+		/* Ignore string whose character number is less than
+		"fts_min_token_size" or more than "fts_max_token_size" */
+		if (str.f_n_char < fts_min_token_size
+		    || str.f_n_char > fts_max_token_size) {
+
+			t_ctx->processed_len += inc;
+			continue;
+		}
+
+		t_str.f_len = innobase_fts_casedn_str(
+			doc->charset, (char*) str.f_str, str.f_len,
+			(char*) &str_buf, FTS_MAX_WORD_LEN + 1);
+
+		t_str.f_str = (byte*) &str_buf;
+
+		/* if "cached_stopword" is defined, ingore words in the
+		stopword list */
+		if (t_ctx->cached_stopword
+		    && rbt_search(t_ctx->cached_stopword,
+				  &parent, &t_str) == 0) {
+
+			t_ctx->processed_len += inc;
+			continue;
+		}
+
+		/* There are FTS_NUM_AUX_INDEX auxiliary tables, find
+		out which sort buffer to put this word record in */
+		t_ctx->buf_used = fts_select_index(
+			doc->charset, t_str.f_str, t_str.f_len);
+
+		buf = sort_buf[t_ctx->buf_used];
+
+		ut_a(t_ctx->buf_used < FTS_NUM_AUX_INDEX);
+		idx = t_ctx->buf_used;
+
+		mtuple_t* mtuple = &buf->tuples[buf->n_tuples + n_tuple[idx]];
+
+		field = mtuple->fields = static_cast<dfield_t*>(
+			mem_heap_alloc(buf->heap,
+				       FTS_NUM_FIELDS_SORT * sizeof *field));
+
+		/* The first field is the tokenized word */
+		dfield_set_data(field, t_str.f_str, t_str.f_len);
+		len = dfield_get_len(field);
+
+		field->type.mtype = word_dtype->mtype;
+		field->type.prtype = word_dtype->prtype | DATA_NOT_NULL;
+
+		/* Variable length field, set to max size. */
+		field->type.len = FTS_MAX_WORD_LEN;
+		field->type.mbminmaxlen = word_dtype->mbminmaxlen;
+
+		cur_len += len;
+		dfield_dup(field, buf->heap);
+		field++;
+
+		/* The second field is the Doc ID */
+
+		ib_uint32_t	doc_id_32_bit;
+
+		if (!opt_doc_id_size) {
+			fts_write_doc_id((byte*) &write_doc_id, doc_id);
+
+			dfield_set_data(
+				field, &write_doc_id, sizeof(write_doc_id));
+		} else {
+			mach_write_to_4(
+				(byte*) &doc_id_32_bit, (ib_uint32_t) doc_id);
+
+			dfield_set_data(
+				field, &doc_id_32_bit, sizeof(doc_id_32_bit));
+		}
+
+		len = field->len;
+		ut_ad(len == FTS_DOC_ID_LEN || len == sizeof(ib_uint32_t));
+
+		field->type.mtype = DATA_INT;
+		field->type.prtype = DATA_NOT_NULL | DATA_BINARY_TYPE;
+		field->type.len = len;
+		field->type.mbminmaxlen = 0;
+
+		cur_len += len;
+		dfield_dup(field, buf->heap);
+
+		++field;
+
+		/* The third field is the position */
+		mach_write_to_4(
+			(byte*) &position,
+			(i + offset + inc - str.f_len + t_ctx->init_pos));
+
+		dfield_set_data(field, &position, sizeof(position));
+		len = dfield_get_len(field);
+		ut_ad(len == sizeof(ib_uint32_t));
+
+		field->type.mtype = DATA_INT;
+		field->type.prtype = DATA_NOT_NULL;
+		field->type.len = len;
+		field->type.mbminmaxlen = 0;
+		cur_len += len;
+		dfield_dup(field, buf->heap);
+
+		/* One variable length column, word with its lenght less than
+		fts_max_token_size, add one extra size and one extra byte */
+		cur_len += 2;
+
+		/* Reserve one byte for the end marker of row_merge_block_t. */
+		if (buf->total_size + data_size[idx] + cur_len
+		    >= srv_sort_buf_size - 1) {
+
+			buf_full = TRUE;
+			break;
+		}
+
+		/* Increment the number of tuples */
+		n_tuple[idx]++;
+		t_ctx->processed_len += inc;
+		data_size[idx] += cur_len;
+	}
+
+	/* Update the data length and the number of new word tuples
+	added in this round of tokenization */
+	for (i = 0; i <  FTS_NUM_AUX_INDEX; i++) {
+		/* The computation of total_size below assumes that no
+		delete-mark flags will be stored and that all fields
+		are NOT NULL and fixed-length. */
+
+		sort_buf[i]->total_size += data_size[i];
+
+		sort_buf[i]->n_tuples += n_tuple[i];
+
+		merge_file[i]->n_rec += n_tuple[i];
+		t_ctx->rows_added[i] += n_tuple[i];
+	}
+
+	if (!buf_full) {
+		/* we pad one byte between text accross two fields */
+		t_ctx->init_pos += doc->text.f_len + 1;
+	}
+
+	return(!buf_full);
+}
+
+/*********************************************************************//**
+Get next doc item from fts_doc_list */
+UNIV_INLINE
+void
+row_merge_fts_get_next_doc_item(
+/*============================*/
+	fts_psort_t*		psort_info,	/*!< in: psort_info */
+	fts_doc_item_t**	doc_item)	/*!< in/out: doc item */
+{
+	if (*doc_item != NULL) {
+		ut_free(*doc_item);
+	}
+
+	mutex_enter(&psort_info->mutex);
+
+	*doc_item = UT_LIST_GET_FIRST(psort_info->fts_doc_list);
+	if (*doc_item != NULL) {
+		UT_LIST_REMOVE(doc_list, psort_info->fts_doc_list,
+			       *doc_item);
+
+		ut_ad(psort_info->memory_used >= sizeof(fts_doc_item_t)
+		      + (*doc_item)->field->len);
+		psort_info->memory_used -= sizeof(fts_doc_item_t)
+			+ (*doc_item)->field->len;
+	}
+
+	mutex_exit(&psort_info->mutex);
+}
+
+/*********************************************************************//**
+Function performs parallel tokenization of the incoming doc strings.
+It also performs the initial in memory sort of the parsed records.
+@return OS_THREAD_DUMMY_RETURN */
+UNIV_INTERN
+os_thread_ret_t
+fts_parallel_tokenization(
+/*======================*/
+	void*		arg)	/*!< in: psort_info for the thread */
+{
+	fts_psort_t*		psort_info = (fts_psort_t*) arg;
+	ulint			i;
+	fts_doc_item_t*		doc_item = NULL;
+	row_merge_buf_t**	buf;
+	ibool			processed = FALSE;
+	merge_file_t**		merge_file;
+	row_merge_block_t**	block;
+	int			tmpfd[FTS_NUM_AUX_INDEX];
+	ulint			mycount[FTS_NUM_AUX_INDEX];
+	ib_uint64_t		total_rec = 0;
+	ulint			num_doc_processed = 0;
+	doc_id_t		last_doc_id = 0;
+	ulint			zip_size;
+	mem_heap_t*		blob_heap = NULL;
+	fts_doc_t		doc;
+	dict_table_t*		table = psort_info->psort_common->new_table;
+	dtype_t			word_dtype;
+	dict_field_t*		idx_field;
+	fts_tokenize_ctx_t	t_ctx;
+	ulint			retried = 0;
+	dberr_t			error = DB_SUCCESS;
+
+	ut_ad(psort_info);
+
+	buf = psort_info->merge_buf;
+	merge_file = psort_info->merge_file;
+	blob_heap = mem_heap_create(512);
+	memset(&doc, 0, sizeof(doc));
+	memset(&t_ctx, 0, sizeof(t_ctx));
+	memset(mycount, 0, FTS_NUM_AUX_INDEX * sizeof(int));
+
+	doc.charset = fts_index_get_charset(
+		psort_info->psort_common->dup->index);
+
+	idx_field = dict_index_get_nth_field(
+		psort_info->psort_common->dup->index, 0);
+	word_dtype.prtype = idx_field->col->prtype;
+	word_dtype.mbminmaxlen = idx_field->col->mbminmaxlen;
+	word_dtype.mtype = (strcmp(doc.charset->name, "latin1_swedish_ci") == 0)
+				? DATA_VARCHAR : DATA_VARMYSQL;
+
+	block = psort_info->merge_block;
+	zip_size = dict_table_zip_size(table);
+
+	row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+
+	t_ctx.cached_stopword = table->fts->cache->stopword_info.cached_stopword;
+	processed = TRUE;
+loop:
+	while (doc_item) {
+		dfield_t*	dfield = doc_item->field;
+
+		last_doc_id = doc_item->doc_id;
+
+		ut_ad (dfield->data != NULL
+		       && dfield_get_len(dfield) != UNIV_SQL_NULL);
+
+		/* If finish processing the last item, update "doc" with
+		strings in the doc_item, otherwise continue processing last
+		item */
+		if (processed) {
+			byte*		data;
+			ulint		data_len;
+
+			dfield = doc_item->field;
+			data = static_cast<byte*>(dfield_get_data(dfield));
+			data_len = dfield_get_len(dfield);
+
+			if (dfield_is_ext(dfield)) {
+				doc.text.f_str =
+					btr_copy_externally_stored_field(
+						&doc.text.f_len, data,
+						zip_size, data_len, blob_heap);
+			} else {
+				doc.text.f_str = data;
+				doc.text.f_len = data_len;
+			}
+
+			doc.tokens = 0;
+			t_ctx.processed_len = 0;
+		} else {
+			/* Not yet finish processing the "doc" on hand,
+			continue processing it */
+			ut_ad(doc.text.f_str);
+			ut_ad(t_ctx.processed_len < doc.text.f_len);
+		}
+
+		processed = row_merge_fts_doc_tokenize(
+			buf, doc_item->doc_id, &doc,
+			&word_dtype,
+			merge_file, psort_info->psort_common->opt_doc_id_size,
+			&t_ctx);
+
+		/* Current sort buffer full, need to recycle */
+		if (!processed) {
+			ut_ad(t_ctx.processed_len < doc.text.f_len);
+			ut_ad(t_ctx.rows_added[t_ctx.buf_used]);
+			break;
+		}
+
+		num_doc_processed++;
+
+		if (fts_enable_diag_print && num_doc_processed % 10000 == 1) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"number of doc processed %d\n",
+				(int) num_doc_processed);
+#ifdef FTS_INTERNAL_DIAG_PRINT
+			for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+				ib_logf(IB_LOG_LEVEL_INFO,
+					"ID %d, partition %d, word "
+					"%d\n",(int) psort_info->psort_id,
+					(int) i, (int) mycount[i]);
+			}
+#endif
+		}
+
+		mem_heap_empty(blob_heap);
+
+		row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+
+		if (doc_item && last_doc_id != doc_item->doc_id) {
+			t_ctx.init_pos = 0;
+		}
+	}
+
+	/* If we run out of current sort buffer, need to sort
+	and flush the sort buffer to disk */
+	if (t_ctx.rows_added[t_ctx.buf_used] && !processed) {
+		row_merge_buf_sort(buf[t_ctx.buf_used], NULL);
+		row_merge_buf_write(buf[t_ctx.buf_used],
+				    merge_file[t_ctx.buf_used],
+				    block[t_ctx.buf_used]);
+
+		if (!row_merge_write(merge_file[t_ctx.buf_used]->fd,
+				     merge_file[t_ctx.buf_used]->offset++,
+				     block[t_ctx.buf_used])) {
+			error = DB_TEMP_FILE_WRITE_FAILURE;
+			goto func_exit;
+		}
+
+		UNIV_MEM_INVALID(block[t_ctx.buf_used][0], srv_sort_buf_size);
+		buf[t_ctx.buf_used] = row_merge_buf_empty(buf[t_ctx.buf_used]);
+		mycount[t_ctx.buf_used] += t_ctx.rows_added[t_ctx.buf_used];
+		t_ctx.rows_added[t_ctx.buf_used] = 0;
+
+		ut_a(doc_item);
+		goto loop;
+	}
+
+	/* Parent done scanning, and if finish processing all the docs, exit */
+	if (psort_info->state == FTS_PARENT_COMPLETE) {
+		if (UT_LIST_GET_LEN(psort_info->fts_doc_list) == 0) {
+			goto exit;
+		} else if (retried > 10000) {
+			ut_ad(!doc_item);
+			/* retied too many times and cannot get new record */
+			ib_logf(IB_LOG_LEVEL_ERROR,
+					"InnoDB: FTS parallel sort processed "
+					"%lu records, the sort queue has "
+					"%lu records. But sort cannot get "
+					"the next records", num_doc_processed,
+					UT_LIST_GET_LEN(
+						psort_info->fts_doc_list));
+			goto exit;
+		}
+	} else if (psort_info->state == FTS_PARENT_EXITING) {
+		/* Parent abort */
+		goto func_exit;
+	}
+
+	if (doc_item == NULL) {
+		os_thread_yield();
+	}
+
+	row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+
+	if (doc_item != NULL) {
+		if (last_doc_id != doc_item->doc_id) {
+			t_ctx.init_pos = 0;
+		}
+
+		retried = 0;
+	} else if (psort_info->state == FTS_PARENT_COMPLETE) {
+		retried++;
+	}
+
+	goto loop;
+
+exit:
+	/* Do a final sort of the last (or latest) batch of records
+	in block memory. Flush them to temp file if records cannot
+	be hold in one block memory */
+	for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+		if (t_ctx.rows_added[i]) {
+			row_merge_buf_sort(buf[i], NULL);
+			row_merge_buf_write(
+				buf[i], merge_file[i], block[i]);
+
+			/* Write to temp file, only if records have
+			been flushed to temp file before (offset > 0):
+			The pseudo code for sort is following:
+
+				while (there are rows) {
+					tokenize rows, put result in block[]
+					if (block[] runs out) {
+						sort rows;
+						write to temp file with
+						row_merge_write();
+						offset++;
+					}
+				}
+
+				# write out the last batch
+				if (offset > 0) {
+					row_merge_write();
+					offset++;
+				} else {
+					# no need to write anything
+					offset stay as 0
+				}
+
+			so if merge_file[i]->offset is 0 when we come to
+			here as the last batch, this means rows have
+			never flush to temp file, it can be held all in
+			memory */
+			if (merge_file[i]->offset != 0) {
+				if (!row_merge_write(merge_file[i]->fd,
+						merge_file[i]->offset++,
+						block[i])) {
+					error = DB_TEMP_FILE_WRITE_FAILURE;
+					goto func_exit;
+				}
+
+				UNIV_MEM_INVALID(block[i][0],
+						 srv_sort_buf_size);
+			}
+
+			buf[i] = row_merge_buf_empty(buf[i]);
+			t_ctx.rows_added[i] = 0;
+		}
+	}
+
+	if (fts_enable_diag_print) {
+		DEBUG_FTS_SORT_PRINT("  InnoDB_FTS: start merge sort\n");
+	}
+
+	for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+		if (!merge_file[i]->offset) {
+			continue;
+		}
+
+		tmpfd[i] = row_merge_file_create_low();
+		if (tmpfd[i] < 0) {
+			error = DB_OUT_OF_MEMORY;
+			goto func_exit;
+		}
+
+		error = row_merge_sort(psort_info->psort_common->trx,
+				       psort_info->psort_common->dup,
+				       merge_file[i], block[i], &tmpfd[i]);
+		if (error != DB_SUCCESS) {
+			close(tmpfd[i]);
+			goto func_exit;
+		}
+
+		total_rec += merge_file[i]->n_rec;
+		close(tmpfd[i]);
+	}
+
+func_exit:
+	if (fts_enable_diag_print) {
+		DEBUG_FTS_SORT_PRINT("  InnoDB_FTS: complete merge sort\n");
+	}
+
+	mem_heap_free(blob_heap);
+
+	mutex_enter(&psort_info->mutex);
+	psort_info->error = error;
+	mutex_exit(&psort_info->mutex);
+
+	if (UT_LIST_GET_LEN(psort_info->fts_doc_list) > 0) {
+		/* child can exit either with error or told by parent. */
+		ut_ad(error != DB_SUCCESS
+		      || psort_info->state == FTS_PARENT_EXITING);
+	}
+
+	/* Free fts doc list in case of error. */
+	do {
+		row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+	} while (doc_item != NULL);
+
+	psort_info->child_status = FTS_CHILD_COMPLETE;
+	os_event_set(psort_info->psort_common->sort_event);
+	psort_info->child_status = FTS_CHILD_EXITING;
+
+#ifdef __WIN__
+	CloseHandle(psort_info->thread_hdl);
+#endif /*__WIN__ */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/*********************************************************************//**
+Start the parallel tokenization and parallel merge sort */
+UNIV_INTERN
+void
+row_fts_start_psort(
+/*================*/
+	fts_psort_t*	psort_info)	/*!< parallel sort structure */
+{
+	ulint		i = 0;
+	os_thread_id_t	thd_id;
+
+	for (i = 0; i < fts_sort_pll_degree; i++) {
+		psort_info[i].psort_id = i;
+		psort_info[i].thread_hdl = os_thread_create(
+			fts_parallel_tokenization,
+			(void*) &psort_info[i], &thd_id);
+	}
+}
+
+/*********************************************************************//**
+Function performs the merge and insertion of the sorted records.
+@return OS_THREAD_DUMMY_RETURN */
+UNIV_INTERN
+os_thread_ret_t
+fts_parallel_merge(
+/*===============*/
+	void*		arg)		/*!< in: parallel merge info */
+{
+	fts_psort_t*	psort_info = (fts_psort_t*) arg;
+	ulint		id;
+
+	ut_ad(psort_info);
+
+	id = psort_info->psort_id;
+
+	row_fts_merge_insert(psort_info->psort_common->dup->index,
+			     psort_info->psort_common->new_table,
+			     psort_info->psort_common->all_info, id);
+
+	psort_info->child_status = FTS_CHILD_COMPLETE;
+	os_event_set(psort_info->psort_common->merge_event);
+	psort_info->child_status = FTS_CHILD_EXITING;
+
+#ifdef __WIN__
+	CloseHandle(psort_info->thread_hdl);
+#endif /*__WIN__ */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/*********************************************************************//**
+Kick off the parallel merge and insert thread */
+UNIV_INTERN
+void
+row_fts_start_parallel_merge(
+/*=========================*/
+	fts_psort_t*	merge_info)	/*!< in: parallel sort info */
+{
+	int		i = 0;
+	os_thread_id_t	thd_id;
+
+	/* Kick off merge/insert threads */
+	for (i = 0; i <  FTS_NUM_AUX_INDEX; i++) {
+		merge_info[i].psort_id = i;
+		merge_info[i].child_status = 0;
+
+		merge_info[i].thread_hdl = os_thread_create(
+			fts_parallel_merge, (void*) &merge_info[i], &thd_id);
+	}
+}
+
+/********************************************************************//**
+Insert processed FTS data to auxillary index tables.
+@return	DB_SUCCESS if insertion runs fine */
+static __attribute__((nonnull))
+dberr_t
+row_merge_write_fts_word(
+/*=====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t**		ins_graph,	/*!< in: Insert query graphs */
+	fts_tokenizer_word_t* word,	/*!< in: sorted and tokenized
+					word */
+	fts_table_t*	fts_table,	/*!< in: fts aux table instance */
+	CHARSET_INFO*	charset)	/*!< in: charset */
+{
+	ulint	selected;
+	dberr_t	ret = DB_SUCCESS;
+
+	selected = fts_select_index(
+		charset, word->text.f_str, word->text.f_len);
+	fts_table->suffix = fts_get_suffix(selected);
+
+	/* Pop out each fts_node in word->nodes write them to auxiliary table */
+	while (ib_vector_size(word->nodes) > 0) {
+		dberr_t		error;
+		fts_node_t*	fts_node;
+
+		fts_node = static_cast<fts_node_t*>(ib_vector_pop(word->nodes));
+
+		error = fts_write_node(
+			trx, &ins_graph[selected], fts_table, &word->text,
+			fts_node);
+
+		if (error != DB_SUCCESS) {
+			fprintf(stderr, "InnoDB: failed to write"
+				" word %s to FTS auxiliary index"
+				" table, error (%s) \n",
+				word->text.f_str, ut_strerr(error));
+			ret = error;
+		}
+
+		ut_free(fts_node->ilist);
+		fts_node->ilist = NULL;
+	}
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Read sorted FTS data files and insert data tuples to auxillary tables.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+void
+row_fts_insert_tuple(
+/*=================*/
+	fts_psort_insert_t*
+			ins_ctx,	/*!< in: insert context */
+	fts_tokenizer_word_t* word,	/*!< in: last processed
+					tokenized word */
+	ib_vector_t*	positions,	/*!< in: word position */
+	doc_id_t*	in_doc_id,	/*!< in: last item doc id */
+	dtuple_t*	dtuple)		/*!< in: entry to insert */
+{
+	fts_node_t*	fts_node = NULL;
+	dfield_t*	dfield;
+	doc_id_t	doc_id;
+	ulint		position;
+	fts_string_t	token_word;
+	ulint		i;
+
+	/* Get fts_node for the FTS auxillary INDEX table */
+	if (ib_vector_size(word->nodes) > 0) {
+		fts_node = static_cast<fts_node_t*>(
+			ib_vector_last(word->nodes));
+	}
+
+	if (fts_node == NULL
+	    || fts_node->ilist_size > FTS_ILIST_MAX_SIZE) {
+
+		fts_node = static_cast<fts_node_t*>(
+			ib_vector_push(word->nodes, NULL));
+
+		memset(fts_node, 0x0, sizeof(*fts_node));
+	}
+
+	/* If dtuple == NULL, this is the last word to be processed */
+	if (!dtuple) {
+		if (fts_node && ib_vector_size(positions) > 0) {
+			fts_cache_node_add_positions(
+				NULL, fts_node, *in_doc_id,
+				positions);
+
+			/* Write out the current word */
+			row_merge_write_fts_word(ins_ctx->trx,
+						 ins_ctx->ins_graph, word,
+						 &ins_ctx->fts_table,
+						 ins_ctx->charset);
+
+		}
+
+		return;
+	}
+
+	/* Get the first field for the tokenized word */
+	dfield = dtuple_get_nth_field(dtuple, 0);
+
+	token_word.f_n_char = 0;
+	token_word.f_len = dfield->len;
+	token_word.f_str = static_cast<byte*>(dfield_get_data(dfield));
+
+	if (!word->text.f_str) {
+		fts_utf8_string_dup(&word->text, &token_word, ins_ctx->heap);
+	}
+
+	/* compare to the last word, to see if they are the same
+	word */
+	if (innobase_fts_text_cmp(ins_ctx->charset,
+				  &word->text, &token_word) != 0) {
+		ulint	num_item;
+
+		/* Getting a new word, flush the last position info
+		for the currnt word in fts_node */
+		if (ib_vector_size(positions) > 0) {
+			fts_cache_node_add_positions(
+				NULL, fts_node, *in_doc_id, positions);
+		}
+
+		/* Write out the current word */
+		row_merge_write_fts_word(ins_ctx->trx, ins_ctx->ins_graph,
+					 word, &ins_ctx->fts_table,
+					 ins_ctx->charset);
+
+		/* Copy the new word */
+		fts_utf8_string_dup(&word->text, &token_word, ins_ctx->heap);
+
+		num_item = ib_vector_size(positions);
+
+		/* Clean up position queue */
+		for (i = 0; i < num_item; i++) {
+			ib_vector_pop(positions);
+		}
+
+		/* Reset Doc ID */
+		*in_doc_id = 0;
+		memset(fts_node, 0x0, sizeof(*fts_node));
+	}
+
+	/* Get the word's Doc ID */
+	dfield = dtuple_get_nth_field(dtuple, 1);
+
+	if (!ins_ctx->opt_doc_id_size) {
+		doc_id = fts_read_doc_id(
+			static_cast<byte*>(dfield_get_data(dfield)));
+	} else {
+		doc_id = (doc_id_t) mach_read_from_4(
+			static_cast<byte*>(dfield_get_data(dfield)));
+	}
+
+	/* Get the word's position info */
+	dfield = dtuple_get_nth_field(dtuple, 2);
+	position = mach_read_from_4(static_cast<byte*>(dfield_get_data(dfield)));
+
+	/* If this is the same word as the last word, and they
+	have the same Doc ID, we just need to add its position
+	info. Otherwise, we will flush position info to the
+	fts_node and initiate a new position vector  */
+	if (!(*in_doc_id) || *in_doc_id == doc_id) {
+		ib_vector_push(positions, &position);
+	} else {
+		ulint	num_pos = ib_vector_size(positions);
+
+		fts_cache_node_add_positions(NULL, fts_node,
+					     *in_doc_id, positions);
+		for (i = 0; i < num_pos; i++) {
+			ib_vector_pop(positions);
+		}
+		ib_vector_push(positions, &position);
+	}
+
+	/* record the current Doc ID */
+	*in_doc_id = doc_id;
+}
+
+/*********************************************************************//**
+Propagate a newly added record up one level in the selection tree
+@return parent where this value propagated to */
+static
+int
+row_fts_sel_tree_propagate(
+/*=======================*/
+	int		propogated,	/*<! in: tree node propagated */
+	int*		sel_tree,	/*<! in: selection tree */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	ulint**		offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in/out: FTS index */
+{
+	ulint	parent;
+	int	child_left;
+	int	child_right;
+	int	selected;
+
+	/* Find which parent this value will be propagated to */
+	parent = (propogated - 1) / 2;
+
+	/* Find out which value is smaller, and to propagate */
+	child_left = sel_tree[parent * 2 + 1];
+	child_right = sel_tree[parent * 2 + 2];
+
+	if (child_left == -1 || mrec[child_left] == NULL) {
+		if (child_right == -1
+		    || mrec[child_right] == NULL) {
+			selected = -1;
+		} else {
+			selected = child_right ;
+		}
+	} else if (child_right == -1
+		   || mrec[child_right] == NULL) {
+		selected = child_left;
+	} else if (cmp_rec_rec_simple(mrec[child_left], mrec[child_right],
+				      offsets[child_left],
+				      offsets[child_right],
+				      index, NULL) < 0) {
+		selected = child_left;
+	} else {
+		selected = child_right;
+	}
+
+	sel_tree[parent] = selected;
+
+	return(static_cast<int>(parent));
+}
+
+/*********************************************************************//**
+Readjust selection tree after popping the root and read a new value
+@return the new root */
+static
+int
+row_fts_sel_tree_update(
+/*====================*/
+	int*		sel_tree,	/*<! in/out: selection tree */
+	ulint		propagated,	/*<! in: node to propagate up */
+	ulint		height,		/*<! in: tree height */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	ulint**		offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in: index dictionary */
+{
+	ulint	i;
+
+	for (i = 1; i <= height; i++) {
+		propagated = static_cast<ulint>(row_fts_sel_tree_propagate(
+			static_cast<int>(propagated), sel_tree, mrec, offsets, index));
+	}
+
+	return(sel_tree[0]);
+}
+
+/*********************************************************************//**
+Build selection tree at a specified level */
+static
+void
+row_fts_build_sel_tree_level(
+/*=========================*/
+	int*		sel_tree,	/*<! in/out: selection tree */
+	ulint		level,		/*<! in: selection tree level */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	ulint**		offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in: index dictionary */
+{
+	ulint	start;
+	int	child_left;
+	int	child_right;
+	ulint	i;
+	ulint	num_item;
+
+	start = static_cast<ulint>((1 << level) - 1);
+	num_item = static_cast<ulint>(1 << level);
+
+	for (i = 0; i < num_item;  i++) {
+		child_left = sel_tree[(start + i) * 2 + 1];
+		child_right = sel_tree[(start + i) * 2 + 2];
+
+		if (child_left == -1) {
+			if (child_right == -1) {
+				sel_tree[start + i] = -1;
+			} else {
+				sel_tree[start + i] =  child_right;
+			}
+			continue;
+		} else if (child_right == -1) {
+			sel_tree[start + i] = child_left;
+			continue;
+		}
+
+		/* Deal with NULL child conditions */
+		if (!mrec[child_left]) {
+			if (!mrec[child_right]) {
+				sel_tree[start + i] = -1;
+			} else {
+				sel_tree[start + i] = child_right;
+			}
+			continue;
+		} else if (!mrec[child_right]) {
+			sel_tree[start + i] = child_left;
+			continue;
+		}
+
+		/* Select the smaller one to set parent pointer */
+		int cmp = cmp_rec_rec_simple(
+			mrec[child_left], mrec[child_right],
+			offsets[child_left], offsets[child_right],
+			index, NULL);
+
+		sel_tree[start + i] = cmp < 0 ? child_left : child_right;
+	}
+}
+
+/*********************************************************************//**
+Build a selection tree for merge. The selection tree is a binary tree
+and should have fts_sort_pll_degree / 2 levels. With root as level 0
+@return number of tree levels */
+static
+ulint
+row_fts_build_sel_tree(
+/*===================*/
+	int*		sel_tree,	/*<! in/out: selection tree */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	ulint**		offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in: index dictionary */
+{
+	ulint	treelevel = 1;
+	ulint	num = 2;
+	int	i = 0;
+	ulint	start;
+
+	/* No need to build selection tree if we only have two merge threads */
+	if (fts_sort_pll_degree <= 2) {
+		return(0);
+	}
+
+	while (num < fts_sort_pll_degree) {
+		num = num << 1;
+		treelevel++;
+	}
+
+	start = (1 << treelevel) - 1;
+
+	for (i = 0; i < (int) fts_sort_pll_degree; i++) {
+		sel_tree[i + start] = i;
+	}
+
+	for (i = static_cast<int>(treelevel) - 1; i >= 0; i--) {
+		row_fts_build_sel_tree_level(
+			sel_tree, static_cast<ulint>(i), mrec, offsets, index);
+	}
+
+	return(treelevel);
+}
+
+/*********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+dberr_t
+row_fts_merge_insert(
+/*=================*/
+	dict_index_t*		index,	/*!< in: index */
+	dict_table_t*		table,	/*!< in: new table */
+	fts_psort_t*		psort_info, /*!< parallel sort info */
+	ulint			id)	/* !< in: which auxiliary table's data
+					to insert to */
+{
+	const byte**		b;
+	mem_heap_t*		tuple_heap;
+	mem_heap_t*		heap;
+	dberr_t			error = DB_SUCCESS;
+	ulint*			foffs;
+	ulint**			offsets;
+	fts_tokenizer_word_t	new_word;
+	ib_vector_t*		positions;
+	doc_id_t		last_doc_id;
+	ib_alloc_t*		heap_alloc;
+	ulint			n_bytes;
+	ulint			i;
+	mrec_buf_t**		buf;
+	int*			fd;
+	byte**			block;
+	const mrec_t**		mrec;
+	ulint			count = 0;
+	int*			sel_tree;
+	ulint			height;
+	ulint			start;
+	fts_psort_insert_t	ins_ctx;
+	ulint			count_diag = 0;
+
+	ut_ad(index);
+	ut_ad(table);
+
+	/* We use the insert query graph as the dummy graph
+	needed in the row module call */
+
+	ins_ctx.trx = trx_allocate_for_background();
+
+	ins_ctx.trx->op_info = "inserting index entries";
+
+	ins_ctx.opt_doc_id_size = psort_info[0].psort_common->opt_doc_id_size;
+
+	heap = mem_heap_create(500 + sizeof(mrec_buf_t));
+
+	b = (const byte**) mem_heap_alloc(
+		heap, sizeof (*b) * fts_sort_pll_degree);
+	foffs = (ulint*) mem_heap_alloc(
+		heap, sizeof(*foffs) * fts_sort_pll_degree);
+	offsets = (ulint**) mem_heap_alloc(
+		heap, sizeof(*offsets) * fts_sort_pll_degree);
+	buf = (mrec_buf_t**) mem_heap_alloc(
+		heap, sizeof(*buf) * fts_sort_pll_degree);
+	fd = (int*) mem_heap_alloc(heap, sizeof(*fd) * fts_sort_pll_degree);
+	block = (byte**) mem_heap_alloc(
+		heap, sizeof(*block) * fts_sort_pll_degree);
+	mrec = (const mrec_t**) mem_heap_alloc(
+		heap, sizeof(*mrec) * fts_sort_pll_degree);
+	sel_tree = (int*) mem_heap_alloc(
+		heap, sizeof(*sel_tree) * (fts_sort_pll_degree * 2));
+
+	tuple_heap = mem_heap_create(1000);
+
+	ins_ctx.charset = fts_index_get_charset(index);
+	ins_ctx.heap = heap;
+
+	for (i = 0; i < fts_sort_pll_degree; i++) {
+		ulint	num;
+
+		num = 1 + REC_OFFS_HEADER_SIZE
+			+ dict_index_get_n_fields(index);
+		offsets[i] = static_cast<ulint*>(mem_heap_zalloc(
+			heap, num * sizeof *offsets[i]));
+		offsets[i][0] = num;
+		offsets[i][1] = dict_index_get_n_fields(index);
+		block[i] = psort_info[i].merge_block[id];
+		b[i] = psort_info[i].merge_block[id];
+		fd[i] = psort_info[i].merge_file[id]->fd;
+		foffs[i] = 0;
+
+		buf[i] = static_cast<unsigned char (*)[16384]>(
+			mem_heap_alloc(heap, sizeof *buf[i]));
+		count_diag += (int) psort_info[i].merge_file[id]->n_rec;
+	}
+
+	if (fts_enable_diag_print) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB_FTS: to inserted %lu records\n",
+			(ulong) count_diag);
+	}
+
+	/* Initialize related variables if creating FTS indexes */
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	memset(&new_word, 0, sizeof(new_word));
+
+	new_word.nodes = ib_vector_create(heap_alloc, sizeof(fts_node_t), 4);
+	positions = ib_vector_create(heap_alloc, sizeof(ulint), 32);
+	last_doc_id = 0;
+
+	/* Allocate insert query graphs for FTS auxillary
+	Index Table, note we have FTS_NUM_AUX_INDEX such index tables */
+	n_bytes = sizeof(que_t*) * (FTS_NUM_AUX_INDEX + 1);
+	ins_ctx.ins_graph = static_cast<que_t**>(mem_heap_alloc(heap, n_bytes));
+	memset(ins_ctx.ins_graph, 0x0, n_bytes);
+
+	/* We should set the flags2 with aux_table_name here,
+	in order to get the correct aux table names. */
+	index->table->flags2 |= DICT_TF2_FTS_AUX_HEX_NAME;
+	DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+			index->table->flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME;);
+
+	ins_ctx.fts_table.type = FTS_INDEX_TABLE;
+	ins_ctx.fts_table.index_id = index->id;
+	ins_ctx.fts_table.table_id = table->id;
+	ins_ctx.fts_table.parent = index->table->name;
+	ins_ctx.fts_table.table = index->table;
+
+	for (i = 0; i < fts_sort_pll_degree; i++) {
+		if (psort_info[i].merge_file[id]->n_rec == 0) {
+			/* No Rows to read */
+			mrec[i] = b[i] = NULL;
+		} else {
+			/* Read from temp file only if it has been
+			written to. Otherwise, block memory holds
+			all the sorted records */
+			if (psort_info[i].merge_file[id]->offset > 0
+			    && (!row_merge_read(
+					fd[i], foffs[i],
+					(row_merge_block_t*) block[i]))) {
+				error = DB_CORRUPTION;
+				goto exit;
+			}
+
+			ROW_MERGE_READ_GET_NEXT(i);
+		}
+	}
+
+	height = row_fts_build_sel_tree(sel_tree, (const mrec_t **) mrec,
+					offsets, index);
+
+	start = (1 << height) - 1;
+
+	/* Fetch sorted records from sort buffer and insert them into
+	corresponding FTS index auxiliary tables */
+	for (;;) {
+		dtuple_t*	dtuple;
+		ulint		n_ext;
+		int		min_rec = 0;
+
+		if (fts_sort_pll_degree <= 2) {
+			while (!mrec[min_rec]) {
+				min_rec++;
+
+				if (min_rec >= (int) fts_sort_pll_degree) {
+					row_fts_insert_tuple(
+						&ins_ctx, &new_word,
+						positions, &last_doc_id,
+						NULL);
+
+					goto exit;
+				}
+			}
+
+			for (i = min_rec + 1; i < fts_sort_pll_degree; i++) {
+				if (!mrec[i]) {
+					continue;
+				}
+
+				if (cmp_rec_rec_simple(
+					    mrec[i], mrec[min_rec],
+					    offsets[i], offsets[min_rec],
+					    index, NULL) < 0) {
+					min_rec = static_cast<int>(i);
+				}
+			}
+		} else {
+			min_rec = sel_tree[0];
+
+			if (min_rec ==  -1) {
+				row_fts_insert_tuple(
+					&ins_ctx, &new_word,
+					positions, &last_doc_id,
+					NULL);
+
+				goto exit;
+			}
+		}
+
+		dtuple = row_rec_to_index_entry_low(
+			mrec[min_rec], index, offsets[min_rec], &n_ext,
+			tuple_heap);
+
+		row_fts_insert_tuple(
+			&ins_ctx, &new_word, positions,
+			&last_doc_id, dtuple);
+
+
+		ROW_MERGE_READ_GET_NEXT(min_rec);
+
+		if (fts_sort_pll_degree > 2) {
+			if (!mrec[min_rec]) {
+				sel_tree[start + min_rec] = -1;
+			}
+
+			row_fts_sel_tree_update(sel_tree, start + min_rec,
+						height, mrec,
+						offsets, index);
+		}
+
+		count++;
+
+		mem_heap_empty(tuple_heap);
+	}
+
+exit:
+	fts_sql_commit(ins_ctx.trx);
+
+	ins_ctx.trx->op_info = "";
+
+	mem_heap_free(tuple_heap);
+
+	for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+		if (ins_ctx.ins_graph[i]) {
+			fts_que_graph_free(ins_ctx.ins_graph[i]);
+		}
+	}
+
+	trx_free_for_background(ins_ctx.trx);
+
+	mem_heap_free(heap);
+
+	if (fts_enable_diag_print) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB_FTS: inserted %lu records\n",
+			(ulong) count);
+	}
+
+	return(error);
+}
diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc
new file mode 100644
index 00000000000..b753574158a
--- /dev/null
+++ b/storage/innobase/row/row0import.cc
@@ -0,0 +1,3806 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0import.cc
+Import a tablespace to a running instance.
+
+Created 2012-02-08 by Sunny Bains.
+*******************************************************/
+
+#include "row0import.h"
+
+#ifdef UNIV_NONINL
+#include "row0import.ic"
+#endif
+
+#include "btr0pcur.h"
+#include "que0que.h"
+#include "dict0boot.h"
+#include "ibuf0ibuf.h"
+#include "pars0pars.h"
+#include "row0upd.h"
+#include "row0sel.h"
+#include "row0mysql.h"
+#include "srv0start.h"
+#include "row0quiesce.h"
+
+#include <vector>
+
+/** The size of the buffer to use for IO. Note: os_file_read() doesn't expect
+reads to fail. If you set the buffer size to be greater than a multiple of the
+file size then it will assert. TODO: Fix this limitation of the IO functions.
+@param n - page size of the tablespace.
+@retval number of pages */
+#define IO_BUFFER_SIZE(n)	((1024 * 1024) / n)
+
+/** For gathering stats on records during phase I */
+struct row_stats_t {
+	ulint		m_n_deleted;		/*!< Number of deleted records
+						found in the index */
+
+	ulint		m_n_purged;		/*!< Number of records purged
+						optimisatically */
+
+	ulint		m_n_rows;		/*!< Number of rows */
+
+	ulint		m_n_purge_failed;	/*!< Number of deleted rows
+						that could not be purged */
+};
+
+/** Index information required by IMPORT. */
+struct row_index_t {
+	index_id_t	m_id;			/*!< Index id of the table
+						in the exporting server */
+	byte*		m_name;			/*!< Index name */
+
+	ulint		m_space;		/*!< Space where it is placed */
+
+	ulint		m_page_no;		/*!< Root page number */
+
+	ulint		m_type;			/*!< Index type */
+
+	ulint		m_trx_id_offset;	/*!< Relevant only for clustered
+						indexes, offset of transaction
+						id system column */
+
+	ulint		m_n_user_defined_cols;	/*!< User defined columns */
+
+	ulint		m_n_uniq;		/*!< Number of columns that can
+						uniquely identify the row */
+
+	ulint		m_n_nullable;		/*!< Number of nullable
+						columns */
+
+	ulint		m_n_fields;		/*!< Total number of fields */
+
+	dict_field_t*	m_fields;		/*!< Index fields */
+
+	const dict_index_t*
+			m_srv_index;		/*!< Index instance in the
+						importing server */
+
+	row_stats_t	m_stats;		/*!< Statistics gathered during
+						the import phase */
+
+};
+
+/** Meta data required by IMPORT. */
+struct row_import {
+	row_import() UNIV_NOTHROW
+		:
+		m_table(),
+		m_version(),
+		m_hostname(),
+		m_table_name(),
+		m_autoinc(),
+		m_page_size(),
+		m_flags(),
+		m_n_cols(),
+		m_cols(),
+		m_col_names(),
+		m_n_indexes(),
+		m_indexes(),
+		m_missing(true) { }
+
+	~row_import() UNIV_NOTHROW;
+
+	/**
+	Find the index entry in in the indexes array.
+	@param name - index name
+	@return instance if found else 0. */
+	row_index_t* get_index(const char* name) const UNIV_NOTHROW;
+
+	/**
+	Get the number of rows in the index.
+	@param name - index name
+	@return number of rows (doesn't include delete marked rows). */
+	ulint	get_n_rows(const char* name) const UNIV_NOTHROW;
+
+	/**
+	Find the ordinal value of the column name in the cfg table columns.
+	@param name - of column to look for.
+	@return ULINT_UNDEFINED if not found. */
+	ulint find_col(const char* name) const UNIV_NOTHROW;
+
+	/**
+	Find the index field entry in in the cfg indexes fields.
+	@name - of the index to look for
+	@return instance if found else 0. */
+	const dict_field_t* find_field(
+		const row_index_t*	cfg_index,
+		const char* 		name) const UNIV_NOTHROW;
+
+	/**
+	Get the number of rows for which purge failed during the convert phase.
+	@param name - index name
+	@return number of rows for which purge failed. */
+	ulint	get_n_purge_failed(const char* name) const UNIV_NOTHROW;
+
+	/**
+	Check if the index is clean. ie. no delete-marked records
+	@param name - index name
+	@return true if index needs to be purged. */
+	bool requires_purge(const char* name) const UNIV_NOTHROW
+	{
+		return(get_n_purge_failed(name) > 0);
+	}
+
+	/**
+	Set the index root <space, pageno> using the index name */
+	void set_root_by_name() UNIV_NOTHROW;
+
+	/**
+	Set the index root <space, pageno> using a heuristic
+	@return DB_SUCCESS or error code */
+	dberr_t set_root_by_heuristic() UNIV_NOTHROW;
+
+	/** Check if the index schema that was read from the .cfg file
+	matches the in memory index definition.
+	Note: It will update row_import_t::m_srv_index to map the meta-data
+	read from the .cfg file to the server index instance.
+	@return DB_SUCCESS or error code. */
+	dberr_t match_index_columns(
+		THD*			thd,
+		const dict_index_t*	index) UNIV_NOTHROW;
+
+	/**
+	Check if the table schema that was read from the .cfg file matches the
+	in memory table definition.
+	@param thd - MySQL session variable
+	@return DB_SUCCESS or error code. */
+	dberr_t match_table_columns(
+		THD*			thd) UNIV_NOTHROW;
+
+	/**
+	Check if the table (and index) schema that was read from the .cfg file
+	matches the in memory table definition.
+	@param thd - MySQL session variable
+	@return DB_SUCCESS or error code. */
+	dberr_t match_schema(
+		THD*			thd) UNIV_NOTHROW;
+
+	dict_table_t*	m_table;		/*!< Table instance */
+
+	ulint		m_version;		/*!< Version of config file */
+
+	byte*		m_hostname;		/*!< Hostname where the
+						tablespace was exported */
+	byte*		m_table_name;		/*!< Exporting instance table
+						name */
+
+	ib_uint64_t	m_autoinc;		/*!< Next autoinc value */
+
+	ulint		m_page_size;		/*!< Tablespace page size */
+
+	ulint		m_flags;		/*!< Table flags */
+
+	ulint		m_n_cols;		/*!< Number of columns in the
+						meta-data file */
+
+	dict_col_t*	m_cols;			/*!< Column data */
+
+	byte**		m_col_names;		/*!< Column names, we store the
+						column naems separately becuase
+						there is no field to store the
+						value in dict_col_t */
+
+	ulint		m_n_indexes;		/*!< Number of indexes,
+						including clustered index */
+
+	row_index_t*	m_indexes;		/*!< Index meta data */
+
+	bool		m_missing;		/*!< true if a .cfg file was
+						found and was readable */
+};
+
+/** Use the page cursor to iterate over records in a block. */
+class RecIterator {
+public:
+	/**
+	Default constructor */
+	RecIterator() UNIV_NOTHROW
+	{
+		memset(&m_cur, 0x0, sizeof(m_cur));
+	}
+
+	/**
+	Position the cursor on the first user record. */
+	void	open(buf_block_t* block) UNIV_NOTHROW
+	{
+		page_cur_set_before_first(block, &m_cur);
+
+		if (!end()) {
+			next();
+		}
+	}
+
+	/**
+	Move to the next record. */
+	void	next() UNIV_NOTHROW
+	{
+		page_cur_move_to_next(&m_cur);
+	}
+
+	/**
+	@return the current record */
+	rec_t*	current() UNIV_NOTHROW
+	{
+		ut_ad(!end());
+		return(page_cur_get_rec(&m_cur));
+	}
+
+	/**
+	@return true if cursor is at the end */
+	bool	end() UNIV_NOTHROW
+	{
+		return(page_cur_is_after_last(&m_cur) == TRUE);
+	}
+
+	/** Remove the current record
+	@return true on success */
+	bool remove(
+		const dict_index_t*	index,
+		page_zip_des_t*		page_zip,
+		ulint*			offsets) UNIV_NOTHROW
+	{
+		/* We can't end up with an empty page unless it is root. */
+		if (page_get_n_recs(m_cur.block->frame) <= 1) {
+			return(false);
+		}
+
+		return(page_delete_rec(index, &m_cur, page_zip, offsets));
+	}
+
+private:
+	page_cur_t	m_cur;
+};
+
+/** Class that purges delete marked reocords from indexes, both secondary
+and cluster. It does a pessimistic delete. This should only be done if we
+couldn't purge the delete marked reocrds during Phase I. */
+class IndexPurge {
+public:
+	/** Constructor
+	@param trx - the user transaction covering the import tablespace
+	@param index - to be imported
+	@param space_id - space id of the tablespace */
+	IndexPurge(
+		trx_t*		trx,
+		dict_index_t*	index) UNIV_NOTHROW
+		:
+		m_trx(trx),
+		m_index(index),
+		m_n_rows(0)
+	{
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Phase II - Purge records from index %s",
+			index->name);
+	}
+
+	/** Descructor */
+	~IndexPurge() UNIV_NOTHROW { }
+
+	/** Purge delete marked records.
+	@return DB_SUCCESS or error code. */
+	dberr_t	garbage_collect() UNIV_NOTHROW;
+
+	/** The number of records that are not delete marked.
+	@return total records in the index after purge */
+	ulint	get_n_rows() const UNIV_NOTHROW
+	{
+		return(m_n_rows);
+	}
+
+private:
+	/**
+	Begin import, position the cursor on the first record. */
+	void	open() UNIV_NOTHROW;
+
+	/**
+	Close the persistent curosr and commit the mini-transaction. */
+	void	close() UNIV_NOTHROW;
+
+	/**
+	Position the cursor on the next record.
+	@return DB_SUCCESS or error code */
+	dberr_t	next() UNIV_NOTHROW;
+
+	/**
+	Store the persistent cursor position and reopen the
+	B-tree cursor in BTR_MODIFY_TREE mode, because the
+	tree structure may be changed during a pessimistic delete. */
+	void	purge_pessimistic_delete() UNIV_NOTHROW;
+
+	/**
+	Purge delete-marked records.
+	@param offsets - current row offsets. */
+	void	purge() UNIV_NOTHROW;
+
+protected:
+	// Disable copying
+	IndexPurge();
+	IndexPurge(const IndexPurge&);
+	IndexPurge &operator=(const IndexPurge&);
+
+private:
+	trx_t*			m_trx;		/*!< User transaction */
+	mtr_t			m_mtr;		/*!< Mini-transaction */
+	btr_pcur_t		m_pcur;		/*!< Persistent cursor */
+	dict_index_t*		m_index;	/*!< Index to be processed */
+	ulint			m_n_rows;	/*!< Records in index */
+};
+
+/** Functor that is called for each physical page that is read from the
+tablespace file.  */
+class AbstractCallback : public PageCallback {
+public:
+	/** Constructor
+	@param trx - covering transaction */
+	AbstractCallback(trx_t* trx)
+		:
+		m_trx(trx),
+		m_space(ULINT_UNDEFINED),
+		m_xdes(),
+		m_xdes_page_no(ULINT_UNDEFINED),
+		m_space_flags(ULINT_UNDEFINED),
+		m_table_flags(ULINT_UNDEFINED) UNIV_NOTHROW { }
+
+	/**
+	Free any extent descriptor instance */
+	virtual ~AbstractCallback()
+	{
+		delete [] m_xdes;
+	}
+
+	/** Determine the page size to use for traversing the tablespace
+	@param file_size - size of the tablespace file in bytes
+	@param block - contents of the first page in the tablespace file.
+	@retval DB_SUCCESS or error code. */
+	virtual dberr_t init(
+		os_offset_t		file_size,
+		const buf_block_t*	block) UNIV_NOTHROW;
+
+	/** @return true if compressed table. */
+	bool is_compressed_table() const UNIV_NOTHROW
+	{
+		return(get_zip_size() > 0);
+	}
+
+protected:
+	/**
+	Get the data page depending on the table type, compressed or not.
+	@param block - block read from disk
+	@retval the buffer frame */
+	buf_frame_t* get_frame(buf_block_t* block) const UNIV_NOTHROW
+	{
+		if (is_compressed_table()) {
+			return(block->page.zip.data);
+		}
+
+		return(buf_block_get_frame(block));
+	}
+
+	/** Check for session interrupt. If required we could
+	even flush to disk here every N pages.
+	@retval DB_SUCCESS or error code */
+	dberr_t periodic_check() UNIV_NOTHROW
+	{
+		if (trx_is_interrupted(m_trx)) {
+			return(DB_INTERRUPTED);
+		}
+
+		return(DB_SUCCESS);
+	}
+
+	/**
+	Get the physical offset of the extent descriptor within the page.
+	@param page_no - page number of the extent descriptor
+	@param page - contents of the page containing the extent descriptor.
+	@return the start of the xdes array in a page */
+	const xdes_t* xdes(
+		ulint		page_no,
+		const page_t*	page) const UNIV_NOTHROW
+	{
+		ulint	offset;
+
+		offset = xdes_calc_descriptor_index(get_zip_size(), page_no);
+
+		return(page + XDES_ARR_OFFSET + XDES_SIZE * offset);
+	}
+
+	/**
+	Set the current page directory (xdes). If the extent descriptor is
+	marked as free then free the current extent descriptor and set it to
+	0. This implies that all pages that are covered by this extent
+	descriptor are also freed.
+
+	@param page_no - offset of page within the file
+	@param page - page contents
+	@return DB_SUCCESS or error code. */
+	dberr_t	set_current_xdes(
+		ulint		page_no,
+		const page_t*	page) UNIV_NOTHROW
+	{
+		m_xdes_page_no = page_no;
+
+		delete[] m_xdes;
+
+		m_xdes = 0;
+
+		ulint		state;
+		const xdes_t*	xdesc = page + XDES_ARR_OFFSET;
+
+		state = mach_read_ulint(xdesc + XDES_STATE, MLOG_4BYTES);
+
+		if (state != XDES_FREE) {
+
+			m_xdes = new(std::nothrow) xdes_t[m_page_size];
+
+			/* Trigger OOM */
+			DBUG_EXECUTE_IF("ib_import_OOM_13",
+					delete [] m_xdes; m_xdes = 0;);
+
+			if (m_xdes == 0) {
+				return(DB_OUT_OF_MEMORY);
+			}
+
+			memcpy(m_xdes, page, m_page_size);
+		}
+
+		return(DB_SUCCESS);
+	}
+
+	/**
+	@return true if it is a root page */
+	bool is_root_page(const page_t* page) const UNIV_NOTHROW
+	{
+		ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX);
+
+		return(mach_read_from_4(page + FIL_PAGE_NEXT) == FIL_NULL
+		       && mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL);
+	}
+
+	/**
+	Check if the page is marked as free in the extent descriptor.
+	@param page_no - page number to check in the extent descriptor.
+	@return true if the page is marked as free */
+	bool is_free(ulint page_no) const UNIV_NOTHROW
+	{
+		ut_a(xdes_calc_descriptor_page(get_zip_size(), page_no)
+		     == m_xdes_page_no);
+
+		if (m_xdes != 0) {
+			const xdes_t*	xdesc = xdes(page_no, m_xdes);
+			ulint		pos = page_no % FSP_EXTENT_SIZE;
+
+			return(xdes_get_bit(xdesc, XDES_FREE_BIT, pos));
+		}
+
+		/* If the current xdes was free, the page must be free. */
+		return(true);
+	}
+
+protected:
+	/** Covering transaction. */
+	trx_t*			m_trx;
+
+	/** Space id of the file being iterated over. */
+	ulint			m_space;
+
+	/** Minimum page number for which the free list has not been
+	initialized: the pages >= this limit are, by definition, free;
+	note that in a single-table tablespace where size < 64 pages,
+	this number is 64, i.e., we have initialized the space about
+	the first extent, but have not physically allocted those pages
+	to the file. @see FSP_LIMIT. */
+	ulint			m_free_limit;
+
+	/** Current size of the space in pages */
+	ulint			m_size;
+
+	/** Current extent descriptor page */
+	xdes_t*			m_xdes;
+
+	/** Physical page offset in the file of the extent descriptor */
+	ulint			m_xdes_page_no;
+
+	/** Flags value read from the header page */
+	ulint			m_space_flags;
+
+	/** Derived from m_space_flags and row format type, the row format
+	type is determined from the page header. */
+	ulint			m_table_flags;
+};
+
+/** Determine the page size to use for traversing the tablespace
+@param file_size - size of the tablespace file in bytes
+@param block - contents of the first page in the tablespace file.
+@retval DB_SUCCESS or error code. */
+dberr_t
+AbstractCallback::init(
+	os_offset_t		file_size,
+	const buf_block_t*	block) UNIV_NOTHROW
+{
+	const page_t*		page = block->frame;
+
+	m_space_flags = fsp_header_get_flags(page);
+
+	/* Since we don't know whether it is a compressed table
+	or not, the data is always read into the block->frame. */
+
+	dberr_t	err = set_zip_size(block->frame);
+
+	if (err != DB_SUCCESS) {
+		return(DB_CORRUPTION);
+	}
+
+	/* Set the page size used to traverse the tablespace. */
+
+	m_page_size = (is_compressed_table())
+		? get_zip_size() : fsp_flags_get_page_size(m_space_flags);
+
+	if (m_page_size == 0) {
+		ib_logf(IB_LOG_LEVEL_ERROR, "Page size is 0");
+		return(DB_CORRUPTION);
+	} else if (!is_compressed_table() && m_page_size != UNIV_PAGE_SIZE) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Page size %lu of ibd file is not the same "
+			"as the server page size %lu",
+			m_page_size, UNIV_PAGE_SIZE);
+
+		return(DB_CORRUPTION);
+
+	} else if ((file_size % m_page_size)) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"File size " UINT64PF " is not a multiple "
+			"of the page size %lu",
+			(ib_uint64_t) file_size, (ulong) m_page_size);
+
+		return(DB_CORRUPTION);
+	}
+
+	ut_a(m_space == ULINT_UNDEFINED);
+
+	m_size  = mach_read_from_4(page + FSP_SIZE);
+	m_free_limit = mach_read_from_4(page + FSP_FREE_LIMIT);
+	m_space = mach_read_from_4(page + FSP_HEADER_OFFSET + FSP_SPACE_ID);
+
+	if ((err = set_current_xdes(0, page)) != DB_SUCCESS) {
+		return(err);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/**
+Try and determine the index root pages by checking if the next/prev
+pointers are both FIL_NULL. We need to ensure that skip deleted pages. */
+struct FetchIndexRootPages : public AbstractCallback {
+
+	/** Index information gathered from the .ibd file. */
+	struct Index {
+
+		Index(index_id_t id, ulint page_no)
+			:
+			m_id(id),
+			m_page_no(page_no) { }
+
+		index_id_t	m_id;		/*!< Index id */
+		ulint		m_page_no;	/*!< Root page number */
+	};
+
+	typedef std::vector<Index> Indexes;
+
+	/** Constructor
+	@param trx - covering (user) transaction
+	@param table - table definition in server .*/
+	FetchIndexRootPages(const dict_table_t* table, trx_t* trx)
+		:
+		AbstractCallback(trx),
+		m_table(table) UNIV_NOTHROW { }
+
+	/** Destructor */
+	virtual ~FetchIndexRootPages() UNIV_NOTHROW { }
+
+	/**
+	@retval the space id of the tablespace being iterated over */
+	virtual ulint get_space_id() const UNIV_NOTHROW
+	{
+		return(m_space);
+	}
+
+	/**
+	Check if the .ibd file row format is the same as the table's.
+	@param ibd_table_flags - determined from space and page.
+	@return DB_SUCCESS or error code. */
+	dberr_t check_row_format(ulint ibd_table_flags) UNIV_NOTHROW
+	{
+		dberr_t		err;
+		rec_format_t	ibd_rec_format;
+		rec_format_t	table_rec_format;
+
+		if (!dict_tf_is_valid(ibd_table_flags)) {
+
+			ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLE_SCHEMA_MISMATCH,
+				".ibd file has invlad table flags: %lx",
+				ibd_table_flags);
+
+			return(DB_CORRUPTION);
+		}
+
+		ibd_rec_format = dict_tf_get_rec_format(ibd_table_flags);
+		table_rec_format = dict_tf_get_rec_format(m_table->flags);
+
+		if (table_rec_format != ibd_rec_format) {
+
+			ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLE_SCHEMA_MISMATCH,
+				"Table has %s row format, .ibd "
+				"file has %s row format.",
+				dict_tf_to_row_format_string(m_table->flags),
+				dict_tf_to_row_format_string(ibd_table_flags));
+
+			err = DB_CORRUPTION;
+		} else {
+			err = DB_SUCCESS;
+		}
+
+		return(err);
+	}
+
+	/**
+	Called for each block as it is read from the file.
+	@param offset - physical offset in the file
+	@param block - block to convert, it is not from the buffer pool.
+	@retval DB_SUCCESS or error code. */
+	virtual dberr_t operator() (
+		os_offset_t	offset,
+		buf_block_t*	block) UNIV_NOTHROW;
+
+	/** Update the import configuration that will be used to import
+	the tablespace. */
+	dberr_t build_row_import(row_import* cfg) const UNIV_NOTHROW;
+
+	/** Table definition in server. */
+	const dict_table_t*	m_table;
+
+	/** Index information */
+	Indexes			m_indexes;
+};
+
+/**
+Called for each block as it is read from the file. Check index pages to
+determine the exact row format. We can't get that from the tablespace
+header flags alone.
+
+@param offset - physical offset in the file
+@param block - block to convert, it is not from the buffer pool.
+@retval DB_SUCCESS or error code. */
+dberr_t
+FetchIndexRootPages::operator() (
+	os_offset_t	offset,
+	buf_block_t*	block) UNIV_NOTHROW
+{
+	dberr_t		err;
+
+	if ((err = periodic_check()) != DB_SUCCESS) {
+		return(err);
+	}
+
+	const page_t*	page = get_frame(block);
+
+	ulint	page_type = fil_page_get_type(page);
+
+	if (block->page.offset * m_page_size != offset) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Page offset doesn't match file offset: "
+			"page offset: %lu, file offset: %lu",
+			(ulint) block->page.offset,
+			(ulint) (offset / m_page_size));
+
+		err = DB_CORRUPTION;
+	} else if (page_type == FIL_PAGE_TYPE_XDES) {
+		err = set_current_xdes(block->page.offset, page);
+	} else if (page_type == FIL_PAGE_INDEX
+		   && !is_free(block->page.offset)
+		   && is_root_page(page)) {
+
+		index_id_t	id = btr_page_get_index_id(page);
+		ulint		page_no = buf_block_get_page_no(block);
+
+		m_indexes.push_back(Index(id, page_no));
+
+		if (m_indexes.size() == 1) {
+
+			m_table_flags = dict_sys_tables_type_to_tf(
+				m_space_flags,
+				page_is_comp(page) ? DICT_N_COLS_COMPACT : 0);
+
+			err = check_row_format(m_table_flags);
+		}
+	}
+
+	return(err);
+}
+
+/**
+Update the import configuration that will be used to import the tablespace.
+@return error code or DB_SUCCESS */
+dberr_t
+FetchIndexRootPages::build_row_import(row_import* cfg) const UNIV_NOTHROW
+{
+	Indexes::const_iterator end = m_indexes.end();
+
+	ut_a(cfg->m_table == m_table);
+	cfg->m_page_size = m_page_size;
+	cfg->m_n_indexes = m_indexes.size();
+
+	if (cfg->m_n_indexes == 0) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR, "No B+Tree found in tablespace");
+
+		return(DB_CORRUPTION);
+	}
+
+	cfg->m_indexes = new(std::nothrow) row_index_t[cfg->m_n_indexes];
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF("ib_import_OOM_11",
+			delete [] cfg->m_indexes; cfg->m_indexes = 0;);
+
+	if (cfg->m_indexes == 0) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes);
+
+	row_index_t*	cfg_index = cfg->m_indexes;
+
+	for (Indexes::const_iterator it = m_indexes.begin();
+	     it != end;
+	     ++it, ++cfg_index) {
+
+		char	name[BUFSIZ];
+
+		ut_snprintf(name, sizeof(name), "index" IB_ID_FMT, it->m_id);
+
+		ulint	len = strlen(name) + 1;
+
+		cfg_index->m_name = new(std::nothrow) byte[len];
+
+		/* Trigger OOM */
+		DBUG_EXECUTE_IF("ib_import_OOM_12",
+				delete [] cfg_index->m_name;
+				cfg_index->m_name = 0;);
+
+		if (cfg_index->m_name == 0) {
+			return(DB_OUT_OF_MEMORY);
+		}
+
+		memcpy(cfg_index->m_name, name, len);
+
+		cfg_index->m_id = it->m_id;
+
+		cfg_index->m_space = m_space;
+
+		cfg_index->m_page_no = it->m_page_no;
+	}
+
+	return(DB_SUCCESS);
+}
+
+/* Functor that is called for each physical page that is read from the
+tablespace file.
+
+  1. Check each page for corruption.
+
+  2. Update the space id and LSN on every page
+     * For the header page
+       - Validate the flags
+       - Update the LSN
+
+  3. On Btree pages
+     * Set the index id
+     * Update the max trx id
+     * In a cluster index, update the system columns
+     * In a cluster index, update the BLOB ptr, set the space id
+     * Purge delete marked records, but only if they can be easily
+       removed from the page
+     * Keep a counter of number of rows, ie. non-delete-marked rows
+     * Keep a counter of number of delete marked rows
+     * Keep a counter of number of purge failure
+     * If a page is stamped with an index id that isn't in the .cfg file
+       we assume it is deleted and the page can be ignored.
+
+   4. Set the page state to dirty so that it will be written to disk.
+*/
+class PageConverter : public AbstractCallback {
+public:
+	/** Constructor
+	* @param cfg - config of table being imported.
+	* @param trx - transaction covering the import */
+	PageConverter(row_import* cfg, trx_t* trx) UNIV_NOTHROW;
+
+	virtual ~PageConverter() UNIV_NOTHROW
+	{
+		if (m_heap != 0) {
+			mem_heap_free(m_heap);
+		}
+	}
+
+	/**
+	@retval the server space id of the tablespace being iterated over */
+	virtual ulint get_space_id() const UNIV_NOTHROW
+	{
+		return(m_cfg->m_table->space);
+	}
+
+	/**
+	Called for each block as it is read from the file.
+	@param offset - physical offset in the file
+	@param block - block to convert, it is not from the buffer pool.
+	@retval DB_SUCCESS or error code. */
+	virtual dberr_t operator() (
+		os_offset_t	offset,
+		buf_block_t*	block) UNIV_NOTHROW;
+private:
+
+	/** Status returned by PageConverter::validate() */
+	enum import_page_status_t {
+		IMPORT_PAGE_STATUS_OK,		/*!< Page is OK */
+		IMPORT_PAGE_STATUS_ALL_ZERO,	/*!< Page is all zeros */
+		IMPORT_PAGE_STATUS_CORRUPTED	/*!< Page is corrupted */
+	};
+
+	/**
+	Update the page, set the space id, max trx id and index id.
+	@param block - block read from file
+	@param page_type - type of the page
+	@retval DB_SUCCESS or error code */
+	dberr_t update_page(
+		buf_block_t*	block,
+		ulint&		page_type) UNIV_NOTHROW;
+
+#if defined UNIV_DEBUG
+	/**
+	@return true error condition is enabled. */
+	bool trigger_corruption() UNIV_NOTHROW
+	{
+		return(false);
+	}
+	#else
+#define trigger_corruption()	(false)
+#endif /* UNIV_DEBUG */
+
+	/**
+	Update the space, index id, trx id.
+	@param block - block to convert
+	@return DB_SUCCESS or error code */
+	dberr_t	update_index_page(buf_block_t*	block) UNIV_NOTHROW;
+
+	/** Update the BLOB refrences and write UNDO log entries for
+	rows that can't be purged optimistically.
+	@param block - block to update
+	@retval DB_SUCCESS or error code */
+	dberr_t	update_records(buf_block_t* block) UNIV_NOTHROW;
+
+	/**
+	Validate the page, check for corruption.
+	@param offset - physical offset within file.
+	@param page - page read from file.
+	@return 0 on success, 1 if all zero, 2 if corrupted */
+	import_page_status_t validate(
+		os_offset_t	offset,
+		buf_block_t*	page) UNIV_NOTHROW;
+
+	/**
+	Validate the space flags and update tablespace header page.
+	@param block - block read from file, not from the buffer pool.
+	@retval DB_SUCCESS or error code */
+	dberr_t	update_header(buf_block_t* block) UNIV_NOTHROW;
+
+	/**
+	Adjust the BLOB reference for a single column that is externally stored
+	@param rec - record to update
+	@param offsets - column offsets for the record
+	@param i - column ordinal value
+	@return DB_SUCCESS or error code */
+	dberr_t	adjust_cluster_index_blob_column(
+		rec_t*		rec,
+		const ulint*	offsets,
+		ulint		i) UNIV_NOTHROW;
+
+	/**
+	Adjusts the BLOB reference in the clustered index row for all
+	externally stored columns.
+	@param rec - record to update
+	@param offsets - column offsets for the record
+	@return DB_SUCCESS or error code */
+	dberr_t	adjust_cluster_index_blob_columns(
+		rec_t*		rec,
+		const ulint*	offsets) UNIV_NOTHROW;
+
+	/**
+	In the clustered index, adjist the BLOB pointers as needed.
+	Also update the BLOB reference, write the new space id.
+	@param rec - record to update
+	@param offsets - column offsets for the record
+	@return DB_SUCCESS or error code */
+	dberr_t	adjust_cluster_index_blob_ref(
+		rec_t*		rec,
+		const ulint*	offsets) UNIV_NOTHROW;
+
+	/**
+	Purge delete-marked records, only if it is possible to do
+	so without re-organising the B+tree.
+	@param offsets - current row offsets.
+	@retval true if purged */
+	bool	purge(const ulint* offsets) UNIV_NOTHROW;
+
+	/**
+	Adjust the BLOB references and sys fields for the current record.
+	@param index - the index being converted
+	@param rec - record to update
+	@param offsets - column offsets for the record
+	@param deleted - true if row is delete marked
+	@return DB_SUCCESS or error code. */
+	dberr_t	adjust_cluster_record(
+		const dict_index_t*	index,
+		rec_t*			rec,
+		const ulint*		offsets,
+		bool			deleted) UNIV_NOTHROW;
+
+	/**
+	Find an index with the matching id.
+	@return row_index_t* instance or 0 */
+	row_index_t* find_index(index_id_t id) UNIV_NOTHROW
+	{
+		row_index_t*	index = &m_cfg->m_indexes[0];
+
+		for (ulint i = 0; i < m_cfg->m_n_indexes; ++i, ++index) {
+			if (id == index->m_id) {
+				return(index);
+			}
+		}
+
+		return(0);
+
+	}
+private:
+	/** Config for table that is being imported. */
+	row_import*		m_cfg;
+
+	/** Current index whose pages are being imported */
+	row_index_t*		m_index;
+
+	/** Current system LSN */
+	lsn_t			m_current_lsn;
+
+	/** Alias for m_page_zip, only set for compressed pages. */
+	page_zip_des_t*		m_page_zip_ptr;
+
+	/** Iterator over records in a block */
+	RecIterator		m_rec_iter;
+
+	/** Record offset */
+	ulint			m_offsets_[REC_OFFS_NORMAL_SIZE];
+
+	/** Pointer to m_offsets_ */
+	ulint*			m_offsets;
+
+	/** Memory heap for the record offsets */
+	mem_heap_t*		m_heap;
+
+	/** Cluster index instance */
+	dict_index_t*		m_cluster_index;
+};
+
+/**
+row_import destructor. */
+row_import::~row_import() UNIV_NOTHROW
+{
+	for (ulint i = 0; m_indexes != 0 && i < m_n_indexes; ++i) {
+		delete [] m_indexes[i].m_name;
+
+		if (m_indexes[i].m_fields == 0) {
+			continue;
+		}
+
+		dict_field_t*	fields = m_indexes[i].m_fields;
+		ulint		n_fields = m_indexes[i].m_n_fields;
+
+		for (ulint j = 0; j < n_fields; ++j) {
+			delete [] fields[j].name;
+		}
+
+		delete [] fields;
+	}
+
+	for (ulint i = 0; m_col_names != 0 && i < m_n_cols; ++i) {
+		delete [] m_col_names[i];
+	}
+
+	delete [] m_cols;
+	delete [] m_indexes;
+	delete [] m_col_names;
+	delete [] m_table_name;
+	delete [] m_hostname;
+}
+
+/**
+Find the index entry in in the indexes array.
+@param name - index name
+@return instance if found else 0. */
+row_index_t*
+row_import::get_index(
+	const char*	name) const UNIV_NOTHROW
+{
+	for (ulint i = 0; i < m_n_indexes; ++i) {
+		const char*	index_name;
+		row_index_t*	index = &m_indexes[i];
+
+		index_name = reinterpret_cast<const char*>(index->m_name);
+
+		if (strcmp(index_name, name) == 0) {
+
+			return(index);
+		}
+	}
+
+	return(0);
+}
+
+/**
+Get the number of rows in the index.
+@param name - index name
+@return number of rows (doesn't include delete marked rows). */
+ulint
+row_import::get_n_rows(
+	const char*	name) const UNIV_NOTHROW
+{
+	const row_index_t*	index = get_index(name);
+
+	ut_a(name != 0);
+
+	return(index->m_stats.m_n_rows);
+}
+
+/**
+Get the number of rows for which purge failed uding the convert phase.
+@param name - index name
+@return number of rows for which purge failed. */
+ulint
+row_import::get_n_purge_failed(
+	const char*	name) const UNIV_NOTHROW
+{
+	const row_index_t*	index = get_index(name);
+
+	ut_a(name != 0);
+
+	return(index->m_stats.m_n_purge_failed);
+}
+
+/**
+Find the ordinal value of the column name in the cfg table columns.
+@param name - of column to look for.
+@return ULINT_UNDEFINED if not found. */
+ulint
+row_import::find_col(
+	const char*	name) const UNIV_NOTHROW
+{
+	for (ulint i = 0; i < m_n_cols; ++i) {
+		const char*	col_name;
+
+		col_name = reinterpret_cast<const char*>(m_col_names[i]);
+
+		if (strcmp(col_name, name) == 0) {
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**
+Find the index field entry in in the cfg indexes fields.
+@name - of the index to look for
+@return instance if found else 0. */
+const dict_field_t*
+row_import::find_field(
+	const row_index_t*	cfg_index,
+	const char* 		name) const UNIV_NOTHROW
+{
+	const dict_field_t*	field = cfg_index->m_fields;
+
+	for (ulint i = 0; i < cfg_index->m_n_fields; ++i, ++field) {
+		const char*	field_name;
+
+		field_name = reinterpret_cast<const char*>(field->name);
+
+		if (strcmp(field_name, name) == 0) {
+			return(field);
+		}
+	}
+
+	return(0);
+}
+
+/**
+Check if the index schema that was read from the .cfg file matches the
+in memory index definition.
+@return DB_SUCCESS or error code. */
+dberr_t
+row_import::match_index_columns(
+	THD*			thd,
+	const dict_index_t*	index) UNIV_NOTHROW
+{
+	row_index_t*		cfg_index;
+	dberr_t			err = DB_SUCCESS;
+
+	cfg_index = get_index(index->name);
+
+	if (cfg_index == 0) {
+		ib_errf(thd, IB_LOG_LEVEL_ERROR,
+			 ER_TABLE_SCHEMA_MISMATCH,
+			 "Index %s not found in tablespace meta-data file.",
+			 index->name);
+
+		return(DB_ERROR);
+	}
+
+	cfg_index->m_srv_index = index;
+
+	const dict_field_t*	field = index->fields;
+
+	for (ulint i = 0; i < index->n_fields; ++i, ++field) {
+
+		const dict_field_t*	cfg_field;
+
+		cfg_field = find_field(cfg_index, field->name);
+
+		if (cfg_field == 0) {
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				 ER_TABLE_SCHEMA_MISMATCH,
+				 "Index %s field %s not found in tablespace "
+				 "meta-data file.",
+				 index->name, field->name);
+
+			err = DB_ERROR;
+		} else {
+
+			if (cfg_field->prefix_len != field->prefix_len) {
+				ib_errf(thd, IB_LOG_LEVEL_ERROR,
+					 ER_TABLE_SCHEMA_MISMATCH,
+					 "Index %s field %s prefix len %lu "
+					 "doesn't match meta-data file value "
+					 "%lu",
+					 index->name, field->name,
+					 (ulong) field->prefix_len,
+					 (ulong) cfg_field->prefix_len);
+
+				err = DB_ERROR;
+			}
+
+			if (cfg_field->fixed_len != field->fixed_len) {
+				ib_errf(thd, IB_LOG_LEVEL_ERROR,
+					 ER_TABLE_SCHEMA_MISMATCH,
+					 "Index %s field %s fixed len %lu "
+					 "doesn't match meta-data file value "
+					 "%lu",
+					 index->name, field->name,
+					 (ulong) field->fixed_len,
+					 (ulong) cfg_field->fixed_len);
+
+				err = DB_ERROR;
+			}
+		}
+	}
+
+	return(err);
+}
+
+/**
+Check if the table schema that was read from the .cfg file matches the
+in memory table definition.
+@param thd - MySQL session variable
+@return DB_SUCCESS or error code. */
+dberr_t
+row_import::match_table_columns(
+	THD*			thd) UNIV_NOTHROW
+{
+	dberr_t			err = DB_SUCCESS;
+	const dict_col_t*	col = m_table->cols;
+
+	for (ulint i = 0; i < m_table->n_cols; ++i, ++col) {
+
+		const char*	col_name;
+		ulint		cfg_col_index;
+
+		col_name = dict_table_get_col_name(
+			m_table, dict_col_get_no(col));
+
+		cfg_col_index = find_col(col_name);
+
+		if (cfg_col_index == ULINT_UNDEFINED) {
+
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				 ER_TABLE_SCHEMA_MISMATCH,
+				 "Column %s not found in tablespace.",
+				 col_name);
+
+			err = DB_ERROR;
+		} else if (cfg_col_index != col->ind) {
+
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				 ER_TABLE_SCHEMA_MISMATCH,
+				 "Column %s ordinal value mismatch, it's at "
+				 "%lu in the table and %lu in the tablespace "
+				 "meta-data file",
+				 col_name,
+				 (ulong) col->ind, (ulong) cfg_col_index);
+
+			err = DB_ERROR;
+		} else {
+			const dict_col_t*	cfg_col;
+
+			cfg_col = &m_cols[cfg_col_index];
+			ut_a(cfg_col->ind == cfg_col_index);
+
+			if (cfg_col->prtype != col->prtype) {
+				ib_errf(thd,
+					 IB_LOG_LEVEL_ERROR,
+					 ER_TABLE_SCHEMA_MISMATCH,
+					 "Column %s precise type mismatch.",
+					 col_name);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->mtype != col->mtype) {
+				ib_errf(thd,
+					 IB_LOG_LEVEL_ERROR,
+					 ER_TABLE_SCHEMA_MISMATCH,
+					 "Column %s main type mismatch.",
+					 col_name);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->len != col->len) {
+				ib_errf(thd,
+					 IB_LOG_LEVEL_ERROR,
+					 ER_TABLE_SCHEMA_MISMATCH,
+					 "Column %s length mismatch.",
+					 col_name);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->mbminmaxlen != col->mbminmaxlen) {
+				ib_errf(thd,
+					 IB_LOG_LEVEL_ERROR,
+					 ER_TABLE_SCHEMA_MISMATCH,
+					 "Column %s multi-byte len mismatch.",
+					 col_name);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->ind != col->ind) {
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->ord_part != col->ord_part) {
+				ib_errf(thd,
+					 IB_LOG_LEVEL_ERROR,
+					 ER_TABLE_SCHEMA_MISMATCH,
+					 "Column %s ordering mismatch.",
+					 col_name);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->max_prefix != col->max_prefix) {
+				ib_errf(thd,
+					 IB_LOG_LEVEL_ERROR,
+					 ER_TABLE_SCHEMA_MISMATCH,
+					 "Column %s max prefix mismatch.",
+					 col_name);
+				err = DB_ERROR;
+			}
+		}
+	}
+
+	return(err);
+}
+
+/**
+Check if the table (and index) schema that was read from the .cfg file
+matches the in memory table definition.
+@param thd - MySQL session variable
+@return DB_SUCCESS or error code. */
+dberr_t
+row_import::match_schema(
+	THD*		thd) UNIV_NOTHROW
+{
+	/* Do some simple checks. */
+
+	if (m_flags != m_table->flags) {
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+			 "Table flags don't match, server table has 0x%lx "
+			 "and the meta-data file has 0x%lx",
+			 (ulong) m_table->n_cols, (ulong) m_flags);
+
+		return(DB_ERROR);
+	} else if (m_table->n_cols != m_n_cols) {
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+			 "Number of columns don't match, table has %lu "
+			 "columns but the tablespace meta-data file has "
+			 "%lu columns",
+			 (ulong) m_table->n_cols, (ulong) m_n_cols);
+
+		return(DB_ERROR);
+	} else if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) {
+
+		/* If the number of indexes don't match then it is better
+		to abort the IMPORT. It is easy for the user to create a
+		table matching the IMPORT definition. */
+
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+			 "Number of indexes don't match, table has %lu "
+			 "indexes but the tablespace meta-data file has "
+			 "%lu indexes",
+			 (ulong) UT_LIST_GET_LEN(m_table->indexes),
+			 (ulong) m_n_indexes);
+
+		return(DB_ERROR);
+	}
+
+	dberr_t	err = match_table_columns(thd);
+
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* Check if the index definitions match. */
+
+	const dict_index_t* index;
+
+	for (index = UT_LIST_GET_FIRST(m_table->indexes);
+	     index != 0;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+		dberr_t	index_err;
+
+		index_err = match_index_columns(thd, index);
+
+		if (index_err != DB_SUCCESS) {
+			err = index_err;
+		}
+	}
+
+	return(err);
+}
+
+/**
+Set the index root <space, pageno>, using index name. */
+void
+row_import::set_root_by_name() UNIV_NOTHROW
+{
+	row_index_t*	cfg_index = m_indexes;
+
+	for (ulint i = 0; i < m_n_indexes; ++i, ++cfg_index) {
+		dict_index_t*	index;
+
+		const char*	index_name;
+
+		index_name = reinterpret_cast<const char*>(cfg_index->m_name);
+
+		index = dict_table_get_index_on_name(m_table, index_name);
+
+		/* We've already checked that it exists. */
+		ut_a(index != 0);
+
+		/* Set the root page number and space id. */
+		index->space = m_table->space;
+		index->page = cfg_index->m_page_no;
+	}
+}
+
+/**
+Set the index root <space, pageno>, using a heuristic.
+@return DB_SUCCESS or error code */
+dberr_t
+row_import::set_root_by_heuristic() UNIV_NOTHROW
+{
+	row_index_t*	cfg_index = m_indexes;
+
+	ut_a(m_n_indexes > 0);
+
+	// TODO: For now use brute force, based on ordinality
+
+	if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) {
+
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name), m_table->name, FALSE);
+
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Table %s should have %lu indexes but the tablespace "
+			"has %lu indexes",
+			table_name,
+			UT_LIST_GET_LEN(m_table->indexes),
+			m_n_indexes);
+	}
+
+	dict_mutex_enter_for_mysql();
+
+	ulint	i = 0;
+	dberr_t	err = DB_SUCCESS;
+
+	for (dict_index_t* index = UT_LIST_GET_FIRST(m_table->indexes);
+	     index != 0;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+		if (index->type & DICT_FTS) {
+			index->type |= DICT_CORRUPT;
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Skipping FTS index: %s", index->name);
+		} else if (i < m_n_indexes) {
+
+			delete [] cfg_index[i].m_name;
+
+			ulint	len = strlen(index->name) + 1;
+
+			cfg_index[i].m_name = new(std::nothrow) byte[len];
+
+			/* Trigger OOM */
+			DBUG_EXECUTE_IF("ib_import_OOM_14",
+					delete[] cfg_index[i].m_name;
+					cfg_index[i].m_name = 0;);
+
+			if (cfg_index[i].m_name == 0) {
+				err = DB_OUT_OF_MEMORY;
+				break;
+			}
+
+			memcpy(cfg_index[i].m_name, index->name, len);
+
+			cfg_index[i].m_srv_index = index;
+
+			index->space = m_table->space;
+			index->page = cfg_index[i].m_page_no;
+
+			++i;
+		}
+	}
+
+	dict_mutex_exit_for_mysql();
+
+	return(err);
+}
+
+/**
+Purge delete marked records.
+@return DB_SUCCESS or error code. */
+dberr_t
+IndexPurge::garbage_collect() UNIV_NOTHROW
+{
+	dberr_t	err;
+	ibool	comp = dict_table_is_comp(m_index->table);
+
+	/* Open the persistent cursor and start the mini-transaction. */
+
+	open();
+
+	while ((err = next()) == DB_SUCCESS) {
+
+		rec_t*	rec = btr_pcur_get_rec(&m_pcur);
+		ibool	deleted = rec_get_deleted_flag(rec, comp);
+
+		if (!deleted) {
+			++m_n_rows;
+		} else {
+			purge();
+		}
+	}
+
+	/* Close the persistent cursor and commit the mini-transaction. */
+
+	close();
+
+	return(err == DB_END_OF_INDEX ? DB_SUCCESS : err);
+}
+
+/**
+Begin import, position the cursor on the first record. */
+void
+IndexPurge::open() UNIV_NOTHROW
+{
+	mtr_start(&m_mtr);
+
+	mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO);
+
+	btr_pcur_open_at_index_side(
+		true, m_index, BTR_MODIFY_LEAF, &m_pcur, true, 0, &m_mtr);
+}
+
+/**
+Close the persistent curosr and commit the mini-transaction. */
+void
+IndexPurge::close() UNIV_NOTHROW
+{
+	btr_pcur_close(&m_pcur);
+	mtr_commit(&m_mtr);
+}
+
+/**
+Position the cursor on the next record.
+@return DB_SUCCESS or error code */
+dberr_t
+IndexPurge::next() UNIV_NOTHROW
+{
+	btr_pcur_move_to_next_on_page(&m_pcur);
+
+	/* When switching pages, commit the mini-transaction
+	in order to release the latch on the old page. */
+
+	if (!btr_pcur_is_after_last_on_page(&m_pcur)) {
+		return(DB_SUCCESS);
+	} else if (trx_is_interrupted(m_trx)) {
+		/* Check after every page because the check
+		is expensive. */
+		return(DB_INTERRUPTED);
+	}
+
+	btr_pcur_store_position(&m_pcur, &m_mtr);
+
+	mtr_commit(&m_mtr);
+
+	mtr_start(&m_mtr);
+
+	mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO);
+
+	btr_pcur_restore_position(BTR_MODIFY_LEAF, &m_pcur, &m_mtr);
+
+	if (!btr_pcur_move_to_next_user_rec(&m_pcur, &m_mtr)) {
+
+		return(DB_END_OF_INDEX);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/**
+Store the persistent cursor position and reopen the
+B-tree cursor in BTR_MODIFY_TREE mode, because the
+tree structure may be changed during a pessimistic delete. */
+void
+IndexPurge::purge_pessimistic_delete() UNIV_NOTHROW
+{
+	dberr_t	err;
+
+	btr_pcur_restore_position(BTR_MODIFY_TREE, &m_pcur, &m_mtr);
+
+	ut_ad(rec_get_deleted_flag(
+			btr_pcur_get_rec(&m_pcur),
+			dict_table_is_comp(m_index->table)));
+
+	btr_cur_pessimistic_delete(
+		&err, FALSE, btr_pcur_get_btr_cur(&m_pcur), 0, RB_NONE, &m_mtr);
+
+	ut_a(err == DB_SUCCESS);
+
+	/* Reopen the B-tree cursor in BTR_MODIFY_LEAF mode */
+	mtr_commit(&m_mtr);
+}
+
+/**
+Purge delete-marked records. */
+void
+IndexPurge::purge() UNIV_NOTHROW
+{
+	btr_pcur_store_position(&m_pcur, &m_mtr);
+
+	purge_pessimistic_delete();
+
+	mtr_start(&m_mtr);
+
+	mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO);
+
+	btr_pcur_restore_position(BTR_MODIFY_LEAF, &m_pcur, &m_mtr);
+}
+
+/**
+Constructor
+* @param cfg - config of table being imported.
+* @param trx - transaction covering the import */
+PageConverter::PageConverter(
+	row_import*	cfg,
+	trx_t*		trx)
+	:
+	AbstractCallback(trx),
+	m_cfg(cfg),
+	m_page_zip_ptr(0),
+	m_heap(0) UNIV_NOTHROW
+{
+	m_index = m_cfg->m_indexes;
+
+	m_current_lsn = log_get_lsn();
+	ut_a(m_current_lsn > 0);
+
+	m_offsets = m_offsets_;
+	rec_offs_init(m_offsets_);
+
+	m_cluster_index = dict_table_get_first_index(m_cfg->m_table);
+}
+
+/**
+Adjust the BLOB reference for a single column that is externally stored
+@param rec - record to update
+@param offsets - column offsets for the record
+@param i - column ordinal value
+@return DB_SUCCESS or error code */
+dberr_t
+PageConverter::adjust_cluster_index_blob_column(
+	rec_t*		rec,
+	const ulint*	offsets,
+	ulint		i) UNIV_NOTHROW
+{
+	ulint		len;
+	byte*		field;
+
+	field = rec_get_nth_field(rec, offsets, i, &len);
+
+	DBUG_EXECUTE_IF("ib_import_trigger_corruption_2",
+			len = BTR_EXTERN_FIELD_REF_SIZE - 1;);
+
+	if (len < BTR_EXTERN_FIELD_REF_SIZE) {
+
+		char index_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			index_name, sizeof(index_name),
+			m_cluster_index->name, TRUE);
+
+		ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_INNODB_INDEX_CORRUPT,
+			"Externally stored column(%lu) has a reference "
+			"length of %lu in the cluster index %s",
+			(ulong) i, (ulong) len, index_name);
+
+		return(DB_CORRUPTION);
+	}
+
+	field += BTR_EXTERN_SPACE_ID - BTR_EXTERN_FIELD_REF_SIZE + len;
+
+	if (is_compressed_table()) {
+		mach_write_to_4(field, get_space_id());
+
+		page_zip_write_blob_ptr(
+			m_page_zip_ptr, rec, m_cluster_index, offsets, i, 0);
+	} else {
+		mlog_write_ulint(field, get_space_id(), MLOG_4BYTES, 0);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/**
+Adjusts the BLOB reference in the clustered index row for all externally
+stored columns.
+@param rec - record to update
+@param offsets - column offsets for the record
+@return DB_SUCCESS or error code */
+dberr_t
+PageConverter::adjust_cluster_index_blob_columns(
+	rec_t*		rec,
+	const ulint*	offsets) UNIV_NOTHROW
+{
+	ut_ad(rec_offs_any_extern(offsets));
+
+	/* Adjust the space_id in the BLOB pointers. */
+
+	for (ulint i = 0; i < rec_offs_n_fields(offsets); ++i) {
+
+		/* Only if the column is stored "externally". */
+
+		if (rec_offs_nth_extern(offsets, i)) {
+			dberr_t	err;
+
+			err = adjust_cluster_index_blob_column(rec, offsets, i);
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/**
+In the clustered index, adjust BLOB pointers as needed. Also update the
+BLOB reference, write the new space id.
+@param rec - record to update
+@param offsets - column offsets for the record
+@return DB_SUCCESS or error code */
+dberr_t
+PageConverter::adjust_cluster_index_blob_ref(
+	rec_t*		rec,
+	const ulint*	offsets) UNIV_NOTHROW
+{
+	if (rec_offs_any_extern(offsets)) {
+		dberr_t	err;
+
+		err = adjust_cluster_index_blob_columns(rec, offsets);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/**
+Purge delete-marked records, only if it is possible to do so without
+re-organising the B+tree.
+@param offsets - current row offsets.
+@return true if purge succeeded */
+bool
+PageConverter::purge(const ulint* offsets) UNIV_NOTHROW
+{
+	const dict_index_t*	index = m_index->m_srv_index;
+
+	/* We can't have a page that is empty and not root. */
+	if (m_rec_iter.remove(index, m_page_zip_ptr, m_offsets)) {
+
+		++m_index->m_stats.m_n_purged;
+
+		return(true);
+	} else {
+		++m_index->m_stats.m_n_purge_failed;
+	}
+
+	return(false);
+}
+
+/**
+Adjust the BLOB references and sys fields for the current record.
+@param rec - record to update
+@param offsets - column offsets for the record
+@param deleted - true if row is delete marked
+@return DB_SUCCESS or error code. */
+dberr_t
+PageConverter::adjust_cluster_record(
+	const dict_index_t*	index,
+	rec_t*			rec,
+	const ulint*		offsets,
+	bool			deleted) UNIV_NOTHROW
+{
+	dberr_t	err;
+
+	if ((err = adjust_cluster_index_blob_ref(rec, offsets)) == DB_SUCCESS) {
+
+		/* Reset DB_TRX_ID and DB_ROLL_PTR.  Normally, these fields
+		are only written in conjunction with other changes to the
+		record. */
+
+		row_upd_rec_sys_fields(
+			rec, m_page_zip_ptr, m_cluster_index, m_offsets,
+			m_trx, 0);
+	}
+
+	return(err);
+}
+
+/**
+Update the BLOB refrences and write UNDO log entries for
+rows that can't be purged optimistically.
+@param block - block to update
+@retval DB_SUCCESS or error code */
+dberr_t
+PageConverter::update_records(
+	buf_block_t*	block) UNIV_NOTHROW
+{
+	ibool	comp = dict_table_is_comp(m_cfg->m_table);
+	bool	clust_index = m_index->m_srv_index == m_cluster_index;
+
+	/* This will also position the cursor on the first user record. */
+
+	m_rec_iter.open(block);
+
+	while (!m_rec_iter.end()) {
+
+		rec_t*	rec = m_rec_iter.current();
+
+		/* FIXME: Move out of the loop */
+
+		if (rec_get_status(rec) == REC_STATUS_NODE_PTR) {
+			break;
+		}
+
+		ibool	deleted = rec_get_deleted_flag(rec, comp);
+
+		/* For the clustered index we have to adjust the BLOB
+		reference and the system fields irrespective of the
+		delete marked flag. The adjustment of delete marked
+		cluster records is required for purge to work later. */
+
+		if (deleted || clust_index) {
+			m_offsets = rec_get_offsets(
+				rec, m_index->m_srv_index, m_offsets,
+				ULINT_UNDEFINED, &m_heap);
+		}
+
+		if (clust_index) {
+
+			dberr_t err = adjust_cluster_record(
+				m_index->m_srv_index, rec, m_offsets,
+				deleted);
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+		}
+
+		/* If it is a delete marked record then try an
+		optimistic delete. */
+
+		if (deleted) {
+			/* A successful purge will move the cursor to the
+			next record. */
+
+			if (!purge(m_offsets)) {
+				m_rec_iter.next();
+			}
+
+			++m_index->m_stats.m_n_deleted;
+		} else {
+			++m_index->m_stats.m_n_rows;
+			m_rec_iter.next();
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/**
+Update the space, index id, trx id.
+@return DB_SUCCESS or error code */
+dberr_t
+PageConverter::update_index_page(
+	buf_block_t*	block) UNIV_NOTHROW
+{
+	index_id_t	id;
+	buf_frame_t*	page = block->frame;
+
+	if (is_free(buf_block_get_page_no(block))) {
+		return(DB_SUCCESS);
+	} else if ((id = btr_page_get_index_id(page)) != m_index->m_id) {
+
+		row_index_t*	index = find_index(id);
+
+		if (index == 0) {
+			m_index = 0;
+			return(DB_CORRUPTION);
+		}
+
+		/* Update current index */
+		m_index = index;
+	}
+
+	/* If the .cfg file is missing and there is an index mismatch
+	then ignore the error. */
+	if (m_cfg->m_missing && (m_index == 0 || m_index->m_srv_index == 0)) {
+		return(DB_SUCCESS);
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!is_compressed_table()
+	     || page_zip_validate(m_page_zip_ptr, page, m_index->m_srv_index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	/* This has to be written to uncompressed index header. Set it to
+	the current index id. */
+	btr_page_set_index_id(
+		page, m_page_zip_ptr, m_index->m_srv_index->id, 0);
+
+	page_set_max_trx_id(block, m_page_zip_ptr, m_trx->id, 0);
+
+	if (page_is_empty(block->frame)) {
+
+		/* Only a root page can be empty. */
+		if (!is_root_page(block->frame)) {
+			// TODO: We should relax this and skip secondary
+			// indexes. Mark them as corrupt because they can
+			// always be rebuilt.
+			return(DB_CORRUPTION);
+		}
+
+		return(DB_SUCCESS);
+	}
+
+	return(update_records(block));
+}
+
+/**
+Validate the space flags and update tablespace header page.
+@param block - block read from file, not from the buffer pool.
+@retval DB_SUCCESS or error code */
+dberr_t
+PageConverter::update_header(
+	buf_block_t*	block) UNIV_NOTHROW
+{
+	/* Check for valid header */
+	switch(fsp_header_get_space_id(get_frame(block))) {
+	case 0:
+		return(DB_CORRUPTION);
+	case ULINT_UNDEFINED:
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Space id check in the header failed "
+			"- ignored");
+	}
+
+	ulint		space_flags = fsp_header_get_flags(get_frame(block));
+
+	if (!fsp_flags_is_valid(space_flags)) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unsupported tablespace format %lu",
+			(ulong) space_flags);
+
+		return(DB_UNSUPPORTED);
+	}
+
+	mach_write_to_8(
+		get_frame(block) + FIL_PAGE_FILE_FLUSH_LSN, m_current_lsn);
+
+	/* Write space_id to the tablespace header, page 0. */
+	mach_write_to_4(
+		get_frame(block) + FSP_HEADER_OFFSET + FSP_SPACE_ID,
+		get_space_id());
+
+	/* This is on every page in the tablespace. */
+	mach_write_to_4(
+		get_frame(block) + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+		get_space_id());
+
+	return(DB_SUCCESS);
+}
+
+/**
+Update the page, set the space id, max trx id and index id.
+@param block - block read from file
+@retval DB_SUCCESS or error code */
+dberr_t
+PageConverter::update_page(
+	buf_block_t*	block,
+	ulint&		page_type) UNIV_NOTHROW
+{
+	dberr_t		err = DB_SUCCESS;
+
+	switch (page_type = fil_page_get_type(get_frame(block))) {
+	case FIL_PAGE_TYPE_FSP_HDR:
+		/* Work directly on the uncompressed page headers. */
+		ut_a(buf_block_get_page_no(block) == 0);
+		return(update_header(block));
+
+	case FIL_PAGE_INDEX:
+		/* We need to decompress the contents into block->frame
+		before we can do any thing with Btree pages. */
+
+		if (is_compressed_table() && !buf_zip_decompress(block, TRUE)) {
+			return(DB_CORRUPTION);
+		}
+
+		/* This is on every page in the tablespace. */
+		mach_write_to_4(
+			get_frame(block)
+			+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id());
+
+		/* Only update the Btree nodes. */
+		return(update_index_page(block));
+
+	case FIL_PAGE_TYPE_SYS:
+		/* This is page 0 in the system tablespace. */
+		return(DB_CORRUPTION);
+
+	case FIL_PAGE_TYPE_XDES:
+		err = set_current_xdes(
+			buf_block_get_page_no(block), get_frame(block));
+	case FIL_PAGE_INODE:
+	case FIL_PAGE_TYPE_TRX_SYS:
+	case FIL_PAGE_IBUF_FREE_LIST:
+	case FIL_PAGE_TYPE_ALLOCATED:
+	case FIL_PAGE_IBUF_BITMAP:
+	case FIL_PAGE_TYPE_BLOB:
+	case FIL_PAGE_TYPE_ZBLOB:
+	case FIL_PAGE_TYPE_ZBLOB2:
+
+		/* Work directly on the uncompressed page headers. */
+		/* This is on every page in the tablespace. */
+		mach_write_to_4(
+			get_frame(block)
+			+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id());
+
+		return(err);
+	}
+
+	ib_logf(IB_LOG_LEVEL_WARN, "Unknown page type (%lu)", page_type);
+
+	return(DB_CORRUPTION);
+}
+
+/**
+Validate the page
+@param offset - physical offset within file.
+@param page - page read from file.
+@return status */
+PageConverter::import_page_status_t
+PageConverter::validate(
+	os_offset_t	offset,
+	buf_block_t*	block) UNIV_NOTHROW
+{
+	buf_frame_t*	page = get_frame(block);
+
+	/* Check that the page number corresponds to the offset in
+	the file. Flag as corrupt if it doesn't. Disable the check
+	for LSN in buf_page_is_corrupted() */
+
+	if (buf_page_is_corrupted(false, page, get_zip_size())
+	    || (page_get_page_no(page) != offset / m_page_size
+		&& page_get_page_no(page) != 0)) {
+
+		return(IMPORT_PAGE_STATUS_CORRUPTED);
+
+	} else if (offset > 0 && page_get_page_no(page) == 0) {
+		const byte*	b = page;
+		const byte*	e = b + m_page_size;
+
+		/* If the page number is zero and offset > 0 then
+		the entire page MUST consist of zeroes. If not then
+		we flag it as corrupt. */
+
+		while (b != e) {
+
+			if (*b++ && !trigger_corruption()) {
+				return(IMPORT_PAGE_STATUS_CORRUPTED);
+			}
+		}
+
+		/* The page is all zero: do nothing. */
+		return(IMPORT_PAGE_STATUS_ALL_ZERO);
+	}
+
+	return(IMPORT_PAGE_STATUS_OK);
+}
+
+/**
+Called for every page in the tablespace. If the page was not
+updated then its state must be set to BUF_PAGE_NOT_USED.
+@param offset - physical offset within the file
+@param block - block read from file, note it is not from the buffer pool
+@retval DB_SUCCESS or error code. */
+dberr_t
+PageConverter::operator() (
+	os_offset_t	offset,
+	buf_block_t*	block) UNIV_NOTHROW
+{
+	ulint		page_type;
+	dberr_t		err = DB_SUCCESS;
+
+	if ((err = periodic_check()) != DB_SUCCESS) {
+		return(err);
+	}
+
+	if (is_compressed_table()) {
+		m_page_zip_ptr = &block->page.zip;
+	} else {
+		ut_ad(m_page_zip_ptr == 0);
+	}
+
+	switch(validate(offset, block)) {
+	case IMPORT_PAGE_STATUS_OK:
+
+		/* We have to decompress the compressed pages before
+		we can work on them */
+
+		if ((err = update_page(block, page_type)) != DB_SUCCESS) {
+			return(err);
+		}
+
+		/* Note: For compressed pages this function will write to the
+		zip descriptor and for uncompressed pages it will write to
+		page (ie. the block->frame). Therefore the caller should write
+		out the descriptor contents and not block->frame for compressed
+		pages. */
+
+		if (!is_compressed_table() || page_type == FIL_PAGE_INDEX) {
+
+			buf_flush_init_for_writing(
+				!is_compressed_table()
+				? block->frame : block->page.zip.data,
+				!is_compressed_table() ? 0 : m_page_zip_ptr,
+				m_current_lsn);
+		} else {
+			/* Calculate and update the checksum of non-btree
+			pages for compressed tables explicitly here. */
+
+			buf_flush_update_zip_checksum(
+				get_frame(block), get_zip_size(),
+				m_current_lsn);
+		}
+
+		break;
+
+	case IMPORT_PAGE_STATUS_ALL_ZERO:
+		/* The page is all zero: leave it as is. */
+		break;
+
+	case IMPORT_PAGE_STATUS_CORRUPTED:
+
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"%s: Page %lu at offset " UINT64PF " looks corrupted.",
+			m_filepath, (ulong) (offset / m_page_size), offset);
+
+		return(DB_CORRUPTION);
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Clean up after import tablespace failure, this function will acquire
+the dictionary latches on behalf of the transaction if the transaction
+hasn't already acquired them. */
+static	__attribute__((nonnull))
+void
+row_import_discard_changes(
+/*=======================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt from handler */
+	trx_t*		trx,		/*!< in/out: transaction for import */
+	dberr_t		err)		/*!< in: error code */
+{
+	dict_table_t*	table = prebuilt->table;
+
+	ut_a(err != DB_SUCCESS);
+
+	prebuilt->trx->error_info = NULL;
+
+	char	table_name[MAX_FULL_NAME_LEN + 1];
+
+	innobase_format_name(
+		table_name, sizeof(table_name),
+		prebuilt->table->name, FALSE);
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Discarding tablespace of table %s: %s",
+		table_name, ut_strerr(err));
+
+	if (trx->dict_operation_lock_mode != RW_X_LATCH) {
+		ut_a(trx->dict_operation_lock_mode == 0);
+		row_mysql_lock_data_dictionary(trx);
+	}
+
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	/* Since we update the index root page numbers on disk after
+	we've done a successful import. The table will not be loadable.
+	However, we need to ensure that the in memory root page numbers
+	are reset to "NULL". */
+
+	for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+		index != 0;
+		index = UT_LIST_GET_NEXT(indexes, index)) {
+
+		index->page = FIL_NULL;
+		index->space = FIL_NULL;
+	}
+
+	table->ibd_file_missing = TRUE;
+
+	fil_close_tablespace(trx, table->space);
+}
+
+/*****************************************************************//**
+Clean up after import tablespace. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_cleanup(
+/*===============*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt from handler */
+	trx_t*		trx,		/*!< in/out: transaction for import */
+	dberr_t		err)		/*!< in: error code */
+{
+	ut_a(prebuilt->trx != trx);
+
+	if (err != DB_SUCCESS) {
+		row_import_discard_changes(prebuilt, trx, err);
+	}
+
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	DBUG_EXECUTE_IF("ib_import_before_commit_crash", DBUG_SUICIDE(););
+
+	trx_commit_for_mysql(trx);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx_free_for_mysql(trx);
+
+	prebuilt->trx->op_info = "";
+
+	DBUG_EXECUTE_IF("ib_import_before_checkpoint_crash", DBUG_SUICIDE(););
+
+	log_make_checkpoint_at(LSN_MAX, TRUE);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Report error during tablespace import. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_error(
+/*=============*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt from handler */
+	trx_t*		trx,		/*!< in/out: transaction for import */
+	dberr_t		err)		/*!< in: error code */
+{
+	if (!trx_is_interrupted(trx)) {
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name),
+			prebuilt->table->name, FALSE);
+
+		ib_senderrf(
+			trx->mysql_thd, IB_LOG_LEVEL_WARN,
+			ER_INNODB_IMPORT_ERROR,
+			table_name, (ulong) err, ut_strerr(err));
+	}
+
+	return(row_import_cleanup(prebuilt, trx, err));
+}
+
+/*****************************************************************//**
+Adjust the root page index node and leaf node segment headers, update
+with the new space id. For all the table's secondary indexes.
+@return error code */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_adjust_root_pages_of_secondary_indexes(
+/*==============================================*/
+	row_prebuilt_t*		prebuilt,	/*!< in/out: prebuilt from
+						handler */
+	trx_t*			trx,		/*!< in: transaction used for
+						the import */
+	dict_table_t*		table,		/*!< in: table the indexes
+						belong to */
+	const row_import&	cfg)		/*!< Import context */
+{
+	dict_index_t*		index;
+	ulint			n_rows_in_table;
+	dberr_t			err = DB_SUCCESS;
+
+	/* Skip the clustered index. */
+	index = dict_table_get_first_index(table);
+
+	n_rows_in_table = cfg.get_n_rows(index->name);
+
+	DBUG_EXECUTE_IF("ib_import_sec_rec_count_mismatch_failure",
+			n_rows_in_table++;);
+
+	/* Adjust the root pages of the secondary indexes only. */
+	while ((index = dict_table_get_next_index(index)) != NULL) {
+		char		index_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			index_name, sizeof(index_name), index->name, TRUE);
+
+		ut_a(!dict_index_is_clust(index));
+
+		if (!(index->type & DICT_CORRUPT)
+		    && index->space != FIL_NULL
+		    && index->page != FIL_NULL) {
+
+			/* Update the Btree segment headers for index node and
+			leaf nodes in the root page. Set the new space id. */
+
+			err = btr_root_adjust_on_import(index);
+		} else {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Skip adjustment of root pages for "
+				"index %s.", index->name);
+
+			err = DB_CORRUPTION;
+		}
+
+		if (err != DB_SUCCESS) {
+
+			if (index->type & DICT_CLUSTERED) {
+				break;
+			}
+
+			ib_errf(trx->mysql_thd,
+				IB_LOG_LEVEL_WARN,
+				ER_INNODB_INDEX_CORRUPT,
+				"Index '%s' not found or corrupt, "
+				"you should recreate this index.",
+				index_name);
+
+			/* Do not bail out, so that the data
+			can be recovered. */
+
+			err = DB_SUCCESS;
+			index->type |= DICT_CORRUPT;
+			continue;
+		}
+
+		/* If we failed to purge any records in the index then
+		do it the hard way.
+
+		TODO: We can do this in the first pass by generating UNDO log
+		records for the failed rows. */
+
+		if (!cfg.requires_purge(index->name)) {
+			continue;
+		}
+
+		IndexPurge   purge(trx, index);
+
+		trx->op_info = "secondary: purge delete marked records";
+
+		err = purge.garbage_collect();
+
+		trx->op_info = "";
+
+		if (err != DB_SUCCESS) {
+			break;
+		} else if (purge.get_n_rows() != n_rows_in_table) {
+
+			ib_errf(trx->mysql_thd,
+				IB_LOG_LEVEL_WARN,
+				ER_INNODB_INDEX_CORRUPT,
+				"Index '%s' contains %lu entries, "
+				"should be %lu, you should recreate "
+				"this index.", index_name,
+				(ulong) purge.get_n_rows(),
+				(ulong) n_rows_in_table);
+
+			index->type |= DICT_CORRUPT;
+
+			/* Do not bail out, so that the data
+			can be recovered. */
+
+			err = DB_SUCCESS;
+                }
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Ensure that dict_sys->row_id exceeds SELECT MAX(DB_ROW_ID).
+@return error code */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_set_sys_max_row_id(
+/*==========================*/
+	row_prebuilt_t*		prebuilt,	/*!< in/out: prebuilt from
+						handler */
+	const dict_table_t*	table)		/*!< in: table to import */
+{
+	dberr_t			err;
+	const rec_t*		rec;
+	mtr_t			mtr;
+	btr_pcur_t		pcur;
+	row_id_t		row_id	= 0;
+	dict_index_t*		index;
+
+	index = dict_table_get_first_index(table);
+	ut_a(dict_index_is_clust(index));
+
+	mtr_start(&mtr);
+
+	mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
+
+	btr_pcur_open_at_index_side(
+		false,		// High end
+		index,
+		BTR_SEARCH_LEAF,
+		&pcur,
+		true,		// Init cursor
+		0,		// Leaf level
+		&mtr);
+
+	btr_pcur_move_to_prev_on_page(&pcur);
+	rec = btr_pcur_get_rec(&pcur);
+
+	/* Check for empty table. */
+	if (!page_rec_is_infimum(rec)) {
+		ulint		len;
+		const byte*	field;
+		mem_heap_t*	heap = NULL;
+		ulint		offsets_[1 + REC_OFFS_HEADER_SIZE];
+		ulint*		offsets;
+
+		rec_offs_init(offsets_);
+
+		offsets = rec_get_offsets(
+			rec, index, offsets_, ULINT_UNDEFINED, &heap);
+
+		field = rec_get_nth_field(
+			rec, offsets,
+			dict_index_get_sys_col_pos(index, DATA_ROW_ID),
+			&len);
+
+		if (len == DATA_ROW_ID_LEN) {
+			row_id = mach_read_from_6(field);
+			err = DB_SUCCESS;
+		} else {
+			err = DB_CORRUPTION;
+		}
+
+		if (heap != NULL) {
+			mem_heap_free(heap);
+		}
+	} else {
+		/* The table is empty. */
+		err = DB_SUCCESS;
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	DBUG_EXECUTE_IF("ib_import_set_max_rowid_failure",
+			err = DB_CORRUPTION;);
+
+	if (err != DB_SUCCESS) {
+		char		index_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			index_name, sizeof(index_name), index->name, TRUE);
+
+		ib_errf(prebuilt->trx->mysql_thd,
+			IB_LOG_LEVEL_WARN,
+			ER_INNODB_INDEX_CORRUPT,
+			"Index '%s' corruption detected, invalid DB_ROW_ID "
+			"in index.", index_name);
+
+		return(err);
+
+	} else if (row_id > 0) {
+
+		/* Update the system row id if the imported index row id is
+		greater than the max system row id. */
+
+		mutex_enter(&dict_sys->mutex);
+
+		if (row_id >= dict_sys->row_id) {
+			dict_sys->row_id = row_id + 1;
+			dict_hdr_flush_row_id();
+		}
+
+		mutex_exit(&dict_sys->mutex);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Read the a string from the meta data file.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_import_cfg_read_string(
+/*=======================*/
+	FILE*		file,		/*!< in/out: File to read from */
+	byte*		ptr,		/*!< out: string to read */
+	ulint		max_len)	/*!< in: maximum length of the output
+					buffer in bytes */
+{
+	DBUG_EXECUTE_IF("ib_import_string_read_error",
+			errno = EINVAL; return(DB_IO_ERROR););
+
+	ulint		len = 0;
+
+	while (!feof(file)) {
+		int	ch = fgetc(file);
+
+		if (ch == EOF) {
+			break;
+		} else if (ch != 0) {
+			if (len < max_len) {
+				ptr[len++] = ch;
+			} else {
+				break;
+			}
+		/* max_len includes the NUL byte */
+		} else if (len != max_len - 1) {
+			break;
+		} else {
+			ptr[len] = 0;
+			return(DB_SUCCESS);
+		}
+	}
+
+	errno = EINVAL;
+
+	return(DB_IO_ERROR);
+}
+
+/*********************************************************************//**
+Write the meta data (index user fields) config file.
+@return DB_SUCCESS or error code. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_cfg_read_index_fields(
+/*=============================*/
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd,	/*!< in/out: session */
+	row_index_t*		index,	/*!< Index being read in */
+	row_import*		cfg)	/*!< in/out: meta-data read */
+{
+	byte			row[sizeof(ib_uint32_t) * 3];
+	ulint			n_fields = index->m_n_fields;
+
+	index->m_fields = new(std::nothrow) dict_field_t[n_fields];
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF("ib_import_OOM_4",
+			delete [] index->m_fields; index->m_fields = 0;);
+
+	if (index->m_fields == 0) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	dict_field_t*	field = index->m_fields;
+
+	memset(field, 0x0, sizeof(*field) * n_fields);
+
+	for (ulint i = 0; i < n_fields; ++i, ++field) {
+		byte*		ptr = row;
+
+		/* Trigger EOF */
+		DBUG_EXECUTE_IF("ib_import_io_read_error_1",
+				(void) fseek(file, 0L, SEEK_END););
+
+		if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				errno, strerror(errno),
+				"while reading index fields.");
+
+			return(DB_IO_ERROR);
+		}
+
+		field->prefix_len = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		field->fixed_len = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		/* Include the NUL byte in the length. */
+		ulint	len = mach_read_from_4(ptr);
+
+		byte*	name = new(std::nothrow) byte[len];
+
+		/* Trigger OOM */
+		DBUG_EXECUTE_IF("ib_import_OOM_5", delete [] name; name = 0;);
+
+		if (name == 0) {
+			return(DB_OUT_OF_MEMORY);
+		}
+
+		field->name = reinterpret_cast<const char*>(name);
+
+		dberr_t	err = row_import_cfg_read_string(file, name, len);
+
+		if (err != DB_SUCCESS) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				errno, strerror(errno),
+				"while parsing table name.");
+
+			return(err);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Read the index names and root page numbers of the indexes and set the values.
+Row format [root_page_no, len of str, str ... ]
+@return DB_SUCCESS or error code. */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_read_index_data(
+/*=======================*/
+	FILE*		file,		/*!< in: File to read from */
+	THD*		thd,		/*!< in: session */
+	row_import*	cfg)		/*!< in/out: meta-data read */
+{
+	byte*		ptr;
+	row_index_t*	cfg_index;
+	byte		row[sizeof(index_id_t) + sizeof(ib_uint32_t) * 9];
+
+	/* FIXME: What is the max value? */
+	ut_a(cfg->m_n_indexes > 0);
+	ut_a(cfg->m_n_indexes < 1024);
+
+	cfg->m_indexes = new(std::nothrow) row_index_t[cfg->m_n_indexes];
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF("ib_import_OOM_6",
+			delete [] cfg->m_indexes; cfg->m_indexes = 0;);
+
+	if (cfg->m_indexes == 0) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes);
+
+	cfg_index = cfg->m_indexes;
+
+	for (ulint i = 0; i < cfg->m_n_indexes; ++i, ++cfg_index) {
+		/* Trigger EOF */
+		DBUG_EXECUTE_IF("ib_import_io_read_error_2",
+				(void) fseek(file, 0L, SEEK_END););
+
+		/* Read the index data. */
+		size_t	n_bytes = fread(row, 1, sizeof(row), file);
+
+		/* Trigger EOF */
+		DBUG_EXECUTE_IF("ib_import_io_read_error",
+				(void) fseek(file, 0L, SEEK_END););
+
+		if (n_bytes != sizeof(row)) {
+			char	msg[BUFSIZ];
+
+			ut_snprintf(msg, sizeof(msg),
+				    "while reading index meta-data, expected "
+				    "to read %lu bytes but read only %lu "
+				    "bytes",
+				    (ulong) sizeof(row), (ulong) n_bytes);
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				errno, strerror(errno), msg);
+
+			ib_logf(IB_LOG_LEVEL_ERROR, "IO Error: %s", msg);
+
+			return(DB_IO_ERROR);
+		}
+
+		ptr = row;
+
+		cfg_index->m_id = mach_read_from_8(ptr);
+		ptr += sizeof(index_id_t);
+
+		cfg_index->m_space = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_page_no = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_type = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_trx_id_offset = mach_read_from_4(ptr);
+		if (cfg_index->m_trx_id_offset != mach_read_from_4(ptr)) {
+			ut_ad(0);
+			/* Overflow. Pretend that the clustered index
+			has a variable-length PRIMARY KEY. */
+			cfg_index->m_trx_id_offset = 0;
+		}
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_n_user_defined_cols = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_n_uniq = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_n_nullable = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_n_fields = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		/* The NUL byte is included in the name length. */
+		ulint	len = mach_read_from_4(ptr);
+
+		if (len > OS_FILE_MAX_PATH) {
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_INNODB_INDEX_CORRUPT,
+				"Index name length (%lu) is too long, "
+				"the meta-data is corrupt", len);
+
+			return(DB_CORRUPTION);
+		}
+
+		cfg_index->m_name = new(std::nothrow) byte[len];
+
+		/* Trigger OOM */
+		DBUG_EXECUTE_IF("ib_import_OOM_7",
+				delete [] cfg_index->m_name;
+				cfg_index->m_name = 0;);
+
+		if (cfg_index->m_name == 0) {
+			return(DB_OUT_OF_MEMORY);
+		}
+
+		dberr_t	err;
+
+		err = row_import_cfg_read_string(file, cfg_index->m_name, len);
+
+		if (err != DB_SUCCESS) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				errno, strerror(errno),
+				"while parsing index name.");
+
+			return(err);
+		}
+
+		err = row_import_cfg_read_index_fields(
+			file, thd, cfg_index, cfg);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Set the index root page number for v1 format.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_import_read_indexes(
+/*====================*/
+	FILE*		file,		/*!< in: File to read from */
+	THD*		thd,		/*!< in: session */
+	row_import*	cfg)		/*!< in/out: meta-data read */
+{
+	byte		row[sizeof(ib_uint32_t)];
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_3",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the number of indexes. */
+	if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			errno, strerror(errno),
+			"while reading number of indexes.");
+
+		return(DB_IO_ERROR);
+	}
+
+	cfg->m_n_indexes = mach_read_from_4(row);
+
+	if (cfg->m_n_indexes == 0) {
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			"Number of indexes in meta-data file is 0");
+
+		return(DB_CORRUPTION);
+
+	} else if (cfg->m_n_indexes > 1024) {
+		// FIXME: What is the upper limit? */
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			"Number of indexes in meta-data file is too high: %lu",
+			(ulong) cfg->m_n_indexes);
+		cfg->m_n_indexes = 0;
+
+		return(DB_CORRUPTION);
+	}
+
+	return(row_import_read_index_data(file, thd, cfg));
+}
+
+/*********************************************************************//**
+Read the meta data (table columns) config file. Deserialise the contents of
+dict_col_t structure, along with the column name. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_read_columns(
+/*====================*/
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd,	/*!< in/out: session */
+	row_import*		cfg)	/*!< in/out: meta-data read */
+{
+	dict_col_t*		col;
+	byte			row[sizeof(ib_uint32_t) * 8];
+
+	/* FIXME: What should the upper limit be? */
+	ut_a(cfg->m_n_cols > 0);
+	ut_a(cfg->m_n_cols < 1024);
+
+	cfg->m_cols = new(std::nothrow) dict_col_t[cfg->m_n_cols];
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF("ib_import_OOM_8",
+			delete [] cfg->m_cols; cfg->m_cols = 0;);
+
+	if (cfg->m_cols == 0) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	cfg->m_col_names = new(std::nothrow) byte* [cfg->m_n_cols];
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF("ib_import_OOM_9",
+			delete [] cfg->m_col_names; cfg->m_col_names = 0;);
+
+	if (cfg->m_col_names == 0) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	memset(cfg->m_cols, 0x0, sizeof(cfg->m_cols) * cfg->m_n_cols);
+	memset(cfg->m_col_names, 0x0, sizeof(cfg->m_col_names) * cfg->m_n_cols);
+
+	col = cfg->m_cols;
+
+	for (ulint i = 0; i < cfg->m_n_cols; ++i, ++col) {
+		byte*		ptr = row;
+
+		/* Trigger EOF */
+		DBUG_EXECUTE_IF("ib_import_io_read_error_4",
+				(void) fseek(file, 0L, SEEK_END););
+
+		if (fread(row, 1,  sizeof(row), file) != sizeof(row)) {
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				errno, strerror(errno),
+				"while reading table column meta-data.");
+
+			return(DB_IO_ERROR);
+		}
+
+		col->prtype = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		col->mtype = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		col->len = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		col->mbminmaxlen = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		col->ind = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		col->ord_part = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		col->max_prefix = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		/* Read in the column name as [len, byte array]. The len
+		includes the NUL byte. */
+
+		ulint		len = mach_read_from_4(ptr);
+
+		/* FIXME: What is the maximum column name length? */
+		if (len == 0 || len > 128) {
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_IO_READ_ERROR,
+				"Column name length %lu, is invalid",
+				(ulong) len);
+
+			return(DB_CORRUPTION);
+		}
+
+		cfg->m_col_names[i] = new(std::nothrow) byte[len];
+
+		/* Trigger OOM */
+		DBUG_EXECUTE_IF("ib_import_OOM_10",
+				delete [] cfg->m_col_names[i];
+				cfg->m_col_names[i] = 0;);
+
+		if (cfg->m_col_names[i] == 0) {
+			return(DB_OUT_OF_MEMORY);
+		}
+
+		dberr_t	err;
+
+		err = row_import_cfg_read_string(
+			file, cfg->m_col_names[i], len);
+
+		if (err != DB_SUCCESS) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				errno, strerror(errno),
+				"while parsing table column name.");
+
+			return(err);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Read the contents of the <tablespace>.cfg file.
+@return DB_SUCCESS or error code. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_read_v1(
+/*===============*/
+	FILE*		file,		/*!< in: File to read from */
+	THD*		thd,		/*!< in: session */
+	row_import*	cfg)		/*!< out: meta data */
+{
+	byte		value[sizeof(ib_uint32_t)];
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_5",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the hostname where the tablespace was exported. */
+	if (fread(value, 1, sizeof(value), file) != sizeof(value)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			errno, strerror(errno),
+			"while reading meta-data export hostname length.");
+
+		return(DB_IO_ERROR);
+	}
+
+	ulint	len = mach_read_from_4(value);
+
+	/* NUL byte is part of name length. */
+	cfg->m_hostname = new(std::nothrow) byte[len];
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF("ib_import_OOM_1",
+			delete [] cfg->m_hostname; cfg->m_hostname = 0;);
+
+	if (cfg->m_hostname == 0) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	dberr_t	err = row_import_cfg_read_string(file, cfg->m_hostname, len);
+
+	if (err != DB_SUCCESS) {
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			errno, strerror(errno),
+			"while parsing export hostname.");
+
+		return(err);
+	}
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_6",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the table name of tablespace that was exported. */
+	if (fread(value, 1, sizeof(value), file) != sizeof(value)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			errno, strerror(errno),
+			"while reading meta-data table name length.");
+
+		return(DB_IO_ERROR);
+	}
+
+	len = mach_read_from_4(value);
+
+	/* NUL byte is part of name length. */
+	cfg->m_table_name = new(std::nothrow) byte[len];
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF("ib_import_OOM_2",
+			delete [] cfg->m_table_name; cfg->m_table_name = 0;);
+
+	if (cfg->m_table_name == 0) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	err = row_import_cfg_read_string(file, cfg->m_table_name, len);
+
+	if (err != DB_SUCCESS) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			errno, strerror(errno),
+			"while parsing table name.");
+
+		return(err);
+	}
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Importing tablespace for table '%s' that was exported "
+		"from host '%s'", cfg->m_table_name, cfg->m_hostname);
+
+	byte		row[sizeof(ib_uint32_t) * 3];
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_7",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the autoinc value. */
+	if (fread(row, 1, sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			errno, strerror(errno),
+			"while reading autoinc value.");
+
+		return(DB_IO_ERROR);
+	}
+
+	cfg->m_autoinc = mach_read_from_8(row);
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_8",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the tablespace page size. */
+	if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			errno, strerror(errno),
+			"while reading meta-data header.");
+
+		return(DB_IO_ERROR);
+	}
+
+	byte*		ptr = row;
+
+	cfg->m_page_size = mach_read_from_4(ptr);
+	ptr += sizeof(ib_uint32_t);
+
+	if (cfg->m_page_size != UNIV_PAGE_SIZE) {
+
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+			"Tablespace to be imported has a different "
+			"page size than this server. Server page size "
+			"is %lu, whereas tablespace page size is %lu",
+			UNIV_PAGE_SIZE, (ulong) cfg->m_page_size);
+
+		return(DB_ERROR);
+	}
+
+	cfg->m_flags = mach_read_from_4(ptr);
+	ptr += sizeof(ib_uint32_t);
+
+	cfg->m_n_cols = mach_read_from_4(ptr);
+
+	if (!dict_tf_is_valid(cfg->m_flags)) {
+
+		return(DB_CORRUPTION);
+
+	} else if ((err = row_import_read_columns(file, thd, cfg))
+		   != DB_SUCCESS) {
+
+		return(err);
+
+	} else  if ((err = row_import_read_indexes(file, thd, cfg))
+		   != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	ut_a(err == DB_SUCCESS);
+	return(err);
+}
+
+/**
+Read the contents of the <tablespace>.cfg file.
+@return DB_SUCCESS or error code. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_read_meta_data(
+/*======================*/
+	dict_table_t*	table,		/*!< in: table */
+	FILE*		file,		/*!< in: File to read from */
+	THD*		thd,		/*!< in: session */
+	row_import&	cfg)		/*!< out: contents of the .cfg file */
+{
+	byte		row[sizeof(ib_uint32_t)];
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_9",
+			(void) fseek(file, 0L, SEEK_END););
+
+	if (fread(&row, 1, sizeof(row), file) != sizeof(row)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			errno, strerror(errno),
+			"while reading meta-data version.");
+
+		return(DB_IO_ERROR);
+	}
+
+	cfg.m_version = mach_read_from_4(row);
+
+	/* Check the version number. */
+	switch (cfg.m_version) {
+	case IB_EXPORT_CFG_VERSION_V1:
+
+		return(row_import_read_v1(file, thd, &cfg));
+	default:
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			"Unsupported meta-data version number (%lu), "
+			"file ignored", (ulong) cfg.m_version);
+	}
+
+	return(DB_ERROR);
+}
+
+/**
+Read the contents of the <tablename>.cfg file.
+@return DB_SUCCESS or error code. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_import_read_cfg(
+/*================*/
+	dict_table_t*	table,	/*!< in: table */
+	THD*		thd,	/*!< in: session */
+	row_import&	cfg)	/*!< out: contents of the .cfg file */
+{
+	dberr_t		err;
+	char		name[OS_FILE_MAX_PATH];
+
+	cfg.m_table = table;
+
+	srv_get_meta_data_filename(table, name, sizeof(name));
+
+	FILE*	file = fopen(name, "rb");
+
+	if (file == NULL) {
+		char	msg[BUFSIZ];
+
+		ut_snprintf(msg, sizeof(msg),
+			    "Error opening '%s', will attempt to import "
+			    "without schema verification", name);
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_READ_ERROR,
+			errno, strerror(errno), msg);
+
+		cfg.m_missing = true;
+
+		err = DB_FAIL;
+	} else {
+
+		cfg.m_missing = false;
+
+		err = row_import_read_meta_data(table, file, thd, cfg);
+		fclose(file);
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Update the <space, root page> of a table's indexes from the values
+in the data dictionary.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_import_update_index_root(
+/*=========================*/
+	trx_t*			trx,		/*!< in/out: transaction that
+						covers the update */
+	const dict_table_t*	table,		/*!< in: Table for which we want
+						to set the root page_no */
+	bool			reset,		/*!< in: if true then set to
+						FIL_NUL */
+	bool			dict_locked)	/*!< in: Set to true if the
+						caller already owns the
+						dict_sys_t:: mutex. */
+
+{
+	const dict_index_t*	index;
+	que_t*			graph = 0;
+	dberr_t			err = DB_SUCCESS;
+
+	static const char	sql[] = {
+		"PROCEDURE UPDATE_INDEX_ROOT() IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_INDEXES\n"
+		"SET SPACE = :space,\n"
+		"    PAGE_NO = :page,\n"
+		"    TYPE = :type\n"
+		"WHERE TABLE_ID = :table_id AND ID = :index_id;\n"
+		"END;\n"};
+
+	if (!dict_locked) {
+		mutex_enter(&dict_sys->mutex);
+	}
+
+	for (index = dict_table_get_first_index(table);
+	     index != 0;
+	     index = dict_table_get_next_index(index)) {
+
+		pars_info_t*	info;
+		ib_uint32_t	page;
+		ib_uint32_t	space;
+		ib_uint32_t	type;
+		index_id_t	index_id;
+		table_id_t	table_id;
+
+		info = (graph != 0) ? graph->info : pars_info_create();
+
+		mach_write_to_4(
+			reinterpret_cast<byte*>(&type),
+			index->type);
+
+		mach_write_to_4(
+			reinterpret_cast<byte*>(&page),
+			reset ? FIL_NULL : index->page);
+
+		mach_write_to_4(
+			reinterpret_cast<byte*>(&space),
+			reset ? FIL_NULL : index->space);
+
+		mach_write_to_8(
+			reinterpret_cast<byte*>(&index_id),
+			index->id);
+
+		mach_write_to_8(
+			reinterpret_cast<byte*>(&table_id),
+			table->id);
+
+		/* If we set the corrupt bit during the IMPORT phase then
+		we need to update the system tables. */
+		pars_info_bind_int4_literal(info, "type", &type);
+		pars_info_bind_int4_literal(info, "space", &space);
+		pars_info_bind_int4_literal(info, "page", &page);
+		pars_info_bind_ull_literal(info, "index_id", &index_id);
+		pars_info_bind_ull_literal(info, "table_id", &table_id);
+
+		if (graph == 0) {
+			graph = pars_sql(info, sql);
+			ut_a(graph);
+			graph->trx = trx;
+		}
+
+		que_thr_t*	thr;
+
+		graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+		ut_a(thr = que_fork_start_command(graph));
+
+		que_run_threads(thr);
+
+		DBUG_EXECUTE_IF("ib_import_internal_error",
+				trx->error_state = DB_ERROR;);
+
+		err = trx->error_state;
+
+		if (err != DB_SUCCESS) {
+			char		index_name[MAX_FULL_NAME_LEN + 1];
+
+			innobase_format_name(
+				index_name, sizeof(index_name),
+				index->name, TRUE);
+
+			ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+				ER_INTERNAL_ERROR,
+				"While updating the <space, root page "
+				"number> of index %s - %s",
+				index_name, ut_strerr(err));
+
+			break;
+		}
+	}
+
+	que_graph_free(graph);
+
+	if (!dict_locked) {
+		mutex_exit(&dict_sys->mutex);
+	}
+
+	return(err);
+}
+
+/** Callback arg for row_import_set_discarded. */
+struct discard_t {
+	ib_uint32_t	flags2;			/*!< Value read from column */
+	bool		state;			/*!< New state of the flag */
+	ulint		n_recs;			/*!< Number of recs processed */
+};
+
+/******************************************************************//**
+Fetch callback that sets or unsets the DISCARDED tablespace flag in
+SYS_TABLES. The flags is stored in MIX_LEN column.
+@return FALSE if all OK */
+static
+ibool
+row_import_set_discarded(
+/*=====================*/
+	void*		row,			/*!< in: sel_node_t* */
+	void*		user_arg)		/*!< in: bool set/unset flag */
+{
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	discard_t*	discard = static_cast<discard_t*>(user_arg);
+	dfield_t*	dfield = que_node_get_val(node->select_list);
+	dtype_t*	type = dfield_get_type(dfield);
+	ulint		len = dfield_get_len(dfield);
+
+	ut_a(dtype_get_mtype(type) == DATA_INT);
+	ut_a(len == sizeof(ib_uint32_t));
+
+	ulint	flags2 = mach_read_from_4(
+		static_cast<byte*>(dfield_get_data(dfield)));
+
+	if (discard->state) {
+		flags2 |= DICT_TF2_DISCARDED;
+	} else {
+		flags2 &= ~DICT_TF2_DISCARDED;
+	}
+
+	mach_write_to_4(reinterpret_cast<byte*>(&discard->flags2), flags2);
+
+	++discard->n_recs;
+
+	/* There should be at most one matching record. */
+	ut_a(discard->n_recs == 1);
+
+	return(FALSE);
+}
+
+/*****************************************************************//**
+Update the DICT_TF2_DISCARDED flag in SYS_TABLES.
+@return DB_SUCCESS or error code. */
+UNIV_INTERN
+dberr_t
+row_import_update_discarded_flag(
+/*=============================*/
+	trx_t*		trx,		/*!< in/out: transaction that
+					covers the update */
+	table_id_t	table_id,	/*!< in: Table for which we want
+					to set the root table->flags2 */
+	bool		discarded,	/*!< in: set MIX_LEN column bit
+					to discarded, if true */
+	bool		dict_locked)	/*!< in: set to true if the
+					caller already owns the
+					dict_sys_t:: mutex. */
+
+{
+	pars_info_t*		info;
+	discard_t		discard;
+
+	static const char	sql[] =
+		"PROCEDURE UPDATE_DISCARDED_FLAG() IS\n"
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS\n"
+		" SELECT MIX_LEN "
+		" FROM SYS_TABLES "
+		" WHERE ID = :table_id FOR UPDATE;"
+		"\n"
+		"BEGIN\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"UPDATE SYS_TABLES"
+		" SET MIX_LEN = :flags2"
+		" WHERE ID = :table_id;\n"
+		"CLOSE c;\n"
+		"END;\n";
+
+	discard.n_recs = 0;
+	discard.state = discarded;
+	discard.flags2 = ULINT32_UNDEFINED;
+
+	info = pars_info_create();
+
+	pars_info_add_ull_literal(info, "table_id", table_id);
+	pars_info_bind_int4_literal(info, "flags2", &discard.flags2);
+
+	pars_info_bind_function(
+		info, "my_func", row_import_set_discarded, &discard);
+
+	dberr_t	err = que_eval_sql(info, sql, !dict_locked, trx);
+
+	ut_a(discard.n_recs == 1);
+	ut_a(discard.flags2 != ULINT32_UNDEFINED);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_import_for_mysql(
+/*=================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL */
+{
+	dberr_t		err;
+	trx_t*		trx;
+	ib_uint64_t	autoinc = 0;
+	char		table_name[MAX_FULL_NAME_LEN + 1];
+	char*		filepath = NULL;
+
+	ut_ad(!srv_read_only_mode);
+
+	innobase_format_name(
+		table_name, sizeof(table_name), table->name, FALSE);
+
+	ut_a(table->space);
+	ut_ad(prebuilt->trx);
+	ut_a(table->ibd_file_missing);
+
+	trx_start_if_not_started(prebuilt->trx);
+
+	trx = trx_allocate_for_mysql();
+
+	/* So that the table is not DROPped during recovery. */
+	trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+
+	trx_start_if_not_started(trx);
+
+	/* So that we can send error messages to the user. */
+	trx->mysql_thd = prebuilt->trx->mysql_thd;
+
+	/* Ensure that the table will be dropped by trx_rollback_active()
+	in case of a crash. */
+
+	trx->table_id = table->id;
+
+	/* Assign an undo segment for the transaction, so that the
+	transaction will be recovered after a crash. */
+
+	mutex_enter(&trx->undo_mutex);
+
+	err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE);
+
+	mutex_exit(&trx->undo_mutex);
+
+	DBUG_EXECUTE_IF("ib_import_undo_assign_failure",
+			err = DB_TOO_MANY_CONCURRENT_TRXS;);
+
+	if (err != DB_SUCCESS) {
+
+		return(row_import_cleanup(prebuilt, trx, err));
+
+	} else if (trx->update_undo == 0) {
+
+		err = DB_TOO_MANY_CONCURRENT_TRXS;
+		return(row_import_cleanup(prebuilt, trx, err));
+	}
+
+	prebuilt->trx->op_info = "read meta-data file";
+
+	/* Prevent DDL operations while we are checking. */
+
+	rw_lock_s_lock_func(&dict_operation_lock, 0, __FILE__, __LINE__);
+
+	row_import	cfg;
+
+	memset(&cfg, 0x0, sizeof(cfg));
+
+	err = row_import_read_cfg(table, trx->mysql_thd, cfg);
+
+	/* Check if the table column definitions match the contents
+	of the config file. */
+
+	if (err == DB_SUCCESS) {
+
+		/* We have a schema file, try and match it with the our
+		data dictionary. */
+
+		err = cfg.match_schema(trx->mysql_thd);
+
+		/* Update index->page and SYS_INDEXES.PAGE_NO to match the
+		B-tree root page numbers in the tablespace. Use the index
+		name from the .cfg file to find match. */
+
+		if (err == DB_SUCCESS) {
+			cfg.set_root_by_name();
+			autoinc = cfg.m_autoinc;
+		}
+
+		rw_lock_s_unlock_gen(&dict_operation_lock, 0);
+
+		DBUG_EXECUTE_IF("ib_import_set_index_root_failure",
+				err = DB_TOO_MANY_CONCURRENT_TRXS;);
+
+	} else if (cfg.m_missing) {
+
+		rw_lock_s_unlock_gen(&dict_operation_lock, 0);
+
+		/* We don't have a schema file, we will have to discover
+		the index root pages from the .ibd file and skip the schema
+		matching step. */
+
+		ut_a(err == DB_FAIL);
+
+		cfg.m_page_size = UNIV_PAGE_SIZE;
+
+		FetchIndexRootPages	fetchIndexRootPages(table, trx);
+
+		err = fil_tablespace_iterate(
+			table, IO_BUFFER_SIZE(cfg.m_page_size),
+			fetchIndexRootPages);
+
+		if (err == DB_SUCCESS) {
+
+			err = fetchIndexRootPages.build_row_import(&cfg);
+
+			/* Update index->page and SYS_INDEXES.PAGE_NO
+			to match the B-tree root page numbers in the
+			tablespace. */
+
+			if (err == DB_SUCCESS) {
+				err = cfg.set_root_by_heuristic();
+			}
+		}
+
+	} else {
+		rw_lock_s_unlock_gen(&dict_operation_lock, 0);
+	}
+
+	if (err != DB_SUCCESS) {
+		return(row_import_error(prebuilt, trx, err));
+	}
+
+	prebuilt->trx->op_info = "importing tablespace";
+
+	ib_logf(IB_LOG_LEVEL_INFO, "Phase I - Update all pages");
+
+	/* Iterate over all the pages and do the sanity checking and
+	the conversion required to import the tablespace. */
+
+	PageConverter	converter(&cfg, trx);
+
+	/* Set the IO buffer size in pages. */
+
+	err = fil_tablespace_iterate(
+		table, IO_BUFFER_SIZE(cfg.m_page_size), converter);
+
+	DBUG_EXECUTE_IF("ib_import_reset_space_and_lsn_failure",
+			err = DB_TOO_MANY_CONCURRENT_TRXS;);
+
+	if (err != DB_SUCCESS) {
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name), table->name, FALSE);
+
+		ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_INTERNAL_ERROR,
+			"Cannot reset LSNs in table '%s' : %s",
+			table_name, ut_strerr(err));
+
+		return(row_import_cleanup(prebuilt, trx, err));
+	}
+
+	row_mysql_lock_data_dictionary(trx);
+
+	/* If the table is stored in a remote tablespace, we need to
+	determine that filepath from the link file and system tables.
+	Find the space ID in SYS_TABLES since this is an ALTER TABLE. */
+	if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+		dict_get_and_save_data_dir_path(table, true);
+		ut_a(table->data_dir_path);
+
+		filepath = os_file_make_remote_pathname(
+			table->data_dir_path, table->name, "ibd");
+	} else {
+		filepath = fil_make_ibd_name(table->name, false);
+	}
+	ut_a(filepath);
+
+	/* Open the tablespace so that we can access via the buffer pool.
+	We set the 2nd param (fix_dict = true) here because we already
+	have an x-lock on dict_operation_lock and dict_sys->mutex. */
+
+	err = fil_open_single_table_tablespace(
+		true, true, table->space,
+		dict_tf_to_fsp_flags(table->flags),
+		table->name, filepath);
+
+	DBUG_EXECUTE_IF("ib_import_open_tablespace_failure",
+			err = DB_TABLESPACE_NOT_FOUND;);
+
+	if (err != DB_SUCCESS) {
+		row_mysql_unlock_data_dictionary(trx);
+
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_FILE_NOT_FOUND,
+			filepath, err, ut_strerr(err));
+
+		mem_free(filepath);
+
+		return(row_import_cleanup(prebuilt, trx, err));
+	}
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	mem_free(filepath);
+
+	err = ibuf_check_bitmap_on_import(trx, table->space);
+
+	DBUG_EXECUTE_IF("ib_import_check_bitmap_failure", err = DB_CORRUPTION;);
+
+	if (err != DB_SUCCESS) {
+		return(row_import_cleanup(prebuilt, trx, err));
+	}
+
+	/* The first index must always be the clustered index. */
+
+	dict_index_t*	index = dict_table_get_first_index(table);
+
+	if (!dict_index_is_clust(index)) {
+		return(row_import_error(prebuilt, trx, DB_CORRUPTION));
+	}
+
+	/* Update the Btree segment headers for index node and
+	leaf nodes in the root page. Set the new space id. */
+
+	err = btr_root_adjust_on_import(index);
+
+	DBUG_EXECUTE_IF("ib_import_cluster_root_adjust_failure",
+			err = DB_CORRUPTION;);
+
+	if (err != DB_SUCCESS) {
+		return(row_import_error(prebuilt, trx, err));
+	}
+
+	if (err != DB_SUCCESS) {
+		return(row_import_error(prebuilt, trx, err));
+	} else if (cfg.requires_purge(index->name)) {
+
+		/* Purge any delete-marked records that couldn't be
+		purged during the page conversion phase from the
+		cluster index. */
+
+		IndexPurge	purge(trx, index);
+
+		trx->op_info = "cluster: purging delete marked records";
+
+		err = purge.garbage_collect();
+
+		trx->op_info = "";
+	}
+
+	DBUG_EXECUTE_IF("ib_import_cluster_failure", err = DB_CORRUPTION;);
+
+	if (err != DB_SUCCESS) {
+		return(row_import_error(prebuilt, trx, err));
+	}
+
+	/* For secondary indexes, purge any records that couldn't be purged
+	during the page conversion phase. */
+
+	err = row_import_adjust_root_pages_of_secondary_indexes(
+		prebuilt, trx, table, cfg);
+
+	DBUG_EXECUTE_IF("ib_import_sec_root_adjust_failure",
+			err = DB_CORRUPTION;);
+
+	if (err != DB_SUCCESS) {
+		return(row_import_error(prebuilt, trx, err));
+	}
+
+	/* Ensure that the next available DB_ROW_ID is not smaller than
+	any DB_ROW_ID stored in the table. */
+
+	if (prebuilt->clust_index_was_generated) {
+
+		err = row_import_set_sys_max_row_id(prebuilt, table);
+
+		if (err != DB_SUCCESS) {
+			return(row_import_error(prebuilt, trx, err));
+		}
+	}
+
+	ib_logf(IB_LOG_LEVEL_INFO, "Phase III - Flush changes to disk");
+
+	/* Ensure that all pages dirtied during the IMPORT make it to disk.
+	The only dirty pages generated should be from the pessimistic purge
+	of delete marked records that couldn't be purged in Phase I. */
+
+	buf_LRU_flush_or_remove_pages(
+		prebuilt->table->space, BUF_REMOVE_FLUSH_WRITE, trx);
+
+	if (trx_is_interrupted(trx)) {
+		ib_logf(IB_LOG_LEVEL_INFO, "Phase III - Flush interrupted");
+		return(row_import_error(prebuilt, trx, DB_INTERRUPTED));
+	} else {
+		ib_logf(IB_LOG_LEVEL_INFO, "Phase IV - Flush complete");
+	}
+
+	/* The dictionary latches will be released in in row_import_cleanup()
+	after the transaction commit, for both success and error. */
+
+	row_mysql_lock_data_dictionary(trx);
+
+	/* Update the root pages of the table's indexes. */
+	err = row_import_update_index_root(trx, table, false, true);
+
+	if (err != DB_SUCCESS) {
+		return(row_import_error(prebuilt, trx, err));
+	}
+
+	/* Update the table's discarded flag, unset it. */
+	err = row_import_update_discarded_flag(trx, table->id, false, true);
+
+	if (err != DB_SUCCESS) {
+		return(row_import_error(prebuilt, trx, err));
+	}
+
+	table->ibd_file_missing = false;
+	table->flags2 &= ~DICT_TF2_DISCARDED;
+
+	if (autoinc != 0) {
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name), table->name, FALSE);
+
+		ib_logf(IB_LOG_LEVEL_INFO, "%s autoinc value set to " IB_ID_FMT,
+			table_name, autoinc);
+
+		dict_table_autoinc_lock(table);
+		dict_table_autoinc_initialize(table, autoinc);
+		dict_table_autoinc_unlock(table);
+	}
+
+	ut_a(err == DB_SUCCESS);
+
+	return(row_import_cleanup(prebuilt, trx, err));
+}
+
diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc
new file mode 100644
index 00000000000..e31b44747ae
--- /dev/null
+++ b/storage/innobase/row/row0ins.cc
@@ -0,0 +1,3328 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ins.cc
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0ins.h"
+
+#ifdef UNIV_NONINL
+#include "row0ins.ic"
+#endif
+
+#include "ha_prototypes.h"
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0rec.h"
+#include "trx0undo.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0sel.h"
+#include "row0row.h"
+#include "row0log.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "eval0eval.h"
+#include "data0data.h"
+#include "usr0sess.h"
+#include "buf0lru.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include "m_string.h"
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/*********************************************************************//**
+Creates an insert node struct.
+@return	own: insert node struct */
+UNIV_INTERN
+ins_node_t*
+ins_node_create(
+/*============*/
+	ulint		ins_type,	/*!< in: INS_VALUES, ... */
+	dict_table_t*	table,		/*!< in: table where to insert */
+	mem_heap_t*	heap)		/*!< in: mem heap where created */
+{
+	ins_node_t*	node;
+
+	node = static_cast<ins_node_t*>(
+		mem_heap_alloc(heap, sizeof(ins_node_t)));
+
+	node->common.type = QUE_NODE_INSERT;
+
+	node->ins_type = ins_type;
+
+	node->state = INS_NODE_SET_IX_LOCK;
+	node->table = table;
+	node->index = NULL;
+	node->entry = NULL;
+
+	node->select = NULL;
+
+	node->trx_id = 0;
+
+	node->entry_sys_heap = mem_heap_create(128);
+
+	node->magic_n = INS_NODE_MAGIC_N;
+
+	return(node);
+}
+
+/***********************************************************//**
+Creates an entry template for each index of a table. */
+static
+void
+ins_node_create_entry_list(
+/*=======================*/
+	ins_node_t*	node)	/*!< in: row insert node */
+{
+	dict_index_t*	index;
+	dtuple_t*	entry;
+
+	ut_ad(node->entry_sys_heap);
+
+	UT_LIST_INIT(node->entry_list);
+
+	/* We will include all indexes (include those corrupted
+	secondary indexes) in the entry list. Filteration of
+	these corrupted index will be done in row_ins() */
+
+	for (index = dict_table_get_first_index(node->table);
+	     index != 0;
+	     index = dict_table_get_next_index(index)) {
+
+		entry = row_build_index_entry(
+			node->row, NULL, index, node->entry_sys_heap);
+
+		UT_LIST_ADD_LAST(tuple_list, node->entry_list, entry);
+	}
+}
+
+/*****************************************************************//**
+Adds system field buffers to a row. */
+static
+void
+row_ins_alloc_sys_fields(
+/*=====================*/
+	ins_node_t*	node)	/*!< in: insert node */
+{
+	dtuple_t*		row;
+	dict_table_t*		table;
+	mem_heap_t*		heap;
+	const dict_col_t*	col;
+	dfield_t*		dfield;
+	byte*			ptr;
+
+	row = node->row;
+	table = node->table;
+	heap = node->entry_sys_heap;
+
+	ut_ad(row && table && heap);
+	ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table));
+
+	/* allocate buffer to hold the needed system created hidden columns. */
+	uint len = DATA_ROW_ID_LEN + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+	ptr = static_cast<byte*>(mem_heap_zalloc(heap, len));
+
+	/* 1. Populate row-id */
+	col = dict_table_get_sys_col(table, DATA_ROW_ID);
+
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+	dfield_set_data(dfield, ptr, DATA_ROW_ID_LEN);
+
+	node->row_id_buf = ptr;
+
+	ptr += DATA_ROW_ID_LEN;
+
+	/* 2. Populate trx id */
+	col = dict_table_get_sys_col(table, DATA_TRX_ID);
+
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+	dfield_set_data(dfield, ptr, DATA_TRX_ID_LEN);
+
+	node->trx_id_buf = ptr;
+
+	ptr += DATA_TRX_ID_LEN;
+
+	/* 3. Populate roll ptr */
+
+	col = dict_table_get_sys_col(table, DATA_ROLL_PTR);
+
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+	dfield_set_data(dfield, ptr, DATA_ROLL_PTR_LEN);
+}
+
+/*********************************************************************//**
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+UNIV_INTERN
+void
+ins_node_set_new_row(
+/*=================*/
+	ins_node_t*	node,	/*!< in: insert node */
+	dtuple_t*	row)	/*!< in: new row (or first row) for the node */
+{
+	node->state = INS_NODE_SET_IX_LOCK;
+	node->index = NULL;
+	node->entry = NULL;
+
+	node->row = row;
+
+	mem_heap_empty(node->entry_sys_heap);
+
+	/* Create templates for index entries */
+
+	ins_node_create_entry_list(node);
+
+	/* Allocate from entry_sys_heap buffers for sys fields */
+
+	row_ins_alloc_sys_fields(node);
+
+	/* As we allocated a new trx id buf, the trx id should be written
+	there again: */
+
+	node->trx_id = 0;
+}
+
+/*******************************************************************//**
+Does an insert operation by updating a delete-marked existing record
+in the index. This situation can occur if the delete-marked record is
+kept in the index for consistent reads.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_ins_sec_index_entry_by_modify(
+/*==============================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether mtr holds just a leaf
+				latch or also a tree latch */
+	btr_cur_t*	cursor,	/*!< in: B-tree cursor */
+	ulint**		offsets,/*!< in/out: offsets on cursor->page_cur.rec */
+	mem_heap_t*	offsets_heap,
+				/*!< in/out: memory heap that can be emptied */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
+	const dtuple_t*	entry,	/*!< in: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr; must be committed before
+				latching any further pages */
+{
+	big_rec_t*	dummy_big_rec;
+	upd_t*		update;
+	rec_t*		rec;
+	dberr_t		err;
+
+	rec = btr_cur_get_rec(cursor);
+
+	ut_ad(!dict_index_is_clust(cursor->index));
+	ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
+	ut_ad(!entry->info_bits);
+
+	/* We know that in the alphabetical ordering, entry and rec are
+	identified. But in their binary form there may be differences if
+	there are char fields in them. Therefore we have to calculate the
+	difference. */
+
+	update = row_upd_build_sec_rec_difference_binary(
+		rec, cursor->index, *offsets, entry, heap);
+
+	if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
+		/* We should never insert in place of a record that
+		has not been delete-marked. The only exception is when
+		online CREATE INDEX copied the changes that we already
+		made to the clustered index, and completed the
+		secondary index creation before we got here. In this
+		case, the change would already be there. The CREATE
+		INDEX should be waiting for a MySQL meta-data lock
+		upgrade at least until this INSERT or UPDATE
+		returns. After that point, the TEMP_INDEX_PREFIX
+		would be dropped from the index name in
+		commit_inplace_alter_table(). */
+		ut_a(update->n_fields == 0);
+		ut_a(*cursor->index->name == TEMP_INDEX_PREFIX);
+		ut_ad(!dict_index_is_online_ddl(cursor->index));
+		return(DB_SUCCESS);
+	}
+
+	if (mode == BTR_MODIFY_LEAF) {
+		/* Try an optimistic updating of the record, keeping changes
+		within the page */
+
+		/* TODO: pass only *offsets */
+		err = btr_cur_optimistic_update(
+			flags | BTR_KEEP_SYS_FLAG, cursor,
+			offsets, &offsets_heap, update, 0, thr,
+			thr_get_trx(thr)->id, mtr);
+		switch (err) {
+		case DB_OVERFLOW:
+		case DB_UNDERFLOW:
+		case DB_ZIP_OVERFLOW:
+			err = DB_FAIL;
+		default:
+			break;
+		}
+	} else {
+		ut_a(mode == BTR_MODIFY_TREE);
+		if (buf_LRU_buf_pool_running_out()) {
+
+			return(DB_LOCK_TABLE_FULL);
+		}
+
+		err = btr_cur_pessimistic_update(
+			flags | BTR_KEEP_SYS_FLAG, cursor,
+			offsets, &offsets_heap,
+			heap, &dummy_big_rec, update, 0,
+			thr, thr_get_trx(thr)->id, mtr);
+		ut_ad(!dummy_big_rec);
+	}
+
+	return(err);
+}
+
+/*******************************************************************//**
+Does an insert operation by delete unmarking and updating a delete marked
+existing record in the index. This situation can occur if the delete marked
+record is kept in the index for consistent reads.
+@return	DB_SUCCESS, DB_FAIL, or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_ins_clust_index_entry_by_modify(
+/*================================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether mtr holds just a leaf
+				latch or also a tree latch */
+	btr_cur_t*	cursor,	/*!< in: B-tree cursor */
+	ulint**		offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: pointer to memory heap that can
+				be emptied, or NULL */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
+	big_rec_t**	big_rec,/*!< out: possible big rec vector of fields
+				which have to be stored externally by the
+				caller */
+	const dtuple_t*	entry,	/*!< in: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr; must be committed before
+				latching any further pages */
+{
+	const rec_t*	rec;
+	const upd_t*	update;
+	dberr_t		err;
+
+	ut_ad(dict_index_is_clust(cursor->index));
+
+	*big_rec = NULL;
+
+	rec = btr_cur_get_rec(cursor);
+
+	ut_ad(rec_get_deleted_flag(rec,
+				   dict_table_is_comp(cursor->index->table)));
+
+	/* Build an update vector containing all the fields to be modified;
+	NOTE that this vector may NOT contain system columns trx_id or
+	roll_ptr */
+
+	update = row_upd_build_difference_binary(
+		cursor->index, entry, rec, NULL, true,
+		thr_get_trx(thr), heap);
+	if (mode != BTR_MODIFY_TREE) {
+		ut_ad((mode & ~BTR_ALREADY_S_LATCHED) == BTR_MODIFY_LEAF);
+
+		/* Try optimistic updating of the record, keeping changes
+		within the page */
+
+		err = btr_cur_optimistic_update(
+			flags, cursor, offsets, offsets_heap, update, 0, thr,
+			thr_get_trx(thr)->id, mtr);
+		switch (err) {
+		case DB_OVERFLOW:
+		case DB_UNDERFLOW:
+		case DB_ZIP_OVERFLOW:
+			err = DB_FAIL;
+		default:
+			break;
+		}
+	} else {
+		if (buf_LRU_buf_pool_running_out()) {
+
+			return(DB_LOCK_TABLE_FULL);
+
+		}
+		err = btr_cur_pessimistic_update(
+			flags | BTR_KEEP_POS_FLAG,
+			cursor, offsets, offsets_heap, heap,
+			big_rec, update, 0, thr, thr_get_trx(thr)->id, mtr);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Returns TRUE if in a cascaded update/delete an ancestor node of node
+updates (not DELETE, but UPDATE) table.
+@return	TRUE if an ancestor updates table */
+static
+ibool
+row_ins_cascade_ancestor_updates_table(
+/*===================================*/
+	que_node_t*	node,	/*!< in: node in a query graph */
+	dict_table_t*	table)	/*!< in: table */
+{
+	que_node_t*	parent;
+
+	for (parent = que_node_get_parent(node);
+	     que_node_get_type(parent) == QUE_NODE_UPDATE;
+	     parent = que_node_get_parent(parent)) {
+
+		upd_node_t*	upd_node;
+
+		upd_node = static_cast<upd_node_t*>(parent);
+
+		if (upd_node->table == table && upd_node->is_delete == FALSE) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Returns the number of ancestor UPDATE or DELETE nodes of a
+cascaded update/delete node.
+@return	number of ancestors */
+static __attribute__((nonnull, warn_unused_result))
+ulint
+row_ins_cascade_n_ancestors(
+/*========================*/
+	que_node_t*	node)	/*!< in: node in a query graph */
+{
+	que_node_t*	parent;
+	ulint		n_ancestors = 0;
+
+	for (parent = que_node_get_parent(node);
+	     que_node_get_type(parent) == QUE_NODE_UPDATE;
+	     parent = que_node_get_parent(parent)) {
+
+		n_ancestors++;
+	}
+
+	return(n_ancestors);
+}
+
+/******************************************************************//**
+Calculates the update vector node->cascade->update for a child table in
+a cascaded update.
+@return number of fields in the calculated update vector; the value
+can also be 0 if no foreign key fields changed; the returned value is
+ULINT_UNDEFINED if the column type in the child table is too short to
+fit the new value in the parent table: that means the update fails */
+static __attribute__((nonnull, warn_unused_result))
+ulint
+row_ins_cascade_calc_update_vec(
+/*============================*/
+	upd_node_t*	node,		/*!< in: update node of the parent
+					table */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint whose
+					type is != 0 */
+	mem_heap_t*	heap,		/*!< in: memory heap to use as
+					temporary storage */
+	trx_t*		trx,		/*!< in: update transaction */
+	ibool*		fts_col_affected)/*!< out: is FTS column affected */
+{
+	upd_node_t*	cascade		= node->cascade_node;
+	dict_table_t*	table		= foreign->foreign_table;
+	dict_index_t*	index		= foreign->foreign_index;
+	upd_t*		update;
+	dict_table_t*	parent_table;
+	dict_index_t*	parent_index;
+	upd_t*		parent_update;
+	ulint		n_fields_updated;
+	ulint		parent_field_no;
+	ulint		i;
+	ulint		j;
+	ibool		doc_id_updated = FALSE;
+	ulint		doc_id_pos = 0;
+	doc_id_t	new_doc_id = FTS_NULL_DOC_ID;
+
+	ut_a(node);
+	ut_a(foreign);
+	ut_a(cascade);
+	ut_a(table);
+	ut_a(index);
+
+	/* Calculate the appropriate update vector which will set the fields
+	in the child index record to the same value (possibly padded with
+	spaces if the column is a fixed length CHAR or FIXBINARY column) as
+	the referenced index record will get in the update. */
+
+	parent_table = node->table;
+	ut_a(parent_table == foreign->referenced_table);
+	parent_index = foreign->referenced_index;
+	parent_update = node->update;
+
+	update = cascade->update;
+
+	update->info_bits = 0;
+	update->n_fields = foreign->n_fields;
+
+	n_fields_updated = 0;
+
+	*fts_col_affected = FALSE;
+
+	if (table->fts) {
+		doc_id_pos = dict_table_get_nth_col_pos(
+			table, table->fts->doc_col);
+	}
+
+	for (i = 0; i < foreign->n_fields; i++) {
+
+		parent_field_no = dict_table_get_nth_col_pos(
+			parent_table,
+			dict_index_get_nth_col_no(parent_index, i));
+
+		for (j = 0; j < parent_update->n_fields; j++) {
+			const upd_field_t*	parent_ufield
+				= &parent_update->fields[j];
+
+			if (parent_ufield->field_no == parent_field_no) {
+
+				ulint			min_size;
+				const dict_col_t*	col;
+				ulint			ufield_len;
+				upd_field_t*		ufield;
+
+				col = dict_index_get_nth_col(index, i);
+
+				/* A field in the parent index record is
+				updated. Let us make the update vector
+				field for the child table. */
+
+				ufield = update->fields + n_fields_updated;
+
+				ufield->field_no
+					= dict_table_get_nth_col_pos(
+					table, dict_col_get_no(col));
+
+				ufield->orig_len = 0;
+				ufield->exp = NULL;
+
+				ufield->new_val = parent_ufield->new_val;
+				ufield_len = dfield_get_len(&ufield->new_val);
+
+				/* Clear the "external storage" flag */
+				dfield_set_len(&ufield->new_val, ufield_len);
+
+				/* Do not allow a NOT NULL column to be
+				updated as NULL */
+
+				if (dfield_is_null(&ufield->new_val)
+				    && (col->prtype & DATA_NOT_NULL)) {
+
+					return(ULINT_UNDEFINED);
+				}
+
+				/* If the new value would not fit in the
+				column, do not allow the update */
+
+				if (!dfield_is_null(&ufield->new_val)
+				    && dtype_get_at_most_n_mbchars(
+					col->prtype, col->mbminmaxlen,
+					col->len,
+					ufield_len,
+					static_cast<char*>(
+						dfield_get_data(
+							&ufield->new_val)))
+				    < ufield_len) {
+
+					return(ULINT_UNDEFINED);
+				}
+
+				/* If the parent column type has a different
+				length than the child column type, we may
+				need to pad with spaces the new value of the
+				child column */
+
+				min_size = dict_col_get_min_size(col);
+
+				/* Because UNIV_SQL_NULL (the marker
+				of SQL NULL values) exceeds all possible
+				values of min_size, the test below will
+				not hold for SQL NULL columns. */
+
+				if (min_size > ufield_len) {
+
+					byte*	pad;
+					ulint	pad_len;
+					byte*	padded_data;
+					ulint	mbminlen;
+
+					padded_data = static_cast<byte*>(
+						mem_heap_alloc(
+							heap, min_size));
+
+					pad = padded_data + ufield_len;
+					pad_len = min_size - ufield_len;
+
+					memcpy(padded_data,
+					       dfield_get_data(&ufield
+							       ->new_val),
+					       ufield_len);
+
+					mbminlen = dict_col_get_mbminlen(col);
+
+					ut_ad(!(ufield_len % mbminlen));
+					ut_ad(!(min_size % mbminlen));
+
+					if (mbminlen == 1
+					    && dtype_get_charset_coll(
+						    col->prtype)
+					    == DATA_MYSQL_BINARY_CHARSET_COLL) {
+						/* Do not pad BINARY columns */
+						return(ULINT_UNDEFINED);
+					}
+
+					row_mysql_pad_col(mbminlen,
+							  pad, pad_len);
+					dfield_set_data(&ufield->new_val,
+							padded_data, min_size);
+				}
+
+				/* Check whether the current column has
+				FTS index on it */
+				if (table->fts
+				    && dict_table_is_fts_column(
+					table->fts->indexes,
+					dict_col_get_no(col))
+					!= ULINT_UNDEFINED) {
+					*fts_col_affected = TRUE;
+				}
+
+				/* If Doc ID is updated, check whether the
+				Doc ID is valid */
+				if (table->fts
+				    && ufield->field_no == doc_id_pos) {
+					doc_id_t	n_doc_id;
+
+					n_doc_id =
+						table->fts->cache->next_doc_id;
+
+					new_doc_id = fts_read_doc_id(
+						static_cast<const byte*>(
+							dfield_get_data(
+							&ufield->new_val)));
+
+					if (new_doc_id <= 0) {
+						fprintf(stderr,
+							"InnoDB: FTS Doc ID "
+							"must be larger than "
+							"0 \n");
+						return(ULINT_UNDEFINED);
+					}
+
+					if (new_doc_id < n_doc_id) {
+						fprintf(stderr,
+						       "InnoDB: FTS Doc ID "
+						       "must be larger than "
+						       IB_ID_FMT" for table",
+						       n_doc_id -1);
+
+						ut_print_name(stderr, trx,
+							      TRUE,
+							      table->name);
+
+						putc('\n', stderr);
+						return(ULINT_UNDEFINED);
+					}
+
+					*fts_col_affected = TRUE;
+					doc_id_updated = TRUE;
+				}
+
+				n_fields_updated++;
+			}
+		}
+	}
+
+	/* Generate a new Doc ID if FTS index columns get updated */
+	if (table->fts && *fts_col_affected) {
+		if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+			doc_id_t	doc_id;
+                        upd_field_t*	ufield;
+
+			ut_ad(!doc_id_updated);
+			ufield = update->fields + n_fields_updated;
+			fts_get_next_doc_id(table, &trx->fts_next_doc_id);
+			doc_id = fts_update_doc_id(table, ufield,
+						   &trx->fts_next_doc_id);
+			n_fields_updated++;
+			fts_trx_add_op(trx, table, doc_id, FTS_INSERT, NULL);
+		} else  {
+			if (doc_id_updated) {
+				ut_ad(new_doc_id);
+				fts_trx_add_op(trx, table, new_doc_id,
+					       FTS_INSERT, NULL);
+			} else {
+				fprintf(stderr, "InnoDB: FTS Doc ID must be "
+					"updated along with FTS indexed "
+					"column for table ");
+				ut_print_name(stderr, trx, TRUE, table->name);
+				putc('\n', stderr);
+				return(ULINT_UNDEFINED);
+			}
+		}
+	}
+
+	update->n_fields = n_fields_updated;
+
+	return(n_fields_updated);
+}
+
+/*********************************************************************//**
+Set detailed error message associated with foreign key errors for
+the given transaction. */
+static
+void
+row_ins_set_detailed(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_foreign_t*	foreign)	/*!< in: foreign key constraint */
+{
+	ut_ad(!srv_read_only_mode);
+
+	mutex_enter(&srv_misc_tmpfile_mutex);
+	rewind(srv_misc_tmpfile);
+
+	if (os_file_set_eof(srv_misc_tmpfile)) {
+		ut_print_name(srv_misc_tmpfile, trx, TRUE,
+			      foreign->foreign_table_name);
+		dict_print_info_on_foreign_key_in_create_format(
+			srv_misc_tmpfile, trx, foreign, FALSE);
+		trx_set_detailed_error_from_file(trx, srv_misc_tmpfile);
+	} else {
+		trx_set_detailed_error(trx, "temp file operation failed");
+	}
+
+	mutex_exit(&srv_misc_tmpfile_mutex);
+}
+
+/*********************************************************************//**
+Acquires dict_foreign_err_mutex, rewinds dict_foreign_err_file
+and displays information about the given transaction.
+The caller must release dict_foreign_err_mutex. */
+static
+void
+row_ins_foreign_trx_print(
+/*======================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	ulint	n_rec_locks;
+	ulint	n_trx_locks;
+	ulint	heap_size;
+
+	if (srv_read_only_mode) {
+		return;
+	}
+
+	lock_mutex_enter();
+	n_rec_locks = lock_number_of_rows_locked(&trx->lock);
+	n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
+	heap_size = mem_heap_get_size(trx->lock.lock_heap);
+	lock_mutex_exit();
+
+	mutex_enter(&trx_sys->mutex);
+
+	mutex_enter(&dict_foreign_err_mutex);
+	rewind(dict_foreign_err_file);
+	ut_print_timestamp(dict_foreign_err_file);
+	fputs(" Transaction:\n", dict_foreign_err_file);
+
+	trx_print_low(dict_foreign_err_file, trx, 600,
+		      n_rec_locks, n_trx_locks, heap_size);
+
+	mutex_exit(&trx_sys->mutex);
+
+	ut_ad(mutex_own(&dict_foreign_err_mutex));
+}
+
+/*********************************************************************//**
+Reports a foreign key error associated with an update or a delete of a
+parent table index entry. */
+static
+void
+row_ins_foreign_report_err(
+/*=======================*/
+	const char*	errstr,		/*!< in: error string from the viewpoint
+					of the parent table */
+	que_thr_t*	thr,		/*!< in: query thread whose run_node
+					is an update node */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
+	const rec_t*	rec,		/*!< in: a matching index record in the
+					child table */
+	const dtuple_t*	entry)		/*!< in: index entry in the parent
+					table */
+{
+	if (srv_read_only_mode) {
+		return;
+	}
+
+	FILE*	ef	= dict_foreign_err_file;
+	trx_t*	trx	= thr_get_trx(thr);
+
+	row_ins_set_detailed(trx, foreign);
+
+	row_ins_foreign_trx_print(trx);
+
+	fputs("Foreign key constraint fails for table ", ef);
+	ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
+	fputs(":\n", ef);
+	dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign,
+							TRUE);
+	putc('\n', ef);
+	fputs(errstr, ef);
+	fputs(" in parent table, in index ", ef);
+	ut_print_name(ef, trx, FALSE, foreign->referenced_index->name);
+	if (entry) {
+		fputs(" tuple:\n", ef);
+		dtuple_print(ef, entry);
+	}
+	fputs("\nBut in child table ", ef);
+	ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
+	fputs(", in index ", ef);
+	ut_print_name(ef, trx, FALSE, foreign->foreign_index->name);
+	if (rec) {
+		fputs(", there is a record:\n", ef);
+		rec_print(ef, rec, foreign->foreign_index);
+	} else {
+		fputs(", the record is not available\n", ef);
+	}
+	putc('\n', ef);
+
+	mutex_exit(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Reports a foreign key error to dict_foreign_err_file when we are trying
+to add an index entry to a child table. Note that the adding may be the result
+of an update, too. */
+static
+void
+row_ins_foreign_report_add_err(
+/*===========================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
+	const rec_t*	rec,		/*!< in: a record in the parent table:
+					it does not match entry because we
+					have an error! */
+	const dtuple_t*	entry)		/*!< in: index entry to insert in the
+					child table */
+{
+	if (srv_read_only_mode) {
+		return;
+	}
+
+	FILE*	ef	= dict_foreign_err_file;
+
+	row_ins_set_detailed(trx, foreign);
+
+	row_ins_foreign_trx_print(trx);
+
+	fputs("Foreign key constraint fails for table ", ef);
+	ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
+	fputs(":\n", ef);
+	dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign,
+							TRUE);
+	fputs("\nTrying to add in child table, in index ", ef);
+	ut_print_name(ef, trx, FALSE, foreign->foreign_index->name);
+	if (entry) {
+		fputs(" tuple:\n", ef);
+		/* TODO: DB_TRX_ID and DB_ROLL_PTR may be uninitialized.
+		It would be better to only display the user columns. */
+		dtuple_print(ef, entry);
+	}
+	fputs("\nBut in parent table ", ef);
+	ut_print_name(ef, trx, TRUE, foreign->referenced_table_name);
+	fputs(", in index ", ef);
+	ut_print_name(ef, trx, FALSE, foreign->referenced_index->name);
+	fputs(",\nthe closest match we can find is record:\n", ef);
+	if (rec && page_rec_is_supremum(rec)) {
+		/* If the cursor ended on a supremum record, it is better
+		to report the previous record in the error message, so that
+		the user gets a more descriptive error message. */
+		rec = page_rec_get_prev_const(rec);
+	}
+
+	if (rec) {
+		rec_print(ef, rec, foreign->referenced_index);
+	}
+	putc('\n', ef);
+
+	mutex_exit(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Invalidate the query cache for the given table. */
+static
+void
+row_ins_invalidate_query_cache(
+/*===========================*/
+	que_thr_t*	thr,		/*!< in: query thread whose run_node
+					is an update node */
+	const char*	name)		/*!< in: table name prefixed with
+					database name and a '/' character */
+{
+	char*	buf;
+	char*	ptr;
+	ulint	len = strlen(name) + 1;
+
+	buf = mem_strdupl(name, len);
+
+	ptr = strchr(buf, '/');
+	ut_a(ptr);
+	*ptr = '\0';
+
+	innobase_invalidate_query_cache(thr_get_trx(thr), buf, len);
+	mem_free(buf);
+}
+
+/*********************************************************************//**
+Perform referential actions or checks when a parent row is deleted or updated
+and the constraint had an ON DELETE or ON UPDATE condition which was not
+RESTRICT.
+@return	DB_SUCCESS, DB_LOCK_WAIT, or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_ins_foreign_check_on_constraint(
+/*================================*/
+	que_thr_t*	thr,		/*!< in: query thread whose run_node
+					is an update node */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint whose
+					type is != 0 */
+	btr_pcur_t*	pcur,		/*!< in: cursor placed on a matching
+					index record in the child table */
+	dtuple_t*	entry,		/*!< in: index entry in the parent
+					table */
+	mtr_t*		mtr)		/*!< in: mtr holding the latch of pcur
+					page */
+{
+	upd_node_t*	node;
+	upd_node_t*	cascade;
+	dict_table_t*	table		= foreign->foreign_table;
+	dict_index_t*	index;
+	dict_index_t*	clust_index;
+	dtuple_t*	ref;
+	mem_heap_t*	upd_vec_heap	= NULL;
+	const rec_t*	rec;
+	const rec_t*	clust_rec;
+	const buf_block_t* clust_block;
+	upd_t*		update;
+	ulint		n_to_update;
+	dberr_t		err;
+	ulint		i;
+	trx_t*		trx;
+	mem_heap_t*	tmp_heap	= NULL;
+	doc_id_t	doc_id = FTS_NULL_DOC_ID;
+	ibool		fts_col_affacted = FALSE;
+
+	ut_a(thr);
+	ut_a(foreign);
+	ut_a(pcur);
+	ut_a(mtr);
+
+	trx = thr_get_trx(thr);
+
+	/* Since we are going to delete or update a row, we have to invalidate
+	the MySQL query cache for table. A deadlock of threads is not possible
+	here because the caller of this function does not hold any latches with
+	the sync0sync.h rank above the lock_sys_t::mutex. The query cache mutex
+       	has a rank just above the lock_sys_t::mutex. */
+
+	row_ins_invalidate_query_cache(thr, table->name);
+
+	node = static_cast<upd_node_t*>(thr->run_node);
+
+	if (node->is_delete && 0 == (foreign->type
+				     & (DICT_FOREIGN_ON_DELETE_CASCADE
+					| DICT_FOREIGN_ON_DELETE_SET_NULL))) {
+
+		row_ins_foreign_report_err("Trying to delete",
+					   thr, foreign,
+					   btr_pcur_get_rec(pcur), entry);
+
+		return(DB_ROW_IS_REFERENCED);
+	}
+
+	if (!node->is_delete && 0 == (foreign->type
+				      & (DICT_FOREIGN_ON_UPDATE_CASCADE
+					 | DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+
+		/* This is an UPDATE */
+
+		row_ins_foreign_report_err("Trying to update",
+					   thr, foreign,
+					   btr_pcur_get_rec(pcur), entry);
+
+		return(DB_ROW_IS_REFERENCED);
+	}
+
+	if (node->cascade_node == NULL) {
+		/* Extend our query graph by creating a child to current
+		update node. The child is used in the cascade or set null
+		operation. */
+
+		node->cascade_heap = mem_heap_create(128);
+		node->cascade_node = row_create_update_node_for_mysql(
+			table, node->cascade_heap);
+		que_node_set_parent(node->cascade_node, node);
+	}
+
+	/* Initialize cascade_node to do the operation we want. Note that we
+	use the SAME cascade node to do all foreign key operations of the
+	SQL DELETE: the table of the cascade node may change if there are
+	several child tables to the table where the delete is done! */
+
+	cascade = node->cascade_node;
+
+	cascade->table = table;
+
+	cascade->foreign = foreign;
+
+	if (node->is_delete
+	    && (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE)) {
+		cascade->is_delete = TRUE;
+	} else {
+		cascade->is_delete = FALSE;
+
+		if (foreign->n_fields > cascade->update_n_fields) {
+			/* We have to make the update vector longer */
+
+			cascade->update = upd_create(foreign->n_fields,
+						     node->cascade_heap);
+			cascade->update_n_fields = foreign->n_fields;
+		}
+	}
+
+	/* We do not allow cyclic cascaded updating (DELETE is allowed,
+	but not UPDATE) of the same table, as this can lead to an infinite
+	cycle. Check that we are not updating the same table which is
+	already being modified in this cascade chain. We have to check
+	this also because the modification of the indexes of a 'parent'
+	table may still be incomplete, and we must avoid seeing the indexes
+	of the parent table in an inconsistent state! */
+
+	if (!cascade->is_delete
+	    && row_ins_cascade_ancestor_updates_table(cascade, table)) {
+
+		/* We do not know if this would break foreign key
+		constraints, but play safe and return an error */
+
+		err = DB_ROW_IS_REFERENCED;
+
+		row_ins_foreign_report_err(
+			"Trying an update, possibly causing a cyclic"
+			" cascaded update\n"
+			"in the child table,", thr, foreign,
+			btr_pcur_get_rec(pcur), entry);
+
+		goto nonstandard_exit_func;
+	}
+
+	if (row_ins_cascade_n_ancestors(cascade) >= 15) {
+		err = DB_ROW_IS_REFERENCED;
+
+		row_ins_foreign_report_err(
+			"Trying a too deep cascaded delete or update\n",
+			thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+		goto nonstandard_exit_func;
+	}
+
+	index = btr_pcur_get_btr_cur(pcur)->index;
+
+	ut_a(index == foreign->foreign_index);
+
+	rec = btr_pcur_get_rec(pcur);
+
+	tmp_heap = mem_heap_create(256);
+
+	if (dict_index_is_clust(index)) {
+		/* pcur is already positioned in the clustered index of
+		the child table */
+
+		clust_index = index;
+		clust_rec = rec;
+		clust_block = btr_pcur_get_block(pcur);
+	} else {
+		/* We have to look for the record in the clustered index
+		in the child table */
+
+		clust_index = dict_table_get_first_index(table);
+
+		ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec,
+					tmp_heap);
+		btr_pcur_open_with_no_init(clust_index, ref,
+					   PAGE_CUR_LE, BTR_SEARCH_LEAF,
+					   cascade->pcur, 0, mtr);
+
+		clust_rec = btr_pcur_get_rec(cascade->pcur);
+		clust_block = btr_pcur_get_block(cascade->pcur);
+
+		if (!page_rec_is_user_rec(clust_rec)
+		    || btr_pcur_get_low_match(cascade->pcur)
+		    < dict_index_get_n_unique(clust_index)) {
+
+			fputs("InnoDB: error in cascade of a foreign key op\n"
+			      "InnoDB: ", stderr);
+			dict_index_name_print(stderr, trx, index);
+
+			fputs("\n"
+			      "InnoDB: record ", stderr);
+			rec_print(stderr, rec, index);
+			fputs("\n"
+			      "InnoDB: clustered record ", stderr);
+			rec_print(stderr, clust_rec, clust_index);
+			fputs("\n"
+			      "InnoDB: Submit a detailed bug report to"
+			      " http://bugs.mysql.com\n", stderr);
+			ut_ad(0);
+			err = DB_SUCCESS;
+
+			goto nonstandard_exit_func;
+		}
+	}
+
+	/* Set an X-lock on the row to delete or update in the child table */
+
+	err = lock_table(0, table, LOCK_IX, thr);
+
+	if (err == DB_SUCCESS) {
+		/* Here it suffices to use a LOCK_REC_NOT_GAP type lock;
+		we already have a normal shared lock on the appropriate
+		gap if the search criterion was not unique */
+
+		err = lock_clust_rec_read_check_and_lock_alt(
+			0, clust_block, clust_rec, clust_index,
+			LOCK_X, LOCK_REC_NOT_GAP, thr);
+	}
+
+	if (err != DB_SUCCESS) {
+
+		goto nonstandard_exit_func;
+	}
+
+	if (rec_get_deleted_flag(clust_rec, dict_table_is_comp(table))) {
+		/* This can happen if there is a circular reference of
+		rows such that cascading delete comes to delete a row
+		already in the process of being delete marked */
+		err = DB_SUCCESS;
+
+		goto nonstandard_exit_func;
+	}
+
+	if (table->fts) {
+		doc_id = fts_get_doc_id_from_rec(table, clust_rec, tmp_heap);
+	}
+
+	if (node->is_delete
+	    ? (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL)
+	    : (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL)) {
+
+		/* Build the appropriate update vector which sets
+		foreign->n_fields first fields in rec to SQL NULL */
+
+		update = cascade->update;
+
+		update->info_bits = 0;
+		update->n_fields = foreign->n_fields;
+		UNIV_MEM_INVALID(update->fields,
+				 update->n_fields * sizeof *update->fields);
+
+		for (i = 0; i < foreign->n_fields; i++) {
+			upd_field_t*	ufield = &update->fields[i];
+
+			ufield->field_no = dict_table_get_nth_col_pos(
+				table,
+				dict_index_get_nth_col_no(index, i));
+			ufield->orig_len = 0;
+			ufield->exp = NULL;
+			dfield_set_null(&ufield->new_val);
+
+			if (table->fts && dict_table_is_fts_column(
+				table->fts->indexes,
+				dict_index_get_nth_col_no(index, i))
+				!= ULINT_UNDEFINED) {
+				fts_col_affacted = TRUE;
+			}
+		}
+
+		if (fts_col_affacted) {
+			fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
+		}
+	} else if (table->fts && cascade->is_delete) {
+		/* DICT_FOREIGN_ON_DELETE_CASCADE case */
+		for (i = 0; i < foreign->n_fields; i++) {
+			if (table->fts && dict_table_is_fts_column(
+				table->fts->indexes,
+				dict_index_get_nth_col_no(index, i))
+				!= ULINT_UNDEFINED) {
+				fts_col_affacted = TRUE;
+			}
+		}
+
+		if (fts_col_affacted) {
+			fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
+		}
+	}
+
+	if (!node->is_delete
+	    && (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE)) {
+
+		/* Build the appropriate update vector which sets changing
+		foreign->n_fields first fields in rec to new values */
+
+		upd_vec_heap = mem_heap_create(256);
+
+		n_to_update = row_ins_cascade_calc_update_vec(
+			node, foreign, upd_vec_heap, trx, &fts_col_affacted);
+
+		if (n_to_update == ULINT_UNDEFINED) {
+			err = DB_ROW_IS_REFERENCED;
+
+			row_ins_foreign_report_err(
+				"Trying a cascaded update where the"
+				" updated value in the child\n"
+				"table would not fit in the length"
+				" of the column, or the value would\n"
+				"be NULL and the column is"
+				" declared as not NULL in the child table,",
+				thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+			goto nonstandard_exit_func;
+		}
+
+		if (cascade->update->n_fields == 0) {
+
+			/* The update does not change any columns referred
+			to in this foreign key constraint: no need to do
+			anything */
+
+			err = DB_SUCCESS;
+
+			goto nonstandard_exit_func;
+		}
+
+		/* Mark the old Doc ID as deleted */
+		if (fts_col_affacted) {
+			ut_ad(table->fts);
+			fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
+		}
+	}
+
+	/* Store pcur position and initialize or store the cascade node
+	pcur stored position */
+
+	btr_pcur_store_position(pcur, mtr);
+
+	if (index == clust_index) {
+		btr_pcur_copy_stored_position(cascade->pcur, pcur);
+	} else {
+		btr_pcur_store_position(cascade->pcur, mtr);
+	}
+
+	mtr_commit(mtr);
+
+	ut_a(cascade->pcur->rel_pos == BTR_PCUR_ON);
+
+	cascade->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	err = row_update_cascade_for_mysql(thr, cascade,
+					   foreign->foreign_table);
+
+	if (foreign->foreign_table->n_foreign_key_checks_running == 0) {
+		fprintf(stderr,
+			"InnoDB: error: table %s has the counter 0"
+			" though there is\n"
+			"InnoDB: a FOREIGN KEY check running on it.\n",
+			foreign->foreign_table->name);
+	}
+
+	/* Release the data dictionary latch for a while, so that we do not
+	starve other threads from doing CREATE TABLE etc. if we have a huge
+	cascaded operation running. The counter n_foreign_key_checks_running
+	will prevent other users from dropping or ALTERing the table when we
+	release the latch. */
+
+	row_mysql_unfreeze_data_dictionary(thr_get_trx(thr));
+
+	DEBUG_SYNC_C("innodb_dml_cascade_dict_unfreeze");
+
+	row_mysql_freeze_data_dictionary(thr_get_trx(thr));
+
+	mtr_start(mtr);
+
+	/* Restore pcur position */
+
+	btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	if (upd_vec_heap) {
+		mem_heap_free(upd_vec_heap);
+	}
+
+	return(err);
+
+nonstandard_exit_func:
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	if (upd_vec_heap) {
+		mem_heap_free(upd_vec_heap);
+	}
+
+	btr_pcur_store_position(pcur, mtr);
+
+	mtr_commit(mtr);
+	mtr_start(mtr);
+
+	btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Sets a shared lock on a record. Used in locking possible duplicate key
+records and also in checking foreign key constraints.
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+static
+dberr_t
+row_ins_set_shared_rec_lock(
+/*========================*/
+	ulint			type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP type lock */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record */
+	dict_index_t*		index,	/*!< in: index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (dict_index_is_clust(index)) {
+		err = lock_clust_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_S, type, thr);
+	} else {
+		err = lock_sec_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_S, type, thr);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Sets a exclusive lock on a record. Used in locking possible duplicate key
+records
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+static
+dberr_t
+row_ins_set_exclusive_rec_lock(
+/*===========================*/
+	ulint			type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP type lock */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record */
+	dict_index_t*		index,	/*!< in: index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (dict_index_is_clust(index)) {
+		err = lock_clust_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_X, type, thr);
+	} else {
+		err = lock_sec_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_X, type, thr);
+	}
+
+	return(err);
+}
+
+/***************************************************************//**
+Checks if foreign key constraint fails for an index entry. Sets shared locks
+which lock either the success or the failure of the constraint. NOTE that
+the caller must have a shared latch on dict_operation_lock.
+@return	DB_SUCCESS, DB_NO_REFERENCED_ROW, or DB_ROW_IS_REFERENCED */
+UNIV_INTERN
+dberr_t
+row_ins_check_foreign_constraint(
+/*=============================*/
+	ibool		check_ref,/*!< in: TRUE if we want to check that
+				the referenced table is ok, FALSE if we
+				want to check the foreign key table */
+	dict_foreign_t*	foreign,/*!< in: foreign constraint; NOTE that the
+				tables mentioned in it must be in the
+				dictionary cache if they exist at all */
+	dict_table_t*	table,	/*!< in: if check_ref is TRUE, then the foreign
+				table, else the referenced table */
+	dtuple_t*	entry,	/*!< in: index entry for index */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t		err;
+	upd_node_t*	upd_node;
+	dict_table_t*	check_table;
+	dict_index_t*	check_index;
+	ulint		n_fields_cmp;
+	btr_pcur_t	pcur;
+	int		cmp;
+	ulint		i;
+	mtr_t		mtr;
+	trx_t*		trx		= thr_get_trx(thr);
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+run_again:
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	err = DB_SUCCESS;
+
+	if (trx->check_foreigns == FALSE) {
+		/* The user has suppressed foreign key checks currently for
+		this session */
+		goto exit_func;
+	}
+
+	/* If any of the foreign key fields in entry is SQL NULL, we
+	suppress the foreign key check: this is compatible with Oracle,
+	for example */
+
+	for (i = 0; i < foreign->n_fields; i++) {
+		if (UNIV_SQL_NULL == dfield_get_len(
+			    dtuple_get_nth_field(entry, i))) {
+
+			goto exit_func;
+		}
+	}
+
+	if (que_node_get_type(thr->run_node) == QUE_NODE_UPDATE) {
+		upd_node = static_cast<upd_node_t*>(thr->run_node);
+
+		if (!(upd_node->is_delete) && upd_node->foreign == foreign) {
+			/* If a cascaded update is done as defined by a
+			foreign key constraint, do not check that
+			constraint for the child row. In ON UPDATE CASCADE
+			the update of the parent row is only half done when
+			we come here: if we would check the constraint here
+			for the child row it would fail.
+
+			A QUESTION remains: if in the child table there are
+			several constraints which refer to the same parent
+			table, we should merge all updates to the child as
+			one update? And the updates can be contradictory!
+			Currently we just perform the update associated
+			with each foreign key constraint, one after
+			another, and the user has problems predicting in
+			which order they are performed. */
+
+			goto exit_func;
+		}
+	}
+
+	if (check_ref) {
+		check_table = foreign->referenced_table;
+		check_index = foreign->referenced_index;
+	} else {
+		check_table = foreign->foreign_table;
+		check_index = foreign->foreign_index;
+	}
+
+	if (check_table == NULL
+	    || check_table->ibd_file_missing
+	    || check_index == NULL) {
+
+		if (!srv_read_only_mode && check_ref) {
+			FILE*	ef = dict_foreign_err_file;
+
+			row_ins_set_detailed(trx, foreign);
+
+			row_ins_foreign_trx_print(trx);
+
+			fputs("Foreign key constraint fails for table ", ef);
+			ut_print_name(ef, trx, TRUE,
+				      foreign->foreign_table_name);
+			fputs(":\n", ef);
+			dict_print_info_on_foreign_key_in_create_format(
+				ef, trx, foreign, TRUE);
+			fputs("\nTrying to add to index ", ef);
+			ut_print_name(ef, trx, FALSE,
+				      foreign->foreign_index->name);
+			fputs(" tuple:\n", ef);
+			dtuple_print(ef, entry);
+			fputs("\nBut the parent table ", ef);
+			ut_print_name(ef, trx, TRUE,
+				      foreign->referenced_table_name);
+			fputs("\nor its .ibd file does"
+			      " not currently exist!\n", ef);
+			mutex_exit(&dict_foreign_err_mutex);
+
+			err = DB_NO_REFERENCED_ROW;
+		}
+
+		goto exit_func;
+	}
+
+	if (check_table != table) {
+		/* We already have a LOCK_IX on table, but not necessarily
+		on check_table */
+
+		err = lock_table(0, check_table, LOCK_IS, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto do_possible_lock_wait;
+		}
+	}
+
+	mtr_start(&mtr);
+
+	/* Store old value on n_fields_cmp */
+
+	n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+	dtuple_set_n_fields_cmp(entry, foreign->n_fields);
+
+	btr_pcur_open(check_index, entry, PAGE_CUR_GE,
+		      BTR_SEARCH_LEAF, &pcur, &mtr);
+
+	/* Scan index records and check if there is a matching record */
+
+	do {
+		const rec_t*		rec = btr_pcur_get_rec(&pcur);
+		const buf_block_t*	block = btr_pcur_get_block(&pcur);
+
+		if (page_rec_is_infimum(rec)) {
+
+			continue;
+		}
+
+		offsets = rec_get_offsets(rec, check_index,
+					  offsets, ULINT_UNDEFINED, &heap);
+
+		if (page_rec_is_supremum(rec)) {
+
+			err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, block,
+							  rec, check_index,
+							  offsets, thr);
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+			case DB_SUCCESS:
+				continue;
+			default:
+				goto end_scan;
+			}
+		}
+
+		cmp = cmp_dtuple_rec(entry, rec, offsets);
+
+		if (cmp == 0) {
+			if (rec_get_deleted_flag(rec,
+						 rec_offs_comp(offsets))) {
+				err = row_ins_set_shared_rec_lock(
+					LOCK_ORDINARY, block,
+					rec, check_index, offsets, thr);
+				switch (err) {
+				case DB_SUCCESS_LOCKED_REC:
+				case DB_SUCCESS:
+					break;
+				default:
+					goto end_scan;
+				}
+			} else {
+				/* Found a matching record. Lock only
+				a record because we can allow inserts
+				into gaps */
+
+				err = row_ins_set_shared_rec_lock(
+					LOCK_REC_NOT_GAP, block,
+					rec, check_index, offsets, thr);
+
+				switch (err) {
+				case DB_SUCCESS_LOCKED_REC:
+				case DB_SUCCESS:
+					break;
+				default:
+					goto end_scan;
+				}
+
+				if (check_ref) {
+					err = DB_SUCCESS;
+
+					goto end_scan;
+				} else if (foreign->type != 0) {
+					/* There is an ON UPDATE or ON DELETE
+					condition: check them in a separate
+					function */
+
+					err = row_ins_foreign_check_on_constraint(
+						thr, foreign, &pcur, entry,
+						&mtr);
+					if (err != DB_SUCCESS) {
+						/* Since reporting a plain
+						"duplicate key" error
+						message to the user in
+						cases where a long CASCADE
+						operation would lead to a
+						duplicate key in some
+						other table is very
+						confusing, map duplicate
+						key errors resulting from
+						FK constraints to a
+						separate error code. */
+
+						if (err == DB_DUPLICATE_KEY) {
+							err = DB_FOREIGN_DUPLICATE_KEY;
+						}
+
+						goto end_scan;
+					}
+
+					/* row_ins_foreign_check_on_constraint
+					may have repositioned pcur on a
+					different block */
+					block = btr_pcur_get_block(&pcur);
+				} else {
+					row_ins_foreign_report_err(
+						"Trying to delete or update",
+						thr, foreign, rec, entry);
+
+					err = DB_ROW_IS_REFERENCED;
+					goto end_scan;
+				}
+			}
+		} else {
+			ut_a(cmp < 0);
+
+			err = row_ins_set_shared_rec_lock(
+				LOCK_GAP, block,
+				rec, check_index, offsets, thr);
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+			case DB_SUCCESS:
+				if (check_ref) {
+					err = DB_NO_REFERENCED_ROW;
+					row_ins_foreign_report_add_err(
+						trx, foreign, rec, entry);
+				} else {
+					err = DB_SUCCESS;
+				}
+			default:
+				break;
+			}
+
+			goto end_scan;
+		}
+	} while (btr_pcur_move_to_next(&pcur, &mtr));
+
+	if (check_ref) {
+		row_ins_foreign_report_add_err(
+			trx, foreign, btr_pcur_get_rec(&pcur), entry);
+		err = DB_NO_REFERENCED_ROW;
+	} else {
+		err = DB_SUCCESS;
+	}
+
+end_scan:
+	btr_pcur_close(&pcur);
+
+	mtr_commit(&mtr);
+
+	/* Restore old value */
+	dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+do_possible_lock_wait:
+	if (err == DB_LOCK_WAIT) {
+		bool		verified = false;
+
+		trx->error_state = err;
+
+		que_thr_stop_for_mysql(thr);
+
+		lock_wait_suspend_thread(thr);
+
+		if (check_table->to_be_dropped) {
+			/* The table is being dropped. We shall timeout
+			this operation */
+			err = DB_LOCK_WAIT_TIMEOUT;
+			goto exit_func;
+		}
+
+		/* We had temporarily released dict_operation_lock in
+		above lock sleep wait, now we have the lock again, and
+		we will need to re-check whether the foreign key has been
+		dropped. We only need to verify if the table is referenced
+		table case (check_ref == 0), since MDL lock will prevent
+		concurrent DDL and DML on the same table */
+		if (!check_ref) {
+			for (dict_foreign_set::iterator it
+				= table->referenced_set.begin();
+			     it != table->referenced_set.end();
+			     ++it) {
+				if (*it == foreign) {
+					verified = true;
+					break;
+				}
+			}
+		} else {
+			verified = true;
+		}
+
+		if (!verified) {
+			err = DB_DICT_CHANGED;
+		} else if (trx->error_state == DB_SUCCESS) {
+			goto run_again;
+		} else {
+			err = trx->error_state;
+		}
+	}
+
+exit_func:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/***************************************************************//**
+Checks if foreign key constraints fail for an index entry. If index
+is not mentioned in any constraint, this function does nothing,
+Otherwise does searches to the indexes of referenced tables and
+sets shared locks which lock either the success or the failure of
+a constraint.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_ins_check_foreign_constraints(
+/*==============================*/
+	dict_table_t*	table,	/*!< in: table */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry for index */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dict_foreign_t*	foreign;
+	dberr_t		err;
+	trx_t*		trx;
+	ibool		got_s_lock	= FALSE;
+
+	trx = thr_get_trx(thr);
+
+	DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+			    "foreign_constraint_check_for_ins");
+
+	for (dict_foreign_set::iterator it = table->foreign_set.begin();
+	     it != table->foreign_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		if (foreign->foreign_index == index) {
+			dict_table_t*	ref_table = NULL;
+			dict_table_t*	foreign_table = foreign->foreign_table;
+			dict_table_t*	referenced_table
+						= foreign->referenced_table;
+
+			if (referenced_table == NULL) {
+
+				ref_table = dict_table_open_on_name(
+					foreign->referenced_table_name_lookup,
+					FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+			}
+
+			if (0 == trx->dict_operation_lock_mode) {
+				got_s_lock = TRUE;
+
+				row_mysql_freeze_data_dictionary(trx);
+			}
+
+			if (referenced_table) {
+				os_inc_counter(dict_sys->mutex,
+					       foreign_table
+					       ->n_foreign_key_checks_running);
+			}
+
+			/* NOTE that if the thread ends up waiting for a lock
+			we will release dict_operation_lock temporarily!
+			But the counter on the table protects the referenced
+			table from being dropped while the check is running. */
+
+			err = row_ins_check_foreign_constraint(
+				TRUE, foreign, table, entry, thr);
+
+			DBUG_EXECUTE_IF("row_ins_dict_change_err",
+					err = DB_DICT_CHANGED;);
+
+			if (referenced_table) {
+				os_dec_counter(dict_sys->mutex,
+					       foreign_table
+					       ->n_foreign_key_checks_running);
+			}
+
+			if (got_s_lock) {
+				row_mysql_unfreeze_data_dictionary(trx);
+			}
+
+			if (ref_table != NULL) {
+				dict_table_close(ref_table, FALSE, FALSE);
+			}
+
+			if (err != DB_SUCCESS) {
+
+				return(err);
+			}
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************//**
+Checks if a unique key violation to rec would occur at the index entry
+insert.
+@return	TRUE if error */
+static
+ibool
+row_ins_dupl_error_with_rec(
+/*========================*/
+	const rec_t*	rec,	/*!< in: user record; NOTE that we assume
+				that the caller already has a record lock on
+				the record! */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	dict_index_t*	index,	/*!< in: index */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ulint	matched_fields;
+	ulint	matched_bytes;
+	ulint	n_unique;
+	ulint	i;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	n_unique = dict_index_get_n_unique(index);
+
+	matched_fields = 0;
+	matched_bytes = 0;
+
+	cmp_dtuple_rec_with_match(entry, rec, offsets,
+				  &matched_fields, &matched_bytes);
+
+	if (matched_fields < n_unique) {
+
+		return(FALSE);
+	}
+
+	/* In a unique secondary index we allow equal key values if they
+	contain SQL NULLs */
+
+	if (!dict_index_is_clust(index)) {
+
+		for (i = 0; i < n_unique; i++) {
+			if (dfield_is_null(dtuple_get_nth_field(entry, i))) {
+
+				return(FALSE);
+			}
+		}
+	}
+
+	return(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+}
+
+/***************************************************************//**
+Scans a unique non-clustered index at a given index entry to determine
+whether a uniqueness violation has occurred for the key value of the entry.
+Set shared locks on possible duplicate records.
+@return	DB_SUCCESS, DB_DUPLICATE_KEY, or DB_LOCK_WAIT */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_ins_scan_sec_index_for_duplicate(
+/*=================================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	dict_index_t*	index,	/*!< in: non-clustered unique index */
+	dtuple_t*	entry,	/*!< in: index entry */
+	que_thr_t*	thr,	/*!< in: query thread */
+	bool		s_latch,/*!< in: whether index->lock is being held */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	mem_heap_t*	offsets_heap)
+				/*!< in/out: memory heap that can be emptied */
+{
+	ulint		n_unique;
+	int		cmp;
+	ulint		n_fields_cmp;
+	btr_pcur_t	pcur;
+	dberr_t		err		= DB_SUCCESS;
+	ulint		allow_duplicates;
+	ulint*		offsets		= NULL;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(s_latch == rw_lock_own(&index->lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	n_unique = dict_index_get_n_unique(index);
+
+	/* If the secondary index is unique, but one of the fields in the
+	n_unique first fields is NULL, a unique key violation cannot occur,
+	since we define NULL != NULL in this case */
+
+	for (ulint i = 0; i < n_unique; i++) {
+		if (UNIV_SQL_NULL == dfield_get_len(
+			    dtuple_get_nth_field(entry, i))) {
+
+			return(DB_SUCCESS);
+		}
+	}
+
+	/* Store old value on n_fields_cmp */
+
+	n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+	dtuple_set_n_fields_cmp(entry, n_unique);
+
+	btr_pcur_open(index, entry, PAGE_CUR_GE,
+		      s_latch
+		      ? BTR_SEARCH_LEAF | BTR_ALREADY_S_LATCHED
+		      : BTR_SEARCH_LEAF,
+		      &pcur, mtr);
+
+	allow_duplicates = thr_get_trx(thr)->duplicates;
+
+	/* Scan index records and check if there is a duplicate */
+
+	do {
+		const rec_t*		rec	= btr_pcur_get_rec(&pcur);
+		const buf_block_t*	block	= btr_pcur_get_block(&pcur);
+		const ulint		lock_type = LOCK_ORDINARY;
+
+		if (page_rec_is_infimum(rec)) {
+
+			continue;
+		}
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &offsets_heap);
+
+		if (flags & BTR_NO_LOCKING_FLAG) {
+			/* Set no locks when applying log
+			in online table rebuild. */
+		} else if (allow_duplicates) {
+
+			/* If the SQL-query will update or replace
+			duplicate key we will take X-lock for
+			duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+			INSERT ON DUPLICATE KEY UPDATE). */
+
+			err = row_ins_set_exclusive_rec_lock(
+				lock_type, block, rec, index, offsets, thr);
+		} else {
+
+			err = row_ins_set_shared_rec_lock(
+				lock_type, block, rec, index, offsets, thr);
+		}
+
+		switch (err) {
+		case DB_SUCCESS_LOCKED_REC:
+			err = DB_SUCCESS;
+		case DB_SUCCESS:
+			break;
+		default:
+			goto end_scan;
+		}
+
+		if (page_rec_is_supremum(rec)) {
+
+			continue;
+		}
+
+		cmp = cmp_dtuple_rec(entry, rec, offsets);
+
+		if (cmp == 0) {
+			if (row_ins_dupl_error_with_rec(rec, entry,
+							index, offsets)) {
+				err = DB_DUPLICATE_KEY;
+
+				thr_get_trx(thr)->error_info = index;
+
+				/* If the duplicate is on hidden FTS_DOC_ID,
+				state so in the error log */
+				if (DICT_TF2_FLAG_IS_SET(
+					index->table,
+					DICT_TF2_FTS_HAS_DOC_ID)
+				    && strcmp(index->name,
+					      FTS_DOC_ID_INDEX_NAME) == 0) {
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"Duplicate FTS_DOC_ID value"
+						" on table %s",
+						index->table->name);
+				}
+
+				goto end_scan;
+			}
+		} else {
+			ut_a(cmp < 0);
+			goto end_scan;
+		}
+	} while (btr_pcur_move_to_next(&pcur, mtr));
+
+end_scan:
+	/* Restore old value */
+	dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+	return(err);
+}
+
+/** Checks for a duplicate when the table is being rebuilt online.
+@retval DB_SUCCESS		when no duplicate is detected
+@retval DB_SUCCESS_LOCKED_REC	when rec is an exact match of entry or
+a newer version of entry (the entry should not be inserted)
+@retval DB_DUPLICATE_KEY	when entry is a duplicate of rec */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_ins_duplicate_online(
+/*=====================*/
+	ulint		n_uniq,	/*!< in: offset of DB_TRX_ID */
+	const dtuple_t*	entry,	/*!< in: entry that is being inserted */
+	const rec_t*	rec,	/*!< in: clustered index record */
+	ulint*		offsets)/*!< in/out: rec_get_offsets(rec) */
+{
+	ulint	fields	= 0;
+	ulint	bytes	= 0;
+
+	/* During rebuild, there should not be any delete-marked rows
+	in the new table. */
+	ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+	ut_ad(dtuple_get_n_fields_cmp(entry) == n_uniq);
+
+	/* Compare the PRIMARY KEY fields and the
+	DB_TRX_ID, DB_ROLL_PTR. */
+	cmp_dtuple_rec_with_match_low(
+		entry, rec, offsets, n_uniq + 2, &fields, &bytes);
+
+	if (fields < n_uniq) {
+		/* Not a duplicate. */
+		return(DB_SUCCESS);
+	}
+
+	if (fields == n_uniq + 2) {
+		/* rec is an exact match of entry. */
+		ut_ad(bytes == 0);
+		return(DB_SUCCESS_LOCKED_REC);
+	}
+
+	return(DB_DUPLICATE_KEY);
+}
+
+/** Checks for a duplicate when the table is being rebuilt online.
+@retval DB_SUCCESS		when no duplicate is detected
+@retval DB_SUCCESS_LOCKED_REC	when rec is an exact match of entry or
+a newer version of entry (the entry should not be inserted)
+@retval DB_DUPLICATE_KEY	when entry is a duplicate of rec */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_ins_duplicate_error_in_clust_online(
+/*====================================*/
+	ulint		n_uniq,	/*!< in: offset of DB_TRX_ID */
+	const dtuple_t*	entry,	/*!< in: entry that is being inserted */
+	const btr_cur_t*cursor,	/*!< in: cursor on insert position */
+	ulint**		offsets,/*!< in/out: rec_get_offsets(rec) */
+	mem_heap_t**	heap)	/*!< in/out: heap for offsets */
+{
+	dberr_t		err	= DB_SUCCESS;
+	const rec_t*	rec	= btr_cur_get_rec(cursor);
+
+	if (cursor->low_match >= n_uniq && !page_rec_is_infimum(rec)) {
+		*offsets = rec_get_offsets(rec, cursor->index, *offsets,
+					   ULINT_UNDEFINED, heap);
+		err = row_ins_duplicate_online(n_uniq, entry, rec, *offsets);
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	rec = page_rec_get_next_const(btr_cur_get_rec(cursor));
+
+	if (cursor->up_match >= n_uniq && !page_rec_is_supremum(rec)) {
+		*offsets = rec_get_offsets(rec, cursor->index, *offsets,
+					   ULINT_UNDEFINED, heap);
+		err = row_ins_duplicate_online(n_uniq, entry, rec, *offsets);
+	}
+
+	return(err);
+}
+
+/***************************************************************//**
+Checks if a unique key violation error would occur at an index entry
+insert. Sets shared locks on possible duplicate records. Works only
+for a clustered index!
+@retval DB_SUCCESS if no error
+@retval DB_DUPLICATE_KEY if error,
+@retval DB_LOCK_WAIT if we have to wait for a lock on a possible duplicate
+record
+@retval DB_SUCCESS_LOCKED_REC if an exact match of the record was found
+in online table rebuild (flags & (BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG)) */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_ins_duplicate_error_in_clust(
+/*=============================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: B-tree cursor */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dberr_t	err;
+	rec_t*	rec;
+	ulint	n_unique;
+	trx_t*	trx		= thr_get_trx(thr);
+	mem_heap_t*heap		= NULL;
+	ulint	offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	UT_NOT_USED(mtr);
+
+	ut_ad(dict_index_is_clust(cursor->index));
+
+	/* NOTE: For unique non-clustered indexes there may be any number
+	of delete marked records with the same value for the non-clustered
+	index key (remember multiversioning), and which differ only in
+	the row refererence part of the index record, containing the
+	clustered index key fields. For such a secondary index record,
+	to avoid race condition, we must FIRST do the insertion and after
+	that check that the uniqueness condition is not breached! */
+
+	/* NOTE: A problem is that in the B-tree node pointers on an
+	upper level may match more to the entry than the actual existing
+	user records on the leaf level. So, even if low_match would suggest
+	that a duplicate key violation may occur, this may not be the case. */
+
+	n_unique = dict_index_get_n_unique(cursor->index);
+
+	if (cursor->low_match >= n_unique) {
+
+		rec = btr_cur_get_rec(cursor);
+
+		if (!page_rec_is_infimum(rec)) {
+			offsets = rec_get_offsets(rec, cursor->index, offsets,
+						  ULINT_UNDEFINED, &heap);
+
+			/* We set a lock on the possible duplicate: this
+			is needed in logical logging of MySQL to make
+			sure that in roll-forward we get the same duplicate
+			errors as in original execution */
+
+			if (trx->duplicates) {
+
+				/* If the SQL-query will update or replace
+				duplicate key we will take X-lock for
+				duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+				INSERT ON DUPLICATE KEY UPDATE). */
+
+				err = row_ins_set_exclusive_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor),
+					rec, cursor->index, offsets, thr);
+			} else {
+
+				err = row_ins_set_shared_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor), rec,
+					cursor->index, offsets, thr);
+			}
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+			case DB_SUCCESS:
+				break;
+			default:
+				goto func_exit;
+			}
+
+			if (row_ins_dupl_error_with_rec(
+				    rec, entry, cursor->index, offsets)) {
+duplicate:
+				trx->error_info = cursor->index;
+				err = DB_DUPLICATE_KEY;
+				goto func_exit;
+			}
+		}
+	}
+
+	if (cursor->up_match >= n_unique) {
+
+		rec = page_rec_get_next(btr_cur_get_rec(cursor));
+
+		if (!page_rec_is_supremum(rec)) {
+			offsets = rec_get_offsets(rec, cursor->index, offsets,
+						  ULINT_UNDEFINED, &heap);
+
+			if (trx->duplicates) {
+
+				/* If the SQL-query will update or replace
+				duplicate key we will take X-lock for
+				duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+				INSERT ON DUPLICATE KEY UPDATE). */
+
+				err = row_ins_set_exclusive_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor),
+					rec, cursor->index, offsets, thr);
+			} else {
+
+				err = row_ins_set_shared_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor),
+					rec, cursor->index, offsets, thr);
+			}
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+			case DB_SUCCESS:
+				break;
+			default:
+				goto func_exit;
+			}
+
+			if (row_ins_dupl_error_with_rec(
+				    rec, entry, cursor->index, offsets)) {
+				goto duplicate;
+			}
+		}
+
+		/* This should never happen */
+		ut_error;
+	}
+
+	err = DB_SUCCESS;
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/***************************************************************//**
+Checks if an index entry has long enough common prefix with an
+existing record so that the intended insert of the entry must be
+changed to a modify of the existing record. In the case of a clustered
+index, the prefix must be n_unique fields long. In the case of a
+secondary index, all fields must be equal.  InnoDB never updates
+secondary index records in place, other than clearing or setting the
+delete-mark flag. We could be able to update the non-unique fields
+of a unique secondary index record by checking the cursor->up_match,
+but we do not do so, because it could have some locking implications.
+@return TRUE if the existing record should be updated; FALSE if not */
+UNIV_INLINE
+ibool
+row_ins_must_modify_rec(
+/*====================*/
+	const btr_cur_t*	cursor)	/*!< in: B-tree cursor */
+{
+	/* NOTE: (compare to the note in row_ins_duplicate_error_in_clust)
+	Because node pointers on upper levels of the B-tree may match more
+	to entry than to actual user records on the leaf level, we
+	have to check if the candidate record is actually a user record.
+	A clustered index node pointer contains index->n_unique first fields,
+	and a secondary index node pointer contains all index fields. */
+
+	return(cursor->low_match
+	       >= dict_index_get_n_unique_in_tree(cursor->index)
+	       && !page_rec_is_infimum(btr_cur_get_rec(cursor)));
+}
+
+/***************************************************************//**
+Tries to insert an entry into a clustered index, ignoring foreign key
+constraints. If a record with the same unique key is found, the other
+record is necessarily marked deleted by a committed transaction, or a
+unique key violation error occurs. The delete marked record is then
+updated to an existing record, and we must write an undo log record on
+the delete marked record.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@return error code */
+UNIV_INTERN
+dberr_t
+row_ins_clust_index_entry_low(
+/*==========================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		n_uniq,	/*!< in: 0 or index->n_uniq */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	btr_cur_t	cursor;
+	ulint*		offsets		= NULL;
+	dberr_t		err;
+	big_rec_t*	big_rec		= NULL;
+	mtr_t		mtr;
+	mem_heap_t*	offsets_heap	= NULL;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(!dict_index_is_unique(index)
+	      || n_uniq == dict_index_get_n_unique(index));
+	ut_ad(!n_uniq || n_uniq == dict_index_get_n_unique(index));
+
+	mtr_start(&mtr);
+
+	if (mode == BTR_MODIFY_LEAF && dict_index_is_online_ddl(index)) {
+		mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED;
+		mtr_s_lock(dict_index_get_lock(index), &mtr);
+	}
+
+	cursor.thr = thr;
+
+	/* Note that we use PAGE_CUR_LE as the search mode, because then
+	the function will return in both low_match and up_match of the
+	cursor sensible values */
+
+	btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE, mode,
+				    &cursor, 0, __FILE__, __LINE__, &mtr);
+
+#ifdef UNIV_DEBUG
+	{
+		page_t*	page = btr_cur_get_page(&cursor);
+		rec_t*	first_rec = page_rec_get_next(
+			page_get_infimum_rec(page));
+
+		ut_ad(page_rec_is_supremum(first_rec)
+		      || rec_get_n_fields(first_rec, index)
+		      == dtuple_get_n_fields(entry));
+	}
+#endif
+
+	if (n_uniq && (cursor.up_match >= n_uniq
+		       || cursor.low_match >= n_uniq)) {
+
+		if (flags
+		    == (BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
+			| BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG)) {
+			/* Set no locks when applying log
+			in online table rebuild. Only check for duplicates. */
+			err = row_ins_duplicate_error_in_clust_online(
+				n_uniq, entry, &cursor,
+				&offsets, &offsets_heap);
+
+			switch (err) {
+			case DB_SUCCESS:
+				break;
+			default:
+				ut_ad(0);
+				/* fall through */
+			case DB_SUCCESS_LOCKED_REC:
+			case DB_DUPLICATE_KEY:
+				thr_get_trx(thr)->error_info = cursor.index;
+			}
+		} else {
+			/* Note that the following may return also
+			DB_LOCK_WAIT */
+
+			err = row_ins_duplicate_error_in_clust(
+				flags, &cursor, entry, thr, &mtr);
+		}
+
+		if (err != DB_SUCCESS) {
+err_exit:
+			mtr_commit(&mtr);
+			goto func_exit;
+		}
+	}
+
+	if (row_ins_must_modify_rec(&cursor)) {
+		/* There is already an index entry with a long enough common
+		prefix, we must convert the insert into a modify of an
+		existing record */
+		mem_heap_t*	entry_heap	= mem_heap_create(1024);
+
+		err = row_ins_clust_index_entry_by_modify(
+			flags, mode, &cursor, &offsets, &offsets_heap,
+			entry_heap, &big_rec, entry, thr, &mtr);
+
+		rec_t*		rec		= btr_cur_get_rec(&cursor);
+
+		if (big_rec) {
+			ut_a(err == DB_SUCCESS);
+			/* Write out the externally stored
+			columns while still x-latching
+			index->lock and block->lock. Allocate
+			pages for big_rec in the mtr that
+			modified the B-tree, but be sure to skip
+			any pages that were freed in mtr. We will
+			write out the big_rec pages before
+			committing the B-tree mini-transaction. If
+			the system crashes so that crash recovery
+			will not replay the mtr_commit(&mtr), the
+			big_rec pages will be left orphaned until
+			the pages are allocated for something else.
+
+			TODO: If the allocation extends the
+			tablespace, it will not be redo
+			logged, in either mini-transaction.
+			Tablespace extension should be
+			redo-logged in the big_rec
+			mini-transaction, so that recovery
+			will not fail when the big_rec was
+			written to the extended portion of the
+			file, in case the file was somehow
+			truncated in the crash. */
+
+			DEBUG_SYNC_C_IF_THD(
+				thr_get_trx(thr)->mysql_thd,
+				"before_row_ins_upd_extern");
+			err = btr_store_big_rec_extern_fields(
+				index, btr_cur_get_block(&cursor),
+				rec, offsets, big_rec, &mtr,
+				BTR_STORE_INSERT_UPDATE);
+			DEBUG_SYNC_C_IF_THD(
+				thr_get_trx(thr)->mysql_thd,
+				"after_row_ins_upd_extern");
+			/* If writing big_rec fails (for
+			example, because of DB_OUT_OF_FILE_SPACE),
+			the record will be corrupted. Even if
+			we did not update any externally
+			stored columns, our update could cause
+			the record to grow so that a
+			non-updated column was selected for
+			external storage. This non-update
+			would not have been written to the
+			undo log, and thus the record cannot
+			be rolled back.
+
+			However, because we have not executed
+			mtr_commit(mtr) yet, the update will
+			not be replayed in crash recovery, and
+			the following assertion failure will
+			effectively "roll back" the operation. */
+			ut_a(err == DB_SUCCESS);
+			dtuple_big_rec_free(big_rec);
+		}
+
+		if (err == DB_SUCCESS && dict_index_is_online_ddl(index)) {
+			row_log_table_insert(rec, index, offsets);
+		}
+
+		mtr_commit(&mtr);
+		mem_heap_free(entry_heap);
+	} else {
+		rec_t*	insert_rec;
+
+		if (mode != BTR_MODIFY_TREE) {
+			ut_ad((mode & ~BTR_ALREADY_S_LATCHED)
+			      == BTR_MODIFY_LEAF);
+			err = btr_cur_optimistic_insert(
+				flags, &cursor, &offsets, &offsets_heap,
+				entry, &insert_rec, &big_rec,
+				n_ext, thr, &mtr);
+		} else {
+			if (buf_LRU_buf_pool_running_out()) {
+
+				err = DB_LOCK_TABLE_FULL;
+				goto err_exit;
+			}
+
+			err = btr_cur_optimistic_insert(
+				flags, &cursor,
+				&offsets, &offsets_heap,
+				entry, &insert_rec, &big_rec,
+				n_ext, thr, &mtr);
+
+			if (err == DB_FAIL) {
+				err = btr_cur_pessimistic_insert(
+					flags, &cursor,
+					&offsets, &offsets_heap,
+					entry, &insert_rec, &big_rec,
+					n_ext, thr, &mtr);
+			}
+		}
+
+		if (UNIV_LIKELY_NULL(big_rec)) {
+			mtr_commit(&mtr);
+
+			/* Online table rebuild could read (and
+			ignore) the incomplete record at this point.
+			If online rebuild is in progress, the
+			row_ins_index_entry_big_rec() will write log. */
+
+			DBUG_EXECUTE_IF(
+				"row_ins_extern_checkpoint",
+				log_make_checkpoint_at(
+					LSN_MAX, TRUE););
+			err = row_ins_index_entry_big_rec(
+				entry, big_rec, offsets, &offsets_heap, index,
+				thr_get_trx(thr)->mysql_thd,
+				__FILE__, __LINE__);
+			dtuple_convert_back_big_rec(index, entry, big_rec);
+		} else {
+			if (err == DB_SUCCESS
+			    && dict_index_is_online_ddl(index)) {
+				row_log_table_insert(
+					insert_rec, index, offsets);
+			}
+
+			mtr_commit(&mtr);
+		}
+	}
+
+func_exit:
+	if (offsets_heap) {
+		mem_heap_free(offsets_heap);
+	}
+
+	return(err);
+}
+
+/***************************************************************//**
+Starts a mini-transaction and checks if the index will be dropped.
+@return true if the index is to be dropped */
+static __attribute__((nonnull, warn_unused_result))
+bool
+row_ins_sec_mtr_start_and_check_if_aborted(
+/*=======================================*/
+	mtr_t*		mtr,	/*!< out: mini-transaction */
+	dict_index_t*	index,	/*!< in/out: secondary index */
+	bool		check,	/*!< in: whether to check */
+	ulint		search_mode)
+				/*!< in: flags */
+{
+	ut_ad(!dict_index_is_clust(index));
+
+	mtr_start(mtr);
+
+	if (!check) {
+		return(false);
+	}
+
+	if (search_mode & BTR_ALREADY_S_LATCHED) {
+		mtr_s_lock(dict_index_get_lock(index), mtr);
+	} else {
+		mtr_x_lock(dict_index_get_lock(index), mtr);
+	}
+
+	switch (index->online_status) {
+	case ONLINE_INDEX_ABORTED:
+	case ONLINE_INDEX_ABORTED_DROPPED:
+		ut_ad(*index->name == TEMP_INDEX_PREFIX);
+		return(true);
+	case ONLINE_INDEX_COMPLETE:
+		return(false);
+	case ONLINE_INDEX_CREATION:
+		break;
+	}
+
+	ut_error;
+	return(true);
+}
+
+/***************************************************************//**
+Tries to insert an entry into a secondary index. If a record with exactly the
+same fields is found, the other record is necessarily marked deleted.
+It is then unmarked. Otherwise, the entry is just inserted to the index.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@return error code */
+UNIV_INTERN
+dberr_t
+row_ins_sec_index_entry_low(
+/*========================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: secondary index */
+	mem_heap_t*	offsets_heap,
+				/*!< in/out: memory heap that can be emptied */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	trx_id_t	trx_id,	/*!< in: PAGE_MAX_TRX_ID during
+				row_log_table_apply(), or 0 */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	btr_cur_t	cursor;
+	ulint		search_mode	= mode | BTR_INSERT;
+	dberr_t		err		= DB_SUCCESS;
+	ulint		n_unique;
+	mtr_t		mtr;
+	ulint*		offsets	= NULL;
+
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(mode == BTR_MODIFY_LEAF || mode == BTR_MODIFY_TREE);
+
+	cursor.thr = thr;
+	ut_ad(thr_get_trx(thr)->id);
+	mtr_start(&mtr);
+
+	/* Ensure that we acquire index->lock when inserting into an
+	index with index->online_status == ONLINE_INDEX_COMPLETE, but
+	could still be subject to rollback_inplace_alter_table().
+	This prevents a concurrent change of index->online_status.
+	The memory object cannot be freed as long as we have an open
+	reference to the table, or index->table->n_ref_count > 0. */
+	const bool check = *index->name == TEMP_INDEX_PREFIX;
+	if (check) {
+		DEBUG_SYNC_C("row_ins_sec_index_enter");
+		if (mode == BTR_MODIFY_LEAF) {
+			search_mode |= BTR_ALREADY_S_LATCHED;
+			mtr_s_lock(dict_index_get_lock(index), &mtr);
+		} else {
+			mtr_x_lock(dict_index_get_lock(index), &mtr);
+		}
+
+		if (row_log_online_op_try(
+			    index, entry, thr_get_trx(thr)->id)) {
+			goto func_exit;
+		}
+	}
+
+	/* Note that we use PAGE_CUR_LE as the search mode, because then
+	the function will return in both low_match and up_match of the
+	cursor sensible values */
+
+	if (!thr_get_trx(thr)->check_unique_secondary) {
+		search_mode |= BTR_IGNORE_SEC_UNIQUE;
+	}
+
+	btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
+				    search_mode,
+				    &cursor, 0, __FILE__, __LINE__, &mtr);
+
+	if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) {
+		/* The insert was buffered during the search: we are done */
+		goto func_exit;
+	}
+
+#ifdef UNIV_DEBUG
+	{
+		page_t*	page = btr_cur_get_page(&cursor);
+		rec_t*	first_rec = page_rec_get_next(
+			page_get_infimum_rec(page));
+
+		ut_ad(page_rec_is_supremum(first_rec)
+		      || rec_get_n_fields(first_rec, index)
+		      == dtuple_get_n_fields(entry));
+	}
+#endif
+
+	n_unique = dict_index_get_n_unique(index);
+
+	if (dict_index_is_unique(index)
+	    && (cursor.low_match >= n_unique || cursor.up_match >= n_unique)) {
+		mtr_commit(&mtr);
+
+		DEBUG_SYNC_C("row_ins_sec_index_unique");
+
+		if (row_ins_sec_mtr_start_and_check_if_aborted(
+			    &mtr, index, check, search_mode)) {
+			goto func_exit;
+		}
+
+		err = row_ins_scan_sec_index_for_duplicate(
+			flags, index, entry, thr, check, &mtr, offsets_heap);
+
+		mtr_commit(&mtr);
+
+		switch (err) {
+		case DB_SUCCESS:
+			break;
+		case DB_DUPLICATE_KEY:
+			if (*index->name == TEMP_INDEX_PREFIX) {
+				ut_ad(!thr_get_trx(thr)
+				      ->dict_operation_lock_mode);
+				mutex_enter(&dict_sys->mutex);
+				dict_set_corrupted_index_cache_only(
+					index, index->table);
+				mutex_exit(&dict_sys->mutex);
+				/* Do not return any error to the
+				caller. The duplicate will be reported
+				by ALTER TABLE or CREATE UNIQUE INDEX.
+				Unfortunately we cannot report the
+				duplicate key value to the DDL thread,
+				because the altered_table object is
+				private to its call stack. */
+				err = DB_SUCCESS;
+			}
+			/* fall through */
+		default:
+			return(err);
+		}
+
+		if (row_ins_sec_mtr_start_and_check_if_aborted(
+			    &mtr, index, check, search_mode)) {
+			goto func_exit;
+		}
+
+		/* We did not find a duplicate and we have now
+		locked with s-locks the necessary records to
+		prevent any insertion of a duplicate by another
+		transaction. Let us now reposition the cursor and
+		continue the insertion. */
+
+		btr_cur_search_to_nth_level(
+			index, 0, entry, PAGE_CUR_LE,
+			search_mode & ~(BTR_INSERT | BTR_IGNORE_SEC_UNIQUE),
+			&cursor, 0, __FILE__, __LINE__, &mtr);
+	}
+
+	if (row_ins_must_modify_rec(&cursor)) {
+		/* There is already an index entry with a long enough common
+		prefix, we must convert the insert into a modify of an
+		existing record */
+		offsets = rec_get_offsets(
+			btr_cur_get_rec(&cursor), index, offsets,
+			ULINT_UNDEFINED, &offsets_heap);
+
+		err = row_ins_sec_index_entry_by_modify(
+			flags, mode, &cursor, &offsets,
+			offsets_heap, heap, entry, thr, &mtr);
+	} else {
+		rec_t*		insert_rec;
+		big_rec_t*	big_rec;
+
+		if (mode == BTR_MODIFY_LEAF) {
+			err = btr_cur_optimistic_insert(
+				flags, &cursor, &offsets, &offsets_heap,
+				entry, &insert_rec,
+				&big_rec, 0, thr, &mtr);
+		} else {
+			ut_ad(mode == BTR_MODIFY_TREE);
+			if (buf_LRU_buf_pool_running_out()) {
+
+				err = DB_LOCK_TABLE_FULL;
+				goto func_exit;
+			}
+
+			err = btr_cur_optimistic_insert(
+				flags, &cursor,
+				&offsets, &offsets_heap,
+				entry, &insert_rec,
+				&big_rec, 0, thr, &mtr);
+			if (err == DB_FAIL) {
+				err = btr_cur_pessimistic_insert(
+					flags, &cursor,
+					&offsets, &offsets_heap,
+					entry, &insert_rec,
+					&big_rec, 0, thr, &mtr);
+			}
+		}
+
+		if (err == DB_SUCCESS && trx_id) {
+			page_update_max_trx_id(
+				btr_cur_get_block(&cursor),
+				btr_cur_get_page_zip(&cursor),
+				trx_id, &mtr);
+		}
+
+		ut_ad(!big_rec);
+	}
+
+func_exit:
+	mtr_commit(&mtr);
+	return(err);
+}
+
+/***************************************************************//**
+Tries to insert the externally stored fields (off-page columns)
+of a clustered index entry.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+UNIV_INTERN
+dberr_t
+row_ins_index_entry_big_rec_func(
+/*=============================*/
+	const dtuple_t*		entry,	/*!< in/out: index entry to insert */
+	const big_rec_t*	big_rec,/*!< in: externally stored fields */
+	ulint*			offsets,/*!< in/out: rec offsets */
+	mem_heap_t**		heap,	/*!< in/out: memory heap */
+	dict_index_t*		index,	/*!< in: index */
+	const char*		file,	/*!< in: file name of caller */
+#ifndef DBUG_OFF
+	const void*		thd,	/*!< in: connection, or NULL */
+#endif /* DBUG_OFF */
+	ulint			line)	/*!< in: line number of caller */
+{
+	mtr_t		mtr;
+	btr_cur_t	cursor;
+	rec_t*		rec;
+	dberr_t		error;
+
+	ut_ad(dict_index_is_clust(index));
+
+	DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern_latch");
+
+	mtr_start(&mtr);
+	btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
+				    BTR_MODIFY_TREE, &cursor, 0,
+				    file, line, &mtr);
+	rec = btr_cur_get_rec(&cursor);
+	offsets = rec_get_offsets(rec, index, offsets,
+				  ULINT_UNDEFINED, heap);
+
+	DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern");
+	error = btr_store_big_rec_extern_fields(
+		index, btr_cur_get_block(&cursor),
+		rec, offsets, big_rec, &mtr, BTR_STORE_INSERT);
+	DEBUG_SYNC_C_IF_THD(thd, "after_row_ins_extern");
+
+	if (error == DB_SUCCESS
+	    && dict_index_is_online_ddl(index)) {
+		row_log_table_insert(rec, index, offsets);
+	}
+
+	mtr_commit(&mtr);
+
+	return(error);
+}
+
+/***************************************************************//**
+Inserts an entry into a clustered index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+UNIV_INTERN
+dberr_t
+row_ins_clust_index_entry(
+/*======================*/
+	dict_index_t*	index,	/*!< in: clustered index */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+{
+	dberr_t	err;
+	ulint	n_uniq;
+
+	if (!index->table->foreign_set.empty()) {
+		err = row_ins_check_foreign_constraints(
+			index->table, index, entry, thr);
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+	}
+
+	n_uniq = dict_index_is_unique(index) ? index->n_uniq : 0;
+
+	/* Try first optimistic descent to the B-tree */
+
+	log_free_check();
+
+	err = row_ins_clust_index_entry_low(
+		0, BTR_MODIFY_LEAF, index, n_uniq, entry, n_ext, thr);
+
+#ifdef UNIV_DEBUG
+	/* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC().
+	Once it is fixed, remove the 'ifdef', 'if' and this comment. */
+	if (!thr_get_trx(thr)->ddl) {
+		DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+				    "after_row_ins_clust_index_entry_leaf");
+	}
+#endif /* UNIV_DEBUG */
+
+	if (err != DB_FAIL) {
+		DEBUG_SYNC_C("row_ins_clust_index_entry_leaf_after");
+		return(err);
+	}
+
+	/* Try then pessimistic descent to the B-tree */
+
+	log_free_check();
+
+	return(row_ins_clust_index_entry_low(
+		       0, BTR_MODIFY_TREE, index, n_uniq, entry, n_ext, thr));
+}
+
+/***************************************************************//**
+Inserts an entry into a secondary index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+UNIV_INTERN
+dberr_t
+row_ins_sec_index_entry(
+/*====================*/
+	dict_index_t*	index,	/*!< in: secondary index */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t		err;
+	mem_heap_t*	offsets_heap;
+	mem_heap_t*	heap;
+
+	if (!index->table->foreign_set.empty()) {
+		err = row_ins_check_foreign_constraints(index->table, index,
+							entry, thr);
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+	}
+
+	ut_ad(thr_get_trx(thr)->id);
+
+	offsets_heap = mem_heap_create(1024);
+	heap = mem_heap_create(1024);
+
+	/* Try first optimistic descent to the B-tree */
+
+	log_free_check();
+
+	err = row_ins_sec_index_entry_low(
+		0, BTR_MODIFY_LEAF, index, offsets_heap, heap, entry, 0, thr);
+	if (err == DB_FAIL) {
+		mem_heap_empty(heap);
+
+		/* Try then pessimistic descent to the B-tree */
+
+		log_free_check();
+
+		err = row_ins_sec_index_entry_low(
+			0, BTR_MODIFY_TREE, index,
+			offsets_heap, heap, entry, 0, thr);
+	}
+
+	mem_heap_free(heap);
+	mem_heap_free(offsets_heap);
+	return(err);
+}
+
+/***************************************************************//**
+Inserts an index entry to index. Tries first optimistic, then pessimistic
+descent down the tree. If the entry matches enough to a delete marked record,
+performs the insert by updating or delete unmarking the delete marked
+record.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+static
+dberr_t
+row_ins_index_entry(
+/*================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	DBUG_EXECUTE_IF("row_ins_index_entry_timeout", {
+			DBUG_SET("-d,row_ins_index_entry_timeout");
+			return(DB_LOCK_WAIT);});
+
+	if (dict_index_is_clust(index)) {
+		return(row_ins_clust_index_entry(index, entry, thr, 0));
+	} else {
+		return(row_ins_sec_index_entry(index, entry, thr));
+	}
+}
+
+/***********************************************************//**
+Sets the values of the dtuple fields in entry from the values of appropriate
+columns in row. */
+static __attribute__((nonnull))
+void
+row_ins_index_entry_set_vals(
+/*=========================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry to make */
+	const dtuple_t*	row)	/*!< in: row */
+{
+	ulint	n_fields;
+	ulint	i;
+
+	n_fields = dtuple_get_n_fields(entry);
+
+	for (i = 0; i < n_fields; i++) {
+		dict_field_t*	ind_field;
+		dfield_t*	field;
+		const dfield_t*	row_field;
+		ulint		len;
+
+		field = dtuple_get_nth_field(entry, i);
+		ind_field = dict_index_get_nth_field(index, i);
+		row_field = dtuple_get_nth_field(row, ind_field->col->ind);
+		len = dfield_get_len(row_field);
+
+		/* Check column prefix indexes */
+		if (ind_field->prefix_len > 0
+		    && dfield_get_len(row_field) != UNIV_SQL_NULL) {
+
+			const	dict_col_t*	col
+				= dict_field_get_col(ind_field);
+
+			len = dtype_get_at_most_n_mbchars(
+				col->prtype, col->mbminmaxlen,
+				ind_field->prefix_len,
+				len,
+				static_cast<const char*>(
+					dfield_get_data(row_field)));
+
+			ut_ad(!dfield_is_ext(row_field));
+		}
+
+		dfield_set_data(field, dfield_get_data(row_field), len);
+		if (dfield_is_ext(row_field)) {
+			ut_ad(dict_index_is_clust(index));
+			dfield_set_ext(field);
+		}
+	}
+}
+
+/***********************************************************//**
+Inserts a single index entry to the table.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_ins_index_entry_step(
+/*=====================*/
+	ins_node_t*	node,	/*!< in: row insert node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+
+	ut_ad(dtuple_check_typed(node->row));
+
+	row_ins_index_entry_set_vals(node->index, node->entry, node->row);
+
+	ut_ad(dtuple_check_typed(node->entry));
+
+	err = row_ins_index_entry(node->index, node->entry, thr);
+
+#ifdef UNIV_DEBUG
+	/* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC().
+	Once it is fixed, remove the 'ifdef', 'if' and this comment. */
+	if (!thr_get_trx(thr)->ddl) {
+		DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+				    "after_row_ins_index_entry_step");
+	}
+#endif /* UNIV_DEBUG */
+
+	return(err);
+}
+
+/***********************************************************//**
+Allocates a row id for row and inits the node->index field. */
+UNIV_INLINE
+void
+row_ins_alloc_row_id_step(
+/*======================*/
+	ins_node_t*	node)	/*!< in: row insert node */
+{
+	row_id_t	row_id;
+
+	ut_ad(node->state == INS_NODE_ALLOC_ROW_ID);
+
+	if (dict_index_is_unique(dict_table_get_first_index(node->table))) {
+
+		/* No row id is stored if the clustered index is unique */
+
+		return;
+	}
+
+	/* Fill in row id value to row */
+
+	row_id = dict_sys_get_new_row_id();
+
+	dict_sys_write_row_id(node->row_id_buf, row_id);
+}
+
+/***********************************************************//**
+Gets a row to insert from the values list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_values(
+/*========================*/
+	ins_node_t*	node)	/*!< in: row insert node */
+{
+	que_node_t*	list_node;
+	dfield_t*	dfield;
+	dtuple_t*	row;
+	ulint		i;
+
+	/* The field values are copied in the buffers of the select node and
+	it is safe to use them until we fetch from select again: therefore
+	we can just copy the pointers */
+
+	row = node->row;
+
+	i = 0;
+	list_node = node->values_list;
+
+	while (list_node) {
+		eval_exp(list_node);
+
+		dfield = dtuple_get_nth_field(row, i);
+		dfield_copy_data(dfield, que_node_get_val(list_node));
+
+		i++;
+		list_node = que_node_get_next(list_node);
+	}
+}
+
+/***********************************************************//**
+Gets a row to insert from the select list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_select(
+/*========================*/
+	ins_node_t*	node)	/*!< in: row insert node */
+{
+	que_node_t*	list_node;
+	dfield_t*	dfield;
+	dtuple_t*	row;
+	ulint		i;
+
+	/* The field values are copied in the buffers of the select node and
+	it is safe to use them until we fetch from select again: therefore
+	we can just copy the pointers */
+
+	row = node->row;
+
+	i = 0;
+	list_node = node->select->select_list;
+
+	while (list_node) {
+		dfield = dtuple_get_nth_field(row, i);
+		dfield_copy_data(dfield, que_node_get_val(list_node));
+
+		i++;
+		list_node = que_node_get_next(list_node);
+	}
+}
+
+/***********************************************************//**
+Inserts a row to a table.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_ins(
+/*====*/
+	ins_node_t*	node,	/*!< in: row insert node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+
+	if (node->state == INS_NODE_ALLOC_ROW_ID) {
+
+		row_ins_alloc_row_id_step(node);
+
+		node->index = dict_table_get_first_index(node->table);
+		node->entry = UT_LIST_GET_FIRST(node->entry_list);
+
+		if (node->ins_type == INS_SEARCHED) {
+
+			row_ins_get_row_from_select(node);
+
+		} else if (node->ins_type == INS_VALUES) {
+
+			row_ins_get_row_from_values(node);
+		}
+
+		node->state = INS_NODE_INSERT_ENTRIES;
+	}
+
+	ut_ad(node->state == INS_NODE_INSERT_ENTRIES);
+
+	while (node->index != NULL) {
+		if (node->index->type != DICT_FTS) {
+			err = row_ins_index_entry_step(node, thr);
+
+			if (err != DB_SUCCESS) {
+
+				return(err);
+			}
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+		node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry);
+
+		DBUG_EXECUTE_IF(
+			"row_ins_skip_sec",
+			node->index = NULL; node->entry = NULL; break;);
+
+		/* Skip corrupted secondary index and its entry */
+		while (node->index && dict_index_is_corrupted(node->index)) {
+
+			node->index = dict_table_get_next_index(node->index);
+			node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry);
+		}
+	}
+
+	ut_ad(node->entry == NULL);
+
+	node->state = INS_NODE_ALLOC_ROW_ID;
+
+	return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Inserts a row to a table. This is a high-level function used in SQL execution
+graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_ins_step(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ins_node_t*	node;
+	que_node_t*	parent;
+	sel_node_t*	sel_node;
+	trx_t*		trx;
+	dberr_t		err;
+
+	ut_ad(thr);
+
+	trx = thr_get_trx(thr);
+
+	trx_start_if_not_started_xa(trx);
+
+	node = static_cast<ins_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_INSERT);
+
+	parent = que_node_get_parent(node);
+	sel_node = node->select;
+
+	if (thr->prev_node == parent) {
+		node->state = INS_NODE_SET_IX_LOCK;
+	}
+
+	/* If this is the first time this node is executed (or when
+	execution resumes after wait for the table IX lock), set an
+	IX lock on the table and reset the possible select node. MySQL's
+	partitioned table code may also call an insert within the same
+	SQL statement AFTER it has used this table handle to do a search.
+	This happens, for example, when a row update moves it to another
+	partition. In that case, we have already set the IX lock on the
+	table during the search operation, and there is no need to set
+	it again here. But we must write trx->id to node->trx_id_buf. */
+
+	trx_write_trx_id(node->trx_id_buf, trx->id);
+
+	if (node->state == INS_NODE_SET_IX_LOCK) {
+
+		node->state = INS_NODE_ALLOC_ROW_ID;
+
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
+
+		if (trx->id == node->trx_id) {
+			/* No need to do IX-locking */
+
+			goto same_trx;
+		}
+
+		err = lock_table(0, node->table, LOCK_IX, thr);
+
+		DBUG_EXECUTE_IF("ib_row_ins_ix_lock_wait",
+				err = DB_LOCK_WAIT;);
+
+		if (err != DB_SUCCESS) {
+
+			goto error_handling;
+		}
+
+		node->trx_id = trx->id;
+same_trx:
+		if (node->ins_type == INS_SEARCHED) {
+			/* Reset the cursor */
+			sel_node->state = SEL_NODE_OPEN;
+
+			/* Fetch a row to insert */
+
+			thr->run_node = sel_node;
+
+			return(thr);
+		}
+	}
+
+	if ((node->ins_type == INS_SEARCHED)
+	    && (sel_node->state != SEL_NODE_FETCH)) {
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to insert */
+		thr->run_node = parent;
+
+		return(thr);
+	}
+
+	/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+	err = row_ins(node, thr);
+
+error_handling:
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		/* err == DB_LOCK_WAIT or SQL error detected */
+		return(NULL);
+	}
+
+	/* DO THE TRIGGER ACTIONS HERE */
+
+	if (node->ins_type == INS_SEARCHED) {
+		/* Fetch a row to insert */
+
+		thr->run_node = sel_node;
+	} else {
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc
new file mode 100644
index 00000000000..48b5dd36cb9
--- /dev/null
+++ b/storage/innobase/row/row0log.cc
@@ -0,0 +1,3633 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0log.cc
+Modification log for online index creation and online table rebuild
+
+Created 2011-05-26 Marko Makela
+*******************************************************/
+
+#include "row0log.h"
+
+#ifdef UNIV_NONINL
+#include "row0log.ic"
+#endif
+
+#include "row0row.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "row0merge.h"
+#include "row0ext.h"
+#include "data0data.h"
+#include "que0que.h"
+#include "handler0alter.h"
+
+#include<map>
+
+/** Table row modification operations during online table rebuild.
+Delete-marked records are not copied to the rebuilt table. */
+enum row_tab_op {
+	/** Insert a record */
+	ROW_T_INSERT = 0x41,
+	/** Update a record in place */
+	ROW_T_UPDATE,
+	/** Delete (purge) a record */
+	ROW_T_DELETE
+};
+
+/** Index record modification operations during online index creation */
+enum row_op {
+	/** Insert a record */
+	ROW_OP_INSERT = 0x61,
+	/** Delete a record */
+	ROW_OP_DELETE
+};
+
+#ifdef UNIV_DEBUG
+/** Write information about the applied record to the error log */
+# define ROW_LOG_APPLY_PRINT
+#endif /* UNIV_DEBUG */
+
+#ifdef ROW_LOG_APPLY_PRINT
+/** When set, write information about the applied record to the error log */
+static bool row_log_apply_print;
+#endif /* ROW_LOG_APPLY_PRINT */
+
+/** Size of the modification log entry header, in bytes */
+#define ROW_LOG_HEADER_SIZE 2/*op, extra_size*/
+
+/** Log block for modifications during online ALTER TABLE */
+struct row_log_buf_t {
+	byte*		block;	/*!< file block buffer */
+	mrec_buf_t	buf;	/*!< buffer for accessing a record
+				that spans two blocks */
+	ulint		blocks; /*!< current position in blocks */
+	ulint		bytes;	/*!< current position within block */
+	ulonglong	total;	/*!< logical position, in bytes from
+				the start of the row_log_table log;
+				0 for row_log_online_op() and
+				row_log_apply(). */
+	ulint		size;	/*!< allocated size of block */
+};
+
+/** Tracks BLOB allocation during online ALTER TABLE */
+class row_log_table_blob_t {
+public:
+	/** Constructor (declaring a BLOB freed)
+	@param offset_arg	row_log_t::tail::total */
+#ifdef UNIV_DEBUG
+	row_log_table_blob_t(ulonglong offset_arg) :
+		old_offset (0), free_offset (offset_arg),
+		offset (BLOB_FREED) {}
+#else /* UNIV_DEBUG */
+	row_log_table_blob_t() :
+		offset (BLOB_FREED) {}
+#endif /* UNIV_DEBUG */
+
+	/** Declare a BLOB freed again.
+	@param offset_arg	row_log_t::tail::total */
+#ifdef UNIV_DEBUG
+	void blob_free(ulonglong offset_arg)
+#else /* UNIV_DEBUG */
+	void blob_free()
+#endif /* UNIV_DEBUG */
+	{
+		ut_ad(offset < offset_arg);
+		ut_ad(offset != BLOB_FREED);
+		ut_d(old_offset = offset);
+		ut_d(free_offset = offset_arg);
+		offset = BLOB_FREED;
+	}
+	/** Declare a freed BLOB reused.
+	@param offset_arg	row_log_t::tail::total */
+	void blob_alloc(ulonglong offset_arg) {
+		ut_ad(free_offset <= offset_arg);
+		ut_d(old_offset = offset);
+		offset = offset_arg;
+	}
+	/** Determine if a BLOB was freed at a given log position
+	@param offset_arg	row_log_t::head::total after the log record
+	@return true if freed */
+	bool is_freed(ulonglong offset_arg) const {
+		/* This is supposed to be the offset at the end of the
+		current log record. */
+		ut_ad(offset_arg > 0);
+		/* We should never get anywhere close the magic value. */
+		ut_ad(offset_arg < BLOB_FREED);
+		return(offset_arg < offset);
+	}
+private:
+	/** Magic value for a freed BLOB */
+	static const ulonglong BLOB_FREED = ~0ULL;
+#ifdef UNIV_DEBUG
+	/** Old offset, in case a page was freed, reused, freed, ... */
+	ulonglong	old_offset;
+	/** Offset of last blob_free() */
+	ulonglong	free_offset;
+#endif /* UNIV_DEBUG */
+	/** Byte offset to the log file */
+	ulonglong	offset;
+};
+
+/** @brief Map of off-page column page numbers to 0 or log byte offsets.
+
+If there is no mapping for a page number, it is safe to access.
+If a page number maps to 0, it is an off-page column that has been freed.
+If a page number maps to a nonzero number, the number is a byte offset
+into the index->online_log, indicating that the page is safe to access
+when applying log records starting from that offset. */
+typedef std::map<ulint, row_log_table_blob_t> page_no_map;
+
+/** @brief Buffer for logging modifications during online index creation
+
+All modifications to an index that is being created will be logged by
+row_log_online_op() to this buffer.
+
+All modifications to a table that is being rebuilt will be logged by
+row_log_table_delete(), row_log_table_update(), row_log_table_insert()
+to this buffer.
+
+When head.blocks == tail.blocks, the reader will access tail.block
+directly. When also head.bytes == tail.bytes, both counts will be
+reset to 0 and the file will be truncated. */
+struct row_log_t {
+	int		fd;	/*!< file descriptor */
+	ib_mutex_t	mutex;	/*!< mutex protecting error,
+				max_trx and tail */
+	page_no_map*	blobs;	/*!< map of page numbers of off-page columns
+				that have been freed during table-rebuilding
+				ALTER TABLE (row_log_table_*); protected by
+				index->lock X-latch only */
+	dict_table_t*	table;	/*!< table that is being rebuilt,
+				or NULL when this is a secondary
+				index that is being created online */
+	bool		same_pk;/*!< whether the definition of the PRIMARY KEY
+				has remained the same */
+	const dtuple_t*	add_cols;
+				/*!< default values of added columns, or NULL */
+	const ulint*	col_map;/*!< mapping of old column numbers to
+				new ones, or NULL if !table */
+	dberr_t		error;	/*!< error that occurred during online
+				table rebuild */
+	trx_id_t	max_trx;/*!< biggest observed trx_id in
+				row_log_online_op();
+				protected by mutex and index->lock S-latch,
+				or by index->lock X-latch only */
+	row_log_buf_t	tail;	/*!< writer context;
+				protected by mutex and index->lock S-latch,
+				or by index->lock X-latch only */
+	row_log_buf_t	head;	/*!< reader context; protected by MDL only;
+				modifiable by row_log_apply_ops() */
+};
+
+
+/** Allocate the memory for the log buffer.
+@param[in,out]	log_buf	Buffer used for log operation
+@return TRUE if success, false if not */
+static __attribute__((warn_unused_result))
+bool
+row_log_block_allocate(
+	row_log_buf_t&	log_buf)
+{
+	DBUG_ENTER("row_log_block_allocate");
+	if (log_buf.block == NULL) {
+		log_buf.size = srv_sort_buf_size;
+		log_buf.block = (byte*) os_mem_alloc_large(&log_buf.size);
+		DBUG_EXECUTE_IF("simulate_row_log_allocation_failure",
+			if (log_buf.block)
+				os_mem_free_large(log_buf.block, log_buf.size);
+			log_buf.block = NULL;);
+		if (!log_buf.block) {
+			DBUG_RETURN(false);
+		}
+	}
+	DBUG_RETURN(true);
+}
+
+/** Free the log buffer.
+@param[in,out]	log_buf	Buffer used for log operation */
+static
+void
+row_log_block_free(
+	row_log_buf_t&	log_buf)
+{
+	DBUG_ENTER("row_log_block_free");
+	if (log_buf.block != NULL) {
+		os_mem_free_large(log_buf.block, log_buf.size);
+		log_buf.block = NULL;
+	}
+	DBUG_VOID_RETURN;
+}
+
+/******************************************************//**
+Logs an operation to a secondary index that is (or was) being created. */
+UNIV_INTERN
+void
+row_log_online_op(
+/*==============*/
+	dict_index_t*	index,	/*!< in/out: index, S or X latched */
+	const dtuple_t* tuple,	/*!< in: index tuple */
+	trx_id_t	trx_id)	/*!< in: transaction ID for insert,
+				or 0 for delete */
+{
+	byte*		b;
+	ulint		extra_size;
+	ulint		size;
+	ulint		mrec_size;
+	ulint		avail_size;
+	row_log_t*	log;
+
+	ut_ad(dtuple_validate(tuple));
+	ut_ad(dtuple_get_n_fields(tuple) == dict_index_get_n_fields(index));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_SHARED)
+	      || rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (dict_index_is_corrupted(index)) {
+		return;
+	}
+
+	ut_ad(dict_index_is_online_ddl(index));
+
+	/* Compute the size of the record. This differs from
+	row_merge_buf_encode(), because here we do not encode
+	extra_size+1 (and reserve 0 as the end-of-chunk marker). */
+
+	size = rec_get_converted_size_temp(
+		index, tuple->fields, tuple->n_fields, &extra_size);
+	ut_ad(size >= extra_size);
+	ut_ad(size <= sizeof log->tail.buf);
+
+	mrec_size = ROW_LOG_HEADER_SIZE
+		+ (extra_size >= 0x80) + size
+		+ (trx_id ? DATA_TRX_ID_LEN : 0);
+
+	log = index->online_log;
+	mutex_enter(&log->mutex);
+
+	if (trx_id > log->max_trx) {
+		log->max_trx = trx_id;
+	}
+
+	if (!row_log_block_allocate(log->tail)) {
+		log->error = DB_OUT_OF_MEMORY;
+		goto err_exit;
+	}
+
+	UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf);
+
+	ut_ad(log->tail.bytes < srv_sort_buf_size);
+	avail_size = srv_sort_buf_size - log->tail.bytes;
+
+	if (mrec_size > avail_size) {
+		b = log->tail.buf;
+	} else {
+		b = log->tail.block + log->tail.bytes;
+	}
+
+	if (trx_id != 0) {
+		*b++ = ROW_OP_INSERT;
+		trx_write_trx_id(b, trx_id);
+		b += DATA_TRX_ID_LEN;
+	} else {
+		*b++ = ROW_OP_DELETE;
+	}
+
+	if (extra_size < 0x80) {
+		*b++ = (byte) extra_size;
+	} else {
+		ut_ad(extra_size < 0x8000);
+		*b++ = (byte) (0x80 | (extra_size >> 8));
+		*b++ = (byte) extra_size;
+	}
+
+	rec_convert_dtuple_to_temp(
+		b + extra_size, index, tuple->fields, tuple->n_fields);
+	b += size;
+
+	if (mrec_size >= avail_size) {
+		const os_offset_t	byte_offset
+			= (os_offset_t) log->tail.blocks
+			* srv_sort_buf_size;
+		ibool			ret;
+
+		if (byte_offset + srv_sort_buf_size >= srv_online_max_size) {
+			goto write_failed;
+		}
+
+		if (mrec_size == avail_size) {
+			ut_ad(b == &log->tail.block[srv_sort_buf_size]);
+		} else {
+			ut_ad(b == log->tail.buf + mrec_size);
+			memcpy(log->tail.block + log->tail.bytes,
+			       log->tail.buf, avail_size);
+		}
+		UNIV_MEM_ASSERT_RW(log->tail.block, srv_sort_buf_size);
+		ret = os_file_write(
+			"(modification log)",
+			OS_FILE_FROM_FD(log->fd),
+			log->tail.block, byte_offset, srv_sort_buf_size);
+		log->tail.blocks++;
+		if (!ret) {
+write_failed:
+			/* We set the flag directly instead of invoking
+			dict_set_corrupted_index_cache_only(index) here,
+			because the index is not "public" yet. */
+			index->type |= DICT_CORRUPT;
+		}
+		UNIV_MEM_INVALID(log->tail.block, srv_sort_buf_size);
+		memcpy(log->tail.block, log->tail.buf + avail_size,
+		       mrec_size - avail_size);
+		log->tail.bytes = mrec_size - avail_size;
+	} else {
+		log->tail.bytes += mrec_size;
+		ut_ad(b == log->tail.block + log->tail.bytes);
+	}
+
+	UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf);
+err_exit:
+	mutex_exit(&log->mutex);
+}
+
+/******************************************************//**
+Gets the error status of the online index rebuild log.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_log_table_get_error(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: clustered index of a table
+					that is being rebuilt online */
+{
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_online_ddl(index));
+	return(index->online_log->error);
+}
+
+/******************************************************//**
+Starts logging an operation to a table that is being rebuilt.
+@return pointer to log, or NULL if no logging is necessary */
+static __attribute__((nonnull, warn_unused_result))
+byte*
+row_log_table_open(
+/*===============*/
+	row_log_t*	log,	/*!< in/out: online rebuild log */
+	ulint		size,	/*!< in: size of log record */
+	ulint*		avail)	/*!< out: available size for log record */
+{
+	mutex_enter(&log->mutex);
+
+	UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf);
+
+	if (log->error != DB_SUCCESS) {
+err_exit:
+		mutex_exit(&log->mutex);
+		return(NULL);
+	}
+
+	if (!row_log_block_allocate(log->tail)) {
+		log->error = DB_OUT_OF_MEMORY;
+		goto err_exit;
+	}
+
+	ut_ad(log->tail.bytes < srv_sort_buf_size);
+	*avail = srv_sort_buf_size - log->tail.bytes;
+
+	if (size > *avail) {
+		return(log->tail.buf);
+	} else {
+		return(log->tail.block + log->tail.bytes);
+	}
+}
+
+/******************************************************//**
+Stops logging an operation to a table that is being rebuilt. */
+static __attribute__((nonnull))
+void
+row_log_table_close_func(
+/*=====================*/
+	row_log_t*	log,	/*!< in/out: online rebuild log */
+#ifdef UNIV_DEBUG
+	const byte*	b,	/*!< in: end of log record */
+#endif /* UNIV_DEBUG */
+	ulint		size,	/*!< in: size of log record */
+	ulint		avail)	/*!< in: available size for log record */
+{
+	ut_ad(mutex_own(&log->mutex));
+
+	if (size >= avail) {
+		const os_offset_t	byte_offset
+			= (os_offset_t) log->tail.blocks
+			* srv_sort_buf_size;
+		ibool			ret;
+
+		if (byte_offset + srv_sort_buf_size >= srv_online_max_size) {
+			goto write_failed;
+		}
+
+		if (size == avail) {
+			ut_ad(b == &log->tail.block[srv_sort_buf_size]);
+		} else {
+			ut_ad(b == log->tail.buf + size);
+			memcpy(log->tail.block + log->tail.bytes,
+			       log->tail.buf, avail);
+		}
+		UNIV_MEM_ASSERT_RW(log->tail.block, srv_sort_buf_size);
+		ret = os_file_write(
+			"(modification log)",
+			OS_FILE_FROM_FD(log->fd),
+			log->tail.block, byte_offset, srv_sort_buf_size);
+		log->tail.blocks++;
+		if (!ret) {
+write_failed:
+			log->error = DB_ONLINE_LOG_TOO_BIG;
+		}
+		UNIV_MEM_INVALID(log->tail.block, srv_sort_buf_size);
+		memcpy(log->tail.block, log->tail.buf + avail, size - avail);
+		log->tail.bytes = size - avail;
+	} else {
+		log->tail.bytes += size;
+		ut_ad(b == log->tail.block + log->tail.bytes);
+	}
+
+	log->tail.total += size;
+	UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf);
+	mutex_exit(&log->mutex);
+}
+
+#ifdef UNIV_DEBUG
+# define row_log_table_close(log, b, size, avail)	\
+	row_log_table_close_func(log, b, size, avail)
+#else /* UNIV_DEBUG */
+# define row_log_table_close(log, b, size, avail)	\
+	row_log_table_close_func(log, size, avail)
+#endif /* UNIV_DEBUG */
+
+/******************************************************//**
+Logs a delete operation to a table that is being rebuilt.
+This will be merged in row_log_table_apply_delete(). */
+UNIV_INTERN
+void
+row_log_table_delete(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	const byte*	sys)	/*!< in: DB_TRX_ID,DB_ROLL_PTR that should
+				be logged, or NULL to use those in rec */
+{
+	ulint		old_pk_extra_size;
+	ulint		old_pk_size;
+	ulint		ext_size = 0;
+	ulint		mrec_size;
+	ulint		avail_size;
+	mem_heap_t*	heap		= NULL;
+	const dtuple_t*	old_pk;
+	row_ext_t*	ext;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index));
+	ut_ad(rec_offs_size(offsets) <= sizeof index->online_log->tail.buf);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED)
+	      || rw_lock_own(&index->lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (dict_index_is_corrupted(index)
+	    || !dict_index_is_online_ddl(index)
+	    || index->online_log->error != DB_SUCCESS) {
+		return;
+	}
+
+	dict_table_t* new_table = index->online_log->table;
+	dict_index_t* new_index = dict_table_get_first_index(new_table);
+
+	ut_ad(dict_index_is_clust(new_index));
+	ut_ad(!dict_index_is_online_ddl(new_index));
+
+	/* Create the tuple PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in new_table. */
+	if (index->online_log->same_pk) {
+		dtuple_t*	tuple;
+		ut_ad(new_index->n_uniq == index->n_uniq);
+
+		/* The PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR are in the first
+		fields of the record. */
+		heap = mem_heap_create(
+			DATA_TRX_ID_LEN
+			+ DTUPLE_EST_ALLOC(new_index->n_uniq + 2));
+		old_pk = tuple = dtuple_create(heap, new_index->n_uniq + 2);
+		dict_index_copy_types(tuple, new_index, tuple->n_fields);
+		dtuple_set_n_fields_cmp(tuple, new_index->n_uniq);
+
+		for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) {
+			ulint		len;
+			const void*	field	= rec_get_nth_field(
+				rec, offsets, i, &len);
+			dfield_t*	dfield	= dtuple_get_nth_field(
+				tuple, i);
+			ut_ad(len != UNIV_SQL_NULL);
+			ut_ad(!rec_offs_nth_extern(offsets, i));
+			dfield_set_data(dfield, field, len);
+		}
+
+		if (sys) {
+			dfield_set_data(
+				dtuple_get_nth_field(tuple,
+						     new_index->n_uniq),
+				sys, DATA_TRX_ID_LEN);
+			dfield_set_data(
+				dtuple_get_nth_field(tuple,
+						     new_index->n_uniq + 1),
+				sys + DATA_TRX_ID_LEN, DATA_ROLL_PTR_LEN);
+		}
+	} else {
+		/* The PRIMARY KEY has changed. Translate the tuple. */
+		old_pk = row_log_table_get_pk(
+			rec, index, offsets, NULL, &heap);
+
+		if (!old_pk) {
+			ut_ad(index->online_log->error != DB_SUCCESS);
+			if (heap) {
+				goto func_exit;
+			}
+			return;
+		}
+	}
+
+	ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(
+		      old_pk, old_pk->n_fields - 2)->len);
+	ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field(
+		      old_pk, old_pk->n_fields - 1)->len);
+	old_pk_size = rec_get_converted_size_temp(
+		new_index, old_pk->fields, old_pk->n_fields,
+		&old_pk_extra_size);
+	ut_ad(old_pk_extra_size < 0x100);
+
+	mrec_size = 4 + old_pk_size;
+
+	/* Log enough prefix of the BLOB unless both the
+	old and new table are in COMPACT or REDUNDANT format,
+	which store the prefix in the clustered index record. */
+	if (rec_offs_any_extern(offsets)
+	    && (dict_table_get_format(index->table) >= UNIV_FORMAT_B
+		|| dict_table_get_format(new_table) >= UNIV_FORMAT_B)) {
+
+		/* Build a cache of those off-page column prefixes
+		that are referenced by secondary indexes. It can be
+		that none of the off-page columns are needed. */
+		row_build(ROW_COPY_DATA, index, rec,
+			  offsets, NULL, NULL, NULL, &ext, heap);
+		if (ext) {
+			/* Log the row_ext_t, ext->ext and ext->buf */
+			ext_size = ext->n_ext * ext->max_len
+				+ sizeof(*ext)
+				+ ext->n_ext * sizeof(ulint)
+				+ (ext->n_ext - 1) * sizeof ext->len;
+			mrec_size += ext_size;
+		}
+	}
+
+	if (byte* b = row_log_table_open(index->online_log,
+					 mrec_size, &avail_size)) {
+		*b++ = ROW_T_DELETE;
+		*b++ = static_cast<byte>(old_pk_extra_size);
+
+		/* Log the size of external prefix we saved */
+		mach_write_to_2(b, ext_size);
+		b += 2;
+
+		rec_convert_dtuple_to_temp(
+			b + old_pk_extra_size, new_index,
+			old_pk->fields, old_pk->n_fields);
+
+		b += old_pk_size;
+
+		if (ext_size) {
+			ulint	cur_ext_size = sizeof(*ext)
+				+ (ext->n_ext - 1) * sizeof ext->len;
+
+			memcpy(b, ext, cur_ext_size);
+			b += cur_ext_size;
+
+			/* Check if we need to col_map to adjust the column
+			number. If columns were added/removed/reordered,
+			adjust the column number. */
+			if (const ulint* col_map =
+				index->online_log->col_map) {
+				for (ulint i = 0; i < ext->n_ext; i++) {
+					const_cast<ulint&>(ext->ext[i]) =
+						col_map[ext->ext[i]];
+				}
+			}
+
+			memcpy(b, ext->ext, ext->n_ext * sizeof(*ext->ext));
+			b += ext->n_ext * sizeof(*ext->ext);
+
+			ext_size -= cur_ext_size
+				 + ext->n_ext * sizeof(*ext->ext);
+			memcpy(b, ext->buf, ext_size);
+			b += ext_size;
+		}
+
+		row_log_table_close(
+			index->online_log, b, mrec_size, avail_size);
+	}
+
+func_exit:
+	mem_heap_free(heap);
+}
+
+/******************************************************//**
+Logs an insert or update to a table that is being rebuilt. */
+static
+void
+row_log_table_low_redundant(
+/*========================*/
+	const rec_t*		rec,	/*!< in: clustered index leaf
+					page record in ROW_FORMAT=REDUNDANT,
+					page X-latched */
+	dict_index_t*		index,	/*!< in/out: clustered index, S-latched
+					or X-latched */
+	bool			insert,	/*!< in: true if insert,
+					false if update */
+	const dtuple_t*		old_pk,	/*!< in: old PRIMARY KEY value
+					(if !insert and a PRIMARY KEY
+					is being created) */
+	const dict_index_t*	new_index)
+					/*!< in: clustered index of the
+					new table, not latched */
+{
+	ulint		old_pk_size;
+	ulint		old_pk_extra_size;
+	ulint		size;
+	ulint		extra_size;
+	ulint		mrec_size;
+	ulint		avail_size;
+	mem_heap_t*	heap		= NULL;
+	dtuple_t*	tuple;
+
+	ut_ad(!page_is_comp(page_align(rec)));
+	ut_ad(dict_index_get_n_fields(index) == rec_get_n_fields_old(rec));
+	ut_ad(dict_tf_is_valid(index->table->flags));
+	ut_ad(!dict_table_is_comp(index->table));  /* redundant row format */
+	ut_ad(dict_index_is_clust(new_index));
+
+	heap = mem_heap_create(DTUPLE_EST_ALLOC(index->n_fields));
+	tuple = dtuple_create(heap, index->n_fields);
+	dict_index_copy_types(tuple, index, index->n_fields);
+	dtuple_set_n_fields_cmp(tuple, dict_index_get_n_unique(index));
+
+	if (rec_get_1byte_offs_flag(rec)) {
+		for (ulint i = 0; i < index->n_fields; i++) {
+			dfield_t*	dfield;
+			ulint		len;
+			const void*	field;
+
+			dfield = dtuple_get_nth_field(tuple, i);
+			field = rec_get_nth_field_old(rec, i, &len);
+
+			dfield_set_data(dfield, field, len);
+		}
+	} else {
+		for (ulint i = 0; i < index->n_fields; i++) {
+			dfield_t*	dfield;
+			ulint		len;
+			const void*	field;
+
+			dfield = dtuple_get_nth_field(tuple, i);
+			field = rec_get_nth_field_old(rec, i, &len);
+
+			dfield_set_data(dfield, field, len);
+
+			if (rec_2_is_field_extern(rec, i)) {
+				dfield_set_ext(dfield);
+			}
+		}
+	}
+
+	size = rec_get_converted_size_temp(
+		index, tuple->fields, tuple->n_fields, &extra_size);
+
+	mrec_size = ROW_LOG_HEADER_SIZE + size + (extra_size >= 0x80);
+
+	if (insert || index->online_log->same_pk) {
+		ut_ad(!old_pk);
+		old_pk_extra_size = old_pk_size = 0;
+	} else {
+		ut_ad(old_pk);
+		ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp);
+		ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(
+			      old_pk, old_pk->n_fields - 2)->len);
+		ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field(
+			      old_pk, old_pk->n_fields - 1)->len);
+
+		old_pk_size = rec_get_converted_size_temp(
+			new_index, old_pk->fields, old_pk->n_fields,
+			&old_pk_extra_size);
+		ut_ad(old_pk_extra_size < 0x100);
+		mrec_size += 1/*old_pk_extra_size*/ + old_pk_size;
+	}
+
+	if (byte* b = row_log_table_open(index->online_log,
+					 mrec_size, &avail_size)) {
+		*b++ = insert ? ROW_T_INSERT : ROW_T_UPDATE;
+
+		if (old_pk_size) {
+			*b++ = static_cast<byte>(old_pk_extra_size);
+
+			rec_convert_dtuple_to_temp(
+				b + old_pk_extra_size, new_index,
+				old_pk->fields, old_pk->n_fields);
+			b += old_pk_size;
+		}
+
+		if (extra_size < 0x80) {
+			*b++ = static_cast<byte>(extra_size);
+		} else {
+			ut_ad(extra_size < 0x8000);
+			*b++ = static_cast<byte>(0x80 | (extra_size >> 8));
+			*b++ = static_cast<byte>(extra_size);
+		}
+
+		rec_convert_dtuple_to_temp(
+			b + extra_size, index, tuple->fields, tuple->n_fields);
+		b += size;
+
+		row_log_table_close(
+			index->online_log, b, mrec_size, avail_size);
+	}
+
+	mem_heap_free(heap);
+}
+
+/******************************************************//**
+Logs an insert or update to a table that is being rebuilt. */
+static __attribute__((nonnull(1,2,3)))
+void
+row_log_table_low(
+/*==============*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	bool		insert,	/*!< in: true if insert, false if update */
+	const dtuple_t*	old_pk)	/*!< in: old PRIMARY KEY value (if !insert
+				and a PRIMARY KEY is being created) */
+{
+	ulint			omit_size;
+	ulint			old_pk_size;
+	ulint			old_pk_extra_size;
+	ulint			extra_size;
+	ulint			mrec_size;
+	ulint			avail_size;
+	const dict_index_t*	new_index = dict_table_get_first_index(
+		index->online_log->table);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_clust(new_index));
+	ut_ad(!dict_index_is_online_ddl(new_index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index));
+	ut_ad(rec_offs_size(offsets) <= sizeof index->online_log->tail.buf);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED)
+	      || rw_lock_own(&index->lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX);
+	ut_ad(page_is_leaf(page_align(rec)));
+	ut_ad(!page_is_comp(page_align(rec)) == !rec_offs_comp(offsets));
+
+	if (dict_index_is_corrupted(index)
+	    || !dict_index_is_online_ddl(index)
+	    || index->online_log->error != DB_SUCCESS) {
+		return;
+	}
+
+	if (!rec_offs_comp(offsets)) {
+		row_log_table_low_redundant(
+			rec, index, insert, old_pk, new_index);
+		return;
+	}
+
+	ut_ad(page_is_comp(page_align(rec)));
+	ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY);
+
+	omit_size = REC_N_NEW_EXTRA_BYTES;
+
+	extra_size = rec_offs_extra_size(offsets) - omit_size;
+
+	mrec_size = ROW_LOG_HEADER_SIZE
+		+ (extra_size >= 0x80) + rec_offs_size(offsets) - omit_size;
+
+	if (insert || index->online_log->same_pk) {
+		ut_ad(!old_pk);
+		old_pk_extra_size = old_pk_size = 0;
+	} else {
+		ut_ad(old_pk);
+		ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp);
+		ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(
+			      old_pk, old_pk->n_fields - 2)->len);
+		ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field(
+			      old_pk, old_pk->n_fields - 1)->len);
+
+		old_pk_size = rec_get_converted_size_temp(
+			new_index, old_pk->fields, old_pk->n_fields,
+			&old_pk_extra_size);
+		ut_ad(old_pk_extra_size < 0x100);
+		mrec_size += 1/*old_pk_extra_size*/ + old_pk_size;
+	}
+
+	if (byte* b = row_log_table_open(index->online_log,
+					 mrec_size, &avail_size)) {
+		*b++ = insert ? ROW_T_INSERT : ROW_T_UPDATE;
+
+		if (old_pk_size) {
+			*b++ = static_cast<byte>(old_pk_extra_size);
+
+			rec_convert_dtuple_to_temp(
+				b + old_pk_extra_size, new_index,
+				old_pk->fields, old_pk->n_fields);
+			b += old_pk_size;
+		}
+
+		if (extra_size < 0x80) {
+			*b++ = static_cast<byte>(extra_size);
+		} else {
+			ut_ad(extra_size < 0x8000);
+			*b++ = static_cast<byte>(0x80 | (extra_size >> 8));
+			*b++ = static_cast<byte>(extra_size);
+		}
+
+		memcpy(b, rec - rec_offs_extra_size(offsets), extra_size);
+		b += extra_size;
+		memcpy(b, rec, rec_offs_data_size(offsets));
+		b += rec_offs_data_size(offsets);
+
+		row_log_table_close(
+			index->online_log, b, mrec_size, avail_size);
+	}
+}
+
+/******************************************************//**
+Logs an update to a table that is being rebuilt.
+This will be merged in row_log_table_apply_update(). */
+UNIV_INTERN
+void
+row_log_table_update(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	const dtuple_t*	old_pk)	/*!< in: row_log_table_get_pk()
+				before the update */
+{
+	row_log_table_low(rec, index, offsets, false, old_pk);
+}
+
+/** Gets the old table column of a PRIMARY KEY column.
+@param table	old table (before ALTER TABLE)
+@param col_map	mapping of old column numbers to new ones
+@param col_no	column position in the new table
+@return old table column, or NULL if this is an added column */
+static
+const dict_col_t*
+row_log_table_get_pk_old_col(
+/*=========================*/
+	const dict_table_t*	table,
+	const ulint*		col_map,
+	ulint			col_no)
+{
+	for (ulint i = 0; i < table->n_cols; i++) {
+		if (col_no == col_map[i]) {
+			return(dict_table_get_nth_col(table, i));
+		}
+	}
+
+	return(NULL);
+}
+
+/** Maps an old table column of a PRIMARY KEY column.
+@param col	old table column (before ALTER TABLE)
+@param ifield	clustered index field in the new table (after ALTER TABLE)
+@param dfield	clustered index tuple field in the new table
+@param heap	memory heap for allocating dfield contents
+@param rec	clustered index leaf page record in the old table
+@param offsets	rec_get_offsets(rec)
+@param i	rec field corresponding to col
+@param zip_size	compressed page size of the old table, or 0 for uncompressed
+@param max_len	maximum length of dfield
+@retval DB_INVALID_NULL if a NULL value is encountered
+@retval DB_TOO_BIG_INDEX_COL if the maximum prefix length is exceeded */
+static
+dberr_t
+row_log_table_get_pk_col(
+/*=====================*/
+	const dict_col_t*	col,
+	const dict_field_t*	ifield,
+	dfield_t*		dfield,
+	mem_heap_t*		heap,
+	const rec_t*		rec,
+	const ulint*		offsets,
+	ulint			i,
+	ulint			zip_size,
+	ulint			max_len)
+{
+	const byte*	field;
+	ulint		len;
+
+	ut_ad(ut_is_2pow(zip_size));
+
+	field = rec_get_nth_field(rec, offsets, i, &len);
+
+	if (len == UNIV_SQL_NULL) {
+		return(DB_INVALID_NULL);
+	}
+
+	if (rec_offs_nth_extern(offsets, i)) {
+		ulint	field_len = ifield->prefix_len;
+		byte*	blob_field;
+
+		if (!field_len) {
+			field_len = ifield->fixed_len;
+			if (!field_len) {
+				field_len = max_len + 1;
+			}
+		}
+
+		blob_field = static_cast<byte*>(
+			mem_heap_alloc(heap, field_len));
+
+		len = btr_copy_externally_stored_field_prefix(
+			blob_field, field_len, zip_size, field, len);
+		if (len >= max_len + 1) {
+			return(DB_TOO_BIG_INDEX_COL);
+		}
+
+		dfield_set_data(dfield, blob_field, len);
+	} else {
+		dfield_set_data(dfield, mem_heap_dup(heap, field, len), len);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/******************************************************//**
+Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR
+of a table that is being rebuilt.
+@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table,
+or NULL if the PRIMARY KEY definition does not change */
+UNIV_INTERN
+const dtuple_t*
+row_log_table_get_pk(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	byte*		sys,	/*!< out: DB_TRX_ID,DB_ROLL_PTR for
+				row_log_table_delete(), or NULL */
+	mem_heap_t**	heap)	/*!< in/out: memory heap where allocated */
+{
+	dtuple_t*	tuple	= NULL;
+	row_log_t*	log	= index->online_log;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_online_ddl(index));
+	ut_ad(!offsets || rec_offs_validate(rec, index, offsets));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED)
+	      || rw_lock_own(&index->lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_ad(log);
+	ut_ad(log->table);
+
+	if (log->same_pk) {
+		/* The PRIMARY KEY columns are unchanged. */
+		if (sys) {
+			/* Store the DB_TRX_ID,DB_ROLL_PTR. */
+			ulint	trx_id_offs = index->trx_id_offset;
+
+			if (!trx_id_offs) {
+				ulint	pos = dict_index_get_sys_col_pos(
+					index, DATA_TRX_ID);
+				ulint	len;
+				ut_ad(pos > 0);
+
+				if (!offsets) {
+					offsets = rec_get_offsets(
+						rec, index, NULL, pos + 1,
+						heap);
+				}
+
+				trx_id_offs = rec_get_nth_field_offs(
+					offsets, pos, &len);
+				ut_ad(len == DATA_TRX_ID_LEN);
+			}
+
+			memcpy(sys, rec + trx_id_offs,
+			       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+		}
+
+		return(NULL);
+	}
+
+	mutex_enter(&log->mutex);
+
+	/* log->error is protected by log->mutex. */
+	if (log->error == DB_SUCCESS) {
+		dict_table_t*	new_table	= log->table;
+		dict_index_t*	new_index
+			= dict_table_get_first_index(new_table);
+		const ulint	new_n_uniq
+			= dict_index_get_n_unique(new_index);
+
+		if (!*heap) {
+			ulint	size = 0;
+
+			if (!offsets) {
+				size += (1 + REC_OFFS_HEADER_SIZE
+					 + index->n_fields)
+					* sizeof *offsets;
+			}
+
+			for (ulint i = 0; i < new_n_uniq; i++) {
+				size += dict_col_get_min_size(
+					dict_index_get_nth_col(new_index, i));
+			}
+
+			*heap = mem_heap_create(
+				DTUPLE_EST_ALLOC(new_n_uniq + 2) + size);
+		}
+
+		if (!offsets) {
+			offsets = rec_get_offsets(rec, index, NULL,
+						  ULINT_UNDEFINED, heap);
+		}
+
+		tuple = dtuple_create(*heap, new_n_uniq + 2);
+		dict_index_copy_types(tuple, new_index, tuple->n_fields);
+		dtuple_set_n_fields_cmp(tuple, new_n_uniq);
+
+		const ulint max_len = DICT_MAX_FIELD_LEN_BY_FORMAT(new_table);
+		const ulint zip_size = dict_table_zip_size(index->table);
+
+		for (ulint new_i = 0; new_i < new_n_uniq; new_i++) {
+			dict_field_t*	ifield;
+			dfield_t*	dfield;
+			ulint		prtype;
+			ulint		mbminmaxlen;
+
+			ifield = dict_index_get_nth_field(new_index, new_i);
+			dfield = dtuple_get_nth_field(tuple, new_i);
+
+			const ulint	col_no
+				= dict_field_get_col(ifield)->ind;
+
+			if (const dict_col_t* col
+			    = row_log_table_get_pk_old_col(
+				    index->table, log->col_map, col_no)) {
+				ulint	i = dict_col_get_clust_pos(col, index);
+
+				if (i == ULINT_UNDEFINED) {
+					ut_ad(0);
+					log->error = DB_CORRUPTION;
+					goto err_exit;
+				}
+
+				log->error = row_log_table_get_pk_col(
+					col, ifield, dfield, *heap,
+					rec, offsets, i, zip_size, max_len);
+
+				if (log->error != DB_SUCCESS) {
+err_exit:
+					tuple = NULL;
+					goto func_exit;
+				}
+
+				mbminmaxlen = col->mbminmaxlen;
+				prtype = col->prtype;
+			} else {
+				/* No matching column was found in the old
+				table, so this must be an added column.
+				Copy the default value. */
+				ut_ad(log->add_cols);
+
+				dfield_copy(dfield, dtuple_get_nth_field(
+						    log->add_cols, col_no));
+				mbminmaxlen = dfield->type.mbminmaxlen;
+				prtype = dfield->type.prtype;
+			}
+
+			ut_ad(!dfield_is_ext(dfield));
+			ut_ad(!dfield_is_null(dfield));
+
+			if (ifield->prefix_len) {
+				ulint	len = dtype_get_at_most_n_mbchars(
+					prtype, mbminmaxlen,
+					ifield->prefix_len,
+					dfield_get_len(dfield),
+					static_cast<const char*>(
+						dfield_get_data(dfield)));
+
+				ut_ad(len <= dfield_get_len(dfield));
+				dfield_set_len(dfield, len);
+			}
+		}
+
+		const byte* trx_roll = rec
+			+ row_get_trx_id_offset(index, offsets);
+
+		/* Copy the fields, because the fields will be updated
+		or the record may be moved somewhere else in the B-tree
+		as part of the upcoming operation. */
+		if (sys) {
+			memcpy(sys, trx_roll,
+			       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+			trx_roll = sys;
+		} else {
+			trx_roll = static_cast<const byte*>(
+				mem_heap_dup(
+					*heap, trx_roll,
+					DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN));
+		}
+
+		dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq),
+				trx_roll, DATA_TRX_ID_LEN);
+		dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq + 1),
+				trx_roll + DATA_TRX_ID_LEN, DATA_ROLL_PTR_LEN);
+	}
+
+func_exit:
+	mutex_exit(&log->mutex);
+	return(tuple);
+}
+
+/******************************************************//**
+Logs an insert to a table that is being rebuilt.
+This will be merged in row_log_table_apply_insert(). */
+UNIV_INTERN
+void
+row_log_table_insert(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec,index) */
+{
+	row_log_table_low(rec, index, offsets, true, NULL);
+}
+
+/******************************************************//**
+Notes that a BLOB is being freed during online ALTER TABLE. */
+UNIV_INTERN
+void
+row_log_table_blob_free(
+/*====================*/
+	dict_index_t*	index,	/*!< in/out: clustered index, X-latched */
+	ulint		page_no)/*!< in: starting page number of the BLOB */
+{
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_online_ddl(index));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&index->lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(page_no != FIL_NULL);
+
+	if (index->online_log->error != DB_SUCCESS) {
+		return;
+	}
+
+	page_no_map*	blobs	= index->online_log->blobs;
+
+	if (!blobs) {
+		index->online_log->blobs = blobs = new page_no_map();
+	}
+
+#ifdef UNIV_DEBUG
+	const ulonglong	log_pos = index->online_log->tail.total;
+#else
+# define log_pos /* empty */
+#endif /* UNIV_DEBUG */
+
+	const page_no_map::value_type v(page_no,
+					row_log_table_blob_t(log_pos));
+
+	std::pair<page_no_map::iterator,bool> p = blobs->insert(v);
+
+	if (!p.second) {
+		/* Update the existing mapping. */
+		ut_ad(p.first->first == page_no);
+		p.first->second.blob_free(log_pos);
+	}
+#undef log_pos
+}
+
+/******************************************************//**
+Notes that a BLOB is being allocated during online ALTER TABLE. */
+UNIV_INTERN
+void
+row_log_table_blob_alloc(
+/*=====================*/
+	dict_index_t*	index,	/*!< in/out: clustered index, X-latched */
+	ulint		page_no)/*!< in: starting page number of the BLOB */
+{
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_online_ddl(index));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&index->lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(page_no != FIL_NULL);
+
+	if (index->online_log->error != DB_SUCCESS) {
+		return;
+	}
+
+	/* Only track allocations if the same page has been freed
+	earlier. Double allocation without a free is not allowed. */
+	if (page_no_map* blobs = index->online_log->blobs) {
+		page_no_map::iterator p = blobs->find(page_no);
+
+		if (p != blobs->end()) {
+			ut_ad(p->first == page_no);
+			p->second.blob_alloc(index->online_log->tail.total);
+		}
+	}
+}
+
+/******************************************************//**
+Converts a log record to a table row.
+@return converted row, or NULL if the conversion fails */
+static __attribute__((nonnull, warn_unused_result))
+const dtuple_t*
+row_log_table_apply_convert_mrec(
+/*=============================*/
+	const mrec_t*		mrec,		/*!< in: merge record */
+	dict_index_t*		index,		/*!< in: index of mrec */
+	const ulint*		offsets,	/*!< in: offsets of mrec */
+	const row_log_t*	log,		/*!< in: rebuild context */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	trx_id_t		trx_id,		/*!< in: DB_TRX_ID of mrec */
+	dberr_t*		error)		/*!< out: DB_SUCCESS or
+						DB_MISSING_HISTORY or
+						reason of failure */
+{
+	dtuple_t*	row;
+
+	*error = DB_SUCCESS;
+
+	/* This is based on row_build(). */
+	if (log->add_cols) {
+		row = dtuple_copy(log->add_cols, heap);
+		/* dict_table_copy_types() would set the fields to NULL */
+		for (ulint i = 0; i < dict_table_get_n_cols(log->table); i++) {
+			dict_col_copy_type(
+				dict_table_get_nth_col(log->table, i),
+				dfield_get_type(dtuple_get_nth_field(row, i)));
+		}
+	} else {
+		row = dtuple_create(heap, dict_table_get_n_cols(log->table));
+		dict_table_copy_types(row, log->table);
+	}
+
+	for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) {
+		const dict_field_t*	ind_field
+			= dict_index_get_nth_field(index, i);
+
+		if (ind_field->prefix_len) {
+			/* Column prefixes can only occur in key
+			fields, which cannot be stored externally. For
+			a column prefix, there should also be the full
+			field in the clustered index tuple. The row
+			tuple comprises full fields, not prefixes. */
+			ut_ad(!rec_offs_nth_extern(offsets, i));
+			continue;
+		}
+
+		const dict_col_t*	col
+			= dict_field_get_col(ind_field);
+		ulint			col_no
+			= log->col_map[dict_col_get_no(col)];
+
+		if (col_no == ULINT_UNDEFINED) {
+			/* dropped column */
+			continue;
+		}
+
+		dfield_t*		dfield
+			= dtuple_get_nth_field(row, col_no);
+		ulint			len;
+		const byte*		data;
+
+		if (rec_offs_nth_extern(offsets, i)) {
+			ut_ad(rec_offs_any_extern(offsets));
+			rw_lock_x_lock(dict_index_get_lock(index));
+
+			if (const page_no_map* blobs = log->blobs) {
+				data = rec_get_nth_field(
+					mrec, offsets, i, &len);
+				ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+				ulint	page_no = mach_read_from_4(
+					data + len - (BTR_EXTERN_FIELD_REF_SIZE
+						      - BTR_EXTERN_PAGE_NO));
+				page_no_map::const_iterator p = blobs->find(
+					page_no);
+				if (p != blobs->end()
+				    && p->second.is_freed(log->head.total)) {
+					/* This BLOB has been freed.
+					We must not access the row. */
+					*error = DB_MISSING_HISTORY;
+					dfield_set_data(dfield, data, len);
+					dfield_set_ext(dfield);
+					goto blob_done;
+				}
+			}
+
+			data = btr_rec_copy_externally_stored_field(
+				mrec, offsets,
+				dict_table_zip_size(index->table),
+				i, &len, heap);
+			ut_a(data);
+			dfield_set_data(dfield, data, len);
+blob_done:
+			rw_lock_x_unlock(dict_index_get_lock(index));
+		} else {
+			data = rec_get_nth_field(mrec, offsets, i, &len);
+			dfield_set_data(dfield, data, len);
+		}
+
+		/* See if any columns were changed to NULL or NOT NULL. */
+		const dict_col_t*	new_col
+			= dict_table_get_nth_col(log->table, col_no);
+		ut_ad(new_col->mtype == col->mtype);
+
+		/* Assert that prtype matches except for nullability. */
+		ut_ad(!((new_col->prtype ^ col->prtype) & ~DATA_NOT_NULL));
+		ut_ad(!((new_col->prtype ^ dfield_get_type(dfield)->prtype)
+			& ~DATA_NOT_NULL));
+
+		if (new_col->prtype == col->prtype) {
+			continue;
+		}
+
+		if ((new_col->prtype & DATA_NOT_NULL)
+		    && dfield_is_null(dfield)) {
+			/* We got a NULL value for a NOT NULL column. */
+			*error = DB_INVALID_NULL;
+			return(NULL);
+		}
+
+		/* Adjust the DATA_NOT_NULL flag in the parsed row. */
+		dfield_get_type(dfield)->prtype = new_col->prtype;
+
+		ut_ad(dict_col_type_assert_equal(new_col,
+						 dfield_get_type(dfield)));
+	}
+
+	return(row);
+}
+
+/******************************************************//**
+Replays an insert operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_insert_low(
+/*===========================*/
+	que_thr_t*		thr,		/*!< in: query graph */
+	const dtuple_t*		row,		/*!< in: table row
+						in the old table definition */
+	trx_id_t		trx_id,		/*!< in: trx_id of the row */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	row_merge_dup_t*	dup)		/*!< in/out: for reporting
+						duplicate key errors */
+{
+	dberr_t		error;
+	dtuple_t*	entry;
+	const row_log_t*log	= dup->index->online_log;
+	dict_index_t*	index	= dict_table_get_first_index(log->table);
+
+	ut_ad(dtuple_validate(row));
+	ut_ad(trx_id);
+
+#ifdef ROW_LOG_APPLY_PRINT
+	if (row_log_apply_print) {
+		fprintf(stderr, "table apply insert "
+			IB_ID_FMT " " IB_ID_FMT "\n",
+			index->table->id, index->id);
+		dtuple_print(stderr, row);
+	}
+#endif /* ROW_LOG_APPLY_PRINT */
+
+	static const ulint	flags
+		= (BTR_CREATE_FLAG
+		   | BTR_NO_LOCKING_FLAG
+		   | BTR_NO_UNDO_LOG_FLAG
+		   | BTR_KEEP_SYS_FLAG);
+
+	entry = row_build_index_entry(row, NULL, index, heap);
+
+	error = row_ins_clust_index_entry_low(
+		flags, BTR_MODIFY_TREE, index, index->n_uniq, entry, 0, thr);
+
+	switch (error) {
+	case DB_SUCCESS:
+		break;
+	case DB_SUCCESS_LOCKED_REC:
+		/* The row had already been copied to the table. */
+		return(DB_SUCCESS);
+	default:
+		return(error);
+	}
+
+	do {
+		if (!(index = dict_table_get_next_index(index))) {
+			break;
+		}
+
+		if (index->type & DICT_FTS) {
+			continue;
+		}
+
+		entry = row_build_index_entry(row, NULL, index, heap);
+		error = row_ins_sec_index_entry_low(
+			flags, BTR_MODIFY_TREE,
+			index, offsets_heap, heap, entry, trx_id, thr);
+	} while (error == DB_SUCCESS);
+
+	return(error);
+}
+
+/******************************************************//**
+Replays an insert operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_insert(
+/*=======================*/
+	que_thr_t*		thr,		/*!< in: query graph */
+	const mrec_t*		mrec,		/*!< in: record to insert */
+	const ulint*		offsets,	/*!< in: offsets of mrec */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	row_merge_dup_t*	dup,		/*!< in/out: for reporting
+						duplicate key errors */
+	trx_id_t		trx_id)		/*!< in: DB_TRX_ID of mrec */
+{
+	const row_log_t*log	= dup->index->online_log;
+	dberr_t		error;
+	const dtuple_t*	row	= row_log_table_apply_convert_mrec(
+		mrec, dup->index, offsets, log, heap, trx_id, &error);
+
+	switch (error) {
+	case DB_MISSING_HISTORY:
+		ut_ad(log->blobs);
+		/* Because some BLOBs are missing, we know that the
+		transaction was rolled back later (a rollback of
+		an insert can free BLOBs).
+		We can simply skip the insert: the subsequent
+		ROW_T_DELETE will be ignored, or a ROW_T_UPDATE will
+		be interpreted as ROW_T_INSERT. */
+		return(DB_SUCCESS);
+	case DB_SUCCESS:
+		ut_ad(row != NULL);
+		break;
+	default:
+		ut_ad(0);
+	case DB_INVALID_NULL:
+		ut_ad(row == NULL);
+		return(error);
+	}
+
+	error = row_log_table_apply_insert_low(
+		thr, row, trx_id, offsets_heap, heap, dup);
+	if (error != DB_SUCCESS) {
+		/* Report the erroneous row using the new
+		version of the table. */
+		innobase_row_to_mysql(dup->table, log->table, row);
+	}
+	return(error);
+}
+
+/******************************************************//**
+Deletes a record from a table that is being rebuilt.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull(1, 2, 4, 5), warn_unused_result))
+dberr_t
+row_log_table_apply_delete_low(
+/*===========================*/
+	btr_pcur_t*		pcur,		/*!< in/out: B-tree cursor,
+						will be trashed */
+	const ulint*		offsets,	/*!< in: offsets on pcur */
+	const row_ext_t*	save_ext,	/*!< in: saved external field
+						info, or NULL */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	mtr_t*			mtr)		/*!< in/out: mini-transaction,
+						will be committed */
+{
+	dberr_t		error;
+	row_ext_t*	ext;
+	dtuple_t*	row;
+	dict_index_t*	index	= btr_pcur_get_btr_cur(pcur)->index;
+
+	ut_ad(dict_index_is_clust(index));
+
+#ifdef ROW_LOG_APPLY_PRINT
+	if (row_log_apply_print) {
+		fprintf(stderr, "table apply delete "
+			IB_ID_FMT " " IB_ID_FMT "\n",
+			index->table->id, index->id);
+		rec_print_new(stderr, btr_pcur_get_rec(pcur), offsets);
+	}
+#endif /* ROW_LOG_APPLY_PRINT */
+	if (dict_table_get_next_index(index)) {
+		/* Build a row template for purging secondary index entries. */
+		row = row_build(
+			ROW_COPY_DATA, index, btr_pcur_get_rec(pcur),
+			offsets, NULL, NULL, NULL,
+			save_ext ? NULL : &ext, heap);
+		if (!save_ext) {
+			save_ext = ext;
+		}
+	} else {
+		row = NULL;
+	}
+
+	btr_cur_pessimistic_delete(&error, FALSE, btr_pcur_get_btr_cur(pcur),
+				   BTR_CREATE_FLAG, RB_NONE, mtr);
+	mtr_commit(mtr);
+
+	if (error != DB_SUCCESS) {
+		return(error);
+	}
+
+	while ((index = dict_table_get_next_index(index)) != NULL) {
+		if (index->type & DICT_FTS) {
+			continue;
+		}
+
+		const dtuple_t*	entry = row_build_index_entry(
+			row, save_ext, index, heap);
+		mtr_start(mtr);
+		btr_pcur_open(index, entry, PAGE_CUR_LE,
+			      BTR_MODIFY_TREE, pcur, mtr);
+#ifdef UNIV_DEBUG
+		switch (btr_pcur_get_btr_cur(pcur)->flag) {
+		case BTR_CUR_DELETE_REF:
+		case BTR_CUR_DEL_MARK_IBUF:
+		case BTR_CUR_DELETE_IBUF:
+		case BTR_CUR_INSERT_TO_IBUF:
+			/* We did not request buffering. */
+			break;
+		case BTR_CUR_HASH:
+		case BTR_CUR_HASH_FAIL:
+		case BTR_CUR_BINARY:
+			goto flag_ok;
+		}
+		ut_ad(0);
+flag_ok:
+#endif /* UNIV_DEBUG */
+
+		if (page_rec_is_infimum(btr_pcur_get_rec(pcur))
+		    || btr_pcur_get_low_match(pcur) < index->n_uniq) {
+			/* All secondary index entries should be
+			found, because new_table is being modified by
+			this thread only, and all indexes should be
+			updated in sync. */
+			mtr_commit(mtr);
+			return(DB_INDEX_CORRUPT);
+		}
+
+		btr_cur_pessimistic_delete(&error, FALSE,
+					   btr_pcur_get_btr_cur(pcur),
+					   BTR_CREATE_FLAG, RB_NONE, mtr);
+		mtr_commit(mtr);
+	}
+
+	return(error);
+}
+
+/******************************************************//**
+Replays a delete operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull(1, 3, 4, 5, 6, 7), warn_unused_result))
+dberr_t
+row_log_table_apply_delete(
+/*=======================*/
+	que_thr_t*		thr,		/*!< in: query graph */
+	ulint			trx_id_col,	/*!< in: position of
+						DB_TRX_ID in the new
+						clustered index */
+	const mrec_t*		mrec,		/*!< in: merge record */
+	const ulint*		moffsets,	/*!< in: offsets of mrec */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	const row_log_t*	log,		/*!< in: online log */
+	const row_ext_t*	save_ext)	/*!< in: saved external field
+						info, or NULL */
+{
+	dict_table_t*	new_table = log->table;
+	dict_index_t*	index = dict_table_get_first_index(new_table);
+	dtuple_t*	old_pk;
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+	ulint*		offsets;
+
+	ut_ad(rec_offs_n_fields(moffsets)
+	      == dict_index_get_n_unique(index) + 2);
+	ut_ad(!rec_offs_any_extern(moffsets));
+
+	/* Convert the row to a search tuple. */
+	old_pk = dtuple_create(heap, index->n_uniq);
+	dict_index_copy_types(old_pk, index, index->n_uniq);
+
+	for (ulint i = 0; i < index->n_uniq; i++) {
+		ulint		len;
+		const void*	field;
+		field = rec_get_nth_field(mrec, moffsets, i, &len);
+		ut_ad(len != UNIV_SQL_NULL);
+		dfield_set_data(dtuple_get_nth_field(old_pk, i),
+				field, len);
+	}
+
+	mtr_start(&mtr);
+	btr_pcur_open(index, old_pk, PAGE_CUR_LE,
+		      BTR_MODIFY_TREE, &pcur, &mtr);
+#ifdef UNIV_DEBUG
+	switch (btr_pcur_get_btr_cur(&pcur)->flag) {
+	case BTR_CUR_DELETE_REF:
+	case BTR_CUR_DEL_MARK_IBUF:
+	case BTR_CUR_DELETE_IBUF:
+	case BTR_CUR_INSERT_TO_IBUF:
+		/* We did not request buffering. */
+		break;
+	case BTR_CUR_HASH:
+	case BTR_CUR_HASH_FAIL:
+	case BTR_CUR_BINARY:
+		goto flag_ok;
+	}
+	ut_ad(0);
+flag_ok:
+#endif /* UNIV_DEBUG */
+
+	if (page_rec_is_infimum(btr_pcur_get_rec(&pcur))
+	    || btr_pcur_get_low_match(&pcur) < index->n_uniq) {
+all_done:
+		mtr_commit(&mtr);
+		/* The record was not found. All done. */
+		/* This should only happen when an earlier
+		ROW_T_INSERT was skipped or
+		ROW_T_UPDATE was interpreted as ROW_T_DELETE
+		due to BLOBs having been freed by rollback. */
+		return(DB_SUCCESS);
+	}
+
+	offsets = rec_get_offsets(btr_pcur_get_rec(&pcur), index, NULL,
+				  ULINT_UNDEFINED, &offsets_heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	ut_a(!rec_offs_any_null_extern(btr_pcur_get_rec(&pcur), offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+	/* Only remove the record if DB_TRX_ID,DB_ROLL_PTR match. */
+
+	{
+		ulint		len;
+		const byte*	mrec_trx_id
+			= rec_get_nth_field(mrec, moffsets, trx_id_col, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+		const byte*	rec_trx_id
+			= rec_get_nth_field(btr_pcur_get_rec(&pcur), offsets,
+					    trx_id_col, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+
+		ut_ad(rec_get_nth_field(mrec, moffsets, trx_id_col + 1, &len)
+		      == mrec_trx_id + DATA_TRX_ID_LEN);
+		ut_ad(len == DATA_ROLL_PTR_LEN);
+		ut_ad(rec_get_nth_field(btr_pcur_get_rec(&pcur), offsets,
+					trx_id_col + 1, &len)
+		      == rec_trx_id + DATA_TRX_ID_LEN);
+		ut_ad(len == DATA_ROLL_PTR_LEN);
+
+		if (memcmp(mrec_trx_id, rec_trx_id,
+			   DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) {
+			/* The ROW_T_DELETE was logged for a different
+			PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR.
+			This is possible if a ROW_T_INSERT was skipped
+			or a ROW_T_UPDATE was interpreted as ROW_T_DELETE
+			because some BLOBs were missing due to
+			(1) rolling back the initial insert, or
+			(2) purging the BLOB for a later ROW_T_DELETE
+			(3) purging 'old values' for a later ROW_T_UPDATE
+			or ROW_T_DELETE. */
+			ut_ad(!log->same_pk);
+			goto all_done;
+		}
+	}
+
+	return(row_log_table_apply_delete_low(&pcur, offsets, save_ext,
+					      heap, &mtr));
+}
+
+/******************************************************//**
+Replays an update operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_update(
+/*=======================*/
+	que_thr_t*		thr,		/*!< in: query graph */
+	ulint			new_trx_id_col,	/*!< in: position of
+						DB_TRX_ID in the new
+						clustered index */
+	const mrec_t*		mrec,		/*!< in: new value */
+	const ulint*		offsets,	/*!< in: offsets of mrec */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	row_merge_dup_t*	dup,		/*!< in/out: for reporting
+						duplicate key errors */
+	trx_id_t		trx_id,		/*!< in: DB_TRX_ID of mrec */
+	const dtuple_t*		old_pk)		/*!< in: PRIMARY KEY and
+						DB_TRX_ID,DB_ROLL_PTR
+						of the old value,
+						or PRIMARY KEY if same_pk */
+{
+	const row_log_t*log	= dup->index->online_log;
+	const dtuple_t*	row;
+	dict_index_t*	index	= dict_table_get_first_index(log->table);
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+	dberr_t		error;
+
+	ut_ad(dtuple_get_n_fields_cmp(old_pk)
+	      == dict_index_get_n_unique(index));
+	ut_ad(dtuple_get_n_fields(old_pk)
+	      == dict_index_get_n_unique(index)
+	      + (log->same_pk ? 0 : 2));
+
+	row = row_log_table_apply_convert_mrec(
+		mrec, dup->index, offsets, log, heap, trx_id, &error);
+
+	switch (error) {
+	case DB_MISSING_HISTORY:
+		/* The record contained BLOBs that are now missing. */
+		ut_ad(log->blobs);
+		/* Whether or not we are updating the PRIMARY KEY, we
+		know that there should be a subsequent
+		ROW_T_DELETE for rolling back a preceding ROW_T_INSERT,
+		overriding this ROW_T_UPDATE record. (*1)
+
+		This allows us to interpret this ROW_T_UPDATE
+		as ROW_T_DELETE.
+
+		When applying the subsequent ROW_T_DELETE, no matching
+		record will be found. */
+	case DB_SUCCESS:
+		ut_ad(row != NULL);
+		break;
+	default:
+		ut_ad(0);
+	case DB_INVALID_NULL:
+		ut_ad(row == NULL);
+		return(error);
+	}
+
+	mtr_start(&mtr);
+	btr_pcur_open(index, old_pk, PAGE_CUR_LE,
+		      BTR_MODIFY_TREE, &pcur, &mtr);
+#ifdef UNIV_DEBUG
+	switch (btr_pcur_get_btr_cur(&pcur)->flag) {
+	case BTR_CUR_DELETE_REF:
+	case BTR_CUR_DEL_MARK_IBUF:
+	case BTR_CUR_DELETE_IBUF:
+	case BTR_CUR_INSERT_TO_IBUF:
+		ut_ad(0);/* We did not request buffering. */
+	case BTR_CUR_HASH:
+	case BTR_CUR_HASH_FAIL:
+	case BTR_CUR_BINARY:
+		break;
+	}
+#endif /* UNIV_DEBUG */
+
+	if (page_rec_is_infimum(btr_pcur_get_rec(&pcur))
+	    || btr_pcur_get_low_match(&pcur) < index->n_uniq) {
+		/* The record was not found. This should only happen
+		when an earlier ROW_T_INSERT or ROW_T_UPDATE was
+		diverted because BLOBs were freed when the insert was
+		later rolled back. */
+
+		ut_ad(log->blobs);
+
+		if (error == DB_SUCCESS) {
+			/* An earlier ROW_T_INSERT could have been
+			skipped because of a missing BLOB, like this:
+
+			BEGIN;
+			INSERT INTO t SET blob_col='blob value';
+			UPDATE t SET blob_col='';
+			ROLLBACK;
+
+			This would generate the following records:
+			ROW_T_INSERT (referring to 'blob value')
+			ROW_T_UPDATE
+			ROW_T_UPDATE (referring to 'blob value')
+			ROW_T_DELETE
+			[ROLLBACK removes the 'blob value']
+
+			The ROW_T_INSERT would have been skipped
+			because of a missing BLOB. Now we are
+			executing the first ROW_T_UPDATE.
+			The second ROW_T_UPDATE (for the ROLLBACK)
+			would be interpreted as ROW_T_DELETE, because
+			the BLOB would be missing.
+
+			We could probably assume that the transaction
+			has been rolled back and simply skip the
+			'insert' part of this ROW_T_UPDATE record.
+			However, there might be some complex scenario
+			that could interfere with such a shortcut.
+			So, we will insert the row (and risk
+			introducing a bogus duplicate key error
+			for the ALTER TABLE), and a subsequent
+			ROW_T_UPDATE or ROW_T_DELETE will delete it. */
+			mtr_commit(&mtr);
+			error = row_log_table_apply_insert_low(
+				thr, row, trx_id, offsets_heap, heap, dup);
+		} else {
+			/* Some BLOBs are missing, so we are interpreting
+			this ROW_T_UPDATE as ROW_T_DELETE (see *1).
+			Because the record was not found, we do nothing. */
+			ut_ad(error == DB_MISSING_HISTORY);
+			error = DB_SUCCESS;
+func_exit:
+			mtr_commit(&mtr);
+		}
+func_exit_committed:
+		ut_ad(mtr.state == MTR_COMMITTED);
+
+		if (error != DB_SUCCESS) {
+			/* Report the erroneous row using the new
+			version of the table. */
+			innobase_row_to_mysql(dup->table, log->table, row);
+		}
+
+		return(error);
+	}
+
+	/* Prepare to update (or delete) the record. */
+	ulint*		cur_offsets	= rec_get_offsets(
+		btr_pcur_get_rec(&pcur),
+		index, NULL, ULINT_UNDEFINED, &offsets_heap);
+
+	if (!log->same_pk) {
+		/* Only update the record if DB_TRX_ID,DB_ROLL_PTR match what
+		was buffered. */
+		ulint		len;
+		const void*	rec_trx_id
+			= rec_get_nth_field(btr_pcur_get_rec(&pcur),
+					    cur_offsets, index->n_uniq, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+		ut_ad(dtuple_get_nth_field(old_pk, index->n_uniq)->len
+		      == DATA_TRX_ID_LEN);
+		ut_ad(dtuple_get_nth_field(old_pk, index->n_uniq + 1)->len
+		      == DATA_ROLL_PTR_LEN);
+		ut_ad(DATA_TRX_ID_LEN + static_cast<const char*>(
+			      dtuple_get_nth_field(old_pk,
+						   index->n_uniq)->data)
+		      == dtuple_get_nth_field(old_pk,
+					      index->n_uniq + 1)->data);
+		if (memcmp(rec_trx_id,
+			   dtuple_get_nth_field(old_pk, index->n_uniq)->data,
+			   DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) {
+			/* The ROW_T_UPDATE was logged for a different
+			DB_TRX_ID,DB_ROLL_PTR. This is possible if an
+			earlier ROW_T_INSERT or ROW_T_UPDATE was diverted
+			because some BLOBs were missing due to rolling
+			back the initial insert or due to purging
+			the old BLOB values of an update. */
+			ut_ad(log->blobs);
+			if (error != DB_SUCCESS) {
+				ut_ad(error == DB_MISSING_HISTORY);
+				/* Some BLOBs are missing, so we are
+				interpreting this ROW_T_UPDATE as
+				ROW_T_DELETE (see *1).
+				Because this is a different row,
+				we will do nothing. */
+				error = DB_SUCCESS;
+			} else {
+				/* Because the user record is missing due to
+				BLOBs that were missing when processing
+				an earlier log record, we should
+				interpret the ROW_T_UPDATE as ROW_T_INSERT.
+				However, there is a different user record
+				with the same PRIMARY KEY value already. */
+				error = DB_DUPLICATE_KEY;
+			}
+
+			goto func_exit;
+		}
+	}
+
+	if (error != DB_SUCCESS) {
+		ut_ad(error == DB_MISSING_HISTORY);
+		ut_ad(log->blobs);
+		/* Some BLOBs are missing, so we are interpreting
+		this ROW_T_UPDATE as ROW_T_DELETE (see *1). */
+		error = row_log_table_apply_delete_low(
+			&pcur, cur_offsets, NULL, heap, &mtr);
+		goto func_exit_committed;
+	}
+
+	dtuple_t*	entry	= row_build_index_entry(
+		row, NULL, index, heap);
+	const upd_t*	update	= row_upd_build_difference_binary(
+		index, entry, btr_pcur_get_rec(&pcur), cur_offsets,
+		false, NULL, heap);
+
+	if (!update->n_fields) {
+		/* Nothing to do. */
+		goto func_exit;
+	}
+
+	const bool	pk_updated
+		= upd_get_nth_field(update, 0)->field_no < new_trx_id_col;
+
+	if (pk_updated || rec_offs_any_extern(cur_offsets)) {
+		/* If the record contains any externally stored
+		columns, perform the update by delete and insert,
+		because we will not write any undo log that would
+		allow purge to free any orphaned externally stored
+		columns. */
+
+		if (pk_updated && log->same_pk) {
+			/* The ROW_T_UPDATE log record should only be
+			written when the PRIMARY KEY fields of the
+			record did not change in the old table.  We
+			can only get a change of PRIMARY KEY columns
+			in the rebuilt table if the PRIMARY KEY was
+			redefined (!same_pk). */
+			ut_ad(0);
+			error = DB_CORRUPTION;
+			goto func_exit;
+		}
+
+		error = row_log_table_apply_delete_low(
+			&pcur, cur_offsets, NULL, heap, &mtr);
+		ut_ad(mtr.state == MTR_COMMITTED);
+
+		if (error == DB_SUCCESS) {
+			error = row_log_table_apply_insert_low(
+				thr, row, trx_id, offsets_heap, heap, dup);
+		}
+
+		goto func_exit_committed;
+	}
+
+	dtuple_t*	old_row;
+	row_ext_t*	old_ext;
+
+	if (dict_table_get_next_index(index)) {
+		/* Construct the row corresponding to the old value of
+		the record. */
+		old_row = row_build(
+			ROW_COPY_DATA, index, btr_pcur_get_rec(&pcur),
+			cur_offsets, NULL, NULL, NULL, &old_ext, heap);
+		ut_ad(old_row);
+#ifdef ROW_LOG_APPLY_PRINT
+		if (row_log_apply_print) {
+			fprintf(stderr, "table apply update "
+				IB_ID_FMT " " IB_ID_FMT "\n",
+				index->table->id, index->id);
+			dtuple_print(stderr, old_row);
+			dtuple_print(stderr, row);
+		}
+#endif /* ROW_LOG_APPLY_PRINT */
+	} else {
+		old_row = NULL;
+		old_ext = NULL;
+	}
+
+	big_rec_t*	big_rec;
+
+	error = btr_cur_pessimistic_update(
+		BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
+		| BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG
+		| BTR_KEEP_POS_FLAG,
+		btr_pcur_get_btr_cur(&pcur),
+		&cur_offsets, &offsets_heap, heap, &big_rec,
+		update, 0, thr, 0, &mtr);
+
+	if (big_rec) {
+		if (error == DB_SUCCESS) {
+			error = btr_store_big_rec_extern_fields(
+				index, btr_pcur_get_block(&pcur),
+				btr_pcur_get_rec(&pcur), cur_offsets,
+				big_rec, &mtr, BTR_STORE_UPDATE);
+		}
+
+		dtuple_big_rec_free(big_rec);
+	}
+
+	while ((index = dict_table_get_next_index(index)) != NULL) {
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		if (index->type & DICT_FTS) {
+			continue;
+		}
+
+		if (!row_upd_changes_ord_field_binary(
+			    index, update, thr, old_row, NULL)) {
+			continue;
+		}
+
+		mtr_commit(&mtr);
+
+		entry = row_build_index_entry(old_row, old_ext, index, heap);
+		if (!entry) {
+			ut_ad(0);
+			return(DB_CORRUPTION);
+		}
+
+		mtr_start(&mtr);
+
+		if (ROW_FOUND != row_search_index_entry(
+			    index, entry, BTR_MODIFY_TREE, &pcur, &mtr)) {
+			ut_ad(0);
+			error = DB_CORRUPTION;
+			break;
+		}
+
+		btr_cur_pessimistic_delete(
+			&error, FALSE, btr_pcur_get_btr_cur(&pcur),
+			BTR_CREATE_FLAG, RB_NONE, &mtr);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		mtr_commit(&mtr);
+
+		entry = row_build_index_entry(row, NULL, index, heap);
+		error = row_ins_sec_index_entry_low(
+			BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
+			| BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG,
+			BTR_MODIFY_TREE, index, offsets_heap, heap,
+			entry, trx_id, thr);
+
+		mtr_start(&mtr);
+	}
+
+	goto func_exit;
+}
+
+/******************************************************//**
+Applies an operation to a table that was rebuilt.
+@return NULL on failure (mrec corruption) or when out of data;
+pointer to next record on success */
+static __attribute__((nonnull, warn_unused_result))
+const mrec_t*
+row_log_table_apply_op(
+/*===================*/
+	que_thr_t*		thr,		/*!< in: query graph */
+	ulint			trx_id_col,	/*!< in: position of
+						DB_TRX_ID in old index */
+	ulint			new_trx_id_col,	/*!< in: position of
+						DB_TRX_ID in new index */
+	row_merge_dup_t*	dup,		/*!< in/out: for reporting
+						duplicate key errors */
+	dberr_t*		error,		/*!< out: DB_SUCCESS
+						or error code */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	const mrec_t*		mrec,		/*!< in: merge record */
+	const mrec_t*		mrec_end,	/*!< in: end of buffer */
+	ulint*			offsets)	/*!< in/out: work area
+						for parsing mrec */
+{
+	row_log_t*	log	= dup->index->online_log;
+	dict_index_t*	new_index = dict_table_get_first_index(log->table);
+	ulint		extra_size;
+	const mrec_t*	next_mrec;
+	dtuple_t*	old_pk;
+	row_ext_t*	ext;
+	ulint		ext_size;
+
+	ut_ad(dict_index_is_clust(dup->index));
+	ut_ad(dup->index->table != log->table);
+	ut_ad(log->head.total <= log->tail.total);
+
+	*error = DB_SUCCESS;
+
+	/* 3 = 1 (op type) + 1 (ext_size) + at least 1 byte payload */
+	if (mrec + 3 >= mrec_end) {
+		return(NULL);
+	}
+
+	const mrec_t* const mrec_start = mrec;
+
+	switch (*mrec++) {
+	default:
+		ut_ad(0);
+		*error = DB_CORRUPTION;
+		return(NULL);
+	case ROW_T_INSERT:
+		extra_size = *mrec++;
+
+		if (extra_size >= 0x80) {
+			/* Read another byte of extra_size. */
+
+			extra_size = (extra_size & 0x7f) << 8;
+			extra_size |= *mrec++;
+		}
+
+		mrec += extra_size;
+
+		if (mrec > mrec_end) {
+			return(NULL);
+		}
+
+		rec_offs_set_n_fields(offsets, dup->index->n_fields);
+		rec_init_offsets_temp(mrec, dup->index, offsets);
+
+		next_mrec = mrec + rec_offs_data_size(offsets);
+
+		if (next_mrec > mrec_end) {
+			return(NULL);
+		} else {
+			log->head.total += next_mrec - mrec_start;
+
+			ulint		len;
+			const byte*	db_trx_id
+				= rec_get_nth_field(
+					mrec, offsets, trx_id_col, &len);
+			ut_ad(len == DATA_TRX_ID_LEN);
+			*error = row_log_table_apply_insert(
+				thr, mrec, offsets, offsets_heap,
+				heap, dup, trx_read_trx_id(db_trx_id));
+		}
+		break;
+
+	case ROW_T_DELETE:
+		/* 1 (extra_size) + 2 (ext_size) + at least 1 (payload) */
+		if (mrec + 4 >= mrec_end) {
+			return(NULL);
+		}
+
+		extra_size = *mrec++;
+		ext_size = mach_read_from_2(mrec);
+		mrec += 2;
+		ut_ad(mrec < mrec_end);
+
+		/* We assume extra_size < 0x100 for the PRIMARY KEY prefix.
+		For fixed-length PRIMARY key columns, it is 0. */
+		mrec += extra_size;
+
+		rec_offs_set_n_fields(offsets, new_index->n_uniq + 2);
+		rec_init_offsets_temp(mrec, new_index, offsets);
+		next_mrec = mrec + rec_offs_data_size(offsets) + ext_size;
+		if (next_mrec > mrec_end) {
+			return(NULL);
+		}
+
+		log->head.total += next_mrec - mrec_start;
+
+		/* If there are external fields, retrieve those logged
+		prefix info and reconstruct the row_ext_t */
+		if (ext_size) {
+			/* We use memcpy to avoid unaligned
+			access on some non-x86 platforms.*/
+			ext = static_cast<row_ext_t*>(
+				mem_heap_dup(heap,
+					     mrec + rec_offs_data_size(offsets),
+					     ext_size));
+
+			byte*	ext_start = reinterpret_cast<byte*>(ext);
+
+			ulint	ext_len = sizeof(*ext)
+				+ (ext->n_ext - 1) * sizeof ext->len;
+
+			ext->ext = reinterpret_cast<ulint*>(ext_start + ext_len);
+			ext_len += ext->n_ext * sizeof(*ext->ext);
+
+			ext->buf = static_cast<byte*>(ext_start + ext_len);
+		} else {
+			ext = NULL;
+		}
+
+		*error = row_log_table_apply_delete(
+			thr, new_trx_id_col,
+			mrec, offsets, offsets_heap, heap,
+			log, ext);
+		break;
+
+	case ROW_T_UPDATE:
+		/* Logically, the log entry consists of the
+		(PRIMARY KEY,DB_TRX_ID) of the old value (converted
+		to the new primary key definition) followed by
+		the new value in the old table definition. If the
+		definition of the columns belonging to PRIMARY KEY
+		is not changed, the log will only contain
+		DB_TRX_ID,new_row. */
+
+		if (dup->index->online_log->same_pk) {
+			ut_ad(new_index->n_uniq == dup->index->n_uniq);
+
+			extra_size = *mrec++;
+
+			if (extra_size >= 0x80) {
+				/* Read another byte of extra_size. */
+
+				extra_size = (extra_size & 0x7f) << 8;
+				extra_size |= *mrec++;
+			}
+
+			mrec += extra_size;
+
+			if (mrec > mrec_end) {
+				return(NULL);
+			}
+
+			rec_offs_set_n_fields(offsets, dup->index->n_fields);
+			rec_init_offsets_temp(mrec, dup->index, offsets);
+
+			next_mrec = mrec + rec_offs_data_size(offsets);
+
+			if (next_mrec > mrec_end) {
+				return(NULL);
+			}
+
+			old_pk = dtuple_create(heap, new_index->n_uniq);
+			dict_index_copy_types(
+				old_pk, new_index, old_pk->n_fields);
+
+			/* Copy the PRIMARY KEY fields from mrec to old_pk. */
+			for (ulint i = 0; i < new_index->n_uniq; i++) {
+				const void*	field;
+				ulint		len;
+				dfield_t*	dfield;
+
+				ut_ad(!rec_offs_nth_extern(offsets, i));
+
+				field = rec_get_nth_field(
+					mrec, offsets, i, &len);
+				ut_ad(len != UNIV_SQL_NULL);
+
+				dfield = dtuple_get_nth_field(old_pk, i);
+				dfield_set_data(dfield, field, len);
+			}
+		} else {
+			/* We assume extra_size < 0x100
+			for the PRIMARY KEY prefix. */
+			mrec += *mrec + 1;
+
+			if (mrec > mrec_end) {
+				return(NULL);
+			}
+
+			/* Get offsets for PRIMARY KEY,
+			DB_TRX_ID, DB_ROLL_PTR. */
+			rec_offs_set_n_fields(offsets, new_index->n_uniq + 2);
+			rec_init_offsets_temp(mrec, new_index, offsets);
+
+			next_mrec = mrec + rec_offs_data_size(offsets);
+			if (next_mrec + 2 > mrec_end) {
+				return(NULL);
+			}
+
+			/* Copy the PRIMARY KEY fields and
+			DB_TRX_ID, DB_ROLL_PTR from mrec to old_pk. */
+			old_pk = dtuple_create(heap, new_index->n_uniq + 2);
+			dict_index_copy_types(old_pk, new_index,
+					      old_pk->n_fields);
+
+			for (ulint i = 0;
+			     i < dict_index_get_n_unique(new_index) + 2;
+			     i++) {
+				const void*	field;
+				ulint		len;
+				dfield_t*	dfield;
+
+				ut_ad(!rec_offs_nth_extern(offsets, i));
+
+				field = rec_get_nth_field(
+					mrec, offsets, i, &len);
+				ut_ad(len != UNIV_SQL_NULL);
+
+				dfield = dtuple_get_nth_field(old_pk, i);
+				dfield_set_data(dfield, field, len);
+			}
+
+			mrec = next_mrec;
+
+			/* Fetch the new value of the row as it was
+			in the old table definition. */
+			extra_size = *mrec++;
+
+			if (extra_size >= 0x80) {
+				/* Read another byte of extra_size. */
+
+				extra_size = (extra_size & 0x7f) << 8;
+				extra_size |= *mrec++;
+			}
+
+			mrec += extra_size;
+
+			if (mrec > mrec_end) {
+				return(NULL);
+			}
+
+			rec_offs_set_n_fields(offsets, dup->index->n_fields);
+			rec_init_offsets_temp(mrec, dup->index, offsets);
+
+			next_mrec = mrec + rec_offs_data_size(offsets);
+
+			if (next_mrec > mrec_end) {
+				return(NULL);
+			}
+		}
+
+		ut_ad(next_mrec <= mrec_end);
+		log->head.total += next_mrec - mrec_start;
+		dtuple_set_n_fields_cmp(old_pk, new_index->n_uniq);
+
+		{
+			ulint		len;
+			const byte*	db_trx_id
+				= rec_get_nth_field(
+					mrec, offsets, trx_id_col, &len);
+			ut_ad(len == DATA_TRX_ID_LEN);
+			*error = row_log_table_apply_update(
+				thr, new_trx_id_col,
+				mrec, offsets, offsets_heap,
+				heap, dup, trx_read_trx_id(db_trx_id), old_pk);
+		}
+
+		break;
+	}
+
+	ut_ad(log->head.total <= log->tail.total);
+	mem_heap_empty(offsets_heap);
+	mem_heap_empty(heap);
+	return(next_mrec);
+}
+
+/******************************************************//**
+Applies operations to a table was rebuilt.
+@return DB_SUCCESS, or error code on failure */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_ops(
+/*====================*/
+	que_thr_t*	thr,	/*!< in: query graph */
+	row_merge_dup_t*dup)	/*!< in/out: for reporting duplicate key
+				errors */
+{
+	dberr_t		error;
+	const mrec_t*	mrec		= NULL;
+	const mrec_t*	next_mrec;
+	const mrec_t*	mrec_end	= NULL; /* silence bogus warning */
+	const mrec_t*	next_mrec_end;
+	mem_heap_t*	heap;
+	mem_heap_t*	offsets_heap;
+	ulint*		offsets;
+	bool		has_index_lock;
+	dict_index_t*	index		= const_cast<dict_index_t*>(
+		dup->index);
+	dict_table_t*	new_table	= index->online_log->table;
+	dict_index_t*	new_index	= dict_table_get_first_index(
+		new_table);
+	const ulint	i		= 1 + REC_OFFS_HEADER_SIZE
+		+ ut_max(dict_index_get_n_fields(index),
+			 dict_index_get_n_unique(new_index) + 2);
+	const ulint	trx_id_col	= dict_col_get_clust_pos(
+		dict_table_get_sys_col(index->table, DATA_TRX_ID), index);
+	const ulint	new_trx_id_col	= dict_col_get_clust_pos(
+		dict_table_get_sys_col(new_table, DATA_TRX_ID), new_index);
+	trx_t*		trx		= thr_get_trx(thr);
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_online_ddl(index));
+	ut_ad(trx->mysql_thd);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(!dict_index_is_online_ddl(new_index));
+	ut_ad(trx_id_col > 0);
+	ut_ad(trx_id_col != ULINT_UNDEFINED);
+	ut_ad(new_trx_id_col > 0);
+	ut_ad(new_trx_id_col != ULINT_UNDEFINED);
+
+	UNIV_MEM_INVALID(&mrec_end, sizeof mrec_end);
+
+	offsets = static_cast<ulint*>(ut_malloc(i * sizeof *offsets));
+	offsets[0] = i;
+	offsets[1] = dict_index_get_n_fields(index);
+
+	heap = mem_heap_create(UNIV_PAGE_SIZE);
+	offsets_heap = mem_heap_create(UNIV_PAGE_SIZE);
+	has_index_lock = true;
+
+next_block:
+	ut_ad(has_index_lock);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(index->online_log->head.bytes == 0);
+
+	if (trx_is_interrupted(trx)) {
+		goto interrupted;
+	}
+
+	if (dict_index_is_corrupted(index)) {
+		error = DB_INDEX_CORRUPT;
+		goto func_exit;
+	}
+
+	ut_ad(dict_index_is_online_ddl(index));
+
+	error = index->online_log->error;
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(index->online_log->head.blocks
+			  > index->online_log->tail.blocks)) {
+unexpected_eof:
+		fprintf(stderr, "InnoDB: unexpected end of temporary file"
+			" for table %s\n", index->table_name);
+corruption:
+		error = DB_CORRUPTION;
+		goto func_exit;
+	}
+
+	if (index->online_log->head.blocks
+	    == index->online_log->tail.blocks) {
+		if (index->online_log->head.blocks) {
+#ifdef HAVE_FTRUNCATE
+			/* Truncate the file in order to save space. */
+			if (ftruncate(index->online_log->fd, 0) == -1) {
+				perror("ftruncate");
+			}
+#endif /* HAVE_FTRUNCATE */
+			index->online_log->head.blocks
+				= index->online_log->tail.blocks = 0;
+		}
+
+		next_mrec = index->online_log->tail.block;
+		next_mrec_end = next_mrec + index->online_log->tail.bytes;
+
+		if (next_mrec_end == next_mrec) {
+			/* End of log reached. */
+all_done:
+			ut_ad(has_index_lock);
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->tail.blocks == 0);
+			index->online_log->head.bytes = 0;
+			index->online_log->tail.bytes = 0;
+			error = DB_SUCCESS;
+			goto func_exit;
+		}
+	} else {
+		os_offset_t	ofs;
+		ibool		success;
+
+		ofs = (os_offset_t) index->online_log->head.blocks
+			* srv_sort_buf_size;
+
+		ut_ad(has_index_lock);
+		has_index_lock = false;
+		rw_lock_x_unlock(dict_index_get_lock(index));
+
+		log_free_check();
+
+		ut_ad(dict_index_is_online_ddl(index));
+
+		if (!row_log_block_allocate(index->online_log->head)) {
+			error = DB_OUT_OF_MEMORY;
+			goto func_exit;
+		}
+
+		success = os_file_read_no_error_handling(
+			OS_FILE_FROM_FD(index->online_log->fd),
+			index->online_log->head.block, ofs,
+			srv_sort_buf_size);
+
+		if (!success) {
+			fprintf(stderr, "InnoDB: unable to read temporary file"
+				" for table %s\n", index->table_name);
+			goto corruption;
+		}
+
+#ifdef POSIX_FADV_DONTNEED
+		/* Each block is read exactly once.  Free up the file cache. */
+		posix_fadvise(index->online_log->fd,
+			      ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+#if 0 //def FALLOC_FL_PUNCH_HOLE
+		/* Try to deallocate the space for the file on disk.
+		This should work on ext4 on Linux 2.6.39 and later,
+		and be ignored when the operation is unsupported. */
+		fallocate(index->online_log->fd,
+			  FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+			  ofs, srv_buf_size);
+#endif /* FALLOC_FL_PUNCH_HOLE */
+
+		next_mrec = index->online_log->head.block;
+		next_mrec_end = next_mrec + srv_sort_buf_size;
+	}
+
+	/* This read is not protected by index->online_log->mutex for
+	performance reasons. We will eventually notice any error that
+	was flagged by a DML thread. */
+	error = index->online_log->error;
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	if (mrec) {
+		/* A partial record was read from the previous block.
+		Copy the temporary buffer full, as we do not know the
+		length of the record. Parse subsequent records from
+		the bigger buffer index->online_log->head.block
+		or index->online_log->tail.block. */
+
+		ut_ad(mrec == index->online_log->head.buf);
+		ut_ad(mrec_end > mrec);
+		ut_ad(mrec_end < (&index->online_log->head.buf)[1]);
+
+		memcpy((mrec_t*) mrec_end, next_mrec,
+		       (&index->online_log->head.buf)[1] - mrec_end);
+		mrec = row_log_table_apply_op(
+			thr, trx_id_col, new_trx_id_col,
+			dup, &error, offsets_heap, heap,
+			index->online_log->head.buf,
+			(&index->online_log->head.buf)[1], offsets);
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		} else if (UNIV_UNLIKELY(mrec == NULL)) {
+			/* The record was not reassembled properly. */
+			goto corruption;
+		}
+		/* The record was previously found out to be
+		truncated. Now that the parse buffer was extended,
+		it should proceed beyond the old end of the buffer. */
+		ut_a(mrec > mrec_end);
+
+		index->online_log->head.bytes = mrec - mrec_end;
+		next_mrec += index->online_log->head.bytes;
+	}
+
+	ut_ad(next_mrec <= next_mrec_end);
+	/* The following loop must not be parsing the temporary
+	buffer, but head.block or tail.block. */
+
+	/* mrec!=NULL means that the next record starts from the
+	middle of the block */
+	ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0));
+
+#ifdef UNIV_DEBUG
+	if (next_mrec_end == index->online_log->head.block
+	    + srv_sort_buf_size) {
+		/* If tail.bytes == 0, next_mrec_end can also be at
+		the end of tail.block. */
+		if (index->online_log->tail.bytes == 0) {
+			ut_ad(next_mrec == next_mrec_end);
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->head.bytes == 0);
+		} else {
+			ut_ad(next_mrec == index->online_log->head.block
+			      + index->online_log->head.bytes);
+			ut_ad(index->online_log->tail.blocks
+			      > index->online_log->head.blocks);
+		}
+	} else if (next_mrec_end == index->online_log->tail.block
+		   + index->online_log->tail.bytes) {
+		ut_ad(next_mrec == index->online_log->tail.block
+		      + index->online_log->head.bytes);
+		ut_ad(index->online_log->tail.blocks == 0);
+		ut_ad(index->online_log->head.blocks == 0);
+		ut_ad(index->online_log->head.bytes
+		      <= index->online_log->tail.bytes);
+	} else {
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+
+	mrec_end = next_mrec_end;
+
+	while (!trx_is_interrupted(trx)) {
+		mrec = next_mrec;
+		ut_ad(mrec < mrec_end);
+
+		if (!has_index_lock) {
+			/* We are applying operations from a different
+			block than the one that is being written to.
+			We do not hold index->lock in order to
+			allow other threads to concurrently buffer
+			modifications. */
+			ut_ad(mrec >= index->online_log->head.block);
+			ut_ad(mrec_end == index->online_log->head.block
+			      + srv_sort_buf_size);
+			ut_ad(index->online_log->head.bytes
+			      < srv_sort_buf_size);
+
+			/* Take the opportunity to do a redo log
+			checkpoint if needed. */
+			log_free_check();
+		} else {
+			/* We are applying operations from the last block.
+			Do not allow other threads to buffer anything,
+			so that we can finally catch up and synchronize. */
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(mrec_end == index->online_log->tail.block
+			      + index->online_log->tail.bytes);
+			ut_ad(mrec >= index->online_log->tail.block);
+		}
+
+		/* This read is not protected by index->online_log->mutex
+		for performance reasons. We will eventually notice any
+		error that was flagged by a DML thread. */
+		error = index->online_log->error;
+
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		}
+
+		next_mrec = row_log_table_apply_op(
+			thr, trx_id_col, new_trx_id_col,
+			dup, &error, offsets_heap, heap,
+			mrec, mrec_end, offsets);
+
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		} else if (next_mrec == next_mrec_end) {
+			/* The record happened to end on a block boundary.
+			Do we have more blocks left? */
+			if (has_index_lock) {
+				/* The index will be locked while
+				applying the last block. */
+				goto all_done;
+			}
+
+			mrec = NULL;
+process_next_block:
+			rw_lock_x_lock(dict_index_get_lock(index));
+			has_index_lock = true;
+
+			index->online_log->head.bytes = 0;
+			index->online_log->head.blocks++;
+			goto next_block;
+		} else if (next_mrec != NULL) {
+			ut_ad(next_mrec < next_mrec_end);
+			index->online_log->head.bytes += next_mrec - mrec;
+		} else if (has_index_lock) {
+			/* When mrec is within tail.block, it should
+			be a complete record, because we are holding
+			index->lock and thus excluding the writer. */
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(mrec_end == index->online_log->tail.block
+			      + index->online_log->tail.bytes);
+			ut_ad(0);
+			goto unexpected_eof;
+		} else {
+			memcpy(index->online_log->head.buf, mrec,
+			       mrec_end - mrec);
+			mrec_end += index->online_log->head.buf - mrec;
+			mrec = index->online_log->head.buf;
+			goto process_next_block;
+		}
+	}
+
+interrupted:
+	error = DB_INTERRUPTED;
+func_exit:
+	if (!has_index_lock) {
+		rw_lock_x_lock(dict_index_get_lock(index));
+	}
+
+	mem_heap_free(offsets_heap);
+	mem_heap_free(heap);
+	row_log_block_free(index->online_log->head);
+	ut_free(offsets);
+	return(error);
+}
+
+/******************************************************//**
+Apply the row_log_table log to a table upon completing rebuild.
+@return DB_SUCCESS, or error code on failure */
+UNIV_INTERN
+dberr_t
+row_log_table_apply(
+/*================*/
+	que_thr_t*	thr,	/*!< in: query graph */
+	dict_table_t*	old_table,
+				/*!< in: old table */
+	struct TABLE*	table)	/*!< in/out: MySQL table
+				(for reporting duplicates) */
+{
+	dberr_t		error;
+	dict_index_t*	clust_index;
+
+	thr_get_trx(thr)->error_key_num = 0;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+	clust_index = dict_table_get_first_index(old_table);
+
+	rw_lock_x_lock(dict_index_get_lock(clust_index));
+
+	if (!clust_index->online_log) {
+		ut_ad(dict_index_get_online_status(clust_index)
+		      == ONLINE_INDEX_COMPLETE);
+		/* This function should not be called unless
+		rebuilding a table online. Build in some fault
+		tolerance. */
+		ut_ad(0);
+		error = DB_ERROR;
+	} else {
+		row_merge_dup_t	dup = {
+			clust_index, table,
+			clust_index->online_log->col_map, 0
+		};
+
+		error = row_log_table_apply_ops(thr, &dup);
+
+		ut_ad(error != DB_SUCCESS
+		      || clust_index->online_log->head.total
+		      == clust_index->online_log->tail.total);
+	}
+
+	rw_lock_x_unlock(dict_index_get_lock(clust_index));
+	return(error);
+}
+
+/******************************************************//**
+Allocate the row log for an index and flag the index
+for online creation.
+@retval true if success, false if not */
+UNIV_INTERN
+bool
+row_log_allocate(
+/*=============*/
+	dict_index_t*	index,	/*!< in/out: index */
+	dict_table_t*	table,	/*!< in/out: new table being rebuilt,
+				or NULL when creating a secondary index */
+	bool		same_pk,/*!< in: whether the definition of the
+				PRIMARY KEY has remained the same */
+	const dtuple_t*	add_cols,
+				/*!< in: default values of
+				added columns, or NULL */
+	const ulint*	col_map)/*!< in: mapping of old column
+				numbers to new ones, or NULL if !table */
+{
+	row_log_t*	log;
+	DBUG_ENTER("row_log_allocate");
+
+	ut_ad(!dict_index_is_online_ddl(index));
+	ut_ad(dict_index_is_clust(index) == !!table);
+	ut_ad(!table || index->table != table);
+	ut_ad(same_pk || table);
+	ut_ad(!table || col_map);
+	ut_ad(!add_cols || col_map);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	log = (row_log_t*) ut_malloc(sizeof *log);
+	if (!log) {
+		DBUG_RETURN(false);
+	}
+
+	log->fd = row_merge_file_create_low();
+	if (log->fd < 0) {
+		ut_free(log);
+		DBUG_RETURN(false);
+	}
+	mutex_create(index_online_log_key, &log->mutex,
+		     SYNC_INDEX_ONLINE_LOG);
+	log->blobs = NULL;
+	log->table = table;
+	log->same_pk = same_pk;
+	log->add_cols = add_cols;
+	log->col_map = col_map;
+	log->error = DB_SUCCESS;
+	log->max_trx = 0;
+	log->tail.blocks = log->tail.bytes = 0;
+	log->tail.total = 0;
+	log->tail.block = log->head.block = NULL;
+	log->head.blocks = log->head.bytes = 0;
+	log->head.total = 0;
+	dict_index_set_online_status(index, ONLINE_INDEX_CREATION);
+	index->online_log = log;
+
+	/* While we might be holding an exclusive data dictionary lock
+	here, in row_log_abort_sec() we will not always be holding it. Use
+	atomic operations in both cases. */
+	MONITOR_ATOMIC_INC(MONITOR_ONLINE_CREATE_INDEX);
+
+	DBUG_RETURN(true);
+}
+
+/******************************************************//**
+Free the row log for an index that was being created online. */
+UNIV_INTERN
+void
+row_log_free(
+/*=========*/
+	row_log_t*&	log)	/*!< in,own: row log */
+{
+	MONITOR_ATOMIC_DEC(MONITOR_ONLINE_CREATE_INDEX);
+
+	delete log->blobs;
+	row_log_block_free(log->tail);
+	row_log_block_free(log->head);
+	row_merge_file_destroy_low(log->fd);
+	mutex_free(&log->mutex);
+	ut_free(log);
+	log = 0;
+}
+
+/******************************************************//**
+Get the latest transaction ID that has invoked row_log_online_op()
+during online creation.
+@return latest transaction ID, or 0 if nothing was logged */
+UNIV_INTERN
+trx_id_t
+row_log_get_max_trx(
+/*================*/
+	dict_index_t*	index)	/*!< in: index, must be locked */
+{
+	ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_CREATION);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad((rw_lock_own(dict_index_get_lock(index), RW_LOCK_SHARED)
+	       && mutex_own(&index->online_log->mutex))
+	      || rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	return(index->online_log->max_trx);
+}
+
+/******************************************************//**
+Applies an operation to a secondary index that was being created. */
+static __attribute__((nonnull))
+void
+row_log_apply_op_low(
+/*=================*/
+	dict_index_t*	index,		/*!< in/out: index */
+	row_merge_dup_t*dup,		/*!< in/out: for reporting
+					duplicate key errors */
+	dberr_t*	error,		/*!< out: DB_SUCCESS or error code */
+	mem_heap_t*	offsets_heap,	/*!< in/out: memory heap for
+					allocating offsets; can be emptied */
+	bool		has_index_lock, /*!< in: true if holding index->lock
+					in exclusive mode */
+	enum row_op	op,		/*!< in: operation being applied */
+	trx_id_t	trx_id,		/*!< in: transaction identifier */
+	const dtuple_t*	entry)		/*!< in: row */
+{
+	mtr_t		mtr;
+	btr_cur_t	cursor;
+	ulint*		offsets = NULL;
+
+	ut_ad(!dict_index_is_clust(index));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)
+	      == has_index_lock);
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(!dict_index_is_corrupted(index));
+	ut_ad(trx_id != 0 || op == ROW_OP_DELETE);
+
+	mtr_start(&mtr);
+
+	/* We perform the pessimistic variant of the operations if we
+	already hold index->lock exclusively. First, search the
+	record. The operation may already have been performed,
+	depending on when the row in the clustered index was
+	scanned. */
+	btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
+				    has_index_lock
+				    ? BTR_MODIFY_TREE
+				    : BTR_MODIFY_LEAF,
+				    &cursor, 0, __FILE__, __LINE__,
+				    &mtr);
+
+	ut_ad(dict_index_get_n_unique(index) > 0);
+	/* This test is somewhat similar to row_ins_must_modify_rec(),
+	but not identical for unique secondary indexes. */
+	if (cursor.low_match >= dict_index_get_n_unique(index)
+	    && !page_rec_is_infimum(btr_cur_get_rec(&cursor))) {
+		/* We have a matching record. */
+		bool	exists	= (cursor.low_match
+				   == dict_index_get_n_fields(index));
+#ifdef UNIV_DEBUG
+		rec_t*	rec	= btr_cur_get_rec(&cursor);
+		ut_ad(page_rec_is_user_rec(rec));
+		ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec)));
+#endif /* UNIV_DEBUG */
+
+		ut_ad(exists || dict_index_is_unique(index));
+
+		switch (op) {
+		case ROW_OP_DELETE:
+			if (!exists) {
+				/* The existing record matches the
+				unique secondary index key, but the
+				PRIMARY KEY columns differ. So, this
+				exact record does not exist. For
+				example, we could detect a duplicate
+				key error in some old index before
+				logging an ROW_OP_INSERT for our
+				index. This ROW_OP_DELETE could have
+				been logged for rolling back
+				TRX_UNDO_INSERT_REC. */
+				goto func_exit;
+			}
+
+			if (btr_cur_optimistic_delete(
+				    &cursor, BTR_CREATE_FLAG, &mtr)) {
+				*error = DB_SUCCESS;
+				break;
+			}
+
+			if (!has_index_lock) {
+				/* This needs a pessimistic operation.
+				Lock the index tree exclusively. */
+				mtr_commit(&mtr);
+				mtr_start(&mtr);
+				btr_cur_search_to_nth_level(
+					index, 0, entry, PAGE_CUR_LE,
+					BTR_MODIFY_TREE, &cursor, 0,
+					__FILE__, __LINE__, &mtr);
+
+				/* No other thread than the current one
+				is allowed to modify the index tree.
+				Thus, the record should still exist. */
+				ut_ad(cursor.low_match
+				      >= dict_index_get_n_fields(index));
+				ut_ad(page_rec_is_user_rec(
+					      btr_cur_get_rec(&cursor)));
+			}
+
+			/* As there are no externally stored fields in
+			a secondary index record, the parameter
+			rb_ctx = RB_NONE will be ignored. */
+
+			btr_cur_pessimistic_delete(
+				error, FALSE, &cursor,
+				BTR_CREATE_FLAG, RB_NONE, &mtr);
+			break;
+		case ROW_OP_INSERT:
+			if (exists) {
+				/* The record already exists. There
+				is nothing to be inserted.
+				This could happen when processing
+				TRX_UNDO_DEL_MARK_REC in statement
+				rollback:
+
+				UPDATE of PRIMARY KEY can lead to
+				statement rollback if the updated
+				value of the PRIMARY KEY already
+				exists. In this case, the UPDATE would
+				be mapped to DELETE;INSERT, and we
+				only wrote undo log for the DELETE
+				part. The duplicate key error would be
+				triggered before logging the INSERT
+				part.
+
+				Theoretically, we could also get a
+				similar situation when a DELETE operation
+				is blocked by a FOREIGN KEY constraint. */
+				goto func_exit;
+			}
+
+			if (dtuple_contains_null(entry)) {
+				/* The UNIQUE KEY columns match, but
+				there is a NULL value in the key, and
+				NULL!=NULL. */
+				goto insert_the_rec;
+			}
+
+			goto duplicate;
+		}
+	} else {
+		switch (op) {
+			rec_t*		rec;
+			big_rec_t*	big_rec;
+		case ROW_OP_DELETE:
+			/* The record does not exist. For example, we
+			could detect a duplicate key error in some old
+			index before logging an ROW_OP_INSERT for our
+			index. This ROW_OP_DELETE could be logged for
+			rolling back TRX_UNDO_INSERT_REC. */
+			goto func_exit;
+		case ROW_OP_INSERT:
+			if (dict_index_is_unique(index)
+			    && (cursor.up_match
+				>= dict_index_get_n_unique(index)
+				|| cursor.low_match
+				>= dict_index_get_n_unique(index))
+			    && (!index->n_nullable
+				|| !dtuple_contains_null(entry))) {
+duplicate:
+				/* Duplicate key */
+				ut_ad(dict_index_is_unique(index));
+				row_merge_dup_report(dup, entry->fields);
+				*error = DB_DUPLICATE_KEY;
+				goto func_exit;
+			}
+insert_the_rec:
+			/* Insert the record. As we are inserting into
+			a secondary index, there cannot be externally
+			stored columns (!big_rec). */
+			*error = btr_cur_optimistic_insert(
+				BTR_NO_UNDO_LOG_FLAG
+				| BTR_NO_LOCKING_FLAG
+				| BTR_CREATE_FLAG,
+				&cursor, &offsets, &offsets_heap,
+				const_cast<dtuple_t*>(entry),
+				&rec, &big_rec, 0, NULL, &mtr);
+			ut_ad(!big_rec);
+			if (*error != DB_FAIL) {
+				break;
+			}
+
+			if (!has_index_lock) {
+				/* This needs a pessimistic operation.
+				Lock the index tree exclusively. */
+				mtr_commit(&mtr);
+				mtr_start(&mtr);
+				btr_cur_search_to_nth_level(
+					index, 0, entry, PAGE_CUR_LE,
+					BTR_MODIFY_TREE, &cursor, 0,
+					__FILE__, __LINE__, &mtr);
+			}
+
+			/* We already determined that the
+			record did not exist. No other thread
+			than the current one is allowed to
+			modify the index tree. Thus, the
+			record should still not exist. */
+
+			*error = btr_cur_pessimistic_insert(
+				BTR_NO_UNDO_LOG_FLAG
+				| BTR_NO_LOCKING_FLAG
+				| BTR_CREATE_FLAG,
+				&cursor, &offsets, &offsets_heap,
+				const_cast<dtuple_t*>(entry),
+				&rec, &big_rec,
+				0, NULL, &mtr);
+			ut_ad(!big_rec);
+			break;
+		}
+		mem_heap_empty(offsets_heap);
+	}
+
+	if (*error == DB_SUCCESS && trx_id) {
+		page_update_max_trx_id(btr_cur_get_block(&cursor),
+				       btr_cur_get_page_zip(&cursor),
+				       trx_id, &mtr);
+	}
+
+func_exit:
+	mtr_commit(&mtr);
+}
+
+/******************************************************//**
+Applies an operation to a secondary index that was being created.
+@return NULL on failure (mrec corruption) or when out of data;
+pointer to next record on success */
+static __attribute__((nonnull, warn_unused_result))
+const mrec_t*
+row_log_apply_op(
+/*=============*/
+	dict_index_t*	index,		/*!< in/out: index */
+	row_merge_dup_t*dup,		/*!< in/out: for reporting
+					duplicate key errors */
+	dberr_t*	error,		/*!< out: DB_SUCCESS or error code */
+	mem_heap_t*	offsets_heap,	/*!< in/out: memory heap for
+					allocating offsets; can be emptied */
+	mem_heap_t*	heap,		/*!< in/out: memory heap for
+					allocating data tuples */
+	bool		has_index_lock, /*!< in: true if holding index->lock
+					in exclusive mode */
+	const mrec_t*	mrec,		/*!< in: merge record */
+	const mrec_t*	mrec_end,	/*!< in: end of buffer */
+	ulint*		offsets)	/*!< in/out: work area for
+					rec_init_offsets_temp() */
+
+{
+	enum row_op	op;
+	ulint		extra_size;
+	ulint		data_size;
+	ulint		n_ext;
+	dtuple_t*	entry;
+	trx_id_t	trx_id;
+
+	/* Online index creation is only used for secondary indexes. */
+	ut_ad(!dict_index_is_clust(index));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX)
+	      == has_index_lock);
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (dict_index_is_corrupted(index)) {
+		*error = DB_INDEX_CORRUPT;
+		return(NULL);
+	}
+
+	*error = DB_SUCCESS;
+
+	if (mrec + ROW_LOG_HEADER_SIZE >= mrec_end) {
+		return(NULL);
+	}
+
+	switch (*mrec) {
+	case ROW_OP_INSERT:
+		if (ROW_LOG_HEADER_SIZE + DATA_TRX_ID_LEN + mrec >= mrec_end) {
+			return(NULL);
+		}
+
+		op = static_cast<enum row_op>(*mrec++);
+		trx_id = trx_read_trx_id(mrec);
+		mrec += DATA_TRX_ID_LEN;
+		break;
+	case ROW_OP_DELETE:
+		op = static_cast<enum row_op>(*mrec++);
+		trx_id = 0;
+		break;
+	default:
+corrupted:
+		ut_ad(0);
+		*error = DB_CORRUPTION;
+		return(NULL);
+	}
+
+	extra_size = *mrec++;
+
+	ut_ad(mrec < mrec_end);
+
+	if (extra_size >= 0x80) {
+		/* Read another byte of extra_size. */
+
+		extra_size = (extra_size & 0x7f) << 8;
+		extra_size |= *mrec++;
+	}
+
+	mrec += extra_size;
+
+	if (mrec > mrec_end) {
+		return(NULL);
+	}
+
+	rec_init_offsets_temp(mrec, index, offsets);
+
+	if (rec_offs_any_extern(offsets)) {
+		/* There should never be any externally stored fields
+		in a secondary index, which is what online index
+		creation is used for. Therefore, the log file must be
+		corrupted. */
+		goto corrupted;
+	}
+
+	data_size = rec_offs_data_size(offsets);
+
+	mrec += data_size;
+
+	if (mrec > mrec_end) {
+		return(NULL);
+	}
+
+	entry = row_rec_to_index_entry_low(
+		mrec - data_size, index, offsets, &n_ext, heap);
+	/* Online index creation is only implemented for secondary
+	indexes, which never contain off-page columns. */
+	ut_ad(n_ext == 0);
+#ifdef ROW_LOG_APPLY_PRINT
+	if (row_log_apply_print) {
+		fprintf(stderr, "apply " IB_ID_FMT " " TRX_ID_FMT " %u %u ",
+			index->id, trx_id,
+			unsigned (op), unsigned (has_index_lock));
+		for (const byte* m = mrec - data_size; m < mrec; m++) {
+			fprintf(stderr, "%02x", *m);
+		}
+		putc('\n', stderr);
+	}
+#endif /* ROW_LOG_APPLY_PRINT */
+	row_log_apply_op_low(index, dup, error, offsets_heap,
+			     has_index_lock, op, trx_id, entry);
+	return(mrec);
+}
+
+/******************************************************//**
+Applies operations to a secondary index that was being created.
+@return DB_SUCCESS, or error code on failure */
+static __attribute__((nonnull))
+dberr_t
+row_log_apply_ops(
+/*==============*/
+	trx_t*		trx,	/*!< in: transaction (for checking if
+				the operation was interrupted) */
+	dict_index_t*	index,	/*!< in/out: index */
+	row_merge_dup_t*dup)	/*!< in/out: for reporting duplicate key
+				errors */
+{
+	dberr_t		error;
+	const mrec_t*	mrec	= NULL;
+	const mrec_t*	next_mrec;
+	const mrec_t*	mrec_end= NULL; /* silence bogus warning */
+	const mrec_t*	next_mrec_end;
+	mem_heap_t*	offsets_heap;
+	mem_heap_t*	heap;
+	ulint*		offsets;
+	bool		has_index_lock;
+	const ulint	i	= 1 + REC_OFFS_HEADER_SIZE
+		+ dict_index_get_n_fields(index);
+
+	ut_ad(dict_index_is_online_ddl(index));
+	ut_ad(*index->name == TEMP_INDEX_PREFIX);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(index->online_log);
+	UNIV_MEM_INVALID(&mrec_end, sizeof mrec_end);
+
+	offsets = static_cast<ulint*>(ut_malloc(i * sizeof *offsets));
+	offsets[0] = i;
+	offsets[1] = dict_index_get_n_fields(index);
+
+	offsets_heap = mem_heap_create(UNIV_PAGE_SIZE);
+	heap = mem_heap_create(UNIV_PAGE_SIZE);
+	has_index_lock = true;
+
+next_block:
+	ut_ad(has_index_lock);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(index->online_log->head.bytes == 0);
+
+	if (trx_is_interrupted(trx)) {
+		goto interrupted;
+	}
+
+	error = index->online_log->error;
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	if (dict_index_is_corrupted(index)) {
+		error = DB_INDEX_CORRUPT;
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(index->online_log->head.blocks
+			  > index->online_log->tail.blocks)) {
+unexpected_eof:
+		fprintf(stderr, "InnoDB: unexpected end of temporary file"
+			" for index %s\n", index->name + 1);
+corruption:
+		error = DB_CORRUPTION;
+		goto func_exit;
+	}
+
+	if (index->online_log->head.blocks
+	    == index->online_log->tail.blocks) {
+		if (index->online_log->head.blocks) {
+#ifdef HAVE_FTRUNCATE
+			/* Truncate the file in order to save space. */
+			if (ftruncate(index->online_log->fd, 0) == -1) {
+				perror("ftruncate");
+			}
+#endif /* HAVE_FTRUNCATE */
+			index->online_log->head.blocks
+				= index->online_log->tail.blocks = 0;
+		}
+
+		next_mrec = index->online_log->tail.block;
+		next_mrec_end = next_mrec + index->online_log->tail.bytes;
+
+		if (next_mrec_end == next_mrec) {
+			/* End of log reached. */
+all_done:
+			ut_ad(has_index_lock);
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->tail.blocks == 0);
+			error = DB_SUCCESS;
+			goto func_exit;
+		}
+	} else {
+		os_offset_t	ofs;
+		ibool		success;
+
+		ofs = (os_offset_t) index->online_log->head.blocks
+			* srv_sort_buf_size;
+
+		ut_ad(has_index_lock);
+		has_index_lock = false;
+		rw_lock_x_unlock(dict_index_get_lock(index));
+
+		log_free_check();
+
+		if (!row_log_block_allocate(index->online_log->head)) {
+			error = DB_OUT_OF_MEMORY;
+			goto func_exit;
+		}
+
+		success = os_file_read_no_error_handling(
+			OS_FILE_FROM_FD(index->online_log->fd),
+			index->online_log->head.block, ofs,
+			srv_sort_buf_size);
+
+		if (!success) {
+			fprintf(stderr, "InnoDB: unable to read temporary file"
+				" for index %s\n", index->name + 1);
+			goto corruption;
+		}
+
+#ifdef POSIX_FADV_DONTNEED
+		/* Each block is read exactly once.  Free up the file cache. */
+		posix_fadvise(index->online_log->fd,
+			      ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+#if 0 //def FALLOC_FL_PUNCH_HOLE
+		/* Try to deallocate the space for the file on disk.
+		This should work on ext4 on Linux 2.6.39 and later,
+		and be ignored when the operation is unsupported. */
+		fallocate(index->online_log->fd,
+			  FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+			  ofs, srv_buf_size);
+#endif /* FALLOC_FL_PUNCH_HOLE */
+
+		next_mrec = index->online_log->head.block;
+		next_mrec_end = next_mrec + srv_sort_buf_size;
+	}
+
+	if (mrec) {
+		/* A partial record was read from the previous block.
+		Copy the temporary buffer full, as we do not know the
+		length of the record. Parse subsequent records from
+		the bigger buffer index->online_log->head.block
+		or index->online_log->tail.block. */
+
+		ut_ad(mrec == index->online_log->head.buf);
+		ut_ad(mrec_end > mrec);
+		ut_ad(mrec_end < (&index->online_log->head.buf)[1]);
+
+		memcpy((mrec_t*) mrec_end, next_mrec,
+		       (&index->online_log->head.buf)[1] - mrec_end);
+		mrec = row_log_apply_op(
+			index, dup, &error, offsets_heap, heap,
+			has_index_lock, index->online_log->head.buf,
+			(&index->online_log->head.buf)[1], offsets);
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		} else if (UNIV_UNLIKELY(mrec == NULL)) {
+			/* The record was not reassembled properly. */
+			goto corruption;
+		}
+		/* The record was previously found out to be
+		truncated. Now that the parse buffer was extended,
+		it should proceed beyond the old end of the buffer. */
+		ut_a(mrec > mrec_end);
+
+		index->online_log->head.bytes = mrec - mrec_end;
+		next_mrec += index->online_log->head.bytes;
+	}
+
+	ut_ad(next_mrec <= next_mrec_end);
+	/* The following loop must not be parsing the temporary
+	buffer, but head.block or tail.block. */
+
+	/* mrec!=NULL means that the next record starts from the
+	middle of the block */
+	ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0));
+
+#ifdef UNIV_DEBUG
+	if (next_mrec_end == index->online_log->head.block
+	    + srv_sort_buf_size) {
+		/* If tail.bytes == 0, next_mrec_end can also be at
+		the end of tail.block. */
+		if (index->online_log->tail.bytes == 0) {
+			ut_ad(next_mrec == next_mrec_end);
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->head.bytes == 0);
+		} else {
+			ut_ad(next_mrec == index->online_log->head.block
+			      + index->online_log->head.bytes);
+			ut_ad(index->online_log->tail.blocks
+			      > index->online_log->head.blocks);
+		}
+	} else if (next_mrec_end == index->online_log->tail.block
+		   + index->online_log->tail.bytes) {
+		ut_ad(next_mrec == index->online_log->tail.block
+		      + index->online_log->head.bytes);
+		ut_ad(index->online_log->tail.blocks == 0);
+		ut_ad(index->online_log->head.blocks == 0);
+		ut_ad(index->online_log->head.bytes
+		      <= index->online_log->tail.bytes);
+	} else {
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+
+	mrec_end = next_mrec_end;
+
+	while (!trx_is_interrupted(trx)) {
+		mrec = next_mrec;
+		ut_ad(mrec < mrec_end);
+
+		if (!has_index_lock) {
+			/* We are applying operations from a different
+			block than the one that is being written to.
+			We do not hold index->lock in order to
+			allow other threads to concurrently buffer
+			modifications. */
+			ut_ad(mrec >= index->online_log->head.block);
+			ut_ad(mrec_end == index->online_log->head.block
+			      + srv_sort_buf_size);
+			ut_ad(index->online_log->head.bytes
+			      < srv_sort_buf_size);
+
+			/* Take the opportunity to do a redo log
+			checkpoint if needed. */
+			log_free_check();
+		} else {
+			/* We are applying operations from the last block.
+			Do not allow other threads to buffer anything,
+			so that we can finally catch up and synchronize. */
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(mrec_end == index->online_log->tail.block
+			      + index->online_log->tail.bytes);
+			ut_ad(mrec >= index->online_log->tail.block);
+		}
+
+		next_mrec = row_log_apply_op(
+			index, dup, &error, offsets_heap, heap,
+			has_index_lock, mrec, mrec_end, offsets);
+
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		} else if (next_mrec == next_mrec_end) {
+			/* The record happened to end on a block boundary.
+			Do we have more blocks left? */
+			if (has_index_lock) {
+				/* The index will be locked while
+				applying the last block. */
+				goto all_done;
+			}
+
+			mrec = NULL;
+process_next_block:
+			rw_lock_x_lock(dict_index_get_lock(index));
+			has_index_lock = true;
+
+			index->online_log->head.bytes = 0;
+			index->online_log->head.blocks++;
+			goto next_block;
+		} else if (next_mrec != NULL) {
+			ut_ad(next_mrec < next_mrec_end);
+			index->online_log->head.bytes += next_mrec - mrec;
+		} else if (has_index_lock) {
+			/* When mrec is within tail.block, it should
+			be a complete record, because we are holding
+			index->lock and thus excluding the writer. */
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(mrec_end == index->online_log->tail.block
+			      + index->online_log->tail.bytes);
+			ut_ad(0);
+			goto unexpected_eof;
+		} else {
+			memcpy(index->online_log->head.buf, mrec,
+			       mrec_end - mrec);
+			mrec_end += index->online_log->head.buf - mrec;
+			mrec = index->online_log->head.buf;
+			goto process_next_block;
+		}
+	}
+
+interrupted:
+	error = DB_INTERRUPTED;
+func_exit:
+	if (!has_index_lock) {
+		rw_lock_x_lock(dict_index_get_lock(index));
+	}
+
+	switch (error) {
+	case DB_SUCCESS:
+		break;
+	case DB_INDEX_CORRUPT:
+		if (((os_offset_t) index->online_log->tail.blocks + 1)
+		    * srv_sort_buf_size >= srv_online_max_size) {
+			/* The log file grew too big. */
+			error = DB_ONLINE_LOG_TOO_BIG;
+		}
+		/* fall through */
+	default:
+		/* We set the flag directly instead of invoking
+		dict_set_corrupted_index_cache_only(index) here,
+		because the index is not "public" yet. */
+		index->type |= DICT_CORRUPT;
+	}
+
+	mem_heap_free(heap);
+	mem_heap_free(offsets_heap);
+	row_log_block_free(index->online_log->head);
+	ut_free(offsets);
+	return(error);
+}
+
+/******************************************************//**
+Apply the row log to the index upon completing index creation.
+@return DB_SUCCESS, or error code on failure */
+UNIV_INTERN
+dberr_t
+row_log_apply(
+/*==========*/
+	trx_t*		trx,	/*!< in: transaction (for checking if
+				the operation was interrupted) */
+	dict_index_t*	index,	/*!< in/out: secondary index */
+	struct TABLE*	table)	/*!< in/out: MySQL table
+				(for reporting duplicates) */
+{
+	dberr_t		error;
+	row_log_t*	log;
+	row_merge_dup_t	dup = { index, table, NULL, 0 };
+	DBUG_ENTER("row_log_apply");
+
+	ut_ad(dict_index_is_online_ddl(index));
+	ut_ad(!dict_index_is_clust(index));
+
+	log_free_check();
+
+	rw_lock_x_lock(dict_index_get_lock(index));
+
+	if (!dict_table_is_corrupted(index->table)) {
+		error = row_log_apply_ops(trx, index, &dup);
+	} else {
+		error = DB_SUCCESS;
+	}
+
+	if (error != DB_SUCCESS) {
+		ut_a(!dict_table_is_discarded(index->table));
+		/* We set the flag directly instead of invoking
+		dict_set_corrupted_index_cache_only(index) here,
+		because the index is not "public" yet. */
+		index->type |= DICT_CORRUPT;
+		index->table->drop_aborted = TRUE;
+
+		dict_index_set_online_status(index, ONLINE_INDEX_ABORTED);
+	} else {
+		ut_ad(dup.n_dup == 0);
+		dict_index_set_online_status(index, ONLINE_INDEX_COMPLETE);
+	}
+
+	log = index->online_log;
+	index->online_log = NULL;
+	/* We could remove the TEMP_INDEX_PREFIX and update the data
+	dictionary to say that this index is complete, if we had
+	access to the .frm file here.  If the server crashes before
+	all requested indexes have been created, this completed index
+	will be dropped. */
+	rw_lock_x_unlock(dict_index_get_lock(index));
+
+	row_log_free(log);
+
+	DBUG_RETURN(error);
+}
diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc
new file mode 100644
index 00000000000..c4224424fdb
--- /dev/null
+++ b/storage/innobase/row/row0merge.cc
@@ -0,0 +1,3732 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0merge.cc
+New index creation routines using a merge sort
+
+Created 12/4/2005 Jan Lindstrom
+Completed by Sunny Bains and Marko Makela
+*******************************************************/
+
+#include "row0merge.h"
+#include "row0ext.h"
+#include "row0log.h"
+#include "row0ins.h"
+#include "row0sel.h"
+#include "dict0crea.h"
+#include "trx0purge.h"
+#include "lock0lock.h"
+#include "pars0pars.h"
+#include "ut0sort.h"
+#include "row0ftsort.h"
+#include "row0import.h"
+#include "handler0alter.h"
+#include "ha_prototypes.h"
+
+/* Ignore posix_fadvise() on those platforms where it does not exist */
+#if defined __WIN__
+# define posix_fadvise(fd, offset, len, advice) /* nothing */
+#endif /* __WIN__ */
+
+#ifdef UNIV_DEBUG
+/** Set these in order ot enable debug printout. */
+/* @{ */
+/** Log each record read from temporary file. */
+static ibool	row_merge_print_read;
+/** Log each record write to temporary file. */
+static ibool	row_merge_print_write;
+/** Log each row_merge_blocks() call, merging two blocks of records to
+a bigger one. */
+static ibool	row_merge_print_block;
+/** Log each block read from temporary file. */
+static ibool	row_merge_print_block_read;
+/** Log each block read from temporary file. */
+static ibool	row_merge_print_block_write;
+/* @} */
+#endif /* UNIV_DEBUG */
+
+/* Whether to disable file system cache */
+UNIV_INTERN char	srv_disable_sort_file_cache;
+
+/* Maximum pending doc memory limit in bytes for a fts tokenization thread */
+#define FTS_PENDING_DOC_MEMORY_LIMIT	1000000
+
+#ifdef UNIV_DEBUG
+/******************************************************//**
+Display a merge tuple. */
+static __attribute__((nonnull))
+void
+row_merge_tuple_print(
+/*==================*/
+	FILE*		f,	/*!< in: output stream */
+	const mtuple_t*	entry,	/*!< in: tuple to print */
+	ulint		n_fields)/*!< in: number of fields in the tuple */
+{
+	ulint	j;
+
+	for (j = 0; j < n_fields; j++) {
+		const dfield_t*	field = &entry->fields[j];
+
+		if (dfield_is_null(field)) {
+			fputs("\n NULL;", f);
+		} else {
+			ulint	field_len	= dfield_get_len(field);
+			ulint	len		= ut_min(field_len, 20);
+			if (dfield_is_ext(field)) {
+				fputs("\nE", f);
+			} else {
+				fputs("\n ", f);
+			}
+			ut_print_buf(f, dfield_get_data(field), len);
+			if (len != field_len) {
+				fprintf(f, " (total %lu bytes)", field_len);
+			}
+		}
+	}
+	putc('\n', f);
+}
+#endif /* UNIV_DEBUG */
+
+/******************************************************//**
+Encode an index record. */
+static __attribute__((nonnull))
+void
+row_merge_buf_encode(
+/*=================*/
+	byte**			b,		/*!< in/out: pointer to
+						current end of output buffer */
+	const dict_index_t*	index,		/*!< in: index */
+	const mtuple_t*		entry,		/*!< in: index fields
+						of the record to encode */
+	ulint			n_fields)	/*!< in: number of fields
+						in the entry */
+{
+	ulint	size;
+	ulint	extra_size;
+
+	size = rec_get_converted_size_temp(
+		index, entry->fields, n_fields, &extra_size);
+	ut_ad(size >= extra_size);
+
+	/* Encode extra_size + 1 */
+	if (extra_size + 1 < 0x80) {
+		*(*b)++ = (byte) (extra_size + 1);
+	} else {
+		ut_ad((extra_size + 1) < 0x8000);
+		*(*b)++ = (byte) (0x80 | ((extra_size + 1) >> 8));
+		*(*b)++ = (byte) (extra_size + 1);
+	}
+
+	rec_convert_dtuple_to_temp(*b + extra_size, index,
+				   entry->fields, n_fields);
+
+	*b += size;
+}
+
+/******************************************************//**
+Allocate a sort buffer.
+@return	own: sort buffer */
+static __attribute__((malloc, nonnull))
+row_merge_buf_t*
+row_merge_buf_create_low(
+/*=====================*/
+	mem_heap_t*	heap,		/*!< in: heap where allocated */
+	dict_index_t*	index,		/*!< in: secondary index */
+	ulint		max_tuples,	/*!< in: maximum number of
+					data tuples */
+	ulint		buf_size)	/*!< in: size of the buffer,
+					in bytes */
+{
+	row_merge_buf_t*	buf;
+
+	ut_ad(max_tuples > 0);
+
+	ut_ad(max_tuples <= srv_sort_buf_size);
+
+	buf = static_cast<row_merge_buf_t*>(mem_heap_zalloc(heap, buf_size));
+	buf->heap = heap;
+	buf->index = index;
+	buf->max_tuples = max_tuples;
+	buf->tuples = static_cast<mtuple_t*>(
+		ut_malloc(2 * max_tuples * sizeof *buf->tuples));
+	buf->tmp_tuples = buf->tuples + max_tuples;
+
+	return(buf);
+}
+
+/******************************************************//**
+Allocate a sort buffer.
+@return	own: sort buffer */
+UNIV_INTERN
+row_merge_buf_t*
+row_merge_buf_create(
+/*=================*/
+	dict_index_t*	index)	/*!< in: secondary index */
+{
+	row_merge_buf_t*	buf;
+	ulint			max_tuples;
+	ulint			buf_size;
+	mem_heap_t*		heap;
+
+	max_tuples = srv_sort_buf_size
+		/ ut_max(1, dict_index_get_min_size(index));
+
+	buf_size = (sizeof *buf);
+
+	heap = mem_heap_create(buf_size);
+
+	buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size);
+
+	return(buf);
+}
+
+/******************************************************//**
+Empty a sort buffer.
+@return	sort buffer */
+UNIV_INTERN
+row_merge_buf_t*
+row_merge_buf_empty(
+/*================*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer */
+{
+	ulint		buf_size	= sizeof *buf;
+	ulint		max_tuples	= buf->max_tuples;
+	mem_heap_t*	heap		= buf->heap;
+	dict_index_t*	index		= buf->index;
+	mtuple_t*	tuples		= buf->tuples;
+
+	mem_heap_empty(heap);
+
+	buf = static_cast<row_merge_buf_t*>(mem_heap_zalloc(heap, buf_size));
+	buf->heap = heap;
+	buf->index = index;
+	buf->max_tuples = max_tuples;
+	buf->tuples = tuples;
+	buf->tmp_tuples = buf->tuples + max_tuples;
+
+	return(buf);
+}
+
+/******************************************************//**
+Deallocate a sort buffer. */
+UNIV_INTERN
+void
+row_merge_buf_free(
+/*===============*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer to be freed */
+{
+	ut_free(buf->tuples);
+	mem_heap_free(buf->heap);
+}
+
+/******************************************************//**
+Insert a data tuple into a sort buffer.
+@return	number of rows added, 0 if out of space */
+static
+ulint
+row_merge_buf_add(
+/*==============*/
+	row_merge_buf_t*	buf,	/*!< in/out: sort buffer */
+	dict_index_t*		fts_index,/*!< in: fts index to be created */
+	const dict_table_t*	old_table,/*!< in: original table */
+	fts_psort_t*		psort_info, /*!< in: parallel sort info */
+	const dtuple_t*		row,	/*!< in: table row */
+	const row_ext_t*	ext,	/*!< in: cache of externally stored
+					column prefixes, or NULL */
+	doc_id_t*		doc_id)	/*!< in/out: Doc ID if we are
+					creating FTS index */
+{
+	ulint			i;
+	const dict_index_t*	index;
+	mtuple_t*		entry;
+	dfield_t*		field;
+	const dict_field_t*	ifield;
+	ulint			n_fields;
+	ulint			data_size;
+	ulint			extra_size;
+	ulint			bucket = 0;
+	doc_id_t		write_doc_id;
+	ulint			n_row_added = 0;
+	DBUG_ENTER("row_merge_buf_add");
+
+	if (buf->n_tuples >= buf->max_tuples) {
+		DBUG_RETURN(0);
+	}
+
+	DBUG_EXECUTE_IF(
+		"ib_row_merge_buf_add_two",
+		if (buf->n_tuples >= 2) DBUG_RETURN(0););
+
+	UNIV_PREFETCH_R(row->fields);
+
+	/* If we are building FTS index, buf->index points to
+	the 'fts_sort_idx', and real FTS index is stored in
+	fts_index */
+	index = (buf->index->type & DICT_FTS) ? fts_index : buf->index;
+
+	n_fields = dict_index_get_n_fields(index);
+
+	entry = &buf->tuples[buf->n_tuples];
+	field = entry->fields = static_cast<dfield_t*>(
+		mem_heap_alloc(buf->heap, n_fields * sizeof *entry->fields));
+
+	data_size = 0;
+	extra_size = UT_BITS_IN_BYTES(index->n_nullable);
+
+	ifield = dict_index_get_nth_field(index, 0);
+
+	for (i = 0; i < n_fields; i++, field++, ifield++) {
+		ulint			len;
+		const dict_col_t*	col;
+		ulint			col_no;
+		ulint			fixed_len;
+		const dfield_t*		row_field;
+
+		col = ifield->col;
+		col_no = dict_col_get_no(col);
+
+		/* Process the Doc ID column */
+		if (*doc_id > 0
+		    && col_no == index->table->fts->doc_col) {
+			fts_write_doc_id((byte*) &write_doc_id, *doc_id);
+
+			/* Note: field->data now points to a value on the
+			stack: &write_doc_id after dfield_set_data(). Because
+			there is only one doc_id per row, it shouldn't matter.
+			We allocate a new buffer before we leave the function
+			later below. */
+
+			dfield_set_data(
+				field, &write_doc_id, sizeof(write_doc_id));
+
+			field->type.mtype = ifield->col->mtype;
+			field->type.prtype = ifield->col->prtype;
+			field->type.mbminmaxlen = DATA_MBMINMAXLEN(0, 0);
+			field->type.len = ifield->col->len;
+		} else {
+			row_field = dtuple_get_nth_field(row, col_no);
+
+			dfield_copy(field, row_field);
+
+			/* Tokenize and process data for FTS */
+			if (index->type & DICT_FTS) {
+				fts_doc_item_t*	doc_item;
+				byte*		value;
+				void*		ptr;
+				const ulint	max_trial_count = 10000;
+				ulint		trial_count = 0;
+
+				/* fetch Doc ID if it already exists
+				in the row, and not supplied by the
+				caller. Even if the value column is
+				NULL, we still need to get the Doc
+				ID so to maintain the correct max
+				Doc ID */
+				if (*doc_id == 0) {
+					const dfield_t*	doc_field;
+					doc_field = dtuple_get_nth_field(
+						row,
+						index->table->fts->doc_col);
+					*doc_id = (doc_id_t) mach_read_from_8(
+						static_cast<byte*>(
+						dfield_get_data(doc_field)));
+
+					if (*doc_id == 0) {
+						ib_logf(IB_LOG_LEVEL_WARN,
+							"FTS Doc ID is zero. "
+							"Record Skipped");
+						DBUG_RETURN(0);
+					}
+				}
+
+				if (dfield_is_null(field)) {
+					n_row_added = 1;
+					continue;
+				}
+
+				ptr = ut_malloc(sizeof(*doc_item)
+						+ field->len);
+
+				doc_item = static_cast<fts_doc_item_t*>(ptr);
+				value = static_cast<byte*>(ptr)
+					+ sizeof(*doc_item);
+				memcpy(value, field->data, field->len);
+				field->data = value;
+
+				doc_item->field = field;
+				doc_item->doc_id = *doc_id;
+
+				bucket = *doc_id % fts_sort_pll_degree;
+
+				/* Add doc item to fts_doc_list */
+				mutex_enter(&psort_info[bucket].mutex);
+
+				if (psort_info[bucket].error == DB_SUCCESS) {
+					UT_LIST_ADD_LAST(
+						doc_list,
+						psort_info[bucket].fts_doc_list,
+						doc_item);
+					psort_info[bucket].memory_used +=
+						sizeof(*doc_item) + field->len;
+				} else {
+					ut_free(doc_item);
+				}
+
+				mutex_exit(&psort_info[bucket].mutex);
+
+				/* Sleep when memory used exceeds limit*/
+				while (psort_info[bucket].memory_used
+				       > FTS_PENDING_DOC_MEMORY_LIMIT
+				       && trial_count++ < max_trial_count) {
+					os_thread_sleep(1000);
+				}
+
+				n_row_added = 1;
+				continue;
+			}
+		}
+
+		len = dfield_get_len(field);
+
+		if (dfield_is_null(field)) {
+			ut_ad(!(col->prtype & DATA_NOT_NULL));
+			continue;
+		} else if (!ext) {
+		} else if (dict_index_is_clust(index)) {
+			/* Flag externally stored fields. */
+			const byte*	buf = row_ext_lookup(ext, col_no,
+							     &len);
+			if (UNIV_LIKELY_NULL(buf)) {
+				ut_a(buf != field_ref_zero);
+				if (i < dict_index_get_n_unique(index)) {
+					dfield_set_data(field, buf, len);
+				} else {
+					dfield_set_ext(field);
+					len = dfield_get_len(field);
+				}
+			}
+		} else {
+			const byte*	buf = row_ext_lookup(ext, col_no,
+							     &len);
+			if (UNIV_LIKELY_NULL(buf)) {
+				ut_a(buf != field_ref_zero);
+				dfield_set_data(field, buf, len);
+			}
+		}
+
+		/* If a column prefix index, take only the prefix */
+
+		if (ifield->prefix_len) {
+			len = dtype_get_at_most_n_mbchars(
+				col->prtype,
+				col->mbminmaxlen,
+				ifield->prefix_len,
+				len,
+				static_cast<char*>(dfield_get_data(field)));
+			dfield_set_len(field, len);
+		}
+
+		ut_ad(len <= col->len || col->mtype == DATA_BLOB);
+
+		fixed_len = ifield->fixed_len;
+		if (fixed_len && !dict_table_is_comp(index->table)
+		    && DATA_MBMINLEN(col->mbminmaxlen)
+		    != DATA_MBMAXLEN(col->mbminmaxlen)) {
+			/* CHAR in ROW_FORMAT=REDUNDANT is always
+			fixed-length, but in the temporary file it is
+			variable-length for variable-length character
+			sets. */
+			fixed_len = 0;
+		}
+
+		if (fixed_len) {
+#ifdef UNIV_DEBUG
+			ulint	mbminlen = DATA_MBMINLEN(col->mbminmaxlen);
+			ulint	mbmaxlen = DATA_MBMAXLEN(col->mbminmaxlen);
+
+			/* len should be between size calcualted base on
+			mbmaxlen and mbminlen */
+			ut_ad(len <= fixed_len);
+			ut_ad(!mbmaxlen || len >= mbminlen
+			      * (fixed_len / mbmaxlen));
+
+			ut_ad(!dfield_is_ext(field));
+#endif /* UNIV_DEBUG */
+		} else if (dfield_is_ext(field)) {
+			extra_size += 2;
+		} else if (len < 128
+			   || (col->len < 256 && col->mtype != DATA_BLOB)) {
+			extra_size++;
+		} else {
+			/* For variable-length columns, we look up the
+			maximum length from the column itself.  If this
+			is a prefix index column shorter than 256 bytes,
+			this will waste one byte. */
+			extra_size += 2;
+		}
+		data_size += len;
+	}
+
+	/* If this is FTS index, we already populated the sort buffer, return
+	here */
+	if (index->type & DICT_FTS) {
+		DBUG_RETURN(n_row_added);
+	}
+
+#ifdef UNIV_DEBUG
+	{
+		ulint	size;
+		ulint	extra;
+
+		size = rec_get_converted_size_temp(
+			index, entry->fields, n_fields, &extra);
+
+		ut_ad(data_size + extra_size == size);
+		ut_ad(extra_size == extra);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* Add to the total size of the record in row_merge_block_t
+	the encoded length of extra_size and the extra bytes (extra_size).
+	See row_merge_buf_write() for the variable-length encoding
+	of extra_size. */
+	data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
+
+	ut_ad(data_size < srv_sort_buf_size);
+
+	/* Reserve one byte for the end marker of row_merge_block_t. */
+	if (buf->total_size + data_size >= srv_sort_buf_size - 1) {
+		DBUG_RETURN(0);
+	}
+
+	buf->total_size += data_size;
+	buf->n_tuples++;
+	n_row_added++;
+
+	field = entry->fields;
+
+	/* Copy the data fields. */
+
+	do {
+		dfield_dup(field++, buf->heap);
+	} while (--n_fields);
+
+	DBUG_RETURN(n_row_added);
+}
+
+/*************************************************************//**
+Report a duplicate key. */
+UNIV_INTERN
+void
+row_merge_dup_report(
+/*=================*/
+	row_merge_dup_t*	dup,	/*!< in/out: for reporting duplicates */
+	const dfield_t*		entry)	/*!< in: duplicate index entry */
+{
+	if (!dup->n_dup++) {
+		/* Only report the first duplicate record,
+		but count all duplicate records. */
+		innobase_fields_to_mysql(dup->table, dup->index, entry);
+	}
+}
+
+/*************************************************************//**
+Compare two tuples.
+@return	1, 0, -1 if a is greater, equal, less, respectively, than b */
+static __attribute__((warn_unused_result))
+int
+row_merge_tuple_cmp(
+/*================*/
+	ulint			n_uniq,	/*!< in: number of unique fields */
+	ulint			n_field,/*!< in: number of fields */
+	const mtuple_t&		a,	/*!< in: first tuple to be compared */
+	const mtuple_t&		b,	/*!< in: second tuple to be compared */
+	row_merge_dup_t*	dup)	/*!< in/out: for reporting duplicates,
+					NULL if non-unique index */
+{
+	int		cmp;
+	const dfield_t*	af	= a.fields;
+	const dfield_t*	bf	= b.fields;
+	ulint		n	= n_uniq;
+
+	ut_ad(n_uniq > 0);
+	ut_ad(n_uniq <= n_field);
+
+	/* Compare the fields of the tuples until a difference is
+	found or we run out of fields to compare.  If !cmp at the
+	end, the tuples are equal. */
+	do {
+		cmp = cmp_dfield_dfield(af++, bf++);
+	} while (!cmp && --n);
+
+	if (cmp) {
+		return(cmp);
+	}
+
+	if (dup) {
+		/* Report a duplicate value error if the tuples are
+		logically equal.  NULL columns are logically inequal,
+		although they are equal in the sorting order.  Find
+		out if any of the fields are NULL. */
+		for (const dfield_t* df = a.fields; df != af; df++) {
+			if (dfield_is_null(df)) {
+				goto no_report;
+			}
+		}
+
+		row_merge_dup_report(dup, a.fields);
+	}
+
+no_report:
+	/* The n_uniq fields were equal, but we compare all fields so
+	that we will get the same (internal) order as in the B-tree. */
+	for (n = n_field - n_uniq + 1; --n; ) {
+		cmp = cmp_dfield_dfield(af++, bf++);
+		if (cmp) {
+			return(cmp);
+		}
+	}
+
+	/* This should never be reached, except in a secondary index
+	when creating a secondary index and a PRIMARY KEY, and there
+	is a duplicate in the PRIMARY KEY that has not been detected
+	yet. Internally, an index must never contain duplicates. */
+	return(cmp);
+}
+
+/** Wrapper for row_merge_tuple_sort() to inject some more context to
+UT_SORT_FUNCTION_BODY().
+@param tuples	array of tuples that being sorted
+@param aux	work area, same size as tuples[]
+@param low	lower bound of the sorting area, inclusive
+@param high	upper bound of the sorting area, inclusive */
+#define row_merge_tuple_sort_ctx(tuples, aux, low, high)		\
+	row_merge_tuple_sort(n_uniq, n_field, dup, tuples, aux, low, high)
+/** Wrapper for row_merge_tuple_cmp() to inject some more context to
+UT_SORT_FUNCTION_BODY().
+@param a	first tuple to be compared
+@param b	second tuple to be compared
+@return	1, 0, -1 if a is greater, equal, less, respectively, than b */
+#define row_merge_tuple_cmp_ctx(a,b)			\
+	row_merge_tuple_cmp(n_uniq, n_field, a, b, dup)
+
+/**********************************************************************//**
+Merge sort the tuple buffer in main memory. */
+static __attribute__((nonnull(4,5)))
+void
+row_merge_tuple_sort(
+/*=================*/
+	ulint			n_uniq,	/*!< in: number of unique fields */
+	ulint			n_field,/*!< in: number of fields */
+	row_merge_dup_t*	dup,	/*!< in/out: reporter of duplicates
+					(NULL if non-unique index) */
+	mtuple_t*		tuples,	/*!< in/out: tuples */
+	mtuple_t*		aux,	/*!< in/out: work area */
+	ulint			low,	/*!< in: lower bound of the
+					sorting area, inclusive */
+	ulint			high)	/*!< in: upper bound of the
+					sorting area, exclusive */
+{
+	ut_ad(n_field > 0);
+	ut_ad(n_uniq <= n_field);
+
+	UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx,
+			      tuples, aux, low, high, row_merge_tuple_cmp_ctx);
+}
+
+/******************************************************//**
+Sort a buffer. */
+UNIV_INTERN
+void
+row_merge_buf_sort(
+/*===============*/
+	row_merge_buf_t*	buf,	/*!< in/out: sort buffer */
+	row_merge_dup_t*	dup)	/*!< in/out: reporter of duplicates
+					(NULL if non-unique index) */
+{
+	row_merge_tuple_sort(dict_index_get_n_unique(buf->index),
+			     dict_index_get_n_fields(buf->index),
+			     dup,
+			     buf->tuples, buf->tmp_tuples, 0, buf->n_tuples);
+}
+
+/******************************************************//**
+Write a buffer to a block. */
+UNIV_INTERN
+void
+row_merge_buf_write(
+/*================*/
+	const row_merge_buf_t*	buf,	/*!< in: sorted buffer */
+	const merge_file_t*	of UNIV_UNUSED,
+					/*!< in: output file */
+	row_merge_block_t*	block)	/*!< out: buffer for writing to file */
+{
+	const dict_index_t*	index	= buf->index;
+	ulint			n_fields= dict_index_get_n_fields(index);
+	byte*			b	= &block[0];
+
+	for (ulint i = 0; i < buf->n_tuples; i++) {
+		const mtuple_t*	entry	= &buf->tuples[i];
+
+		row_merge_buf_encode(&b, index, entry, n_fields);
+		ut_ad(b < &block[srv_sort_buf_size]);
+#ifdef UNIV_DEBUG
+		if (row_merge_print_write) {
+			fprintf(stderr, "row_merge_buf_write %p,%d,%lu %lu",
+				(void*) b, of->fd, (ulong) of->offset,
+				(ulong) i);
+			row_merge_tuple_print(stderr, entry, n_fields);
+		}
+#endif /* UNIV_DEBUG */
+	}
+
+	/* Write an "end-of-chunk" marker. */
+	ut_a(b < &block[srv_sort_buf_size]);
+	ut_a(b == &block[0] + buf->total_size);
+	*b++ = 0;
+#ifdef UNIV_DEBUG_VALGRIND
+	/* The rest of the block is uninitialized.  Initialize it
+	to avoid bogus warnings. */
+	memset(b, 0xff, &block[srv_sort_buf_size] - b);
+#endif /* UNIV_DEBUG_VALGRIND */
+#ifdef UNIV_DEBUG
+	if (row_merge_print_write) {
+		fprintf(stderr, "row_merge_buf_write %p,%d,%lu EOF\n",
+			(void*) b, of->fd, (ulong) of->offset);
+	}
+#endif /* UNIV_DEBUG */
+}
+
+/******************************************************//**
+Create a memory heap and allocate space for row_merge_rec_offsets()
+and mrec_buf_t[3].
+@return	memory heap */
+static
+mem_heap_t*
+row_merge_heap_create(
+/*==================*/
+	const dict_index_t*	index,		/*!< in: record descriptor */
+	mrec_buf_t**		buf,		/*!< out: 3 buffers */
+	ulint**			offsets1,	/*!< out: offsets */
+	ulint**			offsets2)	/*!< out: offsets */
+{
+	ulint		i	= 1 + REC_OFFS_HEADER_SIZE
+		+ dict_index_get_n_fields(index);
+	mem_heap_t*	heap	= mem_heap_create(2 * i * sizeof **offsets1
+						  + 3 * sizeof **buf);
+
+	*buf = static_cast<mrec_buf_t*>(
+		mem_heap_alloc(heap, 3 * sizeof **buf));
+	*offsets1 = static_cast<ulint*>(
+		mem_heap_alloc(heap, i * sizeof **offsets1));
+	*offsets2 = static_cast<ulint*>(
+		mem_heap_alloc(heap, i * sizeof **offsets2));
+
+	(*offsets1)[0] = (*offsets2)[0] = i;
+	(*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index);
+
+	return(heap);
+}
+
+/********************************************************************//**
+Read a merge block from the file system.
+@return	TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+row_merge_read(
+/*===========*/
+	int			fd,	/*!< in: file descriptor */
+	ulint			offset,	/*!< in: offset where to read
+					in number of row_merge_block_t
+					elements */
+	row_merge_block_t*	buf)	/*!< out: data */
+{
+	os_offset_t	ofs = ((os_offset_t) offset) * srv_sort_buf_size;
+	ibool		success;
+
+	DBUG_EXECUTE_IF("row_merge_read_failure", return(FALSE););
+
+#ifdef UNIV_DEBUG
+	if (row_merge_print_block_read) {
+		fprintf(stderr, "row_merge_read fd=%d ofs=%lu\n",
+			fd, (ulong) offset);
+	}
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_DEBUG
+	if (row_merge_print_block_read) {
+		fprintf(stderr, "row_merge_read fd=%d ofs=%lu\n",
+			fd, (ulong) offset);
+	}
+#endif /* UNIV_DEBUG */
+
+	success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf,
+						 ofs, srv_sort_buf_size);
+#ifdef POSIX_FADV_DONTNEED
+	/* Each block is read exactly once.  Free up the file cache. */
+	posix_fadvise(fd, ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+	if (UNIV_UNLIKELY(!success)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: failed to read merge block at " UINT64PF "\n",
+			ofs);
+	}
+
+	return(UNIV_LIKELY(success));
+}
+
+/********************************************************************//**
+Write a merge block to the file system.
+@return	TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+row_merge_write(
+/*============*/
+	int		fd,	/*!< in: file descriptor */
+	ulint		offset,	/*!< in: offset where to write,
+				in number of row_merge_block_t elements */
+	const void*	buf)	/*!< in: data */
+{
+	size_t		buf_len = srv_sort_buf_size;
+	os_offset_t	ofs = buf_len * (os_offset_t) offset;
+	ibool		ret;
+
+	DBUG_EXECUTE_IF("row_merge_write_failure", return(FALSE););
+
+	ret = os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf, ofs, buf_len);
+
+#ifdef UNIV_DEBUG
+	if (row_merge_print_block_write) {
+		fprintf(stderr, "row_merge_write fd=%d ofs=%lu\n",
+			fd, (ulong) offset);
+	}
+#endif /* UNIV_DEBUG */
+
+#ifdef POSIX_FADV_DONTNEED
+	/* The block will be needed on the next merge pass,
+	but it can be evicted from the file cache meanwhile. */
+	posix_fadvise(fd, ofs, buf_len, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+	return(UNIV_LIKELY(ret));
+}
+
+/********************************************************************//**
+Read a merge record.
+@return	pointer to next record, or NULL on I/O error or end of list */
+UNIV_INTERN
+const byte*
+row_merge_read_rec(
+/*===============*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	mrec_buf_t*		buf,	/*!< in/out: secondary buffer */
+	const byte*		b,	/*!< in: pointer to record */
+	const dict_index_t*	index,	/*!< in: index of the record */
+	int			fd,	/*!< in: file descriptor */
+	ulint*			foffs,	/*!< in/out: file offset */
+	const mrec_t**		mrec,	/*!< out: pointer to merge record,
+					or NULL on end of list
+					(non-NULL on I/O error) */
+	ulint*			offsets)/*!< out: offsets of mrec */
+{
+	ulint	extra_size;
+	ulint	data_size;
+	ulint	avail_size;
+
+	ut_ad(block);
+	ut_ad(buf);
+	ut_ad(b >= &block[0]);
+	ut_ad(b < &block[srv_sort_buf_size]);
+	ut_ad(index);
+	ut_ad(foffs);
+	ut_ad(mrec);
+	ut_ad(offsets);
+
+	ut_ad(*offsets == 1 + REC_OFFS_HEADER_SIZE
+	      + dict_index_get_n_fields(index));
+
+	extra_size = *b++;
+
+	if (UNIV_UNLIKELY(!extra_size)) {
+		/* End of list */
+		*mrec = NULL;
+#ifdef UNIV_DEBUG
+		if (row_merge_print_read) {
+			fprintf(stderr, "row_merge_read %p,%p,%d,%lu EOF\n",
+				(const void*) b, (const void*) block,
+				fd, (ulong) *foffs);
+		}
+#endif /* UNIV_DEBUG */
+		return(NULL);
+	}
+
+	if (extra_size >= 0x80) {
+		/* Read another byte of extra_size. */
+
+		if (UNIV_UNLIKELY(b >= &block[srv_sort_buf_size])) {
+			if (!row_merge_read(fd, ++(*foffs), block)) {
+err_exit:
+				/* Signal I/O error. */
+				*mrec = b;
+				return(NULL);
+			}
+
+			/* Wrap around to the beginning of the buffer. */
+			b = &block[0];
+		}
+
+		extra_size = (extra_size & 0x7f) << 8;
+		extra_size |= *b++;
+	}
+
+	/* Normalize extra_size.  Above, value 0 signals "end of list". */
+	extra_size--;
+
+	/* Read the extra bytes. */
+
+	if (UNIV_UNLIKELY(b + extra_size >= &block[srv_sort_buf_size])) {
+		/* The record spans two blocks.  Copy the entire record
+		to the auxiliary buffer and handle this as a special
+		case. */
+
+		avail_size = &block[srv_sort_buf_size] - b;
+		ut_ad(avail_size < sizeof *buf);
+		memcpy(*buf, b, avail_size);
+
+		if (!row_merge_read(fd, ++(*foffs), block)) {
+
+			goto err_exit;
+		}
+
+		/* Wrap around to the beginning of the buffer. */
+		b = &block[0];
+
+		/* Copy the record. */
+		memcpy(*buf + avail_size, b, extra_size - avail_size);
+		b += extra_size - avail_size;
+
+		*mrec = *buf + extra_size;
+
+		rec_init_offsets_temp(*mrec, index, offsets);
+
+		data_size = rec_offs_data_size(offsets);
+
+		/* These overflows should be impossible given that
+		records are much smaller than either buffer, and
+		the record starts near the beginning of each buffer. */
+		ut_a(extra_size + data_size < sizeof *buf);
+		ut_a(b + data_size < &block[srv_sort_buf_size]);
+
+		/* Copy the data bytes. */
+		memcpy(*buf + extra_size, b, data_size);
+		b += data_size;
+
+		goto func_exit;
+	}
+
+	*mrec = b + extra_size;
+
+	rec_init_offsets_temp(*mrec, index, offsets);
+
+	data_size = rec_offs_data_size(offsets);
+	ut_ad(extra_size + data_size < sizeof *buf);
+
+	b += extra_size + data_size;
+
+	if (UNIV_LIKELY(b < &block[srv_sort_buf_size])) {
+		/* The record fits entirely in the block.
+		This is the normal case. */
+		goto func_exit;
+	}
+
+	/* The record spans two blocks.  Copy it to buf. */
+
+	b -= extra_size + data_size;
+	avail_size = &block[srv_sort_buf_size] - b;
+	memcpy(*buf, b, avail_size);
+	*mrec = *buf + extra_size;
+#ifdef UNIV_DEBUG
+	/* We cannot invoke rec_offs_make_valid() here, because there
+	are no REC_N_NEW_EXTRA_BYTES between extra_size and data_size.
+	Similarly, rec_offs_validate() would fail, because it invokes
+	rec_get_status(). */
+	offsets[2] = (ulint) *mrec;
+	offsets[3] = (ulint) index;
+#endif /* UNIV_DEBUG */
+
+	if (!row_merge_read(fd, ++(*foffs), block)) {
+
+		goto err_exit;
+	}
+
+	/* Wrap around to the beginning of the buffer. */
+	b = &block[0];
+
+	/* Copy the rest of the record. */
+	memcpy(*buf + avail_size, b, extra_size + data_size - avail_size);
+	b += extra_size + data_size - avail_size;
+
+func_exit:
+#ifdef UNIV_DEBUG
+	if (row_merge_print_read) {
+		fprintf(stderr, "row_merge_read %p,%p,%d,%lu ",
+			(const void*) b, (const void*) block,
+			fd, (ulong) *foffs);
+		rec_print_comp(stderr, *mrec, offsets);
+		putc('\n', stderr);
+	}
+#endif /* UNIV_DEBUG */
+
+	return(b);
+}
+
+/********************************************************************//**
+Write a merge record. */
+static
+void
+row_merge_write_rec_low(
+/*====================*/
+	byte*		b,	/*!< out: buffer */
+	ulint		e,	/*!< in: encoded extra_size */
+#ifdef UNIV_DEBUG
+	ulint		size,	/*!< in: total size to write */
+	int		fd,	/*!< in: file descriptor */
+	ulint		foffs,	/*!< in: file offset */
+#endif /* UNIV_DEBUG */
+	const mrec_t*	mrec,	/*!< in: record to write */
+	const ulint*	offsets)/*!< in: offsets of mrec */
+#ifndef UNIV_DEBUG
+# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets)	\
+	row_merge_write_rec_low(b, e, mrec, offsets)
+#endif /* !UNIV_DEBUG */
+{
+#ifdef UNIV_DEBUG
+	const byte* const end = b + size;
+	ut_ad(e == rec_offs_extra_size(offsets) + 1);
+
+	if (row_merge_print_write) {
+		fprintf(stderr, "row_merge_write %p,%d,%lu ",
+			(void*) b, fd, (ulong) foffs);
+		rec_print_comp(stderr, mrec, offsets);
+		putc('\n', stderr);
+	}
+#endif /* UNIV_DEBUG */
+
+	if (e < 0x80) {
+		*b++ = (byte) e;
+	} else {
+		*b++ = (byte) (0x80 | (e >> 8));
+		*b++ = (byte) e;
+	}
+
+	memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets));
+	ut_ad(b + rec_offs_size(offsets) == end);
+}
+
+/********************************************************************//**
+Write a merge record.
+@return	pointer to end of block, or NULL on error */
+static
+byte*
+row_merge_write_rec(
+/*================*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	mrec_buf_t*		buf,	/*!< in/out: secondary buffer */
+	byte*			b,	/*!< in: pointer to end of block */
+	int			fd,	/*!< in: file descriptor */
+	ulint*			foffs,	/*!< in/out: file offset */
+	const mrec_t*		mrec,	/*!< in: record to write */
+	const ulint*		offsets)/*!< in: offsets of mrec */
+{
+	ulint	extra_size;
+	ulint	size;
+	ulint	avail_size;
+
+	ut_ad(block);
+	ut_ad(buf);
+	ut_ad(b >= &block[0]);
+	ut_ad(b < &block[srv_sort_buf_size]);
+	ut_ad(mrec);
+	ut_ad(foffs);
+	ut_ad(mrec < &block[0] || mrec > &block[srv_sort_buf_size]);
+	ut_ad(mrec < buf[0] || mrec > buf[1]);
+
+	/* Normalize extra_size.  Value 0 signals "end of list". */
+	extra_size = rec_offs_extra_size(offsets) + 1;
+
+	size = extra_size + (extra_size >= 0x80)
+		+ rec_offs_data_size(offsets);
+
+	if (UNIV_UNLIKELY(b + size >= &block[srv_sort_buf_size])) {
+		/* The record spans two blocks.
+		Copy it to the temporary buffer first. */
+		avail_size = &block[srv_sort_buf_size] - b;
+
+		row_merge_write_rec_low(buf[0],
+					extra_size, size, fd, *foffs,
+					mrec, offsets);
+
+		/* Copy the head of the temporary buffer, write
+		the completed block, and copy the tail of the
+		record to the head of the new block. */
+		memcpy(b, buf[0], avail_size);
+
+		if (!row_merge_write(fd, (*foffs)++, block)) {
+			return(NULL);
+		}
+
+		UNIV_MEM_INVALID(&block[0], srv_sort_buf_size);
+
+		/* Copy the rest. */
+		b = &block[0];
+		memcpy(b, buf[0] + avail_size, size - avail_size);
+		b += size - avail_size;
+	} else {
+		row_merge_write_rec_low(b, extra_size, size, fd, *foffs,
+					mrec, offsets);
+		b += size;
+	}
+
+	return(b);
+}
+
+/********************************************************************//**
+Write an end-of-list marker.
+@return	pointer to end of block, or NULL on error */
+static
+byte*
+row_merge_write_eof(
+/*================*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	byte*			b,	/*!< in: pointer to end of block */
+	int			fd,	/*!< in: file descriptor */
+	ulint*			foffs)	/*!< in/out: file offset */
+{
+	ut_ad(block);
+	ut_ad(b >= &block[0]);
+	ut_ad(b < &block[srv_sort_buf_size]);
+	ut_ad(foffs);
+#ifdef UNIV_DEBUG
+	if (row_merge_print_write) {
+		fprintf(stderr, "row_merge_write %p,%p,%d,%lu EOF\n",
+			(void*) b, (void*) block, fd, (ulong) *foffs);
+	}
+#endif /* UNIV_DEBUG */
+
+	*b++ = 0;
+	UNIV_MEM_ASSERT_RW(&block[0], b - &block[0]);
+	UNIV_MEM_ASSERT_W(&block[0], srv_sort_buf_size);
+#ifdef UNIV_DEBUG_VALGRIND
+	/* The rest of the block is uninitialized.  Initialize it
+	to avoid bogus warnings. */
+	memset(b, 0xff, &block[srv_sort_buf_size] - b);
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	if (!row_merge_write(fd, (*foffs)++, block)) {
+		return(NULL);
+	}
+
+	UNIV_MEM_INVALID(&block[0], srv_sort_buf_size);
+	return(&block[0]);
+}
+
+/********************************************************************//**
+Reads clustered index of the table and create temporary files
+containing the index entries for the indexes to be built.
+@return	DB_SUCCESS or error */
+static __attribute__((nonnull(1,2,3,4,6,9,10,16), warn_unused_result))
+dberr_t
+row_merge_read_clustered_index(
+/*===========================*/
+	trx_t*			trx,	/*!< in: transaction */
+	struct TABLE*		table,	/*!< in/out: MySQL table object,
+					for reporting erroneous records */
+	const dict_table_t*	old_table,/*!< in: table where rows are
+					read from */
+	const dict_table_t*	new_table,/*!< in: table where indexes are
+					created; identical to old_table
+					unless creating a PRIMARY KEY */
+	bool			online,	/*!< in: true if creating indexes
+					online */
+	dict_index_t**		index,	/*!< in: indexes to be created */
+	dict_index_t*		fts_sort_idx,
+					/*!< in: full-text index to be created,
+					or NULL */
+	fts_psort_t*		psort_info,
+					/*!< in: parallel sort info for
+					fts_sort_idx creation, or NULL */
+	merge_file_t*		files,	/*!< in: temporary files */
+	const ulint*		key_numbers,
+					/*!< in: MySQL key numbers to create */
+	ulint			n_index,/*!< in: number of indexes to create */
+	const dtuple_t*		add_cols,
+					/*!< in: default values of
+					added columns, or NULL */
+	const ulint*		col_map,/*!< in: mapping of old column
+					numbers to new ones, or NULL
+					if old_table == new_table */
+	ulint			add_autoinc,
+					/*!< in: number of added
+					AUTO_INCREMENT column, or
+					ULINT_UNDEFINED if none is added */
+	ib_sequence_t&		sequence,/*!< in/out: autoinc sequence */
+	row_merge_block_t*	block)	/*!< in/out: file buffer */
+{
+	dict_index_t*		clust_index;	/* Clustered index */
+	mem_heap_t*		row_heap;	/* Heap memory to create
+						clustered index tuples */
+	row_merge_buf_t**	merge_buf;	/* Temporary list for records*/
+	btr_pcur_t		pcur;		/* Cursor on the clustered
+						index */
+	mtr_t			mtr;		/* Mini transaction */
+	dberr_t			err = DB_SUCCESS;/* Return code */
+	ulint			n_nonnull = 0;	/* number of columns
+						changed to NOT NULL */
+	ulint*			nonnull = NULL;	/* NOT NULL columns */
+	dict_index_t*		fts_index = NULL;/* FTS index */
+	doc_id_t		doc_id = 0;
+	doc_id_t		max_doc_id = 0;
+	ibool			add_doc_id = FALSE;
+	os_event_t		fts_parallel_sort_event = NULL;
+	ibool			fts_pll_sort = FALSE;
+	ib_int64_t		sig_count = 0;
+	DBUG_ENTER("row_merge_read_clustered_index");
+
+	ut_ad((old_table == new_table) == !col_map);
+	ut_ad(!add_cols || col_map);
+
+	trx->op_info = "reading clustered index";
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	DEBUG_FTS_SORT_PRINT("FTS_SORT: Start Create Index\n");
+#endif
+
+	/* Create and initialize memory for record buffers */
+
+	merge_buf = static_cast<row_merge_buf_t**>(
+		mem_alloc(n_index * sizeof *merge_buf));
+
+	for (ulint i = 0; i < n_index; i++) {
+		if (index[i]->type & DICT_FTS) {
+
+			/* We are building a FT index, make sure
+			we have the temporary 'fts_sort_idx' */
+			ut_a(fts_sort_idx);
+
+			fts_index = index[i];
+
+			merge_buf[i] = row_merge_buf_create(fts_sort_idx);
+
+			add_doc_id = DICT_TF2_FLAG_IS_SET(
+				new_table, DICT_TF2_FTS_ADD_DOC_ID);
+
+			/* If Doc ID does not exist in the table itself,
+			fetch the first FTS Doc ID */
+			if (add_doc_id) {
+				fts_get_next_doc_id(
+					(dict_table_t*) new_table,
+					&doc_id);
+				ut_ad(doc_id > 0);
+			}
+
+			fts_pll_sort = TRUE;
+			row_fts_start_psort(psort_info);
+			fts_parallel_sort_event =
+				 psort_info[0].psort_common->sort_event;
+		} else {
+			merge_buf[i] = row_merge_buf_create(index[i]);
+		}
+	}
+
+	mtr_start(&mtr);
+
+	/* Find the clustered index and create a persistent cursor
+	based on that. */
+
+	clust_index = dict_table_get_first_index(old_table);
+
+	btr_pcur_open_at_index_side(
+		true, clust_index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
+
+	if (old_table != new_table) {
+		/* The table is being rebuilt.  Identify the columns
+		that were flagged NOT NULL in the new table, so that
+		we can quickly check that the records in the old table
+		do not violate the added NOT NULL constraints. */
+
+		nonnull = static_cast<ulint*>(
+			mem_alloc(dict_table_get_n_cols(new_table)
+				  * sizeof *nonnull));
+
+		for (ulint i = 0; i < dict_table_get_n_cols(old_table); i++) {
+			if (dict_table_get_nth_col(old_table, i)->prtype
+			    & DATA_NOT_NULL) {
+				continue;
+			}
+
+			const ulint j = col_map[i];
+
+			if (j == ULINT_UNDEFINED) {
+				/* The column was dropped. */
+				continue;
+			}
+
+			if (dict_table_get_nth_col(new_table, j)->prtype
+			    & DATA_NOT_NULL) {
+				nonnull[n_nonnull++] = j;
+			}
+		}
+
+		if (!n_nonnull) {
+			mem_free(nonnull);
+			nonnull = NULL;
+		}
+	}
+
+	row_heap = mem_heap_create(sizeof(mrec_buf_t));
+
+	/* Scan the clustered index. */
+	for (;;) {
+		const rec_t*	rec;
+		ulint*		offsets;
+		const dtuple_t*	row;
+		row_ext_t*	ext;
+		page_cur_t*	cur	= btr_pcur_get_page_cur(&pcur);
+
+		page_cur_move_to_next(cur);
+
+		if (page_cur_is_after_last(cur)) {
+			if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+				err = DB_INTERRUPTED;
+				trx->error_key_num = 0;
+				goto func_exit;
+			}
+
+			if (online && old_table != new_table) {
+				err = row_log_table_get_error(clust_index);
+				if (err != DB_SUCCESS) {
+					trx->error_key_num = 0;
+					goto func_exit;
+				}
+			}
+#ifdef DBUG_OFF
+# define dbug_run_purge	false
+#else /* DBUG_OFF */
+			bool	dbug_run_purge = false;
+#endif /* DBUG_OFF */
+			DBUG_EXECUTE_IF(
+				"ib_purge_on_create_index_page_switch",
+				dbug_run_purge = true;);
+
+			if (dbug_run_purge
+			    || rw_lock_get_waiters(
+				    dict_index_get_lock(clust_index))) {
+				/* There are waiters on the clustered
+				index tree lock, likely the purge
+				thread. Store and restore the cursor
+				position, and yield so that scanning a
+				large table will not starve other
+				threads. */
+
+				/* Store the cursor position on the last user
+				record on the page. */
+				btr_pcur_move_to_prev_on_page(&pcur);
+				/* Leaf pages must never be empty, unless
+				this is the only page in the index tree. */
+				ut_ad(btr_pcur_is_on_user_rec(&pcur)
+				      || buf_block_get_page_no(
+					      btr_pcur_get_block(&pcur))
+				      == clust_index->page);
+
+				btr_pcur_store_position(&pcur, &mtr);
+				mtr_commit(&mtr);
+
+				if (dbug_run_purge) {
+					/* This is for testing
+					purposes only (see
+					DBUG_EXECUTE_IF above).  We
+					signal the purge thread and
+					hope that the purge batch will
+					complete before we execute
+					btr_pcur_restore_position(). */
+					trx_purge_run();
+					os_thread_sleep(1000000);
+				}
+
+				/* Give the waiters a chance to proceed. */
+				os_thread_yield();
+
+				mtr_start(&mtr);
+				/* Restore position on the record, or its
+				predecessor if the record was purged
+				meanwhile. */
+				btr_pcur_restore_position(
+					BTR_SEARCH_LEAF, &pcur, &mtr);
+				/* Move to the successor of the
+				original record. */
+				if (!btr_pcur_move_to_next_user_rec(
+					    &pcur, &mtr)) {
+end_of_index:
+					row = NULL;
+					mtr_commit(&mtr);
+					mem_heap_free(row_heap);
+					if (nonnull) {
+						mem_free(nonnull);
+					}
+					goto write_buffers;
+				}
+			} else {
+				ulint		next_page_no;
+				buf_block_t*	block;
+
+				next_page_no = btr_page_get_next(
+					page_cur_get_page(cur), &mtr);
+
+				if (next_page_no == FIL_NULL) {
+					goto end_of_index;
+				}
+
+				block = page_cur_get_block(cur);
+				block = btr_block_get(
+					buf_block_get_space(block),
+					buf_block_get_zip_size(block),
+					next_page_no, BTR_SEARCH_LEAF,
+					clust_index, &mtr);
+
+				btr_leaf_page_release(page_cur_get_block(cur),
+						      BTR_SEARCH_LEAF, &mtr);
+				page_cur_set_before_first(block, cur);
+				page_cur_move_to_next(cur);
+
+				ut_ad(!page_cur_is_after_last(cur));
+			}
+		}
+
+		rec = page_cur_get_rec(cur);
+
+		offsets = rec_get_offsets(rec, clust_index, NULL,
+					  ULINT_UNDEFINED, &row_heap);
+
+		if (online) {
+			/* Perform a REPEATABLE READ.
+
+			When rebuilding the table online,
+			row_log_table_apply() must not see a newer
+			state of the table when applying the log.
+			This is mainly to prevent false duplicate key
+			errors, because the log will identify records
+			by the PRIMARY KEY, and also to prevent unsafe
+			BLOB access.
+
+			When creating a secondary index online, this
+			table scan must not see records that have only
+			been inserted to the clustered index, but have
+			not been written to the online_log of
+			index[]. If we performed READ UNCOMMITTED, it
+			could happen that the ADD INDEX reaches
+			ONLINE_INDEX_COMPLETE state between the time
+			the DML thread has updated the clustered index
+			but has not yet accessed secondary index. */
+			ut_ad(trx->read_view);
+
+			if (!read_view_sees_trx_id(
+				    trx->read_view,
+				    row_get_rec_trx_id(
+					    rec, clust_index, offsets))) {
+				rec_t*	old_vers;
+
+				row_vers_build_for_consistent_read(
+					rec, &mtr, clust_index, &offsets,
+					trx->read_view, &row_heap,
+					row_heap, &old_vers);
+
+				rec = old_vers;
+
+				if (!rec) {
+					continue;
+				}
+			}
+
+			if (rec_get_deleted_flag(
+				    rec,
+				    dict_table_is_comp(old_table))) {
+				/* This record was deleted in the latest
+				committed version, or it was deleted and
+				then reinserted-by-update before purge
+				kicked in. Skip it. */
+				continue;
+			}
+
+			ut_ad(!rec_offs_any_null_extern(rec, offsets));
+		} else if (rec_get_deleted_flag(
+				   rec, dict_table_is_comp(old_table))) {
+			/* Skip delete-marked records.
+
+			Skipping delete-marked records will make the
+			created indexes unuseable for transactions
+			whose read views were created before the index
+			creation completed, but preserving the history
+			would make it tricky to detect duplicate
+			keys. */
+			continue;
+		}
+
+		/* When !online, we are holding a lock on old_table, preventing
+		any inserts that could have written a record 'stub' before
+		writing out off-page columns. */
+		ut_ad(!rec_offs_any_null_extern(rec, offsets));
+
+		/* Build a row based on the clustered index. */
+
+		row = row_build(ROW_COPY_POINTERS, clust_index,
+				rec, offsets, new_table,
+				add_cols, col_map, &ext, row_heap);
+		ut_ad(row);
+
+		for (ulint i = 0; i < n_nonnull; i++) {
+			const dfield_t*	field	= &row->fields[nonnull[i]];
+
+			ut_ad(dfield_get_type(field)->prtype & DATA_NOT_NULL);
+
+			if (dfield_is_null(field)) {
+				err = DB_INVALID_NULL;
+				trx->error_key_num = 0;
+				goto func_exit;
+			}
+		}
+
+		/* Get the next Doc ID */
+		if (add_doc_id) {
+			doc_id++;
+		} else {
+			doc_id = 0;
+		}
+
+		if (add_autoinc != ULINT_UNDEFINED) {
+
+			ut_ad(add_autoinc
+			      < dict_table_get_n_user_cols(new_table));
+
+			const dfield_t*	dfield;
+
+			dfield = dtuple_get_nth_field(row, add_autoinc);
+			if (dfield_is_null(dfield)) {
+				goto write_buffers;
+			}
+
+			const dtype_t*  dtype = dfield_get_type(dfield);
+			byte*	b = static_cast<byte*>(dfield_get_data(dfield));
+
+			if (sequence.eof()) {
+				err = DB_ERROR;
+				trx->error_key_num = 0;
+
+				ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+					ER_AUTOINC_READ_FAILED, "[NULL]");
+
+				goto func_exit;
+			}
+
+			ulonglong	value = sequence++;
+
+			switch (dtype_get_mtype(dtype)) {
+			case DATA_INT: {
+				ibool	usign;
+				ulint	len = dfield_get_len(dfield);
+
+				usign = dtype_get_prtype(dtype) & DATA_UNSIGNED;
+				mach_write_ulonglong(b, value, len, usign);
+
+				break;
+				}
+
+			case DATA_FLOAT:
+				mach_float_write(
+					b, static_cast<float>(value));
+				break;
+
+			case DATA_DOUBLE:
+				mach_double_write(
+					b, static_cast<double>(value));
+				break;
+
+			default:
+				ut_ad(0);
+			}
+		}
+
+write_buffers:
+		/* Build all entries for all the indexes to be created
+		in a single scan of the clustered index. */
+
+		for (ulint i = 0; i < n_index; i++) {
+			row_merge_buf_t*	buf	= merge_buf[i];
+			merge_file_t*		file	= &files[i];
+			ulint			rows_added = 0;
+
+			if (UNIV_LIKELY
+			    (row && (rows_added = row_merge_buf_add(
+					buf, fts_index, old_table,
+					psort_info, row, ext, &doc_id)))) {
+
+				/* If we are creating FTS index,
+				a single row can generate more
+				records for tokenized word */
+				file->n_rec += rows_added;
+				if (doc_id > max_doc_id) {
+					max_doc_id = doc_id;
+				}
+
+				if (buf->index->type & DICT_FTS) {
+					/* Check if error occurs in child thread */
+					for (ulint j = 0; j < fts_sort_pll_degree; j++) {
+						if (psort_info[j].error != DB_SUCCESS) {
+							err = psort_info[j].error;
+							trx->error_key_num = i;
+							break;
+						}
+					}
+
+					if (err != DB_SUCCESS) {
+						break;
+					}
+				}
+
+				continue;
+			}
+
+			if (buf->index->type & DICT_FTS) {
+				if (!row || !doc_id) {
+					continue;
+				}
+			}
+
+			/* The buffer must be sufficiently large
+			to hold at least one record. It may only
+			be empty when we reach the end of the
+			clustered index. row_merge_buf_add()
+			must not have been called in this loop. */
+			ut_ad(buf->n_tuples || row == NULL);
+
+			/* We have enough data tuples to form a block.
+			Sort them and write to disk. */
+
+			if (buf->n_tuples) {
+				if (dict_index_is_unique(buf->index)) {
+					row_merge_dup_t	dup = {
+						buf->index, table, col_map, 0};
+
+					row_merge_buf_sort(buf, &dup);
+
+					if (dup.n_dup) {
+						err = DB_DUPLICATE_KEY;
+						trx->error_key_num
+							= key_numbers[i];
+						break;
+					}
+				} else {
+					row_merge_buf_sort(buf, NULL);
+				}
+			} else if (online && new_table == old_table) {
+				/* Note the newest transaction that
+				modified this index when the scan was
+				completed. We prevent older readers
+				from accessing this index, to ensure
+				read consistency. */
+
+				trx_id_t	max_trx_id;
+
+				ut_a(row == NULL);
+				rw_lock_x_lock(
+					dict_index_get_lock(buf->index));
+				ut_a(dict_index_get_online_status(buf->index)
+				     == ONLINE_INDEX_CREATION);
+
+				max_trx_id = row_log_get_max_trx(buf->index);
+
+				if (max_trx_id > buf->index->trx_id) {
+					buf->index->trx_id = max_trx_id;
+				}
+
+				rw_lock_x_unlock(
+					dict_index_get_lock(buf->index));
+			}
+
+			row_merge_buf_write(buf, file, block);
+
+			if (!row_merge_write(file->fd, file->offset++,
+					     block)) {
+				err = DB_TEMP_FILE_WRITE_FAILURE;
+				trx->error_key_num = i;
+				break;
+			}
+
+			UNIV_MEM_INVALID(&block[0], srv_sort_buf_size);
+			merge_buf[i] = row_merge_buf_empty(buf);
+
+			if (UNIV_LIKELY(row != NULL)) {
+				/* Try writing the record again, now
+				that the buffer has been written out
+				and emptied. */
+
+				if (UNIV_UNLIKELY
+				    (!(rows_added = row_merge_buf_add(
+						buf, fts_index, old_table,
+						psort_info, row, ext,
+						&doc_id)))) {
+					/* An empty buffer should have enough
+					room for at least one record. */
+					ut_error;
+				}
+
+				file->n_rec += rows_added;
+			}
+		}
+
+		if (row == NULL) {
+			goto all_done;
+		}
+
+		if (err != DB_SUCCESS) {
+			goto func_exit;
+		}
+
+		mem_heap_empty(row_heap);
+	}
+
+func_exit:
+	mtr_commit(&mtr);
+	mem_heap_free(row_heap);
+
+	if (nonnull) {
+		mem_free(nonnull);
+	}
+
+all_done:
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Scan Table\n");
+#endif
+	if (fts_pll_sort) {
+		bool	all_exit = false;
+		ulint	trial_count = 0;
+		const ulint max_trial_count = 10000;
+
+wait_again:
+                /* Check if error occurs in child thread */
+		for (ulint j = 0; j < fts_sort_pll_degree; j++) {
+			if (psort_info[j].error != DB_SUCCESS) {
+				err = psort_info[j].error;
+				trx->error_key_num = j;
+				break;
+			}
+		}
+
+		/* Tell all children that parent has done scanning */
+		for (ulint i = 0; i < fts_sort_pll_degree; i++) {
+			if (err == DB_SUCCESS) {
+				psort_info[i].state = FTS_PARENT_COMPLETE;
+			} else {
+				psort_info[i].state = FTS_PARENT_EXITING;
+			}
+		}
+
+		/* Now wait all children to report back to be completed */
+		os_event_wait_time_low(fts_parallel_sort_event,
+				       1000000, sig_count);
+
+		for (ulint i = 0; i < fts_sort_pll_degree; i++) {
+			if (psort_info[i].child_status != FTS_CHILD_COMPLETE
+			    && psort_info[i].child_status != FTS_CHILD_EXITING) {
+				sig_count = os_event_reset(
+					fts_parallel_sort_event);
+				goto wait_again;
+			}
+		}
+
+		/* Now all children should complete, wait a bit until
+		they all finish setting the event, before we free everything.
+		This has a 10 second timeout */
+		do {
+			all_exit = true;
+
+			for (ulint j = 0; j < fts_sort_pll_degree; j++) {
+				if (psort_info[j].child_status
+				    != FTS_CHILD_EXITING) {
+					all_exit = false;
+					os_thread_sleep(1000);
+					break;
+				}
+			}
+			trial_count++;
+		} while (!all_exit && trial_count < max_trial_count);
+
+		if (!all_exit) {
+			ut_ad(0);
+			ib_logf(IB_LOG_LEVEL_FATAL,
+				"Not all child sort threads exited"
+				" when creating FTS index '%s'",
+				fts_sort_idx->name);
+		}
+	}
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Tokenization\n");
+#endif
+	for (ulint i = 0; i < n_index; i++) {
+		row_merge_buf_free(merge_buf[i]);
+	}
+
+	row_fts_free_pll_merge_buf(psort_info);
+
+	mem_free(merge_buf);
+
+	btr_pcur_close(&pcur);
+
+	/* Update the next Doc ID we used. Table should be locked, so
+	no concurrent DML */
+	if (max_doc_id && err == DB_SUCCESS) {
+		/* Sync fts cache for other fts indexes to keep all
+		fts indexes consistent in sync_doc_id. */
+		err = fts_sync_table(const_cast<dict_table_t*>(new_table));
+
+		if (err == DB_SUCCESS) {
+			fts_update_next_doc_id(
+				0, new_table, old_table->name, max_doc_id);
+		}
+	}
+
+	trx->op_info = "";
+
+	DBUG_RETURN(err);
+}
+
+/** Write a record via buffer 2 and read the next record to buffer N.
+@param N	number of the buffer (0 or 1)
+@param INDEX	record descriptor
+@param AT_END	statement to execute at end of input */
+#define ROW_MERGE_WRITE_GET_NEXT(N, INDEX, AT_END)			\
+	do {								\
+		b2 = row_merge_write_rec(&block[2 * srv_sort_buf_size], \
+					 &buf[2], b2,			\
+					 of->fd, &of->offset,		\
+					 mrec##N, offsets##N);		\
+		if (UNIV_UNLIKELY(!b2 || ++of->n_rec > file->n_rec)) {	\
+			goto corrupt;					\
+		}							\
+		b##N = row_merge_read_rec(&block[N * srv_sort_buf_size],\
+					  &buf[N], b##N, INDEX,		\
+					  file->fd, foffs##N,		\
+					  &mrec##N, offsets##N);	\
+		if (UNIV_UNLIKELY(!b##N)) {				\
+			if (mrec##N) {					\
+				goto corrupt;				\
+			}						\
+			AT_END;						\
+		}							\
+	} while (0)
+
+/*************************************************************//**
+Merge two blocks of records on disk and write a bigger block.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_merge_blocks(
+/*=============*/
+	const row_merge_dup_t*	dup,	/*!< in: descriptor of
+					index being created */
+	const merge_file_t*	file,	/*!< in: file containing
+					index entries */
+	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
+	ulint*			foffs0,	/*!< in/out: offset of first
+					source list in the file */
+	ulint*			foffs1,	/*!< in/out: offset of second
+					source list in the file */
+	merge_file_t*		of)	/*!< in/out: output file */
+{
+	mem_heap_t*	heap;	/*!< memory heap for offsets0, offsets1 */
+
+	mrec_buf_t*	buf;	/*!< buffer for handling
+				split mrec in block[] */
+	const byte*	b0;	/*!< pointer to block[0] */
+	const byte*	b1;	/*!< pointer to block[srv_sort_buf_size] */
+	byte*		b2;	/*!< pointer to block[2 * srv_sort_buf_size] */
+	const mrec_t*	mrec0;	/*!< merge rec, points to block[0] or buf[0] */
+	const mrec_t*	mrec1;	/*!< merge rec, points to
+				block[srv_sort_buf_size] or buf[1] */
+	ulint*		offsets0;/* offsets of mrec0 */
+	ulint*		offsets1;/* offsets of mrec1 */
+
+#ifdef UNIV_DEBUG
+	if (row_merge_print_block) {
+		fprintf(stderr,
+			"row_merge_blocks fd=%d ofs=%lu + fd=%d ofs=%lu"
+			" = fd=%d ofs=%lu\n",
+			file->fd, (ulong) *foffs0,
+			file->fd, (ulong) *foffs1,
+			of->fd, (ulong) of->offset);
+	}
+#endif /* UNIV_DEBUG */
+
+	heap = row_merge_heap_create(dup->index, &buf, &offsets0, &offsets1);
+
+	/* Write a record and read the next record.  Split the output
+	file in two halves, which can be merged on the following pass. */
+
+	if (!row_merge_read(file->fd, *foffs0, &block[0])
+	    || !row_merge_read(file->fd, *foffs1, &block[srv_sort_buf_size])) {
+corrupt:
+		mem_heap_free(heap);
+		return(DB_CORRUPTION);
+	}
+
+	b0 = &block[0];
+	b1 = &block[srv_sort_buf_size];
+	b2 = &block[2 * srv_sort_buf_size];
+
+	b0 = row_merge_read_rec(
+		&block[0], &buf[0], b0, dup->index,
+		file->fd, foffs0, &mrec0, offsets0);
+	b1 = row_merge_read_rec(
+		&block[srv_sort_buf_size],
+		&buf[srv_sort_buf_size], b1, dup->index,
+		file->fd, foffs1, &mrec1, offsets1);
+	if (UNIV_UNLIKELY(!b0 && mrec0)
+	    || UNIV_UNLIKELY(!b1 && mrec1)) {
+
+		goto corrupt;
+	}
+
+	while (mrec0 && mrec1) {
+		switch (cmp_rec_rec_simple(
+				mrec0, mrec1, offsets0, offsets1,
+				dup->index, dup->table)) {
+		case 0:
+			mem_heap_free(heap);
+			return(DB_DUPLICATE_KEY);
+		case -1:
+			ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto merged);
+			break;
+		case 1:
+			ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto merged);
+			break;
+		default:
+			ut_error;
+		}
+	}
+
+merged:
+	if (mrec0) {
+		/* append all mrec0 to output */
+		for (;;) {
+			ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto done0);
+		}
+	}
+done0:
+	if (mrec1) {
+		/* append all mrec1 to output */
+		for (;;) {
+			ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto done1);
+		}
+	}
+done1:
+
+	mem_heap_free(heap);
+	b2 = row_merge_write_eof(&block[2 * srv_sort_buf_size],
+				 b2, of->fd, &of->offset);
+	return(b2 ? DB_SUCCESS : DB_CORRUPTION);
+}
+
+/*************************************************************//**
+Copy a block of index entries.
+@return	TRUE on success, FALSE on failure */
+static __attribute__((nonnull, warn_unused_result))
+ibool
+row_merge_blocks_copy(
+/*==================*/
+	const dict_index_t*	index,	/*!< in: index being created */
+	const merge_file_t*	file,	/*!< in: input file */
+	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
+	ulint*			foffs0,	/*!< in/out: input file offset */
+	merge_file_t*		of)	/*!< in/out: output file */
+{
+	mem_heap_t*	heap;	/*!< memory heap for offsets0, offsets1 */
+
+	mrec_buf_t*	buf;	/*!< buffer for handling
+				split mrec in block[] */
+	const byte*	b0;	/*!< pointer to block[0] */
+	byte*		b2;	/*!< pointer to block[2 * srv_sort_buf_size] */
+	const mrec_t*	mrec0;	/*!< merge rec, points to block[0] */
+	ulint*		offsets0;/* offsets of mrec0 */
+	ulint*		offsets1;/* dummy offsets */
+
+#ifdef UNIV_DEBUG
+	if (row_merge_print_block) {
+		fprintf(stderr,
+			"row_merge_blocks_copy fd=%d ofs=%lu"
+			" = fd=%d ofs=%lu\n",
+			file->fd, (ulong) foffs0,
+			of->fd, (ulong) of->offset);
+	}
+#endif /* UNIV_DEBUG */
+
+	heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
+
+	/* Write a record and read the next record.  Split the output
+	file in two halves, which can be merged on the following pass. */
+
+	if (!row_merge_read(file->fd, *foffs0, &block[0])) {
+corrupt:
+		mem_heap_free(heap);
+		return(FALSE);
+	}
+
+	b0 = &block[0];
+
+	b2 = &block[2 * srv_sort_buf_size];
+
+	b0 = row_merge_read_rec(&block[0], &buf[0], b0, index,
+				file->fd, foffs0, &mrec0, offsets0);
+	if (UNIV_UNLIKELY(!b0 && mrec0)) {
+
+		goto corrupt;
+	}
+
+	if (mrec0) {
+		/* append all mrec0 to output */
+		for (;;) {
+			ROW_MERGE_WRITE_GET_NEXT(0, index, goto done0);
+		}
+	}
+done0:
+
+	/* The file offset points to the beginning of the last page
+	that has been read.  Update it to point to the next block. */
+	(*foffs0)++;
+
+	mem_heap_free(heap);
+	return(row_merge_write_eof(&block[2 * srv_sort_buf_size],
+				   b2, of->fd, &of->offset)
+	       != NULL);
+}
+
+/*************************************************************//**
+Merge disk files.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull))
+dberr_t
+row_merge(
+/*======*/
+	trx_t*			trx,	/*!< in: transaction */
+	const row_merge_dup_t*	dup,	/*!< in: descriptor of
+					index being created */
+	merge_file_t*		file,	/*!< in/out: file containing
+					index entries */
+	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
+	int*			tmpfd,	/*!< in/out: temporary file handle */
+	ulint*			num_run,/*!< in/out: Number of runs remain
+					to be merged */
+	ulint*			run_offset) /*!< in/out: Array contains the
+					first offset number for each merge
+					run */
+{
+	ulint		foffs0;	/*!< first input offset */
+	ulint		foffs1;	/*!< second input offset */
+	dberr_t		error;	/*!< error code */
+	merge_file_t	of;	/*!< output file */
+	const ulint	ihalf	= run_offset[*num_run / 2];
+				/*!< half the input file */
+	ulint		n_run	= 0;
+				/*!< num of runs generated from this merge */
+
+	UNIV_MEM_ASSERT_W(&block[0], 3 * srv_sort_buf_size);
+
+	ut_ad(ihalf < file->offset);
+
+	of.fd = *tmpfd;
+	of.offset = 0;
+	of.n_rec = 0;
+
+#ifdef POSIX_FADV_SEQUENTIAL
+	/* The input file will be read sequentially, starting from the
+	beginning and the middle.  In Linux, the POSIX_FADV_SEQUENTIAL
+	affects the entire file.  Each block will be read exactly once. */
+	posix_fadvise(file->fd, 0, 0,
+		      POSIX_FADV_SEQUENTIAL | POSIX_FADV_NOREUSE);
+#endif /* POSIX_FADV_SEQUENTIAL */
+
+	/* Merge blocks to the output file. */
+	foffs0 = 0;
+	foffs1 = ihalf;
+
+	UNIV_MEM_INVALID(run_offset, *num_run * sizeof *run_offset);
+
+	for (; foffs0 < ihalf && foffs1 < file->offset; foffs0++, foffs1++) {
+
+		if (trx_is_interrupted(trx)) {
+			return(DB_INTERRUPTED);
+		}
+
+		/* Remember the offset number for this run */
+		run_offset[n_run++] = of.offset;
+
+		error = row_merge_blocks(dup, file, block,
+					 &foffs0, &foffs1, &of);
+
+		if (error != DB_SUCCESS) {
+			return(error);
+		}
+
+	}
+
+	/* Copy the last blocks, if there are any. */
+
+	while (foffs0 < ihalf) {
+		if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+			return(DB_INTERRUPTED);
+		}
+
+		/* Remember the offset number for this run */
+		run_offset[n_run++] = of.offset;
+
+		if (!row_merge_blocks_copy(dup->index, file, block,
+					   &foffs0, &of)) {
+			return(DB_CORRUPTION);
+		}
+	}
+
+	ut_ad(foffs0 == ihalf);
+
+	while (foffs1 < file->offset) {
+		if (trx_is_interrupted(trx)) {
+			return(DB_INTERRUPTED);
+		}
+
+		/* Remember the offset number for this run */
+		run_offset[n_run++] = of.offset;
+
+		if (!row_merge_blocks_copy(dup->index, file, block,
+					   &foffs1, &of)) {
+			return(DB_CORRUPTION);
+		}
+	}
+
+	ut_ad(foffs1 == file->offset);
+
+	if (UNIV_UNLIKELY(of.n_rec != file->n_rec)) {
+		return(DB_CORRUPTION);
+	}
+
+	ut_ad(n_run <= *num_run);
+
+	*num_run = n_run;
+
+	/* Each run can contain one or more offsets. As merge goes on,
+	the number of runs (to merge) will reduce until we have one
+	single run. So the number of runs will always be smaller than
+	the number of offsets in file */
+	ut_ad((*num_run) <= file->offset);
+
+	/* The number of offsets in output file is always equal or
+	smaller than input file */
+	ut_ad(of.offset <= file->offset);
+
+	/* Swap file descriptors for the next pass. */
+	*tmpfd = file->fd;
+	*file = of;
+
+	UNIV_MEM_INVALID(&block[0], 3 * srv_sort_buf_size);
+
+	return(DB_SUCCESS);
+}
+
+/*************************************************************//**
+Merge disk files.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_merge_sort(
+/*===========*/
+	trx_t*			trx,	/*!< in: transaction */
+	const row_merge_dup_t*	dup,	/*!< in: descriptor of
+					index being created */
+	merge_file_t*		file,	/*!< in/out: file containing
+					index entries */
+	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
+	int*			tmpfd)	/*!< in/out: temporary file handle */
+{
+	const ulint	half	= file->offset / 2;
+	ulint		num_runs;
+	ulint*		run_offset;
+	dberr_t		error	= DB_SUCCESS;
+	DBUG_ENTER("row_merge_sort");
+
+	/* Record the number of merge runs we need to perform */
+	num_runs = file->offset;
+
+	/* If num_runs are less than 1, nothing to merge */
+	if (num_runs <= 1) {
+		DBUG_RETURN(error);
+	}
+
+	/* "run_offset" records each run's first offset number */
+	run_offset = (ulint*) mem_alloc(file->offset * sizeof(ulint));
+
+	/* This tells row_merge() where to start for the first round
+	of merge. */
+	run_offset[half] = half;
+
+	/* The file should always contain at least one byte (the end
+	of file marker).  Thus, it must be at least one block. */
+	ut_ad(file->offset > 0);
+
+	/* Merge the runs until we have one big run */
+	do {
+		error = row_merge(trx, dup, file, block, tmpfd,
+				  &num_runs, run_offset);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		UNIV_MEM_ASSERT_RW(run_offset, num_runs * sizeof *run_offset);
+	} while (num_runs > 1);
+
+	mem_free(run_offset);
+
+	DBUG_RETURN(error);
+}
+
+/*************************************************************//**
+Copy externally stored columns to the data tuple. */
+static __attribute__((nonnull))
+void
+row_merge_copy_blobs(
+/*=================*/
+	const mrec_t*	mrec,	/*!< in: merge record */
+	const ulint*	offsets,/*!< in: offsets of mrec */
+	ulint		zip_size,/*!< in: compressed page size in bytes, or 0 */
+	dtuple_t*	tuple,	/*!< in/out: data tuple */
+	mem_heap_t*	heap)	/*!< in/out: memory heap */
+{
+	ut_ad(rec_offs_any_extern(offsets));
+
+	for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) {
+		ulint		len;
+		const void*	data;
+		dfield_t*	field = dtuple_get_nth_field(tuple, i);
+
+		if (!dfield_is_ext(field)) {
+			continue;
+		}
+
+		ut_ad(!dfield_is_null(field));
+
+		/* During the creation of a PRIMARY KEY, the table is
+		X-locked, and we skip copying records that have been
+		marked for deletion. Therefore, externally stored
+		columns cannot possibly be freed between the time the
+		BLOB pointers are read (row_merge_read_clustered_index())
+		and dereferenced (below). */
+		data = btr_rec_copy_externally_stored_field(
+			mrec, offsets, zip_size, i, &len, heap);
+		/* Because we have locked the table, any records
+		written by incomplete transactions must have been
+		rolled back already. There must not be any incomplete
+		BLOB columns. */
+		ut_a(data);
+
+		dfield_set_data(field, data, len);
+	}
+}
+
+/********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return	DB_SUCCESS or error number */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_merge_insert_index_tuples(
+/*==========================*/
+	trx_id_t		trx_id,	/*!< in: transaction identifier */
+	dict_index_t*		index,	/*!< in: index */
+	const dict_table_t*	old_table,/*!< in: old table */
+	int			fd,	/*!< in: file descriptor */
+	row_merge_block_t*	block)	/*!< in/out: file buffer */
+{
+	const byte*		b;
+	mem_heap_t*		heap;
+	mem_heap_t*		tuple_heap;
+	mem_heap_t*		ins_heap;
+	dberr_t			error = DB_SUCCESS;
+	ulint			foffs = 0;
+	ulint*			offsets;
+	mrec_buf_t*		buf;
+	DBUG_ENTER("row_merge_insert_index_tuples");
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(!(index->type & DICT_FTS));
+	ut_ad(trx_id);
+
+	tuple_heap = mem_heap_create(1000);
+
+	{
+		ulint i	= 1 + REC_OFFS_HEADER_SIZE
+			+ dict_index_get_n_fields(index);
+		heap = mem_heap_create(sizeof *buf + i * sizeof *offsets);
+		ins_heap = mem_heap_create(sizeof *buf + i * sizeof *offsets);
+		offsets = static_cast<ulint*>(
+			mem_heap_alloc(heap, i * sizeof *offsets));
+		offsets[0] = i;
+		offsets[1] = dict_index_get_n_fields(index);
+	}
+
+	b = block;
+
+	if (!row_merge_read(fd, foffs, block)) {
+		error = DB_CORRUPTION;
+	} else {
+		buf = static_cast<mrec_buf_t*>(
+			mem_heap_alloc(heap, sizeof *buf));
+
+		for (;;) {
+			const mrec_t*	mrec;
+			dtuple_t*	dtuple;
+			ulint		n_ext;
+			big_rec_t*	big_rec;
+			rec_t*		rec;
+			btr_cur_t	cursor;
+			mtr_t		mtr;
+
+			b = row_merge_read_rec(block, buf, b, index,
+					       fd, &foffs, &mrec, offsets);
+			if (UNIV_UNLIKELY(!b)) {
+				/* End of list, or I/O error */
+				if (mrec) {
+					error = DB_CORRUPTION;
+				}
+				break;
+			}
+
+			dict_index_t*	old_index
+				= dict_table_get_first_index(old_table);
+
+			if (dict_index_is_clust(index)
+			    && dict_index_is_online_ddl(old_index)) {
+				error = row_log_table_get_error(old_index);
+				if (error != DB_SUCCESS) {
+					break;
+				}
+			}
+
+			dtuple = row_rec_to_index_entry_low(
+				mrec, index, offsets, &n_ext, tuple_heap);
+
+			if (!n_ext) {
+				/* There are no externally stored columns. */
+			} else {
+				ut_ad(dict_index_is_clust(index));
+				/* Off-page columns can be fetched safely
+				when concurrent modifications to the table
+				are disabled. (Purge can process delete-marked
+				records, but row_merge_read_clustered_index()
+				would have skipped them.)
+
+				When concurrent modifications are enabled,
+				row_merge_read_clustered_index() will
+				only see rows from transactions that were
+				committed before the ALTER TABLE started
+				(REPEATABLE READ).
+
+				Any modifications after the
+				row_merge_read_clustered_index() scan
+				will go through row_log_table_apply().
+				Any modifications to off-page columns
+				will be tracked by
+				row_log_table_blob_alloc() and
+				row_log_table_blob_free(). */
+				row_merge_copy_blobs(
+					mrec, offsets,
+					dict_table_zip_size(old_table),
+					dtuple, tuple_heap);
+			}
+
+			ut_ad(dtuple_validate(dtuple));
+			log_free_check();
+
+			mtr_start(&mtr);
+			/* Insert after the last user record. */
+			btr_cur_open_at_index_side(
+				false, index, BTR_MODIFY_LEAF,
+				&cursor, 0, &mtr);
+			page_cur_position(
+				page_rec_get_prev(btr_cur_get_rec(&cursor)),
+				btr_cur_get_block(&cursor),
+				btr_cur_get_page_cur(&cursor));
+			cursor.flag = BTR_CUR_BINARY;
+#ifdef UNIV_DEBUG
+			/* Check that the records are inserted in order. */
+			rec = btr_cur_get_rec(&cursor);
+
+			if (!page_rec_is_infimum(rec)) {
+				ulint*	rec_offsets = rec_get_offsets(
+					rec, index, offsets,
+					ULINT_UNDEFINED, &tuple_heap);
+				ut_ad(cmp_dtuple_rec(dtuple, rec, rec_offsets)
+				      > 0);
+			}
+#endif /* UNIV_DEBUG */
+			ulint*	ins_offsets = NULL;
+
+			error = btr_cur_optimistic_insert(
+				BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+				| BTR_KEEP_SYS_FLAG | BTR_CREATE_FLAG,
+				&cursor, &ins_offsets, &ins_heap,
+				dtuple, &rec, &big_rec, 0, NULL, &mtr);
+
+			if (error == DB_FAIL) {
+				ut_ad(!big_rec);
+				mtr_commit(&mtr);
+				mtr_start(&mtr);
+				btr_cur_open_at_index_side(
+					false, index, BTR_MODIFY_TREE,
+					&cursor, 0, &mtr);
+				page_cur_position(
+					page_rec_get_prev(btr_cur_get_rec(
+								  &cursor)),
+					btr_cur_get_block(&cursor),
+					btr_cur_get_page_cur(&cursor));
+
+				error = btr_cur_pessimistic_insert(
+					BTR_NO_UNDO_LOG_FLAG
+					| BTR_NO_LOCKING_FLAG
+					| BTR_KEEP_SYS_FLAG | BTR_CREATE_FLAG,
+					&cursor, &ins_offsets, &ins_heap,
+					dtuple, &rec, &big_rec, 0, NULL, &mtr);
+			}
+
+			if (!dict_index_is_clust(index)) {
+				page_update_max_trx_id(
+					btr_cur_get_block(&cursor),
+					btr_cur_get_page_zip(&cursor),
+					trx_id, &mtr);
+			}
+
+			mtr_commit(&mtr);
+
+			if (UNIV_LIKELY_NULL(big_rec)) {
+				/* If the system crashes at this
+				point, the clustered index record will
+				contain a null BLOB pointer. This
+				should not matter, because the copied
+				table will be dropped on crash
+				recovery anyway. */
+
+				ut_ad(dict_index_is_clust(index));
+				ut_ad(error == DB_SUCCESS);
+				error = row_ins_index_entry_big_rec(
+					dtuple, big_rec,
+					ins_offsets, &ins_heap,
+					index, NULL, __FILE__, __LINE__);
+				dtuple_convert_back_big_rec(
+					index, dtuple, big_rec);
+			}
+
+			if (error != DB_SUCCESS) {
+				goto err_exit;
+			}
+
+			mem_heap_empty(tuple_heap);
+			mem_heap_empty(ins_heap);
+		}
+	}
+
+err_exit:
+	mem_heap_free(tuple_heap);
+	mem_heap_free(ins_heap);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(error);
+}
+
+/*********************************************************************//**
+Sets an exclusive lock on a table, for the duration of creating indexes.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_merge_lock_table(
+/*=================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	dict_table_t*	table,		/*!< in: table to lock */
+	enum lock_mode	mode)		/*!< in: LOCK_X or LOCK_S */
+{
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	dberr_t		err;
+	sel_node_t*	node;
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(mode == LOCK_X || mode == LOCK_S);
+
+	heap = mem_heap_create(512);
+
+	trx->op_info = "setting table lock for creating or dropping index";
+
+	node = sel_node_create(heap);
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+	thr->graph->state = QUE_FORK_ACTIVE;
+
+	/* We use the select query graph as the dummy graph needed
+	in the lock module call */
+
+	thr = static_cast<que_thr_t*>(
+		que_fork_get_first_thr(
+			static_cast<que_fork_t*>(que_node_get_parent(thr))));
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = thr;
+	thr->prev_node = thr->common.parent;
+
+	err = lock_table(0, table, mode, thr);
+
+	trx->error_state = err;
+
+	if (UNIV_LIKELY(err == DB_SUCCESS)) {
+		que_thr_stop_for_mysql_no_error(thr, trx);
+	} else {
+		que_thr_stop_for_mysql(thr);
+
+		if (err != DB_QUE_THR_SUSPENDED) {
+			bool	was_lock_wait;
+
+			was_lock_wait = row_mysql_handle_errors(
+				&err, trx, thr, NULL);
+
+			if (was_lock_wait) {
+				goto run_again;
+			}
+		} else {
+			que_thr_t*	run_thr;
+			que_node_t*	parent;
+
+			parent = que_node_get_parent(thr);
+
+			run_thr = que_fork_start_command(
+				static_cast<que_fork_t*>(parent));
+
+			ut_a(run_thr == thr);
+
+			/* There was a lock wait but the thread was not
+			in a ready to run or running state. */
+			trx->error_state = DB_LOCK_WAIT;
+
+			goto run_again;
+		}
+	}
+
+	que_graph_free(thr->graph);
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Drop an index that was created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed. */
+static
+void
+row_merge_drop_index_dict(
+/*======================*/
+	trx_t*		trx,	/*!< in/out: dictionary transaction */
+	index_id_t	index_id)/*!< in: index identifier */
+{
+	static const char sql[] =
+		"PROCEDURE DROP_INDEX_PROC () IS\n"
+		"BEGIN\n"
+		"DELETE FROM SYS_FIELDS WHERE INDEX_ID=:indexid;\n"
+		"DELETE FROM SYS_INDEXES WHERE ID=:indexid;\n"
+		"END;\n";
+	dberr_t		error;
+	pars_info_t*	info;
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	info = pars_info_create();
+	pars_info_add_ull_literal(info, "indexid", index_id);
+	trx->op_info = "dropping index from dictionary";
+	error = que_eval_sql(info, sql, FALSE, trx);
+
+	if (error != DB_SUCCESS) {
+		/* Even though we ensure that DDL transactions are WAIT
+		and DEADLOCK free, we could encounter other errors e.g.,
+		DB_TOO_MANY_CONCURRENT_TRXS. */
+		trx->error_state = DB_SUCCESS;
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Error: row_merge_drop_index_dict "
+			"failed with error code: %u.\n", (unsigned) error);
+	}
+
+	trx->op_info = "";
+}
+
+/*********************************************************************//**
+Drop indexes that were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed. */
+UNIV_INTERN
+void
+row_merge_drop_indexes_dict(
+/*========================*/
+	trx_t*		trx,	/*!< in/out: dictionary transaction */
+	table_id_t	table_id)/*!< in: table identifier */
+{
+	static const char sql[] =
+		"PROCEDURE DROP_INDEXES_PROC () IS\n"
+		"ixid CHAR;\n"
+		"found INT;\n"
+
+		"DECLARE CURSOR index_cur IS\n"
+		" SELECT ID FROM SYS_INDEXES\n"
+		" WHERE TABLE_ID=:tableid AND\n"
+		" SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n"
+		"FOR UPDATE;\n"
+
+		"BEGIN\n"
+		"found := 1;\n"
+		"OPEN index_cur;\n"
+		"WHILE found = 1 LOOP\n"
+		"  FETCH index_cur INTO ixid;\n"
+		"  IF (SQL % NOTFOUND) THEN\n"
+		"    found := 0;\n"
+		"  ELSE\n"
+		"    DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n"
+		"    DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE index_cur;\n"
+
+		"END;\n";
+	dberr_t		error;
+	pars_info_t*	info;
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* It is possible that table->n_ref_count > 1 when
+	locked=TRUE. In this case, all code that should have an open
+	handle to the table be waiting for the next statement to execute,
+	or waiting for a meta-data lock.
+
+	A concurrent purge will be prevented by dict_operation_lock. */
+
+	info = pars_info_create();
+	pars_info_add_ull_literal(info, "tableid", table_id);
+	trx->op_info = "dropping indexes";
+	error = que_eval_sql(info, sql, FALSE, trx);
+
+	if (error != DB_SUCCESS) {
+		/* Even though we ensure that DDL transactions are WAIT
+		and DEADLOCK free, we could encounter other errors e.g.,
+		DB_TOO_MANY_CONCURRENT_TRXS. */
+		trx->error_state = DB_SUCCESS;
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Error: row_merge_drop_indexes_dict "
+			"failed with error code: %u.\n", (unsigned) error);
+	}
+
+	trx->op_info = "";
+}
+
+/*********************************************************************//**
+Drop indexes that were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed. */
+UNIV_INTERN
+void
+row_merge_drop_indexes(
+/*===================*/
+	trx_t*		trx,	/*!< in/out: dictionary transaction */
+	dict_table_t*	table,	/*!< in/out: table containing the indexes */
+	ibool		locked)	/*!< in: TRUE=table locked,
+				FALSE=may need to do a lazy drop */
+{
+	dict_index_t*	index;
+	dict_index_t*	next_index;
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	index = dict_table_get_first_index(table);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_COMPLETE);
+
+	/* the caller should have an open handle to the table */
+	ut_ad(table->n_ref_count >= 1);
+
+	/* It is possible that table->n_ref_count > 1 when
+	locked=TRUE. In this case, all code that should have an open
+	handle to the table be waiting for the next statement to execute,
+	or waiting for a meta-data lock.
+
+	A concurrent purge will be prevented by dict_operation_lock. */
+
+	if (!locked && table->n_ref_count > 1) {
+		/* We will have to drop the indexes later, when the
+		table is guaranteed to be no longer in use.  Mark the
+		indexes as incomplete and corrupted, so that other
+		threads will stop using them.  Let dict_table_close()
+		or crash recovery or the next invocation of
+		prepare_inplace_alter_table() take care of dropping
+		the indexes. */
+
+		while ((index = dict_table_get_next_index(index)) != NULL) {
+			ut_ad(!dict_index_is_clust(index));
+
+			switch (dict_index_get_online_status(index)) {
+			case ONLINE_INDEX_ABORTED_DROPPED:
+				continue;
+			case ONLINE_INDEX_COMPLETE:
+				if (*index->name != TEMP_INDEX_PREFIX) {
+					/* Do nothing to already
+					published indexes. */
+				} else if (index->type & DICT_FTS) {
+					/* Drop a completed FULLTEXT
+					index, due to a timeout during
+					MDL upgrade for
+					commit_inplace_alter_table().
+					Because only concurrent reads
+					are allowed (and they are not
+					seeing this index yet) we
+					are safe to drop the index. */
+					dict_index_t* prev = UT_LIST_GET_PREV(
+						indexes, index);
+					/* At least there should be
+					the clustered index before
+					this one. */
+					ut_ad(prev);
+					ut_a(table->fts);
+					fts_drop_index(table, index, trx);
+					/* Since
+					INNOBASE_SHARE::idx_trans_tbl
+					is shared between all open
+					ha_innobase handles to this
+					table, no thread should be
+					accessing this dict_index_t
+					object. Also, we should be
+					holding LOCK=SHARED MDL on the
+					table even after the MDL
+					upgrade timeout. */
+
+					/* We can remove a DICT_FTS
+					index from the cache, because
+					we do not allow ADD FULLTEXT INDEX
+					with LOCK=NONE. If we allowed that,
+					we should exclude FTS entries from
+					prebuilt->ins_node->entry_list
+					in ins_node_create_entry_list(). */
+					dict_index_remove_from_cache(
+						table, index);
+					index = prev;
+				} else {
+					rw_lock_x_lock(
+						dict_index_get_lock(index));
+					dict_index_set_online_status(
+						index, ONLINE_INDEX_ABORTED);
+					index->type |= DICT_CORRUPT;
+					table->drop_aborted = TRUE;
+					goto drop_aborted;
+				}
+				continue;
+			case ONLINE_INDEX_CREATION:
+				rw_lock_x_lock(dict_index_get_lock(index));
+				ut_ad(*index->name == TEMP_INDEX_PREFIX);
+				row_log_abort_sec(index);
+			drop_aborted:
+				rw_lock_x_unlock(dict_index_get_lock(index));
+
+				DEBUG_SYNC_C("merge_drop_index_after_abort");
+				/* covered by dict_sys->mutex */
+				MONITOR_INC(MONITOR_BACKGROUND_DROP_INDEX);
+				/* fall through */
+			case ONLINE_INDEX_ABORTED:
+				/* Drop the index tree from the
+				data dictionary and free it from
+				the tablespace, but keep the object
+				in the data dictionary cache. */
+				row_merge_drop_index_dict(trx, index->id);
+				rw_lock_x_lock(dict_index_get_lock(index));
+				dict_index_set_online_status(
+					index, ONLINE_INDEX_ABORTED_DROPPED);
+				rw_lock_x_unlock(dict_index_get_lock(index));
+				table->drop_aborted = TRUE;
+				continue;
+			}
+			ut_error;
+		}
+
+		return;
+	}
+
+	row_merge_drop_indexes_dict(trx, table->id);
+
+	/* Invalidate all row_prebuilt_t::ins_graph that are referring
+	to this table. That is, force row_get_prebuilt_insert_row() to
+	rebuild prebuilt->ins_node->entry_list). */
+	ut_ad(table->def_trx_id <= trx->id);
+	table->def_trx_id = trx->id;
+
+	next_index = dict_table_get_next_index(index);
+
+	while ((index = next_index) != NULL) {
+		/* read the next pointer before freeing the index */
+		next_index = dict_table_get_next_index(index);
+
+		ut_ad(!dict_index_is_clust(index));
+
+		if (*index->name == TEMP_INDEX_PREFIX) {
+			/* If it is FTS index, drop from table->fts
+			and also drop its auxiliary tables */
+			if (index->type & DICT_FTS) {
+				ut_a(table->fts);
+				fts_drop_index(table, index, trx);
+			}
+
+			switch (dict_index_get_online_status(index)) {
+			case ONLINE_INDEX_CREATION:
+				/* This state should only be possible
+				when prepare_inplace_alter_table() fails
+				after invoking row_merge_create_index().
+				In inplace_alter_table(),
+				row_merge_build_indexes()
+				should never leave the index in this state.
+				It would invoke row_log_abort_sec() on
+				failure. */
+			case ONLINE_INDEX_COMPLETE:
+				/* In these cases, we are able to drop
+				the index straight. The DROP INDEX was
+				never deferred. */
+				break;
+			case ONLINE_INDEX_ABORTED:
+			case ONLINE_INDEX_ABORTED_DROPPED:
+				/* covered by dict_sys->mutex */
+				MONITOR_DEC(MONITOR_BACKGROUND_DROP_INDEX);
+			}
+
+			dict_index_remove_from_cache(table, index);
+		}
+	}
+
+	table->drop_aborted = FALSE;
+	ut_d(dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE));
+}
+
+/*********************************************************************//**
+Drop all partially created indexes during crash recovery. */
+UNIV_INTERN
+void
+row_merge_drop_temp_indexes(void)
+/*=============================*/
+{
+	static const char sql[] =
+		"PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n"
+		"ixid CHAR;\n"
+		"found INT;\n"
+
+		"DECLARE CURSOR index_cur IS\n"
+		" SELECT ID FROM SYS_INDEXES\n"
+		" WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n"
+		"FOR UPDATE;\n"
+
+		"BEGIN\n"
+		"found := 1;\n"
+		"OPEN index_cur;\n"
+		"WHILE found = 1 LOOP\n"
+		"  FETCH index_cur INTO ixid;\n"
+		"  IF (SQL % NOTFOUND) THEN\n"
+		"    found := 0;\n"
+		"  ELSE\n"
+		"    DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n"
+		"    DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE index_cur;\n"
+		"END;\n";
+	trx_t*	trx;
+	dberr_t	error;
+
+	/* Load the table definitions that contain partially defined
+	indexes, so that the data dictionary information can be checked
+	when accessing the tablename.ibd files. */
+	trx = trx_allocate_for_background();
+	trx->op_info = "dropping partially created indexes";
+	row_mysql_lock_data_dictionary(trx);
+	/* Ensure that this transaction will be rolled back and locks
+	will be released, if the server gets killed before the commit
+	gets written to the redo log. */
+	trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+
+	trx->op_info = "dropping indexes";
+	error = que_eval_sql(NULL, sql, FALSE, trx);
+
+	if (error != DB_SUCCESS) {
+		/* Even though we ensure that DDL transactions are WAIT
+		and DEADLOCK free, we could encounter other errors e.g.,
+		DB_TOO_MANY_CONCURRENT_TRXS. */
+		trx->error_state = DB_SUCCESS;
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Error: row_merge_drop_temp_indexes "
+			"failed with error code: %u.\n", (unsigned) error);
+	}
+
+	trx_commit_for_mysql(trx);
+	row_mysql_unlock_data_dictionary(trx);
+	trx_free_for_background(trx);
+}
+
+/*********************************************************************//**
+Creates temporary merge files, and if UNIV_PFS_IO defined, register
+the file descriptor with Performance Schema.
+@return file descriptor, or -1 on failure */
+UNIV_INTERN
+int
+row_merge_file_create_low(void)
+/*===========================*/
+{
+	int	fd;
+#ifdef UNIV_PFS_IO
+	/* This temp file open does not go through normal
+	file APIs, add instrumentation to register with
+	performance schema */
+	struct PSI_file_locker*	locker = NULL;
+	PSI_file_locker_state	state;
+	register_pfs_file_open_begin(&state, locker, innodb_file_temp_key,
+				     PSI_FILE_OPEN,
+				     "Innodb Merge Temp File",
+				     __FILE__, __LINE__);
+#endif
+	fd = innobase_mysql_tmpfile();
+#ifdef UNIV_PFS_IO
+	register_pfs_file_open_end(locker, fd);
+#endif
+
+	if (fd < 0) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Cannot create temporary merge file");
+		return (-1);
+	}
+	return(fd);
+}
+
+/*********************************************************************//**
+Create a merge file.
+@return file descriptor, or -1 on failure */
+UNIV_INTERN
+int
+row_merge_file_create(
+/*==================*/
+	merge_file_t*	merge_file)	/*!< out: merge file structure */
+{
+	merge_file->fd = row_merge_file_create_low();
+	merge_file->offset = 0;
+	merge_file->n_rec = 0;
+
+	if (merge_file->fd >= 0) {
+		if (srv_disable_sort_file_cache) {
+			os_file_set_nocache(merge_file->fd,
+				"row0merge.cc", "sort");
+		}
+	}
+	return(merge_file->fd);
+}
+
+/*********************************************************************//**
+Destroy a merge file. And de-register the file from Performance Schema
+if UNIV_PFS_IO is defined. */
+UNIV_INTERN
+void
+row_merge_file_destroy_low(
+/*=======================*/
+	int		fd)	/*!< in: merge file descriptor */
+{
+#ifdef UNIV_PFS_IO
+	struct PSI_file_locker*	locker = NULL;
+	PSI_file_locker_state	state;
+	register_pfs_file_io_begin(&state, locker,
+				   fd, 0, PSI_FILE_CLOSE,
+				   __FILE__, __LINE__);
+#endif
+	if (fd >= 0) {
+		close(fd);
+	}
+#ifdef UNIV_PFS_IO
+	register_pfs_file_io_end(locker, 0);
+#endif
+}
+/*********************************************************************//**
+Destroy a merge file. */
+UNIV_INTERN
+void
+row_merge_file_destroy(
+/*===================*/
+	merge_file_t*	merge_file)	/*!< in/out: merge file structure */
+{
+	ut_ad(!srv_read_only_mode);
+
+	if (merge_file->fd != -1) {
+		row_merge_file_destroy_low(merge_file->fd);
+		merge_file->fd = -1;
+	}
+}
+
+/*********************************************************************//**
+Rename an index in the dictionary that was created. The data
+dictionary must have been locked exclusively by the caller, because
+the transaction will not be committed.
+@return	DB_SUCCESS if all OK */
+UNIV_INTERN
+dberr_t
+row_merge_rename_index_to_add(
+/*==========================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	table_id_t	table_id,	/*!< in: table identifier */
+	index_id_t	index_id)	/*!< in: index identifier */
+{
+	dberr_t		err = DB_SUCCESS;
+	pars_info_t*	info = pars_info_create();
+
+	/* We use the private SQL parser of Innobase to generate the
+	query graphs needed in renaming indexes. */
+
+	static const char rename_index[] =
+		"PROCEDURE RENAME_INDEX_PROC () IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n"
+		"WHERE TABLE_ID = :tableid AND ID = :indexid;\n"
+		"END;\n";
+
+	ut_ad(trx);
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+
+	trx->op_info = "renaming index to add";
+
+	pars_info_add_ull_literal(info, "tableid", table_id);
+	pars_info_add_ull_literal(info, "indexid", index_id);
+
+	err = que_eval_sql(info, rename_index, FALSE, trx);
+
+	if (err != DB_SUCCESS) {
+		/* Even though we ensure that DDL transactions are WAIT
+		and DEADLOCK free, we could encounter other errors e.g.,
+		DB_TOO_MANY_CONCURRENT_TRXS. */
+		trx->error_state = DB_SUCCESS;
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error: row_merge_rename_index_to_add "
+			 "failed with error code: %u.\n", (unsigned) err);
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Rename an index in the dictionary that is to be dropped. The data
+dictionary must have been locked exclusively by the caller, because
+the transaction will not be committed.
+@return	DB_SUCCESS if all OK */
+UNIV_INTERN
+dberr_t
+row_merge_rename_index_to_drop(
+/*===========================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	table_id_t	table_id,	/*!< in: table identifier */
+	index_id_t	index_id)	/*!< in: index identifier */
+{
+	dberr_t		err;
+	pars_info_t*	info = pars_info_create();
+
+	ut_ad(!srv_read_only_mode);
+
+	/* We use the private SQL parser of Innobase to generate the
+	query graphs needed in renaming indexes. */
+
+	static const char rename_index[] =
+		"PROCEDURE RENAME_INDEX_PROC () IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_INDEXES SET NAME=CONCAT('"
+		TEMP_INDEX_PREFIX_STR "',NAME)\n"
+		"WHERE TABLE_ID = :tableid AND ID = :indexid;\n"
+		"END;\n";
+
+	ut_ad(trx);
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+
+	trx->op_info = "renaming index to drop";
+
+	pars_info_add_ull_literal(info, "tableid", table_id);
+	pars_info_add_ull_literal(info, "indexid", index_id);
+
+	err = que_eval_sql(info, rename_index, FALSE, trx);
+
+	if (err != DB_SUCCESS) {
+		/* Even though we ensure that DDL transactions are WAIT
+		and DEADLOCK free, we could encounter other errors e.g.,
+		DB_TOO_MANY_CONCURRENT_TRXS. */
+		trx->error_state = DB_SUCCESS;
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error: row_merge_rename_index_to_drop "
+			 "failed with error code: %u.\n", (unsigned) err);
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Provide a new pathname for a table that is being renamed if it belongs to
+a file-per-table tablespace.  The caller is responsible for freeing the
+memory allocated for the return value.
+@return	new pathname of tablespace file, or NULL if space = 0 */
+UNIV_INTERN
+char*
+row_make_new_pathname(
+/*==================*/
+	dict_table_t*	table,		/*!< in: table to be renamed */
+	const char*	new_name)	/*!< in: new name */
+{
+	char*	new_path;
+	char*	old_path;
+
+	ut_ad(table->space != TRX_SYS_SPACE);
+
+	old_path = fil_space_get_first_path(table->space);
+	ut_a(old_path);
+
+	new_path = os_file_make_new_pathname(old_path, new_name);
+
+	mem_free(old_path);
+
+	return(new_path);
+}
+
+/*********************************************************************//**
+Rename the tables in the data dictionary.  The data dictionary must
+have been locked exclusively by the caller, because the transaction
+will not be committed.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_merge_rename_tables_dict(
+/*=========================*/
+	dict_table_t*	old_table,	/*!< in/out: old table, renamed to
+					tmp_name */
+	dict_table_t*	new_table,	/*!< in/out: new table, renamed to
+					old_table->name */
+	const char*	tmp_name,	/*!< in: new name for old_table */
+	trx_t*		trx)		/*!< in/out: dictionary transaction */
+{
+	dberr_t		err	= DB_ERROR;
+	pars_info_t*	info;
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(old_table != new_table);
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_TABLE
+	      || trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+
+	trx->op_info = "renaming tables";
+
+	/* We use the private SQL parser of Innobase to generate the query
+	graphs needed in updating the dictionary data in system tables. */
+
+	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "new_name", new_table->name);
+	pars_info_add_str_literal(info, "old_name", old_table->name);
+	pars_info_add_str_literal(info, "tmp_name", tmp_name);
+
+	err = que_eval_sql(info,
+			   "PROCEDURE RENAME_TABLES () IS\n"
+			   "BEGIN\n"
+			   "UPDATE SYS_TABLES SET NAME = :tmp_name\n"
+			   " WHERE NAME = :old_name;\n"
+			   "UPDATE SYS_TABLES SET NAME = :old_name\n"
+			   " WHERE NAME = :new_name;\n"
+			   "END;\n", FALSE, trx);
+
+	/* Update SYS_TABLESPACES and SYS_DATAFILES if the old
+	table is in a non-system tablespace where space > 0. */
+	if (err == DB_SUCCESS
+	    && old_table->space != TRX_SYS_SPACE
+	    && !old_table->ibd_file_missing) {
+		/* Make pathname to update SYS_DATAFILES. */
+		char* tmp_path = row_make_new_pathname(old_table, tmp_name);
+
+		info = pars_info_create();
+
+		pars_info_add_str_literal(info, "tmp_name", tmp_name);
+		pars_info_add_str_literal(info, "tmp_path", tmp_path);
+		pars_info_add_int4_literal(info, "old_space",
+					   (lint) old_table->space);
+
+		err = que_eval_sql(info,
+				   "PROCEDURE RENAME_OLD_SPACE () IS\n"
+				   "BEGIN\n"
+				   "UPDATE SYS_TABLESPACES"
+				   " SET NAME = :tmp_name\n"
+				   " WHERE SPACE = :old_space;\n"
+				   "UPDATE SYS_DATAFILES"
+				   " SET PATH = :tmp_path\n"
+				   " WHERE SPACE = :old_space;\n"
+				   "END;\n", FALSE, trx);
+
+		mem_free(tmp_path);
+	}
+
+	/* Update SYS_TABLESPACES and SYS_DATAFILES if the new
+	table is in a non-system tablespace where space > 0. */
+	if (err == DB_SUCCESS && new_table->space != TRX_SYS_SPACE) {
+		/* Make pathname to update SYS_DATAFILES. */
+		char* old_path = row_make_new_pathname(
+			new_table, old_table->name);
+
+		info = pars_info_create();
+
+		pars_info_add_str_literal(info, "old_name", old_table->name);
+		pars_info_add_str_literal(info, "old_path", old_path);
+		pars_info_add_int4_literal(info, "new_space",
+					   (lint) new_table->space);
+
+		err = que_eval_sql(info,
+				   "PROCEDURE RENAME_NEW_SPACE () IS\n"
+				   "BEGIN\n"
+				   "UPDATE SYS_TABLESPACES"
+				   " SET NAME = :old_name\n"
+				   " WHERE SPACE = :new_space;\n"
+				   "UPDATE SYS_DATAFILES"
+				   " SET PATH = :old_path\n"
+				   " WHERE SPACE = :new_space;\n"
+				   "END;\n", FALSE, trx);
+
+		mem_free(old_path);
+	}
+
+	if (err == DB_SUCCESS && dict_table_is_discarded(new_table)) {
+		err = row_import_update_discarded_flag(
+			trx, new_table->id, true, true);
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Create and execute a query graph for creating an index.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_merge_create_index_graph(
+/*=========================*/
+	trx_t*		trx,		/*!< in: trx */
+	dict_table_t*	table,		/*!< in: table */
+	dict_index_t*	index)		/*!< in: index */
+{
+	ind_node_t*	node;		/*!< Index creation node */
+	mem_heap_t*	heap;		/*!< Memory heap */
+	que_thr_t*	thr;		/*!< Query thread */
+	dberr_t		err;
+
+	ut_ad(trx);
+	ut_ad(table);
+	ut_ad(index);
+
+	heap = mem_heap_create(512);
+
+	index->table = table;
+	node = ind_create_graph_create(index, heap, false);
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+
+	ut_a(thr == que_fork_start_command(
+			static_cast<que_fork_t*>(que_node_get_parent(thr))));
+
+	que_run_threads(thr);
+
+	err = trx->error_state;
+
+	que_graph_free((que_t*) que_node_get_parent(thr));
+
+	return(err);
+}
+
+/*********************************************************************//**
+Create the index and load in to the dictionary.
+@return	index, or NULL on error */
+UNIV_INTERN
+dict_index_t*
+row_merge_create_index(
+/*===================*/
+	trx_t*			trx,	/*!< in/out: trx (sets error_state) */
+	dict_table_t*		table,	/*!< in: the index is on this table */
+	const index_def_t*	index_def)
+					/*!< in: the index definition */
+{
+	dict_index_t*	index;
+	dberr_t		err;
+	ulint		n_fields = index_def->n_fields;
+	ulint		i;
+
+	ut_ad(!srv_read_only_mode);
+
+	/* Create the index prototype, using the passed in def, this is not
+	a persistent operation. We pass 0 as the space id, and determine at
+	a lower level the space id where to store the table. */
+
+	index = dict_mem_index_create(table->name, index_def->name,
+				      0, index_def->ind_type, n_fields);
+
+	ut_a(index);
+
+	for (i = 0; i < n_fields; i++) {
+		index_field_t*	ifield = &index_def->fields[i];
+
+		dict_mem_index_add_field(
+			index, dict_table_get_col_name(table, ifield->col_no),
+			ifield->prefix_len);
+	}
+
+	/* Add the index to SYS_INDEXES, using the index prototype. */
+	err = row_merge_create_index_graph(trx, table, index);
+
+	if (err == DB_SUCCESS) {
+
+		index = dict_table_get_index_on_name(table, index_def->name);
+
+		ut_a(index);
+
+		/* Note the id of the transaction that created this
+		index, we use it to restrict readers from accessing
+		this index, to ensure read consistency. */
+		ut_ad(index->trx_id == trx->id);
+	} else {
+		index = NULL;
+	}
+
+	return(index);
+}
+
+/*********************************************************************//**
+Check if a transaction can use an index. */
+UNIV_INTERN
+ibool
+row_merge_is_index_usable(
+/*======================*/
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_index_t*	index)	/*!< in: index to check */
+{
+	if (!dict_index_is_clust(index)
+	    && dict_index_is_online_ddl(index)) {
+		/* Indexes that are being created are not useable. */
+		return(FALSE);
+	}
+
+	return(!dict_index_is_corrupted(index)
+	       && (dict_table_is_temporary(index->table)
+		   || !trx->read_view
+		   || read_view_sees_trx_id(trx->read_view, index->trx_id)));
+}
+
+/*********************************************************************//**
+Drop a table. The caller must have ensured that the background stats
+thread is not processing the table. This can be done by calling
+dict_stats_wait_bg_to_stop_using_table() after locking the dictionary and
+before calling this function.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_merge_drop_table(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_table_t*	table)		/*!< in: table to drop */
+{
+	ut_ad(!srv_read_only_mode);
+
+	/* There must be no open transactions on the table. */
+	ut_a(table->n_ref_count == 0);
+
+	return(row_drop_table_for_mysql(table->name, trx, false, false));
+}
+
+/*********************************************************************//**
+Build indexes on a table by reading a clustered index,
+creating a temporary file containing index entries, merge sorting
+these index entries and inserting sorted index entries to indexes.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_merge_build_indexes(
+/*====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_table_t*	old_table,	/*!< in: table where rows are
+					read from */
+	dict_table_t*	new_table,	/*!< in: table where indexes are
+					created; identical to old_table
+					unless creating a PRIMARY KEY */
+	bool		online,		/*!< in: true if creating indexes
+					online */
+	dict_index_t**	indexes,	/*!< in: indexes to be created */
+	const ulint*	key_numbers,	/*!< in: MySQL key numbers */
+	ulint		n_indexes,	/*!< in: size of indexes[] */
+	struct TABLE*	table,		/*!< in/out: MySQL table, for
+					reporting erroneous key value
+					if applicable */
+	const dtuple_t*	add_cols,	/*!< in: default values of
+					added columns, or NULL */
+	const ulint*	col_map,	/*!< in: mapping of old column
+					numbers to new ones, or NULL
+					if old_table == new_table */
+	ulint		add_autoinc,	/*!< in: number of added
+					AUTO_INCREMENT column, or
+					ULINT_UNDEFINED if none is added */
+	ib_sequence_t&	sequence)	/*!< in: autoinc instance if
+					add_autoinc != ULINT_UNDEFINED */
+{
+	merge_file_t*		merge_files;
+	row_merge_block_t*	block;
+	ulint			block_size;
+	ulint			i;
+	ulint			j;
+	dberr_t			error;
+	int			tmpfd = -1;
+	dict_index_t*		fts_sort_idx = NULL;
+	fts_psort_t*		psort_info = NULL;
+	fts_psort_t*		merge_info = NULL;
+	ib_int64_t		sig_count = 0;
+	bool			fts_psort_initiated = false;
+	DBUG_ENTER("row_merge_build_indexes");
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad((old_table == new_table) == !col_map);
+	ut_ad(!add_cols || col_map);
+
+	/* Allocate memory for merge file data structure and initialize
+	fields */
+
+	block_size = 3 * srv_sort_buf_size;
+	block = static_cast<row_merge_block_t*>(
+		os_mem_alloc_large(&block_size));
+
+	if (block == NULL) {
+		DBUG_RETURN(DB_OUT_OF_MEMORY);
+	}
+
+	trx_start_if_not_started_xa(trx);
+
+	merge_files = static_cast<merge_file_t*>(
+		mem_alloc(n_indexes * sizeof *merge_files));
+
+	/* Initialize all the merge file descriptors, so that we
+	don't call row_merge_file_destroy() on uninitialized
+	merge file descriptor */
+
+	for (i = 0; i < n_indexes; i++) {
+		merge_files[i].fd = -1;
+	}
+
+	for (i = 0; i < n_indexes; i++) {
+		if (row_merge_file_create(&merge_files[i]) < 0) {
+			error = DB_OUT_OF_MEMORY;
+			goto func_exit;
+		}
+
+		if (indexes[i]->type & DICT_FTS) {
+			ibool	opt_doc_id_size = FALSE;
+
+			/* To build FTS index, we would need to extract
+			doc's word, Doc ID, and word's position, so
+			we need to build a "fts sort index" indexing
+			on above three 'fields' */
+			fts_sort_idx = row_merge_create_fts_sort_index(
+				indexes[i], old_table, &opt_doc_id_size);
+
+			row_merge_dup_t* dup = static_cast<row_merge_dup_t*>(
+				ut_malloc(sizeof *dup));
+			dup->index = fts_sort_idx;
+			dup->table = table;
+			dup->col_map = col_map;
+			dup->n_dup = 0;
+
+			row_fts_psort_info_init(
+				trx, dup, new_table, opt_doc_id_size,
+				&psort_info, &merge_info);
+
+			/* "We need to ensure that we free the resources
+			allocated */
+			fts_psort_initiated = true;
+		}
+	}
+
+	tmpfd = row_merge_file_create_low();
+
+	if (tmpfd < 0) {
+		error = DB_OUT_OF_MEMORY;
+		goto func_exit;
+	}
+
+	/* Reset the MySQL row buffer that is used when reporting
+	duplicate keys. */
+	innobase_rec_reset(table);
+
+	/* Read clustered index of the table and create files for
+	secondary index entries for merge sort */
+
+	error = row_merge_read_clustered_index(
+		trx, table, old_table, new_table, online, indexes,
+		fts_sort_idx, psort_info, merge_files, key_numbers,
+		n_indexes, add_cols, col_map,
+		add_autoinc, sequence, block);
+
+	if (error != DB_SUCCESS) {
+
+		goto func_exit;
+	}
+
+	DEBUG_SYNC_C("row_merge_after_scan");
+
+	/* Now we have files containing index entries ready for
+	sorting and inserting. */
+
+	for (i = 0; i < n_indexes; i++) {
+		dict_index_t*	sort_idx = indexes[i];
+
+		if (indexes[i]->type & DICT_FTS) {
+			os_event_t	fts_parallel_merge_event;
+
+			sort_idx = fts_sort_idx;
+
+			fts_parallel_merge_event
+				= merge_info[0].psort_common->merge_event;
+
+			if (FTS_PLL_MERGE) {
+				ulint	trial_count = 0;
+				bool	all_exit = false;
+
+				os_event_reset(fts_parallel_merge_event);
+				row_fts_start_parallel_merge(merge_info);
+wait_again:
+				os_event_wait_time_low(
+					fts_parallel_merge_event, 1000000,
+					sig_count);
+
+				for (j = 0; j < FTS_NUM_AUX_INDEX; j++) {
+					if (merge_info[j].child_status
+					    != FTS_CHILD_COMPLETE
+					    && merge_info[j].child_status
+					    != FTS_CHILD_EXITING) {
+						sig_count = os_event_reset(
+						fts_parallel_merge_event);
+
+						goto wait_again;
+					}
+				}
+
+				/* Now all children should complete, wait
+				a bit until they all finish using event */
+				while (!all_exit && trial_count < 10000) {
+					all_exit = true;
+
+					for (j = 0; j < FTS_NUM_AUX_INDEX;
+					     j++) {
+						if (merge_info[j].child_status
+						    != FTS_CHILD_EXITING) {
+							all_exit = false;
+							os_thread_sleep(1000);
+							break;
+						}
+					}
+					trial_count++;
+				}
+
+				if (!all_exit) {
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"Not all child merge threads"
+						" exited when creating FTS"
+						" index '%s'",
+						indexes[i]->name);
+				}
+			} else {
+				/* This cannot report duplicates; an
+				assertion would fail in that case. */
+				error = row_fts_merge_insert(
+					sort_idx, new_table,
+					psort_info, 0);
+			}
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+			DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Insert\n");
+#endif
+		} else {
+			row_merge_dup_t	dup = {
+				sort_idx, table, col_map, 0};
+
+			error = row_merge_sort(
+				trx, &dup, &merge_files[i],
+				block, &tmpfd);
+
+			if (error == DB_SUCCESS) {
+				error = row_merge_insert_index_tuples(
+					trx->id, sort_idx, old_table,
+					merge_files[i].fd, block);
+			}
+		}
+
+		/* Close the temporary file to free up space. */
+		row_merge_file_destroy(&merge_files[i]);
+
+		if (indexes[i]->type & DICT_FTS) {
+			row_fts_psort_info_destroy(psort_info, merge_info);
+			fts_psort_initiated = false;
+		} else if (error != DB_SUCCESS || !online) {
+			/* Do not apply any online log. */
+		} else if (old_table != new_table) {
+			ut_ad(!sort_idx->online_log);
+			ut_ad(sort_idx->online_status
+			      == ONLINE_INDEX_COMPLETE);
+		} else {
+			DEBUG_SYNC_C("row_log_apply_before");
+			error = row_log_apply(trx, sort_idx, table);
+			DEBUG_SYNC_C("row_log_apply_after");
+		}
+
+		if (error != DB_SUCCESS) {
+			trx->error_key_num = key_numbers[i];
+			goto func_exit;
+		}
+
+		if (indexes[i]->type & DICT_FTS && fts_enable_diag_print) {
+			char*	name = (char*) indexes[i]->name;
+
+			if (*name == TEMP_INDEX_PREFIX)  {
+				name++;
+			}
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr, " InnoDB: Finished building "
+				"full-text index %s\n", name);
+		}
+	}
+
+func_exit:
+	DBUG_EXECUTE_IF(
+		"ib_build_indexes_too_many_concurrent_trxs",
+		error = DB_TOO_MANY_CONCURRENT_TRXS;
+		trx->error_state = error;);
+
+	if (fts_psort_initiated) {
+		/* Clean up FTS psort related resource */
+		row_fts_psort_info_destroy(psort_info, merge_info);
+		fts_psort_initiated = false;
+	}
+
+	row_merge_file_destroy_low(tmpfd);
+
+	for (i = 0; i < n_indexes; i++) {
+		row_merge_file_destroy(&merge_files[i]);
+	}
+
+	if (fts_sort_idx) {
+		dict_mem_index_free(fts_sort_idx);
+	}
+
+	mem_free(merge_files);
+	os_mem_free_large(block, block_size);
+
+	DICT_TF2_FLAG_UNSET(new_table, DICT_TF2_FTS_ADD_DOC_ID);
+
+	if (online && old_table == new_table && error != DB_SUCCESS) {
+		/* On error, flag all online secondary index creation
+		as aborted. */
+		for (i = 0; i < n_indexes; i++) {
+			ut_ad(!(indexes[i]->type & DICT_FTS));
+			ut_ad(*indexes[i]->name == TEMP_INDEX_PREFIX);
+			ut_ad(!dict_index_is_clust(indexes[i]));
+
+			/* Completed indexes should be dropped as
+			well, and indexes whose creation was aborted
+			should be dropped from the persistent
+			storage. However, at this point we can only
+			set some flags in the not-yet-published
+			indexes. These indexes will be dropped later
+			in row_merge_drop_indexes(), called by
+			rollback_inplace_alter_table(). */
+
+			switch (dict_index_get_online_status(indexes[i])) {
+			case ONLINE_INDEX_COMPLETE:
+				break;
+			case ONLINE_INDEX_CREATION:
+				rw_lock_x_lock(
+					dict_index_get_lock(indexes[i]));
+				row_log_abort_sec(indexes[i]);
+				indexes[i]->type |= DICT_CORRUPT;
+				rw_lock_x_unlock(
+					dict_index_get_lock(indexes[i]));
+				new_table->drop_aborted = TRUE;
+				/* fall through */
+			case ONLINE_INDEX_ABORTED_DROPPED:
+			case ONLINE_INDEX_ABORTED:
+				MONITOR_MUTEX_INC(
+					&dict_sys->mutex,
+					MONITOR_BACKGROUND_DROP_INDEX);
+			}
+		}
+	}
+
+	DBUG_RETURN(error);
+}
diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc
new file mode 100644
index 00000000000..bf17673a036
--- /dev/null
+++ b/storage/innobase/row/row0mysql.cc
@@ -0,0 +1,5446 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2015, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0mysql.cc
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#include "row0mysql.h"
+
+#ifdef UNIV_NONINL
+#include "row0mysql.ic"
+#endif
+
+#include <debug_sync.h>
+#include <my_dbug.h>
+
+#include <sql_const.h>
+#include "row0ins.h"
+#include "row0merge.h"
+#include "row0sel.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "que0que.h"
+#include "pars0pars.h"
+#include "dict0dict.h"
+#include "dict0crea.h"
+#include "dict0load.h"
+#include "dict0boot.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "trx0roll.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0undo.h"
+#include "lock0lock.h"
+#include "rem0cmp.h"
+#include "log0log.h"
+#include "btr0sea.h"
+#include "fil0fil.h"
+#include "ibuf0ibuf.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include "srv0start.h"
+#include "row0import.h"
+#include "m_string.h"
+#include "my_sys.h"
+#include "ha_prototypes.h"
+#include <algorithm>
+
+/** Provide optional 4.x backwards compatibility for 5.0 and above */
+UNIV_INTERN ibool	row_rollback_on_timeout	= FALSE;
+
+/** Chain node of the list of tables to drop in the background. */
+struct row_mysql_drop_t{
+	char*				table_name;	/*!< table name */
+	UT_LIST_NODE_T(row_mysql_drop_t)row_mysql_drop_list;
+							/*!< list chain node */
+};
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register drop list mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	row_drop_list_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/** @brief List of tables we should drop in background.
+
+ALTER TABLE in MySQL requires that the table handler can drop the
+table in background when there are no queries to it any
+more.  Protected by row_drop_list_mutex. */
+static UT_LIST_BASE_NODE_T(row_mysql_drop_t)	row_mysql_drop_list;
+
+/** Mutex protecting the background table drop list. */
+static ib_mutex_t row_drop_list_mutex;
+
+/** Flag: has row_mysql_drop_list been initialized? */
+static ibool	row_mysql_drop_list_inited	= FALSE;
+
+/** Magic table names for invoking various monitor threads */
+/* @{ */
+static const char S_innodb_monitor[] = "innodb_monitor";
+static const char S_innodb_lock_monitor[] = "innodb_lock_monitor";
+static const char S_innodb_tablespace_monitor[] = "innodb_tablespace_monitor";
+static const char S_innodb_table_monitor[] = "innodb_table_monitor";
+#ifdef UNIV_MEM_DEBUG
+static const char S_innodb_mem_validate[] = "innodb_mem_validate";
+#endif /* UNIV_MEM_DEBUG */
+/* @} */
+
+/** Evaluates to true if str1 equals str2_onstack, used for comparing
+the magic table names.
+@param str1		in: string to compare
+@param str1_len 	in: length of str1, in bytes, including terminating NUL
+@param str2_onstack	in: char[] array containing a NUL terminated string
+@return			TRUE if str1 equals str2_onstack */
+#define STR_EQ(str1, str1_len, str2_onstack) \
+	((str1_len) == sizeof(str2_onstack) \
+	 && memcmp(str1, str2_onstack, sizeof(str2_onstack)) == 0)
+
+/*******************************************************************//**
+Determine if the given name is a name reserved for MySQL system tables.
+@return	TRUE if name is a MySQL system table name */
+static
+ibool
+row_mysql_is_system_table(
+/*======================*/
+	const char*	name)
+{
+	if (strncmp(name, "mysql/", 6) != 0) {
+
+		return(FALSE);
+	}
+
+	return(0 == strcmp(name + 6, "host")
+	       || 0 == strcmp(name + 6, "user")
+	       || 0 == strcmp(name + 6, "db"));
+}
+
+/*********************************************************************//**
+If a table is not yet in the drop list, adds the table to the list of tables
+which the master thread drops in background. We need this on Unix because in
+ALTER TABLE MySQL may call drop table even if the table has running queries on
+it. Also, if there are running foreign key checks on the table, we drop the
+table lazily.
+@return	TRUE if the table was not yet in the drop list, and was added there */
+static
+ibool
+row_add_table_to_background_drop_list(
+/*==================================*/
+	const char*	name);	/*!< in: table name */
+
+/*******************************************************************//**
+Delays an INSERT, DELETE or UPDATE operation if the purge is lagging. */
+static
+void
+row_mysql_delay_if_needed(void)
+/*===========================*/
+{
+	if (srv_dml_needed_delay) {
+		os_thread_sleep(srv_dml_needed_delay);
+	}
+}
+
+/*******************************************************************//**
+Frees the blob heap in prebuilt when no longer needed. */
+UNIV_INTERN
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct of a
+					ha_innobase:: table handle */
+{
+	mem_heap_free(prebuilt->blob_heap);
+	prebuilt->blob_heap = NULL;
+}
+
+/*******************************************************************//**
+Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
+format.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+UNIV_INTERN
+byte*
+row_mysql_store_true_var_len(
+/*=========================*/
+	byte*	dest,	/*!< in: where to store */
+	ulint	len,	/*!< in: length, must fit in two bytes */
+	ulint	lenlen)	/*!< in: storage length of len: either 1 or 2 bytes */
+{
+	if (lenlen == 2) {
+		ut_a(len < 256 * 256);
+
+		mach_write_to_2_little_endian(dest, len);
+
+		return(dest + 2);
+	}
+
+	ut_a(lenlen == 1);
+	ut_a(len < 256);
+
+	mach_write_to_1(dest, len);
+
+	return(dest + 1);
+}
+
+/*******************************************************************//**
+Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and
+returns a pointer to the data.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+UNIV_INTERN
+const byte*
+row_mysql_read_true_varchar(
+/*========================*/
+	ulint*		len,	/*!< out: variable-length field length */
+	const byte*	field,	/*!< in: field in the MySQL format */
+	ulint		lenlen)	/*!< in: storage length of len: either 1
+				or 2 bytes */
+{
+	if (lenlen == 2) {
+		*len = mach_read_from_2_little_endian(field);
+
+		return(field + 2);
+	}
+
+	ut_a(lenlen == 1);
+
+	*len = mach_read_from_1(field);
+
+	return(field + 1);
+}
+
+/*******************************************************************//**
+Stores a reference to a BLOB in the MySQL format. */
+UNIV_INTERN
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+	byte*		dest,	/*!< in: where to store */
+	ulint		col_len,/*!< in: dest buffer size: determines into
+				how many bytes the BLOB length is stored,
+				the space for the length may vary from 1
+				to 4 bytes */
+	const void*	data,	/*!< in: BLOB data; if the value to store
+				is SQL NULL this should be NULL pointer */
+	ulint		len)	/*!< in: BLOB length; if the value to store
+				is SQL NULL this should be 0; remember
+				also to set the NULL bit in the MySQL record
+				header! */
+{
+	/* MySQL might assume the field is set to zero except the length and
+	the pointer fields */
+
+	memset(dest, '\0', col_len);
+
+	/* In dest there are 1 - 4 bytes reserved for the BLOB length,
+	and after that 8 bytes reserved for the pointer to the data.
+	In 32-bit architectures we only use the first 4 bytes of the pointer
+	slot. */
+
+	ut_a(col_len - 8 > 1 || len < 256);
+	ut_a(col_len - 8 > 2 || len < 256 * 256);
+	ut_a(col_len - 8 > 3 || len < 256 * 256 * 256);
+
+	mach_write_to_n_little_endian(dest, col_len - 8, len);
+
+	memcpy(dest + col_len - 8, &data, sizeof data);
+}
+
+/*******************************************************************//**
+Reads a reference to a BLOB in the MySQL format.
+@return	pointer to BLOB data */
+UNIV_INTERN
+const byte*
+row_mysql_read_blob_ref(
+/*====================*/
+	ulint*		len,		/*!< out: BLOB length */
+	const byte*	ref,		/*!< in: BLOB reference in the
+					MySQL format */
+	ulint		col_len)	/*!< in: BLOB reference length
+					(not BLOB length) */
+{
+	byte*	data;
+
+	*len = mach_read_from_n_little_endian(ref, col_len - 8);
+
+	memcpy(&data, ref + col_len - 8, sizeof data);
+
+	return(data);
+}
+
+/**************************************************************//**
+Pad a column with spaces. */
+UNIV_INTERN
+void
+row_mysql_pad_col(
+/*==============*/
+	ulint	mbminlen,	/*!< in: minimum size of a character,
+				in bytes */
+	byte*	pad,		/*!< out: padded buffer */
+	ulint	len)		/*!< in: number of bytes to pad */
+{
+	const byte*	pad_end;
+
+	switch (UNIV_EXPECT(mbminlen, 1)) {
+	default:
+		ut_error;
+	case 1:
+		/* space=0x20 */
+		memset(pad, 0x20, len);
+		break;
+	case 2:
+		/* space=0x0020 */
+		pad_end = pad + len;
+		ut_a(!(len % 2));
+		while (pad < pad_end) {
+			*pad++ = 0x00;
+			*pad++ = 0x20;
+		};
+		break;
+	case 4:
+		/* space=0x00000020 */
+		pad_end = pad + len;
+		ut_a(!(len % 4));
+		while (pad < pad_end) {
+			*pad++ = 0x00;
+			*pad++ = 0x00;
+			*pad++ = 0x00;
+			*pad++ = 0x20;
+		}
+		break;
+	}
+}
+
+/**************************************************************//**
+Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
+The counterpart of this function is row_sel_field_store_in_mysql_format() in
+row0sel.cc.
+@return	up to which byte we used buf in the conversion */
+UNIV_INTERN
+byte*
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+	dfield_t*	dfield,		/*!< in/out: dfield where dtype
+					information must be already set when
+					this function is called! */
+	byte*		buf,		/*!< in/out: buffer for a converted
+					integer value; this must be at least
+					col_len long then! NOTE that dfield
+					may also get a pointer to 'buf',
+					therefore do not discard this as long
+					as dfield is used! */
+	ibool		row_format_col,	/*!< TRUE if the mysql_data is from
+					a MySQL row, FALSE if from a MySQL
+					key value;
+					in MySQL, a true VARCHAR storage
+					format differs in a row and in a
+					key value: in a key value the length
+					is always stored in 2 bytes! */
+	const byte*	mysql_data,	/*!< in: MySQL column value, not
+					SQL NULL; NOTE that dfield may also
+					get a pointer to mysql_data,
+					therefore do not discard this as long
+					as dfield is used! */
+	ulint		col_len,	/*!< in: MySQL column length; NOTE that
+					this is the storage length of the
+					column in the MySQL format row, not
+					necessarily the length of the actual
+					payload data; if the column is a true
+					VARCHAR then this is irrelevant */
+	ulint		comp)		/*!< in: nonzero=compact format */
+{
+	const byte*	ptr	= mysql_data;
+	const dtype_t*	dtype;
+	ulint		type;
+	ulint		lenlen;
+
+	dtype = dfield_get_type(dfield);
+
+	type = dtype->mtype;
+
+	if (type == DATA_INT) {
+		/* Store integer data in Innobase in a big-endian format,
+		sign bit negated if the data is a signed integer. In MySQL,
+		integers are stored in a little-endian format. */
+
+		byte*	p = buf + col_len;
+
+		for (;;) {
+			p--;
+			*p = *mysql_data;
+			if (p == buf) {
+				break;
+			}
+			mysql_data++;
+		}
+
+		if (!(dtype->prtype & DATA_UNSIGNED)) {
+
+			*buf ^= 128;
+		}
+
+		ptr = buf;
+		buf += col_len;
+	} else if ((type == DATA_VARCHAR
+		    || type == DATA_VARMYSQL
+		    || type == DATA_BINARY)) {
+
+		if (dtype_get_mysql_type(dtype) == DATA_MYSQL_TRUE_VARCHAR) {
+			/* The length of the actual data is stored to 1 or 2
+			bytes at the start of the field */
+
+			if (row_format_col) {
+				if (dtype->prtype & DATA_LONG_TRUE_VARCHAR) {
+					lenlen = 2;
+				} else {
+					lenlen = 1;
+				}
+			} else {
+				/* In a MySQL key value, lenlen is always 2 */
+				lenlen = 2;
+			}
+
+			ptr = row_mysql_read_true_varchar(&col_len, mysql_data,
+							  lenlen);
+		} else {
+			/* Remove trailing spaces from old style VARCHAR
+			columns. */
+
+			/* Handle Unicode strings differently. */
+			ulint	mbminlen	= dtype_get_mbminlen(dtype);
+
+			ptr = mysql_data;
+
+			switch (mbminlen) {
+			default:
+				ut_error;
+			case 4:
+				/* space=0x00000020 */
+				/* Trim "half-chars", just in case. */
+				col_len &= ~3;
+
+				while (col_len >= 4
+				       && ptr[col_len - 4] == 0x00
+				       && ptr[col_len - 3] == 0x00
+				       && ptr[col_len - 2] == 0x00
+				       && ptr[col_len - 1] == 0x20) {
+					col_len -= 4;
+				}
+				break;
+			case 2:
+				/* space=0x0020 */
+				/* Trim "half-chars", just in case. */
+				col_len &= ~1;
+
+				while (col_len >= 2 && ptr[col_len - 2] == 0x00
+				       && ptr[col_len - 1] == 0x20) {
+					col_len -= 2;
+				}
+				break;
+			case 1:
+				/* space=0x20 */
+				while (col_len > 0
+				       && ptr[col_len - 1] == 0x20) {
+					col_len--;
+				}
+			}
+		}
+	} else if (comp && type == DATA_MYSQL
+		   && dtype_get_mbminlen(dtype) == 1
+		   && dtype_get_mbmaxlen(dtype) > 1) {
+		/* In some cases we strip trailing spaces from UTF-8 and other
+		multibyte charsets, from FIXED-length CHAR columns, to save
+		space. UTF-8 would otherwise normally use 3 * the string length
+		bytes to store an ASCII string! */
+
+		/* We assume that this CHAR field is encoded in a
+		variable-length character set where spaces have
+		1:1 correspondence to 0x20 bytes, such as UTF-8.
+
+		Consider a CHAR(n) field, a field of n characters.
+		It will contain between n * mbminlen and n * mbmaxlen bytes.
+		We will try to truncate it to n bytes by stripping
+		space padding.	If the field contains single-byte
+		characters only, it will be truncated to n characters.
+		Consider a CHAR(5) field containing the string ".a   "
+		where "." denotes a 3-byte character represented by
+		the bytes "$%&".  After our stripping, the string will
+		be stored as "$%&a " (5 bytes).	 The string ".abc "
+		will be stored as "$%&abc" (6 bytes).
+
+		The space padding will be restored in row0sel.cc, function
+		row_sel_field_store_in_mysql_format(). */
+
+		ulint		n_chars;
+
+		ut_a(!(dtype_get_len(dtype) % dtype_get_mbmaxlen(dtype)));
+
+		n_chars = dtype_get_len(dtype) / dtype_get_mbmaxlen(dtype);
+
+		/* Strip space padding. */
+		while (col_len > n_chars && ptr[col_len - 1] == 0x20) {
+			col_len--;
+		}
+	} else if (type == DATA_BLOB && row_format_col) {
+
+		ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len);
+	}
+
+	dfield_set_data(dfield, ptr, col_len);
+
+	return(buf);
+}
+
+/**************************************************************//**
+Convert a row in the MySQL format to a row in the Innobase format. Note that
+the function to convert a MySQL format key value to an InnoDB dtuple is
+row_sel_convert_mysql_key_to_innobase() in row0sel.cc. */
+static
+void
+row_mysql_convert_row_to_innobase(
+/*==============================*/
+	dtuple_t*	row,		/*!< in/out: Innobase row where the
+					field type information is already
+					copied there! */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct where template
+					must be of type ROW_MYSQL_WHOLE_ROW */
+	byte*		mysql_rec)	/*!< in: row in the MySQL format;
+					NOTE: do not discard as long as
+					row is used, as row may contain
+					pointers to this record! */
+{
+	const mysql_row_templ_t*templ;
+	dfield_t*		dfield;
+	ulint			i;
+
+	ut_ad(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
+	ut_ad(prebuilt->mysql_template);
+
+	for (i = 0; i < prebuilt->n_template; i++) {
+
+		templ = prebuilt->mysql_template + i;
+		dfield = dtuple_get_nth_field(row, i);
+
+		if (templ->mysql_null_bit_mask != 0) {
+			/* Column may be SQL NULL */
+
+			if (mysql_rec[templ->mysql_null_byte_offset]
+			    & (byte) (templ->mysql_null_bit_mask)) {
+
+				/* It is SQL NULL */
+
+				dfield_set_null(dfield);
+
+				goto next_column;
+			}
+		}
+
+		row_mysql_store_col_in_innobase_format(
+			dfield,
+			prebuilt->ins_upd_rec_buff + templ->mysql_col_offset,
+			TRUE, /* MySQL row format data */
+			mysql_rec + templ->mysql_col_offset,
+			templ->mysql_col_len,
+			dict_table_is_comp(prebuilt->table));
+next_column:
+		;
+	}
+
+	/* If there is a FTS doc id column and it is not user supplied (
+	generated by server) then assign it a new doc id. */
+	if (prebuilt->table->fts) {
+
+		ut_a(prebuilt->table->fts->doc_col != ULINT_UNDEFINED);
+
+		fts_create_doc_id(prebuilt->table, row, prebuilt->heap);
+	}
+}
+
+/****************************************************************//**
+Handles user errors and lock waits detected by the database engine.
+@return true if it was a lock wait and we should continue running the
+query thread and in that case the thr is ALREADY in the running state. */
+UNIV_INTERN
+bool
+row_mysql_handle_errors(
+/*====================*/
+	dberr_t*	new_err,/*!< out: possible new error encountered in
+				lock wait, or if no new error, the value
+				of trx->error_state at the entry of this
+				function */
+	trx_t*		trx,	/*!< in: transaction */
+	que_thr_t*	thr,	/*!< in: query thread, or NULL */
+	trx_savept_t*	savept)	/*!< in: savepoint, or NULL */
+{
+	dberr_t	err;
+
+handle_new_error:
+	err = trx->error_state;
+
+	ut_a(err != DB_SUCCESS);
+
+	trx->error_state = DB_SUCCESS;
+
+	switch (err) {
+	case DB_LOCK_WAIT_TIMEOUT:
+		if (row_rollback_on_timeout) {
+			trx_rollback_to_savepoint(trx, NULL);
+			break;
+		}
+		/* fall through */
+	case DB_DUPLICATE_KEY:
+	case DB_FOREIGN_DUPLICATE_KEY:
+	case DB_TOO_BIG_RECORD:
+	case DB_TOO_BIG_FOR_REDO:
+	case DB_UNDO_RECORD_TOO_BIG:
+	case DB_ROW_IS_REFERENCED:
+	case DB_NO_REFERENCED_ROW:
+	case DB_CANNOT_ADD_CONSTRAINT:
+	case DB_TOO_MANY_CONCURRENT_TRXS:
+	case DB_OUT_OF_FILE_SPACE:
+	case DB_READ_ONLY:
+	case DB_FTS_INVALID_DOCID:
+	case DB_INTERRUPTED:
+	case DB_DICT_CHANGED:
+		if (savept) {
+			/* Roll back the latest, possibly incomplete insertion
+			or update */
+
+			trx_rollback_to_savepoint(trx, savept);
+		}
+		/* MySQL will roll back the latest SQL statement */
+		break;
+	case DB_LOCK_WAIT:
+		lock_wait_suspend_thread(thr);
+
+		if (trx->error_state != DB_SUCCESS) {
+			que_thr_stop_for_mysql(thr);
+
+			goto handle_new_error;
+		}
+
+		*new_err = err;
+
+		return(true);
+
+	case DB_DEADLOCK:
+	case DB_LOCK_TABLE_FULL:
+		/* Roll back the whole transaction; this resolution was added
+		to version 3.23.43 */
+
+		trx_rollback_to_savepoint(trx, NULL);
+		break;
+
+	case DB_MUST_GET_MORE_FILE_SPACE:
+		fputs("InnoDB: The database cannot continue"
+		      " operation because of\n"
+		      "InnoDB: lack of space. You must add"
+		      " a new data file to\n"
+		      "InnoDB: my.cnf and restart the database.\n", stderr);
+
+		ut_ad(0);
+		exit(1);
+
+	case DB_CORRUPTION:
+		fputs("InnoDB: We detected index corruption"
+		      " in an InnoDB type table.\n"
+		      "InnoDB: You have to dump + drop + reimport"
+		      " the table or, in\n"
+		      "InnoDB: a case of widespread corruption,"
+		      " dump all InnoDB\n"
+		      "InnoDB: tables and recreate the"
+		      " whole InnoDB tablespace.\n"
+		      "InnoDB: If the mysqld server crashes"
+		      " after the startup or when\n"
+		      "InnoDB: you dump the tables, look at\n"
+		      "InnoDB: " REFMAN "forcing-innodb-recovery.html"
+		      " for help.\n", stderr);
+		break;
+	case DB_FOREIGN_EXCEED_MAX_CASCADE:
+		fprintf(stderr, "InnoDB: Cannot delete/update rows with"
+			" cascading foreign key constraints that exceed max"
+			" depth of %lu\n"
+			"Please drop excessive foreign constraints"
+			" and try again\n", (ulong) DICT_FK_MAX_RECURSIVE_LOAD);
+		break;
+	default:
+		fprintf(stderr, "InnoDB: unknown error code %lu\n",
+			(ulong) err);
+		ut_error;
+	}
+
+	if (trx->error_state != DB_SUCCESS) {
+		*new_err = trx->error_state;
+	} else {
+		*new_err = err;
+	}
+
+	trx->error_state = DB_SUCCESS;
+
+	return(false);
+}
+
+/********************************************************************//**
+Create a prebuilt struct for a MySQL table handle.
+@return	own: a prebuilt struct */
+UNIV_INTERN
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+	dict_table_t*	table,		/*!< in: Innobase table handle */
+	ulint		mysql_row_len)	/*!< in: length in bytes of a row in
+					the MySQL format */
+{
+	row_prebuilt_t*	prebuilt;
+	mem_heap_t*	heap;
+	dict_index_t*	clust_index;
+	dict_index_t*	temp_index;
+	dtuple_t*	ref;
+	ulint		ref_len;
+	uint		srch_key_len = 0;
+	ulint		search_tuple_n_fields;
+
+	search_tuple_n_fields = 2 * dict_table_get_n_cols(table);
+
+	clust_index = dict_table_get_first_index(table);
+
+	/* Make sure that search_tuple is long enough for clustered index */
+	ut_a(2 * dict_table_get_n_cols(table) >= clust_index->n_fields);
+
+	ref_len = dict_index_get_n_unique(clust_index);
+
+
+        /* Maximum size of the buffer needed for conversion of INTs from
+	little endian format to big endian format in an index. An index
+	can have maximum 16 columns (MAX_REF_PARTS) in it. Therfore
+	Max size for PK: 16 * 8 bytes (BIGINT's size) = 128 bytes
+	Max size Secondary index: 16 * 8 bytes + PK = 256 bytes. */
+#define MAX_SRCH_KEY_VAL_BUFFER         2* (8 * MAX_REF_PARTS)
+
+#define PREBUILT_HEAP_INITIAL_SIZE	\
+	( \
+	sizeof(*prebuilt) \
+	/* allocd in this function */ \
+	+ DTUPLE_EST_ALLOC(search_tuple_n_fields) \
+	+ DTUPLE_EST_ALLOC(ref_len) \
+	/* allocd in row_prebuild_sel_graph() */ \
+	+ sizeof(sel_node_t) \
+	+ sizeof(que_fork_t) \
+	+ sizeof(que_thr_t) \
+	/* allocd in row_get_prebuilt_update_vector() */ \
+	+ sizeof(upd_node_t) \
+	+ sizeof(upd_t) \
+	+ sizeof(upd_field_t) \
+	  * dict_table_get_n_cols(table) \
+	+ sizeof(que_fork_t) \
+	+ sizeof(que_thr_t) \
+	/* allocd in row_get_prebuilt_insert_row() */ \
+	+ sizeof(ins_node_t) \
+	/* mysql_row_len could be huge and we are not \
+	sure if this prebuilt instance is going to be \
+	used in inserts */ \
+	+ (mysql_row_len < 256 ? mysql_row_len : 0) \
+	+ DTUPLE_EST_ALLOC(dict_table_get_n_cols(table)) \
+	+ sizeof(que_fork_t) \
+	+ sizeof(que_thr_t) \
+	)
+
+	/* Calculate size of key buffer used to store search key in
+	InnoDB format. MySQL stores INTs in little endian format and
+	InnoDB stores INTs in big endian format with the sign bit
+	flipped. All other field types are stored/compared the same
+	in MySQL and InnoDB, so we must create a buffer containing
+	the INT key parts in InnoDB format.We need two such buffers
+	since both start and end keys are used in records_in_range(). */
+
+	for (temp_index = dict_table_get_first_index(table); temp_index;
+	     temp_index = dict_table_get_next_index(temp_index)) {
+		DBUG_EXECUTE_IF("innodb_srch_key_buffer_max_value",
+			ut_a(temp_index->n_user_defined_cols
+						== MAX_REF_PARTS););
+		uint temp_len = 0;
+		for (uint i = 0; i < temp_index->n_uniq; i++) {
+			if (temp_index->fields[i].col->mtype == DATA_INT) {
+				temp_len +=
+					temp_index->fields[i].fixed_len;
+			}
+		}
+		srch_key_len = max(srch_key_len,temp_len);
+	}
+
+	ut_a(srch_key_len <= MAX_SRCH_KEY_VAL_BUFFER);
+
+	DBUG_EXECUTE_IF("innodb_srch_key_buffer_max_value",
+		ut_a(srch_key_len == MAX_SRCH_KEY_VAL_BUFFER););
+
+	/* We allocate enough space for the objects that are likely to
+	be created later in order to minimize the number of malloc()
+	calls */
+	heap = mem_heap_create(PREBUILT_HEAP_INITIAL_SIZE + 2 * srch_key_len);
+
+	prebuilt = static_cast<row_prebuilt_t*>(
+		mem_heap_zalloc(heap, sizeof(*prebuilt)));
+
+	prebuilt->magic_n = ROW_PREBUILT_ALLOCATED;
+	prebuilt->magic_n2 = ROW_PREBUILT_ALLOCATED;
+
+	prebuilt->table = table;
+
+	prebuilt->sql_stat_start = TRUE;
+	prebuilt->heap = heap;
+
+	prebuilt->srch_key_val_len = srch_key_len;
+	if (prebuilt->srch_key_val_len) {
+		prebuilt->srch_key_val1 = static_cast<byte*>(
+			mem_heap_alloc(prebuilt->heap,
+				       2 * prebuilt->srch_key_val_len));
+		prebuilt->srch_key_val2 = prebuilt->srch_key_val1 +
+						prebuilt->srch_key_val_len;
+	} else {
+		prebuilt->srch_key_val1 = NULL;
+		prebuilt->srch_key_val2 = NULL;
+	}
+
+	btr_pcur_reset(&prebuilt->pcur);
+	btr_pcur_reset(&prebuilt->clust_pcur);
+
+	prebuilt->select_lock_type = LOCK_NONE;
+	prebuilt->stored_select_lock_type = LOCK_NONE_UNSET;
+
+	prebuilt->search_tuple = dtuple_create(heap, search_tuple_n_fields);
+
+	ref = dtuple_create(heap, ref_len);
+
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	prebuilt->clust_ref = ref;
+
+	prebuilt->autoinc_error = DB_SUCCESS;
+	prebuilt->autoinc_offset = 0;
+
+	/* Default to 1, we will set the actual value later in
+	ha_innobase::get_auto_increment(). */
+	prebuilt->autoinc_increment = 1;
+
+	prebuilt->autoinc_last_value = 0;
+
+	/* During UPDATE and DELETE we need the doc id. */
+	prebuilt->fts_doc_id = 0;
+
+	prebuilt->mysql_row_len = mysql_row_len;
+
+	return(prebuilt);
+}
+
+/********************************************************************//**
+Free a prebuilt struct for a MySQL table handle. */
+UNIV_INTERN
+void
+row_prebuilt_free(
+/*==============*/
+	row_prebuilt_t*	prebuilt,	/*!< in, own: prebuilt struct */
+	ibool		dict_locked)	/*!< in: TRUE=data dictionary locked */
+{
+	ulint	i;
+
+	if (UNIV_UNLIKELY
+	    (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED
+	     || prebuilt->magic_n2 != ROW_PREBUILT_ALLOCATED)) {
+
+		fprintf(stderr,
+			"InnoDB: Error: trying to free a corrupt\n"
+			"InnoDB: table handle. Magic n %lu,"
+			" magic n2 %lu, table name ",
+			(ulong) prebuilt->magic_n,
+			(ulong) prebuilt->magic_n2);
+		ut_print_name(stderr, NULL, TRUE, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption(prebuilt);
+
+		ut_error;
+	}
+
+	prebuilt->magic_n = ROW_PREBUILT_FREED;
+	prebuilt->magic_n2 = ROW_PREBUILT_FREED;
+
+	btr_pcur_reset(&prebuilt->pcur);
+	btr_pcur_reset(&prebuilt->clust_pcur);
+
+	if (prebuilt->mysql_template) {
+		mem_free(prebuilt->mysql_template);
+	}
+
+	if (prebuilt->ins_graph) {
+		que_graph_free_recursive(prebuilt->ins_graph);
+	}
+
+	if (prebuilt->sel_graph) {
+		que_graph_free_recursive(prebuilt->sel_graph);
+	}
+
+	if (prebuilt->upd_graph) {
+		que_graph_free_recursive(prebuilt->upd_graph);
+	}
+
+	if (prebuilt->blob_heap) {
+		mem_heap_free(prebuilt->blob_heap);
+	}
+
+	if (prebuilt->old_vers_heap) {
+		mem_heap_free(prebuilt->old_vers_heap);
+	}
+
+	if (prebuilt->fetch_cache[0] != NULL) {
+		byte*	base = prebuilt->fetch_cache[0] - 4;
+		byte*	ptr = base;
+
+		for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+			byte*	row;
+			ulint	magic1;
+			ulint	magic2;
+
+			magic1 = mach_read_from_4(ptr);
+			ptr += 4;
+
+			row = ptr;
+			ptr += prebuilt->mysql_row_len;
+
+			magic2 = mach_read_from_4(ptr);
+			ptr += 4;
+
+			if (ROW_PREBUILT_FETCH_MAGIC_N != magic1
+			    || row != prebuilt->fetch_cache[i]
+			    || ROW_PREBUILT_FETCH_MAGIC_N != magic2) {
+
+				fputs("InnoDB: Error: trying to free"
+					" a corrupt fetch buffer.\n", stderr);
+
+				mem_analyze_corruption(base);
+				ut_error;
+			}
+		}
+
+		mem_free(base);
+	}
+
+	dict_table_close(prebuilt->table, dict_locked, TRUE);
+
+	mem_heap_free(prebuilt->heap);
+}
+
+/*********************************************************************//**
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+UNIV_INTERN
+void
+row_update_prebuilt_trx(
+/*====================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct
+					in MySQL handle */
+	trx_t*		trx)		/*!< in: transaction handle */
+{
+	if (trx->magic_n != TRX_MAGIC_N) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to use a corrupt\n"
+			"InnoDB: trx handle. Magic n %lu\n",
+			(ulong) trx->magic_n);
+
+		mem_analyze_corruption(trx);
+
+		ut_error;
+	}
+
+	if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to use a corrupt\n"
+			"InnoDB: table handle. Magic n %lu, table name ",
+			(ulong) prebuilt->magic_n);
+		ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption(prebuilt);
+
+		ut_error;
+	}
+
+	prebuilt->trx = trx;
+
+	if (prebuilt->ins_graph) {
+		prebuilt->ins_graph->trx = trx;
+	}
+
+	if (prebuilt->upd_graph) {
+		prebuilt->upd_graph->trx = trx;
+	}
+
+	if (prebuilt->sel_graph) {
+		prebuilt->sel_graph->trx = trx;
+	}
+}
+
+/*********************************************************************//**
+Gets pointer to a prebuilt dtuple used in insertions. If the insert graph
+has not yet been built in the prebuilt struct, then this function first
+builds it.
+@return	prebuilt dtuple; the column type information is also set in it */
+static
+dtuple_t*
+row_get_prebuilt_insert_row(
+/*========================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	dict_table_t*		table	= prebuilt->table;
+
+	ut_ad(prebuilt && table && prebuilt->trx);
+
+	if (prebuilt->ins_node != 0) {
+
+		/* Check if indexes have been dropped or added and we
+		may need to rebuild the row insert template. */
+
+		if (prebuilt->trx_id == table->def_trx_id
+		    && UT_LIST_GET_LEN(prebuilt->ins_node->entry_list)
+		    == UT_LIST_GET_LEN(table->indexes)) {
+
+			return(prebuilt->ins_node->row);
+		}
+
+		ut_ad(prebuilt->trx_id < table->def_trx_id);
+
+		que_graph_free_recursive(prebuilt->ins_graph);
+
+		prebuilt->ins_graph = 0;
+	}
+
+	/* Create an insert node and query graph to the prebuilt struct */
+
+	ins_node_t*		node;
+
+	node = ins_node_create(INS_DIRECT, table, prebuilt->heap);
+
+	prebuilt->ins_node = node;
+
+	if (prebuilt->ins_upd_rec_buff == 0) {
+		prebuilt->ins_upd_rec_buff = static_cast<byte*>(
+			mem_heap_alloc(
+				prebuilt->heap,
+				prebuilt->mysql_row_len));
+	}
+
+	dtuple_t*	row;
+
+	row = dtuple_create(prebuilt->heap, dict_table_get_n_cols(table));
+
+	dict_table_copy_types(row, table);
+
+	ins_node_set_new_row(node, row);
+
+	prebuilt->ins_graph = static_cast<que_fork_t*>(
+		que_node_get_parent(
+			pars_complete_graph_for_exec(
+				node,
+				prebuilt->trx, prebuilt->heap)));
+
+	prebuilt->ins_graph->state = QUE_FORK_ACTIVE;
+
+	prebuilt->trx_id = table->def_trx_id;
+
+	return(prebuilt->ins_node->row);
+}
+
+/*********************************************************************//**
+Updates the table modification counter and calculates new estimates
+for table and index statistics if necessary. */
+UNIV_INLINE
+void
+row_update_statistics_if_needed(
+/*============================*/
+	dict_table_t*	table)	/*!< in: table */
+{
+	ib_uint64_t	counter;
+	ib_uint64_t	n_rows;
+
+	if (!table->stat_initialized) {
+		DBUG_EXECUTE_IF(
+			"test_upd_stats_if_needed_not_inited",
+			fprintf(stderr, "test_upd_stats_if_needed_not_inited "
+				"was executed\n");
+		);
+		return;
+	}
+
+	counter = table->stat_modified_counter++;
+	n_rows = dict_table_get_n_rows(table);
+
+	if (dict_stats_is_persistent_enabled(table)) {
+		if (counter > n_rows / 10 /* 10% */
+		    && dict_stats_auto_recalc_is_enabled(table)) {
+
+			dict_stats_recalc_pool_add(table);
+			table->stat_modified_counter = 0;
+		}
+		return;
+	}
+
+	/* Calculate new statistics if 1 / 16 of table has been modified
+	since the last time a statistics batch was run.
+	We calculate statistics at most every 16th round, since we may have
+	a counter table which is very small and updated very often. */
+
+	if (counter > 16 + n_rows / 16 /* 6.25% */) {
+
+		ut_ad(!mutex_own(&dict_sys->mutex));
+		/* this will reset table->stat_modified_counter to 0 */
+		dict_stats_update(table, DICT_STATS_RECALC_TRANSIENT);
+	}
+}
+
+/*********************************************************************//**
+Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
+AUTO_INC lock gives exclusive access to the auto-inc counter of the
+table. The lock is reserved only for the duration of an SQL statement.
+It is not compatible with another AUTO_INC or exclusive lock on the
+table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_lock_table_autoinc_for_mysql(
+/*=============================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in the MySQL
+					table handle */
+{
+	trx_t*			trx	= prebuilt->trx;
+	ins_node_t*		node	= prebuilt->ins_node;
+	const dict_table_t*	table	= prebuilt->table;
+	que_thr_t*		thr;
+	dberr_t			err;
+	ibool			was_lock_wait;
+
+	ut_ad(trx);
+
+	/* If we already hold an AUTOINC lock on the table then do nothing.
+        Note: We peek at the value of the current owner without acquiring
+	the lock mutex. **/
+	if (trx == table->autoinc_trx) {
+
+		return(DB_SUCCESS);
+	}
+
+	trx->op_info = "setting auto-inc lock";
+
+	row_get_prebuilt_insert_row(prebuilt);
+	node = prebuilt->ins_node;
+
+	/* We use the insert query graph as the dummy graph needed
+	in the lock module call */
+
+	thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+
+	/* It may be that the current session has not yet started
+	its transaction, or it has been committed: */
+
+	trx_start_if_not_started_xa(trx);
+
+	err = lock_table(0, prebuilt->table, LOCK_AUTO_INC, thr);
+
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		que_thr_stop_for_mysql(thr);
+
+		was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
+
+		if (was_lock_wait) {
+			goto run_again;
+		}
+
+		trx->op_info = "";
+
+		return(err);
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Sets a table lock on the table mentioned in prebuilt.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_lock_table_for_mysql(
+/*=====================*/
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct in the MySQL
+					table handle */
+	dict_table_t*	table,		/*!< in: table to lock, or NULL
+					if prebuilt->table should be
+					locked as
+					prebuilt->select_lock_type */
+	ulint		mode)		/*!< in: lock mode of table
+					(ignored if table==NULL) */
+{
+	trx_t*		trx		= prebuilt->trx;
+	que_thr_t*	thr;
+	dberr_t		err;
+	ibool		was_lock_wait;
+
+	ut_ad(trx);
+
+	trx->op_info = "setting table lock";
+
+	if (prebuilt->sel_graph == NULL) {
+		/* Build a dummy select query graph */
+		row_prebuild_sel_graph(prebuilt);
+	}
+
+	/* We use the select query graph as the dummy graph needed
+	in the lock module call */
+
+	thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = thr;
+	thr->prev_node = thr->common.parent;
+
+	/* It may be that the current session has not yet started
+	its transaction, or it has been committed: */
+
+	trx_start_if_not_started_xa(trx);
+
+	if (table) {
+		err = lock_table(
+			0, table,
+			static_cast<enum lock_mode>(mode), thr);
+	} else {
+		err = lock_table(
+			0, prebuilt->table,
+			static_cast<enum lock_mode>(
+				prebuilt->select_lock_type),
+			thr);
+	}
+
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		que_thr_stop_for_mysql(thr);
+
+		was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
+
+		if (was_lock_wait) {
+			goto run_again;
+		}
+
+		trx->op_info = "";
+
+		return(err);
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Does an insert for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_insert_for_mysql(
+/*=================*/
+	byte*		mysql_rec,	/*!< in: row in the MySQL format */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	trx_savept_t	savept;
+	que_thr_t*	thr;
+	dberr_t		err;
+	ibool		was_lock_wait;
+	trx_t*		trx		= prebuilt->trx;
+	ins_node_t*	node		= prebuilt->ins_node;
+	dict_table_t*	table		= prebuilt->table;
+
+	ut_ad(trx);
+
+	if (dict_table_is_discarded(prebuilt->table)) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"The table %s doesn't have a corresponding "
+			"tablespace, it was discarded.",
+			prebuilt->table->name);
+
+		return(DB_TABLESPACE_DELETED);
+
+	} else if (prebuilt->table->ibd_file_missing) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			".ibd file is missing for table %s",
+			prebuilt->table->name);
+
+		return(DB_TABLESPACE_NOT_FOUND);
+
+	} else if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to free a corrupt\n"
+			"InnoDB: table handle. Magic n %lu, table name ",
+			(ulong) prebuilt->magic_n);
+		ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption(prebuilt);
+
+		ut_error;
+	} else if (srv_created_new_raw || srv_force_recovery) {
+		fputs("InnoDB: A new raw disk partition was initialized or\n"
+		      "InnoDB: innodb_force_recovery is on: we do not allow\n"
+		      "InnoDB: database modifications by the user. Shut down\n"
+		      "InnoDB: mysqld and edit my.cnf so that"
+		      " newraw is replaced\n"
+		      "InnoDB: with raw, and innodb_force_... is removed.\n",
+		      stderr);
+		if(srv_force_recovery) {
+			return(DB_READ_ONLY);
+		}
+		return(DB_ERROR);
+	}
+
+	trx->op_info = "inserting";
+
+	row_mysql_delay_if_needed();
+
+	trx_start_if_not_started_xa(trx);
+
+	row_get_prebuilt_insert_row(prebuilt);
+	node = prebuilt->ins_node;
+
+	row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec);
+
+	savept = trx_savept_take(trx);
+
+	thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+	if (prebuilt->sql_stat_start) {
+		node->state = INS_NODE_SET_IX_LOCK;
+		prebuilt->sql_stat_start = FALSE;
+	} else {
+		node->state = INS_NODE_ALLOC_ROW_ID;
+	}
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+
+	row_ins_step(thr);
+
+	err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+error_exit:
+		que_thr_stop_for_mysql(thr);
+
+		/* FIXME: What's this ? */
+		thr->lock_state = QUE_THR_LOCK_ROW;
+
+		was_lock_wait = row_mysql_handle_errors(
+			&err, trx, thr, &savept);
+
+		thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+		if (was_lock_wait) {
+			ut_ad(node->state == INS_NODE_INSERT_ENTRIES
+			      || node->state == INS_NODE_ALLOC_ROW_ID);
+			goto run_again;
+		}
+
+		trx->op_info = "";
+
+		return(err);
+	}
+
+	if (dict_table_has_fts_index(table)) {
+		doc_id_t        doc_id;
+
+		/* Extract the doc id from the hidden FTS column */
+		doc_id = fts_get_doc_id_from_row(table, node->row);
+
+		if (doc_id <= 0) {
+			fprintf(stderr,
+				"InnoDB: FTS Doc ID must be large than 0 \n");
+			err = DB_FTS_INVALID_DOCID;
+			trx->error_state = DB_FTS_INVALID_DOCID;
+			goto error_exit;
+		}
+
+		if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+			doc_id_t	next_doc_id
+				= table->fts->cache->next_doc_id;
+
+			if (doc_id < next_doc_id) {
+				fprintf(stderr,
+					"InnoDB: FTS Doc ID must be large than"
+					" " UINT64PF " for table",
+					next_doc_id - 1);
+				ut_print_name(stderr, trx, TRUE, table->name);
+				putc('\n', stderr);
+
+				err = DB_FTS_INVALID_DOCID;
+				trx->error_state = DB_FTS_INVALID_DOCID;
+				goto error_exit;
+			}
+
+			/* Difference between Doc IDs are restricted within
+			4 bytes integer. See fts_get_encoded_len() */
+
+			if (doc_id - next_doc_id >= FTS_DOC_ID_MAX_STEP) {
+				fprintf(stderr,
+					"InnoDB: Doc ID " UINT64PF " is too"
+					" big. Its difference with largest"
+					" used Doc ID " UINT64PF " cannot"
+					" exceed or equal to %d\n",
+					doc_id, next_doc_id - 1,
+					FTS_DOC_ID_MAX_STEP);
+				err = DB_FTS_INVALID_DOCID;
+				trx->error_state = DB_FTS_INVALID_DOCID;
+				goto error_exit;
+			}
+		}
+
+		/* Pass NULL for the columns affected, since an INSERT affects
+		all FTS indexes. */
+		fts_trx_add_op(trx, table, doc_id, FTS_INSERT, NULL);
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	srv_stats.n_rows_inserted.add((size_t)trx->id, 1);
+
+	/* Not protected by dict_table_stats_lock() for performance
+	reasons, we would rather get garbage in stat_n_rows (which is
+	just an estimate anyway) than protecting the following code
+	with a latch. */
+	dict_table_n_rows_inc(table);
+
+	row_update_statistics_if_needed(table);
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Builds a dummy query graph used in selects. */
+UNIV_INTERN
+void
+row_prebuild_sel_graph(
+/*===================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	sel_node_t*	node;
+
+	ut_ad(prebuilt && prebuilt->trx);
+
+	if (prebuilt->sel_graph == NULL) {
+
+		node = sel_node_create(prebuilt->heap);
+
+		prebuilt->sel_graph = static_cast<que_fork_t*>(
+			que_node_get_parent(
+				pars_complete_graph_for_exec(
+					static_cast<sel_node_t*>(node),
+					prebuilt->trx, prebuilt->heap)));
+
+		prebuilt->sel_graph->state = QUE_FORK_ACTIVE;
+	}
+}
+
+/*********************************************************************//**
+Creates an query graph node of 'update' type to be used in the MySQL
+interface.
+@return	own: update node */
+UNIV_INTERN
+upd_node_t*
+row_create_update_node_for_mysql(
+/*=============================*/
+	dict_table_t*	table,	/*!< in: table to update */
+	mem_heap_t*	heap)	/*!< in: mem heap from which allocated */
+{
+	upd_node_t*	node;
+
+	node = upd_node_create(heap);
+
+	node->in_mysql_interface = TRUE;
+	node->is_delete = FALSE;
+	node->searched_update = FALSE;
+	node->select = NULL;
+	node->pcur = btr_pcur_create_for_mysql();
+	node->table = table;
+
+	node->update = upd_create(dict_table_get_n_cols(table), heap);
+
+	node->update_n_fields = dict_table_get_n_cols(table);
+
+	UT_LIST_INIT(node->columns);
+	node->has_clust_rec_x_lock = TRUE;
+	node->cmpl_info = 0;
+
+	node->table_sym = NULL;
+	node->col_assign_list = NULL;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it.
+@return	prebuilt update vector */
+UNIV_INTERN
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	dict_table_t*	table	= prebuilt->table;
+	upd_node_t*	node;
+
+	ut_ad(prebuilt && table && prebuilt->trx);
+
+	if (prebuilt->upd_node == NULL) {
+
+		/* Not called before for this handle: create an update node
+		and query graph to the prebuilt struct */
+
+		node = row_create_update_node_for_mysql(table, prebuilt->heap);
+
+		prebuilt->upd_node = node;
+
+		prebuilt->upd_graph = static_cast<que_fork_t*>(
+			que_node_get_parent(
+				pars_complete_graph_for_exec(
+					static_cast<upd_node_t*>(node),
+					prebuilt->trx, prebuilt->heap)));
+
+		prebuilt->upd_graph->state = QUE_FORK_ACTIVE;
+	}
+
+	return(prebuilt->upd_node->update);
+}
+
+/********************************************************************
+Handle an update of a column that has an FTS index. */
+static
+void
+row_fts_do_update(
+/*==============*/
+	trx_t*		trx,		/* in: transaction */
+	dict_table_t*	table,		/* in: Table with FTS index */
+	doc_id_t	old_doc_id,	/* in: old document id */
+	doc_id_t	new_doc_id)	/* in: new document id */
+{
+	if (trx->fts_next_doc_id) {
+		fts_trx_add_op(trx, table, old_doc_id, FTS_DELETE, NULL);
+		fts_trx_add_op(trx, table, new_doc_id, FTS_INSERT, NULL);
+	}
+}
+
+/************************************************************************
+Handles FTS matters for an update or a delete.
+NOTE: should not be called if the table does not have an FTS index. .*/
+static
+dberr_t
+row_fts_update_or_delete(
+/*=====================*/
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct in MySQL
+					handle */
+{
+	trx_t*		trx = prebuilt->trx;
+	dict_table_t*	table = prebuilt->table;
+	upd_node_t*	node = prebuilt->upd_node;
+	doc_id_t	old_doc_id = prebuilt->fts_doc_id;
+
+	ut_a(dict_table_has_fts_index(prebuilt->table));
+
+	/* Deletes are simple; get them out of the way first. */
+	if (node->is_delete) {
+		/* A delete affects all FTS indexes, so we pass NULL */
+		fts_trx_add_op(trx, table, old_doc_id, FTS_DELETE, NULL);
+	} else {
+		doc_id_t	new_doc_id;
+
+		new_doc_id = fts_read_doc_id((byte*) &trx->fts_next_doc_id);
+
+		if (new_doc_id == 0) {
+			fprintf(stderr, " InnoDB FTS: Doc ID cannot be 0 \n");
+			return(DB_FTS_INVALID_DOCID);
+		}
+
+		row_fts_do_update(trx, table, old_doc_id, new_doc_id);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Initialize the Doc ID system for FK table with FTS index */
+static
+void
+init_fts_doc_id_for_ref(
+/*====================*/
+	dict_table_t*	table,		/*!< in: table */
+	ulint*		depth)		/*!< in: recusive call depth */
+{
+	dict_foreign_t* foreign;
+
+	table->fk_max_recusive_level = 0;
+
+	(*depth)++;
+
+	/* Limit on tables involved in cascading delete/update */
+	if (*depth > FK_MAX_CASCADE_DEL) {
+		return;
+	}
+
+	/* Loop through this table's referenced list and also
+	recursively traverse each table's foreign table list */
+	for (dict_foreign_set::iterator it = table->referenced_set.begin();
+	     it != table->referenced_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		if (foreign->foreign_table == NULL) {
+			break;
+		}
+
+		if (foreign->foreign_table->fts != NULL) {
+			fts_init_doc_id(foreign->foreign_table);
+		}
+
+		if (!foreign->foreign_table->referenced_set.empty()
+		    && foreign->foreign_table != table) {
+			init_fts_doc_id_for_ref(
+				foreign->foreign_table, depth);
+		}
+	}
+}
+
+/*********************************************************************//**
+Does an update or delete of a row for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_update_for_mysql(
+/*=================*/
+	byte*		mysql_rec,	/*!< in: the row to be updated, in
+					the MySQL format */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	trx_savept_t	savept;
+	dberr_t		err;
+	que_thr_t*	thr;
+	ibool		was_lock_wait;
+	dict_index_t*	clust_index;
+	/*	ulint		ref_len; */
+	upd_node_t*	node;
+	dict_table_t*	table		= prebuilt->table;
+	trx_t*		trx		= prebuilt->trx;
+	ulint		fk_depth	= 0;
+
+	ut_ad(prebuilt && trx);
+	UT_NOT_USED(mysql_rec);
+
+	if (prebuilt->table->ibd_file_missing) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Error:\n"
+			"InnoDB: MySQL is trying to use a table handle"
+			" but the .ibd file for\n"
+			"InnoDB: table %s does not exist.\n"
+			"InnoDB: Have you deleted the .ibd file"
+			" from the database directory under\n"
+			"InnoDB: the MySQL datadir, or have you"
+			" used DISCARD TABLESPACE?\n"
+			"InnoDB: Look from\n"
+			"InnoDB: " REFMAN "innodb-troubleshooting.html\n"
+			"InnoDB: how you can resolve the problem.\n",
+			prebuilt->table->name);
+		return(DB_ERROR);
+	}
+
+	if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to free a corrupt\n"
+			"InnoDB: table handle. Magic n %lu, table name ",
+			(ulong) prebuilt->magic_n);
+		ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption(prebuilt);
+
+		ut_error;
+	}
+
+	if (UNIV_UNLIKELY(srv_created_new_raw || srv_force_recovery)) {
+		fputs("InnoDB: A new raw disk partition was initialized or\n"
+		      "InnoDB: innodb_force_recovery is on: we do not allow\n"
+		      "InnoDB: database modifications by the user. Shut down\n"
+		      "InnoDB: mysqld and edit my.cnf so that newraw"
+		      " is replaced\n"
+		      "InnoDB: with raw, and innodb_force_... is removed.\n",
+		      stderr);
+		if(srv_force_recovery) {
+			return(DB_READ_ONLY);
+		}
+		return(DB_ERROR);
+	}
+
+	DEBUG_SYNC_C("innodb_row_update_for_mysql_begin");
+
+	trx->op_info = "updating or deleting";
+
+	row_mysql_delay_if_needed();
+
+	trx_start_if_not_started_xa(trx);
+
+	if (dict_table_is_referenced_by_foreign_key(table)) {
+		/* Share lock the data dictionary to prevent any
+		table dictionary (for foreign constraint) change.
+		This is similar to row_ins_check_foreign_constraint
+		check protect by the dictionary lock as well.
+		In the future, this can be removed once the Foreign
+		key MDL is implemented */
+		row_mysql_freeze_data_dictionary(trx);
+		init_fts_doc_id_for_ref(table, &fk_depth);
+		row_mysql_unfreeze_data_dictionary(trx);
+	}
+
+	node = prebuilt->upd_node;
+
+	clust_index = dict_table_get_first_index(table);
+
+	if (prebuilt->pcur.btr_cur.index == clust_index) {
+		btr_pcur_copy_stored_position(node->pcur, &prebuilt->pcur);
+	} else {
+		btr_pcur_copy_stored_position(node->pcur,
+					      &prebuilt->clust_pcur);
+	}
+
+	ut_a(node->pcur->rel_pos == BTR_PCUR_ON);
+
+	/* MySQL seems to call rnd_pos before updating each row it
+	has cached: we can get the correct cursor position from
+	prebuilt->pcur; NOTE that we cannot build the row reference
+	from mysql_rec if the clustered index was automatically
+	generated for the table: MySQL does not know anything about
+	the row id used as the clustered index key */
+
+	savept = trx_savept_take(trx);
+
+	thr = que_fork_get_first_thr(prebuilt->upd_graph);
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	ut_ad(!prebuilt->sql_stat_start);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+	thr->fk_cascade_depth = 0;
+
+	row_upd_step(thr);
+
+	err = trx->error_state;
+
+	/* Reset fk_cascade_depth back to 0 */
+	thr->fk_cascade_depth = 0;
+
+	if (err != DB_SUCCESS) {
+		que_thr_stop_for_mysql(thr);
+
+		if (err == DB_RECORD_NOT_FOUND) {
+			trx->error_state = DB_SUCCESS;
+			trx->op_info = "";
+
+			return(err);
+		}
+
+		thr->lock_state= QUE_THR_LOCK_ROW;
+
+		DEBUG_SYNC(trx->mysql_thd, "row_update_for_mysql_error");
+
+		was_lock_wait = row_mysql_handle_errors(&err, trx, thr,
+							&savept);
+		thr->lock_state= QUE_THR_LOCK_NOLOCK;
+
+		if (was_lock_wait) {
+			goto run_again;
+		}
+
+		trx->op_info = "";
+
+		return(err);
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	if (dict_table_has_fts_index(table)
+	    && trx->fts_next_doc_id != UINT64_UNDEFINED) {
+		err = row_fts_update_or_delete(prebuilt);
+		if (err != DB_SUCCESS) {
+			trx->op_info = "";
+			return(err);
+		}
+	}
+
+	if (node->is_delete) {
+		/* Not protected by dict_table_stats_lock() for performance
+		reasons, we would rather get garbage in stat_n_rows (which is
+		just an estimate anyway) than protecting the following code
+		with a latch. */
+		dict_table_n_rows_dec(prebuilt->table);
+
+		srv_stats.n_rows_deleted.add((size_t)trx->id, 1);
+	} else {
+		srv_stats.n_rows_updated.add((size_t)trx->id, 1);
+	}
+
+	/* We update table statistics only if it is a DELETE or UPDATE
+	that changes indexed columns, UPDATEs that change only non-indexed
+	columns would not affect statistics. */
+	if (node->is_delete || !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+		row_update_statistics_if_needed(prebuilt->table);
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+This can only be used when srv_locks_unsafe_for_binlog is TRUE or this
+session is using a READ COMMITTED or READ UNCOMMITTED isolation level.
+Before calling this function row_search_for_mysql() must have
+initialized prebuilt->new_rec_locks to store the information which new
+record locks really were set. This function removes a newly set
+clustered index record lock under prebuilt->pcur or
+prebuilt->clust_pcur.  Thus, this implements a 'mini-rollback' that
+releases the latest clustered index record lock we set.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+void
+row_unlock_for_mysql(
+/*=================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct in MySQL
+					handle */
+	ibool		has_latches_on_recs)/*!< in: TRUE if called so
+					that we have the latches on
+					the records under pcur and
+					clust_pcur, and we do not need
+					to reposition the cursors. */
+{
+	btr_pcur_t*	pcur		= &prebuilt->pcur;
+	btr_pcur_t*	clust_pcur	= &prebuilt->clust_pcur;
+	trx_t*		trx		= prebuilt->trx;
+
+	ut_ad(prebuilt && trx);
+
+	if (UNIV_UNLIKELY
+	    (!srv_locks_unsafe_for_binlog
+	     && trx->isolation_level > TRX_ISO_READ_COMMITTED)) {
+
+		fprintf(stderr,
+			"InnoDB: Error: calling row_unlock_for_mysql though\n"
+			"InnoDB: innodb_locks_unsafe_for_binlog is FALSE and\n"
+			"InnoDB: this session is not using"
+			" READ COMMITTED isolation level.\n");
+		return;
+	}
+
+	trx->op_info = "unlock_row";
+
+	if (prebuilt->new_rec_locks >= 1) {
+
+		const rec_t*	rec;
+		dict_index_t*	index;
+		trx_id_t	rec_trx_id;
+		mtr_t		mtr;
+
+		mtr_start(&mtr);
+
+		/* Restore the cursor position and find the record */
+
+		if (!has_latches_on_recs) {
+			btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, &mtr);
+		}
+
+		rec = btr_pcur_get_rec(pcur);
+		index = btr_pcur_get_btr_cur(pcur)->index;
+
+		if (prebuilt->new_rec_locks >= 2) {
+			/* Restore the cursor position and find the record
+			in the clustered index. */
+
+			if (!has_latches_on_recs) {
+				btr_pcur_restore_position(BTR_SEARCH_LEAF,
+							  clust_pcur, &mtr);
+			}
+
+			rec = btr_pcur_get_rec(clust_pcur);
+			index = btr_pcur_get_btr_cur(clust_pcur)->index;
+		}
+
+		if (!dict_index_is_clust(index)) {
+			/* This is not a clustered index record.  We
+			do not know how to unlock the record. */
+			goto no_unlock;
+		}
+
+		/* If the record has been modified by this
+		transaction, do not unlock it. */
+
+		if (index->trx_id_offset) {
+			rec_trx_id = trx_read_trx_id(rec
+						     + index->trx_id_offset);
+		} else {
+			mem_heap_t*	heap			= NULL;
+			ulint	offsets_[REC_OFFS_NORMAL_SIZE];
+			ulint*	offsets				= offsets_;
+
+			rec_offs_init(offsets_);
+			offsets = rec_get_offsets(rec, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+
+			rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+		}
+
+		if (rec_trx_id != trx->id) {
+			/* We did not update the record: unlock it */
+
+			rec = btr_pcur_get_rec(pcur);
+
+			lock_rec_unlock(
+				trx,
+				btr_pcur_get_block(pcur),
+				rec,
+				static_cast<enum lock_mode>(
+					prebuilt->select_lock_type));
+
+			if (prebuilt->new_rec_locks >= 2) {
+				rec = btr_pcur_get_rec(clust_pcur);
+
+				lock_rec_unlock(
+					trx,
+					btr_pcur_get_block(clust_pcur),
+					rec,
+					static_cast<enum lock_mode>(
+						prebuilt->select_lock_type));
+			}
+		}
+no_unlock:
+		mtr_commit(&mtr);
+	}
+
+	trx->op_info = "";
+}
+
+/**********************************************************************//**
+Does a cascaded delete or set null in a foreign key operation.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_update_cascade_for_mysql(
+/*=========================*/
+	que_thr_t*	thr,	/*!< in: query thread */
+	upd_node_t*	node,	/*!< in: update node used in the cascade
+				or set null operation */
+	dict_table_t*	table)	/*!< in: table where we do the operation */
+{
+	dberr_t	err;
+	trx_t*	trx;
+
+	trx = thr_get_trx(thr);
+
+	/* Increment fk_cascade_depth to record the recursive call depth on
+	a single update/delete that affects multiple tables chained
+	together with foreign key relations. */
+	thr->fk_cascade_depth++;
+
+	if (thr->fk_cascade_depth > FK_MAX_CASCADE_DEL) {
+		return(DB_FOREIGN_EXCEED_MAX_CASCADE);
+	}
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+
+	DEBUG_SYNC_C("foreign_constraint_update_cascade");
+
+	row_upd_step(thr);
+
+	/* The recursive call for cascading update/delete happens
+	in above row_upd_step(), reset the counter once we come
+	out of the recursive call, so it does not accumulate for
+	different row deletes */
+	thr->fk_cascade_depth = 0;
+
+	err = trx->error_state;
+
+	/* Note that the cascade node is a subnode of another InnoDB
+	query graph node. We do a normal lock wait in this node, but
+	all errors are handled by the parent node. */
+
+	if (err == DB_LOCK_WAIT) {
+		/* Handle lock wait here */
+
+		que_thr_stop_for_mysql(thr);
+
+		lock_wait_suspend_thread(thr);
+
+		/* Note that a lock wait may also end in a lock wait timeout,
+		or this transaction is picked as a victim in selective
+		deadlock resolution */
+
+		if (trx->error_state != DB_SUCCESS) {
+
+			return(trx->error_state);
+		}
+
+		/* Retry operation after a normal lock wait */
+
+		goto run_again;
+	}
+
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	if (node->is_delete) {
+		/* Not protected by dict_table_stats_lock() for performance
+		reasons, we would rather get garbage in stat_n_rows (which is
+		just an estimate anyway) than protecting the following code
+		with a latch. */
+		dict_table_n_rows_dec(table);
+
+		srv_stats.n_rows_deleted.add((size_t)trx->id, 1);
+	} else {
+		srv_stats.n_rows_updated.add((size_t)trx->id, 1);
+	}
+
+	row_update_statistics_if_needed(table);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Checks if a table is such that we automatically created a clustered
+index on it (on row id).
+@return	TRUE if the clustered index was generated automatically */
+UNIV_INTERN
+ibool
+row_table_got_default_clust_index(
+/*==============================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	const dict_index_t*	clust_index;
+
+	clust_index = dict_table_get_first_index(table);
+
+	return(dict_index_get_nth_col(clust_index, 0)->mtype == DATA_SYS);
+}
+
+/*********************************************************************//**
+Locks the data dictionary in shared mode from modifications, for performing
+foreign key check, rollback, or other operation invisible to MySQL. */
+UNIV_INTERN
+void
+row_mysql_freeze_data_dictionary_func(
+/*==================================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	const char*	file,	/*!< in: file name */
+	ulint		line)	/*!< in: line number */
+{
+	ut_a(trx->dict_operation_lock_mode == 0);
+
+	rw_lock_s_lock_inline(&dict_operation_lock, 0, file, line);
+
+	trx->dict_operation_lock_mode = RW_S_LATCH;
+}
+
+/*********************************************************************//**
+Unlocks the data dictionary shared lock. */
+UNIV_INTERN
+void
+row_mysql_unfreeze_data_dictionary(
+/*===============================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	ut_ad(lock_trx_has_sys_table_locks(trx) == NULL);
+
+	ut_a(trx->dict_operation_lock_mode == RW_S_LATCH);
+
+	rw_lock_s_unlock(&dict_operation_lock);
+
+	trx->dict_operation_lock_mode = 0;
+}
+
+/*********************************************************************//**
+Locks the data dictionary exclusively for performing a table create or other
+data dictionary modification operation. */
+UNIV_INTERN
+void
+row_mysql_lock_data_dictionary_func(
+/*================================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	const char*	file,	/*!< in: file name */
+	ulint		line)	/*!< in: line number */
+{
+	ut_a(trx->dict_operation_lock_mode == 0
+	     || trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks or lock waits can occur then in these operations */
+
+	rw_lock_x_lock_inline(&dict_operation_lock, 0, file, line);
+	trx->dict_operation_lock_mode = RW_X_LATCH;
+
+	mutex_enter(&(dict_sys->mutex));
+}
+
+/*********************************************************************//**
+Unlocks the data dictionary exclusive lock. */
+UNIV_INTERN
+void
+row_mysql_unlock_data_dictionary(
+/*=============================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	ut_ad(lock_trx_has_sys_table_locks(trx) == NULL);
+
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	mutex_exit(&(dict_sys->mutex));
+	rw_lock_x_unlock(&dict_operation_lock);
+
+	trx->dict_operation_lock_mode = 0;
+}
+
+/*********************************************************************//**
+Creates a table for MySQL. If the name of the table ends in
+one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor",
+"innodb_table_monitor", then this will also start the printing of monitor
+output by the master thread. If the table name ends in "innodb_mem_validate",
+InnoDB will try to invoke mem_validate(). On failure the transaction will
+be rolled back and the 'table' object will be freed.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_create_table_for_mysql(
+/*=======================*/
+	dict_table_t*	table,	/*!< in, own: table definition
+				(will be freed, or on DB_SUCCESS
+				added to the data dictionary cache) */
+	trx_t*		trx,	/*!< in/out: transaction */
+	bool		commit)	/*!< in: if true, commit the transaction */
+{
+	tab_node_t*	node;
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	const char*	table_name;
+	ulint		table_name_len;
+	dberr_t		err;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	DBUG_EXECUTE_IF(
+		"ib_create_table_fail_at_start_of_row_create_table_for_mysql",
+		goto err_exit;
+	);
+
+	if (srv_created_new_raw) {
+		fputs("InnoDB: A new raw disk partition was initialized:\n"
+		      "InnoDB: we do not allow database modifications"
+		      " by the user.\n"
+		      "InnoDB: Shut down mysqld and edit my.cnf so that newraw"
+		      " is replaced with raw.\n", stderr);
+err_exit:
+		dict_mem_table_free(table);
+
+		if (commit) {
+			trx_commit_for_mysql(trx);
+		}
+
+		return(DB_ERROR);
+	}
+
+	trx->op_info = "creating table";
+
+	if (row_mysql_is_system_table(table->name)) {
+
+		fprintf(stderr,
+			"InnoDB: Error: trying to create a MySQL system"
+			" table %s of type InnoDB.\n"
+			"InnoDB: MySQL system tables must be"
+			" of the MyISAM type!\n",
+			table->name);
+		goto err_exit;
+	}
+
+	trx_start_if_not_started_xa(trx);
+
+	/* The table name is prefixed with the database name and a '/'.
+	Certain table names starting with 'innodb_' have their special
+	meaning regardless of the database name.  Thus, we need to
+	ignore the database name prefix in the comparisons. */
+	table_name = dict_remove_db_name(table->name);
+	table_name_len = strlen(table_name) + 1;
+
+	if (STR_EQ(table_name, table_name_len, S_innodb_monitor)) {
+
+		/* Table equals "innodb_monitor":
+		start monitor prints */
+
+		srv_print_innodb_monitor = TRUE;
+
+		/* The lock timeout monitor thread also takes care
+		of InnoDB monitor prints */
+
+		os_event_set(lock_sys->timeout_event);
+	} else if (STR_EQ(table_name, table_name_len,
+			  S_innodb_lock_monitor)) {
+
+		srv_print_innodb_monitor = TRUE;
+		srv_print_innodb_lock_monitor = TRUE;
+		os_event_set(lock_sys->timeout_event);
+	} else if (STR_EQ(table_name, table_name_len,
+			  S_innodb_tablespace_monitor)) {
+
+		srv_print_innodb_tablespace_monitor = TRUE;
+		os_event_set(lock_sys->timeout_event);
+	} else if (STR_EQ(table_name, table_name_len,
+			  S_innodb_table_monitor)) {
+
+		srv_print_innodb_table_monitor = TRUE;
+		os_event_set(lock_sys->timeout_event);
+#ifdef UNIV_MEM_DEBUG
+	} else if (STR_EQ(table_name, table_name_len,
+			  S_innodb_mem_validate)) {
+		/* We define here a debugging feature intended for
+		developers */
+
+		fputs("Validating InnoDB memory:\n"
+		      "to use this feature you must compile InnoDB with\n"
+		      "UNIV_MEM_DEBUG defined in univ.i and"
+		      " the server must be\n"
+		      "quiet because allocation from a mem heap"
+		      " is not protected\n"
+		      "by any semaphore.\n", stderr);
+		ut_a(mem_validate());
+		fputs("Memory validated\n", stderr);
+#endif /* UNIV_MEM_DEBUG */
+	}
+
+	heap = mem_heap_create(512);
+
+	switch (trx_get_dict_operation(trx)) {
+	case TRX_DICT_OP_NONE:
+		trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+	case TRX_DICT_OP_TABLE:
+		break;
+	case TRX_DICT_OP_INDEX:
+		/* If the transaction was previously flagged as
+		TRX_DICT_OP_INDEX, we should be creating auxiliary
+		tables for full-text indexes. */
+		ut_ad(strstr(table->name, "/FTS_") != NULL);
+	}
+
+	node = tab_create_graph_create(table, heap, commit);
+
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+
+	ut_a(thr == que_fork_start_command(
+			static_cast<que_fork_t*>(que_node_get_parent(thr))));
+
+	que_run_threads(thr);
+
+	err = trx->error_state;
+
+	if (table->space != TRX_SYS_SPACE) {
+		ut_a(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_USE_TABLESPACE));
+
+		/* Update SYS_TABLESPACES and SYS_DATAFILES if a new
+		tablespace was created. */
+		if (err == DB_SUCCESS) {
+			char*	path;
+			path = fil_space_get_first_path(table->space);
+
+			err = dict_create_add_tablespace_to_dictionary(
+				table->space, table->name,
+				fil_space_get_flags(table->space),
+				path, trx, commit);
+
+			mem_free(path);
+		}
+
+		if (err != DB_SUCCESS) {
+			/* We must delete the link file. */
+			fil_delete_link_file(table->name);
+		}
+	}
+
+	switch (err) {
+	case DB_SUCCESS:
+		break;
+	case DB_OUT_OF_FILE_SPACE:
+		trx->error_state = DB_SUCCESS;
+		trx_rollback_to_savepoint(trx, NULL);
+
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Warning: cannot create table ",
+		      stderr);
+		ut_print_name(stderr, trx, TRUE, table->name);
+		fputs(" because tablespace full\n", stderr);
+
+		if (dict_table_open_on_name(table->name, TRUE, FALSE,
+					    DICT_ERR_IGNORE_NONE)) {
+
+			/* Make things easy for the drop table code. */
+
+			if (table->can_be_evicted) {
+				dict_table_move_from_lru_to_non_lru(table);
+			}
+
+			dict_table_close(table, TRUE, FALSE);
+
+			row_drop_table_for_mysql(table->name, trx, FALSE);
+
+			if (commit) {
+				trx_commit_for_mysql(trx);
+			}
+		} else {
+			dict_mem_table_free(table);
+		}
+
+		break;
+
+	case DB_TOO_MANY_CONCURRENT_TRXS:
+		/* We already have .ibd file here. it should be deleted. */
+
+		if (table->space
+		    && fil_delete_tablespace(
+			    table->space,
+			    BUF_REMOVE_FLUSH_NO_WRITE)
+		    != DB_SUCCESS) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Error: not able to"
+				" delete tablespace %lu of table ",
+				(ulong) table->space);
+			ut_print_name(stderr, trx, TRUE, table->name);
+			fputs("!\n", stderr);
+		}
+		/* fall through */
+
+	case DB_DUPLICATE_KEY:
+	case DB_TABLESPACE_EXISTS:
+	default:
+		trx->error_state = DB_SUCCESS;
+		trx_rollback_to_savepoint(trx, NULL);
+		dict_mem_table_free(table);
+		break;
+	}
+
+	que_graph_free((que_t*) que_node_get_parent(thr));
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Does an index creation operation for MySQL. TODO: currently failure
+to create an index results in dropping the whole table! This is no problem
+currently as all indexes must be created at the same time as the table.
+@return	error number or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_create_index_for_mysql(
+/*=======================*/
+	dict_index_t*	index,		/*!< in, own: index definition
+					(will be freed) */
+	trx_t*		trx,		/*!< in: transaction handle */
+	const ulint*	field_lengths)	/*!< in: if not NULL, must contain
+					dict_index_get_n_fields(index)
+					actual field lengths for the
+					index columns, which are
+					then checked for not being too
+					large. */
+{
+	ind_node_t*	node;
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	dberr_t		err;
+	ulint		i;
+	ulint		len;
+	char*		table_name;
+	char*		index_name;
+	dict_table_t*	table;
+	ibool		is_fts;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	trx->op_info = "creating index";
+
+	/* Copy the table name because we may want to drop the
+	table later, after the index object is freed (inside
+	que_run_threads()) and thus index->table_name is not available. */
+	table_name = mem_strdup(index->table_name);
+	index_name = mem_strdup(index->name);
+
+	is_fts = (index->type == DICT_FTS);
+
+	table = dict_table_open_on_name(table_name, TRUE, TRUE,
+					DICT_ERR_IGNORE_NONE);
+
+	trx_start_if_not_started_xa(trx);
+
+	for (i = 0; i < index->n_def; i++) {
+		/* Check that prefix_len and actual length
+		< DICT_MAX_INDEX_COL_LEN */
+
+		len = dict_index_get_nth_field(index, i)->prefix_len;
+
+		if (field_lengths && field_lengths[i]) {
+			len = ut_max(len, field_lengths[i]);
+		}
+
+		DBUG_EXECUTE_IF(
+			"ib_create_table_fail_at_create_index",
+			len = DICT_MAX_FIELD_LEN_BY_FORMAT(table) + 1;
+		);
+
+		/* Column or prefix length exceeds maximum column length */
+		if (len > (ulint) DICT_MAX_FIELD_LEN_BY_FORMAT(table)) {
+			err = DB_TOO_BIG_INDEX_COL;
+
+			dict_mem_index_free(index);
+			goto error_handling;
+		}
+	}
+
+	heap = mem_heap_create(512);
+
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+	/* Note that the space id where we store the index is inherited from
+	the table in dict_build_index_def_step() in dict0crea.cc. */
+
+	node = ind_create_graph_create(index, heap, true);
+
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+
+	ut_a(thr == que_fork_start_command(
+			static_cast<que_fork_t*>(que_node_get_parent(thr))));
+
+	que_run_threads(thr);
+
+	err = trx->error_state;
+
+	que_graph_free((que_t*) que_node_get_parent(thr));
+
+	/* Create the index specific FTS auxiliary tables. */
+	if (err == DB_SUCCESS && is_fts) {
+		dict_index_t*	idx;
+
+		idx = dict_table_get_index_on_name(table, index_name);
+
+		ut_ad(idx);
+		err = fts_create_index_tables(trx, idx);
+	}
+
+error_handling:
+	dict_table_close(table, TRUE, FALSE);
+
+	if (err != DB_SUCCESS) {
+		/* We have special error handling here */
+
+		trx->error_state = DB_SUCCESS;
+
+		trx_rollback_to_savepoint(trx, NULL);
+
+		row_drop_table_for_mysql(table_name, trx, FALSE);
+
+		trx_commit_for_mysql(trx);
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	trx->op_info = "";
+
+	mem_free(table_name);
+	mem_free(index_name);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Scans a table create SQL string and adds to the data dictionary
+the foreign key constraints declared in the string. This function
+should be called after the indexes for a table have been created.
+Each foreign key constraint must be accompanied with indexes in
+both participating tables. The indexes are allowed to contain more
+fields than mentioned in the constraint. Check also that foreign key
+constraints which reference this table are ok.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_table_add_foreign_constraints(
+/*==============================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const char*	sql_string,	/*!< in: table create statement where
+					foreign keys are declared like:
+				FOREIGN KEY (a, b) REFERENCES table2(c, d),
+					table2 can be written also with the
+					database name before it: test.table2 */
+	size_t		sql_length,	/*!< in: length of sql_string */
+	const char*	name,		/*!< in: table full name in the
+					normalized form
+					database_name/table_name */
+	ibool		reject_fks)	/*!< in: if TRUE, fail with error
+					code DB_CANNOT_ADD_CONSTRAINT if
+					any foreign keys are found. */
+{
+	dberr_t	err;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_a(sql_string);
+
+	trx->op_info = "adding foreign keys";
+
+	trx_start_if_not_started_xa(trx);
+
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+	err = dict_create_foreign_constraints(trx, sql_string, sql_length,
+					      name, reject_fks);
+
+	DBUG_EXECUTE_IF("ib_table_add_foreign_fail",
+			err = DB_DUPLICATE_KEY;);
+
+	DEBUG_SYNC_C("table_add_foreign_constraints");
+
+	if (err == DB_SUCCESS) {
+		/* Check that also referencing constraints are ok */
+		err = dict_load_foreigns(name, NULL, false, true,
+					 DICT_ERR_IGNORE_NONE);
+	}
+
+	if (err != DB_SUCCESS) {
+		/* We have special error handling here */
+
+		trx->error_state = DB_SUCCESS;
+
+		trx_rollback_to_savepoint(trx, NULL);
+
+		row_drop_table_for_mysql(name, trx, FALSE);
+
+		trx_commit_for_mysql(trx);
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Drops a table for MySQL as a background operation. MySQL relies on Unix
+in ALTER TABLE to the fact that the table handler does not remove the
+table before all handles to it has been removed. Furhermore, the MySQL's
+call to drop table must be non-blocking. Therefore we do the drop table
+as a background operation, which is taken care of by the master thread
+in srv0srv.cc.
+@return	error code or DB_SUCCESS */
+static
+dberr_t
+row_drop_table_for_mysql_in_background(
+/*===================================*/
+	const char*	name)	/*!< in: table name */
+{
+	dberr_t	error;
+	trx_t*	trx;
+
+	trx = trx_allocate_for_background();
+
+	/* If the original transaction was dropping a table referenced by
+	foreign keys, we must set the following to be able to drop the
+	table: */
+
+	trx->check_foreigns = FALSE;
+
+	/*	fputs("InnoDB: Error: Dropping table ", stderr);
+	ut_print_name(stderr, trx, TRUE, name);
+	fputs(" in background drop list\n", stderr); */
+
+	/* Try to drop the table in InnoDB */
+
+	error = row_drop_table_for_mysql(name, trx, FALSE);
+
+	/* Flush the log to reduce probability that the .frm files and
+	the InnoDB data dictionary get out-of-sync if the user runs
+	with innodb_flush_log_at_trx_commit = 0 */
+
+	log_buffer_flush_to_disk();
+
+	trx_commit_for_mysql(trx);
+
+	trx_free_for_background(trx);
+
+	return(error);
+}
+
+/*********************************************************************//**
+The master thread in srv0srv.cc calls this regularly to drop tables which
+we must drop in background after queries to them have ended. Such lazy
+dropping of tables is needed in ALTER TABLE on Unix.
+@return	how many tables dropped + remaining tables in list */
+UNIV_INTERN
+ulint
+row_drop_tables_for_mysql_in_background(void)
+/*=========================================*/
+{
+	row_mysql_drop_t*	drop;
+	dict_table_t*		table;
+	ulint			n_tables;
+	ulint			n_tables_dropped = 0;
+loop:
+	mutex_enter(&row_drop_list_mutex);
+
+	ut_a(row_mysql_drop_list_inited);
+
+	drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
+
+	n_tables = UT_LIST_GET_LEN(row_mysql_drop_list);
+
+	mutex_exit(&row_drop_list_mutex);
+
+	if (drop == NULL) {
+		/* All tables dropped */
+
+		return(n_tables + n_tables_dropped);
+	}
+
+	table = dict_table_open_on_name(drop->table_name, FALSE, FALSE,
+					DICT_ERR_IGNORE_NONE);
+
+	if (table == NULL) {
+		/* If for some reason the table has already been dropped
+		through some other mechanism, do not try to drop it */
+
+		goto already_dropped;
+	}
+
+	ut_a(!table->can_be_evicted);
+
+	dict_table_close(table, FALSE, FALSE);
+
+	if (DB_SUCCESS != row_drop_table_for_mysql_in_background(
+		    drop->table_name)) {
+		/* If the DROP fails for some table, we return, and let the
+		main thread retry later */
+
+		return(n_tables + n_tables_dropped);
+	}
+
+	n_tables_dropped++;
+
+already_dropped:
+	mutex_enter(&row_drop_list_mutex);
+
+	UT_LIST_REMOVE(row_mysql_drop_list, row_mysql_drop_list, drop);
+
+	MONITOR_DEC(MONITOR_BACKGROUND_DROP_TABLE);
+
+	ut_print_timestamp(stderr);
+	fputs("  InnoDB: Dropped table ", stderr);
+	ut_print_name(stderr, NULL, TRUE, drop->table_name);
+	fputs(" in background drop queue.\n", stderr);
+
+	mem_free(drop->table_name);
+
+	mem_free(drop);
+
+	mutex_exit(&row_drop_list_mutex);
+
+	goto loop;
+}
+
+/*********************************************************************//**
+Get the background drop list length. NOTE: the caller must own the
+drop list mutex!
+@return	how many tables in list */
+UNIV_INTERN
+ulint
+row_get_background_drop_list_len_low(void)
+/*======================================*/
+{
+	ulint	len;
+
+	mutex_enter(&row_drop_list_mutex);
+
+	ut_a(row_mysql_drop_list_inited);
+
+	len = UT_LIST_GET_LEN(row_mysql_drop_list);
+
+	mutex_exit(&row_drop_list_mutex);
+
+	return(len);
+}
+
+/*********************************************************************//**
+If a table is not yet in the drop list, adds the table to the list of tables
+which the master thread drops in background. We need this on Unix because in
+ALTER TABLE MySQL may call drop table even if the table has running queries on
+it. Also, if there are running foreign key checks on the table, we drop the
+table lazily.
+@return	TRUE if the table was not yet in the drop list, and was added there */
+static
+ibool
+row_add_table_to_background_drop_list(
+/*==================================*/
+	const char*	name)	/*!< in: table name */
+{
+	row_mysql_drop_t*	drop;
+
+	mutex_enter(&row_drop_list_mutex);
+
+	ut_a(row_mysql_drop_list_inited);
+
+	/* Look if the table already is in the drop list */
+	for (drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
+	     drop != NULL;
+	     drop = UT_LIST_GET_NEXT(row_mysql_drop_list, drop)) {
+
+		if (strcmp(drop->table_name, name) == 0) {
+			/* Already in the list */
+
+			mutex_exit(&row_drop_list_mutex);
+
+			return(FALSE);
+		}
+	}
+
+	drop = static_cast<row_mysql_drop_t*>(
+		mem_alloc(sizeof(row_mysql_drop_t)));
+
+	drop->table_name = mem_strdup(name);
+
+	UT_LIST_ADD_LAST(row_mysql_drop_list, row_mysql_drop_list, drop);
+
+	MONITOR_INC(MONITOR_BACKGROUND_DROP_TABLE);
+
+	/*	fputs("InnoDB: Adding table ", stderr);
+	ut_print_name(stderr, trx, TRUE, drop->table_name);
+	fputs(" to background drop list\n", stderr); */
+
+	mutex_exit(&row_drop_list_mutex);
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Reassigns the table identifier of a table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_mysql_table_id_reassign(
+/*========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	trx_t*		trx,	/*!< in/out: transaction */
+	table_id_t*	new_id)	/*!< out: new table id */
+{
+	dberr_t		err;
+	pars_info_t*	info	= pars_info_create();
+
+	dict_hdr_get_new_id(new_id, NULL, NULL);
+
+	/* Remove all locks except the table-level S and X locks. */
+	lock_remove_all_on_table(table, FALSE);
+
+	pars_info_add_ull_literal(info, "old_id", table->id);
+	pars_info_add_ull_literal(info, "new_id", *new_id);
+
+	err = que_eval_sql(
+		info,
+		"PROCEDURE RENUMBER_TABLE_PROC () IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_TABLES SET ID = :new_id\n"
+		" WHERE ID = :old_id;\n"
+		"UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n"
+		" WHERE TABLE_ID = :old_id;\n"
+		"UPDATE SYS_INDEXES SET TABLE_ID = :new_id\n"
+		" WHERE TABLE_ID = :old_id;\n"
+		"END;\n", FALSE, trx);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Setup the pre-requisites for DISCARD TABLESPACE. It will start the transaction,
+acquire the data dictionary lock in X mode and open the table.
+@return table instance or 0 if not found. */
+static
+dict_table_t*
+row_discard_tablespace_begin(
+/*=========================*/
+	const char*	name,	/*!< in: table name */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	trx->op_info = "discarding tablespace";
+
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+	trx_start_if_not_started_xa(trx);
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	this is to avoid deadlocks during data dictionary operations */
+
+	row_mysql_lock_data_dictionary(trx);
+
+	dict_table_t*	table;
+
+	table = dict_table_open_on_name(
+		name, TRUE, FALSE, DICT_ERR_IGNORE_NONE);
+
+	if (table) {
+		dict_stats_wait_bg_to_stop_using_table(table, trx);
+		ut_a(table->space != TRX_SYS_SPACE);
+		ut_a(table->n_foreign_key_checks_running == 0);
+	}
+
+	return(table);
+}
+
+/*********************************************************************//**
+Do the foreign key constraint checks.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_discard_tablespace_foreign_key_checks(
+/*======================================*/
+	const trx_t*		trx,	/*!< in: transaction handle */
+	const dict_table_t*	table)	/*!< in: table to be discarded */
+{
+
+	if (srv_read_only_mode || !trx->check_foreigns) {
+		return(DB_SUCCESS);
+	}
+
+	/* Check if the table is referenced by foreign key constraints from
+	some other table (not the table itself) */
+	dict_foreign_set::iterator	it
+		= std::find_if(table->referenced_set.begin(),
+			       table->referenced_set.end(),
+			       dict_foreign_different_tables());
+
+	if (it == table->referenced_set.end()) {
+		return(DB_SUCCESS);
+	}
+
+	const dict_foreign_t*	foreign	= *it;
+	FILE*			ef	= dict_foreign_err_file;
+
+	ut_ad(foreign->foreign_table != table);
+	ut_ad(foreign->referenced_table == table);
+
+	/* We only allow discarding a referenced table if
+	FOREIGN_KEY_CHECKS is set to 0 */
+
+	mutex_enter(&dict_foreign_err_mutex);
+
+	rewind(ef);
+
+	ut_print_timestamp(ef);
+
+	fputs("  Cannot DISCARD table ", ef);
+	ut_print_name(stderr, trx, TRUE, table->name);
+	fputs("\n"
+	      "because it is referenced by ", ef);
+	ut_print_name(stderr, trx, TRUE, foreign->foreign_table_name);
+	putc('\n', ef);
+
+	mutex_exit(&dict_foreign_err_mutex);
+
+	return(DB_CANNOT_DROP_CONSTRAINT);
+}
+
+/*********************************************************************//**
+Cleanup after the DISCARD TABLESPACE operation.
+@return error code. */
+static
+dberr_t
+row_discard_tablespace_end(
+/*=======================*/
+	trx_t*		trx,	/*!< in/out: transaction handle */
+	dict_table_t*	table,	/*!< in/out: table to be discarded */
+	dberr_t		err)	/*!< in: error code */
+{
+	if (table != 0) {
+		dict_table_close(table, TRUE, FALSE);
+	}
+
+	DBUG_EXECUTE_IF("ib_discard_before_commit_crash",
+			log_make_checkpoint_at(LSN_MAX, TRUE);
+			DBUG_SUICIDE(););
+
+	trx_commit_for_mysql(trx);
+
+	DBUG_EXECUTE_IF("ib_discard_after_commit_crash",
+			log_make_checkpoint_at(LSN_MAX, TRUE);
+			DBUG_SUICIDE(););
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Do the DISCARD TABLESPACE operation.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_discard_tablespace(
+/*===================*/
+	trx_t*		trx,	/*!< in/out: transaction handle */
+	dict_table_t*	table)	/*!< in/out: table to be discarded */
+{
+	dberr_t		err;
+
+	/* How do we prevent crashes caused by ongoing operations on
+	the table? Old operations could try to access non-existent
+	pages. MySQL will block all DML on the table using MDL and a
+	DISCARD will not start unless all existing operations on the
+	table to be discarded are completed.
+
+	1) Acquire the data dictionary latch in X mode. To prevent any
+	internal operations that MySQL is not aware off and also for
+	the internal SQL parser.
+
+	2) Purge and rollback: we assign a new table id for the
+	table. Since purge and rollback look for the table based on
+	the table id, they see the table as 'dropped' and discard
+	their operations.
+
+	3) Insert buffer: we remove all entries for the tablespace in
+	the insert buffer tree.
+
+	4) FOREIGN KEY operations: if table->n_foreign_key_checks_running > 0,
+	we do not allow the discard. */
+
+	/* Play safe and remove all insert buffer entries, though we should
+	have removed them already when DISCARD TABLESPACE was called */
+
+	ibuf_delete_for_discarded_space(table->space);
+
+	table_id_t	new_id;
+
+	/* Set the TABLESPACE DISCARD flag in the table definition on disk. */
+
+	err = row_import_update_discarded_flag(trx, table->id, true, true);
+
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* Update the index root pages in the system tables, on disk */
+
+	err = row_import_update_index_root(trx, table, true, true);
+
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* Drop all the FTS auxiliary tables. */
+	if (dict_table_has_fts_index(table)
+	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+
+		fts_drop_tables(trx, table);
+	}
+
+	/* Assign a new space ID to the table definition so that purge
+	can ignore the changes. Update the system table on disk. */
+
+	err = row_mysql_table_id_reassign(table, trx, &new_id);
+
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* Discard the physical file that is used for the tablespace. */
+
+	err = fil_discard_tablespace(table->space);
+
+	switch(err) {
+	case DB_SUCCESS:
+	case DB_IO_ERROR:
+	case DB_TABLESPACE_NOT_FOUND:
+		/* All persistent operations successful, update the
+		data dictionary memory cache. */
+
+		table->ibd_file_missing = TRUE;
+
+		table->flags2 |= DICT_TF2_DISCARDED;
+
+		dict_table_change_id_in_cache(table, new_id);
+
+		/* Reset the root page numbers. */
+
+		for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+		     index != 0;
+		     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+			index->page = FIL_NULL;
+			index->space = FIL_NULL;
+		}
+
+		/* If the tablespace did not already exist or we couldn't
+		write to it, we treat that as a successful DISCARD. It is
+		unusable anyway. */
+
+		err = DB_SUCCESS;
+		break;
+
+	default:
+		/* We need to rollback the disk changes, something failed. */
+
+		trx->error_state = DB_SUCCESS;
+
+		trx_rollback_to_savepoint(trx, NULL);
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function renames the .ibd file and assigns a new table id for
+the table. Also the flag table->ibd_file_missing is set to TRUE.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_discard_tablespace_for_mysql(
+/*=============================*/
+	const char*	name,	/*!< in: table name */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	dberr_t		err;
+	dict_table_t*	table;
+
+	/* Open the table and start the transaction if not started. */
+
+	table = row_discard_tablespace_begin(name, trx);
+
+	if (table == 0) {
+		err = DB_TABLE_NOT_FOUND;
+	} else if (table->space == TRX_SYS_SPACE) {
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name), table->name, FALSE);
+
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			    ER_TABLE_IN_SYSTEM_TABLESPACE, table_name);
+
+		err = DB_ERROR;
+
+	} else if (table->n_foreign_key_checks_running > 0) {
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name), table->name, FALSE);
+
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			    ER_DISCARD_FK_CHECKS_RUNNING, table_name);
+
+		err = DB_ERROR;
+
+	} else {
+		/* Do foreign key constraint checks. */
+
+		err = row_discard_tablespace_foreign_key_checks(trx, table);
+
+		if (err == DB_SUCCESS) {
+			err = row_discard_tablespace(trx, table);
+		}
+	}
+
+	return(row_discard_tablespace_end(trx, table, err));
+}
+
+/*********************************************************************//**
+Sets an exclusive lock on a table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_mysql_lock_table(
+/*=================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	dict_table_t*	table,		/*!< in: table to lock */
+	enum lock_mode	mode,		/*!< in: LOCK_X or LOCK_S */
+	const char*	op_info)	/*!< in: string for trx->op_info */
+{
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	dberr_t		err;
+	sel_node_t*	node;
+
+	ut_ad(trx);
+	ut_ad(mode == LOCK_X || mode == LOCK_S);
+
+	heap = mem_heap_create(512);
+
+	trx->op_info = op_info;
+
+	node = sel_node_create(heap);
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+	thr->graph->state = QUE_FORK_ACTIVE;
+
+	/* We use the select query graph as the dummy graph needed
+	in the lock module call */
+
+	thr = que_fork_get_first_thr(
+		static_cast<que_fork_t*>(que_node_get_parent(thr)));
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = thr;
+	thr->prev_node = thr->common.parent;
+
+	err = lock_table(0, table, mode, thr);
+
+	trx->error_state = err;
+
+	if (err == DB_SUCCESS) {
+		que_thr_stop_for_mysql_no_error(thr, trx);
+	} else {
+		que_thr_stop_for_mysql(thr);
+
+		if (err != DB_QUE_THR_SUSPENDED) {
+			ibool	was_lock_wait;
+
+			was_lock_wait = row_mysql_handle_errors(
+				&err, trx, thr, NULL);
+
+			if (was_lock_wait) {
+				goto run_again;
+			}
+		} else {
+			que_thr_t*	run_thr;
+			que_node_t*	parent;
+
+			parent = que_node_get_parent(thr);
+
+			run_thr = que_fork_start_command(
+				static_cast<que_fork_t*>(parent));
+
+			ut_a(run_thr == thr);
+
+			/* There was a lock wait but the thread was not
+			in a ready to run or running state. */
+			trx->error_state = DB_LOCK_WAIT;
+
+			goto run_again;
+		}
+	}
+
+	que_graph_free(thr->graph);
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Truncates a table for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_truncate_table_for_mysql(
+/*=========================*/
+	dict_table_t*	table,	/*!< in: table handle */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	dberr_t		err;
+	mem_heap_t*	heap;
+	byte*		buf;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	dict_index_t*	sys_index;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+	table_id_t	new_id;
+	ulint		recreate_space = 0;
+	pars_info_t*	info = NULL;
+	ibool		has_internal_doc_id;
+	ulint		old_space = table->space;
+
+	/* How do we prevent crashes caused by ongoing operations on
+	the table? Old operations could try to access non-existent
+	pages.
+
+	1) SQL queries, INSERT, SELECT, ...: we must get an exclusive
+	InnoDB table lock on the table before we can do TRUNCATE
+	TABLE. Then there are no running queries on the table.
+
+	2) Purge and rollback: we assign a new table id for the
+	table. Since purge and rollback look for the table based on
+	the table id, they see the table as 'dropped' and discard
+	their operations.
+
+	3) Insert buffer: TRUNCATE TABLE is analogous to DROP TABLE,
+	so we do not have to remove insert buffer records, as the
+	insert buffer works at a low level. If a freed page is later
+	reallocated, the allocator will remove the ibuf entries for
+	it.
+
+	When we truncate *.ibd files by recreating them (analogous to
+	DISCARD TABLESPACE), we remove all entries for the table in the
+	insert buffer tree.  This is not strictly necessary, because
+	in 6) we will assign a new tablespace identifier, but we can
+	free up some space in the system tablespace.
+
+	4) Linear readahead and random readahead: we use the same
+	method as in 3) to discard ongoing operations. (This is only
+	relevant for TRUNCATE TABLE by DISCARD TABLESPACE.)
+
+	5) FOREIGN KEY operations: if
+	table->n_foreign_key_checks_running > 0, we do not allow the
+	TRUNCATE. We also reserve the data dictionary latch.
+
+	6) Crash recovery: To prevent the application of pre-truncation
+	redo log records on the truncated tablespace, we will assign
+	a new tablespace identifier to the truncated tablespace. */
+
+	ut_ad(table);
+
+	if (srv_created_new_raw) {
+		fputs("InnoDB: A new raw disk partition was initialized:\n"
+		      "InnoDB: we do not allow database modifications"
+		      " by the user.\n"
+		      "InnoDB: Shut down mysqld and edit my.cnf so that newraw"
+		      " is replaced with raw.\n", stderr);
+
+		return(DB_ERROR);
+	}
+
+	if (dict_table_is_discarded(table)) {
+		return(DB_TABLESPACE_DELETED);
+	} else if (table->ibd_file_missing) {
+		return(DB_TABLESPACE_NOT_FOUND);
+	}
+
+	trx_start_for_ddl(trx, TRX_DICT_OP_TABLE);
+
+	trx->op_info = "truncating table";
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	ut_a(trx->dict_operation_lock_mode == 0);
+	/* Prevent foreign key checks etc. while we are truncating the
+	table */
+	row_mysql_lock_data_dictionary(trx);
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	dict_stats_wait_bg_to_stop_using_table(table, trx);
+
+	/* Check if the table is referenced by foreign key constraints from
+	some other table (not the table itself) */
+
+	dict_foreign_set::iterator	it
+		= std::find_if(table->referenced_set.begin(),
+			       table->referenced_set.end(),
+			       dict_foreign_different_tables());
+
+	if (!srv_read_only_mode
+	    && it != table->referenced_set.end()
+	    && trx->check_foreigns) {
+
+		FILE*		ef	= dict_foreign_err_file;
+		dict_foreign_t*	foreign	= *it;
+
+		/* We only allow truncating a referenced table if
+		FOREIGN_KEY_CHECKS is set to 0 */
+
+		mutex_enter(&dict_foreign_err_mutex);
+		rewind(ef);
+		ut_print_timestamp(ef);
+
+		fputs("  Cannot truncate table ", ef);
+		ut_print_name(ef, trx, TRUE, table->name);
+		fputs(" by DROP+CREATE\n"
+		      "InnoDB: because it is referenced by ", ef);
+		ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
+		putc('\n', ef);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		err = DB_ERROR;
+		goto funct_exit;
+	}
+
+	/* TODO: could we replace the counter n_foreign_key_checks_running
+	with lock checks on the table? Acquire here an exclusive lock on the
+	table, and rewrite lock0lock.cc and the lock wait in srv0srv.cc so that
+	they can cope with the table having been truncated here? Foreign key
+	checks take an IS or IX lock on the table. */
+
+	if (table->n_foreign_key_checks_running > 0) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Cannot truncate table ", stderr);
+		ut_print_name(stderr, trx, TRUE, table->name);
+		fputs(" by DROP+CREATE\n"
+		      "InnoDB: because there is a foreign key check"
+		      " running on it.\n",
+		      stderr);
+		err = DB_ERROR;
+
+		goto funct_exit;
+	}
+
+	/* Check if memcached plugin is running on this table. if is, we don't
+	allow truncate this table. */
+	if (table->memcached_sync_count != 0) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Cannot truncate table ", stderr);
+		ut_print_name(stderr, trx, TRUE, table->name);
+		fputs(" by DROP+CREATE\n"
+		      "InnoDB: because there are memcached operations"
+		      " running on it.\n",
+		      stderr);
+		err = DB_ERROR;
+
+		goto funct_exit;
+	} else {
+                /* We need to set this counter to -1 for blocking
+                memcached operations. */
+		table->memcached_sync_count = DICT_TABLE_IN_DDL;
+        }
+
+	/* Remove all locks except the table-level X lock. */
+
+	lock_remove_all_on_table(table, FALSE);
+
+	/* Ensure that the table will be dropped by
+	trx_rollback_active() in case of a crash. */
+
+	trx->table_id = table->id;
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+	/* Assign an undo segment for the transaction, so that the
+	transaction will be recovered after a crash. */
+
+	mutex_enter(&trx->undo_mutex);
+
+	err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE);
+
+	mutex_exit(&trx->undo_mutex);
+
+	if (err != DB_SUCCESS) {
+
+		goto funct_exit;
+	}
+
+	if (table->space && !table->dir_path_of_temp_table) {
+		/* Discard and create the single-table tablespace. */
+		ulint	space	= table->space;
+		ulint	flags	= fil_space_get_flags(space);
+
+		ut_a(!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY));
+
+		dict_get_and_save_data_dir_path(table, true);
+
+		if (flags != ULINT_UNDEFINED
+		    && fil_discard_tablespace(space) == DB_SUCCESS) {
+
+			dict_index_t*	index;
+
+			dict_hdr_get_new_id(NULL, NULL, &space);
+
+			/* Lock all index trees for this table. We must
+			do so after dict_hdr_get_new_id() to preserve
+			the latch order */
+			dict_table_x_lock_indexes(table);
+
+			if (space == ULINT_UNDEFINED
+			    || fil_create_new_single_table_tablespace(
+				    space, table->name,
+				    table->data_dir_path,
+				    flags, table->flags2,
+				    FIL_IBD_FILE_INITIAL_SIZE)
+			    != DB_SUCCESS) {
+				dict_table_x_unlock_indexes(table);
+
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"TRUNCATE TABLE %s failed to "
+					"create a new tablespace",
+					table->name);
+
+				table->ibd_file_missing = 1;
+				err = DB_ERROR;
+				goto funct_exit;
+			}
+
+			recreate_space = space;
+
+			/* Replace the space_id in the data dictionary cache.
+			The persisent data dictionary (SYS_TABLES.SPACE
+			and SYS_INDEXES.SPACE) are updated later in this
+			function. */
+			table->space = space;
+			index = dict_table_get_first_index(table);
+			do {
+				index->space = space;
+				index = dict_table_get_next_index(index);
+			} while (index);
+
+			mtr_start(&mtr);
+			fsp_header_init(space,
+					FIL_IBD_FILE_INITIAL_SIZE, &mtr);
+			mtr_commit(&mtr);
+		}
+	} else {
+		/* Lock all index trees for this table, as we will
+		truncate the table/index and possibly change their metadata.
+		All DML/DDL are blocked by table level lock, with
+		a few exceptions such as queries into information schema
+		about the table, MySQL could try to access index stats
+		for this kind of query, we need to use index locks to
+		sync up */
+		dict_table_x_lock_indexes(table);
+	}
+
+	/* scan SYS_INDEXES for all indexes of the table */
+	heap = mem_heap_create(800);
+
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(buf, table->id);
+
+	dfield_set_data(dfield, buf, 8);
+	sys_index = dict_table_get_first_index(dict_sys->sys_indexes);
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	mtr_start(&mtr);
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_MODIFY_LEAF, &pcur, &mtr);
+	for (;;) {
+		rec_t*		rec;
+		const byte*	field;
+		ulint		len;
+		ulint		root_page_no;
+
+		if (!btr_pcur_is_on_user_rec(&pcur)) {
+			/* The end of SYS_INDEXES has been reached. */
+			break;
+		}
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_INDEXES__TABLE_ID, &len);
+		ut_ad(len == 8);
+
+		if (memcmp(buf, field, len) != 0) {
+			/* End of indexes for the table (TABLE_ID mismatch). */
+			break;
+		}
+
+		if (rec_get_deleted_flag(rec, FALSE)) {
+			/* The index has been dropped. */
+			goto next_rec;
+		}
+
+		/* This call may commit and restart mtr
+		and reposition pcur. */
+		root_page_no = dict_truncate_index_tree(table, recreate_space,
+							&pcur, &mtr);
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		if (root_page_no != FIL_NULL) {
+			page_rec_write_field(
+				rec, DICT_FLD__SYS_INDEXES__PAGE_NO,
+				root_page_no, &mtr);
+			/* We will need to commit and restart the
+			mini-transaction in order to avoid deadlocks.
+			The dict_truncate_index_tree() call has allocated
+			a page in this mini-transaction, and the rest of
+			this loop could latch another index page. */
+			mtr_commit(&mtr);
+			mtr_start(&mtr);
+			btr_pcur_restore_position(BTR_MODIFY_LEAF,
+						  &pcur, &mtr);
+		}
+
+next_rec:
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	mem_heap_free(heap);
+	/* Done with index truncation, release index tree locks,
+	subsequent work relates to table level metadata change */
+	dict_table_x_unlock_indexes(table);
+
+	dict_hdr_get_new_id(&new_id, NULL, NULL);
+
+	/* Create new FTS auxiliary tables with the new_id, and
+	drop the old index later, only if everything runs successful. */
+	has_internal_doc_id = dict_table_has_fts_index(table)
+			      || DICT_TF2_FLAG_IS_SET(
+				table, DICT_TF2_FTS_HAS_DOC_ID);
+	if (has_internal_doc_id) {
+		dict_table_t	fts_table;
+		ulint		i;
+
+		fts_table.name = table->name;
+		fts_table.id = new_id;
+		fts_table.flags2 = table->flags2;
+
+		err = fts_create_common_tables(
+			trx, &fts_table, table->name, TRUE);
+
+		for (i = 0;
+		     i < ib_vector_size(table->fts->indexes)
+		     && err == DB_SUCCESS;
+		     i++) {
+
+			dict_index_t*	fts_index;
+
+			fts_index = static_cast<dict_index_t*>(
+				ib_vector_getp(table->fts->indexes, i));
+
+			err = fts_create_index_tables_low(
+				trx, fts_index, table->name, new_id);
+		}
+
+		if (err != DB_SUCCESS) {
+			trx->error_state = DB_SUCCESS;
+			trx_rollback_to_savepoint(trx, NULL);
+			trx->error_state = DB_SUCCESS;
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Unable to truncate FTS index for"
+			      " table", stderr);
+			ut_print_name(stderr, trx, TRUE, table->name);
+			fputs("\n", stderr);
+
+			goto funct_exit;
+		} else {
+			ut_ad(trx->state != TRX_STATE_NOT_STARTED);
+		}
+	}
+
+	info = pars_info_create();
+
+	pars_info_add_int4_literal(info, "new_space", (lint) table->space);
+	pars_info_add_ull_literal(info, "old_id", table->id);
+	pars_info_add_ull_literal(info, "new_id", new_id);
+
+	err = que_eval_sql(info,
+			   "PROCEDURE RENUMBER_TABLE_ID_PROC () IS\n"
+			   "BEGIN\n"
+			   "UPDATE SYS_TABLES"
+			   " SET ID = :new_id, SPACE = :new_space\n"
+			   " WHERE ID = :old_id;\n"
+			   "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n"
+			   " WHERE TABLE_ID = :old_id;\n"
+			   "UPDATE SYS_INDEXES"
+			   " SET TABLE_ID = :new_id, SPACE = :new_space\n"
+			   " WHERE TABLE_ID = :old_id;\n"
+			   "END;\n"
+			   , FALSE, trx);
+
+	if (err == DB_SUCCESS && old_space != table->space) {
+		info = pars_info_create();
+
+		pars_info_add_int4_literal(info, "old_space", (lint) old_space);
+
+		pars_info_add_int4_literal(
+			info, "new_space", (lint) table->space);
+
+		err = que_eval_sql(info,
+				   "PROCEDURE RENUMBER_TABLESPACE_PROC () IS\n"
+				   "BEGIN\n"
+				   "UPDATE SYS_TABLESPACES"
+				   " SET SPACE = :new_space\n"
+				   " WHERE SPACE = :old_space;\n"
+				   "UPDATE SYS_DATAFILES"
+				   " SET SPACE = :new_space"
+				   " WHERE SPACE = :old_space;\n"
+				   "END;\n"
+				   , FALSE, trx);
+	}
+	DBUG_EXECUTE_IF("ib_ddl_crash_before_fts_truncate", err = DB_ERROR;);
+
+	if (err != DB_SUCCESS) {
+		trx->error_state = DB_SUCCESS;
+		trx_rollback_to_savepoint(trx, NULL);
+		trx->error_state = DB_SUCCESS;
+
+		/* Update system table failed.  Table in memory metadata
+		could be in an inconsistent state, mark the in-memory
+		table->corrupted to be true. In the long run, this should
+		be fixed by atomic truncate table */
+		table->corrupted = true;
+
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Unable to assign a new identifier to table ",
+		      stderr);
+		ut_print_name(stderr, trx, TRUE, table->name);
+		fputs("\n"
+		      "InnoDB: after truncating it.  Background processes"
+		      " may corrupt the table!\n", stderr);
+
+		/* Failed to update the table id, so drop the new
+		FTS auxiliary tables */
+		if (has_internal_doc_id) {
+			ut_ad(trx->state == TRX_STATE_NOT_STARTED);
+
+			table_id_t	id = table->id;
+
+			table->id = new_id;
+
+			fts_drop_tables(trx, table);
+
+			table->id = id;
+
+			ut_ad(trx->state != TRX_STATE_NOT_STARTED);
+		}
+
+		err = DB_ERROR;
+	} else {
+		/* Drop the old FTS index */
+		if (has_internal_doc_id) {
+			ut_ad(trx->state != TRX_STATE_NOT_STARTED);
+			fts_drop_tables(trx, table);
+			ut_ad(trx->state != TRX_STATE_NOT_STARTED);
+		}
+
+		DBUG_EXECUTE_IF("ib_truncate_crash_after_fts_drop",
+				DBUG_SUICIDE(););
+
+		dict_table_change_id_in_cache(table, new_id);
+
+		/* Reset the Doc ID in cache to 0 */
+		if (has_internal_doc_id && table->fts->cache) {
+			table->fts->fts_status |= TABLE_DICT_LOCKED;
+			fts_update_next_doc_id(trx, table, NULL, 0);
+			fts_cache_clear(table->fts->cache);
+			fts_cache_init(table->fts->cache);
+			table->fts->fts_status &= ~TABLE_DICT_LOCKED;
+		}
+	}
+
+	/* Reset auto-increment. */
+	dict_table_autoinc_lock(table);
+	dict_table_autoinc_initialize(table, 1);
+	dict_table_autoinc_unlock(table);
+
+	trx_commit_for_mysql(trx);
+
+funct_exit:
+
+        if (table->memcached_sync_count == DICT_TABLE_IN_DDL) {
+                /* We need to set the memcached sync back to 0, unblock
+                memcached operationse. */
+                table->memcached_sync_count = 0;
+        }
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	dict_stats_update(table, DICT_STATS_EMPTY_TABLE);
+
+	trx->op_info = "";
+
+	srv_wake_master_thread();
+
+	return(err);
+}
+
+/*********************************************************************//**
+Drops a table for MySQL.  If the name of the dropped table ends in
+one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor",
+"innodb_table_monitor", then this will also stop the printing of monitor
+output by the master thread.  If the data dictionary was not already locked
+by the transaction, the transaction will be committed.  Otherwise, the
+data dictionary will remain locked.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_drop_table_for_mysql(
+/*=====================*/
+	const char*	name,	/*!< in: table name */
+	trx_t*		trx,	/*!< in: transaction handle */
+	bool		drop_db,/*!< in: true=dropping whole database */
+	bool		nonatomic)
+				/*!< in: whether it is permitted
+				to release and reacquire dict_operation_lock */
+{
+	dberr_t		err;
+	dict_foreign_t*	foreign;
+	dict_table_t*	table;
+	ibool		print_msg;
+	ulint		space_id;
+	char*		filepath = NULL;
+	const char*	tablename_minus_db;
+	char*		tablename =  NULL;
+	bool		ibd_file_missing;
+	ulint		namelen;
+	bool		locked_dictionary	= false;
+	pars_info_t*	info			= NULL;
+	mem_heap_t*	heap			= NULL;
+
+	DBUG_ENTER("row_drop_table_for_mysql");
+
+	DBUG_PRINT("row_drop_table_for_mysql", ("table: %s", name));
+
+	ut_a(name != NULL);
+
+	if (srv_created_new_raw) {
+		fputs("InnoDB: A new raw disk partition was initialized:\n"
+		      "InnoDB: we do not allow database modifications"
+		      " by the user.\n"
+		      "InnoDB: Shut down mysqld and edit my.cnf so that newraw"
+		      " is replaced with raw.\n", stderr);
+
+		DBUG_RETURN(DB_ERROR);
+	}
+
+	/* The table name is prefixed with the database name and a '/'.
+	Certain table names starting with 'innodb_' have their special
+	meaning regardless of the database name.  Thus, we need to
+	ignore the database name prefix in the comparisons. */
+	tablename_minus_db = strchr(name, '/');
+
+	if (tablename_minus_db) {
+		tablename_minus_db++;
+	} else {
+		/* Ancillary FTS tables don't have '/' characters. */
+		tablename_minus_db = name;
+	}
+
+	namelen = strlen(tablename_minus_db) + 1;
+
+	if (namelen == sizeof S_innodb_monitor
+	    && !memcmp(tablename_minus_db, S_innodb_monitor,
+		       sizeof S_innodb_monitor)) {
+
+		/* Table name equals "innodb_monitor":
+		stop monitor prints */
+
+		srv_print_innodb_monitor = FALSE;
+		srv_print_innodb_lock_monitor = FALSE;
+	} else if (namelen == sizeof S_innodb_lock_monitor
+		   && !memcmp(tablename_minus_db, S_innodb_lock_monitor,
+			      sizeof S_innodb_lock_monitor)) {
+		srv_print_innodb_monitor = FALSE;
+		srv_print_innodb_lock_monitor = FALSE;
+	} else if (namelen == sizeof S_innodb_tablespace_monitor
+		   && !memcmp(tablename_minus_db, S_innodb_tablespace_monitor,
+			      sizeof S_innodb_tablespace_monitor)) {
+
+		srv_print_innodb_tablespace_monitor = FALSE;
+	} else if (namelen == sizeof S_innodb_table_monitor
+		   && !memcmp(tablename_minus_db, S_innodb_table_monitor,
+			      sizeof S_innodb_table_monitor)) {
+
+		srv_print_innodb_table_monitor = FALSE;
+	}
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	trx->op_info = "dropping table";
+
+	/* This function is called recursively via fts_drop_tables(). */
+	if (trx->state == TRX_STATE_NOT_STARTED) {
+		trx_start_for_ddl(trx, TRX_DICT_OP_TABLE);
+	}
+
+	if (trx->dict_operation_lock_mode != RW_X_LATCH) {
+		/* Prevent foreign key checks etc. while we are dropping the
+		table */
+
+		row_mysql_lock_data_dictionary(trx);
+
+		locked_dictionary = true;
+		nonatomic = true;
+	}
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	table = dict_table_open_on_name(
+		name, TRUE, FALSE,
+		static_cast<dict_err_ignore_t>(
+			DICT_ERR_IGNORE_INDEX_ROOT | DICT_ERR_IGNORE_CORRUPT));
+
+	if (!table) {
+		err = DB_TABLE_NOT_FOUND;
+		ut_print_timestamp(stderr);
+
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_name(stderr, trx, TRUE, name);
+		fputs(" does not exist in the InnoDB internal\n"
+		      "InnoDB: data dictionary though MySQL is"
+		      " trying to drop it.\n"
+		      "InnoDB: Have you copied the .frm file"
+		      " of the table to the\n"
+		      "InnoDB: MySQL database directory"
+		      " from another database?\n"
+		      "InnoDB: You can look for further help from\n"
+		      "InnoDB: " REFMAN "innodb-troubleshooting.html\n",
+		      stderr);
+		goto funct_exit;
+	}
+
+	/* Turn on this drop bit before we could release the dictionary
+	latch */
+	table->to_be_dropped = true;
+
+	if (nonatomic) {
+		/* This trx did not acquire any locks on dictionary
+		table records yet. Thus it is safe to release and
+		reacquire the data dictionary latches. */
+		if (table->fts) {
+			ut_ad(!table->fts->add_wq);
+			ut_ad(lock_trx_has_sys_table_locks(trx) == 0);
+
+			row_mysql_unlock_data_dictionary(trx);
+			fts_optimize_remove_table(table);
+			row_mysql_lock_data_dictionary(trx);
+		}
+
+		/* Do not bother to deal with persistent stats for temp
+		tables since we know temp tables do not use persistent
+		stats. */
+		if (!dict_table_is_temporary(table)) {
+			dict_stats_wait_bg_to_stop_using_table(
+				table, trx);
+		}
+	}
+
+	/* make sure background stats thread is not running on the table */
+	ut_ad(!(table->stats_bg_flag & BG_STAT_IN_PROGRESS));
+
+	/* Delete the link file if used. */
+	if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+		fil_delete_link_file(name);
+	}
+
+	if (!dict_table_is_temporary(table)) {
+
+		dict_stats_recalc_pool_del(table);
+
+		/* Remove stats for this table and all of its indexes from the
+		persistent storage if it exists and if there are stats for this
+		table in there. This function creates its own trx and commits
+		it. */
+		char	errstr[1024];
+		err = dict_stats_drop_table(name, errstr, sizeof(errstr));
+
+		if (err != DB_SUCCESS) {
+			ib_logf(IB_LOG_LEVEL_WARN, "%s", errstr);
+		}
+	}
+
+	/* Move the table the the non-LRU list so that it isn't
+	considered for eviction. */
+
+	if (table->can_be_evicted) {
+		dict_table_move_from_lru_to_non_lru(table);
+	}
+
+	dict_table_close(table, TRUE, FALSE);
+
+	/* Check if the table is referenced by foreign key constraints from
+	some other table (not the table itself) */
+
+	if (!srv_read_only_mode && trx->check_foreigns) {
+
+		for (dict_foreign_set::iterator it
+			= table->referenced_set.begin();
+		     it != table->referenced_set.end();
+		     ++it) {
+
+			foreign = *it;
+
+			const bool	ref_ok = drop_db
+				&& dict_tables_have_same_db(
+					name,
+					foreign->foreign_table_name_lookup);
+
+			if (foreign->foreign_table != table && !ref_ok) {
+
+				FILE*	ef	= dict_foreign_err_file;
+
+				/* We only allow dropping a referenced table
+				if FOREIGN_KEY_CHECKS is set to 0 */
+
+				err = DB_CANNOT_DROP_CONSTRAINT;
+
+				mutex_enter(&dict_foreign_err_mutex);
+				rewind(ef);
+				ut_print_timestamp(ef);
+
+				fputs("  Cannot drop table ", ef);
+				ut_print_name(ef, trx, TRUE, name);
+				fputs("\n"
+				      "because it is referenced by ", ef);
+				ut_print_name(ef, trx, TRUE,
+					      foreign->foreign_table_name);
+				putc('\n', ef);
+				mutex_exit(&dict_foreign_err_mutex);
+
+				goto funct_exit;
+			}
+		}
+	}
+
+	/* TODO: could we replace the counter n_foreign_key_checks_running
+	with lock checks on the table? Acquire here an exclusive lock on the
+	table, and rewrite lock0lock.cc and the lock wait in srv0srv.cc so that
+	they can cope with the table having been dropped here? Foreign key
+	checks take an IS or IX lock on the table. */
+
+	if (table->n_foreign_key_checks_running > 0) {
+
+		const char*	save_tablename = table->name;
+		ibool		added;
+
+		added = row_add_table_to_background_drop_list(save_tablename);
+
+		if (added) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: You are trying to drop table ",
+			      stderr);
+			ut_print_name(stderr, trx, TRUE, save_tablename);
+			fputs("\n"
+			      "InnoDB: though there is a"
+			      " foreign key check running on it.\n"
+			      "InnoDB: Adding the table to"
+			      " the background drop queue.\n",
+			      stderr);
+
+			/* We return DB_SUCCESS to MySQL though the drop will
+			happen lazily later */
+
+			err = DB_SUCCESS;
+		} else {
+			/* The table is already in the background drop list */
+			err = DB_ERROR;
+		}
+
+		goto funct_exit;
+	}
+
+	/* Remove all locks that are on the table or its records, if there
+	are no refernces to the table but it has record locks, we release
+	the record locks unconditionally. One use case is:
+
+		CREATE TABLE t2 (PRIMARY KEY (a)) SELECT * FROM t1;
+
+	If after the user transaction has done the SELECT and there is a
+	problem in completing the CREATE TABLE operation, MySQL will drop
+	the table. InnoDB will create a new background transaction to do the
+	actual drop, the trx instance that is passed to this function. To
+	preserve existing behaviour we remove the locks but ideally we
+	shouldn't have to. There should never be record locks on a table
+	that is going to be dropped. */
+
+	if (table->n_ref_count == 0) {
+		lock_remove_all_on_table(table, TRUE);
+		ut_a(table->n_rec_locks == 0);
+	} else if (table->n_ref_count > 0 || table->n_rec_locks > 0) {
+		ibool	added;
+
+		added = row_add_table_to_background_drop_list(table->name);
+
+		if (added) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Warning: MySQL is"
+			      " trying to drop table ", stderr);
+			ut_print_name(stderr, trx, TRUE, table->name);
+			fputs("\n"
+			      "InnoDB: though there are still"
+			      " open handles to it.\n"
+			      "InnoDB: Adding the table to the"
+			      " background drop queue.\n",
+			      stderr);
+
+			/* We return DB_SUCCESS to MySQL though the drop will
+			happen lazily later */
+			err = DB_SUCCESS;
+		} else {
+			/* The table is already in the background drop list */
+			err = DB_ERROR;
+		}
+
+		goto funct_exit;
+	}
+
+	/* The "to_be_dropped" marks table that is to be dropped, but
+	has not been dropped, instead, was put in the background drop
+	list due to being used by concurrent DML operations. Clear it
+	here since there are no longer any concurrent activities on it,
+	and it is free to be dropped */
+	table->to_be_dropped = false;
+
+	/* If we get this far then the table to be dropped must not have
+	any table or record locks on it. */
+
+	ut_a(!lock_table_has_locks(table));
+
+	switch (trx_get_dict_operation(trx)) {
+	case TRX_DICT_OP_NONE:
+		trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+		trx->table_id = table->id;
+	case TRX_DICT_OP_TABLE:
+		break;
+	case TRX_DICT_OP_INDEX:
+		/* If the transaction was previously flagged as
+		TRX_DICT_OP_INDEX, we should be dropping auxiliary
+		tables for full-text indexes. */
+		ut_ad(strstr(table->name, "/FTS_") != NULL);
+	}
+
+	/* Mark all indexes unavailable in the data dictionary cache
+	before starting to drop the table. */
+
+	unsigned*	page_no;
+	unsigned*	page_nos;
+	heap = mem_heap_create(
+		200 + UT_LIST_GET_LEN(table->indexes) * sizeof *page_nos);
+	tablename = mem_heap_strdup(heap, name);
+
+	page_no = page_nos = static_cast<unsigned*>(
+		mem_heap_alloc(
+			heap,
+			UT_LIST_GET_LEN(table->indexes) * sizeof *page_no));
+
+	for (dict_index_t* index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+		rw_lock_x_lock(dict_index_get_lock(index));
+		/* Save the page numbers so that we can restore them
+		if the operation fails. */
+		*page_no++ = index->page;
+		/* Mark the index unusable. */
+		index->page = FIL_NULL;
+		rw_lock_x_unlock(dict_index_get_lock(index));
+	}
+
+	/* We use the private SQL parser of Innobase to generate the
+	query graphs needed in deleting the dictionary data from system
+	tables in Innobase. Deleting a row from SYS_INDEXES table also
+	frees the file segments of the B-tree associated with the index. */
+
+	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "table_name", name);
+
+	err = que_eval_sql(info,
+			   "PROCEDURE DROP_TABLE_PROC () IS\n"
+			   "sys_foreign_id CHAR;\n"
+			   "table_id CHAR;\n"
+			   "index_id CHAR;\n"
+			   "foreign_id CHAR;\n"
+			   "space_id INT;\n"
+			   "found INT;\n"
+
+			   "DECLARE CURSOR cur_fk IS\n"
+			   "SELECT ID FROM SYS_FOREIGN\n"
+			   "WHERE FOR_NAME = :table_name\n"
+			   "AND TO_BINARY(FOR_NAME)\n"
+			   "  = TO_BINARY(:table_name)\n"
+			   "LOCK IN SHARE MODE;\n"
+
+			   "DECLARE CURSOR cur_idx IS\n"
+			   "SELECT ID FROM SYS_INDEXES\n"
+			   "WHERE TABLE_ID = table_id\n"
+			   "LOCK IN SHARE MODE;\n"
+
+			   "BEGIN\n"
+			   "SELECT ID INTO table_id\n"
+			   "FROM SYS_TABLES\n"
+			   "WHERE NAME = :table_name\n"
+			   "LOCK IN SHARE MODE;\n"
+			   "IF (SQL % NOTFOUND) THEN\n"
+			   "       RETURN;\n"
+			   "END IF;\n"
+			   "SELECT SPACE INTO space_id\n"
+			   "FROM SYS_TABLES\n"
+			   "WHERE NAME = :table_name;\n"
+			   "IF (SQL % NOTFOUND) THEN\n"
+			   "       RETURN;\n"
+			   "END IF;\n"
+			   "found := 1;\n"
+			   "SELECT ID INTO sys_foreign_id\n"
+			   "FROM SYS_TABLES\n"
+			   "WHERE NAME = 'SYS_FOREIGN'\n"
+			   "LOCK IN SHARE MODE;\n"
+			   "IF (SQL % NOTFOUND) THEN\n"
+			   "       found := 0;\n"
+			   "END IF;\n"
+			   "IF (:table_name = 'SYS_FOREIGN') THEN\n"
+			   "       found := 0;\n"
+			   "END IF;\n"
+			   "IF (:table_name = 'SYS_FOREIGN_COLS') THEN\n"
+			   "       found := 0;\n"
+			   "END IF;\n"
+			   "OPEN cur_fk;\n"
+			   "WHILE found = 1 LOOP\n"
+			   "       FETCH cur_fk INTO foreign_id;\n"
+			   "       IF (SQL % NOTFOUND) THEN\n"
+			   "               found := 0;\n"
+			   "       ELSE\n"
+			   "               DELETE FROM SYS_FOREIGN_COLS\n"
+			   "               WHERE ID = foreign_id;\n"
+			   "               DELETE FROM SYS_FOREIGN\n"
+			   "               WHERE ID = foreign_id;\n"
+			   "       END IF;\n"
+			   "END LOOP;\n"
+			   "CLOSE cur_fk;\n"
+			   "found := 1;\n"
+			   "OPEN cur_idx;\n"
+			   "WHILE found = 1 LOOP\n"
+			   "       FETCH cur_idx INTO index_id;\n"
+			   "       IF (SQL % NOTFOUND) THEN\n"
+			   "               found := 0;\n"
+			   "       ELSE\n"
+			   "               DELETE FROM SYS_FIELDS\n"
+			   "               WHERE INDEX_ID = index_id;\n"
+			   "               DELETE FROM SYS_INDEXES\n"
+			   "               WHERE ID = index_id\n"
+			   "               AND TABLE_ID = table_id;\n"
+			   "       END IF;\n"
+			   "END LOOP;\n"
+			   "CLOSE cur_idx;\n"
+			   "DELETE FROM SYS_TABLESPACES\n"
+			   "WHERE SPACE = space_id;\n"
+			   "DELETE FROM SYS_DATAFILES\n"
+			   "WHERE SPACE = space_id;\n"
+			   "DELETE FROM SYS_COLUMNS\n"
+			   "WHERE TABLE_ID = table_id;\n"
+			   "DELETE FROM SYS_TABLES\n"
+			   "WHERE NAME = :table_name;\n"
+			   "END;\n"
+			   , FALSE, trx);
+
+	switch (err) {
+		ibool	is_temp;
+
+	case DB_SUCCESS:
+		/* Clone the name, in case it has been allocated
+		from table->heap, which will be freed by
+		dict_table_remove_from_cache(table) below. */
+		space_id = table->space;
+		ibd_file_missing = table->ibd_file_missing;
+
+		is_temp = DICT_TF2_FLAG_IS_SET(table, DICT_TF2_TEMPORARY);
+
+		/* If there is a temp path then the temp flag is set.
+		However, during recovery, we might have a temp flag but
+		not know the temp path */
+		ut_a(table->dir_path_of_temp_table == NULL || is_temp);
+		if (dict_table_is_discarded(table)
+		    || table->ibd_file_missing) {
+			/* Do not attempt to drop known-to-be-missing
+			tablespaces. */
+			space_id = 0;
+		}
+
+		/* We do not allow temporary tables with a remote path. */
+		ut_a(!(is_temp && DICT_TF_HAS_DATA_DIR(table->flags)));
+
+		if (space_id && DICT_TF_HAS_DATA_DIR(table->flags)) {
+			dict_get_and_save_data_dir_path(table, true);
+			ut_a(table->data_dir_path);
+
+			filepath = os_file_make_remote_pathname(
+				table->data_dir_path, table->name, "ibd");
+		} else if (table->dir_path_of_temp_table) {
+			filepath = fil_make_ibd_name(
+				table->dir_path_of_temp_table, true);
+		} else {
+			filepath = fil_make_ibd_name(tablename, false);
+		}
+
+		if (dict_table_has_fts_index(table)
+		    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+			ut_ad(table->n_ref_count == 0);
+			ut_ad(trx->state != TRX_STATE_NOT_STARTED);
+			err = fts_drop_tables(trx, table);
+
+			if (err != DB_SUCCESS) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr," InnoDB: Error: (%s) not "
+					"able to remove ancillary FTS tables "
+					"for table ", ut_strerr(err));
+				ut_print_name(stderr, trx, TRUE, tablename);
+				fputs("\n", stderr);
+
+				goto funct_exit;
+			}
+		}
+
+		/* The table->fts flag can be set on the table for which
+		the cluster index is being rebuilt. Such table might not have
+		DICT_TF2_FTS flag set. So keep this out of above
+		dict_table_has_fts_index condition */
+		if (table->fts) {
+			/* Need to set TABLE_DICT_LOCKED bit, since
+			fts_que_graph_free_check_lock would try to acquire
+			dict mutex lock */
+			table->fts->fts_status |= TABLE_DICT_LOCKED;
+
+			fts_free(table);
+		}
+
+		dict_table_remove_from_cache(table);
+
+		if (dict_load_table(tablename, TRUE,
+				    DICT_ERR_IGNORE_NONE) != NULL) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Error: not able to remove table ",
+			      stderr);
+			ut_print_name(stderr, trx, TRUE, tablename);
+			fputs(" from the dictionary cache!\n", stderr);
+			err = DB_ERROR;
+		}
+
+		/* Do not drop possible .ibd tablespace if something went
+		wrong: we do not want to delete valuable data of the user */
+
+		/* Don't spam the log if we can't find the tablespace of
+		a temp table or if the tablesace has been discarded. */
+		print_msg = !(is_temp || ibd_file_missing);
+
+		if (err == DB_SUCCESS && space_id > TRX_SYS_SPACE) {
+			if (!is_temp
+			    && !fil_space_for_table_exists_in_mem(
+					space_id, tablename, FALSE,
+					print_msg, false, NULL, 0)) {
+				/* This might happen if we are dropping a
+				discarded tablespace */
+				err = DB_SUCCESS;
+
+				if (print_msg) {
+					char msg_tablename[MAX_FULL_NAME_LEN + 1];
+
+					innobase_format_name(
+						msg_tablename, sizeof(tablename),
+						tablename, FALSE);
+
+					ib_logf(IB_LOG_LEVEL_INFO,
+						"Removed the table %s from "
+						"InnoDB's data dictionary",
+						msg_tablename);
+				}
+
+				/* Force a delete of any discarded
+				or temporary files. */
+
+				fil_delete_file(filepath);
+
+			} else if (fil_delete_tablespace(
+					space_id,
+					BUF_REMOVE_FLUSH_NO_WRITE)
+				   != DB_SUCCESS) {
+				fprintf(stderr,
+					"InnoDB: We removed now the InnoDB"
+					" internal data dictionary entry\n"
+					"InnoDB: of table ");
+				ut_print_name(stderr, trx, TRUE, tablename);
+				fprintf(stderr, ".\n");
+
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"  InnoDB: Error: not able to"
+					" delete tablespace %lu of table ",
+					(ulong) space_id);
+				ut_print_name(stderr, trx, TRUE, tablename);
+				fputs("!\n", stderr);
+				err = DB_ERROR;
+			}
+		}
+
+		break;
+
+	case DB_OUT_OF_FILE_SPACE:
+		err = DB_MUST_GET_MORE_FILE_SPACE;
+
+		row_mysql_handle_errors(&err, trx, NULL, NULL);
+
+		/* raise error */
+		ut_error;
+		break;
+
+	case DB_TOO_MANY_CONCURRENT_TRXS:
+		/* Cannot even find a free slot for the
+		the undo log. We can directly exit here
+		and return the DB_TOO_MANY_CONCURRENT_TRXS
+		error. */
+
+	default:
+		/* This is some error we do not expect. Print
+		the error number and rollback transaction */
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr, "InnoDB: unknown error code %lu"
+			" while dropping table:", (ulong) err);
+		ut_print_name(stderr, trx, TRUE, tablename);
+		fprintf(stderr, ".\n");
+
+		trx->error_state = DB_SUCCESS;
+		trx_rollback_to_savepoint(trx, NULL);
+		trx->error_state = DB_SUCCESS;
+
+		/* Mark all indexes available in the data dictionary
+		cache again. */
+
+		page_no = page_nos;
+
+		for (dict_index_t* index = dict_table_get_first_index(table);
+		     index != NULL;
+		     index = dict_table_get_next_index(index)) {
+			rw_lock_x_lock(dict_index_get_lock(index));
+			ut_a(index->page == FIL_NULL);
+			index->page = *page_no++;
+			rw_lock_x_unlock(dict_index_get_lock(index));
+		}
+	}
+
+funct_exit:
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	if (filepath) {
+		mem_free(filepath);
+	}
+
+	if (locked_dictionary) {
+		trx_commit_for_mysql(trx);
+
+		row_mysql_unlock_data_dictionary(trx);
+	}
+
+	trx->op_info = "";
+
+	srv_wake_master_thread();
+
+	DBUG_RETURN(err);
+}
+
+/*********************************************************************//**
+Drop all temporary tables during crash recovery. */
+UNIV_INTERN
+void
+row_mysql_drop_temp_tables(void)
+/*============================*/
+{
+	trx_t*		trx;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+	mem_heap_t*	heap;
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "dropping temporary tables";
+	row_mysql_lock_data_dictionary(trx);
+
+	heap = mem_heap_create(200);
+
+	mtr_start(&mtr);
+
+	btr_pcur_open_at_index_side(
+		true,
+		dict_table_get_first_index(dict_sys->sys_tables),
+		BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
+
+	for (;;) {
+		const rec_t*	rec;
+		const byte*	field;
+		ulint		len;
+		const char*	table_name;
+		dict_table_t*	table;
+
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+		if (!btr_pcur_is_on_user_rec(&pcur)) {
+			break;
+		}
+
+		/* The high order bit of N_COLS is set unless
+		ROW_FORMAT=REDUNDANT. */
+		rec = btr_pcur_get_rec(&pcur);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__NAME, &len);
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__N_COLS, &len);
+		if (len != 4
+		    || !(mach_read_from_4(field) & DICT_N_COLS_COMPACT)) {
+			continue;
+		}
+
+		/* Older versions of InnoDB, which only supported tables
+		in ROW_FORMAT=REDUNDANT could write garbage to
+		SYS_TABLES.MIX_LEN, where we now store the is_temp flag.
+		Above, we assumed is_temp=0 if ROW_FORMAT=REDUNDANT. */
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len);
+		if (len != 4
+		    || !(mach_read_from_4(field) & DICT_TF2_TEMPORARY)) {
+			continue;
+		}
+
+		/* This is a temporary table. */
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__NAME, &len);
+		if (len == UNIV_SQL_NULL || len == 0) {
+			/* Corrupted SYS_TABLES.NAME */
+			continue;
+		}
+
+		table_name = mem_heap_strdupl(heap, (const char*) field, len);
+
+		btr_pcur_store_position(&pcur, &mtr);
+		btr_pcur_commit_specify_mtr(&pcur, &mtr);
+
+		table = dict_load_table(table_name, TRUE, DICT_ERR_IGNORE_NONE);
+
+		if (table) {
+			row_drop_table_for_mysql(table_name, trx, FALSE);
+			trx_commit_for_mysql(trx);
+		}
+
+		mtr_start(&mtr);
+		btr_pcur_restore_position(BTR_SEARCH_LEAF,
+					  &pcur, &mtr);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+	mem_heap_free(heap);
+	row_mysql_unlock_data_dictionary(trx);
+	trx_free_for_background(trx);
+}
+
+/*******************************************************************//**
+Drop all foreign keys in a database, see Bug#18942.
+Called at the end of row_drop_database_for_mysql().
+@return	error code or DB_SUCCESS */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+drop_all_foreign_keys_in_db(
+/*========================*/
+	const char*	name,	/*!< in: database name which ends to '/' */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	pars_info_t*	pinfo;
+	dberr_t		err;
+
+	ut_a(name[strlen(name) - 1] == '/');
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "dbname", name);
+
+/** true if for_name is not prefixed with dbname */
+#define TABLE_NOT_IN_THIS_DB \
+"SUBSTR(for_name, 0, LENGTH(:dbname)) <> :dbname"
+
+	err = que_eval_sql(pinfo,
+			   "PROCEDURE DROP_ALL_FOREIGN_KEYS_PROC () IS\n"
+			   "foreign_id CHAR;\n"
+			   "for_name CHAR;\n"
+			   "found INT;\n"
+			   "DECLARE CURSOR cur IS\n"
+			   "SELECT ID, FOR_NAME FROM SYS_FOREIGN\n"
+			   "WHERE FOR_NAME >= :dbname\n"
+			   "LOCK IN SHARE MODE\n"
+			   "ORDER BY FOR_NAME;\n"
+			   "BEGIN\n"
+			   "found := 1;\n"
+			   "OPEN cur;\n"
+			   "WHILE found = 1 LOOP\n"
+			   "        FETCH cur INTO foreign_id, for_name;\n"
+			   "        IF (SQL % NOTFOUND) THEN\n"
+			   "                found := 0;\n"
+			   "        ELSIF (" TABLE_NOT_IN_THIS_DB ") THEN\n"
+			   "                found := 0;\n"
+			   "        ELSIF (1=1) THEN\n"
+			   "                DELETE FROM SYS_FOREIGN_COLS\n"
+			   "                WHERE ID = foreign_id;\n"
+			   "                DELETE FROM SYS_FOREIGN\n"
+			   "                WHERE ID = foreign_id;\n"
+			   "        END IF;\n"
+			   "END LOOP;\n"
+			   "CLOSE cur;\n"
+			   "COMMIT WORK;\n"
+			   "END;\n",
+			   FALSE, /* do not reserve dict mutex,
+				  we are already holding it */
+			   trx);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Drops a database for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_drop_database_for_mysql(
+/*========================*/
+	const char*	name,	/*!< in: database name which ends to '/' */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	dict_table_t*	table;
+	char*		table_name;
+	dberr_t		err	= DB_SUCCESS;
+	ulint		namelen	= strlen(name);
+
+	ut_a(name != NULL);
+	ut_a(name[namelen - 1] == '/');
+
+	trx->op_info = "dropping database";
+
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+	trx_start_if_not_started_xa(trx);
+loop:
+	row_mysql_lock_data_dictionary(trx);
+
+	while ((table_name = dict_get_first_table_name_in_db(name))) {
+		ut_a(memcmp(table_name, name, namelen) == 0);
+
+		table = dict_table_open_on_name(
+			table_name, TRUE, FALSE, static_cast<dict_err_ignore_t>(
+				DICT_ERR_IGNORE_INDEX_ROOT
+				| DICT_ERR_IGNORE_CORRUPT));
+
+		if (!table) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Cannot load table %s from InnoDB internal "
+				"data dictionary during drop database",
+				table_name);
+			mem_free(table_name);
+			err = DB_TABLE_NOT_FOUND;
+			break;
+
+		}
+
+		if (!row_is_mysql_tmp_table_name(table->name)) {
+			/* There could be orphan temp tables left from
+			interrupted alter table. Leave them, and handle
+			the rest.*/
+			if (table->can_be_evicted) {
+				ib_logf(IB_LOG_LEVEL_WARN,
+					"Orphan table encountered during "
+					"DROP DATABASE. This is possible if "
+					"'%s.frm' was lost.", table->name);
+			}
+
+			if (table->ibd_file_missing) {
+				ib_logf(IB_LOG_LEVEL_WARN,
+					"Missing %s.ibd file for table %s.",
+					table->name, table->name);
+			}
+		}
+
+		dict_table_close(table, TRUE, FALSE);
+
+		/* The dict_table_t object must not be accessed before
+		dict_table_open() or after dict_table_close(). But this is OK
+		if we are holding, the dict_sys->mutex. */
+		ut_ad(mutex_own(&dict_sys->mutex));
+
+		/* Wait until MySQL does not have any queries running on
+		the table */
+
+		if (table->n_ref_count > 0) {
+			row_mysql_unlock_data_dictionary(trx);
+
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Warning: MySQL is trying to"
+			      " drop database ", stderr);
+			ut_print_name(stderr, trx, TRUE, name);
+			fputs("\n"
+			      "InnoDB: though there are still"
+			      " open handles to table ", stderr);
+			ut_print_name(stderr, trx, TRUE, table_name);
+			fputs(".\n", stderr);
+
+			os_thread_sleep(1000000);
+
+			mem_free(table_name);
+
+			goto loop;
+		}
+
+		err = row_drop_table_for_mysql(table_name, trx, TRUE);
+		trx_commit_for_mysql(trx);
+
+		if (err != DB_SUCCESS) {
+			fputs("InnoDB: DROP DATABASE ", stderr);
+			ut_print_name(stderr, trx, TRUE, name);
+			fprintf(stderr, " failed with error (%s) for table ",
+				ut_strerr(err));
+			ut_print_name(stderr, trx, TRUE, table_name);
+			putc('\n', stderr);
+			mem_free(table_name);
+			break;
+		}
+
+		mem_free(table_name);
+	}
+
+	if (err == DB_SUCCESS) {
+		/* after dropping all tables try to drop all leftover
+		foreign keys in case orphaned ones exist */
+		err = drop_all_foreign_keys_in_db(name, trx);
+
+		if (err != DB_SUCCESS) {
+			fputs("InnoDB: DROP DATABASE ", stderr);
+			ut_print_name(stderr, trx, TRUE, name);
+			fprintf(stderr, " failed with error %d while "
+				"dropping all foreign keys", err);
+		}
+	}
+
+	trx_commit_for_mysql(trx);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Checks if a table name contains the string "/#sql" which denotes temporary
+tables in MySQL.
+@return	true if temporary table */
+UNIV_INTERN __attribute__((warn_unused_result))
+bool
+row_is_mysql_tmp_table_name(
+/*========================*/
+	const char*	name)	/*!< in: table name in the form
+				'database/tablename' */
+{
+	return(strstr(name, "/#sql") != NULL);
+	/* return(strstr(name, "/@0023sql") != NULL); */
+}
+
+/****************************************************************//**
+Delete a single constraint.
+@return	error code or DB_SUCCESS */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_delete_constraint_low(
+/*======================*/
+	const char*	id,		/*!< in: constraint id */
+	trx_t*		trx)		/*!< in: transaction handle */
+{
+	pars_info_t*	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "id", id);
+
+	return(que_eval_sql(info,
+			    "PROCEDURE DELETE_CONSTRAINT () IS\n"
+			    "BEGIN\n"
+			    "DELETE FROM SYS_FOREIGN_COLS WHERE ID = :id;\n"
+			    "DELETE FROM SYS_FOREIGN WHERE ID = :id;\n"
+			    "END;\n"
+			    , FALSE, trx));
+}
+
+/****************************************************************//**
+Delete a single constraint.
+@return	error code or DB_SUCCESS */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_delete_constraint(
+/*==================*/
+	const char*	id,		/*!< in: constraint id */
+	const char*	database_name,	/*!< in: database name, with the
+					trailing '/' */
+	mem_heap_t*	heap,		/*!< in: memory heap */
+	trx_t*		trx)		/*!< in: transaction handle */
+{
+	dberr_t	err;
+
+	/* New format constraints have ids <databasename>/<constraintname>. */
+	err = row_delete_constraint_low(
+		mem_heap_strcat(heap, database_name, id), trx);
+
+	if ((err == DB_SUCCESS) && !strchr(id, '/')) {
+		/* Old format < 4.0.18 constraints have constraint ids
+		NUMBER_NUMBER. We only try deleting them if the
+		constraint name does not contain a '/' character, otherwise
+		deleting a new format constraint named 'foo/bar' from
+		database 'baz' would remove constraint 'bar' from database
+		'foo', if it existed. */
+
+		err = row_delete_constraint_low(id, trx);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Renames a table for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+row_rename_table_for_mysql(
+/*=======================*/
+	const char*	old_name,	/*!< in: old table name */
+	const char*	new_name,	/*!< in: new table name */
+	trx_t*		trx,		/*!< in/out: transaction */
+	bool		commit)		/*!< in: whether to commit trx */
+{
+	dict_table_t*	table			= NULL;
+	ibool		dict_locked		= FALSE;
+	dberr_t		err			= DB_ERROR;
+	mem_heap_t*	heap			= NULL;
+	const char**	constraints_to_drop	= NULL;
+	ulint		n_constraints_to_drop	= 0;
+	ibool		old_is_tmp, new_is_tmp;
+	pars_info_t*	info			= NULL;
+	int		retry;
+
+	ut_a(old_name != NULL);
+	ut_a(new_name != NULL);
+	ut_ad(trx->state == TRX_STATE_ACTIVE);
+
+	if (srv_created_new_raw || srv_force_recovery) {
+		fputs("InnoDB: A new raw disk partition was initialized or\n"
+		      "InnoDB: innodb_force_recovery is on: we do not allow\n"
+		      "InnoDB: database modifications by the user. Shut down\n"
+		      "InnoDB: mysqld and edit my.cnf so that newraw"
+		      " is replaced\n"
+		      "InnoDB: with raw, and innodb_force_... is removed.\n",
+		      stderr);
+		if(srv_force_recovery) {
+			err = DB_READ_ONLY;
+		}
+
+		goto funct_exit;
+	} else if (row_mysql_is_system_table(new_name)) {
+
+		fprintf(stderr,
+			"InnoDB: Error: trying to create a MySQL"
+			" system table %s of type InnoDB.\n"
+			"InnoDB: MySQL system tables must be"
+			" of the MyISAM type!\n",
+			new_name);
+
+		goto funct_exit;
+	}
+
+	trx->op_info = "renaming table";
+
+	old_is_tmp = row_is_mysql_tmp_table_name(old_name);
+	new_is_tmp = row_is_mysql_tmp_table_name(new_name);
+
+	dict_locked = trx->dict_operation_lock_mode == RW_X_LATCH;
+
+	table = dict_table_open_on_name(old_name, dict_locked, FALSE,
+					DICT_ERR_IGNORE_NONE);
+
+	if (!table) {
+		err = DB_TABLE_NOT_FOUND;
+		ut_print_timestamp(stderr);
+
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_name(stderr, trx, TRUE, old_name);
+		fputs(" does not exist in the InnoDB internal\n"
+		      "InnoDB: data dictionary though MySQL is"
+		      " trying to rename the table.\n"
+		      "InnoDB: Have you copied the .frm file"
+		      " of the table to the\n"
+		      "InnoDB: MySQL database directory"
+		      " from another database?\n"
+		      "InnoDB: You can look for further help from\n"
+		      "InnoDB: " REFMAN "innodb-troubleshooting.html\n",
+		      stderr);
+		goto funct_exit;
+
+	} else if (table->ibd_file_missing
+		   && !dict_table_is_discarded(table)) {
+
+		err = DB_TABLE_NOT_FOUND;
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Table %s does not have an .ibd file in the database "
+			"directory. See " REFMAN "innodb-troubleshooting.html",
+			old_name);
+
+		goto funct_exit;
+
+	} else if (new_is_tmp) {
+		/* MySQL is doing an ALTER TABLE command and it renames the
+		original table to a temporary table name. We want to preserve
+		the original foreign key constraint definitions despite the
+		name change. An exception is those constraints for which
+		the ALTER TABLE contained DROP FOREIGN KEY <foreign key id>.*/
+
+		heap = mem_heap_create(100);
+
+		err = dict_foreign_parse_drop_constraints(
+			heap, trx, table, &n_constraints_to_drop,
+			&constraints_to_drop);
+
+		if (err != DB_SUCCESS) {
+			goto funct_exit;
+		}
+	}
+
+	/* Is a foreign key check running on this table? */
+	for (retry = 0; retry < 100
+	     && table->n_foreign_key_checks_running > 0; ++retry) {
+		row_mysql_unlock_data_dictionary(trx);
+		os_thread_yield();
+		row_mysql_lock_data_dictionary(trx);
+	}
+
+	if (table->n_foreign_key_checks_running > 0) {
+		ut_print_timestamp(stderr);
+		fputs(" InnoDB: Error: in ALTER TABLE ", stderr);
+		ut_print_name(stderr, trx, TRUE, old_name);
+		fprintf(stderr, "\n"
+			"InnoDB: a FOREIGN KEY check is running.\n"
+			"InnoDB: Cannot rename table.\n");
+		err = DB_TABLE_IN_FK_CHECK;
+		goto funct_exit;
+	}
+
+	/* We use the private SQL parser of Innobase to generate the query
+	graphs needed in updating the dictionary data from system tables. */
+
+	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "new_table_name", new_name);
+	pars_info_add_str_literal(info, "old_table_name", old_name);
+
+	err = que_eval_sql(info,
+			   "PROCEDURE RENAME_TABLE () IS\n"
+			   "BEGIN\n"
+			   "UPDATE SYS_TABLES"
+			   " SET NAME = :new_table_name\n"
+			   " WHERE NAME = :old_table_name;\n"
+			   "END;\n"
+			   , FALSE, trx);
+
+	/* SYS_TABLESPACES and SYS_DATAFILES track non-system tablespaces
+	which have space IDs > 0. */
+	if (err == DB_SUCCESS
+	    && table->space != TRX_SYS_SPACE
+	    && !table->ibd_file_missing) {
+		/* Make a new pathname to update SYS_DATAFILES. */
+		char*	new_path = row_make_new_pathname(table, new_name);
+
+		info = pars_info_create();
+
+		pars_info_add_str_literal(info, "new_table_name", new_name);
+		pars_info_add_str_literal(info, "new_path_name", new_path);
+		pars_info_add_int4_literal(info, "space_id", table->space);
+
+		err = que_eval_sql(info,
+				   "PROCEDURE RENAME_SPACE () IS\n"
+				   "BEGIN\n"
+				   "UPDATE SYS_TABLESPACES"
+				   " SET NAME = :new_table_name\n"
+				   " WHERE SPACE = :space_id;\n"
+				   "UPDATE SYS_DATAFILES"
+				   " SET PATH = :new_path_name\n"
+				   " WHERE SPACE = :space_id;\n"
+				   "END;\n"
+				   , FALSE, trx);
+
+		mem_free(new_path);
+	}
+	if (err != DB_SUCCESS) {
+		goto end;
+	}
+
+	if (!new_is_tmp) {
+		/* Rename all constraints. */
+		char	new_table_name[MAX_TABLE_NAME_LEN] = "";
+		char	old_table_utf8[MAX_TABLE_NAME_LEN] = "";
+		uint	errors = 0;
+
+		strncpy(old_table_utf8, old_name, MAX_TABLE_NAME_LEN);
+		innobase_convert_to_system_charset(
+			strchr(old_table_utf8, '/') + 1,
+			strchr(old_name, '/') +1,
+			MAX_TABLE_NAME_LEN, &errors);
+
+		if (errors) {
+			/* Table name could not be converted from charset
+			my_charset_filename to UTF-8. This means that the
+			table name is already in UTF-8 (#mysql#50). */
+			strncpy(old_table_utf8, old_name, MAX_TABLE_NAME_LEN);
+		}
+
+		info = pars_info_create();
+
+		pars_info_add_str_literal(info, "new_table_name", new_name);
+		pars_info_add_str_literal(info, "old_table_name", old_name);
+		pars_info_add_str_literal(info, "old_table_name_utf8",
+					  old_table_utf8);
+
+		strncpy(new_table_name, new_name, MAX_TABLE_NAME_LEN);
+		innobase_convert_to_system_charset(
+			strchr(new_table_name, '/') + 1,
+			strchr(new_name, '/') +1,
+			MAX_TABLE_NAME_LEN, &errors);
+
+		if (errors) {
+			/* Table name could not be converted from charset
+			my_charset_filename to UTF-8. This means that the
+			table name is already in UTF-8 (#mysql#50). */
+			strncpy(new_table_name, new_name, MAX_TABLE_NAME_LEN);
+		}
+
+		pars_info_add_str_literal(info, "new_table_utf8", new_table_name);
+
+		err = que_eval_sql(
+			info,
+			"PROCEDURE RENAME_CONSTRAINT_IDS () IS\n"
+			"gen_constr_prefix CHAR;\n"
+			"new_db_name CHAR;\n"
+			"foreign_id CHAR;\n"
+			"new_foreign_id CHAR;\n"
+			"old_db_name_len INT;\n"
+			"old_t_name_len INT;\n"
+			"new_db_name_len INT;\n"
+			"id_len INT;\n"
+			"offset INT;\n"
+			"found INT;\n"
+			"BEGIN\n"
+			"found := 1;\n"
+			"old_db_name_len := INSTR(:old_table_name, '/')-1;\n"
+			"new_db_name_len := INSTR(:new_table_name, '/')-1;\n"
+			"new_db_name := SUBSTR(:new_table_name, 0,\n"
+			"                      new_db_name_len);\n"
+			"old_t_name_len := LENGTH(:old_table_name);\n"
+			"gen_constr_prefix := CONCAT(:old_table_name_utf8,\n"
+			"			     '_ibfk_');\n"
+			"WHILE found = 1 LOOP\n"
+			"       SELECT ID INTO foreign_id\n"
+			"        FROM SYS_FOREIGN\n"
+			"        WHERE FOR_NAME = :old_table_name\n"
+			"         AND TO_BINARY(FOR_NAME)\n"
+			"           = TO_BINARY(:old_table_name)\n"
+			"         LOCK IN SHARE MODE;\n"
+			"       IF (SQL % NOTFOUND) THEN\n"
+			"        found := 0;\n"
+			"       ELSE\n"
+			"        UPDATE SYS_FOREIGN\n"
+			"        SET FOR_NAME = :new_table_name\n"
+			"         WHERE ID = foreign_id;\n"
+			"        id_len := LENGTH(foreign_id);\n"
+			"        IF (INSTR(foreign_id, '/') > 0) THEN\n"
+			"               IF (INSTR(foreign_id,\n"
+			"                         gen_constr_prefix) > 0)\n"
+			"               THEN\n"
+                        "                offset := INSTR(foreign_id, '_ibfk_') - 1;\n"
+			"                new_foreign_id :=\n"
+			"                CONCAT(:new_table_utf8,\n"
+			"                SUBSTR(foreign_id, offset,\n"
+			"                       id_len - offset));\n"
+			"               ELSE\n"
+			"                new_foreign_id :=\n"
+			"                CONCAT(new_db_name,\n"
+			"                SUBSTR(foreign_id,\n"
+			"                       old_db_name_len,\n"
+			"                       id_len - old_db_name_len));\n"
+			"               END IF;\n"
+			"               UPDATE SYS_FOREIGN\n"
+			"                SET ID = new_foreign_id\n"
+			"                WHERE ID = foreign_id;\n"
+			"               UPDATE SYS_FOREIGN_COLS\n"
+			"                SET ID = new_foreign_id\n"
+			"                WHERE ID = foreign_id;\n"
+			"        END IF;\n"
+			"       END IF;\n"
+			"END LOOP;\n"
+			"UPDATE SYS_FOREIGN SET REF_NAME = :new_table_name\n"
+			"WHERE REF_NAME = :old_table_name\n"
+			"  AND TO_BINARY(REF_NAME)\n"
+			"    = TO_BINARY(:old_table_name);\n"
+			"END;\n"
+			, FALSE, trx);
+
+	} else if (n_constraints_to_drop > 0) {
+		/* Drop some constraints of tmp tables. */
+
+		ulint	db_name_len = dict_get_db_name_len(old_name) + 1;
+		char*	db_name = mem_heap_strdupl(heap, old_name,
+						   db_name_len);
+		ulint	i;
+
+		for (i = 0; i < n_constraints_to_drop; i++) {
+			err = row_delete_constraint(constraints_to_drop[i],
+						    db_name, heap, trx);
+
+			if (err != DB_SUCCESS) {
+				break;
+			}
+		}
+	}
+
+	if (dict_table_has_fts_index(table)
+	    && !dict_tables_have_same_db(old_name, new_name)) {
+		err = fts_rename_aux_tables(table, new_name, trx);
+
+		if (err != DB_SUCCESS && (table->space != 0)) {
+			char*	orig_name = table->name;
+			trx_t*	trx_bg = trx_allocate_for_background();
+
+			/* If the first fts_rename fails, the trx would
+			be rolled back and committed, we can't use it any more,
+			so we have to start a new background trx here. */
+			ut_a(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+			trx_bg->op_info = "Revert the failing rename "
+					  "for fts aux tables";
+			trx_bg->dict_operation_lock_mode = RW_X_LATCH;
+			trx_start_for_ddl(trx_bg, TRX_DICT_OP_TABLE);
+
+			/* If rename fails and table has its own tablespace,
+			we need to call fts_rename_aux_tables again to
+			revert the ibd file rename, which is not under the
+			control of trx. Also notice the parent table name
+			in cache is not changed yet. If the reverting fails,
+			the ibd data may be left in the new database, which
+			can be fixed only manually. */
+			table->name = const_cast<char*>(new_name);
+			fts_rename_aux_tables(table, old_name, trx_bg);
+			table->name = orig_name;
+
+			trx_bg->dict_operation_lock_mode = 0;
+			trx_commit_for_mysql(trx_bg);
+			trx_free_for_background(trx_bg);
+		}
+	}
+
+end:
+	if (err != DB_SUCCESS) {
+		if (err == DB_DUPLICATE_KEY) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Error; possible reasons:\n"
+			      "InnoDB: 1) Table rename would cause"
+			      " two FOREIGN KEY constraints\n"
+			      "InnoDB: to have the same internal name"
+			      " in case-insensitive comparison.\n"
+			      "InnoDB: 2) table ", stderr);
+			ut_print_name(stderr, trx, TRUE, new_name);
+			fputs(" exists in the InnoDB internal data\n"
+			      "InnoDB: dictionary though MySQL is"
+			      " trying to rename table ", stderr);
+			ut_print_name(stderr, trx, TRUE, old_name);
+			fputs(" to it.\n"
+			      "InnoDB: Have you deleted the .frm file"
+			      " and not used DROP TABLE?\n"
+			      "InnoDB: You can look for further help from\n"
+			      "InnoDB: " REFMAN "innodb-troubleshooting.html\n"
+			      "InnoDB: If table ", stderr);
+			ut_print_name(stderr, trx, TRUE, new_name);
+			fputs(" is a temporary table #sql..., then"
+			      " it can be that\n"
+			      "InnoDB: there are still queries running"
+			      " on the table, and it will be\n"
+			      "InnoDB: dropped automatically when"
+			      " the queries end.\n"
+			      "InnoDB: You can drop the orphaned table"
+			      " inside InnoDB by\n"
+			      "InnoDB: creating an InnoDB table with"
+			      " the same name in another\n"
+			      "InnoDB: database and copying the .frm file"
+			      " to the current database.\n"
+			      "InnoDB: Then MySQL thinks the table exists,"
+			      " and DROP TABLE will\n"
+			      "InnoDB: succeed.\n", stderr);
+		}
+		trx->error_state = DB_SUCCESS;
+		trx_rollback_to_savepoint(trx, NULL);
+		trx->error_state = DB_SUCCESS;
+	} else {
+		/* The following call will also rename the .ibd data file if
+		the table is stored in a single-table tablespace */
+
+		err = dict_table_rename_in_cache(
+			table, new_name, !new_is_tmp);
+		if (err != DB_SUCCESS) {
+			trx->error_state = DB_SUCCESS;
+			trx_rollback_to_savepoint(trx, NULL);
+			trx->error_state = DB_SUCCESS;
+			goto funct_exit;
+		}
+
+		/* We only want to switch off some of the type checking in
+		an ALTER, not in a RENAME. */
+
+		err = dict_load_foreigns(
+			new_name, NULL,
+			false, !old_is_tmp || trx->check_foreigns,
+			DICT_ERR_IGNORE_NONE);
+
+		if (err != DB_SUCCESS) {
+			ut_print_timestamp(stderr);
+
+			if (old_is_tmp) {
+				fputs("  InnoDB: Error: in ALTER TABLE ",
+				      stderr);
+				ut_print_name(stderr, trx, TRUE, new_name);
+				fputs("\n"
+				      "InnoDB: has or is referenced"
+				      " in foreign key constraints\n"
+				      "InnoDB: which are not compatible"
+				      " with the new table definition.\n",
+				      stderr);
+			} else {
+				fputs("  InnoDB: Error: in RENAME TABLE"
+				      " table ",
+				      stderr);
+				ut_print_name(stderr, trx, TRUE, new_name);
+				fputs("\n"
+				      "InnoDB: is referenced in"
+				      " foreign key constraints\n"
+				      "InnoDB: which are not compatible"
+				      " with the new table definition.\n",
+				      stderr);
+			}
+
+			ut_a(DB_SUCCESS == dict_table_rename_in_cache(
+				table, old_name, FALSE));
+			trx->error_state = DB_SUCCESS;
+			trx_rollback_to_savepoint(trx, NULL);
+			trx->error_state = DB_SUCCESS;
+		}
+	}
+
+funct_exit:
+	if (table != NULL) {
+		dict_table_close(table, dict_locked, FALSE);
+	}
+
+	if (commit) {
+		trx_commit_for_mysql(trx);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Checks that the index contains entries in an ascending order, unique
+constraint is not broken, and calculates the number of index entries
+in the read view of the current transaction.
+@return	true if ok */
+UNIV_INTERN
+bool
+row_check_index_for_mysql(
+/*======================*/
+	row_prebuilt_t*		prebuilt,	/*!< in: prebuilt struct
+						in MySQL handle */
+	const dict_index_t*	index,		/*!< in: index */
+	ulint*			n_rows)		/*!< out: number of entries
+						seen in the consistent read */
+{
+	dtuple_t*	prev_entry	= NULL;
+	ulint		matched_fields;
+	ulint		matched_bytes;
+	byte*		buf;
+	ulint		ret;
+	rec_t*		rec;
+	bool		is_ok		= true;
+	int		cmp;
+	ibool		contains_null;
+	ulint		i;
+	ulint		cnt;
+	mem_heap_t*	heap		= NULL;
+	ulint		n_ext;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets;
+	rec_offs_init(offsets_);
+
+	*n_rows = 0;
+
+	if (dict_index_is_clust(index)) {
+		/* The clustered index of a table is always available.
+		During online ALTER TABLE that rebuilds the table, the
+		clustered index in the old table will have
+		index->online_log pointing to the new table. All
+		indexes of the old table will remain valid and the new
+		table will be unaccessible to MySQL until the
+		completion of the ALTER TABLE. */
+	} else if (dict_index_is_online_ddl(index)
+		   || (index->type & DICT_FTS)) {
+		/* Full Text index are implemented by auxiliary tables,
+		not the B-tree. We also skip secondary indexes that are
+		being created online. */
+		return(true);
+	}
+
+	buf = static_cast<byte*>(mem_alloc(UNIV_PAGE_SIZE));
+	heap = mem_heap_create(100);
+
+	cnt = 1000;
+
+	ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, 0);
+loop:
+	/* Check thd->killed every 1,000 scanned rows */
+	if (--cnt == 0) {
+		if (trx_is_interrupted(prebuilt->trx)) {
+			goto func_exit;
+		}
+		cnt = 1000;
+	}
+
+	switch (ret) {
+	case DB_SUCCESS:
+		break;
+	default:
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Warning: CHECK TABLE on ", stderr);
+		dict_index_name_print(stderr, prebuilt->trx, index);
+		fprintf(stderr, " returned %lu\n", ret);
+		/* fall through (this error is ignored by CHECK TABLE) */
+	case DB_END_OF_INDEX:
+func_exit:
+		mem_free(buf);
+		mem_heap_free(heap);
+
+		return(is_ok);
+	}
+
+	*n_rows = *n_rows + 1;
+
+	/* row_search... returns the index record in buf, record origin offset
+	within buf stored in the first 4 bytes, because we have built a dummy
+	template */
+
+	rec = buf + mach_read_from_4(buf);
+
+	offsets = rec_get_offsets(rec, index, offsets_,
+				  ULINT_UNDEFINED, &heap);
+
+	if (prev_entry != NULL) {
+		matched_fields = 0;
+		matched_bytes = 0;
+
+		cmp = cmp_dtuple_rec_with_match(prev_entry, rec, offsets,
+						&matched_fields,
+						&matched_bytes);
+		contains_null = FALSE;
+
+		/* In a unique secondary index we allow equal key values if
+		they contain SQL NULLs */
+
+		for (i = 0;
+		     i < dict_index_get_n_ordering_defined_by_user(index);
+		     i++) {
+			if (UNIV_SQL_NULL == dfield_get_len(
+				    dtuple_get_nth_field(prev_entry, i))) {
+
+				contains_null = TRUE;
+				break;
+			}
+		}
+
+		if (cmp > 0) {
+			fputs("InnoDB: index records in a wrong order in ",
+			      stderr);
+not_ok:
+			dict_index_name_print(stderr,
+					      prebuilt->trx, index);
+			fputs("\n"
+			      "InnoDB: prev record ", stderr);
+			dtuple_print(stderr, prev_entry);
+			fputs("\n"
+			      "InnoDB: record ", stderr);
+			rec_print_new(stderr, rec, offsets);
+			putc('\n', stderr);
+			is_ok = false;
+		} else if (dict_index_is_unique(index)
+			   && !contains_null
+			   && matched_fields
+			   >= dict_index_get_n_ordering_defined_by_user(
+				   index)) {
+
+			fputs("InnoDB: duplicate key in ", stderr);
+			goto not_ok;
+		}
+	}
+
+	{
+		mem_heap_t*	tmp_heap = NULL;
+
+		/* Empty the heap on each round.  But preserve offsets[]
+		for the row_rec_to_index_entry() call, by copying them
+		into a separate memory heap when needed. */
+		if (UNIV_UNLIKELY(offsets != offsets_)) {
+			ulint	size = rec_offs_get_n_alloc(offsets)
+				* sizeof *offsets;
+
+			tmp_heap = mem_heap_create(size);
+
+			offsets = static_cast<ulint*>(
+				mem_heap_dup(tmp_heap, offsets, size));
+		}
+
+		mem_heap_empty(heap);
+
+		prev_entry = row_rec_to_index_entry(
+			rec, index, offsets, &n_ext, heap);
+
+		if (UNIV_LIKELY_NULL(tmp_heap)) {
+			mem_heap_free(tmp_heap);
+		}
+	}
+
+	ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, ROW_SEL_NEXT);
+
+	goto loop;
+}
+
+/*********************************************************************//**
+Determines if a table is a magic monitor table.
+@return	true if monitor table */
+UNIV_INTERN
+bool
+row_is_magic_monitor_table(
+/*=======================*/
+	const char*	table_name)	/*!< in: name of the table, in the
+					form database/table_name */
+{
+	const char*	name; /* table_name without database/ */
+	ulint		len;
+
+	name = dict_remove_db_name(table_name);
+	len = strlen(name) + 1;
+
+	return(STR_EQ(name, len, S_innodb_monitor)
+	       || STR_EQ(name, len, S_innodb_lock_monitor)
+	       || STR_EQ(name, len, S_innodb_tablespace_monitor)
+	       || STR_EQ(name, len, S_innodb_table_monitor)
+#ifdef UNIV_MEM_DEBUG
+	       || STR_EQ(name, len, S_innodb_mem_validate)
+#endif /* UNIV_MEM_DEBUG */
+	       );
+}
+
+/*********************************************************************//**
+Initialize this module */
+UNIV_INTERN
+void
+row_mysql_init(void)
+/*================*/
+{
+	mutex_create(
+		row_drop_list_mutex_key,
+		&row_drop_list_mutex, SYNC_NO_ORDER_CHECK);
+
+	UT_LIST_INIT(row_mysql_drop_list);
+
+	row_mysql_drop_list_inited = TRUE;
+}
+
+/*********************************************************************//**
+Close this module */
+UNIV_INTERN
+void
+row_mysql_close(void)
+/*================*/
+{
+	ut_a(UT_LIST_GET_LEN(row_mysql_drop_list) == 0);
+
+	mutex_free(&row_drop_list_mutex);
+
+	row_mysql_drop_list_inited = FALSE;
+}
diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc
new file mode 100644
index 00000000000..8212a7b43e0
--- /dev/null
+++ b/storage/innobase/row/row0purge.cc
@@ -0,0 +1,988 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0purge.cc
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0purge.h"
+
+#ifdef UNIV_NONINL
+#include "row0purge.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "row0vers.h"
+#include "row0mysql.h"
+#include "row0log.h"
+#include "log0log.h"
+#include "srv0mon.h"
+#include "srv0start.h"
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/********************************************************************//**
+Creates a purge node to a query graph.
+@return	own: purge node */
+UNIV_INTERN
+purge_node_t*
+row_purge_node_create(
+/*==================*/
+	que_thr_t*	parent,		/*!< in: parent node  */
+	mem_heap_t*	heap)		/*!< in: memory heap where created */
+{
+	purge_node_t*	node;
+
+	ut_ad(parent && heap);
+
+	node = static_cast<purge_node_t*>(
+		mem_heap_zalloc(heap, sizeof(*node)));
+
+	node->common.type = QUE_NODE_PURGE;
+	node->common.parent = parent;
+	node->done = TRUE;
+	node->heap = mem_heap_create(256);
+
+	return(node);
+}
+
+/***********************************************************//**
+Repositions the pcur in the purge node on the clustered index record,
+if found.
+@return	TRUE if the record was found */
+static
+ibool
+row_purge_reposition_pcur(
+/*======================*/
+	ulint		mode,	/*!< in: latching mode */
+	purge_node_t*	node,	/*!< in: row purge node */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	if (node->found_clust) {
+		ibool	found;
+
+		found = btr_pcur_restore_position(mode, &node->pcur, mtr);
+
+		return(found);
+	} else {
+		node->found_clust = row_search_on_row_ref(
+			&node->pcur, mode, node->table, node->ref, mtr);
+
+		if (node->found_clust) {
+			btr_pcur_store_position(&node->pcur, mtr);
+		}
+	}
+
+	return(node->found_clust);
+}
+
+/***********************************************************//**
+Removes a delete marked clustered index record if possible.
+@retval true if the row was not found, or it was successfully removed
+@retval false if the row was modified after the delete marking */
+static __attribute__((nonnull, warn_unused_result))
+bool
+row_purge_remove_clust_if_poss_low(
+/*===============================*/
+	purge_node_t*	node,	/*!< in/out: row purge node */
+	ulint		mode)	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+	dict_index_t*		index;
+	bool			success		= true;
+	mtr_t			mtr;
+	rec_t*			rec;
+	mem_heap_t*		heap		= NULL;
+	ulint*			offsets;
+	ulint			offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	index = dict_table_get_first_index(node->table);
+
+	log_free_check();
+	mtr_start(&mtr);
+
+	if (!row_purge_reposition_pcur(mode, node, &mtr)) {
+		/* The record was already removed. */
+		goto func_exit;
+	}
+
+	rec = btr_pcur_get_rec(&node->pcur);
+
+	offsets = rec_get_offsets(
+		rec, index, offsets_, ULINT_UNDEFINED, &heap);
+
+	if (node->roll_ptr != row_get_rec_roll_ptr(rec, index, offsets)) {
+		/* Someone else has modified the record later: do not remove */
+		goto func_exit;
+	}
+
+	if (mode == BTR_MODIFY_LEAF) {
+		success = btr_cur_optimistic_delete(
+			btr_pcur_get_btr_cur(&node->pcur), 0, &mtr);
+	} else {
+		dberr_t	err;
+		ut_ad(mode == BTR_MODIFY_TREE);
+		btr_cur_pessimistic_delete(
+			&err, FALSE, btr_pcur_get_btr_cur(&node->pcur), 0,
+			RB_NONE, &mtr);
+
+		switch (err) {
+		case DB_SUCCESS:
+			break;
+		case DB_OUT_OF_FILE_SPACE:
+			success = false;
+			break;
+		default:
+			ut_error;
+		}
+	}
+
+func_exit:
+	if (heap) {
+		mem_heap_free(heap);
+	}
+
+	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+
+	return(success);
+}
+
+/***********************************************************//**
+Removes a clustered index record if it has not been modified after the delete
+marking.
+@retval true if the row was not found, or it was successfully removed
+@retval false the purge needs to be suspended because of running out
+of file space. */
+static __attribute__((nonnull, warn_unused_result))
+bool
+row_purge_remove_clust_if_poss(
+/*===========================*/
+	purge_node_t*	node)	/*!< in/out: row purge node */
+{
+	if (row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF)) {
+		return(true);
+	}
+
+	for (ulint n_tries = 0;
+	     n_tries < BTR_CUR_RETRY_DELETE_N_TIMES;
+	     n_tries++) {
+		if (row_purge_remove_clust_if_poss_low(
+			    node, BTR_MODIFY_TREE)) {
+			return(true);
+		}
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+	}
+
+	return(false);
+}
+
+/***********************************************************//**
+Determines if it is possible to remove a secondary index entry.
+Removal is possible if the secondary index entry does not refer to any
+not delete marked version of a clustered index record where DB_TRX_ID
+is newer than the purge view.
+
+NOTE: This function should only be called by the purge thread, only
+while holding a latch on the leaf page of the secondary index entry
+(or keeping the buffer pool watch on the page).  It is possible that
+this function first returns true and then false, if a user transaction
+inserts a record that the secondary index entry would refer to.
+However, in that case, the user transaction would also re-insert the
+secondary index entry after purge has removed it and released the leaf
+page latch.
+@return	true if the secondary index record can be purged */
+UNIV_INTERN
+bool
+row_purge_poss_sec(
+/*===============*/
+	purge_node_t*	node,	/*!< in/out: row purge node */
+	dict_index_t*	index,	/*!< in: secondary index */
+	const dtuple_t*	entry)	/*!< in: secondary index entry */
+{
+	bool	can_delete;
+	mtr_t	mtr;
+
+	ut_ad(!dict_index_is_clust(index));
+	mtr_start(&mtr);
+
+	can_delete = !row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr)
+		|| !row_vers_old_has_index_entry(TRUE,
+						 btr_pcur_get_rec(&node->pcur),
+						 &mtr, index, entry);
+
+	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+
+	return(can_delete);
+}
+
+/***************************************************************
+Removes a secondary index entry if possible, by modifying the
+index tree.  Does not try to buffer the delete.
+@return	TRUE if success or if not found */
+static __attribute__((nonnull, warn_unused_result))
+ibool
+row_purge_remove_sec_if_poss_tree(
+/*==============================*/
+	purge_node_t*	node,	/*!< in: row purge node */
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry)	/*!< in: index entry */
+{
+	btr_pcur_t		pcur;
+	btr_cur_t*		btr_cur;
+	ibool			success	= TRUE;
+	dberr_t			err;
+	mtr_t			mtr;
+	enum row_search_result	search_result;
+
+	log_free_check();
+	mtr_start(&mtr);
+
+	if (*index->name == TEMP_INDEX_PREFIX) {
+		/* The index->online_status may change if the
+		index->name starts with TEMP_INDEX_PREFIX (meaning
+		that the index is or was being created online). It is
+		protected by index->lock. */
+		mtr_x_lock(dict_index_get_lock(index), &mtr);
+
+		if (dict_index_is_online_ddl(index)) {
+			/* Online secondary index creation will not
+			copy any delete-marked records. Therefore
+			there is nothing to be purged. We must also
+			skip the purge when a completed index is
+			dropped by rollback_inplace_alter_table(). */
+			goto func_exit_no_pcur;
+		}
+	} else {
+		/* For secondary indexes,
+		index->online_status==ONLINE_INDEX_CREATION unless
+		index->name starts with TEMP_INDEX_PREFIX. */
+		ut_ad(!dict_index_is_online_ddl(index));
+	}
+
+	search_result = row_search_index_entry(index, entry, BTR_MODIFY_TREE,
+					       &pcur, &mtr);
+
+	switch (search_result) {
+	case ROW_NOT_FOUND:
+		/* Not found.  This is a legitimate condition.  In a
+		rollback, InnoDB will remove secondary recs that would
+		be purged anyway.  Then the actual purge will not find
+		the secondary index record.  Also, the purge itself is
+		eager: if it comes to consider a secondary index
+		record, and notices it does not need to exist in the
+		index, it will remove it.  Then if/when the purge
+		comes to consider the secondary index record a second
+		time, it will not exist any more in the index. */
+
+		/* fputs("PURGE:........sec entry not found\n", stderr); */
+		/* dtuple_print(stderr, entry); */
+		goto func_exit;
+	case ROW_FOUND:
+		break;
+	case ROW_BUFFERED:
+	case ROW_NOT_DELETED_REF:
+		/* These are invalid outcomes, because the mode passed
+		to row_search_index_entry() did not include any of the
+		flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+		ut_error;
+	}
+
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	/* We should remove the index record if no later version of the row,
+	which cannot be purged yet, requires its existence. If some requires,
+	we should do nothing. */
+
+	if (row_purge_poss_sec(node, index, entry)) {
+		/* Remove the index record, which should have been
+		marked for deletion. */
+		if (!rec_get_deleted_flag(btr_cur_get_rec(btr_cur),
+					  dict_table_is_comp(index->table))) {
+			fputs("InnoDB: tried to purge sec index entry not"
+			      " marked for deletion in\n"
+			      "InnoDB: ", stderr);
+			dict_index_name_print(stderr, NULL, index);
+			fputs("\n"
+			      "InnoDB: tuple ", stderr);
+			dtuple_print(stderr, entry);
+			fputs("\n"
+			      "InnoDB: record ", stderr);
+			rec_print(stderr, btr_cur_get_rec(btr_cur), index);
+			putc('\n', stderr);
+
+			ut_ad(0);
+
+			goto func_exit;
+		}
+
+		btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
+					   RB_NONE, &mtr);
+		switch (UNIV_EXPECT(err, DB_SUCCESS)) {
+		case DB_SUCCESS:
+			break;
+		case DB_OUT_OF_FILE_SPACE:
+			success = FALSE;
+			break;
+		default:
+			ut_error;
+		}
+	}
+
+func_exit:
+	btr_pcur_close(&pcur);
+func_exit_no_pcur:
+	mtr_commit(&mtr);
+
+	return(success);
+}
+
+/***************************************************************
+Removes a secondary index entry without modifying the index tree,
+if possible.
+@retval	true if success or if not found
+@retval	false if row_purge_remove_sec_if_poss_tree() should be invoked */
+static __attribute__((nonnull, warn_unused_result))
+bool
+row_purge_remove_sec_if_poss_leaf(
+/*==============================*/
+	purge_node_t*	node,	/*!< in: row purge node */
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry)	/*!< in: index entry */
+{
+	mtr_t			mtr;
+	btr_pcur_t		pcur;
+	ulint			mode;
+	enum row_search_result	search_result;
+	bool			success	= true;
+
+	log_free_check();
+
+	mtr_start(&mtr);
+
+	if (*index->name == TEMP_INDEX_PREFIX) {
+		/* The index->online_status may change if the
+		index->name starts with TEMP_INDEX_PREFIX (meaning
+		that the index is or was being created online). It is
+		protected by index->lock. */
+		mtr_s_lock(dict_index_get_lock(index), &mtr);
+
+		if (dict_index_is_online_ddl(index)) {
+			/* Online secondary index creation will not
+			copy any delete-marked records. Therefore
+			there is nothing to be purged. We must also
+			skip the purge when a completed index is
+			dropped by rollback_inplace_alter_table(). */
+			goto func_exit_no_pcur;
+		}
+
+		mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED | BTR_DELETE;
+	} else {
+		/* For secondary indexes,
+		index->online_status==ONLINE_INDEX_CREATION unless
+		index->name starts with TEMP_INDEX_PREFIX. */
+		ut_ad(!dict_index_is_online_ddl(index));
+
+		mode = BTR_MODIFY_LEAF | BTR_DELETE;
+	}
+
+	/* Set the purge node for the call to row_purge_poss_sec(). */
+	pcur.btr_cur.purge_node = node;
+	/* Set the query thread, so that ibuf_insert_low() will be
+	able to invoke thd_get_trx(). */
+	pcur.btr_cur.thr = static_cast<que_thr_t*>(que_node_get_parent(node));
+
+	search_result = row_search_index_entry(
+		index, entry, mode, &pcur, &mtr);
+
+	switch (search_result) {
+	case ROW_FOUND:
+		/* Before attempting to purge a record, check
+		if it is safe to do so. */
+		if (row_purge_poss_sec(node, index, entry)) {
+			btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+			/* Only delete-marked records should be purged. */
+			if (!rec_get_deleted_flag(
+				btr_cur_get_rec(btr_cur),
+				dict_table_is_comp(index->table))) {
+
+				fputs("InnoDB: tried to purge sec index"
+				      " entry not marked for deletion in\n"
+				      "InnoDB: ", stderr);
+				dict_index_name_print(stderr, NULL, index);
+				fputs("\n"
+				      "InnoDB: tuple ", stderr);
+				dtuple_print(stderr, entry);
+				fputs("\n"
+				      "InnoDB: record ", stderr);
+				rec_print(stderr, btr_cur_get_rec(btr_cur),
+					  index);
+				putc('\n', stderr);
+
+				ut_ad(0);
+
+				btr_pcur_close(&pcur);
+
+				goto func_exit_no_pcur;
+			}
+
+			if (!btr_cur_optimistic_delete(btr_cur, 0, &mtr)) {
+
+				/* The index entry could not be deleted. */
+				success = false;
+			}
+		}
+		/* fall through (the index entry is still needed,
+		or the deletion succeeded) */
+	case ROW_NOT_DELETED_REF:
+		/* The index entry is still needed. */
+	case ROW_BUFFERED:
+		/* The deletion was buffered. */
+	case ROW_NOT_FOUND:
+		/* The index entry does not exist, nothing to do. */
+		btr_pcur_close(&pcur);
+	func_exit_no_pcur:
+		mtr_commit(&mtr);
+		return(success);
+	}
+
+	ut_error;
+	return(FALSE);
+}
+
+/***********************************************************//**
+Removes a secondary index entry if possible. */
+UNIV_INLINE __attribute__((nonnull(1,2)))
+void
+row_purge_remove_sec_if_poss(
+/*=========================*/
+	purge_node_t*	node,	/*!< in: row purge node */
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry)	/*!< in: index entry */
+{
+	ibool	success;
+	ulint	n_tries		= 0;
+
+	/*	fputs("Purge: Removing secondary record\n", stderr); */
+
+	if (!entry) {
+		/* The node->row must have lacked some fields of this
+		index. This is possible when the undo log record was
+		written before this index was created. */
+		return;
+	}
+
+	if (row_purge_remove_sec_if_poss_leaf(node, index, entry)) {
+
+		return;
+	}
+retry:
+	success = row_purge_remove_sec_if_poss_tree(node, index, entry);
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+		n_tries++;
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+		goto retry;
+	}
+
+	ut_a(success);
+}
+
+/***********************************************************//**
+Purges a delete marking of a record.
+@retval true if the row was not found, or it was successfully removed
+@retval false the purge needs to be suspended because of
+running out of file space */
+static __attribute__((nonnull, warn_unused_result))
+bool
+row_purge_del_mark(
+/*===============*/
+	purge_node_t*	node)	/*!< in/out: row purge node */
+{
+	mem_heap_t*	heap;
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		/* skip corrupted secondary index */
+		dict_table_skip_corrupt_index(node->index);
+
+		if (!node->index) {
+			break;
+		}
+
+		if (node->index->type != DICT_FTS) {
+			dtuple_t*	entry = row_build_index_entry_low(
+				node->row, NULL, node->index, heap);
+			row_purge_remove_sec_if_poss(node, node->index, entry);
+			mem_heap_empty(heap);
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);
+
+	return(row_purge_remove_clust_if_poss(node));
+}
+
+/***********************************************************//**
+Purges an update of an existing record. Also purges an update of a delete
+marked record if that record contained an externally stored field. */
+static
+void
+row_purge_upd_exist_or_extern_func(
+/*===============================*/
+#ifdef UNIV_DEBUG
+	const que_thr_t*thr,		/*!< in: query thread */
+#endif /* UNIV_DEBUG */
+	purge_node_t*	node,		/*!< in: row purge node */
+	trx_undo_rec_t*	undo_rec)	/*!< in: record to purge */
+{
+	mem_heap_t*	heap;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (node->rec_type == TRX_UNDO_UPD_DEL_REC
+	    || (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+
+		goto skip_secondaries;
+	}
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		dict_table_skip_corrupt_index(node->index);
+
+		if (!node->index) {
+			break;
+		}
+
+		if (row_upd_changes_ord_field_binary(node->index, node->update,
+						     thr, NULL, NULL)) {
+			/* Build the older version of the index entry */
+			dtuple_t*	entry = row_build_index_entry_low(
+				node->row, NULL, node->index, heap);
+			row_purge_remove_sec_if_poss(node, node->index, entry);
+			mem_heap_empty(heap);
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);
+
+skip_secondaries:
+	/* Free possible externally stored fields */
+	for (ulint i = 0; i < upd_get_n_fields(node->update); i++) {
+
+		const upd_field_t*	ufield
+			= upd_get_nth_field(node->update, i);
+
+		if (dfield_is_ext(&ufield->new_val)) {
+			trx_rseg_t*	rseg;
+			buf_block_t*	block;
+			ulint		internal_offset;
+			byte*		data_field;
+			dict_index_t*	index;
+			ibool		is_insert;
+			ulint		rseg_id;
+			ulint		page_no;
+			ulint		offset;
+			mtr_t		mtr;
+
+			/* We use the fact that new_val points to
+			undo_rec and get thus the offset of
+			dfield data inside the undo record. Then we
+			can calculate from node->roll_ptr the file
+			address of the new_val data */
+
+			internal_offset
+				= ((const byte*)
+				   dfield_get_data(&ufield->new_val))
+				- undo_rec;
+
+			ut_a(internal_offset < UNIV_PAGE_SIZE);
+
+			trx_undo_decode_roll_ptr(node->roll_ptr,
+						 &is_insert, &rseg_id,
+						 &page_no, &offset);
+
+			rseg = trx_sys_get_nth_rseg(trx_sys, rseg_id);
+			ut_a(rseg != NULL);
+			ut_a(rseg->id == rseg_id);
+
+			mtr_start(&mtr);
+
+			/* We have to acquire an X-latch to the clustered
+			index tree */
+
+			index = dict_table_get_first_index(node->table);
+			mtr_x_lock(dict_index_get_lock(index), &mtr);
+
+			/* NOTE: we must also acquire an X-latch to the
+			root page of the tree. We will need it when we
+			free pages from the tree. If the tree is of height 1,
+			the tree X-latch does NOT protect the root page,
+			because it is also a leaf page. Since we will have a
+			latch on an undo log page, we would break the
+			latching order if we would only later latch the
+			root page of such a tree! */
+
+			btr_root_get(index, &mtr);
+
+			block = buf_page_get(
+				rseg->space, 0, page_no, RW_X_LATCH, &mtr);
+
+			buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+			data_field = buf_block_get_frame(block)
+				+ offset + internal_offset;
+
+			ut_a(dfield_get_len(&ufield->new_val)
+			     >= BTR_EXTERN_FIELD_REF_SIZE);
+			btr_free_externally_stored_field(
+				index,
+				data_field + dfield_get_len(&ufield->new_val)
+				- BTR_EXTERN_FIELD_REF_SIZE,
+				NULL, NULL, NULL, 0, RB_NONE, &mtr);
+			mtr_commit(&mtr);
+		}
+	}
+}
+
+#ifdef UNIV_DEBUG
+# define row_purge_upd_exist_or_extern(thr,node,undo_rec)	\
+	row_purge_upd_exist_or_extern_func(thr,node,undo_rec)
+#else /* UNIV_DEBUG */
+# define row_purge_upd_exist_or_extern(thr,node,undo_rec)	\
+	row_purge_upd_exist_or_extern_func(node,undo_rec)
+#endif /* UNIV_DEBUG */
+
+/***********************************************************//**
+Parses the row reference and other info in a modify undo log record.
+@return true if purge operation required */
+static
+bool
+row_purge_parse_undo_rec(
+/*=====================*/
+	purge_node_t*		node,		/*!< in: row undo node */
+	trx_undo_rec_t*		undo_rec,	/*!< in: record to purge */
+	bool*			updated_extern, /*!< out: true if an externally
+						stored field was updated */
+	que_thr_t*		thr)		/*!< in: query thread */
+{
+	dict_index_t*	clust_index;
+	byte*		ptr;
+	trx_t*		trx;
+	undo_no_t	undo_no;
+	table_id_t	table_id;
+	trx_id_t	trx_id;
+	roll_ptr_t	roll_ptr;
+	ulint		info_bits;
+	ulint		type;
+
+	ut_ad(node && thr);
+
+	ptr = trx_undo_rec_get_pars(
+		undo_rec, &type, &node->cmpl_info,
+		updated_extern, &undo_no, &table_id);
+
+	node->rec_type = type;
+
+	if (type == TRX_UNDO_UPD_DEL_REC && !*updated_extern) {
+
+		return(false);
+	}
+
+	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+					       &info_bits);
+	node->table = NULL;
+
+	/* Prevent DROP TABLE etc. from running when we are doing the purge
+	for this row */
+
+	rw_lock_s_lock_inline(&dict_operation_lock, 0, __FILE__, __LINE__);
+
+	node->table = dict_table_open_on_id(
+		table_id, FALSE, DICT_TABLE_OP_NORMAL);
+
+	if (node->table == NULL) {
+		/* The table has been dropped: no need to do purge */
+		goto err_exit;
+	}
+
+	if (node->table->ibd_file_missing) {
+		/* We skip purge of missing .ibd files */
+
+		dict_table_close(node->table, FALSE, FALSE);
+
+		node->table = NULL;
+
+		goto err_exit;
+	}
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	if (clust_index == NULL) {
+		/* The table was corrupt in the data dictionary.
+		dict_set_corrupted() works on an index, and
+		we do not have an index to call it with. */
+close_exit:
+		dict_table_close(node->table, FALSE, FALSE);
+err_exit:
+		rw_lock_s_unlock(&dict_operation_lock);
+		return(false);
+	}
+
+	if (type == TRX_UNDO_UPD_EXIST_REC
+	    && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)
+	    && !*updated_extern) {
+
+		/* Purge requires no changes to indexes: we may return */
+		goto close_exit;
+	}
+
+	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+				       node->heap);
+
+	trx = thr_get_trx(thr);
+
+	ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+					     roll_ptr, info_bits, trx,
+					     node->heap, &(node->update));
+
+	/* Read to the partial row the fields that occur in indexes */
+
+	if (!(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+		ptr = trx_undo_rec_get_partial_row(
+			ptr, clust_index, &node->row,
+			type == TRX_UNDO_UPD_DEL_REC,
+			node->heap);
+	}
+
+	return(true);
+}
+
+/***********************************************************//**
+Purges the parsed record.
+@return true if purged, false if skipped */
+static __attribute__((nonnull, warn_unused_result))
+bool
+row_purge_record_func(
+/*==================*/
+	purge_node_t*	node,		/*!< in: row purge node */
+	trx_undo_rec_t*	undo_rec,	/*!< in: record to purge */
+#ifdef UNIV_DEBUG
+	const que_thr_t*thr,		/*!< in: query thread */
+#endif /* UNIV_DEBUG */
+	bool		updated_extern)	/*!< in: whether external columns
+					were updated */
+{
+	dict_index_t*	clust_index;
+	bool		purged		= true;
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	node->index = dict_table_get_next_index(clust_index);
+
+	switch (node->rec_type) {
+	case TRX_UNDO_DEL_MARK_REC:
+		purged = row_purge_del_mark(node);
+		if (!purged) {
+			break;
+		}
+		MONITOR_INC(MONITOR_N_DEL_ROW_PURGE);
+		break;
+	default:
+		if (!updated_extern) {
+			break;
+		}
+		/* fall through */
+	case TRX_UNDO_UPD_EXIST_REC:
+		row_purge_upd_exist_or_extern(thr, node, undo_rec);
+		MONITOR_INC(MONITOR_N_UPD_EXIST_EXTERN);
+		break;
+	}
+
+	if (node->found_clust) {
+		btr_pcur_close(&node->pcur);
+		node->found_clust = FALSE;
+	}
+
+	if (node->table != NULL) {
+		dict_table_close(node->table, FALSE, FALSE);
+		node->table = NULL;
+	}
+
+	return(purged);
+}
+
+#ifdef UNIV_DEBUG
+# define row_purge_record(node,undo_rec,thr,updated_extern)	\
+	row_purge_record_func(node,undo_rec,thr,updated_extern)
+#else /* UNIV_DEBUG */
+# define row_purge_record(node,undo_rec,thr,updated_extern)	\
+	row_purge_record_func(node,undo_rec,updated_extern)
+#endif /* UNIV_DEBUG */
+
+/***********************************************************//**
+Fetches an undo log record and does the purge for the recorded operation.
+If none left, or the current purge completed, returns the control to the
+parent node, which is always a query thread node. */
+static __attribute__((nonnull))
+void
+row_purge(
+/*======*/
+	purge_node_t*	node,		/*!< in: row purge node */
+	trx_undo_rec_t*	undo_rec,	/*!< in: record to purge */
+	que_thr_t*	thr)		/*!< in: query thread */
+{
+	if (undo_rec != &trx_purge_dummy_rec) {
+		bool	updated_extern;
+
+		while (row_purge_parse_undo_rec(
+			       node, undo_rec, &updated_extern, thr)) {
+
+			bool purged = row_purge_record(
+				node, undo_rec, thr, updated_extern);
+
+			rw_lock_s_unlock(&dict_operation_lock);
+
+			if (purged
+			    || srv_shutdown_state != SRV_SHUTDOWN_NONE) {
+				return;
+			}
+
+			/* Retry the purge in a second. */
+			os_thread_sleep(1000000);
+		}
+	}
+}
+
+/***********************************************************//**
+Reset the purge query thread. */
+UNIV_INLINE
+void
+row_purge_end(
+/*==========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	purge_node_t*	node;
+
+	ut_ad(thr);
+
+	node = static_cast<purge_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
+
+	thr->run_node = que_node_get_parent(node);
+
+	node->undo_recs = NULL;
+
+	node->done = TRUE;
+
+	ut_a(thr->run_node != NULL);
+
+	mem_heap_empty(node->heap);
+}
+
+/***********************************************************//**
+Does the purge operation for a single undo log record. This is a high-level
+function used in an SQL execution graph.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_purge_step(
+/*===========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	purge_node_t*	node;
+
+	ut_ad(thr);
+
+	node = static_cast<purge_node_t*>(thr->run_node);
+
+	node->table = NULL;
+	node->row = NULL;
+	node->ref = NULL;
+	node->index = NULL;
+	node->update = NULL;
+	node->found_clust = FALSE;
+	node->rec_type = ULINT_UNDEFINED;
+	node->cmpl_info = ULINT_UNDEFINED;
+
+	ut_a(!node->done);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
+
+	if (!(node->undo_recs == NULL || ib_vector_is_empty(node->undo_recs))) {
+		trx_purge_rec_t*purge_rec;
+
+		purge_rec = static_cast<trx_purge_rec_t*>(
+			ib_vector_pop(node->undo_recs));
+
+		node->roll_ptr = purge_rec->roll_ptr;
+
+		row_purge(node, purge_rec->undo_rec, thr);
+
+		if (ib_vector_is_empty(node->undo_recs)) {
+			row_purge_end(thr);
+		} else {
+			thr->run_node = node;
+		}
+	} else {
+		row_purge_end(thr);
+	}
+
+	return(thr);
+}
diff --git a/storage/innobase/row/row0quiesce.cc b/storage/innobase/row/row0quiesce.cc
new file mode 100644
index 00000000000..ecd6f47947b
--- /dev/null
+++ b/storage/innobase/row/row0quiesce.cc
@@ -0,0 +1,702 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0quiesce.cc
+Quiesce a tablespace.
+
+Created 2012-02-08 by Sunny Bains.
+*******************************************************/
+
+#include "row0quiesce.h"
+#include "row0mysql.h"
+
+#ifdef UNIV_NONINL
+#include "row0quiesce.ic"
+#endif
+
+#include "ibuf0ibuf.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+
+/*********************************************************************//**
+Write the meta data (index user fields) config file.
+@return DB_SUCCESS or error code. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_index_fields(
+/*===========================*/
+	const dict_index_t*	index,	/*!< in: write the meta data for
+					this index */
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd)	/*!< in/out: session */
+{
+	byte			row[sizeof(ib_uint32_t) * 2];
+
+	for (ulint i = 0; i < index->n_fields; ++i) {
+		byte*			ptr = row;
+		const dict_field_t*	field = &index->fields[i];
+
+		mach_write_to_4(ptr, field->prefix_len);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, field->fixed_len);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_9",
+				close(fileno(file)););
+
+		if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				errno, strerror(errno),
+				"while writing index fields.");
+
+			return(DB_IO_ERROR);
+		}
+
+		/* Include the NUL byte in the length. */
+		ib_uint32_t	len = static_cast<ib_uint32_t>(strlen(field->name) + 1);
+		ut_a(len > 1);
+
+		mach_write_to_4(row, len);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_10",
+				close(fileno(file)););
+
+		if (fwrite(row, 1,  sizeof(len), file) != sizeof(len)
+		    || fwrite(field->name, 1, len, file) != len) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				errno, strerror(errno),
+				"while writing index column.");
+
+			return(DB_IO_ERROR);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Write the meta data config file index information.
+@return DB_SUCCESS or error code. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_indexes(
+/*======================*/
+	const dict_table_t*	table,	/*!< in: write the meta data for
+					this table */
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd)	/*!< in/out: session */
+{
+	{
+		byte		row[sizeof(ib_uint32_t)];
+
+		/* Write the number of indexes in the table. */
+		mach_write_to_4(row, UT_LIST_GET_LEN(table->indexes));
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_11",
+				close(fileno(file)););
+
+		if (fwrite(row, 1,  sizeof(row), file) != sizeof(row)) {
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				errno, strerror(errno),
+				"while writing index count.");
+
+			return(DB_IO_ERROR);
+		}
+	}
+
+	dberr_t			err = DB_SUCCESS;
+
+	/* Write the index meta data. */
+	for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+	     index != 0 && err == DB_SUCCESS;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+		byte*		ptr;
+		byte		row[sizeof(index_id_t)
+				    + sizeof(ib_uint32_t) * 8];
+
+		ptr = row;
+
+		ut_ad(sizeof(index_id_t) == 8);
+		mach_write_to_8(ptr, index->id);
+		ptr += sizeof(index_id_t);
+
+		mach_write_to_4(ptr, index->space);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->page);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->type);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->trx_id_offset);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->n_user_defined_cols);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->n_uniq);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->n_nullable);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->n_fields);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_12",
+				close(fileno(file)););
+
+		if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				errno, strerror(errno),
+				"while writing index meta-data.");
+
+			return(DB_IO_ERROR);
+		}
+
+		/* Write the length of the index name.
+		NUL byte is included in the length. */
+		ib_uint32_t	len = static_cast<ib_uint32_t>(strlen(index->name) + 1);
+		ut_a(len > 1);
+
+		mach_write_to_4(row, len);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_1",
+				close(fileno(file)););
+
+		if (fwrite(row, 1, sizeof(len), file) != sizeof(len)
+		    || fwrite(index->name, 1, len, file) != len) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				errno, strerror(errno),
+				"while writing index name.");
+
+			return(DB_IO_ERROR);
+		}
+
+		err = row_quiesce_write_index_fields(index, file, thd);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Write the meta data (table columns) config file. Serialise the contents of
+dict_col_t structure, along with the column name. All fields are serialized
+as ib_uint32_t.
+@return DB_SUCCESS or error code. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_table(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: write the meta data for
+					this table */
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd)	/*!< in/out: session */
+{
+	dict_col_t*		col;
+	byte			row[sizeof(ib_uint32_t) * 7];
+
+	col = table->cols;
+
+	for (ulint i = 0; i < table->n_cols; ++i, ++col) {
+		byte*		ptr = row;
+
+		mach_write_to_4(ptr, col->prtype);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->mtype);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->len);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->mbminmaxlen);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->ind);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->ord_part);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->max_prefix);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_2",
+				close(fileno(file)););
+
+		if (fwrite(row, 1,  sizeof(row), file) != sizeof(row)) {
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				errno, strerror(errno),
+				"while writing table column data.");
+
+			return(DB_IO_ERROR);
+		}
+
+		/* Write out the column name as [len, byte array]. The len
+		includes the NUL byte. */
+		ib_uint32_t	len;
+		const char*	col_name;
+
+		col_name = dict_table_get_col_name(table, dict_col_get_no(col));
+
+		/* Include the NUL byte in the length. */
+		len = static_cast<ib_uint32_t>(strlen(col_name) + 1);
+		ut_a(len > 1);
+
+		mach_write_to_4(row, len);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_3",
+				close(fileno(file)););
+
+		if (fwrite(row, 1,  sizeof(len), file) != sizeof(len)
+		    || fwrite(col_name, 1, len, file) != len) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				errno, strerror(errno),
+				"while writing column name.");
+
+			return(DB_IO_ERROR);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Write the meta data config file header.
+@return DB_SUCCESS or error code. */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_header(
+/*=====================*/
+	const dict_table_t*	table,	/*!< in: write the meta data for
+					this table */
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd)	/*!< in/out: session */
+{
+	byte			value[sizeof(ib_uint32_t)];
+
+	/* Write the meta-data version number. */
+	mach_write_to_4(value, IB_EXPORT_CFG_VERSION_V1);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_4", close(fileno(file)););
+
+	if (fwrite(&value, 1,  sizeof(value), file) != sizeof(value)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			errno, strerror(errno),
+			"while writing meta-data version number.");
+
+		return(DB_IO_ERROR);
+	}
+
+	/* Write the server hostname. */
+	ib_uint32_t		len;
+	const char*		hostname = server_get_hostname();
+
+	/* Play it safe and check for NULL. */
+	if (hostname == 0) {
+		static const char	NullHostname[] = "Hostname unknown";
+
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Unable to determine server hostname.");
+
+		hostname = NullHostname;
+	}
+
+	/* The server hostname includes the NUL byte. */
+	len = static_cast<ib_uint32_t>(strlen(hostname) + 1);
+	mach_write_to_4(value, len);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_5", close(fileno(file)););
+
+	if (fwrite(&value, 1,  sizeof(value), file) != sizeof(value)
+	    || fwrite(hostname, 1,  len, file) != len) {
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			errno, strerror(errno),
+			"while writing hostname.");
+
+		return(DB_IO_ERROR);
+	}
+
+	/* The table name includes the NUL byte. */
+	ut_a(table->name != 0);
+	len = static_cast<ib_uint32_t>(strlen(table->name) + 1);
+
+	/* Write the table name. */
+	mach_write_to_4(value, len);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_6", close(fileno(file)););
+
+	if (fwrite(&value, 1,  sizeof(value), file) != sizeof(value)
+	    || fwrite(table->name, 1,  len, file) != len) {
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			errno, strerror(errno),
+			"while writing table name.");
+
+		return(DB_IO_ERROR);
+	}
+
+	byte		row[sizeof(ib_uint32_t) * 3];
+
+	/* Write the next autoinc value. */
+	mach_write_to_8(row, table->autoinc);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_7", close(fileno(file)););
+
+	if (fwrite(row, 1,  sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			errno, strerror(errno),
+			"while writing table autoinc value.");
+
+		return(DB_IO_ERROR);
+	}
+
+	byte*		ptr = row;
+
+	/* Write the system page size. */
+	mach_write_to_4(ptr, UNIV_PAGE_SIZE);
+	ptr += sizeof(ib_uint32_t);
+
+	/* Write the table->flags. */
+	mach_write_to_4(ptr, table->flags);
+	ptr += sizeof(ib_uint32_t);
+
+	/* Write the number of columns in the table. */
+	mach_write_to_4(ptr, table->n_cols);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_8", close(fileno(file)););
+
+	if (fwrite(row, 1,  sizeof(row), file) != sizeof(row)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			errno, strerror(errno),
+			"while writing table meta-data.");
+
+		return(DB_IO_ERROR);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Write the table meta data after quiesce.
+@return DB_SUCCESS or error code */
+static	__attribute__((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_cfg(
+/*==================*/
+	dict_table_t*	table,	/*!< in: write the meta data for
+					this table */
+	THD*			thd)	/*!< in/out: session */
+{
+	dberr_t			err;
+	char			name[OS_FILE_MAX_PATH];
+
+	srv_get_meta_data_filename(table, name, sizeof(name));
+
+	ib_logf(IB_LOG_LEVEL_INFO, "Writing table metadata to '%s'", name);
+
+	FILE*	file = fopen(name, "w+b");
+
+	if (file == NULL) {
+		ib_errf(thd, IB_LOG_LEVEL_WARN, ER_CANT_CREATE_FILE,
+			 name, errno, strerror(errno));
+
+		err = DB_IO_ERROR;
+	} else {
+		err = row_quiesce_write_header(table, file, thd);
+
+		if (err == DB_SUCCESS) {
+			err = row_quiesce_write_table(table, file, thd);
+		}
+
+		if (err == DB_SUCCESS) {
+			err = row_quiesce_write_indexes(table, file, thd);
+		}
+
+		if (fflush(file) != 0) {
+
+			char	msg[BUFSIZ];
+
+			ut_snprintf(msg, sizeof(msg), "%s flush() failed",
+				    name);
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				errno, strerror(errno), msg);
+		}
+
+		if (fclose(file) != 0) {
+			char	msg[BUFSIZ];
+
+			ut_snprintf(msg, sizeof(msg), "%s flose() failed",
+				    name);
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				errno, strerror(errno), msg);
+		}
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Check whether a table has an FTS index defined on it.
+@return true if an FTS index exists on the table */
+static
+bool
+row_quiesce_table_has_fts_index(
+/*============================*/
+	const dict_table_t*	table)	/*!< in: quiesce this table */
+{
+	bool			exists = false;
+
+	dict_mutex_enter_for_mysql();
+
+	for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+	     index != 0;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+		if (index->type & DICT_FTS) {
+			exists = true;
+			break;
+		}
+	}
+
+	dict_mutex_exit_for_mysql();
+
+	return(exists);
+}
+
+/*********************************************************************//**
+Quiesce the tablespace that the table resides in. */
+UNIV_INTERN
+void
+row_quiesce_table_start(
+/*====================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	trx_t*		trx)		/*!< in/out: transaction/session */
+{
+	ut_a(trx->mysql_thd != 0);
+	ut_a(srv_n_purge_threads > 0);
+	ut_ad(!srv_read_only_mode);
+
+	char		table_name[MAX_FULL_NAME_LEN + 1];
+
+	ut_a(trx->mysql_thd != 0);
+
+	innobase_format_name(
+		table_name, sizeof(table_name), table->name, FALSE);
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Sync to disk of '%s' started.", table_name);
+
+	if (trx_purge_state() != PURGE_STATE_DISABLED) {
+		trx_purge_stop();
+	}
+
+	ut_a(table->id > 0);
+
+	for (ulint count = 0;
+	     ibuf_contract_in_background(table->id, TRUE) != 0
+	     && !trx_is_interrupted(trx);
+	     ++count) {
+		if (!(count % 20)) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Merging change buffer entries for '%s'",
+				table_name);
+		}
+	}
+
+	if (!trx_is_interrupted(trx)) {
+		buf_LRU_flush_or_remove_pages(
+			table->space, BUF_REMOVE_FLUSH_WRITE, trx);
+
+		if (trx_is_interrupted(trx)) {
+
+			ib_logf(IB_LOG_LEVEL_WARN, "Quiesce aborted!");
+
+		} else if (row_quiesce_write_cfg(table, trx->mysql_thd)
+			   != DB_SUCCESS) {
+
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"There was an error writing to the "
+				"meta data file");
+		} else {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Table '%s' flushed to disk", table_name);
+		}
+	} else {
+		ib_logf(IB_LOG_LEVEL_WARN, "Quiesce aborted!");
+	}
+
+	dberr_t	err = row_quiesce_set_state(table, QUIESCE_COMPLETE, trx);
+	ut_a(err == DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Cleanup after table quiesce. */
+UNIV_INTERN
+void
+row_quiesce_table_complete(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	trx_t*		trx)		/*!< in/out: transaction/session */
+{
+	ulint		count = 0;
+	char		table_name[MAX_FULL_NAME_LEN + 1];
+
+	ut_a(trx->mysql_thd != 0);
+
+	innobase_format_name(
+		table_name, sizeof(table_name), table->name, FALSE);
+
+	/* We need to wait for the operation to complete if the
+	transaction has been killed. */
+
+	while (table->quiesce != QUIESCE_COMPLETE) {
+
+		/* Print a warning after every minute. */
+		if (!(count % 60)) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Waiting for quiesce of '%s' to complete",
+				table_name);
+		}
+
+		/* Sleep for a second. */
+		os_thread_sleep(1000000);
+
+		++count;
+	}
+
+	/* Remove the .cfg file now that the user has resumed
+	normal operations. Otherwise it will cause problems when
+	the user tries to drop the database (remove directory). */
+	char		cfg_name[OS_FILE_MAX_PATH];
+
+	srv_get_meta_data_filename(table, cfg_name, sizeof(cfg_name));
+
+	os_file_delete_if_exists(innodb_file_data_key, cfg_name);
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Deleting the meta-data file '%s'", cfg_name);
+
+	if (trx_purge_state() != PURGE_STATE_DISABLED) {
+		trx_purge_run();
+	}
+
+	dberr_t	err = row_quiesce_set_state(table, QUIESCE_NONE, trx);
+	ut_a(err == DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Set a table's quiesce state.
+@return DB_SUCCESS or error code. */
+UNIV_INTERN
+dberr_t
+row_quiesce_set_state(
+/*==================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	ib_quiesce_t	state,		/*!< in: quiesce state to set */
+	trx_t*		trx)		/*!< in/out: transaction */
+{
+	ut_a(srv_n_purge_threads > 0);
+
+	if (srv_read_only_mode) {
+
+		ib_senderrf(trx->mysql_thd,
+			    IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+
+		return(DB_UNSUPPORTED);
+
+	} else if (table->space == TRX_SYS_SPACE) {
+
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name), table->name, FALSE);
+
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+			    ER_TABLE_IN_SYSTEM_TABLESPACE, table_name);
+
+		return(DB_UNSUPPORTED);
+	} else if (row_quiesce_table_has_fts_index(table)) {
+
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+			    ER_NOT_SUPPORTED_YET,
+			    "FLUSH TABLES on tables that have an FTS index. "
+			    "FTS auxiliary tables will not be flushed.");
+
+	} else if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+		/* If this flag is set then the table may not have any active
+		FTS indexes but it will still have the auxiliary tables. */
+
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+			    ER_NOT_SUPPORTED_YET,
+			    "FLUSH TABLES on a table that had an FTS index, "
+			    "created on a hidden column, the "
+			    "auxiliary tables haven't been dropped as yet. "
+			    "FTS auxiliary tables will not be flushed.");
+	}
+
+	row_mysql_lock_data_dictionary(trx);
+
+	dict_table_x_lock_indexes(table);
+
+	switch (state) {
+	case QUIESCE_START:
+		break;
+
+	case QUIESCE_COMPLETE:
+		ut_a(table->quiesce == QUIESCE_START);
+		break;
+
+	case QUIESCE_NONE:
+		ut_a(table->quiesce == QUIESCE_COMPLETE);
+		break;
+	}
+
+	table->quiesce = state;
+
+	dict_table_x_unlock_indexes(table);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	return(DB_SUCCESS);
+}
+
diff --git a/storage/innobase/row/row0row.cc b/storage/innobase/row/row0row.cc
new file mode 100644
index 00000000000..be786f954fb
--- /dev/null
+++ b/storage/innobase/row/row0row.cc
@@ -0,0 +1,1252 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0row.cc
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0row.h"
+
+#ifdef UNIV_NONINL
+#include "row0row.ic"
+#endif
+
+#include "data0type.h"
+#include "dict0dict.h"
+#include "btr0btr.h"
+#include "ha_prototypes.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0ext.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "read0read.h"
+#include "ut0mem.h"
+
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged
+@retval NULL if the externally stored columns in the clustered index record
+are unavailable and ext != NULL, or row is missing some needed columns. */
+UNIV_INTERN
+dtuple_t*
+row_build_index_entry_low(
+/*======================*/
+	const dtuple_t*		row,	/*!< in: row which should be
+					inserted or purged */
+	const row_ext_t*	ext,	/*!< in: externally stored column
+					prefixes, or NULL */
+	dict_index_t*		index,	/*!< in: index on the table */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory for the index entry
+					is allocated */
+{
+	dtuple_t*	entry;
+	ulint		entry_len;
+	ulint		i;
+
+	entry_len = dict_index_get_n_fields(index);
+	entry = dtuple_create(heap, entry_len);
+
+	if (dict_index_is_univ(index)) {
+		dtuple_set_n_fields_cmp(entry, entry_len);
+		/* There may only be externally stored columns
+		in a clustered index B-tree of a user table. */
+		ut_a(!ext);
+	} else {
+		dtuple_set_n_fields_cmp(
+			entry, dict_index_get_n_unique_in_tree(index));
+	}
+
+	for (i = 0; i < entry_len; i++) {
+		const dict_field_t*	ind_field
+			= dict_index_get_nth_field(index, i);
+		const dict_col_t*	col
+			= ind_field->col;
+		ulint			col_no
+			= dict_col_get_no(col);
+		dfield_t*		dfield
+			= dtuple_get_nth_field(entry, i);
+		const dfield_t*		dfield2
+			= dtuple_get_nth_field(row, col_no);
+		ulint			len;
+
+#if DATA_MISSING != 0
+# error "DATA_MISSING != 0"
+#endif
+		if (UNIV_UNLIKELY(dfield_get_type(dfield2)->mtype
+				  == DATA_MISSING)) {
+			/* The field has not been initialized in the row.
+			This should be from trx_undo_rec_get_partial_row(). */
+			return(NULL);
+		}
+
+		len = dfield_get_len(dfield2);
+
+		dfield_copy(dfield, dfield2);
+
+		if (dfield_is_null(dfield)) {
+			continue;
+		}
+
+		if (ind_field->prefix_len == 0
+		    && (!dfield_is_ext(dfield)
+			|| dict_index_is_clust(index))) {
+			/* The dfield_copy() above suffices for
+			columns that are stored in-page, or for
+			clustered index record columns that are not
+			part of a column prefix in the PRIMARY KEY. */
+			continue;
+		}
+
+		/* If the column is stored externally (off-page) in
+		the clustered index, it must be an ordering field in
+		the secondary index.  In the Antelope format, only
+		prefix-indexed columns may be stored off-page in the
+		clustered index record. In the Barracuda format, also
+		fully indexed long CHAR or VARCHAR columns may be
+		stored off-page. */
+		ut_ad(col->ord_part);
+
+		if (ext) {
+			/* See if the column is stored externally. */
+			const byte*	buf = row_ext_lookup(ext, col_no,
+							     &len);
+			if (UNIV_LIKELY_NULL(buf)) {
+				if (UNIV_UNLIKELY(buf == field_ref_zero)) {
+					return(NULL);
+				}
+				dfield_set_data(dfield, buf, len);
+			}
+
+			if (ind_field->prefix_len == 0) {
+				/* In the Barracuda format
+				(ROW_FORMAT=DYNAMIC or
+				ROW_FORMAT=COMPRESSED), we can have a
+				secondary index on an entire column
+				that is stored off-page in the
+				clustered index. As this is not a
+				prefix index (prefix_len == 0),
+				include the entire off-page column in
+				the secondary index record. */
+				continue;
+			}
+		} else if (dfield_is_ext(dfield)) {
+			/* This table is either in Antelope format
+			(ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT)
+			or a purge record where the ordered part of
+			the field is not external.
+			In Antelope, the maximum column prefix
+			index length is 767 bytes, and the clustered
+			index record contains a 768-byte prefix of
+			each off-page column. */
+			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+			len -= BTR_EXTERN_FIELD_REF_SIZE;
+			dfield_set_len(dfield, len);
+		}
+
+		/* If a column prefix index, take only the prefix. */
+		if (ind_field->prefix_len) {
+			len = dtype_get_at_most_n_mbchars(
+				col->prtype, col->mbminmaxlen,
+				ind_field->prefix_len, len,
+				static_cast<char*>(dfield_get_data(dfield)));
+			dfield_set_len(dfield, len);
+		}
+	}
+
+	return(entry);
+}
+
+/*******************************************************************//**
+An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index.
+@return	own: row built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_build(
+/*======*/
+	ulint			type,	/*!< in: ROW_COPY_POINTERS or
+					ROW_COPY_DATA; the latter
+					copies also the data fields to
+					heap while the first only
+					places pointers to data fields
+					on the index page, and thus is
+					more efficient */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_t*		rec,	/*!< in: record in the clustered
+					index; NOTE: in the case
+					ROW_COPY_POINTERS the data
+					fields in the row will point
+					directly into this record,
+					therefore, the buffer page of
+					this record must be at least
+					s-latched and the latch held
+					as long as the row dtuple is used! */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec,index)
+					or NULL, in which case this function
+					will invoke rec_get_offsets() */
+	const dict_table_t*	col_table,
+					/*!< in: table, to check which
+					externally stored columns
+					occur in the ordering columns
+					of an index, or NULL if
+					index->table should be
+					consulted instead */
+	const dtuple_t*		add_cols,
+					/*!< in: default values of
+					added columns, or NULL */
+	const ulint*		col_map,/*!< in: mapping of old column
+					numbers to new ones, or NULL */
+	row_ext_t**		ext,	/*!< out, own: cache of
+					externally stored column
+					prefixes, or NULL */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory needed is allocated */
+{
+	const byte*		copy;
+	dtuple_t*		row;
+	ulint			n_ext_cols;
+	ulint*			ext_cols	= NULL; /* remove warning */
+	ulint			len;
+	byte*			buf;
+	ulint			j;
+	mem_heap_t*		tmp_heap	= NULL;
+	ulint			offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_);
+
+	ut_ad(index && rec && heap);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(!mutex_own(&trx_sys->mutex));
+	ut_ad(!col_map || col_table);
+
+	if (!offsets) {
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  ULINT_UNDEFINED, &tmp_heap);
+	} else {
+		ut_ad(rec_offs_validate(rec, index, offsets));
+	}
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	if (rec_offs_any_null_extern(rec, offsets)) {
+		/* This condition can occur during crash recovery
+		before trx_rollback_active() has completed execution,
+		or when a concurrently executing
+		row_ins_index_entry_low() has committed the B-tree
+		mini-transaction but has not yet managed to restore
+		the cursor position for writing the big_rec. */
+		ut_a(trx_undo_roll_ptr_is_insert(
+			     row_get_rec_roll_ptr(rec, index, offsets)));
+	}
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+	if (type != ROW_COPY_POINTERS) {
+		/* Take a copy of rec to heap */
+		buf = static_cast<byte*>(
+			mem_heap_alloc(heap, rec_offs_size(offsets)));
+
+		copy = rec_copy(buf, rec, offsets);
+	} else {
+		copy = rec;
+	}
+
+	n_ext_cols = rec_offs_n_extern(offsets);
+	if (n_ext_cols) {
+		ext_cols = static_cast<ulint*>(
+			mem_heap_alloc(heap, n_ext_cols * sizeof *ext_cols));
+	}
+
+	/* Avoid a debug assertion in rec_offs_validate(). */
+	rec_offs_make_valid(copy, index, const_cast<ulint*>(offsets));
+
+	if (!col_table) {
+		ut_ad(!col_map);
+		ut_ad(!add_cols);
+		col_table = index->table;
+	}
+
+	if (add_cols) {
+		ut_ad(col_map);
+		row = dtuple_copy(add_cols, heap);
+		/* dict_table_copy_types() would set the fields to NULL */
+		for (ulint i = 0; i < dict_table_get_n_cols(col_table); i++) {
+			dict_col_copy_type(
+				dict_table_get_nth_col(col_table, i),
+				dfield_get_type(dtuple_get_nth_field(row, i)));
+		}
+	} else {
+		row = dtuple_create(heap, dict_table_get_n_cols(col_table));
+		dict_table_copy_types(row, col_table);
+	}
+
+	dtuple_set_info_bits(row, rec_get_info_bits(
+				     copy, rec_offs_comp(offsets)));
+
+	j = 0;
+
+	for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) {
+		const dict_field_t*	ind_field
+			= dict_index_get_nth_field(index, i);
+
+		if (ind_field->prefix_len) {
+			/* Column prefixes can only occur in key
+			fields, which cannot be stored externally. For
+			a column prefix, there should also be the full
+			field in the clustered index tuple. The row
+			tuple comprises full fields, not prefixes. */
+			ut_ad(!rec_offs_nth_extern(offsets, i));
+			continue;
+		}
+
+		const dict_col_t*	col
+			= dict_field_get_col(ind_field);
+		ulint			col_no
+			= dict_col_get_no(col);
+
+		if (col_map) {
+			col_no = col_map[col_no];
+
+			if (col_no == ULINT_UNDEFINED) {
+				/* dropped column */
+				continue;
+			}
+		}
+
+		dfield_t*	dfield = dtuple_get_nth_field(row, col_no);
+
+		const byte*	field = rec_get_nth_field(
+			copy, offsets, i, &len);
+
+		dfield_set_data(dfield, field, len);
+
+		if (rec_offs_nth_extern(offsets, i)) {
+			dfield_set_ext(dfield);
+
+			col = dict_table_get_nth_col(col_table, col_no);
+
+			if (col->ord_part) {
+				/* We will have to fetch prefixes of
+				externally stored columns that are
+				referenced by column prefixes. */
+				ext_cols[j++] = col_no;
+			}
+		}
+	}
+
+	rec_offs_make_valid(rec, index, const_cast<ulint*>(offsets));
+
+	ut_ad(dtuple_check_typed(row));
+
+	if (!ext) {
+		/* REDUNDANT and COMPACT formats store a local
+		768-byte prefix of each externally stored
+		column. No cache is needed.
+
+		During online table rebuild,
+		row_log_table_apply_delete_low()
+		may use a cache that was set up by
+		row_log_table_delete(). */
+
+	} else if (j) {
+		*ext = row_ext_create(j, ext_cols, index->table->flags, row,
+				      heap);
+	} else {
+		*ext = NULL;
+	}
+
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	return(row);
+}
+
+/*******************************************************************//**
+Converts an index record to a typed data tuple.
+@return index entry built; does not set info_bits, and the data fields
+in the entry will point directly to rec */
+UNIV_INTERN
+dtuple_t*
+row_rec_to_index_entry_low(
+/*=======================*/
+	const rec_t*		rec,	/*!< in: record in the index */
+	const dict_index_t*	index,	/*!< in: index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint*			n_ext,	/*!< out: number of externally
+					stored columns */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory needed is allocated */
+{
+	dtuple_t*	entry;
+	dfield_t*	dfield;
+	ulint		i;
+	const byte*	field;
+	ulint		len;
+	ulint		rec_len;
+
+	ut_ad(rec && heap && index);
+	/* Because this function may be invoked by row0merge.cc
+	on a record whose header is in different format, the check
+	rec_offs_validate(rec, index, offsets) must be avoided here. */
+	ut_ad(n_ext);
+	*n_ext = 0;
+
+	rec_len = rec_offs_n_fields(offsets);
+
+	entry = dtuple_create(heap, rec_len);
+
+	dtuple_set_n_fields_cmp(entry,
+				dict_index_get_n_unique_in_tree(index));
+	ut_ad(rec_len == dict_index_get_n_fields(index));
+
+	dict_index_copy_types(entry, index, rec_len);
+
+	for (i = 0; i < rec_len; i++) {
+
+		dfield = dtuple_get_nth_field(entry, i);
+		field = rec_get_nth_field(rec, offsets, i, &len);
+
+		dfield_set_data(dfield, field, len);
+
+		if (rec_offs_nth_extern(offsets, i)) {
+			dfield_set_ext(dfield);
+			(*n_ext)++;
+		}
+	}
+
+	ut_ad(dtuple_check_typed(entry));
+
+	return(entry);
+}
+
+/*******************************************************************//**
+Converts an index record to a typed data tuple. NOTE that externally
+stored (often big) fields are NOT copied to heap.
+@return	own: index entry built */
+UNIV_INTERN
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+	const rec_t*		rec,	/*!< in: record in the index */
+	const dict_index_t*	index,	/*!< in: index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec) */
+	ulint*			n_ext,	/*!< out: number of externally
+					stored columns */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory needed is allocated */
+{
+	dtuple_t*	entry;
+	byte*		buf;
+	const rec_t*	copy_rec;
+
+	ut_ad(rec && heap && index);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	/* Take a copy of rec to heap */
+	buf = static_cast<byte*>(
+		mem_heap_alloc(heap, rec_offs_size(offsets)));
+
+	copy_rec = rec_copy(buf, rec, offsets);
+
+	rec_offs_make_valid(copy_rec, index, const_cast<ulint*>(offsets));
+	entry = row_rec_to_index_entry_low(
+		copy_rec, index, offsets, n_ext, heap);
+	rec_offs_make_valid(rec, index, const_cast<ulint*>(offsets));
+
+	dtuple_set_info_bits(entry,
+			     rec_get_info_bits(rec, rec_offs_comp(offsets)));
+
+	return(entry);
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record.
+@return	own: row reference built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+	ulint		type,	/*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+				the former copies also the data fields to
+				heap, whereas the latter only places pointers
+				to data fields on the index page */
+	dict_index_t*	index,	/*!< in: secondary index */
+	const rec_t*	rec,	/*!< in: record in the index;
+				NOTE: in the case ROW_COPY_POINTERS
+				the data fields in the row will point
+				directly into this record, therefore,
+				the buffer page of this record must be
+				at least s-latched and the latch held
+				as long as the row reference is used! */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
+				needed is allocated */
+{
+	dict_table_t*	table;
+	dict_index_t*	clust_index;
+	dfield_t*	dfield;
+	dtuple_t*	ref;
+	const byte*	field;
+	ulint		len;
+	ulint		ref_len;
+	ulint		pos;
+	byte*		buf;
+	ulint		clust_col_prefix_len;
+	ulint		i;
+	mem_heap_t*	tmp_heap	= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(index && rec && heap);
+	ut_ad(!dict_index_is_clust(index));
+
+	offsets = rec_get_offsets(rec, index, offsets,
+				  ULINT_UNDEFINED, &tmp_heap);
+	/* Secondary indexes must not contain externally stored columns. */
+	ut_ad(!rec_offs_any_extern(offsets));
+
+	if (type == ROW_COPY_DATA) {
+		/* Take a copy of rec to heap */
+
+		buf = static_cast<byte*>(
+			mem_heap_alloc(heap, rec_offs_size(offsets)));
+
+		rec = rec_copy(buf, rec, offsets);
+		/* Avoid a debug assertion in rec_offs_validate(). */
+		rec_offs_make_valid(rec, index, offsets);
+	}
+
+	table = index->table;
+
+	clust_index = dict_table_get_first_index(table);
+
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ref = dtuple_create(heap, ref_len);
+
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+
+		pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+		ut_a(pos != ULINT_UNDEFINED);
+
+		field = rec_get_nth_field(rec, offsets, pos, &len);
+
+		dfield_set_data(dfield, field, len);
+
+		/* If the primary key contains a column prefix, then the
+		secondary index may contain a longer prefix of the same
+		column, or the full column, and we must adjust the length
+		accordingly. */
+
+		clust_col_prefix_len = dict_index_get_nth_field(
+			clust_index, i)->prefix_len;
+
+		if (clust_col_prefix_len > 0) {
+			if (len != UNIV_SQL_NULL) {
+
+				const dtype_t*	dtype
+					= dfield_get_type(dfield);
+
+				dfield_set_len(dfield,
+					       dtype_get_at_most_n_mbchars(
+						       dtype->prtype,
+						       dtype->mbminmaxlen,
+						       clust_col_prefix_len,
+						       len, (char*) field));
+			}
+		}
+	}
+
+	ut_ad(dtuple_check_typed(ref));
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	return(ref);
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INTERN
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+	dtuple_t*		ref,	/*!< in/out: row reference built;
+					see the NOTE below! */
+	const rec_t*		rec,	/*!< in: record in the index;
+					NOTE: the data fields in ref
+					will point directly into this
+					record, therefore, the buffer
+					page of this record must be at
+					least s-latched and the latch
+					held as long as the row
+					reference is used! */
+	const dict_index_t*	index,	/*!< in: secondary index */
+	ulint*			offsets,/*!< in: rec_get_offsets(rec, index)
+					or NULL */
+	trx_t*			trx)	/*!< in: transaction */
+{
+	const dict_index_t*	clust_index;
+	dfield_t*		dfield;
+	const byte*		field;
+	ulint			len;
+	ulint			ref_len;
+	ulint			pos;
+	ulint			clust_col_prefix_len;
+	ulint			i;
+	mem_heap_t*		heap		= NULL;
+	ulint			offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_);
+
+	ut_a(ref);
+	ut_a(index);
+	ut_a(rec);
+	ut_ad(!dict_index_is_clust(index));
+
+	if (UNIV_UNLIKELY(!index->table)) {
+		fputs("InnoDB: table ", stderr);
+notfound:
+		ut_print_name(stderr, trx, TRUE, index->table_name);
+		fputs(" for index ", stderr);
+		ut_print_name(stderr, trx, FALSE, index->name);
+		fputs(" not found\n", stderr);
+		ut_error;
+	}
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	if (UNIV_UNLIKELY(!clust_index)) {
+		fputs("InnoDB: clust index for table ", stderr);
+		goto notfound;
+	}
+
+	if (!offsets) {
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  ULINT_UNDEFINED, &heap);
+	} else {
+		ut_ad(rec_offs_validate(rec, index, offsets));
+	}
+
+	/* Secondary indexes must not contain externally stored columns. */
+	ut_ad(!rec_offs_any_extern(offsets));
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ut_ad(ref_len == dtuple_get_n_fields(ref));
+
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+
+		pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+		ut_a(pos != ULINT_UNDEFINED);
+
+		field = rec_get_nth_field(rec, offsets, pos, &len);
+
+		dfield_set_data(dfield, field, len);
+
+		/* If the primary key contains a column prefix, then the
+		secondary index may contain a longer prefix of the same
+		column, or the full column, and we must adjust the length
+		accordingly. */
+
+		clust_col_prefix_len = dict_index_get_nth_field(
+			clust_index, i)->prefix_len;
+
+		if (clust_col_prefix_len > 0) {
+			if (len != UNIV_SQL_NULL) {
+
+				const dtype_t*	dtype
+					= dfield_get_type(dfield);
+
+				dfield_set_len(dfield,
+					       dtype_get_at_most_n_mbchars(
+						       dtype->prtype,
+						       dtype->mbminmaxlen,
+						       clust_col_prefix_len,
+						       len, (char*) field));
+			}
+		}
+	}
+
+	ut_ad(dtuple_check_typed(ref));
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/***************************************************************//**
+Searches the clustered index record for a row, if we have the row reference.
+@return	TRUE if found */
+UNIV_INTERN
+ibool
+row_search_on_row_ref(
+/*==================*/
+	btr_pcur_t*		pcur,	/*!< out: persistent cursor, which must
+					be closed by the caller */
+	ulint			mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	const dict_table_t*	table,	/*!< in: table */
+	const dtuple_t*		ref,	/*!< in: row reference */
+	mtr_t*			mtr)	/*!< in/out: mtr */
+{
+	ulint		low_match;
+	rec_t*		rec;
+	dict_index_t*	index;
+
+	ut_ad(dtuple_check_typed(ref));
+
+	index = dict_table_get_first_index(table);
+
+	ut_a(dtuple_get_n_fields(ref) == dict_index_get_n_unique(index));
+
+	btr_pcur_open(index, ref, PAGE_CUR_LE, mode, pcur, mtr);
+
+	low_match = btr_pcur_get_low_match(pcur);
+
+	rec = btr_pcur_get_rec(pcur);
+
+	if (page_rec_is_infimum(rec)) {
+
+		return(FALSE);
+	}
+
+	if (low_match != dtuple_get_n_fields(ref)) {
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved.
+@return	record or NULL, if no record found */
+UNIV_INTERN
+rec_t*
+row_get_clust_rec(
+/*==============*/
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	const rec_t*	rec,	/*!< in: record in a secondary index */
+	dict_index_t*	index,	/*!< in: secondary index */
+	dict_index_t**	clust_index,/*!< out: clustered index */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	ref;
+	dict_table_t*	table;
+	btr_pcur_t	pcur;
+	ibool		found;
+	rec_t*		clust_rec;
+
+	ut_ad(!dict_index_is_clust(index));
+
+	table = index->table;
+
+	heap = mem_heap_create(256);
+
+	ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, heap);
+
+	found = row_search_on_row_ref(&pcur, mode, table, ref, mtr);
+
+	clust_rec = found ? btr_pcur_get_rec(&pcur) : NULL;
+
+	mem_heap_free(heap);
+
+	btr_pcur_close(&pcur);
+
+	*clust_index = dict_table_get_first_index(table);
+
+	return(clust_rec);
+}
+
+/***************************************************************//**
+Searches an index record.
+@return	whether the record was found or buffered */
+UNIV_INTERN
+enum row_search_result
+row_search_index_entry(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry,	/*!< in: index entry */
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor, which must
+				be closed by the caller */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint	n_fields;
+	ulint	low_match;
+	rec_t*	rec;
+
+	ut_ad(dtuple_check_typed(entry));
+
+	btr_pcur_open(index, entry, PAGE_CUR_LE, mode, pcur, mtr);
+
+	switch (btr_pcur_get_btr_cur(pcur)->flag) {
+	case BTR_CUR_DELETE_REF:
+		ut_a(mode & BTR_DELETE);
+		return(ROW_NOT_DELETED_REF);
+
+	case BTR_CUR_DEL_MARK_IBUF:
+	case BTR_CUR_DELETE_IBUF:
+	case BTR_CUR_INSERT_TO_IBUF:
+		return(ROW_BUFFERED);
+
+	case BTR_CUR_HASH:
+	case BTR_CUR_HASH_FAIL:
+	case BTR_CUR_BINARY:
+		break;
+	}
+
+	low_match = btr_pcur_get_low_match(pcur);
+
+	rec = btr_pcur_get_rec(pcur);
+
+	n_fields = dtuple_get_n_fields(entry);
+
+	if (page_rec_is_infimum(rec)) {
+
+		return(ROW_NOT_FOUND);
+	} else if (low_match != n_fields) {
+
+		return(ROW_NOT_FOUND);
+	}
+
+	return(ROW_FOUND);
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_INT using "prtype" and writes the result to "buf".
+If the data is in unknown format, then nothing is written to "buf",
+0 is returned and "format_in_hex" is set to TRUE, otherwise
+"format_in_hex" is left untouched.
+Not more than "buf_size" bytes are written to "buf".
+The result is always '\0'-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating '\0').
+@return	number of bytes that were written */
+static
+ulint
+row_raw_format_int(
+/*===============*/
+	const char*	data,		/*!< in: raw data */
+	ulint		data_len,	/*!< in: raw data length
+					in bytes */
+	ulint		prtype,		/*!< in: precise type */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size,	/*!< in: output buffer size
+					in bytes */
+	ibool*		format_in_hex)	/*!< out: should the data be
+					formated in hex */
+{
+	ulint	ret;
+
+	if (data_len <= sizeof(ib_uint64_t)) {
+
+		ib_uint64_t	value;
+		ibool		unsigned_type = prtype & DATA_UNSIGNED;
+
+		value = mach_read_int_type(
+			(const byte*) data, data_len, unsigned_type);
+
+		ret = ut_snprintf(
+			buf, buf_size,
+			unsigned_type ? UINT64PF : INT64PF, value) + 1;
+	} else {
+
+		*format_in_hex = TRUE;
+		ret = 0;
+	}
+
+	return(ut_min(ret, buf_size));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "prtype" and writes the
+result to "buf".
+If the data is in binary format, then nothing is written to "buf",
+0 is returned and "format_in_hex" is set to TRUE, otherwise
+"format_in_hex" is left untouched.
+Not more than "buf_size" bytes are written to "buf".
+The result is always '\0'-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating '\0').
+@return	number of bytes that were written */
+static
+ulint
+row_raw_format_str(
+/*===============*/
+	const char*	data,		/*!< in: raw data */
+	ulint		data_len,	/*!< in: raw data length
+					in bytes */
+	ulint		prtype,		/*!< in: precise type */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size,	/*!< in: output buffer size
+					in bytes */
+	ibool*		format_in_hex)	/*!< out: should the data be
+					formated in hex */
+{
+	ulint	charset_coll;
+
+	if (buf_size == 0) {
+
+		return(0);
+	}
+
+	/* we assume system_charset_info is UTF-8 */
+
+	charset_coll = dtype_get_charset_coll(prtype);
+
+	if (UNIV_LIKELY(dtype_is_utf8(prtype))) {
+
+		return(ut_str_sql_format(data, data_len, buf, buf_size));
+	}
+	/* else */
+
+	if (charset_coll == DATA_MYSQL_BINARY_CHARSET_COLL) {
+
+		*format_in_hex = TRUE;
+		return(0);
+	}
+	/* else */
+
+	return(innobase_raw_format(data, data_len, charset_coll,
+					  buf, buf_size));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) using
+"dict_field" and writes the result to "buf".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size is positive) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return	number of bytes that were written */
+UNIV_INTERN
+ulint
+row_raw_format(
+/*===========*/
+	const char*		data,		/*!< in: raw data */
+	ulint			data_len,	/*!< in: raw data length
+						in bytes */
+	const dict_field_t*	dict_field,	/*!< in: index field */
+	char*			buf,		/*!< out: output buffer */
+	ulint			buf_size)	/*!< in: output buffer size
+						in bytes */
+{
+	ulint	mtype;
+	ulint	prtype;
+	ulint	ret;
+	ibool	format_in_hex;
+
+	if (buf_size == 0) {
+
+		return(0);
+	}
+
+	if (data_len == UNIV_SQL_NULL) {
+
+		ret = ut_snprintf((char*) buf, buf_size, "NULL") + 1;
+
+		return(ut_min(ret, buf_size));
+	}
+
+	mtype = dict_field->col->mtype;
+	prtype = dict_field->col->prtype;
+
+	format_in_hex = FALSE;
+
+	switch (mtype) {
+	case DATA_INT:
+
+		ret = row_raw_format_int(data, data_len, prtype,
+					 buf, buf_size, &format_in_hex);
+		if (format_in_hex) {
+
+			goto format_in_hex;
+		}
+		break;
+	case DATA_CHAR:
+	case DATA_VARCHAR:
+	case DATA_MYSQL:
+	case DATA_VARMYSQL:
+
+		ret = row_raw_format_str(data, data_len, prtype,
+					 buf, buf_size, &format_in_hex);
+		if (format_in_hex) {
+
+			goto format_in_hex;
+		}
+
+		break;
+	/* XXX support more data types */
+	default:
+	format_in_hex:
+
+		if (UNIV_LIKELY(buf_size > 2)) {
+
+			memcpy(buf, "0x", 2);
+			buf += 2;
+			buf_size -= 2;
+			ret = 2 + ut_raw_to_hex(data, data_len,
+						buf, buf_size);
+		} else {
+
+			buf[0] = '\0';
+			ret = 1;
+		}
+	}
+
+	return(ret);
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+#include "ut0dbg.h"
+
+void
+test_row_raw_format_int()
+{
+	ulint	ret;
+	char	buf[128];
+	ibool	format_in_hex;
+	speedo_t speedo;
+	ulint	i;
+
+#define CALL_AND_TEST(data, data_len, prtype, buf, buf_size,\
+		      ret_expected, buf_expected, format_in_hex_expected)\
+	do {\
+		ibool	ok = TRUE;\
+		ulint	i;\
+		memset(buf, 'x', 10);\
+		buf[10] = '\0';\
+		format_in_hex = FALSE;\
+		fprintf(stderr, "TESTING \"\\x");\
+		for (i = 0; i < data_len; i++) {\
+			fprintf(stderr, "%02hhX", data[i]);\
+		}\
+		fprintf(stderr, "\", %lu, %lu, %lu\n",\
+                        (ulint) data_len, (ulint) prtype,\
+			(ulint) buf_size);\
+		ret = row_raw_format_int(data, data_len, prtype,\
+					 buf, buf_size, &format_in_hex);\
+		if (ret != ret_expected) {\
+			fprintf(stderr, "expected ret %lu, got %lu\n",\
+				(ulint) ret_expected, ret);\
+			ok = FALSE;\
+                }\
+                if (strcmp((char*) buf, buf_expected) != 0) {\
+                        fprintf(stderr, "expected buf \"%s\", got \"%s\"\n",\
+                                buf_expected, buf);\
+                        ok = FALSE;\
+                }\
+                if (format_in_hex != format_in_hex_expected) {\
+                        fprintf(stderr, "expected format_in_hex %d, got %d\n",\
+                                (int) format_in_hex_expected,\
+				(int) format_in_hex);\
+                        ok = FALSE;\
+                }\
+                if (ok) {\
+                        fprintf(stderr, "OK: %lu, \"%s\" %d\n\n",\
+                                (ulint) ret, buf, (int) format_in_hex);\
+                } else {\
+                        return;\
+                }\
+        } while (0)
+
+#if 1
+	/* min values for signed 1-8 byte integers */
+
+	CALL_AND_TEST("\x00", 1, 0,
+		      buf, sizeof(buf), 5, "-128", 0);
+
+	CALL_AND_TEST("\x00\x00", 2, 0,
+		      buf, sizeof(buf), 7, "-32768", 0);
+
+	CALL_AND_TEST("\x00\x00\x00", 3, 0,
+		      buf, sizeof(buf), 9, "-8388608", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00", 4, 0,
+		      buf, sizeof(buf), 12, "-2147483648", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, 0,
+		      buf, sizeof(buf), 14, "-549755813888", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, 0,
+		      buf, sizeof(buf), 17, "-140737488355328", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, 0,
+		      buf, sizeof(buf), 19, "-36028797018963968", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, 0,
+		      buf, sizeof(buf), 21, "-9223372036854775808", 0);
+
+	/* min values for unsigned 1-8 byte integers */
+
+	CALL_AND_TEST("\x00", 1, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00", 2, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00", 3, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00", 4, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	/* max values for signed 1-8 byte integers */
+
+	CALL_AND_TEST("\xFF", 1, 0,
+		      buf, sizeof(buf), 4, "127", 0);
+
+	CALL_AND_TEST("\xFF\xFF", 2, 0,
+		      buf, sizeof(buf), 6, "32767", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF", 3, 0,
+		      buf, sizeof(buf), 8, "8388607", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, 0,
+		      buf, sizeof(buf), 11, "2147483647", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, 0,
+		      buf, sizeof(buf), 13, "549755813887", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, 0,
+		      buf, sizeof(buf), 16, "140737488355327", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, 0,
+		      buf, sizeof(buf), 18, "36028797018963967", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, 0,
+		      buf, sizeof(buf), 20, "9223372036854775807", 0);
+
+	/* max values for unsigned 1-8 byte integers */
+
+	CALL_AND_TEST("\xFF", 1, DATA_UNSIGNED,
+		      buf, sizeof(buf), 4, "255", 0);
+
+	CALL_AND_TEST("\xFF\xFF", 2, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "65535", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF", 3, DATA_UNSIGNED,
+		      buf, sizeof(buf), 9, "16777215", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, DATA_UNSIGNED,
+		      buf, sizeof(buf), 11, "4294967295", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, DATA_UNSIGNED,
+		      buf, sizeof(buf), 14, "1099511627775", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, DATA_UNSIGNED,
+		      buf, sizeof(buf), 16, "281474976710655", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, DATA_UNSIGNED,
+		      buf, sizeof(buf), 18, "72057594037927935", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, DATA_UNSIGNED,
+		      buf, sizeof(buf), 21, "18446744073709551615", 0);
+
+	/* some random values */
+
+	CALL_AND_TEST("\x52", 1, 0,
+		      buf, sizeof(buf), 4, "-46", 0);
+
+	CALL_AND_TEST("\x0E", 1, DATA_UNSIGNED,
+		      buf, sizeof(buf), 3, "14", 0);
+
+	CALL_AND_TEST("\x62\xCE", 2, 0,
+		      buf, sizeof(buf), 6, "-7474", 0);
+
+	CALL_AND_TEST("\x29\xD6", 2, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "10710", 0);
+
+	CALL_AND_TEST("\x7F\xFF\x90", 3, 0,
+		      buf, sizeof(buf), 5, "-112", 0);
+
+	CALL_AND_TEST("\x00\xA1\x16", 3, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "41238", 0);
+
+	CALL_AND_TEST("\x7F\xFF\xFF\xF7", 4, 0,
+		      buf, sizeof(buf), 3, "-9", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x5C", 4, DATA_UNSIGNED,
+		      buf, sizeof(buf), 3, "92", 0);
+
+	CALL_AND_TEST("\x7F\xFF\xFF\xFF\xFF\xFF\xDC\x63", 8, 0,
+		      buf, sizeof(buf), 6, "-9117", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x01\x64\x62", 8, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "91234", 0);
+#endif
+
+	/* speed test */
+
+	speedo_reset(&speedo);
+
+	for (i = 0; i < 1000000; i++) {
+		row_raw_format_int("\x23", 1,
+				   0, buf, sizeof(buf),
+				   &format_in_hex);
+		row_raw_format_int("\x23", 1,
+				   DATA_UNSIGNED, buf, sizeof(buf),
+				   &format_in_hex);
+
+		row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8,
+				   0, buf, sizeof(buf),
+				   &format_in_hex);
+		row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8,
+				   DATA_UNSIGNED, buf, sizeof(buf),
+				   &format_in_hex);
+	}
+
+	speedo_show(&speedo);
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc
new file mode 100644
index 00000000000..a27f0ebfb80
--- /dev/null
+++ b/storage/innobase/row/row0sel.cc
@@ -0,0 +1,5385 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***************************************************//**
+@file row/row0sel.cc
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0sel.h"
+
+#ifdef UNIV_NONINL
+#include "row0sel.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0trx.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "row0vers.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "eval0eval.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+#include "row0mysql.h"
+#include "read0read.h"
+#include "buf0lru.h"
+#include "ha_prototypes.h"
+#include "m_string.h" /* for my_sys.h */
+#include "my_sys.h" /* DEBUG_SYNC_C */
+
+#include "my_compare.h" /* enum icp_result */
+
+/* Maximum number of rows to prefetch; MySQL interface has another parameter */
+#define SEL_MAX_N_PREFETCH	16
+
+/* Number of rows fetched, after which to start prefetching; MySQL interface
+has another parameter */
+#define SEL_PREFETCH_LIMIT	1
+
+/* When a select has accessed about this many pages, it returns control back
+to que_run_threads: this is to allow canceling runaway queries */
+
+#define SEL_COST_LIMIT	100
+
+/* Flags for search shortcut */
+#define SEL_FOUND	0
+#define	SEL_EXHAUSTED	1
+#define SEL_RETRY	2
+
+/********************************************************************//**
+Returns TRUE if the user-defined column in a secondary index record
+is alphabetically the same as the corresponding BLOB column in the clustered
+index record.
+NOTE: the comparison is NOT done as a binary comparison, but character
+fields are compared with collation!
+@return	TRUE if the columns are equal */
+static
+ibool
+row_sel_sec_rec_is_for_blob(
+/*========================*/
+	ulint		mtype,		/*!< in: main type */
+	ulint		prtype,		/*!< in: precise type */
+	ulint		mbminmaxlen,	/*!< in: minimum and maximum length of
+					a multi-byte character */
+	const byte*	clust_field,	/*!< in: the locally stored part of
+					the clustered index column, including
+					the BLOB pointer; the clustered
+					index record must be covered by
+					a lock or a page latch to protect it
+					against deletion (rollback or purge) */
+	ulint		clust_len,	/*!< in: length of clust_field */
+	const byte*	sec_field,	/*!< in: column in secondary index */
+	ulint		sec_len,	/*!< in: length of sec_field */
+	ulint		prefix_len,	/*!< in: index column prefix length
+					in bytes */
+	dict_table_t*	table)		/*!< in: table */
+{
+	ulint	len;
+	byte	buf[REC_VERSION_56_MAX_INDEX_COL_LEN];
+	ulint	zip_size = dict_tf_get_zip_size(table->flags);
+
+	/* This function should never be invoked on an Antelope format
+	table, because they should always contain enough prefix in the
+	clustered index record. */
+	ut_ad(dict_table_get_format(table) >= UNIV_FORMAT_B);
+	ut_a(clust_len >= BTR_EXTERN_FIELD_REF_SIZE);
+	ut_ad(prefix_len >= sec_len);
+	ut_ad(prefix_len > 0);
+	ut_a(prefix_len <= sizeof buf);
+
+	if (UNIV_UNLIKELY
+	    (!memcmp(clust_field + clust_len - BTR_EXTERN_FIELD_REF_SIZE,
+		     field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
+		/* The externally stored field was not written yet.
+		This record should only be seen by
+		recv_recovery_rollback_active() or any
+		TRX_ISO_READ_UNCOMMITTED transactions. */
+		return(FALSE);
+	}
+
+	len = btr_copy_externally_stored_field_prefix(buf, prefix_len,
+						      zip_size,
+						      clust_field, clust_len);
+
+	if (UNIV_UNLIKELY(len == 0)) {
+		/* The BLOB was being deleted as the server crashed.
+		There should not be any secondary index records
+		referring to this clustered index record, because
+		btr_free_externally_stored_field() is called after all
+		secondary index entries of the row have been purged. */
+		return(FALSE);
+	}
+
+	len = dtype_get_at_most_n_mbchars(prtype, mbminmaxlen,
+					  prefix_len, len, (const char*) buf);
+
+	return(!cmp_data_data(mtype, prtype, buf, len, sec_field, sec_len));
+}
+
+/********************************************************************//**
+Returns TRUE if the user-defined column values in a secondary index record
+are alphabetically the same as the corresponding columns in the clustered
+index record.
+NOTE: the comparison is NOT done as a binary comparison, but character
+fields are compared with collation!
+@return TRUE if the secondary record is equal to the corresponding
+fields in the clustered record, when compared with collation;
+FALSE if not equal or if the clustered record has been marked for deletion */
+static
+ibool
+row_sel_sec_rec_is_for_clust_rec(
+/*=============================*/
+	const rec_t*	sec_rec,	/*!< in: secondary index record */
+	dict_index_t*	sec_index,	/*!< in: secondary index */
+	const rec_t*	clust_rec,	/*!< in: clustered index record;
+					must be protected by a lock or
+					a page latch against deletion
+					in rollback or purge */
+	dict_index_t*	clust_index)	/*!< in: clustered index */
+{
+	const byte*	sec_field;
+	ulint		sec_len;
+	const byte*	clust_field;
+	ulint		n;
+	ulint		i;
+	mem_heap_t*	heap		= NULL;
+	ulint		clust_offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint		sec_offsets_[REC_OFFS_SMALL_SIZE];
+	ulint*		clust_offs	= clust_offsets_;
+	ulint*		sec_offs	= sec_offsets_;
+	ibool		is_equal	= TRUE;
+
+	rec_offs_init(clust_offsets_);
+	rec_offs_init(sec_offsets_);
+
+	if (rec_get_deleted_flag(clust_rec,
+				 dict_table_is_comp(clust_index->table))) {
+
+		/* The clustered index record is delete-marked;
+		it is not visible in the read view.  Besides,
+		if there are any externally stored columns,
+		some of them may have already been purged. */
+		return(FALSE);
+	}
+
+	clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
+				     ULINT_UNDEFINED, &heap);
+	sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
+				   ULINT_UNDEFINED, &heap);
+
+	n = dict_index_get_n_ordering_defined_by_user(sec_index);
+
+	for (i = 0; i < n; i++) {
+		const dict_field_t*	ifield;
+		const dict_col_t*	col;
+		ulint			clust_pos;
+		ulint			clust_len;
+		ulint			len;
+
+		ifield = dict_index_get_nth_field(sec_index, i);
+		col = dict_field_get_col(ifield);
+		clust_pos = dict_col_get_clust_pos(col, clust_index);
+
+		clust_field = rec_get_nth_field(
+			clust_rec, clust_offs, clust_pos, &clust_len);
+		sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
+
+		len = clust_len;
+
+		if (ifield->prefix_len > 0 && len != UNIV_SQL_NULL
+		    && sec_len != UNIV_SQL_NULL) {
+
+			if (rec_offs_nth_extern(clust_offs, clust_pos)) {
+				len -= BTR_EXTERN_FIELD_REF_SIZE;
+			}
+
+			len = dtype_get_at_most_n_mbchars(
+				col->prtype, col->mbminmaxlen,
+				ifield->prefix_len, len, (char*) clust_field);
+
+			if (rec_offs_nth_extern(clust_offs, clust_pos)
+			    && len < sec_len) {
+				if (!row_sel_sec_rec_is_for_blob(
+					    col->mtype, col->prtype,
+					    col->mbminmaxlen,
+					    clust_field, clust_len,
+					    sec_field, sec_len,
+					    ifield->prefix_len,
+					    clust_index->table)) {
+					goto inequal;
+				}
+
+				continue;
+			}
+		}
+
+		if (0 != cmp_data_data(col->mtype, col->prtype,
+				       clust_field, len,
+				       sec_field, sec_len)) {
+inequal:
+			is_equal = FALSE;
+			goto func_exit;
+		}
+	}
+
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(is_equal);
+}
+
+/*********************************************************************//**
+Creates a select node struct.
+@return	own: select node struct */
+UNIV_INTERN
+sel_node_t*
+sel_node_create(
+/*============*/
+	mem_heap_t*	heap)	/*!< in: memory heap where created */
+{
+	sel_node_t*	node;
+
+	node = static_cast<sel_node_t*>(
+		mem_heap_alloc(heap, sizeof(sel_node_t)));
+
+	node->common.type = QUE_NODE_SELECT;
+	node->state = SEL_NODE_OPEN;
+
+	node->plans = NULL;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+UNIV_INTERN
+void
+sel_node_free_private(
+/*==================*/
+	sel_node_t*	node)	/*!< in: select node struct */
+{
+	ulint	i;
+	plan_t*	plan;
+
+	if (node->plans != NULL) {
+		for (i = 0; i < node->n_tables; i++) {
+			plan = sel_node_get_nth_plan(node, i);
+
+			btr_pcur_close(&(plan->pcur));
+			btr_pcur_close(&(plan->clust_pcur));
+
+			if (plan->old_vers_heap) {
+				mem_heap_free(plan->old_vers_heap);
+			}
+		}
+	}
+}
+
+/*********************************************************************//**
+Evaluates the values in a select list. If there are aggregate functions,
+their argument value is added to the aggregate total. */
+UNIV_INLINE
+void
+sel_eval_select_list(
+/*=================*/
+	sel_node_t*	node)	/*!< in: select node */
+{
+	que_node_t*	exp;
+
+	exp = node->select_list;
+
+	while (exp) {
+		eval_exp(exp);
+
+		exp = que_node_get_next(exp);
+	}
+}
+
+/*********************************************************************//**
+Assigns the values in the select list to the possible into-variables in
+SELECT ... INTO ... */
+UNIV_INLINE
+void
+sel_assign_into_var_values(
+/*=======================*/
+	sym_node_t*	var,	/*!< in: first variable in a list of
+				variables */
+	sel_node_t*	node)	/*!< in: select node */
+{
+	que_node_t*	exp;
+
+	if (var == NULL) {
+
+		return;
+	}
+
+	for (exp = node->select_list;
+	     var != 0;
+	     var = static_cast<sym_node_t*>(que_node_get_next(var))) {
+
+		ut_ad(exp);
+
+		eval_node_copy_val(var->alias, exp);
+
+		exp = que_node_get_next(exp);
+	}
+}
+
+/*********************************************************************//**
+Resets the aggregate value totals in the select list of an aggregate type
+query. */
+UNIV_INLINE
+void
+sel_reset_aggregate_vals(
+/*=====================*/
+	sel_node_t*	node)	/*!< in: select node */
+{
+	func_node_t*	func_node;
+
+	ut_ad(node->is_aggregate);
+
+	for (func_node = static_cast<func_node_t*>(node->select_list);
+	     func_node != 0;
+	     func_node = static_cast<func_node_t*>(
+		     	que_node_get_next(func_node))) {
+
+		eval_node_set_int_val(func_node, 0);
+	}
+
+	node->aggregate_already_fetched = FALSE;
+}
+
+/*********************************************************************//**
+Copies the input variable values when an explicit cursor is opened. */
+UNIV_INLINE
+void
+row_sel_copy_input_variable_vals(
+/*=============================*/
+	sel_node_t*	node)	/*!< in: select node */
+{
+	sym_node_t*	var;
+
+	var = UT_LIST_GET_FIRST(node->copy_variables);
+
+	while (var) {
+		eval_node_copy_val(var, var->alias);
+
+		var->indirection = NULL;
+
+		var = UT_LIST_GET_NEXT(col_var_list, var);
+	}
+}
+
+/*********************************************************************//**
+Fetches the column values from a record. */
+static
+void
+row_sel_fetch_columns(
+/*==================*/
+	dict_index_t*	index,	/*!< in: record index */
+	const rec_t*	rec,	/*!< in: record in a clustered or non-clustered
+				index; must be protected by a page latch */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	sym_node_t*	column)	/*!< in: first column in a column list, or
+				NULL */
+{
+	dfield_t*	val;
+	ulint		index_type;
+	ulint		field_no;
+	const byte*	data;
+	ulint		len;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (dict_index_is_clust(index)) {
+		index_type = SYM_CLUST_FIELD_NO;
+	} else {
+		index_type = SYM_SEC_FIELD_NO;
+	}
+
+	while (column) {
+		mem_heap_t*	heap = NULL;
+		ibool		needs_copy;
+
+		field_no = column->field_nos[index_type];
+
+		if (field_no != ULINT_UNDEFINED) {
+
+			if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
+							      field_no))) {
+
+				/* Copy an externally stored field to the
+				temporary heap, if possible. */
+
+				heap = mem_heap_create(1);
+
+				data = btr_rec_copy_externally_stored_field(
+					rec, offsets,
+					dict_table_zip_size(index->table),
+					field_no, &len, heap);
+
+				/* data == NULL means that the
+				externally stored field was not
+				written yet. This record
+				should only be seen by
+				recv_recovery_rollback_active() or any
+				TRX_ISO_READ_UNCOMMITTED
+				transactions. The InnoDB SQL parser
+				(the sole caller of this function)
+				does not implement READ UNCOMMITTED,
+				and it is not involved during rollback. */
+				ut_a(data);
+				ut_a(len != UNIV_SQL_NULL);
+
+				needs_copy = TRUE;
+			} else {
+				data = rec_get_nth_field(rec, offsets,
+							 field_no, &len);
+
+				needs_copy = column->copy_val;
+			}
+
+			if (needs_copy) {
+				eval_node_copy_and_alloc_val(column, data,
+							     len);
+			} else {
+				val = que_node_get_val(column);
+				dfield_set_data(val, data, len);
+			}
+
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+		}
+
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+}
+
+/*********************************************************************//**
+Allocates a prefetch buffer for a column when prefetch is first time done. */
+static
+void
+sel_col_prefetch_buf_alloc(
+/*=======================*/
+	sym_node_t*	column)	/*!< in: symbol table node for a column */
+{
+	sel_buf_t*	sel_buf;
+	ulint		i;
+
+	ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
+
+	column->prefetch_buf = static_cast<sel_buf_t*>(
+		mem_alloc(SEL_MAX_N_PREFETCH * sizeof(sel_buf_t)));
+
+	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+		sel_buf = column->prefetch_buf + i;
+
+		sel_buf->data = NULL;
+		sel_buf->len = 0;
+		sel_buf->val_buf_size = 0;
+	}
+}
+
+/*********************************************************************//**
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+UNIV_INTERN
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+	sel_buf_t*	prefetch_buf)	/*!< in, own: prefetch buffer */
+{
+	sel_buf_t*	sel_buf;
+	ulint		i;
+
+	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+		sel_buf = prefetch_buf + i;
+
+		if (sel_buf->val_buf_size > 0) {
+
+			mem_free(sel_buf->data);
+		}
+	}
+
+	mem_free(prefetch_buf);
+}
+
+/*********************************************************************//**
+Pops the column values for a prefetched, cached row from the column prefetch
+buffers and places them to the val fields in the column nodes. */
+static
+void
+sel_dequeue_prefetched_row(
+/*=======================*/
+	plan_t*	plan)	/*!< in: plan node for a table */
+{
+	sym_node_t*	column;
+	sel_buf_t*	sel_buf;
+	dfield_t*	val;
+	byte*		data;
+	ulint		len;
+	ulint		val_buf_size;
+
+	ut_ad(plan->n_rows_prefetched > 0);
+
+	column = UT_LIST_GET_FIRST(plan->columns);
+
+	while (column) {
+		val = que_node_get_val(column);
+
+		if (!column->copy_val) {
+			/* We did not really push any value for the
+			column */
+
+			ut_ad(!column->prefetch_buf);
+			ut_ad(que_node_get_val_buf_size(column) == 0);
+			ut_d(dfield_set_null(val));
+
+			goto next_col;
+		}
+
+		ut_ad(column->prefetch_buf);
+		ut_ad(!dfield_is_ext(val));
+
+		sel_buf = column->prefetch_buf + plan->first_prefetched;
+
+		data = sel_buf->data;
+		len = sel_buf->len;
+		val_buf_size = sel_buf->val_buf_size;
+
+		/* We must keep track of the allocated memory for
+		column values to be able to free it later: therefore
+		we swap the values for sel_buf and val */
+
+		sel_buf->data = static_cast<byte*>(dfield_get_data(val));
+		sel_buf->len = dfield_get_len(val);
+		sel_buf->val_buf_size = que_node_get_val_buf_size(column);
+
+		dfield_set_data(val, data, len);
+		que_node_set_val_buf_size(column, val_buf_size);
+next_col:
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+
+	plan->n_rows_prefetched--;
+
+	plan->first_prefetched++;
+}
+
+/*********************************************************************//**
+Pushes the column values for a prefetched, cached row to the column prefetch
+buffers from the val fields in the column nodes. */
+UNIV_INLINE
+void
+sel_enqueue_prefetched_row(
+/*=======================*/
+	plan_t*	plan)	/*!< in: plan node for a table */
+{
+	sym_node_t*	column;
+	sel_buf_t*	sel_buf;
+	dfield_t*	val;
+	byte*		data;
+	ulint		len;
+	ulint		pos;
+	ulint		val_buf_size;
+
+	if (plan->n_rows_prefetched == 0) {
+		pos = 0;
+		plan->first_prefetched = 0;
+	} else {
+		pos = plan->n_rows_prefetched;
+
+		/* We have the convention that pushing new rows starts only
+		after the prefetch stack has been emptied: */
+
+		ut_ad(plan->first_prefetched == 0);
+	}
+
+	plan->n_rows_prefetched++;
+
+	ut_ad(pos < SEL_MAX_N_PREFETCH);
+
+	for (column = UT_LIST_GET_FIRST(plan->columns);
+	     column != 0;
+	     column = UT_LIST_GET_NEXT(col_var_list, column)) {
+
+		if (!column->copy_val) {
+			/* There is no sense to push pointers to database
+			page fields when we do not keep latch on the page! */
+			continue;
+		}
+
+		if (!column->prefetch_buf) {
+			/* Allocate a new prefetch buffer */
+
+			sel_col_prefetch_buf_alloc(column);
+		}
+
+		sel_buf = column->prefetch_buf + pos;
+
+		val = que_node_get_val(column);
+
+		data = static_cast<byte*>(dfield_get_data(val));
+		len = dfield_get_len(val);
+		val_buf_size = que_node_get_val_buf_size(column);
+
+		/* We must keep track of the allocated memory for
+		column values to be able to free it later: therefore
+		we swap the values for sel_buf and val */
+
+		dfield_set_data(val, sel_buf->data, sel_buf->len);
+		que_node_set_val_buf_size(column, sel_buf->val_buf_size);
+
+		sel_buf->data = data;
+		sel_buf->len = len;
+		sel_buf->val_buf_size = val_buf_size;
+	}
+}
+
+/*********************************************************************//**
+Builds a previous version of a clustered index record for a consistent read
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_sel_build_prev_vers(
+/*====================*/
+	read_view_t*	read_view,	/*!< in: read view */
+	dict_index_t*	index,		/*!< in: plan node for table */
+	rec_t*		rec,		/*!< in: record in a clustered index */
+	ulint**		offsets,	/*!< in/out: offsets returned by
+					rec_get_offsets(rec, plan->index) */
+	mem_heap_t**	offset_heap,	/*!< in/out: memory heap from which
+					the offsets are allocated */
+	mem_heap_t**    old_vers_heap,  /*!< out: old version heap to use */
+	rec_t**		old_vers,	/*!< out: old version, or NULL if the
+					record does not exist in the view:
+					i.e., it was freshly inserted
+					afterwards */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	dberr_t	err;
+
+	if (*old_vers_heap) {
+		mem_heap_empty(*old_vers_heap);
+	} else {
+		*old_vers_heap = mem_heap_create(512);
+	}
+
+	err = row_vers_build_for_consistent_read(
+		rec, mtr, index, offsets, read_view, offset_heap,
+		*old_vers_heap, old_vers);
+	return(err);
+}
+
+/*********************************************************************//**
+Builds the last committed version of a clustered index record for a
+semi-consistent read. */
+static __attribute__((nonnull))
+void
+row_sel_build_committed_vers_for_mysql(
+/*===================================*/
+	dict_index_t*	clust_index,	/*!< in: clustered index */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct */
+	const rec_t*	rec,		/*!< in: record in a clustered index */
+	ulint**		offsets,	/*!< in/out: offsets returned by
+					rec_get_offsets(rec, clust_index) */
+	mem_heap_t**	offset_heap,	/*!< in/out: memory heap from which
+					the offsets are allocated */
+	const rec_t**	old_vers,	/*!< out: old version, or NULL if the
+					record does not exist in the view:
+					i.e., it was freshly inserted
+					afterwards */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	if (prebuilt->old_vers_heap) {
+		mem_heap_empty(prebuilt->old_vers_heap);
+	} else {
+		prebuilt->old_vers_heap = mem_heap_create(
+			rec_offs_size(*offsets));
+	}
+
+	row_vers_build_for_semi_consistent_read(
+		rec, mtr, clust_index, offsets, offset_heap,
+		prebuilt->old_vers_heap, old_vers);
+}
+
+/*********************************************************************//**
+Tests the conditions which determine when the index segment we are searching
+through has been exhausted.
+@return	TRUE if row passed the tests */
+UNIV_INLINE
+ibool
+row_sel_test_end_conds(
+/*===================*/
+	plan_t*	plan)	/*!< in: plan for the table; the column values must
+			already have been retrieved and the right sides of
+			comparisons evaluated */
+{
+	func_node_t*	cond;
+
+	/* All conditions in end_conds are comparisons of a column to an
+	expression */
+
+	for (cond = UT_LIST_GET_FIRST(plan->end_conds);
+	     cond != 0;
+	     cond = UT_LIST_GET_NEXT(cond_list, cond)) {
+
+		/* Evaluate the left side of the comparison, i.e., get the
+		column value if there is an indirection */
+
+		eval_sym(static_cast<sym_node_t*>(cond->args));
+
+		/* Do the comparison */
+
+		if (!eval_cmp(cond)) {
+
+			return(FALSE);
+		}
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Tests the other conditions.
+@return	TRUE if row passed the tests */
+UNIV_INLINE
+ibool
+row_sel_test_other_conds(
+/*=====================*/
+	plan_t*	plan)	/*!< in: plan for the table; the column values must
+			already have been retrieved */
+{
+	func_node_t*	cond;
+
+	cond = UT_LIST_GET_FIRST(plan->other_conds);
+
+	while (cond) {
+		eval_exp(cond);
+
+		if (!eval_node_get_ibool_val(cond)) {
+
+			return(FALSE);
+		}
+
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_sel_get_clust_rec(
+/*==================*/
+	sel_node_t*	node,	/*!< in: select_node */
+	plan_t*		plan,	/*!< in: plan node for table */
+	rec_t*		rec,	/*!< in: record in a non-clustered index */
+	que_thr_t*	thr,	/*!< in: query thread */
+	rec_t**		out_rec,/*!< out: clustered record or an old version of
+				it, NULL if the old version did not exist
+				in the read view, i.e., it was a fresh
+				inserted version */
+	mtr_t*		mtr)	/*!< in: mtr used to get access to the
+				non-clustered record; the same mtr is used to
+				access the clustered index */
+{
+	dict_index_t*	index;
+	rec_t*		clust_rec;
+	rec_t*		old_vers;
+	dberr_t		err;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	*out_rec = NULL;
+
+	offsets = rec_get_offsets(rec,
+				  btr_pcur_get_btr_cur(&plan->pcur)->index,
+				  offsets, ULINT_UNDEFINED, &heap);
+
+	row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
+
+	index = dict_table_get_first_index(plan->table);
+
+	btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
+				   BTR_SEARCH_LEAF, &plan->clust_pcur,
+				   0, mtr);
+
+	clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
+
+	/* Note: only if the search ends up on a non-infimum record is the
+	low_match value the real match to the search tuple */
+
+	if (!page_rec_is_user_rec(clust_rec)
+	    || btr_pcur_get_low_match(&(plan->clust_pcur))
+	    < dict_index_get_n_unique(index)) {
+
+		ut_a(rec_get_deleted_flag(rec,
+					  dict_table_is_comp(plan->table)));
+		ut_a(node->read_view);
+
+		/* In a rare case it is possible that no clust rec is found
+		for a delete-marked secondary index record: if in row0umod.cc
+		in row_undo_mod_remove_clust_low() we have already removed
+		the clust rec, while purge is still cleaning and removing
+		secondary index records associated with earlier versions of
+		the clustered index record. In that case we know that the
+		clustered index record did not exist in the read view of
+		trx. */
+
+		goto func_exit;
+	}
+
+	offsets = rec_get_offsets(clust_rec, index, offsets,
+				  ULINT_UNDEFINED, &heap);
+
+	if (!node->read_view) {
+		/* Try to place a lock on the index record */
+
+		/* If innodb_locks_unsafe_for_binlog option is used
+		or this session is using READ COMMITTED isolation level
+		we lock only the record, i.e., next-key locking is
+		not used. */
+		ulint	lock_type;
+		trx_t*	trx;
+
+		trx = thr_get_trx(thr);
+
+		if (srv_locks_unsafe_for_binlog
+		    || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+			lock_type = LOCK_REC_NOT_GAP;
+		} else {
+			lock_type = LOCK_ORDINARY;
+		}
+
+		err = lock_clust_rec_read_check_and_lock(
+			0, btr_pcur_get_block(&plan->clust_pcur),
+			clust_rec, index, offsets,
+			static_cast<enum lock_mode>(node->row_lock_mode),
+			lock_type,
+			thr);
+
+		switch (err) {
+		case DB_SUCCESS:
+		case DB_SUCCESS_LOCKED_REC:
+			/* Declare the variable uninitialized in Valgrind.
+			It should be set to DB_SUCCESS at func_exit. */
+			UNIV_MEM_INVALID(&err, sizeof err);
+			break;
+		default:
+			goto err_exit;
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		old_vers = NULL;
+
+		if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
+						   node->read_view)) {
+
+			err = row_sel_build_prev_vers(
+				node->read_view, index, clust_rec,
+				&offsets, &heap, &plan->old_vers_heap,
+				&old_vers, mtr);
+
+			if (err != DB_SUCCESS) {
+
+				goto err_exit;
+			}
+
+			clust_rec = old_vers;
+
+			if (clust_rec == NULL) {
+				goto func_exit;
+			}
+		}
+
+		/* If we had to go to an earlier version of row or the
+		secondary index record is delete marked, then it may be that
+		the secondary index record corresponding to clust_rec
+		(or old_vers) is not rec; in that case we must ignore
+		such row because in our snapshot rec would not have existed.
+		Remember that from rec we cannot see directly which transaction
+		id corresponds to it: we have to go to the clustered index
+		record. A query where we want to fetch all rows where
+		the secondary index value is in some interval would return
+		a wrong result if we would not drop rows which we come to
+		visit through secondary index records that would not really
+		exist in our snapshot. */
+
+		if ((old_vers
+		     || rec_get_deleted_flag(rec, dict_table_is_comp(
+						     plan->table)))
+		    && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index,
+							 clust_rec, index)) {
+			goto func_exit;
+		}
+	}
+
+	/* Fetch the columns needed in test conditions.  The clustered
+	index record is protected by a page latch that was acquired
+	when plan->clust_pcur was positioned.  The latch will not be
+	released until mtr_commit(mtr). */
+
+	ut_ad(!rec_get_deleted_flag(clust_rec, rec_offs_comp(offsets)));
+	row_sel_fetch_columns(index, clust_rec, offsets,
+			      UT_LIST_GET_FIRST(plan->columns));
+	*out_rec = clust_rec;
+func_exit:
+	err = DB_SUCCESS;
+err_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/*********************************************************************//**
+Sets a lock on a record.
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+UNIV_INLINE
+dberr_t
+sel_set_rec_lock(
+/*=============*/
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record */
+	dict_index_t*		index,	/*!< in: index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint			mode,	/*!< in: lock mode */
+	ulint			type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOC_REC_NOT_GAP */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	trx_t*		trx;
+	dberr_t		err;
+
+	trx = thr_get_trx(thr);
+
+	if (UT_LIST_GET_LEN(trx->lock.trx_locks) > 10000) {
+		if (buf_LRU_buf_pool_running_out()) {
+
+			return(DB_LOCK_TABLE_FULL);
+		}
+	}
+
+	if (dict_index_is_clust(index)) {
+		err = lock_clust_rec_read_check_and_lock(
+			0, block, rec, index, offsets,
+			static_cast<enum lock_mode>(mode), type, thr);
+	} else {
+		err = lock_sec_rec_read_check_and_lock(
+			0, block, rec, index, offsets,
+			static_cast<enum lock_mode>(mode), type, thr);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Opens a pcur to a table index. */
+static
+void
+row_sel_open_pcur(
+/*==============*/
+	plan_t*		plan,		/*!< in: table plan */
+	ibool		search_latch_locked,
+					/*!< in: TRUE if the thread currently
+					has the search latch locked in
+					s-mode */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	dict_index_t*	index;
+	func_node_t*	cond;
+	que_node_t*	exp;
+	ulint		n_fields;
+	ulint		has_search_latch = 0;	/* RW_S_LATCH or 0 */
+	ulint		i;
+
+	if (search_latch_locked) {
+		has_search_latch = RW_S_LATCH;
+	}
+
+	index = plan->index;
+
+	/* Calculate the value of the search tuple: the exact match columns
+	get their expressions evaluated when we evaluate the right sides of
+	end_conds */
+
+	cond = UT_LIST_GET_FIRST(plan->end_conds);
+
+	while (cond) {
+		eval_exp(que_node_get_next(cond->args));
+
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+
+	if (plan->tuple) {
+		n_fields = dtuple_get_n_fields(plan->tuple);
+
+		if (plan->n_exact_match < n_fields) {
+			/* There is a non-exact match field which must be
+			evaluated separately */
+
+			eval_exp(plan->tuple_exps[n_fields - 1]);
+		}
+
+		for (i = 0; i < n_fields; i++) {
+			exp = plan->tuple_exps[i];
+
+			dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
+					 que_node_get_val(exp));
+		}
+
+		/* Open pcur to the index */
+
+		btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
+					   BTR_SEARCH_LEAF, &plan->pcur,
+					   has_search_latch, mtr);
+	} else {
+		/* Open the cursor to the start or the end of the index
+		(FALSE: no init) */
+
+		btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF,
+					    &(plan->pcur), false, 0, mtr);
+	}
+
+	ut_ad(plan->n_rows_prefetched == 0);
+	ut_ad(plan->n_rows_fetched == 0);
+	ut_ad(plan->cursor_at_end == FALSE);
+
+	plan->pcur_is_open = TRUE;
+}
+
+/*********************************************************************//**
+Restores a stored pcur position to a table index.
+@return TRUE if the cursor should be moved to the next record after we
+return from this function (moved to the previous, in the case of a
+descending cursor) without processing again the current cursor
+record */
+static
+ibool
+row_sel_restore_pcur_pos(
+/*=====================*/
+	plan_t*		plan,	/*!< in: table plan */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ibool	equal_position;
+	ulint	relative_position;
+
+	ut_ad(!plan->cursor_at_end);
+
+	relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
+
+	equal_position = btr_pcur_restore_position(BTR_SEARCH_LEAF,
+						   &(plan->pcur), mtr);
+
+	/* If the cursor is traveling upwards, and relative_position is
+
+	(1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
+	yet on the successor of the page infimum;
+	(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+	first record GREATER than the predecessor of a page supremum; we have
+	not yet processed the cursor record: no need to move the cursor to the
+	next record;
+	(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+	last record LESS or EQUAL to the old stored user record; (a) if
+	equal_position is FALSE, this means that the cursor is now on a record
+	less than the old user record, and we must move to the next record;
+	(b) if equal_position is TRUE, then if
+	plan->stored_cursor_rec_processed is TRUE, we must move to the next
+	record, else there is no need to move the cursor. */
+
+	if (plan->asc) {
+		if (relative_position == BTR_PCUR_ON) {
+
+			if (equal_position) {
+
+				return(plan->stored_cursor_rec_processed);
+			}
+
+			return(TRUE);
+		}
+
+		ut_ad(relative_position == BTR_PCUR_AFTER
+		      || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+		return(FALSE);
+	}
+
+	/* If the cursor is traveling downwards, and relative_position is
+
+	(1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
+	the last record LESS than the successor of a page infimum; we have not
+	processed the cursor record: no need to move the cursor;
+	(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+	first record GREATER than the predecessor of a page supremum; we have
+	processed the cursor record: we should move the cursor to the previous
+	record;
+	(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+	last record LESS or EQUAL to the old stored user record; (a) if
+	equal_position is FALSE, this means that the cursor is now on a record
+	less than the old user record, and we need not move to the previous
+	record; (b) if equal_position is TRUE, then if
+	plan->stored_cursor_rec_processed is TRUE, we must move to the previous
+	record, else there is no need to move the cursor. */
+
+	if (relative_position == BTR_PCUR_BEFORE
+	    || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
+
+		return(FALSE);
+	}
+
+	if (relative_position == BTR_PCUR_ON) {
+
+		if (equal_position) {
+
+			return(plan->stored_cursor_rec_processed);
+		}
+
+		return(FALSE);
+	}
+
+	ut_ad(relative_position == BTR_PCUR_AFTER
+	      || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Resets a plan cursor to a closed state. */
+UNIV_INLINE
+void
+plan_reset_cursor(
+/*==============*/
+	plan_t*	plan)	/*!< in: plan */
+{
+	plan->pcur_is_open = FALSE;
+	plan->cursor_at_end = FALSE;
+	plan->n_rows_fetched = 0;
+	plan->n_rows_prefetched = 0;
+}
+
+/*********************************************************************//**
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always).
+@return	SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+static
+ulint
+row_sel_try_search_shortcut(
+/*========================*/
+	sel_node_t*	node,	/*!< in: select node for a consistent read */
+	plan_t*		plan,	/*!< in: plan for a unique search in clustered
+				index */
+	ibool		search_latch_locked,
+				/*!< in: whether the search holds
+				btr_search_latch */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_index_t*	index;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	ulint		ret;
+	rec_offs_init(offsets_);
+
+	index = plan->index;
+
+	ut_ad(node->read_view);
+	ut_ad(plan->unique_search);
+	ut_ad(!plan->must_get_clust);
+#ifdef UNIV_SYNC_DEBUG
+	if (search_latch_locked) {
+		ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+	}
+#endif /* UNIV_SYNC_DEBUG */
+
+	row_sel_open_pcur(plan, search_latch_locked, mtr);
+
+	rec = btr_pcur_get_rec(&(plan->pcur));
+
+	if (!page_rec_is_user_rec(rec)) {
+
+		return(SEL_RETRY);
+	}
+
+	ut_ad(plan->mode == PAGE_CUR_GE);
+
+	/* As the cursor is now placed on a user record after a search with
+	the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+	fields in the user record matched to the search tuple */
+
+	if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
+
+		return(SEL_EXHAUSTED);
+	}
+
+	/* This is a non-locking consistent read: if necessary, fetch
+	a previous version of the record */
+
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+	if (dict_index_is_clust(index)) {
+		if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
+						   node->read_view)) {
+			ret = SEL_RETRY;
+			goto func_exit;
+		}
+	} else if (!lock_sec_rec_cons_read_sees(rec, node->read_view)) {
+
+		ret = SEL_RETRY;
+		goto func_exit;
+	}
+
+	/* Test the deleted flag. */
+
+	if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
+
+		ret = SEL_EXHAUSTED;
+		goto func_exit;
+	}
+
+	/* Fetch the columns needed in test conditions.  The index
+	record is protected by a page latch that was acquired when
+	plan->pcur was positioned.  The latch will not be released
+	until mtr_commit(mtr). */
+
+	row_sel_fetch_columns(index, rec, offsets,
+			      UT_LIST_GET_FIRST(plan->columns));
+
+	/* Test the rest of search conditions */
+
+	if (!row_sel_test_other_conds(plan)) {
+
+		ret = SEL_EXHAUSTED;
+		goto func_exit;
+	}
+
+	ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
+
+	plan->n_rows_fetched++;
+	ret = SEL_FOUND;
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(ret);
+}
+
+/*********************************************************************//**
+Performs a select step.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_sel(
+/*====*/
+	sel_node_t*	node,	/*!< in: select node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dict_index_t*	index;
+	plan_t*		plan;
+	mtr_t		mtr;
+	ibool		moved;
+	rec_t*		rec;
+	rec_t*		old_vers;
+	rec_t*		clust_rec;
+	ibool		search_latch_locked;
+	ibool		consistent_read;
+
+	/* The following flag becomes TRUE when we are doing a
+	consistent read from a non-clustered index and we must look
+	at the clustered index to find out the previous delete mark
+	state of the non-clustered record: */
+
+	ibool		cons_read_requires_clust_rec	= FALSE;
+	ulint		cost_counter			= 0;
+	ibool		cursor_just_opened;
+	ibool		must_go_to_next;
+	ibool		mtr_has_extra_clust_latch	= FALSE;
+	/* TRUE if the search was made using
+	a non-clustered index, and we had to
+	access the clustered record: now &mtr
+	contains a clustered index latch, and
+	&mtr must be committed before we move
+	to the next non-clustered record */
+	ulint		found_flag;
+	dberr_t		err;
+	mem_heap_t*	heap				= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets				= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(thr->run_node == node);
+
+	search_latch_locked = FALSE;
+
+	if (node->read_view) {
+		/* In consistent reads, we try to do with the hash index and
+		not to use the buffer page get. This is to reduce memory bus
+		load resulting from semaphore operations. The search latch
+		will be s-locked when we access an index with a unique search
+		condition, but not locked when we access an index with a
+		less selective search condition. */
+
+		consistent_read = TRUE;
+	} else {
+		consistent_read = FALSE;
+	}
+
+table_loop:
+	/* TABLE LOOP
+	----------
+	This is the outer major loop in calculating a join. We come here when
+	node->fetch_table changes, and after adding a row to aggregate totals
+	and, of course, when this function is called. */
+
+	ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+	plan = sel_node_get_nth_plan(node, node->fetch_table);
+	index = plan->index;
+
+	if (plan->n_rows_prefetched > 0) {
+		sel_dequeue_prefetched_row(plan);
+
+		goto next_table_no_mtr;
+	}
+
+	if (plan->cursor_at_end) {
+		/* The cursor has already reached the result set end: no more
+		rows to process for this table cursor, as also the prefetch
+		stack was empty */
+
+		ut_ad(plan->pcur_is_open);
+
+		goto table_exhausted_no_mtr;
+	}
+
+	/* Open a cursor to index, or restore an open cursor position */
+
+	mtr_start(&mtr);
+
+	if (consistent_read && plan->unique_search && !plan->pcur_is_open
+	    && !plan->must_get_clust
+	    && !plan->table->big_rows) {
+		if (!search_latch_locked) {
+			rw_lock_s_lock(&btr_search_latch);
+
+			search_latch_locked = TRUE;
+		} else if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_WAIT_EX) {
+
+			/* There is an x-latch request waiting: release the
+			s-latch for a moment; as an s-latch here is often
+			kept for some 10 searches before being released,
+			a waiting x-latch request would block other threads
+			from acquiring an s-latch for a long time, lowering
+			performance significantly in multiprocessors. */
+
+			rw_lock_s_unlock(&btr_search_latch);
+			rw_lock_s_lock(&btr_search_latch);
+		}
+
+		found_flag = row_sel_try_search_shortcut(node, plan,
+							 search_latch_locked,
+							 &mtr);
+
+		if (found_flag == SEL_FOUND) {
+
+			goto next_table;
+
+		} else if (found_flag == SEL_EXHAUSTED) {
+
+			goto table_exhausted;
+		}
+
+		ut_ad(found_flag == SEL_RETRY);
+
+		plan_reset_cursor(plan);
+
+		mtr_commit(&mtr);
+		mtr_start(&mtr);
+	}
+
+	if (search_latch_locked) {
+		rw_lock_s_unlock(&btr_search_latch);
+
+		search_latch_locked = FALSE;
+	}
+
+	if (!plan->pcur_is_open) {
+		/* Evaluate the expressions to build the search tuple and
+		open the cursor */
+
+		row_sel_open_pcur(plan, search_latch_locked, &mtr);
+
+		cursor_just_opened = TRUE;
+
+		/* A new search was made: increment the cost counter */
+		cost_counter++;
+	} else {
+		/* Restore pcur position to the index */
+
+		must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr);
+
+		cursor_just_opened = FALSE;
+
+		if (must_go_to_next) {
+			/* We have already processed the cursor record: move
+			to the next */
+
+			goto next_rec;
+		}
+	}
+
+rec_loop:
+	/* RECORD LOOP
+	-----------
+	In this loop we use pcur and try to fetch a qualifying row, and
+	also fill the prefetch buffer for this table if n_rows_fetched has
+	exceeded a threshold. While we are inside this loop, the following
+	holds:
+	(1) &mtr is started,
+	(2) pcur is positioned and open.
+
+	NOTE that if cursor_just_opened is TRUE here, it means that we came
+	to this point right after row_sel_open_pcur. */
+
+	ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+	rec = btr_pcur_get_rec(&(plan->pcur));
+
+	/* PHASE 1: Set a lock if specified */
+
+	if (!node->asc && cursor_just_opened
+	    && !page_rec_is_supremum(rec)) {
+
+		/* When we open a cursor for a descending search, we must set
+		a next-key lock on the successor record: otherwise it would
+		be possible to insert new records next to the cursor position,
+		and it might be that these new records should appear in the
+		search result set, resulting in the phantom problem. */
+
+		if (!consistent_read) {
+
+			/* If innodb_locks_unsafe_for_binlog option is used
+			or this session is using READ COMMITTED isolation
+			level, we lock only the record, i.e., next-key
+			locking is not used. */
+
+			rec_t*	next_rec = page_rec_get_next(rec);
+			ulint	lock_type;
+			trx_t*	trx;
+
+			trx = thr_get_trx(thr);
+
+			offsets = rec_get_offsets(next_rec, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+
+			if (srv_locks_unsafe_for_binlog
+			    || trx->isolation_level
+			    <= TRX_ISO_READ_COMMITTED) {
+
+				if (page_rec_is_supremum(next_rec)) {
+
+					goto skip_lock;
+				}
+
+				lock_type = LOCK_REC_NOT_GAP;
+			} else {
+				lock_type = LOCK_ORDINARY;
+			}
+
+			err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
+					       next_rec, index, offsets,
+					       node->row_lock_mode,
+					       lock_type, thr);
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+				err = DB_SUCCESS;
+			case DB_SUCCESS:
+				break;
+			default:
+				/* Note that in this case we will store in pcur
+				the PREDECESSOR of the record we are waiting
+				the lock for */
+				goto lock_wait_or_error;
+			}
+		}
+	}
+
+skip_lock:
+	if (page_rec_is_infimum(rec)) {
+
+		/* The infimum record on a page cannot be in the result set,
+		and neither can a record lock be placed on it: we skip such
+		a record. We also increment the cost counter as we may have
+		processed yet another page of index. */
+
+		cost_counter++;
+
+		goto next_rec;
+	}
+
+	if (!consistent_read) {
+		/* Try to place a lock on the index record */
+
+		/* If innodb_locks_unsafe_for_binlog option is used
+		or this session is using READ COMMITTED isolation level,
+		we lock only the record, i.e., next-key locking is
+		not used. */
+
+		ulint	lock_type;
+		trx_t*	trx;
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+
+		trx = thr_get_trx(thr);
+
+		if (srv_locks_unsafe_for_binlog
+		    || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+
+			if (page_rec_is_supremum(rec)) {
+
+				goto next_rec;
+			}
+
+			lock_type = LOCK_REC_NOT_GAP;
+		} else {
+			lock_type = LOCK_ORDINARY;
+		}
+
+		err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
+				       rec, index, offsets,
+				       node->row_lock_mode, lock_type, thr);
+
+		switch (err) {
+		case DB_SUCCESS_LOCKED_REC:
+			err = DB_SUCCESS;
+		case DB_SUCCESS:
+			break;
+		default:
+			goto lock_wait_or_error;
+		}
+	}
+
+	if (page_rec_is_supremum(rec)) {
+
+		/* A page supremum record cannot be in the result set: skip
+		it now when we have placed a possible lock on it */
+
+		goto next_rec;
+	}
+
+	ut_ad(page_rec_is_user_rec(rec));
+
+	if (cost_counter > SEL_COST_LIMIT) {
+
+		/* Now that we have placed the necessary locks, we can stop
+		for a while and store the cursor position; NOTE that if we
+		would store the cursor position BEFORE placing a record lock,
+		it might happen that the cursor would jump over some records
+		that another transaction could meanwhile insert adjacent to
+		the cursor: this would result in the phantom problem. */
+
+		goto stop_for_a_while;
+	}
+
+	/* PHASE 2: Check a mixed index mix id if needed */
+
+	if (plan->unique_search && cursor_just_opened) {
+
+		ut_ad(plan->mode == PAGE_CUR_GE);
+
+		/* As the cursor is now placed on a user record after a search
+		with the mode PAGE_CUR_GE, the up_match field in the cursor
+		tells how many fields in the user record matched to the search
+		tuple */
+
+		if (btr_pcur_get_up_match(&(plan->pcur))
+		    < plan->n_exact_match) {
+			goto table_exhausted;
+		}
+
+		/* Ok, no need to test end_conds or mix id */
+
+	}
+
+	/* We are ready to look at a possible new index entry in the result
+	set: the cursor is now placed on a user record */
+
+	/* PHASE 3: Get previous version in a consistent read */
+
+	cons_read_requires_clust_rec = FALSE;
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+	if (consistent_read) {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		if (dict_index_is_clust(index)) {
+
+			if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
+							   node->read_view)) {
+
+				err = row_sel_build_prev_vers(
+					node->read_view, index, rec,
+					&offsets, &heap, &plan->old_vers_heap,
+					&old_vers, &mtr);
+
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+
+				if (old_vers == NULL) {
+					/* The record does not exist
+					in our read view. Skip it, but
+					first attempt to determine
+					whether the index segment we
+					are searching through has been
+					exhausted. */
+
+					offsets = rec_get_offsets(
+						rec, index, offsets,
+						ULINT_UNDEFINED, &heap);
+
+					/* Fetch the columns needed in
+					test conditions. The clustered
+					index record is protected by a
+					page latch that was acquired
+					by row_sel_open_pcur() or
+					row_sel_restore_pcur_pos().
+					The latch will not be released
+					until mtr_commit(mtr). */
+
+					row_sel_fetch_columns(
+						index, rec, offsets,
+						UT_LIST_GET_FIRST(
+							plan->columns));
+
+					if (!row_sel_test_end_conds(plan)) {
+
+						goto table_exhausted;
+					}
+
+					goto next_rec;
+				}
+
+				rec = old_vers;
+			}
+		} else if (!lock_sec_rec_cons_read_sees(rec,
+							node->read_view)) {
+			cons_read_requires_clust_rec = TRUE;
+		}
+	}
+
+	/* PHASE 4: Test search end conditions and deleted flag */
+
+	/* Fetch the columns needed in test conditions.  The record is
+	protected by a page latch that was acquired by
+	row_sel_open_pcur() or row_sel_restore_pcur_pos().  The latch
+	will not be released until mtr_commit(mtr). */
+
+	row_sel_fetch_columns(index, rec, offsets,
+			      UT_LIST_GET_FIRST(plan->columns));
+
+	/* Test the selection end conditions: these can only contain columns
+	which already are found in the index, even though the index might be
+	non-clustered */
+
+	if (plan->unique_search && cursor_just_opened) {
+
+		/* No test necessary: the test was already made above */
+
+	} else if (!row_sel_test_end_conds(plan)) {
+
+		goto table_exhausted;
+	}
+
+	if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))
+	    && !cons_read_requires_clust_rec) {
+
+		/* The record is delete marked: we can skip it if this is
+		not a consistent read which might see an earlier version
+		of a non-clustered index record */
+
+		if (plan->unique_search) {
+
+			goto table_exhausted;
+		}
+
+		goto next_rec;
+	}
+
+	/* PHASE 5: Get the clustered index record, if needed and if we did
+	not do the search using the clustered index */
+
+	if (plan->must_get_clust || cons_read_requires_clust_rec) {
+
+		/* It was a non-clustered index and we must fetch also the
+		clustered index record */
+
+		err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
+					    &mtr);
+		mtr_has_extra_clust_latch = TRUE;
+
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+
+		/* Retrieving the clustered record required a search:
+		increment the cost counter */
+
+		cost_counter++;
+
+		if (clust_rec == NULL) {
+			/* The record did not exist in the read view */
+			ut_ad(consistent_read);
+
+			goto next_rec;
+		}
+
+		if (rec_get_deleted_flag(clust_rec,
+					 dict_table_is_comp(plan->table))) {
+
+			/* The record is delete marked: we can skip it */
+
+			goto next_rec;
+		}
+
+		if (node->can_get_updated) {
+
+			btr_pcur_store_position(&(plan->clust_pcur), &mtr);
+		}
+	}
+
+	/* PHASE 6: Test the rest of search conditions */
+
+	if (!row_sel_test_other_conds(plan)) {
+
+		if (plan->unique_search) {
+
+			goto table_exhausted;
+		}
+
+		goto next_rec;
+	}
+
+	/* PHASE 7: We found a new qualifying row for the current table; push
+	the row if prefetch is on, or move to the next table in the join */
+
+	plan->n_rows_fetched++;
+
+	ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
+
+	if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
+	    || plan->unique_search || plan->no_prefetch
+	    || plan->table->big_rows) {
+
+		/* No prefetch in operation: go to the next table */
+
+		goto next_table;
+	}
+
+	sel_enqueue_prefetched_row(plan);
+
+	if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
+
+		/* The prefetch buffer is now full */
+
+		sel_dequeue_prefetched_row(plan);
+
+		goto next_table;
+	}
+
+next_rec:
+	ut_ad(!search_latch_locked);
+
+	if (mtr_has_extra_clust_latch) {
+
+		/* We must commit &mtr if we are moving to the next
+		non-clustered index record, because we could break the
+		latching order if we would access a different clustered
+		index page right away without releasing the previous. */
+
+		goto commit_mtr_for_a_while;
+	}
+
+	if (node->asc) {
+		moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
+	} else {
+		moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
+	}
+
+	if (!moved) {
+
+		goto table_exhausted;
+	}
+
+	cursor_just_opened = FALSE;
+
+	/* END OF RECORD LOOP
+	------------------ */
+	goto rec_loop;
+
+next_table:
+	/* We found a record which satisfies the conditions: we can move to
+	the next table or return a row in the result set */
+
+	ut_ad(btr_pcur_is_on_user_rec(&plan->pcur));
+
+	if (plan->unique_search && !node->can_get_updated) {
+
+		plan->cursor_at_end = TRUE;
+	} else {
+		ut_ad(!search_latch_locked);
+
+		plan->stored_cursor_rec_processed = TRUE;
+
+		btr_pcur_store_position(&(plan->pcur), &mtr);
+	}
+
+	mtr_commit(&mtr);
+
+	mtr_has_extra_clust_latch = FALSE;
+
+next_table_no_mtr:
+	/* If we use 'goto' to this label, it means that the row was popped
+	from the prefetched rows stack, and &mtr is already committed */
+
+	if (node->fetch_table + 1 == node->n_tables) {
+
+		sel_eval_select_list(node);
+
+		if (node->is_aggregate) {
+
+			goto table_loop;
+		}
+
+		sel_assign_into_var_values(node->into_list, node);
+
+		thr->run_node = que_node_get_parent(node);
+
+		err = DB_SUCCESS;
+		goto func_exit;
+	}
+
+	node->fetch_table++;
+
+	/* When we move to the next table, we first reset the plan cursor:
+	we do not care about resetting it when we backtrack from a table */
+
+	plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
+
+	goto table_loop;
+
+table_exhausted:
+	/* The table cursor pcur reached the result set end: backtrack to the
+	previous table in the join if we do not have cached prefetched rows */
+
+	plan->cursor_at_end = TRUE;
+
+	mtr_commit(&mtr);
+
+	mtr_has_extra_clust_latch = FALSE;
+
+	if (plan->n_rows_prefetched > 0) {
+		/* The table became exhausted during a prefetch */
+
+		sel_dequeue_prefetched_row(plan);
+
+		goto next_table_no_mtr;
+	}
+
+table_exhausted_no_mtr:
+	if (node->fetch_table == 0) {
+		err = DB_SUCCESS;
+
+		if (node->is_aggregate && !node->aggregate_already_fetched) {
+
+			node->aggregate_already_fetched = TRUE;
+
+			sel_assign_into_var_values(node->into_list, node);
+
+			thr->run_node = que_node_get_parent(node);
+		} else {
+			node->state = SEL_NODE_NO_MORE_ROWS;
+
+			thr->run_node = que_node_get_parent(node);
+		}
+
+		goto func_exit;
+	}
+
+	node->fetch_table--;
+
+	goto table_loop;
+
+stop_for_a_while:
+	/* Return control for a while to que_run_threads, so that runaway
+	queries can be canceled. NOTE that when we come here, we must, in a
+	locking read, have placed the necessary (possibly waiting request)
+	record lock on the cursor record or its successor: when we reposition
+	the cursor, this record lock guarantees that nobody can meanwhile have
+	inserted new records which should have appeared in the result set,
+	which would result in the phantom problem. */
+
+	ut_ad(!search_latch_locked);
+
+	plan->stored_cursor_rec_processed = FALSE;
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+
+	mtr_commit(&mtr);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(sync_thread_levels_empty_except_dict());
+#endif /* UNIV_SYNC_DEBUG */
+	err = DB_SUCCESS;
+	goto func_exit;
+
+commit_mtr_for_a_while:
+	/* Stores the cursor position and commits &mtr; this is used if
+	&mtr may contain latches which would break the latching order if
+	&mtr would not be committed and the latches released. */
+
+	plan->stored_cursor_rec_processed = TRUE;
+
+	ut_ad(!search_latch_locked);
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+
+	mtr_commit(&mtr);
+
+	mtr_has_extra_clust_latch = FALSE;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(sync_thread_levels_empty_except_dict());
+#endif /* UNIV_SYNC_DEBUG */
+
+	goto table_loop;
+
+lock_wait_or_error:
+	/* See the note at stop_for_a_while: the same holds for this case */
+
+	ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc);
+	ut_ad(!search_latch_locked);
+
+	plan->stored_cursor_rec_processed = FALSE;
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+
+	mtr_commit(&mtr);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(sync_thread_levels_empty_except_dict());
+#endif /* UNIV_SYNC_DEBUG */
+
+func_exit:
+	if (search_latch_locked) {
+		rw_lock_s_unlock(&btr_search_latch);
+	}
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/**********************************************************************//**
+Performs a select step. This is a high-level function used in SQL execution
+graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_sel_step(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	sel_node_t*	node;
+
+	ut_ad(thr);
+
+	node = static_cast<sel_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
+
+	/* If this is a new time this node is executed (or when execution
+	resumes after wait for a table intention lock), set intention locks
+	on the tables, or assign a read view */
+
+	if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
+
+		node->state = SEL_NODE_OPEN;
+	}
+
+	if (node->state == SEL_NODE_OPEN) {
+
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
+
+		trx_start_if_not_started_xa(thr_get_trx(thr));
+
+		plan_reset_cursor(sel_node_get_nth_plan(node, 0));
+
+		if (node->consistent_read) {
+			/* Assign a read view for the query */
+			node->read_view = trx_assign_read_view(
+				thr_get_trx(thr));
+		} else {
+			sym_node_t*	table_node;
+			enum lock_mode	i_lock_mode;
+
+			if (node->set_x_locks) {
+				i_lock_mode = LOCK_IX;
+			} else {
+				i_lock_mode = LOCK_IS;
+			}
+
+			for (table_node = node->table_list;
+			     table_node != 0;
+			     table_node = static_cast<sym_node_t*>(
+					que_node_get_next(table_node))) {
+
+				dberr_t	err = lock_table(
+					0, table_node->table, i_lock_mode,
+					thr);
+
+				if (err != DB_SUCCESS) {
+					trx_t*	trx;
+
+					trx = thr_get_trx(thr);
+					trx->error_state = err;
+
+					return(NULL);
+				}
+			}
+		}
+
+		/* If this is an explicit cursor, copy stored procedure
+		variable values, so that the values cannot change between
+		fetches (currently, we copy them also for non-explicit
+		cursors) */
+
+		if (node->explicit_cursor
+		    && UT_LIST_GET_FIRST(node->copy_variables)) {
+
+			row_sel_copy_input_variable_vals(node);
+		}
+
+		node->state = SEL_NODE_FETCH;
+		node->fetch_table = 0;
+
+		if (node->is_aggregate) {
+			/* Reset the aggregate total values */
+			sel_reset_aggregate_vals(node);
+		}
+	}
+
+	dberr_t	err = row_sel(node, thr);
+
+	/* NOTE! if queries are parallelized, the following assignment may
+	have problems; the assignment should be made only if thr is the
+	only top-level thr in the graph: */
+
+	thr->graph->last_sel_node = node;
+
+	if (err != DB_SUCCESS) {
+		thr_get_trx(thr)->error_state = err;
+
+		return(NULL);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs a fetch for a cursor.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+fetch_step(
+/*=======*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	sel_node_t*	sel_node;
+	fetch_node_t*	node;
+
+	ut_ad(thr);
+
+	node = static_cast<fetch_node_t*>(thr->run_node);
+	sel_node = node->cursor_def;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
+
+	if (thr->prev_node != que_node_get_parent(node)) {
+
+		if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
+
+			if (node->into_list) {
+				sel_assign_into_var_values(node->into_list,
+							   sel_node);
+			} else {
+				ibool ret = (*node->func->func)(
+					sel_node, node->func->arg);
+
+				if (!ret) {
+					sel_node->state
+						 = SEL_NODE_NO_MORE_ROWS;
+				}
+			}
+		}
+
+		thr->run_node = que_node_get_parent(node);
+
+		return(thr);
+	}
+
+	/* Make the fetch node the parent of the cursor definition for
+	the time of the fetch, so that execution knows to return to this
+	fetch node after a row has been selected or we know that there is
+	no row left */
+
+	sel_node->common.parent = node;
+
+	if (sel_node->state == SEL_NODE_CLOSED) {
+		fprintf(stderr,
+			"InnoDB: Error: fetch called on a closed cursor\n");
+
+		thr_get_trx(thr)->error_state = DB_ERROR;
+
+		return(NULL);
+	}
+
+	thr->run_node = sel_node;
+
+	return(thr);
+}
+
+/****************************************************************//**
+Sample callback function for fetch that prints each row.
+@return	always returns non-NULL */
+UNIV_INTERN
+void*
+row_fetch_print(
+/*============*/
+	void*	row,		/*!< in:  sel_node_t* */
+	void*	user_arg)	/*!< in:  not used */
+{
+	que_node_t*	exp;
+	ulint		i = 0;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+
+	UT_NOT_USED(user_arg);
+
+	fprintf(stderr, "row_fetch_print: row %p\n", row);
+
+	for (exp = node->select_list;
+	     exp != 0;
+	     exp = que_node_get_next(exp), i++) {
+
+		dfield_t*	dfield = que_node_get_val(exp);
+		const dtype_t*	type = dfield_get_type(dfield);
+
+		fprintf(stderr, " column %lu:\n", (ulong) i);
+
+		dtype_print(type);
+		putc('\n', stderr);
+
+		if (dfield_get_len(dfield) != UNIV_SQL_NULL) {
+			ut_print_buf(stderr, dfield_get_data(dfield),
+				     dfield_get_len(dfield));
+			putc('\n', stderr);
+		} else {
+			fputs(" <NULL>;\n", stderr);
+		}
+	}
+
+	return((void*)42);
+}
+
+/***********************************************************//**
+Prints a row in a select result.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_printf_step(
+/*============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	row_printf_node_t*	node;
+	sel_node_t*		sel_node;
+	que_node_t*		arg;
+
+	ut_ad(thr);
+
+	node = static_cast<row_printf_node_t*>(thr->run_node);
+
+	sel_node = node->sel_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+
+		/* Reset the cursor */
+		sel_node->state = SEL_NODE_OPEN;
+
+		/* Fetch next row to print */
+
+		thr->run_node = sel_node;
+
+		return(thr);
+	}
+
+	if (sel_node->state != SEL_NODE_FETCH) {
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to print */
+
+		thr->run_node = que_node_get_parent(node);
+
+		return(thr);
+	}
+
+	arg = sel_node->select_list;
+
+	while (arg) {
+		dfield_print_also_hex(que_node_get_val(arg));
+
+		fputs(" ::: ", stderr);
+
+		arg = que_node_get_next(arg);
+	}
+
+	putc('\n', stderr);
+
+	/* Fetch next row to print */
+
+	thr->run_node = sel_node;
+
+	return(thr);
+}
+
+/****************************************************************//**
+Converts a key value stored in MySQL format to an Innobase dtuple. The last
+field of the key value may be just a prefix of a fixed length field: hence
+the parameter key_len. But currently we do not allow search keys where the
+last field is only a prefix of the full key field len and print a warning if
+such appears. A counterpart of this function is
+ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+UNIV_INTERN
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+	dtuple_t*	tuple,		/*!< in/out: tuple where to build;
+					NOTE: we assume that the type info
+					in the tuple is already according
+					to index! */
+	byte*		buf,		/*!< in: buffer to use in field
+					conversions; NOTE that dtuple->data
+					may end up pointing inside buf so
+					do not discard that buffer while
+					the tuple is being used. See
+					row_mysql_store_col_in_innobase_format()
+					in the case of DATA_INT */
+	ulint		buf_len,	/*!< in: buffer length */
+	dict_index_t*	index,		/*!< in: index of the key value */
+	const byte*	key_ptr,	/*!< in: MySQL key value */
+	ulint		key_len,	/*!< in: MySQL key value length */
+	trx_t*		trx)		/*!< in: transaction */
+{
+	byte*		original_buf	= buf;
+	const byte*	original_key_ptr = key_ptr;
+	dict_field_t*	field;
+	dfield_t*	dfield;
+	ulint		data_offset;
+	ulint		data_len;
+	ulint		data_field_len;
+	ibool		is_null;
+	const byte*	key_end;
+	ulint		n_fields = 0;
+
+	/* For documentation of the key value storage format in MySQL, see
+	ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+
+	key_end = key_ptr + key_len;
+
+	/* Permit us to access any field in the tuple (ULINT_MAX): */
+
+	dtuple_set_n_fields(tuple, ULINT_MAX);
+
+	dfield = dtuple_get_nth_field(tuple, 0);
+	field = dict_index_get_nth_field(index, 0);
+
+	if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) {
+		/* A special case: we are looking for a position in the
+		generated clustered index which InnoDB automatically added
+		to a table with no primary key: the first and the only
+		ordering column is ROW_ID which InnoDB stored to the key_ptr
+		buffer. */
+
+		ut_a(key_len == DATA_ROW_ID_LEN);
+
+		dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
+
+		dtuple_set_n_fields(tuple, 1);
+
+		return;
+	}
+
+	while (key_ptr < key_end) {
+
+		ulint	type = dfield_get_type(dfield)->mtype;
+		ut_a(field->col->mtype == type);
+
+		data_offset = 0;
+		is_null = FALSE;
+
+		if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
+			/* The first byte in the field tells if this is
+			an SQL NULL value */
+
+			data_offset = 1;
+
+			if (*key_ptr != 0) {
+				dfield_set_null(dfield);
+
+				is_null = TRUE;
+			}
+		}
+
+		/* Calculate data length and data field total length */
+
+		if (type == DATA_BLOB) {
+			/* The key field is a column prefix of a BLOB or
+			TEXT */
+
+			ut_a(field->prefix_len > 0);
+
+			/* MySQL stores the actual data length to the first 2
+			bytes after the optional SQL NULL marker byte. The
+			storage format is little-endian, that is, the most
+			significant byte at a higher address. In UTF-8, MySQL
+			seems to reserve field->prefix_len bytes for
+			storing this field in the key value buffer, even
+			though the actual value only takes data_len bytes
+			from the start. */
+
+			data_len = key_ptr[data_offset]
+				+ 256 * key_ptr[data_offset + 1];
+			data_field_len = data_offset + 2 + field->prefix_len;
+
+			data_offset += 2;
+
+			/* Now that we know the length, we store the column
+			value like it would be a fixed char field */
+
+		} else if (field->prefix_len > 0) {
+			/* Looks like MySQL pads unused end bytes in the
+			prefix with space. Therefore, also in UTF-8, it is ok
+			to compare with a prefix containing full prefix_len
+			bytes, and no need to take at most prefix_len / 3
+			UTF-8 characters from the start.
+			If the prefix is used as the upper end of a LIKE
+			'abc%' query, then MySQL pads the end with chars
+			0xff. TODO: in that case does it any harm to compare
+			with the full prefix_len bytes. How do characters
+			0xff in UTF-8 behave? */
+
+			data_len = field->prefix_len;
+			data_field_len = data_offset + data_len;
+		} else {
+			data_len = dfield_get_type(dfield)->len;
+			data_field_len = data_offset + data_len;
+		}
+
+		if (UNIV_UNLIKELY
+		    (dtype_get_mysql_type(dfield_get_type(dfield))
+		     == DATA_MYSQL_TRUE_VARCHAR)
+		    && UNIV_LIKELY(type != DATA_INT)) {
+			/* In a MySQL key value format, a true VARCHAR is
+			always preceded by 2 bytes of a length field.
+			dfield_get_type(dfield)->len returns the maximum
+			'payload' len in bytes. That does not include the
+			2 bytes that tell the actual data length.
+
+			We added the check != DATA_INT to make sure we do
+			not treat MySQL ENUM or SET as a true VARCHAR! */
+
+			data_len += 2;
+			data_field_len += 2;
+		}
+
+		/* Storing may use at most data_len bytes of buf */
+
+		if (UNIV_LIKELY(!is_null)) {
+			buf = row_mysql_store_col_in_innobase_format(
+					dfield, buf,
+					FALSE, /* MySQL key value format col */
+					key_ptr + data_offset, data_len,
+					dict_table_is_comp(index->table));
+			ut_a(buf <= original_buf + buf_len);
+		}
+
+		key_ptr += data_field_len;
+
+		if (UNIV_UNLIKELY(key_ptr > key_end)) {
+			/* The last field in key was not a complete key field
+			but a prefix of it.
+
+			Print a warning about this! HA_READ_PREFIX_LAST does
+			not currently work in InnoDB with partial-field key
+			value prefixes. Since MySQL currently uses a padding
+			trick to calculate LIKE 'abc%' type queries there
+			should never be partial-field prefixes in searches. */
+
+			ut_print_timestamp(stderr);
+
+			fputs("  InnoDB: Warning: using a partial-field"
+			      " key prefix in search.\n"
+			      "InnoDB: ", stderr);
+			dict_index_name_print(stderr, trx, index);
+			fprintf(stderr, ". Last data field length %lu bytes,\n"
+				"InnoDB: key ptr now exceeds"
+				" key end by %lu bytes.\n"
+				"InnoDB: Key value in the MySQL format:\n",
+				(ulong) data_field_len,
+				(ulong) (key_ptr - key_end));
+			fflush(stderr);
+			ut_print_buf(stderr, original_key_ptr, key_len);
+			putc('\n', stderr);
+
+			if (!is_null) {
+				ulint	len = dfield_get_len(dfield);
+				dfield_set_len(dfield, len
+					       - (ulint) (key_ptr - key_end));
+			}
+                        ut_ad(0);
+		}
+
+		n_fields++;
+		field++;
+		dfield++;
+	}
+
+	ut_a(buf <= original_buf + buf_len);
+
+	/* We set the length of tuple to n_fields: we assume that the memory
+	area allocated for it is big enough (usually bigger than n_fields). */
+
+	dtuple_set_n_fields(tuple, n_fields);
+}
+
+/**************************************************************//**
+Stores the row id to the prebuilt struct. */
+static
+void
+row_sel_store_row_id_to_prebuilt(
+/*=============================*/
+	row_prebuilt_t*		prebuilt,	/*!< in/out: prebuilt */
+	const rec_t*		index_rec,	/*!< in: record */
+	const dict_index_t*	index,		/*!< in: index of the record */
+	const ulint*		offsets)	/*!< in: rec_get_offsets
+						(index_rec, index) */
+{
+	const byte*	data;
+	ulint		len;
+
+	ut_ad(rec_offs_validate(index_rec, index, offsets));
+
+	data = rec_get_nth_field(
+		index_rec, offsets,
+		dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
+
+	if (UNIV_UNLIKELY(len != DATA_ROW_ID_LEN)) {
+		fprintf(stderr,
+			"InnoDB: Error: Row id field is"
+			" wrong length %lu in ", (ulong) len);
+		dict_index_name_print(stderr, prebuilt->trx, index);
+		fprintf(stderr, "\n"
+			"InnoDB: Field number %lu, record:\n",
+			(ulong) dict_index_get_sys_col_pos(index,
+							   DATA_ROW_ID));
+		rec_print_new(stderr, index_rec, offsets);
+		putc('\n', stderr);
+		ut_error;
+	}
+
+	ut_memcpy(prebuilt->row_id, data, len);
+}
+
+#ifdef UNIV_DEBUG
+/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
+# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \
+	row_sel_field_store_in_mysql_format_func(dest,templ,idx,field,src,len)
+#else /* UNIV_DEBUG */
+/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
+# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \
+	row_sel_field_store_in_mysql_format_func(dest,templ,src,len)
+#endif /* UNIV_DEBUG */
+
+/**************************************************************//**
+Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
+function is row_mysql_store_col_in_innobase_format() in row0mysql.cc. */
+static __attribute__((nonnull))
+void
+row_sel_field_store_in_mysql_format_func(
+/*=====================================*/
+	byte*		dest,	/*!< in/out: buffer where to store; NOTE
+				that BLOBs are not in themselves
+				stored here: the caller must allocate
+				and copy the BLOB into buffer before,
+				and pass the pointer to the BLOB in
+				'data' */
+	const mysql_row_templ_t* templ,
+				/*!< in: MySQL column template.
+				Its following fields are referenced:
+				type, is_unsigned, mysql_col_len,
+				mbminlen, mbmaxlen */
+#ifdef UNIV_DEBUG
+	const dict_index_t* index,
+				/*!< in: InnoDB index */
+	ulint		field_no,
+				/*!< in: templ->rec_field_no or
+				templ->clust_rec_field_no or
+				templ->icp_rec_field_no */
+#endif /* UNIV_DEBUG */
+	const byte*	data,	/*!< in: data to store */
+	ulint		len)	/*!< in: length of the data */
+{
+	byte*			ptr;
+#ifdef UNIV_DEBUG
+	const dict_field_t*	field
+		= dict_index_get_nth_field(index, field_no);
+#endif /* UNIV_DEBUG */
+
+	ut_ad(len != UNIV_SQL_NULL);
+	UNIV_MEM_ASSERT_RW(data, len);
+	UNIV_MEM_ASSERT_W(dest, templ->mysql_col_len);
+	UNIV_MEM_INVALID(dest, templ->mysql_col_len);
+
+	switch (templ->type) {
+		const byte*	field_end;
+		byte*		pad;
+	case DATA_INT:
+		/* Convert integer data from Innobase to a little-endian
+		format, sign bit restored to normal */
+
+		ptr = dest + len;
+
+		for (;;) {
+			ptr--;
+			*ptr = *data;
+			if (ptr == dest) {
+				break;
+			}
+			data++;
+		}
+
+		if (!templ->is_unsigned) {
+			dest[len - 1] = (byte) (dest[len - 1] ^ 128);
+		}
+
+		ut_ad(templ->mysql_col_len == len);
+		break;
+
+	case DATA_VARCHAR:
+	case DATA_VARMYSQL:
+	case DATA_BINARY:
+		field_end = dest + templ->mysql_col_len;
+
+		if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+			/* This is a >= 5.0.3 type true VARCHAR. Store the
+			length of the data to the first byte or the first
+			two bytes of dest. */
+
+			dest = row_mysql_store_true_var_len(
+				dest, len, templ->mysql_length_bytes);
+			/* Copy the actual data. Leave the rest of the
+			buffer uninitialized. */
+			memcpy(dest, data, len);
+			break;
+		}
+
+		/* Copy the actual data */
+		ut_memcpy(dest, data, len);
+
+		/* Pad with trailing spaces. */
+
+		pad = dest + len;
+
+		ut_ad(templ->mbminlen <= templ->mbmaxlen);
+
+		/* We treat some Unicode charset strings specially. */
+		switch (templ->mbminlen) {
+		case 4:
+			/* InnoDB should never have stripped partial
+			UTF-32 characters. */
+			ut_a(!(len & 3));
+			break;
+		case 2:
+			/* A space char is two bytes,
+			0x0020 in UCS2 and UTF-16 */
+
+			if (UNIV_UNLIKELY(len & 1)) {
+				/* A 0x20 has been stripped from the column.
+				Pad it back. */
+
+				if (pad < field_end) {
+					*pad++ = 0x20;
+				}
+			}
+		}
+
+		row_mysql_pad_col(templ->mbminlen, pad, field_end - pad);
+		break;
+
+	case DATA_BLOB:
+		/* Store a pointer to the BLOB buffer to dest: the BLOB was
+		already copied to the buffer in row_sel_store_mysql_rec */
+
+		row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
+					 len);
+		break;
+
+	case DATA_MYSQL:
+		memcpy(dest, data, len);
+
+		ut_ad(templ->mysql_col_len >= len);
+		ut_ad(templ->mbmaxlen >= templ->mbminlen);
+
+		/* If field_no equals to templ->icp_rec_field_no,
+		we are examining a row pointed by "icp_rec_field_no".
+		There is possibility that icp_rec_field_no refers to
+		a field in a secondary index while templ->rec_field_no
+		points to field in a primary index. The length
+		should still be equal, unless the field pointed
+		by icp_rec_field_no has a prefix */
+		ut_ad(templ->mbmaxlen > templ->mbminlen
+		      || templ->mysql_col_len == len
+		      || (field_no == templ->icp_rec_field_no
+			  && field->prefix_len > 0));
+
+		/* The following assertion would fail for old tables
+		containing UTF-8 ENUM columns due to Bug #9526. */
+		ut_ad(!templ->mbmaxlen
+		      || !(templ->mysql_col_len % templ->mbmaxlen));
+		ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len
+		      || (field_no == templ->icp_rec_field_no
+			  && field->prefix_len > 0));
+		ut_ad(!(field->prefix_len % templ->mbmaxlen));
+
+		if (templ->mbminlen == 1 && templ->mbmaxlen != 1) {
+			/* Pad with spaces. This undoes the stripping
+			done in row0mysql.cc, function
+			row_mysql_store_col_in_innobase_format(). */
+
+			memset(dest + len, 0x20, templ->mysql_col_len - len);
+		}
+		break;
+
+	default:
+#ifdef UNIV_DEBUG
+	case DATA_SYS_CHILD:
+	case DATA_SYS:
+		/* These column types should never be shipped to MySQL. */
+		ut_ad(0);
+
+	case DATA_CHAR:
+	case DATA_FIXBINARY:
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+	case DATA_DECIMAL:
+		/* Above are the valid column types for MySQL data. */
+#endif /* UNIV_DEBUG */
+		ut_ad(field->prefix_len
+		      ? field->prefix_len == len
+		      : templ->mysql_col_len == len);
+		memcpy(dest, data, len);
+	}
+}
+
+#ifdef UNIV_DEBUG
+/** Convert a field from Innobase format to MySQL format. */
+# define row_sel_store_mysql_field(m,p,r,i,o,f,t) \
+	row_sel_store_mysql_field_func(m,p,r,i,o,f,t)
+#else /* UNIV_DEBUG */
+/** Convert a field from Innobase format to MySQL format. */
+# define row_sel_store_mysql_field(m,p,r,i,o,f,t) \
+	row_sel_store_mysql_field_func(m,p,r,o,f,t)
+#endif /* UNIV_DEBUG */
+/**************************************************************//**
+Convert a field in the Innobase format to a field in the MySQL format. */
+static __attribute__((warn_unused_result))
+ibool
+row_sel_store_mysql_field_func(
+/*===========================*/
+	byte*			mysql_rec,	/*!< out: record in the
+						MySQL format */
+	row_prebuilt_t*		prebuilt,	/*!< in/out: prebuilt struct */
+	const rec_t*		rec,		/*!< in: InnoDB record;
+						must be protected by
+						a page latch */
+#ifdef UNIV_DEBUG
+	const dict_index_t*	index,		/*!< in: index of rec */
+#endif
+	const ulint*		offsets,	/*!< in: array returned by
+						rec_get_offsets() */
+	ulint			field_no,	/*!< in: templ->rec_field_no or
+						templ->clust_rec_field_no or
+						templ->icp_rec_field_no */
+	const mysql_row_templ_t*templ)		/*!< in: row template */
+{
+	const byte*	data;
+	ulint		len;
+
+	ut_ad(prebuilt->default_rec);
+	ut_ad(templ);
+	ut_ad(templ >= prebuilt->mysql_template);
+	ut_ad(templ < &prebuilt->mysql_template[prebuilt->n_template]);
+	ut_ad(field_no == templ->clust_rec_field_no
+	      || field_no == templ->rec_field_no
+	      || field_no == templ->icp_rec_field_no);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, field_no))) {
+
+		mem_heap_t*	heap;
+		/* Copy an externally stored field to a temporary heap */
+
+		ut_a(!prebuilt->trx->has_search_latch);
+		ut_ad(field_no == templ->clust_rec_field_no);
+
+		if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
+			if (prebuilt->blob_heap == NULL) {
+				prebuilt->blob_heap = mem_heap_create(
+					UNIV_PAGE_SIZE);
+			}
+
+			heap = prebuilt->blob_heap;
+		} else {
+			heap = mem_heap_create(UNIV_PAGE_SIZE);
+		}
+
+		/* NOTE: if we are retrieving a big BLOB, we may
+		already run out of memory in the next call, which
+		causes an assert */
+
+		data = btr_rec_copy_externally_stored_field(
+			rec, offsets,
+			dict_table_zip_size(prebuilt->table),
+			field_no, &len, heap);
+
+		if (UNIV_UNLIKELY(!data)) {
+			/* The externally stored field was not written
+			yet. This record should only be seen by
+			recv_recovery_rollback_active() or any
+			TRX_ISO_READ_UNCOMMITTED transactions. */
+
+			if (heap != prebuilt->blob_heap) {
+				mem_heap_free(heap);
+			}
+
+			ut_a(prebuilt->trx->isolation_level
+			     == TRX_ISO_READ_UNCOMMITTED);
+			return(FALSE);
+		}
+
+		ut_a(len != UNIV_SQL_NULL);
+
+		row_sel_field_store_in_mysql_format(
+			mysql_rec + templ->mysql_col_offset,
+			templ, index, field_no, data, len);
+
+		if (heap != prebuilt->blob_heap) {
+			mem_heap_free(heap);
+		}
+	} else {
+		/* Field is stored in the row. */
+
+		data = rec_get_nth_field(rec, offsets, field_no, &len);
+
+		if (len == UNIV_SQL_NULL) {
+			/* MySQL assumes that the field for an SQL
+			NULL value is set to the default value. */
+			ut_ad(templ->mysql_null_bit_mask);
+
+			UNIV_MEM_ASSERT_RW(prebuilt->default_rec
+					   + templ->mysql_col_offset,
+					   templ->mysql_col_len);
+			mysql_rec[templ->mysql_null_byte_offset]
+				|= (byte) templ->mysql_null_bit_mask;
+			memcpy(mysql_rec + templ->mysql_col_offset,
+			       (const byte*) prebuilt->default_rec
+			       + templ->mysql_col_offset,
+			       templ->mysql_col_len);
+			return(TRUE);
+		}
+
+		if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
+
+			/* It is a BLOB field locally stored in the
+			InnoDB record: we MUST copy its contents to
+			prebuilt->blob_heap here because
+			row_sel_field_store_in_mysql_format() stores a
+			pointer to the data, and the data passed to us
+			will be invalid as soon as the
+			mini-transaction is committed and the page
+			latch on the clustered index page is
+			released. */
+
+			if (prebuilt->blob_heap == NULL) {
+				prebuilt->blob_heap = mem_heap_create(
+					UNIV_PAGE_SIZE);
+			}
+
+			data = static_cast<byte*>(
+				mem_heap_dup(prebuilt->blob_heap, data, len));
+		}
+
+		row_sel_field_store_in_mysql_format(
+			mysql_rec + templ->mysql_col_offset,
+			templ, index, field_no, data, len);
+	}
+
+	ut_ad(len != UNIV_SQL_NULL);
+
+	if (templ->mysql_null_bit_mask) {
+		/* It is a nullable column with a non-NULL
+		value */
+		mysql_rec[templ->mysql_null_byte_offset]
+			&= ~(byte) templ->mysql_null_bit_mask;
+	}
+
+	return(TRUE);
+}
+
+/**************************************************************//**
+Convert a row in the Innobase format to a row in the MySQL format.
+Note that the template in prebuilt may advise us to copy only a few
+columns to mysql_rec, other columns are left blank. All columns may not
+be needed in the query.
+@return TRUE on success, FALSE if not all columns could be retrieved */
+static __attribute__((warn_unused_result))
+ibool
+row_sel_store_mysql_rec(
+/*====================*/
+	byte*		mysql_rec,	/*!< out: row in the MySQL format */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct */
+	const rec_t*	rec,		/*!< in: Innobase record in the index
+					which was described in prebuilt's
+					template, or in the clustered index;
+					must be protected by a page latch */
+	ibool		rec_clust,	/*!< in: TRUE if rec is in the
+					clustered index instead of
+					prebuilt->index */
+	const dict_index_t* index,	/*!< in: index of rec */
+	const ulint*	offsets)	/*!< in: array returned by
+					rec_get_offsets(rec) */
+{
+	ulint	i;
+
+	ut_ad(rec_clust || index == prebuilt->index);
+	ut_ad(!rec_clust || dict_index_is_clust(index));
+
+	if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
+		mem_heap_free(prebuilt->blob_heap);
+		prebuilt->blob_heap = NULL;
+	}
+
+	for (i = 0; i < prebuilt->n_template; i++) {
+		const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
+		const ulint		field_no
+			= rec_clust
+			? templ->clust_rec_field_no
+			: templ->rec_field_no;
+		/* We should never deliver column prefixes to MySQL,
+		except for evaluating innobase_index_cond(). */
+		ut_ad(dict_index_get_nth_field(index, field_no)->prefix_len
+		      == 0);
+
+		if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
+					       rec, index, offsets,
+					       field_no, templ)) {
+			return(FALSE);
+		}
+	}
+
+	/* FIXME: We only need to read the doc_id if an FTS indexed
+	column is being updated.
+	NOTE, the record must be cluster index record. Secondary index
+	might not have the Doc ID */
+	if (dict_table_has_fts_index(prebuilt->table)
+	    && dict_index_is_clust(index)) {
+
+		prebuilt->fts_doc_id = fts_get_doc_id_from_rec(
+			prebuilt->table, rec, NULL);
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Builds a previous version of a clustered index record for a consistent read
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_sel_build_prev_vers_for_mysql(
+/*==============================*/
+	read_view_t*	read_view,	/*!< in: read view */
+	dict_index_t*	clust_index,	/*!< in: clustered index */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct */
+	const rec_t*	rec,		/*!< in: record in a clustered index */
+	ulint**		offsets,	/*!< in/out: offsets returned by
+					rec_get_offsets(rec, clust_index) */
+	mem_heap_t**	offset_heap,	/*!< in/out: memory heap from which
+					the offsets are allocated */
+	rec_t**		old_vers,	/*!< out: old version, or NULL if the
+					record does not exist in the view:
+					i.e., it was freshly inserted
+					afterwards */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	dberr_t	err;
+
+	if (prebuilt->old_vers_heap) {
+		mem_heap_empty(prebuilt->old_vers_heap);
+	} else {
+		prebuilt->old_vers_heap = mem_heap_create(200);
+	}
+
+	err = row_vers_build_for_consistent_read(
+		rec, mtr, clust_index, offsets, read_view, offset_heap,
+		prebuilt->old_vers_heap, old_vers);
+	return(err);
+}
+
+/*********************************************************************//**
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking. Used in the MySQL
+interface.
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_sel_get_clust_rec_for_mysql(
+/*============================*/
+	row_prebuilt_t*	prebuilt,/*!< in: prebuilt struct in the handle */
+	dict_index_t*	sec_index,/*!< in: secondary index where rec resides */
+	const rec_t*	rec,	/*!< in: record in a non-clustered index; if
+				this is a locking read, then rec is not
+				allowed to be delete-marked, and that would
+				not make sense either */
+	que_thr_t*	thr,	/*!< in: query thread */
+	const rec_t**	out_rec,/*!< out: clustered record or an old version of
+				it, NULL if the old version did not exist
+				in the read view, i.e., it was a fresh
+				inserted version */
+	ulint**		offsets,/*!< in: offsets returned by
+				rec_get_offsets(rec, sec_index);
+				out: offsets returned by
+				rec_get_offsets(out_rec, clust_index) */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mtr_t*		mtr)	/*!< in: mtr used to get access to the
+				non-clustered record; the same mtr is used to
+				access the clustered index */
+{
+	dict_index_t*	clust_index;
+	const rec_t*	clust_rec;
+	rec_t*		old_vers;
+	dberr_t		err;
+	trx_t*		trx;
+
+	*out_rec = NULL;
+	trx = thr_get_trx(thr);
+
+	row_build_row_ref_in_tuple(prebuilt->clust_ref, rec,
+				   sec_index, *offsets, trx);
+
+	clust_index = dict_table_get_first_index(sec_index->table);
+
+	btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
+				   PAGE_CUR_LE, BTR_SEARCH_LEAF,
+				   &prebuilt->clust_pcur, 0, mtr);
+
+	clust_rec = btr_pcur_get_rec(&prebuilt->clust_pcur);
+
+	prebuilt->clust_pcur.trx_if_known = trx;
+
+	/* Note: only if the search ends up on a non-infimum record is the
+	low_match value the real match to the search tuple */
+
+	if (!page_rec_is_user_rec(clust_rec)
+	    || btr_pcur_get_low_match(&prebuilt->clust_pcur)
+	    < dict_index_get_n_unique(clust_index)) {
+
+		/* In a rare case it is possible that no clust rec is found
+		for a delete-marked secondary index record: if in row0umod.cc
+		in row_undo_mod_remove_clust_low() we have already removed
+		the clust rec, while purge is still cleaning and removing
+		secondary index records associated with earlier versions of
+		the clustered index record. In that case we know that the
+		clustered index record did not exist in the read view of
+		trx. */
+
+		if (!rec_get_deleted_flag(rec,
+					  dict_table_is_comp(sec_index->table))
+		    || prebuilt->select_lock_type != LOCK_NONE) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: error clustered record"
+			      " for sec rec not found\n"
+			      "InnoDB: ", stderr);
+			dict_index_name_print(stderr, trx, sec_index);
+			fputs("\n"
+			      "InnoDB: sec index record ", stderr);
+			rec_print(stderr, rec, sec_index);
+			fputs("\n"
+			      "InnoDB: clust index record ", stderr);
+			rec_print(stderr, clust_rec, clust_index);
+			putc('\n', stderr);
+			trx_print(stderr, trx, 600);
+			fputs("\n"
+			      "InnoDB: Submit a detailed bug report"
+			      " to http://bugs.mysql.com\n", stderr);
+			ut_ad(0);
+		}
+
+		clust_rec = NULL;
+
+		err = DB_SUCCESS;
+		goto func_exit;
+	}
+
+	*offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
+				   ULINT_UNDEFINED, offset_heap);
+
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* Try to place a lock on the index record; we are searching
+		the clust rec with a unique condition, hence
+		we set a LOCK_REC_NOT_GAP type lock */
+
+		err = lock_clust_rec_read_check_and_lock(
+			0, btr_pcur_get_block(&prebuilt->clust_pcur),
+			clust_rec, clust_index, *offsets,
+			static_cast<enum lock_mode>(prebuilt->select_lock_type),
+			LOCK_REC_NOT_GAP,
+			thr);
+
+		switch (err) {
+		case DB_SUCCESS:
+		case DB_SUCCESS_LOCKED_REC:
+			break;
+		default:
+			goto err_exit;
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		old_vers = NULL;
+
+		/* If the isolation level allows reading of uncommitted data,
+		then we never look for an earlier version */
+
+		if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+		    && !lock_clust_rec_cons_read_sees(
+			    clust_rec, clust_index, *offsets,
+			    trx->read_view)) {
+
+			/* The following call returns 'offsets' associated with
+			'old_vers' */
+			err = row_sel_build_prev_vers_for_mysql(
+				trx->read_view, clust_index, prebuilt,
+				clust_rec, offsets, offset_heap, &old_vers,
+				mtr);
+
+			if (err != DB_SUCCESS || old_vers == NULL) {
+
+				goto err_exit;
+			}
+
+			clust_rec = old_vers;
+		}
+
+		/* If we had to go to an earlier version of row or the
+		secondary index record is delete marked, then it may be that
+		the secondary index record corresponding to clust_rec
+		(or old_vers) is not rec; in that case we must ignore
+		such row because in our snapshot rec would not have existed.
+		Remember that from rec we cannot see directly which transaction
+		id corresponds to it: we have to go to the clustered index
+		record. A query where we want to fetch all rows where
+		the secondary index value is in some interval would return
+		a wrong result if we would not drop rows which we come to
+		visit through secondary index records that would not really
+		exist in our snapshot. */
+
+		if (clust_rec
+		    && (old_vers
+			|| trx->isolation_level <= TRX_ISO_READ_UNCOMMITTED
+			|| rec_get_deleted_flag(rec, dict_table_is_comp(
+							sec_index->table)))
+		    && !row_sel_sec_rec_is_for_clust_rec(
+			    rec, sec_index, clust_rec, clust_index)) {
+			clust_rec = NULL;
+#ifdef UNIV_SEARCH_DEBUG
+		} else {
+			ut_a(clust_rec == NULL
+			     || row_sel_sec_rec_is_for_clust_rec(
+				     rec, sec_index, clust_rec, clust_index));
+#endif
+		}
+
+		err = DB_SUCCESS;
+	}
+
+func_exit:
+	*out_rec = clust_rec;
+
+	/* Store the current position if select_lock_type is not
+	LOCK_NONE or if we are scanning using InnoDB APIs */
+	if (prebuilt->select_lock_type != LOCK_NONE
+	    || prebuilt->innodb_api) {
+		/* We may use the cursor in update or in unlock_row():
+		store its position */
+
+		btr_pcur_store_position(&prebuilt->clust_pcur, mtr);
+	}
+
+err_exit:
+	return(err);
+}
+
+/********************************************************************//**
+Restores cursor position after it has been stored. We have to take into
+account that the record cursor was positioned on may have been deleted.
+Then we may have to move the cursor one step up or down.
+@return TRUE if we may need to process the record the cursor is now
+positioned on (i.e. we should not go to the next record yet) */
+static
+ibool
+sel_restore_position_for_mysql(
+/*===========================*/
+	ibool*		same_user_rec,	/*!< out: TRUE if we were able to restore
+					the cursor on a user record with the
+					same ordering prefix in in the
+					B-tree index */
+	ulint		latch_mode,	/*!< in: latch mode wished in
+					restoration */
+	btr_pcur_t*	pcur,		/*!< in: cursor whose position
+					has been stored */
+	ibool		moves_up,	/*!< in: TRUE if the cursor moves up
+					in the index */
+	mtr_t*		mtr)		/*!< in: mtr; CAUTION: may commit
+					mtr temporarily! */
+{
+	ibool		success;
+
+	success = btr_pcur_restore_position(latch_mode, pcur, mtr);
+
+	*same_user_rec = success;
+
+	ut_ad(!success || pcur->rel_pos == BTR_PCUR_ON);
+#ifdef UNIV_DEBUG
+	if (pcur->pos_state == BTR_PCUR_IS_POSITIONED_OPTIMISTIC) {
+		ut_ad(pcur->rel_pos == BTR_PCUR_BEFORE
+		      || pcur->rel_pos == BTR_PCUR_AFTER);
+	} else {
+		ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+		ut_ad((pcur->rel_pos == BTR_PCUR_ON)
+		      == btr_pcur_is_on_user_rec(pcur));
+	}
+#endif
+
+	/* The position may need be adjusted for rel_pos and moves_up. */
+
+	switch (pcur->rel_pos) {
+	case BTR_PCUR_ON:
+		if (!success && moves_up) {
+next:
+			btr_pcur_move_to_next(pcur, mtr);
+			return(TRUE);
+		}
+		return(!success);
+	case BTR_PCUR_AFTER_LAST_IN_TREE:
+	case BTR_PCUR_BEFORE_FIRST_IN_TREE:
+		return(TRUE);
+	case BTR_PCUR_AFTER:
+		/* positioned to record after pcur->old_rec. */
+		pcur->pos_state = BTR_PCUR_IS_POSITIONED;
+prev:
+		if (btr_pcur_is_on_user_rec(pcur) && !moves_up) {
+			btr_pcur_move_to_prev(pcur, mtr);
+		}
+		return(TRUE);
+	case BTR_PCUR_BEFORE:
+		/* For non optimistic restoration:
+		The position is now set to the record before pcur->old_rec.
+
+		For optimistic restoration:
+		The position also needs to take the previous search_mode into
+		consideration. */
+
+		switch (pcur->pos_state) {
+		case BTR_PCUR_IS_POSITIONED_OPTIMISTIC:
+			pcur->pos_state = BTR_PCUR_IS_POSITIONED;
+			if (pcur->search_mode == PAGE_CUR_GE) {
+				/* Positioned during Greater or Equal search
+				with BTR_PCUR_BEFORE. Optimistic restore to
+				the same record. If scanning for lower then
+				we must move to previous record.
+				This can happen with:
+				HANDLER READ idx a = (const);
+				HANDLER READ idx PREV; */
+				goto prev;
+			}
+			return(TRUE);
+		case BTR_PCUR_IS_POSITIONED:
+			if (moves_up && btr_pcur_is_on_user_rec(pcur)) {
+				goto next;
+			}
+			return(TRUE);
+		case BTR_PCUR_WAS_POSITIONED:
+		case BTR_PCUR_NOT_POSITIONED:
+			break;
+		}
+	}
+	ut_ad(0);
+	return(TRUE);
+}
+
+/********************************************************************//**
+Copies a cached field for MySQL from the fetch cache. */
+static
+void
+row_sel_copy_cached_field_for_mysql(
+/*================================*/
+	byte*			buf,	/*!< in/out: row buffer */
+	const byte*		cache,	/*!< in: cached row */
+	const mysql_row_templ_t*templ)	/*!< in: column template */
+{
+	ulint	len;
+
+	buf += templ->mysql_col_offset;
+	cache += templ->mysql_col_offset;
+
+	UNIV_MEM_ASSERT_W(buf, templ->mysql_col_len);
+
+	if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR
+	    && templ->type != DATA_INT) {
+		/* Check for != DATA_INT to make sure we do
+		not treat MySQL ENUM or SET as a true VARCHAR!
+		Find the actual length of the true VARCHAR field. */
+		row_mysql_read_true_varchar(
+			&len, cache, templ->mysql_length_bytes);
+		len += templ->mysql_length_bytes;
+		UNIV_MEM_INVALID(buf, templ->mysql_col_len);
+	} else {
+		len = templ->mysql_col_len;
+	}
+
+	ut_memcpy(buf, cache, len);
+}
+
+/********************************************************************//**
+Pops a cached row for MySQL from the fetch cache. */
+UNIV_INLINE
+void
+row_sel_dequeue_cached_row_for_mysql(
+/*=================================*/
+	byte*		buf,		/*!< in/out: buffer where to copy the
+					row */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct */
+{
+	ulint			i;
+	const mysql_row_templ_t*templ;
+	const byte*		cached_rec;
+	ut_ad(prebuilt->n_fetch_cached > 0);
+	ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
+
+	UNIV_MEM_ASSERT_W(buf, prebuilt->mysql_row_len);
+
+	cached_rec = prebuilt->fetch_cache[prebuilt->fetch_cache_first];
+
+	if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
+		/* Copy cache record field by field, don't touch fields that
+		are not covered by current key */
+
+		for (i = 0; i < prebuilt->n_template; i++) {
+			templ = prebuilt->mysql_template + i;
+			row_sel_copy_cached_field_for_mysql(
+				buf, cached_rec, templ);
+			/* Copy NULL bit of the current field from cached_rec
+			to buf */
+			if (templ->mysql_null_bit_mask) {
+				buf[templ->mysql_null_byte_offset]
+					^= (buf[templ->mysql_null_byte_offset]
+					    ^ cached_rec[templ->mysql_null_byte_offset])
+					& (byte) templ->mysql_null_bit_mask;
+			}
+		}
+	} else if (prebuilt->mysql_prefix_len > 63) {
+		/* The record is long. Copy it field by field, in case
+		there are some long VARCHAR column of which only a
+		small length is being used. */
+		UNIV_MEM_INVALID(buf, prebuilt->mysql_prefix_len);
+
+		/* First copy the NULL bits. */
+		ut_memcpy(buf, cached_rec, prebuilt->null_bitmap_len);
+		/* Then copy the requested fields. */
+
+		for (i = 0; i < prebuilt->n_template; i++) {
+			row_sel_copy_cached_field_for_mysql(
+				buf, cached_rec, prebuilt->mysql_template + i);
+		}
+	} else {
+		ut_memcpy(buf, cached_rec, prebuilt->mysql_prefix_len);
+	}
+
+	prebuilt->n_fetch_cached--;
+	prebuilt->fetch_cache_first++;
+
+	if (prebuilt->n_fetch_cached == 0) {
+		prebuilt->fetch_cache_first = 0;
+	}
+}
+
+/********************************************************************//**
+Initialise the prefetch cache. */
+UNIV_INLINE
+void
+row_sel_prefetch_cache_init(
+/*========================*/
+	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
+{
+	ulint	i;
+	ulint	sz;
+	byte*	ptr;
+
+	/* Reserve space for the magic number. */
+	sz = UT_ARR_SIZE(prebuilt->fetch_cache) * (prebuilt->mysql_row_len + 8);
+	ptr = static_cast<byte*>(mem_alloc(sz));
+
+	for (i = 0; i < UT_ARR_SIZE(prebuilt->fetch_cache); i++) {
+
+		/* A user has reported memory corruption in these
+		buffers in Linux. Put magic numbers there to help
+		to track a possible bug. */
+
+		mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
+		ptr += 4;
+
+		prebuilt->fetch_cache[i] = ptr;
+		ptr += prebuilt->mysql_row_len;
+
+		mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
+		ptr += 4;
+	}
+}
+
+/********************************************************************//**
+Get the last fetch cache buffer from the queue.
+@return pointer to buffer. */
+UNIV_INLINE
+byte*
+row_sel_fetch_last_buf(
+/*===================*/
+	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
+{
+	ut_ad(!prebuilt->templ_contains_blob);
+	ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+
+	if (prebuilt->fetch_cache[0] == NULL) {
+		/* Allocate memory for the fetch cache */
+		ut_ad(prebuilt->n_fetch_cached == 0);
+
+		row_sel_prefetch_cache_init(prebuilt);
+	}
+
+	ut_ad(prebuilt->fetch_cache_first == 0);
+	UNIV_MEM_INVALID(prebuilt->fetch_cache[prebuilt->n_fetch_cached],
+			 prebuilt->mysql_row_len);
+
+	return(prebuilt->fetch_cache[prebuilt->n_fetch_cached]);
+}
+
+/********************************************************************//**
+Pushes a row for MySQL to the fetch cache. */
+UNIV_INLINE
+void
+row_sel_enqueue_cache_row_for_mysql(
+/*================================*/
+	byte*		mysql_rec,	/*!< in/out: MySQL record */
+	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
+{
+	/* For non ICP code path the row should already exist in the
+	next fetch cache slot. */
+
+	if (prebuilt->idx_cond != NULL) {
+		byte*	dest = row_sel_fetch_last_buf(prebuilt);
+
+		ut_memcpy(dest, mysql_rec, prebuilt->mysql_row_len);
+	}
+
+	++prebuilt->n_fetch_cached;
+}
+
+/*********************************************************************//**
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always). We assume that the search
+mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
+btr search latch has been locked in S-mode if AHI is enabled.
+@return	SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+static
+ulint
+row_sel_try_search_shortcut_for_mysql(
+/*==================================*/
+	const rec_t**	out_rec,/*!< out: record if found */
+	row_prebuilt_t*	prebuilt,/*!< in: prebuilt struct */
+	ulint**		offsets,/*!< in/out: for rec_get_offsets(*out_rec) */
+	mem_heap_t**	heap,	/*!< in/out: heap for rec_get_offsets() */
+	mtr_t*		mtr)	/*!< in: started mtr */
+{
+	dict_index_t*	index		= prebuilt->index;
+	const dtuple_t*	search_tuple	= prebuilt->search_tuple;
+	btr_pcur_t*	pcur		= &prebuilt->pcur;
+	trx_t*		trx		= prebuilt->trx;
+	const rec_t*	rec;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(!prebuilt->templ_contains_blob);
+
+#ifndef UNIV_SEARCH_DEBUG
+	btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
+				   BTR_SEARCH_LEAF, pcur,
+				   (trx->has_search_latch)
+				    ? RW_S_LATCH
+				    : 0,
+				   mtr);
+#else /* UNIV_SEARCH_DEBUG */
+	btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
+				   BTR_SEARCH_LEAF, pcur,
+				   0,
+				   mtr);
+#endif /* UNIV_SEARCH_DEBUG */
+	rec = btr_pcur_get_rec(pcur);
+
+	if (!page_rec_is_user_rec(rec)) {
+
+		return(SEL_RETRY);
+	}
+
+	/* As the cursor is now placed on a user record after a search with
+	the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+	fields in the user record matched to the search tuple */
+
+	if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
+
+		return(SEL_EXHAUSTED);
+	}
+
+	/* This is a non-locking consistent read: if necessary, fetch
+	a previous version of the record */
+
+	*offsets = rec_get_offsets(rec, index, *offsets,
+				   ULINT_UNDEFINED, heap);
+
+	if (!lock_clust_rec_cons_read_sees(rec, index,
+					   *offsets, trx->read_view)) {
+
+		return(SEL_RETRY);
+	}
+
+	if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
+
+		return(SEL_EXHAUSTED);
+	}
+
+	*out_rec = rec;
+
+	return(SEL_FOUND);
+}
+
+/*********************************************************************//**
+Check a pushed-down index condition.
+@return ICP_NO_MATCH, ICP_MATCH, or ICP_OUT_OF_RANGE */
+static
+enum icp_result
+row_search_idx_cond_check(
+/*======================*/
+	byte*			mysql_rec,	/*!< out: record
+						in MySQL format (invalid unless
+						prebuilt->idx_cond!=NULL and
+						we return ICP_MATCH) */
+	row_prebuilt_t*		prebuilt,	/*!< in/out: prebuilt struct
+						for the table handle */
+	const rec_t*		rec,		/*!< in: InnoDB record */
+	const ulint*		offsets)	/*!< in: rec_get_offsets() */
+{
+	enum icp_result result;
+	ulint		i;
+
+	ut_ad(rec_offs_validate(rec, prebuilt->index, offsets));
+
+	if (!prebuilt->idx_cond) {
+		return(ICP_MATCH);
+	}
+
+	MONITOR_INC(MONITOR_ICP_ATTEMPTS);
+
+	/* Convert to MySQL format those fields that are needed for
+	evaluating the index condition. */
+
+	if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
+		mem_heap_empty(prebuilt->blob_heap);
+	}
+
+	for (i = 0; i < prebuilt->idx_cond_n_cols; i++) {
+		const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
+
+		if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
+					       rec, prebuilt->index, offsets,
+					       templ->icp_rec_field_no,
+					       templ)) {
+			return(ICP_NO_MATCH);
+		}
+	}
+
+	/* We assume that the index conditions on
+	case-insensitive columns are case-insensitive. The
+	case of such columns may be wrong in a secondary
+	index, if the case of the column has been updated in
+	the past, or a record has been deleted and a record
+	inserted in a different case. */
+	result = innobase_index_cond(prebuilt->idx_cond);
+	switch (result) {
+	case ICP_MATCH:
+		/* Convert the remaining fields to MySQL format.
+		If this is a secondary index record, we must defer
+		this until we have fetched the clustered index record. */
+		if (!prebuilt->need_to_access_clustered
+		    || dict_index_is_clust(prebuilt->index)) {
+			if (!row_sel_store_mysql_rec(
+				    mysql_rec, prebuilt, rec, FALSE,
+				    prebuilt->index, offsets)) {
+				ut_ad(dict_index_is_clust(prebuilt->index));
+				return(ICP_NO_MATCH);
+			}
+		}
+		MONITOR_INC(MONITOR_ICP_MATCH);
+		return(result);
+	case ICP_NO_MATCH:
+		MONITOR_INC(MONITOR_ICP_NO_MATCH);
+		return(result);
+	case ICP_OUT_OF_RANGE:
+		MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE);
+		return(result);
+	}
+
+	ut_error;
+	return(result);
+}
+
+/********************************************************************//**
+Searches for rows in the database. This is used in the interface to
+MySQL. This function opens a cursor, and also implements fetch next
+and fetch prev. NOTE that if we do a search with a full key value
+from a unique index (ROW_SEL_EXACT), then we will not store the cursor
+position and fetch next or fetch prev must not be tried to the cursor!
+@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
+DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */
+UNIV_INTERN
+dberr_t
+row_search_for_mysql(
+/*=================*/
+	byte*		buf,		/*!< in/out: buffer for the fetched
+					row in the MySQL format */
+	ulint		mode,		/*!< in: search mode PAGE_CUR_L, ... */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct for the
+					table handle; this contains the info
+					of search_tuple, index; if search
+					tuple contains 0 fields then we
+					position the cursor at the start or
+					the end of the index, depending on
+					'mode' */
+	ulint		match_mode,	/*!< in: 0 or ROW_SEL_EXACT or
+					ROW_SEL_EXACT_PREFIX */
+	ulint		direction)	/*!< in: 0 or ROW_SEL_NEXT or
+					ROW_SEL_PREV; NOTE: if this is != 0,
+					then prebuilt must have a pcur
+					with stored position! In opening of a
+					cursor 'direction' should be 0. */
+{
+	dict_index_t*	index		= prebuilt->index;
+	ibool		comp		= dict_table_is_comp(index->table);
+	const dtuple_t*	search_tuple	= prebuilt->search_tuple;
+	btr_pcur_t*	pcur		= &prebuilt->pcur;
+	trx_t*		trx		= prebuilt->trx;
+	dict_index_t*	clust_index;
+	que_thr_t*	thr;
+	const rec_t*	rec;
+	const rec_t*	result_rec = NULL;
+	const rec_t*	clust_rec;
+	dberr_t		err				= DB_SUCCESS;
+	ibool		unique_search			= FALSE;
+	ibool		mtr_has_extra_clust_latch	= FALSE;
+	ibool		moves_up			= FALSE;
+	ibool		set_also_gap_locks		= TRUE;
+	/* if the query is a plain locking SELECT, and the isolation level
+	is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
+	ibool		did_semi_consistent_read	= FALSE;
+	/* if the returned record was locked and we did a semi-consistent
+	read (fetch the newest committed version), then this is set to
+	TRUE */
+#ifdef UNIV_SEARCH_DEBUG
+	ulint		cnt				= 0;
+#endif /* UNIV_SEARCH_DEBUG */
+	ulint		next_offs;
+	ibool		same_user_rec;
+	mtr_t		mtr;
+	mem_heap_t*	heap				= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets				= offsets_;
+	ibool		table_lock_waited		= FALSE;
+	byte*		next_buf			= 0;
+
+	rec_offs_init(offsets_);
+
+	ut_ad(index && pcur && search_tuple);
+
+	/* We don't support FTS queries from the HANDLER interfaces, because
+	we implemented FTS as reversed inverted index with auxiliary tables.
+	So anything related to traditional index query would not apply to
+	it. */
+	if (index->type & DICT_FTS) {
+		return(DB_END_OF_INDEX);
+	}
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (dict_table_is_discarded(prebuilt->table)) {
+
+		return(DB_TABLESPACE_DELETED);
+
+	} else if (prebuilt->table->ibd_file_missing) {
+
+		return(DB_TABLESPACE_NOT_FOUND);
+
+	} else if (!prebuilt->index_usable) {
+
+		return(DB_MISSING_HISTORY);
+
+	} else if (dict_index_is_corrupted(index)) {
+
+		return(DB_CORRUPTION);
+
+	} else if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to free a corrupt\n"
+			"InnoDB: table handle. Magic n %lu, table name ",
+			(ulong) prebuilt->magic_n);
+		ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption(prebuilt);
+
+		ut_error;
+	}
+
+#if 0
+	/* August 19, 2005 by Heikki: temporarily disable this error
+	print until the cursor lock count is done correctly.
+	See bugs #12263 and #12456!*/
+
+	if (trx->n_mysql_tables_in_use == 0
+	    && UNIV_UNLIKELY(prebuilt->select_lock_type == LOCK_NONE)) {
+		/* Note that if MySQL uses an InnoDB temp table that it
+		created inside LOCK TABLES, then n_mysql_tables_in_use can
+		be zero; in that case select_lock_type is set to LOCK_X in
+		::start_stmt. */
+
+		fputs("InnoDB: Error: MySQL is trying to perform a SELECT\n"
+		      "InnoDB: but it has not locked"
+		      " any tables in ::external_lock()!\n",
+		      stderr);
+		trx_print(stderr, trx, 600);
+		fputc('\n', stderr);
+	}
+#endif
+
+#if 0
+	fprintf(stderr, "Match mode %lu\n search tuple ",
+		(ulong) match_mode);
+	dtuple_print(search_tuple);
+	fprintf(stderr, "N tables locked %lu\n",
+		(ulong) trx->mysql_n_tables_locked);
+#endif
+	/*-------------------------------------------------------------*/
+	/* PHASE 0: Release a possible s-latch we are holding on the
+	adaptive hash index latch if there is someone waiting behind */
+
+	if (UNIV_UNLIKELY(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_NOT_LOCKED)
+	    && trx->has_search_latch) {
+
+		/* There is an x-latch request on the adaptive hash index:
+		release the s-latch to reduce starvation and wait for
+		BTR_SEA_TIMEOUT rounds before trying to keep it again over
+		calls from MySQL */
+
+		rw_lock_s_unlock(&btr_search_latch);
+		trx->has_search_latch = FALSE;
+
+		trx->search_latch_timeout = BTR_SEA_TIMEOUT;
+	}
+
+	/* Reset the new record lock info if srv_locks_unsafe_for_binlog
+	is set or session is using a READ COMMITED isolation level. Then
+	we are able to remove the record locks set here on an individual
+	row. */
+	prebuilt->new_rec_locks = 0;
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 1: Try to pop the row from the prefetch cache */
+
+	if (UNIV_UNLIKELY(direction == 0)) {
+		trx->op_info = "starting index read";
+
+		prebuilt->n_rows_fetched = 0;
+		prebuilt->n_fetch_cached = 0;
+		prebuilt->fetch_cache_first = 0;
+
+		if (prebuilt->sel_graph == NULL) {
+			/* Build a dummy select query graph */
+			row_prebuild_sel_graph(prebuilt);
+		}
+	} else {
+		trx->op_info = "fetching rows";
+
+		if (prebuilt->n_rows_fetched == 0) {
+			prebuilt->fetch_direction = direction;
+		}
+
+		if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) {
+			if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) {
+				ut_error;
+				/* TODO: scrollable cursor: restore cursor to
+				the place of the latest returned row,
+				or better: prevent caching for a scroll
+				cursor! */
+			}
+
+			prebuilt->n_rows_fetched = 0;
+			prebuilt->n_fetch_cached = 0;
+			prebuilt->fetch_cache_first = 0;
+
+		} else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
+			row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
+
+			prebuilt->n_rows_fetched++;
+
+			err = DB_SUCCESS;
+			goto func_exit;
+		}
+
+		if (prebuilt->fetch_cache_first > 0
+		    && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
+
+			/* The previous returned row was popped from the fetch
+			cache, but the cache was not full at the time of the
+			popping: no more rows can exist in the result set */
+
+			err = DB_RECORD_NOT_FOUND;
+			goto func_exit;
+		}
+
+		prebuilt->n_rows_fetched++;
+
+		if (prebuilt->n_rows_fetched > 1000000000) {
+			/* Prevent wrap-over */
+			prebuilt->n_rows_fetched = 500000000;
+		}
+
+		mode = pcur->search_mode;
+	}
+
+	/* In a search where at most one record in the index may match, we
+	can use a LOCK_REC_NOT_GAP type record lock when locking a
+	non-delete-marked matching record.
+
+	Note that in a unique secondary index there may be different
+	delete-marked versions of a record where only the primary key
+	values differ: thus in a secondary index we must use next-key
+	locks when locking delete-marked records. */
+
+	if (match_mode == ROW_SEL_EXACT
+	    && dict_index_is_unique(index)
+	    && dtuple_get_n_fields(search_tuple)
+	    == dict_index_get_n_unique(index)
+	    && (dict_index_is_clust(index)
+		|| !dtuple_contains_null(search_tuple))) {
+
+		/* Note above that a UNIQUE secondary index can contain many
+		rows with the same key value if one of the columns is the SQL
+		null. A clustered index under MySQL can never contain null
+		columns because we demand that all the columns in primary key
+		are non-null. */
+
+		unique_search = TRUE;
+
+		/* Even if the condition is unique, MySQL seems to try to
+		retrieve also a second row if a primary key contains more than
+		1 column. Return immediately if this is not a HANDLER
+		command. */
+
+		if (UNIV_UNLIKELY(direction != 0
+				  && !prebuilt->used_in_HANDLER)) {
+
+			err = DB_RECORD_NOT_FOUND;
+			goto func_exit;
+		}
+	}
+
+	mtr_start(&mtr);
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 2: Try fast adaptive hash index search if possible */
+
+	/* Next test if this is the special case where we can use the fast
+	adaptive hash index to try the search. Since we must release the
+	search system latch when we retrieve an externally stored field, we
+	cannot use the adaptive hash index in a search in the case the row
+	may be long and there may be externally stored fields */
+
+	if (UNIV_UNLIKELY(direction == 0)
+	    && unique_search
+	    && dict_index_is_clust(index)
+	    && !prebuilt->templ_contains_blob
+	    && !prebuilt->used_in_HANDLER
+	    && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)
+	    && !prebuilt->innodb_api) {
+
+		mode = PAGE_CUR_GE;
+
+		if (trx->mysql_n_tables_locked == 0
+		    && prebuilt->select_lock_type == LOCK_NONE
+		    && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+		    && trx->read_view) {
+
+			/* This is a SELECT query done as a consistent read,
+			and the read view has already been allocated:
+			let us try a search shortcut through the hash
+			index.
+			NOTE that we must also test that
+			mysql_n_tables_locked == 0, because this might
+			also be INSERT INTO ... SELECT ... or
+			CREATE TABLE ... SELECT ... . Our algorithm is
+			NOT prepared to inserts interleaved with the SELECT,
+			and if we try that, we can deadlock on the adaptive
+			hash index semaphore! */
+
+#ifndef UNIV_SEARCH_DEBUG
+			if (!trx->has_search_latch) {
+				rw_lock_s_lock(&btr_search_latch);
+				trx->has_search_latch = TRUE;
+			}
+#endif
+			switch (row_sel_try_search_shortcut_for_mysql(
+					&rec, prebuilt, &offsets, &heap,
+					&mtr)) {
+			case SEL_FOUND:
+#ifdef UNIV_SEARCH_DEBUG
+				ut_a(0 == cmp_dtuple_rec(search_tuple,
+							 rec, offsets));
+#endif
+				/* At this point, rec is protected by
+				a page latch that was acquired by
+				row_sel_try_search_shortcut_for_mysql().
+				The latch will not be released until
+				mtr_commit(&mtr). */
+				ut_ad(!rec_get_deleted_flag(rec, comp));
+
+				if (prebuilt->idx_cond) {
+					switch (row_search_idx_cond_check(
+							buf, prebuilt,
+							rec, offsets)) {
+					case ICP_NO_MATCH:
+					case ICP_OUT_OF_RANGE:
+						goto shortcut_mismatch;
+					case ICP_MATCH:
+						goto shortcut_match;
+					}
+				}
+
+				if (!row_sel_store_mysql_rec(
+					    buf, prebuilt,
+					    rec, FALSE, index, offsets)) {
+					/* Only fresh inserts may contain
+					incomplete externally stored
+					columns. Pretend that such
+					records do not exist. Such
+					records may only be accessed
+					at the READ UNCOMMITTED
+					isolation level or when
+					rolling back a recovered
+					transaction. Rollback happens
+					at a lower level, not here. */
+
+					/* Proceed as in case SEL_RETRY. */
+					break;
+				}
+
+			shortcut_match:
+				mtr_commit(&mtr);
+
+				/* ut_print_name(stderr, index->name);
+				fputs(" shortcut\n", stderr); */
+
+				err = DB_SUCCESS;
+				goto release_search_latch_if_needed;
+
+			case SEL_EXHAUSTED:
+			shortcut_mismatch:
+				mtr_commit(&mtr);
+
+				/* ut_print_name(stderr, index->name);
+				fputs(" record not found 2\n", stderr); */
+
+				err = DB_RECORD_NOT_FOUND;
+release_search_latch_if_needed:
+				if (trx->search_latch_timeout > 0
+				    && trx->has_search_latch) {
+
+					trx->search_latch_timeout--;
+
+					rw_lock_s_unlock(&btr_search_latch);
+					trx->has_search_latch = FALSE;
+				}
+
+				/* NOTE that we do NOT store the cursor
+				position */
+				goto func_exit;
+
+			case SEL_RETRY:
+				break;
+
+			default:
+				ut_ad(0);
+			}
+
+			mtr_commit(&mtr);
+			mtr_start(&mtr);
+		}
+	}
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 3: Open or restore index cursor position */
+
+	if (trx->has_search_latch) {
+		rw_lock_s_unlock(&btr_search_latch);
+		trx->has_search_latch = FALSE;
+	}
+
+	/* The state of a running trx can only be changed by the
+	thread that is currently serving the transaction. Because we
+	are that thread, we can read trx->state without holding any
+	mutex. */
+	ut_ad(prebuilt->sql_stat_start || trx->state == TRX_STATE_ACTIVE);
+
+	ut_ad(trx->state == TRX_STATE_NOT_STARTED
+	      || trx->state == TRX_STATE_ACTIVE);
+
+	ut_ad(prebuilt->sql_stat_start
+	      || prebuilt->select_lock_type != LOCK_NONE
+	      || trx->read_view);
+
+	trx_start_if_not_started(trx);
+
+	if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+	    && prebuilt->select_lock_type != LOCK_NONE
+	    && trx->mysql_thd != NULL
+	    && thd_is_select(trx->mysql_thd)) {
+		/* It is a plain locking SELECT and the isolation
+		level is low: do not lock gaps */
+
+		set_also_gap_locks = FALSE;
+	}
+
+	/* Note that if the search mode was GE or G, then the cursor
+	naturally moves upward (in fetch next) in alphabetical order,
+	otherwise downward */
+
+	if (UNIV_UNLIKELY(direction == 0)) {
+		if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) {
+			moves_up = TRUE;
+		}
+	} else if (direction == ROW_SEL_NEXT) {
+		moves_up = TRUE;
+	}
+
+	thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	/* Do some start-of-statement preparations */
+
+	if (!prebuilt->sql_stat_start) {
+		/* No need to set an intention lock or assign a read view */
+
+		if (UNIV_UNLIKELY
+		    (trx->read_view == NULL
+		     && prebuilt->select_lock_type == LOCK_NONE)) {
+
+			fputs("InnoDB: Error: MySQL is trying to"
+			      " perform a consistent read\n"
+			      "InnoDB: but the read view is not assigned!\n",
+			      stderr);
+			trx_print(stderr, trx, 600);
+			fputc('\n', stderr);
+			ut_error;
+		}
+	} else if (prebuilt->select_lock_type == LOCK_NONE) {
+		/* This is a consistent read */
+		/* Assign a read view for the query */
+
+		trx_assign_read_view(trx);
+		prebuilt->sql_stat_start = FALSE;
+	} else {
+wait_table_again:
+		err = lock_table(0, index->table,
+				 prebuilt->select_lock_type == LOCK_S
+				 ? LOCK_IS : LOCK_IX, thr);
+
+		if (err != DB_SUCCESS) {
+
+			table_lock_waited = TRUE;
+			goto lock_table_wait;
+		}
+		prebuilt->sql_stat_start = FALSE;
+	}
+
+	/* Open or restore index cursor position */
+
+	if (UNIV_LIKELY(direction != 0)) {
+		ibool	need_to_process = sel_restore_position_for_mysql(
+			&same_user_rec, BTR_SEARCH_LEAF,
+			pcur, moves_up, &mtr);
+
+		if (UNIV_UNLIKELY(need_to_process)) {
+			if (UNIV_UNLIKELY(prebuilt->row_read_type
+					  == ROW_READ_DID_SEMI_CONSISTENT)) {
+				/* We did a semi-consistent read,
+				but the record was removed in
+				the meantime. */
+				prebuilt->row_read_type
+					= ROW_READ_TRY_SEMI_CONSISTENT;
+			}
+		} else if (UNIV_LIKELY(prebuilt->row_read_type
+				       != ROW_READ_DID_SEMI_CONSISTENT)) {
+
+			/* The cursor was positioned on the record
+			that we returned previously.  If we need
+			to repeat a semi-consistent read as a
+			pessimistic locking read, the record
+			cannot be skipped. */
+
+			goto next_rec;
+		}
+
+	} else if (dtuple_get_n_fields(search_tuple) > 0) {
+
+		btr_pcur_open_with_no_init(index, search_tuple, mode,
+					   BTR_SEARCH_LEAF,
+					   pcur, 0, &mtr);
+
+		pcur->trx_if_known = trx;
+
+		rec = btr_pcur_get_rec(pcur);
+
+		if (!moves_up
+		    && !page_rec_is_supremum(rec)
+		    && set_also_gap_locks
+		    && !(srv_locks_unsafe_for_binlog
+			 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
+		    && prebuilt->select_lock_type != LOCK_NONE) {
+
+			/* Try to place a gap lock on the next index record
+			to prevent phantoms in ORDER BY ... DESC queries */
+			const rec_t*	next_rec = page_rec_get_next_const(rec);
+
+			offsets = rec_get_offsets(next_rec, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+			err = sel_set_rec_lock(btr_pcur_get_block(pcur),
+					       next_rec, index, offsets,
+					       prebuilt->select_lock_type,
+					       LOCK_GAP, thr);
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+				err = DB_SUCCESS;
+			case DB_SUCCESS:
+				break;
+			default:
+				goto lock_wait_or_error;
+			}
+		}
+	} else if (mode == PAGE_CUR_G || mode == PAGE_CUR_L) {
+		btr_pcur_open_at_index_side(
+			mode == PAGE_CUR_G, index, BTR_SEARCH_LEAF,
+			pcur, false, 0, &mtr);
+	}
+
+rec_loop:
+	DEBUG_SYNC_C("row_search_rec_loop");
+	if (trx_is_interrupted(trx)) {
+		btr_pcur_store_position(pcur, &mtr);
+		err = DB_INTERRUPTED;
+		goto normal_return;
+	}
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 4: Look for matching records in a loop */
+
+	rec = btr_pcur_get_rec(pcur);
+	ut_ad(!!page_rec_is_comp(rec) == comp);
+#ifdef UNIV_SEARCH_DEBUG
+	/*
+	fputs("Using ", stderr);
+	dict_index_name_print(stderr, trx, index);
+	fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt,
+	page_get_page_no(page_align(rec)));
+	rec_print(stderr, rec, index);
+	printf("delete-mark: %lu\n",
+	       rec_get_deleted_flag(rec, page_rec_is_comp(rec)));
+	*/
+#endif /* UNIV_SEARCH_DEBUG */
+
+	if (page_rec_is_infimum(rec)) {
+
+		/* The infimum record on a page cannot be in the result set,
+		and neither can a record lock be placed on it: we skip such
+		a record. */
+
+		goto next_rec;
+	}
+
+	if (page_rec_is_supremum(rec)) {
+
+		if (set_also_gap_locks
+		    && !(srv_locks_unsafe_for_binlog
+			 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
+		    && prebuilt->select_lock_type != LOCK_NONE) {
+
+			/* Try to place a lock on the index record */
+
+			/* If innodb_locks_unsafe_for_binlog option is used
+			or this session is using a READ COMMITTED isolation
+			level we do not lock gaps. Supremum record is really
+			a gap and therefore we do not set locks there. */
+
+			offsets = rec_get_offsets(rec, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+			err = sel_set_rec_lock(btr_pcur_get_block(pcur),
+					       rec, index, offsets,
+					       prebuilt->select_lock_type,
+					       LOCK_ORDINARY, thr);
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+				err = DB_SUCCESS;
+			case DB_SUCCESS:
+				break;
+			default:
+				goto lock_wait_or_error;
+			}
+		}
+		/* A page supremum record cannot be in the result set: skip
+		it now that we have placed a possible lock on it */
+
+		goto next_rec;
+	}
+
+	/*-------------------------------------------------------------*/
+	/* Do sanity checks in case our cursor has bumped into page
+	corruption */
+
+	if (comp) {
+		next_offs = rec_get_next_offs(rec, TRUE);
+		if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) {
+
+			goto wrong_offs;
+		}
+	} else {
+		next_offs = rec_get_next_offs(rec, FALSE);
+		if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) {
+
+			goto wrong_offs;
+		}
+	}
+
+	if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) {
+
+wrong_offs:
+		if (srv_force_recovery == 0 || moves_up == FALSE) {
+			ut_print_timestamp(stderr);
+			buf_page_print(page_align(rec), 0,
+				       BUF_PAGE_PRINT_NO_CRASH);
+			fprintf(stderr,
+				"\nInnoDB: rec address %p,"
+				" buf block fix count %lu\n",
+				(void*) rec, (ulong)
+				btr_cur_get_block(btr_pcur_get_btr_cur(pcur))
+				->page.buf_fix_count);
+			fprintf(stderr,
+				"InnoDB: Index corruption: rec offs %lu"
+				" next offs %lu, page no %lu,\n"
+				"InnoDB: ",
+				(ulong) page_offset(rec),
+				(ulong) next_offs,
+				(ulong) page_get_page_no(page_align(rec)));
+			dict_index_name_print(stderr, trx, index);
+			fputs(". Run CHECK TABLE. You may need to\n"
+			      "InnoDB: restore from a backup, or"
+			      " dump + drop + reimport the table.\n",
+			      stderr);
+			ut_ad(0);
+			err = DB_CORRUPTION;
+
+			goto lock_wait_or_error;
+		} else {
+			/* The user may be dumping a corrupt table. Jump
+			over the corruption to recover as much as possible. */
+
+			fprintf(stderr,
+				"InnoDB: Index corruption: rec offs %lu"
+				" next offs %lu, page no %lu,\n"
+				"InnoDB: ",
+				(ulong) page_offset(rec),
+				(ulong) next_offs,
+				(ulong) page_get_page_no(page_align(rec)));
+			dict_index_name_print(stderr, trx, index);
+			fputs(". We try to skip the rest of the page.\n",
+			      stderr);
+
+			btr_pcur_move_to_last_on_page(pcur, &mtr);
+
+			goto next_rec;
+		}
+	}
+	/*-------------------------------------------------------------*/
+
+	/* Calculate the 'offsets' associated with 'rec' */
+
+	ut_ad(fil_page_get_type(btr_pcur_get_page(pcur)) == FIL_PAGE_INDEX);
+	ut_ad(btr_page_get_index_id(btr_pcur_get_page(pcur)) == index->id);
+
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+	if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
+		if (!rec_validate(rec, offsets)
+		    || !btr_index_rec_validate(rec, index, FALSE)) {
+			fprintf(stderr,
+				"InnoDB: Index corruption: rec offs %lu"
+				" next offs %lu, page no %lu,\n"
+				"InnoDB: ",
+				(ulong) page_offset(rec),
+				(ulong) next_offs,
+				(ulong) page_get_page_no(page_align(rec)));
+			dict_index_name_print(stderr, trx, index);
+			fputs(". We try to skip the record.\n",
+			      stderr);
+
+			goto next_rec;
+		}
+	}
+
+	/* Note that we cannot trust the up_match value in the cursor at this
+	place because we can arrive here after moving the cursor! Thus
+	we have to recompare rec and search_tuple to determine if they
+	match enough. */
+
+	if (match_mode == ROW_SEL_EXACT) {
+		/* Test if the index record matches completely to search_tuple
+		in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
+
+		/* fputs("Comparing rec and search tuple\n", stderr); */
+
+		if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) {
+
+			if (set_also_gap_locks
+			    && !(srv_locks_unsafe_for_binlog
+				 || trx->isolation_level
+				 <= TRX_ISO_READ_COMMITTED)
+			    && prebuilt->select_lock_type != LOCK_NONE) {
+
+				/* Try to place a gap lock on the index
+				record only if innodb_locks_unsafe_for_binlog
+				option is not set or this session is not
+				using a READ COMMITTED isolation level. */
+
+				err = sel_set_rec_lock(
+					btr_pcur_get_block(pcur),
+					rec, index, offsets,
+					prebuilt->select_lock_type, LOCK_GAP,
+					thr);
+
+				switch (err) {
+				case DB_SUCCESS_LOCKED_REC:
+				case DB_SUCCESS:
+					break;
+				default:
+					goto lock_wait_or_error;
+				}
+			}
+
+			btr_pcur_store_position(pcur, &mtr);
+
+			/* The found record was not a match, but may be used
+			as NEXT record (index_next). Set the relative position
+			to BTR_PCUR_BEFORE, to reflect that the position of
+			the persistent cursor is before the found/stored row
+			(pcur->old_rec). */
+			ut_ad(pcur->rel_pos == BTR_PCUR_ON);
+			pcur->rel_pos = BTR_PCUR_BEFORE;
+
+			err = DB_RECORD_NOT_FOUND;
+#if 0
+			ut_print_name(stderr, trx, FALSE, index->name);
+			fputs(" record not found 3\n", stderr);
+#endif
+
+			goto normal_return;
+		}
+
+	} else if (match_mode == ROW_SEL_EXACT_PREFIX) {
+
+		if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) {
+
+			if (set_also_gap_locks
+			    && !(srv_locks_unsafe_for_binlog
+				 || trx->isolation_level
+				 <= TRX_ISO_READ_COMMITTED)
+			    && prebuilt->select_lock_type != LOCK_NONE) {
+
+				/* Try to place a gap lock on the index
+				record only if innodb_locks_unsafe_for_binlog
+				option is not set or this session is not
+				using a READ COMMITTED isolation level. */
+
+				err = sel_set_rec_lock(
+					btr_pcur_get_block(pcur),
+					rec, index, offsets,
+					prebuilt->select_lock_type, LOCK_GAP,
+					thr);
+
+				switch (err) {
+				case DB_SUCCESS_LOCKED_REC:
+				case DB_SUCCESS:
+					break;
+				default:
+					goto lock_wait_or_error;
+				}
+			}
+
+			btr_pcur_store_position(pcur, &mtr);
+
+			/* The found record was not a match, but may be used
+			as NEXT record (index_next). Set the relative position
+			to BTR_PCUR_BEFORE, to reflect that the position of
+			the persistent cursor is before the found/stored row
+			(pcur->old_rec). */
+			ut_ad(pcur->rel_pos == BTR_PCUR_ON);
+			pcur->rel_pos = BTR_PCUR_BEFORE;
+
+			err = DB_RECORD_NOT_FOUND;
+#if 0
+			ut_print_name(stderr, trx, FALSE, index->name);
+			fputs(" record not found 4\n", stderr);
+#endif
+
+			goto normal_return;
+		}
+	}
+
+	/* We are ready to look at a possible new index entry in the result
+	set: the cursor is now placed on a user record */
+
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* Try to place a lock on the index record; note that delete
+		marked records are a special case in a unique search. If there
+		is a non-delete marked record, then it is enough to lock its
+		existence with LOCK_REC_NOT_GAP. */
+
+		/* If innodb_locks_unsafe_for_binlog option is used
+		or this session is using a READ COMMITED isolation
+		level we lock only the record, i.e., next-key locking is
+		not used. */
+
+		ulint	lock_type;
+
+		if (!set_also_gap_locks
+		    || srv_locks_unsafe_for_binlog
+		    || trx->isolation_level <= TRX_ISO_READ_COMMITTED
+		    || (unique_search && !rec_get_deleted_flag(rec, comp))) {
+
+			goto no_gap_lock;
+		} else {
+			lock_type = LOCK_ORDINARY;
+		}
+
+		/* If we are doing a 'greater or equal than a primary key
+		value' search from a clustered index, and we find a record
+		that has that exact primary key value, then there is no need
+		to lock the gap before the record, because no insert in the
+		gap can be in our search range. That is, no phantom row can
+		appear that way.
+
+		An example: if col1 is the primary key, the search is WHERE
+		col1 >= 100, and we find a record where col1 = 100, then no
+		need to lock the gap before that record. */
+
+		if (index == clust_index
+		    && mode == PAGE_CUR_GE
+		    && direction == 0
+		    && dtuple_get_n_fields_cmp(search_tuple)
+		    == dict_index_get_n_unique(index)
+		    && 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) {
+no_gap_lock:
+			lock_type = LOCK_REC_NOT_GAP;
+		}
+
+		err = sel_set_rec_lock(btr_pcur_get_block(pcur),
+				       rec, index, offsets,
+				       prebuilt->select_lock_type,
+				       lock_type, thr);
+
+		switch (err) {
+			const rec_t*	old_vers;
+		case DB_SUCCESS_LOCKED_REC:
+			if (srv_locks_unsafe_for_binlog
+			    || trx->isolation_level
+			    <= TRX_ISO_READ_COMMITTED) {
+				/* Note that a record of
+				prebuilt->index was locked. */
+				prebuilt->new_rec_locks = 1;
+			}
+			err = DB_SUCCESS;
+		case DB_SUCCESS:
+			break;
+		case DB_LOCK_WAIT:
+			/* Never unlock rows that were part of a conflict. */
+			prebuilt->new_rec_locks = 0;
+
+			if (UNIV_LIKELY(prebuilt->row_read_type
+					!= ROW_READ_TRY_SEMI_CONSISTENT)
+			    || unique_search
+			    || index != clust_index) {
+
+				goto lock_wait_or_error;
+			}
+
+			/* The following call returns 'offsets'
+			associated with 'old_vers' */
+			row_sel_build_committed_vers_for_mysql(
+				clust_index, prebuilt, rec,
+				&offsets, &heap, &old_vers, &mtr);
+
+			/* Check whether it was a deadlock or not, if not
+			a deadlock and the transaction had to wait then
+			release the lock it is waiting on. */
+
+			err = lock_trx_handle_wait(trx);
+
+			switch (err) {
+			case DB_SUCCESS:
+				/* The lock was granted while we were
+				searching for the last committed version.
+				Do a normal locking read. */
+
+				offsets = rec_get_offsets(
+					rec, index, offsets, ULINT_UNDEFINED,
+					&heap);
+				goto locks_ok;
+			case DB_DEADLOCK:
+				goto lock_wait_or_error;
+			case DB_LOCK_WAIT:
+				err = DB_SUCCESS;
+				break;
+			default:
+				ut_error;
+			}
+
+			if (old_vers == NULL) {
+				/* The row was not yet committed */
+
+				goto next_rec;
+			}
+
+			did_semi_consistent_read = TRUE;
+			rec = old_vers;
+			break;
+		default:
+
+			goto lock_wait_or_error;
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
+
+			/* Do nothing: we let a non-locking SELECT read the
+			latest version of the record */
+
+		} else if (index == clust_index) {
+
+			/* Fetch a previous version of the row if the current
+			one is not visible in the snapshot; if we have a very
+			high force recovery level set, we try to avoid crashes
+			by skipping this lookup */
+
+			if (UNIV_LIKELY(srv_force_recovery < 5)
+			    && !lock_clust_rec_cons_read_sees(
+				    rec, index, offsets, trx->read_view)) {
+
+				rec_t*	old_vers;
+				/* The following call returns 'offsets'
+				associated with 'old_vers' */
+				err = row_sel_build_prev_vers_for_mysql(
+					trx->read_view, clust_index,
+					prebuilt, rec, &offsets, &heap,
+					&old_vers, &mtr);
+
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+
+				if (old_vers == NULL) {
+					/* The row did not exist yet in
+					the read view */
+
+					goto next_rec;
+				}
+
+				rec = old_vers;
+			}
+		} else {
+			/* We are looking into a non-clustered index,
+			and to get the right version of the record we
+			have to look also into the clustered index: this
+			is necessary, because we can only get the undo
+			information via the clustered index record. */
+
+			ut_ad(!dict_index_is_clust(index));
+
+			if (!lock_sec_rec_cons_read_sees(
+				    rec, trx->read_view)) {
+				/* We should look at the clustered index.
+				However, as this is a non-locking read,
+				we can skip the clustered index lookup if
+				the condition does not match the secondary
+				index entry. */
+				switch (row_search_idx_cond_check(
+						buf, prebuilt, rec, offsets)) {
+				case ICP_NO_MATCH:
+					goto next_rec;
+				case ICP_OUT_OF_RANGE:
+					err = DB_RECORD_NOT_FOUND;
+					goto idx_cond_failed;
+				case ICP_MATCH:
+					goto requires_clust_rec;
+				}
+
+				ut_error;
+			}
+		}
+	}
+
+locks_ok:
+	/* NOTE that at this point rec can be an old version of a clustered
+	index record built for a consistent read. We cannot assume after this
+	point that rec is on a buffer pool page. Functions like
+	page_rec_is_comp() cannot be used! */
+
+	if (rec_get_deleted_flag(rec, comp)) {
+
+		/* The record is delete-marked: we can skip it */
+
+		if ((srv_locks_unsafe_for_binlog
+		     || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
+		    && prebuilt->select_lock_type != LOCK_NONE
+		    && !did_semi_consistent_read) {
+
+			/* No need to keep a lock on a delete-marked record
+			if we do not want to use next-key locking. */
+
+			row_unlock_for_mysql(prebuilt, TRUE);
+		}
+
+		/* This is an optimization to skip setting the next key lock
+		on the record that follows this delete-marked record. This
+		optimization works because of the unique search criteria
+		which precludes the presence of a range lock between this
+		delete marked record and the record following it.
+
+		For now this is applicable only to clustered indexes while
+		doing a unique search except for HANDLER queries because
+		HANDLER allows NEXT and PREV even in unique search on
+		clustered index. There is scope for further optimization
+		applicable to unique secondary indexes. Current behaviour is
+		to widen the scope of a lock on an already delete marked record
+		if the same record is deleted twice by the same transaction */
+		if (index == clust_index && unique_search
+		    && !prebuilt->used_in_HANDLER) {
+
+			err = DB_RECORD_NOT_FOUND;
+
+			goto normal_return;
+		}
+
+		goto next_rec;
+	}
+
+	/* Check if the record matches the index condition. */
+	switch (row_search_idx_cond_check(buf, prebuilt, rec, offsets)) {
+	case ICP_NO_MATCH:
+		if (did_semi_consistent_read) {
+			row_unlock_for_mysql(prebuilt, TRUE);
+		}
+		goto next_rec;
+	case ICP_OUT_OF_RANGE:
+		err = DB_RECORD_NOT_FOUND;
+		goto idx_cond_failed;
+	case ICP_MATCH:
+		break;
+	}
+
+	/* Get the clustered index record if needed, if we did not do the
+	search using the clustered index. */
+
+	if (index != clust_index && prebuilt->need_to_access_clustered) {
+
+requires_clust_rec:
+		ut_ad(index != clust_index);
+		/* We use a 'goto' to the preceding label if a consistent
+		read of a secondary index record requires us to look up old
+		versions of the associated clustered index record. */
+
+		ut_ad(rec_offs_validate(rec, index, offsets));
+
+		/* It was a non-clustered index and we must fetch also the
+		clustered index record */
+
+		mtr_has_extra_clust_latch = TRUE;
+
+		/* The following call returns 'offsets' associated with
+		'clust_rec'. Note that 'clust_rec' can be an old version
+		built for a consistent read. */
+
+		err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
+						      thr, &clust_rec,
+						      &offsets, &heap, &mtr);
+		switch (err) {
+		case DB_SUCCESS:
+			if (clust_rec == NULL) {
+				/* The record did not exist in the read view */
+				ut_ad(prebuilt->select_lock_type == LOCK_NONE);
+
+				goto next_rec;
+			}
+			break;
+		case DB_SUCCESS_LOCKED_REC:
+			ut_a(clust_rec != NULL);
+			if (srv_locks_unsafe_for_binlog
+			     || trx->isolation_level
+			    <= TRX_ISO_READ_COMMITTED) {
+				/* Note that the clustered index record
+				was locked. */
+				prebuilt->new_rec_locks = 2;
+			}
+			err = DB_SUCCESS;
+			break;
+		default:
+			goto lock_wait_or_error;
+		}
+
+		if (rec_get_deleted_flag(clust_rec, comp)) {
+
+			/* The record is delete marked: we can skip it */
+
+			if ((srv_locks_unsafe_for_binlog
+			     || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
+			    && prebuilt->select_lock_type != LOCK_NONE) {
+
+				/* No need to keep a lock on a delete-marked
+				record if we do not want to use next-key
+				locking. */
+
+				row_unlock_for_mysql(prebuilt, TRUE);
+			}
+
+			goto next_rec;
+		}
+
+		result_rec = clust_rec;
+		ut_ad(rec_offs_validate(result_rec, clust_index, offsets));
+
+		if (prebuilt->idx_cond) {
+			/* Convert the record to MySQL format. We were
+			unable to do this in row_search_idx_cond_check(),
+			because the condition is on the secondary index
+			and the requested column is in the clustered index.
+			We convert all fields, including those that
+			may have been used in ICP, because the
+			secondary index may contain a column prefix
+			rather than the full column. Also, as noted
+			in Bug #56680, the column in the secondary
+			index may be in the wrong case, and the
+			authoritative case is in result_rec, the
+			appropriate version of the clustered index record. */
+			if (!row_sel_store_mysql_rec(
+				    buf, prebuilt, result_rec,
+				    TRUE, clust_index, offsets)) {
+				goto next_rec;
+			}
+		}
+	} else {
+		result_rec = rec;
+	}
+
+	/* We found a qualifying record 'result_rec'. At this point,
+	'offsets' are associated with 'result_rec'. */
+
+	ut_ad(rec_offs_validate(result_rec,
+				result_rec != rec ? clust_index : index,
+				offsets));
+	ut_ad(!rec_get_deleted_flag(result_rec, comp));
+
+	/* At this point, the clustered index record is protected
+	by a page latch that was acquired when pcur was positioned.
+	The latch will not be released until mtr_commit(&mtr). */
+
+	if ((match_mode == ROW_SEL_EXACT
+	     || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD)
+	    && prebuilt->select_lock_type == LOCK_NONE
+	    && !prebuilt->templ_contains_blob
+	    && !prebuilt->clust_index_was_generated
+	    && !prebuilt->used_in_HANDLER
+	    && !prebuilt->innodb_api
+	    && prebuilt->template_type
+	    != ROW_MYSQL_DUMMY_TEMPLATE
+	    && !prebuilt->in_fts_query) {
+
+		/* Inside an update, for example, we do not cache rows,
+		since we may use the cursor position to do the actual
+		update, that is why we require ...lock_type == LOCK_NONE.
+		Since we keep space in prebuilt only for the BLOBs of
+		a single row, we cannot cache rows in the case there
+		are BLOBs in the fields to be fetched. In HANDLER we do
+		not cache rows because there the cursor is a scrollable
+		cursor. */
+
+		ut_a(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+
+		/* We only convert from InnoDB row format to MySQL row
+		format when ICP is disabled. */
+
+		if (!prebuilt->idx_cond) {
+
+			/* We use next_buf to track the allocation of buffers
+			where we store and enqueue the buffers for our
+			pre-fetch optimisation.
+
+			If next_buf == 0 then we store the converted record
+			directly into the MySQL record buffer (buf). If it is
+			!= 0 then we allocate a pre-fetch buffer and store the
+			converted record there.
+
+			If the conversion fails and the MySQL record buffer
+			was not written to then we reset next_buf so that
+			we can re-use the MySQL record buffer in the next
+			iteration. */
+
+			next_buf = next_buf
+				 ? row_sel_fetch_last_buf(prebuilt) : buf;
+
+			if (!row_sel_store_mysql_rec(
+				next_buf, prebuilt, result_rec,
+				result_rec != rec,
+				result_rec != rec ? clust_index : index,
+				offsets)) {
+
+				if (next_buf == buf) {
+					ut_a(prebuilt->n_fetch_cached == 0);
+					next_buf = 0;
+				}
+
+				/* Only fresh inserts may contain incomplete
+				externally stored columns. Pretend that such
+				records do not exist. Such records may only be
+				accessed at the READ UNCOMMITTED isolation
+				level or when rolling back a recovered
+				transaction. Rollback happens at a lower
+				level, not here. */
+				goto next_rec;
+			}
+
+			if (next_buf != buf) {
+				row_sel_enqueue_cache_row_for_mysql(
+					next_buf, prebuilt);
+			}
+		} else {
+			row_sel_enqueue_cache_row_for_mysql(buf, prebuilt);
+		}
+
+		if (prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE) {
+			goto next_rec;
+		}
+
+	} else {
+		if (UNIV_UNLIKELY
+		    (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE)) {
+			/* CHECK TABLE: fetch the row */
+
+			if (result_rec != rec
+			    && !prebuilt->need_to_access_clustered) {
+				/* We used 'offsets' for the clust
+				rec, recalculate them for 'rec' */
+				offsets = rec_get_offsets(rec, index, offsets,
+							  ULINT_UNDEFINED,
+							  &heap);
+				result_rec = rec;
+			}
+
+			memcpy(buf + 4, result_rec
+			       - rec_offs_extra_size(offsets),
+			       rec_offs_size(offsets));
+			mach_write_to_4(buf,
+					rec_offs_extra_size(offsets) + 4);
+		} else if (!prebuilt->idx_cond && !prebuilt->innodb_api) {
+			/* The record was not yet converted to MySQL format. */
+			if (!row_sel_store_mysql_rec(
+				    buf, prebuilt, result_rec,
+				    result_rec != rec,
+				    result_rec != rec ? clust_index : index,
+				    offsets)) {
+				/* Only fresh inserts may contain
+				incomplete externally stored
+				columns. Pretend that such records do
+				not exist. Such records may only be
+				accessed at the READ UNCOMMITTED
+				isolation level or when rolling back a
+				recovered transaction. Rollback
+				happens at a lower level, not here. */
+				goto next_rec;
+			}
+		}
+
+		if (prebuilt->clust_index_was_generated) {
+			row_sel_store_row_id_to_prebuilt(
+				prebuilt, result_rec,
+				result_rec == rec ? index : clust_index,
+				offsets);
+		}
+	}
+
+	/* From this point on, 'offsets' are invalid. */
+
+	/* We have an optimization to save CPU time: if this is a consistent
+	read on a unique condition on the clustered index, then we do not
+	store the pcur position, because any fetch next or prev will anyway
+	return 'end of file'. Exceptions are locking reads and the MySQL
+	HANDLER command where the user can move the cursor with PREV or NEXT
+	even after a unique search. */
+
+	err = DB_SUCCESS;
+
+idx_cond_failed:
+	if (!unique_search
+	    || !dict_index_is_clust(index)
+	    || direction != 0
+	    || prebuilt->select_lock_type != LOCK_NONE
+	    || prebuilt->used_in_HANDLER
+	    || prebuilt->innodb_api) {
+
+		/* Inside an update always store the cursor position */
+
+		btr_pcur_store_position(pcur, &mtr);
+
+		if (prebuilt->innodb_api) {
+			prebuilt->innodb_api_rec = result_rec;
+		}
+	}
+
+	goto normal_return;
+
+next_rec:
+	/* Reset the old and new "did semi-consistent read" flags. */
+	if (UNIV_UNLIKELY(prebuilt->row_read_type
+			  == ROW_READ_DID_SEMI_CONSISTENT)) {
+		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+	}
+	did_semi_consistent_read = FALSE;
+	prebuilt->new_rec_locks = 0;
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 5: Move the cursor to the next index record */
+
+	/* NOTE: For moves_up==FALSE, the mini-transaction will be
+	committed and restarted every time when switching b-tree
+	pages. For moves_up==TRUE in index condition pushdown, we can
+	scan an entire secondary index tree within a single
+	mini-transaction. As long as the prebuilt->idx_cond does not
+	match, we do not need to consult the clustered index or
+	return records to MySQL, and thus we can avoid repositioning
+	the cursor. What prevents us from buffer-fixing all leaf pages
+	within the mini-transaction is the btr_leaf_page_release()
+	call in btr_pcur_move_to_next_page(). Only the leaf page where
+	the cursor is positioned will remain buffer-fixed. */
+
+	if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) {
+		/* We must commit mtr if we are moving to the next
+		non-clustered index record, because we could break the
+		latching order if we would access a different clustered
+		index page right away without releasing the previous. */
+
+		btr_pcur_store_position(pcur, &mtr);
+
+		mtr_commit(&mtr);
+		mtr_has_extra_clust_latch = FALSE;
+
+		mtr_start(&mtr);
+		if (sel_restore_position_for_mysql(&same_user_rec,
+						   BTR_SEARCH_LEAF,
+						   pcur, moves_up, &mtr)) {
+#ifdef UNIV_SEARCH_DEBUG
+			cnt++;
+#endif /* UNIV_SEARCH_DEBUG */
+
+			goto rec_loop;
+		}
+	}
+
+	if (moves_up) {
+		if (UNIV_UNLIKELY(!btr_pcur_move_to_next(pcur, &mtr))) {
+not_moved:
+			btr_pcur_store_position(pcur, &mtr);
+
+			if (match_mode != 0) {
+				err = DB_RECORD_NOT_FOUND;
+			} else {
+				err = DB_END_OF_INDEX;
+			}
+
+			goto normal_return;
+		}
+	} else {
+		if (UNIV_UNLIKELY(!btr_pcur_move_to_prev(pcur, &mtr))) {
+			goto not_moved;
+		}
+	}
+
+#ifdef UNIV_SEARCH_DEBUG
+	cnt++;
+#endif /* UNIV_SEARCH_DEBUG */
+
+	goto rec_loop;
+
+lock_wait_or_error:
+	/* Reset the old and new "did semi-consistent read" flags. */
+	if (UNIV_UNLIKELY(prebuilt->row_read_type
+			  == ROW_READ_DID_SEMI_CONSISTENT)) {
+		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+	}
+	did_semi_consistent_read = FALSE;
+
+	/*-------------------------------------------------------------*/
+
+	btr_pcur_store_position(pcur, &mtr);
+
+lock_table_wait:
+	mtr_commit(&mtr);
+	mtr_has_extra_clust_latch = FALSE;
+
+	trx->error_state = err;
+
+	/* The following is a patch for MySQL */
+
+	que_thr_stop_for_mysql(thr);
+
+	thr->lock_state = QUE_THR_LOCK_ROW;
+
+	if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
+		/* It was a lock wait, and it ended */
+
+		thr->lock_state = QUE_THR_LOCK_NOLOCK;
+		mtr_start(&mtr);
+
+		/* Table lock waited, go try to obtain table lock
+		again */
+		if (table_lock_waited) {
+			table_lock_waited = FALSE;
+
+			goto wait_table_again;
+		}
+
+		sel_restore_position_for_mysql(&same_user_rec,
+					       BTR_SEARCH_LEAF, pcur,
+					       moves_up, &mtr);
+
+		if ((srv_locks_unsafe_for_binlog
+		     || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
+		    && !same_user_rec) {
+
+			/* Since we were not able to restore the cursor
+			on the same user record, we cannot use
+			row_unlock_for_mysql() to unlock any records, and
+			we must thus reset the new rec lock info. Since
+			in lock0lock.cc we have blocked the inheriting of gap
+			X-locks, we actually do not have any new record locks
+			set in this case.
+
+			Note that if we were able to restore on the 'same'
+			user record, it is still possible that we were actually
+			waiting on a delete-marked record, and meanwhile
+			it was removed by purge and inserted again by some
+			other user. But that is no problem, because in
+			rec_loop we will again try to set a lock, and
+			new_rec_lock_info in trx will be right at the end. */
+
+			prebuilt->new_rec_locks = 0;
+		}
+
+		mode = pcur->search_mode;
+
+		goto rec_loop;
+	}
+
+	thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+#ifdef UNIV_SEARCH_DEBUG
+	/*	fputs("Using ", stderr);
+	dict_index_name_print(stderr, index);
+	fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
+#endif /* UNIV_SEARCH_DEBUG */
+	goto func_exit;
+
+normal_return:
+	/*-------------------------------------------------------------*/
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	mtr_commit(&mtr);
+
+	if (prebuilt->idx_cond != 0) {
+
+		/* When ICP is active we don't write to the MySQL buffer
+		directly, only to buffers that are enqueued in the pre-fetch
+		queue. We need to dequeue the first buffer and copy the contents
+		to the record buffer that was passed in by MySQL. */
+
+		if (prebuilt->n_fetch_cached > 0) {
+			row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
+			err = DB_SUCCESS;
+		}
+
+	} else if (next_buf != 0) {
+
+		/* We may or may not have enqueued some buffers to the
+		pre-fetch queue, but we definitely wrote to the record
+		buffer passed to use by MySQL. */
+
+		DEBUG_SYNC_C("row_search_cached_row");
+		err = DB_SUCCESS;
+	}
+
+#ifdef UNIV_SEARCH_DEBUG
+	/*	fputs("Using ", stderr);
+	dict_index_name_print(stderr, index);
+	fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
+#endif /* UNIV_SEARCH_DEBUG */
+
+func_exit:
+	trx->op_info = "";
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	/* Set or reset the "did semi-consistent read" flag on return.
+	The flag did_semi_consistent_read is set if and only if
+	the record being returned was fetched with a semi-consistent read. */
+	ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS
+	      || !did_semi_consistent_read);
+
+	if (UNIV_UNLIKELY(prebuilt->row_read_type != ROW_READ_WITH_LOCKS)) {
+		if (UNIV_UNLIKELY(did_semi_consistent_read)) {
+			prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT;
+		} else {
+			prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+		}
+	}
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
+
+	DEBUG_SYNC_C("innodb_row_search_for_mysql_exit");
+
+	return(err);
+}
+
+/*******************************************************************//**
+Checks if MySQL at the moment is allowed for this table to retrieve a
+consistent read result, or store it to the query cache.
+@return	TRUE if storing or retrieving from the query cache is permitted */
+UNIV_INTERN
+ibool
+row_search_check_if_query_cache_permitted(
+/*======================================*/
+	trx_t*		trx,		/*!< in: transaction object */
+	const char*	norm_name)	/*!< in: concatenation of database name,
+					'/' char, table name */
+{
+	dict_table_t*	table;
+	ibool		ret	= FALSE;
+
+	/* Disable query cache altogether for all tables if recovered XA
+	transactions in prepared state exist. This is because we do not
+	restore the table locks for those transactions and we may wrongly
+	set ret=TRUE above if "lock_table_get_n_locks(table) == 0". See
+	"Bug#14658648 XA ROLLBACK (DISTRIBUTED DATABASE) NOT WORKING WITH
+	QUERY CACHE ENABLED".
+	Read trx_sys->n_prepared_recovered_trx without mutex protection,
+	not possible to end up with a torn read since n_prepared_recovered_trx
+	is word size. */
+	if (trx_sys->n_prepared_recovered_trx > 0) {
+
+		return(FALSE);
+	}
+
+	table = dict_table_open_on_name(norm_name, FALSE, FALSE,
+					DICT_ERR_IGNORE_NONE);
+
+	if (table == NULL) {
+
+		return(FALSE);
+	}
+
+	/* Start the transaction if it is not started yet */
+
+	trx_start_if_not_started(trx);
+
+	/* If there are locks on the table or some trx has invalidated the
+	cache up to our trx id, then ret = FALSE.
+	We do not check what type locks there are on the table, though only
+	IX type locks actually would require ret = FALSE. */
+
+	if (lock_table_get_n_locks(table) == 0
+	    && trx->id >= table->query_cache_inv_trx_id) {
+
+		ret = TRUE;
+
+		/* If the isolation level is high, assign a read view for the
+		transaction if it does not yet have one */
+
+		if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
+		    && !trx->read_view) {
+
+			trx->read_view = read_view_open_now(
+				trx->id, trx->global_read_view_heap);
+
+			trx->global_read_view = trx->read_view;
+		}
+	}
+
+	dict_table_close(table, FALSE, FALSE);
+
+	return(ret);
+}
+
+/*******************************************************************//**
+Read the AUTOINC column from the current row. If the value is less than
+0 and the type is not unsigned then we reset the value to 0.
+@return	value read from the column */
+static
+ib_uint64_t
+row_search_autoinc_read_column(
+/*===========================*/
+	dict_index_t*	index,		/*!< in: index to read from */
+	const rec_t*	rec,		/*!< in: current rec */
+	ulint		col_no,		/*!< in: column number */
+	ulint		mtype,		/*!< in: column main type */
+	ibool		unsigned_type)	/*!< in: signed or unsigned flag */
+{
+	ulint		len;
+	const byte*	data;
+	ib_uint64_t	value;
+	mem_heap_t*	heap = NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets	= offsets_;
+
+	rec_offs_init(offsets_);
+
+	offsets = rec_get_offsets(rec, index, offsets, col_no + 1, &heap);
+
+	if (rec_offs_nth_sql_null(offsets, col_no)) {
+		/* There is no non-NULL value in the auto-increment column. */
+		value = 0;
+		goto func_exit;
+	}
+
+	data = rec_get_nth_field(rec, offsets, col_no, &len);
+
+	switch (mtype) {
+	case DATA_INT:
+		ut_a(len <= sizeof value);
+		value = mach_read_int_type(data, len, unsigned_type);
+		break;
+
+	case DATA_FLOAT:
+		ut_a(len == sizeof(float));
+		value = (ib_uint64_t) mach_float_read(data);
+		break;
+
+	case DATA_DOUBLE:
+		ut_a(len == sizeof(double));
+		value = (ib_uint64_t) mach_double_read(data);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	if (!unsigned_type && (ib_int64_t) value < 0) {
+		value = 0;
+	}
+
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return(value);
+}
+
+/** Get the maximum and non-delete-marked record in an index.
+@param[in]	index	index tree
+@param[in,out]	mtr	mini-transaction (may be committed and restarted)
+@return maximum record, page s-latched in mtr
+@retval NULL if there are no records, or if all of them are delete-marked */
+static
+const rec_t*
+row_search_get_max_rec(
+	dict_index_t*	index,
+	mtr_t*		mtr)
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	/* Open at the high/right end (false), and init cursor */
+	btr_pcur_open_at_index_side(
+		false, index, BTR_SEARCH_LEAF, &pcur, true, 0, mtr);
+
+	do {
+		const page_t*	page;
+
+		page = btr_pcur_get_page(&pcur);
+		rec = page_find_rec_max_not_deleted(page);
+
+		if (page_rec_is_user_rec(rec)) {
+			break;
+		} else {
+			rec = NULL;
+		}
+		btr_pcur_move_before_first_on_page(&pcur);
+	} while (btr_pcur_move_to_prev(&pcur, mtr));
+
+	btr_pcur_close(&pcur);
+
+	return(rec);
+}
+
+/*******************************************************************//**
+Read the max AUTOINC value from an index.
+@return DB_SUCCESS if all OK else error code, DB_RECORD_NOT_FOUND if
+column name can't be found in index */
+UNIV_INTERN
+dberr_t
+row_search_max_autoinc(
+/*===================*/
+	dict_index_t*	index,		/*!< in: index to search */
+	const char*	col_name,	/*!< in: name of autoinc column */
+	ib_uint64_t*	value)		/*!< out: AUTOINC value read */
+{
+	dict_field_t*	dfield = dict_index_get_nth_field(index, 0);
+	dberr_t		error = DB_SUCCESS;
+	*value = 0;
+
+	if (strcmp(col_name, dfield->name) != 0) {
+		error = DB_RECORD_NOT_FOUND;
+	} else {
+		mtr_t		mtr;
+		const rec_t*	rec;
+
+		mtr_start(&mtr);
+
+		rec = row_search_get_max_rec(index, &mtr);
+
+		if (rec != NULL) {
+			ibool unsigned_type = (
+				dfield->col->prtype & DATA_UNSIGNED);
+
+			*value = row_search_autoinc_read_column(
+				index, rec, 0,
+				dfield->col->mtype, unsigned_type);
+		}
+
+		mtr_commit(&mtr);
+	}
+
+	return(error);
+}
diff --git a/storage/innobase/row/row0uins.cc b/storage/innobase/row/row0uins.cc
new file mode 100644
index 00000000000..849bf096492
--- /dev/null
+++ b/storage/innobase/row/row0uins.cc
@@ -0,0 +1,475 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0uins.cc
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0uins.h"
+
+#ifdef UNIV_NONINL
+#include "row0uins.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "row0log.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***************************************************************//**
+Removes a clustered index record. The pcur in node was positioned on the
+record, now it is detached.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static  __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_clust_rec(
+/*==========================*/
+	undo_node_t*	node)	/*!< in: undo node */
+{
+	btr_cur_t*	btr_cur;
+	ibool		success;
+	dberr_t		err;
+	ulint		n_tries	= 0;
+	mtr_t		mtr;
+	dict_index_t*	index	= node->pcur.btr_cur.index;
+	bool		online;
+
+	ut_ad(dict_index_is_clust(index));
+
+	mtr_start(&mtr);
+
+	/* This is similar to row_undo_mod_clust(). The DDL thread may
+	already have copied this row from the log to the new table.
+	We must log the removal, so that the row will be correctly
+	purged. However, we can log the removal out of sync with the
+	B-tree modification. */
+
+	online = dict_index_is_online_ddl(index);
+	if (online) {
+		ut_ad(node->trx->dict_operation_lock_mode
+		      != RW_X_LATCH);
+		ut_ad(node->table->id != DICT_INDEXES_ID);
+		mtr_s_lock(dict_index_get_lock(index), &mtr);
+	}
+
+	success = btr_pcur_restore_position(
+		online
+		? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED
+		: BTR_MODIFY_LEAF, &node->pcur, &mtr);
+	ut_a(success);
+
+	btr_cur = btr_pcur_get_btr_cur(&node->pcur);
+
+	ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur), btr_cur->index)
+	      == node->trx->id);
+
+	if (online && dict_index_is_online_ddl(index)) {
+		const rec_t*	rec	= btr_cur_get_rec(btr_cur);
+		mem_heap_t*	heap	= NULL;
+		const ulint*	offsets	= rec_get_offsets(
+			rec, index, NULL, ULINT_UNDEFINED, &heap);
+		row_log_table_delete(rec, index, offsets, NULL);
+		mem_heap_free(heap);
+	}
+
+	if (node->table->id == DICT_INDEXES_ID) {
+		ut_ad(!online);
+		ut_ad(node->trx->dict_operation_lock_mode == RW_X_LATCH);
+
+		/* Drop the index tree associated with the row in
+		SYS_INDEXES table: */
+
+		dict_drop_index_tree(btr_pcur_get_rec(&(node->pcur)), &mtr);
+
+		mtr_commit(&mtr);
+
+		mtr_start(&mtr);
+
+		success = btr_pcur_restore_position(
+			BTR_MODIFY_LEAF, &node->pcur, &mtr);
+		ut_a(success);
+	}
+
+	if (btr_cur_optimistic_delete(btr_cur, 0, &mtr)) {
+		err = DB_SUCCESS;
+		goto func_exit;
+	}
+
+	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+retry:
+	/* If did not succeed, try pessimistic descent to tree */
+	mtr_start(&mtr);
+
+	success = btr_pcur_restore_position(BTR_MODIFY_TREE,
+					    &(node->pcur), &mtr);
+	ut_a(success);
+
+	btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
+				   trx_is_recv(node->trx)
+				   ? RB_RECOVERY
+				   : RB_NORMAL, &mtr);
+
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (err == DB_OUT_OF_FILE_SPACE
+	    && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+		btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+		n_tries++;
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+		goto retry;
+	}
+
+func_exit:
+	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+	trx_undo_rec_release(node->trx, node->undo_no);
+
+	return(err);
+}
+
+/***************************************************************//**
+Removes a secondary index entry if found.
+@return	DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_sec_low(
+/*========================*/
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry)	/*!< in: index entry to remove */
+{
+	btr_pcur_t		pcur;
+	btr_cur_t*		btr_cur;
+	dberr_t			err	= DB_SUCCESS;
+	mtr_t			mtr;
+	enum row_search_result	search_result;
+
+	log_free_check();
+
+	mtr_start(&mtr);
+
+	if (mode == BTR_MODIFY_LEAF) {
+		mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED;
+		mtr_s_lock(dict_index_get_lock(index), &mtr);
+	} else {
+		ut_ad(mode == BTR_MODIFY_TREE);
+		mtr_x_lock(dict_index_get_lock(index), &mtr);
+	}
+
+	if (row_log_online_op_try(index, entry, 0)) {
+		goto func_exit_no_pcur;
+	}
+
+	search_result = row_search_index_entry(index, entry, mode,
+					       &pcur, &mtr);
+
+	switch (search_result) {
+	case ROW_NOT_FOUND:
+		goto func_exit;
+	case ROW_FOUND:
+		break;
+	case ROW_BUFFERED:
+	case ROW_NOT_DELETED_REF:
+		/* These are invalid outcomes, because the mode passed
+		to row_search_index_entry() did not include any of the
+		flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+		ut_error;
+	}
+
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	if (mode != BTR_MODIFY_TREE) {
+		err = btr_cur_optimistic_delete(btr_cur, 0, &mtr)
+			? DB_SUCCESS : DB_FAIL;
+	} else {
+		/* No need to distinguish RB_RECOVERY here, because we
+		are deleting a secondary index record: the distinction
+		between RB_NORMAL and RB_RECOVERY only matters when
+		deleting a record that contains externally stored
+		columns. */
+		ut_ad(!dict_index_is_clust(index));
+		btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
+					   RB_NORMAL, &mtr);
+	}
+func_exit:
+	btr_pcur_close(&pcur);
+func_exit_no_pcur:
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/***************************************************************//**
+Removes a secondary index entry from the index if found. Tries first
+optimistic, then pessimistic descent down the tree.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_sec(
+/*====================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry)	/*!< in: index entry to insert */
+{
+	dberr_t	err;
+	ulint	n_tries	= 0;
+
+	/* Try first optimistic descent to the B-tree */
+
+	err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry);
+
+	if (err == DB_SUCCESS) {
+
+		return(err);
+	}
+
+	/* Try then pessimistic descent to the B-tree */
+retry:
+	err = row_undo_ins_remove_sec_low(BTR_MODIFY_TREE, index, entry);
+
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+		n_tries++;
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+		goto retry;
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Parses the row reference and other info in a fresh insert undo record. */
+static
+void
+row_undo_ins_parse_undo_rec(
+/*========================*/
+	undo_node_t*	node,		/*!< in/out: row undo node */
+	ibool		dict_locked)	/*!< in: TRUE if own dict_sys->mutex */
+{
+	dict_index_t*	clust_index;
+	byte*		ptr;
+	undo_no_t	undo_no;
+	table_id_t	table_id;
+	ulint		type;
+	ulint		dummy;
+	bool		dummy_extern;
+
+	ut_ad(node);
+
+	ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy,
+				    &dummy_extern, &undo_no, &table_id);
+	ut_ad(type == TRX_UNDO_INSERT_REC);
+	node->rec_type = type;
+
+	node->update = NULL;
+	node->table = dict_table_open_on_id(
+		table_id, dict_locked, DICT_TABLE_OP_NORMAL);
+
+	/* Skip the UNDO if we can't find the table or the .ibd file. */
+	if (UNIV_UNLIKELY(node->table == NULL)) {
+	} else if (UNIV_UNLIKELY(node->table->ibd_file_missing)) {
+close_table:
+		dict_table_close(node->table, dict_locked, FALSE);
+		node->table = NULL;
+	} else {
+		clust_index = dict_table_get_first_index(node->table);
+
+		if (clust_index != NULL) {
+			trx_undo_rec_get_row_ref(
+				ptr, clust_index, &node->ref, node->heap);
+
+			if (!row_undo_search_clust_to_pcur(node)) {
+				goto close_table;
+			}
+
+		} else {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: table ");
+			ut_print_name(stderr, node->trx, TRUE,
+				      node->table->name);
+			fprintf(stderr, " has no indexes, "
+				"ignoring the table\n");
+			goto close_table;
+		}
+	}
+}
+
+/***************************************************************//**
+Removes secondary index records.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_sec_rec(
+/*========================*/
+	undo_node_t*	node)	/*!< in/out: row undo node */
+{
+	dberr_t		err	= DB_SUCCESS;
+	dict_index_t*	index	= node->index;
+	mem_heap_t*	heap;
+
+	heap = mem_heap_create(1024);
+
+	while (index != NULL) {
+		dtuple_t*	entry;
+
+		if (index->type & DICT_FTS) {
+			dict_table_next_uncorrupted_index(index);
+			continue;
+		}
+
+		/* An insert undo record TRX_UNDO_INSERT_REC will
+		always contain all fields of the index. It does not
+		matter if any indexes were created afterwards; all
+		index entries can be reconstructed from the row. */
+		entry = row_build_index_entry(
+			node->row, node->ext, index, heap);
+		if (UNIV_UNLIKELY(!entry)) {
+			/* The database must have crashed after
+			inserting a clustered index record but before
+			writing all the externally stored columns of
+			that record, or a statement is being rolled
+			back because an error occurred while storing
+			off-page columns.
+
+			Because secondary index entries are inserted
+			after the clustered index record, we may
+			assume that the secondary index record does
+			not exist. */
+		} else {
+			err = row_undo_ins_remove_sec(index, entry);
+
+			if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+				goto func_exit;
+			}
+		}
+
+		mem_heap_empty(heap);
+		dict_table_next_uncorrupted_index(index);
+	}
+
+func_exit:
+	node->index = index;
+	mem_heap_free(heap);
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert.  InnoDB is eager in a rollback:
+if it figures out that an index record will be removed in the purge
+anyway, it will remove it in the rollback.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+UNIV_INTERN
+dberr_t
+row_undo_ins(
+/*=========*/
+	undo_node_t*	node)	/*!< in: row undo node */
+{
+	dberr_t	err;
+	ibool	dict_locked;
+
+	ut_ad(node->state == UNDO_NODE_INSERT);
+
+	dict_locked = node->trx->dict_operation_lock_mode == RW_X_LATCH;
+
+	row_undo_ins_parse_undo_rec(node, dict_locked);
+
+	if (node->table == NULL) {
+		trx_undo_rec_release(node->trx, node->undo_no);
+
+		return(DB_SUCCESS);
+	}
+
+	/* Iterate over all the indexes and undo the insert.*/
+
+	node->index = dict_table_get_first_index(node->table);
+	ut_ad(dict_index_is_clust(node->index));
+	/* Skip the clustered index (the first index) */
+	node->index = dict_table_get_next_index(node->index);
+
+	dict_table_skip_corrupt_index(node->index);
+
+	err = row_undo_ins_remove_sec_rec(node);
+
+	if (err == DB_SUCCESS) {
+
+		log_free_check();
+
+		if (node->table->id == DICT_INDEXES_ID) {
+
+			if (!dict_locked) {
+				mutex_enter(&dict_sys->mutex);
+			}
+		}
+
+		// FIXME: We need to update the dict_index_t::space and
+		// page number fields too.
+		err = row_undo_ins_remove_clust_rec(node);
+
+		if (node->table->id == DICT_INDEXES_ID
+		    && !dict_locked) {
+
+			mutex_exit(&dict_sys->mutex);
+		}
+	}
+
+	dict_table_close(node->table, dict_locked, FALSE);
+
+	node->table = NULL;
+
+	return(err);
+}
diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc
new file mode 100644
index 00000000000..29252c7834a
--- /dev/null
+++ b/storage/innobase/row/row0umod.cc
@@ -0,0 +1,1160 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0umod.cc
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0umod.h"
+
+#ifdef UNIV_NONINL
+#include "row0umod.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "row0log.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "log0log.h"
+
+/* Considerations on undoing a modify operation.
+(1) Undoing a delete marking: all index records should be found. Some of
+them may have delete mark already FALSE, if the delete mark operation was
+stopped underway, or if the undo operation ended prematurely because of a
+system crash.
+(2) Undoing an update of a delete unmarked record: the newer version of
+an updated secondary index entry should be removed if no prior version
+of the clustered index record requires its existence. Otherwise, it should
+be delete marked.
+(3) Undoing an update of a delete marked record. In this kind of update a
+delete marked clustered index record was delete unmarked and possibly also
+some of its fields were changed. Now, it is possible that the delete marked
+version has become obsolete at the time the undo is started. */
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***********************************************************//**
+Undoes a modify in a clustered index record.
+@return	DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_clust_low(
+/*===================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	ulint**		offsets,/*!< out: rec_get_offsets() on the record */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: memory heap that can be emptied */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
+	const dtuple_t**rebuilt_old_pk,
+				/*!< out: row_log_table_get_pk()
+				before the update, or NULL if
+				the table is not being rebuilt online or
+				the PRIMARY KEY definition does not change */
+	byte*		sys,	/*!< out: DB_TRX_ID, DB_ROLL_PTR
+				for row_log_table_delete() */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in: mtr; must be committed before
+				latching any further pages */
+	ulint		mode)	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	dberr_t		err;
+#ifdef UNIV_DEBUG
+	ibool		success;
+#endif /* UNIV_DEBUG */
+
+	pcur = &node->pcur;
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+#ifdef UNIV_DEBUG
+	success =
+#endif /* UNIV_DEBUG */
+	btr_pcur_restore_position(mode, pcur, mtr);
+
+	ut_ad(success);
+	ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur),
+			     btr_cur_get_index(btr_cur))
+	      == thr_get_trx(thr)->id);
+
+	if (mode != BTR_MODIFY_LEAF
+	    && dict_index_is_online_ddl(btr_cur_get_index(btr_cur))) {
+		*rebuilt_old_pk = row_log_table_get_pk(
+			btr_cur_get_rec(btr_cur),
+			btr_cur_get_index(btr_cur), NULL, sys, &heap);
+	} else {
+		*rebuilt_old_pk = NULL;
+	}
+
+	if (mode != BTR_MODIFY_TREE) {
+		ut_ad((mode & ~BTR_ALREADY_S_LATCHED) == BTR_MODIFY_LEAF);
+
+		err = btr_cur_optimistic_update(
+			BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG
+			| BTR_KEEP_SYS_FLAG,
+			btr_cur, offsets, offsets_heap,
+			node->update, node->cmpl_info,
+			thr, thr_get_trx(thr)->id, mtr);
+	} else {
+		big_rec_t*	dummy_big_rec;
+
+		err = btr_cur_pessimistic_update(
+			BTR_NO_LOCKING_FLAG
+			| BTR_NO_UNDO_LOG_FLAG
+			| BTR_KEEP_SYS_FLAG,
+			btr_cur, offsets, offsets_heap, heap,
+			&dummy_big_rec, node->update,
+			node->cmpl_info, thr, thr_get_trx(thr)->id, mtr);
+
+		ut_a(!dummy_big_rec);
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Purges a clustered index record after undo if possible.
+This is attempted when the record was inserted by updating a
+delete-marked record and there no longer exist transactions
+that would see the delete-marked record.
+@return	DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_remove_clust_low(
+/*==========================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	ulint		mode)	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+	btr_cur_t*	btr_cur;
+	dberr_t		err;
+	ulint		trx_id_offset;
+
+	ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+
+	/* Find out if the record has been purged already
+	or if we can remove it. */
+
+	if (!btr_pcur_restore_position(mode, &node->pcur, mtr)
+	    || row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) {
+
+		return(DB_SUCCESS);
+	}
+
+	btr_cur = btr_pcur_get_btr_cur(&node->pcur);
+
+	trx_id_offset = btr_cur_get_index(btr_cur)->trx_id_offset;
+
+	if (!trx_id_offset) {
+		mem_heap_t*	heap	= NULL;
+		ulint		trx_id_col;
+		const ulint*	offsets;
+		ulint		len;
+
+		trx_id_col = dict_index_get_sys_col_pos(
+			btr_cur_get_index(btr_cur), DATA_TRX_ID);
+		ut_ad(trx_id_col > 0);
+		ut_ad(trx_id_col != ULINT_UNDEFINED);
+
+		offsets = rec_get_offsets(
+			btr_cur_get_rec(btr_cur), btr_cur_get_index(btr_cur),
+			NULL, trx_id_col + 1, &heap);
+
+		trx_id_offset = rec_get_nth_field_offs(
+			offsets, trx_id_col, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+		mem_heap_free(heap);
+	}
+
+	if (trx_read_trx_id(btr_cur_get_rec(btr_cur) + trx_id_offset)
+	    != node->new_trx_id) {
+		/* The record must have been purged and then replaced
+		with a different one. */
+		return(DB_SUCCESS);
+	}
+
+	/* We are about to remove an old, delete-marked version of the
+	record that may have been delete-marked by a different transaction
+	than the rolling-back one. */
+	ut_ad(rec_get_deleted_flag(btr_cur_get_rec(btr_cur),
+				   dict_table_is_comp(node->table)));
+
+	if (mode == BTR_MODIFY_LEAF) {
+		err = btr_cur_optimistic_delete(btr_cur, 0, mtr)
+			? DB_SUCCESS
+			: DB_FAIL;
+	} else {
+		ut_ad(mode == BTR_MODIFY_TREE);
+
+		/* This operation is analogous to purge, we can free also
+		inherited externally stored fields */
+
+		btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
+					   thr_is_recv(thr)
+					   ? RB_RECOVERY_PURGE_REC
+					   : RB_NONE, mtr);
+
+		/* The delete operation may fail if we have little
+		file space left: TODO: easiest to crash the database
+		and restart with more file space */
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in a clustered index record. Sets also the node state for the
+next round of undo.
+@return	DB_SUCCESS or error code: we may run out of file space */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_clust(
+/*===============*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	btr_pcur_t*	pcur;
+	mtr_t		mtr;
+	dberr_t		err;
+	dict_index_t*	index;
+	bool		online;
+
+	ut_ad(thr_get_trx(thr) == node->trx);
+	ut_ad(node->trx->dict_operation_lock_mode);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED)
+	      || rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	log_free_check();
+	pcur = &node->pcur;
+	index = btr_cur_get_index(btr_pcur_get_btr_cur(pcur));
+
+	mtr_start(&mtr);
+
+	online = dict_index_is_online_ddl(index);
+	if (online) {
+		ut_ad(node->trx->dict_operation_lock_mode != RW_X_LATCH);
+		mtr_s_lock(dict_index_get_lock(index), &mtr);
+	}
+
+	mem_heap_t*	heap		= mem_heap_create(1024);
+	mem_heap_t*	offsets_heap	= NULL;
+	ulint*		offsets		= NULL;
+	const dtuple_t*	rebuilt_old_pk;
+	byte		sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
+
+	/* Try optimistic processing of the record, keeping changes within
+	the index page */
+
+	err = row_undo_mod_clust_low(node, &offsets, &offsets_heap,
+				     heap, &rebuilt_old_pk, sys,
+				     thr, &mtr, online
+				     ? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED
+				     : BTR_MODIFY_LEAF);
+
+	if (err != DB_SUCCESS) {
+		btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+		/* We may have to modify tree structure: do a pessimistic
+		descent down the index tree */
+
+		mtr_start(&mtr);
+
+		err = row_undo_mod_clust_low(
+			node, &offsets, &offsets_heap,
+			heap, &rebuilt_old_pk, sys,
+			thr, &mtr, BTR_MODIFY_TREE);
+		ut_ad(err == DB_SUCCESS || err == DB_OUT_OF_FILE_SPACE);
+	}
+
+	/* Online rebuild cannot be initiated while we are holding
+	dict_operation_lock and index->lock. (It can be aborted.) */
+	ut_ad(online || !dict_index_is_online_ddl(index));
+
+	if (err == DB_SUCCESS && online) {
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(rw_lock_own(&index->lock, RW_LOCK_SHARED)
+		      || rw_lock_own(&index->lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+		switch (node->rec_type) {
+		case TRX_UNDO_DEL_MARK_REC:
+			row_log_table_insert(
+				btr_pcur_get_rec(pcur), index, offsets);
+			break;
+		case TRX_UNDO_UPD_EXIST_REC:
+			row_log_table_update(
+				btr_pcur_get_rec(pcur), index, offsets,
+				rebuilt_old_pk);
+			break;
+		case TRX_UNDO_UPD_DEL_REC:
+			row_log_table_delete(
+				btr_pcur_get_rec(pcur), index, offsets, sys);
+			break;
+		default:
+			ut_ad(0);
+			break;
+		}
+	}
+
+	ut_ad(rec_get_trx_id(btr_pcur_get_rec(pcur), index)
+	      == node->new_trx_id);
+
+	btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+	if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_UPD_DEL_REC) {
+
+		mtr_start(&mtr);
+
+		/* It is not necessary to call row_log_table,
+		because the record is delete-marked and would thus
+		be omitted from the rebuilt copy of the table. */
+		err = row_undo_mod_remove_clust_low(
+			node, thr, &mtr, BTR_MODIFY_LEAF);
+		if (err != DB_SUCCESS) {
+			btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+			/* We may have to modify tree structure: do a
+			pessimistic descent down the index tree */
+
+			mtr_start(&mtr);
+
+			err = row_undo_mod_remove_clust_low(node, thr, &mtr,
+							    BTR_MODIFY_TREE);
+
+			ut_ad(err == DB_SUCCESS
+			      || err == DB_OUT_OF_FILE_SPACE);
+		}
+
+		btr_pcur_commit_specify_mtr(pcur, &mtr);
+	}
+
+	node->state = UNDO_NODE_FETCH_NEXT;
+
+	trx_undo_rec_release(node->trx, node->undo_no);
+
+	if (offsets_heap) {
+		mem_heap_free(offsets_heap);
+	}
+	mem_heap_free(heap);
+	return(err);
+}
+
+/***********************************************************//**
+Delete marks or removes a secondary index entry if found.
+@return	DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_mark_or_remove_sec_low(
+/*====================================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr,	/*!< in: query thread */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry */
+	ulint		mode)	/*!< in: latch mode BTR_MODIFY_LEAF or
+				BTR_MODIFY_TREE */
+{
+	btr_pcur_t		pcur;
+	btr_cur_t*		btr_cur;
+	ibool			success;
+	ibool			old_has;
+	dberr_t			err	= DB_SUCCESS;
+	mtr_t			mtr;
+	mtr_t			mtr_vers;
+	enum row_search_result	search_result;
+
+	log_free_check();
+	mtr_start(&mtr);
+
+	if (*index->name == TEMP_INDEX_PREFIX) {
+		/* The index->online_status may change if the
+		index->name starts with TEMP_INDEX_PREFIX (meaning
+		that the index is or was being created online). It is
+		protected by index->lock. */
+		if (mode == BTR_MODIFY_LEAF) {
+			mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED;
+			mtr_s_lock(dict_index_get_lock(index), &mtr);
+		} else {
+			ut_ad(mode == BTR_MODIFY_TREE);
+			mtr_x_lock(dict_index_get_lock(index), &mtr);
+		}
+
+		if (row_log_online_op_try(index, entry, 0)) {
+			goto func_exit_no_pcur;
+		}
+	} else {
+		/* For secondary indexes,
+		index->online_status==ONLINE_INDEX_CREATION unless
+		index->name starts with TEMP_INDEX_PREFIX. */
+		ut_ad(!dict_index_is_online_ddl(index));
+	}
+
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	search_result = row_search_index_entry(index, entry, mode,
+					       &pcur, &mtr);
+
+	switch (UNIV_EXPECT(search_result, ROW_FOUND)) {
+	case ROW_NOT_FOUND:
+		/* In crash recovery, the secondary index record may
+		be missing if the UPDATE did not have time to insert
+		the secondary index records before the crash.  When we
+		are undoing that UPDATE in crash recovery, the record
+		may be missing.
+
+		In normal processing, if an update ends in a deadlock
+		before it has inserted all updated secondary index
+		records, then the undo will not find those records. */
+		goto func_exit;
+	case ROW_FOUND:
+		break;
+	case ROW_BUFFERED:
+	case ROW_NOT_DELETED_REF:
+		/* These are invalid outcomes, because the mode passed
+		to row_search_index_entry() did not include any of the
+		flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+		ut_error;
+	}
+
+	/* We should remove the index record if no prior version of the row,
+	which cannot be purged yet, requires its existence. If some requires,
+	we should delete mark the record. */
+
+	mtr_start(&mtr_vers);
+
+	success = btr_pcur_restore_position(BTR_SEARCH_LEAF, &(node->pcur),
+					    &mtr_vers);
+	ut_a(success);
+
+	old_has = row_vers_old_has_index_entry(FALSE,
+					       btr_pcur_get_rec(&(node->pcur)),
+					       &mtr_vers, index, entry);
+	if (old_has) {
+		err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
+						   btr_cur, TRUE, thr, &mtr);
+		ut_ad(err == DB_SUCCESS);
+	} else {
+		/* Remove the index record */
+
+		if (mode != BTR_MODIFY_TREE) {
+			success = btr_cur_optimistic_delete(btr_cur, 0, &mtr);
+			if (success) {
+				err = DB_SUCCESS;
+			} else {
+				err = DB_FAIL;
+			}
+		} else {
+			/* No need to distinguish RB_RECOVERY_PURGE here,
+			because we are deleting a secondary index record:
+			the distinction between RB_NORMAL and
+			RB_RECOVERY_PURGE only matters when deleting a
+			record that contains externally stored
+			columns. */
+			ut_ad(!dict_index_is_clust(index));
+			btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
+						   RB_NORMAL, &mtr);
+
+			/* The delete operation may fail if we have little
+			file space left: TODO: easiest to crash the database
+			and restart with more file space */
+		}
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+
+func_exit:
+	btr_pcur_close(&pcur);
+func_exit_no_pcur:
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/***********************************************************//**
+Delete marks or removes a secondary index entry if found.
+NOTE that if we updated the fields of a delete-marked secondary index record
+so that alphabetically they stayed the same, e.g., 'abc' -> 'aBc', we cannot
+return to the original values because we do not know them. But this should
+not cause problems because in row0sel.cc, in queries we always retrieve the
+clustered index record or an earlier version of it, if the secondary index
+record through which we do the search is delete-marked.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_mark_or_remove_sec(
+/*================================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr,	/*!< in: query thread */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry)	/*!< in: index entry */
+{
+	dberr_t	err;
+
+	err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+						      entry, BTR_MODIFY_LEAF);
+	if (err == DB_SUCCESS) {
+
+		return(err);
+	}
+
+	err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+						      entry, BTR_MODIFY_TREE);
+	return(err);
+}
+
+/***********************************************************//**
+Delete unmarks a secondary index entry which must be found. It might not be
+delete-marked at the moment, but it does not harm to unmark it anyway. We also
+need to update the fields of the secondary index record if we updated its
+fields but alphabetically they stayed the same, e.g., 'abc' -> 'aBc'.
+@retval	DB_SUCCESS on success
+@retval	DB_FAIL if BTR_MODIFY_TREE should be tried
+@retval	DB_OUT_OF_FILE_SPACE when running out of tablespace
+@retval	DB_DUPLICATE_KEY if the value was missing
+	and an insert would lead to a duplicate exists */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_unmark_sec_and_undo_update(
+/*========================================*/
+	ulint		mode,	/*!< in: search mode: BTR_MODIFY_LEAF or
+				BTR_MODIFY_TREE */
+	que_thr_t*	thr,	/*!< in: query thread */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry)	/*!< in: index entry */
+{
+	btr_pcur_t		pcur;
+	btr_cur_t*		btr_cur		= btr_pcur_get_btr_cur(&pcur);
+	upd_t*			update;
+	dberr_t			err		= DB_SUCCESS;
+	big_rec_t*		dummy_big_rec;
+	mtr_t			mtr;
+	trx_t*			trx		= thr_get_trx(thr);
+	const ulint		flags
+		= BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG;
+	enum row_search_result	search_result;
+
+	ut_ad(trx->id);
+
+	log_free_check();
+	mtr_start(&mtr);
+
+	if (*index->name == TEMP_INDEX_PREFIX) {
+		/* The index->online_status may change if the
+		index->name starts with TEMP_INDEX_PREFIX (meaning
+		that the index is or was being created online). It is
+		protected by index->lock. */
+		if (mode == BTR_MODIFY_LEAF) {
+			mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED;
+			mtr_s_lock(dict_index_get_lock(index), &mtr);
+		} else {
+			ut_ad(mode == BTR_MODIFY_TREE);
+			mtr_x_lock(dict_index_get_lock(index), &mtr);
+		}
+
+		if (row_log_online_op_try(index, entry, trx->id)) {
+			goto func_exit_no_pcur;
+		}
+	} else {
+		/* For secondary indexes,
+		index->online_status==ONLINE_INDEX_CREATION unless
+		index->name starts with TEMP_INDEX_PREFIX. */
+		ut_ad(!dict_index_is_online_ddl(index));
+	}
+
+	search_result = row_search_index_entry(index, entry, mode,
+					       &pcur, &mtr);
+
+	switch (search_result) {
+		mem_heap_t*	heap;
+		mem_heap_t*	offsets_heap;
+		ulint*		offsets;
+	case ROW_BUFFERED:
+	case ROW_NOT_DELETED_REF:
+		/* These are invalid outcomes, because the mode passed
+		to row_search_index_entry() did not include any of the
+		flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+		ut_error;
+	case ROW_NOT_FOUND:
+		if (*index->name != TEMP_INDEX_PREFIX) {
+			/* During online secondary index creation, it
+			is possible that MySQL is waiting for a
+			meta-data lock upgrade before invoking
+			ha_innobase::commit_inplace_alter_table()
+			while this ROLLBACK is executing. InnoDB has
+			finished building the index, but it does not
+			yet exist in MySQL. In this case, we suppress
+			the printout to the error log. */
+			fputs("InnoDB: error in sec index entry del undo in\n"
+			      "InnoDB: ", stderr);
+			dict_index_name_print(stderr, trx, index);
+			fputs("\n"
+			      "InnoDB: tuple ", stderr);
+			dtuple_print(stderr, entry);
+			fputs("\n"
+			      "InnoDB: record ", stderr);
+			rec_print(stderr, btr_pcur_get_rec(&pcur), index);
+			putc('\n', stderr);
+			trx_print(stderr, trx, 0);
+			fputs("\n"
+			      "InnoDB: Submit a detailed bug report"
+			      " to http://bugs.mysql.com\n", stderr);
+
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"record in index %s was not found"
+				" on rollback, trying to insert",
+				index->name);
+		}
+
+		if (btr_cur->up_match >= dict_index_get_n_unique(index)
+		    || btr_cur->low_match >= dict_index_get_n_unique(index)) {
+			if (*index->name != TEMP_INDEX_PREFIX) {
+				ib_logf(IB_LOG_LEVEL_WARN,
+					"record in index %s was not found on"
+					" rollback, and a duplicate exists",
+					index->name);
+			}
+			err = DB_DUPLICATE_KEY;
+			break;
+		}
+
+		/* Insert the missing record that we were trying to
+		delete-unmark. */
+		big_rec_t*	big_rec;
+		rec_t*		insert_rec;
+		offsets = NULL;
+		offsets_heap = NULL;
+
+		err = btr_cur_optimistic_insert(
+			flags, btr_cur, &offsets, &offsets_heap,
+			entry, &insert_rec, &big_rec,
+			0, thr, &mtr);
+		ut_ad(!big_rec);
+
+		if (err == DB_FAIL && mode == BTR_MODIFY_TREE) {
+			err = btr_cur_pessimistic_insert(
+				flags, btr_cur,
+				&offsets, &offsets_heap,
+				entry, &insert_rec, &big_rec,
+				0, thr, &mtr);
+			/* There are no off-page columns in
+			secondary indexes. */
+			ut_ad(!big_rec);
+		}
+
+		if (err == DB_SUCCESS) {
+			page_update_max_trx_id(
+				btr_cur_get_block(btr_cur),
+				btr_cur_get_page_zip(btr_cur),
+				trx->id, &mtr);
+		}
+
+		if (offsets_heap) {
+			mem_heap_free(offsets_heap);
+		}
+
+		break;
+	case ROW_FOUND:
+		err = btr_cur_del_mark_set_sec_rec(
+			BTR_NO_LOCKING_FLAG,
+			btr_cur, FALSE, thr, &mtr);
+		ut_a(err == DB_SUCCESS);
+		heap = mem_heap_create(
+			sizeof(upd_t)
+			+ dtuple_get_n_fields(entry) * sizeof(upd_field_t));
+		offsets_heap = NULL;
+		offsets = rec_get_offsets(
+			btr_cur_get_rec(btr_cur),
+			index, NULL, ULINT_UNDEFINED, &offsets_heap);
+		update = row_upd_build_sec_rec_difference_binary(
+			btr_cur_get_rec(btr_cur), index, offsets, entry, heap);
+		if (upd_get_n_fields(update) == 0) {
+
+			/* Do nothing */
+
+		} else if (mode != BTR_MODIFY_TREE) {
+			/* Try an optimistic updating of the record, keeping
+			changes within the page */
+
+			/* TODO: pass offsets, not &offsets */
+			err = btr_cur_optimistic_update(
+				flags, btr_cur, &offsets, &offsets_heap,
+				update, 0, thr, thr_get_trx(thr)->id, &mtr);
+			switch (err) {
+			case DB_OVERFLOW:
+			case DB_UNDERFLOW:
+			case DB_ZIP_OVERFLOW:
+				err = DB_FAIL;
+			default:
+				break;
+			}
+		} else {
+			err = btr_cur_pessimistic_update(
+				flags, btr_cur, &offsets, &offsets_heap,
+				heap, &dummy_big_rec,
+				update, 0, thr, thr_get_trx(thr)->id, &mtr);
+			ut_a(!dummy_big_rec);
+		}
+
+		mem_heap_free(heap);
+		mem_heap_free(offsets_heap);
+	}
+
+	btr_pcur_close(&pcur);
+func_exit_no_pcur:
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/***********************************************************//**
+Flags a secondary index corrupted. */
+static __attribute__((nonnull))
+void
+row_undo_mod_sec_flag_corrupted(
+/*============================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	dict_index_t*	index)	/*!< in: secondary index */
+{
+	ut_ad(!dict_index_is_clust(index));
+
+	switch (trx->dict_operation_lock_mode) {
+	case RW_S_LATCH:
+		/* Because row_undo() is holding an S-latch
+		on the data dictionary during normal rollback,
+		we can only mark the index corrupted in the
+		data dictionary cache. TODO: fix this somehow.*/
+		mutex_enter(&dict_sys->mutex);
+		dict_set_corrupted_index_cache_only(index, index->table);
+		mutex_exit(&dict_sys->mutex);
+		break;
+	default:
+		ut_ad(0);
+		/* fall through */
+	case RW_X_LATCH:
+		/* This should be the rollback of a data dictionary
+		transaction. */
+		dict_set_corrupted(index, trx, "rollback");
+	}
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is UPD_DEL.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_upd_del_sec(
+/*=====================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	mem_heap_t*	heap;
+	dberr_t		err	= DB_SUCCESS;
+
+	ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+	ut_ad(!node->undo_row);
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		dict_index_t*	index	= node->index;
+		dtuple_t*	entry;
+
+		if (index->type & DICT_FTS) {
+			dict_table_next_uncorrupted_index(node->index);
+			continue;
+		}
+
+		/* During online index creation,
+		HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE should
+		guarantee that any active transaction has not modified
+		indexed columns such that col->ord_part was 0 at the
+		time when the undo log record was written. When we get
+		to roll back an undo log entry TRX_UNDO_DEL_MARK_REC,
+		it should always cover all affected indexes. */
+		entry = row_build_index_entry(
+			node->row, node->ext, index, heap);
+
+		if (UNIV_UNLIKELY(!entry)) {
+			/* The database must have crashed after
+			inserting a clustered index record but before
+			writing all the externally stored columns of
+			that record.  Because secondary index entries
+			are inserted after the clustered index record,
+			we may assume that the secondary index record
+			does not exist.  However, this situation may
+			only occur during the rollback of incomplete
+			transactions. */
+			ut_a(thr_is_recv(thr));
+		} else {
+			err = row_undo_mod_del_mark_or_remove_sec(
+				node, thr, index, entry);
+
+			if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+
+				break;
+			}
+		}
+
+		mem_heap_empty(heap);
+		dict_table_next_uncorrupted_index(node->index);
+	}
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is DEL_MARK.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_mark_sec(
+/*======================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	mem_heap_t*	heap;
+	dberr_t		err	= DB_SUCCESS;
+
+	ut_ad(!node->undo_row);
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		dict_index_t*	index	= node->index;
+		dtuple_t*	entry;
+
+		if (index->type == DICT_FTS) {
+			dict_table_next_uncorrupted_index(node->index);
+			continue;
+		}
+
+		/* During online index creation,
+		HA_ALTER_INPLACE_NO_LOCK_AFTER_PREPARE should
+		guarantee that any active transaction has not modified
+		indexed columns such that col->ord_part was 0 at the
+		time when the undo log record was written. When we get
+		to roll back an undo log entry TRX_UNDO_DEL_MARK_REC,
+		it should always cover all affected indexes. */
+		entry = row_build_index_entry(
+			node->row, node->ext, index, heap);
+
+		ut_a(entry);
+
+		err = row_undo_mod_del_unmark_sec_and_undo_update(
+			BTR_MODIFY_LEAF, thr, index, entry);
+		if (err == DB_FAIL) {
+			err = row_undo_mod_del_unmark_sec_and_undo_update(
+				BTR_MODIFY_TREE, thr, index, entry);
+		}
+
+		if (err == DB_DUPLICATE_KEY) {
+			row_undo_mod_sec_flag_corrupted(
+				thr_get_trx(thr), index);
+			err = DB_SUCCESS;
+			/* Do not return any error to the caller. The
+			duplicate will be reported by ALTER TABLE or
+			CREATE UNIQUE INDEX. Unfortunately we cannot
+			report the duplicate key value to the DDL
+			thread, because the altered_table object is
+			private to its call stack. */
+		} else if (err != DB_SUCCESS) {
+			break;
+		}
+
+		mem_heap_empty(heap);
+		dict_table_next_uncorrupted_index(node->index);
+	}
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is UPD_EXIST.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_upd_exist_sec(
+/*=======================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	mem_heap_t*	heap;
+	dberr_t		err	= DB_SUCCESS;
+
+	if (node->index == NULL
+	    || ((node->cmpl_info & UPD_NODE_NO_ORD_CHANGE))) {
+		/* No change in secondary indexes */
+
+		return(err);
+	}
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		dict_index_t*	index	= node->index;
+		dtuple_t*	entry;
+
+		if (index->type == DICT_FTS
+		    || !row_upd_changes_ord_field_binary(
+			index, node->update, thr, node->row, node->ext)) {
+			dict_table_next_uncorrupted_index(node->index);
+			continue;
+		}
+
+		/* Build the newest version of the index entry */
+		entry = row_build_index_entry(node->row, node->ext,
+					      index, heap);
+		if (UNIV_UNLIKELY(!entry)) {
+			/* The server must have crashed in
+			row_upd_clust_rec_by_insert() before
+			the updated externally stored columns (BLOBs)
+			of the new clustered index entry were written. */
+
+			/* The table must be in DYNAMIC or COMPRESSED
+			format.  REDUNDANT and COMPACT formats
+			store a local 768-byte prefix of each
+			externally stored column. */
+			ut_a(dict_table_get_format(index->table)
+			     >= UNIV_FORMAT_B);
+
+			/* This is only legitimate when
+			rolling back an incomplete transaction
+			after crash recovery. */
+			ut_a(thr_get_trx(thr)->is_recovered);
+
+			/* The server must have crashed before
+			completing the insert of the new
+			clustered index entry and before
+			inserting to the secondary indexes.
+			Because node->row was not yet written
+			to this index, we can ignore it.  But
+			we must restore node->undo_row. */
+		} else {
+			/* NOTE that if we updated the fields of a
+			delete-marked secondary index record so that
+			alphabetically they stayed the same, e.g.,
+			'abc' -> 'aBc', we cannot return to the
+			original values because we do not know them.
+			But this should not cause problems because
+			in row0sel.cc, in queries we always retrieve
+			the clustered index record or an earlier
+			version of it, if the secondary index record
+			through which we do the search is
+			delete-marked. */
+
+			err = row_undo_mod_del_mark_or_remove_sec(
+				node, thr, index, entry);
+			if (err != DB_SUCCESS) {
+				break;
+			}
+		}
+
+		mem_heap_empty(heap);
+		/* We may have to update the delete mark in the
+		secondary index record of the previous version of
+		the row. We also need to update the fields of
+		the secondary index record if we updated its fields
+		but alphabetically they stayed the same, e.g.,
+		'abc' -> 'aBc'. */
+		entry = row_build_index_entry(node->undo_row,
+					      node->undo_ext,
+					      index, heap);
+		ut_a(entry);
+
+		err = row_undo_mod_del_unmark_sec_and_undo_update(
+			BTR_MODIFY_LEAF, thr, index, entry);
+		if (err == DB_FAIL) {
+			err = row_undo_mod_del_unmark_sec_and_undo_update(
+				BTR_MODIFY_TREE, thr, index, entry);
+		}
+
+		if (err == DB_DUPLICATE_KEY) {
+			row_undo_mod_sec_flag_corrupted(
+				thr_get_trx(thr), index);
+			err = DB_SUCCESS;
+		} else if (err != DB_SUCCESS) {
+			break;
+		}
+
+		mem_heap_empty(heap);
+		dict_table_next_uncorrupted_index(node->index);
+	}
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/***********************************************************//**
+Parses the row reference and other info in a modify undo log record. */
+static __attribute__((nonnull))
+void
+row_undo_mod_parse_undo_rec(
+/*========================*/
+	undo_node_t*	node,		/*!< in: row undo node */
+	ibool		dict_locked)	/*!< in: TRUE if own dict_sys->mutex */
+{
+	dict_index_t*	clust_index;
+	byte*		ptr;
+	undo_no_t	undo_no;
+	table_id_t	table_id;
+	trx_id_t	trx_id;
+	roll_ptr_t	roll_ptr;
+	ulint		info_bits;
+	ulint		type;
+	ulint		cmpl_info;
+	bool		dummy_extern;
+
+	ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
+				    &dummy_extern, &undo_no, &table_id);
+	node->rec_type = type;
+
+	node->table = dict_table_open_on_id(
+		table_id, dict_locked, DICT_TABLE_OP_NORMAL);
+
+	/* TODO: other fixes associated with DROP TABLE + rollback in the
+	same table by another user */
+
+	if (node->table == NULL) {
+		/* Table was dropped */
+		return;
+	}
+
+	if (node->table->ibd_file_missing) {
+		dict_table_close(node->table, dict_locked, FALSE);
+
+		/* We skip undo operations to missing .ibd files */
+		node->table = NULL;
+
+		return;
+	}
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+					       &info_bits);
+
+	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+				       node->heap);
+
+	trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+				       roll_ptr, info_bits, node->trx,
+				       node->heap, &(node->update));
+	node->new_trx_id = trx_id;
+	node->cmpl_info = cmpl_info;
+
+	if (!row_undo_search_clust_to_pcur(node)) {
+
+		dict_table_close(node->table, dict_locked, FALSE);
+
+		node->table = NULL;
+	}
+}
+
+/***********************************************************//**
+Undoes a modify operation on a row of a table.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+row_undo_mod(
+/*=========*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+	ibool	dict_locked;
+
+	ut_ad(node && thr);
+	ut_ad(node->state == UNDO_NODE_MODIFY);
+
+	dict_locked = thr_get_trx(thr)->dict_operation_lock_mode == RW_X_LATCH;
+
+	ut_ad(thr_get_trx(thr) == node->trx);
+
+	row_undo_mod_parse_undo_rec(node, dict_locked);
+
+	if (node->table == NULL) {
+		/* It is already undone, or will be undone by another query
+		thread, or table was dropped */
+
+		trx_undo_rec_release(node->trx, node->undo_no);
+		node->state = UNDO_NODE_FETCH_NEXT;
+
+		return(DB_SUCCESS);
+	}
+
+	node->index = dict_table_get_first_index(node->table);
+	ut_ad(dict_index_is_clust(node->index));
+	/* Skip the clustered index (the first index) */
+	node->index = dict_table_get_next_index(node->index);
+
+	/* Skip all corrupted secondary index */
+	dict_table_skip_corrupt_index(node->index);
+
+	switch (node->rec_type) {
+	case TRX_UNDO_UPD_EXIST_REC:
+		err = row_undo_mod_upd_exist_sec(node, thr);
+		break;
+	case TRX_UNDO_DEL_MARK_REC:
+		err = row_undo_mod_del_mark_sec(node, thr);
+		break;
+	case TRX_UNDO_UPD_DEL_REC:
+		err = row_undo_mod_upd_del_sec(node, thr);
+		break;
+	default:
+		ut_error;
+		err = DB_ERROR;
+	}
+
+	if (err == DB_SUCCESS) {
+
+		err = row_undo_mod_clust(node, thr);
+	}
+
+	dict_table_close(node->table, dict_locked, FALSE);
+
+	node->table = NULL;
+
+	return(err);
+}
diff --git a/storage/innobase/row/row0undo.cc b/storage/innobase/row/row0undo.cc
new file mode 100644
index 00000000000..9977a1e8f04
--- /dev/null
+++ b/storage/innobase/row/row0undo.cc
@@ -0,0 +1,375 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0undo.cc
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0undo.h"
+
+#ifdef UNIV_NONINL
+#include "row0undo.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0uins.h"
+#include "row0umod.h"
+#include "row0upd.h"
+#include "row0mysql.h"
+#include "srv0srv.h"
+
+/* How to undo row operations?
+(1) For an insert, we have stored a prefix of the clustered index record
+in the undo log. Using it, we look for the clustered record, and using
+that we look for the records in the secondary indexes. The insert operation
+may have been left incomplete, if the database crashed, for example.
+We may have look at the trx id and roll ptr to make sure the record in the
+clustered index is really the one for which the undo log record was
+written. We can use the framework we get from the original insert op.
+(2) Delete marking: We can use the framework we get from the original
+delete mark op. We only have to check the trx id.
+(3) Update: This may be the most complicated. We have to use the framework
+we get from the original update op.
+
+What if the same trx repeatedly deletes and inserts an identical row.
+Then the row id changes and also roll ptr. What if the row id was not
+part of the ordering fields in the clustered index? Maybe we have to write
+it to undo log. Well, maybe not, because if we order the row id and trx id
+in descending order, then the only undeleted copy is the first in the
+index. Our searches in row operations always position the cursor before
+the first record in the result set. But, if there is no key defined for
+a table, then it would be desirable that row id is in ascending order.
+So, lets store row id in descending order only if it is not an ordering
+field in the clustered index.
+
+NOTE: Deletes and inserts may lead to situation where there are identical
+records in a secondary index. Is that a problem in the B-tree? Yes.
+Also updates can lead to this, unless trx id and roll ptr are included in
+ord fields.
+(1) Fix in clustered indexes: include row id, trx id, and roll ptr
+in node pointers of B-tree.
+(2) Fix in secondary indexes: include all fields in node pointers, and
+if an entry is inserted, check if it is equal to the right neighbor,
+in which case update the right neighbor: the neighbor must be delete
+marked, set it unmarked and write the trx id of the current transaction.
+
+What if the same trx repeatedly updates the same row, updating a secondary
+index field or not? Updating a clustered index ordering field?
+
+(1) If it does not update the secondary index and not the clustered index
+ord field. Then the secondary index record stays unchanged, but the
+trx id in the secondary index record may be smaller than in the clustered
+index record. This is no problem?
+(2) If it updates secondary index ord field but not clustered: then in
+secondary index there are delete marked records, which differ in an
+ord field. No problem.
+(3) Updates clustered ord field but not secondary, and secondary index
+is unique. Then the record in secondary index is just updated at the
+clustered ord field.
+(4)
+
+Problem with duplicate records:
+Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a
+bigger trx id has inserted and delete marked a similar row, our trx inserts
+again a similar row, and a trx with an even bigger id delete marks it. Then
+the position of the row should change in the index if the trx id affects
+the alphabetical ordering.
+
+Fix 2: If an insert encounters a similar row marked deleted, we turn the
+insert into an 'update' of the row marked deleted. Then we must write undo
+info on the update. A problem: what if a purge operation tries to remove
+the delete marked row?
+
+We can think of the database row versions as a linked list which starts
+from the record in the clustered index, and is linked by roll ptrs
+through undo logs. The secondary index records are references which tell
+what kinds of records can be found in this linked list for a record
+in the clustered index.
+
+How to do the purge? A record can be removed from the clustered index
+if its linked list becomes empty, i.e., the row has been marked deleted
+and its roll ptr points to the record in the undo log we are going through,
+doing the purge. Similarly, during a rollback, a record can be removed
+if the stored roll ptr in the undo log points to a trx already (being) purged,
+or if the roll ptr is NULL, i.e., it was a fresh insert. */
+
+/********************************************************************//**
+Creates a row undo node to a query graph.
+@return	own: undo node */
+UNIV_INTERN
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+	trx_t*		trx,	/*!< in: transaction */
+	que_thr_t*	parent,	/*!< in: parent node, i.e., a thr node */
+	mem_heap_t*	heap)	/*!< in: memory heap where created */
+{
+	undo_node_t*	undo;
+
+	ut_ad(trx && parent && heap);
+
+	undo = static_cast<undo_node_t*>(
+		mem_heap_alloc(heap, sizeof(undo_node_t)));
+
+	undo->common.type = QUE_NODE_UNDO;
+	undo->common.parent = parent;
+
+	undo->state = UNDO_NODE_FETCH_NEXT;
+	undo->trx = trx;
+
+	btr_pcur_init(&(undo->pcur));
+
+	undo->heap = mem_heap_create(256);
+
+	return(undo);
+}
+
+/***********************************************************//**
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case.
+@return TRUE if found; NOTE the node->pcur must be closed by the
+caller, regardless of the return value */
+UNIV_INTERN
+ibool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+	undo_node_t*	node)	/*!< in: row undo node */
+{
+	dict_index_t*	clust_index;
+	ibool		found;
+	mtr_t		mtr;
+	ibool		ret;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	mtr_start(&mtr);
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	found = row_search_on_row_ref(&(node->pcur), BTR_MODIFY_LEAF,
+				      node->table, node->ref, &mtr);
+
+	rec = btr_pcur_get_rec(&(node->pcur));
+
+	offsets = rec_get_offsets(rec, clust_index, offsets,
+				  ULINT_UNDEFINED, &heap);
+
+	if (!found || node->roll_ptr
+	    != row_get_rec_roll_ptr(rec, clust_index, offsets)) {
+
+		/* We must remove the reservation on the undo log record
+		BEFORE releasing the latch on the clustered index page: this
+		is to make sure that some thread will eventually undo the
+		modification corresponding to node->roll_ptr. */
+
+		/* fputs("--------------------undoing a previous version\n",
+		stderr); */
+
+		ret = FALSE;
+	} else {
+		row_ext_t**	ext;
+
+		if (dict_table_get_format(node->table) >= UNIV_FORMAT_B) {
+			/* In DYNAMIC or COMPRESSED format, there is
+			no prefix of externally stored columns in the
+			clustered index record. Build a cache of
+			column prefixes. */
+			ext = &node->ext;
+		} else {
+			/* REDUNDANT and COMPACT formats store a local
+			768-byte prefix of each externally stored
+			column. No cache is needed. */
+			ext = NULL;
+			node->ext = NULL;
+		}
+
+		node->row = row_build(ROW_COPY_DATA, clust_index, rec,
+				      offsets, NULL,
+				      NULL, NULL, ext, node->heap);
+		if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+			node->undo_row = dtuple_copy(node->row, node->heap);
+			row_upd_replace(node->undo_row, &node->undo_ext,
+					clust_index, node->update, node->heap);
+		} else {
+			node->undo_row = NULL;
+			node->undo_ext = NULL;
+		}
+
+		btr_pcur_store_position(&(node->pcur), &mtr);
+
+		ret = TRUE;
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(ret);
+}
+
+/***********************************************************//**
+Fetches an undo log record and does the undo for the recorded operation.
+If none left, or a partial rollback completed, returns control to the
+parent node, which is always a query thread node.
+@return	DB_SUCCESS if operation successfully completed, else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_undo(
+/*=====*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t		err;
+	trx_t*		trx;
+	roll_ptr_t	roll_ptr;
+	ibool		locked_data_dict;
+
+	ut_ad(node && thr);
+
+	trx = node->trx;
+
+	if (node->state == UNDO_NODE_FETCH_NEXT) {
+
+		node->undo_rec = trx_roll_pop_top_rec_of_trx(trx,
+							     trx->roll_limit,
+							     &roll_ptr,
+							     node->heap);
+		if (!node->undo_rec) {
+			/* Rollback completed for this query thread */
+
+			thr->run_node = que_node_get_parent(node);
+
+			return(DB_SUCCESS);
+		}
+
+		node->roll_ptr = roll_ptr;
+		node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
+
+		if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+
+			node->state = UNDO_NODE_INSERT;
+		} else {
+			node->state = UNDO_NODE_MODIFY;
+		}
+	}
+
+	/* Prevent DROP TABLE etc. while we are rolling back this row.
+	If we are doing a TABLE CREATE or some other dictionary operation,
+	then we already have dict_operation_lock locked in x-mode. Do not
+	try to lock again, because that would cause a hang. */
+
+	locked_data_dict = (trx->dict_operation_lock_mode == 0);
+
+	if (locked_data_dict) {
+
+		row_mysql_freeze_data_dictionary(trx);
+	}
+
+	if (node->state == UNDO_NODE_INSERT) {
+
+		err = row_undo_ins(node);
+
+		node->state = UNDO_NODE_FETCH_NEXT;
+	} else {
+		ut_ad(node->state == UNDO_NODE_MODIFY);
+		err = row_undo_mod(node, thr);
+	}
+
+	if (locked_data_dict) {
+
+		row_mysql_unfreeze_data_dictionary(trx);
+	}
+
+	/* Do some cleanup */
+	btr_pcur_close(&(node->pcur));
+
+	mem_heap_empty(node->heap);
+
+	thr->run_node = node;
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_undo_step(
+/*==========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t		err;
+	undo_node_t*	node;
+	trx_t*		trx;
+
+	ut_ad(thr);
+
+	srv_inc_activity_count();
+
+	trx = thr_get_trx(thr);
+
+	node = static_cast<undo_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
+
+	err = row_undo(node, thr);
+
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		/* SQL error detected */
+
+		fprintf(stderr, "InnoDB: Fatal error (%s) in rollback.\n",
+			ut_strerr(err));
+
+		if (err == DB_OUT_OF_FILE_SPACE) {
+			fprintf(stderr,
+				"InnoDB: Out of tablespace.\n"
+				"InnoDB: Consider increasing"
+				" your tablespace.\n");
+
+			exit(1);
+		}
+
+		ut_error;
+
+		return(NULL);
+	}
+
+	return(thr);
+}
diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc
new file mode 100644
index 00000000000..fcd54332a47
--- /dev/null
+++ b/storage/innobase/row/row0upd.cc
@@ -0,0 +1,2698 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0upd.cc
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0upd.h"
+
+#ifdef UNIV_NONINL
+#include "row0upd.ic"
+#endif
+
+#include "ha_prototypes.h"
+#include "dict0dict.h"
+#include "trx0undo.h"
+#include "rem0rec.h"
+#ifndef UNIV_HOTBACKUP
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "mach0data.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "que0que.h"
+#include "row0ext.h"
+#include "row0ins.h"
+#include "row0log.h"
+#include "row0row.h"
+#include "row0sel.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "pars0sym.h"
+#include "eval0eval.h"
+#include "buf0lru.h"
+#include <algorithm>
+
+/* What kind of latch and lock can we assume when the control comes to
+   -------------------------------------------------------------------
+an update node?
+--------------
+Efficiency of massive updates would require keeping an x-latch on a
+clustered index page through many updates, and not setting an explicit
+x-lock on clustered index records, as they anyway will get an implicit
+x-lock when they are updated. A problem is that the read nodes in the
+graph should know that they must keep the latch when passing the control
+up to the update node, and not set any record lock on the record which
+will be updated. Another problem occurs if the execution is stopped,
+as the kernel switches to another query thread, or the transaction must
+wait for a lock. Then we should be able to release the latch and, maybe,
+acquire an explicit x-lock on the record.
+	Because this seems too complicated, we conclude that the less
+efficient solution of releasing all the latches when the control is
+transferred to another node, and acquiring explicit x-locks, is better. */
+
+/* How is a delete performed? If there is a delete without an
+explicit cursor, i.e., a searched delete, there are at least
+two different situations:
+the implicit select cursor may run on (1) the clustered index or
+on (2) a secondary index. The delete is performed by setting
+the delete bit in the record and substituting the id of the
+deleting transaction for the original trx id, and substituting a
+new roll ptr for previous roll ptr. The old trx id and roll ptr
+are saved in the undo log record. Thus, no physical changes occur
+in the index tree structure at the time of the delete. Only
+when the undo log is purged, the index records will be physically
+deleted from the index trees.
+
+The query graph executing a searched delete would consist of
+a delete node which has as a subtree a select subgraph.
+The select subgraph should return a (persistent) cursor
+in the clustered index, placed on page which is x-latched.
+The delete node should look for all secondary index records for
+this clustered index entry and mark them as deleted. When is
+the x-latch freed? The most efficient way for performing a
+searched delete is obviously to keep the x-latch for several
+steps of query graph execution. */
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***********************************************************//**
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes.
+@return	TRUE if changes */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+	dtuple_t*	entry,	/*!< in: old value of index entry */
+	dict_index_t*	index,	/*!< in: index of entry */
+	const upd_t*	update,	/*!< in: update vector for the row */
+	ulint		n);	/*!< in: how many first fields to check */
+
+
+/*********************************************************************//**
+Checks if index currently is mentioned as a referenced index in a foreign
+key constraint.
+
+NOTE that since we do not hold dict_operation_lock when leaving the
+function, it may be that the referencing table has been dropped when
+we leave this function: this function is only for heuristic use!
+
+@return TRUE if referenced */
+static
+ibool
+row_upd_index_is_referenced(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index */
+	trx_t*		trx)	/*!< in: transaction */
+{
+	dict_table_t*	table		= index->table;
+	ibool		froze_data_dict	= FALSE;
+	ibool		is_referenced	= FALSE;
+
+	if (table->referenced_set.empty()) {
+		return(FALSE);
+	}
+
+	if (trx->dict_operation_lock_mode == 0) {
+		row_mysql_freeze_data_dictionary(trx);
+		froze_data_dict = TRUE;
+	}
+
+	dict_foreign_set::iterator	it
+		= std::find_if(table->referenced_set.begin(),
+			       table->referenced_set.end(),
+			       dict_foreign_with_index(index));
+
+	is_referenced = (it != table->referenced_set.end());
+
+	if (froze_data_dict) {
+		row_mysql_unfreeze_data_dictionary(trx);
+	}
+
+	return(is_referenced);
+}
+
+/*********************************************************************//**
+Checks if possible foreign key constraints hold after a delete of the record
+under pcur.
+
+NOTE that this function will temporarily commit mtr and lose the
+pcur position!
+
+@return	DB_SUCCESS or an error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_upd_check_references_constraints(
+/*=================================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	btr_pcur_t*	pcur,	/*!< in: cursor positioned on a record; NOTE: the
+				cursor position is lost in this function! */
+	dict_table_t*	table,	/*!< in: table in question */
+	dict_index_t*	index,	/*!< in: index of the cursor */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(pcur.rec, index) */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_foreign_t*	foreign;
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	trx_t*		trx;
+	const rec_t*	rec;
+	ulint		n_ext;
+	dberr_t		err;
+	ibool		got_s_lock	= FALSE;
+
+	if (table->referenced_set.empty()) {
+
+		return(DB_SUCCESS);
+	}
+
+	trx = thr_get_trx(thr);
+
+	rec = btr_pcur_get_rec(pcur);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	heap = mem_heap_create(500);
+
+	entry = row_rec_to_index_entry(rec, index, offsets, &n_ext, heap);
+
+	mtr_commit(mtr);
+
+	DEBUG_SYNC_C("foreign_constraint_check_for_update");
+
+	mtr_start(mtr);
+
+	if (trx->dict_operation_lock_mode == 0) {
+		got_s_lock = TRUE;
+
+		row_mysql_freeze_data_dictionary(trx);
+	}
+
+run_again:
+
+	for (dict_foreign_set::iterator it = table->referenced_set.begin();
+	     it != table->referenced_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		/* Note that we may have an update which updates the index
+		record, but does NOT update the first fields which are
+		referenced in a foreign key constraint. Then the update does
+		NOT break the constraint. */
+
+		if (foreign->referenced_index == index
+		    && (node->is_delete
+			|| row_upd_changes_first_fields_binary(
+				entry, index, node->update,
+				foreign->n_fields))) {
+			dict_table_t*	foreign_table = foreign->foreign_table;
+
+			dict_table_t*	ref_table = NULL;
+
+			if (foreign_table == NULL) {
+
+				ref_table = dict_table_open_on_name(
+					foreign->foreign_table_name_lookup,
+					FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+			}
+
+			if (foreign_table) {
+				os_inc_counter(dict_sys->mutex,
+					       foreign_table
+					       ->n_foreign_key_checks_running);
+			}
+
+			/* NOTE that if the thread ends up waiting for a lock
+			we will release dict_operation_lock temporarily!
+			But the counter on the table protects 'foreign' from
+			being dropped while the check is running. */
+
+			err = row_ins_check_foreign_constraint(
+				FALSE, foreign, table, entry, thr);
+
+			if (foreign_table) {
+				os_dec_counter(dict_sys->mutex,
+					       foreign_table
+					       ->n_foreign_key_checks_running);
+			}
+
+			if (ref_table != NULL) {
+				dict_table_close(ref_table, FALSE, FALSE);
+			}
+
+			/* Some table foreign key dropped, try again */
+			if (err == DB_DICT_CHANGED) {
+				goto run_again;
+			} else if (err != DB_SUCCESS) {
+				goto func_exit;
+			}
+		}
+	}
+
+	err = DB_SUCCESS;
+
+func_exit:
+	if (got_s_lock) {
+		row_mysql_unfreeze_data_dictionary(trx);
+	}
+
+	mem_heap_free(heap);
+
+	DEBUG_SYNC_C("foreign_constraint_check_for_update_done");
+
+	return(err);
+}
+
+/*********************************************************************//**
+Creates an update node for a query graph.
+@return	own: update node */
+UNIV_INTERN
+upd_node_t*
+upd_node_create(
+/*============*/
+	mem_heap_t*	heap)	/*!< in: mem heap where created */
+{
+	upd_node_t*	node;
+
+	node = static_cast<upd_node_t*>(
+		mem_heap_alloc(heap, sizeof(upd_node_t)));
+
+	node->common.type = QUE_NODE_UPDATE;
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+	node->in_mysql_interface = FALSE;
+
+	node->row = NULL;
+	node->ext = NULL;
+	node->upd_row = NULL;
+	node->upd_ext = NULL;
+	node->index = NULL;
+	node->update = NULL;
+
+	node->foreign = NULL;
+	node->cascade_heap = NULL;
+	node->cascade_node = NULL;
+
+	node->select = NULL;
+
+	node->heap = mem_heap_create(128);
+	node->magic_n = UPD_NODE_MAGIC_N;
+
+	node->cmpl_info = 0;
+
+	return(node);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Updates the trx id and roll ptr field in a clustered index record in database
+recovery. */
+UNIV_INTERN
+void
+row_upd_rec_sys_fields_in_recovery(
+/*===============================*/
+	rec_t*		rec,	/*!< in/out: record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		pos,	/*!< in: TRX_ID position in rec */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	roll_ptr_t	roll_ptr)/*!< in: roll ptr of the undo log record */
+{
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	if (page_zip) {
+		page_zip_write_trx_id_and_roll_ptr(
+			page_zip, rec, offsets, pos, trx_id, roll_ptr);
+	} else {
+		byte*	field;
+		ulint	len;
+
+		field = rec_get_nth_field(rec, offsets, pos, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+#if DATA_TRX_ID + 1 != DATA_ROLL_PTR
+# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR"
+#endif
+		trx_write_trx_id(field, trx_id);
+		trx_write_roll_ptr(field + DATA_TRX_ID_LEN, roll_ptr);
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Sets the trx id or roll ptr field of a clustered index entry. */
+UNIV_INTERN
+void
+row_upd_index_entry_sys_field(
+/*==========================*/
+	dtuple_t*	entry,	/*!< in/out: index entry, where the memory
+				buffers for sys fields are already allocated:
+				the function just copies the new values to
+				them */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		type,	/*!< in: DATA_TRX_ID or DATA_ROLL_PTR */
+	ib_uint64_t	val)	/*!< in: value to write */
+{
+	dfield_t*	dfield;
+	byte*		field;
+	ulint		pos;
+
+	ut_ad(dict_index_is_clust(index));
+
+	pos = dict_index_get_sys_col_pos(index, type);
+
+	dfield = dtuple_get_nth_field(entry, pos);
+	field = static_cast<byte*>(dfield_get_data(dfield));
+
+	if (type == DATA_TRX_ID) {
+		trx_write_trx_id(field, val);
+	} else {
+		ut_ad(type == DATA_ROLL_PTR);
+		trx_write_roll_ptr(field, val);
+	}
+}
+
+/***********************************************************//**
+Returns TRUE if row update changes size of some field in index or if some
+field to be updated is stored externally in rec or update.
+@return TRUE if the update changes the size of some field in index or
+the field is external in rec or update */
+UNIV_INTERN
+ibool
+row_upd_changes_field_size_or_external(
+/*===================================*/
+	dict_index_t*	index,	/*!< in: index */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update)	/*!< in: update vector */
+{
+	const upd_field_t*	upd_field;
+	const dfield_t*		new_val;
+	ulint			old_len;
+	ulint			new_len;
+	ulint			n_fields;
+	ulint			i;
+
+	ut_ad(rec_offs_validate(NULL, index, offsets));
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+
+		new_val = &(upd_field->new_val);
+		new_len = dfield_get_len(new_val);
+
+		if (dfield_is_null(new_val) && !rec_offs_comp(offsets)) {
+			/* A bug fixed on Dec 31st, 2004: we looked at the
+			SQL NULL size from the wrong field! We may backport
+			this fix also to 4.0. The merge to 5.0 will be made
+			manually immediately after we commit this to 4.1. */
+
+			new_len = dict_col_get_sql_null_size(
+				dict_index_get_nth_col(index,
+						       upd_field->field_no),
+				0);
+		}
+
+		old_len = rec_offs_nth_size(offsets, upd_field->field_no);
+
+		if (rec_offs_comp(offsets)
+		    && rec_offs_nth_sql_null(offsets,
+					     upd_field->field_no)) {
+			/* Note that in the compact table format, for a
+			variable length field, an SQL NULL will use zero
+			bytes in the offset array at the start of the physical
+			record, but a zero-length value (empty string) will
+			use one byte! Thus, we cannot use update-in-place
+			if we update an SQL NULL varchar to an empty string! */
+
+			old_len = UNIV_SQL_NULL;
+		}
+
+		if (dfield_is_ext(new_val) || old_len != new_len
+		    || rec_offs_nth_extern(offsets, upd_field->field_no)) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/***********************************************************//**
+Returns true if row update contains disowned external fields.
+@return true if the update contains disowned external fields. */
+UNIV_INTERN
+bool
+row_upd_changes_disowned_external(
+/*==============================*/
+	const upd_t*	update)	/*!< in: update vector */
+{
+	const upd_field_t*	upd_field;
+	const dfield_t*		new_val;
+	ulint			new_len;
+	ulint                   n_fields;
+	ulint			i;
+
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		const byte*	field_ref;
+
+		upd_field = upd_get_nth_field(update, i);
+		new_val = &(upd_field->new_val);
+		new_len = dfield_get_len(new_val);
+
+		if (!dfield_is_ext(new_val)) {
+			continue;
+		}
+
+		ut_ad(new_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+		field_ref = static_cast<const byte*>(dfield_get_data(new_val))
+			    + new_len - BTR_EXTERN_FIELD_REF_SIZE;
+
+		if (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the
+record given. No field size changes are allowed. This function is
+usually invoked on a clustered index. The only use case for a
+secondary index is row_ins_sec_index_entry_by_modify() or its
+counterpart in ibuf_insert_to_index_page(). */
+UNIV_INTERN
+void
+row_upd_rec_in_place(
+/*=================*/
+	rec_t*		rec,	/*!< in/out: record where replaced */
+	dict_index_t*	index,	/*!< in: the index the record belongs to */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	const upd_t*	update,	/*!< in: update vector */
+	page_zip_des_t*	page_zip)/*!< in: compressed page with enough space
+				available, or NULL */
+{
+	const upd_field_t*	upd_field;
+	const dfield_t*		new_val;
+	ulint			n_fields;
+	ulint			i;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (rec_offs_comp(offsets)) {
+		rec_set_info_bits_new(rec, update->info_bits);
+	} else {
+		rec_set_info_bits_old(rec, update->info_bits);
+	}
+
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+#ifdef UNIV_BLOB_DEBUG
+		btr_blob_dbg_t	b;
+		const byte*	field_ref	= NULL;
+#endif /* UNIV_BLOB_DEBUG */
+
+		upd_field = upd_get_nth_field(update, i);
+		new_val = &(upd_field->new_val);
+		ut_ad(!dfield_is_ext(new_val) ==
+		      !rec_offs_nth_extern(offsets, upd_field->field_no));
+#ifdef UNIV_BLOB_DEBUG
+		if (dfield_is_ext(new_val)) {
+			ulint	len;
+			field_ref = rec_get_nth_field(rec, offsets, i, &len);
+			ut_a(len != UNIV_SQL_NULL);
+			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+			field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+			b.ref_page_no = page_get_page_no(page_align(rec));
+			b.ref_heap_no = page_rec_get_heap_no(rec);
+			b.ref_field_no = i;
+			b.blob_page_no = mach_read_from_4(
+				field_ref + BTR_EXTERN_PAGE_NO);
+			ut_a(b.ref_field_no >= index->n_uniq);
+			btr_blob_dbg_rbt_delete(index, &b, "upd_in_place");
+		}
+#endif /* UNIV_BLOB_DEBUG */
+
+		rec_set_nth_field(rec, offsets, upd_field->field_no,
+				  dfield_get_data(new_val),
+				  dfield_get_len(new_val));
+
+#ifdef UNIV_BLOB_DEBUG
+		if (dfield_is_ext(new_val)) {
+			b.blob_page_no = mach_read_from_4(
+				field_ref + BTR_EXTERN_PAGE_NO);
+			b.always_owner = b.owner = !(field_ref[BTR_EXTERN_LEN]
+						     & BTR_EXTERN_OWNER_FLAG);
+			b.del = rec_get_deleted_flag(
+				rec, rec_offs_comp(offsets));
+
+			btr_blob_dbg_rbt_insert(index, &b, "upd_in_place");
+		}
+#endif /* UNIV_BLOB_DEBUG */
+	}
+
+	if (page_zip) {
+		page_zip_write_rec(page_zip, rec, index, offsets, 0);
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Writes into the redo log the values of trx id and roll ptr and enough info
+to determine their positions within a clustered index record.
+@return	new pointer to mlog */
+UNIV_INTERN
+byte*
+row_upd_write_sys_vals_to_log(
+/*==========================*/
+	dict_index_t*	index,	/*!< in: clustered index */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	roll_ptr_t	roll_ptr,/*!< in: roll ptr of the undo log record */
+	byte*		log_ptr,/*!< pointer to a buffer of size > 20 opened
+				in mlog */
+	mtr_t*		mtr __attribute__((unused))) /*!< in: mtr */
+{
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(mtr);
+
+	log_ptr += mach_write_compressed(log_ptr,
+					 dict_index_get_sys_col_pos(
+						 index, DATA_TRX_ID));
+
+	trx_write_roll_ptr(log_ptr, roll_ptr);
+	log_ptr += DATA_ROLL_PTR_LEN;
+
+	log_ptr += mach_ull_write_compressed(log_ptr, trx_id);
+
+	return(log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Parses the log data of system field values.
+@return	log data end or NULL */
+UNIV_INTERN
+byte*
+row_upd_parse_sys_vals(
+/*===================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	ulint*		pos,	/*!< out: TRX_ID position in record */
+	trx_id_t*	trx_id,	/*!< out: trx id */
+	roll_ptr_t*	roll_ptr)/*!< out: roll ptr */
+{
+	ptr = mach_parse_compressed(ptr, end_ptr, pos);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	if (end_ptr < ptr + DATA_ROLL_PTR_LEN) {
+
+		return(NULL);
+	}
+
+	*roll_ptr = trx_read_roll_ptr(ptr);
+	ptr += DATA_ROLL_PTR_LEN;
+
+	ptr = mach_ull_parse_compressed(ptr, end_ptr, trx_id);
+
+	return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************//**
+Writes to the redo log the new values of the fields occurring in the index. */
+UNIV_INTERN
+void
+row_upd_index_write_log(
+/*====================*/
+	const upd_t*	update,	/*!< in: update vector */
+	byte*		log_ptr,/*!< in: pointer to mlog buffer: must
+				contain at least MLOG_BUF_MARGIN bytes
+				of free space; the buffer is closed
+				within this function */
+	mtr_t*		mtr)	/*!< in: mtr into whose log to write */
+{
+	const upd_field_t*	upd_field;
+	const dfield_t*		new_val;
+	ulint			len;
+	ulint			n_fields;
+	byte*			buf_end;
+	ulint			i;
+
+	n_fields = upd_get_n_fields(update);
+
+	buf_end = log_ptr + MLOG_BUF_MARGIN;
+
+	mach_write_to_1(log_ptr, update->info_bits);
+	log_ptr++;
+	log_ptr += mach_write_compressed(log_ptr, n_fields);
+
+	for (i = 0; i < n_fields; i++) {
+
+#if MLOG_BUF_MARGIN <= 30
+# error "MLOG_BUF_MARGIN <= 30"
+#endif
+
+		if (log_ptr + 30 > buf_end) {
+			mlog_close(mtr, log_ptr);
+
+			log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
+			buf_end = log_ptr + MLOG_BUF_MARGIN;
+		}
+
+		upd_field = upd_get_nth_field(update, i);
+
+		new_val = &(upd_field->new_val);
+
+		len = dfield_get_len(new_val);
+
+		log_ptr += mach_write_compressed(log_ptr, upd_field->field_no);
+		log_ptr += mach_write_compressed(log_ptr, len);
+
+		if (len != UNIV_SQL_NULL) {
+			if (log_ptr + len < buf_end) {
+				memcpy(log_ptr, dfield_get_data(new_val), len);
+
+				log_ptr += len;
+			} else {
+				mlog_close(mtr, log_ptr);
+
+				mlog_catenate_string(
+					mtr,
+					static_cast<byte*>(
+						dfield_get_data(new_val)),
+					len);
+
+				log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
+				buf_end = log_ptr + MLOG_BUF_MARGIN;
+			}
+		}
+	}
+
+	mlog_close(mtr, log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Parses the log data written by row_upd_index_write_log.
+@return	log data end or NULL */
+UNIV_INTERN
+byte*
+row_upd_index_parse(
+/*================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	mem_heap_t*	heap,	/*!< in: memory heap where update vector is
+				built */
+	upd_t**		update_out)/*!< out: update vector */
+{
+	upd_t*		update;
+	upd_field_t*	upd_field;
+	dfield_t*	new_val;
+	ulint		len;
+	ulint		n_fields;
+	ulint		info_bits;
+	ulint		i;
+
+	if (end_ptr < ptr + 1) {
+
+		return(NULL);
+	}
+
+	info_bits = mach_read_from_1(ptr);
+	ptr++;
+	ptr = mach_parse_compressed(ptr, end_ptr, &n_fields);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	update = upd_create(n_fields, heap);
+	update->info_bits = info_bits;
+
+	for (i = 0; i < n_fields; i++) {
+		ulint	field_no;
+		upd_field = upd_get_nth_field(update, i);
+		new_val = &(upd_field->new_val);
+
+		ptr = mach_parse_compressed(ptr, end_ptr, &field_no);
+
+		if (ptr == NULL) {
+
+			return(NULL);
+		}
+
+		upd_field->field_no = field_no;
+
+		ptr = mach_parse_compressed(ptr, end_ptr, &len);
+
+		if (ptr == NULL) {
+
+			return(NULL);
+		}
+
+		if (len != UNIV_SQL_NULL) {
+
+			if (end_ptr < ptr + len) {
+
+				return(NULL);
+			}
+
+			dfield_set_data(new_val,
+					mem_heap_dup(heap, ptr, len), len);
+			ptr += len;
+		} else {
+			dfield_set_null(new_val);
+		}
+	}
+
+	*update_out = update;
+
+	return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Builds an update vector from those fields which in a secondary index entry
+differ from a record that has the equal ordering fields. NOTE: we compare
+the fields as binary strings!
+@return	own: update vector of differing fields */
+UNIV_INTERN
+upd_t*
+row_upd_build_sec_rec_difference_binary(
+/*====================================*/
+	const rec_t*	rec,	/*!< in: secondary index record */
+	dict_index_t*	index,	/*!< in: index */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+{
+	upd_field_t*	upd_field;
+	const dfield_t*	dfield;
+	const byte*	data;
+	ulint		len;
+	upd_t*		update;
+	ulint		n_diff;
+	ulint		i;
+
+	/* This function is used only for a secondary index */
+	ut_a(!dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(rec_offs_n_fields(offsets) == dtuple_get_n_fields(entry));
+	ut_ad(!rec_offs_any_extern(offsets));
+
+	update = upd_create(dtuple_get_n_fields(entry), heap);
+
+	n_diff = 0;
+
+	for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		dfield = dtuple_get_nth_field(entry, i);
+
+		/* NOTE that it may be that len != dfield_get_len(dfield) if we
+		are updating in a character set and collation where strings of
+		different length can be equal in an alphabetical comparison,
+		and also in the case where we have a column prefix index
+		and the last characters in the index field are spaces; the
+		latter case probably caused the assertion failures reported at
+		row0upd.cc line 713 in versions 4.0.14 - 4.0.16. */
+
+		/* NOTE: we compare the fields as binary strings!
+		(No collation) */
+
+		if (!dfield_data_is_binary_equal(dfield, len, data)) {
+
+			upd_field = upd_get_nth_field(update, n_diff);
+
+			dfield_copy(&(upd_field->new_val), dfield);
+
+			upd_field_set_field_no(upd_field, i, index, NULL);
+
+			n_diff++;
+		}
+	}
+
+	update->n_fields = n_diff;
+
+	return(update);
+}
+
+/***************************************************************//**
+Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. NOTE: we compare the fields as binary strings!
+@return own: update vector of differing fields, excluding roll ptr and
+trx id */
+UNIV_INTERN
+const upd_t*
+row_upd_build_difference_binary(
+/*============================*/
+	dict_index_t*	index,	/*!< in: clustered index */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	const rec_t*	rec,	/*!< in: clustered index record */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec,index), or NULL */
+	bool		no_sys,	/*!< in: skip the system columns
+				DB_TRX_ID and DB_ROLL_PTR */
+	trx_t*		trx,	/*!< in: transaction */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+{
+	upd_field_t*	upd_field;
+	const dfield_t*	dfield;
+	const byte*	data;
+	ulint		len;
+	upd_t*		update;
+	ulint		n_diff;
+	ulint		trx_id_pos;
+	ulint		i;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_);
+
+	/* This function is used only for a clustered index */
+	ut_a(dict_index_is_clust(index));
+
+	update = upd_create(dtuple_get_n_fields(entry), heap);
+
+	n_diff = 0;
+
+	trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+	ut_ad(dict_index_get_sys_col_pos(index, DATA_ROLL_PTR)
+	      == trx_id_pos + 1);
+
+	if (!offsets) {
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  ULINT_UNDEFINED, &heap);
+	} else {
+		ut_ad(rec_offs_validate(rec, index, offsets));
+	}
+
+	for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		dfield = dtuple_get_nth_field(entry, i);
+
+		/* NOTE: we compare the fields as binary strings!
+		(No collation) */
+
+		if (no_sys && (i == trx_id_pos || i == trx_id_pos + 1)) {
+
+			continue;
+		}
+
+		if (!dfield_is_ext(dfield)
+		    != !rec_offs_nth_extern(offsets, i)
+		    || !dfield_data_is_binary_equal(dfield, len, data)) {
+
+			upd_field = upd_get_nth_field(update, n_diff);
+
+			dfield_copy(&(upd_field->new_val), dfield);
+
+			upd_field_set_field_no(upd_field, i, index, trx);
+
+			n_diff++;
+		}
+	}
+
+	update->n_fields = n_diff;
+
+	return(update);
+}
+
+/***********************************************************//**
+Fetch a prefix of an externally stored column.  This is similar
+to row_ext_lookup(), but the row_ext_t holds the old values
+of the column and must not be poisoned with the new values.
+@return	BLOB prefix */
+static
+byte*
+row_upd_ext_fetch(
+/*==============*/
+	const byte*	data,		/*!< in: 'internally' stored part of the
+					field containing also the reference to
+					the external part */
+	ulint		local_len,	/*!< in: length of data, in bytes */
+	ulint		zip_size,	/*!< in: nonzero=compressed BLOB
+					page size, zero for uncompressed
+					BLOBs */
+	ulint*		len,		/*!< in: length of prefix to fetch;
+					out: fetched length of the prefix */
+	mem_heap_t*	heap)		/*!< in: heap where to allocate */
+{
+	byte*	buf = static_cast<byte*>(mem_heap_alloc(heap, *len));
+
+	*len = btr_copy_externally_stored_field_prefix(
+		buf, *len, zip_size, data, local_len);
+
+	/* We should never update records containing a half-deleted BLOB. */
+	ut_a(*len);
+
+	return(buf);
+}
+
+/***********************************************************//**
+Replaces the new column value stored in the update vector in
+the given index entry field. */
+static
+void
+row_upd_index_replace_new_col_val(
+/*==============================*/
+	dfield_t*		dfield,	/*!< in/out: data field
+					of the index entry */
+	const dict_field_t*	field,	/*!< in: index field */
+	const dict_col_t*	col,	/*!< in: field->col */
+	const upd_field_t*	uf,	/*!< in: update field */
+	mem_heap_t*		heap,	/*!< in: memory heap for allocating
+					and copying the new value */
+	ulint			zip_size)/*!< in: compressed page
+					 size of the table, or 0 */
+{
+	ulint		len;
+	const byte*	data;
+
+	dfield_copy_data(dfield, &uf->new_val);
+
+	if (dfield_is_null(dfield)) {
+		return;
+	}
+
+	len = dfield_get_len(dfield);
+	data = static_cast<const byte*>(dfield_get_data(dfield));
+
+	if (field->prefix_len > 0) {
+		ibool		fetch_ext = dfield_is_ext(dfield)
+			&& len < (ulint) field->prefix_len
+			+ BTR_EXTERN_FIELD_REF_SIZE;
+
+		if (fetch_ext) {
+			ulint	l = len;
+
+			len = field->prefix_len;
+
+			data = row_upd_ext_fetch(data, l, zip_size,
+						 &len, heap);
+		}
+
+		len = dtype_get_at_most_n_mbchars(col->prtype,
+						  col->mbminmaxlen,
+						  field->prefix_len, len,
+						  (const char*) data);
+
+		dfield_set_data(dfield, data, len);
+
+		if (!fetch_ext) {
+			dfield_dup(dfield, heap);
+		}
+
+		return;
+	}
+
+	switch (uf->orig_len) {
+		byte*	buf;
+	case BTR_EXTERN_FIELD_REF_SIZE:
+		/* Restore the original locally stored
+		part of the column.  In the undo log,
+		InnoDB writes a longer prefix of externally
+		stored columns, so that column prefixes
+		in secondary indexes can be reconstructed. */
+		dfield_set_data(dfield,
+				data + len - BTR_EXTERN_FIELD_REF_SIZE,
+				BTR_EXTERN_FIELD_REF_SIZE);
+		dfield_set_ext(dfield);
+		/* fall through */
+	case 0:
+		dfield_dup(dfield, heap);
+		break;
+	default:
+		/* Reconstruct the original locally
+		stored part of the column.  The data
+		will have to be copied. */
+		ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
+		buf = static_cast<byte*>(mem_heap_alloc(heap, uf->orig_len));
+
+		/* Copy the locally stored prefix. */
+		memcpy(buf, data,
+		       uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE);
+
+		/* Copy the BLOB pointer. */
+		memcpy(buf + uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE,
+		       data + len - BTR_EXTERN_FIELD_REF_SIZE,
+		       BTR_EXTERN_FIELD_REF_SIZE);
+
+		dfield_set_data(dfield, buf, uf->orig_len);
+		dfield_set_ext(dfield);
+		break;
+	}
+}
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the index entry
+given. */
+UNIV_INTERN
+void
+row_upd_index_replace_new_col_vals_index_pos(
+/*=========================================*/
+	dtuple_t*	entry,	/*!< in/out: index entry where replaced;
+				the clustered index record must be
+				covered by a lock or a page latch to
+				prevent deletion (rollback or purge) */
+	dict_index_t*	index,	/*!< in: index; NOTE that this may also be a
+				non-clustered index */
+	const upd_t*	update,	/*!< in: an update vector built for the index so
+				that the field number in an upd_field is the
+				index position */
+	ibool		order_only,
+				/*!< in: if TRUE, limit the replacement to
+				ordering fields of index; note that this
+				does not work for non-clustered indexes. */
+	mem_heap_t*	heap)	/*!< in: memory heap for allocating and
+				copying the new values */
+{
+	ulint		i;
+	ulint		n_fields;
+	const ulint	zip_size	= dict_table_zip_size(index->table);
+
+	ut_ad(index);
+
+	dtuple_set_info_bits(entry, update->info_bits);
+
+	if (order_only) {
+		n_fields = dict_index_get_n_unique(index);
+	} else {
+		n_fields = dict_index_get_n_fields(index);
+	}
+
+	for (i = 0; i < n_fields; i++) {
+		const dict_field_t*	field;
+		const dict_col_t*	col;
+		const upd_field_t*	uf;
+
+		field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(field);
+		uf = upd_get_field_by_field_no(update, i);
+
+		if (uf) {
+			row_upd_index_replace_new_col_val(
+				dtuple_get_nth_field(entry, i),
+				field, col, uf, heap, zip_size);
+		}
+	}
+}
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the index entry
+given. */
+UNIV_INTERN
+void
+row_upd_index_replace_new_col_vals(
+/*===============================*/
+	dtuple_t*	entry,	/*!< in/out: index entry where replaced;
+				the clustered index record must be
+				covered by a lock or a page latch to
+				prevent deletion (rollback or purge) */
+	dict_index_t*	index,	/*!< in: index; NOTE that this may also be a
+				non-clustered index */
+	const upd_t*	update,	/*!< in: an update vector built for the
+				CLUSTERED index so that the field number in
+				an upd_field is the clustered index position */
+	mem_heap_t*	heap)	/*!< in: memory heap for allocating and
+				copying the new values */
+{
+	ulint			i;
+	const dict_index_t*	clust_index
+		= dict_table_get_first_index(index->table);
+	const ulint		zip_size
+		= dict_table_zip_size(index->table);
+
+	dtuple_set_info_bits(entry, update->info_bits);
+
+	for (i = 0; i < dict_index_get_n_fields(index); i++) {
+		const dict_field_t*	field;
+		const dict_col_t*	col;
+		const upd_field_t*	uf;
+
+		field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(field);
+		uf = upd_get_field_by_field_no(
+			update, dict_col_get_clust_pos(col, clust_index));
+
+		if (uf) {
+			row_upd_index_replace_new_col_val(
+				dtuple_get_nth_field(entry, i),
+				field, col, uf, heap, zip_size);
+		}
+	}
+}
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector. */
+UNIV_INTERN
+void
+row_upd_replace(
+/*============*/
+	dtuple_t*		row,	/*!< in/out: row where replaced,
+					indexed by col_no;
+					the clustered index record must be
+					covered by a lock or a page latch to
+					prevent deletion (rollback or purge) */
+	row_ext_t**		ext,	/*!< out, own: NULL, or externally
+					stored column prefixes */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const upd_t*		update,	/*!< in: an update vector built for the
+					clustered index */
+	mem_heap_t*		heap)	/*!< in: memory heap */
+{
+	ulint			col_no;
+	ulint			i;
+	ulint			n_cols;
+	ulint			n_ext_cols;
+	ulint*			ext_cols;
+	const dict_table_t*	table;
+
+	ut_ad(row);
+	ut_ad(ext);
+	ut_ad(index);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(update);
+	ut_ad(heap);
+
+	n_cols = dtuple_get_n_fields(row);
+	table = index->table;
+	ut_ad(n_cols == dict_table_get_n_cols(table));
+
+	ext_cols = static_cast<ulint*>(
+		mem_heap_alloc(heap, n_cols * sizeof *ext_cols));
+
+	n_ext_cols = 0;
+
+	dtuple_set_info_bits(row, update->info_bits);
+
+	for (col_no = 0; col_no < n_cols; col_no++) {
+
+		const dict_col_t*	col
+			= dict_table_get_nth_col(table, col_no);
+		const ulint		clust_pos
+			= dict_col_get_clust_pos(col, index);
+		dfield_t*		dfield;
+
+		if (UNIV_UNLIKELY(clust_pos == ULINT_UNDEFINED)) {
+
+			continue;
+		}
+
+		dfield = dtuple_get_nth_field(row, col_no);
+
+		for (i = 0; i < upd_get_n_fields(update); i++) {
+
+			const upd_field_t*	upd_field
+				= upd_get_nth_field(update, i);
+
+			if (upd_field->field_no != clust_pos) {
+
+				continue;
+			}
+
+			dfield_copy_data(dfield, &upd_field->new_val);
+			break;
+		}
+
+		if (dfield_is_ext(dfield) && col->ord_part) {
+			ext_cols[n_ext_cols++] = col_no;
+		}
+	}
+
+	if (n_ext_cols) {
+		*ext = row_ext_create(n_ext_cols, ext_cols, table->flags, row,
+				      heap);
+	} else {
+		*ext = NULL;
+	}
+}
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector changes an ordering field in the index record */
+UNIV_INTERN
+ibool
+row_upd_changes_ord_field_binary_func(
+/*==================================*/
+	dict_index_t*	index,	/*!< in: index of the record */
+	const upd_t*	update,	/*!< in: update vector for the row; NOTE: the
+				field numbers in this MUST be clustered index
+				positions! */
+#ifdef UNIV_DEBUG
+	const que_thr_t*thr,	/*!< in: query thread */
+#endif /* UNIV_DEBUG */
+	const dtuple_t*	row,	/*!< in: old value of row, or NULL if the
+				row and the data values in update are not
+				known when this function is called, e.g., at
+				compile time */
+	const row_ext_t*ext)	/*!< NULL, or prefixes of the externally
+				stored columns in the old row */
+{
+	ulint			n_unique;
+	ulint			i;
+	const dict_index_t*	clust_index;
+
+	ut_ad(index);
+	ut_ad(update);
+	ut_ad(thr);
+	ut_ad(thr->graph);
+	ut_ad(thr->graph->trx);
+
+	n_unique = dict_index_get_n_unique(index);
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	for (i = 0; i < n_unique; i++) {
+
+		const dict_field_t*	ind_field;
+		const dict_col_t*	col;
+		ulint			col_no;
+		const upd_field_t*	upd_field;
+		const dfield_t*		dfield;
+		dfield_t		dfield_ext;
+		ulint			dfield_len;
+		const byte*		buf;
+
+		ind_field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(ind_field);
+		col_no = dict_col_get_no(col);
+
+		upd_field = upd_get_field_by_field_no(
+			update, dict_col_get_clust_pos(col, clust_index));
+
+		if (upd_field == NULL) {
+			continue;
+		}
+
+		if (row == NULL) {
+			ut_ad(ext == NULL);
+			return(TRUE);
+		}
+
+		dfield = dtuple_get_nth_field(row, col_no);
+
+		/* This treatment of column prefix indexes is loosely
+		based on row_build_index_entry(). */
+
+		if (UNIV_LIKELY(ind_field->prefix_len == 0)
+		    || dfield_is_null(dfield)) {
+			/* do nothing special */
+		} else if (ext) {
+			/* Silence a compiler warning without
+			silencing a Valgrind error. */
+			dfield_len = 0;
+			UNIV_MEM_INVALID(&dfield_len, sizeof dfield_len);
+			/* See if the column is stored externally. */
+			buf = row_ext_lookup(ext, col_no, &dfield_len);
+
+			ut_ad(col->ord_part);
+
+			if (UNIV_LIKELY_NULL(buf)) {
+				if (UNIV_UNLIKELY(buf == field_ref_zero)) {
+					/* The externally stored field
+					was not written yet. This
+					record should only be seen by
+					recv_recovery_rollback_active(),
+					when the server had crashed before
+					storing the field. */
+					ut_ad(thr->graph->trx->is_recovered);
+					ut_ad(trx_is_recv(thr->graph->trx));
+					return(TRUE);
+				}
+
+				goto copy_dfield;
+			}
+		} else if (dfield_is_ext(dfield)) {
+			dfield_len = dfield_get_len(dfield);
+			ut_a(dfield_len > BTR_EXTERN_FIELD_REF_SIZE);
+			dfield_len -= BTR_EXTERN_FIELD_REF_SIZE;
+			ut_a(dict_index_is_clust(index)
+			     || ind_field->prefix_len <= dfield_len);
+
+			buf = static_cast<byte*>(dfield_get_data(dfield));
+copy_dfield:
+			ut_a(dfield_len > 0);
+			dfield_copy(&dfield_ext, dfield);
+			dfield_set_data(&dfield_ext, buf, dfield_len);
+			dfield = &dfield_ext;
+		}
+
+		if (!dfield_datas_are_binary_equal(
+			    dfield, &upd_field->new_val,
+			    ind_field->prefix_len)) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector may change an ordering field in an index
+record */
+UNIV_INTERN
+ibool
+row_upd_changes_some_index_ord_field_binary(
+/*========================================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const upd_t*		update)	/*!< in: update vector for the row */
+{
+	upd_field_t*	upd_field;
+	dict_index_t*	index;
+	ulint		i;
+
+	index = dict_table_get_first_index(table);
+
+	for (i = 0; i < upd_get_n_fields(update); i++) {
+
+		upd_field = upd_get_nth_field(update, i);
+
+		if (dict_field_get_col(dict_index_get_nth_field(
+					       index, upd_field->field_no))
+		    ->ord_part) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/***********************************************************//**
+Checks if an FTS Doc ID column is affected by an UPDATE.
+@return whether the Doc ID column is changed */
+UNIV_INTERN
+bool
+row_upd_changes_doc_id(
+/*===================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field)	/*!< in: field to check */
+{
+	ulint		col_no;
+	dict_index_t*	clust_index;
+	fts_t*		fts = table->fts;
+
+	clust_index = dict_table_get_first_index(table);
+
+	/* Convert from index-specific column number to table-global
+	column number. */
+	col_no = dict_index_get_nth_col_no(clust_index, upd_field->field_no);
+
+	return(col_no == fts->doc_col);
+}
+/***********************************************************//**
+Checks if an FTS indexed column is affected by an UPDATE.
+@return offset within fts_t::indexes if FTS indexed column updated else
+ULINT_UNDEFINED */
+UNIV_INTERN
+ulint
+row_upd_changes_fts_column(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field)	/*!< in: field to check */
+{
+	ulint		col_no;
+	dict_index_t*	clust_index;
+	fts_t*		fts = table->fts;
+
+	clust_index = dict_table_get_first_index(table);
+
+	/* Convert from index-specific column number to table-global
+	column number. */
+	col_no = dict_index_get_nth_col_no(clust_index, upd_field->field_no);
+
+	return(dict_table_is_fts_column(fts->indexes, col_no));
+}
+
+/***********************************************************//**
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes.
+@return	TRUE if changes */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+	dtuple_t*	entry,	/*!< in: index entry */
+	dict_index_t*	index,	/*!< in: index of entry */
+	const upd_t*	update,	/*!< in: update vector for the row */
+	ulint		n)	/*!< in: how many first fields to check */
+{
+	ulint		n_upd_fields;
+	ulint		i, j;
+	dict_index_t*	clust_index;
+
+	ut_ad(update && index);
+	ut_ad(n <= dict_index_get_n_fields(index));
+
+	n_upd_fields = upd_get_n_fields(update);
+	clust_index = dict_table_get_first_index(index->table);
+
+	for (i = 0; i < n; i++) {
+
+		const dict_field_t*	ind_field;
+		const dict_col_t*	col;
+		ulint			col_pos;
+
+		ind_field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(ind_field);
+		col_pos = dict_col_get_clust_pos(col, clust_index);
+
+		ut_a(ind_field->prefix_len == 0);
+
+		for (j = 0; j < n_upd_fields; j++) {
+
+			upd_field_t*	upd_field
+				= upd_get_nth_field(update, j);
+
+			if (col_pos == upd_field->field_no
+			    && !dfield_datas_are_binary_equal(
+				    dtuple_get_nth_field(entry, i),
+				    &upd_field->new_val, 0)) {
+
+				return(TRUE);
+			}
+		}
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Copies the column values from a record. */
+UNIV_INLINE
+void
+row_upd_copy_columns(
+/*=================*/
+	rec_t*		rec,	/*!< in: record in a clustered index */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	sym_node_t*	column)	/*!< in: first column in a column list, or
+				NULL */
+{
+	byte*	data;
+	ulint	len;
+
+	while (column) {
+		data = rec_get_nth_field(rec, offsets,
+					 column->field_nos[SYM_CLUST_FIELD_NO],
+					 &len);
+		eval_node_copy_and_alloc_val(column, data, len);
+
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+}
+
+/*********************************************************************//**
+Calculates the new values for fields to update. Note that row_upd_copy_columns
+must have been called first. */
+UNIV_INLINE
+void
+row_upd_eval_new_vals(
+/*==================*/
+	upd_t*	update)	/*!< in/out: update vector */
+{
+	que_node_t*	exp;
+	upd_field_t*	upd_field;
+	ulint		n_fields;
+	ulint		i;
+
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+
+		exp = upd_field->exp;
+
+		eval_exp(exp);
+
+		dfield_copy_data(&(upd_field->new_val), que_node_get_val(exp));
+	}
+}
+
+/***********************************************************//**
+Stores to the heap the row on which the node->pcur is positioned. */
+static
+void
+row_upd_store_row(
+/*==============*/
+	upd_node_t*	node)	/*!< in: row update node */
+{
+	dict_index_t*	clust_index;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	row_ext_t**	ext;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	const ulint*	offsets;
+	rec_offs_init(offsets_);
+
+	ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES);
+
+	if (node->row != NULL) {
+		mem_heap_empty(node->heap);
+	}
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	rec = btr_pcur_get_rec(node->pcur);
+
+	offsets = rec_get_offsets(rec, clust_index, offsets_,
+				  ULINT_UNDEFINED, &heap);
+
+	if (dict_table_get_format(node->table) >= UNIV_FORMAT_B) {
+		/* In DYNAMIC or COMPRESSED format, there is no prefix
+		of externally stored columns in the clustered index
+		record. Build a cache of column prefixes. */
+		ext = &node->ext;
+	} else {
+		/* REDUNDANT and COMPACT formats store a local
+		768-byte prefix of each externally stored column.
+		No cache is needed. */
+		ext = NULL;
+		node->ext = NULL;
+	}
+
+	node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets,
+			      NULL, NULL, NULL, ext, node->heap);
+	if (node->is_delete) {
+		node->upd_row = NULL;
+		node->upd_ext = NULL;
+	} else {
+		node->upd_row = dtuple_copy(node->row, node->heap);
+		row_upd_replace(node->upd_row, &node->upd_ext,
+				clust_index, node->update, node->heap);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/***********************************************************//**
+Updates a secondary index entry of a row.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_upd_sec_index_entry(
+/*====================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	mtr_t			mtr;
+	const rec_t*		rec;
+	btr_pcur_t		pcur;
+	mem_heap_t*		heap;
+	dtuple_t*		entry;
+	dict_index_t*		index;
+	btr_cur_t*		btr_cur;
+	ibool			referenced;
+	dberr_t			err	= DB_SUCCESS;
+	trx_t*			trx	= thr_get_trx(thr);
+	ulint			mode;
+	enum row_search_result	search_result;
+
+	ut_ad(trx->id);
+
+	index = node->index;
+
+	referenced = row_upd_index_is_referenced(index, trx);
+
+	heap = mem_heap_create(1024);
+
+	/* Build old index entry */
+	entry = row_build_index_entry(node->row, node->ext, index, heap);
+	ut_a(entry);
+
+	log_free_check();
+
+#ifdef UNIV_DEBUG
+	/* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC().
+	Once it is fixed, remove the 'ifdef', 'if' and this comment. */
+	if (!trx->ddl) {
+		DEBUG_SYNC_C_IF_THD(trx->mysql_thd,
+				    "before_row_upd_sec_index_entry");
+	}
+#endif /* UNIV_DEBUG */
+
+	mtr_start(&mtr);
+
+	if (*index->name == TEMP_INDEX_PREFIX) {
+		/* The index->online_status may change if the
+		index->name starts with TEMP_INDEX_PREFIX (meaning
+		that the index is or was being created online). It is
+		protected by index->lock. */
+
+		mtr_s_lock(dict_index_get_lock(index), &mtr);
+
+		switch (dict_index_get_online_status(index)) {
+		case ONLINE_INDEX_COMPLETE:
+			/* This is a normal index. Do not log anything.
+			Perform the update on the index tree directly. */
+			break;
+		case ONLINE_INDEX_CREATION:
+			/* Log a DELETE and optionally INSERT. */
+			row_log_online_op(index, entry, 0);
+
+			if (!node->is_delete) {
+				mem_heap_empty(heap);
+				entry = row_build_index_entry(
+					node->upd_row, node->upd_ext,
+					index, heap);
+				ut_a(entry);
+				row_log_online_op(index, entry, trx->id);
+			}
+			/* fall through */
+		case ONLINE_INDEX_ABORTED:
+		case ONLINE_INDEX_ABORTED_DROPPED:
+			mtr_commit(&mtr);
+			goto func_exit;
+		}
+
+		/* We can only buffer delete-mark operations if there
+		are no foreign key constraints referring to the index. */
+		mode = referenced
+			? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED
+			: BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED
+			| BTR_DELETE_MARK;
+	} else {
+		/* For secondary indexes,
+		index->online_status==ONLINE_INDEX_CREATION unless
+		index->name starts with TEMP_INDEX_PREFIX. */
+		ut_ad(!dict_index_is_online_ddl(index));
+
+		/* We can only buffer delete-mark operations if there
+		are no foreign key constraints referring to the index. */
+		mode = referenced
+			? BTR_MODIFY_LEAF
+			: BTR_MODIFY_LEAF | BTR_DELETE_MARK;
+	}
+
+	/* Set the query thread, so that ibuf_insert_low() will be
+	able to invoke thd_get_trx(). */
+	btr_pcur_get_btr_cur(&pcur)->thr = thr;
+
+	search_result = row_search_index_entry(index, entry, mode,
+					       &pcur, &mtr);
+
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	rec = btr_cur_get_rec(btr_cur);
+
+	switch (search_result) {
+	case ROW_NOT_DELETED_REF:	/* should only occur for BTR_DELETE */
+		ut_error;
+		break;
+	case ROW_BUFFERED:
+		/* Entry was delete marked already. */
+		break;
+
+	case ROW_NOT_FOUND:
+		if (*index->name == TEMP_INDEX_PREFIX) {
+			/* When online CREATE INDEX copied the update
+			that we already made to the clustered index,
+			and completed the secondary index creation
+			before we got here, the old secondary index
+			record would not exist. The CREATE INDEX
+			should be waiting for a MySQL meta-data lock
+			upgrade at least until this UPDATE
+			returns. After that point, the
+			TEMP_INDEX_PREFIX would be dropped from the
+			index name in commit_inplace_alter_table(). */
+			break;
+		}
+
+		fputs("InnoDB: error in sec index entry update in\n"
+		      "InnoDB: ", stderr);
+		dict_index_name_print(stderr, trx, index);
+		fputs("\n"
+		      "InnoDB: tuple ", stderr);
+		dtuple_print(stderr, entry);
+		fputs("\n"
+		      "InnoDB: record ", stderr);
+		rec_print(stderr, rec, index);
+		putc('\n', stderr);
+		trx_print(stderr, trx, 0);
+		fputs("\n"
+		      "InnoDB: Submit a detailed bug report"
+		      " to http://bugs.mysql.com\n", stderr);
+		ut_ad(0);
+		break;
+	case ROW_FOUND:
+		/* Delete mark the old index record; it can already be
+		delete marked if we return after a lock wait in
+		row_ins_sec_index_entry() below */
+		if (!rec_get_deleted_flag(
+			    rec, dict_table_is_comp(index->table))) {
+			err = btr_cur_del_mark_set_sec_rec(
+				0, btr_cur, TRUE, thr, &mtr);
+
+			if (err == DB_SUCCESS && referenced) {
+
+				ulint*	offsets;
+
+				offsets = rec_get_offsets(
+					rec, index, NULL, ULINT_UNDEFINED,
+					&heap);
+
+				/* NOTE that the following call loses
+				the position of pcur ! */
+				err = row_upd_check_references_constraints(
+					node, &pcur, index->table,
+					index, offsets, thr, &mtr);
+			}
+		}
+		break;
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	if (node->is_delete || err != DB_SUCCESS) {
+
+		goto func_exit;
+	}
+
+	mem_heap_empty(heap);
+
+	/* Build a new index entry */
+	entry = row_build_index_entry(node->upd_row, node->upd_ext,
+				      index, heap);
+	ut_a(entry);
+
+	/* Insert new index entry */
+	err = row_ins_sec_index_entry(index, entry, thr);
+
+func_exit:
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/***********************************************************//**
+Updates the secondary index record if it is changed in the row update or
+deletes it if this is a delete.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_upd_sec_step(
+/*=============*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ut_ad((node->state == UPD_NODE_UPDATE_ALL_SEC)
+	      || (node->state == UPD_NODE_UPDATE_SOME_SEC));
+	ut_ad(!dict_index_is_clust(node->index));
+
+	if (node->state == UPD_NODE_UPDATE_ALL_SEC
+	    || row_upd_changes_ord_field_binary(node->index, node->update,
+						thr, node->row, node->ext)) {
+		return(row_upd_sec_index_entry(node, thr));
+	}
+
+	return(DB_SUCCESS);
+}
+
+#ifdef UNIV_DEBUG
+# define row_upd_clust_rec_by_insert_inherit(rec,offsets,entry,update)	\
+	row_upd_clust_rec_by_insert_inherit_func(rec,offsets,entry,update)
+#else /* UNIV_DEBUG */
+# define row_upd_clust_rec_by_insert_inherit(rec,offsets,entry,update)	\
+	row_upd_clust_rec_by_insert_inherit_func(entry,update)
+#endif /* UNIV_DEBUG */
+/*******************************************************************//**
+Mark non-updated off-page columns inherited when the primary key is
+updated. We must mark them as inherited in entry, so that they are not
+freed in a rollback. A limited version of this function used to be
+called btr_cur_mark_dtuple_inherited_extern().
+@return TRUE if any columns were inherited */
+static __attribute__((warn_unused_result))
+ibool
+row_upd_clust_rec_by_insert_inherit_func(
+/*=====================================*/
+#ifdef UNIV_DEBUG
+	const rec_t*	rec,	/*!< in: old record, or NULL */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec), or NULL */
+#endif /* UNIV_DEBUG */
+	dtuple_t*	entry,	/*!< in/out: updated entry to be
+				inserted into the clustered index */
+	const upd_t*	update)	/*!< in: update vector */
+{
+	ibool	inherit	= FALSE;
+	ulint	i;
+
+	ut_ad(!rec == !offsets);
+	ut_ad(!rec || rec_offs_any_extern(offsets));
+
+	for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+		dfield_t*	dfield	= dtuple_get_nth_field(entry, i);
+		byte*		data;
+		ulint		len;
+
+		ut_ad(!offsets
+		      || !rec_offs_nth_extern(offsets, i)
+		      == !dfield_is_ext(dfield)
+		      || upd_get_field_by_field_no(update, i));
+		if (!dfield_is_ext(dfield)
+		    || upd_get_field_by_field_no(update, i)) {
+			continue;
+		}
+
+#ifdef UNIV_DEBUG
+		if (UNIV_LIKELY(rec != NULL)) {
+			const byte* rec_data
+				= rec_get_nth_field(rec, offsets, i, &len);
+			ut_ad(len == dfield_get_len(dfield));
+			ut_ad(len != UNIV_SQL_NULL);
+			ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+			rec_data += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+			/* The pointer must not be zero. */
+			ut_ad(memcmp(rec_data, field_ref_zero,
+				     BTR_EXTERN_FIELD_REF_SIZE));
+			/* The BLOB must be owned. */
+			ut_ad(!(rec_data[BTR_EXTERN_LEN]
+				& BTR_EXTERN_OWNER_FLAG));
+		}
+#endif /* UNIV_DEBUG */
+
+		len = dfield_get_len(dfield);
+		ut_a(len != UNIV_SQL_NULL);
+		ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+		data = static_cast<byte*>(dfield_get_data(dfield));
+
+		data += len - BTR_EXTERN_FIELD_REF_SIZE;
+		/* The pointer must not be zero. */
+		ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
+		data[BTR_EXTERN_LEN] &= ~BTR_EXTERN_OWNER_FLAG;
+		data[BTR_EXTERN_LEN] |= BTR_EXTERN_INHERITED_FLAG;
+		/* The BTR_EXTERN_INHERITED_FLAG only matters in
+		rollback. Purge will always free the extern fields of
+		a delete-marked row. */
+
+		inherit = TRUE;
+	}
+
+	return(inherit);
+}
+
+/***********************************************************//**
+Marks the clustered index record deleted and inserts the updated version
+of the record to the index. This function should be used when the ordering
+fields of the clustered index record change. This should be quite rare in
+database applications.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_upd_clust_rec_by_insert(
+/*========================*/
+	upd_node_t*	node,	/*!< in/out: row update node */
+	dict_index_t*	index,	/*!< in: clustered index of the record */
+	que_thr_t*	thr,	/*!< in: query thread */
+	ibool		referenced,/*!< in: TRUE if index may be referenced in
+				a foreign key constraint */
+	mtr_t*		mtr)	/*!< in/out: mtr; gets committed here */
+{
+	mem_heap_t*	heap;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	trx_t*		trx;
+	dict_table_t*	table;
+	dtuple_t*	entry;
+	dberr_t		err;
+	ibool		change_ownership	= FALSE;
+	rec_t*		rec;
+	ulint*		offsets			= NULL;
+
+	ut_ad(node);
+	ut_ad(dict_index_is_clust(index));
+
+	trx = thr_get_trx(thr);
+	table = node->table;
+	pcur = node->pcur;
+	btr_cur	= btr_pcur_get_btr_cur(pcur);
+
+	heap = mem_heap_create(1000);
+
+	entry = row_build_index_entry(node->upd_row, node->upd_ext,
+				      index, heap);
+	ut_a(entry);
+
+	row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id);
+
+	switch (node->state) {
+	default:
+		ut_error;
+	case UPD_NODE_INSERT_BLOB:
+		/* A lock wait occurred in row_ins_clust_index_entry() in
+		the previous invocation of this function. Mark the
+		off-page columns in the entry inherited. */
+
+		change_ownership = row_upd_clust_rec_by_insert_inherit(
+			NULL, NULL, entry, node->update);
+		ut_a(change_ownership);
+		/* fall through */
+	case UPD_NODE_INSERT_CLUSTERED:
+		/* A lock wait occurred in row_ins_clust_index_entry() in
+		the previous invocation of this function. */
+		break;
+	case UPD_NODE_UPDATE_CLUSTERED:
+		/* This is the first invocation of the function where
+		we update the primary key.  Delete-mark the old record
+		in the clustered index and prepare to insert a new entry. */
+		rec = btr_cur_get_rec(btr_cur);
+		offsets = rec_get_offsets(rec, index, NULL,
+					  ULINT_UNDEFINED, &heap);
+		ut_ad(page_rec_is_user_rec(rec));
+
+		err = btr_cur_del_mark_set_clust_rec(
+			btr_cur_get_block(btr_cur), rec, index, offsets,
+			thr, mtr);
+		if (err != DB_SUCCESS) {
+err_exit:
+			mtr_commit(mtr);
+			mem_heap_free(heap);
+			return(err);
+		}
+
+		/* If the the new row inherits externally stored
+		fields (off-page columns a.k.a. BLOBs) from the
+		delete-marked old record, mark them disowned by the
+		old record and owned by the new entry. */
+
+		if (rec_offs_any_extern(offsets)) {
+			change_ownership = row_upd_clust_rec_by_insert_inherit(
+				rec, offsets, entry, node->update);
+
+			if (change_ownership) {
+				/* The blobs are disowned here, expecting the
+				insert down below to inherit them.  But if the
+				insert fails, then this disown will be undone
+				when the operation is rolled back. */
+				btr_cur_disown_inherited_fields(
+					btr_cur_get_page_zip(btr_cur),
+					rec, index, offsets, node->update, mtr);
+			}
+		}
+
+		if (referenced) {
+			/* NOTE that the following call loses
+			the position of pcur ! */
+
+			err = row_upd_check_references_constraints(
+				node, pcur, table, index, offsets, thr, mtr);
+
+			if (err != DB_SUCCESS) {
+				goto err_exit;
+			}
+		}
+	}
+
+	mtr_commit(mtr);
+
+	err = row_ins_clust_index_entry(
+		index, entry, thr,
+		node->upd_ext ? node->upd_ext->n_ext : 0);
+	node->state = change_ownership
+		? UPD_NODE_INSERT_BLOB
+		: UPD_NODE_INSERT_CLUSTERED;
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/***********************************************************//**
+Updates a clustered index record of a row when the ordering fields do
+not change.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_upd_clust_rec(
+/*==============*/
+	upd_node_t*	node,	/*!< in: row update node */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint*		offsets,/*!< in: rec_get_offsets() on node->pcur */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: memory heap, can be emptied */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr; gets committed here */
+{
+	mem_heap_t*	heap		= NULL;
+	big_rec_t*	big_rec		= NULL;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	dberr_t		err;
+	const dtuple_t*	rebuilt_old_pk	= NULL;
+
+	ut_ad(node);
+	ut_ad(dict_index_is_clust(index));
+
+	pcur = node->pcur;
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	ut_ad(btr_cur_get_index(btr_cur) == index);
+	ut_ad(!rec_get_deleted_flag(btr_cur_get_rec(btr_cur),
+				    dict_table_is_comp(index->table)));
+	ut_ad(rec_offs_validate(btr_cur_get_rec(btr_cur), index, offsets));
+
+	if (dict_index_is_online_ddl(index)) {
+		rebuilt_old_pk = row_log_table_get_pk(
+			btr_cur_get_rec(btr_cur), index, offsets, NULL, &heap);
+	}
+
+	/* Try optimistic updating of the record, keeping changes within
+	the page; we do not check locks because we assume the x-lock on the
+	record to update */
+
+	if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) {
+		err = btr_cur_update_in_place(
+			BTR_NO_LOCKING_FLAG, btr_cur,
+			offsets, node->update,
+			node->cmpl_info, thr, thr_get_trx(thr)->id, mtr);
+	} else {
+		err = btr_cur_optimistic_update(
+			BTR_NO_LOCKING_FLAG, btr_cur,
+			&offsets, offsets_heap, node->update,
+			node->cmpl_info, thr, thr_get_trx(thr)->id, mtr);
+	}
+
+	if (err == DB_SUCCESS && dict_index_is_online_ddl(index)) {
+		row_log_table_update(btr_cur_get_rec(btr_cur),
+				     index, offsets, rebuilt_old_pk);
+	}
+
+	mtr_commit(mtr);
+
+	if (UNIV_LIKELY(err == DB_SUCCESS)) {
+
+		goto func_exit;
+	}
+
+	if (buf_LRU_buf_pool_running_out()) {
+
+		err = DB_LOCK_TABLE_FULL;
+		goto func_exit;
+	}
+	/* We may have to modify the tree structure: do a pessimistic descent
+	down the index tree */
+
+	mtr_start(mtr);
+
+	/* NOTE: this transaction has an s-lock or x-lock on the record and
+	therefore other transactions cannot modify the record when we have no
+	latch on the page. In addition, we assume that other query threads of
+	the same transaction do not modify the record in the meantime.
+	Therefore we can assert that the restoration of the cursor succeeds. */
+
+	ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));
+
+	ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+				    dict_table_is_comp(index->table)));
+
+	if (!heap) {
+		heap = mem_heap_create(1024);
+	}
+
+	err = btr_cur_pessimistic_update(
+		BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, btr_cur,
+		&offsets, offsets_heap, heap, &big_rec,
+		node->update, node->cmpl_info,
+		thr, thr_get_trx(thr)->id, mtr);
+	if (big_rec) {
+		ut_a(err == DB_SUCCESS);
+		/* Write out the externally stored
+		columns while still x-latching
+		index->lock and block->lock. Allocate
+		pages for big_rec in the mtr that
+		modified the B-tree, but be sure to skip
+		any pages that were freed in mtr. We will
+		write out the big_rec pages before
+		committing the B-tree mini-transaction. If
+		the system crashes so that crash recovery
+		will not replay the mtr_commit(&mtr), the
+		big_rec pages will be left orphaned until
+		the pages are allocated for something else.
+
+		TODO: If the allocation extends the tablespace, it
+		will not be redo logged, in either mini-transaction.
+		Tablespace extension should be redo-logged in the
+		big_rec mini-transaction, so that recovery will not
+		fail when the big_rec was written to the extended
+		portion of the file, in case the file was somehow
+		truncated in the crash. */
+
+		DEBUG_SYNC_C("before_row_upd_extern");
+		err = btr_store_big_rec_extern_fields(
+			index, btr_cur_get_block(btr_cur),
+			btr_cur_get_rec(btr_cur), offsets,
+			big_rec, mtr, BTR_STORE_UPDATE);
+		DEBUG_SYNC_C("after_row_upd_extern");
+		/* If writing big_rec fails (for example, because of
+		DB_OUT_OF_FILE_SPACE), the record will be corrupted.
+		Even if we did not update any externally stored
+		columns, our update could cause the record to grow so
+		that a non-updated column was selected for external
+		storage. This non-update would not have been written
+		to the undo log, and thus the record cannot be rolled
+		back.
+
+		However, because we have not executed mtr_commit(mtr)
+		yet, the update will not be replayed in crash
+		recovery, and the following assertion failure will
+		effectively "roll back" the operation. */
+		ut_a(err == DB_SUCCESS);
+	}
+
+	if (err == DB_SUCCESS && dict_index_is_online_ddl(index)) {
+		row_log_table_update(btr_cur_get_rec(btr_cur),
+				     index, offsets, rebuilt_old_pk);
+	}
+
+	mtr_commit(mtr);
+func_exit:
+	if (heap) {
+		mem_heap_free(heap);
+	}
+
+	if (big_rec) {
+		dtuple_big_rec_free(big_rec);
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Delete marks a clustered index record.
+@return	DB_SUCCESS if operation successfully completed, else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_upd_del_mark_clust_rec(
+/*=======================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint*		offsets,/*!< in/out: rec_get_offsets() for the
+				record under the cursor */
+	que_thr_t*	thr,	/*!< in: query thread */
+	ibool		referenced,
+				/*!< in: TRUE if index may be referenced in
+				a foreign key constraint */
+	mtr_t*		mtr)	/*!< in: mtr; gets committed here */
+{
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	dberr_t		err;
+
+	ut_ad(node);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(node->is_delete);
+
+	pcur = node->pcur;
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	/* Store row because we have to build also the secondary index
+	entries */
+
+	row_upd_store_row(node);
+
+	/* Mark the clustered index record deleted; we do not have to check
+	locks, because we assume that we have an x-lock on the record */
+
+	err = btr_cur_del_mark_set_clust_rec(
+		btr_cur_get_block(btr_cur), btr_cur_get_rec(btr_cur),
+		index, offsets, thr, mtr);
+	if (err == DB_SUCCESS && referenced) {
+		/* NOTE that the following call loses the position of pcur ! */
+
+		err = row_upd_check_references_constraints(
+			node, pcur, index->table, index, offsets, thr, mtr);
+	}
+
+	mtr_commit(mtr);
+
+	return(err);
+}
+
+/***********************************************************//**
+Updates the clustered index record.
+@return DB_SUCCESS if operation successfully completed, DB_LOCK_WAIT
+in case of a lock wait, else error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_upd_clust_step(
+/*===============*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dict_index_t*	index;
+	btr_pcur_t*	pcur;
+	ibool		success;
+	dberr_t		err;
+	mtr_t		mtr;
+	rec_t*		rec;
+	mem_heap_t*	heap	= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets;
+	ibool		referenced;
+	rec_offs_init(offsets_);
+
+	index = dict_table_get_first_index(node->table);
+
+	referenced = row_upd_index_is_referenced(index, thr_get_trx(thr));
+
+	pcur = node->pcur;
+
+	/* We have to restore the cursor to its position */
+
+	mtr_start(&mtr);
+
+	/* If the restoration does not succeed, then the same
+	transaction has deleted the record on which the cursor was,
+	and that is an SQL error. If the restoration succeeds, it may
+	still be that the same transaction has successively deleted
+	and inserted a record with the same ordering fields, but in
+	that case we know that the transaction has at least an
+	implicit x-lock on the record. */
+
+	ut_a(pcur->rel_pos == BTR_PCUR_ON);
+
+	ulint	mode;
+
+#ifdef UNIV_DEBUG
+	/* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC().
+	Once it is fixed, remove the 'ifdef', 'if' and this comment. */
+	if (!thr_get_trx(thr)->ddl) {
+		DEBUG_SYNC_C_IF_THD(
+			thr_get_trx(thr)->mysql_thd,
+			"innodb_row_upd_clust_step_enter");
+	}
+#endif /* UNIV_DEBUG */
+
+	if (dict_index_is_online_ddl(index)) {
+		ut_ad(node->table->id != DICT_INDEXES_ID);
+		mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED;
+		mtr_s_lock(dict_index_get_lock(index), &mtr);
+	} else {
+		mode = BTR_MODIFY_LEAF;
+	}
+
+	success = btr_pcur_restore_position(mode, pcur, &mtr);
+
+	if (!success) {
+		err = DB_RECORD_NOT_FOUND;
+
+		mtr_commit(&mtr);
+
+		return(err);
+	}
+
+	/* If this is a row in SYS_INDEXES table of the data dictionary,
+	then we have to free the file segments of the index tree associated
+	with the index */
+
+	if (node->is_delete && node->table->id == DICT_INDEXES_ID) {
+
+		ut_ad(!dict_index_is_online_ddl(index));
+
+		dict_drop_index_tree(btr_pcur_get_rec(pcur), &mtr);
+
+		mtr_commit(&mtr);
+
+		mtr_start(&mtr);
+
+		success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur,
+						    &mtr);
+		if (!success) {
+			err = DB_ERROR;
+
+			mtr_commit(&mtr);
+
+			return(err);
+		}
+	}
+
+	rec = btr_pcur_get_rec(pcur);
+	offsets = rec_get_offsets(rec, index, offsets_,
+				  ULINT_UNDEFINED, &heap);
+
+	if (!node->has_clust_rec_x_lock) {
+		err = lock_clust_rec_modify_check_and_lock(
+			0, btr_pcur_get_block(pcur),
+			rec, index, offsets, thr);
+		if (err != DB_SUCCESS) {
+			mtr_commit(&mtr);
+			goto exit_func;
+		}
+	}
+
+	ut_ad(lock_trx_has_rec_x_lock(thr_get_trx(thr), index->table,
+				      btr_pcur_get_block(pcur),
+				      page_rec_get_heap_no(rec)));
+
+	/* NOTE: the following function calls will also commit mtr */
+
+	if (node->is_delete) {
+		err = row_upd_del_mark_clust_rec(
+			node, index, offsets, thr, referenced, &mtr);
+
+		if (err == DB_SUCCESS) {
+			node->state = UPD_NODE_UPDATE_ALL_SEC;
+			node->index = dict_table_get_next_index(index);
+		}
+
+		goto exit_func;
+	}
+
+	/* If the update is made for MySQL, we already have the update vector
+	ready, else we have to do some evaluation: */
+
+	if (UNIV_UNLIKELY(!node->in_mysql_interface)) {
+		/* Copy the necessary columns from clust_rec and calculate the
+		new values to set */
+		row_upd_copy_columns(rec, offsets,
+				     UT_LIST_GET_FIRST(node->columns));
+		row_upd_eval_new_vals(node->update);
+	}
+
+	if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+
+		err = row_upd_clust_rec(
+			node, index, offsets, &heap, thr, &mtr);
+		goto exit_func;
+	}
+
+	row_upd_store_row(node);
+
+	if (row_upd_changes_ord_field_binary(index, node->update, thr,
+					     node->row, node->ext)) {
+
+		/* Update causes an ordering field (ordering fields within
+		the B-tree) of the clustered index record to change: perform
+		the update by delete marking and inserting.
+
+		TODO! What to do to the 'Halloween problem', where an update
+		moves the record forward in index so that it is again
+		updated when the cursor arrives there? Solution: the
+		read operation must check the undo record undo number when
+		choosing records to update. MySQL solves now the problem
+		externally! */
+
+		err = row_upd_clust_rec_by_insert(
+			node, index, thr, referenced, &mtr);
+
+		if (err != DB_SUCCESS) {
+
+			goto exit_func;
+		}
+
+		node->state = UPD_NODE_UPDATE_ALL_SEC;
+	} else {
+		err = row_upd_clust_rec(
+			node, index, offsets, &heap, thr, &mtr);
+
+		if (err != DB_SUCCESS) {
+
+			goto exit_func;
+		}
+
+		node->state = UPD_NODE_UPDATE_SOME_SEC;
+	}
+
+	node->index = dict_table_get_next_index(index);
+
+exit_func:
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/***********************************************************//**
+Updates the affected index records of a row. When the control is transferred
+to this node, we assume that we have a persistent cursor which was on a
+record, and the position of the cursor is stored in the cursor.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+row_upd(
+/*====*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t		err	= DB_SUCCESS;
+
+	ut_ad(node && thr);
+
+	if (UNIV_LIKELY(node->in_mysql_interface)) {
+
+		/* We do not get the cmpl_info value from the MySQL
+		interpreter: we must calculate it on the fly: */
+
+		if (node->is_delete
+		    || row_upd_changes_some_index_ord_field_binary(
+			    node->table, node->update)) {
+			node->cmpl_info = 0;
+		} else {
+			node->cmpl_info = UPD_NODE_NO_ORD_CHANGE;
+		}
+	}
+
+	switch (node->state) {
+	case UPD_NODE_UPDATE_CLUSTERED:
+	case UPD_NODE_INSERT_CLUSTERED:
+	case UPD_NODE_INSERT_BLOB:
+		log_free_check();
+		err = row_upd_clust_step(node, thr);
+
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+	}
+
+	if (node->index == NULL
+	    || (!node->is_delete
+		&& (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE))) {
+
+		return(DB_SUCCESS);
+	}
+
+#ifdef UNIV_DEBUG
+	/* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC().
+	Once it is fixed, remove the 'ifdef', 'if' and this comment. */
+	if (!thr_get_trx(thr)->ddl) {
+		DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+				    "after_row_upd_clust");
+	}
+#endif /* UNIV_DEBUG */
+
+	DBUG_EXECUTE_IF("row_upd_skip_sec", node->index = NULL;);
+
+	do {
+		/* Skip corrupted index */
+		dict_table_skip_corrupt_index(node->index);
+
+		if (!node->index) {
+			break;
+		}
+
+		if (node->index->type != DICT_FTS) {
+			err = row_upd_sec_step(node, thr);
+
+			if (err != DB_SUCCESS) {
+
+				return(err);
+			}
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	} while (node->index != NULL);
+
+	ut_ad(err == DB_SUCCESS);
+
+	/* Do some cleanup */
+
+	if (node->row != NULL) {
+		node->row = NULL;
+		node->ext = NULL;
+		node->upd_row = NULL;
+		node->upd_ext = NULL;
+		mem_heap_empty(node->heap);
+	}
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	return(err);
+}
+
+/***********************************************************//**
+Updates a row in a table. This is a high-level function used in SQL execution
+graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_upd_step(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	upd_node_t*	node;
+	sel_node_t*	sel_node;
+	que_node_t*	parent;
+	dberr_t		err		= DB_SUCCESS;
+	trx_t*		trx;
+
+	ut_ad(thr);
+
+	trx = thr_get_trx(thr);
+
+	trx_start_if_not_started_xa(trx);
+
+	node = static_cast<upd_node_t*>(thr->run_node);
+
+	sel_node = node->select;
+
+	parent = que_node_get_parent(node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE);
+
+	if (thr->prev_node == parent) {
+		node->state = UPD_NODE_SET_IX_LOCK;
+	}
+
+	if (node->state == UPD_NODE_SET_IX_LOCK) {
+
+		if (!node->has_clust_rec_x_lock) {
+			/* It may be that the current session has not yet
+			started its transaction, or it has been committed: */
+
+			err = lock_table(0, node->table, LOCK_IX, thr);
+
+			if (err != DB_SUCCESS) {
+
+				goto error_handling;
+			}
+		}
+
+		node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+		if (node->searched_update) {
+			/* Reset the cursor */
+			sel_node->state = SEL_NODE_OPEN;
+
+			/* Fetch a row to update */
+
+			thr->run_node = sel_node;
+
+			return(thr);
+		}
+	}
+
+	/* sel_node is NULL if we are in the MySQL interface */
+
+	if (sel_node && (sel_node->state != SEL_NODE_FETCH)) {
+
+		if (!node->searched_update) {
+			/* An explicit cursor should be positioned on a row
+			to update */
+
+			ut_error;
+
+			err = DB_ERROR;
+
+			goto error_handling;
+		}
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to update, or the select node performed the
+		updates directly in-place */
+
+		thr->run_node = parent;
+
+		return(thr);
+	}
+
+	/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+	err = row_upd(node, thr);
+
+error_handling:
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		return(NULL);
+	}
+
+	/* DO THE TRIGGER ACTIONS HERE */
+
+	if (node->searched_update) {
+		/* Fetch next row to update */
+
+		thr->run_node = sel_node;
+	} else {
+		/* It was an explicit cursor update */
+
+		thr->run_node = parent;
+	}
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	return(thr);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/row/row0vers.cc b/storage/innobase/row/row0vers.cc
new file mode 100644
index 00000000000..9dd7b63bcab
--- /dev/null
+++ b/storage/innobase/row/row0vers.cc
@@ -0,0 +1,773 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0vers.cc
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0vers.h"
+
+#ifdef UNIV_NONINL
+#include "row0vers.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "read0read.h"
+#include "lock0lock.h"
+
+/*****************************************************************//**
+Finds out if an active transaction has inserted or modified a secondary
+index record.
+@return 0 if committed, else the active transaction id;
+NOTE that this function can return false positives but never false
+negatives. The caller must confirm all positive results by calling
+trx_is_active() while holding lock_sys->mutex. */
+UNIV_INLINE
+trx_id_t
+row_vers_impl_x_locked_low(
+/*=======================*/
+	const rec_t*	clust_rec,	/*!< in: clustered index record */
+	dict_index_t*	clust_index,	/*!< in: the clustered index */
+	const rec_t*	rec,		/*!< in: secondary index record */
+	dict_index_t*	index,		/*!< in: the secondary index */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	trx_id_t	trx_id;
+	ibool		corrupt;
+	ulint		comp;
+	ulint		rec_del;
+	const rec_t*	version;
+	rec_t*		prev_version = NULL;
+	ulint*		clust_offsets;
+	mem_heap_t*	heap;
+
+	DBUG_ENTER("row_vers_impl_x_locked_low");
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	heap = mem_heap_create(1024);
+
+	clust_offsets = rec_get_offsets(
+		clust_rec, clust_index, NULL, ULINT_UNDEFINED, &heap);
+
+	trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets);
+	corrupt = FALSE;
+
+	if (!trx_rw_is_active(trx_id, &corrupt)) {
+		/* The transaction that modified or inserted clust_rec is no
+		longer active, or it is corrupt: no implicit lock on rec */
+		if (corrupt) {
+			lock_report_trx_id_insanity(
+				trx_id, clust_rec, clust_index, clust_offsets,
+				trx_sys_get_max_trx_id());
+		}
+		mem_heap_free(heap);
+		DBUG_RETURN(0);
+	}
+
+	comp = page_rec_is_comp(rec);
+	ut_ad(index->table == clust_index->table);
+	ut_ad(!!comp == dict_table_is_comp(index->table));
+	ut_ad(!comp == !page_rec_is_comp(clust_rec));
+
+	rec_del = rec_get_deleted_flag(rec, comp);
+
+	/* We look up if some earlier version, which was modified by
+	the trx_id transaction, of the clustered index record would
+	require rec to be in a different state (delete marked or
+	unmarked, or have different field values, or not existing). If
+	there is such a version, then rec was modified by the trx_id
+	transaction, and it has an implicit x-lock on rec. Note that
+	if clust_rec itself would require rec to be in a different
+	state, then the trx_id transaction has not yet had time to
+	modify rec, and does not necessarily have an implicit x-lock
+	on rec. */
+
+	for (version = clust_rec;; version = prev_version) {
+		row_ext_t*	ext;
+		const dtuple_t*	row;
+		dtuple_t*	entry;
+		ulint		vers_del;
+		trx_id_t	prev_trx_id;
+		mem_heap_t*	old_heap = heap;
+
+		/* We keep the semaphore in mtr on the clust_rec page, so
+		that no other transaction can update it and get an
+		implicit x-lock on rec until mtr_commit(mtr). */
+
+		heap = mem_heap_create(1024);
+
+		trx_undo_prev_version_build(
+			clust_rec, mtr, version, clust_index, clust_offsets,
+			heap, &prev_version);
+
+		/* The oldest visible clustered index version must not be
+		delete-marked, because we never start a transaction by
+		inserting a delete-marked record. */
+		ut_ad(prev_version
+		      || !rec_get_deleted_flag(version, comp)
+		      || !trx_rw_is_active(trx_id, NULL));
+
+		/* Free version and clust_offsets. */
+		mem_heap_free(old_heap);
+
+		if (prev_version == NULL) {
+
+			/* We reached the oldest visible version without
+			finding an older version of clust_rec that would
+			match the secondary index record.  If the secondary
+			index record is not delete marked, then clust_rec
+			is considered the correct match of the secondary
+			index record and hence holds the implicit lock. */
+
+			if (rec_del) {
+				/* The secondary index record is del marked.
+				So, the implicit lock holder of clust_rec
+				did not modify the secondary index record yet,
+				and is not holding an implicit lock on it.
+
+				This assumes that whenever a row is inserted
+				or updated, the leaf page record always is
+				created with a clear delete-mark flag.
+				(We never insert a delete-marked record.) */
+				trx_id = 0;
+			}
+
+			break;
+		}
+
+		clust_offsets = rec_get_offsets(
+			prev_version, clust_index, NULL, ULINT_UNDEFINED,
+			&heap);
+
+		vers_del = rec_get_deleted_flag(prev_version, comp);
+
+		prev_trx_id = row_get_rec_trx_id(prev_version, clust_index,
+						 clust_offsets);
+
+		/* The stack of versions is locked by mtr.  Thus, it
+		is safe to fetch the prefixes for externally stored
+		columns. */
+
+		row = row_build(ROW_COPY_POINTERS, clust_index, prev_version,
+				clust_offsets,
+				NULL, NULL, NULL, &ext, heap);
+
+		entry = row_build_index_entry(row, ext, index, heap);
+
+		/* entry may be NULL if a record was inserted in place
+		of a deleted record, and the BLOB pointers of the new
+		record were not initialized yet.  But in that case,
+		prev_version should be NULL. */
+
+		ut_a(entry != NULL);
+
+		/* If we get here, we know that the trx_id transaction
+		modified prev_version. Let us check if prev_version
+		would require rec to be in a different state. */
+
+		/* The previous version of clust_rec must be
+		accessible, because clust_rec was not a fresh insert.
+		There is no guarantee that the transaction is still
+		active. */
+
+		/* We check if entry and rec are identified in the alphabetical
+		ordering */
+
+		if (!trx_rw_is_active(trx_id, &corrupt)) {
+			/* Transaction no longer active: no implicit
+			x-lock. This situation should only be possible
+			because we are not holding lock_sys->mutex. */
+			ut_ad(!lock_mutex_own());
+			if (corrupt) {
+				lock_report_trx_id_insanity(
+					trx_id,
+					prev_version, clust_index,
+					clust_offsets,
+					trx_sys_get_max_trx_id());
+			}
+			trx_id = 0;
+			break;
+		} else if (0 == cmp_dtuple_rec(entry, rec, offsets)) {
+			/* The delete marks of rec and prev_version should be
+			equal for rec to be in the state required by
+			prev_version */
+
+			if (rec_del != vers_del) {
+
+				break;
+			}
+
+			/* It is possible that the row was updated so that the
+			secondary index record remained the same in
+			alphabetical ordering, but the field values changed
+			still. For example, 'abc' -> 'ABC'. Check also that. */
+
+			dtuple_set_types_binary(
+				entry, dtuple_get_n_fields(entry));
+
+			if (0 != cmp_dtuple_rec(entry, rec, offsets)) {
+
+				break;
+			}
+
+		} else if (!rec_del) {
+			/* The delete mark should be set in rec for it to be
+			in the state required by prev_version */
+
+			break;
+		}
+
+		if (trx_id != prev_trx_id) {
+			/* prev_version was the first version modified by
+			the trx_id transaction: no implicit x-lock */
+
+			trx_id = 0;
+			break;
+		}
+	}
+
+	DBUG_PRINT("info", ("Implicit lock is held by trx:%lu",
+		static_cast<unsigned long>(trx_id)));
+
+	mem_heap_free(heap);
+	DBUG_RETURN(trx_id);
+}
+
+/*****************************************************************//**
+Finds out if an active transaction has inserted or modified a secondary
+index record.
+@return 0 if committed, else the active transaction id;
+NOTE that this function can return false positives but never false
+negatives. The caller must confirm all positive results by calling
+trx_is_active() while holding lock_sys->mutex. */
+UNIV_INTERN
+trx_id_t
+row_vers_impl_x_locked(
+/*===================*/
+	const rec_t*	rec,	/*!< in: record in a secondary index */
+	dict_index_t*	index,	/*!< in: the secondary index */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	dict_index_t*	clust_index;
+	const rec_t*	clust_rec;
+	trx_id_t	trx_id;
+	mtr_t		mtr;
+
+	ut_ad(!lock_mutex_own());
+	ut_ad(!mutex_own(&trx_sys->mutex));
+
+	mtr_start(&mtr);
+
+	/* Search for the clustered index record. The latch on the
+	page of clust_rec locks the top of the stack of versions. The
+	bottom of the version stack is not locked; oldest versions may
+	disappear by the fact that transactions may be committed and
+	collected by the purge. This is not a problem, because we are
+	only interested in active transactions. */
+
+	clust_rec = row_get_clust_rec(
+		BTR_SEARCH_LEAF, rec, index, &clust_index, &mtr);
+
+	if (UNIV_UNLIKELY(!clust_rec)) {
+		/* In a rare case it is possible that no clust rec is found
+		for a secondary index record: if in row0umod.cc
+		row_undo_mod_remove_clust_low() we have already removed the
+		clust rec, while purge is still cleaning and removing
+		secondary index records associated with earlier versions of
+		the clustered index record. In that case there cannot be
+		any implicit lock on the secondary index record, because
+		an active transaction which has modified the secondary index
+		record has also modified the clustered index record. And in
+		a rollback we always undo the modifications to secondary index
+		records before the clustered index record. */
+
+		trx_id = 0;
+	} else {
+		trx_id = row_vers_impl_x_locked_low(
+			clust_rec, clust_index, rec, index, offsets, &mtr);
+	}
+
+	mtr_commit(&mtr);
+
+	return(trx_id);
+}
+
+/*****************************************************************//**
+Finds out if we must preserve a delete marked earlier version of a clustered
+index record, because it is >= the purge view.
+@return	TRUE if earlier version should be preserved */
+UNIV_INTERN
+ibool
+row_vers_must_preserve_del_marked(
+/*==============================*/
+	trx_id_t	trx_id,	/*!< in: transaction id in the version */
+	mtr_t*		mtr)	/*!< in: mtr holding the latch on the
+				clustered index record; it will also
+				hold the latch on purge_view */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	mtr_s_lock(&(purge_sys->latch), mtr);
+
+	return(!read_view_sees_trx_id(purge_sys->view, trx_id));
+}
+
+/*****************************************************************//**
+Finds out if a version of the record, where the version >= the current
+purge view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry and ientry are identified in
+the alphabetical ordering; exactly in this case we return TRUE.
+@return	TRUE if earlier version should have */
+UNIV_INTERN
+ibool
+row_vers_old_has_index_entry(
+/*=========================*/
+	ibool		also_curr,/*!< in: TRUE if also rec is included in the
+				versions to search; otherwise only versions
+				prior to it are searched */
+	const rec_t*	rec,	/*!< in: record in the clustered index; the
+				caller must have a latch on the page */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec; it will
+				also hold the latch on purge_view */
+	dict_index_t*	index,	/*!< in: the secondary index */
+	const dtuple_t*	ientry)	/*!< in: the secondary index entry */
+{
+	const rec_t*	version;
+	rec_t*		prev_version;
+	dict_index_t*	clust_index;
+	ulint*		clust_offsets;
+	mem_heap_t*	heap;
+	mem_heap_t*	heap2;
+	const dtuple_t*	row;
+	const dtuple_t*	entry;
+	ulint		comp;
+
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	comp = page_rec_is_comp(rec);
+	ut_ad(!dict_table_is_comp(index->table) == !comp);
+	heap = mem_heap_create(1024);
+	clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+					ULINT_UNDEFINED, &heap);
+
+	if (also_curr && !rec_get_deleted_flag(rec, comp)) {
+		row_ext_t*	ext;
+
+		/* The top of the stack of versions is locked by the
+		mtr holding a latch on the page containing the
+		clustered index record. The bottom of the stack is
+		locked by the fact that the purge_sys->view must
+		'overtake' any read view of an active transaction.
+		Thus, it is safe to fetch the prefixes for
+		externally stored columns. */
+		row = row_build(ROW_COPY_POINTERS, clust_index,
+				rec, clust_offsets,
+				NULL, NULL, NULL, &ext, heap);
+		entry = row_build_index_entry(row, ext, index, heap);
+
+		/* If entry == NULL, the record contains unset BLOB
+		pointers.  This must be a freshly inserted record.  If
+		this is called from
+		row_purge_remove_sec_if_poss_low(), the thread will
+		hold latches on the clustered index and the secondary
+		index.  Because the insert works in three steps:
+
+			(1) insert the record to clustered index
+			(2) store the BLOBs and update BLOB pointers
+			(3) insert records to secondary indexes
+
+		the purge thread can safely ignore freshly inserted
+		records and delete the secondary index record.  The
+		thread that inserted the new record will be inserting
+		the secondary index records. */
+
+		/* NOTE that we cannot do the comparison as binary
+		fields because the row is maybe being modified so that
+		the clustered index record has already been updated to
+		a different binary value in a char field, but the
+		collation identifies the old and new value anyway! */
+		if (entry && !dtuple_coll_cmp(ientry, entry)) {
+
+			mem_heap_free(heap);
+
+			return(TRUE);
+		}
+	}
+
+	version = rec;
+
+	for (;;) {
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+		trx_undo_prev_version_build(rec, mtr, version,
+					    clust_index, clust_offsets,
+					    heap, &prev_version);
+		mem_heap_free(heap2); /* free version and clust_offsets */
+
+		if (!prev_version) {
+			/* Versions end here */
+
+			mem_heap_free(heap);
+
+			return(FALSE);
+		}
+
+		clust_offsets = rec_get_offsets(prev_version, clust_index,
+						NULL, ULINT_UNDEFINED, &heap);
+
+		if (!rec_get_deleted_flag(prev_version, comp)) {
+			row_ext_t*	ext;
+
+			/* The stack of versions is locked by mtr.
+			Thus, it is safe to fetch the prefixes for
+			externally stored columns. */
+			row = row_build(ROW_COPY_POINTERS, clust_index,
+					prev_version, clust_offsets,
+					NULL, NULL, NULL, &ext, heap);
+			entry = row_build_index_entry(row, ext, index, heap);
+
+			/* If entry == NULL, the record contains unset
+			BLOB pointers.  This must be a freshly
+			inserted record that we can safely ignore.
+			For the justification, see the comments after
+			the previous row_build_index_entry() call. */
+
+			/* NOTE that we cannot do the comparison as binary
+			fields because maybe the secondary index record has
+			already been updated to a different binary value in
+			a char field, but the collation identifies the old
+			and new value anyway! */
+
+			if (entry && !dtuple_coll_cmp(ientry, entry)) {
+
+				mem_heap_free(heap);
+
+				return(TRUE);
+			}
+		}
+
+		version = prev_version;
+	}
+}
+
+/*****************************************************************//**
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version.
+@return	DB_SUCCESS or DB_MISSING_HISTORY */
+UNIV_INTERN
+dberr_t
+row_vers_build_for_consistent_read(
+/*===============================*/
+	const rec_t*	rec,	/*!< in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec */
+	dict_index_t*	index,	/*!< in: the clustered index */
+	ulint**		offsets,/*!< in/out: offsets returned by
+				rec_get_offsets(rec, index) */
+	read_view_t*	view,	/*!< in: the consistent read view */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mem_heap_t*	in_heap,/*!< in: memory heap from which the memory for
+				*old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	rec_t**		old_vers)/*!< out, own: old version, or NULL
+				if the history is missing or the record
+				does not exist in the view, that is,
+				it was freshly inserted afterwards */
+{
+	const rec_t*	version;
+	rec_t*		prev_version;
+	trx_id_t	trx_id;
+	mem_heap_t*	heap		= NULL;
+	byte*		buf;
+	dberr_t		err;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_ad(rec_offs_validate(rec, index, *offsets));
+
+	trx_id = row_get_rec_trx_id(rec, index, *offsets);
+
+	ut_ad(!read_view_sees_trx_id(view, trx_id));
+
+	version = rec;
+
+	for (;;) {
+		mem_heap_t*	heap2	= heap;
+		trx_undo_rec_t* undo_rec;
+		roll_ptr_t	roll_ptr;
+		undo_no_t	undo_no;
+		heap = mem_heap_create(1024);
+
+		/* If we have high-granularity consistent read view and
+		creating transaction of the view is the same as trx_id in
+		the record we see this record only in the case when
+		undo_no of the record is < undo_no in the view. */
+
+		if (view->type == VIEW_HIGH_GRANULARITY
+		    && view->creator_trx_id == trx_id) {
+
+			roll_ptr = row_get_rec_roll_ptr(version, index,
+							*offsets);
+			undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
+			undo_no = trx_undo_rec_get_undo_no(undo_rec);
+			mem_heap_empty(heap);
+
+			if (view->undo_no > undo_no) {
+				/* The view already sees this version: we can
+				copy it to in_heap and return */
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+				ut_a(!rec_offs_any_null_extern(
+					     version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+				buf = static_cast<byte*>(mem_heap_alloc(
+					in_heap, rec_offs_size(*offsets)));
+
+				*old_vers = rec_copy(buf, version, *offsets);
+				rec_offs_make_valid(*old_vers, index,
+						    *offsets);
+				err = DB_SUCCESS;
+				break;
+			}
+		}
+
+		err = trx_undo_prev_version_build(rec, mtr, version, index,
+						  *offsets, heap,
+						  &prev_version)
+			? DB_SUCCESS : DB_MISSING_HISTORY;
+		if (heap2) {
+			mem_heap_free(heap2); /* free version */
+		}
+
+		if (prev_version == NULL) {
+			/* It was a freshly inserted version */
+			*old_vers = NULL;
+			break;
+		}
+
+		*offsets = rec_get_offsets(prev_version, index, *offsets,
+					   ULINT_UNDEFINED, offset_heap);
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+		ut_a(!rec_offs_any_null_extern(prev_version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+		trx_id = row_get_rec_trx_id(prev_version, index, *offsets);
+
+		if (read_view_sees_trx_id(view, trx_id)) {
+
+			/* The view already sees this version: we can copy
+			it to in_heap and return */
+
+			buf = static_cast<byte*>(
+				mem_heap_alloc(
+					in_heap, rec_offs_size(*offsets)));
+
+			*old_vers = rec_copy(buf, prev_version, *offsets);
+			rec_offs_make_valid(*old_vers, index, *offsets);
+			break;
+		}
+
+		version = prev_version;
+	}/* for (;;) */
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Constructs the last committed version of a clustered index record,
+which should be seen by a semi-consistent read. */
+UNIV_INTERN
+void
+row_vers_build_for_semi_consistent_read(
+/*====================================*/
+	const rec_t*	rec,	/*!< in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec */
+	dict_index_t*	index,	/*!< in: the clustered index */
+	ulint**		offsets,/*!< in/out: offsets returned by
+				rec_get_offsets(rec, index) */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mem_heap_t*	in_heap,/*!< in: memory heap from which the memory for
+				*old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	const rec_t**	old_vers)/*!< out: rec, old version, or NULL if the
+				record does not exist in the view, that is,
+				it was freshly inserted afterwards */
+{
+	const rec_t*	version;
+	mem_heap_t*	heap		= NULL;
+	byte*		buf;
+	trx_id_t	rec_trx_id	= 0;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_ad(rec_offs_validate(rec, index, *offsets));
+
+	version = rec;
+
+	for (;;) {
+		const trx_t*	version_trx;
+		mem_heap_t*	heap2;
+		rec_t*		prev_version;
+		trx_id_t	version_trx_id;
+
+		version_trx_id = row_get_rec_trx_id(version, index, *offsets);
+		if (rec == version) {
+			rec_trx_id = version_trx_id;
+		}
+
+		mutex_enter(&trx_sys->mutex);
+		version_trx = trx_get_rw_trx_by_id(version_trx_id);
+		/* Because version_trx is a read-write transaction,
+		its state cannot change from or to NOT_STARTED while
+		we are holding the trx_sys->mutex.  It may change from
+		ACTIVE to PREPARED or COMMITTED. */
+		if (version_trx
+		    && trx_state_eq(version_trx,
+				    TRX_STATE_COMMITTED_IN_MEMORY)) {
+			version_trx = NULL;
+		}
+		mutex_exit(&trx_sys->mutex);
+
+		if (!version_trx) {
+committed_version_trx:
+			/* We found a version that belongs to a
+			committed transaction: return it. */
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+			ut_a(!rec_offs_any_null_extern(version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+			if (rec == version) {
+				*old_vers = rec;
+				break;
+			}
+
+			/* We assume that a rolled-back transaction stays in
+			TRX_STATE_ACTIVE state until all the changes have been
+			rolled back and the transaction is removed from
+			the global list of transactions. */
+
+			if (rec_trx_id == version_trx_id) {
+				/* The transaction was committed while
+				we searched for earlier versions.
+				Return the current version as a
+				semi-consistent read. */
+
+				version = rec;
+				*offsets = rec_get_offsets(version,
+							   index, *offsets,
+							   ULINT_UNDEFINED,
+							   offset_heap);
+			}
+
+			buf = static_cast<byte*>(
+				mem_heap_alloc(
+					in_heap, rec_offs_size(*offsets)));
+
+			*old_vers = rec_copy(buf, version, *offsets);
+			rec_offs_make_valid(*old_vers, index, *offsets);
+			break;
+		}
+
+		DEBUG_SYNC_C("after_row_vers_check_trx_active");
+
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+
+		if (!trx_undo_prev_version_build(rec, mtr, version, index,
+						 *offsets, heap,
+						 &prev_version)) {
+			mem_heap_free(heap);
+			heap = heap2;
+			heap2 = NULL;
+			goto committed_version_trx;
+		}
+
+		if (heap2) {
+			mem_heap_free(heap2); /* free version */
+		}
+
+		if (prev_version == NULL) {
+			/* It was a freshly inserted version */
+			*old_vers = NULL;
+			break;
+		}
+
+		version = prev_version;
+		*offsets = rec_get_offsets(version, index, *offsets,
+					   ULINT_UNDEFINED, offset_heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+		ut_a(!rec_offs_any_null_extern(version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+	}/* for (;;) */
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+}
diff --git a/storage/innobase/srv/srv0conc.cc b/storage/innobase/srv/srv0conc.cc
new file mode 100644
index 00000000000..dc3c0b1dd88
--- /dev/null
+++ b/storage/innobase/srv/srv0conc.cc
@@ -0,0 +1,597 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0conc.cc
+
+InnoDB concurrency manager
+
+Created 2011/04/18 Sunny Bains
+*******************************************************/
+
+#include "srv0srv.h"
+#include "sync0sync.h"
+#include "trx0trx.h"
+
+#include "mysql/plugin.h"
+
+/** Number of times a thread is allowed to enter InnoDB within the same
+SQL query after it has once got the ticket. */
+UNIV_INTERN ulong	srv_n_free_tickets_to_enter = 500;
+
+#ifdef HAVE_ATOMIC_BUILTINS
+/** Maximum sleep delay (in micro-seconds), value of 0 disables it. */
+UNIV_INTERN ulong	srv_adaptive_max_sleep_delay = 150000;
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+UNIV_INTERN ulong	srv_thread_sleep_delay	= 10000;
+
+
+/** We are prepared for a situation that we have this many threads waiting for
+a semaphore inside InnoDB. innobase_start_or_create_for_mysql() sets the
+value. */
+
+UNIV_INTERN ulint	srv_max_n_threads	= 0;
+
+/** The following controls how many threads we let inside InnoDB concurrently:
+threads waiting for locks are not counted into the number because otherwise
+we could get a deadlock. Value of 0 will disable the concurrency check. */
+
+UNIV_INTERN ulong	srv_thread_concurrency	= 0;
+
+#ifndef HAVE_ATOMIC_BUILTINS
+
+/** This mutex protects srv_conc data structures */
+static os_fast_mutex_t	srv_conc_mutex;
+
+/** Concurrency list node */
+typedef UT_LIST_NODE_T(struct srv_conc_slot_t)	srv_conc_node_t;
+
+/** Slot for a thread waiting in the concurrency control queue. */
+struct srv_conc_slot_t{
+	os_event_t	event;		/*!< event to wait */
+	ibool		reserved;	/*!< TRUE if slot
+					reserved */
+	ibool		wait_ended;	/*!< TRUE when another thread has
+					already set the event and the thread
+					in this slot is free to proceed; but
+					reserved may still be TRUE at that
+					point */
+	srv_conc_node_t	srv_conc_queue;	/*!< queue node */
+};
+
+/** Queue of threads waiting to get in */
+typedef UT_LIST_BASE_NODE_T(srv_conc_slot_t)	srv_conc_queue_t;
+
+static srv_conc_queue_t	srv_conc_queue;
+
+/** Array of wait slots */
+static srv_conc_slot_t*	srv_conc_slots;
+
+#if defined(UNIV_PFS_MUTEX)
+/* Key to register srv_conc_mutex_key with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_conc_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+#endif /* !HAVE_ATOMIC_BUILTINS */
+
+/** Variables tracking the active and waiting threads. */
+struct srv_conc_t {
+	char		pad[64  - (sizeof(ulint) + sizeof(lint))];
+
+	/** Number of transactions that have declared_to_be_inside_innodb set.
+	It used to be a non-error for this value to drop below zero temporarily.
+	This is no longer true. We'll, however, keep the lint datatype to add
+	assertions to catch any corner cases that we may have missed. */
+
+	volatile lint	n_active;
+
+	/** Number of OS threads waiting in the FIFO for permission to
+	enter InnoDB */
+	volatile lint	n_waiting;
+};
+
+/* Control variables for tracking concurrency. */
+static srv_conc_t	srv_conc;
+
+/*********************************************************************//**
+Initialise the concurrency management data structures */
+void
+srv_conc_init(void)
+/*===============*/
+{
+#ifndef HAVE_ATOMIC_BUILTINS
+	ulint		i;
+
+	/* Init the server concurrency restriction data structures */
+
+	os_fast_mutex_init(srv_conc_mutex_key, &srv_conc_mutex);
+
+	UT_LIST_INIT(srv_conc_queue);
+
+	srv_conc_slots = static_cast<srv_conc_slot_t*>(
+		mem_zalloc(OS_THREAD_MAX_N * sizeof(*srv_conc_slots)));
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++) {
+		srv_conc_slot_t*	conc_slot = &srv_conc_slots[i];
+
+		conc_slot->event = os_event_create();
+		ut_a(conc_slot->event);
+	}
+#endif /* !HAVE_ATOMIC_BUILTINS */
+}
+
+/*********************************************************************//**
+Free the concurrency management data structures */
+void
+srv_conc_free(void)
+/*===============*/
+{
+#ifndef HAVE_ATOMIC_BUILTINS
+	os_fast_mutex_free(&srv_conc_mutex);
+	mem_free(srv_conc_slots);
+	srv_conc_slots = NULL;
+#endif /* !HAVE_ATOMIC_BUILTINS */
+}
+
+#ifdef HAVE_ATOMIC_BUILTINS
+/*********************************************************************//**
+Note that a user thread is entering InnoDB. */
+static
+void
+srv_enter_innodb_with_tickets(
+/*==========================*/
+	trx_t*	trx)			/*!< in/out: transaction that wants
+					to enter InnoDB */
+{
+	trx->declared_to_be_inside_innodb = TRUE;
+	trx->n_tickets_to_enter_innodb = srv_n_free_tickets_to_enter;
+}
+
+/*********************************************************************//**
+Handle the scheduling of a user thread that wants to enter InnoDB.  Setting
+srv_adaptive_max_sleep_delay > 0 switches the adaptive sleep calibration to
+ON. When set, we want to wait in the queue for as little time as possible.
+However, very short waits will result in a lot of context switches and that
+is also not desirable. When threads need to sleep multiple times we increment
+os_thread_sleep_delay by one. When we see threads getting a slot without
+waiting and there are no other threads waiting in the queue, we try and reduce
+the wait as much as we can. Currently we reduce it by half each time. If the
+thread only had to wait for one turn before it was able to enter InnoDB we
+decrement it by one. This is to try and keep the sleep time stable around the
+"optimum" sleep time. */
+static
+void
+srv_conc_enter_innodb_with_atomics(
+/*===============================*/
+	trx_t*	trx)			/*!< in/out: transaction that wants
+					to enter InnoDB */
+{
+	ulint	n_sleeps = 0;
+	ibool	notified_mysql = FALSE;
+
+	ut_a(!trx->declared_to_be_inside_innodb);
+
+	for (;;) {
+		ulint	sleep_in_us;
+
+		if (srv_conc.n_active < (lint) srv_thread_concurrency) {
+			ulint	n_active;
+
+			/* Check if there are any free tickets. */
+			n_active = os_atomic_increment_lint(
+				&srv_conc.n_active, 1);
+
+			if (n_active <= srv_thread_concurrency) {
+
+				srv_enter_innodb_with_tickets(trx);
+
+				if (notified_mysql) {
+
+					(void) os_atomic_decrement_lint(
+						&srv_conc.n_waiting, 1);
+
+					thd_wait_end(trx->mysql_thd);
+				}
+
+				if (srv_adaptive_max_sleep_delay > 0) {
+					if (srv_thread_sleep_delay > 20
+					    && n_sleeps == 1) {
+
+						--srv_thread_sleep_delay;
+					}
+
+					if (srv_conc.n_waiting == 0) {
+						srv_thread_sleep_delay >>= 1;
+					}
+				}
+
+				return;
+			}
+
+			/* Since there were no free seats, we relinquish
+			the overbooked ticket. */
+
+			(void) os_atomic_decrement_lint(
+				&srv_conc.n_active, 1);
+		}
+
+		if (!notified_mysql) {
+			(void) os_atomic_increment_lint(
+				&srv_conc.n_waiting, 1);
+
+			/* Release possible search system latch this
+			thread has */
+
+			if (trx->has_search_latch) {
+				trx_search_latch_release_if_reserved(trx);
+			}
+
+			thd_wait_begin(trx->mysql_thd, THD_WAIT_USER_LOCK);
+
+			notified_mysql = TRUE;
+		}
+
+		trx->op_info = "sleeping before entering InnoDB";
+
+		sleep_in_us = srv_thread_sleep_delay;
+
+		/* Guard against overflow when adaptive sleep delay is on. */
+
+		if (srv_adaptive_max_sleep_delay > 0
+		    && sleep_in_us > srv_adaptive_max_sleep_delay) {
+
+			sleep_in_us = srv_adaptive_max_sleep_delay;
+			srv_thread_sleep_delay = static_cast<ulong>(sleep_in_us);
+		}
+
+		os_thread_sleep(sleep_in_us);
+
+		trx->op_info = "";
+
+		++n_sleeps;
+
+		if (srv_adaptive_max_sleep_delay > 0 && n_sleeps > 1) {
+			++srv_thread_sleep_delay;
+		}
+	}
+}
+
+/*********************************************************************//**
+Note that a user thread is leaving InnoDB code. */
+static
+void
+srv_conc_exit_innodb_with_atomics(
+/*==============================*/
+	trx_t*	trx)		/*!< in/out: transaction */
+{
+	trx->n_tickets_to_enter_innodb = 0;
+	trx->declared_to_be_inside_innodb = FALSE;
+
+	(void) os_atomic_decrement_lint(&srv_conc.n_active, 1);
+}
+#else
+/*********************************************************************//**
+Note that a user thread is leaving InnoDB code. */
+static
+void
+srv_conc_exit_innodb_without_atomics(
+/*=================================*/
+	trx_t*	trx)		/*!< in/out: transaction */
+{
+	srv_conc_slot_t*	slot;
+
+	os_fast_mutex_lock(&srv_conc_mutex);
+
+	ut_ad(srv_conc.n_active > 0);
+	srv_conc.n_active--;
+	trx->declared_to_be_inside_innodb = FALSE;
+	trx->n_tickets_to_enter_innodb = 0;
+
+	slot = NULL;
+
+	if (srv_conc.n_active < (lint) srv_thread_concurrency) {
+		/* Look for a slot where a thread is waiting and no other
+		thread has yet released the thread */
+
+		for (slot = UT_LIST_GET_FIRST(srv_conc_queue);
+		     slot != NULL && slot->wait_ended == TRUE;
+		     slot = UT_LIST_GET_NEXT(srv_conc_queue, slot)) {
+
+			/* No op */
+		}
+
+		if (slot != NULL) {
+			slot->wait_ended = TRUE;
+
+			/* We increment the count on behalf of the released
+			thread */
+
+			srv_conc.n_active++;
+		}
+	}
+
+	os_fast_mutex_unlock(&srv_conc_mutex);
+
+	if (slot != NULL) {
+		os_event_set(slot->event);
+	}
+}
+
+/*********************************************************************//**
+Handle the scheduling of a user thread that wants to enter InnoDB. */
+static
+void
+srv_conc_enter_innodb_without_atomics(
+/*==================================*/
+	trx_t*	trx)			/*!< in/out: transaction that wants
+					to enter InnoDB */
+{
+	ulint			i;
+	srv_conc_slot_t*	slot = NULL;
+	ibool			has_slept = FALSE;
+
+	os_fast_mutex_lock(&srv_conc_mutex);
+retry:
+	if (UNIV_UNLIKELY(trx->declared_to_be_inside_innodb)) {
+		os_fast_mutex_unlock(&srv_conc_mutex);
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: trying to declare trx"
+		      " to enter InnoDB, but\n"
+		      "InnoDB: it already is declared.\n", stderr);
+		trx_print(stderr, trx, 0);
+		putc('\n', stderr);
+		return;
+	}
+
+	ut_ad(srv_conc.n_active >= 0);
+
+	if (srv_conc.n_active < (lint) srv_thread_concurrency) {
+
+		srv_conc.n_active++;
+		trx->declared_to_be_inside_innodb = TRUE;
+		trx->n_tickets_to_enter_innodb = srv_n_free_tickets_to_enter;
+
+		os_fast_mutex_unlock(&srv_conc_mutex);
+
+		return;
+	}
+
+	/* If the transaction is not holding resources, let it sleep
+	for srv_thread_sleep_delay microseconds, and try again then */
+
+	if (!has_slept && !trx->has_search_latch
+	    && NULL == UT_LIST_GET_FIRST(trx->lock.trx_locks)) {
+
+		has_slept = TRUE; /* We let it sleep only once to avoid
+				starvation */
+
+		srv_conc.n_waiting++;
+
+		os_fast_mutex_unlock(&srv_conc_mutex);
+
+		trx->op_info = "sleeping before joining InnoDB queue";
+
+		/* Peter Zaitsev suggested that we take the sleep away
+		altogether. But the sleep may be good in pathological
+		situations of lots of thread switches. Simply put some
+		threads aside for a while to reduce the number of thread
+		switches. */
+		if (srv_thread_sleep_delay > 0) {
+			os_thread_sleep(srv_thread_sleep_delay);
+		}
+
+		trx->op_info = "";
+
+		os_fast_mutex_lock(&srv_conc_mutex);
+
+		srv_conc.n_waiting--;
+
+		goto retry;
+	}
+
+	/* Too many threads inside: put the current thread to a queue */
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++) {
+		slot = srv_conc_slots + i;
+
+		if (!slot->reserved) {
+
+			break;
+		}
+	}
+
+	if (i == OS_THREAD_MAX_N) {
+		/* Could not find a free wait slot, we must let the
+		thread enter */
+
+		srv_conc.n_active++;
+		trx->declared_to_be_inside_innodb = TRUE;
+		trx->n_tickets_to_enter_innodb = 0;
+
+		os_fast_mutex_unlock(&srv_conc_mutex);
+
+		return;
+	}
+
+	/* Release possible search system latch this thread has */
+	if (trx->has_search_latch) {
+		trx_search_latch_release_if_reserved(trx);
+	}
+
+	/* Add to the queue */
+	slot->reserved = TRUE;
+	slot->wait_ended = FALSE;
+
+	UT_LIST_ADD_LAST(srv_conc_queue, srv_conc_queue, slot);
+
+	os_event_reset(slot->event);
+
+	srv_conc.n_waiting++;
+
+	os_fast_mutex_unlock(&srv_conc_mutex);
+
+	/* Go to wait for the event; when a thread leaves InnoDB it will
+	release this thread */
+
+	ut_ad(!trx->has_search_latch);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
+	trx->op_info = "waiting in InnoDB queue";
+
+	thd_wait_begin(trx->mysql_thd, THD_WAIT_USER_LOCK);
+
+	os_event_wait(slot->event);
+	thd_wait_end(trx->mysql_thd);
+
+	trx->op_info = "";
+
+	os_fast_mutex_lock(&srv_conc_mutex);
+
+	srv_conc.n_waiting--;
+
+	/* NOTE that the thread which released this thread already
+	incremented the thread counter on behalf of this thread */
+
+	slot->reserved = FALSE;
+
+	UT_LIST_REMOVE(srv_conc_queue, srv_conc_queue, slot);
+
+	trx->declared_to_be_inside_innodb = TRUE;
+	trx->n_tickets_to_enter_innodb = srv_n_free_tickets_to_enter;
+
+	os_fast_mutex_unlock(&srv_conc_mutex);
+}
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+/*********************************************************************//**
+Puts an OS thread to wait if there are too many concurrent threads
+(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */
+UNIV_INTERN
+void
+srv_conc_enter_innodb(
+/*==================*/
+	trx_t*	trx)	/*!< in: transaction object associated with the
+			thread */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	srv_conc_enter_innodb_with_atomics(trx);
+#else
+	srv_conc_enter_innodb_without_atomics(trx);
+#endif /* HAVE_ATOMIC_BUILTINS */
+}
+
+/*********************************************************************//**
+This lets a thread enter InnoDB regardless of the number of threads inside
+InnoDB. This must be called when a thread ends a lock wait. */
+UNIV_INTERN
+void
+srv_conc_force_enter_innodb(
+/*========================*/
+	trx_t*	trx)	/*!< in: transaction object associated with the
+			thread */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (!srv_thread_concurrency) {
+
+		return;
+	}
+
+	ut_ad(srv_conc.n_active >= 0);
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	(void) os_atomic_increment_lint(&srv_conc.n_active, 1);
+#else
+	os_fast_mutex_lock(&srv_conc_mutex);
+	++srv_conc.n_active;
+	os_fast_mutex_unlock(&srv_conc_mutex);
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+	trx->n_tickets_to_enter_innodb = 1;
+	trx->declared_to_be_inside_innodb = TRUE;
+}
+
+/*********************************************************************//**
+This must be called when a thread exits InnoDB in a lock wait or at the
+end of an SQL statement. */
+UNIV_INTERN
+void
+srv_conc_force_exit_innodb(
+/*=======================*/
+	trx_t*	trx)	/*!< in: transaction object associated with the
+			thread */
+{
+	if ((trx->mysql_thd != NULL
+	     && thd_is_replication_slave_thread(trx->mysql_thd))
+	    || trx->declared_to_be_inside_innodb == FALSE) {
+
+		return;
+	}
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	srv_conc_exit_innodb_with_atomics(trx);
+#else
+	srv_conc_exit_innodb_without_atomics(trx);
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
+#endif /* UNIV_SYNC_DEBUG */
+}
+
+/*********************************************************************//**
+Get the count of threads waiting inside InnoDB. */
+UNIV_INTERN
+ulint
+srv_conc_get_waiting_threads(void)
+/*==============================*/
+{
+	return(srv_conc.n_waiting);
+}
+
+/*********************************************************************//**
+Get the count of threads active inside InnoDB. */
+UNIV_INTERN
+ulint
+srv_conc_get_active_threads(void)
+/*==============================*/
+{
+	return(srv_conc.n_active);
+ }
+
diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc
new file mode 100644
index 00000000000..80c8f7fadbc
--- /dev/null
+++ b/storage/innobase/srv/srv0mon.cc
@@ -0,0 +1,1931 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0mon.cc
+Database monitor counter interfaces
+
+Created 12/9/2009 Jimmy Yang
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+#include "os0file.h"
+#include "mach0data.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "buf0buf.h"
+#include "trx0sys.h"
+#include "trx0rseg.h"
+#include "lock0lock.h"
+#include "ibuf0ibuf.h"
+#ifdef UNIV_NONINL
+#include "srv0mon.ic"
+#endif
+
+/* Macro to standardize the counter names for counters in the
+"monitor_buf_page" module as they have very structured defines */
+#define	MONITOR_BUF_PAGE(name, description, code, op, op_code)	\
+	{"buffer_page_" op "_" name, "buffer_page_io",		\
+	 "Number of " description " Pages " op,			\
+	 MONITOR_GROUP_MODULE, MONITOR_DEFAULT_START,		\
+	 MONITOR_##code##_##op_code}
+
+#define MONITOR_BUF_PAGE_READ(name, description, code)		\
+	 MONITOR_BUF_PAGE(name, description, code, "read", PAGE_READ)
+
+#define MONITOR_BUF_PAGE_WRITTEN(name, description, code)	\
+	 MONITOR_BUF_PAGE(name, description, code, "written", PAGE_WRITTEN)
+
+
+/** This array defines basic static information of monitor counters,
+including each monitor's name, module it belongs to, a short
+description and its property/type and corresponding monitor_id.
+Please note: If you add a monitor here, please add its corresponding
+monitor_id to "enum monitor_id_value" structure in srv0mon.h file. */
+
+static monitor_info_t	innodb_counter_info[] =
+{
+	/* A dummy item to mark the module start, this is
+	to accomodate the default value (0) set for the
+	global variables with the control system. */
+	{"module_start", "module_start", "module_start",
+	MONITOR_MODULE,
+	MONITOR_DEFAULT_START, MONITOR_DEFAULT_START},
+
+	/* ========== Counters for Server Metadata ========== */
+	{"module_metadata", "metadata", "Server Metadata",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_METADATA},
+
+	{"metadata_table_handles_opened", "metadata",
+	 "Number of table handles opened",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLE_OPEN},
+
+	{"metadata_table_handles_closed", "metadata",
+	 "Number of table handles closed",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLE_CLOSE},
+
+	{"metadata_table_reference_count", "metadata",
+	 "Table reference counter",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLE_REFERENCE},
+
+	{"metadata_mem_pool_size", "metadata",
+	 "Size of a memory pool InnoDB uses to store data dictionary"
+	 " and internal data structures in bytes",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_META_MEM_POOL},
+
+	/* ========== Counters for Lock Module ========== */
+	{"module_lock", "lock", "Lock Module",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_LOCK},
+
+	{"lock_deadlocks", "lock", "Number of deadlocks",
+	 MONITOR_DEFAULT_ON,
+	 MONITOR_DEFAULT_START, MONITOR_DEADLOCK},
+
+	{"lock_timeouts", "lock", "Number of lock timeouts",
+	 MONITOR_DEFAULT_ON,
+	 MONITOR_DEFAULT_START, MONITOR_TIMEOUT},
+
+	{"lock_rec_lock_waits", "lock",
+	 "Number of times enqueued into record lock wait queue",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LOCKREC_WAIT},
+
+	{"lock_table_lock_waits", "lock",
+	 "Number of times enqueued into table lock wait queue",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLELOCK_WAIT},
+
+	{"lock_rec_lock_requests", "lock",
+	 "Number of record locks requested",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_RECLOCK_REQ},
+
+	{"lock_rec_lock_created", "lock", "Number of record locks created",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_RECLOCK_CREATED},
+
+	{"lock_rec_lock_removed", "lock",
+	 "Number of record locks removed from the lock queue",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_RECLOCK_REMOVED},
+
+	{"lock_rec_locks", "lock",
+	 "Current number of record locks on tables",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_RECLOCK},
+
+	{"lock_table_lock_created", "lock", "Number of table locks created",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLELOCK_CREATED},
+
+	{"lock_table_lock_removed", "lock",
+	 "Number of table locks removed from the lock queue",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLELOCK_REMOVED},
+
+	{"lock_table_locks", "lock",
+	 "Current number of table locks on tables",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_TABLELOCK},
+
+	{"lock_row_lock_current_waits", "lock",
+	 "Number of row locks currently being waited for"
+	 " (innodb_row_lock_current_waits)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT},
+
+	{"lock_row_lock_time", "lock",
+	 "Time spent in acquiring row locks, in milliseconds"
+	 " (innodb_row_lock_time)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOCK_WAIT_TIME},
+
+	{"lock_row_lock_time_max", "lock",
+	 "The maximum time to acquire a row lock, in milliseconds"
+	 " (innodb_row_lock_time_max)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOCK_MAX_WAIT_TIME},
+
+	{"lock_row_lock_waits", "lock",
+	 "Number of times a row lock had to be waited for"
+	 " (innodb_row_lock_waits)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_ROW_LOCK_WAIT},
+
+	{"lock_row_lock_time_avg", "lock",
+	 "The average time to acquire a row lock, in milliseconds"
+	 " (innodb_row_lock_time_avg)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOCK_AVG_WAIT_TIME},
+
+	/* ========== Counters for Buffer Manager and I/O ========== */
+	{"module_buffer", "buffer", "Buffer Manager Module",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_BUFFER},
+
+	{"buffer_pool_size", "server",
+	 "Server buffer pool size (all buffer pools) in bytes",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUFFER_POOL_SIZE},
+
+	{"buffer_pool_reads", "buffer",
+	 "Number of reads directly from disk (innodb_buffer_pool_reads)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READS},
+
+	{"buffer_pool_read_requests", "buffer",
+	 "Number of logical read requests (innodb_buffer_pool_read_requests)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READ_REQUESTS},
+
+	{"buffer_pool_write_requests", "buffer",
+	 "Number of write requests (innodb_buffer_pool_write_requests)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_WRITE_REQUEST},
+
+	{"buffer_pool_wait_free", "buffer",
+	 "Number of times waited for free buffer"
+	 " (innodb_buffer_pool_wait_free)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_WAIT_FREE},
+
+	{"buffer_pool_read_ahead", "buffer",
+	 "Number of pages read as read ahead (innodb_buffer_pool_read_ahead)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READ_AHEAD},
+
+	{"buffer_pool_read_ahead_evicted", "buffer",
+	 "Read-ahead pages evicted without being accessed"
+	 " (innodb_buffer_pool_read_ahead_evicted)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED},
+
+	{"buffer_pool_pages_total", "buffer",
+	 "Total buffer pool size in pages (innodb_buffer_pool_pages_total)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGE_TOTAL},
+
+	{"buffer_pool_pages_misc", "buffer",
+	 "Buffer pages for misc use such as row locks or the adaptive"
+	 " hash index (innodb_buffer_pool_pages_misc)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGE_MISC},
+
+	{"buffer_pool_pages_data", "buffer",
+	 "Buffer pages containing data (innodb_buffer_pool_pages_data)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_DATA},
+
+	{"buffer_pool_bytes_data", "buffer",
+	 "Buffer bytes containing data (innodb_buffer_pool_bytes_data)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_BYTES_DATA},
+
+	{"buffer_pool_pages_dirty", "buffer",
+	 "Buffer pages currently dirty (innodb_buffer_pool_pages_dirty)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_DIRTY},
+
+	{"buffer_pool_bytes_dirty", "buffer",
+	 "Buffer bytes currently dirty (innodb_buffer_pool_bytes_dirty)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_BYTES_DIRTY},
+
+	{"buffer_pool_pages_free", "buffer",
+	 "Buffer pages currently free (innodb_buffer_pool_pages_free)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_FREE},
+
+	{"buffer_pages_created", "buffer",
+	 "Number of pages created (innodb_pages_created)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_CREATED},
+
+	{"buffer_pages_written", "buffer",
+	 "Number of pages written (innodb_pages_written)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_WRITTEN},
+
+	{"buffer_pages_read", "buffer",
+	 "Number of pages read (innodb_pages_read)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_READ},
+
+	{"buffer_data_reads", "buffer",
+	 "Amount of data read in bytes (innodb_data_reads)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BYTE_READ},
+
+	{"buffer_data_written", "buffer",
+	 "Amount of data written in bytes (innodb_data_written)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BYTE_WRITTEN},
+
+	/* Cumulative counter for scanning in flush batches */
+	{"buffer_flush_batch_scanned", "buffer",
+	 "Total pages scanned as part of flush batch",
+	 MONITOR_SET_OWNER,
+	 MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+	 MONITOR_FLUSH_BATCH_SCANNED},
+
+	{"buffer_flush_batch_num_scan", "buffer",
+	 "Number of times buffer flush list flush is called",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_SCANNED,
+	 MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL},
+
+	{"buffer_flush_batch_scanned_per_call", "buffer",
+	 "Pages scanned per flush batch scan",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_SCANNED,
+	 MONITOR_FLUSH_BATCH_SCANNED_PER_CALL},
+
+	{"buffer_flush_batch_rescan", "buffer",
+	 "Number of times rescan of flush list forced",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_HP_RESCAN},
+
+	/* Cumulative counter for pages flushed in flush batches */
+	{"buffer_flush_batch_total_pages", "buffer",
+	 "Total pages flushed as part of flush batch",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_BATCH_COUNT,
+	 MONITOR_FLUSH_BATCH_TOTAL_PAGE},
+
+	{"buffer_flush_batches", "buffer",
+	 "Number of flush batches",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+	 MONITOR_FLUSH_BATCH_COUNT},
+
+	{"buffer_flush_batch_pages", "buffer",
+	 "Pages queued as a flush batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+	 MONITOR_FLUSH_BATCH_PAGES},
+
+	/* Cumulative counter for flush batches because of neighbor */
+	{"buffer_flush_neighbor_total_pages", "buffer",
+	 "Total neighbors flushed as part of neighbor flush",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_NEIGHBOR_COUNT,
+	 MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE},
+
+	{"buffer_flush_neighbor", "buffer",
+	 "Number of times neighbors flushing is invoked",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+	 MONITOR_FLUSH_NEIGHBOR_COUNT},
+
+	{"buffer_flush_neighbor_pages", "buffer",
+	 "Pages queued as a neighbor batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+	 MONITOR_FLUSH_NEIGHBOR_PAGES},
+
+	{"buffer_flush_n_to_flush_requested", "buffer",
+	 "Number of pages requested for flushing.",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_N_TO_FLUSH_REQUESTED},
+
+	{"buffer_flush_avg_page_rate", "buffer",
+	 "Average number of pages at which flushing is happening",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_AVG_PAGE_RATE},
+
+	{"buffer_flush_lsn_avg_rate", "buffer",
+	 "Average redo generation rate",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_LSN_AVG_RATE},
+
+	{"buffer_flush_pct_for_dirty", "buffer",
+	 "Percent of IO capacity used to avoid max dirty page limit",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_PCT_FOR_DIRTY},
+
+	{"buffer_flush_pct_for_lsn", "buffer",
+	 "Percent of IO capacity used to avoid reusable redo space limit",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_PCT_FOR_LSN},
+
+	{"buffer_flush_sync_waits", "buffer",
+	 "Number of times a wait happens due to sync flushing",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_SYNC_WAITS},
+
+	/* Cumulative counter for flush batches for adaptive flushing  */
+	{"buffer_flush_adaptive_total_pages", "buffer",
+	 "Total pages flushed as part of adaptive flushing",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_ADAPTIVE_COUNT,
+	 MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE},
+
+	{"buffer_flush_adaptive", "buffer",
+	 "Number of adaptive batches",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+	 MONITOR_FLUSH_ADAPTIVE_COUNT},
+
+	{"buffer_flush_adaptive_pages", "buffer",
+	 "Pages queued as an adaptive batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+	 MONITOR_FLUSH_ADAPTIVE_PAGES},
+
+	/* Cumulative counter for flush batches because of sync */
+	{"buffer_flush_sync_total_pages", "buffer",
+	 "Total pages flushed as part of sync batches",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_SYNC_COUNT,
+	 MONITOR_FLUSH_SYNC_TOTAL_PAGE},
+
+	{"buffer_flush_sync", "buffer",
+	 "Number of sync batches",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+	 MONITOR_FLUSH_SYNC_COUNT},
+
+	{"buffer_flush_sync_pages", "buffer",
+	 "Pages queued as a sync batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+	 MONITOR_FLUSH_SYNC_PAGES},
+
+	/* Cumulative counter for flush batches because of background */
+	{"buffer_flush_background_total_pages", "buffer",
+	 "Total pages flushed as part of background batches",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_BACKGROUND_COUNT,
+	 MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE},
+
+	{"buffer_flush_background", "buffer",
+	 "Number of background batches",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+	 MONITOR_FLUSH_BACKGROUND_COUNT},
+
+	{"buffer_flush_background_pages", "buffer",
+	 "Pages queued as a background batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+	 MONITOR_FLUSH_BACKGROUND_PAGES},
+
+	/* Cumulative counter for LRU batch scan */
+	{"buffer_LRU_batch_scanned", "buffer",
+	 "Total pages scanned as part of LRU batch",
+	 MONITOR_SET_OWNER, MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+	 MONITOR_LRU_BATCH_SCANNED},
+
+	{"buffer_LRU_batch_num_scan", "buffer",
+	 "Number of times LRU batch is called",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_SCANNED,
+	 MONITOR_LRU_BATCH_SCANNED_NUM_CALL},
+
+	{"buffer_LRU_batch_scanned_per_call", "buffer",
+	 "Pages scanned per LRU batch call",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_SCANNED,
+	 MONITOR_LRU_BATCH_SCANNED_PER_CALL},
+
+	/* Cumulative counter for LRU batch pages flushed */
+	{"buffer_LRU_batch_total_pages", "buffer",
+	 "Total pages flushed as part of LRU batches",
+	 MONITOR_SET_OWNER, MONITOR_LRU_BATCH_COUNT,
+	 MONITOR_LRU_BATCH_TOTAL_PAGE},
+
+	{"buffer_LRU_batches", "buffer",
+	 "Number of LRU batches",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_TOTAL_PAGE,
+	 MONITOR_LRU_BATCH_COUNT},
+
+	{"buffer_LRU_batch_pages", "buffer",
+	 "Pages queued as an LRU batch",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_TOTAL_PAGE,
+	 MONITOR_LRU_BATCH_PAGES},
+
+	/* Cumulative counter for single page LRU scans */
+	{"buffer_LRU_single_flush_scanned", "buffer",
+	 "Total pages scanned as part of single page LRU flush",
+	 MONITOR_SET_OWNER,
+	 MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
+	 MONITOR_LRU_SINGLE_FLUSH_SCANNED},
+
+	{"buffer_LRU_single_flush_num_scan", "buffer",
+	 "Number of times single page LRU flush is called",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_SINGLE_FLUSH_SCANNED,
+	 MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL},
+
+	{"buffer_LRU_single_flush_scanned_per_call", "buffer",
+	 "Page scanned per single LRU flush",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_SINGLE_FLUSH_SCANNED,
+	 MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL},
+
+	{"buffer_LRU_single_flush_failure_count", "Buffer",
+	 "Number of times attempt to flush a single page from LRU failed",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT},
+
+	{"buffer_LRU_get_free_search", "Buffer",
+	 "Number of searches performed for a clean page",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LRU_GET_FREE_SEARCH},
+
+	/* Cumulative counter for LRU search scans */
+	{"buffer_LRU_search_scanned", "buffer",
+	 "Total pages scanned as part of LRU search",
+	 MONITOR_SET_OWNER,
+	 MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+	 MONITOR_LRU_SEARCH_SCANNED},
+
+	{"buffer_LRU_search_num_scan", "buffer",
+	 "Number of times LRU search is performed",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_SEARCH_SCANNED,
+	 MONITOR_LRU_SEARCH_SCANNED_NUM_CALL},
+
+	{"buffer_LRU_search_scanned_per_call", "buffer",
+	 "Page scanned per single LRU search",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_SEARCH_SCANNED,
+	 MONITOR_LRU_SEARCH_SCANNED_PER_CALL},
+
+	/* Cumulative counter for LRU unzip search scans */
+	{"buffer_LRU_unzip_search_scanned", "buffer",
+	 "Total pages scanned as part of LRU unzip search",
+	 MONITOR_SET_OWNER,
+	 MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+	 MONITOR_LRU_UNZIP_SEARCH_SCANNED},
+
+	{"buffer_LRU_unzip_search_num_scan", "buffer",
+	 "Number of times LRU unzip search is performed",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+	 MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL},
+
+	{"buffer_LRU_unzip_search_scanned_per_call", "buffer",
+	 "Page scanned per single LRU unzip search",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+	 MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL},
+
+	/* ========== Counters for Buffer Page I/O ========== */
+	{"module_buffer_page", "buffer_page_io", "Buffer Page I/O Module",
+	 static_cast<monitor_type_t>(
+	 MONITOR_MODULE | MONITOR_GROUP_MODULE),
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_BUF_PAGE},
+
+	MONITOR_BUF_PAGE_READ("index_leaf","Index Leaf", INDEX_LEAF),
+
+	MONITOR_BUF_PAGE_READ("index_non_leaf","Index Non-leaf",
+			      INDEX_NON_LEAF),
+
+	MONITOR_BUF_PAGE_READ("index_ibuf_leaf", "Insert Buffer Index Leaf",
+			      INDEX_IBUF_LEAF),
+
+	MONITOR_BUF_PAGE_READ("index_ibuf_non_leaf",
+			      "Insert Buffer Index Non-Leaf",
+			       INDEX_IBUF_NON_LEAF),
+
+	MONITOR_BUF_PAGE_READ("undo_log", "Undo Log", UNDO_LOG),
+
+	MONITOR_BUF_PAGE_READ("index_inode", "Index Inode", INODE),
+
+	MONITOR_BUF_PAGE_READ("ibuf_free_list", "Insert Buffer Free List",
+			      IBUF_FREELIST),
+
+	MONITOR_BUF_PAGE_READ("ibuf_bitmap", "Insert Buffer Bitmap",
+			      IBUF_BITMAP),
+
+	MONITOR_BUF_PAGE_READ("system_page", "System", SYSTEM),
+
+	MONITOR_BUF_PAGE_READ("trx_system", "Transaction System", TRX_SYSTEM),
+
+	MONITOR_BUF_PAGE_READ("fsp_hdr", "File Space Header", FSP_HDR),
+
+	MONITOR_BUF_PAGE_READ("xdes", "Extent Descriptor", XDES),
+
+	MONITOR_BUF_PAGE_READ("blob", "Uncompressed BLOB", BLOB),
+
+	MONITOR_BUF_PAGE_READ("zblob", "First Compressed BLOB", ZBLOB),
+
+	MONITOR_BUF_PAGE_READ("zblob2", "Subsequent Compressed BLOB", ZBLOB2),
+
+	MONITOR_BUF_PAGE_READ("other", "other/unknown (old version of InnoDB)",
+			      OTHER),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_leaf","Index Leaf", INDEX_LEAF),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_non_leaf","Index Non-leaf",
+				 INDEX_NON_LEAF),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_ibuf_leaf", "Insert Buffer Index Leaf",
+				 INDEX_IBUF_LEAF),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_ibuf_non_leaf",
+				 "Insert Buffer Index Non-Leaf",
+				 INDEX_IBUF_NON_LEAF),
+
+	MONITOR_BUF_PAGE_WRITTEN("undo_log", "Undo Log", UNDO_LOG),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_inode", "Index Inode", INODE),
+
+	MONITOR_BUF_PAGE_WRITTEN("ibuf_free_list", "Insert Buffer Free List",
+				 IBUF_FREELIST),
+
+	MONITOR_BUF_PAGE_WRITTEN("ibuf_bitmap", "Insert Buffer Bitmap",
+				 IBUF_BITMAP),
+
+	MONITOR_BUF_PAGE_WRITTEN("system_page", "System", SYSTEM),
+
+	MONITOR_BUF_PAGE_WRITTEN("trx_system", "Transaction System",
+				 TRX_SYSTEM),
+
+	MONITOR_BUF_PAGE_WRITTEN("fsp_hdr", "File Space Header", FSP_HDR),
+
+	MONITOR_BUF_PAGE_WRITTEN("xdes", "Extent Descriptor", XDES),
+
+	MONITOR_BUF_PAGE_WRITTEN("blob", "Uncompressed BLOB", BLOB),
+
+	MONITOR_BUF_PAGE_WRITTEN("zblob", "First Compressed BLOB", ZBLOB),
+
+	MONITOR_BUF_PAGE_WRITTEN("zblob2", "Subsequent Compressed BLOB",
+				 ZBLOB2),
+
+	MONITOR_BUF_PAGE_WRITTEN("other", "other/unknown (old version InnoDB)",
+				 OTHER),
+
+	/* ========== Counters for OS level operations ========== */
+	{"module_os", "os", "OS Level Operation",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_OS},
+
+	{"os_data_reads", "os",
+	 "Number of reads initiated (innodb_data_reads)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_FILE_READ},
+
+	{"os_data_writes", "os",
+	 "Number of writes initiated (innodb_data_writes)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_FILE_WRITE},
+
+	{"os_data_fsyncs", "os",
+	 "Number of fsync() calls (innodb_data_fsyncs)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_FSYNC},
+
+	{"os_pending_reads", "os", "Number of reads pending",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OS_PENDING_READS},
+
+	{"os_pending_writes", "os", "Number of writes pending",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OS_PENDING_WRITES},
+
+	{"os_log_bytes_written", "os",
+	 "Bytes of log written (innodb_os_log_written)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_WRITTEN},
+
+	{"os_log_fsyncs", "os",
+	 "Number of fsync log writes (innodb_os_log_fsyncs)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_FSYNC},
+
+	{"os_log_pending_fsyncs", "os",
+	 "Number of pending fsync write (innodb_os_log_pending_fsyncs)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_PENDING_FSYNC},
+
+	{"os_log_pending_writes", "os",
+	 "Number of pending log file writes (innodb_os_log_pending_writes)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_PENDING_WRITES},
+
+	/* ========== Counters for Transaction Module ========== */
+	{"module_trx", "transaction", "Transaction Manager",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_TRX},
+
+	{"trx_rw_commits", "transaction", "Number of read-write transactions "
+	  "committed",
+	 MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_TRX_RW_COMMIT},
+
+	{"trx_ro_commits", "transaction", "Number of read-only transactions "
+	  "committed",
+	 MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_TRX_RO_COMMIT},
+
+	{"trx_nl_ro_commits", "transaction", "Number of non-locking "
+	 "auto-commit read-only transactions committed",
+	 MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_TRX_NL_RO_COMMIT},
+
+	{"trx_commits_insert_update", "transaction",
+	 "Number of transactions committed with inserts and updates",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TRX_COMMIT_UNDO},
+
+	{"trx_rollbacks", "transaction",
+	 "Number of transactions rolled back",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TRX_ROLLBACK},
+
+	{"trx_rollbacks_savepoint", "transaction",
+	 "Number of transactions rolled back to savepoint",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TRX_ROLLBACK_SAVEPOINT},
+
+	{"trx_rollback_active", "transaction",
+	 "Number of resurrected active transactions rolled back",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TRX_ROLLBACK_ACTIVE},
+
+	{"trx_active_transactions", "transaction",
+	 "Number of active transactions",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TRX_ACTIVE},
+
+	{"trx_rseg_history_len", "transaction",
+	 "Length of the TRX_RSEG_HISTORY list",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_RSEG_HISTORY_LEN},
+
+	{"trx_undo_slots_used", "transaction", "Number of undo slots used",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_UNDO_SLOT_USED},
+
+	{"trx_undo_slots_cached", "transaction",
+	 "Number of undo slots cached",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_UNDO_SLOT_CACHED},
+
+	{"trx_rseg_current_size", "transaction",
+	 "Current rollback segment size in pages",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_RSEG_CUR_SIZE},
+
+	/* ========== Counters for Purge Module ========== */
+	{"module_purge", "purge", "Purge Module",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_PURGE},
+
+	{"purge_del_mark_records", "purge",
+	 "Number of delete-marked rows purged",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_N_DEL_ROW_PURGE},
+
+	{"purge_upd_exist_or_extern_records", "purge",
+	 "Number of purges on updates of existing records and "
+	 " updates on delete marked record with externally stored field",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_N_UPD_EXIST_EXTERN},
+
+	{"purge_invoked", "purge",
+	 "Number of times purge was invoked",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PURGE_INVOKED},
+
+	{"purge_undo_log_pages", "purge",
+	 "Number of undo log pages handled by the purge",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PURGE_N_PAGE_HANDLED},
+
+	{"purge_dml_delay_usec", "purge",
+	 "Microseconds DML to be delayed due to purge lagging",
+	 MONITOR_DISPLAY_CURRENT,
+	 MONITOR_DEFAULT_START, MONITOR_DML_PURGE_DELAY},
+
+	{"purge_stop_count", "purge",
+	 "Number of times purge was stopped",
+	 MONITOR_DISPLAY_CURRENT,
+	 MONITOR_DEFAULT_START, MONITOR_PURGE_STOP_COUNT},
+
+	{"purge_resume_count", "purge",
+	 "Number of times purge was resumed",
+	 MONITOR_DISPLAY_CURRENT,
+	 MONITOR_DEFAULT_START, MONITOR_PURGE_RESUME_COUNT},
+
+	/* ========== Counters for Recovery Module ========== */
+	{"module_log", "recovery", "Recovery Module",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_RECOVERY},
+
+	{"log_checkpoints", "recovery", "Number of checkpoints",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_CHECKPOINT},
+
+	{"log_lsn_last_flush", "recovery", "LSN of Last flush",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LSN_FLUSHDISK},
+
+	{"log_lsn_last_checkpoint", "recovery", "LSN at last checkpoint",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LSN_CHECKPOINT},
+
+	{"log_lsn_current", "recovery", "Current LSN value",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LSN_CURRENT},
+
+	{"log_lsn_checkpoint_age", "recovery",
+	 "Current LSN value minus LSN at last checkpoint",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LSN_CHECKPOINT_AGE},
+
+	{"log_lsn_buf_pool_oldest", "recovery",
+	 "The oldest modified block LSN in the buffer pool",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_OLDEST_LSN},
+
+	{"log_max_modified_age_async", "recovery",
+	 "Maximum LSN difference; when exceeded, start asynchronous preflush",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_MAX_AGE_ASYNC},
+
+	{"log_max_modified_age_sync", "recovery",
+	 "Maximum LSN difference; when exceeded, start synchronous preflush",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_MAX_AGE_SYNC},
+
+	{"log_pending_log_writes", "recovery", "Pending log writes",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PENDING_LOG_WRITE},
+
+	{"log_pending_checkpoint_writes", "recovery", "Pending checkpoints",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PENDING_CHECKPOINT_WRITE},
+
+	{"log_num_log_io", "recovery", "Number of log I/Os",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LOG_IO},
+
+	{"log_waits", "recovery",
+	 "Number of log waits due to small log buffer (innodb_log_waits)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_WAITS},
+
+	{"log_write_requests", "recovery",
+	 "Number of log write requests (innodb_log_write_requests)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_WRITE_REQUEST},
+
+	{"log_writes", "recovery",
+	 "Number of log writes (innodb_log_writes)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_WRITES},
+
+	/* ========== Counters for Page Compression ========== */
+	{"module_compress", "compression", "Page Compression Info",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_PAGE},
+
+	{"compress_pages_compressed", "compression",
+	 "Number of pages compressed", MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PAGE_COMPRESS},
+
+	{"compress_pages_decompressed", "compression",
+	 "Number of pages decompressed",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PAGE_DECOMPRESS},
+
+	{"compression_pad_increments", "compression",
+	 "Number of times padding is incremented to avoid compression failures",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PAD_INCREMENTS},
+
+	{"compression_pad_decrements", "compression",
+	 "Number of times padding is decremented due to good compressibility",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PAD_DECREMENTS},
+
+	/* ========== Counters for Index ========== */
+	{"module_index", "index", "Index Manager",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_INDEX},
+
+	{"index_page_splits", "index", "Number of index page splits",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_INDEX_SPLIT},
+
+	{"index_page_merge_attempts", "index",
+	 "Number of index page merge attempts",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_INDEX_MERGE_ATTEMPTS},
+
+	{"index_page_merge_successful", "index",
+	 "Number of successful index page merges",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_INDEX_MERGE_SUCCESSFUL},
+
+	{"index_page_reorg_attempts", "index",
+	 "Number of index page reorganization attempts",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_INDEX_REORG_ATTEMPTS},
+
+	{"index_page_reorg_successful", "index",
+	 "Number of successful index page reorganizations",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_INDEX_REORG_SUCCESSFUL},
+
+	{"index_page_discards", "index", "Number of index pages discarded",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_INDEX_DISCARD},
+
+	/* ========== Counters for Adaptive Hash Index ========== */
+	{"module_adaptive_hash", "adaptive_hash_index", "Adpative Hash Index",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_ADAPTIVE_HASH},
+
+	{"adaptive_hash_searches", "adaptive_hash_index",
+	 "Number of successful searches using Adaptive Hash Index",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_ADAPTIVE_HASH_SEARCH},
+
+	{"adaptive_hash_searches_btree", "adaptive_hash_index",
+	 "Number of searches using B-tree on an index search",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE},
+
+	{"adaptive_hash_pages_added", "adaptive_hash_index",
+	 "Number of index pages on which the Adaptive Hash Index is built",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_PAGE_ADDED},
+
+	{"adaptive_hash_pages_removed", "adaptive_hash_index",
+	 "Number of index pages whose corresponding Adaptive Hash Index"
+	 " entries were removed",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_PAGE_REMOVED},
+
+	{"adaptive_hash_rows_added", "adaptive_hash_index",
+	 "Number of Adaptive Hash Index rows added",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_ADDED},
+
+	{"adaptive_hash_rows_removed", "adaptive_hash_index",
+	 "Number of Adaptive Hash Index rows removed",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_REMOVED},
+
+	{"adaptive_hash_rows_deleted_no_hash_entry", "adaptive_hash_index",
+	 "Number of rows deleted that did not have corresponding Adaptive Hash"
+	 " Index entries",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND},
+
+	{"adaptive_hash_rows_updated", "adaptive_hash_index",
+	 "Number of Adaptive Hash Index rows updated",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_UPDATED},
+
+	/* ========== Counters for tablespace ========== */
+	{"module_file", "file_system", "Tablespace and File System Manager",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_FIL_SYSTEM},
+
+	{"file_num_open_files", "file_system",
+	 "Number of files currently open (innodb_num_open_files)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_N_FILE_OPENED},
+
+	/* ========== Counters for Change Buffer ========== */
+	{"module_ibuf_system", "change_buffer", "InnoDB Change Buffer",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_IBUF_SYSTEM},
+
+	{"ibuf_merges_insert", "change_buffer",
+	 "Number of inserted records merged by change buffering",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_INSERT},
+
+	{"ibuf_merges_delete_mark", "change_buffer",
+	 "Number of deleted records merged by change buffering",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DELETE},
+
+	{"ibuf_merges_delete", "change_buffer",
+	 "Number of purge records merged by change buffering",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_PURGE},
+
+	{"ibuf_merges_discard_insert", "change_buffer",
+	 "Number of insert merged operations discarded",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT},
+
+	{"ibuf_merges_discard_delete_mark", "change_buffer",
+	 "Number of deleted merged operations discarded",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE},
+
+	{"ibuf_merges_discard_delete", "change_buffer",
+	 "Number of purge merged  operations discarded",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE},
+
+	{"ibuf_merges", "change_buffer", "Number of change buffer merges",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGES},
+
+	{"ibuf_size", "change_buffer", "Change buffer size in pages",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_SIZE},
+
+	/* ========== Counters for server operations ========== */
+	{"module_innodb", "innodb",
+	 "Counter for general InnoDB server wide operations and properties",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_SERVER},
+
+	{"innodb_master_thread_sleeps", "server",
+	 "Number of times (seconds) master thread sleeps",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_MASTER_THREAD_SLEEP},
+
+	{"innodb_activity_count", "server", "Current server activity count",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_SERVER_ACTIVITY},
+
+	{"innodb_master_active_loops", "server",
+	 "Number of times master thread performs its tasks when"
+	 " server is active",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_MASTER_ACTIVE_LOOPS},
+
+	{"innodb_master_idle_loops", "server",
+	 "Number of times master thread performs its tasks when server is idle",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_MASTER_IDLE_LOOPS},
+
+	{"innodb_background_drop_table_usec", "server",
+	 "Time (in microseconds) spent to process drop table list",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND},
+
+	{"innodb_ibuf_merge_usec", "server",
+	 "Time (in microseconds) spent to process change buffer merge",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_IBUF_MERGE_MICROSECOND},
+
+	{"innodb_log_flush_usec", "server",
+	 "Time (in microseconds) spent to flush log records",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_LOG_FLUSH_MICROSECOND},
+
+	{"innodb_mem_validate_usec", "server",
+	 "Time (in microseconds) spent to do memory validation",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_MEM_VALIDATE_MICROSECOND},
+
+	{"innodb_master_purge_usec", "server",
+	 "Time (in microseconds) spent by master thread to purge records",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_PURGE_MICROSECOND},
+
+	{"innodb_dict_lru_usec", "server",
+	 "Time (in microseconds) spent to process DICT LRU list",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_DICT_LRU_MICROSECOND},
+
+	{"innodb_checkpoint_usec", "server",
+	 "Time (in microseconds) spent by master thread to do checkpoint",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_CHECKPOINT_MICROSECOND},
+
+	{"innodb_dblwr_writes", "server",
+	 "Number of doublewrite operations that have been performed"
+	 " (innodb_dblwr_writes)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_SRV_DBLWR_WRITES},
+
+	{"innodb_dblwr_pages_written", "server",
+	 "Number of pages that have been written for doublewrite operations"
+	 " (innodb_dblwr_pages_written)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN},
+
+	{"innodb_page_size", "server",
+	 "InnoDB page size in bytes (innodb_page_size)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_SRV_PAGE_SIZE},
+
+	{"innodb_rwlock_s_spin_waits", "server",
+	 "Number of rwlock spin waits due to shared latch request",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_S_SPIN_WAITS},
+
+	{"innodb_rwlock_x_spin_waits", "server",
+	 "Number of rwlock spin waits due to exclusive latch request",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_X_SPIN_WAITS},
+
+	{"innodb_rwlock_s_spin_rounds", "server",
+	 "Number of rwlock spin loop rounds due to shared latch request",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS},
+
+	{"innodb_rwlock_x_spin_rounds", "server",
+	 "Number of rwlock spin loop rounds due to exclusive latch request",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS},
+
+	{"innodb_rwlock_s_os_waits", "server",
+	 "Number of OS waits due to shared latch request",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_S_OS_WAITS},
+
+	{"innodb_rwlock_x_os_waits", "server",
+	 "Number of OS waits due to exclusive latch request",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_X_OS_WAITS},
+
+	/* ========== Counters for DML operations ========== */
+	{"module_dml", "dml", "Statistics for DMLs",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_DML_STATS},
+
+	{"dml_reads", "dml", "Number of rows read",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_READ},
+
+	{"dml_inserts", "dml", "Number of rows inserted",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_INSERTED},
+
+	{"dml_deletes", "dml", "Number of rows deleted",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_DELETED},
+
+	{"dml_updates", "dml", "Number of rows updated",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OLVD_ROW_UPDTATED},
+
+	/* ========== Counters for DDL operations ========== */
+	{"module_ddl", "ddl", "Statistics for DDLs",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_DDL_STATS},
+
+	{"ddl_background_drop_indexes", "ddl",
+	 "Number of indexes waiting to be dropped after failed index creation",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_BACKGROUND_DROP_INDEX},
+
+	{"ddl_background_drop_tables", "ddl",
+	 "Number of tables in background drop table list",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_BACKGROUND_DROP_TABLE},
+
+	{"ddl_online_create_index", "ddl",
+	 "Number of indexes being created online",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ONLINE_CREATE_INDEX},
+
+	{"ddl_pending_alter_table", "ddl",
+	 "Number of ALTER TABLE, CREATE INDEX, DROP INDEX in progress",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PENDING_ALTER_TABLE},
+
+	/* ===== Counters for ICP (Index Condition Pushdown) Module ===== */
+	{"module_icp", "icp", "Index Condition Pushdown",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_ICP},
+
+	{"icp_attempts", "icp",
+	 "Number of attempts for index push-down condition checks",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ICP_ATTEMPTS},
+
+	{"icp_no_match", "icp", "Index push-down condition does not match",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ICP_NO_MATCH},
+
+	{"icp_out_of_range", "icp", "Index push-down condition out of range",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ICP_OUT_OF_RANGE},
+
+	{"icp_match", "icp", "Index push-down condition matches",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ICP_MATCH},
+
+	/* ========== To turn on/off reset all counters ========== */
+	{"all", "All Counters", "Turn on/off and reset all counters",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_ALL_COUNTER}
+};
+
+/* The "innodb_counter_value" array stores actual counter values */
+UNIV_INTERN monitor_value_t	innodb_counter_value[NUM_MONITOR];
+
+/* monitor_set_tbl is used to record and determine whether a monitor
+has been turned on/off. */
+UNIV_INTERN ulint		monitor_set_tbl[(NUM_MONITOR + NUM_BITS_ULINT
+						- 1) / NUM_BITS_ULINT];
+
+#ifndef HAVE_ATOMIC_BUILTINS_64
+/** Mutex protecting atomic operations on platforms that lack
+built-in operations for atomic memory access */
+ib_mutex_t	monitor_mutex;
+
+/** Key to register monitor_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	monitor_mutex_key;
+
+/****************************************************************//**
+Initialize the monitor subsystem. */
+UNIV_INTERN
+void
+srv_mon_create(void)
+/*================*/
+{
+	mutex_create(monitor_mutex_key, &monitor_mutex, SYNC_ANY_LATCH);
+}
+/****************************************************************//**
+Close the monitor subsystem. */
+UNIV_INTERN
+void
+srv_mon_free(void)
+/*==============*/
+{
+	mutex_free(&monitor_mutex);
+}
+#endif /* !HAVE_ATOMIC_BUILTINS_64 */
+
+/****************************************************************//**
+Get a monitor's "monitor_info" by its monitor id (index into the
+innodb_counter_info array.
+@return	Point to corresponding monitor_info_t, or NULL if no such
+monitor */
+UNIV_INTERN
+monitor_info_t*
+srv_mon_get_info(
+/*=============*/
+	monitor_id_t	monitor_id)	/*!< id indexing into the
+					innodb_counter_info array */
+{
+	ut_a(monitor_id < NUM_MONITOR);
+
+	return((monitor_id < NUM_MONITOR)
+			? &innodb_counter_info[monitor_id]
+			: NULL);
+}
+
+/****************************************************************//**
+Get monitor's name by its monitor id (indexing into the
+innodb_counter_info array.
+@return	corresponding monitor name, or NULL if no such
+monitor */
+UNIV_INTERN
+const char*
+srv_mon_get_name(
+/*=============*/
+	monitor_id_t	monitor_id)	/*!< id index into the
+					innodb_counter_info array */
+{
+	ut_a(monitor_id < NUM_MONITOR);
+
+	return((monitor_id < NUM_MONITOR)
+			? innodb_counter_info[monitor_id].monitor_name
+			: NULL);
+}
+
+/****************************************************************//**
+Turn on/off, reset monitor counters in a module. If module_id
+is MONITOR_ALL_COUNTER then turn on all monitor counters.
+turned on because it has already been turned on. */
+UNIV_INTERN
+void
+srv_mon_set_module_control(
+/*=======================*/
+	monitor_id_t	module_id,	/*!< in: Module ID as in
+					monitor_counter_id. If it is
+					set to MONITOR_ALL_COUNTER, this means
+					we shall turn on all the counters */
+	mon_option_t	set_option)	/*!< in: Turn on/off reset the
+					counter */
+{
+	ulint	ix;
+	ulint	start_id;
+	ibool	set_current_module = FALSE;
+
+	ut_a(module_id <= NUM_MONITOR);
+	ut_a(UT_ARR_SIZE(innodb_counter_info) == NUM_MONITOR);
+
+	/* The module_id must be an ID of MONITOR_MODULE type */
+	ut_a(innodb_counter_info[module_id].monitor_type & MONITOR_MODULE);
+
+	/* start with the first monitor in the module. If module_id
+	is MONITOR_ALL_COUNTER, this means we need to turn on all
+	monitor counters. */
+	if (module_id == MONITOR_ALL_COUNTER) {
+		start_id = 1;
+	} else if (innodb_counter_info[module_id].monitor_type
+		   & MONITOR_GROUP_MODULE) {
+		/* Counters in this module are set as a group together
+		and cannot be turned on/off individually. Need to set
+		the on/off bit in the module counter */
+		start_id = module_id;
+		set_current_module = TRUE;
+
+	} else {
+		start_id = module_id + 1;
+	}
+
+	for (ix = start_id; ix < NUM_MONITOR; ix++) {
+		/* if we hit the next module counter, we will
+		continue if we want to turn on all monitor counters,
+		and break if just turn on the counters in the
+		current module. */
+		if (innodb_counter_info[ix].monitor_type & MONITOR_MODULE) {
+
+			if (set_current_module) {
+				/* Continue to set on/off bit on current
+				module */
+				set_current_module = FALSE;
+			} else if (module_id == MONITOR_ALL_COUNTER) {
+				continue;
+			} else {
+				/* Hitting the next module, stop */
+				break;
+			}
+		}
+
+		/* Cannot turn on a monitor already been turned on. User
+		should be aware some counters are already on before
+		turn them on again (which could reset counter value) */
+		if (MONITOR_IS_ON(ix) && (set_option == MONITOR_TURN_ON)) {
+			fprintf(stderr, "Monitor '%s' is already enabled.\n",
+				srv_mon_get_name((monitor_id_t) ix));
+			continue;
+		}
+
+		/* For some existing counters (server status variables),
+		we will get its counter value at the start/stop time
+		to calculate the actual value during the time. */
+		if (innodb_counter_info[ix].monitor_type & MONITOR_EXISTING) {
+			srv_mon_process_existing_counter(
+				static_cast<monitor_id_t>(ix), set_option);
+		}
+
+		/* Currently support 4 operations on the monitor counters:
+		turn on, turn off, reset and reset all operations. */
+		switch (set_option) {
+		case MONITOR_TURN_ON:
+			MONITOR_ON(ix);
+			MONITOR_INIT(ix);
+			MONITOR_SET_START(ix);
+			break;
+
+		case MONITOR_TURN_OFF:
+			MONITOR_OFF(ix);
+			MONITOR_SET_OFF(ix);
+			break;
+
+		case MONITOR_RESET_VALUE:
+			srv_mon_reset(static_cast<monitor_id_t>(ix));
+			break;
+
+		case MONITOR_RESET_ALL_VALUE:
+			srv_mon_reset_all(static_cast<monitor_id_t>(ix));
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+}
+
+/****************************************************************//**
+Get transaction system's rollback segment size in pages
+@return size in pages */
+static
+ulint
+srv_mon_get_rseg_size(void)
+/*=======================*/
+{
+	ulint		i;
+	ulint		value = 0;
+
+	/* rseg_array is a static array, so we can go through it without
+	mutex protection. In addition, we provide an estimate of the
+	total rollback segment size and to avoid mutex contention we
+	don't acquire the rseg->mutex" */
+	for (i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+		const trx_rseg_t*	rseg = trx_sys->rseg_array[i];
+
+		if (rseg != NULL) {
+			value += rseg->curr_size;
+		}
+	}
+
+	return(value);
+}
+
+/****************************************************************//**
+This function consolidates some existing server counters used
+by "system status variables". These existing system variables do not have
+mechanism to start/stop and reset the counters, so we simulate these
+controls by remembering the corresponding counter values when the
+corresponding monitors are turned on/off/reset, and do appropriate
+mathematics to deduct the actual value. Please also refer to
+srv_export_innodb_status() for related global counters used by
+the existing status variables.*/
+UNIV_INTERN
+void
+srv_mon_process_existing_counter(
+/*=============================*/
+	monitor_id_t	monitor_id,	/*!< in: the monitor's ID as in
+					monitor_counter_id */
+	mon_option_t	set_option)	/*!< in: Turn on/off reset the
+					counter */
+{
+	mon_type_t		value;
+	monitor_info_t*		monitor_info;
+	ibool			update_min = FALSE;
+	buf_pool_stat_t		stat;
+	buf_pools_list_size_t	buf_pools_list_size;
+	ulint			LRU_len;
+	ulint			free_len;
+	ulint			flush_list_len;
+
+	monitor_info = srv_mon_get_info(monitor_id);
+
+	ut_a(monitor_info->monitor_type & MONITOR_EXISTING);
+	ut_a(monitor_id < NUM_MONITOR);
+
+	/* Get the value from corresponding global variable */
+	switch (monitor_id) {
+	case MONITOR_OVLD_META_MEM_POOL:
+		value = srv_mem_pool_size;
+		break;
+
+	/* export_vars.innodb_buffer_pool_reads. Num Reads from
+	disk (page not in buffer) */
+	case MONITOR_OVLD_BUF_POOL_READS:
+		value = srv_stats.buf_pool_reads;
+		break;
+
+	/* innodb_buffer_pool_read_requests, the number of logical
+	read requests */
+	case MONITOR_OVLD_BUF_POOL_READ_REQUESTS:
+		buf_get_total_stat(&stat);
+		value = stat.n_page_gets;
+		break;
+
+	/* innodb_buffer_pool_write_requests, the number of
+	write request */
+	case MONITOR_OVLD_BUF_POOL_WRITE_REQUEST:
+		value = srv_stats.buf_pool_write_requests;
+		break;
+
+	/* innodb_buffer_pool_wait_free */
+	case MONITOR_OVLD_BUF_POOL_WAIT_FREE:
+		value = srv_stats.buf_pool_wait_free;
+		break;
+
+	/* innodb_buffer_pool_read_ahead */
+	case MONITOR_OVLD_BUF_POOL_READ_AHEAD:
+		buf_get_total_stat(&stat);
+		value = stat.n_ra_pages_read;
+		break;
+
+	/* innodb_buffer_pool_read_ahead_evicted */
+	case MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED:
+		buf_get_total_stat(&stat);
+		value = stat.n_ra_pages_evicted;
+		break;
+
+	/* innodb_buffer_pool_pages_total */
+	case MONITOR_OVLD_BUF_POOL_PAGE_TOTAL:
+		value = buf_pool_get_n_pages();
+		break;
+
+	/* innodb_buffer_pool_pages_misc */
+	case MONITOR_OVLD_BUF_POOL_PAGE_MISC:
+		buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
+		value = buf_pool_get_n_pages() - LRU_len - free_len;
+		break;
+
+	/* innodb_buffer_pool_pages_data */
+	case MONITOR_OVLD_BUF_POOL_PAGES_DATA:
+		buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
+		value = LRU_len;
+		break;
+
+	/* innodb_buffer_pool_bytes_data */
+	case MONITOR_OVLD_BUF_POOL_BYTES_DATA:
+		buf_get_total_list_size_in_bytes(&buf_pools_list_size);
+		value = buf_pools_list_size.LRU_bytes
+			+ buf_pools_list_size.unzip_LRU_bytes;
+		break;
+
+	/* innodb_buffer_pool_pages_dirty */
+	case MONITOR_OVLD_BUF_POOL_PAGES_DIRTY:
+		buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
+		value = flush_list_len;
+		break;
+
+	/* innodb_buffer_pool_bytes_dirty */
+	case MONITOR_OVLD_BUF_POOL_BYTES_DIRTY:
+		buf_get_total_list_size_in_bytes(&buf_pools_list_size);
+		value = buf_pools_list_size.flush_list_bytes;
+		break;
+
+	/* innodb_buffer_pool_pages_free */
+	case MONITOR_OVLD_BUF_POOL_PAGES_FREE:
+		buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
+		value = free_len;
+		break;
+
+	/* innodb_pages_created, the number of pages created */
+	case MONITOR_OVLD_PAGE_CREATED:
+		buf_get_total_stat(&stat);
+		value = stat.n_pages_created;
+		break;
+
+	/* innodb_pages_written, the number of page written */
+	case MONITOR_OVLD_PAGES_WRITTEN:
+		buf_get_total_stat(&stat);
+		value = stat.n_pages_written;
+		break;
+
+	/* innodb_pages_read */
+	case MONITOR_OVLD_PAGES_READ:
+		buf_get_total_stat(&stat);
+		value = stat.n_pages_read;
+		break;
+
+	/* innodb_data_reads, the total number of data reads */
+	case MONITOR_OVLD_BYTE_READ:
+		value = srv_stats.data_read;
+		break;
+
+	/* innodb_data_writes, the total number of data writes. */
+	case MONITOR_OVLD_BYTE_WRITTEN:
+		value = srv_stats.data_written;
+		break;
+
+	/* innodb_data_reads, the total number of data reads. */
+	case MONITOR_OVLD_OS_FILE_READ:
+		value = os_n_file_reads;
+		break;
+
+	/* innodb_data_writes, the total number of data writes*/
+	case MONITOR_OVLD_OS_FILE_WRITE:
+		value = os_n_file_writes;
+		break;
+
+	/* innodb_data_fsyncs, number of fsync() operations so far. */
+	case MONITOR_OVLD_OS_FSYNC:
+		value = os_n_fsyncs;
+		break;
+
+	/* innodb_os_log_written */
+	case MONITOR_OVLD_OS_LOG_WRITTEN:
+		value = (mon_type_t) srv_stats.os_log_written;
+		break;
+
+	/* innodb_os_log_fsyncs */
+	case MONITOR_OVLD_OS_LOG_FSYNC:
+		value = fil_n_log_flushes;
+		break;
+
+	/* innodb_os_log_pending_fsyncs */
+	case MONITOR_OVLD_OS_LOG_PENDING_FSYNC:
+		value = fil_n_pending_log_flushes;
+		update_min = TRUE;
+		break;
+
+	/* innodb_os_log_pending_writes */
+	case MONITOR_OVLD_OS_LOG_PENDING_WRITES:
+		value = srv_stats.os_log_pending_writes;
+		update_min = TRUE;
+		break;
+
+	/* innodb_log_waits */
+	case MONITOR_OVLD_LOG_WAITS:
+		value = srv_stats.log_waits;
+		break;
+
+	/* innodb_log_write_requests */
+	case MONITOR_OVLD_LOG_WRITE_REQUEST:
+		value = srv_stats.log_write_requests;
+		break;
+
+	/* innodb_log_writes */
+	case MONITOR_OVLD_LOG_WRITES:
+		value = srv_stats.log_writes;
+		break;
+
+	/* innodb_dblwr_writes */
+	case MONITOR_OVLD_SRV_DBLWR_WRITES:
+		value = srv_stats.dblwr_writes;
+		break;
+
+	/* innodb_dblwr_pages_written */
+	case MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN:
+		value = srv_stats.dblwr_pages_written;
+		break;
+
+	/* innodb_page_size */
+	case MONITOR_OVLD_SRV_PAGE_SIZE:
+		value = UNIV_PAGE_SIZE;
+		break;
+
+	case MONITOR_OVLD_RWLOCK_S_SPIN_WAITS:
+		value = rw_lock_stats.rw_s_spin_wait_count;
+		break;
+
+	case MONITOR_OVLD_RWLOCK_X_SPIN_WAITS:
+		value = rw_lock_stats.rw_x_spin_wait_count;
+		break;
+
+	case MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS:
+		value = rw_lock_stats.rw_s_spin_round_count;
+		break;
+
+	case MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS:
+		value = rw_lock_stats.rw_x_spin_round_count;
+		break;
+
+	case MONITOR_OVLD_RWLOCK_S_OS_WAITS:
+		value = rw_lock_stats.rw_s_os_wait_count;
+		break;
+
+	case MONITOR_OVLD_RWLOCK_X_OS_WAITS:
+		value = rw_lock_stats.rw_x_os_wait_count;
+		break;
+
+	case MONITOR_OVLD_BUFFER_POOL_SIZE:
+		value = srv_buf_pool_size;
+		break;
+
+	/* innodb_rows_read */
+	case MONITOR_OLVD_ROW_READ:
+		value = srv_stats.n_rows_read;
+		break;
+
+	/* innodb_rows_inserted */
+	case MONITOR_OLVD_ROW_INSERTED:
+		value = srv_stats.n_rows_inserted;
+		break;
+
+	/* innodb_rows_deleted */
+	case MONITOR_OLVD_ROW_DELETED:
+		value = srv_stats.n_rows_deleted;
+		break;
+
+	/* innodb_rows_updated */
+	case MONITOR_OLVD_ROW_UPDTATED:
+		value = srv_stats.n_rows_updated;
+		break;
+
+	/* innodb_row_lock_current_waits */
+	case MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT:
+		value = srv_stats.n_lock_wait_current_count;
+		break;
+
+	/* innodb_row_lock_time */
+	case MONITOR_OVLD_LOCK_WAIT_TIME:
+		value = srv_stats.n_lock_wait_time / 1000;
+		break;
+
+	/* innodb_row_lock_time_max */
+	case MONITOR_OVLD_LOCK_MAX_WAIT_TIME:
+		value = lock_sys->n_lock_max_wait_time / 1000;
+		break;
+
+	/* innodb_row_lock_time_avg */
+	case MONITOR_OVLD_LOCK_AVG_WAIT_TIME:
+		if (srv_stats.n_lock_wait_count > 0) {
+			value = srv_stats.n_lock_wait_time / 1000
+				/ srv_stats.n_lock_wait_count;
+		} else {
+			value = 0;
+		}
+		break;
+
+	/* innodb_row_lock_waits */
+	case MONITOR_OVLD_ROW_LOCK_WAIT:
+		value = srv_stats.n_lock_wait_count;
+		break;
+
+	case MONITOR_RSEG_HISTORY_LEN:
+		value = trx_sys->rseg_history_len;
+		break;
+
+	case MONITOR_RSEG_CUR_SIZE:
+		value = srv_mon_get_rseg_size();
+		break;
+
+	case MONITOR_OVLD_N_FILE_OPENED:
+		value = fil_n_file_opened;
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_INSERT:
+		value = ibuf->n_merged_ops[IBUF_OP_INSERT];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_DELETE:
+		value = ibuf->n_merged_ops[IBUF_OP_DELETE_MARK];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_PURGE:
+		value = ibuf->n_merged_ops[IBUF_OP_DELETE];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT:
+		value = ibuf->n_discarded_ops[IBUF_OP_INSERT];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE:
+		value = ibuf->n_discarded_ops[IBUF_OP_DELETE_MARK];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE:
+		value = ibuf->n_discarded_ops[IBUF_OP_DELETE];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGES:
+		value = ibuf->n_merges;
+		break;
+
+	case MONITOR_OVLD_IBUF_SIZE:
+		value = ibuf->size;
+		break;
+
+	case MONITOR_OVLD_SERVER_ACTIVITY:
+		value = srv_get_activity_count();
+		break;
+
+	case MONITOR_OVLD_LSN_FLUSHDISK:
+		value = (mon_type_t) log_sys->flushed_to_disk_lsn;
+		break;
+
+	case MONITOR_OVLD_LSN_CURRENT:
+		value = (mon_type_t) log_sys->lsn;
+		break;
+
+	case MONITOR_OVLD_BUF_OLDEST_LSN:
+		value = (mon_type_t) buf_pool_get_oldest_modification();
+		break;
+
+	case MONITOR_OVLD_LSN_CHECKPOINT:
+		value = (mon_type_t) log_sys->last_checkpoint_lsn;
+		break;
+
+	case MONITOR_OVLD_MAX_AGE_ASYNC:
+		value = log_sys->max_modified_age_async;
+		break;
+
+	case MONITOR_OVLD_MAX_AGE_SYNC:
+		value = log_sys->max_modified_age_sync;
+		break;
+
+	case MONITOR_OVLD_ADAPTIVE_HASH_SEARCH:
+		value = btr_cur_n_sea;
+		break;
+
+	case MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE:
+		value = btr_cur_n_non_sea;
+		break;
+
+	default:
+		ut_error;
+	}
+
+	switch (set_option) {
+	case MONITOR_TURN_ON:
+		/* Save the initial counter value in mon_start_value
+		field */
+		MONITOR_SAVE_START(monitor_id, value);
+		return;
+
+	case MONITOR_TURN_OFF:
+		/* Save the counter value to mon_last_value when we
+		turn off the monitor but not yet reset. Note the
+		counter has not yet been set to off in the bitmap
+		table for normal turn off. We need to check the
+		count status (on/off) to avoid reset the value
+		for an already off conte */
+		if (MONITOR_IS_ON(monitor_id)) {
+			srv_mon_process_existing_counter(monitor_id,
+							 MONITOR_GET_VALUE);
+			MONITOR_SAVE_LAST(monitor_id);
+		}
+		return;
+
+	case MONITOR_GET_VALUE:
+		if (MONITOR_IS_ON(monitor_id)) {
+
+			/* If MONITOR_DISPLAY_CURRENT bit is on, we
+			only record the current value, rather than
+			incremental value over a period. Most of
+`			this type of counters are resource related
+			counters such as number of buffer pages etc. */
+			if (monitor_info->monitor_type
+			    & MONITOR_DISPLAY_CURRENT) {
+				MONITOR_SET(monitor_id, value);
+			} else {
+				/* Most status counters are montonically
+				increasing, no need to update their
+				minimum values. Only do so
+				if "update_min" set to TRUE */
+				MONITOR_SET_DIFF(monitor_id, value);
+
+				if (update_min
+				    && (MONITOR_VALUE(monitor_id)
+					< MONITOR_MIN_VALUE(monitor_id))) {
+					MONITOR_MIN_VALUE(monitor_id) =
+						MONITOR_VALUE(monitor_id);
+				}
+			}
+		}
+		return;
+
+	case MONITOR_RESET_VALUE:
+		if (!MONITOR_IS_ON(monitor_id)) {
+			MONITOR_LAST_VALUE(monitor_id) = 0;
+		}
+		return;
+
+	/* Nothing special for reset all operation for these existing
+	counters */
+	case MONITOR_RESET_ALL_VALUE:
+		return;
+	}
+}
+
+/*************************************************************//**
+Reset a monitor, create a new base line with the current monitor
+value. This baseline is recorded by MONITOR_VALUE_RESET(monitor) */
+UNIV_INTERN
+void
+srv_mon_reset(
+/*==========*/
+	monitor_id_t	monitor)	/*!< in: monitor id */
+{
+	ibool	monitor_was_on;
+
+	monitor_was_on = MONITOR_IS_ON(monitor);
+
+	if (monitor_was_on) {
+		/* Temporarily turn off the counter for the resetting
+		operation */
+		MONITOR_OFF(monitor);
+	}
+
+	/* Before resetting the current monitor value, first
+	calculate and set the max/min value since monitor
+	start */
+	srv_mon_calc_max_since_start(monitor);
+	srv_mon_calc_min_since_start(monitor);
+
+	/* Monitors with MONITOR_DISPLAY_CURRENT bit
+	are not incremental, no need to remember
+	the reset value. */
+	if (innodb_counter_info[monitor].monitor_type
+	    & MONITOR_DISPLAY_CURRENT) {
+		MONITOR_VALUE_RESET(monitor) = 0;
+	} else {
+		/* Remember the new baseline */
+		MONITOR_VALUE_RESET(monitor) = MONITOR_VALUE_RESET(monitor)
+					       + MONITOR_VALUE(monitor);
+	}
+
+	/* Reset the counter value */
+	MONITOR_VALUE(monitor) = 0;
+	MONITOR_MAX_VALUE(monitor) = MAX_RESERVED;
+	MONITOR_MIN_VALUE(monitor) = MIN_RESERVED;
+
+	MONITOR_FIELD((monitor), mon_reset_time) = time(NULL);
+
+	if (monitor_was_on) {
+		MONITOR_ON(monitor);
+	}
+}
+
+/*************************************************************//**
+Turn on monitor counters that are marked as default ON. */
+UNIV_INTERN
+void
+srv_mon_default_on(void)
+/*====================*/
+{
+	ulint   ix;
+
+	for (ix = 0; ix < NUM_MONITOR; ix++) {
+		if (innodb_counter_info[ix].monitor_type
+		    & MONITOR_DEFAULT_ON) {
+			/* Turn on monitor counters that are default on */
+			MONITOR_ON(ix);
+			MONITOR_INIT(ix);
+			MONITOR_SET_START(ix);
+		}
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
new file mode 100644
index 00000000000..f4ea8895d2f
--- /dev/null
+++ b/storage/innobase/srv/srv0srv.cc
@@ -0,0 +1,2880 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, 2009 Google Inc.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0srv.cc
+The database server main program
+
+Created 10/8/1995 Heikki Tuuri
+*******************************************************/
+
+/* Dummy comment */
+#include "srv0srv.h"
+
+#include "ut0mem.h"
+#include "ut0ut.h"
+#include "os0proc.h"
+#include "mem0mem.h"
+#include "mem0pool.h"
+#include "sync0sync.h"
+#include "que0que.h"
+#include "log0recv.h"
+#include "pars0pars.h"
+#include "usr0sess.h"
+#include "lock0lock.h"
+#include "trx0purge.h"
+#include "ibuf0ibuf.h"
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "btr0sea.h"
+#include "dict0load.h"
+#include "dict0boot.h"
+#include "dict0stats_bg.h" /* dict_stats_event */
+#include "srv0start.h"
+#include "row0mysql.h"
+#include "ha_prototypes.h"
+#include "trx0i_s.h"
+#include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
+#include "srv0mon.h"
+#include "ut0crc32.h"
+
+#include "mysql/plugin.h"
+#include "mysql/service_thd_wait.h"
+
+/* The following is the maximum allowed duration of a lock wait. */
+UNIV_INTERN ulint	srv_fatal_semaphore_wait_threshold = 600;
+
+/* How much data manipulation language (DML) statements need to be delayed,
+in microseconds, in order to reduce the lagging of the purge thread. */
+UNIV_INTERN ulint	srv_dml_needed_delay = 0;
+
+UNIV_INTERN ibool	srv_monitor_active = FALSE;
+UNIV_INTERN ibool	srv_error_monitor_active = FALSE;
+
+UNIV_INTERN ibool	srv_buf_dump_thread_active = FALSE;
+
+UNIV_INTERN ibool	srv_dict_stats_thread_active = FALSE;
+
+UNIV_INTERN const char*	srv_main_thread_op_info = "";
+
+/** Prefix used by MySQL to indicate pre-5.1 table name encoding */
+const char		srv_mysql50_table_name_prefix[10] = "#mysql50#";
+
+/* Server parameters which are read from the initfile */
+
+/* The following three are dir paths which are catenated before file
+names, where the file name itself may also contain a path */
+
+UNIV_INTERN char*	srv_data_home	= NULL;
+
+/** Rollback files directory, can be absolute. */
+UNIV_INTERN char*	srv_undo_dir = NULL;
+
+/** The number of tablespaces to use for rollback segments. */
+UNIV_INTERN ulong	srv_undo_tablespaces = 8;
+
+/** The number of UNDO tablespaces that are open and ready to use. */
+UNIV_INTERN ulint	srv_undo_tablespaces_open = 8;
+
+/* The number of rollback segments to use */
+UNIV_INTERN ulong	srv_undo_logs = 1;
+
+#ifdef UNIV_LOG_ARCHIVE
+UNIV_INTERN char*	srv_arch_dir	= NULL;
+#endif /* UNIV_LOG_ARCHIVE */
+
+/** Set if InnoDB must operate in read-only mode. We don't do any
+recovery and open all tables in RO mode instead of RW mode. We don't
+sync the max trx id to disk either. */
+UNIV_INTERN my_bool	srv_read_only_mode;
+/** store to its own file each table created by an user; data
+dictionary tables are in the system tablespace 0 */
+UNIV_INTERN my_bool	srv_file_per_table;
+/** The file format to use on new *.ibd files. */
+UNIV_INTERN ulint	srv_file_format = 0;
+/** Whether to check file format during startup.  A value of
+UNIV_FORMAT_MAX + 1 means no checking ie. FALSE.  The default is to
+set it to the highest format we support. */
+UNIV_INTERN ulint	srv_max_file_format_at_startup = UNIV_FORMAT_MAX;
+
+#if UNIV_FORMAT_A
+# error "UNIV_FORMAT_A must be 0!"
+#endif
+
+/** Place locks to records only i.e. do not use next-key locking except
+on duplicate key checking and foreign key checking */
+UNIV_INTERN ibool	srv_locks_unsafe_for_binlog = FALSE;
+/** Sort buffer size in index creation */
+UNIV_INTERN ulong	srv_sort_buf_size = 1048576;
+/** Maximum modification log file size for online index creation */
+UNIV_INTERN unsigned long long	srv_online_max_size;
+
+/* If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio we build below with threads.
+Currently we support native aio on windows and linux */
+UNIV_INTERN my_bool	srv_use_native_aio = TRUE;
+
+#ifdef __WIN__
+/* Windows native condition variables. We use runtime loading / function
+pointers, because they are not available on Windows Server 2003 and
+Windows XP/2000.
+
+We use condition for events on Windows if possible, even if os_event
+resembles Windows kernel event object well API-wise. The reason is
+performance, kernel objects are heavyweights and WaitForSingleObject() is a
+performance killer causing calling thread to context switch. Besides, Innodb
+is preallocating large number (often millions) of os_events. With kernel event
+objects it takes a big chunk out of non-paged pool, which is better suited
+for tasks like IO than for storing idle event objects. */
+UNIV_INTERN ibool	srv_use_native_conditions = FALSE;
+#endif /* __WIN__ */
+
+UNIV_INTERN ulint	srv_n_data_files = 0;
+UNIV_INTERN char**	srv_data_file_names = NULL;
+/* size in database pages */
+UNIV_INTERN ulint*	srv_data_file_sizes = NULL;
+
+/* if TRUE, then we auto-extend the last data file */
+UNIV_INTERN ibool	srv_auto_extend_last_data_file	= FALSE;
+/* if != 0, this tells the max size auto-extending may increase the
+last data file size */
+UNIV_INTERN ulint	srv_last_file_size_max	= 0;
+/* If the last data file is auto-extended, we add this
+many pages to it at a time */
+UNIV_INTERN ulong	srv_auto_extend_increment = 8;
+UNIV_INTERN ulint*	srv_data_file_is_raw_partition = NULL;
+
+/* If the following is TRUE we do not allow inserts etc. This protects
+the user from forgetting the 'newraw' keyword to my.cnf */
+
+UNIV_INTERN ibool	srv_created_new_raw	= FALSE;
+
+UNIV_INTERN char*	srv_log_group_home_dir	= NULL;
+
+UNIV_INTERN ulong	srv_n_log_files		= SRV_N_LOG_FILES_MAX;
+/* size in database pages */
+UNIV_INTERN ib_uint64_t	srv_log_file_size	= IB_UINT64_MAX;
+UNIV_INTERN ib_uint64_t	srv_log_file_size_requested;
+/* size in database pages */
+UNIV_INTERN ulint	srv_log_buffer_size	= ULINT_MAX;
+UNIV_INTERN ulong	srv_flush_log_at_trx_commit = 1;
+UNIV_INTERN uint	srv_flush_log_at_timeout = 1;
+UNIV_INTERN ulong	srv_page_size		= UNIV_PAGE_SIZE_DEF;
+UNIV_INTERN ulong	srv_page_size_shift	= UNIV_PAGE_SIZE_SHIFT_DEF;
+
+/* Try to flush dirty pages so as to avoid IO bursts at
+the checkpoints. */
+UNIV_INTERN char	srv_adaptive_flushing	= TRUE;
+
+/** Maximum number of times allowed to conditionally acquire
+mutex before switching to blocking wait on the mutex */
+#define MAX_MUTEX_NOWAIT	20
+
+/** Check whether the number of failed nonblocking mutex
+acquisition attempts exceeds maximum allowed value. If so,
+srv_printf_innodb_monitor() will request mutex acquisition
+with mutex_enter(), which will wait until it gets the mutex. */
+#define MUTEX_NOWAIT(mutex_skipped)	((mutex_skipped) < MAX_MUTEX_NOWAIT)
+
+/** The sort order table of the MySQL latin1_swedish_ci character set
+collation */
+UNIV_INTERN const byte*	srv_latin1_ordering;
+
+/* use os/external memory allocator */
+UNIV_INTERN my_bool	srv_use_sys_malloc	= TRUE;
+/* requested size in kilobytes */
+UNIV_INTERN ulint	srv_buf_pool_size	= ULINT_MAX;
+/* requested number of buffer pool instances */
+UNIV_INTERN ulint       srv_buf_pool_instances  = 1;
+/* number of locks to protect buf_pool->page_hash */
+UNIV_INTERN ulong	srv_n_page_hash_locks = 16;
+/** Scan depth for LRU flush batch i.e.: number of blocks scanned*/
+UNIV_INTERN ulong	srv_LRU_scan_depth	= 1024;
+/** whether or not to flush neighbors of a block */
+UNIV_INTERN ulong	srv_flush_neighbors	= 1;
+/* previously requested size */
+UNIV_INTERN ulint	srv_buf_pool_old_size;
+/* current size in kilobytes */
+UNIV_INTERN ulint	srv_buf_pool_curr_size	= 0;
+/* size in bytes */
+UNIV_INTERN ulint	srv_mem_pool_size	= ULINT_MAX;
+UNIV_INTERN ulint	srv_lock_table_size	= ULINT_MAX;
+
+/* This parameter is deprecated. Use srv_n_io_[read|write]_threads
+instead. */
+UNIV_INTERN ulint	srv_n_file_io_threads	= ULINT_MAX;
+UNIV_INTERN ulint	srv_n_read_io_threads	= ULINT_MAX;
+UNIV_INTERN ulint	srv_n_write_io_threads	= ULINT_MAX;
+
+/* Switch to enable random read ahead. */
+UNIV_INTERN my_bool	srv_random_read_ahead	= FALSE;
+/* User settable value of the number of pages that must be present
+in the buffer cache and accessed sequentially for InnoDB to trigger a
+readahead request. */
+UNIV_INTERN ulong	srv_read_ahead_threshold	= 56;
+
+#ifdef UNIV_LOG_ARCHIVE
+UNIV_INTERN ibool		srv_log_archive_on	= FALSE;
+UNIV_INTERN ibool		srv_archive_recovery	= 0;
+UNIV_INTERN ib_uint64_t	srv_archive_recovery_limit_lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+
+/* This parameter is used to throttle the number of insert buffers that are
+merged in a batch. By increasing this parameter on a faster disk you can
+possibly reduce the number of I/O operations performed to complete the
+merge operation. The value of this parameter is used as is by the
+background loop when the system is idle (low load), on a busy system
+the parameter is scaled down by a factor of 4, this is to avoid putting
+a heavier load on the I/O sub system. */
+
+UNIV_INTERN ulong	srv_insert_buffer_batch_size = 20;
+
+UNIV_INTERN char*	srv_file_flush_method_str = NULL;
+UNIV_INTERN ulint	srv_unix_file_flush_method = SRV_UNIX_FSYNC;
+UNIV_INTERN ulint	srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+
+UNIV_INTERN ulint	srv_max_n_open_files	  = 300;
+
+/* Number of IO operations per second the server can do */
+UNIV_INTERN ulong	srv_io_capacity         = 200;
+UNIV_INTERN ulong	srv_max_io_capacity     = 400;
+
+/* The InnoDB main thread tries to keep the ratio of modified pages
+in the buffer pool to all database pages in the buffer pool smaller than
+the following number. But it is not guaranteed that the value stays below
+that during a time of heavy update/insert activity. */
+
+UNIV_INTERN ulong	srv_max_buf_pool_modified_pct	= 75;
+UNIV_INTERN ulong	srv_max_dirty_pages_pct_lwm	= 50;
+
+/* This is the percentage of log capacity at which adaptive flushing,
+if enabled, will kick in. */
+UNIV_INTERN ulong	srv_adaptive_flushing_lwm	= 10;
+
+/* Number of iterations over which adaptive flushing is averaged. */
+UNIV_INTERN ulong	srv_flushing_avg_loops		= 30;
+
+/* The number of purge threads to use.*/
+UNIV_INTERN ulong	srv_n_purge_threads = 1;
+
+/* the number of pages to purge in one batch */
+UNIV_INTERN ulong	srv_purge_batch_size = 20;
+
+/* Internal setting for "innodb_stats_method". Decides how InnoDB treats
+NULL value when collecting statistics. By default, it is set to
+SRV_STATS_NULLS_EQUAL(0), ie. all NULL value are treated equal */
+UNIV_INTERN ulong srv_innodb_stats_method = SRV_STATS_NULLS_EQUAL;
+
+UNIV_INTERN srv_stats_t	srv_stats;
+
+/* structure to pass status variables to MySQL */
+UNIV_INTERN export_var_t export_vars;
+
+/** Normally 0. When nonzero, skip some phases of crash recovery,
+starting from SRV_FORCE_IGNORE_CORRUPT, so that data can be recovered
+by SELECT or mysqldump. When this is nonzero, we do not allow any user
+modifications to the data. */
+UNIV_INTERN ulong	srv_force_recovery;
+#ifndef DBUG_OFF
+/** Inject a crash at different steps of the recovery process.
+This is for testing and debugging only. */
+UNIV_INTERN ulong	srv_force_recovery_crash;
+#endif /* !DBUG_OFF */
+
+/** Print all user-level transactions deadlocks to mysqld stderr */
+
+UNIV_INTERN my_bool	srv_print_all_deadlocks = FALSE;
+
+/** Enable INFORMATION_SCHEMA.innodb_cmp_per_index */
+UNIV_INTERN my_bool	srv_cmp_per_index_enabled = FALSE;
+
+/* If the following is set to 1 then we do not run purge and insert buffer
+merge to completion before shutdown. If it is set to 2, do not even flush the
+buffer pool to data files at the shutdown: we effectively 'crash'
+InnoDB (but lose no committed transactions). */
+UNIV_INTERN ulint	srv_fast_shutdown	= 0;
+
+/* Generate a innodb_status.<pid> file */
+UNIV_INTERN ibool	srv_innodb_status	= FALSE;
+
+/* When estimating number of different key values in an index, sample
+this many index pages, there are 2 ways to calculate statistics:
+* persistent stats that are calculated by ANALYZE TABLE and saved
+  in the innodb database.
+* quick transient stats, that are used if persistent stats for the given
+  table/index are not found in the innodb database */
+UNIV_INTERN unsigned long long	srv_stats_transient_sample_pages = 8;
+UNIV_INTERN my_bool		srv_stats_persistent = TRUE;
+UNIV_INTERN unsigned long long	srv_stats_persistent_sample_pages = 20;
+UNIV_INTERN my_bool		srv_stats_auto_recalc = TRUE;
+
+UNIV_INTERN ibool	srv_use_doublewrite_buf	= TRUE;
+
+/** doublewrite buffer is 1MB is size i.e.: it can hold 128 16K pages.
+The following parameter is the size of the buffer that is used for
+batch flushing i.e.: LRU flushing and flush_list flushing. The rest
+of the pages are used for single page flushing. */
+UNIV_INTERN ulong	srv_doublewrite_batch_size	= 120;
+
+UNIV_INTERN ulong	srv_replication_delay		= 0;
+
+/*-------------------------------------------*/
+UNIV_INTERN ulong	srv_n_spin_wait_rounds	= 30;
+UNIV_INTERN ulong	srv_spin_wait_delay	= 6;
+UNIV_INTERN ibool	srv_priority_boost	= TRUE;
+
+#ifdef UNIV_DEBUG
+UNIV_INTERN ibool	srv_print_thread_releases	= FALSE;
+UNIV_INTERN ibool	srv_print_lock_waits		= FALSE;
+UNIV_INTERN ibool	srv_print_buf_io		= FALSE;
+UNIV_INTERN ibool	srv_print_log_io		= FALSE;
+UNIV_INTERN ibool	srv_print_latch_waits		= FALSE;
+#endif /* UNIV_DEBUG */
+
+static ulint		srv_n_rows_inserted_old		= 0;
+static ulint		srv_n_rows_updated_old		= 0;
+static ulint		srv_n_rows_deleted_old		= 0;
+static ulint		srv_n_rows_read_old		= 0;
+
+UNIV_INTERN ulint	srv_truncated_status_writes	= 0;
+UNIV_INTERN ulint	srv_available_undo_logs         = 0;
+
+/* Set the following to 0 if you want InnoDB to write messages on
+stderr on startup/shutdown. */
+UNIV_INTERN ibool	srv_print_verbose_log		= TRUE;
+UNIV_INTERN my_bool	srv_print_innodb_monitor	= FALSE;
+UNIV_INTERN my_bool	srv_print_innodb_lock_monitor	= FALSE;
+UNIV_INTERN ibool	srv_print_innodb_tablespace_monitor = FALSE;
+UNIV_INTERN ibool	srv_print_innodb_table_monitor = FALSE;
+
+/* Array of English strings describing the current state of an
+i/o handler thread */
+
+UNIV_INTERN const char* srv_io_thread_op_info[SRV_MAX_N_IO_THREADS];
+UNIV_INTERN const char* srv_io_thread_function[SRV_MAX_N_IO_THREADS];
+
+UNIV_INTERN time_t	srv_last_monitor_time;
+
+UNIV_INTERN ib_mutex_t	srv_innodb_monitor_mutex;
+
+/* Mutex for locking srv_monitor_file. Not created if srv_read_only_mode */
+UNIV_INTERN ib_mutex_t	srv_monitor_file_mutex;
+
+#ifdef UNIV_PFS_MUTEX
+# ifndef HAVE_ATOMIC_BUILTINS
+/* Key to register server_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	server_mutex_key;
+# endif /* !HAVE_ATOMIC_BUILTINS */
+/** Key to register srv_innodb_monitor_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_innodb_monitor_mutex_key;
+/** Key to register srv_monitor_file_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_monitor_file_mutex_key;
+/** Key to register srv_dict_tmpfile_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_dict_tmpfile_mutex_key;
+/** Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_misc_tmpfile_mutex_key;
+/** Key to register srv_sys_t::mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_sys_mutex_key;
+/** Key to register srv_sys_t::tasks_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	srv_sys_tasks_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/** Temporary file for innodb monitor output */
+UNIV_INTERN FILE*	srv_monitor_file;
+/** Mutex for locking srv_dict_tmpfile. Not created if srv_read_only_mode.
+This mutex has a very high rank; threads reserving it should not
+be holding any InnoDB latches. */
+UNIV_INTERN ib_mutex_t	srv_dict_tmpfile_mutex;
+/** Temporary file for output from the data dictionary */
+UNIV_INTERN FILE*	srv_dict_tmpfile;
+/** Mutex for locking srv_misc_tmpfile. Not created if srv_read_only_mode.
+This mutex has a very low rank; threads reserving it should not
+acquire any further latches or sleep before releasing this one. */
+UNIV_INTERN ib_mutex_t	srv_misc_tmpfile_mutex;
+/** Temporary file for miscellanous diagnostic output */
+UNIV_INTERN FILE*	srv_misc_tmpfile;
+
+UNIV_INTERN ulint	srv_main_thread_process_no	= 0;
+UNIV_INTERN ulint	srv_main_thread_id		= 0;
+
+/* The following counts are used by the srv_master_thread. */
+
+/** Iterations of the loop bounded by 'srv_active' label. */
+static ulint		srv_main_active_loops		= 0;
+/** Iterations of the loop bounded by the 'srv_idle' label. */
+static ulint		srv_main_idle_loops		= 0;
+/** Iterations of the loop bounded by the 'srv_shutdown' label. */
+static ulint		srv_main_shutdown_loops		= 0;
+/** Log writes involving flush. */
+static ulint		srv_log_writes_and_flush	= 0;
+
+/* This is only ever touched by the master thread. It records the
+time when the last flush of log file has happened. The master
+thread ensures that we flush the log files at least once per
+second. */
+static time_t	srv_last_log_flush_time;
+
+/* Interval in seconds at which various tasks are performed by the
+master thread when server is active. In order to balance the workload,
+we should try to keep intervals such that they are not multiple of
+each other. For example, if we have intervals for various tasks
+defined as 5, 10, 15, 60 then all tasks will be performed when
+current_time % 60 == 0 and no tasks will be performed when
+current_time % 5 != 0. */
+
+# define	SRV_MASTER_CHECKPOINT_INTERVAL		(7)
+# define	SRV_MASTER_PURGE_INTERVAL		(10)
+#ifdef MEM_PERIODIC_CHECK
+# define	SRV_MASTER_MEM_VALIDATE_INTERVAL	(13)
+#endif /* MEM_PERIODIC_CHECK */
+# define	SRV_MASTER_DICT_LRU_INTERVAL		(47)
+
+/** Acquire the system_mutex. */
+#define srv_sys_mutex_enter() do {			\
+	mutex_enter(&srv_sys->mutex);			\
+} while (0)
+
+/** Test if the system mutex is owned. */
+#define srv_sys_mutex_own() (mutex_own(&srv_sys->mutex)	\
+			     && !srv_read_only_mode)
+
+/** Release the system mutex. */
+#define srv_sys_mutex_exit() do {			\
+	mutex_exit(&srv_sys->mutex);			\
+} while (0)
+
+#define fetch_lock_wait_timeout(trx)			\
+	((trx)->lock.allowed_to_wait			\
+	 ? thd_lock_wait_timeout((trx)->mysql_thd)	\
+	 : 0)
+
+/*
+	IMPLEMENTATION OF THE SERVER MAIN PROGRAM
+	=========================================
+
+There is the following analogue between this database
+server and an operating system kernel:
+
+DB concept			equivalent OS concept
+----------			---------------------
+transaction		--	process;
+
+query thread		--	thread;
+
+lock			--	semaphore;
+
+kernel			--	kernel;
+
+query thread execution:
+(a) without lock mutex
+reserved		--	process executing in user mode;
+(b) with lock mutex reserved
+			--	process executing in kernel mode;
+
+The server has several backgroind threads all running at the same
+priority as user threads. It periodically checks if here is anything
+happening in the server which requires intervention of the master
+thread. Such situations may be, for example, when flushing of dirty
+blocks is needed in the buffer pool or old version of database rows
+have to be cleaned away (purged). The user can configure a separate
+dedicated purge thread(s) too, in which case the master thread does not
+do any purging.
+
+The threads which we call user threads serve the queries of the MySQL
+server. They run at normal priority.
+
+When there is no activity in the system, also the master thread
+suspends itself to wait for an event making the server totally silent.
+
+There is still one complication in our server design. If a
+background utility thread obtains a resource (e.g., mutex) needed by a user
+thread, and there is also some other user activity in the system,
+the user thread may have to wait indefinitely long for the
+resource, as the OS does not schedule a background thread if
+there is some other runnable user thread. This problem is called
+priority inversion in real-time programming.
+
+One solution to the priority inversion problem would be to keep record
+of which thread owns which resource and in the above case boost the
+priority of the background thread so that it will be scheduled and it
+can release the resource.  This solution is called priority inheritance
+in real-time programming.  A drawback of this solution is that the overhead
+of acquiring a mutex increases slightly, maybe 0.2 microseconds on a 100
+MHz Pentium, because the thread has to call os_thread_get_curr_id.  This may
+be compared to 0.5 microsecond overhead for a mutex lock-unlock pair. Note
+that the thread cannot store the information in the resource , say mutex,
+itself, because competing threads could wipe out the information if it is
+stored before acquiring the mutex, and if it stored afterwards, the
+information is outdated for the time of one machine instruction, at least.
+(To be precise, the information could be stored to lock_word in mutex if
+the machine supports atomic swap.)
+
+The above solution with priority inheritance may become actual in the
+future, currently we do not implement any priority twiddling solution.
+Our general aim is to reduce the contention of all mutexes by making
+them more fine grained.
+
+The thread table contains information of the current status of each
+thread existing in the system, and also the event semaphores used in
+suspending the master thread and utility threads when they have nothing
+to do.  The thread table can be seen as an analogue to the process table
+in a traditional Unix implementation. */
+
+/** The server system struct */
+struct srv_sys_t{
+	ib_mutex_t	tasks_mutex;		/*!< variable protecting the
+						tasks queue */
+	UT_LIST_BASE_NODE_T(que_thr_t)
+			tasks;			/*!< task queue */
+
+	ib_mutex_t	mutex;			/*!< variable protecting the
+						fields below. */
+	ulint		n_sys_threads;		/*!< size of the sys_threads
+						array */
+
+	srv_slot_t*	sys_threads;		/*!< server thread table */
+
+	ulint		n_threads_active[SRV_MASTER + 1];
+						/*!< number of threads active
+						in a thread class */
+
+	srv_stats_t::ulint_ctr_1_t
+			activity_count;		/*!< For tracking server
+						activity */
+};
+
+#ifndef HAVE_ATOMIC_BUILTINS
+/** Mutex protecting some server global variables. */
+UNIV_INTERN ib_mutex_t	server_mutex;
+#endif /* !HAVE_ATOMIC_BUILTINS */
+
+static srv_sys_t*	srv_sys	= NULL;
+
+/** Event to signal the monitor thread. */
+UNIV_INTERN os_event_t	srv_monitor_event;
+
+/** Event to signal the error thread */
+UNIV_INTERN os_event_t	srv_error_event;
+
+/** Event to signal the buffer pool dump/load thread */
+UNIV_INTERN os_event_t	srv_buf_dump_event;
+
+/** The buffer pool dump/load file name */
+UNIV_INTERN char*	srv_buf_dump_filename;
+
+/** Boolean config knobs that tell InnoDB to dump the buffer pool at shutdown
+and/or load it during startup. */
+UNIV_INTERN char	srv_buffer_pool_dump_at_shutdown = FALSE;
+UNIV_INTERN char	srv_buffer_pool_load_at_startup = FALSE;
+
+/** Slot index in the srv_sys->sys_threads array for the purge thread. */
+static const ulint	SRV_PURGE_SLOT	= 1;
+
+/** Slot index in the srv_sys->sys_threads array for the master thread. */
+static const ulint	SRV_MASTER_SLOT = 0;
+
+/*********************************************************************//**
+Prints counters for work done by srv_master_thread. */
+static
+void
+srv_print_master_thread_info(
+/*=========================*/
+	FILE  *file)    /* in: output stream */
+{
+	fprintf(file, "srv_master_thread loops: %lu srv_active, "
+		"%lu srv_shutdown, %lu srv_idle\n",
+		srv_main_active_loops,
+		srv_main_shutdown_loops,
+		srv_main_idle_loops);
+	fprintf(file, "srv_master_thread log flush and writes: %lu\n",
+		srv_log_writes_and_flush);
+}
+
+/*********************************************************************//**
+Sets the info describing an i/o thread current state. */
+UNIV_INTERN
+void
+srv_set_io_thread_op_info(
+/*======================*/
+	ulint		i,	/*!< in: the 'segment' of the i/o thread */
+	const char*	str)	/*!< in: constant char string describing the
+				state */
+{
+	ut_a(i < SRV_MAX_N_IO_THREADS);
+
+	srv_io_thread_op_info[i] = str;
+}
+
+/*********************************************************************//**
+Resets the info describing an i/o thread current state. */
+UNIV_INTERN
+void
+srv_reset_io_thread_op_info()
+/*=========================*/
+{
+	for (ulint i = 0; i < UT_ARR_SIZE(srv_io_thread_op_info); ++i) {
+		srv_io_thread_op_info[i] = "not started yet";
+	}
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Validates the type of a thread table slot.
+@return TRUE if ok */
+static
+ibool
+srv_thread_type_validate(
+/*=====================*/
+	srv_thread_type	type)	/*!< in: thread type */
+{
+	switch (type) {
+	case SRV_NONE:
+		break;
+	case SRV_WORKER:
+	case SRV_PURGE:
+	case SRV_MASTER:
+		return(TRUE);
+	}
+	ut_error;
+	return(FALSE);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets the type of a thread table slot.
+@return thread type */
+static
+srv_thread_type
+srv_slot_get_type(
+/*==============*/
+	const srv_slot_t*	slot)	/*!< in: thread slot */
+{
+	srv_thread_type	type = slot->type;
+	ut_ad(srv_thread_type_validate(type));
+	return(type);
+}
+
+/*********************************************************************//**
+Reserves a slot in the thread table for the current thread.
+@return	reserved slot */
+static
+srv_slot_t*
+srv_reserve_slot(
+/*=============*/
+	srv_thread_type	type)	/*!< in: type of the thread */
+{
+	srv_slot_t*	slot = 0;
+
+	srv_sys_mutex_enter();
+
+	ut_ad(srv_thread_type_validate(type));
+
+	switch (type) {
+	case SRV_MASTER:
+		slot = &srv_sys->sys_threads[SRV_MASTER_SLOT];
+		break;
+
+	case SRV_PURGE:
+		slot = &srv_sys->sys_threads[SRV_PURGE_SLOT];
+		break;
+
+	case SRV_WORKER:
+		/* Find an empty slot, skip the master and purge slots. */
+		for (slot = &srv_sys->sys_threads[2];
+		     slot->in_use;
+		     ++slot) {
+
+			ut_a(slot < &srv_sys->sys_threads[
+			     srv_sys->n_sys_threads]);
+		}
+		break;
+
+	case SRV_NONE:
+		ut_error;
+	}
+
+	ut_a(!slot->in_use);
+
+	slot->in_use = TRUE;
+	slot->suspended = FALSE;
+	slot->type = type;
+
+	ut_ad(srv_slot_get_type(slot) == type);
+
+	++srv_sys->n_threads_active[type];
+
+	srv_sys_mutex_exit();
+
+	return(slot);
+}
+
+/*********************************************************************//**
+Suspends the calling thread to wait for the event in its thread slot.
+@return the current signal count of the event. */
+static
+ib_int64_t
+srv_suspend_thread_low(
+/*===================*/
+	srv_slot_t*	slot)	/*!< in/out: thread slot */
+{
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(srv_sys_mutex_own());
+
+	ut_ad(slot->in_use);
+
+	srv_thread_type	type = srv_slot_get_type(slot);
+
+	switch (type) {
+	case SRV_NONE:
+		ut_error;
+
+	case SRV_MASTER:
+		/* We have only one master thread and it
+		should be the first entry always. */
+		ut_a(srv_sys->n_threads_active[type] == 1);
+		break;
+
+	case SRV_PURGE:
+		/* We have only one purge coordinator thread
+		and it should be the second entry always. */
+		ut_a(srv_sys->n_threads_active[type] == 1);
+		break;
+
+	case SRV_WORKER:
+		ut_a(srv_n_purge_threads > 1);
+		ut_a(srv_sys->n_threads_active[type] > 0);
+		break;
+	}
+
+	ut_a(!slot->suspended);
+	slot->suspended = TRUE;
+
+	ut_a(srv_sys->n_threads_active[type] > 0);
+
+	srv_sys->n_threads_active[type]--;
+
+	return(os_event_reset(slot->event));
+}
+
+/*********************************************************************//**
+Suspends the calling thread to wait for the event in its thread slot.
+@return the current signal count of the event. */
+static
+ib_int64_t
+srv_suspend_thread(
+/*===============*/
+	srv_slot_t*	slot)	/*!< in/out: thread slot */
+{
+	srv_sys_mutex_enter();
+
+	ib_int64_t	sig_count = srv_suspend_thread_low(slot);
+
+	srv_sys_mutex_exit();
+
+	return(sig_count);
+}
+
+/*********************************************************************//**
+Releases threads of the type given from suspension in the thread table.
+NOTE! The server mutex has to be reserved by the caller!
+@return number of threads released: this may be less than n if not
+        enough threads were suspended at the moment. */
+UNIV_INTERN
+ulint
+srv_release_threads(
+/*================*/
+	srv_thread_type	type,	/*!< in: thread type */
+	ulint		n)	/*!< in: number of threads to release */
+{
+	ulint		i;
+	ulint		count	= 0;
+
+	ut_ad(srv_thread_type_validate(type));
+	ut_ad(n > 0);
+
+	srv_sys_mutex_enter();
+
+	for (i = 0; i < srv_sys->n_sys_threads; i++) {
+		srv_slot_t*	slot;
+
+		slot = &srv_sys->sys_threads[i];
+
+		if (slot->in_use
+		    && srv_slot_get_type(slot) == type
+		    && slot->suspended) {
+
+			switch (type) {
+			case SRV_NONE:
+				ut_error;
+
+			case SRV_MASTER:
+				/* We have only one master thread and it
+				should be the first entry always. */
+				ut_a(n == 1);
+				ut_a(i == SRV_MASTER_SLOT);
+				ut_a(srv_sys->n_threads_active[type] == 0);
+				break;
+
+			case SRV_PURGE:
+				/* We have only one purge coordinator thread
+				and it should be the second entry always. */
+				ut_a(n == 1);
+				ut_a(i == SRV_PURGE_SLOT);
+				ut_a(srv_n_purge_threads > 0);
+				ut_a(srv_sys->n_threads_active[type] == 0);
+				break;
+
+			case SRV_WORKER:
+				ut_a(srv_n_purge_threads > 1);
+				ut_a(srv_sys->n_threads_active[type]
+				     < srv_n_purge_threads - 1);
+				break;
+			}
+
+			slot->suspended = FALSE;
+
+			++srv_sys->n_threads_active[type];
+
+			os_event_set(slot->event);
+
+			if (++count == n) {
+				break;
+			}
+		}
+	}
+
+	srv_sys_mutex_exit();
+
+	return(count);
+}
+
+/*********************************************************************//**
+Release a thread's slot. */
+static
+void
+srv_free_slot(
+/*==========*/
+	srv_slot_t*	slot)	/*!< in/out: thread slot */
+{
+	srv_sys_mutex_enter();
+
+	if (!slot->suspended) {
+		/* Mark the thread as inactive. */
+		srv_suspend_thread_low(slot);
+	}
+
+	/* Free the slot for reuse. */
+	ut_ad(slot->in_use);
+	slot->in_use = FALSE;
+
+	srv_sys_mutex_exit();
+}
+
+/*********************************************************************//**
+Initializes the server. */
+UNIV_INTERN
+void
+srv_init(void)
+/*==========*/
+{
+	ulint	n_sys_threads = 0;
+	ulint	srv_sys_sz = sizeof(*srv_sys);
+
+#ifndef HAVE_ATOMIC_BUILTINS
+	mutex_create(server_mutex_key, &server_mutex, SYNC_ANY_LATCH);
+#endif /* !HAVE_ATOMIC_BUILTINS */
+
+	mutex_create(srv_innodb_monitor_mutex_key,
+		     &srv_innodb_monitor_mutex, SYNC_NO_ORDER_CHECK);
+
+	if (!srv_read_only_mode) {
+
+		/* Number of purge threads + master thread */
+		n_sys_threads = srv_n_purge_threads + 1;
+
+		srv_sys_sz += n_sys_threads * sizeof(*srv_sys->sys_threads);
+	}
+
+	srv_sys = static_cast<srv_sys_t*>(mem_zalloc(srv_sys_sz));
+
+	srv_sys->n_sys_threads = n_sys_threads;
+
+	if (!srv_read_only_mode) {
+
+		mutex_create(srv_sys_mutex_key, &srv_sys->mutex, SYNC_THREADS);
+
+		mutex_create(srv_sys_tasks_mutex_key,
+			     &srv_sys->tasks_mutex, SYNC_ANY_LATCH);
+
+		srv_sys->sys_threads = (srv_slot_t*) &srv_sys[1];
+
+		for (ulint i = 0; i < srv_sys->n_sys_threads; ++i) {
+			srv_slot_t*	slot = &srv_sys->sys_threads[i];
+
+			slot->event = os_event_create();
+
+			ut_a(slot->event);
+		}
+
+		srv_error_event = os_event_create();
+
+		srv_monitor_event = os_event_create();
+
+		srv_buf_dump_event = os_event_create();
+
+		UT_LIST_INIT(srv_sys->tasks);
+	}
+
+	/* page_zip_stat_per_index_mutex is acquired from:
+	1. page_zip_compress() (after SYNC_FSP)
+	2. page_zip_decompress()
+	3. i_s_cmp_per_index_fill_low() (where SYNC_DICT is acquired)
+	4. innodb_cmp_per_index_update(), no other latches
+	since we do not acquire any other latches while holding this mutex,
+	it can have very low level. We pick SYNC_ANY_LATCH for it. */
+
+	mutex_create(
+		page_zip_stat_per_index_mutex_key,
+		&page_zip_stat_per_index_mutex, SYNC_ANY_LATCH);
+
+	/* Create dummy indexes for infimum and supremum records */
+
+	dict_ind_init();
+
+	srv_conc_init();
+
+	/* Initialize some INFORMATION SCHEMA internal structures */
+	trx_i_s_cache_init(trx_i_s_cache);
+
+	ut_crc32_init();
+
+	dict_mem_init();
+}
+
+/*********************************************************************//**
+Frees the data structures created in srv_init(). */
+UNIV_INTERN
+void
+srv_free(void)
+/*==========*/
+{
+	srv_conc_free();
+
+	/* The mutexes srv_sys->mutex and srv_sys->tasks_mutex should have
+	been freed by sync_close() already. */
+	mem_free(srv_sys);
+	srv_sys = NULL;
+
+	trx_i_s_cache_free(trx_i_s_cache);
+
+	if (!srv_read_only_mode) {
+		os_event_free(srv_buf_dump_event);
+		srv_buf_dump_event = NULL;
+	}
+}
+
+/*********************************************************************//**
+Initializes the synchronization primitives, memory system, and the thread
+local storage. */
+UNIV_INTERN
+void
+srv_general_init(void)
+/*==================*/
+{
+	ut_mem_init();
+	/* Reset the system variables in the recovery module. */
+	recv_sys_var_init();
+	os_sync_init();
+	sync_init();
+	mem_init(srv_mem_pool_size);
+	que_init();
+	row_mysql_init();
+}
+
+/*********************************************************************//**
+Normalizes init parameter values to use units we use inside InnoDB. */
+static
+void
+srv_normalize_init_values(void)
+/*===========================*/
+{
+	ulint	n;
+	ulint	i;
+
+	n = srv_n_data_files;
+
+	for (i = 0; i < n; i++) {
+		srv_data_file_sizes[i] = srv_data_file_sizes[i]
+			* ((1024 * 1024) / UNIV_PAGE_SIZE);
+	}
+
+	srv_last_file_size_max = srv_last_file_size_max
+		* ((1024 * 1024) / UNIV_PAGE_SIZE);
+
+	srv_log_file_size = srv_log_file_size / UNIV_PAGE_SIZE;
+
+	srv_log_buffer_size = srv_log_buffer_size / UNIV_PAGE_SIZE;
+
+	srv_lock_table_size = 5 * (srv_buf_pool_size / UNIV_PAGE_SIZE);
+}
+
+/*********************************************************************//**
+Boots the InnoDB server. */
+UNIV_INTERN
+void
+srv_boot(void)
+/*==========*/
+{
+	/* Transform the init parameter values given by MySQL to
+	use units we use inside InnoDB: */
+
+	srv_normalize_init_values();
+
+	/* Initialize synchronization primitives, memory management, and thread
+	local storage */
+
+	srv_general_init();
+
+	/* Initialize this module */
+
+	srv_init();
+	srv_mon_create();
+}
+
+/******************************************************************//**
+Refreshes the values used to calculate per-second averages. */
+static
+void
+srv_refresh_innodb_monitor_stats(void)
+/*==================================*/
+{
+	mutex_enter(&srv_innodb_monitor_mutex);
+
+	srv_last_monitor_time = time(NULL);
+
+	os_aio_refresh_stats();
+
+	btr_cur_n_sea_old = btr_cur_n_sea;
+	btr_cur_n_non_sea_old = btr_cur_n_non_sea;
+
+	log_refresh_stats();
+
+	buf_refresh_io_stats_all();
+
+	srv_n_rows_inserted_old = srv_stats.n_rows_inserted;
+	srv_n_rows_updated_old = srv_stats.n_rows_updated;
+	srv_n_rows_deleted_old = srv_stats.n_rows_deleted;
+	srv_n_rows_read_old = srv_stats.n_rows_read;
+
+	mutex_exit(&srv_innodb_monitor_mutex);
+}
+
+/******************************************************************//**
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
+UNIV_INTERN
+ibool
+srv_printf_innodb_monitor(
+/*======================*/
+	FILE*	file,		/*!< in: output stream */
+	ibool	nowait,		/*!< in: whether to wait for the
+				lock_sys_t:: mutex */
+	ulint*	trx_start_pos,	/*!< out: file position of the start of
+				the list of active transactions */
+	ulint*	trx_end)	/*!< out: file position of the end of
+				the list of active transactions */
+{
+	double	time_elapsed;
+	time_t	current_time;
+	ulint	n_reserved;
+	ibool	ret;
+
+	mutex_enter(&srv_innodb_monitor_mutex);
+
+	current_time = time(NULL);
+
+	/* We add 0.001 seconds to time_elapsed to prevent division
+	by zero if two users happen to call SHOW ENGINE INNODB STATUS at the
+	same time */
+
+	time_elapsed = difftime(current_time, srv_last_monitor_time)
+		+ 0.001;
+
+	srv_last_monitor_time = time(NULL);
+
+	fputs("\n=====================================\n", file);
+
+	ut_print_timestamp(file);
+	fprintf(file,
+		" INNODB MONITOR OUTPUT\n"
+		"=====================================\n"
+		"Per second averages calculated from the last %lu seconds\n",
+		(ulong) time_elapsed);
+
+	fputs("-----------------\n"
+	      "BACKGROUND THREAD\n"
+	      "-----------------\n", file);
+	srv_print_master_thread_info(file);
+
+	fputs("----------\n"
+	      "SEMAPHORES\n"
+	      "----------\n", file);
+	sync_print(file);
+
+	/* Conceptually, srv_innodb_monitor_mutex has a very high latching
+	order level in sync0sync.h, while dict_foreign_err_mutex has a very
+	low level 135. Therefore we can reserve the latter mutex here without
+	a danger of a deadlock of threads. */
+
+	mutex_enter(&dict_foreign_err_mutex);
+
+	if (!srv_read_only_mode && ftell(dict_foreign_err_file) != 0L) {
+		fputs("------------------------\n"
+		      "LATEST FOREIGN KEY ERROR\n"
+		      "------------------------\n", file);
+		ut_copy_file(file, dict_foreign_err_file);
+	}
+
+	mutex_exit(&dict_foreign_err_mutex);
+
+	/* Only if lock_print_info_summary proceeds correctly,
+	before we call the lock_print_info_all_transactions
+	to print all the lock information. IMPORTANT NOTE: This
+	function acquires the lock mutex on success. */
+	ret = lock_print_info_summary(file, nowait);
+
+	if (ret) {
+		if (trx_start_pos) {
+			long	t = ftell(file);
+			if (t < 0) {
+				*trx_start_pos = ULINT_UNDEFINED;
+			} else {
+				*trx_start_pos = (ulint) t;
+			}
+		}
+
+		/* NOTE: If we get here then we have the lock mutex. This
+		function will release the lock mutex that we acquired when
+		we called the lock_print_info_summary() function earlier. */
+
+		lock_print_info_all_transactions(file);
+
+		if (trx_end) {
+			long	t = ftell(file);
+			if (t < 0) {
+				*trx_end = ULINT_UNDEFINED;
+			} else {
+				*trx_end = (ulint) t;
+			}
+		}
+	}
+
+	fputs("--------\n"
+	      "FILE I/O\n"
+	      "--------\n", file);
+	os_aio_print(file);
+
+	fputs("-------------------------------------\n"
+	      "INSERT BUFFER AND ADAPTIVE HASH INDEX\n"
+	      "-------------------------------------\n", file);
+	ibuf_print(file);
+
+	ha_print_info(file, btr_search_sys->hash_index);
+
+	fprintf(file,
+		"%.2f hash searches/s, %.2f non-hash searches/s\n",
+		(btr_cur_n_sea - btr_cur_n_sea_old)
+		/ time_elapsed,
+		(btr_cur_n_non_sea - btr_cur_n_non_sea_old)
+		/ time_elapsed);
+	btr_cur_n_sea_old = btr_cur_n_sea;
+	btr_cur_n_non_sea_old = btr_cur_n_non_sea;
+
+	fputs("---\n"
+	      "LOG\n"
+	      "---\n", file);
+	log_print(file);
+
+	fputs("----------------------\n"
+	      "BUFFER POOL AND MEMORY\n"
+	      "----------------------\n", file);
+	fprintf(file,
+		"Total memory allocated " ULINTPF
+		"; in additional pool allocated " ULINTPF "\n",
+		ut_total_allocated_memory,
+		mem_pool_get_reserved(mem_comm_pool));
+	fprintf(file, "Dictionary memory allocated " ULINTPF "\n",
+		dict_sys->size);
+
+	buf_print_io(file);
+
+	fputs("--------------\n"
+	      "ROW OPERATIONS\n"
+	      "--------------\n", file);
+	fprintf(file, "%ld queries inside InnoDB, %lu queries in queue\n",
+		(long) srv_conc_get_active_threads(),
+		srv_conc_get_waiting_threads());
+
+	/* This is a dirty read, without holding trx_sys->mutex. */
+	fprintf(file, "%lu read views open inside InnoDB\n",
+		UT_LIST_GET_LEN(trx_sys->view_list));
+
+	n_reserved = fil_space_get_n_reserved_extents(0);
+	if (n_reserved > 0) {
+		fprintf(file,
+			"%lu tablespace extents now reserved for"
+			" B-tree split operations\n",
+			(ulong) n_reserved);
+	}
+
+#ifdef UNIV_LINUX
+	fprintf(file, "Main thread process no. %lu, id %lu, state: %s\n",
+		(ulong) srv_main_thread_process_no,
+		(ulong) srv_main_thread_id,
+		srv_main_thread_op_info);
+#else
+	fprintf(file, "Main thread id %lu, state: %s\n",
+		(ulong) srv_main_thread_id,
+		srv_main_thread_op_info);
+#endif
+	fprintf(file,
+		"Number of rows inserted " ULINTPF
+		", updated " ULINTPF ", deleted " ULINTPF
+		", read " ULINTPF "\n",
+		(ulint) srv_stats.n_rows_inserted,
+		(ulint) srv_stats.n_rows_updated,
+		(ulint) srv_stats.n_rows_deleted,
+		(ulint) srv_stats.n_rows_read);
+	fprintf(file,
+		"%.2f inserts/s, %.2f updates/s,"
+		" %.2f deletes/s, %.2f reads/s\n",
+		((ulint) srv_stats.n_rows_inserted - srv_n_rows_inserted_old)
+		/ time_elapsed,
+		((ulint) srv_stats.n_rows_updated - srv_n_rows_updated_old)
+		/ time_elapsed,
+		((ulint) srv_stats.n_rows_deleted - srv_n_rows_deleted_old)
+		/ time_elapsed,
+		((ulint) srv_stats.n_rows_read - srv_n_rows_read_old)
+		/ time_elapsed);
+
+	srv_n_rows_inserted_old = srv_stats.n_rows_inserted;
+	srv_n_rows_updated_old = srv_stats.n_rows_updated;
+	srv_n_rows_deleted_old = srv_stats.n_rows_deleted;
+	srv_n_rows_read_old = srv_stats.n_rows_read;
+
+	fputs("----------------------------\n"
+	      "END OF INNODB MONITOR OUTPUT\n"
+	      "============================\n", file);
+	mutex_exit(&srv_innodb_monitor_mutex);
+	fflush(file);
+
+	return(ret);
+}
+
+/******************************************************************//**
+Function to pass InnoDB status variables to MySQL */
+UNIV_INTERN
+void
+srv_export_innodb_status(void)
+/*==========================*/
+{
+	buf_pool_stat_t		stat;
+	buf_pools_list_size_t	buf_pools_list_size;
+	ulint			LRU_len;
+	ulint			free_len;
+	ulint			flush_list_len;
+
+	buf_get_total_stat(&stat);
+	buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
+	buf_get_total_list_size_in_bytes(&buf_pools_list_size);
+
+	mutex_enter(&srv_innodb_monitor_mutex);
+
+	export_vars.innodb_data_pending_reads =
+		os_n_pending_reads;
+
+	export_vars.innodb_data_pending_writes =
+		os_n_pending_writes;
+
+	export_vars.innodb_data_pending_fsyncs =
+		fil_n_pending_log_flushes
+		+ fil_n_pending_tablespace_flushes;
+
+	export_vars.innodb_data_fsyncs = os_n_fsyncs;
+
+	export_vars.innodb_data_read = srv_stats.data_read;
+
+	export_vars.innodb_data_reads = os_n_file_reads;
+
+	export_vars.innodb_data_writes = os_n_file_writes;
+
+	export_vars.innodb_data_written = srv_stats.data_written;
+
+	export_vars.innodb_buffer_pool_read_requests = stat.n_page_gets;
+
+	export_vars.innodb_buffer_pool_write_requests =
+		srv_stats.buf_pool_write_requests;
+
+	export_vars.innodb_buffer_pool_wait_free =
+		srv_stats.buf_pool_wait_free;
+
+	export_vars.innodb_buffer_pool_pages_flushed =
+		srv_stats.buf_pool_flushed;
+
+	export_vars.innodb_buffer_pool_reads = srv_stats.buf_pool_reads;
+
+	export_vars.innodb_buffer_pool_read_ahead_rnd =
+		stat.n_ra_pages_read_rnd;
+
+	export_vars.innodb_buffer_pool_read_ahead =
+		stat.n_ra_pages_read;
+
+	export_vars.innodb_buffer_pool_read_ahead_evicted =
+		stat.n_ra_pages_evicted;
+
+	export_vars.innodb_buffer_pool_pages_data = LRU_len;
+
+	export_vars.innodb_buffer_pool_bytes_data =
+		buf_pools_list_size.LRU_bytes
+		+ buf_pools_list_size.unzip_LRU_bytes;
+
+	export_vars.innodb_buffer_pool_pages_dirty = flush_list_len;
+
+	export_vars.innodb_buffer_pool_bytes_dirty =
+		buf_pools_list_size.flush_list_bytes;
+
+	export_vars.innodb_buffer_pool_pages_free = free_len;
+
+#ifdef UNIV_DEBUG
+	export_vars.innodb_buffer_pool_pages_latched =
+		buf_get_latched_pages_number();
+#endif /* UNIV_DEBUG */
+	export_vars.innodb_buffer_pool_pages_total = buf_pool_get_n_pages();
+
+	export_vars.innodb_buffer_pool_pages_misc =
+		buf_pool_get_n_pages() - LRU_len - free_len;
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	export_vars.innodb_have_atomic_builtins = 1;
+#else
+	export_vars.innodb_have_atomic_builtins = 0;
+#endif
+	export_vars.innodb_page_size = UNIV_PAGE_SIZE;
+
+	export_vars.innodb_log_waits = srv_stats.log_waits;
+
+	export_vars.innodb_os_log_written = srv_stats.os_log_written;
+
+	export_vars.innodb_os_log_fsyncs = fil_n_log_flushes;
+
+	export_vars.innodb_os_log_pending_fsyncs = fil_n_pending_log_flushes;
+
+	export_vars.innodb_os_log_pending_writes =
+		srv_stats.os_log_pending_writes;
+
+	export_vars.innodb_log_write_requests = srv_stats.log_write_requests;
+
+	export_vars.innodb_log_writes = srv_stats.log_writes;
+
+	export_vars.innodb_dblwr_pages_written =
+		srv_stats.dblwr_pages_written;
+
+	export_vars.innodb_dblwr_writes = srv_stats.dblwr_writes;
+
+	export_vars.innodb_pages_created = stat.n_pages_created;
+
+	export_vars.innodb_pages_read = stat.n_pages_read;
+
+	export_vars.innodb_pages_written = stat.n_pages_written;
+
+	export_vars.innodb_row_lock_waits = srv_stats.n_lock_wait_count;
+
+	export_vars.innodb_row_lock_current_waits =
+		srv_stats.n_lock_wait_current_count;
+
+	export_vars.innodb_row_lock_time = srv_stats.n_lock_wait_time / 1000;
+
+	if (srv_stats.n_lock_wait_count > 0) {
+
+		export_vars.innodb_row_lock_time_avg = (ulint)
+			(srv_stats.n_lock_wait_time
+			 / 1000 / srv_stats.n_lock_wait_count);
+
+	} else {
+		export_vars.innodb_row_lock_time_avg = 0;
+	}
+
+	export_vars.innodb_row_lock_time_max =
+		lock_sys->n_lock_max_wait_time / 1000;
+
+	export_vars.innodb_rows_read = srv_stats.n_rows_read;
+
+	export_vars.innodb_rows_inserted = srv_stats.n_rows_inserted;
+
+	export_vars.innodb_rows_updated = srv_stats.n_rows_updated;
+
+	export_vars.innodb_rows_deleted = srv_stats.n_rows_deleted;
+
+	export_vars.innodb_num_open_files = fil_n_file_opened;
+
+	export_vars.innodb_truncated_status_writes =
+		srv_truncated_status_writes;
+
+	export_vars.innodb_available_undo_logs = srv_available_undo_logs;
+
+#ifdef UNIV_DEBUG
+	rw_lock_s_lock(&purge_sys->latch);
+	trx_id_t	done_trx_no	= purge_sys->done.trx_no;
+	trx_id_t	up_limit_id	= purge_sys->view
+		? purge_sys->view->up_limit_id
+		: 0;
+	rw_lock_s_unlock(&purge_sys->latch);
+
+	mutex_enter(&trx_sys->mutex);
+	trx_id_t	max_trx_id	= trx_sys->rw_max_trx_id;
+	mutex_exit(&trx_sys->mutex);
+
+	if (!done_trx_no || max_trx_id < done_trx_no - 1) {
+		export_vars.innodb_purge_trx_id_age = 0;
+	} else {
+		export_vars.innodb_purge_trx_id_age =
+			(ulint) (max_trx_id - done_trx_no + 1);
+	}
+
+	if (!up_limit_id
+	    || max_trx_id < up_limit_id) {
+		export_vars.innodb_purge_view_trx_id_age = 0;
+	} else {
+		export_vars.innodb_purge_view_trx_id_age =
+			(ulint) (max_trx_id - up_limit_id);
+	}
+#endif /* UNIV_DEBUG */
+
+	mutex_exit(&srv_innodb_monitor_mutex);
+}
+
+/*********************************************************************//**
+A thread which prints the info output by various InnoDB monitors.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_monitor_thread)(
+/*===============================*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	ib_int64_t	sig_count;
+	double		time_elapsed;
+	time_t		current_time;
+	time_t		last_table_monitor_time;
+	time_t		last_tablespace_monitor_time;
+	time_t		last_monitor_time;
+	ulint		mutex_skipped;
+	ibool		last_srv_print_monitor;
+
+	ut_ad(!srv_read_only_mode);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Lock timeout thread starts, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(srv_monitor_thread_key);
+#endif /* UNIV_PFS_THREAD */
+	srv_monitor_active = TRUE;
+
+	UT_NOT_USED(arg);
+	srv_last_monitor_time = ut_time();
+	last_table_monitor_time = ut_time();
+	last_tablespace_monitor_time = ut_time();
+	last_monitor_time = ut_time();
+	mutex_skipped = 0;
+	last_srv_print_monitor = srv_print_innodb_monitor;
+loop:
+	/* Wake up every 5 seconds to see if we need to print
+	monitor information or if signalled at shutdown. */
+
+	sig_count = os_event_reset(srv_monitor_event);
+
+	os_event_wait_time_low(srv_monitor_event, 5000000, sig_count);
+
+	current_time = ut_time();
+
+	time_elapsed = difftime(current_time, last_monitor_time);
+
+	if (time_elapsed > 15) {
+		last_monitor_time = ut_time();
+
+		if (srv_print_innodb_monitor) {
+			/* Reset mutex_skipped counter everytime
+			srv_print_innodb_monitor changes. This is to
+			ensure we will not be blocked by lock_sys->mutex
+			for short duration information printing,
+			such as requested by sync_array_print_long_waits() */
+			if (!last_srv_print_monitor) {
+				mutex_skipped = 0;
+				last_srv_print_monitor = TRUE;
+			}
+
+			if (!srv_printf_innodb_monitor(stderr,
+						MUTEX_NOWAIT(mutex_skipped),
+						NULL, NULL)) {
+				mutex_skipped++;
+			} else {
+				/* Reset the counter */
+				mutex_skipped = 0;
+			}
+		} else {
+			last_srv_print_monitor = FALSE;
+		}
+
+
+		/* We don't create the temp files or associated
+		mutexes in read-only-mode */
+
+		if (!srv_read_only_mode && srv_innodb_status) {
+			mutex_enter(&srv_monitor_file_mutex);
+			rewind(srv_monitor_file);
+			if (!srv_printf_innodb_monitor(srv_monitor_file,
+						MUTEX_NOWAIT(mutex_skipped),
+						NULL, NULL)) {
+				mutex_skipped++;
+			} else {
+				mutex_skipped = 0;
+			}
+
+			os_file_set_eof(srv_monitor_file);
+			mutex_exit(&srv_monitor_file_mutex);
+		}
+
+		if (srv_print_innodb_tablespace_monitor
+		    && difftime(current_time,
+				last_tablespace_monitor_time) > 60) {
+			last_tablespace_monitor_time = ut_time();
+
+			fputs("========================"
+			      "========================\n",
+			      stderr);
+
+			ut_print_timestamp(stderr);
+
+			fputs(" INNODB TABLESPACE MONITOR OUTPUT\n"
+			      "========================"
+			      "========================\n",
+			      stderr);
+
+			fsp_print(0);
+			fputs("Validating tablespace\n", stderr);
+			fsp_validate(0);
+			fputs("Validation ok\n"
+			      "---------------------------------------\n"
+			      "END OF INNODB TABLESPACE MONITOR OUTPUT\n"
+			      "=======================================\n",
+			      stderr);
+		}
+
+		if (srv_print_innodb_table_monitor
+		    && difftime(current_time, last_table_monitor_time) > 60) {
+
+			last_table_monitor_time = ut_time();
+
+			fprintf(stderr, "Warning: %s\n",
+				DEPRECATED_MSG_INNODB_TABLE_MONITOR);
+
+			fputs("===========================================\n",
+			      stderr);
+
+			ut_print_timestamp(stderr);
+
+			fputs(" INNODB TABLE MONITOR OUTPUT\n"
+			      "===========================================\n",
+			      stderr);
+			dict_print();
+
+			fputs("-----------------------------------\n"
+			      "END OF INNODB TABLE MONITOR OUTPUT\n"
+			      "==================================\n",
+			      stderr);
+
+			fprintf(stderr, "Warning: %s\n",
+				DEPRECATED_MSG_INNODB_TABLE_MONITOR);
+		}
+	}
+
+	if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
+		goto exit_func;
+	}
+
+	if (srv_print_innodb_monitor
+	    || srv_print_innodb_lock_monitor
+	    || srv_print_innodb_tablespace_monitor
+	    || srv_print_innodb_table_monitor) {
+		goto loop;
+	}
+
+	goto loop;
+
+exit_func:
+	srv_monitor_active = FALSE;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/*********************************************************************//**
+A thread which prints warnings about semaphore waits which have lasted
+too long. These can be used to track bugs which cause hangs.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_error_monitor_thread)(
+/*=====================================*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	/* number of successive fatal timeouts observed */
+	ulint		fatal_cnt	= 0;
+	lsn_t		old_lsn;
+	lsn_t		new_lsn;
+	ib_int64_t	sig_count;
+	/* longest waiting thread for a semaphore */
+	os_thread_id_t	waiter		= os_thread_get_curr_id();
+	os_thread_id_t	old_waiter	= waiter;
+	/* the semaphore that is being waited for */
+	const void*	sema		= NULL;
+	const void*	old_sema	= NULL;
+
+	ut_ad(!srv_read_only_mode);
+
+	old_lsn = srv_start_lsn;
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Error monitor thread starts, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(srv_error_monitor_thread_key);
+#endif /* UNIV_PFS_THREAD */
+	srv_error_monitor_active = TRUE;
+
+loop:
+	/* Try to track a strange bug reported by Harald Fuchs and others,
+	where the lsn seems to decrease at times */
+
+	new_lsn = log_get_lsn();
+
+	if (new_lsn < old_lsn) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: old log sequence number " LSN_PF
+			" was greater\n"
+			"InnoDB: than the new log sequence number " LSN_PF "!\n"
+			"InnoDB: Please submit a bug report"
+			" to http://bugs.mysql.com\n",
+			old_lsn, new_lsn);
+		ut_ad(0);
+	}
+
+	old_lsn = new_lsn;
+
+	if (difftime(time(NULL), srv_last_monitor_time) > 60) {
+		/* We referesh InnoDB Monitor values so that averages are
+		printed from at most 60 last seconds */
+
+		srv_refresh_innodb_monitor_stats();
+	}
+
+	/* Update the statistics collected for deciding LRU
+	eviction policy. */
+	buf_LRU_stat_update();
+
+	/* In case mutex_exit is not a memory barrier, it is
+	theoretically possible some threads are left waiting though
+	the semaphore is already released. Wake up those threads: */
+
+	sync_arr_wake_threads_if_sema_free();
+
+	if (sync_array_print_long_waits(&waiter, &sema)
+	    && sema == old_sema && os_thread_eq(waiter, old_waiter)) {
+		fatal_cnt++;
+		if (fatal_cnt > 10) {
+
+			fprintf(stderr,
+				"InnoDB: Error: semaphore wait has lasted"
+				" > %lu seconds\n"
+				"InnoDB: We intentionally crash the server,"
+				" because it appears to be hung.\n",
+				(ulong) srv_fatal_semaphore_wait_threshold);
+
+			ut_error;
+		}
+	} else {
+		fatal_cnt = 0;
+		old_waiter = waiter;
+		old_sema = sema;
+	}
+
+	/* Flush stderr so that a database user gets the output
+	to possible MySQL error file */
+
+	fflush(stderr);
+
+	sig_count = os_event_reset(srv_error_event);
+
+	os_event_wait_time_low(srv_error_event, 1000000, sig_count);
+
+	if (srv_shutdown_state < SRV_SHUTDOWN_CLEANUP) {
+
+		goto loop;
+	}
+
+	srv_error_monitor_active = FALSE;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/******************************************************************//**
+Increment the server activity count. */
+UNIV_INTERN
+void
+srv_inc_activity_count(void)
+/*========================*/
+{
+	srv_sys->activity_count.inc();
+}
+
+/**********************************************************************//**
+Check whether any background thread is active. If so return the thread
+type.
+@return SRV_NONE if all are suspended or have exited, thread
+type if any are still active. */
+UNIV_INTERN
+srv_thread_type
+srv_get_active_thread_type(void)
+/*============================*/
+{
+	srv_thread_type ret = SRV_NONE;
+
+	if (srv_read_only_mode) {
+		return(SRV_NONE);
+	}
+
+	srv_sys_mutex_enter();
+
+	for (ulint i = SRV_WORKER; i <= SRV_MASTER; ++i) {
+		if (srv_sys->n_threads_active[i] != 0) {
+			ret = static_cast<srv_thread_type>(i);
+			break;
+		}
+	}
+
+	srv_sys_mutex_exit();
+
+	/* Check only on shutdown. */
+	if (ret == SRV_NONE
+	    && srv_shutdown_state != SRV_SHUTDOWN_NONE
+	    && trx_purge_state() != PURGE_STATE_DISABLED
+	    && trx_purge_state() != PURGE_STATE_EXIT) {
+
+		ret = SRV_PURGE;
+	}
+
+	return(ret);
+}
+
+/**********************************************************************//**
+Check whether any background thread are active. If so print which thread
+is active. Send the threads wakeup signal.
+@return name of thread that is active or NULL */
+UNIV_INTERN
+const char*
+srv_any_background_threads_are_active(void)
+/*=======================================*/
+{
+	const char*	thread_active = NULL;
+
+	if (srv_read_only_mode) {
+		return(NULL);
+	} else if (srv_error_monitor_active) {
+		thread_active = "srv_error_monitor_thread";
+	} else if (lock_sys->timeout_thread_active) {
+		thread_active = "srv_lock_timeout thread";
+	} else if (srv_monitor_active) {
+		thread_active = "srv_monitor_thread";
+	} else if (srv_buf_dump_thread_active) {
+		thread_active = "buf_dump_thread";
+	} else if (srv_dict_stats_thread_active) {
+		thread_active = "dict_stats_thread";
+	}
+
+	os_event_set(srv_error_event);
+	os_event_set(srv_monitor_event);
+	os_event_set(srv_buf_dump_event);
+	os_event_set(lock_sys->timeout_event);
+	os_event_set(dict_stats_event);
+
+	return(thread_active);
+}
+
+/*******************************************************************//**
+Tells the InnoDB server that there has been activity in the database
+and wakes up the master thread if it is suspended (not sleeping). Used
+in the MySQL interface. Note that there is a small chance that the master
+thread stays suspended (we do not protect our operation with the
+srv_sys_t->mutex, for performance reasons). */
+UNIV_INTERN
+void
+srv_active_wake_master_thread(void)
+/*===============================*/
+{
+	if (srv_read_only_mode) {
+		return;
+	}
+
+	ut_ad(!srv_sys_mutex_own());
+
+	srv_inc_activity_count();
+
+	if (srv_sys->n_threads_active[SRV_MASTER] == 0) {
+		srv_slot_t*	slot;
+
+		srv_sys_mutex_enter();
+
+		slot = &srv_sys->sys_threads[SRV_MASTER_SLOT];
+
+		/* Only if the master thread has been started. */
+
+		if (slot->in_use) {
+			ut_a(srv_slot_get_type(slot) == SRV_MASTER);
+
+			if (slot->suspended) {
+
+				slot->suspended = FALSE;
+
+				++srv_sys->n_threads_active[SRV_MASTER];
+
+				os_event_set(slot->event);
+			}
+		}
+
+		srv_sys_mutex_exit();
+	}
+}
+
+/*******************************************************************//**
+Tells the purge thread that there has been activity in the database
+and wakes up the purge thread if it is suspended (not sleeping).  Note
+that there is a small chance that the purge thread stays suspended
+(we do not protect our check with the srv_sys_t:mutex and the
+purge_sys->latch, for performance reasons). */
+UNIV_INTERN
+void
+srv_wake_purge_thread_if_not_active(void)
+/*=====================================*/
+{
+	ut_ad(!srv_sys_mutex_own());
+
+	if (purge_sys->state == PURGE_STATE_RUN
+	    && srv_sys->n_threads_active[SRV_PURGE] == 0) {
+
+		srv_release_threads(SRV_PURGE, 1);
+	}
+}
+
+/*******************************************************************//**
+Wakes up the master thread if it is suspended or being suspended. */
+UNIV_INTERN
+void
+srv_wake_master_thread(void)
+/*========================*/
+{
+	ut_ad(!srv_sys_mutex_own());
+
+	srv_inc_activity_count();
+
+	srv_release_threads(SRV_MASTER, 1);
+}
+
+/*******************************************************************//**
+Get current server activity count. We don't hold srv_sys::mutex while
+reading this value as it is only used in heuristics.
+@return activity count. */
+UNIV_INTERN
+ulint
+srv_get_activity_count(void)
+/*========================*/
+{
+	return(srv_sys->activity_count);
+}
+
+/*******************************************************************//**
+Check if there has been any activity.
+@return FALSE if no change in activity counter. */
+UNIV_INTERN
+ibool
+srv_check_activity(
+/*===============*/
+	ulint		old_activity_count)	/*!< in: old activity count */
+{
+	return(srv_sys->activity_count != old_activity_count);
+}
+
+/********************************************************************//**
+The master thread is tasked to ensure that flush of log file happens
+once every second in the background. This is to ensure that not more
+than one second of trxs are lost in case of crash when
+innodb_flush_logs_at_trx_commit != 1 */
+static
+void
+srv_sync_log_buffer_in_background(void)
+/*===================================*/
+{
+	time_t	current_time = time(NULL);
+
+	srv_main_thread_op_info = "flushing log";
+	if (difftime(current_time, srv_last_log_flush_time)
+	    >= srv_flush_log_at_timeout) {
+		log_buffer_sync_in_background(TRUE);
+		srv_last_log_flush_time = current_time;
+		srv_log_writes_and_flush++;
+	}
+}
+
+/********************************************************************//**
+Make room in the table cache by evicting an unused table.
+@return number of tables evicted. */
+static
+ulint
+srv_master_evict_from_table_cache(
+/*==============================*/
+	ulint	pct_check)	/*!< in: max percent to check */
+{
+	ulint	n_tables_evicted = 0;
+
+	rw_lock_x_lock(&dict_operation_lock);
+
+	dict_mutex_enter_for_mysql();
+
+	n_tables_evicted = dict_make_room_in_cache(
+		innobase_get_table_cache_size(), pct_check);
+
+	dict_mutex_exit_for_mysql();
+
+	rw_lock_x_unlock(&dict_operation_lock);
+
+	return(n_tables_evicted);
+}
+
+/*********************************************************************//**
+This function prints progress message every 60 seconds during server
+shutdown, for any activities that master thread is pending on. */
+static
+void
+srv_shutdown_print_master_pending(
+/*==============================*/
+	ib_time_t*	last_print_time,	/*!< last time the function
+						print the message */
+	ulint		n_tables_to_drop,	/*!< number of tables to
+						be dropped */
+	ulint		n_bytes_merged)		/*!< number of change buffer
+						just merged */
+{
+	ib_time_t	current_time;
+	double		time_elapsed;
+
+	current_time = ut_time();
+	time_elapsed = ut_difftime(current_time, *last_print_time);
+
+	if (time_elapsed > 60) {
+		*last_print_time = ut_time();
+
+		if (n_tables_to_drop) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: Waiting for "
+				"%lu table(s) to be dropped\n",
+				(ulong) n_tables_to_drop);
+		}
+
+		/* Check change buffer merge, we only wait for change buffer
+		merge if it is a slow shutdown */
+		if (!srv_fast_shutdown && n_bytes_merged) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: Waiting for change "
+				"buffer merge to complete\n"
+				"  InnoDB: number of bytes of change buffer "
+				"just merged:  %lu\n",
+				n_bytes_merged);
+		}
+	}
+}
+
+/*********************************************************************//**
+Perform the tasks that the master thread is supposed to do when the
+server is active. There are two types of tasks. The first category is
+of such tasks which are performed at each inovcation of this function.
+We assume that this function is called roughly every second when the
+server is active. The second category is of such tasks which are
+performed at some interval e.g.: purge, dict_LRU cleanup etc. */
+static
+void
+srv_master_do_active_tasks(void)
+/*============================*/
+{
+	ib_time_t	cur_time = ut_time();
+	ullint		counter_time = ut_time_us(NULL);
+
+	/* First do the tasks that we are suppose to do at each
+	invocation of this function. */
+
+	++srv_main_active_loops;
+
+	MONITOR_INC(MONITOR_MASTER_ACTIVE_LOOPS);
+
+	/* ALTER TABLE in MySQL requires on Unix that the table handler
+	can drop tables lazily after there no longer are SELECT
+	queries to them. */
+	srv_main_thread_op_info = "doing background drop tables";
+	row_drop_tables_for_mysql_in_background();
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND, counter_time);
+
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	/* make sure that there is enough reusable space in the redo
+	log files */
+	srv_main_thread_op_info = "checking free log space";
+	log_free_check();
+
+	/* Do an ibuf merge */
+	srv_main_thread_op_info = "doing insert buffer merge";
+	counter_time = ut_time_us(NULL);
+	ibuf_contract_in_background(0, FALSE);
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_IBUF_MERGE_MICROSECOND, counter_time);
+
+	/* Flush logs if needed */
+	srv_main_thread_op_info = "flushing log";
+	srv_sync_log_buffer_in_background();
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_LOG_FLUSH_MICROSECOND, counter_time);
+
+	/* Now see if various tasks that are performed at defined
+	intervals need to be performed. */
+
+#ifdef MEM_PERIODIC_CHECK
+	/* Check magic numbers of every allocated mem block once in
+	SRV_MASTER_MEM_VALIDATE_INTERVAL seconds */
+	if (cur_time % SRV_MASTER_MEM_VALIDATE_INTERVAL == 0) {
+		mem_validate_all_blocks();
+		MONITOR_INC_TIME_IN_MICRO_SECS(
+			MONITOR_SRV_MEM_VALIDATE_MICROSECOND, counter_time);
+	}
+#endif
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	if (cur_time % SRV_MASTER_DICT_LRU_INTERVAL == 0) {
+		srv_main_thread_op_info = "enforcing dict cache limit";
+		srv_master_evict_from_table_cache(50);
+		MONITOR_INC_TIME_IN_MICRO_SECS(
+			MONITOR_SRV_DICT_LRU_MICROSECOND, counter_time);
+	}
+
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	/* Make a new checkpoint */
+	if (cur_time % SRV_MASTER_CHECKPOINT_INTERVAL == 0) {
+		srv_main_thread_op_info = "making checkpoint";
+		log_checkpoint(TRUE, FALSE);
+		MONITOR_INC_TIME_IN_MICRO_SECS(
+			MONITOR_SRV_CHECKPOINT_MICROSECOND, counter_time);
+	}
+}
+
+/*********************************************************************//**
+Perform the tasks that the master thread is supposed to do whenever the
+server is idle. We do check for the server state during this function
+and if the server has entered the shutdown phase we may return from
+the function without completing the required tasks.
+Note that the server can move to active state when we are executing this
+function but we don't check for that as we are suppose to perform more
+or less same tasks when server is active. */
+static
+void
+srv_master_do_idle_tasks(void)
+/*==========================*/
+{
+	ullint	counter_time;
+
+	++srv_main_idle_loops;
+
+	MONITOR_INC(MONITOR_MASTER_IDLE_LOOPS);
+
+
+	/* ALTER TABLE in MySQL requires on Unix that the table handler
+	can drop tables lazily after there no longer are SELECT
+	queries to them. */
+	counter_time = ut_time_us(NULL);
+	srv_main_thread_op_info = "doing background drop tables";
+	row_drop_tables_for_mysql_in_background();
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND,
+			 counter_time);
+
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	/* make sure that there is enough reusable space in the redo
+	log files */
+	srv_main_thread_op_info = "checking free log space";
+	log_free_check();
+
+	/* Do an ibuf merge */
+	counter_time = ut_time_us(NULL);
+	srv_main_thread_op_info = "doing insert buffer merge";
+	ibuf_contract_in_background(0, TRUE);
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_IBUF_MERGE_MICROSECOND, counter_time);
+
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	srv_main_thread_op_info = "enforcing dict cache limit";
+	srv_master_evict_from_table_cache(100);
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_DICT_LRU_MICROSECOND, counter_time);
+
+	/* Flush logs if needed */
+	srv_sync_log_buffer_in_background();
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_LOG_FLUSH_MICROSECOND, counter_time);
+
+	if (srv_shutdown_state > 0) {
+		return;
+	}
+
+	/* Make a new checkpoint */
+	srv_main_thread_op_info = "making checkpoint";
+	log_checkpoint(TRUE, FALSE);
+	MONITOR_INC_TIME_IN_MICRO_SECS(MONITOR_SRV_CHECKPOINT_MICROSECOND,
+				       counter_time);
+}
+
+/*********************************************************************//**
+Perform the tasks during shutdown. The tasks that we do at shutdown
+depend on srv_fast_shutdown:
+2 => very fast shutdown => do no book keeping
+1 => normal shutdown => clear drop table queue and make checkpoint
+0 => slow shutdown => in addition to above do complete purge and ibuf
+merge
+@return TRUE if some work was done. FALSE otherwise */
+static
+ibool
+srv_master_do_shutdown_tasks(
+/*=========================*/
+	ib_time_t*	last_print_time)/*!< last time the function
+					print the message */
+{
+	ulint		n_bytes_merged = 0;
+	ulint		n_tables_to_drop = 0;
+
+	ut_ad(!srv_read_only_mode);
+
+	++srv_main_shutdown_loops;
+
+	ut_a(srv_shutdown_state > 0);
+
+	/* In very fast shutdown none of the following is necessary */
+	if (srv_fast_shutdown == 2) {
+		return(FALSE);
+	}
+
+	/* ALTER TABLE in MySQL requires on Unix that the table handler
+	can drop tables lazily after there no longer are SELECT
+	queries to them. */
+	srv_main_thread_op_info = "doing background drop tables";
+	n_tables_to_drop = row_drop_tables_for_mysql_in_background();
+
+	/* make sure that there is enough reusable space in the redo
+	log files */
+	srv_main_thread_op_info = "checking free log space";
+	log_free_check();
+
+	/* In case of normal shutdown we don't do ibuf merge or purge */
+	if (srv_fast_shutdown == 1) {
+		goto func_exit;
+	}
+
+	/* Do an ibuf merge */
+	srv_main_thread_op_info = "doing insert buffer merge";
+	n_bytes_merged = ibuf_contract_in_background(0, TRUE);
+
+	/* Flush logs if needed */
+	srv_sync_log_buffer_in_background();
+
+func_exit:
+	/* Make a new checkpoint about once in 10 seconds */
+	srv_main_thread_op_info = "making checkpoint";
+	log_checkpoint(TRUE, FALSE);
+
+	/* Print progress message every 60 seconds during shutdown */
+	if (srv_shutdown_state > 0 && srv_print_verbose_log) {
+		srv_shutdown_print_master_pending(
+			last_print_time, n_tables_to_drop, n_bytes_merged);
+	}
+
+	return(n_bytes_merged || n_tables_to_drop);
+}
+
+/*********************************************************************//**
+Puts master thread to sleep. At this point we are using polling to
+service various activities. Master thread sleeps for one second before
+checking the state of the server again */
+static
+void
+srv_master_sleep(void)
+/*==================*/
+{
+	srv_main_thread_op_info = "sleeping";
+	os_thread_sleep(1000000);
+	srv_main_thread_op_info = "";
+}
+
+/*********************************************************************//**
+The master thread controlling the server.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_master_thread)(
+/*==============================*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	srv_slot_t*	slot;
+	ulint		old_activity_count = srv_get_activity_count();
+	ib_time_t	last_print_time;
+
+	ut_ad(!srv_read_only_mode);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Master thread starts, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(srv_master_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+	srv_main_thread_process_no = os_proc_get_number();
+	srv_main_thread_id = os_thread_pf(os_thread_get_curr_id());
+
+	slot = srv_reserve_slot(SRV_MASTER);
+	ut_a(slot == srv_sys->sys_threads);
+
+	last_print_time = ut_time();
+loop:
+	if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND) {
+		goto suspend_thread;
+	}
+
+	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+
+		srv_master_sleep();
+
+		MONITOR_INC(MONITOR_MASTER_THREAD_SLEEP);
+
+		if (srv_check_activity(old_activity_count)) {
+			old_activity_count = srv_get_activity_count();
+			srv_master_do_active_tasks();
+		} else {
+			srv_master_do_idle_tasks();
+		}
+	}
+
+	while (srv_master_do_shutdown_tasks(&last_print_time)) {
+
+		/* Shouldn't loop here in case of very fast shutdown */
+		ut_ad(srv_fast_shutdown < 2);
+	}
+
+suspend_thread:
+	srv_main_thread_op_info = "suspending";
+
+	srv_suspend_thread(slot);
+
+	/* DO NOT CHANGE THIS STRING. innobase_start_or_create_for_mysql()
+	waits for database activity to die down when converting < 4.1.x
+	databases, and relies on this string being exactly as it is. InnoDB
+	manual also mentions this string in several places. */
+	srv_main_thread_op_info = "waiting for server activity";
+
+	os_event_wait(slot->event);
+
+	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+		os_thread_exit(NULL);
+	}
+
+	goto loop;
+
+	OS_THREAD_DUMMY_RETURN;	/* Not reached, avoid compiler warning */
+}
+
+/*********************************************************************//**
+Check if purge should stop.
+@return true if it should shutdown. */
+static
+bool
+srv_purge_should_exit(
+/*==============*/
+	ulint		n_purged)	/*!< in: pages purged in last batch */
+{
+	switch (srv_shutdown_state) {
+	case SRV_SHUTDOWN_NONE:
+		/* Normal operation. */
+		break;
+
+	case SRV_SHUTDOWN_CLEANUP:
+	case SRV_SHUTDOWN_EXIT_THREADS:
+		/* Exit unless slow shutdown requested or all done. */
+		return(srv_fast_shutdown != 0 || n_purged == 0);
+
+	case SRV_SHUTDOWN_LAST_PHASE:
+	case SRV_SHUTDOWN_FLUSH_PHASE:
+		ut_error;
+	}
+
+	return(false);
+}
+
+/*********************************************************************//**
+Fetch and execute a task from the work queue.
+@return	true if a task was executed */
+static
+bool
+srv_task_execute(void)
+/*==================*/
+{
+	que_thr_t*	thr = NULL;
+
+	ut_ad(!srv_read_only_mode);
+	ut_a(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
+
+	mutex_enter(&srv_sys->tasks_mutex);
+
+	if (UT_LIST_GET_LEN(srv_sys->tasks) > 0) {
+
+		thr = UT_LIST_GET_FIRST(srv_sys->tasks);
+
+		ut_a(que_node_get_type(thr->child) == QUE_NODE_PURGE);
+
+		UT_LIST_REMOVE(queue, srv_sys->tasks, thr);
+	}
+
+	mutex_exit(&srv_sys->tasks_mutex);
+
+	if (thr != NULL) {
+
+		que_run_threads(thr);
+
+		os_atomic_inc_ulint(
+			&purge_sys->bh_mutex, &purge_sys->n_completed, 1);
+	}
+
+	return(thr != NULL);
+}
+
+/*********************************************************************//**
+Worker thread that reads tasks from the work queue and executes them.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_worker_thread)(
+/*==============================*/
+	void*	arg __attribute__((unused)))	/*!< in: a dummy parameter
+						required by os_thread_create */
+{
+	srv_slot_t*	slot;
+
+	ut_ad(!srv_read_only_mode);
+	ut_a(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: worker thread starting, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+	slot = srv_reserve_slot(SRV_WORKER);
+
+	ut_a(srv_n_purge_threads > 1);
+
+	srv_sys_mutex_enter();
+
+	ut_a(srv_sys->n_threads_active[SRV_WORKER] < srv_n_purge_threads);
+
+	srv_sys_mutex_exit();
+
+	/* We need to ensure that the worker threads exit after the
+	purge coordinator thread. Otherwise the purge coordinaor can
+	end up waiting forever in trx_purge_wait_for_workers_to_complete() */
+
+	do {
+		srv_suspend_thread(slot);
+
+		os_event_wait(slot->event);
+
+		if (srv_task_execute()) {
+
+			/* If there are tasks in the queue, wakeup
+			the purge coordinator thread. */
+
+			srv_wake_purge_thread_if_not_active();
+		}
+
+		/* Note: we are checking the state without holding the
+		purge_sys->latch here. */
+	} while (purge_sys->state != PURGE_STATE_EXIT);
+
+	srv_free_slot(slot);
+
+	rw_lock_x_lock(&purge_sys->latch);
+
+	ut_a(!purge_sys->running);
+	ut_a(purge_sys->state == PURGE_STATE_EXIT);
+	ut_a(srv_shutdown_state > SRV_SHUTDOWN_NONE);
+
+	rw_lock_x_unlock(&purge_sys->latch);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: Purge worker thread exiting, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;	/* Not reached, avoid compiler warning */
+}
+
+/*********************************************************************//**
+Do the actual purge operation.
+@return length of history list before the last purge batch. */
+static
+ulint
+srv_do_purge(
+/*=========*/
+	ulint		n_threads,	/*!< in: number of threads to use */
+	ulint*		n_total_purged)	/*!< in/out: total pages purged */
+{
+	ulint		n_pages_purged;
+
+	static ulint	count = 0;
+	static ulint	n_use_threads = 0;
+	static ulint	rseg_history_len = 0;
+	ulint		old_activity_count = srv_get_activity_count();
+
+	ut_a(n_threads > 0);
+	ut_ad(!srv_read_only_mode);
+
+	/* Purge until there are no more records to purge and there is
+	no change in configuration or server state. If the user has
+	configured more than one purge thread then we treat that as a
+	pool of threads and only use the extra threads if purge can't
+	keep up with updates. */
+
+	if (n_use_threads == 0) {
+		n_use_threads = n_threads;
+	}
+
+	do {
+		if (trx_sys->rseg_history_len > rseg_history_len
+		    || (srv_max_purge_lag > 0
+			&& rseg_history_len > srv_max_purge_lag)) {
+
+			/* History length is now longer than what it was
+			when we took the last snapshot. Use more threads. */
+
+			if (n_use_threads < n_threads) {
+				++n_use_threads;
+			}
+
+		} else if (srv_check_activity(old_activity_count)
+			   && n_use_threads > 1) {
+
+			/* History length same or smaller since last snapshot,
+			use fewer threads. */
+
+			--n_use_threads;
+
+			old_activity_count = srv_get_activity_count();
+		}
+
+		/* Ensure that the purge threads are less than what
+		was configured. */
+
+		ut_a(n_use_threads > 0);
+		ut_a(n_use_threads <= n_threads);
+
+		/* Take a snapshot of the history list before purge. */
+		if ((rseg_history_len = trx_sys->rseg_history_len) == 0) {
+			break;
+		}
+
+		n_pages_purged = trx_purge(
+			n_use_threads, srv_purge_batch_size, false);
+
+		if (!(count++ % TRX_SYS_N_RSEGS)) {
+			/* Force a truncate of the history list. */
+			n_pages_purged += trx_purge(
+				1, srv_purge_batch_size, true);
+		}
+
+		*n_total_purged += n_pages_purged;
+
+	} while (!srv_purge_should_exit(n_pages_purged)
+		 && n_pages_purged > 0
+		 && purge_sys->state == PURGE_STATE_RUN);
+
+	return(rseg_history_len);
+}
+
+/*********************************************************************//**
+Suspend the purge coordinator thread. */
+static
+void
+srv_purge_coordinator_suspend(
+/*==========================*/
+	srv_slot_t*	slot,			/*!< in/out: Purge coordinator
+						thread slot */
+	ulint		rseg_history_len)	/*!< in: history list length
+						before last purge */
+{
+	ut_ad(!srv_read_only_mode);
+	ut_a(slot->type == SRV_PURGE);
+
+	bool		stop = false;
+
+	/** Maximum wait time on the purge event, in micro-seconds. */
+	static const ulint SRV_PURGE_MAX_TIMEOUT = 10000;
+
+	ib_int64_t	sig_count = srv_suspend_thread(slot);
+
+	do {
+		ulint		ret;
+
+		rw_lock_x_lock(&purge_sys->latch);
+
+		purge_sys->running = false;
+
+		rw_lock_x_unlock(&purge_sys->latch);
+
+		/* We don't wait right away on the the non-timed wait because
+		we want to signal the thread that wants to suspend purge. */
+
+		if (stop) {
+			os_event_wait_low(slot->event, sig_count);
+			ret = 0;
+		} else if (rseg_history_len <= trx_sys->rseg_history_len) {
+			ret = os_event_wait_time_low(
+				slot->event, SRV_PURGE_MAX_TIMEOUT, sig_count);
+		} else {
+			/* We don't want to waste time waiting, if the
+			history list increased by the time we got here,
+			unless purge has been stopped. */
+			ret = 0;
+		}
+
+		srv_sys_mutex_enter();
+
+		/* The thread can be in state !suspended after the timeout
+		but before this check if another thread sent a wakeup signal. */
+
+		if (slot->suspended) {
+			slot->suspended = FALSE;
+			++srv_sys->n_threads_active[slot->type];
+			ut_a(srv_sys->n_threads_active[slot->type] == 1);
+		}
+
+		srv_sys_mutex_exit();
+
+		sig_count = srv_suspend_thread(slot);
+
+		rw_lock_x_lock(&purge_sys->latch);
+
+		stop = (srv_shutdown_state == SRV_SHUTDOWN_NONE
+			&& purge_sys->state == PURGE_STATE_STOP);
+
+		if (!stop) {
+			ut_a(purge_sys->n_stop == 0);
+			purge_sys->running = true;
+		} else {
+			ut_a(purge_sys->n_stop > 0);
+
+			/* Signal that we are suspended. */
+			os_event_set(purge_sys->event);
+		}
+
+		rw_lock_x_unlock(&purge_sys->latch);
+
+		if (ret == OS_SYNC_TIME_EXCEEDED) {
+
+			/* No new records added since wait started then simply
+			wait for new records. The magic number 5000 is an
+			approximation for the case where we have cached UNDO
+			log records which prevent truncate of the UNDO
+			segments. */
+
+			if (rseg_history_len == trx_sys->rseg_history_len
+			    && trx_sys->rseg_history_len < 5000) {
+
+				stop = true;
+			}
+		}
+
+	} while (stop);
+
+	srv_sys_mutex_enter();
+
+	if (slot->suspended) {
+		slot->suspended = FALSE;
+		++srv_sys->n_threads_active[slot->type];
+		ut_a(srv_sys->n_threads_active[slot->type] == 1);
+	}
+
+	srv_sys_mutex_exit();
+}
+
+/*********************************************************************//**
+Purge coordinator thread that schedules the purge tasks.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(srv_purge_coordinator_thread)(
+/*=========================================*/
+	void*	arg __attribute__((unused)))	/*!< in: a dummy parameter
+						required by os_thread_create */
+{
+	srv_slot_t*	slot;
+	ulint           n_total_purged = ULINT_UNDEFINED;
+
+	ut_ad(!srv_read_only_mode);
+	ut_a(srv_n_purge_threads >= 1);
+	ut_a(trx_purge_state() == PURGE_STATE_INIT);
+	ut_a(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
+
+	rw_lock_x_lock(&purge_sys->latch);
+
+	purge_sys->running = true;
+	purge_sys->state = PURGE_STATE_RUN;
+
+	rw_lock_x_unlock(&purge_sys->latch);
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(srv_purge_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: Purge coordinator thread created, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+	slot = srv_reserve_slot(SRV_PURGE);
+
+	ulint	rseg_history_len = trx_sys->rseg_history_len;
+
+	do {
+		/* If there are no records to purge or the last
+		purge didn't purge any records then wait for activity. */
+
+		if (srv_shutdown_state == SRV_SHUTDOWN_NONE
+		    && (purge_sys->state == PURGE_STATE_STOP
+			|| n_total_purged == 0)) {
+
+			srv_purge_coordinator_suspend(slot, rseg_history_len);
+		}
+
+		if (srv_purge_should_exit(n_total_purged)) {
+			ut_a(!slot->suspended);
+			break;
+		}
+
+		n_total_purged = 0;
+
+		rseg_history_len = srv_do_purge(
+			srv_n_purge_threads, &n_total_purged);
+
+	} while (!srv_purge_should_exit(n_total_purged));
+
+	/* Ensure that we don't jump out of the loop unless the
+	exit condition is satisfied. */
+
+	ut_a(srv_purge_should_exit(n_total_purged));
+
+	ulint	n_pages_purged = ULINT_MAX;
+
+	/* Ensure that all records are purged if it is not a fast shutdown.
+	This covers the case where a record can be added after we exit the
+	loop above. */
+	while (srv_fast_shutdown == 0 && n_pages_purged > 0) {
+		n_pages_purged = trx_purge(1, srv_purge_batch_size, false);
+	}
+
+	/* Force a truncate of the history list. */
+	n_pages_purged = trx_purge(1, srv_purge_batch_size, true);
+	ut_a(n_pages_purged == 0 || srv_fast_shutdown != 0);
+
+	/* The task queue should always be empty, independent of fast
+	shutdown state. */
+	ut_a(srv_get_task_queue_length() == 0);
+
+	srv_free_slot(slot);
+
+	/* Note that we are shutting down. */
+	rw_lock_x_lock(&purge_sys->latch);
+
+	purge_sys->state = PURGE_STATE_EXIT;
+
+	purge_sys->running = false;
+
+	rw_lock_x_unlock(&purge_sys->latch);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	ut_print_timestamp(stderr);
+	fprintf(stderr, " InnoDB: Purge coordinator exiting, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif /* UNIV_DEBUG_THREAD_CREATION */
+
+	/* Ensure that all the worker threads quit. */
+	if (srv_n_purge_threads > 1) {
+		srv_release_threads(SRV_WORKER, srv_n_purge_threads - 1);
+	}
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;	/* Not reached, avoid compiler warning */
+}
+
+/**********************************************************************//**
+Enqueues a task to server task queue and releases a worker thread, if there
+is a suspended one. */
+UNIV_INTERN
+void
+srv_que_task_enqueue_low(
+/*=====================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ut_ad(!srv_read_only_mode);
+	mutex_enter(&srv_sys->tasks_mutex);
+
+	UT_LIST_ADD_LAST(queue, srv_sys->tasks, thr);
+
+	mutex_exit(&srv_sys->tasks_mutex);
+
+	srv_release_threads(SRV_WORKER, 1);
+}
+
+/**********************************************************************//**
+Get count of tasks in the queue.
+@return number of tasks in queue  */
+UNIV_INTERN
+ulint
+srv_get_task_queue_length(void)
+/*===========================*/
+{
+	ulint	n_tasks;
+
+	ut_ad(!srv_read_only_mode);
+
+	mutex_enter(&srv_sys->tasks_mutex);
+
+	n_tasks = UT_LIST_GET_LEN(srv_sys->tasks);
+
+	mutex_exit(&srv_sys->tasks_mutex);
+
+	return(n_tasks);
+}
+
+/**********************************************************************//**
+Wakeup the purge threads. */
+UNIV_INTERN
+void
+srv_purge_wakeup(void)
+/*==================*/
+{
+	ut_ad(!srv_read_only_mode);
+
+	if (srv_force_recovery < SRV_FORCE_NO_BACKGROUND) {
+
+		srv_release_threads(SRV_PURGE, 1);
+
+		if (srv_n_purge_threads > 1) {
+			ulint	n_workers = srv_n_purge_threads - 1;
+
+			srv_release_threads(SRV_WORKER, n_workers);
+		}
+	}
+}
+
diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc
new file mode 100644
index 00000000000..fde0d1552be
--- /dev/null
+++ b/storage/innobase/srv/srv0start.cc
@@ -0,0 +1,3219 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file srv/srv0start.cc
+Starts the InnoDB database server
+
+Created 2/16/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "mysqld.h"
+#include "pars0pars.h"
+#include "row0ftsort.h"
+#include "ut0mem.h"
+#include "mem0mem.h"
+#include "data0data.h"
+#include "data0type.h"
+#include "dict0dict.h"
+#include "buf0buf.h"
+#include "buf0dump.h"
+#include "os0file.h"
+#include "os0thread.h"
+#include "fil0fil.h"
+#include "fsp0fsp.h"
+#include "rem0rec.h"
+#include "mtr0mtr.h"
+#include "log0log.h"
+#include "log0recv.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "trx0trx.h"
+#include "trx0sys.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "rem0rec.h"
+#include "ibuf0ibuf.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#ifndef UNIV_HOTBACKUP
+# include "trx0rseg.h"
+# include "os0proc.h"
+# include "sync0sync.h"
+# include "buf0flu.h"
+# include "buf0rea.h"
+# include "dict0boot.h"
+# include "dict0load.h"
+# include "dict0stats_bg.h"
+# include "que0que.h"
+# include "usr0sess.h"
+# include "lock0lock.h"
+# include "trx0roll.h"
+# include "trx0purge.h"
+# include "lock0lock.h"
+# include "pars0pars.h"
+# include "btr0sea.h"
+# include "rem0cmp.h"
+# include "dict0crea.h"
+# include "row0ins.h"
+# include "row0sel.h"
+# include "row0upd.h"
+# include "row0row.h"
+# include "row0mysql.h"
+# include "btr0pcur.h"
+# include "os0sync.h"
+# include "zlib.h"
+# include "ut0crc32.h"
+
+/** Log sequence number immediately after startup */
+UNIV_INTERN lsn_t	srv_start_lsn;
+/** Log sequence number at shutdown */
+UNIV_INTERN lsn_t	srv_shutdown_lsn;
+
+#ifdef HAVE_DARWIN_THREADS
+# include <sys/utsname.h>
+/** TRUE if the F_FULLFSYNC option is available */
+UNIV_INTERN ibool	srv_have_fullfsync = FALSE;
+#endif
+
+/** TRUE if a raw partition is in use */
+UNIV_INTERN ibool	srv_start_raw_disk_in_use = FALSE;
+
+/** TRUE if the server is being started, before rolling back any
+incomplete transactions */
+UNIV_INTERN ibool	srv_startup_is_before_trx_rollback_phase = FALSE;
+/** TRUE if the server is being started */
+UNIV_INTERN ibool	srv_is_being_started = FALSE;
+/** TRUE if the server was successfully started */
+UNIV_INTERN ibool	srv_was_started = FALSE;
+/** TRUE if innobase_start_or_create_for_mysql() has been called */
+static ibool		srv_start_has_been_called = FALSE;
+
+/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to
+SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */
+UNIV_INTERN enum srv_shutdown_state	srv_shutdown_state = SRV_SHUTDOWN_NONE;
+
+/** Files comprising the system tablespace */
+static os_file_t	files[1000];
+
+/** io_handler_thread parameters for thread identification */
+static ulint		n[SRV_MAX_N_IO_THREADS + 6];
+/** io_handler_thread identifiers, 32 is the maximum number of purge threads  */
+static os_thread_id_t	thread_ids[SRV_MAX_N_IO_THREADS + 6 + 32];
+
+/** We use this mutex to test the return value of pthread_mutex_trylock
+   on successful locking. HP-UX does NOT return 0, though Linux et al do. */
+static os_fast_mutex_t	srv_os_test_mutex;
+
+/** Name of srv_monitor_file */
+static char*	srv_monitor_file_name;
+#endif /* !UNIV_HOTBACKUP */
+
+/** Default undo tablespace size in UNIV_PAGEs count (10MB). */
+static const ulint SRV_UNDO_TABLESPACE_SIZE_IN_PAGES =
+	((1024 * 1024) * 10) / UNIV_PAGE_SIZE_DEF;
+
+/** */
+#define SRV_N_PENDING_IOS_PER_THREAD	OS_AIO_N_PENDING_IOS_PER_THREAD
+#define SRV_MAX_N_PENDING_SYNC_IOS	100
+
+#ifdef UNIV_PFS_THREAD
+/* Keys to register InnoDB threads with performance schema */
+UNIV_INTERN mysql_pfs_key_t	io_handler_thread_key;
+UNIV_INTERN mysql_pfs_key_t	srv_lock_timeout_thread_key;
+UNIV_INTERN mysql_pfs_key_t	srv_error_monitor_thread_key;
+UNIV_INTERN mysql_pfs_key_t	srv_monitor_thread_key;
+UNIV_INTERN mysql_pfs_key_t	srv_master_thread_key;
+UNIV_INTERN mysql_pfs_key_t	srv_purge_thread_key;
+#endif /* UNIV_PFS_THREAD */
+
+/*********************************************************************//**
+Convert a numeric string that optionally ends in G or M or K, to a number
+containing megabytes.
+@return	next character in string */
+static
+char*
+srv_parse_megabytes(
+/*================*/
+	char*	str,	/*!< in: string containing a quantity in bytes */
+	ulint*	megs)	/*!< out: the number in megabytes */
+{
+	char*	endp;
+	ulint	size;
+
+	size = strtoul(str, &endp, 10);
+
+	str = endp;
+
+	switch (*str) {
+	case 'G': case 'g':
+		size *= 1024;
+		/* fall through */
+	case 'M': case 'm':
+		str++;
+		break;
+	case 'K': case 'k':
+		size /= 1024;
+		str++;
+		break;
+	default:
+		size /= 1024 * 1024;
+		break;
+	}
+
+	*megs = size;
+	return(str);
+}
+
+/*********************************************************************//**
+Check if a file can be opened in read-write mode.
+@return	true if it doesn't exist or can be opened in rw mode. */
+static
+bool
+srv_file_check_mode(
+/*================*/
+	const char*	name)		/*!< in: filename to check */
+{
+	os_file_stat_t	stat;
+
+	memset(&stat, 0x0, sizeof(stat));
+
+	dberr_t		err = os_file_get_status(name, &stat, true);
+
+	if (err == DB_FAIL) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"os_file_get_status() failed on '%s'. Can't determine "
+			"file permissions", name);
+
+		return(false);
+
+	} else if (err == DB_SUCCESS) {
+
+		/* Note: stat.rw_perm is only valid of files */
+
+		if (stat.type == OS_FILE_TYPE_FILE
+		    || stat.type == OS_FILE_TYPE_BLOCK) {
+			if (!stat.rw_perm) {
+
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"%s can't be opened in %s mode",
+					name,
+					srv_read_only_mode
+					? "read" : "read-write");
+
+				return(false);
+			}
+		} else {
+			/* Not a regular file, bail out. */
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"'%s' not a regular file.", name);
+
+			return(false);
+		}
+	} else {
+
+		/* This is OK. If the file create fails on RO media, there
+		is nothing we can do. */
+
+		ut_a(err == DB_NOT_FOUND);
+	}
+
+	return(true);
+}
+
+/*********************************************************************//**
+Reads the data files and their sizes from a character string given in
+the .cnf file.
+@return	TRUE if ok, FALSE on parse error */
+UNIV_INTERN
+ibool
+srv_parse_data_file_paths_and_sizes(
+/*================================*/
+	char*	str)	/*!< in/out: the data file path string */
+{
+	char*	input_str;
+	char*	path;
+	ulint	size;
+	ulint	i	= 0;
+
+	srv_auto_extend_last_data_file = FALSE;
+	srv_last_file_size_max = 0;
+	srv_data_file_names = NULL;
+	srv_data_file_sizes = NULL;
+	srv_data_file_is_raw_partition = NULL;
+
+	input_str = str;
+
+	/* First calculate the number of data files and check syntax:
+	path:size[M | G];path:size[M | G]... . Note that a Windows path may
+	contain a drive name and a ':'. */
+
+	while (*str != '\0') {
+		path = str;
+
+		while ((*str != ':' && *str != '\0')
+		       || (*str == ':'
+			   && (*(str + 1) == '\\' || *(str + 1) == '/'
+			       || *(str + 1) == ':'))) {
+			str++;
+		}
+
+		if (*str == '\0') {
+			return(FALSE);
+		}
+
+		str++;
+
+		str = srv_parse_megabytes(str, &size);
+
+		if (0 == strncmp(str, ":autoextend",
+				 (sizeof ":autoextend") - 1)) {
+
+			str += (sizeof ":autoextend") - 1;
+
+			if (0 == strncmp(str, ":max:",
+					 (sizeof ":max:") - 1)) {
+
+				str += (sizeof ":max:") - 1;
+
+				str = srv_parse_megabytes(str, &size);
+			}
+
+			if (*str != '\0') {
+
+				return(FALSE);
+			}
+		}
+
+		if (strlen(str) >= 6
+		    && *str == 'n'
+		    && *(str + 1) == 'e'
+		    && *(str + 2) == 'w') {
+			str += 3;
+		}
+
+		if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') {
+			str += 3;
+		}
+
+		if (size == 0) {
+			return(FALSE);
+		}
+
+		i++;
+
+		if (*str == ';') {
+			str++;
+		} else if (*str != '\0') {
+
+			return(FALSE);
+		}
+	}
+
+	if (i == 0) {
+		/* If innodb_data_file_path was defined it must contain
+		at least one data file definition */
+
+		return(FALSE);
+	}
+
+	srv_data_file_names = static_cast<char**>(
+		malloc(i * sizeof *srv_data_file_names));
+
+	srv_data_file_sizes = static_cast<ulint*>(
+		malloc(i * sizeof *srv_data_file_sizes));
+
+	srv_data_file_is_raw_partition = static_cast<ulint*>(
+		malloc(i * sizeof *srv_data_file_is_raw_partition));
+
+	srv_n_data_files = i;
+
+	/* Then store the actual values to our arrays */
+
+	str = input_str;
+	i = 0;
+
+	while (*str != '\0') {
+		path = str;
+
+		/* Note that we must step over the ':' in a Windows path;
+		a Windows path normally looks like C:\ibdata\ibdata1:1G, but
+		a Windows raw partition may have a specification like
+		\\.\C::1Gnewraw or \\.\PHYSICALDRIVE2:1Gnewraw */
+
+		while ((*str != ':' && *str != '\0')
+		       || (*str == ':'
+			   && (*(str + 1) == '\\' || *(str + 1) == '/'
+			       || *(str + 1) == ':'))) {
+			str++;
+		}
+
+		if (*str == ':') {
+			/* Make path a null-terminated string */
+			*str = '\0';
+			str++;
+		}
+
+		str = srv_parse_megabytes(str, &size);
+
+		srv_data_file_names[i] = path;
+		srv_data_file_sizes[i] = size;
+
+		if (0 == strncmp(str, ":autoextend",
+				 (sizeof ":autoextend") - 1)) {
+
+			srv_auto_extend_last_data_file = TRUE;
+
+			str += (sizeof ":autoextend") - 1;
+
+			if (0 == strncmp(str, ":max:",
+					 (sizeof ":max:") - 1)) {
+
+				str += (sizeof ":max:") - 1;
+
+				str = srv_parse_megabytes(
+					str, &srv_last_file_size_max);
+			}
+
+			if (*str != '\0') {
+
+				return(FALSE);
+			}
+		}
+
+		(srv_data_file_is_raw_partition)[i] = 0;
+
+		if (strlen(str) >= 6
+		    && *str == 'n'
+		    && *(str + 1) == 'e'
+		    && *(str + 2) == 'w') {
+			str += 3;
+			(srv_data_file_is_raw_partition)[i] = SRV_NEW_RAW;
+		}
+
+		if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') {
+			str += 3;
+
+			if ((srv_data_file_is_raw_partition)[i] == 0) {
+				(srv_data_file_is_raw_partition)[i] = SRV_OLD_RAW;
+			}
+		}
+
+		i++;
+
+		if (*str == ';') {
+			str++;
+		}
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Frees the memory allocated by srv_parse_data_file_paths_and_sizes()
+and srv_parse_log_group_home_dirs(). */
+UNIV_INTERN
+void
+srv_free_paths_and_sizes(void)
+/*==========================*/
+{
+	free(srv_data_file_names);
+	srv_data_file_names = NULL;
+	free(srv_data_file_sizes);
+	srv_data_file_sizes = NULL;
+	free(srv_data_file_is_raw_partition);
+	srv_data_file_is_raw_partition = NULL;
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+I/o-handler thread function.
+@return	OS_THREAD_DUMMY_RETURN */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(io_handler_thread)(
+/*==============================*/
+	void*	arg)	/*!< in: pointer to the number of the segment in
+			the aio array */
+{
+	ulint	segment;
+
+	segment = *((ulint*) arg);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Io handler thread %lu starts, id %lu\n", segment,
+		os_thread_pf(os_thread_get_curr_id()));
+#endif
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(io_handler_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+	while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) {
+		fil_aio_wait(segment);
+	}
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit.
+	The thread actually never comes here because it is exited in an
+	os_event_wait(). */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Normalizes a directory path for Windows: converts slashes to backslashes. */
+UNIV_INTERN
+void
+srv_normalize_path_for_win(
+/*=======================*/
+	char*	str __attribute__((unused)))	/*!< in/out: null-terminated
+						character string */
+{
+#ifdef __WIN__
+	for (; *str; str++) {
+
+		if (*str == '/') {
+			*str = '\\';
+		}
+	}
+#endif
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Creates a log file.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+create_log_file(
+/*============*/
+	os_file_t*	file,	/*!< out: file handle */
+	const char*	name)	/*!< in: log file name */
+{
+	ibool		ret;
+
+	*file = os_file_create(
+		innodb_file_log_key, name,
+		OS_FILE_CREATE|OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL,
+		OS_LOG_FILE, &ret);
+
+	if (!ret) {
+		ib_logf(IB_LOG_LEVEL_ERROR, "Cannot create %s", name);
+		return(DB_ERROR);
+	}
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Setting log file %s size to %lu MB",
+		name, (ulong) srv_log_file_size
+		>> (20 - UNIV_PAGE_SIZE_SHIFT));
+
+	ret = os_file_set_size(name, *file,
+			       (os_offset_t) srv_log_file_size
+			       << UNIV_PAGE_SIZE_SHIFT);
+	if (!ret) {
+		ib_logf(IB_LOG_LEVEL_ERROR, "Cannot set log file"
+			" %s to size %lu MB", name, (ulong) srv_log_file_size
+			>> (20 - UNIV_PAGE_SIZE_SHIFT));
+		return(DB_ERROR);
+	}
+
+	ret = os_file_close(*file);
+	ut_a(ret);
+
+	return(DB_SUCCESS);
+}
+
+/** Initial number of the first redo log file */
+#define INIT_LOG_FILE0	(SRV_N_LOG_FILES_MAX + 1)
+
+#ifdef DBUG_OFF
+# define RECOVERY_CRASH(x) do {} while(0)
+#else
+# define RECOVERY_CRASH(x) do {						\
+	if (srv_force_recovery_crash == x) {				\
+		fprintf(stderr, "innodb_force_recovery_crash=%lu\n",	\
+			srv_force_recovery_crash);			\
+		fflush(stderr);						\
+		exit(3);						\
+	}								\
+} while (0)
+#endif
+
+/*********************************************************************//**
+Creates all log files.
+@return	DB_SUCCESS or error code */
+static
+dberr_t
+create_log_files(
+/*=============*/
+	bool	create_new_db,	/*!< in: TRUE if new database is being
+				created */
+	char*	logfilename,	/*!< in/out: buffer for log file name */
+	size_t	dirnamelen,	/*!< in: length of the directory path */
+	lsn_t	lsn,		/*!< in: FIL_PAGE_FILE_FLUSH_LSN value */
+	char*&	logfile0)	/*!< out: name of the first log file */
+{
+	if (srv_read_only_mode) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Cannot create log files in read-only mode");
+		return(DB_READ_ONLY);
+	}
+
+	/* We prevent system tablespace creation with existing files in
+	data directory. So we do not delete log files when creating new system
+	tablespace */
+	if (!create_new_db) {
+		/* Remove any old log files. */
+		for (unsigned i = 0; i <= INIT_LOG_FILE0; i++) {
+			sprintf(logfilename + dirnamelen, "ib_logfile%u", i);
+
+			/* Ignore errors about non-existent files or files
+			that cannot be removed. The create_log_file() will
+			return an error when the file exists. */
+#ifdef __WIN__
+			DeleteFile((LPCTSTR) logfilename);
+#else
+			unlink(logfilename);
+#endif
+			/* Crashing after deleting the first
+			file should be recoverable. The buffer
+			pool was clean, and we can simply create
+			all log files from the scratch. */
+			RECOVERY_CRASH(6);
+		}
+	}
+
+	ut_ad(!buf_pool_check_no_pending_io());
+
+	RECOVERY_CRASH(7);
+
+	for (unsigned i = 0; i < srv_n_log_files; i++) {
+		sprintf(logfilename + dirnamelen,
+			"ib_logfile%u", i ? i : INIT_LOG_FILE0);
+
+		dberr_t err = create_log_file(&files[i], logfilename);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	RECOVERY_CRASH(8);
+
+	/* We did not create the first log file initially as
+	ib_logfile0, so that crash recovery cannot find it until it
+	has been completed and renamed. */
+	sprintf(logfilename + dirnamelen, "ib_logfile%u", INIT_LOG_FILE0);
+
+	fil_space_create(
+		logfilename, SRV_LOG_SPACE_FIRST_ID,
+		fsp_flags_set_page_size(0, UNIV_PAGE_SIZE),
+		FIL_LOG);
+	ut_a(fil_validate());
+
+	logfile0 = fil_node_create(
+		logfilename, (ulint) srv_log_file_size,
+		SRV_LOG_SPACE_FIRST_ID, FALSE);
+	ut_a(logfile0);
+
+	for (unsigned i = 1; i < srv_n_log_files; i++) {
+		sprintf(logfilename + dirnamelen, "ib_logfile%u", i);
+
+		if (!fil_node_create(
+			    logfilename,
+			    (ulint) srv_log_file_size,
+			    SRV_LOG_SPACE_FIRST_ID, FALSE)) {
+			ut_error;
+		}
+	}
+
+	log_group_init(0, srv_n_log_files,
+		       srv_log_file_size * UNIV_PAGE_SIZE,
+		       SRV_LOG_SPACE_FIRST_ID,
+		       SRV_LOG_SPACE_FIRST_ID + 1);
+
+	fil_open_log_and_system_tablespace_files();
+
+	/* Create a log checkpoint. */
+	mutex_enter(&log_sys->mutex);
+	ut_d(recv_no_log_write = FALSE);
+	recv_reset_logs(lsn);
+	mutex_exit(&log_sys->mutex);
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Renames the first log file. */
+static
+void
+create_log_files_rename(
+/*====================*/
+	char*	logfilename,	/*!< in/out: buffer for log file name */
+	size_t	dirnamelen,	/*!< in: length of the directory path */
+	lsn_t	lsn,		/*!< in: FIL_PAGE_FILE_FLUSH_LSN value */
+	char*	logfile0)	/*!< in/out: name of the first log file */
+{
+	/* If innodb_flush_method=O_DSYNC,
+	we need to explicitly flush the log buffers. */
+	fil_flush(SRV_LOG_SPACE_FIRST_ID);
+	/* Close the log files, so that we can rename
+	the first one. */
+	fil_close_log_files(false);
+
+	/* Rename the first log file, now that a log
+	checkpoint has been created. */
+	sprintf(logfilename + dirnamelen, "ib_logfile%u", 0);
+
+	RECOVERY_CRASH(9);
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Renaming log file %s to %s", logfile0, logfilename);
+
+	mutex_enter(&log_sys->mutex);
+	ut_ad(strlen(logfile0) == 2 + strlen(logfilename));
+	ibool success = os_file_rename(
+		innodb_file_log_key, logfile0, logfilename);
+	ut_a(success);
+
+	RECOVERY_CRASH(10);
+
+	/* Replace the first file with ib_logfile0. */
+	strcpy(logfile0, logfilename);
+	mutex_exit(&log_sys->mutex);
+
+	fil_open_log_and_system_tablespace_files();
+
+	ib_logf(IB_LOG_LEVEL_WARN, "New log files created, LSN=" LSN_PF, lsn);
+}
+
+/*********************************************************************//**
+Opens a log file.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+open_log_file(
+/*==========*/
+	os_file_t*	file,	/*!< out: file handle */
+	const char*	name,	/*!< in: log file name */
+	os_offset_t*	size)	/*!< out: file size */
+{
+	ibool	ret;
+
+	*file = os_file_create(innodb_file_log_key, name,
+			       OS_FILE_OPEN, OS_FILE_AIO,
+			       OS_LOG_FILE, &ret);
+	if (!ret) {
+		ib_logf(IB_LOG_LEVEL_ERROR, "Unable to open '%s'", name);
+		return(DB_ERROR);
+	}
+
+	*size = os_file_get_size(*file);
+
+	ret = os_file_close(*file);
+	ut_a(ret);
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Creates or opens database data files and closes them.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+open_or_create_data_files(
+/*======================*/
+	ibool*		create_new_db,	/*!< out: TRUE if new database should be
+					created */
+#ifdef UNIV_LOG_ARCHIVE
+	ulint*		min_arch_log_no,/*!< out: min of archived log
+					numbers in data files */
+	ulint*		max_arch_log_no,/*!< out: max of archived log
+					numbers in data files */
+#endif /* UNIV_LOG_ARCHIVE */
+	lsn_t*		min_flushed_lsn,/*!< out: min of flushed lsn
+					values in data files */
+	lsn_t*		max_flushed_lsn,/*!< out: max of flushed lsn
+					values in data files */
+	ulint*		sum_of_new_sizes)/*!< out: sum of sizes of the
+					new files added */
+{
+	ibool		ret;
+	ulint		i;
+	ibool		one_opened	= FALSE;
+	ibool		one_created	= FALSE;
+	os_offset_t	size;
+	ulint		flags;
+	ulint		space;
+	ulint		rounded_size_pages;
+	char		name[10000];
+
+	if (srv_n_data_files >= 1000) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Can only have < 1000 data files, you have "
+			"defined %lu", (ulong) srv_n_data_files);
+
+		return(DB_ERROR);
+	}
+
+	*sum_of_new_sizes = 0;
+
+	*create_new_db = FALSE;
+
+	srv_normalize_path_for_win(srv_data_home);
+
+	for (i = 0; i < srv_n_data_files; i++) {
+		ulint	dirnamelen;
+
+		srv_normalize_path_for_win(srv_data_file_names[i]);
+		dirnamelen = strlen(srv_data_home);
+
+		ut_a(dirnamelen + strlen(srv_data_file_names[i])
+		     < (sizeof name) - 1);
+
+		memcpy(name, srv_data_home, dirnamelen);
+
+		/* Add a path separator if needed. */
+		if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
+			name[dirnamelen++] = SRV_PATH_SEPARATOR;
+		}
+
+		strcpy(name + dirnamelen, srv_data_file_names[i]);
+
+		/* Note: It will return true if the file doesn' exist. */
+
+		if (!srv_file_check_mode(name)) {
+
+			return(DB_FAIL);
+
+		} else if (srv_data_file_is_raw_partition[i] == 0) {
+
+			/* First we try to create the file: if it already
+			exists, ret will get value FALSE */
+
+			files[i] = os_file_create(
+				innodb_file_data_key, name, OS_FILE_CREATE,
+				OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+
+			if (srv_read_only_mode) {
+
+				if (ret) {
+					goto size_check;
+				}
+
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Opening %s failed!", name);
+
+				return(DB_ERROR);
+
+			} else if (!ret
+				   && os_file_get_last_error(false)
+				   != OS_FILE_ALREADY_EXISTS
+#ifdef UNIV_AIX
+			    	   /* AIX 5.1 after security patch ML7 may have
+			           errno set to 0 here, which causes our
+				   function to return 100; work around that
+				   AIX problem */
+				   && os_file_get_last_error(false) != 100
+#endif /* UNIV_AIX */
+			    ) {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Creating or opening %s failed!",
+					name);
+
+				return(DB_ERROR);
+			}
+
+		} else if (srv_data_file_is_raw_partition[i] == SRV_NEW_RAW) {
+
+			ut_a(!srv_read_only_mode);
+
+			/* The partition is opened, not created; then it is
+			written over */
+
+			srv_start_raw_disk_in_use = TRUE;
+			srv_created_new_raw = TRUE;
+
+			files[i] = os_file_create(
+				innodb_file_data_key, name, OS_FILE_OPEN_RAW,
+				OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+
+			if (!ret) {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Error in opening %s", name);
+
+				return(DB_ERROR);
+			}
+		} else if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
+			srv_start_raw_disk_in_use = TRUE;
+
+			ret = FALSE;
+		} else {
+			ut_a(0);
+		}
+
+		if (ret == FALSE) {
+			const char* check_msg;
+			/* We open the data file */
+
+			if (one_created) {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Data files can only be added at "
+					"the end of a tablespace, but "
+					"data file %s existed beforehand.",
+					name);
+				return(DB_ERROR);
+			}
+			if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
+				ut_a(!srv_read_only_mode);
+				files[i] = os_file_create(
+					innodb_file_data_key,
+					name, OS_FILE_OPEN_RAW,
+					OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+			} else if (i == 0) {
+				files[i] = os_file_create(
+					innodb_file_data_key,
+					name, OS_FILE_OPEN_RETRY,
+					OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+			} else {
+				files[i] = os_file_create(
+					innodb_file_data_key,
+					name, OS_FILE_OPEN, OS_FILE_NORMAL,
+					OS_DATA_FILE, &ret);
+			}
+
+			if (!ret) {
+
+				os_file_get_last_error(true);
+
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Can't open '%s'", name);
+
+				return(DB_ERROR);
+			}
+
+			if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
+
+				goto skip_size_check;
+			}
+
+size_check:
+			size = os_file_get_size(files[i]);
+			ut_a(size != (os_offset_t) -1);
+			/* Round size downward to megabytes */
+
+			rounded_size_pages = (ulint)
+				(size >> UNIV_PAGE_SIZE_SHIFT);
+
+			if (i == srv_n_data_files - 1
+			    && srv_auto_extend_last_data_file) {
+
+				if (srv_data_file_sizes[i] > rounded_size_pages
+				    || (srv_last_file_size_max > 0
+					&& srv_last_file_size_max
+					< rounded_size_pages)) {
+
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"auto-extending "
+						"data file %s is "
+						"of a different size "
+						"%lu pages (rounded "
+						"down to MB) than specified "
+						"in the .cnf file: "
+						"initial %lu pages, "
+						"max %lu (relevant if "
+						"non-zero) pages!",
+						name,
+						(ulong) rounded_size_pages,
+						(ulong) srv_data_file_sizes[i],
+						(ulong)
+						srv_last_file_size_max);
+
+					return(DB_ERROR);
+				}
+
+				srv_data_file_sizes[i] = rounded_size_pages;
+			}
+
+			if (rounded_size_pages != srv_data_file_sizes[i]) {
+
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Data file %s is of a different "
+					"size %lu pages (rounded down to MB) "
+					"than specified in the .cnf file "
+					"%lu pages!",
+					name,
+					(ulong) rounded_size_pages,
+					(ulong) srv_data_file_sizes[i]);
+
+				return(DB_ERROR);
+			}
+skip_size_check:
+
+			/* This is the earliest location where we can load
+			the double write buffer. */
+			if (i == 0) {
+				buf_dblwr_init_or_load_pages(
+					files[i], srv_data_file_names[i], true);
+			}
+
+			bool retry = true;
+check_first_page:
+			check_msg = fil_read_first_page(
+				files[i], one_opened, &flags, &space,
+#ifdef UNIV_LOG_ARCHIVE
+				min_arch_log_no, max_arch_log_no,
+#endif /* UNIV_LOG_ARCHIVE */
+				min_flushed_lsn, max_flushed_lsn);
+
+			if (check_msg) {
+
+				if (retry) {
+					fsp_open_info	fsp;
+					const ulint	page_no = 0;
+
+					retry = false;
+					fsp.id = 0;
+					fsp.filepath = srv_data_file_names[i];
+					fsp.file = files[i];
+
+					if (fil_user_tablespace_restore_page(
+						&fsp, page_no)) {
+						goto check_first_page;
+					}
+				}
+
+				ib_logf(IB_LOG_LEVEL_ERROR,
+						"%s in data file %s",
+						check_msg, name);
+				return(DB_ERROR);
+			}
+
+			/* The first file of the system tablespace must
+			have space ID = TRX_SYS_SPACE.  The FSP_SPACE_ID
+			field in files greater than ibdata1 are unreliable. */
+			ut_a(one_opened || space == TRX_SYS_SPACE);
+
+			/* Check the flags for the first system tablespace
+			file only. */
+			if (!one_opened
+			    && UNIV_PAGE_SIZE
+			       != fsp_flags_get_page_size(flags)) {
+
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Data file \"%s\" uses page size %lu,"
+					"but the start-up parameter "
+					"is --innodb-page-size=%lu",
+					name,
+					fsp_flags_get_page_size(flags),
+					UNIV_PAGE_SIZE);
+
+				return(DB_ERROR);
+			}
+
+			one_opened = TRUE;
+		} else if (!srv_read_only_mode) {
+			/* We created the data file and now write it full of
+			zeros */
+
+			one_created = TRUE;
+
+			if (i > 0) {
+				ib_logf(IB_LOG_LEVEL_INFO,
+					"Data file %s did not"
+					" exist: new to be created",
+					name);
+			} else {
+				ib_logf(IB_LOG_LEVEL_INFO,
+					"The first specified "
+					"data file %s did not exist: "
+					"a new database to be created!",
+					name);
+
+				*create_new_db = TRUE;
+			}
+
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Setting file %s size to %lu MB",
+				name,
+				(ulong) (srv_data_file_sizes[i]
+					 >> (20 - UNIV_PAGE_SIZE_SHIFT)));
+
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Database physically writes the"
+				" file full: wait...");
+
+			ret = os_file_set_size(
+				name, files[i],
+				(os_offset_t) srv_data_file_sizes[i]
+				<< UNIV_PAGE_SIZE_SHIFT);
+
+			if (!ret) {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Error in creating %s: "
+					"probably out of disk space",
+					name);
+
+				return(DB_ERROR);
+			}
+
+			*sum_of_new_sizes += srv_data_file_sizes[i];
+		}
+
+		ret = os_file_close(files[i]);
+		ut_a(ret);
+
+		if (i == 0) {
+			flags = fsp_flags_set_page_size(0, UNIV_PAGE_SIZE);
+			fil_space_create(name, 0, flags, FIL_TABLESPACE);
+		}
+
+		ut_a(fil_validate());
+
+		if (!fil_node_create(name, srv_data_file_sizes[i], 0,
+				     srv_data_file_is_raw_partition[i] != 0)) {
+			return(DB_ERROR);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Create undo tablespace.
+@return	DB_SUCCESS or error code */
+static
+dberr_t
+srv_undo_tablespace_create(
+/*=======================*/
+	const char*	name,		/*!< in: tablespace name */
+	ulint		size)		/*!< in: tablespace size in pages */
+{
+	os_file_t	fh;
+	ibool		ret;
+	dberr_t		err = DB_SUCCESS;
+
+	os_file_create_subdirs_if_needed(name);
+
+	fh = os_file_create(
+		innodb_file_data_key,
+		name,
+		srv_read_only_mode ? OS_FILE_OPEN : OS_FILE_CREATE,
+		OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+
+	if (srv_read_only_mode && ret) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"%s opened in read-only mode", name);
+	} else if (ret == FALSE) {
+		if (os_file_get_last_error(false) != OS_FILE_ALREADY_EXISTS
+#ifdef UNIV_AIX
+			/* AIX 5.1 after security patch ML7 may have
+			errno set to 0 here, which causes our function
+			to return 100; work around that AIX problem */
+		    && os_file_get_last_error(false) != 100
+#endif /* UNIV_AIX */
+		) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Can't create UNDO tablespace %s", name);
+		} else {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Creating system tablespace with"
+				" existing undo tablespaces is not"
+				" supported. Please delete all undo"
+				" tablespaces before creating new"
+				" system tablespace.");
+		}
+		err = DB_ERROR;
+	} else {
+		ut_a(!srv_read_only_mode);
+
+		/* We created the data file and now write it full of zeros */
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Data file %s did not exist: new to be created",
+			name);
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Setting file %s size to %lu MB",
+			name, size >> (20 - UNIV_PAGE_SIZE_SHIFT));
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Database physically writes the file full: wait...");
+
+		ret = os_file_set_size(name, fh, size << UNIV_PAGE_SIZE_SHIFT);
+
+		if (!ret) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Error in creating %s: probably out of "
+				"disk space", name);
+
+			err = DB_ERROR;
+		}
+
+		os_file_close(fh);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Open an undo tablespace.
+@return	DB_SUCCESS or error code */
+static
+dberr_t
+srv_undo_tablespace_open(
+/*=====================*/
+	const char*	name,		/*!< in: tablespace name */
+	ulint		space)		/*!< in: tablespace id */
+{
+	os_file_t	fh;
+	dberr_t		err	= DB_ERROR;
+	ibool		ret;
+	ulint		flags;
+
+	if (!srv_file_check_mode(name)) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"UNDO tablespaces must be %s!",
+			srv_read_only_mode ? "writable" : "readable");
+
+		return(DB_ERROR);
+	}
+
+	fh = os_file_create(
+		innodb_file_data_key, name,
+		OS_FILE_OPEN_RETRY
+		| OS_FILE_ON_ERROR_NO_EXIT
+		| OS_FILE_ON_ERROR_SILENT,
+		OS_FILE_NORMAL,
+		OS_DATA_FILE,
+		&ret);
+
+	/* If the file open was successful then load the tablespace. */
+
+	if (ret) {
+		os_offset_t	size;
+
+		size = os_file_get_size(fh);
+		ut_a(size != (os_offset_t) -1);
+
+		ret = os_file_close(fh);
+		ut_a(ret);
+
+		/* Load the tablespace into InnoDB's internal
+		data structures. */
+
+		/* We set the biggest space id to the undo tablespace
+		because InnoDB hasn't opened any other tablespace apart
+		from the system tablespace. */
+
+		fil_set_max_space_id_if_bigger(space);
+
+		/* Set the compressed page size to 0 (non-compressed) */
+		flags = fsp_flags_set_page_size(0, UNIV_PAGE_SIZE);
+		fil_space_create(name, space, flags, FIL_TABLESPACE);
+
+		ut_a(fil_validate());
+
+		os_offset_t	n_pages = size / UNIV_PAGE_SIZE;
+
+		/* On 64 bit Windows ulint can be 32 bit and os_offset_t
+		is 64 bit. It is OK to cast the n_pages to ulint because
+		the unit has been scaled to pages and they are always
+		32 bit. */
+		if (fil_node_create(name, (ulint) n_pages, space, FALSE)) {
+			err = DB_SUCCESS;
+		}
+	}
+
+	return(err);
+}
+
+/********************************************************************
+Opens the configured number of undo tablespaces.
+@return	DB_SUCCESS or error code */
+static
+dberr_t
+srv_undo_tablespaces_init(
+/*======================*/
+	ibool		create_new_db,		/*!< in: TRUE if new db being
+						created */
+	const ulint	n_conf_tablespaces,	/*!< in: configured undo
+						tablespaces */
+	ulint*		n_opened)		/*!< out: number of UNDO
+						tablespaces successfully
+						discovered and opened */
+{
+	ulint		i;
+	dberr_t		err = DB_SUCCESS;
+	ulint		prev_space_id = 0;
+	ulint		n_undo_tablespaces;
+	ulint		undo_tablespace_ids[TRX_SYS_N_RSEGS + 1];
+
+	*n_opened = 0;
+
+	ut_a(n_conf_tablespaces <= TRX_SYS_N_RSEGS);
+
+	memset(undo_tablespace_ids, 0x0, sizeof(undo_tablespace_ids));
+
+	/* Create the undo spaces only if we are creating a new
+	instance. We don't allow creating of new undo tablespaces
+	in an existing instance (yet).  This restriction exists because
+	we check in several places for SYSTEM tablespaces to be less than
+	the min of user defined tablespace ids. Once we implement saving
+	the location of the undo tablespaces and their space ids this
+	restriction will/should be lifted. */
+
+	for (i = 0; create_new_db && i < n_conf_tablespaces; ++i) {
+		char	name[OS_FILE_MAX_PATH];
+
+		ut_snprintf(
+			name, sizeof(name),
+			"%s%cundo%03lu",
+			srv_undo_dir, SRV_PATH_SEPARATOR, i + 1);
+
+		/* Undo space ids start from 1. */
+		err = srv_undo_tablespace_create(
+			name, SRV_UNDO_TABLESPACE_SIZE_IN_PAGES);
+
+		if (err != DB_SUCCESS) {
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Could not create undo tablespace '%s'.",
+				name);
+
+			return(err);
+		}
+	}
+
+	/* Get the tablespace ids of all the undo segments excluding
+	the system tablespace (0). If we are creating a new instance then
+	we build the undo_tablespace_ids ourselves since they don't
+	already exist. */
+
+	if (!create_new_db) {
+		n_undo_tablespaces = trx_rseg_get_n_undo_tablespaces(
+			undo_tablespace_ids);
+	} else {
+		n_undo_tablespaces = n_conf_tablespaces;
+
+		for (i = 1; i <= n_undo_tablespaces; ++i) {
+			undo_tablespace_ids[i - 1] = i;
+		}
+
+		undo_tablespace_ids[i] = ULINT_UNDEFINED;
+	}
+
+	/* Open all the undo tablespaces that are currently in use. If we
+	fail to open any of these it is a fatal error. The tablespace ids
+	should be contiguous. It is a fatal error because they are required
+	for recovery and are referenced by the UNDO logs (a.k.a RBS). */
+
+	for (i = 0; i < n_undo_tablespaces; ++i) {
+		char	name[OS_FILE_MAX_PATH];
+
+		ut_snprintf(
+			name, sizeof(name),
+			"%s%cundo%03lu",
+			srv_undo_dir, SRV_PATH_SEPARATOR,
+			undo_tablespace_ids[i]);
+
+		/* Should be no gaps in undo tablespace ids. */
+		ut_a(prev_space_id + 1 == undo_tablespace_ids[i]);
+
+		/* The system space id should not be in this array. */
+		ut_a(undo_tablespace_ids[i] != 0);
+		ut_a(undo_tablespace_ids[i] != ULINT_UNDEFINED);
+
+		/* Undo space ids start from 1. */
+
+		err = srv_undo_tablespace_open(name, undo_tablespace_ids[i]);
+
+		if (err != DB_SUCCESS) {
+
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Unable to open undo tablespace '%s'.", name);
+
+			return(err);
+		}
+
+		prev_space_id = undo_tablespace_ids[i];
+
+		++*n_opened;
+	}
+
+	/* Open any extra unused undo tablespaces. These must be contiguous.
+	We stop at the first failure. These are undo tablespaces that are
+	not in use and therefore not required by recovery. We only check
+	that there are no gaps. */
+
+	for (i = prev_space_id + 1; i < TRX_SYS_N_RSEGS; ++i) {
+		char	name[OS_FILE_MAX_PATH];
+
+		ut_snprintf(
+			name, sizeof(name),
+			"%s%cundo%03lu", srv_undo_dir, SRV_PATH_SEPARATOR, i);
+
+		/* Undo space ids start from 1. */
+		err = srv_undo_tablespace_open(name, i);
+
+		if (err != DB_SUCCESS) {
+			break;
+		}
+
+		++n_undo_tablespaces;
+
+		++*n_opened;
+	}
+
+	/* If the user says that there are fewer than what we find we
+	tolerate that discrepancy but not the inverse. Because there could
+	be unused undo tablespaces for future use. */
+
+	if (n_conf_tablespaces > n_undo_tablespaces) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Expected to open %lu undo "
+			"tablespaces but was able\n",
+			n_conf_tablespaces);
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: to find only %lu undo "
+			"tablespaces.\n", n_undo_tablespaces);
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Set the "
+			"innodb_undo_tablespaces parameter to "
+			"the\n");
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: correct value and retry. Suggested "
+			"value is %lu\n", n_undo_tablespaces);
+
+		return(err != DB_SUCCESS ? err : DB_ERROR);
+
+	} else  if (n_undo_tablespaces > 0) {
+
+		ib_logf(IB_LOG_LEVEL_INFO, "Opened %lu undo tablespaces",
+			n_undo_tablespaces);
+
+		if (n_conf_tablespaces == 0) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Using the system tablespace for all UNDO "
+				"logging because innodb_undo_tablespaces=0");
+		}
+	}
+
+	if (create_new_db) {
+		mtr_t	mtr;
+
+		mtr_start(&mtr);
+
+		/* The undo log tablespace */
+		for (i = 1; i <= n_undo_tablespaces; ++i) {
+
+			fsp_header_init(
+				i, SRV_UNDO_TABLESPACE_SIZE_IN_PAGES, &mtr);
+		}
+
+		mtr_commit(&mtr);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/********************************************************************
+Wait for the purge thread(s) to start up. */
+static
+void
+srv_start_wait_for_purge_to_start()
+/*===============================*/
+{
+	/* Wait for the purge coordinator and master thread to startup. */
+
+	purge_state_t	state = trx_purge_state();
+
+	ut_a(state != PURGE_STATE_DISABLED);
+
+	while (srv_shutdown_state == SRV_SHUTDOWN_NONE
+	       && srv_force_recovery < SRV_FORCE_NO_BACKGROUND
+	       && state == PURGE_STATE_INIT) {
+
+		switch (state = trx_purge_state()) {
+		case PURGE_STATE_RUN:
+		case PURGE_STATE_STOP:
+			break;
+
+		case PURGE_STATE_INIT:
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Waiting for purge to start");
+
+			os_thread_sleep(50000);
+			break;
+
+		case PURGE_STATE_EXIT:
+		case PURGE_STATE_DISABLED:
+			ut_error;
+		}
+	}
+}
+
+/********************************************************************
+Starts InnoDB and creates a new database if database files
+are not found and the user wants.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+innobase_start_or_create_for_mysql(void)
+/*====================================*/
+{
+	ibool		create_new_db;
+	lsn_t		min_flushed_lsn;
+	lsn_t		max_flushed_lsn;
+#ifdef UNIV_LOG_ARCHIVE
+	ulint		min_arch_log_no;
+	ulint		max_arch_log_no;
+#endif /* UNIV_LOG_ARCHIVE */
+	ulint		sum_of_new_sizes;
+	ulint		sum_of_data_file_sizes;
+	ulint		tablespace_size_in_header;
+	dberr_t		err;
+	unsigned	i;
+	ulint		srv_n_log_files_found = srv_n_log_files;
+	ulint		io_limit;
+	mtr_t		mtr;
+	ib_bh_t*	ib_bh;
+	ulint		n_recovered_trx;
+	char		logfilename[10000];
+	char*		logfile0	= NULL;
+	size_t		dirnamelen;
+
+	if (srv_force_recovery > SRV_FORCE_NO_TRX_UNDO) {
+		srv_read_only_mode = true;
+	}
+
+	if (srv_read_only_mode) {
+		ib_logf(IB_LOG_LEVEL_INFO, "Started in read only mode");
+	}
+
+#ifdef HAVE_DARWIN_THREADS
+# ifdef F_FULLFSYNC
+	/* This executable has been compiled on Mac OS X 10.3 or later.
+	Assume that F_FULLFSYNC is available at run-time. */
+	srv_have_fullfsync = TRUE;
+# else /* F_FULLFSYNC */
+	/* This executable has been compiled on Mac OS X 10.2
+	or earlier.  Determine if the executable is running
+	on Mac OS X 10.3 or later. */
+	struct utsname utsname;
+	if (uname(&utsname)) {
+		ut_print_timestamp(stderr);
+		fputs(" InnoDB: cannot determine Mac OS X version!\n", stderr);
+	} else {
+		srv_have_fullfsync = strcmp(utsname.release, "7.") >= 0;
+	}
+	if (!srv_have_fullfsync) {
+		ut_print_timestamp(stderr);
+		fputs(" InnoDB: On Mac OS X, fsync() may be "
+		      "broken on internal drives,\n", stderr);
+		ut_print_timestamp(stderr);
+		fputs(" InnoDB: making transactions unsafe!\n", stderr);
+	}
+# endif /* F_FULLFSYNC */
+#endif /* HAVE_DARWIN_THREADS */
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Using %s to ref count buffer pool pages",
+#ifdef PAGE_ATOMIC_REF_COUNT
+		"atomics"
+#else
+		"mutexes"
+#endif /* PAGE_ATOMIC_REF_COUNT */
+	);
+
+
+	if (sizeof(ulint) != sizeof(void*)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error: size of InnoDB's ulint is %lu, "
+			"but size of void*\n", (ulong) sizeof(ulint));
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: is %lu. The sizes should be the same "
+			"so that on a 64-bit\n",
+			(ulong) sizeof(void*));
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: platforms you can allocate more than 4 GB "
+			"of memory.\n");
+	}
+
+#ifdef UNIV_DEBUG
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: !!!!!!!! UNIV_DEBUG switched on !!!!!!!!!\n");
+#endif
+
+#ifdef UNIV_IBUF_DEBUG
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: !!!!!!!! UNIV_IBUF_DEBUG switched on !!!!!!!!!\n");
+# ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: !!!!!!!! UNIV_IBUF_COUNT_DEBUG switched on "
+		"!!!!!!!!!\n");
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: Crash recovery will fail with UNIV_IBUF_COUNT_DEBUG\n");
+# endif
+#endif
+
+#ifdef UNIV_BLOB_DEBUG
+	fprintf(stderr,
+		"InnoDB: !!!!!!!! UNIV_BLOB_DEBUG switched on !!!!!!!!!\n"
+		"InnoDB: Server restart may fail with UNIV_BLOB_DEBUG\n");
+#endif /* UNIV_BLOB_DEBUG */
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: !!!!!!!! UNIV_SYNC_DEBUG switched on !!!!!!!!!\n");
+#endif
+
+#ifdef UNIV_SEARCH_DEBUG
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: !!!!!!!! UNIV_SEARCH_DEBUG switched on !!!!!!!!!\n");
+#endif
+
+#ifdef UNIV_LOG_LSN_DEBUG
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: !!!!!!!! UNIV_LOG_LSN_DEBUG switched on !!!!!!!!!\n");
+#endif /* UNIV_LOG_LSN_DEBUG */
+#ifdef UNIV_MEM_DEBUG
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: !!!!!!!! UNIV_MEM_DEBUG switched on !!!!!!!!!\n");
+#endif
+
+	if (srv_use_sys_malloc) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"The InnoDB memory heap is disabled");
+	}
+
+#if defined(COMPILER_HINTS_ENABLED)
+	ib_logf(IB_LOG_LEVEL_INFO,
+		" InnoDB: Compiler hints enabled.");
+#endif /* defined(COMPILER_HINTS_ENABLED) */
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"" IB_ATOMICS_STARTUP_MSG "");
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"" IB_MEMORY_BARRIER_STARTUP_MSG "");
+
+#ifndef HAVE_MEMORY_BARRIER
+#if defined __i386__ || defined __x86_64__ || defined _M_IX86 || defined _M_X64 || defined __WIN__
+#else
+	ib_logf(IB_LOG_LEVEL_WARN,
+		"MySQL was built without a memory barrier capability on this"
+		" architecture, which might allow a mutex/rw_lock violation"
+		" under high thread concurrency. This may cause a hang.");
+#endif /* IA32 or AMD64 */
+#endif /* HAVE_MEMORY_BARRIER */
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Compressed tables use zlib " ZLIB_VERSION
+#ifdef UNIV_ZIP_DEBUG
+	      " with validation"
+#endif /* UNIV_ZIP_DEBUG */
+	      );
+#ifdef UNIV_ZIP_COPY
+	ib_logf(IB_LOG_LEVEL_INFO, "and extra copying");
+#endif /* UNIV_ZIP_COPY */
+
+
+	/* Since InnoDB does not currently clean up all its internal data
+	structures in MySQL Embedded Server Library server_end(), we
+	print an error message if someone tries to start up InnoDB a
+	second time during the process lifetime. */
+
+	if (srv_start_has_been_called) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Error: startup called second time "
+			"during the process\n");
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: lifetime. In the MySQL Embedded "
+			"Server Library you\n");
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: cannot call server_init() more "
+			"than once during the\n");
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: process lifetime.\n");
+	}
+
+	srv_start_has_been_called = TRUE;
+
+#ifdef UNIV_DEBUG
+	log_do_write = TRUE;
+#endif /* UNIV_DEBUG */
+	/*	yydebug = TRUE; */
+
+	srv_is_being_started = TRUE;
+	srv_startup_is_before_trx_rollback_phase = TRUE;
+
+#ifdef __WIN__
+	switch (os_get_os_version()) {
+	case OS_WIN95:
+	case OS_WIN31:
+	case OS_WINNT:
+		/* On Win 95, 98, ME, Win32 subsystem for Windows 3.1,
+		and NT use simulated aio. In NT Windows provides async i/o,
+		but when run in conjunction with InnoDB Hot Backup, it seemed
+		to corrupt the data files. */
+
+		srv_use_native_aio = FALSE;
+		break;
+
+	case OS_WIN2000:
+	case OS_WINXP:
+		/* On 2000 and XP, async IO is available. */
+		srv_use_native_aio = TRUE;
+		break;
+
+	default:
+		/* Vista and later have both async IO and condition variables */
+		srv_use_native_aio = TRUE;
+		srv_use_native_conditions = TRUE;
+		break;
+	}
+
+#elif defined(LINUX_NATIVE_AIO)
+
+	if (srv_use_native_aio) {
+		ib_logf(IB_LOG_LEVEL_INFO, "Using Linux native AIO");
+	}
+#else
+	/* Currently native AIO is supported only on windows and linux
+	and that also when the support is compiled in. In all other
+	cases, we ignore the setting of innodb_use_native_aio. */
+	srv_use_native_aio = FALSE;
+#endif /* __WIN__ */
+
+	if (srv_file_flush_method_str == NULL) {
+		/* These are the default options */
+
+		srv_unix_file_flush_method = SRV_UNIX_FSYNC;
+
+		srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+#ifndef __WIN__
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "fsync")) {
+		srv_unix_file_flush_method = SRV_UNIX_FSYNC;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DSYNC")) {
+		srv_unix_file_flush_method = SRV_UNIX_O_DSYNC;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT")) {
+		srv_unix_file_flush_method = SRV_UNIX_O_DIRECT;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT_NO_FSYNC")) {
+		srv_unix_file_flush_method = SRV_UNIX_O_DIRECT_NO_FSYNC;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "littlesync")) {
+		srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "nosync")) {
+		srv_unix_file_flush_method = SRV_UNIX_NOSYNC;
+#else
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "normal")) {
+		srv_win_file_flush_method = SRV_WIN_IO_NORMAL;
+		srv_use_native_aio = FALSE;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "unbuffered")) {
+		srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+		srv_use_native_aio = FALSE;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str,
+				  "async_unbuffered")) {
+		srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+#endif /* __WIN__ */
+	} else {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Unrecognized value %s for innodb_flush_method",
+			srv_file_flush_method_str);
+		return(DB_ERROR);
+	}
+
+	/* Note that the call srv_boot() also changes the values of
+	some variables to the units used by InnoDB internally */
+
+	/* Set the maximum number of threads which can wait for a semaphore
+	inside InnoDB: this is the 'sync wait array' size, as well as the
+	maximum number of threads that can wait in the 'srv_conc array' for
+	their time to enter InnoDB. */
+
+#define BUF_POOL_SIZE_THRESHOLD (1024 * 1024 * 1024)
+	srv_max_n_threads = 1   /* io_ibuf_thread */
+			    + 1 /* io_log_thread */
+			    + 1 /* lock_wait_timeout_thread */
+			    + 1 /* srv_error_monitor_thread */
+			    + 1 /* srv_monitor_thread */
+			    + 1 /* srv_master_thread */
+			    + 1 /* srv_purge_coordinator_thread */
+			    + 1 /* buf_dump_thread */
+			    + 1 /* dict_stats_thread */
+			    + 1 /* fts_optimize_thread */
+			    + 1 /* recv_writer_thread */
+			    + 1 /* buf_flush_page_cleaner_thread */
+			    + 1 /* trx_rollback_or_clean_all_recovered */
+			    + 128 /* added as margin, for use of
+				  InnoDB Memcached etc. */
+			    + max_connections
+			    + srv_n_read_io_threads
+			    + srv_n_write_io_threads
+			    + srv_n_purge_threads
+			    /* FTS Parallel Sort */
+			    + fts_sort_pll_degree * FTS_NUM_AUX_INDEX
+			      * max_connections;
+
+	if (srv_buf_pool_size < BUF_POOL_SIZE_THRESHOLD) {
+		/* If buffer pool is less than 1 GB,
+		use only one buffer pool instance */
+		srv_buf_pool_instances = 1;
+	}
+
+	srv_boot();
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"%s CPU crc32 instructions",
+		ut_crc32_sse2_enabled ? "Using" : "Not using");
+
+	if (!srv_read_only_mode) {
+
+		mutex_create(srv_monitor_file_mutex_key,
+			     &srv_monitor_file_mutex, SYNC_NO_ORDER_CHECK);
+
+		if (srv_innodb_status) {
+
+			srv_monitor_file_name = static_cast<char*>(
+				mem_alloc(
+					strlen(fil_path_to_mysql_datadir)
+					+ 20 + sizeof "/innodb_status."));
+
+			sprintf(srv_monitor_file_name, "%s/innodb_status.%lu",
+				fil_path_to_mysql_datadir,
+				os_proc_get_number());
+
+			srv_monitor_file = fopen(srv_monitor_file_name, "w+");
+
+			if (!srv_monitor_file) {
+
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Unable to create %s: %s",
+					srv_monitor_file_name,
+					strerror(errno));
+
+				return(DB_ERROR);
+			}
+		} else {
+			srv_monitor_file_name = NULL;
+			srv_monitor_file = os_file_create_tmpfile();
+
+			if (!srv_monitor_file) {
+				return(DB_ERROR);
+			}
+		}
+
+		mutex_create(srv_dict_tmpfile_mutex_key,
+			     &srv_dict_tmpfile_mutex, SYNC_DICT_OPERATION);
+
+		srv_dict_tmpfile = os_file_create_tmpfile();
+
+		if (!srv_dict_tmpfile) {
+			return(DB_ERROR);
+		}
+
+		mutex_create(srv_misc_tmpfile_mutex_key,
+			     &srv_misc_tmpfile_mutex, SYNC_ANY_LATCH);
+
+		srv_misc_tmpfile = os_file_create_tmpfile();
+
+		if (!srv_misc_tmpfile) {
+			return(DB_ERROR);
+		}
+	}
+
+	/* If user has set the value of innodb_file_io_threads then
+	we'll emit a message telling the user that this parameter
+	is now deprecated. */
+	if (srv_n_file_io_threads != 4) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"innodb_file_io_threads is deprecated. Please use "
+			"innodb_read_io_threads and innodb_write_io_threads "
+			"instead");
+	}
+
+	/* Now overwrite the value on srv_n_file_io_threads */
+	srv_n_file_io_threads = srv_n_read_io_threads;
+
+	if (!srv_read_only_mode) {
+		/* Add the log and ibuf IO threads. */
+		srv_n_file_io_threads += 2;
+		srv_n_file_io_threads += srv_n_write_io_threads;
+	} else {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Disabling background IO write threads.");
+
+		srv_n_write_io_threads = 0;
+	}
+
+	ut_a(srv_n_file_io_threads <= SRV_MAX_N_IO_THREADS);
+
+	io_limit = 8 * SRV_N_PENDING_IOS_PER_THREAD;
+
+	/* On Windows when using native aio the number of aio requests
+	that a thread can handle at a given time is limited to 32
+	i.e.: SRV_N_PENDING_IOS_PER_THREAD */
+# ifdef __WIN__
+	if (srv_use_native_aio) {
+		io_limit = SRV_N_PENDING_IOS_PER_THREAD;
+	}
+# endif /* __WIN__ */
+
+	if (!os_aio_init(io_limit,
+			 srv_n_read_io_threads,
+			 srv_n_write_io_threads,
+			 SRV_MAX_N_PENDING_SYNC_IOS)) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Fatal : Cannot initialize AIO sub-system");
+
+		return(DB_ERROR);
+	}
+
+	fil_init(srv_file_per_table ? 50000 : 5000, srv_max_n_open_files);
+
+	double	size;
+	char	unit;
+
+	if (srv_buf_pool_size >= 1024 * 1024 * 1024) {
+		size = ((double) srv_buf_pool_size) / (1024 * 1024 * 1024);
+		unit = 'G';
+	} else {
+		size = ((double) srv_buf_pool_size) / (1024 * 1024);
+		unit = 'M';
+	}
+
+	/* Print time to initialize the buffer pool */
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Initializing buffer pool, size = %.1f%c", size, unit);
+
+	err = buf_pool_init(srv_buf_pool_size, srv_buf_pool_instances);
+
+	if (err != DB_SUCCESS) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Cannot allocate memory for the buffer pool");
+
+		return(DB_ERROR);
+	}
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Completed initialization of buffer pool");
+
+#ifdef UNIV_DEBUG
+	/* We have observed deadlocks with a 5MB buffer pool but
+	the actual lower limit could very well be a little higher. */
+
+	if (srv_buf_pool_size <= 5 * 1024 * 1024) {
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Small buffer pool size (%luM), the flst_validate() "
+			"debug function can cause a deadlock if the "
+			"buffer pool fills up.",
+			srv_buf_pool_size / 1024 / 1024);
+	}
+#endif /* UNIV_DEBUG */
+
+	fsp_init();
+	log_init();
+
+	lock_sys_create(srv_lock_table_size);
+
+	/* Create i/o-handler threads: */
+
+	for (i = 0; i < srv_n_file_io_threads; ++i) {
+
+		n[i] = i;
+
+		os_thread_create(io_handler_thread, n + i, thread_ids + i);
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	if (0 != ut_strcmp(srv_log_group_home_dir, srv_arch_dir)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: Error: you must set the log group home dir in my.cnf\n");
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: the same as log arch dir.\n");
+
+		return(DB_ERROR);
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+
+	if (srv_n_log_files * srv_log_file_size * UNIV_PAGE_SIZE
+	    >= 512ULL * 1024ULL * 1024ULL * 1024ULL) {
+		/* log_block_convert_lsn_to_no() limits the returned block
+		number to 1G and given that OS_FILE_LOG_BLOCK_SIZE is 512
+		bytes, then we have a limit of 512 GB. If that limit is to
+		be raised, then log_block_convert_lsn_to_no() must be
+		modified. */
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Combined size of log files must be < 512 GB");
+
+		return(DB_ERROR);
+	}
+
+	if (srv_n_log_files * srv_log_file_size >= ULINT_MAX) {
+		/* fil_io() takes ulint as an argument and we are passing
+		(next_offset / UNIV_PAGE_SIZE) to it in log_group_write_buf().
+		So (next_offset / UNIV_PAGE_SIZE) must be less than ULINT_MAX.
+		So next_offset must be < ULINT_MAX * UNIV_PAGE_SIZE. This
+		means that we are limited to ULINT_MAX * UNIV_PAGE_SIZE which
+		is 64 TB on 32 bit systems. */
+		fprintf(stderr,
+			" InnoDB: Error: combined size of log files"
+			" must be < %lu GB\n",
+			ULINT_MAX / 1073741824 * UNIV_PAGE_SIZE);
+
+		return(DB_ERROR);
+	}
+
+	sum_of_new_sizes = 0;
+
+	for (i = 0; i < srv_n_data_files; i++) {
+#ifndef __WIN__
+		if (sizeof(off_t) < 5
+		    && srv_data_file_sizes[i]
+		    >= (ulint) (1 << (32 - UNIV_PAGE_SIZE_SHIFT))) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Error: file size must be < 4 GB"
+				" with this MySQL binary\n");
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: and operating system combination,"
+				" in some OS's < 2 GB\n");
+
+			return(DB_ERROR);
+		}
+#endif
+		sum_of_new_sizes += srv_data_file_sizes[i];
+	}
+
+	if (sum_of_new_sizes < 10485760 / UNIV_PAGE_SIZE) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Tablespace size must be at least 10 MB");
+
+		return(DB_ERROR);
+	}
+
+	recv_sys_create();
+	recv_sys_init(buf_pool_get_curr_size());
+
+	err = open_or_create_data_files(&create_new_db,
+#ifdef UNIV_LOG_ARCHIVE
+					&min_arch_log_no, &max_arch_log_no,
+#endif /* UNIV_LOG_ARCHIVE */
+					&min_flushed_lsn, &max_flushed_lsn,
+					&sum_of_new_sizes);
+	if (err == DB_FAIL) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"The system tablespace must be writable!");
+
+		return(DB_ERROR);
+
+	} else if (err != DB_SUCCESS) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Could not open or create the system tablespace. If "
+			"you tried to add new data files to the system "
+			"tablespace, and it failed here, you should now "
+			"edit innodb_data_file_path in my.cnf back to what "
+			"it was, and remove the new ibdata files InnoDB "
+			"created in this failed attempt. InnoDB only wrote "
+			"those files full of zeros, but did not yet use "
+			"them in any way. But be careful: do not remove "
+			"old data files which contain your precious data!");
+
+		return(err);
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	srv_normalize_path_for_win(srv_arch_dir);
+	srv_arch_dir = srv_add_path_separator_if_needed(srv_arch_dir);
+#endif /* UNIV_LOG_ARCHIVE */
+
+	dirnamelen = strlen(srv_log_group_home_dir);
+	ut_a(dirnamelen < (sizeof logfilename) - 10 - sizeof "ib_logfile");
+	memcpy(logfilename, srv_log_group_home_dir, dirnamelen);
+
+	/* Add a path separator if needed. */
+	if (dirnamelen && logfilename[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
+		logfilename[dirnamelen++] = SRV_PATH_SEPARATOR;
+	}
+
+	srv_log_file_size_requested = srv_log_file_size;
+
+	if (create_new_db) {
+		bool success = buf_flush_list(ULINT_MAX, LSN_MAX, NULL);
+		ut_a(success);
+
+		min_flushed_lsn = max_flushed_lsn = log_get_lsn();
+
+		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+
+		err = create_log_files(create_new_db, logfilename, dirnamelen,
+				       max_flushed_lsn, logfile0);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	} else {
+		for (i = 0; i < SRV_N_LOG_FILES_MAX; i++) {
+			os_offset_t	size;
+			os_file_stat_t	stat_info;
+
+			sprintf(logfilename + dirnamelen,
+				"ib_logfile%u", i);
+
+			err = os_file_get_status(
+				logfilename, &stat_info, false);
+
+			if (err == DB_NOT_FOUND) {
+				if (i == 0) {
+					if (max_flushed_lsn
+					    != min_flushed_lsn) {
+						ib_logf(IB_LOG_LEVEL_ERROR,
+							"Cannot create"
+							" log files because"
+							" data files are"
+							" corrupt or"
+							" not in sync"
+							" with each other");
+						return(DB_ERROR);
+					}
+
+					if (max_flushed_lsn < (lsn_t) 1000) {
+						ib_logf(IB_LOG_LEVEL_ERROR,
+							"Cannot create"
+							" log files because"
+							" data files are"
+							" corrupt or the"
+							" database was not"
+							" shut down cleanly"
+							" after creating"
+							" the data files.");
+						return(DB_ERROR);
+					}
+
+					err = create_log_files(
+						create_new_db, logfilename,
+						dirnamelen, max_flushed_lsn,
+						logfile0);
+
+					if (err != DB_SUCCESS) {
+						return(err);
+					}
+
+					create_log_files_rename(
+						logfilename, dirnamelen,
+						max_flushed_lsn, logfile0);
+
+					/* Suppress the message about
+					crash recovery. */
+					max_flushed_lsn = min_flushed_lsn
+						= log_get_lsn();
+					goto files_checked;
+				} else if (i < 2) {
+					/* must have at least 2 log files */
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"Only one log file found.");
+					return(err);
+				}
+
+				/* opened all files */
+				break;
+			}
+
+			if (!srv_file_check_mode(logfilename)) {
+				return(DB_ERROR);
+			}
+
+			err = open_log_file(&files[i], logfilename, &size);
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+
+			ut_a(size != (os_offset_t) -1);
+
+			if (size & ((1 << UNIV_PAGE_SIZE_SHIFT) - 1)) {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Log file %s size "
+					UINT64PF " is not a multiple of"
+					" innodb_page_size",
+					logfilename, size);
+				return(DB_ERROR);
+			}
+
+			size >>= UNIV_PAGE_SIZE_SHIFT;
+
+			if (i == 0) {
+				srv_log_file_size = size;
+			} else if (size != srv_log_file_size) {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Log file %s is"
+					" of different size " UINT64PF " bytes"
+					" than other log"
+					" files " UINT64PF " bytes!",
+					logfilename,
+					size << UNIV_PAGE_SIZE_SHIFT,
+					(os_offset_t) srv_log_file_size
+					<< UNIV_PAGE_SIZE_SHIFT);
+				return(DB_ERROR);
+			}
+		}
+
+		srv_n_log_files_found = i;
+
+		/* Create the in-memory file space objects. */
+
+		sprintf(logfilename + dirnamelen, "ib_logfile%u", 0);
+
+		fil_space_create(logfilename,
+				 SRV_LOG_SPACE_FIRST_ID,
+				 fsp_flags_set_page_size(0, UNIV_PAGE_SIZE),
+				 FIL_LOG);
+
+		ut_a(fil_validate());
+
+		/* srv_log_file_size is measured in pages; if page size is 16KB,
+		then we have a limit of 64TB on 32 bit systems */
+		ut_a(srv_log_file_size <= ULINT_MAX);
+
+		for (unsigned j = 0; j < i; j++) {
+			sprintf(logfilename + dirnamelen, "ib_logfile%u", j);
+
+			if (!fil_node_create(logfilename,
+					     (ulint) srv_log_file_size,
+					     SRV_LOG_SPACE_FIRST_ID, FALSE)) {
+				return(DB_ERROR);
+			}
+		}
+
+#ifdef UNIV_LOG_ARCHIVE
+		/* Create the file space object for archived logs. Under
+		MySQL, no archiving ever done. */
+		fil_space_create("arch_log_space", SRV_LOG_SPACE_FIRST_ID + 1,
+				 0, FIL_LOG);
+#endif /* UNIV_LOG_ARCHIVE */
+		log_group_init(0, i, srv_log_file_size * UNIV_PAGE_SIZE,
+			       SRV_LOG_SPACE_FIRST_ID,
+			       SRV_LOG_SPACE_FIRST_ID + 1);
+	}
+
+files_checked:
+	/* Open all log files and data files in the system
+	tablespace: we keep them open until database
+	shutdown */
+
+	fil_open_log_and_system_tablespace_files();
+
+	err = srv_undo_tablespaces_init(
+		create_new_db,
+		srv_undo_tablespaces,
+		&srv_undo_tablespaces_open);
+
+	/* If the force recovery is set very high then we carry on regardless
+	of all errors. Basically this is fingers crossed mode. */
+
+	if (err != DB_SUCCESS
+	    && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) {
+
+		return(err);
+	}
+
+	/* Initialize objects used by dict stats gathering thread, which
+	can also be used by recovery if it tries to drop some table */
+	if (!srv_read_only_mode) {
+		dict_stats_thread_init();
+	}
+
+	trx_sys_file_format_init();
+
+	trx_sys_create();
+
+	if (create_new_db) {
+
+		ut_a(!srv_read_only_mode);
+
+		mtr_start(&mtr);
+
+		fsp_header_init(0, sum_of_new_sizes, &mtr);
+
+		mtr_commit(&mtr);
+
+		/* To maintain backward compatibility we create only
+		the first rollback segment before the double write buffer.
+		All the remaining rollback segments will be created later,
+		after the double write buffer has been created. */
+		trx_sys_create_sys_pages();
+
+		ib_bh = trx_sys_init_at_db_start();
+		n_recovered_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list);
+
+		/* The purge system needs to create the purge view and
+		therefore requires that the trx_sys is inited. */
+
+		trx_purge_sys_create(srv_n_purge_threads, ib_bh);
+
+		err = dict_create();
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+
+		srv_startup_is_before_trx_rollback_phase = FALSE;
+
+		bool success = buf_flush_list(ULINT_MAX, LSN_MAX, NULL);
+		ut_a(success);
+
+		min_flushed_lsn = max_flushed_lsn = log_get_lsn();
+
+		buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+
+		/* Stamp the LSN to the data files. */
+		fil_write_flushed_lsn_to_data_files(max_flushed_lsn, 0);
+
+		fil_flush_file_spaces(FIL_TABLESPACE);
+
+		create_log_files_rename(logfilename, dirnamelen,
+					max_flushed_lsn, logfile0);
+#ifdef UNIV_LOG_ARCHIVE
+	} else if (srv_archive_recovery) {
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			" Starting archive recovery from a backup...");
+
+		err = recv_recovery_from_archive_start(
+			min_flushed_lsn, srv_archive_recovery_limit_lsn,
+			min_arch_log_no);
+		if (err != DB_SUCCESS) {
+
+			return(DB_ERROR);
+		}
+		/* Since ibuf init is in dict_boot, and ibuf is needed
+		in any disk i/o, first call dict_boot */
+
+		err = dict_boot();
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+
+		ib_bh = trx_sys_init_at_db_start();
+		n_recovered_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list);
+
+		/* The purge system needs to create the purge view and
+		therefore requires that the trx_sys is inited. */
+
+		trx_purge_sys_create(srv_n_purge_threads, ib_bh);
+
+		srv_startup_is_before_trx_rollback_phase = FALSE;
+
+		recv_recovery_from_archive_finish();
+#endif /* UNIV_LOG_ARCHIVE */
+	} else {
+
+		/* Check if we support the max format that is stamped
+		on the system tablespace.
+		Note:  We are NOT allowed to make any modifications to
+		the TRX_SYS_PAGE_NO page before recovery  because this
+		page also contains the max_trx_id etc. important system
+		variables that are required for recovery.  We need to
+		ensure that we return the system to a state where normal
+		recovery is guaranteed to work. We do this by
+		invalidating the buffer cache, this will force the
+		reread of the page and restoration to its last known
+		consistent state, this is REQUIRED for the recovery
+		process to work. */
+		err = trx_sys_file_format_max_check(
+			srv_max_file_format_at_startup);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+
+		/* Invalidate the buffer pool to ensure that we reread
+		the page that we read above, during recovery.
+		Note that this is not as heavy weight as it seems. At
+		this point there will be only ONE page in the buf_LRU
+		and there must be no page in the buf_flush list. */
+		buf_pool_invalidate();
+
+		/* We always try to do a recovery, even if the database had
+		been shut down normally: this is the normal startup path */
+
+		err = recv_recovery_from_checkpoint_start(
+			LOG_CHECKPOINT, LSN_MAX,
+			min_flushed_lsn, max_flushed_lsn);
+
+		if (err != DB_SUCCESS) {
+
+			return(DB_ERROR);
+		}
+
+		/* Since the insert buffer init is in dict_boot, and the
+		insert buffer is needed in any disk i/o, first we call
+		dict_boot(). Note that trx_sys_init_at_db_start() only needs
+		to access space 0, and the insert buffer at this stage already
+		works for space 0. */
+
+		err = dict_boot();
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+
+		ib_bh = trx_sys_init_at_db_start();
+		n_recovered_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list);
+
+		/* The purge system needs to create the purge view and
+		therefore requires that the trx_sys is inited. */
+
+		trx_purge_sys_create(srv_n_purge_threads, ib_bh);
+
+		/* recv_recovery_from_checkpoint_finish needs trx lists which
+		are initialized in trx_sys_init_at_db_start(). */
+
+		recv_recovery_from_checkpoint_finish();
+
+		if (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE) {
+			/* The following call is necessary for the insert
+			buffer to work with multiple tablespaces. We must
+			know the mapping between space id's and .ibd file
+			names.
+
+			In a crash recovery, we check that the info in data
+			dictionary is consistent with what we already know
+			about space id's from the call of
+			fil_load_single_table_tablespaces().
+
+			In a normal startup, we create the space objects for
+			every table in the InnoDB data dictionary that has
+			an .ibd file.
+
+			We also determine the maximum tablespace id used. */
+			dict_check_t	dict_check;
+
+			if (recv_needed_recovery) {
+				dict_check = DICT_CHECK_ALL_LOADED;
+			} else if (n_recovered_trx) {
+				dict_check = DICT_CHECK_SOME_LOADED;
+			} else {
+				dict_check = DICT_CHECK_NONE_LOADED;
+			}
+
+			dict_check_tablespaces_and_store_max_id(dict_check);
+		}
+
+		if (!srv_force_recovery
+		    && !recv_sys->found_corrupt_log
+		    && (srv_log_file_size_requested != srv_log_file_size
+			|| srv_n_log_files_found != srv_n_log_files)) {
+			/* Prepare to replace the redo log files. */
+
+			if (srv_read_only_mode) {
+				ib_logf(IB_LOG_LEVEL_ERROR,
+					"Cannot resize log files "
+					"in read-only mode.");
+				return(DB_READ_ONLY);
+			}
+
+			/* Clean the buffer pool. */
+			bool success = buf_flush_list(
+				ULINT_MAX, LSN_MAX, NULL);
+			ut_a(success);
+
+			RECOVERY_CRASH(1);
+
+			min_flushed_lsn = max_flushed_lsn = log_get_lsn();
+
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Resizing redo log from %u*%u to %u*%u pages"
+				", LSN=" LSN_PF,
+				(unsigned) i,
+				(unsigned) srv_log_file_size,
+				(unsigned) srv_n_log_files,
+				(unsigned) srv_log_file_size_requested,
+				max_flushed_lsn);
+
+			buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
+
+			RECOVERY_CRASH(2);
+
+			/* Flush the old log files. */
+			log_buffer_flush_to_disk();
+			/* If innodb_flush_method=O_DSYNC,
+			we need to explicitly flush the log buffers. */
+			fil_flush(SRV_LOG_SPACE_FIRST_ID);
+
+			ut_ad(max_flushed_lsn == log_get_lsn());
+
+			/* Prohibit redo log writes from any other
+			threads until creating a log checkpoint at the
+			end of create_log_files(). */
+			ut_d(recv_no_log_write = TRUE);
+			ut_ad(!buf_pool_check_no_pending_io());
+
+			RECOVERY_CRASH(3);
+
+			/* Stamp the LSN to the data files. */
+			fil_write_flushed_lsn_to_data_files(
+				max_flushed_lsn, 0);
+
+			fil_flush_file_spaces(FIL_TABLESPACE);
+
+			RECOVERY_CRASH(4);
+
+			/* Close and free the redo log files, so that
+			we can replace them. */
+			fil_close_log_files(true);
+
+			RECOVERY_CRASH(5);
+
+			/* Free the old log file space. */
+			log_group_close_all();
+
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Starting to delete and rewrite log files.");
+
+			srv_log_file_size = srv_log_file_size_requested;
+
+			err = create_log_files(create_new_db, logfilename,
+					       dirnamelen, max_flushed_lsn,
+					       logfile0);
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+
+			create_log_files_rename(logfilename, dirnamelen,
+						max_flushed_lsn, logfile0);
+		}
+
+		srv_startup_is_before_trx_rollback_phase = FALSE;
+		recv_recovery_rollback_active();
+
+		/* It is possible that file_format tag has never
+		been set. In this case we initialize it to minimum
+		value.  Important to note that we can do it ONLY after
+		we have finished the recovery process so that the
+		image of TRX_SYS_PAGE_NO is not stale. */
+		trx_sys_file_format_tag_init();
+	}
+
+	if (!create_new_db && sum_of_new_sizes > 0) {
+		/* New data file(s) were added */
+		mtr_start(&mtr);
+
+		fsp_header_inc_size(0, sum_of_new_sizes, &mtr);
+
+		mtr_commit(&mtr);
+
+		/* Immediately write the log record about increased tablespace
+		size to disk, so that it is durable even if mysqld would crash
+		quickly */
+
+		log_buffer_flush_to_disk();
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	/* Archiving is always off under MySQL */
+	if (!srv_log_archive_on) {
+		ut_a(DB_SUCCESS == log_archive_noarchivelog());
+	} else {
+		mutex_enter(&(log_sys->mutex));
+
+		start_archive = FALSE;
+
+		if (log_sys->archiving_state == LOG_ARCH_OFF) {
+			start_archive = TRUE;
+		}
+
+		mutex_exit(&(log_sys->mutex));
+
+		if (start_archive) {
+			ut_a(DB_SUCCESS == log_archive_archivelog());
+		}
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+
+	/* fprintf(stderr, "Max allowed record size %lu\n",
+	page_get_free_space_of_empty() / 2); */
+
+	if (buf_dblwr == NULL) {
+		/* Create the doublewrite buffer to a new tablespace */
+
+		buf_dblwr_create();
+	}
+
+	/* Here the double write buffer has already been created and so
+	any new rollback segments will be allocated after the double
+	write buffer. The default segment should already exist.
+	We create the new segments only if it's a new database or
+	the database was shutdown cleanly. */
+
+	/* Note: When creating the extra rollback segments during an upgrade
+	we violate the latching order, even if the change buffer is empty.
+	We make an exception in sync0sync.cc and check srv_is_being_started
+	for that violation. It cannot create a deadlock because we are still
+	running in single threaded mode essentially. Only the IO threads
+	should be running at this stage. */
+
+	ut_a(srv_undo_logs > 0);
+	ut_a(srv_undo_logs <= TRX_SYS_N_RSEGS);
+
+	/* The number of rsegs that exist in InnoDB is given by status
+	variable srv_available_undo_logs. The number of rsegs to use can
+	be set using the dynamic global variable srv_undo_logs. */
+
+	srv_available_undo_logs = trx_sys_create_rsegs(
+		srv_undo_tablespaces, srv_undo_logs);
+
+	if (srv_available_undo_logs == ULINT_UNDEFINED) {
+		/* Can only happen if server is read only. */
+		ut_a(srv_read_only_mode);
+		srv_undo_logs = ULONG_UNDEFINED;
+	}
+
+	if (!srv_read_only_mode) {
+		/* Create the thread which watches the timeouts
+		for lock waits */
+		os_thread_create(
+			lock_wait_timeout_thread,
+			NULL, thread_ids + 2 + SRV_MAX_N_IO_THREADS);
+
+		/* Create the thread which warns of long semaphore waits */
+		os_thread_create(
+			srv_error_monitor_thread,
+			NULL, thread_ids + 3 + SRV_MAX_N_IO_THREADS);
+
+		/* Create the thread which prints InnoDB monitor info */
+		os_thread_create(
+			srv_monitor_thread,
+			NULL, thread_ids + 4 + SRV_MAX_N_IO_THREADS);
+	}
+
+	/* Create the SYS_FOREIGN and SYS_FOREIGN_COLS system tables */
+	err = dict_create_or_check_foreign_constraint_tables();
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* Create the SYS_TABLESPACES system table */
+	err = dict_create_or_check_sys_tablespace();
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	srv_is_being_started = FALSE;
+
+	ut_a(trx_purge_state() == PURGE_STATE_INIT);
+
+	/* Create the master thread which does purge and other utility
+	operations */
+
+	if (!srv_read_only_mode) {
+
+		os_thread_create(
+			srv_master_thread,
+			NULL, thread_ids + (1 + SRV_MAX_N_IO_THREADS));
+	}
+
+	if (!srv_read_only_mode
+	    && srv_force_recovery < SRV_FORCE_NO_BACKGROUND) {
+
+		os_thread_create(
+			srv_purge_coordinator_thread,
+			NULL, thread_ids + 5 + SRV_MAX_N_IO_THREADS);
+
+		ut_a(UT_ARR_SIZE(thread_ids)
+		     > 5 + srv_n_purge_threads + SRV_MAX_N_IO_THREADS);
+
+		/* We've already created the purge coordinator thread above. */
+		for (i = 1; i < srv_n_purge_threads; ++i) {
+			os_thread_create(
+				srv_worker_thread, NULL,
+				thread_ids + 5 + i + SRV_MAX_N_IO_THREADS);
+		}
+
+		srv_start_wait_for_purge_to_start();
+
+	} else {
+		purge_sys->state = PURGE_STATE_DISABLED;
+	}
+
+	if (!srv_read_only_mode) {
+		os_thread_create(buf_flush_page_cleaner_thread, NULL, NULL);
+	}
+
+#ifdef UNIV_DEBUG
+	/* buf_debug_prints = TRUE; */
+#endif /* UNIV_DEBUG */
+	sum_of_data_file_sizes = 0;
+
+	for (i = 0; i < srv_n_data_files; i++) {
+		sum_of_data_file_sizes += srv_data_file_sizes[i];
+	}
+
+	tablespace_size_in_header = fsp_header_get_tablespace_size();
+
+	if (!srv_read_only_mode
+	    && !srv_auto_extend_last_data_file
+	    && sum_of_data_file_sizes != tablespace_size_in_header) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error: tablespace size"
+			" stored in header is %lu pages, but\n",
+			(ulong) tablespace_size_in_header);
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"InnoDB: the sum of data file sizes is %lu pages\n",
+			(ulong) sum_of_data_file_sizes);
+
+		if (srv_force_recovery == 0
+		    && sum_of_data_file_sizes < tablespace_size_in_header) {
+			/* This is a fatal error, the tail of a tablespace is
+			missing */
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Cannot start InnoDB."
+				" The tail of the system tablespace is\n");
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: missing. Have you edited"
+				" innodb_data_file_path in my.cnf in an\n");
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: inappropriate way, removing"
+				" ibdata files from there?\n");
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: You can set innodb_force_recovery=1"
+				" in my.cnf to force\n");
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: a startup if you are trying"
+				" to recover a badly corrupt database.\n");
+
+			return(DB_ERROR);
+		}
+	}
+
+	if (!srv_read_only_mode
+	    && srv_auto_extend_last_data_file
+	    && sum_of_data_file_sizes < tablespace_size_in_header) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error: tablespace size stored in header"
+			" is %lu pages, but\n",
+			(ulong) tablespace_size_in_header);
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: the sum of data file sizes"
+			" is only %lu pages\n",
+			(ulong) sum_of_data_file_sizes);
+
+		if (srv_force_recovery == 0) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Cannot start InnoDB. The tail of"
+				" the system tablespace is\n");
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: missing. Have you edited"
+				" innodb_data_file_path in my.cnf in an\n");
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: inappropriate way, removing"
+				" ibdata files from there?\n");
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: You can set innodb_force_recovery=1"
+				" in my.cnf to force\n");
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: a startup if you are trying to"
+				" recover a badly corrupt database.\n");
+
+			return(DB_ERROR);
+		}
+	}
+
+	/* Check that os_fast_mutexes work as expected */
+	os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &srv_os_test_mutex);
+
+	if (0 != os_fast_mutex_trylock(&srv_os_test_mutex)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error: pthread_mutex_trylock returns"
+			" an unexpected value on\n");
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: success! Cannot continue.\n");
+		exit(1);
+	}
+
+	os_fast_mutex_unlock(&srv_os_test_mutex);
+
+	os_fast_mutex_lock(&srv_os_test_mutex);
+
+	os_fast_mutex_unlock(&srv_os_test_mutex);
+
+	os_fast_mutex_free(&srv_os_test_mutex);
+
+	if (srv_print_verbose_log) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"%s started; log sequence number " LSN_PF "",
+			INNODB_VERSION_STR, srv_start_lsn);
+	}
+
+	if (srv_force_recovery > 0) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"!!! innodb_force_recovery is set to %lu !!!",
+			(ulong) srv_force_recovery);
+	}
+
+	if (srv_force_recovery == 0) {
+		/* In the insert buffer we may have even bigger tablespace
+		id's, because we may have dropped those tablespaces, but
+		insert buffer merge has not had time to clean the records from
+		the ibuf tree. */
+
+		ibuf_update_max_tablespace_id();
+	}
+
+	if (!srv_read_only_mode) {
+		/* Create the buffer pool dump/load thread */
+		os_thread_create(buf_dump_thread, NULL, NULL);
+
+		/* Create the dict stats gathering thread */
+		os_thread_create(dict_stats_thread, NULL, NULL);
+
+		/* Create the thread that will optimize the FTS sub-system. */
+		fts_optimize_init();
+	}
+
+	srv_was_started = TRUE;
+
+	return(DB_SUCCESS);
+}
+
+#if 0
+/********************************************************************
+Sync all FTS cache before shutdown */
+static
+void
+srv_fts_close(void)
+/*===============*/
+{
+	dict_table_t*	table;
+
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	     table; table = UT_LIST_GET_NEXT(table_LRU, table)) {
+		fts_t*          fts = table->fts;
+
+		if (fts != NULL) {
+			fts_sync_table(table);
+		}
+	}
+
+	for (table = UT_LIST_GET_FIRST(dict_sys->table_non_LRU);
+	     table; table = UT_LIST_GET_NEXT(table_LRU, table)) {
+		fts_t*          fts = table->fts;
+
+		if (fts != NULL) {
+			fts_sync_table(table);
+		}
+	}
+}
+#endif
+
+/****************************************************************//**
+Shuts down the InnoDB database.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+innobase_shutdown_for_mysql(void)
+/*=============================*/
+{
+	ulint	i;
+
+	if (!srv_was_started) {
+		if (srv_is_being_started) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Shutting down an improperly started, "
+				"or created database!");
+		}
+
+		return(DB_SUCCESS);
+	}
+
+	if (!srv_read_only_mode) {
+		/* Shutdown the FTS optimize sub system. */
+		fts_optimize_start_shutdown();
+
+		fts_optimize_end();
+	}
+
+	/* 1. Flush the buffer pool to disk, write the current lsn to
+	the tablespace header(s), and copy all log data to archive.
+	The step 1 is the real InnoDB shutdown. The remaining steps 2 - ...
+	just free data structures after the shutdown. */
+
+	logs_empty_and_mark_files_at_shutdown();
+
+	if (srv_conc_get_active_threads() != 0) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Query counter shows %ld queries still "
+			"inside InnoDB at shutdown",
+			srv_conc_get_active_threads());
+	}
+
+	/* 2. Make all threads created by InnoDB to exit */
+
+	srv_shutdown_state = SRV_SHUTDOWN_EXIT_THREADS;
+
+	/* All threads end up waiting for certain events. Put those events
+	to the signaled state. Then the threads will exit themselves after
+	os_event_wait(). */
+
+	for (i = 0; i < 1000; i++) {
+		/* NOTE: IF YOU CREATE THREADS IN INNODB, YOU MUST EXIT THEM
+		HERE OR EARLIER */
+
+		if (!srv_read_only_mode) {
+			/* a. Let the lock timeout thread exit */
+			os_event_set(lock_sys->timeout_event);
+
+			/* b. srv error monitor thread exits automatically,
+			no need to do anything here */
+
+			/* c. We wake the master thread so that it exits */
+			srv_wake_master_thread();
+
+			/* d. Wakeup purge threads. */
+			srv_purge_wakeup();
+		}
+
+		/* e. Exit the i/o threads */
+
+		os_aio_wake_all_threads_at_shutdown();
+
+		/* f. dict_stats_thread is signaled from
+		logs_empty_and_mark_files_at_shutdown() and should have
+		already quit or is quitting right now. */
+
+		os_mutex_enter(os_sync_mutex);
+
+		if (os_thread_count == 0) {
+			/* All the threads have exited or are just exiting;
+			NOTE that the threads may not have completed their
+			exit yet. Should we use pthread_join() to make sure
+			they have exited? If we did, we would have to
+			remove the pthread_detach() from
+			os_thread_exit().  Now we just sleep 0.1
+			seconds and hope that is enough! */
+
+			os_mutex_exit(os_sync_mutex);
+
+			os_thread_sleep(100000);
+
+			break;
+		}
+
+		os_mutex_exit(os_sync_mutex);
+
+		os_thread_sleep(100000);
+	}
+
+	if (i == 1000) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"%lu threads created by InnoDB"
+			" had not exited at shutdown!",
+			(ulong) os_thread_count);
+	}
+
+	if (srv_monitor_file) {
+		fclose(srv_monitor_file);
+		srv_monitor_file = 0;
+		if (srv_monitor_file_name) {
+			unlink(srv_monitor_file_name);
+			mem_free(srv_monitor_file_name);
+		}
+	}
+
+	if (srv_dict_tmpfile) {
+		fclose(srv_dict_tmpfile);
+		srv_dict_tmpfile = 0;
+	}
+
+	if (srv_misc_tmpfile) {
+		fclose(srv_misc_tmpfile);
+		srv_misc_tmpfile = 0;
+	}
+
+	if (!srv_read_only_mode) {
+		dict_stats_thread_deinit();
+	}
+
+	/* This must be disabled before closing the buffer pool
+	and closing the data dictionary.  */
+	btr_search_disable();
+
+	ibuf_close();
+	log_shutdown();
+	lock_sys_close();
+	trx_sys_file_format_close();
+	trx_sys_close();
+
+	/* We don't create these mutexes in RO mode because we don't create
+	the temp files that the cover. */
+	if (!srv_read_only_mode) {
+		mutex_free(&srv_monitor_file_mutex);
+		mutex_free(&srv_dict_tmpfile_mutex);
+		mutex_free(&srv_misc_tmpfile_mutex);
+	}
+
+	dict_close();
+	btr_search_sys_free();
+
+	/* 3. Free all InnoDB's own mutexes and the os_fast_mutexes inside
+	them */
+	os_aio_free();
+	que_close();
+	row_mysql_close();
+	srv_mon_free();
+	sync_close();
+	srv_free();
+	fil_close();
+
+	/* 4. Free the os_conc_mutex and all os_events and os_mutexes */
+
+	os_sync_free();
+
+	/* 5. Free all allocated memory */
+
+	pars_lexer_close();
+	log_mem_free();
+	buf_pool_free(srv_buf_pool_instances);
+	mem_close();
+
+	/* ut_free_all_mem() frees all allocated memory not freed yet
+	in shutdown, and it will also free the ut_list_mutex, so it
+	should be the last one for all operation */
+	ut_free_all_mem();
+
+	if (os_thread_count != 0
+	    || os_event_count != 0
+	    || os_mutex_count != 0
+	    || os_fast_mutex_count != 0) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Some resources were not cleaned up in shutdown: "
+			"threads %lu, events %lu, os_mutexes %lu, "
+			"os_fast_mutexes %lu",
+			(ulong) os_thread_count, (ulong) os_event_count,
+			(ulong) os_mutex_count, (ulong) os_fast_mutex_count);
+	}
+
+	if (dict_foreign_err_file) {
+		fclose(dict_foreign_err_file);
+	}
+
+	if (srv_print_verbose_log) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Shutdown completed; log sequence number " LSN_PF "",
+			srv_shutdown_lsn);
+	}
+
+	srv_was_started = FALSE;
+	srv_start_has_been_called = FALSE;
+
+	return(DB_SUCCESS);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+
+/********************************************************************
+Signal all per-table background threads to shutdown, and wait for them to do
+so. */
+UNIV_INTERN
+void
+srv_shutdown_table_bg_threads(void)
+/*===============================*/
+{
+	dict_table_t*	table;
+	dict_table_t*	first;
+	dict_table_t*	last = NULL;
+
+	mutex_enter(&dict_sys->mutex);
+
+	/* Signal all threads that they should stop. */
+	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	first = table;
+	while (table) {
+		dict_table_t*	next;
+		fts_t*		fts = table->fts;
+
+		if (fts != NULL) {
+			fts_start_shutdown(table, fts);
+		}
+
+		next = UT_LIST_GET_NEXT(table_LRU, table);
+
+		if (!next) {
+			last = table;
+		}
+
+		table = next;
+	}
+
+	/* We must release dict_sys->mutex here; if we hold on to it in the
+	loop below, we will deadlock if any of the background threads try to
+	acquire it (for example, the FTS thread by calling que_eval_sql).
+
+	Releasing it here and going through dict_sys->table_LRU without
+	holding it is safe because:
+
+	 a) MySQL only starts the shutdown procedure after all client
+	 threads have been disconnected and no new ones are accepted, so no
+	 new tables are added or old ones dropped.
+
+	 b) Despite its name, the list is not LRU, and the order stays
+	 fixed.
+
+	To safeguard against the above assumptions ever changing, we store
+	the first and last items in the list above, and then check that
+	they've stayed the same below. */
+
+	mutex_exit(&dict_sys->mutex);
+
+	/* Wait for the threads of each table to stop. This is not inside
+	the above loop, because by signaling all the threads first we can
+	overlap their shutting down delays. */
+	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+	ut_a(first == table);
+	while (table) {
+		dict_table_t*	next;
+		fts_t*		fts = table->fts;
+
+		if (fts != NULL) {
+			fts_shutdown(table, fts);
+		}
+
+		next = UT_LIST_GET_NEXT(table_LRU, table);
+
+		if (table == last) {
+			ut_a(!next);
+		}
+
+		table = next;
+	}
+}
+
+/*****************************************************************//**
+Get the meta-data filename from the table name. */
+UNIV_INTERN
+void
+srv_get_meta_data_filename(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: table */
+	char*			filename,	/*!< out: filename */
+	ulint			max_len)	/*!< in: filename max length */
+{
+	ulint			len;
+	char*			path;
+	char*			suffix;
+	static const ulint	suffix_len = strlen(".cfg");
+
+	if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+		dict_get_and_save_data_dir_path(table, false);
+		ut_a(table->data_dir_path);
+
+		path = os_file_make_remote_pathname(
+			table->data_dir_path, table->name, "cfg");
+	} else {
+		path = fil_make_ibd_name(table->name, false);
+	}
+
+	ut_a(path);
+	len = ut_strlen(path);
+	ut_a(max_len >= len);
+
+	suffix = path + (len - suffix_len);
+	if (strncmp(suffix, ".cfg", suffix_len) == 0) {
+		strcpy(filename, path);
+	} else {
+		ut_ad(strncmp(suffix, ".ibd", suffix_len) == 0);
+
+		strncpy(filename, path, len - suffix_len);
+		suffix = filename + (len - suffix_len);
+		strcpy(suffix, ".cfg");
+	}
+
+	mem_free(path);
+
+	srv_normalize_path_for_win(filename);
+}
diff --git a/storage/innobase/sync/sync0arr.cc b/storage/innobase/sync/sync0arr.cc
new file mode 100644
index 00000000000..d56d328d8c3
--- /dev/null
+++ b/storage/innobase/sync/sync0arr.cc
@@ -0,0 +1,1156 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file sync/sync0arr.cc
+The wait array used in synchronization primitives
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "sync0arr.h"
+#ifdef UNIV_NONINL
+#include "sync0arr.ic"
+#endif
+
+#include "sync0sync.h"
+#include "sync0rw.h"
+#include "os0sync.h"
+#include "os0file.h"
+#include "lock0lock.h"
+#include "srv0srv.h"
+#include "ha_prototypes.h"
+
+/*
+			WAIT ARRAY
+			==========
+
+The wait array consists of cells each of which has an
+an operating system event object created for it. The threads
+waiting for a mutex, for example, can reserve a cell
+in the array and suspend themselves to wait for the event
+to become signaled. When using the wait array, remember to make
+sure that some thread holding the synchronization object
+will eventually know that there is a waiter in the array and
+signal the object, to prevent infinite wait.
+Why we chose to implement a wait array? First, to make
+mutexes fast, we had to code our own implementation of them,
+which only in usually uncommon cases resorts to using
+slow operating system primitives. Then we had the choice of
+assigning a unique OS event for each mutex, which would
+be simpler, or using a global wait array. In some operating systems,
+the global wait array solution is more efficient and flexible,
+because we can do with a very small number of OS events,
+say 200. In NT 3.51, allocating events seems to be a quadratic
+algorithm, because 10 000 events are created fast, but
+100 000 events takes a couple of minutes to create.
+
+As of 5.0.30 the above mentioned design is changed. Since now
+OS can handle millions of wait events efficiently, we no longer
+have this concept of each cell of wait array having one event.
+Instead, now the event that a thread wants to wait on is embedded
+in the wait object (mutex or rw_lock). We still keep the global
+wait array for the sake of diagnostics and also to avoid infinite
+wait The error_monitor thread scans the global wait array to signal
+any waiting threads who have missed the signal. */
+
+/** A cell where an individual thread may wait suspended
+until a resource is released. The suspending is implemented
+using an operating system event semaphore. */
+struct sync_cell_t {
+	void*		wait_object;	/*!< pointer to the object the
+					thread is waiting for; if NULL
+					the cell is free for use */
+	ib_mutex_t*	old_wait_mutex;	/*!< the latest wait mutex in cell */
+	rw_lock_t*	old_wait_rw_lock;
+					/*!< the latest wait rw-lock
+					in cell */
+	ulint		request_type;	/*!< lock type requested on the
+					object */
+	const char*	file;		/*!< in debug version file where
+					requested */
+	ulint		line;		/*!< in debug version line where
+					requested */
+	os_thread_id_t	thread;		/*!< thread id of this waiting
+					thread */
+	ibool		waiting;	/*!< TRUE if the thread has already
+					called sync_array_event_wait
+					on this cell */
+	ib_int64_t	signal_count;	/*!< We capture the signal_count
+					of the wait_object when we
+					reset the event. This value is
+					then passed on to os_event_wait
+					and we wait only if the event
+					has not been signalled in the
+					period between the reset and
+					wait call. */
+	time_t		reservation_time;/*!< time when the thread reserved
+					the wait cell */
+};
+
+/* NOTE: It is allowed for a thread to wait
+for an event allocated for the array without owning the
+protecting mutex (depending on the case: OS or database mutex), but
+all changes (set or reset) to the state of the event must be made
+while owning the mutex. */
+
+/** Synchronization array */
+struct sync_array_t {
+	ulint		n_reserved;	/*!< number of currently reserved
+					cells in the wait array */
+	ulint		n_cells;	/*!< number of cells in the
+					wait array */
+	sync_cell_t*	array;		/*!< pointer to wait array */
+	ib_mutex_t	mutex;		/*!< possible database mutex
+					protecting this data structure */
+	os_ib_mutex_t	os_mutex;	/*!< Possible operating system mutex
+					protecting the data structure.
+					As this data structure is used in
+					constructing the database mutex,
+					to prevent infinite recursion
+					in implementation, we fall back to
+					an OS mutex. */
+	ulint		res_count;	/*!< count of cell reservations
+					since creation of the array */
+};
+
+/** User configured sync array size */
+UNIV_INTERN ulong	srv_sync_array_size = 32;
+
+/** Locally stored copy of srv_sync_array_size */
+static	ulint		sync_array_size;
+
+/** The global array of wait cells for implementation of the database's own
+mutexes and read-write locks */
+static	sync_array_t**	sync_wait_array;
+
+/** count of how many times an object has been signalled */
+static ulint		sg_count;
+
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+This function is called only in the debug version. Detects a deadlock
+of one or more threads because of waits of semaphores.
+@return	TRUE if deadlock detected */
+static
+ibool
+sync_array_detect_deadlock(
+/*=======================*/
+	sync_array_t*	arr,	/*!< in: wait array; NOTE! the caller must
+				own the mutex to array */
+	sync_cell_t*	start,	/*!< in: cell where recursive search started */
+	sync_cell_t*	cell,	/*!< in: cell to search */
+	ulint		depth);	/*!< in: recursion depth */
+#endif /* UNIV_SYNC_DEBUG */
+
+/*****************************************************************//**
+Gets the nth cell in array.
+@return	cell */
+static
+sync_cell_t*
+sync_array_get_nth_cell(
+/*====================*/
+	sync_array_t*	arr,	/*!< in: sync array */
+	ulint		n)	/*!< in: index */
+{
+	ut_a(arr);
+	ut_a(n < arr->n_cells);
+
+	return(arr->array + n);
+}
+
+/******************************************************************//**
+Reserves the mutex semaphore protecting a sync array. */
+static
+void
+sync_array_enter(
+/*=============*/
+	sync_array_t*	arr)	/*!< in: sync wait array */
+{
+	os_mutex_enter(arr->os_mutex);
+}
+
+/******************************************************************//**
+Releases the mutex semaphore protecting a sync array. */
+static
+void
+sync_array_exit(
+/*============*/
+	sync_array_t*	arr)	/*!< in: sync wait array */
+{
+	os_mutex_exit(arr->os_mutex);
+}
+
+/*******************************************************************//**
+Creates a synchronization wait array. It is protected by a mutex
+which is automatically reserved when the functions operating on it
+are called.
+@return	own: created wait array */
+static
+sync_array_t*
+sync_array_create(
+/*==============*/
+	ulint	n_cells)	/*!< in: number of cells in the array
+				to create */
+{
+	ulint		sz;
+	sync_array_t*	arr;
+
+	ut_a(n_cells > 0);
+
+	/* Allocate memory for the data structures */
+	arr = static_cast<sync_array_t*>(ut_malloc(sizeof(*arr)));
+	memset(arr, 0x0, sizeof(*arr));
+
+	sz = sizeof(sync_cell_t) * n_cells;
+	arr->array = static_cast<sync_cell_t*>(ut_malloc(sz));
+	memset(arr->array, 0x0, sz);
+
+	arr->n_cells = n_cells;
+
+	/* Then create the mutex to protect the wait array complex */
+	arr->os_mutex = os_mutex_create();
+
+	return(arr);
+}
+
+/******************************************************************//**
+Frees the resources in a wait array. */
+static
+void
+sync_array_free(
+/*============*/
+	sync_array_t*	arr)	/*!< in, own: sync wait array */
+{
+	ut_a(arr->n_reserved == 0);
+
+	sync_array_validate(arr);
+
+	/* Release the mutex protecting the wait array complex */
+
+	os_mutex_free(arr->os_mutex);
+
+	ut_free(arr->array);
+	ut_free(arr);
+}
+
+/********************************************************************//**
+Validates the integrity of the wait array. Checks
+that the number of reserved cells equals the count variable. */
+UNIV_INTERN
+void
+sync_array_validate(
+/*================*/
+	sync_array_t*	arr)	/*!< in: sync wait array */
+{
+	ulint		i;
+	sync_cell_t*	cell;
+	ulint		count		= 0;
+
+	sync_array_enter(arr);
+
+	for (i = 0; i < arr->n_cells; i++) {
+		cell = sync_array_get_nth_cell(arr, i);
+		if (cell->wait_object != NULL) {
+			count++;
+		}
+	}
+
+	ut_a(count == arr->n_reserved);
+
+	sync_array_exit(arr);
+}
+
+/*******************************************************************//**
+Returns the event that the thread owning the cell waits for. */
+static
+os_event_t
+sync_cell_get_event(
+/*================*/
+	sync_cell_t*	cell) /*!< in: non-empty sync array cell */
+{
+	ulint type = cell->request_type;
+
+	if (type == SYNC_MUTEX) {
+		return(((ib_mutex_t*) cell->wait_object)->event);
+	} else if (type == RW_LOCK_WAIT_EX) {
+		return(((rw_lock_t*) cell->wait_object)->wait_ex_event);
+	} else { /* RW_LOCK_SHARED and RW_LOCK_EX wait on the same event */
+		return(((rw_lock_t*) cell->wait_object)->event);
+	}
+}
+
+/******************************************************************//**
+Reserves a wait array cell for waiting for an object.
+The event of the cell is reset to nonsignalled state.
+@return true if free cell is found, otherwise false */
+UNIV_INTERN
+bool
+sync_array_reserve_cell(
+/*====================*/
+	sync_array_t*	arr,	/*!< in: wait array */
+	void*		object, /*!< in: pointer to the object to wait for */
+	ulint		type,	/*!< in: lock request type */
+	const char*	file,	/*!< in: file where requested */
+	ulint		line,	/*!< in: line where requested */
+	ulint*		index)	/*!< out: index of the reserved cell */
+{
+	sync_cell_t*	cell;
+	os_event_t      event;
+	ulint		i;
+
+	ut_a(object);
+	ut_a(index);
+
+	sync_array_enter(arr);
+
+	arr->res_count++;
+
+	/* Reserve a new cell. */
+	for (i = 0; i < arr->n_cells; i++) {
+		cell = sync_array_get_nth_cell(arr, i);
+
+		if (cell->wait_object == NULL) {
+
+			cell->waiting = FALSE;
+			cell->wait_object = object;
+
+			if (type == SYNC_MUTEX) {
+				cell->old_wait_mutex =
+					static_cast<ib_mutex_t*>(object);
+			} else {
+				cell->old_wait_rw_lock =
+					static_cast<rw_lock_t*>(object);
+			}
+
+			cell->request_type = type;
+
+			cell->file = file;
+			cell->line = line;
+
+			arr->n_reserved++;
+
+			*index = i;
+
+			sync_array_exit(arr);
+
+			/* Make sure the event is reset and also store
+			the value of signal_count at which the event
+			was reset. */
+                        event = sync_cell_get_event(cell);
+			cell->signal_count = os_event_reset(event);
+
+			cell->reservation_time = ut_time();
+
+			cell->thread = os_thread_get_curr_id();
+
+			return(true);
+		}
+	}
+
+	/* No free cell found */
+	return false;
+}
+
+/******************************************************************//**
+This function should be called when a thread starts to wait on
+a wait array cell. In the debug version this function checks
+if the wait for a semaphore will result in a deadlock, in which
+case prints info and asserts. */
+UNIV_INTERN
+void
+sync_array_wait_event(
+/*==================*/
+	sync_array_t*	arr,	/*!< in: wait array */
+	ulint		index)	/*!< in: index of the reserved cell */
+{
+	sync_cell_t*	cell;
+	os_event_t	event;
+
+	ut_a(arr);
+
+	sync_array_enter(arr);
+
+	cell = sync_array_get_nth_cell(arr, index);
+
+	ut_a(cell->wait_object);
+	ut_a(!cell->waiting);
+	ut_ad(os_thread_get_curr_id() == cell->thread);
+
+	event = sync_cell_get_event(cell);
+	cell->waiting = TRUE;
+
+#ifdef UNIV_SYNC_DEBUG
+
+	/* We use simple enter to the mutex below, because if
+	we cannot acquire it at once, mutex_enter would call
+	recursively sync_array routines, leading to trouble.
+	rw_lock_debug_mutex freezes the debug lists. */
+
+	rw_lock_debug_mutex_enter();
+
+	if (TRUE == sync_array_detect_deadlock(arr, cell, cell, 0)) {
+
+		fputs("########################################\n", stderr);
+		ut_error;
+	}
+
+	rw_lock_debug_mutex_exit();
+#endif
+	sync_array_exit(arr);
+
+	os_event_wait_low(event, cell->signal_count);
+
+	sync_array_free_cell(arr, index);
+}
+
+/******************************************************************//**
+Reports info of a wait array cell. */
+static
+void
+sync_array_cell_print(
+/*==================*/
+	FILE*		file,	/*!< in: file where to print */
+	sync_cell_t*	cell)	/*!< in: sync cell */
+{
+	ib_mutex_t*	mutex;
+	rw_lock_t*	rwlock;
+	ulint		type;
+	ulint		writer;
+
+	type = cell->request_type;
+
+	fprintf(file,
+		"--Thread %lu has waited at %s line %lu"
+		" for %.2f seconds the semaphore:\n",
+		(ulong) os_thread_pf(cell->thread),
+		innobase_basename(cell->file), (ulong) cell->line,
+		difftime(time(NULL), cell->reservation_time));
+
+	if (type == SYNC_MUTEX) {
+		/* We use old_wait_mutex in case the cell has already
+		been freed meanwhile */
+		mutex = cell->old_wait_mutex;
+
+		fprintf(file,
+			"Mutex at %p created file %s line %lu, lock var %lu\n"
+#ifdef UNIV_SYNC_DEBUG
+			"Last time reserved in file %s line %lu, "
+#endif /* UNIV_SYNC_DEBUG */
+			"waiters flag %lu\n",
+			(void*) mutex, innobase_basename(mutex->cfile_name),
+			(ulong) mutex->cline,
+			(ulong) mutex->lock_word,
+#ifdef UNIV_SYNC_DEBUG
+			mutex->file_name, (ulong) mutex->line,
+#endif /* UNIV_SYNC_DEBUG */
+			(ulong) mutex->waiters);
+
+	} else if (type == RW_LOCK_EX
+		   || type == RW_LOCK_WAIT_EX
+		   || type == RW_LOCK_SHARED) {
+
+		fputs(type == RW_LOCK_EX ? "X-lock on"
+		      : type == RW_LOCK_WAIT_EX ? "X-lock (wait_ex) on"
+		      : "S-lock on", file);
+
+		rwlock = cell->old_wait_rw_lock;
+
+		fprintf(file,
+			" RW-latch at %p created in file %s line %lu\n",
+			(void*) rwlock, innobase_basename(rwlock->cfile_name),
+			(ulong) rwlock->cline);
+		writer = rw_lock_get_writer(rwlock);
+		if (writer != RW_LOCK_NOT_LOCKED) {
+			fprintf(file,
+				"a writer (thread id %lu) has"
+				" reserved it in mode %s",
+				(ulong) os_thread_pf(rwlock->writer_thread),
+				writer == RW_LOCK_EX
+				? " exclusive\n"
+				: " wait exclusive\n");
+		}
+
+		fprintf(file,
+			"number of readers %lu, waiters flag %lu, "
+                        "lock_word: %lx\n"
+			"Last time read locked in file %s line %lu\n"
+			"Last time write locked in file %s line %lu\n",
+			(ulong) rw_lock_get_reader_count(rwlock),
+			(ulong) rwlock->waiters,
+			rwlock->lock_word,
+			innobase_basename(rwlock->last_s_file_name),
+			(ulong) rwlock->last_s_line,
+			rwlock->last_x_file_name,
+			(ulong) rwlock->last_x_line);
+	} else {
+		ut_error;
+	}
+
+	if (!cell->waiting) {
+		fputs("wait has ended\n", file);
+	}
+}
+
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Looks for a cell with the given thread id.
+@return	pointer to cell or NULL if not found */
+static
+sync_cell_t*
+sync_array_find_thread(
+/*===================*/
+	sync_array_t*	arr,	/*!< in: wait array */
+	os_thread_id_t	thread)	/*!< in: thread id */
+{
+	ulint		i;
+	sync_cell_t*	cell;
+
+	for (i = 0; i < arr->n_cells; i++) {
+
+		cell = sync_array_get_nth_cell(arr, i);
+
+		if (cell->wait_object != NULL
+		    && os_thread_eq(cell->thread, thread)) {
+
+			return(cell);	/* Found */
+		}
+	}
+
+	return(NULL);	/* Not found */
+}
+
+/******************************************************************//**
+Recursion step for deadlock detection.
+@return	TRUE if deadlock detected */
+static
+ibool
+sync_array_deadlock_step(
+/*=====================*/
+	sync_array_t*	arr,	/*!< in: wait array; NOTE! the caller must
+				own the mutex to array */
+	sync_cell_t*	start,	/*!< in: cell where recursive search
+				started */
+	os_thread_id_t	thread,	/*!< in: thread to look at */
+	ulint		pass,	/*!< in: pass value */
+	ulint		depth)	/*!< in: recursion depth */
+{
+	sync_cell_t*	new_cell;
+
+	if (pass != 0) {
+		/* If pass != 0, then we do not know which threads are
+		responsible of releasing the lock, and no deadlock can
+		be detected. */
+
+		return(FALSE);
+	}
+
+	new_cell = sync_array_find_thread(arr, thread);
+
+	if (new_cell == start) {
+		/* Deadlock */
+		fputs("########################################\n"
+		      "DEADLOCK of threads detected!\n", stderr);
+
+		return(TRUE);
+
+	} else if (new_cell) {
+		return(sync_array_detect_deadlock(
+			arr, start, new_cell, depth + 1));
+	}
+	return(FALSE);
+}
+
+/******************************************************************//**
+This function is called only in the debug version. Detects a deadlock
+of one or more threads because of waits of semaphores.
+@return	TRUE if deadlock detected */
+static
+ibool
+sync_array_detect_deadlock(
+/*=======================*/
+	sync_array_t*	arr,	/*!< in: wait array; NOTE! the caller must
+				own the mutex to array */
+	sync_cell_t*	start,	/*!< in: cell where recursive search started */
+	sync_cell_t*	cell,	/*!< in: cell to search */
+	ulint		depth)	/*!< in: recursion depth */
+{
+	ib_mutex_t*	mutex;
+	rw_lock_t*	lock;
+	os_thread_id_t	thread;
+	ibool		ret;
+	rw_lock_debug_t*debug;
+
+	ut_a(arr);
+	ut_a(start);
+	ut_a(cell);
+	ut_ad(cell->wait_object);
+	ut_ad(os_thread_get_curr_id() == start->thread);
+	ut_ad(depth < 100);
+
+	depth++;
+
+	if (!cell->waiting) {
+
+		return(FALSE); /* No deadlock here */
+	}
+
+	if (cell->request_type == SYNC_MUTEX) {
+
+		mutex = static_cast<ib_mutex_t*>(cell->wait_object);
+
+		if (mutex_get_lock_word(mutex) != 0) {
+
+			thread = mutex->thread_id;
+
+			/* Note that mutex->thread_id above may be
+			also OS_THREAD_ID_UNDEFINED, because the
+			thread which held the mutex maybe has not
+			yet updated the value, or it has already
+			released the mutex: in this case no deadlock
+			can occur, as the wait array cannot contain
+			a thread with ID_UNDEFINED value. */
+
+			ret = sync_array_deadlock_step(arr, start, thread, 0,
+						       depth);
+			if (ret) {
+				fprintf(stderr,
+			"Mutex %p owned by thread %lu file %s line %lu\n",
+					mutex, (ulong) os_thread_pf(mutex->thread_id),
+					mutex->file_name, (ulong) mutex->line);
+				sync_array_cell_print(stderr, cell);
+
+				return(TRUE);
+			}
+		}
+
+		return(FALSE); /* No deadlock */
+
+	} else if (cell->request_type == RW_LOCK_EX
+		   || cell->request_type == RW_LOCK_WAIT_EX) {
+
+		lock = static_cast<rw_lock_t*>(cell->wait_object);
+
+		for (debug = UT_LIST_GET_FIRST(lock->debug_list);
+		     debug != 0;
+		     debug = UT_LIST_GET_NEXT(list, debug)) {
+
+			thread = debug->thread_id;
+
+			if (((debug->lock_type == RW_LOCK_EX)
+			     && !os_thread_eq(thread, cell->thread))
+			    || ((debug->lock_type == RW_LOCK_WAIT_EX)
+				&& !os_thread_eq(thread, cell->thread))
+			    || (debug->lock_type == RW_LOCK_SHARED)) {
+
+				/* The (wait) x-lock request can block
+				infinitely only if someone (can be also cell
+				thread) is holding s-lock, or someone
+				(cannot be cell thread) (wait) x-lock, and
+				he is blocked by start thread */
+
+				ret = sync_array_deadlock_step(
+					arr, start, thread, debug->pass,
+					depth);
+				if (ret) {
+print:
+					fprintf(stderr, "rw-lock %p ",
+						(void*) lock);
+					sync_array_cell_print(stderr, cell);
+					rw_lock_debug_print(stderr, debug);
+					return(TRUE);
+				}
+			}
+		}
+
+		return(FALSE);
+
+	} else if (cell->request_type == RW_LOCK_SHARED) {
+
+		lock = static_cast<rw_lock_t*>(cell->wait_object);
+
+		for (debug = UT_LIST_GET_FIRST(lock->debug_list);
+		     debug != 0;
+		     debug = UT_LIST_GET_NEXT(list, debug)) {
+
+			thread = debug->thread_id;
+
+			if ((debug->lock_type == RW_LOCK_EX)
+			    || (debug->lock_type == RW_LOCK_WAIT_EX)) {
+
+				/* The s-lock request can block infinitely
+				only if someone (can also be cell thread) is
+				holding (wait) x-lock, and he is blocked by
+				start thread */
+
+				ret = sync_array_deadlock_step(
+					arr, start, thread, debug->pass,
+					depth);
+				if (ret) {
+					goto print;
+				}
+			}
+		}
+
+		return(FALSE);
+
+	} else {
+		ut_error;
+	}
+
+	return(TRUE);	/* Execution never reaches this line: for compiler
+			fooling only */
+}
+#endif /* UNIV_SYNC_DEBUG */
+
+/******************************************************************//**
+Determines if we can wake up the thread waiting for a sempahore. */
+static
+ibool
+sync_arr_cell_can_wake_up(
+/*======================*/
+	sync_cell_t*	cell)	/*!< in: cell to search */
+{
+	ib_mutex_t*	mutex;
+	rw_lock_t*	lock;
+
+	if (cell->request_type == SYNC_MUTEX) {
+
+		mutex = static_cast<ib_mutex_t*>(cell->wait_object);
+
+		os_rmb;
+		if (mutex_get_lock_word(mutex) == 0) {
+
+			return(TRUE);
+		}
+
+	} else if (cell->request_type == RW_LOCK_EX) {
+
+		lock = static_cast<rw_lock_t*>(cell->wait_object);
+
+		os_rmb;
+		if (lock->lock_word > 0) {
+		/* Either unlocked or only read locked. */
+
+			return(TRUE);
+		}
+
+        } else if (cell->request_type == RW_LOCK_WAIT_EX) {
+
+		lock = static_cast<rw_lock_t*>(cell->wait_object);
+
+                /* lock_word == 0 means all readers have left */
+		os_rmb;
+		if (lock->lock_word == 0) {
+
+			return(TRUE);
+		}
+	} else if (cell->request_type == RW_LOCK_SHARED) {
+		lock = static_cast<rw_lock_t*>(cell->wait_object);
+
+                /* lock_word > 0 means no writer or reserved writer */
+		os_rmb;
+		if (lock->lock_word > 0) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/******************************************************************//**
+Frees the cell. NOTE! sync_array_wait_event frees the cell
+automatically! */
+UNIV_INTERN
+void
+sync_array_free_cell(
+/*=================*/
+	sync_array_t*	arr,	/*!< in: wait array */
+	ulint		index)  /*!< in: index of the cell in array */
+{
+	sync_cell_t*	cell;
+
+	sync_array_enter(arr);
+
+	cell = sync_array_get_nth_cell(arr, index);
+
+	ut_a(cell->wait_object != NULL);
+
+	cell->waiting = FALSE;
+	cell->wait_object =  NULL;
+	cell->signal_count = 0;
+
+	ut_a(arr->n_reserved > 0);
+	arr->n_reserved--;
+
+	sync_array_exit(arr);
+}
+
+/**********************************************************************//**
+Increments the signalled count. */
+UNIV_INTERN
+void
+sync_array_object_signalled(void)
+/*=============================*/
+{
+#ifdef HAVE_ATOMIC_BUILTINS
+	(void) os_atomic_increment_ulint(&sg_count, 1);
+#else
+	++sg_count;
+#endif /* HAVE_ATOMIC_BUILTINS */
+}
+
+/**********************************************************************//**
+If the wakeup algorithm does not work perfectly at semaphore relases,
+this function will do the waking (see the comment in mutex_exit). This
+function should be called about every 1 second in the server.
+
+Note that there's a race condition between this thread and mutex_exit
+changing the lock_word and calling signal_object, so sometimes this finds
+threads to wake up even when nothing has gone wrong. */
+static
+void
+sync_array_wake_threads_if_sema_free_low(
+/*=====================================*/
+	sync_array_t*	arr)		/* in/out: wait array */
+{
+	ulint		i = 0;
+	ulint		count;
+
+	sync_array_enter(arr);
+
+	for (count = 0;  count < arr->n_reserved; ++i) {
+		sync_cell_t*	cell;
+
+		cell = sync_array_get_nth_cell(arr, i);
+
+		if (cell->wait_object != NULL) {
+
+			count++;
+
+			if (sync_arr_cell_can_wake_up(cell)) {
+				os_event_t      event;
+
+				event = sync_cell_get_event(cell);
+
+				os_event_set(event);
+			}
+		}
+	}
+
+	sync_array_exit(arr);
+}
+
+/**********************************************************************//**
+If the wakeup algorithm does not work perfectly at semaphore relases,
+this function will do the waking (see the comment in mutex_exit). This
+function should be called about every 1 second in the server.
+
+Note that there's a race condition between this thread and mutex_exit
+changing the lock_word and calling signal_object, so sometimes this finds
+threads to wake up even when nothing has gone wrong. */
+UNIV_INTERN
+void
+sync_arr_wake_threads_if_sema_free(void)
+/*====================================*/
+{
+	ulint		i;
+
+	for (i = 0; i < sync_array_size; ++i) {
+
+		sync_array_wake_threads_if_sema_free_low(
+			sync_wait_array[i]);
+	}
+}
+
+/**********************************************************************//**
+Prints warnings of long semaphore waits to stderr.
+@return	TRUE if fatal semaphore wait threshold was exceeded */
+static
+ibool
+sync_array_print_long_waits_low(
+/*============================*/
+	sync_array_t*	arr,	/*!< in: sync array instance */
+	os_thread_id_t*	waiter,	/*!< out: longest waiting thread */
+	const void**	sema,	/*!< out: longest-waited-for semaphore */
+	ibool*		noticed)/*!< out: TRUE if long wait noticed */
+{
+	ulint		i;
+	ulint		fatal_timeout = srv_fatal_semaphore_wait_threshold;
+	ibool		fatal = FALSE;
+	double		longest_diff = 0;
+
+	/* For huge tables, skip the check during CHECK TABLE etc... */
+	if (fatal_timeout > SRV_SEMAPHORE_WAIT_EXTENSION) {
+		return(FALSE);
+	}
+
+#ifdef UNIV_DEBUG_VALGRIND
+	/* Increase the timeouts if running under valgrind because it executes
+	extremely slowly. UNIV_DEBUG_VALGRIND does not necessary mean that
+	we are running under valgrind but we have no better way to tell.
+	See Bug#58432 innodb.innodb_bug56143 fails under valgrind
+	for an example */
+# define SYNC_ARRAY_TIMEOUT	2400
+	fatal_timeout *= 10;
+#else
+# define SYNC_ARRAY_TIMEOUT	240
+#endif
+
+	for (i = 0; i < arr->n_cells; i++) {
+
+		double		diff;
+		sync_cell_t*	cell;
+		void*		wait_object;
+
+		cell = sync_array_get_nth_cell(arr, i);
+
+		wait_object = cell->wait_object;
+
+		if (wait_object == NULL || !cell->waiting) {
+
+			continue;
+		}
+
+		diff = difftime(time(NULL), cell->reservation_time);
+
+		if (diff > SYNC_ARRAY_TIMEOUT) {
+			fputs("InnoDB: Warning: a long semaphore wait:\n",
+			      stderr);
+			sync_array_cell_print(stderr, cell);
+			*noticed = TRUE;
+		}
+
+		if (diff > fatal_timeout) {
+			fatal = TRUE;
+		}
+
+		if (diff > longest_diff) {
+			longest_diff = diff;
+			*sema = wait_object;
+			*waiter = cell->thread;
+		}
+	}
+
+#undef SYNC_ARRAY_TIMEOUT
+
+	return(fatal);
+}
+
+/**********************************************************************//**
+Prints warnings of long semaphore waits to stderr.
+@return	TRUE if fatal semaphore wait threshold was exceeded */
+UNIV_INTERN
+ibool
+sync_array_print_long_waits(
+/*========================*/
+	os_thread_id_t*	waiter,	/*!< out: longest waiting thread */
+	const void**	sema)	/*!< out: longest-waited-for semaphore */
+{
+	ulint		i;
+	ibool		fatal = FALSE;
+	ibool		noticed = FALSE;
+
+	for (i = 0; i < sync_array_size; ++i) {
+
+		sync_array_t*	arr = sync_wait_array[i];
+
+		sync_array_enter(arr);
+
+		if (sync_array_print_long_waits_low(
+				arr, waiter, sema, &noticed)) {
+
+			fatal = TRUE;
+		}
+
+		sync_array_exit(arr);
+	}
+
+	if (noticed) {
+		ibool	old_val;
+
+		fprintf(stderr,
+			"InnoDB: ###### Starts InnoDB Monitor"
+			" for 30 secs to print diagnostic info:\n");
+
+		old_val = srv_print_innodb_monitor;
+
+		/* If some crucial semaphore is reserved, then also the InnoDB
+		Monitor can hang, and we do not get diagnostics. Since in
+		many cases an InnoDB hang is caused by a pwrite() or a pread()
+		call hanging inside the operating system, let us print right
+		now the values of pending calls of these. */
+
+		fprintf(stderr,
+			"InnoDB: Pending preads %lu, pwrites %lu\n",
+			(ulong) os_file_n_pending_preads,
+			(ulong) os_file_n_pending_pwrites);
+
+		srv_print_innodb_monitor = TRUE;
+		os_event_set(lock_sys->timeout_event);
+
+		os_thread_sleep(30000000);
+
+		srv_print_innodb_monitor = static_cast<my_bool>(old_val);
+		fprintf(stderr,
+			"InnoDB: ###### Diagnostic info printed"
+			" to the standard error stream\n");
+	}
+
+	return(fatal);
+}
+
+/**********************************************************************//**
+Prints info of the wait array. */
+static
+void
+sync_array_print_info_low(
+/*======================*/
+	FILE*		file,	/*!< in: file where to print */
+	sync_array_t*	arr)	/*!< in: wait array */
+{
+	ulint		i;
+	ulint		count = 0;
+
+	fprintf(file,
+		"OS WAIT ARRAY INFO: reservation count %ld\n",
+		(long) arr->res_count);
+
+	for (i = 0; count < arr->n_reserved; ++i) {
+		sync_cell_t*	cell;
+
+		cell = sync_array_get_nth_cell(arr, i);
+
+		if (cell->wait_object != NULL) {
+			count++;
+			sync_array_cell_print(file, cell);
+		}
+	}
+}
+
+/**********************************************************************//**
+Prints info of the wait array. */
+static
+void
+sync_array_print_info(
+/*==================*/
+	FILE*		file,	/*!< in: file where to print */
+	sync_array_t*	arr)	/*!< in: wait array */
+{
+	sync_array_enter(arr);
+
+	sync_array_print_info_low(file, arr);
+
+	sync_array_exit(arr);
+}
+
+/**********************************************************************//**
+Create the primary system wait array(s), they are protected by an OS mutex */
+UNIV_INTERN
+void
+sync_array_init(
+/*============*/
+	ulint		n_threads)		/*!< in: Number of slots to
+						create in all arrays */
+{
+	ulint		i;
+	ulint		n_slots;
+
+	ut_a(sync_wait_array == NULL);
+	ut_a(srv_sync_array_size > 0);
+	ut_a(n_threads > 0);
+
+	sync_array_size = srv_sync_array_size;
+
+	/* We have to use ut_malloc() because the mutex infrastructure
+	hasn't been initialised yet. It is required by mem_alloc() and
+	the heap functions. */
+
+	sync_wait_array = static_cast<sync_array_t**>(
+		ut_malloc(sizeof(*sync_wait_array) * sync_array_size));
+
+	n_slots = 1 + (n_threads - 1) / sync_array_size;
+
+	for (i = 0; i < sync_array_size; ++i) {
+
+		sync_wait_array[i] = sync_array_create(n_slots);
+	}
+}
+
+/**********************************************************************//**
+Close sync array wait sub-system. */
+UNIV_INTERN
+void
+sync_array_close(void)
+/*==================*/
+{
+	ulint		i;
+
+	for (i = 0; i < sync_array_size; ++i) {
+		sync_array_free(sync_wait_array[i]);
+	}
+
+	ut_free(sync_wait_array);
+	sync_wait_array = NULL;
+}
+
+/**********************************************************************//**
+Print info about the sync array(s). */
+UNIV_INTERN
+void
+sync_array_print(
+/*=============*/
+	FILE*		file)		/*!< in/out: Print to this stream */
+{
+	ulint		i;
+
+	for (i = 0; i < sync_array_size; ++i) {
+		sync_array_print_info(file, sync_wait_array[i]);
+	}
+
+	fprintf(file,
+		"OS WAIT ARRAY INFO: signal count %ld\n", (long) sg_count);
+
+}
+
+/**********************************************************************//**
+Get an instance of the sync wait array. */
+UNIV_INTERN
+sync_array_t*
+sync_array_get(void)
+/*================*/
+{
+	ulint		i;
+	static ulint	count;
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	i = os_atomic_increment_ulint(&count, 1);
+#else
+	i = count++;
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+	return(sync_wait_array[i % sync_array_size]);
+}
diff --git a/storage/innobase/sync/sync0rw.cc b/storage/innobase/sync/sync0rw.cc
new file mode 100644
index 00000000000..2d3d16e9065
--- /dev/null
+++ b/storage/innobase/sync/sync0rw.cc
@@ -0,0 +1,1049 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file sync/sync0rw.cc
+The read-write lock (for thread synchronization)
+
+Created 9/11/1995 Heikki Tuuri
+*******************************************************/
+
+#include "sync0rw.h"
+#ifdef UNIV_NONINL
+#include "sync0rw.ic"
+#include "sync0arr.ic"
+#endif
+
+#include "os0thread.h"
+#include "mem0mem.h"
+#include "srv0srv.h"
+#include "os0sync.h" /* for INNODB_RW_LOCKS_USE_ATOMICS */
+#include "ha_prototypes.h"
+
+/*
+	IMPLEMENTATION OF THE RW_LOCK
+	=============================
+The status of a rw_lock is held in lock_word. The initial value of lock_word is
+X_LOCK_DECR. lock_word is decremented by 1 for each s-lock and by X_LOCK_DECR
+for each x-lock. This describes the lock state for each value of lock_word:
+
+lock_word == X_LOCK_DECR:      Unlocked.
+0 < lock_word < X_LOCK_DECR:   Read locked, no waiting writers.
+			       (X_LOCK_DECR - lock_word) is the
+			       number of readers that hold the lock.
+lock_word == 0:		       Write locked
+-X_LOCK_DECR < lock_word < 0:  Read locked, with a waiting writer.
+			       (-lock_word) is the number of readers
+			       that hold the lock.
+lock_word <= -X_LOCK_DECR:     Recursively write locked. lock_word has been
+			       decremented by X_LOCK_DECR for the first lock
+			       and the first recursive lock, then by 1 for
+			       each recursive lock thereafter.
+			       So the number of locks is:
+			       (lock_copy == 0) ? 1 : 2 - (lock_copy + X_LOCK_DECR)
+
+The lock_word is always read and updated atomically and consistently, so that
+it always represents the state of the lock, and the state of the lock changes
+with a single atomic operation. This lock_word holds all of the information
+that a thread needs in order to determine if it is eligible to gain the lock
+or if it must spin or sleep. The one exception to this is that writer_thread
+must be verified before recursive write locks: to solve this scenario, we make
+writer_thread readable by all threads, but only writeable by the x-lock holder.
+
+The other members of the lock obey the following rules to remain consistent:
+
+recursive:	This and the writer_thread field together control the
+		behaviour of recursive x-locking.
+		lock->recursive must be FALSE in following states:
+			1) The writer_thread contains garbage i.e.: the
+			lock has just been initialized.
+			2) The lock is not x-held and there is no
+			x-waiter waiting on WAIT_EX event.
+			3) The lock is x-held or there is an x-waiter
+			waiting on WAIT_EX event but the 'pass' value
+			is non-zero.
+		lock->recursive is TRUE iff:
+			1) The lock is x-held or there is an x-waiter
+			waiting on WAIT_EX event and the 'pass' value
+			is zero.
+		This flag must be set after the writer_thread field
+		has been updated with a memory ordering barrier.
+		It is unset before the lock_word has been incremented.
+writer_thread:	Is used only in recursive x-locking. Can only be safely
+		read iff lock->recursive flag is TRUE.
+		This field is uninitialized at lock creation time and
+		is updated atomically when x-lock is acquired or when
+		move_ownership is called. A thread is only allowed to
+		set the value of this field to it's thread_id i.e.: a
+		thread cannot set writer_thread to some other thread's
+		id.
+waiters:	May be set to 1 anytime, but to avoid unnecessary wake-up
+		signals, it should only be set to 1 when there are threads
+		waiting on event. Must be 1 when a writer starts waiting to
+		ensure the current x-locking thread sends a wake-up signal
+		during unlock. May only be reset to 0 immediately before a
+		a wake-up signal is sent to event. On most platforms, a
+		memory barrier is required after waiters is set, and before
+		verifying lock_word is still held, to ensure some unlocker
+		really does see the flags new value.
+event:		Threads wait on event for read or writer lock when another
+		thread has an x-lock or an x-lock reservation (wait_ex). A
+		thread may only	wait on event after performing the following
+		actions in order:
+		   (1) Record the counter value of event (with os_event_reset).
+		   (2) Set waiters to 1.
+		   (3) Verify lock_word <= 0.
+		(1) must come before (2) to ensure signal is not missed.
+		(2) must come before (3) to ensure a signal is sent.
+		These restrictions force the above ordering.
+		Immediately before sending the wake-up signal, we should:
+		   (1) Verify lock_word == X_LOCK_DECR (unlocked)
+		   (2) Reset waiters to 0.
+wait_ex_event:	A thread may only wait on the wait_ex_event after it has
+		performed the following actions in order:
+		   (1) Decrement lock_word by X_LOCK_DECR.
+		   (2) Record counter value of wait_ex_event (os_event_reset,
+		       called from sync_array_reserve_cell).
+		   (3) Verify that lock_word < 0.
+		(1) must come first to ensures no other threads become reader
+		or next writer, and notifies unlocker that signal must be sent.
+		(2) must come before (3) to ensure the signal is not missed.
+		These restrictions force the above ordering.
+		Immediately before sending the wake-up signal, we should:
+		   Verify lock_word == 0 (waiting thread holds x_lock)
+*/
+
+UNIV_INTERN rw_lock_stats_t	rw_lock_stats;
+
+/* The global list of rw-locks */
+UNIV_INTERN rw_lock_list_t	rw_lock_list;
+UNIV_INTERN ib_mutex_t		rw_lock_list_mutex;
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	rw_lock_list_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	rw_lock_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+#ifdef UNIV_SYNC_DEBUG
+/* The global mutex which protects debug info lists of all rw-locks.
+To modify the debug info list of an rw-lock, this mutex has to be
+acquired in addition to the mutex protecting the lock. */
+
+UNIV_INTERN ib_mutex_t		rw_lock_debug_mutex;
+
+# ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	rw_lock_debug_mutex_key;
+# endif
+
+/* If deadlock detection does not get immediately the mutex,
+it may wait for this event */
+UNIV_INTERN os_event_t		rw_lock_debug_event;
+/* This is set to TRUE, if there may be waiters for the event */
+UNIV_INTERN ibool		rw_lock_debug_waiters;
+
+/******************************************************************//**
+Creates a debug info struct. */
+static
+rw_lock_debug_t*
+rw_lock_debug_create(void);
+/*======================*/
+/******************************************************************//**
+Frees a debug info struct. */
+static
+void
+rw_lock_debug_free(
+/*===============*/
+	rw_lock_debug_t* info);
+
+/******************************************************************//**
+Creates a debug info struct.
+@return	own: debug info struct */
+static
+rw_lock_debug_t*
+rw_lock_debug_create(void)
+/*======================*/
+{
+	return((rw_lock_debug_t*) mem_alloc(sizeof(rw_lock_debug_t)));
+}
+
+/******************************************************************//**
+Frees a debug info struct. */
+static
+void
+rw_lock_debug_free(
+/*===============*/
+	rw_lock_debug_t* info)
+{
+	mem_free(info);
+}
+#endif /* UNIV_SYNC_DEBUG */
+
+/******************************************************************//**
+Creates, or rather, initializes an rw-lock object in a specified memory
+location (which must be appropriately aligned). The rw-lock is initialized
+to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free
+is necessary only if the memory block containing it is freed. */
+UNIV_INTERN
+void
+rw_lock_create_func(
+/*================*/
+	rw_lock_t*	lock,		/*!< in: pointer to memory */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+	ulint		level,		/*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+	const char*	cmutex_name,	/*!< in: mutex name */
+#endif /* UNIV_DEBUG */
+	const char*	cfile_name,	/*!< in: file name where created */
+	ulint		cline)		/*!< in: file line where created */
+{
+	/* If this is the very first time a synchronization object is
+	created, then the following call initializes the sync system. */
+
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+	mutex_create(rw_lock_mutex_key, rw_lock_get_mutex(lock),
+		     SYNC_NO_ORDER_CHECK);
+
+	lock->mutex.cfile_name = cfile_name;
+	lock->mutex.cline = cline;
+
+	ut_d(lock->mutex.cmutex_name = cmutex_name);
+	ut_d(lock->mutex.ib_mutex_type = 1);
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+# ifdef UNIV_DEBUG
+	UT_NOT_USED(cmutex_name);
+# endif
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+
+	lock->lock_word = X_LOCK_DECR;
+	lock->waiters = 0;
+
+	/* We set this value to signify that lock->writer_thread
+	contains garbage at initialization and cannot be used for
+	recursive x-locking. */
+	lock->recursive = FALSE;
+	/* Silence Valgrind when UNIV_DEBUG_VALGRIND is not enabled. */
+	memset((void*) &lock->writer_thread, 0, sizeof lock->writer_thread);
+	UNIV_MEM_INVALID(&lock->writer_thread, sizeof lock->writer_thread);
+
+#ifdef UNIV_SYNC_DEBUG
+	UT_LIST_INIT(lock->debug_list);
+
+	lock->level = level;
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_d(lock->magic_n = RW_LOCK_MAGIC_N);
+
+	lock->cfile_name = cfile_name;
+	lock->cline = (unsigned int) cline;
+
+	lock->count_os_wait = 0;
+	lock->last_s_file_name = "not yet reserved";
+	lock->last_x_file_name = "not yet reserved";
+	lock->last_s_line = 0;
+	lock->last_x_line = 0;
+	lock->event = os_event_create();
+	lock->wait_ex_event = os_event_create();
+
+	mutex_enter(&rw_lock_list_mutex);
+
+	ut_ad(UT_LIST_GET_FIRST(rw_lock_list) == NULL
+	      || UT_LIST_GET_FIRST(rw_lock_list)->magic_n == RW_LOCK_MAGIC_N);
+
+	UT_LIST_ADD_FIRST(list, rw_lock_list, lock);
+
+	mutex_exit(&rw_lock_list_mutex);
+}
+
+/******************************************************************//**
+Calling this function is obligatory only if the memory buffer containing
+the rw-lock is freed. Removes an rw-lock object from the global list. The
+rw-lock is checked to be in the non-locked state. */
+UNIV_INTERN
+void
+rw_lock_free_func(
+/*==============*/
+	rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+	ib_mutex_t*	mutex;
+#endif /* !INNODB_RW_LOCKS_USE_ATOMICS */
+
+	os_rmb;
+	ut_ad(rw_lock_validate(lock));
+	ut_a(lock->lock_word == X_LOCK_DECR);
+
+	mutex_enter(&rw_lock_list_mutex);
+
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+	mutex = rw_lock_get_mutex(lock);
+#endif /* !INNODB_RW_LOCKS_USE_ATOMICS */
+
+	os_event_free(lock->event);
+
+	os_event_free(lock->wait_ex_event);
+
+	ut_ad(UT_LIST_GET_PREV(list, lock) == NULL
+	      || UT_LIST_GET_PREV(list, lock)->magic_n == RW_LOCK_MAGIC_N);
+	ut_ad(UT_LIST_GET_NEXT(list, lock) == NULL
+	      || UT_LIST_GET_NEXT(list, lock)->magic_n == RW_LOCK_MAGIC_N);
+
+	UT_LIST_REMOVE(list, rw_lock_list, lock);
+
+	mutex_exit(&rw_lock_list_mutex);
+
+	ut_d(lock->magic_n = 0);
+
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+	/* We have merely removed the rw_lock from the list, the memory
+	has not been freed. Therefore the pointer to mutex is valid. */
+	mutex_free(mutex);
+#endif /* !INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Checks that the rw-lock has been initialized and that there are no
+simultaneous shared and exclusive locks.
+@return	TRUE */
+UNIV_INTERN
+ibool
+rw_lock_validate(
+/*=============*/
+	rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+	ulint	waiters;
+	lint	lock_word;
+
+	ut_ad(lock);
+
+	waiters = rw_lock_get_waiters(lock);
+	lock_word = lock->lock_word;
+
+	ut_ad(lock->magic_n == RW_LOCK_MAGIC_N);
+	ut_ad(waiters == 0 || waiters == 1);
+	ut_ad(lock_word > -(2 * X_LOCK_DECR));
+	ut_ad(lock_word <= X_LOCK_DECR);
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/******************************************************************//**
+Lock an rw-lock in shared mode for the current thread. If the rw-lock is
+locked in exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
+for the lock, before suspending the thread. */
+UNIV_INTERN
+void
+rw_lock_s_lock_spin(
+/*================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock
+				will be passed to another thread to unlock */
+	const char*	file_name, /*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	ulint		index;	/* index of the reserved wait cell */
+	ulint		i = 0;	/* spin round count */
+	sync_array_t*	sync_arr;
+	size_t		counter_index;
+
+	/* We reuse the thread id to index into the counter, cache
+	it here for efficiency. */
+
+	counter_index = (size_t) os_thread_get_curr_id();
+
+	ut_ad(rw_lock_validate(lock));
+
+	rw_lock_stats.rw_s_spin_wait_count.add(counter_index, 1);
+lock_loop:
+
+	/* Spin waiting for the writer field to become free */
+	os_rmb;
+	while (i < SYNC_SPIN_ROUNDS && lock->lock_word <= 0) {
+		if (srv_spin_wait_delay) {
+			ut_delay(ut_rnd_interval(0, srv_spin_wait_delay));
+		}
+
+		i++;
+	}
+
+	if (i >= SYNC_SPIN_ROUNDS) {
+		os_thread_yield();
+	}
+
+	/* We try once again to obtain the lock */
+	if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) {
+		rw_lock_stats.rw_s_spin_round_count.add(counter_index, i);
+
+		return; /* Success */
+	} else {
+
+		if (i < SYNC_SPIN_ROUNDS) {
+			goto lock_loop;
+		}
+
+		rw_lock_stats.rw_s_spin_round_count.add(counter_index, i);
+
+		sync_arr = sync_array_get_and_reserve_cell(lock,
+							   RW_LOCK_SHARED,
+							   file_name,
+							   line, &index);
+
+		/* Set waiters before checking lock_word to ensure wake-up
+		signal is sent. This may lead to some unnecessary signals. */
+		rw_lock_set_waiter_flag(lock);
+
+		if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) {
+			sync_array_free_cell(sync_arr, index);
+			return; /* Success */
+		}
+
+		/* these stats may not be accurate */
+		lock->count_os_wait++;
+		rw_lock_stats.rw_s_os_wait_count.add(counter_index, 1);
+
+		sync_array_wait_event(sync_arr, index);
+
+		i = 0;
+		goto lock_loop;
+	}
+}
+
+/******************************************************************//**
+This function is used in the insert buffer to move the ownership of an
+x-latch on a buffer frame to the current thread. The x-latch was set by
+the buffer read operation and it protected the buffer frame while the
+read was done. The ownership is moved because we want that the current
+thread is able to acquire a second x-latch which is stored in an mtr.
+This, in turn, is needed to pass the debug checks of index page
+operations. */
+UNIV_INTERN
+void
+rw_lock_x_lock_move_ownership(
+/*==========================*/
+	rw_lock_t*	lock)	/*!< in: lock which was x-locked in the
+				buffer read */
+{
+	ut_ad(rw_lock_is_locked(lock, RW_LOCK_EX));
+
+	rw_lock_set_writer_id_and_recursion_flag(lock, TRUE);
+}
+
+/******************************************************************//**
+Function for the next writer to call. Waits for readers to exit.
+The caller must have already decremented lock_word by X_LOCK_DECR. */
+UNIV_INLINE
+void
+rw_lock_x_lock_wait(
+/*================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+#endif
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	ulint		index;
+	ulint		i = 0;
+	sync_array_t*	sync_arr;
+	size_t		counter_index;
+
+	/* We reuse the thread id to index into the counter, cache
+	it here for efficiency. */
+
+	counter_index = (size_t) os_thread_get_curr_id();
+
+	os_rmb;
+	ut_ad(lock->lock_word <= 0);
+
+	while (lock->lock_word < 0) {
+		if (srv_spin_wait_delay) {
+			ut_delay(ut_rnd_interval(0, srv_spin_wait_delay));
+		}
+		if(i < SYNC_SPIN_ROUNDS) {
+			i++;
+			os_rmb;
+			continue;
+		}
+
+		/* If there is still a reader, then go to sleep.*/
+		rw_lock_stats.rw_x_spin_round_count.add(counter_index, i);
+
+		sync_arr = sync_array_get_and_reserve_cell(lock,
+							   RW_LOCK_WAIT_EX,
+							   file_name,
+							   line, &index);
+
+		i = 0;
+
+		/* Check lock_word to ensure wake-up isn't missed.*/
+		if (lock->lock_word < 0) {
+
+			/* these stats may not be accurate */
+			lock->count_os_wait++;
+			rw_lock_stats.rw_x_os_wait_count.add(counter_index, 1);
+
+			/* Add debug info as it is needed to detect possible
+			deadlock. We must add info for WAIT_EX thread for
+			deadlock detection to work properly. */
+#ifdef UNIV_SYNC_DEBUG
+			rw_lock_add_debug_info(lock, pass, RW_LOCK_WAIT_EX,
+					       file_name, line);
+#endif
+
+			sync_array_wait_event(sync_arr, index);
+#ifdef UNIV_SYNC_DEBUG
+			rw_lock_remove_debug_info(
+				lock, pass, RW_LOCK_WAIT_EX);
+#endif
+			/* It is possible to wake when lock_word < 0.
+			We must pass the while-loop check to proceed.*/
+		} else {
+			sync_array_free_cell(sync_arr, index);
+		}
+	}
+	rw_lock_stats.rw_x_spin_round_count.add(counter_index, i);
+}
+
+/******************************************************************//**
+Low-level function for acquiring an exclusive lock.
+@return	FALSE if did not succeed, TRUE if success. */
+UNIV_INLINE
+ibool
+rw_lock_x_lock_low(
+/*===============*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	if (rw_lock_lock_word_decr(lock, X_LOCK_DECR)) {
+
+		/* lock->recursive also tells us if the writer_thread
+		field is stale or active. As we are going to write
+		our own thread id in that field it must be that the
+		current writer_thread value is not active. */
+		ut_a(!lock->recursive);
+
+		/* Decrement occurred: we are writer or next-writer. */
+		rw_lock_set_writer_id_and_recursion_flag(
+			lock, pass ? FALSE : TRUE);
+
+		rw_lock_x_lock_wait(lock,
+#ifdef UNIV_SYNC_DEBUG
+				    pass,
+#endif
+				    file_name, line);
+
+	} else {
+		os_thread_id_t	thread_id = os_thread_get_curr_id();
+
+		if (!pass) {
+			os_rmb;
+		}
+
+		/* Decrement failed: relock or failed lock */
+		if (!pass && lock->recursive
+		    && os_thread_eq(lock->writer_thread, thread_id)) {
+			/* Relock */
+			if (lock->lock_word == 0) {
+				lock->lock_word -= X_LOCK_DECR;
+			} else {
+				--lock->lock_word;
+			}
+
+		} else {
+			/* Another thread locked before us */
+			return(FALSE);
+		}
+	}
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, file_name, line);
+#endif
+	lock->last_x_file_name = file_name;
+	lock->last_x_line = (unsigned int) line;
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread. If the rw-lock is locked
+in shared or exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
+for the lock before suspending the thread. If the same thread has an x-lock
+on the rw-lock, locking succeed, with the following exception: if pass != 0,
+only a single x-lock may be taken on the lock. NOTE: If the same thread has
+an s-lock, locking does not succeed! */
+UNIV_INTERN
+void
+rw_lock_x_lock_func(
+/*================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	ulint		i;	/*!< spin round count */
+	ulint		index;	/*!< index of the reserved wait cell */
+	sync_array_t*	sync_arr;
+	ibool		spinning = FALSE;
+	size_t		counter_index;
+
+	/* We reuse the thread id to index into the counter, cache
+	it here for efficiency. */
+
+	counter_index = (size_t) os_thread_get_curr_id();
+
+	ut_ad(rw_lock_validate(lock));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	i = 0;
+
+lock_loop:
+
+	if (rw_lock_x_lock_low(lock, pass, file_name, line)) {
+		rw_lock_stats.rw_x_spin_round_count.add(counter_index, i);
+
+		return;	/* Locking succeeded */
+
+	} else {
+
+		if (!spinning) {
+			spinning = TRUE;
+
+			rw_lock_stats.rw_x_spin_wait_count.add(
+				counter_index, 1);
+		}
+
+		/* Spin waiting for the lock_word to become free */
+		os_rmb;
+		while (i < SYNC_SPIN_ROUNDS
+		       && lock->lock_word <= 0) {
+			if (srv_spin_wait_delay) {
+				ut_delay(ut_rnd_interval(0,
+							 srv_spin_wait_delay));
+			}
+
+			i++;
+		}
+		if (i >= SYNC_SPIN_ROUNDS) {
+			os_thread_yield();
+		} else {
+			goto lock_loop;
+		}
+	}
+
+	rw_lock_stats.rw_x_spin_round_count.add(counter_index, i);
+
+	sync_arr = sync_array_get_and_reserve_cell(lock, RW_LOCK_EX,
+						   file_name, line, &index);
+
+	/* Waiters must be set before checking lock_word, to ensure signal
+	is sent. This could lead to a few unnecessary wake-up signals. */
+	rw_lock_set_waiter_flag(lock);
+
+	if (rw_lock_x_lock_low(lock, pass, file_name, line)) {
+		sync_array_free_cell(sync_arr, index);
+		return; /* Locking succeeded */
+	}
+
+	/* these stats may not be accurate */
+	lock->count_os_wait++;
+	rw_lock_stats.rw_x_os_wait_count.add(counter_index, 1);
+
+	sync_array_wait_event(sync_arr, index);
+
+	i = 0;
+	goto lock_loop;
+}
+
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Acquires the debug mutex. We cannot use the mutex defined in sync0sync,
+because the debug mutex is also acquired in sync0arr while holding the OS
+mutex protecting the sync array, and the ordinary mutex_enter might
+recursively call routines in sync0arr, leading to a deadlock on the OS
+mutex. */
+UNIV_INTERN
+void
+rw_lock_debug_mutex_enter(void)
+/*===========================*/
+{
+loop:
+	if (0 == mutex_enter_nowait(&rw_lock_debug_mutex)) {
+		return;
+	}
+
+	os_event_reset(rw_lock_debug_event);
+
+	rw_lock_debug_waiters = TRUE;
+
+	if (0 == mutex_enter_nowait(&rw_lock_debug_mutex)) {
+		return;
+	}
+
+	os_event_wait(rw_lock_debug_event);
+
+	goto loop;
+}
+
+/******************************************************************//**
+Releases the debug mutex. */
+UNIV_INTERN
+void
+rw_lock_debug_mutex_exit(void)
+/*==========================*/
+{
+	mutex_exit(&rw_lock_debug_mutex);
+
+	if (rw_lock_debug_waiters) {
+		rw_lock_debug_waiters = FALSE;
+		os_event_set(rw_lock_debug_event);
+	}
+}
+
+/******************************************************************//**
+Inserts the debug information for an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_add_debug_info(
+/*===================*/
+	rw_lock_t*	lock,		/*!< in: rw-lock */
+	ulint		pass,		/*!< in: pass value */
+	ulint		lock_type,	/*!< in: lock type */
+	const char*	file_name,	/*!< in: file where requested */
+	ulint		line)		/*!< in: line where requested */
+{
+	rw_lock_debug_t*	info;
+
+	ut_ad(lock);
+	ut_ad(file_name);
+
+	info = rw_lock_debug_create();
+
+	rw_lock_debug_mutex_enter();
+
+	info->file_name = file_name;
+	info->line	= line;
+	info->lock_type = lock_type;
+	info->thread_id = os_thread_get_curr_id();
+	info->pass	= pass;
+
+	UT_LIST_ADD_FIRST(list, lock->debug_list, info);
+
+	rw_lock_debug_mutex_exit();
+
+	if ((pass == 0) && (lock_type != RW_LOCK_WAIT_EX)) {
+		sync_thread_add_level(lock, lock->level,
+				      lock_type == RW_LOCK_EX
+				      && lock->lock_word < 0);
+	}
+}
+
+/******************************************************************//**
+Removes a debug information struct for an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_remove_debug_info(
+/*======================*/
+	rw_lock_t*	lock,		/*!< in: rw-lock */
+	ulint		pass,		/*!< in: pass value */
+	ulint		lock_type)	/*!< in: lock type */
+{
+	rw_lock_debug_t*	info;
+
+	ut_ad(lock);
+
+	if ((pass == 0) && (lock_type != RW_LOCK_WAIT_EX)) {
+		sync_thread_reset_level(lock);
+	}
+
+	rw_lock_debug_mutex_enter();
+
+	info = UT_LIST_GET_FIRST(lock->debug_list);
+
+	while (info != NULL) {
+		if ((pass == info->pass)
+		    && ((pass != 0)
+			|| os_thread_eq(info->thread_id,
+					os_thread_get_curr_id()))
+		    && (info->lock_type == lock_type)) {
+
+			/* Found! */
+			UT_LIST_REMOVE(list, lock->debug_list, info);
+			rw_lock_debug_mutex_exit();
+
+			rw_lock_debug_free(info);
+
+			return;
+		}
+
+		info = UT_LIST_GET_NEXT(list, info);
+	}
+
+	ut_error;
+}
+#endif /* UNIV_SYNC_DEBUG */
+
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Checks if the thread has locked the rw-lock in the specified mode, with
+the pass value == 0.
+@return	TRUE if locked */
+UNIV_INTERN
+ibool
+rw_lock_own(
+/*========*/
+	rw_lock_t*	lock,		/*!< in: rw-lock */
+	ulint		lock_type)	/*!< in: lock type: RW_LOCK_SHARED,
+					RW_LOCK_EX */
+{
+	rw_lock_debug_t*	info;
+
+	ut_ad(lock);
+	ut_ad(rw_lock_validate(lock));
+
+	rw_lock_debug_mutex_enter();
+
+	info = UT_LIST_GET_FIRST(lock->debug_list);
+
+	while (info != NULL) {
+
+		if (os_thread_eq(info->thread_id, os_thread_get_curr_id())
+		    && (info->pass == 0)
+		    && (info->lock_type == lock_type)) {
+
+			rw_lock_debug_mutex_exit();
+			/* Found! */
+
+			return(TRUE);
+		}
+
+		info = UT_LIST_GET_NEXT(list, info);
+	}
+	rw_lock_debug_mutex_exit();
+
+	return(FALSE);
+}
+#endif /* UNIV_SYNC_DEBUG */
+
+/******************************************************************//**
+Checks if somebody has locked the rw-lock in the specified mode.
+@return	TRUE if locked */
+UNIV_INTERN
+ibool
+rw_lock_is_locked(
+/*==============*/
+	rw_lock_t*	lock,		/*!< in: rw-lock */
+	ulint		lock_type)	/*!< in: lock type: RW_LOCK_SHARED,
+					RW_LOCK_EX */
+{
+	ibool	ret	= FALSE;
+
+	ut_ad(lock);
+	ut_ad(rw_lock_validate(lock));
+
+	if (lock_type == RW_LOCK_SHARED) {
+		if (rw_lock_get_reader_count(lock) > 0) {
+			ret = TRUE;
+		}
+	} else if (lock_type == RW_LOCK_EX) {
+		if (rw_lock_get_writer(lock) == RW_LOCK_EX) {
+			ret = TRUE;
+		}
+	} else {
+		ut_error;
+	}
+
+	return(ret);
+}
+
+#ifdef UNIV_SYNC_DEBUG
+/***************************************************************//**
+Prints debug info of currently locked rw-locks. */
+UNIV_INTERN
+void
+rw_lock_list_print_info(
+/*====================*/
+	FILE*	file)		/*!< in: file where to print */
+{
+	rw_lock_t*	lock;
+	ulint		count		= 0;
+	rw_lock_debug_t* info;
+
+	mutex_enter(&rw_lock_list_mutex);
+
+	fputs("-------------\n"
+	      "RW-LATCH INFO\n"
+	      "-------------\n", file);
+
+	lock = UT_LIST_GET_FIRST(rw_lock_list);
+
+	while (lock != NULL) {
+
+		count++;
+
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+		mutex_enter(&(lock->mutex));
+#endif
+		if (lock->lock_word != X_LOCK_DECR) {
+
+			fprintf(file, "RW-LOCK: %p ", (void*) lock);
+
+			if (rw_lock_get_waiters(lock)) {
+				fputs(" Waiters for the lock exist\n", file);
+			} else {
+				putc('\n', file);
+			}
+
+			rw_lock_debug_mutex_enter();
+			info = UT_LIST_GET_FIRST(lock->debug_list);
+			while (info != NULL) {
+				rw_lock_debug_print(file, info);
+				info = UT_LIST_GET_NEXT(list, info);
+			}
+			rw_lock_debug_mutex_exit();
+		}
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+		mutex_exit(&(lock->mutex));
+#endif
+
+		lock = UT_LIST_GET_NEXT(list, lock);
+	}
+
+	fprintf(file, "Total number of rw-locks %ld\n", count);
+	mutex_exit(&rw_lock_list_mutex);
+}
+
+/***************************************************************//**
+Prints debug info of an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_print(
+/*==========*/
+	rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+	rw_lock_debug_t* info;
+
+	fprintf(stderr,
+		"-------------\n"
+		"RW-LATCH INFO\n"
+		"RW-LATCH: %p ", (void*) lock);
+
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+	/* We used to acquire lock->mutex here, but it would cause a
+	recursive call to sync_thread_add_level() if UNIV_SYNC_DEBUG
+	is defined.  Since this function is only invoked from
+	sync_thread_levels_g(), let us choose the smaller evil:
+	performing dirty reads instead of causing bogus deadlocks or
+	assertion failures. */
+#endif
+	if (lock->lock_word != X_LOCK_DECR) {
+
+		if (rw_lock_get_waiters(lock)) {
+			fputs(" Waiters for the lock exist\n", stderr);
+		} else {
+			putc('\n', stderr);
+		}
+
+		rw_lock_debug_mutex_enter();
+		info = UT_LIST_GET_FIRST(lock->debug_list);
+		while (info != NULL) {
+			rw_lock_debug_print(stderr, info);
+			info = UT_LIST_GET_NEXT(list, info);
+		}
+		rw_lock_debug_mutex_exit();
+	}
+}
+
+/*********************************************************************//**
+Prints info of a debug struct. */
+UNIV_INTERN
+void
+rw_lock_debug_print(
+/*================*/
+	FILE*			f,	/*!< in: output stream */
+	rw_lock_debug_t*	info)	/*!< in: debug struct */
+{
+	ulint	rwt;
+
+	rwt	  = info->lock_type;
+
+	fprintf(f, "Locked: thread %lu file %s line %lu  ",
+		(ulong) os_thread_pf(info->thread_id), info->file_name,
+		(ulong) info->line);
+	if (rwt == RW_LOCK_SHARED) {
+		fputs("S-LOCK", f);
+	} else if (rwt == RW_LOCK_EX) {
+		fputs("X-LOCK", f);
+	} else if (rwt == RW_LOCK_WAIT_EX) {
+		fputs("WAIT X-LOCK", f);
+	} else {
+		ut_error;
+	}
+	if (info->pass != 0) {
+		fprintf(f, " pass value %lu", (ulong) info->pass);
+	}
+	putc('\n', f);
+}
+
+/***************************************************************//**
+Returns the number of currently locked rw-locks. Works only in the debug
+version.
+@return	number of locked rw-locks */
+UNIV_INTERN
+ulint
+rw_lock_n_locked(void)
+/*==================*/
+{
+	rw_lock_t*	lock;
+	ulint		count		= 0;
+
+	mutex_enter(&rw_lock_list_mutex);
+
+	lock = UT_LIST_GET_FIRST(rw_lock_list);
+
+	while (lock != NULL) {
+
+		if (lock->lock_word != X_LOCK_DECR) {
+			count++;
+		}
+
+		lock = UT_LIST_GET_NEXT(list, lock);
+	}
+
+	mutex_exit(&rw_lock_list_mutex);
+
+	return(count);
+}
+#endif /* UNIV_SYNC_DEBUG */
diff --git a/storage/innobase/sync/sync0sync.cc b/storage/innobase/sync/sync0sync.cc
new file mode 100644
index 00000000000..60b1798fb0d
--- /dev/null
+++ b/storage/innobase/sync/sync0sync.cc
@@ -0,0 +1,1608 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file sync/sync0sync.cc
+Mutex, the basic synchronization primitive
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "sync0sync.h"
+#ifdef UNIV_NONINL
+#include "sync0sync.ic"
+#include "sync0arr.ic"
+#endif
+
+#include "sync0rw.h"
+#include "buf0buf.h"
+#include "srv0srv.h"
+#include "buf0types.h"
+#include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
+#ifdef UNIV_SYNC_DEBUG
+# include "srv0start.h" /* srv_is_being_started */
+#endif /* UNIV_SYNC_DEBUG */
+#include "ha_prototypes.h"
+
+/*
+	REASONS FOR IMPLEMENTING THE SPIN LOCK MUTEX
+	============================================
+
+Semaphore operations in operating systems are slow: Solaris on a 1993 Sparc
+takes 3 microseconds (us) for a lock-unlock pair and Windows NT on a 1995
+Pentium takes 20 microseconds for a lock-unlock pair. Therefore, we have to
+implement our own efficient spin lock mutex. Future operating systems may
+provide efficient spin locks, but we cannot count on that.
+
+Another reason for implementing a spin lock is that on multiprocessor systems
+it can be more efficient for a processor to run a loop waiting for the
+semaphore to be released than to switch to a different thread. A thread switch
+takes 25 us on both platforms mentioned above. See Gray and Reuter's book
+Transaction processing for background.
+
+How long should the spin loop last before suspending the thread? On a
+uniprocessor, spinning does not help at all, because if the thread owning the
+mutex is not executing, it cannot be released. Spinning actually wastes
+resources.
+
+On a multiprocessor, we do not know if the thread owning the mutex is
+executing or not. Thus it would make sense to spin as long as the operation
+guarded by the mutex would typically last assuming that the thread is
+executing. If the mutex is not released by that time, we may assume that the
+thread owning the mutex is not executing and suspend the waiting thread.
+
+A typical operation (where no i/o involved) guarded by a mutex or a read-write
+lock may last 1 - 20 us on the current Pentium platform. The longest
+operations are the binary searches on an index node.
+
+We conclude that the best choice is to set the spin time at 20 us. Then the
+system should work well on a multiprocessor. On a uniprocessor we have to
+make sure that thread swithches due to mutex collisions are not frequent,
+i.e., they do not happen every 100 us or so, because that wastes too much
+resources. If the thread switches are not frequent, the 20 us wasted in spin
+loop is not too much.
+
+Empirical studies on the effect of spin time should be done for different
+platforms.
+
+
+	IMPLEMENTATION OF THE MUTEX
+	===========================
+
+For background, see Curt Schimmel's book on Unix implementation on modern
+architectures. The key points in the implementation are atomicity and
+serialization of memory accesses. The test-and-set instruction (XCHG in
+Pentium) must be atomic. As new processors may have weak memory models, also
+serialization of memory references may be necessary. The successor of Pentium,
+P6, has at least one mode where the memory model is weak. As far as we know,
+in Pentium all memory accesses are serialized in the program order and we do
+not have to worry about the memory model. On other processors there are
+special machine instructions called a fence, memory barrier, or storage
+barrier (STBAR in Sparc), which can be used to serialize the memory accesses
+to happen in program order relative to the fence instruction.
+
+Leslie Lamport has devised a "bakery algorithm" to implement a mutex without
+the atomic test-and-set, but his algorithm should be modified for weak memory
+models. We do not use Lamport's algorithm, because we guess it is slower than
+the atomic test-and-set.
+
+Our mutex implementation works as follows: After that we perform the atomic
+test-and-set instruction on the memory word. If the test returns zero, we
+know we got the lock first. If the test returns not zero, some other thread
+was quicker and got the lock: then we spin in a loop reading the memory word,
+waiting it to become zero. It is wise to just read the word in the loop, not
+perform numerous test-and-set instructions, because they generate memory
+traffic between the cache and the main memory. The read loop can just access
+the cache, saving bus bandwidth.
+
+If we cannot acquire the mutex lock in the specified time, we reserve a cell
+in the wait array, set the waiters byte in the mutex to 1. To avoid a race
+condition, after setting the waiters byte and before suspending the waiting
+thread, we still have to check that the mutex is reserved, because it may
+have happened that the thread which was holding the mutex has just released
+it and did not see the waiters byte set to 1, a case which would lead the
+other thread to an infinite wait.
+
+LEMMA 1: After a thread resets the event of a mutex (or rw_lock), some
+=======
+thread will eventually call os_event_set() on that particular event.
+Thus no infinite wait is possible in this case.
+
+Proof:	After making the reservation the thread sets the waiters field in the
+mutex to 1. Then it checks that the mutex is still reserved by some thread,
+or it reserves the mutex for itself. In any case, some thread (which may be
+also some earlier thread, not necessarily the one currently holding the mutex)
+will set the waiters field to 0 in mutex_exit, and then call
+os_event_set() with the mutex as an argument.
+Q.E.D.
+
+LEMMA 2: If an os_event_set() call is made after some thread has called
+=======
+the os_event_reset() and before it starts wait on that event, the call
+will not be lost to the second thread. This is true even if there is an
+intervening call to os_event_reset() by another thread.
+Thus no infinite wait is possible in this case.
+
+Proof (non-windows platforms): os_event_reset() returns a monotonically
+increasing value of signal_count. This value is increased at every
+call of os_event_set() If thread A has called os_event_reset() followed
+by thread B calling os_event_set() and then some other thread C calling
+os_event_reset(), the is_set flag of the event will be set to FALSE;
+but now if thread A calls os_event_wait_low() with the signal_count
+value returned from the earlier call of os_event_reset(), it will
+return immediately without waiting.
+Q.E.D.
+
+Proof (windows): If there is a writer thread which is forced to wait for
+the lock, it may be able to set the state of rw_lock to RW_LOCK_WAIT_EX
+The design of rw_lock ensures that there is one and only one thread
+that is able to change the state to RW_LOCK_WAIT_EX and this thread is
+guaranteed to acquire the lock after it is released by the current
+holders and before any other waiter gets the lock.
+On windows this thread waits on a separate event i.e.: wait_ex_event.
+Since only one thread can wait on this event there is no chance
+of this event getting reset before the writer starts wait on it.
+Therefore, this thread is guaranteed to catch the os_set_event()
+signalled unconditionally at the release of the lock.
+Q.E.D. */
+
+/* Number of spin waits on mutexes: for performance monitoring */
+
+/** The number of iterations in the mutex_spin_wait() spin loop.
+Intended for performance monitoring. */
+static ib_counter_t<ib_int64_t, IB_N_SLOTS>	mutex_spin_round_count;
+/** The number of mutex_spin_wait() calls.  Intended for
+performance monitoring. */
+static ib_counter_t<ib_int64_t, IB_N_SLOTS>	mutex_spin_wait_count;
+/** The number of OS waits in mutex_spin_wait().  Intended for
+performance monitoring. */
+static ib_counter_t<ib_int64_t, IB_N_SLOTS>	mutex_os_wait_count;
+/** The number of mutex_exit() calls. Intended for performance
+monitoring. */
+UNIV_INTERN ib_int64_t			mutex_exit_count;
+
+/** This variable is set to TRUE when sync_init is called */
+UNIV_INTERN ibool	sync_initialized	= FALSE;
+
+#ifdef UNIV_SYNC_DEBUG
+/** An acquired mutex or rw-lock and its level in the latching order */
+struct sync_level_t;
+/** Mutexes or rw-locks held by a thread */
+struct sync_thread_t;
+
+/** The latch levels currently owned by threads are stored in this data
+structure; the size of this array is OS_THREAD_MAX_N */
+
+UNIV_INTERN sync_thread_t*	sync_thread_level_arrays;
+
+/** Mutex protecting sync_thread_level_arrays */
+UNIV_INTERN ib_mutex_t		sync_thread_mutex;
+
+# ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	sync_thread_mutex_key;
+# endif /* UNIV_PFS_MUTEX */
+#endif /* UNIV_SYNC_DEBUG */
+
+/** Global list of database mutexes (not OS mutexes) created. */
+UNIV_INTERN ut_list_base_node_t  mutex_list;
+
+/** Mutex protecting the mutex_list variable */
+UNIV_INTERN ib_mutex_t mutex_list_mutex;
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	mutex_list_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+#ifdef UNIV_SYNC_DEBUG
+/** Latching order checks start when this is set TRUE */
+UNIV_INTERN ibool	sync_order_checks_on	= FALSE;
+
+/** Number of slots reserved for each OS thread in the sync level array */
+static const ulint SYNC_THREAD_N_LEVELS = 10000;
+
+/** Array for tracking sync levels per thread. */
+struct sync_arr_t {
+	ulint		in_use;		/*!< Number of active cells */
+	ulint		n_elems;	/*!< Number of elements in the array */
+	ulint		max_elems;	/*!< Maximum elements */
+	ulint		next_free;	/*!< ULINT_UNDEFINED or index of next
+					free slot */
+	sync_level_t*	elems;		/*!< Array elements */
+};
+
+/** Mutexes or rw-locks held by a thread */
+struct sync_thread_t{
+	os_thread_id_t	id;		/*!< OS thread id */
+	sync_arr_t*	levels;		/*!< level array for this thread; if
+					this is NULL this slot is unused */
+};
+
+/** An acquired mutex or rw-lock and its level in the latching order */
+struct sync_level_t{
+	void*		latch;		/*!< pointer to a mutex or an
+					rw-lock; NULL means that
+					the slot is empty */
+	ulint		level;		/*!< level of the latch in the
+					latching order. This field is
+					overloaded to serve as a node in a
+					linked list of free nodes too. When
+					latch == NULL then this will contain
+					the ordinal value of the next free
+					element */
+};
+#endif /* UNIV_SYNC_DEBUG */
+
+/******************************************************************//**
+Creates, or rather, initializes a mutex object in a specified memory
+location (which must be appropriately aligned). The mutex is initialized
+in the reset state. Explicit freeing of the mutex with mutex_free is
+necessary only if the memory block containing it is freed. */
+UNIV_INTERN
+void
+mutex_create_func(
+/*==============*/
+	ib_mutex_t*	mutex,		/*!< in: pointer to memory */
+#ifdef UNIV_DEBUG
+	const char*	cmutex_name,	/*!< in: mutex name */
+# ifdef UNIV_SYNC_DEBUG
+	ulint		level,		/*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+#endif /* UNIV_DEBUG */
+	const char*	cfile_name,	/*!< in: file name where created */
+	ulint		cline)		/*!< in: file line where created */
+{
+#if defined(HAVE_ATOMIC_BUILTINS)
+	mutex_reset_lock_word(mutex);
+#else
+	os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mutex->os_fast_mutex);
+	mutex->lock_word = 0;
+#endif
+	mutex->event = os_event_create();
+	mutex_set_waiters(mutex, 0);
+#ifdef UNIV_DEBUG
+	mutex->magic_n = MUTEX_MAGIC_N;
+#endif /* UNIV_DEBUG */
+#ifdef UNIV_SYNC_DEBUG
+	mutex->line = 0;
+	mutex->file_name = "not yet reserved";
+	mutex->level = level;
+#endif /* UNIV_SYNC_DEBUG */
+	mutex->cfile_name = cfile_name;
+	mutex->cline = cline;
+	mutex->count_os_wait = 0;
+
+	/* Check that lock_word is aligned; this is important on Intel */
+	ut_ad(((ulint)(&(mutex->lock_word))) % 4 == 0);
+
+	/* NOTE! The very first mutexes are not put to the mutex list */
+
+	if ((mutex == &mutex_list_mutex)
+#ifdef UNIV_SYNC_DEBUG
+	    || (mutex == &sync_thread_mutex)
+#endif /* UNIV_SYNC_DEBUG */
+	    ) {
+
+		return;
+	}
+
+	mutex_enter(&mutex_list_mutex);
+
+	ut_ad(UT_LIST_GET_LEN(mutex_list) == 0
+	      || UT_LIST_GET_FIRST(mutex_list)->magic_n == MUTEX_MAGIC_N);
+
+	UT_LIST_ADD_FIRST(list, mutex_list, mutex);
+
+	mutex_exit(&mutex_list_mutex);
+}
+
+/******************************************************************//**
+NOTE! Use the corresponding macro mutex_free(), not directly this function!
+Calling this function is obligatory only if the memory buffer containing
+the mutex is freed. Removes a mutex object from the mutex list. The mutex
+is checked to be in the reset state. */
+UNIV_INTERN
+void
+mutex_free_func(
+/*============*/
+	ib_mutex_t*	mutex)	/*!< in: mutex */
+{
+	ut_ad(mutex_validate(mutex));
+	ut_a(mutex_get_lock_word(mutex) == 0);
+	ut_a(mutex_get_waiters(mutex) == 0);
+
+#ifdef UNIV_MEM_DEBUG
+	if (mutex == &mem_hash_mutex) {
+		ut_ad(UT_LIST_GET_LEN(mutex_list) == 1);
+		ut_ad(UT_LIST_GET_FIRST(mutex_list) == &mem_hash_mutex);
+		UT_LIST_REMOVE(list, mutex_list, mutex);
+		goto func_exit;
+	}
+#endif /* UNIV_MEM_DEBUG */
+
+	if (mutex != &mutex_list_mutex
+#ifdef UNIV_SYNC_DEBUG
+	    && mutex != &sync_thread_mutex
+#endif /* UNIV_SYNC_DEBUG */
+	    ) {
+
+		mutex_enter(&mutex_list_mutex);
+
+		ut_ad(!UT_LIST_GET_PREV(list, mutex)
+		      || UT_LIST_GET_PREV(list, mutex)->magic_n
+		      == MUTEX_MAGIC_N);
+		ut_ad(!UT_LIST_GET_NEXT(list, mutex)
+		      || UT_LIST_GET_NEXT(list, mutex)->magic_n
+		      == MUTEX_MAGIC_N);
+
+		UT_LIST_REMOVE(list, mutex_list, mutex);
+
+		mutex_exit(&mutex_list_mutex);
+	}
+
+	os_event_free(mutex->event);
+#ifdef UNIV_MEM_DEBUG
+func_exit:
+#endif /* UNIV_MEM_DEBUG */
+#if !defined(HAVE_ATOMIC_BUILTINS)
+	os_fast_mutex_free(&(mutex->os_fast_mutex));
+#endif
+	/* If we free the mutex protecting the mutex list (freeing is
+	not necessary), we have to reset the magic number AFTER removing
+	it from the list. */
+#ifdef UNIV_DEBUG
+	mutex->magic_n = 0;
+#endif /* UNIV_DEBUG */
+	return;
+}
+
+/********************************************************************//**
+NOTE! Use the corresponding macro in the header file, not this function
+directly. Tries to lock the mutex for the current thread. If the lock is not
+acquired immediately, returns with return value 1.
+@return	0 if succeed, 1 if not */
+UNIV_INTERN
+ulint
+mutex_enter_nowait_func(
+/*====================*/
+	ib_mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*	file_name __attribute__((unused)),
+					/*!< in: file name where mutex
+					requested */
+	ulint		line __attribute__((unused)))
+					/*!< in: line where requested */
+{
+	ut_ad(mutex_validate(mutex));
+
+	if (!ib_mutex_test_and_set(mutex)) {
+
+		ut_d(mutex->thread_id = os_thread_get_curr_id());
+#ifdef UNIV_SYNC_DEBUG
+		mutex_set_debug_info(mutex, file_name, line);
+#endif
+
+		return(0);	/* Succeeded! */
+	}
+
+	return(1);
+}
+
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Checks that the mutex has been initialized.
+@return	TRUE */
+UNIV_INTERN
+ibool
+mutex_validate(
+/*===========*/
+	const ib_mutex_t*	mutex)	/*!< in: mutex */
+{
+	ut_a(mutex);
+	ut_a(mutex->magic_n == MUTEX_MAGIC_N);
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+Checks that the current thread owns the mutex. Works only in the debug
+version.
+@return	TRUE if owns */
+UNIV_INTERN
+ibool
+mutex_own(
+/*======*/
+	const ib_mutex_t*	mutex)	/*!< in: mutex */
+{
+	ut_ad(mutex_validate(mutex));
+
+	return(mutex_get_lock_word(mutex) == 1
+	       && os_thread_eq(mutex->thread_id, os_thread_get_curr_id()));
+}
+#endif /* UNIV_DEBUG */
+
+/******************************************************************//**
+Sets the waiters field in a mutex. */
+UNIV_INTERN
+void
+mutex_set_waiters(
+/*==============*/
+	ib_mutex_t*	mutex,	/*!< in: mutex */
+	ulint		n)	/*!< in: value to set */
+{
+	volatile ulint*	ptr;		/* declared volatile to ensure that
+					the value is stored to memory */
+	ut_ad(mutex);
+
+	ptr = &(mutex->waiters);
+
+	*ptr = n;		/* Here we assume that the write of a single
+				word in memory is atomic */
+	os_wmb;
+}
+
+/******************************************************************//**
+Reserves a mutex for the current thread. If the mutex is reserved, the
+function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
+for the mutex before suspending the thread. */
+UNIV_INTERN
+void
+mutex_spin_wait(
+/*============*/
+	ib_mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*	file_name,	/*!< in: file name where mutex
+					requested */
+	ulint		line)		/*!< in: line where requested */
+{
+	ulint		i;		/* spin round count */
+	ulint		index;		/* index of the reserved wait cell */
+	sync_array_t*	sync_arr;
+	size_t		counter_index;
+
+	counter_index = (size_t) os_thread_get_curr_id();
+
+	ut_ad(mutex);
+
+	/* This update is not thread safe, but we don't mind if the count
+	isn't exact. Moved out of ifdef that follows because we are willing
+	to sacrifice the cost of counting this as the data is valuable.
+	Count the number of calls to mutex_spin_wait. */
+	mutex_spin_wait_count.add(counter_index, 1);
+
+mutex_loop:
+
+	i = 0;
+
+	/* Spin waiting for the lock word to become zero. Note that we do
+	not have to assume that the read access to the lock word is atomic,
+	as the actual locking is always committed with atomic test-and-set.
+	In reality, however, all processors probably have an atomic read of
+	a memory word. */
+
+spin_loop:
+	os_rmb;
+	while (mutex_get_lock_word(mutex) != 0 && i < SYNC_SPIN_ROUNDS) {
+		if (srv_spin_wait_delay) {
+			ut_delay(ut_rnd_interval(0, srv_spin_wait_delay));
+		}
+
+		i++;
+	}
+
+	if (i >= SYNC_SPIN_ROUNDS) {
+		os_thread_yield();
+	}
+
+	mutex_spin_round_count.add(counter_index, i);
+
+	if (ib_mutex_test_and_set(mutex) == 0) {
+		/* Succeeded! */
+
+		ut_d(mutex->thread_id = os_thread_get_curr_id());
+#ifdef UNIV_SYNC_DEBUG
+		mutex_set_debug_info(mutex, file_name, line);
+#endif
+		return;
+	}
+
+	/* We may end up with a situation where lock_word is 0 but the OS
+	fast mutex is still reserved. On FreeBSD the OS does not seem to
+	schedule a thread which is constantly calling pthread_mutex_trylock
+	(in ib_mutex_test_and_set implementation). Then we could end up
+	spinning here indefinitely. The following 'i++' stops this infinite
+	spin. */
+
+	i++;
+
+	if (i < SYNC_SPIN_ROUNDS) {
+		goto spin_loop;
+	}
+
+	sync_arr = sync_array_get_and_reserve_cell(mutex, SYNC_MUTEX,
+						   file_name, line, &index);
+
+	/* The memory order of the array reservation and the change in the
+	waiters field is important: when we suspend a thread, we first
+	reserve the cell and then set waiters field to 1. When threads are
+	released in mutex_exit, the waiters field is first set to zero and
+	then the event is set to the signaled state. */
+
+	mutex_set_waiters(mutex, 1);
+
+	/* Try to reserve still a few times */
+	for (i = 0; i < 4; i++) {
+		if (ib_mutex_test_and_set(mutex) == 0) {
+			/* Succeeded! Free the reserved wait cell */
+
+			sync_array_free_cell(sync_arr, index);
+
+			ut_d(mutex->thread_id = os_thread_get_curr_id());
+#ifdef UNIV_SYNC_DEBUG
+			mutex_set_debug_info(mutex, file_name, line);
+#endif
+
+			return;
+
+			/* Note that in this case we leave the waiters field
+			set to 1. We cannot reset it to zero, as we do not
+			know if there are other waiters. */
+		}
+	}
+
+	/* Now we know that there has been some thread holding the mutex
+	after the change in the wait array and the waiters field was made.
+	Now there is no risk of infinite wait on the event. */
+
+	mutex_os_wait_count.add(counter_index, 1);
+
+	mutex->count_os_wait++;
+
+	sync_array_wait_event(sync_arr, index);
+
+	goto mutex_loop;
+}
+
+/******************************************************************//**
+Releases the threads waiting in the primary wait array for this mutex. */
+UNIV_INTERN
+void
+mutex_signal_object(
+/*================*/
+	ib_mutex_t*	mutex)	/*!< in: mutex */
+{
+	mutex_set_waiters(mutex, 0);
+
+	/* The memory order of resetting the waiters field and
+	signaling the object is important. See LEMMA 1 above. */
+	os_event_set(mutex->event);
+	sync_array_object_signalled();
+}
+
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Sets the debug information for a reserved mutex. */
+UNIV_INTERN
+void
+mutex_set_debug_info(
+/*=================*/
+	ib_mutex_t*	mutex,		/*!< in: mutex */
+	const char*	file_name,	/*!< in: file where requested */
+	ulint		line)		/*!< in: line where requested */
+{
+	ut_ad(mutex);
+	ut_ad(file_name);
+
+	sync_thread_add_level(mutex, mutex->level, FALSE);
+
+	mutex->file_name = file_name;
+	mutex->line	 = line;
+}
+
+/******************************************************************//**
+Gets the debug information for a reserved mutex. */
+UNIV_INTERN
+void
+mutex_get_debug_info(
+/*=================*/
+	ib_mutex_t*	mutex,		/*!< in: mutex */
+	const char**	file_name,	/*!< out: file where requested */
+	ulint*		line,		/*!< out: line where requested */
+	os_thread_id_t* thread_id)	/*!< out: id of the thread which owns
+					the mutex */
+{
+	ut_ad(mutex);
+
+	*file_name = mutex->file_name;
+	*line	   = mutex->line;
+	*thread_id = mutex->thread_id;
+}
+
+/******************************************************************//**
+Prints debug info of currently reserved mutexes. */
+static
+void
+mutex_list_print_info(
+/*==================*/
+	FILE*	file)		/*!< in: file where to print */
+{
+	ib_mutex_t*	mutex;
+	const char*	file_name;
+	ulint		line;
+	os_thread_id_t	thread_id;
+	ulint		count		= 0;
+
+	fputs("----------\n"
+	      "MUTEX INFO\n"
+	      "----------\n", file);
+
+	mutex_enter(&mutex_list_mutex);
+
+	mutex = UT_LIST_GET_FIRST(mutex_list);
+
+	while (mutex != NULL) {
+		count++;
+
+		if (mutex_get_lock_word(mutex) != 0) {
+			mutex_get_debug_info(mutex, &file_name, &line,
+					     &thread_id);
+			fprintf(file,
+				"Locked mutex: addr %p thread %ld"
+				" file %s line %ld\n",
+				(void*) mutex, os_thread_pf(thread_id),
+				file_name, line);
+		}
+
+		mutex = UT_LIST_GET_NEXT(list, mutex);
+	}
+
+	fprintf(file, "Total number of mutexes %ld\n", count);
+
+	mutex_exit(&mutex_list_mutex);
+}
+
+/******************************************************************//**
+Counts currently reserved mutexes. Works only in the debug version.
+@return	number of reserved mutexes */
+UNIV_INTERN
+ulint
+mutex_n_reserved(void)
+/*==================*/
+{
+	ib_mutex_t*	mutex;
+	ulint		count	= 0;
+
+	mutex_enter(&mutex_list_mutex);
+
+	for (mutex = UT_LIST_GET_FIRST(mutex_list);
+	     mutex != NULL;
+	     mutex = UT_LIST_GET_NEXT(list, mutex)) {
+
+		if (mutex_get_lock_word(mutex) != 0) {
+
+			count++;
+		}
+	}
+
+	mutex_exit(&mutex_list_mutex);
+
+	ut_a(count >= 1);
+
+	/* Subtract one, because this function itself was holding
+	one mutex (mutex_list_mutex) */
+
+	return(count - 1);
+}
+
+/******************************************************************//**
+Returns TRUE if no mutex or rw-lock is currently locked. Works only in
+the debug version.
+@return	TRUE if no mutexes and rw-locks reserved */
+UNIV_INTERN
+ibool
+sync_all_freed(void)
+/*================*/
+{
+	return(mutex_n_reserved() + rw_lock_n_locked() == 0);
+}
+
+/******************************************************************//**
+Looks for the thread slot for the calling thread.
+@return	pointer to thread slot, NULL if not found */
+static
+sync_thread_t*
+sync_thread_level_arrays_find_slot(void)
+/*====================================*/
+
+{
+	ulint		i;
+	os_thread_id_t	id;
+
+	id = os_thread_get_curr_id();
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++) {
+		sync_thread_t*	slot;
+
+		slot = &sync_thread_level_arrays[i];
+
+		if (slot->levels && os_thread_eq(slot->id, id)) {
+
+			return(slot);
+		}
+	}
+
+	return(NULL);
+}
+
+/******************************************************************//**
+Looks for an unused thread slot.
+@return	pointer to thread slot */
+static
+sync_thread_t*
+sync_thread_level_arrays_find_free(void)
+/*====================================*/
+
+{
+	ulint		i;
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++) {
+		sync_thread_t*	slot;
+
+		slot = &sync_thread_level_arrays[i];
+
+		if (slot->levels == NULL) {
+
+			return(slot);
+		}
+	}
+
+	return(NULL);
+}
+
+/******************************************************************//**
+Print warning. */
+static
+void
+sync_print_warning(
+/*===============*/
+	const sync_level_t*	slot)	/*!< in: slot for which to
+					print warning */
+{
+	ib_mutex_t*	mutex;
+
+	mutex = static_cast<ib_mutex_t*>(slot->latch);
+
+	if (mutex->magic_n == MUTEX_MAGIC_N) {
+		fprintf(stderr,
+			"Mutex created at %s %lu\n",
+			innobase_basename(mutex->cfile_name),
+			(ulong) mutex->cline);
+
+		if (mutex_get_lock_word(mutex) != 0) {
+			ulint		line;
+			const char*	file_name;
+			os_thread_id_t	thread_id;
+
+			mutex_get_debug_info(
+				mutex, &file_name, &line, &thread_id);
+
+			fprintf(stderr,
+				"InnoDB: Locked mutex:"
+				" addr %p thread %ld file %s line %ld\n",
+				(void*) mutex, os_thread_pf(thread_id),
+				file_name, (ulong) line);
+		} else {
+			fputs("Not locked\n", stderr);
+		}
+	} else {
+		rw_lock_t*	lock;
+
+		lock = static_cast<rw_lock_t*>(slot->latch);
+
+		rw_lock_print(lock);
+	}
+}
+
+/******************************************************************//**
+Checks if all the level values stored in the level array are greater than
+the given limit.
+@return	TRUE if all greater */
+static
+ibool
+sync_thread_levels_g(
+/*=================*/
+	sync_arr_t*	arr,	/*!< in: pointer to level array for an OS
+				thread */
+	ulint		limit,	/*!< in: level limit */
+	ulint		warn)	/*!< in: TRUE=display a diagnostic message */
+{
+	ulint		i;
+
+	for (i = 0; i < arr->n_elems; i++) {
+		const sync_level_t*	slot;
+
+		slot = &arr->elems[i];
+
+		if (slot->latch != NULL && slot->level <= limit) {
+			if (warn) {
+				fprintf(stderr,
+					"InnoDB: sync levels should be"
+					" > %lu but a level is %lu\n",
+					(ulong) limit, (ulong) slot->level);
+
+				sync_print_warning(slot);
+			}
+
+			return(FALSE);
+		}
+	}
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+Checks if the level value is stored in the level array.
+@return	slot if found or NULL */
+static
+const sync_level_t*
+sync_thread_levels_contain(
+/*=======================*/
+	sync_arr_t*	arr,	/*!< in: pointer to level array for an OS
+				thread */
+	ulint		level)	/*!< in: level */
+{
+	ulint		i;
+
+	for (i = 0; i < arr->n_elems; i++) {
+		const sync_level_t*	slot;
+
+		slot = &arr->elems[i];
+
+		if (slot->latch != NULL && slot->level == level) {
+
+			return(slot);
+		}
+	}
+
+	return(NULL);
+}
+
+/******************************************************************//**
+Checks if the level array for the current thread contains a
+mutex or rw-latch at the specified level.
+@return	a matching latch, or NULL if not found */
+UNIV_INTERN
+void*
+sync_thread_levels_contains(
+/*========================*/
+	ulint	level)			/*!< in: latching order level
+					(SYNC_DICT, ...)*/
+{
+	ulint		i;
+	sync_arr_t*	arr;
+	sync_thread_t*	thread_slot;
+
+	if (!sync_order_checks_on) {
+
+		return(NULL);
+	}
+
+	mutex_enter(&sync_thread_mutex);
+
+	thread_slot = sync_thread_level_arrays_find_slot();
+
+	if (thread_slot == NULL) {
+
+		mutex_exit(&sync_thread_mutex);
+
+		return(NULL);
+	}
+
+	arr = thread_slot->levels;
+
+	for (i = 0; i < arr->n_elems; i++) {
+		sync_level_t*	slot;
+
+		slot = &arr->elems[i];
+
+		if (slot->latch != NULL && slot->level == level) {
+
+			mutex_exit(&sync_thread_mutex);
+			return(slot->latch);
+		}
+	}
+
+	mutex_exit(&sync_thread_mutex);
+
+	return(NULL);
+}
+
+/******************************************************************//**
+Checks that the level array for the current thread is empty.
+@return	a latch, or NULL if empty except the exceptions specified below */
+UNIV_INTERN
+void*
+sync_thread_levels_nonempty_gen(
+/*============================*/
+	ibool	dict_mutex_allowed)	/*!< in: TRUE if dictionary mutex is
+					allowed to be owned by the thread */
+{
+	ulint		i;
+	sync_arr_t*	arr;
+	sync_thread_t*	thread_slot;
+
+	if (!sync_order_checks_on) {
+
+		return(NULL);
+	}
+
+	mutex_enter(&sync_thread_mutex);
+
+	thread_slot = sync_thread_level_arrays_find_slot();
+
+	if (thread_slot == NULL) {
+
+		mutex_exit(&sync_thread_mutex);
+
+		return(NULL);
+	}
+
+	arr = thread_slot->levels;
+
+	for (i = 0; i < arr->n_elems; ++i) {
+		const sync_level_t*	slot;
+
+		slot = &arr->elems[i];
+
+		if (slot->latch != NULL
+		    && (!dict_mutex_allowed
+			|| (slot->level != SYNC_DICT
+			    && slot->level != SYNC_DICT_OPERATION
+			    && slot->level != SYNC_FTS_CACHE))) {
+
+			mutex_exit(&sync_thread_mutex);
+			ut_error;
+
+			return(slot->latch);
+		}
+	}
+
+	mutex_exit(&sync_thread_mutex);
+
+	return(NULL);
+}
+
+/******************************************************************//**
+Checks if the level array for the current thread is empty,
+except for the btr_search_latch.
+@return	a latch, or NULL if empty except the exceptions specified below */
+UNIV_INTERN
+void*
+sync_thread_levels_nonempty_trx(
+/*============================*/
+	ibool	has_search_latch)
+				/*!< in: TRUE if and only if the thread
+				is supposed to hold btr_search_latch */
+{
+	ulint		i;
+	sync_arr_t*	arr;
+	sync_thread_t*	thread_slot;
+
+	if (!sync_order_checks_on) {
+
+		return(NULL);
+	}
+
+	ut_a(!has_search_latch
+	     || sync_thread_levels_contains(SYNC_SEARCH_SYS));
+
+	mutex_enter(&sync_thread_mutex);
+
+	thread_slot = sync_thread_level_arrays_find_slot();
+
+	if (thread_slot == NULL) {
+
+		mutex_exit(&sync_thread_mutex);
+
+		return(NULL);
+	}
+
+	arr = thread_slot->levels;
+
+	for (i = 0; i < arr->n_elems; ++i) {
+		const sync_level_t*	slot;
+
+		slot = &arr->elems[i];
+
+		if (slot->latch != NULL
+		    && (!has_search_latch
+			|| slot->level != SYNC_SEARCH_SYS)) {
+
+			mutex_exit(&sync_thread_mutex);
+			ut_error;
+
+			return(slot->latch);
+		}
+	}
+
+	mutex_exit(&sync_thread_mutex);
+
+	return(NULL);
+}
+
+/******************************************************************//**
+Adds a latch and its level in the thread level array. Allocates the memory
+for the array if called first time for this OS thread. Makes the checks
+against other latch levels stored in the array for this thread. */
+UNIV_INTERN
+void
+sync_thread_add_level(
+/*==================*/
+	void*	latch,	/*!< in: pointer to a mutex or an rw-lock */
+	ulint	level,	/*!< in: level in the latching order; if
+			SYNC_LEVEL_VARYING, nothing is done */
+	ibool	relock)	/*!< in: TRUE if re-entering an x-lock */
+{
+	ulint		i;
+	sync_level_t*	slot;
+	sync_arr_t*	array;
+	sync_thread_t*	thread_slot;
+
+	if (!sync_order_checks_on) {
+
+		return;
+	}
+
+	if ((latch == (void*) &sync_thread_mutex)
+	    || (latch == (void*) &mutex_list_mutex)
+	    || (latch == (void*) &rw_lock_debug_mutex)
+	    || (latch == (void*) &rw_lock_list_mutex)) {
+
+		return;
+	}
+
+	if (level == SYNC_LEVEL_VARYING) {
+
+		return;
+	}
+
+	mutex_enter(&sync_thread_mutex);
+
+	thread_slot = sync_thread_level_arrays_find_slot();
+
+	if (thread_slot == NULL) {
+		ulint	sz;
+
+		sz = sizeof(*array)
+		   + (sizeof(*array->elems) * SYNC_THREAD_N_LEVELS);
+
+		/* We have to allocate the level array for a new thread */
+		array = static_cast<sync_arr_t*>(calloc(sz, sizeof(char)));
+		ut_a(array != NULL);
+
+		array->next_free = ULINT_UNDEFINED;
+		array->max_elems = SYNC_THREAD_N_LEVELS;
+		array->elems = (sync_level_t*) &array[1];
+
+		thread_slot = sync_thread_level_arrays_find_free();
+
+		thread_slot->levels = array;
+		thread_slot->id = os_thread_get_curr_id();
+	}
+
+	array = thread_slot->levels;
+
+	if (relock) {
+		goto levels_ok;
+	}
+
+	/* NOTE that there is a problem with _NODE and _LEAF levels: if the
+	B-tree height changes, then a leaf can change to an internal node
+	or the other way around. We do not know at present if this can cause
+	unnecessary assertion failures below. */
+
+	switch (level) {
+	case SYNC_NO_ORDER_CHECK:
+	case SYNC_EXTERN_STORAGE:
+	case SYNC_TREE_NODE_FROM_HASH:
+		/* Do no order checking */
+		break;
+	case SYNC_TRX_SYS_HEADER:
+		if (srv_is_being_started) {
+			/* This is violated during trx_sys_create_rsegs()
+			when creating additional rollback segments when
+			upgrading in innobase_start_or_create_for_mysql(). */
+			break;
+		}
+	case SYNC_MEM_POOL:
+	case SYNC_MEM_HASH:
+	case SYNC_RECV:
+	case SYNC_FTS_BG_THREADS:
+	case SYNC_WORK_QUEUE:
+	case SYNC_FTS_TOKENIZE:
+	case SYNC_FTS_OPTIMIZE:
+	case SYNC_FTS_CACHE:
+	case SYNC_FTS_CACHE_INIT:
+	case SYNC_LOG:
+	case SYNC_LOG_FLUSH_ORDER:
+	case SYNC_ANY_LATCH:
+	case SYNC_FILE_FORMAT_TAG:
+	case SYNC_DOUBLEWRITE:
+	case SYNC_SEARCH_SYS:
+	case SYNC_THREADS:
+	case SYNC_LOCK_SYS:
+	case SYNC_LOCK_WAIT_SYS:
+	case SYNC_TRX_SYS:
+	case SYNC_IBUF_BITMAP_MUTEX:
+	case SYNC_RSEG:
+	case SYNC_TRX_UNDO:
+	case SYNC_PURGE_LATCH:
+	case SYNC_PURGE_QUEUE:
+	case SYNC_DICT_AUTOINC_MUTEX:
+	case SYNC_DICT_OPERATION:
+	case SYNC_DICT_HEADER:
+	case SYNC_TRX_I_S_RWLOCK:
+	case SYNC_TRX_I_S_LAST_READ:
+	case SYNC_IBUF_MUTEX:
+	case SYNC_INDEX_ONLINE_LOG:
+	case SYNC_STATS_AUTO_RECALC:
+		if (!sync_thread_levels_g(array, level, TRUE)) {
+			fprintf(stderr,
+				"InnoDB: sync_thread_levels_g(array, %lu)"
+				" does not hold!\n", level);
+			ut_error;
+		}
+		break;
+	case SYNC_TRX:
+		/* Either the thread must own the lock_sys->mutex, or
+		it is allowed to own only ONE trx->mutex. */
+		if (!sync_thread_levels_g(array, level, FALSE)) {
+			ut_a(sync_thread_levels_g(array, level - 1, TRUE));
+			ut_a(sync_thread_levels_contain(array, SYNC_LOCK_SYS));
+		}
+		break;
+	case SYNC_BUF_FLUSH_LIST:
+	case SYNC_BUF_POOL:
+		/* We can have multiple mutexes of this type therefore we
+		can only check whether the greater than condition holds. */
+		if (!sync_thread_levels_g(array, level-1, TRUE)) {
+			fprintf(stderr,
+				"InnoDB: sync_thread_levels_g(array, %lu)"
+				" does not hold!\n", level-1);
+			ut_error;
+		}
+		break;
+
+
+	case SYNC_BUF_PAGE_HASH:
+		/* Multiple page_hash locks are only allowed during
+		buf_validate and that is where buf_pool mutex is already
+		held. */
+		/* Fall through */
+
+	case SYNC_BUF_BLOCK:
+		/* Either the thread must own the buffer pool mutex
+		(buf_pool->mutex), or it is allowed to latch only ONE
+		buffer block (block->mutex or buf_pool->zip_mutex). */
+		if (!sync_thread_levels_g(array, level, FALSE)) {
+			ut_a(sync_thread_levels_g(array, level - 1, TRUE));
+			ut_a(sync_thread_levels_contain(array, SYNC_BUF_POOL));
+		}
+		break;
+	case SYNC_REC_LOCK:
+		if (sync_thread_levels_contain(array, SYNC_LOCK_SYS)) {
+			ut_a(sync_thread_levels_g(array, SYNC_REC_LOCK - 1,
+						  TRUE));
+		} else {
+			ut_a(sync_thread_levels_g(array, SYNC_REC_LOCK, TRUE));
+		}
+		break;
+	case SYNC_IBUF_BITMAP:
+		/* Either the thread must own the master mutex to all
+		the bitmap pages, or it is allowed to latch only ONE
+		bitmap page. */
+		if (sync_thread_levels_contain(array,
+					       SYNC_IBUF_BITMAP_MUTEX)) {
+			ut_a(sync_thread_levels_g(array, SYNC_IBUF_BITMAP - 1,
+						  TRUE));
+		} else {
+			/* This is violated during trx_sys_create_rsegs()
+			when creating additional rollback segments when
+			upgrading in innobase_start_or_create_for_mysql(). */
+			ut_a(srv_is_being_started
+			     || sync_thread_levels_g(array, SYNC_IBUF_BITMAP,
+						     TRUE));
+		}
+		break;
+	case SYNC_FSP_PAGE:
+		ut_a(sync_thread_levels_contain(array, SYNC_FSP));
+		break;
+	case SYNC_FSP:
+		ut_a(sync_thread_levels_contain(array, SYNC_FSP)
+		     || sync_thread_levels_g(array, SYNC_FSP, TRUE));
+		break;
+	case SYNC_TRX_UNDO_PAGE:
+		/* Purge is allowed to read in as many UNDO pages as it likes,
+		there was a bogus rule here earlier that forced the caller to
+		acquire the purge_sys_t::mutex. The purge mutex did not really
+		protect anything because it was only ever acquired by the
+		single purge thread. The purge thread can read the UNDO pages
+		without any covering mutex. */
+
+		ut_a(sync_thread_levels_contain(array, SYNC_TRX_UNDO)
+		     || sync_thread_levels_contain(array, SYNC_RSEG)
+		     || sync_thread_levels_g(array, level - 1, TRUE));
+		break;
+	case SYNC_RSEG_HEADER:
+		ut_a(sync_thread_levels_contain(array, SYNC_RSEG));
+		break;
+	case SYNC_RSEG_HEADER_NEW:
+		ut_a(sync_thread_levels_contain(array, SYNC_FSP_PAGE));
+		break;
+	case SYNC_TREE_NODE:
+		ut_a(sync_thread_levels_contain(array, SYNC_INDEX_TREE)
+		     || sync_thread_levels_contain(array, SYNC_DICT_OPERATION)
+		     || sync_thread_levels_g(array, SYNC_TREE_NODE - 1, TRUE));
+		break;
+	case SYNC_TREE_NODE_NEW:
+		ut_a(sync_thread_levels_contain(array, SYNC_FSP_PAGE));
+		break;
+	case SYNC_INDEX_TREE:
+		ut_a(sync_thread_levels_g(array, SYNC_TREE_NODE - 1, TRUE));
+		break;
+	case SYNC_IBUF_TREE_NODE:
+		ut_a(sync_thread_levels_contain(array, SYNC_IBUF_INDEX_TREE)
+		     || sync_thread_levels_g(array, SYNC_IBUF_TREE_NODE - 1,
+					     TRUE));
+		break;
+	case SYNC_IBUF_TREE_NODE_NEW:
+		/* ibuf_add_free_page() allocates new pages for the
+		change buffer while only holding the tablespace
+		x-latch. These pre-allocated new pages may only be
+		taken in use while holding ibuf_mutex, in
+		btr_page_alloc_for_ibuf(). */
+		ut_a(sync_thread_levels_contain(array, SYNC_IBUF_MUTEX)
+		     || sync_thread_levels_contain(array, SYNC_FSP));
+		break;
+	case SYNC_IBUF_INDEX_TREE:
+		if (sync_thread_levels_contain(array, SYNC_FSP)) {
+			ut_a(sync_thread_levels_g(array, level - 1, TRUE));
+		} else {
+			ut_a(sync_thread_levels_g(
+				     array, SYNC_IBUF_TREE_NODE - 1, TRUE));
+		}
+		break;
+	case SYNC_IBUF_PESS_INSERT_MUTEX:
+		ut_a(sync_thread_levels_g(array, SYNC_FSP - 1, TRUE));
+		ut_a(!sync_thread_levels_contain(array, SYNC_IBUF_MUTEX));
+		break;
+	case SYNC_IBUF_HEADER:
+		ut_a(sync_thread_levels_g(array, SYNC_FSP - 1, TRUE));
+		ut_a(!sync_thread_levels_contain(array, SYNC_IBUF_MUTEX));
+		ut_a(!sync_thread_levels_contain(array,
+						 SYNC_IBUF_PESS_INSERT_MUTEX));
+		break;
+	case SYNC_DICT:
+#ifdef UNIV_DEBUG
+		ut_a(buf_debug_prints
+		     || sync_thread_levels_g(array, SYNC_DICT, TRUE));
+#else /* UNIV_DEBUG */
+		ut_a(sync_thread_levels_g(array, SYNC_DICT, TRUE));
+#endif /* UNIV_DEBUG */
+		break;
+	default:
+		ut_error;
+	}
+
+levels_ok:
+	if (array->next_free == ULINT_UNDEFINED) {
+		ut_a(array->n_elems < array->max_elems);
+
+		i = array->n_elems++;
+	} else {
+		i = array->next_free;
+		array->next_free = array->elems[i].level;
+	}
+
+	ut_a(i < array->n_elems);
+	ut_a(i != ULINT_UNDEFINED);
+
+	++array->in_use;
+
+	slot = &array->elems[i];
+
+	ut_a(slot->latch == NULL);
+
+	slot->latch = latch;
+	slot->level = level;
+
+	mutex_exit(&sync_thread_mutex);
+}
+
+/******************************************************************//**
+Removes a latch from the thread level array if it is found there.
+@return TRUE if found in the array; it is no error if the latch is
+not found, as we presently are not able to determine the level for
+every latch reservation the program does */
+UNIV_INTERN
+ibool
+sync_thread_reset_level(
+/*====================*/
+	void*	latch)	/*!< in: pointer to a mutex or an rw-lock */
+{
+	sync_arr_t*	array;
+	sync_thread_t*	thread_slot;
+	ulint		i;
+
+	if (!sync_order_checks_on) {
+
+		return(FALSE);
+	}
+
+	if ((latch == (void*) &sync_thread_mutex)
+	    || (latch == (void*) &mutex_list_mutex)
+	    || (latch == (void*) &rw_lock_debug_mutex)
+	    || (latch == (void*) &rw_lock_list_mutex)) {
+
+		return(FALSE);
+	}
+
+	mutex_enter(&sync_thread_mutex);
+
+	thread_slot = sync_thread_level_arrays_find_slot();
+
+	if (thread_slot == NULL) {
+
+		ut_error;
+
+		mutex_exit(&sync_thread_mutex);
+		return(FALSE);
+	}
+
+	array = thread_slot->levels;
+
+	for (i = 0; i < array->n_elems; i++) {
+		sync_level_t*	slot;
+
+		slot = &array->elems[i];
+
+		if (slot->latch != latch) {
+			continue;
+		}
+
+		slot->latch = NULL;
+
+		/* Update the free slot list. See comment in sync_level_t
+		for the level field. */
+		slot->level = array->next_free;
+		array->next_free = i;
+
+		ut_a(array->in_use >= 1);
+		--array->in_use;
+
+		/* If all cells are idle then reset the free
+		list. The assumption is that this will save
+		time when we need to scan up to n_elems. */
+
+		if (array->in_use == 0) {
+			array->n_elems = 0;
+			array->next_free = ULINT_UNDEFINED;
+		}
+
+		mutex_exit(&sync_thread_mutex);
+
+		return(TRUE);
+	}
+
+	if (((ib_mutex_t*) latch)->magic_n != MUTEX_MAGIC_N) {
+		rw_lock_t*	rw_lock;
+
+		rw_lock = (rw_lock_t*) latch;
+
+		if (rw_lock->level == SYNC_LEVEL_VARYING) {
+			mutex_exit(&sync_thread_mutex);
+
+			return(TRUE);
+		}
+	}
+
+	ut_error;
+
+	mutex_exit(&sync_thread_mutex);
+
+	return(FALSE);
+}
+#endif /* UNIV_SYNC_DEBUG */
+
+/******************************************************************//**
+Initializes the synchronization data structures. */
+UNIV_INTERN
+void
+sync_init(void)
+/*===========*/
+{
+	ut_a(sync_initialized == FALSE);
+
+	sync_initialized = TRUE;
+
+	sync_array_init(OS_THREAD_MAX_N);
+
+#ifdef UNIV_SYNC_DEBUG
+	/* Create the thread latch level array where the latch levels
+	are stored for each OS thread */
+
+	sync_thread_level_arrays = static_cast<sync_thread_t*>(
+		calloc(sizeof(sync_thread_t), OS_THREAD_MAX_N));
+
+	ut_a(sync_thread_level_arrays != NULL);
+
+#endif /* UNIV_SYNC_DEBUG */
+	/* Init the mutex list and create the mutex to protect it. */
+
+	UT_LIST_INIT(mutex_list);
+	mutex_create(mutex_list_mutex_key, &mutex_list_mutex,
+		     SYNC_NO_ORDER_CHECK);
+#ifdef UNIV_SYNC_DEBUG
+	mutex_create(sync_thread_mutex_key, &sync_thread_mutex,
+		     SYNC_NO_ORDER_CHECK);
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* Init the rw-lock list and create the mutex to protect it. */
+
+	UT_LIST_INIT(rw_lock_list);
+	mutex_create(rw_lock_list_mutex_key, &rw_lock_list_mutex,
+		     SYNC_NO_ORDER_CHECK);
+
+#ifdef UNIV_SYNC_DEBUG
+	mutex_create(rw_lock_debug_mutex_key, &rw_lock_debug_mutex,
+		     SYNC_NO_ORDER_CHECK);
+
+	rw_lock_debug_event = os_event_create();
+	rw_lock_debug_waiters = FALSE;
+#endif /* UNIV_SYNC_DEBUG */
+}
+
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Frees all debug memory. */
+static
+void
+sync_thread_level_arrays_free(void)
+/*===============================*/
+
+{
+	ulint	i;
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++) {
+		sync_thread_t*	slot;
+
+		slot = &sync_thread_level_arrays[i];
+
+		/* If this slot was allocated then free the slot memory too. */
+		if (slot->levels != NULL) {
+			free(slot->levels);
+			slot->levels = NULL;
+		}
+	}
+
+	free(sync_thread_level_arrays);
+	sync_thread_level_arrays = NULL;
+}
+#endif /* UNIV_SYNC_DEBUG */
+
+/******************************************************************//**
+Frees the resources in InnoDB's own synchronization data structures. Use
+os_sync_free() after calling this. */
+UNIV_INTERN
+void
+sync_close(void)
+/*===========*/
+{
+	ib_mutex_t*	mutex;
+
+	sync_array_close();
+
+	for (mutex = UT_LIST_GET_FIRST(mutex_list);
+	     mutex != NULL;
+	     /* No op */) {
+
+#ifdef UNIV_MEM_DEBUG
+		if (mutex == &mem_hash_mutex) {
+			mutex = UT_LIST_GET_NEXT(list, mutex);
+			continue;
+		}
+#endif /* UNIV_MEM_DEBUG */
+
+		mutex_free(mutex);
+
+		mutex = UT_LIST_GET_FIRST(mutex_list);
+	}
+
+	mutex_free(&mutex_list_mutex);
+#ifdef UNIV_SYNC_DEBUG
+	mutex_free(&sync_thread_mutex);
+
+	/* Switch latching order checks on in sync0sync.cc */
+	sync_order_checks_on = FALSE;
+
+	sync_thread_level_arrays_free();
+#endif /* UNIV_SYNC_DEBUG */
+
+	sync_initialized = FALSE;
+}
+
+/*******************************************************************//**
+Prints wait info of the sync system. */
+UNIV_INTERN
+void
+sync_print_wait_info(
+/*=================*/
+	FILE*	file)		/*!< in: file where to print */
+{
+	fprintf(file,
+		"Mutex spin waits " UINT64PF ", rounds " UINT64PF ", "
+		"OS waits " UINT64PF "\n"
+		"RW-shared spins " UINT64PF ", rounds " UINT64PF ", "
+		"OS waits " UINT64PF "\n"
+		"RW-excl spins " UINT64PF ", rounds " UINT64PF ", "
+		"OS waits " UINT64PF "\n",
+		(ib_uint64_t) mutex_spin_wait_count,
+		(ib_uint64_t) mutex_spin_round_count,
+		(ib_uint64_t) mutex_os_wait_count,
+		(ib_uint64_t) rw_lock_stats.rw_s_spin_wait_count,
+		(ib_uint64_t) rw_lock_stats.rw_s_spin_round_count,
+		(ib_uint64_t) rw_lock_stats.rw_s_os_wait_count,
+		(ib_uint64_t) rw_lock_stats.rw_x_spin_wait_count,
+		(ib_uint64_t) rw_lock_stats.rw_x_spin_round_count,
+		(ib_uint64_t) rw_lock_stats.rw_x_os_wait_count);
+
+	fprintf(file,
+		"Spin rounds per wait: %.2f mutex, %.2f RW-shared, "
+		"%.2f RW-excl\n",
+		(double) mutex_spin_round_count /
+		(mutex_spin_wait_count ? mutex_spin_wait_count : 1),
+		(double) rw_lock_stats.rw_s_spin_round_count /
+		(rw_lock_stats.rw_s_spin_wait_count
+		 ? rw_lock_stats.rw_s_spin_wait_count : 1),
+		(double) rw_lock_stats.rw_x_spin_round_count /
+		(rw_lock_stats.rw_x_spin_wait_count
+		 ? rw_lock_stats.rw_x_spin_wait_count : 1));
+}
+
+/*******************************************************************//**
+Prints info of the sync system. */
+UNIV_INTERN
+void
+sync_print(
+/*=======*/
+	FILE*	file)		/*!< in: file where to print */
+{
+#ifdef UNIV_SYNC_DEBUG
+	mutex_list_print_info(file);
+
+	rw_lock_list_print_info(file);
+#endif /* UNIV_SYNC_DEBUG */
+
+	sync_array_print(file);
+
+	sync_print_wait_info(file);
+}
diff --git a/storage/innobase/trx/trx0i_s.cc b/storage/innobase/trx/trx0i_s.cc
new file mode 100644
index 00000000000..3d07091dd67
--- /dev/null
+++ b/storage/innobase/trx/trx0i_s.cc
@@ -0,0 +1,1667 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0i_s.cc
+INFORMATION SCHEMA innodb_trx, innodb_locks and
+innodb_lock_waits tables fetch code.
+
+The code below fetches information needed to fill those
+3 dynamic tables and uploads it into a "transactions
+table cache" for later retrieval.
+
+Created July 17, 2007 Vasil Dimov
+*******************************************************/
+
+/* Found during the build of 5.5.3 on Linux 2.4 and early 2.6 kernels:
+   The includes "univ.i" -> "my_global.h" cause a different path
+   to be taken further down with pthread functions and types,
+   so they must come first.
+   From the symptoms, this is related to bug#46587 in the MySQL bug DB.
+*/
+#include "univ.i"
+
+#include <mysql/plugin.h>
+
+#include "buf0buf.h"
+#include "dict0dict.h"
+#include "ha0storage.h"
+#include "ha_prototypes.h"
+#include "hash0hash.h"
+#include "lock0iter.h"
+#include "lock0lock.h"
+#include "mem0mem.h"
+#include "page0page.h"
+#include "rem0rec.h"
+#include "row0row.h"
+#include "srv0srv.h"
+#include "sync0rw.h"
+#include "sync0sync.h"
+#include "sync0types.h"
+#include "trx0i_s.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "ut0mem.h"
+#include "ut0ut.h"
+
+/** Initial number of rows in the table cache */
+#define TABLE_CACHE_INITIAL_ROWSNUM	1024
+
+/** @brief The maximum number of chunks to allocate for a table cache.
+
+The rows of a table cache are stored in a set of chunks. When a new
+row is added a new chunk is allocated if necessary. Assuming that the
+first one is 1024 rows (TABLE_CACHE_INITIAL_ROWSNUM) and each
+subsequent is N/2 where N is the number of rows we have allocated till
+now, then 39th chunk would accommodate 1677416425 rows and all chunks
+would accommodate 3354832851 rows. */
+#define MEM_CHUNKS_IN_TABLE_CACHE	39
+
+/** The following are some testing auxiliary macros. Do not enable them
+in a production environment. */
+/* @{ */
+
+#if 0
+/** If this is enabled then lock folds will always be different
+resulting in equal rows being put in a different cells of the hash
+table. Checking for duplicates will be flawed because different
+fold will be calculated when a row is searched in the hash table. */
+#define TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+#endif
+
+#if 0
+/** This effectively kills the search-for-duplicate-before-adding-a-row
+function, but searching in the hash is still performed. It will always
+be assumed that lock is not present and insertion will be performed in
+the hash table. */
+#define TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+#endif
+
+#if 0
+/** This aggressively repeats adding each row many times. Depending on
+the above settings this may be noop or may result in lots of rows being
+added. */
+#define TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+#endif
+
+#if 0
+/** Very similar to TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T but hash
+table search is not performed at all. */
+#define TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+#endif
+
+#if 0
+/** Do not insert each row into the hash table, duplicates may appear
+if this is enabled, also if this is enabled searching into the hash is
+noop because it will be empty. */
+#define TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+#endif
+/* @} */
+
+/** Memory limit passed to ha_storage_put_memlim().
+@param cache	hash storage
+@return		maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_STORAGE(cache)		\
+	(TRX_I_S_MEM_LIMIT			\
+	 - (cache)->mem_allocd)
+
+/** Memory limit in table_cache_create_empty_row().
+@param cache	hash storage
+@return		maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_ALLOC(cache)		\
+	(TRX_I_S_MEM_LIMIT			\
+	 - (cache)->mem_allocd			\
+	 - ha_storage_get_size((cache)->storage))
+
+/** Memory for each table in the intermediate buffer is allocated in
+separate chunks. These chunks are considered to be concatenated to
+represent one flat array of rows. */
+struct i_s_mem_chunk_t {
+	ulint	offset;		/*!< offset, in number of rows */
+	ulint	rows_allocd;	/*!< the size of this chunk, in number
+				of rows */
+	void*	base;		/*!< start of the chunk */
+};
+
+/** This represents one table's cache. */
+struct i_s_table_cache_t {
+	ulint		rows_used;	/*!< number of used rows */
+	ulint		rows_allocd;	/*!< number of allocated rows */
+	ulint		row_size;	/*!< size of a single row */
+	i_s_mem_chunk_t	chunks[MEM_CHUNKS_IN_TABLE_CACHE]; /*!< array of
+					memory chunks that stores the
+					rows */
+};
+
+/** This structure describes the intermediate buffer */
+struct trx_i_s_cache_t {
+	rw_lock_t	rw_lock;	/*!< read-write lock protecting
+					the rest of this structure */
+	ullint		last_read;	/*!< last time the cache was read;
+					measured in microseconds since
+					epoch */
+	ib_mutex_t		last_read_mutex;/*!< mutex protecting the
+					last_read member - it is updated
+					inside a shared lock of the
+					rw_lock member */
+	i_s_table_cache_t innodb_trx;	/*!< innodb_trx table */
+	i_s_table_cache_t innodb_locks;	/*!< innodb_locks table */
+	i_s_table_cache_t innodb_lock_waits;/*!< innodb_lock_waits table */
+/** the hash table size is LOCKS_HASH_CELLS_NUM * sizeof(void*) bytes */
+#define LOCKS_HASH_CELLS_NUM		10000
+	hash_table_t*	locks_hash;	/*!< hash table used to eliminate
+					duplicate entries in the
+					innodb_locks table */
+/** Initial size of the cache storage */
+#define CACHE_STORAGE_INITIAL_SIZE	1024
+/** Number of hash cells in the cache storage */
+#define CACHE_STORAGE_HASH_CELLS	2048
+	ha_storage_t*	storage;	/*!< storage for external volatile
+					data that may become unavailable
+					when we release
+					lock_sys->mutex or trx_sys->mutex */
+	ulint		mem_allocd;	/*!< the amount of memory
+					allocated with mem_alloc*() */
+	ibool		is_truncated;	/*!< this is TRUE if the memory
+					limit was hit and thus the data
+					in the cache is truncated */
+};
+
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+static trx_i_s_cache_t	trx_i_s_cache_static;
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+UNIV_INTERN trx_i_s_cache_t*	trx_i_s_cache = &trx_i_s_cache_static;
+
+/* Key to register the lock/mutex with performance schema */
+#ifdef UNIV_PFS_RWLOCK
+UNIV_INTERN mysql_pfs_key_t	trx_i_s_cache_lock_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	cache_last_read_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/*******************************************************************//**
+For a record lock that is in waiting state retrieves the only bit that
+is set, for a table lock returns ULINT_UNDEFINED.
+@return	record number within the heap */
+static
+ulint
+wait_lock_get_heap_no(
+/*==================*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	ulint	ret;
+
+	switch (lock_get_type(lock)) {
+	case LOCK_REC:
+		ret = lock_rec_find_set_bit(lock);
+		ut_a(ret != ULINT_UNDEFINED);
+		break;
+	case LOCK_TABLE:
+		ret = ULINT_UNDEFINED;
+		break;
+	default:
+		ut_error;
+	}
+
+	return(ret);
+}
+
+/*******************************************************************//**
+Initializes the members of a table cache. */
+static
+void
+table_cache_init(
+/*=============*/
+	i_s_table_cache_t*	table_cache,	/*!< out: table cache */
+	size_t			row_size)	/*!< in: the size of a
+						row */
+{
+	ulint	i;
+
+	table_cache->rows_used = 0;
+	table_cache->rows_allocd = 0;
+	table_cache->row_size = row_size;
+
+	for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+		/* the memory is actually allocated in
+		table_cache_create_empty_row() */
+		table_cache->chunks[i].base = NULL;
+	}
+}
+
+/*******************************************************************//**
+Frees a table cache. */
+static
+void
+table_cache_free(
+/*=============*/
+	i_s_table_cache_t*	table_cache)	/*!< in/out: table cache */
+{
+	ulint	i;
+
+	for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+		/* the memory is actually allocated in
+		table_cache_create_empty_row() */
+		if (table_cache->chunks[i].base) {
+			mem_free(table_cache->chunks[i].base);
+			table_cache->chunks[i].base = NULL;
+		}
+	}
+}
+
+/*******************************************************************//**
+Returns an empty row from a table cache. The row is allocated if no more
+empty rows are available. The number of used rows is incremented.
+If the memory limit is hit then NULL is returned and nothing is
+allocated.
+@return	empty row, or NULL if out of memory */
+static
+void*
+table_cache_create_empty_row(
+/*=========================*/
+	i_s_table_cache_t*	table_cache,	/*!< in/out: table cache */
+	trx_i_s_cache_t*	cache)		/*!< in/out: cache to record
+						how many bytes are
+						allocated */
+{
+	ulint	i;
+	void*	row;
+
+	ut_a(table_cache->rows_used <= table_cache->rows_allocd);
+
+	if (table_cache->rows_used == table_cache->rows_allocd) {
+
+		/* rows_used == rows_allocd means that new chunk needs
+		to be allocated: either no more empty rows in the
+		last allocated chunk or nothing has been allocated yet
+		(rows_num == rows_allocd == 0); */
+
+		i_s_mem_chunk_t*	chunk;
+		ulint			req_bytes;
+		ulint			got_bytes;
+		ulint			req_rows;
+		ulint			got_rows;
+
+		/* find the first not allocated chunk */
+		for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+			if (table_cache->chunks[i].base == NULL) {
+
+				break;
+			}
+		}
+
+		/* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+		have been allocated :-X */
+		ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+		/* allocate the chunk we just found */
+
+		if (i == 0) {
+
+			/* first chunk, nothing is allocated yet */
+			req_rows = TABLE_CACHE_INITIAL_ROWSNUM;
+		} else {
+
+			/* Memory is increased by the formula
+			new = old + old / 2; We are trying not to be
+			aggressive here (= using the common new = old * 2)
+			because the allocated memory will not be freed
+			until InnoDB exit (it is reused). So it is better
+			to once allocate the memory in more steps, but
+			have less unused/wasted memory than to use less
+			steps in allocation (which is done once in a
+			lifetime) but end up with lots of unused/wasted
+			memory. */
+			req_rows = table_cache->rows_allocd / 2;
+		}
+		req_bytes = req_rows * table_cache->row_size;
+
+		if (req_bytes > MAX_ALLOWED_FOR_ALLOC(cache)) {
+
+			return(NULL);
+		}
+
+		chunk = &table_cache->chunks[i];
+
+		chunk->base = mem_alloc2(req_bytes, &got_bytes);
+
+		got_rows = got_bytes / table_cache->row_size;
+
+		cache->mem_allocd += got_bytes;
+
+#if 0
+		printf("allocating chunk %d req bytes=%lu, got bytes=%lu, "
+		       "row size=%lu, "
+		       "req rows=%lu, got rows=%lu\n",
+		       i, req_bytes, got_bytes,
+		       table_cache->row_size,
+		       req_rows, got_rows);
+#endif
+
+		chunk->rows_allocd = got_rows;
+
+		table_cache->rows_allocd += got_rows;
+
+		/* adjust the offset of the next chunk */
+		if (i < MEM_CHUNKS_IN_TABLE_CACHE - 1) {
+
+			table_cache->chunks[i + 1].offset
+				= chunk->offset + chunk->rows_allocd;
+		}
+
+		/* return the first empty row in the newly allocated
+		chunk */
+		row = chunk->base;
+	} else {
+
+		char*	chunk_start;
+		ulint	offset;
+
+		/* there is an empty row, no need to allocate new
+		chunks */
+
+		/* find the first chunk that contains allocated but
+		empty/unused rows */
+		for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+			if (table_cache->chunks[i].offset
+			    + table_cache->chunks[i].rows_allocd
+			    > table_cache->rows_used) {
+
+				break;
+			}
+		}
+
+		/* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+		are full, but
+		table_cache->rows_used != table_cache->rows_allocd means
+		exactly the opposite - there are allocated but
+		empty/unused rows :-X */
+		ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+		chunk_start = (char*) table_cache->chunks[i].base;
+		offset = table_cache->rows_used
+			- table_cache->chunks[i].offset;
+
+		row = chunk_start + offset * table_cache->row_size;
+	}
+
+	table_cache->rows_used++;
+
+	return(row);
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Validates a row in the locks cache.
+@return	TRUE if valid */
+static
+ibool
+i_s_locks_row_validate(
+/*===================*/
+	const i_s_locks_row_t*	row)	/*!< in: row to validate */
+{
+	ut_ad(row->lock_trx_id != 0);
+	ut_ad(row->lock_mode != NULL);
+	ut_ad(row->lock_type != NULL);
+	ut_ad(row->lock_table != NULL);
+	ut_ad(row->lock_table_id != 0);
+
+	if (row->lock_space == ULINT_UNDEFINED) {
+		/* table lock */
+		ut_ad(!strcmp("TABLE", row->lock_type));
+		ut_ad(row->lock_index == NULL);
+		ut_ad(row->lock_data == NULL);
+		ut_ad(row->lock_page == ULINT_UNDEFINED);
+		ut_ad(row->lock_rec == ULINT_UNDEFINED);
+	} else {
+		/* record lock */
+		ut_ad(!strcmp("RECORD", row->lock_type));
+		ut_ad(row->lock_index != NULL);
+		/* row->lock_data == NULL if buf_page_try_get() == NULL */
+		ut_ad(row->lock_page != ULINT_UNDEFINED);
+		ut_ad(row->lock_rec != ULINT_UNDEFINED);
+	}
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Fills i_s_trx_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return	FALSE if allocation fails */
+static
+ibool
+fill_trx_row(
+/*=========*/
+	i_s_trx_row_t*		row,		/*!< out: result object
+						that's filled */
+	const trx_t*		trx,		/*!< in: transaction to
+						get data from */
+	const i_s_locks_row_t*	requested_lock_row,/*!< in: pointer to the
+						corresponding row in
+						innodb_locks if trx is
+						waiting or NULL if trx
+						is not waiting */
+	trx_i_s_cache_t*	cache)		/*!< in/out: cache into
+						which to copy volatile
+						strings */
+{
+	const char*	stmt;
+	size_t		stmt_len;
+	const char*	s;
+
+	ut_ad(lock_mutex_own());
+
+	row->trx_id = trx->id;
+	row->trx_started = (ib_time_t) trx->start_time;
+	row->trx_state = trx_get_que_state_str(trx);
+	row->requested_lock_row = requested_lock_row;
+	ut_ad(requested_lock_row == NULL
+	      || i_s_locks_row_validate(requested_lock_row));
+
+	if (trx->lock.wait_lock != NULL) {
+
+		ut_a(requested_lock_row != NULL);
+		row->trx_wait_started = (ib_time_t) trx->lock.wait_started;
+	} else {
+		ut_a(requested_lock_row == NULL);
+		row->trx_wait_started = 0;
+	}
+
+	row->trx_weight = (ullint) TRX_WEIGHT(trx);
+
+	if (trx->mysql_thd == NULL) {
+		/* For internal transactions e.g., purge and transactions
+		being recovered at startup there is no associated MySQL
+		thread data structure. */
+		row->trx_mysql_thread_id = 0;
+		row->trx_query = NULL;
+		goto thd_done;
+	}
+
+	row->trx_mysql_thread_id = thd_get_thread_id(trx->mysql_thd);
+
+	stmt = innobase_get_stmt(trx->mysql_thd, &stmt_len);
+
+	if (stmt != NULL) {
+		char	query[TRX_I_S_TRX_QUERY_MAX_LEN + 1];
+
+		if (stmt_len > TRX_I_S_TRX_QUERY_MAX_LEN) {
+			stmt_len = TRX_I_S_TRX_QUERY_MAX_LEN;
+		}
+
+		memcpy(query, stmt, stmt_len);
+		query[stmt_len] = '\0';
+
+		row->trx_query = static_cast<const char*>(
+			ha_storage_put_memlim(
+				cache->storage, query, stmt_len + 1,
+				MAX_ALLOWED_FOR_STORAGE(cache)));
+
+		row->trx_query_cs = innobase_get_charset(trx->mysql_thd);
+
+		if (row->trx_query == NULL) {
+
+			return(FALSE);
+		}
+	} else {
+
+		row->trx_query = NULL;
+	}
+
+thd_done:
+	s = trx->op_info;
+
+	if (s != NULL && s[0] != '\0') {
+
+		TRX_I_S_STRING_COPY(s, row->trx_operation_state,
+				    TRX_I_S_TRX_OP_STATE_MAX_LEN, cache);
+
+		if (row->trx_operation_state == NULL) {
+
+			return(FALSE);
+		}
+	} else {
+
+		row->trx_operation_state = NULL;
+	}
+
+	row->trx_tables_in_use = trx->n_mysql_tables_in_use;
+
+	row->trx_tables_locked = trx->mysql_n_tables_locked;
+
+	/* These are protected by both trx->mutex or lock_sys->mutex,
+	or just lock_sys->mutex. For reading, it suffices to hold
+	lock_sys->mutex. */
+
+	row->trx_lock_structs = UT_LIST_GET_LEN(trx->lock.trx_locks);
+
+	row->trx_lock_memory_bytes = mem_heap_get_size(trx->lock.lock_heap);
+
+	row->trx_rows_locked = lock_number_of_rows_locked(&trx->lock);
+
+	row->trx_rows_modified = trx->undo_no;
+
+	row->trx_concurrency_tickets = trx->n_tickets_to_enter_innodb;
+
+	switch (trx->isolation_level) {
+	case TRX_ISO_READ_UNCOMMITTED:
+		row->trx_isolation_level = "READ UNCOMMITTED";
+		break;
+	case TRX_ISO_READ_COMMITTED:
+		row->trx_isolation_level = "READ COMMITTED";
+		break;
+	case TRX_ISO_REPEATABLE_READ:
+		row->trx_isolation_level = "REPEATABLE READ";
+		break;
+	case TRX_ISO_SERIALIZABLE:
+		row->trx_isolation_level = "SERIALIZABLE";
+		break;
+	/* Should not happen as TRX_ISO_READ_COMMITTED is default */
+	default:
+		row->trx_isolation_level = "UNKNOWN";
+	}
+
+	row->trx_unique_checks = (ibool) trx->check_unique_secondary;
+
+	row->trx_foreign_key_checks = (ibool) trx->check_foreigns;
+
+	s = trx->detailed_error;
+
+	if (s != NULL && s[0] != '\0') {
+
+		TRX_I_S_STRING_COPY(s,
+				    row->trx_foreign_key_error,
+				    TRX_I_S_TRX_FK_ERROR_MAX_LEN, cache);
+
+		if (row->trx_foreign_key_error == NULL) {
+
+			return(FALSE);
+		}
+	} else {
+		row->trx_foreign_key_error = NULL;
+	}
+
+	row->trx_has_search_latch = (ibool) trx->has_search_latch;
+
+	row->trx_search_latch_timeout = trx->search_latch_timeout;
+
+	row->trx_is_read_only = trx->read_only;
+
+	row->trx_is_autocommit_non_locking = trx_is_autocommit_non_locking(trx);
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Format the nth field of "rec" and put it in "buf". The result is always
+NUL-terminated. Returns the number of bytes that were written to "buf"
+(including the terminating NUL).
+@return	end of the result */
+static
+ulint
+put_nth_field(
+/*==========*/
+	char*			buf,	/*!< out: buffer */
+	ulint			buf_size,/*!< in: buffer size in bytes */
+	ulint			n,	/*!< in: number of field */
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_t*		rec,	/*!< in: record */
+	const ulint*		offsets)/*!< in: record offsets, returned
+					by rec_get_offsets() */
+{
+	const byte*	data;
+	ulint		data_len;
+	dict_field_t*	dict_field;
+	ulint		ret;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	if (buf_size == 0) {
+
+		return(0);
+	}
+
+	ret = 0;
+
+	if (n > 0) {
+		/* we must append ", " before the actual data */
+
+		if (buf_size < 3) {
+
+			buf[0] = '\0';
+			return(1);
+		}
+
+		memcpy(buf, ", ", 3);
+
+		buf += 2;
+		buf_size -= 2;
+		ret += 2;
+	}
+
+	/* now buf_size >= 1 */
+
+	data = rec_get_nth_field(rec, offsets, n, &data_len);
+
+	dict_field = dict_index_get_nth_field(index, n);
+
+	ret += row_raw_format((const char*) data, data_len,
+			      dict_field, buf, buf_size);
+
+	return(ret);
+}
+
+/*******************************************************************//**
+Fills the "lock_data" member of i_s_locks_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return	FALSE if allocation fails */
+static
+ibool
+fill_lock_data(
+/*===========*/
+	const char**		lock_data,/*!< out: "lock_data" to fill */
+	const lock_t*		lock,	/*!< in: lock used to find the data */
+	ulint			heap_no,/*!< in: rec num used to find the data */
+	trx_i_s_cache_t*	cache)	/*!< in/out: cache where to store
+					volatile data */
+{
+	mtr_t			mtr;
+
+	const buf_block_t*	block;
+	const page_t*		page;
+	const rec_t*		rec;
+
+	ut_a(lock_get_type(lock) == LOCK_REC);
+
+	mtr_start(&mtr);
+
+	block = buf_page_try_get(lock_rec_get_space_id(lock),
+				 lock_rec_get_page_no(lock),
+				 &mtr);
+
+	if (block == NULL) {
+
+		*lock_data = NULL;
+
+		mtr_commit(&mtr);
+
+		return(TRUE);
+	}
+
+	page = (const page_t*) buf_block_get_frame(block);
+
+	rec = page_find_rec_with_heap_no(page, heap_no);
+
+	if (page_rec_is_infimum(rec)) {
+
+		*lock_data = ha_storage_put_str_memlim(
+			cache->storage, "infimum pseudo-record",
+			MAX_ALLOWED_FOR_STORAGE(cache));
+	} else if (page_rec_is_supremum(rec)) {
+
+		*lock_data = ha_storage_put_str_memlim(
+			cache->storage, "supremum pseudo-record",
+			MAX_ALLOWED_FOR_STORAGE(cache));
+	} else {
+
+		const dict_index_t*	index;
+		ulint			n_fields;
+		mem_heap_t*		heap;
+		ulint			offsets_onstack[REC_OFFS_NORMAL_SIZE];
+		ulint*			offsets;
+		char			buf[TRX_I_S_LOCK_DATA_MAX_LEN];
+		ulint			buf_used;
+		ulint			i;
+
+		rec_offs_init(offsets_onstack);
+		offsets = offsets_onstack;
+
+		index = lock_rec_get_index(lock);
+
+		n_fields = dict_index_get_n_unique(index);
+
+		ut_a(n_fields > 0);
+
+		heap = NULL;
+		offsets = rec_get_offsets(rec, index, offsets, n_fields,
+					  &heap);
+
+		/* format and store the data */
+
+		buf_used = 0;
+		for (i = 0; i < n_fields; i++) {
+
+			buf_used += put_nth_field(
+				buf + buf_used, sizeof(buf) - buf_used,
+				i, index, rec, offsets) - 1;
+		}
+
+		*lock_data = (const char*) ha_storage_put_memlim(
+			cache->storage, buf, buf_used + 1,
+			MAX_ALLOWED_FOR_STORAGE(cache));
+
+		if (UNIV_UNLIKELY(heap != NULL)) {
+
+			/* this means that rec_get_offsets() has created a new
+			heap and has stored offsets in it; check that this is
+			really the case and free the heap */
+			ut_a(offsets != offsets_onstack);
+			mem_heap_free(heap);
+		}
+	}
+
+	mtr_commit(&mtr);
+
+	if (*lock_data == NULL) {
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Fills i_s_locks_row_t object. Returns its first argument.
+If memory can not be allocated then FALSE is returned.
+@return	FALSE if allocation fails */
+static
+ibool
+fill_locks_row(
+/*===========*/
+	i_s_locks_row_t* row,	/*!< out: result object that's filled */
+	const lock_t*	lock,	/*!< in: lock to get data from */
+	ulint		heap_no,/*!< in: lock's record number
+				or ULINT_UNDEFINED if the lock
+				is a table lock */
+	trx_i_s_cache_t* cache)	/*!< in/out: cache into which to copy
+				volatile strings */
+{
+	row->lock_trx_id = lock_get_trx_id(lock);
+	row->lock_mode = lock_get_mode_str(lock);
+	row->lock_type = lock_get_type_str(lock);
+
+	row->lock_table = ha_storage_put_str_memlim(
+		cache->storage, lock_get_table_name(lock),
+		MAX_ALLOWED_FOR_STORAGE(cache));
+
+	/* memory could not be allocated */
+	if (row->lock_table == NULL) {
+
+		return(FALSE);
+	}
+
+	switch (lock_get_type(lock)) {
+	case LOCK_REC:
+		row->lock_index = ha_storage_put_str_memlim(
+			cache->storage, lock_rec_get_index_name(lock),
+			MAX_ALLOWED_FOR_STORAGE(cache));
+
+		/* memory could not be allocated */
+		if (row->lock_index == NULL) {
+
+			return(FALSE);
+		}
+
+		row->lock_space = lock_rec_get_space_id(lock);
+		row->lock_page = lock_rec_get_page_no(lock);
+		row->lock_rec = heap_no;
+
+		if (!fill_lock_data(&row->lock_data, lock, heap_no, cache)) {
+
+			/* memory could not be allocated */
+			return(FALSE);
+		}
+
+		break;
+	case LOCK_TABLE:
+		row->lock_index = NULL;
+
+		row->lock_space = ULINT_UNDEFINED;
+		row->lock_page = ULINT_UNDEFINED;
+		row->lock_rec = ULINT_UNDEFINED;
+
+		row->lock_data = NULL;
+
+		break;
+	default:
+		ut_error;
+	}
+
+	row->lock_table_id = lock_get_table_id(lock);
+
+	row->hash_chain.value = row;
+	ut_ad(i_s_locks_row_validate(row));
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Fills i_s_lock_waits_row_t object. Returns its first argument.
+@return	result object that's filled */
+static
+i_s_lock_waits_row_t*
+fill_lock_waits_row(
+/*================*/
+	i_s_lock_waits_row_t*	row,		/*!< out: result object
+						that's filled */
+	const i_s_locks_row_t*	requested_lock_row,/*!< in: pointer to the
+						relevant requested lock
+						row in innodb_locks */
+	const i_s_locks_row_t*	blocking_lock_row)/*!< in: pointer to the
+						relevant blocking lock
+						row in innodb_locks */
+{
+	ut_ad(i_s_locks_row_validate(requested_lock_row));
+	ut_ad(i_s_locks_row_validate(blocking_lock_row));
+
+	row->requested_lock_row = requested_lock_row;
+	row->blocking_lock_row = blocking_lock_row;
+
+	return(row);
+}
+
+/*******************************************************************//**
+Calculates a hash fold for a lock. For a record lock the fold is
+calculated from 4 elements, which uniquely identify a lock at a given
+point in time: transaction id, space id, page number, record number.
+For a table lock the fold is table's id.
+@return	fold */
+static
+ulint
+fold_lock(
+/*======*/
+	const lock_t*	lock,	/*!< in: lock object to fold */
+	ulint		heap_no)/*!< in: lock's record number
+				or ULINT_UNDEFINED if the lock
+				is a table lock */
+{
+#ifdef TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+	static ulint	fold = 0;
+
+	return(fold++);
+#else
+	ulint	ret;
+
+	switch (lock_get_type(lock)) {
+	case LOCK_REC:
+		ut_a(heap_no != ULINT_UNDEFINED);
+
+		ret = ut_fold_ulint_pair((ulint) lock_get_trx_id(lock),
+					 lock_rec_get_space_id(lock));
+
+		ret = ut_fold_ulint_pair(ret,
+					 lock_rec_get_page_no(lock));
+
+		ret = ut_fold_ulint_pair(ret, heap_no);
+
+		break;
+	case LOCK_TABLE:
+		/* this check is actually not necessary for continuing
+		correct operation, but something must have gone wrong if
+		it fails. */
+		ut_a(heap_no == ULINT_UNDEFINED);
+
+		ret = (ulint) lock_get_table_id(lock);
+
+		break;
+	default:
+		ut_error;
+	}
+
+	return(ret);
+#endif
+}
+
+/*******************************************************************//**
+Checks whether i_s_locks_row_t object represents a lock_t object.
+@return	TRUE if they match */
+static
+ibool
+locks_row_eq_lock(
+/*==============*/
+	const i_s_locks_row_t*	row,	/*!< in: innodb_locks row */
+	const lock_t*		lock,	/*!< in: lock object */
+	ulint			heap_no)/*!< in: lock's record number
+					or ULINT_UNDEFINED if the lock
+					is a table lock */
+{
+	ut_ad(i_s_locks_row_validate(row));
+#ifdef TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+	return(0);
+#else
+	switch (lock_get_type(lock)) {
+	case LOCK_REC:
+		ut_a(heap_no != ULINT_UNDEFINED);
+
+		return(row->lock_trx_id == lock_get_trx_id(lock)
+		       && row->lock_space == lock_rec_get_space_id(lock)
+		       && row->lock_page == lock_rec_get_page_no(lock)
+		       && row->lock_rec == heap_no);
+
+	case LOCK_TABLE:
+		/* this check is actually not necessary for continuing
+		correct operation, but something must have gone wrong if
+		it fails. */
+		ut_a(heap_no == ULINT_UNDEFINED);
+
+		return(row->lock_trx_id == lock_get_trx_id(lock)
+		       && row->lock_table_id == lock_get_table_id(lock));
+
+	default:
+		ut_error;
+		return(FALSE);
+	}
+#endif
+}
+
+/*******************************************************************//**
+Searches for a row in the innodb_locks cache that has a specified id.
+This happens in O(1) time since a hash table is used. Returns pointer to
+the row or NULL if none is found.
+@return	row or NULL */
+static
+i_s_locks_row_t*
+search_innodb_locks(
+/*================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	const lock_t*		lock,	/*!< in: lock to search for */
+	ulint			heap_no)/*!< in: lock's record number
+					or ULINT_UNDEFINED if the lock
+					is a table lock */
+{
+	i_s_hash_chain_t*	hash_chain;
+
+	HASH_SEARCH(
+		/* hash_chain->"next" */
+		next,
+		/* the hash table */
+		cache->locks_hash,
+		/* fold */
+		fold_lock(lock, heap_no),
+		/* the type of the next variable */
+		i_s_hash_chain_t*,
+		/* auxiliary variable */
+		hash_chain,
+		/* assertion on every traversed item */
+		ut_ad(i_s_locks_row_validate(hash_chain->value)),
+		/* this determines if we have found the lock */
+		locks_row_eq_lock(hash_chain->value, lock, heap_no));
+
+	if (hash_chain == NULL) {
+
+		return(NULL);
+	}
+	/* else */
+
+	return(hash_chain->value);
+}
+
+/*******************************************************************//**
+Adds new element to the locks cache, enlarging it if necessary.
+Returns a pointer to the added row. If the row is already present then
+no row is added and a pointer to the existing row is returned.
+If row can not be allocated then NULL is returned.
+@return	row */
+static
+i_s_locks_row_t*
+add_lock_to_cache(
+/*==============*/
+	trx_i_s_cache_t*	cache,	/*!< in/out: cache */
+	const lock_t*		lock,	/*!< in: the element to add */
+	ulint			heap_no)/*!< in: lock's record number
+					or ULINT_UNDEFINED if the lock
+					is a table lock */
+{
+	i_s_locks_row_t*	dst_row;
+
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+	ulint	i;
+	for (i = 0; i < 10000; i++) {
+#endif
+#ifndef TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+	/* quit if this lock is already present */
+	dst_row = search_innodb_locks(cache, lock, heap_no);
+	if (dst_row != NULL) {
+
+		ut_ad(i_s_locks_row_validate(dst_row));
+		return(dst_row);
+	}
+#endif
+
+	dst_row = (i_s_locks_row_t*)
+		table_cache_create_empty_row(&cache->innodb_locks, cache);
+
+	/* memory could not be allocated */
+	if (dst_row == NULL) {
+
+		return(NULL);
+	}
+
+	if (!fill_locks_row(dst_row, lock, heap_no, cache)) {
+
+		/* memory could not be allocated */
+		cache->innodb_locks.rows_used--;
+		return(NULL);
+	}
+
+#ifndef TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+	HASH_INSERT(
+		/* the type used in the hash chain */
+		i_s_hash_chain_t,
+		/* hash_chain->"next" */
+		next,
+		/* the hash table */
+		cache->locks_hash,
+		/* fold */
+		fold_lock(lock, heap_no),
+		/* add this data to the hash */
+		&dst_row->hash_chain);
+#endif
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+	} /* for()-loop */
+#endif
+
+	ut_ad(i_s_locks_row_validate(dst_row));
+	return(dst_row);
+}
+
+/*******************************************************************//**
+Adds new pair of locks to the lock waits cache.
+If memory can not be allocated then FALSE is returned.
+@return	FALSE if allocation fails */
+static
+ibool
+add_lock_wait_to_cache(
+/*===================*/
+	trx_i_s_cache_t*	cache,		/*!< in/out: cache */
+	const i_s_locks_row_t*	requested_lock_row,/*!< in: pointer to the
+						relevant requested lock
+						row in innodb_locks */
+	const i_s_locks_row_t*	blocking_lock_row)/*!< in: pointer to the
+						relevant blocking lock
+						row in innodb_locks */
+{
+	i_s_lock_waits_row_t*	dst_row;
+
+	dst_row = (i_s_lock_waits_row_t*)
+		table_cache_create_empty_row(&cache->innodb_lock_waits,
+					     cache);
+
+	/* memory could not be allocated */
+	if (dst_row == NULL) {
+
+		return(FALSE);
+	}
+
+	fill_lock_waits_row(dst_row, requested_lock_row, blocking_lock_row);
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Adds transaction's relevant (important) locks to cache.
+If the transaction is waiting, then the wait lock is added to
+innodb_locks and a pointer to the added row is returned in
+requested_lock_row, otherwise requested_lock_row is set to NULL.
+If rows can not be allocated then FALSE is returned and the value of
+requested_lock_row is undefined.
+@return	FALSE if allocation fails */
+static
+ibool
+add_trx_relevant_locks_to_cache(
+/*============================*/
+	trx_i_s_cache_t*	cache,	/*!< in/out: cache */
+	const trx_t*		trx,	/*!< in: transaction */
+	i_s_locks_row_t**	requested_lock_row)/*!< out: pointer to the
+					requested lock row, or NULL or
+					undefined */
+{
+	ut_ad(lock_mutex_own());
+
+	/* If transaction is waiting we add the wait lock and all locks
+	from another transactions that are blocking the wait lock. */
+	if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+		const lock_t*		curr_lock;
+		ulint			wait_lock_heap_no;
+		i_s_locks_row_t*	blocking_lock_row;
+		lock_queue_iterator_t	iter;
+
+		ut_a(trx->lock.wait_lock != NULL);
+
+		wait_lock_heap_no
+			= wait_lock_get_heap_no(trx->lock.wait_lock);
+
+		/* add the requested lock */
+		*requested_lock_row
+			= add_lock_to_cache(cache, trx->lock.wait_lock,
+					    wait_lock_heap_no);
+
+		/* memory could not be allocated */
+		if (*requested_lock_row == NULL) {
+
+			return(FALSE);
+		}
+
+		/* then iterate over the locks before the wait lock and
+		add the ones that are blocking it */
+
+		lock_queue_iterator_reset(&iter, trx->lock.wait_lock,
+					  ULINT_UNDEFINED);
+
+		for (curr_lock = lock_queue_iterator_get_prev(&iter);
+		     curr_lock != NULL;
+		     curr_lock = lock_queue_iterator_get_prev(&iter)) {
+
+			if (lock_has_to_wait(trx->lock.wait_lock,
+					     curr_lock)) {
+
+				/* add the lock that is
+				blocking trx->lock.wait_lock */
+				blocking_lock_row
+					= add_lock_to_cache(
+						cache, curr_lock,
+						/* heap_no is the same
+						for the wait and waited
+						locks */
+						wait_lock_heap_no);
+
+				/* memory could not be allocated */
+				if (blocking_lock_row == NULL) {
+
+					return(FALSE);
+				}
+
+				/* add the relation between both locks
+				to innodb_lock_waits */
+				if (!add_lock_wait_to_cache(
+						cache, *requested_lock_row,
+						blocking_lock_row)) {
+
+					/* memory could not be allocated */
+					return(FALSE);
+				}
+			}
+		}
+	} else {
+
+		*requested_lock_row = NULL;
+	}
+
+	return(TRUE);
+}
+
+/** The minimum time that a cache must not be updated after it has been
+read for the last time; measured in microseconds. We use this technique
+to ensure that SELECTs which join several INFORMATION SCHEMA tables read
+the same version of the cache. */
+#define CACHE_MIN_IDLE_TIME_US	100000 /* 0.1 sec */
+
+/*******************************************************************//**
+Checks if the cache can safely be updated.
+@return	TRUE if can be updated */
+static
+ibool
+can_cache_be_updated(
+/*=================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	ullint	now;
+
+	/* Here we read cache->last_read without acquiring its mutex
+	because last_read is only updated when a shared rw lock on the
+	whole cache is being held (see trx_i_s_cache_end_read()) and
+	we are currently holding an exclusive rw lock on the cache.
+	So it is not possible for last_read to be updated while we are
+	reading it. */
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_EX));
+#endif
+
+	now = ut_time_us(NULL);
+	if (now - cache->last_read > CACHE_MIN_IDLE_TIME_US) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Declare a cache empty, preparing it to be filled up. Not all resources
+are freed because they can be reused. */
+static
+void
+trx_i_s_cache_clear(
+/*================*/
+	trx_i_s_cache_t*	cache)	/*!< out: cache to clear */
+{
+	cache->innodb_trx.rows_used = 0;
+	cache->innodb_locks.rows_used = 0;
+	cache->innodb_lock_waits.rows_used = 0;
+
+	hash_table_clear(cache->locks_hash);
+
+	ha_storage_empty(&cache->storage);
+}
+
+/*******************************************************************//**
+Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the
+table cache buffer. Cache must be locked for write. */
+static
+void
+fetch_data_into_cache_low(
+/*======================*/
+	trx_i_s_cache_t*	cache,		/*!< in/out: cache */
+	ibool			only_ac_nl,	/*!< in: only select non-locking
+						autocommit transactions */
+	trx_list_t*		trx_list)	/*!< in: trx list */
+{
+	const trx_t*		trx;
+
+	ut_ad(trx_list == &trx_sys->rw_trx_list
+	      || trx_list == &trx_sys->ro_trx_list
+	      || trx_list == &trx_sys->mysql_trx_list);
+
+	ut_ad(only_ac_nl == (trx_list == &trx_sys->mysql_trx_list));
+
+	/* Iterate over the transaction list and add each one
+	to innodb_trx's cache. We also add all locks that are relevant
+	to each transaction into innodb_locks' and innodb_lock_waits'
+	caches. */
+
+	for (trx = UT_LIST_GET_FIRST(*trx_list);
+	     trx != NULL;
+	     trx =
+	     (trx_list == &trx_sys->mysql_trx_list
+	      ? UT_LIST_GET_NEXT(mysql_trx_list, trx)
+	      : UT_LIST_GET_NEXT(trx_list, trx))) {
+
+		i_s_trx_row_t*		trx_row;
+		i_s_locks_row_t*	requested_lock_row;
+
+		if (trx->state == TRX_STATE_NOT_STARTED
+		    || (only_ac_nl && !trx_is_autocommit_non_locking(trx))) {
+
+			continue;
+		}
+
+		assert_trx_nonlocking_or_in_list(trx);
+
+		ut_ad(trx->in_ro_trx_list
+		      == (trx_list == &trx_sys->ro_trx_list));
+
+		ut_ad(trx->in_rw_trx_list
+		      == (trx_list == &trx_sys->rw_trx_list));
+
+		if (!add_trx_relevant_locks_to_cache(cache, trx,
+						     &requested_lock_row)) {
+
+			cache->is_truncated = TRUE;
+			return;
+		}
+
+		trx_row = (i_s_trx_row_t*)
+			table_cache_create_empty_row(&cache->innodb_trx,
+						     cache);
+
+		/* memory could not be allocated */
+		if (trx_row == NULL) {
+
+			cache->is_truncated = TRUE;
+			return;
+		}
+
+		if (!fill_trx_row(trx_row, trx, requested_lock_row, cache)) {
+
+			/* memory could not be allocated */
+			cache->innodb_trx.rows_used--;
+			cache->is_truncated = TRUE;
+			return;
+		}
+	}
+}
+
+/*******************************************************************//**
+Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the
+table cache buffer. Cache must be locked for write. */
+static
+void
+fetch_data_into_cache(
+/*==================*/
+	trx_i_s_cache_t*	cache)	/*!< in/out: cache */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	trx_i_s_cache_clear(cache);
+
+	fetch_data_into_cache_low(cache, FALSE, &trx_sys->rw_trx_list);
+	fetch_data_into_cache_low(cache, FALSE, &trx_sys->ro_trx_list);
+
+	/* Only select autocommit non-locking selects because they can
+	only be on the MySQL transaction list (TRUE). */
+	fetch_data_into_cache_low(cache, TRUE, &trx_sys->mysql_trx_list);
+
+	cache->is_truncated = FALSE;
+}
+
+/*******************************************************************//**
+Update the transactions cache if it has not been read for some time.
+Called from handler/i_s.cc.
+@return	0 - fetched, 1 - not */
+UNIV_INTERN
+int
+trx_i_s_possibly_fetch_data_into_cache(
+/*===================================*/
+	trx_i_s_cache_t*	cache)	/*!< in/out: cache */
+{
+	if (!can_cache_be_updated(cache)) {
+
+		return(1);
+	}
+
+	/* We need to read trx_sys and record/table lock queues */
+
+	lock_mutex_enter();
+
+	mutex_enter(&trx_sys->mutex);
+
+	fetch_data_into_cache(cache);
+
+	mutex_exit(&trx_sys->mutex);
+
+	lock_mutex_exit();
+
+	return(0);
+}
+
+/*******************************************************************//**
+Returns TRUE if the data in the cache is truncated due to the memory
+limit posed by TRX_I_S_MEM_LIMIT.
+@return	TRUE if truncated */
+UNIV_INTERN
+ibool
+trx_i_s_cache_is_truncated(
+/*=======================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	return(cache->is_truncated);
+}
+
+/*******************************************************************//**
+Initialize INFORMATION SCHEMA trx related cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_init(
+/*===============*/
+	trx_i_s_cache_t*	cache)	/*!< out: cache to init */
+{
+	/* The latching is done in the following order:
+	acquire trx_i_s_cache_t::rw_lock, X
+	acquire lock mutex
+	release lock mutex
+	release trx_i_s_cache_t::rw_lock
+	acquire trx_i_s_cache_t::rw_lock, S
+	acquire trx_i_s_cache_t::last_read_mutex
+	release trx_i_s_cache_t::last_read_mutex
+	release trx_i_s_cache_t::rw_lock */
+
+	rw_lock_create(trx_i_s_cache_lock_key, &cache->rw_lock,
+		       SYNC_TRX_I_S_RWLOCK);
+
+	cache->last_read = 0;
+
+	mutex_create(cache_last_read_mutex_key,
+		     &cache->last_read_mutex, SYNC_TRX_I_S_LAST_READ);
+
+	table_cache_init(&cache->innodb_trx, sizeof(i_s_trx_row_t));
+	table_cache_init(&cache->innodb_locks, sizeof(i_s_locks_row_t));
+	table_cache_init(&cache->innodb_lock_waits,
+			 sizeof(i_s_lock_waits_row_t));
+
+	cache->locks_hash = hash_create(LOCKS_HASH_CELLS_NUM);
+
+	cache->storage = ha_storage_create(CACHE_STORAGE_INITIAL_SIZE,
+					   CACHE_STORAGE_HASH_CELLS);
+
+	cache->mem_allocd = 0;
+
+	cache->is_truncated = FALSE;
+}
+
+/*******************************************************************//**
+Free the INFORMATION SCHEMA trx related cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_free(
+/*===============*/
+	trx_i_s_cache_t*	cache)	/*!< in, own: cache to free */
+{
+	hash_table_free(cache->locks_hash);
+	ha_storage_free(cache->storage);
+	table_cache_free(&cache->innodb_trx);
+	table_cache_free(&cache->innodb_locks);
+	table_cache_free(&cache->innodb_lock_waits);
+	memset(cache, 0, sizeof *cache);
+}
+
+/*******************************************************************//**
+Issue a shared/read lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_start_read(
+/*=====================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	rw_lock_s_lock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Release a shared/read lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_end_read(
+/*===================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	ullint	now;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_SHARED));
+#endif
+
+	/* update cache last read time */
+	now = ut_time_us(NULL);
+	mutex_enter(&cache->last_read_mutex);
+	cache->last_read = now;
+	mutex_exit(&cache->last_read_mutex);
+
+	rw_lock_s_unlock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Issue an exclusive/write lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_start_write(
+/*======================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	rw_lock_x_lock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Release an exclusive/write lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_end_write(
+/*====================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_EX));
+#endif
+
+	rw_lock_x_unlock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Selects a INFORMATION SCHEMA table cache from the whole cache.
+@return	table cache */
+static
+i_s_table_cache_t*
+cache_select_table(
+/*===============*/
+	trx_i_s_cache_t*	cache,	/*!< in: whole cache */
+	enum i_s_table		table)	/*!< in: which table */
+{
+	i_s_table_cache_t*	table_cache;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_SHARED)
+	     || rw_lock_own(&cache->rw_lock, RW_LOCK_EX));
+#endif
+
+	switch (table) {
+	case I_S_INNODB_TRX:
+		table_cache = &cache->innodb_trx;
+		break;
+	case I_S_INNODB_LOCKS:
+		table_cache = &cache->innodb_locks;
+		break;
+	case I_S_INNODB_LOCK_WAITS:
+		table_cache = &cache->innodb_lock_waits;
+		break;
+	default:
+		ut_error;
+	}
+
+	return(table_cache);
+}
+
+/*******************************************************************//**
+Retrieves the number of used rows in the cache for a given
+INFORMATION SCHEMA table.
+@return	number of rows */
+UNIV_INTERN
+ulint
+trx_i_s_cache_get_rows_used(
+/*========================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	enum i_s_table		table)	/*!< in: which table */
+{
+	i_s_table_cache_t*	table_cache;
+
+	table_cache = cache_select_table(cache, table);
+
+	return(table_cache->rows_used);
+}
+
+/*******************************************************************//**
+Retrieves the nth row (zero-based) in the cache for a given
+INFORMATION SCHEMA table.
+@return	row */
+UNIV_INTERN
+void*
+trx_i_s_cache_get_nth_row(
+/*======================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	enum i_s_table		table,	/*!< in: which table */
+	ulint			n)	/*!< in: row number */
+{
+	i_s_table_cache_t*	table_cache;
+	ulint			i;
+	void*			row;
+
+	table_cache = cache_select_table(cache, table);
+
+	ut_a(n < table_cache->rows_used);
+
+	row = NULL;
+
+	for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+		if (table_cache->chunks[i].offset
+		    + table_cache->chunks[i].rows_allocd > n) {
+
+			row = (char*) table_cache->chunks[i].base
+				+ (n - table_cache->chunks[i].offset)
+				* table_cache->row_size;
+			break;
+		}
+	}
+
+	ut_a(row != NULL);
+
+	return(row);
+}
+
+/*******************************************************************//**
+Crafts a lock id string from a i_s_locks_row_t object. Returns its
+second argument. This function aborts if there is not enough space in
+lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you
+want to be 100% sure that it will not abort.
+@return	resulting lock id */
+UNIV_INTERN
+char*
+trx_i_s_create_lock_id(
+/*===================*/
+	const i_s_locks_row_t*	row,	/*!< in: innodb_locks row */
+	char*			lock_id,/*!< out: resulting lock_id */
+	ulint			lock_id_size)/*!< in: size of the lock id
+					buffer */
+{
+	int	res_len;
+
+	/* please adjust TRX_I_S_LOCK_ID_MAX_LEN if you change this */
+
+	if (row->lock_space != ULINT_UNDEFINED) {
+		/* record lock */
+		res_len = ut_snprintf(lock_id, lock_id_size,
+				      TRX_ID_FMT ":%lu:%lu:%lu",
+				      row->lock_trx_id, row->lock_space,
+				      row->lock_page, row->lock_rec);
+	} else {
+		/* table lock */
+		res_len = ut_snprintf(lock_id, lock_id_size,
+				      TRX_ID_FMT":" UINT64PF,
+				      row->lock_trx_id,
+				      row->lock_table_id);
+	}
+
+	/* the typecast is safe because snprintf(3) never returns
+	negative result */
+	ut_a(res_len >= 0);
+	ut_a((ulint) res_len < lock_id_size);
+
+	return(lock_id);
+}
diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc
new file mode 100644
index 00000000000..56d46311f62
--- /dev/null
+++ b/storage/innobase/trx/trx0purge.cc
@@ -0,0 +1,1405 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0purge.cc
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0purge.h"
+
+#ifdef UNIV_NONINL
+#include "trx0purge.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "read0read.h"
+#include "fut0fut.h"
+#include "que0que.h"
+#include "row0purge.h"
+#include "row0upd.h"
+#include "trx0rec.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "os0thread.h"
+#include "srv0mon.h"
+#include "mtr0log.h"
+
+/** Maximum allowable purge history length.  <=0 means 'infinite'. */
+UNIV_INTERN ulong		srv_max_purge_lag = 0;
+
+/** Max DML user threads delay in micro-seconds. */
+UNIV_INTERN ulong		srv_max_purge_lag_delay = 0;
+
+/** The global data structure coordinating a purge */
+UNIV_INTERN trx_purge_t*	purge_sys = NULL;
+
+/** A dummy undo record used as a return value when we have a whole undo log
+which needs no purge */
+UNIV_INTERN trx_undo_rec_t	trx_purge_dummy_rec;
+
+#ifdef UNIV_PFS_RWLOCK
+/* Key to register trx_purge_latch with performance schema */
+UNIV_INTERN mysql_pfs_key_t	trx_purge_latch_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register purge_sys_bh_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	purge_sys_bh_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+#ifdef UNIV_DEBUG
+UNIV_INTERN my_bool		srv_purge_view_update_only_debug;
+#endif /* UNIV_DEBUG */
+
+/****************************************************************//**
+Builds a purge 'query' graph. The actual purge is performed by executing
+this query graph.
+@return	own: the query graph */
+static
+que_t*
+trx_purge_graph_build(
+/*==================*/
+	trx_t*		trx,			/*!< in: transaction */
+	ulint		n_purge_threads)	/*!< in: number of purge
+						threads */
+{
+	ulint		i;
+	mem_heap_t*	heap;
+	que_fork_t*	fork;
+
+	heap = mem_heap_create(512);
+	fork = que_fork_create(NULL, NULL, QUE_FORK_PURGE, heap);
+	fork->trx = trx;
+
+	for (i = 0; i < n_purge_threads; ++i) {
+		que_thr_t*	thr;
+
+		thr = que_thr_create(fork, heap);
+
+		thr->child = row_purge_node_create(thr, heap);
+	}
+
+	return(fork);
+}
+
+/********************************************************************//**
+Creates the global purge system control structure and inits the history
+mutex. */
+UNIV_INTERN
+void
+trx_purge_sys_create(
+/*=================*/
+	ulint		n_purge_threads,	/*!< in: number of purge
+						threads */
+	ib_bh_t*	ib_bh)			/*!< in, own: UNDO log min
+						binary heap */
+{
+	purge_sys = static_cast<trx_purge_t*>(mem_zalloc(sizeof(*purge_sys)));
+
+	purge_sys->state = PURGE_STATE_INIT;
+	purge_sys->event = os_event_create();
+
+	/* Take ownership of ib_bh, we are responsible for freeing it. */
+	purge_sys->ib_bh = ib_bh;
+
+	rw_lock_create(trx_purge_latch_key,
+		       &purge_sys->latch, SYNC_PURGE_LATCH);
+
+	mutex_create(
+		purge_sys_bh_mutex_key, &purge_sys->bh_mutex,
+		SYNC_PURGE_QUEUE);
+
+	purge_sys->heap = mem_heap_create(256);
+
+	ut_a(n_purge_threads > 0);
+
+	purge_sys->sess = sess_open();
+
+	purge_sys->trx = purge_sys->sess->trx;
+
+	ut_a(purge_sys->trx->sess == purge_sys->sess);
+
+	/* A purge transaction is not a real transaction, we use a transaction
+	here only because the query threads code requires it. It is otherwise
+	quite unnecessary. We should get rid of it eventually. */
+	purge_sys->trx->id = 0;
+	purge_sys->trx->start_time = ut_time();
+	purge_sys->trx->state = TRX_STATE_ACTIVE;
+	purge_sys->trx->op_info = "purge trx";
+
+	purge_sys->query = trx_purge_graph_build(
+		purge_sys->trx, n_purge_threads);
+
+	purge_sys->view = read_view_purge_open(purge_sys->heap);
+}
+
+/************************************************************************
+Frees the global purge system control structure. */
+UNIV_INTERN
+void
+trx_purge_sys_close(void)
+/*======================*/
+{
+	que_graph_free(purge_sys->query);
+
+	ut_a(purge_sys->trx->id == 0);
+	ut_a(purge_sys->sess->trx == purge_sys->trx);
+
+	purge_sys->trx->state = TRX_STATE_NOT_STARTED;
+
+	sess_close(purge_sys->sess);
+
+	purge_sys->sess = NULL;
+
+	purge_sys->view = NULL;
+
+	rw_lock_free(&purge_sys->latch);
+	mutex_free(&purge_sys->bh_mutex);
+
+	mem_heap_free(purge_sys->heap);
+
+	ib_bh_free(purge_sys->ib_bh);
+
+	os_event_free(purge_sys->event);
+
+	purge_sys->event = NULL;
+
+	mem_free(purge_sys);
+
+	purge_sys = NULL;
+}
+
+/*================ UNDO LOG HISTORY LIST =============================*/
+
+/********************************************************************//**
+Adds the update undo log as the first log in the history list. Removes the
+update undo log segment from the rseg slot if it is too big for reuse. */
+UNIV_INTERN
+void
+trx_purge_add_update_undo_to_history(
+/*=================================*/
+	trx_t*	trx,		/*!< in: transaction */
+	page_t*	undo_page,	/*!< in: update undo log header page,
+				x-latched */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	trx_undo_t*	undo;
+	trx_rseg_t*	rseg;
+	trx_rsegf_t*	rseg_header;
+	trx_ulogf_t*	undo_header;
+
+	undo = trx->update_undo;
+	rseg = undo->rseg;
+
+	rseg_header = trx_rsegf_get(
+		undo->rseg->space, undo->rseg->zip_size, undo->rseg->page_no,
+		mtr);
+
+	undo_header = undo_page + undo->hdr_offset;
+
+	if (undo->state != TRX_UNDO_CACHED) {
+		ulint		hist_size;
+#ifdef UNIV_DEBUG
+		trx_usegf_t*	seg_header = undo_page + TRX_UNDO_SEG_HDR;
+#endif /* UNIV_DEBUG */
+
+		/* The undo log segment will not be reused */
+
+		if (UNIV_UNLIKELY(undo->id >= TRX_RSEG_N_SLOTS)) {
+			fprintf(stderr,
+				"InnoDB: Error: undo->id is %lu\n",
+				(ulong) undo->id);
+			ut_error;
+		}
+
+		trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, mtr);
+
+		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED);
+
+		hist_size = mtr_read_ulint(
+			rseg_header + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, mtr);
+
+		ut_ad(undo->size == flst_get_len(
+			      seg_header + TRX_UNDO_PAGE_LIST, mtr));
+
+		mlog_write_ulint(
+			rseg_header + TRX_RSEG_HISTORY_SIZE,
+			hist_size + undo->size, MLOG_4BYTES, mtr);
+	}
+
+	/* Add the log as the first in the history list */
+	flst_add_first(rseg_header + TRX_RSEG_HISTORY,
+		       undo_header + TRX_UNDO_HISTORY_NODE, mtr);
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	os_atomic_increment_ulint(&trx_sys->rseg_history_len, 1);
+#else
+	mutex_enter(&trx_sys->mutex);
+	++trx_sys->rseg_history_len;
+	mutex_exit(&trx_sys->mutex);
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+	srv_wake_purge_thread_if_not_active();
+
+	/* Write the trx number to the undo log header */
+	mlog_write_ull(undo_header + TRX_UNDO_TRX_NO, trx->no, mtr);
+
+	/* Write information about delete markings to the undo log header */
+
+	if (!undo->del_marks) {
+		mlog_write_ulint(undo_header + TRX_UNDO_DEL_MARKS, FALSE,
+				 MLOG_2BYTES, mtr);
+	}
+
+	if (rseg->last_page_no == FIL_NULL) {
+		rseg->last_page_no = undo->hdr_page_no;
+		rseg->last_offset = undo->hdr_offset;
+		rseg->last_trx_no = trx->no;
+		rseg->last_del_marks = undo->del_marks;
+	}
+}
+
+/**********************************************************************//**
+Frees an undo log segment which is in the history list. Cuts the end of the
+history list at the youngest undo log in this segment. */
+static
+void
+trx_purge_free_segment(
+/*===================*/
+	trx_rseg_t*	rseg,		/*!< in: rollback segment */
+	fil_addr_t	hdr_addr,	/*!< in: the file address of log_hdr */
+	ulint		n_removed_logs)	/*!< in: count of how many undo logs we
+					will cut off from the end of the
+					history list */
+{
+	mtr_t		mtr;
+	trx_rsegf_t*	rseg_hdr;
+	trx_ulogf_t*	log_hdr;
+	trx_usegf_t*	seg_hdr;
+	ulint		seg_size;
+	ulint		hist_size;
+	ibool		marked		= FALSE;
+
+	/*	fputs("Freeing an update undo log segment\n", stderr); */
+
+	for (;;) {
+		page_t*	undo_page;
+
+		mtr_start(&mtr);
+
+		mutex_enter(&rseg->mutex);
+
+		rseg_hdr = trx_rsegf_get(
+			rseg->space, rseg->zip_size, rseg->page_no, &mtr);
+
+		undo_page = trx_undo_page_get(
+			rseg->space, rseg->zip_size, hdr_addr.page, &mtr);
+
+		seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+		log_hdr = undo_page + hdr_addr.boffset;
+
+		/* Mark the last undo log totally purged, so that if the
+		system crashes, the tail of the undo log will not get accessed
+		again. The list of pages in the undo log tail gets inconsistent
+		during the freeing of the segment, and therefore purge should
+		not try to access them again. */
+
+		if (!marked) {
+			mlog_write_ulint(
+				log_hdr + TRX_UNDO_DEL_MARKS, FALSE,
+				MLOG_2BYTES, &mtr);
+
+			marked = TRUE;
+		}
+
+		if (fseg_free_step_not_header(
+			seg_hdr + TRX_UNDO_FSEG_HEADER, &mtr)) {
+
+			break;
+		}
+
+		mutex_exit(&rseg->mutex);
+
+		mtr_commit(&mtr);
+	}
+
+	/* The page list may now be inconsistent, but the length field
+	stored in the list base node tells us how big it was before we
+	started the freeing. */
+
+	seg_size = flst_get_len(seg_hdr + TRX_UNDO_PAGE_LIST, &mtr);
+
+	/* We may free the undo log segment header page; it must be freed
+	within the same mtr as the undo log header is removed from the
+	history list: otherwise, in case of a database crash, the segment
+	could become inaccessible garbage in the file space. */
+
+	flst_cut_end(rseg_hdr + TRX_RSEG_HISTORY,
+		     log_hdr + TRX_UNDO_HISTORY_NODE, n_removed_logs, &mtr);
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	os_atomic_decrement_ulint(&trx_sys->rseg_history_len, n_removed_logs);
+#else
+	mutex_enter(&trx_sys->mutex);
+	trx_sys->rseg_history_len -= n_removed_logs;
+	mutex_exit(&trx_sys->mutex);
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+	do {
+
+		/* Here we assume that a file segment with just the header
+		page can be freed in a few steps, so that the buffer pool
+		is not flooded with bufferfixed pages: see the note in
+		fsp0fsp.cc. */
+
+	} while(!fseg_free_step(seg_hdr + TRX_UNDO_FSEG_HEADER, &mtr));
+
+	hist_size = mtr_read_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE,
+				   MLOG_4BYTES, &mtr);
+	ut_ad(hist_size >= seg_size);
+
+	mlog_write_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE,
+			 hist_size - seg_size, MLOG_4BYTES, &mtr);
+
+	ut_ad(rseg->curr_size >= seg_size);
+
+	rseg->curr_size -= seg_size;
+
+	mutex_exit(&(rseg->mutex));
+
+	mtr_commit(&mtr);
+}
+
+/********************************************************************//**
+Removes unnecessary history data from a rollback segment. */
+static
+void
+trx_purge_truncate_rseg_history(
+/*============================*/
+	trx_rseg_t*		rseg,		/*!< in: rollback segment */
+	const purge_iter_t*	limit)		/*!< in: truncate offset */
+{
+	fil_addr_t	hdr_addr;
+	fil_addr_t	prev_hdr_addr;
+	trx_rsegf_t*	rseg_hdr;
+	page_t*		undo_page;
+	trx_ulogf_t*	log_hdr;
+	trx_usegf_t*	seg_hdr;
+	ulint		n_removed_logs	= 0;
+	mtr_t		mtr;
+	trx_id_t	undo_trx_no;
+
+	mtr_start(&mtr);
+	mutex_enter(&(rseg->mutex));
+
+	rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size,
+				 rseg->page_no, &mtr);
+
+	hdr_addr = trx_purge_get_log_from_hist(
+		flst_get_last(rseg_hdr + TRX_RSEG_HISTORY, &mtr));
+loop:
+	if (hdr_addr.page == FIL_NULL) {
+
+		mutex_exit(&(rseg->mutex));
+
+		mtr_commit(&mtr);
+
+		return;
+	}
+
+	undo_page = trx_undo_page_get(rseg->space, rseg->zip_size,
+				      hdr_addr.page, &mtr);
+
+	log_hdr = undo_page + hdr_addr.boffset;
+
+	undo_trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
+
+	if (undo_trx_no >= limit->trx_no) {
+
+		if (undo_trx_no == limit->trx_no) {
+
+			trx_undo_truncate_start(
+				rseg, rseg->space, hdr_addr.page,
+				hdr_addr.boffset, limit->undo_no);
+		}
+
+#ifdef HAVE_ATOMIC_BUILTINS
+		os_atomic_decrement_ulint(
+			&trx_sys->rseg_history_len, n_removed_logs);
+#else
+		mutex_enter(&trx_sys->mutex);
+		trx_sys->rseg_history_len -= n_removed_logs;
+		mutex_exit(&trx_sys->mutex);
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+		flst_truncate_end(rseg_hdr + TRX_RSEG_HISTORY,
+				  log_hdr + TRX_UNDO_HISTORY_NODE,
+				  n_removed_logs, &mtr);
+
+		mutex_exit(&(rseg->mutex));
+		mtr_commit(&mtr);
+
+		return;
+	}
+
+	prev_hdr_addr = trx_purge_get_log_from_hist(
+		flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr));
+	n_removed_logs++;
+
+	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+	if ((mach_read_from_2(seg_hdr + TRX_UNDO_STATE) == TRX_UNDO_TO_PURGE)
+	    && (mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG) == 0)) {
+
+		/* We can free the whole log segment */
+
+		mutex_exit(&(rseg->mutex));
+		mtr_commit(&mtr);
+
+		trx_purge_free_segment(rseg, hdr_addr, n_removed_logs);
+
+		n_removed_logs = 0;
+	} else {
+		mutex_exit(&(rseg->mutex));
+		mtr_commit(&mtr);
+	}
+
+	mtr_start(&mtr);
+	mutex_enter(&(rseg->mutex));
+
+	rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size,
+				 rseg->page_no, &mtr);
+
+	hdr_addr = prev_hdr_addr;
+
+	goto loop;
+}
+
+/********************************************************************//**
+Removes unnecessary history data from rollback segments. NOTE that when this
+function is called, the caller must not have any latches on undo log pages! */
+static
+void
+trx_purge_truncate_history(
+/*========================*/
+	purge_iter_t*		limit,		/*!< in: truncate limit */
+	const read_view_t*	view)		/*!< in: purge view */
+{
+	ulint		i;
+
+	/* We play safe and set the truncate limit at most to the purge view
+	low_limit number, though this is not necessary */
+
+	if (limit->trx_no >= view->low_limit_no) {
+		limit->trx_no = view->low_limit_no;
+		limit->undo_no = 0;
+	}
+
+	ut_ad(limit->trx_no <= purge_sys->view->low_limit_no);
+
+	for (i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+		trx_rseg_t*	rseg = trx_sys->rseg_array[i];
+
+		if (rseg != NULL) {
+			ut_a(rseg->id == i);
+			trx_purge_truncate_rseg_history(rseg, limit);
+		}
+	}
+}
+
+/***********************************************************************//**
+Updates the last not yet purged history log info in rseg when we have purged
+a whole undo log. Advances also purge_sys->purge_trx_no past the purged log. */
+static
+void
+trx_purge_rseg_get_next_history_log(
+/*================================*/
+	trx_rseg_t*	rseg,		/*!< in: rollback segment */
+	ulint*		n_pages_handled)/*!< in/out: number of UNDO pages
+					handled */
+{
+	const void*	ptr;
+	page_t*		undo_page;
+	trx_ulogf_t*	log_hdr;
+	fil_addr_t	prev_log_addr;
+	trx_id_t	trx_no;
+	ibool		del_marks;
+	mtr_t		mtr;
+	rseg_queue_t	rseg_queue;
+
+	mutex_enter(&(rseg->mutex));
+
+	ut_a(rseg->last_page_no != FIL_NULL);
+
+	purge_sys->iter.trx_no = rseg->last_trx_no + 1;
+	purge_sys->iter.undo_no = 0;
+	purge_sys->next_stored = FALSE;
+
+	mtr_start(&mtr);
+
+	undo_page = trx_undo_page_get_s_latched(
+		rseg->space, rseg->zip_size, rseg->last_page_no, &mtr);
+
+	log_hdr = undo_page + rseg->last_offset;
+
+	/* Increase the purge page count by one for every handled log */
+
+	(*n_pages_handled)++;
+
+	prev_log_addr = trx_purge_get_log_from_hist(
+		flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr));
+
+	if (prev_log_addr.page == FIL_NULL) {
+		/* No logs left in the history list */
+
+		rseg->last_page_no = FIL_NULL;
+
+		mutex_exit(&(rseg->mutex));
+		mtr_commit(&mtr);
+
+		mutex_enter(&trx_sys->mutex);
+
+		/* Add debug code to track history list corruption reported
+		on the MySQL mailing list on Nov 9, 2004. The fut0lst.cc
+		file-based list was corrupt. The prev node pointer was
+		FIL_NULL, even though the list length was over 8 million nodes!
+		We assume that purge truncates the history list in large
+		size pieces, and if we here reach the head of the list, the
+		list cannot be longer than 2000 000 undo logs now. */
+
+		if (trx_sys->rseg_history_len > 2000000) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Warning: purge reached the"
+				" head of the history list,\n"
+				"InnoDB: but its length is still"
+				" reported as %lu! Make a detailed bug\n"
+				"InnoDB: report, and submit it"
+				" to http://bugs.mysql.com\n",
+				(ulong) trx_sys->rseg_history_len);
+			ut_ad(0);
+		}
+
+		mutex_exit(&trx_sys->mutex);
+
+		return;
+	}
+
+	mutex_exit(&rseg->mutex);
+
+	mtr_commit(&mtr);
+
+	/* Read the trx number and del marks from the previous log header */
+	mtr_start(&mtr);
+
+	log_hdr = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size,
+					      prev_log_addr.page, &mtr)
+		+ prev_log_addr.boffset;
+
+	trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
+
+	del_marks = mach_read_from_2(log_hdr + TRX_UNDO_DEL_MARKS);
+
+	mtr_commit(&mtr);
+
+	mutex_enter(&(rseg->mutex));
+
+	rseg->last_page_no = prev_log_addr.page;
+	rseg->last_offset = prev_log_addr.boffset;
+	rseg->last_trx_no = trx_no;
+	rseg->last_del_marks = del_marks;
+
+	rseg_queue.rseg = rseg;
+	rseg_queue.trx_no = rseg->last_trx_no;
+
+	/* Purge can also produce events, however these are already ordered
+	in the rollback segment and any user generated event will be greater
+	than the events that Purge produces. ie. Purge can never produce
+	events from an empty rollback segment. */
+
+	mutex_enter(&purge_sys->bh_mutex);
+
+	ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue);
+	ut_a(ptr != NULL);
+
+	mutex_exit(&purge_sys->bh_mutex);
+
+	mutex_exit(&rseg->mutex);
+}
+
+/***********************************************************************//**
+Chooses the rollback segment with the smallest trx_id.
+@return zip_size if log is for a compressed table, ULINT_UNDEFINED if
+	no rollback segments to purge, 0 for non compressed tables. */
+static
+ulint
+trx_purge_get_rseg_with_min_trx_id(
+/*===============================*/
+	trx_purge_t*	purge_sys)		/*!< in/out: purge instance */
+
+{
+	ulint		zip_size = 0;
+
+	mutex_enter(&purge_sys->bh_mutex);
+
+	/* Only purge consumes events from the binary heap, user
+	threads only produce the events. */
+
+	if (!ib_bh_is_empty(purge_sys->ib_bh)) {
+		trx_rseg_t*	rseg;
+
+		rseg = ((rseg_queue_t*) ib_bh_first(purge_sys->ib_bh))->rseg;
+		ib_bh_pop(purge_sys->ib_bh);
+
+		mutex_exit(&purge_sys->bh_mutex);
+
+		purge_sys->rseg = rseg;
+	} else {
+		mutex_exit(&purge_sys->bh_mutex);
+
+		purge_sys->rseg = NULL;
+
+		return(ULINT_UNDEFINED);
+	}
+
+	ut_a(purge_sys->rseg != NULL);
+
+	mutex_enter(&purge_sys->rseg->mutex);
+
+	ut_a(purge_sys->rseg->last_page_no != FIL_NULL);
+
+	/* We assume in purge of externally stored fields that space id is
+	in the range of UNDO tablespace space ids */
+	ut_a(purge_sys->rseg->space <= srv_undo_tablespaces_open);
+
+	zip_size = purge_sys->rseg->zip_size;
+
+	ut_a(purge_sys->iter.trx_no <= purge_sys->rseg->last_trx_no);
+
+	purge_sys->iter.trx_no = purge_sys->rseg->last_trx_no;
+	purge_sys->hdr_offset = purge_sys->rseg->last_offset;
+	purge_sys->hdr_page_no = purge_sys->rseg->last_page_no;
+
+	mutex_exit(&purge_sys->rseg->mutex);
+
+	return(zip_size);
+}
+
+/***********************************************************************//**
+Position the purge sys "iterator" on the undo record to use for purging. */
+static
+void
+trx_purge_read_undo_rec(
+/*====================*/
+	trx_purge_t*	purge_sys,		/*!< in/out: purge instance */
+	ulint		zip_size)		/*!< in: block size or 0 */
+{
+	ulint		offset;
+	ulint		page_no;
+	ib_uint64_t	undo_no;
+
+	purge_sys->hdr_offset = purge_sys->rseg->last_offset;
+	page_no = purge_sys->hdr_page_no = purge_sys->rseg->last_page_no;
+
+	if (purge_sys->rseg->last_del_marks) {
+		mtr_t		mtr;
+		trx_undo_rec_t*	undo_rec = NULL;
+
+		mtr_start(&mtr);
+
+		undo_rec = trx_undo_get_first_rec(
+			purge_sys->rseg->space,
+			zip_size,
+			purge_sys->hdr_page_no,
+			purge_sys->hdr_offset, RW_S_LATCH, &mtr);
+
+		if (undo_rec != NULL) {
+			offset = page_offset(undo_rec);
+			undo_no = trx_undo_rec_get_undo_no(undo_rec);
+			page_no = page_get_page_no(page_align(undo_rec));
+		} else {
+			offset = 0;
+			undo_no = 0;
+		}
+
+		mtr_commit(&mtr);
+	} else {
+		offset = 0;
+		undo_no = 0;
+	}
+
+	purge_sys->offset = offset;
+	purge_sys->page_no = page_no;
+	purge_sys->iter.undo_no = undo_no;
+
+	purge_sys->next_stored = TRUE;
+}
+
+/***********************************************************************//**
+Chooses the next undo log to purge and updates the info in purge_sys. This
+function is used to initialize purge_sys when the next record to purge is
+not known, and also to update the purge system info on the next record when
+purge has handled the whole undo log for a transaction. */
+static
+void
+trx_purge_choose_next_log(void)
+/*===========================*/
+{
+	ulint		zip_size;
+
+	ut_ad(purge_sys->next_stored == FALSE);
+
+	zip_size = trx_purge_get_rseg_with_min_trx_id(purge_sys);
+
+	if (purge_sys->rseg != NULL) {
+		trx_purge_read_undo_rec(purge_sys, zip_size);
+	} else {
+		/* There is nothing to do yet. */
+		os_thread_yield();
+	}
+}
+
+/***********************************************************************//**
+Gets the next record to purge and updates the info in the purge system.
+@return	copy of an undo log record or pointer to the dummy undo log record */
+static
+trx_undo_rec_t*
+trx_purge_get_next_rec(
+/*===================*/
+	ulint*		n_pages_handled,/*!< in/out: number of UNDO pages
+					handled */
+	mem_heap_t*	heap)		/*!< in: memory heap where copied */
+{
+	trx_undo_rec_t*	rec;
+	trx_undo_rec_t*	rec_copy;
+	trx_undo_rec_t*	rec2;
+	page_t*		undo_page;
+	page_t*		page;
+	ulint		offset;
+	ulint		page_no;
+	ulint		space;
+	ulint		zip_size;
+	mtr_t		mtr;
+
+	ut_ad(purge_sys->next_stored);
+	ut_ad(purge_sys->iter.trx_no < purge_sys->view->low_limit_no);
+
+	space = purge_sys->rseg->space;
+	zip_size = purge_sys->rseg->zip_size;
+	page_no = purge_sys->page_no;
+	offset = purge_sys->offset;
+
+	if (offset == 0) {
+		/* It is the dummy undo log record, which means that there is
+		no need to purge this undo log */
+
+		trx_purge_rseg_get_next_history_log(
+			purge_sys->rseg, n_pages_handled);
+
+		/* Look for the next undo log and record to purge */
+
+		trx_purge_choose_next_log();
+
+		return(&trx_purge_dummy_rec);
+	}
+
+	mtr_start(&mtr);
+
+	undo_page = trx_undo_page_get_s_latched(space, zip_size, page_no, &mtr);
+
+	rec = undo_page + offset;
+
+	rec2 = rec;
+
+	for (;;) {
+		ulint		type;
+		trx_undo_rec_t*	next_rec;
+		ulint		cmpl_info;
+
+		/* Try first to find the next record which requires a purge
+		operation from the same page of the same undo log */
+
+		next_rec = trx_undo_page_get_next_rec(
+			rec2, purge_sys->hdr_page_no, purge_sys->hdr_offset);
+
+		if (next_rec == NULL) {
+			rec2 = trx_undo_get_next_rec(
+				rec2, purge_sys->hdr_page_no,
+				purge_sys->hdr_offset, &mtr);
+			break;
+		}
+
+		rec2 = next_rec;
+
+		type = trx_undo_rec_get_type(rec2);
+
+		if (type == TRX_UNDO_DEL_MARK_REC) {
+
+			break;
+		}
+
+		cmpl_info = trx_undo_rec_get_cmpl_info(rec2);
+
+		if (trx_undo_rec_get_extern_storage(rec2)) {
+			break;
+		}
+
+		if ((type == TRX_UNDO_UPD_EXIST_REC)
+		    && !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+			break;
+		}
+	}
+
+	if (rec2 == NULL) {
+		mtr_commit(&mtr);
+
+		trx_purge_rseg_get_next_history_log(
+			purge_sys->rseg, n_pages_handled);
+
+		/* Look for the next undo log and record to purge */
+
+		trx_purge_choose_next_log();
+
+		mtr_start(&mtr);
+
+		undo_page = trx_undo_page_get_s_latched(
+			space, zip_size, page_no, &mtr);
+
+		rec = undo_page + offset;
+	} else {
+		page = page_align(rec2);
+
+		purge_sys->offset = rec2 - page;
+		purge_sys->page_no = page_get_page_no(page);
+		purge_sys->iter.undo_no = trx_undo_rec_get_undo_no(rec2);
+
+		if (undo_page != page) {
+			/* We advance to a new page of the undo log: */
+			(*n_pages_handled)++;
+		}
+	}
+
+	rec_copy = trx_undo_rec_copy(rec, heap);
+
+	mtr_commit(&mtr);
+
+	return(rec_copy);
+}
+
+/********************************************************************//**
+Fetches the next undo log record from the history list to purge. It must be
+released with the corresponding release function.
+@return copy of an undo log record or pointer to trx_purge_dummy_rec,
+if the whole undo log can skipped in purge; NULL if none left */
+static __attribute__((warn_unused_result, nonnull))
+trx_undo_rec_t*
+trx_purge_fetch_next_rec(
+/*=====================*/
+	roll_ptr_t*	roll_ptr,	/*!< out: roll pointer to undo record */
+	ulint*		n_pages_handled,/*!< in/out: number of UNDO log pages
+					handled */
+	mem_heap_t*	heap)		/*!< in: memory heap where copied */
+{
+	if (!purge_sys->next_stored) {
+		trx_purge_choose_next_log();
+
+		if (!purge_sys->next_stored) {
+
+			if (srv_print_thread_releases) {
+				fprintf(stderr,
+					"Purge: No logs left in the"
+					" history list\n");
+			}
+
+			return(NULL);
+		}
+	}
+
+	if (purge_sys->iter.trx_no >= purge_sys->view->low_limit_no) {
+
+		return(NULL);
+	}
+
+	/* fprintf(stderr, "Thread %lu purging trx %llu undo record %llu\n",
+	os_thread_get_curr_id(), iter->trx_no, iter->undo_no); */
+
+	*roll_ptr = trx_undo_build_roll_ptr(
+		FALSE, purge_sys->rseg->id,
+		purge_sys->page_no, purge_sys->offset);
+
+	/* The following call will advance the stored values of the
+	purge iterator. */
+
+	return(trx_purge_get_next_rec(n_pages_handled, heap));
+}
+
+/*******************************************************************//**
+This function runs a purge batch.
+@return	number of undo log pages handled in the batch */
+static
+ulint
+trx_purge_attach_undo_recs(
+/*=======================*/
+	ulint		n_purge_threads,/*!< in: number of purge threads */
+	trx_purge_t*	purge_sys,	/*!< in/out: purge instance */
+	purge_iter_t*	limit,		/*!< out: records read up to */
+	ulint		batch_size)	/*!< in: no. of pages to purge */
+{
+	que_thr_t*	thr;
+	ulint		i = 0;
+	ulint		n_pages_handled = 0;
+	ulint		n_thrs = UT_LIST_GET_LEN(purge_sys->query->thrs);
+
+	ut_a(n_purge_threads > 0);
+
+	*limit = purge_sys->iter;
+
+	/* Debug code to validate some pre-requisites and reset done flag. */
+	for (thr = UT_LIST_GET_FIRST(purge_sys->query->thrs);
+	     thr != NULL && i < n_purge_threads;
+	     thr = UT_LIST_GET_NEXT(thrs, thr), ++i) {
+
+		purge_node_t*		node;
+
+		/* Get the purge node. */
+		node = (purge_node_t*) thr->child;
+
+		ut_a(que_node_get_type(node) == QUE_NODE_PURGE);
+		ut_a(node->undo_recs == NULL);
+		ut_a(node->done);
+
+		node->done = FALSE;
+	}
+
+	/* There should never be fewer nodes than threads, the inverse
+	however is allowed because we only use purge threads as needed. */
+	ut_a(i == n_purge_threads);
+
+	/* Fetch and parse the UNDO records. The UNDO records are added
+	to a per purge node vector. */
+	thr = UT_LIST_GET_FIRST(purge_sys->query->thrs);
+	ut_a(n_thrs > 0 && thr != NULL);
+
+	ut_ad(trx_purge_check_limit());
+
+	i = 0;
+
+	for (;;) {
+		purge_node_t*		node;
+		trx_purge_rec_t*	purge_rec;
+
+		ut_a(!thr->is_active);
+
+		/* Get the purge node. */
+		node = (purge_node_t*) thr->child;
+		ut_a(que_node_get_type(node) == QUE_NODE_PURGE);
+
+		purge_rec = static_cast<trx_purge_rec_t*>(
+			mem_heap_zalloc(node->heap, sizeof(*purge_rec)));
+
+		/* Track the max {trx_id, undo_no} for truncating the
+		UNDO logs once we have purged the records. */
+
+		if (purge_sys->iter.trx_no > limit->trx_no
+		    || (purge_sys->iter.trx_no == limit->trx_no
+			&& purge_sys->iter.undo_no >= limit->undo_no)) {
+
+			*limit = purge_sys->iter;
+		}
+
+		/* Fetch the next record, and advance the purge_sys->iter. */
+		purge_rec->undo_rec = trx_purge_fetch_next_rec(
+			&purge_rec->roll_ptr, &n_pages_handled, node->heap);
+
+		if (purge_rec->undo_rec != NULL) {
+
+			if (node->undo_recs == NULL) {
+				node->undo_recs = ib_vector_create(
+					ib_heap_allocator_create(node->heap),
+					sizeof(trx_purge_rec_t),
+					batch_size);
+			} else {
+				ut_a(!ib_vector_is_empty(node->undo_recs));
+			}
+
+			ib_vector_push(node->undo_recs, purge_rec);
+
+			if (n_pages_handled >= batch_size) {
+
+				break;
+			}
+		} else {
+			break;
+		}
+
+		thr = UT_LIST_GET_NEXT(thrs, thr);
+
+		if (!(++i % n_purge_threads)) {
+			thr = UT_LIST_GET_FIRST(purge_sys->query->thrs);
+		}
+
+		ut_a(thr != NULL);
+	}
+
+	ut_ad(trx_purge_check_limit());
+
+	return(n_pages_handled);
+}
+
+/*******************************************************************//**
+Calculate the DML delay required.
+@return delay in microseconds or ULINT_MAX */
+static
+ulint
+trx_purge_dml_delay(void)
+/*=====================*/
+{
+	/* Determine how much data manipulation language (DML) statements
+	need to be delayed in order to reduce the lagging of the purge
+	thread. */
+	ulint	delay = 0; /* in microseconds; default: no delay */
+
+	/* If purge lag is set (ie. > 0) then calculate the new DML delay.
+	Note: we do a dirty read of the trx_sys_t data structure here,
+	without holding trx_sys->mutex. */
+
+	if (srv_max_purge_lag > 0) {
+		float	ratio;
+
+		ratio = float(trx_sys->rseg_history_len) / srv_max_purge_lag;
+
+		if (ratio > 1.0) {
+			/* If the history list length exceeds the
+			srv_max_purge_lag, the data manipulation
+			statements are delayed by at least 5000
+			microseconds. */
+			delay = (ulint) ((ratio - .5) * 10000);
+		}
+
+		if (delay > srv_max_purge_lag_delay) {
+			delay = srv_max_purge_lag_delay;
+		}
+
+		MONITOR_SET(MONITOR_DML_PURGE_DELAY, delay);
+	}
+
+	return(delay);
+}
+
+/*******************************************************************//**
+Wait for pending purge jobs to complete. */
+static
+void
+trx_purge_wait_for_workers_to_complete(
+/*===================================*/
+	trx_purge_t*	purge_sys)	/*!< in: purge instance */
+{
+	ulint		n_submitted = purge_sys->n_submitted;
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	/* Ensure that the work queue empties out. */
+	while (!os_compare_and_swap_ulint(
+			&purge_sys->n_completed, n_submitted, n_submitted)) {
+#else
+	mutex_enter(&purge_sys->bh_mutex);
+
+	while (purge_sys->n_completed < n_submitted) {
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+#ifndef HAVE_ATOMIC_BUILTINS
+		mutex_exit(&purge_sys->bh_mutex);
+#endif /* !HAVE_ATOMIC_BUILTINS */
+
+		if (srv_get_task_queue_length() > 0) {
+			srv_release_threads(SRV_WORKER, 1);
+		}
+
+		os_thread_yield();
+
+#ifndef HAVE_ATOMIC_BUILTINS
+		mutex_enter(&purge_sys->bh_mutex);
+#endif /* !HAVE_ATOMIC_BUILTINS */
+	}
+
+#ifndef HAVE_ATOMIC_BUILTINS
+	mutex_exit(&purge_sys->bh_mutex);
+#endif /* !HAVE_ATOMIC_BUILTINS */
+
+	/* None of the worker threads should be doing any work. */
+	ut_a(purge_sys->n_submitted == purge_sys->n_completed);
+
+	/* There should be no outstanding tasks as long
+	as the worker threads are active. */
+	ut_a(srv_get_task_queue_length() == 0);
+}
+
+/******************************************************************//**
+Remove old historical changes from the rollback segments. */
+static
+void
+trx_purge_truncate(void)
+/*====================*/
+{
+	ut_ad(trx_purge_check_limit());
+
+	if (purge_sys->limit.trx_no == 0) {
+		trx_purge_truncate_history(&purge_sys->iter, purge_sys->view);
+	} else {
+		trx_purge_truncate_history(&purge_sys->limit, purge_sys->view);
+	}
+}
+
+/*******************************************************************//**
+This function runs a purge batch.
+@return	number of undo log pages handled in the batch */
+UNIV_INTERN
+ulint
+trx_purge(
+/*======*/
+	ulint	n_purge_threads,	/*!< in: number of purge tasks
+					to submit to the work queue */
+	ulint	batch_size,		/*!< in: the maximum number of records
+					to purge in one batch */
+	bool	truncate)		/*!< in: truncate history if true */
+{
+	que_thr_t*	thr = NULL;
+	ulint		n_pages_handled;
+
+	ut_a(n_purge_threads > 0);
+
+	srv_dml_needed_delay = trx_purge_dml_delay();
+
+	/* The number of tasks submitted should be completed. */
+	ut_a(purge_sys->n_submitted == purge_sys->n_completed);
+
+	rw_lock_x_lock(&purge_sys->latch);
+
+	purge_sys->view = NULL;
+
+	mem_heap_empty(purge_sys->heap);
+
+	purge_sys->view = read_view_purge_open(purge_sys->heap);
+
+	rw_lock_x_unlock(&purge_sys->latch);
+
+#ifdef UNIV_DEBUG
+	if (srv_purge_view_update_only_debug) {
+		return(0);
+	}
+#endif
+
+	/* Fetch the UNDO recs that need to be purged. */
+	n_pages_handled = trx_purge_attach_undo_recs(
+		n_purge_threads, purge_sys, &purge_sys->limit, batch_size);
+
+	/* Do we do an asynchronous purge or not ? */
+	if (n_purge_threads > 1) {
+		ulint	i = 0;
+
+		/* Submit the tasks to the work queue. */
+		for (i = 0; i < n_purge_threads - 1; ++i) {
+			thr = que_fork_scheduler_round_robin(
+				purge_sys->query, thr);
+
+			ut_a(thr != NULL);
+
+			srv_que_task_enqueue_low(thr);
+		}
+
+		thr = que_fork_scheduler_round_robin(purge_sys->query, thr);
+		ut_a(thr != NULL);
+
+		purge_sys->n_submitted += n_purge_threads - 1;
+
+		goto run_synchronously;
+
+	/* Do it synchronously. */
+	} else {
+		thr = que_fork_scheduler_round_robin(purge_sys->query, NULL);
+		ut_ad(thr);
+
+run_synchronously:
+		++purge_sys->n_submitted;
+
+		que_run_threads(thr);
+
+		os_atomic_inc_ulint(
+			&purge_sys->bh_mutex, &purge_sys->n_completed, 1);
+
+		if (n_purge_threads > 1) {
+			trx_purge_wait_for_workers_to_complete(purge_sys);
+		}
+	}
+
+	ut_a(purge_sys->n_submitted == purge_sys->n_completed);
+
+#ifdef UNIV_DEBUG
+	rw_lock_x_lock(&purge_sys->latch);
+	if (purge_sys->limit.trx_no == 0) {
+		purge_sys->done = purge_sys->iter;
+	} else {
+		purge_sys->done = purge_sys->limit;
+	}
+	rw_lock_x_unlock(&purge_sys->latch);
+#endif /* UNIV_DEBUG */
+
+	if (truncate) {
+		trx_purge_truncate();
+	}
+
+	MONITOR_INC_VALUE(MONITOR_PURGE_INVOKED, 1);
+	MONITOR_INC_VALUE(MONITOR_PURGE_N_PAGE_HANDLED, n_pages_handled);
+
+	return(n_pages_handled);
+}
+
+/*******************************************************************//**
+Get the purge state.
+@return purge state. */
+UNIV_INTERN
+purge_state_t
+trx_purge_state(void)
+/*=================*/
+{
+	purge_state_t	state;
+
+	rw_lock_x_lock(&purge_sys->latch);
+
+	state = purge_sys->state;
+
+	rw_lock_x_unlock(&purge_sys->latch);
+
+	return(state);
+}
+
+/*******************************************************************//**
+Stop purge and wait for it to stop, move to PURGE_STATE_STOP. */
+UNIV_INTERN
+void
+trx_purge_stop(void)
+/*================*/
+{
+	purge_state_t	state;
+	ib_int64_t	sig_count = os_event_reset(purge_sys->event);
+
+	ut_a(srv_n_purge_threads > 0);
+
+	rw_lock_x_lock(&purge_sys->latch);
+
+	ut_a(purge_sys->state != PURGE_STATE_INIT);
+	ut_a(purge_sys->state != PURGE_STATE_EXIT);
+	ut_a(purge_sys->state != PURGE_STATE_DISABLED);
+
+	++purge_sys->n_stop;
+
+	state = purge_sys->state;
+
+	if (state == PURGE_STATE_RUN) {
+		ib_logf(IB_LOG_LEVEL_INFO, "Stopping purge");
+
+		/* We need to wakeup the purge thread in case it is suspended,
+		so that it can acknowledge the state change. */
+
+		srv_purge_wakeup();
+	}
+
+	purge_sys->state = PURGE_STATE_STOP;
+
+	rw_lock_x_unlock(&purge_sys->latch);
+
+	if (state != PURGE_STATE_STOP) {
+
+		/* Wait for purge coordinator to signal that it
+		is suspended. */
+		os_event_wait_low(purge_sys->event, sig_count);
+	} else { 
+		bool	once = true; 
+
+		rw_lock_x_lock(&purge_sys->latch);
+
+		/* Wait for purge to signal that it has actually stopped. */ 
+		while (purge_sys->running) { 
+
+			if (once) { 
+				ib_logf(IB_LOG_LEVEL_INFO,
+					"Waiting for purge to stop");
+				once = false; 
+			}
+
+			rw_lock_x_unlock(&purge_sys->latch);
+
+			os_thread_sleep(10000); 
+
+			rw_lock_x_lock(&purge_sys->latch);
+		} 
+
+		rw_lock_x_unlock(&purge_sys->latch);
+	}
+
+	MONITOR_INC_VALUE(MONITOR_PURGE_STOP_COUNT, 1);
+}
+
+/*******************************************************************//**
+Resume purge, move to PURGE_STATE_RUN. */
+UNIV_INTERN
+void
+trx_purge_run(void)
+/*===============*/
+{
+	rw_lock_x_lock(&purge_sys->latch);
+
+	switch(purge_sys->state) {
+	case PURGE_STATE_INIT:
+	case PURGE_STATE_EXIT:
+	case PURGE_STATE_DISABLED:
+		ut_error;
+
+	case PURGE_STATE_RUN:
+	case PURGE_STATE_STOP:
+		break;
+	}
+
+	if (purge_sys->n_stop > 0) {
+
+		ut_a(purge_sys->state == PURGE_STATE_STOP);
+
+		--purge_sys->n_stop;
+
+		if (purge_sys->n_stop == 0) {
+
+			ib_logf(IB_LOG_LEVEL_INFO, "Resuming purge");
+
+			purge_sys->state = PURGE_STATE_RUN;
+		}
+
+		MONITOR_INC_VALUE(MONITOR_PURGE_RESUME_COUNT, 1);
+	} else {
+		ut_a(purge_sys->state == PURGE_STATE_RUN);
+	}
+
+	rw_lock_x_unlock(&purge_sys->latch);
+
+	srv_purge_wakeup();
+}
diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc
new file mode 100644
index 00000000000..a698b37c2a6
--- /dev/null
+++ b/storage/innobase/trx/trx0rec.cc
@@ -0,0 +1,1656 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rec.cc
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rec.h"
+
+#ifdef UNIV_NONINL
+#include "trx0rec.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0undo.h"
+#include "mtr0log.h"
+#ifndef UNIV_HOTBACKUP
+#include "dict0dict.h"
+#include "ut0mem.h"
+#include "read0read.h"
+#include "row0ext.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "trx0purge.h"
+#include "trx0rseg.h"
+#include "row0row.h"
+
+/*=========== UNDO LOG RECORD CREATION AND DECODING ====================*/
+
+/**********************************************************************//**
+Writes the mtr log entry of the inserted undo log record on the undo log
+page. */
+UNIV_INLINE
+void
+trx_undof_page_add_undo_rec_log(
+/*============================*/
+	page_t* undo_page,	/*!< in: undo log page */
+	ulint	old_free,	/*!< in: start offset of the inserted entry */
+	ulint	new_free,	/*!< in: end offset of the entry */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	byte*		log_ptr;
+	const byte*	log_end;
+	ulint		len;
+
+	log_ptr = mlog_open(mtr, 11 + 13 + MLOG_BUF_MARGIN);
+
+	if (log_ptr == NULL) {
+
+		return;
+	}
+
+	log_end = &log_ptr[11 + 13 + MLOG_BUF_MARGIN];
+	log_ptr = mlog_write_initial_log_record_fast(
+		undo_page, MLOG_UNDO_INSERT, log_ptr, mtr);
+	len = new_free - old_free - 4;
+
+	mach_write_to_2(log_ptr, len);
+	log_ptr += 2;
+
+	if (log_ptr + len <= log_end) {
+		memcpy(log_ptr, undo_page + old_free + 2, len);
+		mlog_close(mtr, log_ptr + len);
+	} else {
+		mlog_close(mtr, log_ptr);
+		mlog_catenate_string(mtr, undo_page + old_free + 2, len);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses a redo log record of adding an undo log record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_add_undo_rec(
+/*========================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	page_t*	page)	/*!< in: page or NULL */
+{
+	ulint	len;
+	byte*	rec;
+	ulint	first_free;
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	len = mach_read_from_2(ptr);
+	ptr += 2;
+
+	if (end_ptr < ptr + len) {
+
+		return(NULL);
+	}
+
+	if (page == NULL) {
+
+		return(ptr + len);
+	}
+
+	first_free = mach_read_from_2(page + TRX_UNDO_PAGE_HDR
+				      + TRX_UNDO_PAGE_FREE);
+	rec = page + first_free;
+
+	mach_write_to_2(rec, first_free + 4 + len);
+	mach_write_to_2(rec + 2 + len, first_free);
+
+	mach_write_to_2(page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE,
+			first_free + 4 + len);
+	ut_memcpy(rec + 2, ptr, len);
+
+	return(ptr + len);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Calculates the free space left for extending an undo log record.
+@return	bytes left */
+UNIV_INLINE
+ulint
+trx_undo_left(
+/*==========*/
+	const page_t*	page,	/*!< in: undo log page */
+	const byte*	ptr)	/*!< in: pointer to page */
+{
+	/* The '- 10' is a safety margin, in case we have some small
+	calculation error below */
+
+	return(UNIV_PAGE_SIZE - (ptr - page) - 10 - FIL_PAGE_DATA_END);
+}
+
+/**********************************************************************//**
+Set the next and previous pointers in the undo page for the undo record
+that was written to ptr. Update the first free value by the number of bytes
+written for this undo record.
+@return	offset of the inserted entry on the page if succeeded, 0 if fail */
+static
+ulint
+trx_undo_page_set_next_prev_and_add(
+/*================================*/
+	page_t*		undo_page,	/*!< in/out: undo log page */
+	byte*		ptr,		/*!< in: ptr up to where data has been
+					written on this undo page. */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint		first_free;	/*!< offset within undo_page */
+	ulint		end_of_rec;	/*!< offset within undo_page */
+	byte*		ptr_to_first_free;
+					/* pointer within undo_page
+					that points to the next free
+					offset value within undo_page.*/
+
+	ut_ad(ptr > undo_page);
+	ut_ad(ptr < undo_page + UNIV_PAGE_SIZE);
+
+	if (UNIV_UNLIKELY(trx_undo_left(undo_page, ptr) < 2)) {
+
+		return(0);
+	}
+
+	ptr_to_first_free = undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE;
+
+	first_free = mach_read_from_2(ptr_to_first_free);
+
+	/* Write offset of the previous undo log record */
+	mach_write_to_2(ptr, first_free);
+	ptr += 2;
+
+	end_of_rec = ptr - undo_page;
+
+	/* Write offset of the next undo log record */
+	mach_write_to_2(undo_page + first_free, end_of_rec);
+
+	/* Update the offset to first free undo record */
+	mach_write_to_2(ptr_to_first_free, end_of_rec);
+
+	/* Write this log entry to the UNDO log */
+	trx_undof_page_add_undo_rec_log(undo_page, first_free,
+					end_of_rec, mtr);
+
+	return(first_free);
+}
+
+/**********************************************************************//**
+Reports in the undo log of an insert of a clustered index record.
+@return	offset of the inserted entry on the page if succeed, 0 if fail */
+static
+ulint
+trx_undo_page_report_insert(
+/*========================*/
+	page_t*		undo_page,	/*!< in: undo log page */
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: clustered index */
+	const dtuple_t*	clust_entry,	/*!< in: index entry which will be
+					inserted to the clustered index */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint		first_free;
+	byte*		ptr;
+	ulint		i;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+			       + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_INSERT);
+
+	first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+				      + TRX_UNDO_PAGE_FREE);
+	ptr = undo_page + first_free;
+
+	ut_ad(first_free <= UNIV_PAGE_SIZE);
+
+	if (trx_undo_left(undo_page, ptr) < 2 + 1 + 11 + 11) {
+
+		/* Not enough space for writing the general parameters */
+
+		return(0);
+	}
+
+	/* Reserve 2 bytes for the pointer to the next undo log record */
+	ptr += 2;
+
+	/* Store first some general parameters to the undo log */
+	*ptr++ = TRX_UNDO_INSERT_REC;
+	ptr += mach_ull_write_much_compressed(ptr, trx->undo_no);
+	ptr += mach_ull_write_much_compressed(ptr, index->table->id);
+	/*----------------------------------------*/
+	/* Store then the fields required to uniquely determine the record
+	to be inserted in the clustered index */
+
+	for (i = 0; i < dict_index_get_n_unique(index); i++) {
+
+		const dfield_t*	field	= dtuple_get_nth_field(clust_entry, i);
+		ulint		flen	= dfield_get_len(field);
+
+		if (trx_undo_left(undo_page, ptr) < 5) {
+
+			return(0);
+		}
+
+		ptr += mach_write_compressed(ptr, flen);
+
+		if (flen != UNIV_SQL_NULL) {
+			if (trx_undo_left(undo_page, ptr) < flen) {
+
+				return(0);
+			}
+
+			ut_memcpy(ptr, dfield_get_data(field), flen);
+			ptr += flen;
+		}
+	}
+
+	return(trx_undo_page_set_next_prev_and_add(undo_page, ptr, mtr));
+}
+
+/**********************************************************************//**
+Reads from an undo log record the general parameters.
+@return	remaining part of undo log record after reading these values */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_pars(
+/*==================*/
+	trx_undo_rec_t*	undo_rec,	/*!< in: undo log record */
+	ulint*		type,		/*!< out: undo record type:
+					TRX_UNDO_INSERT_REC, ... */
+	ulint*		cmpl_info,	/*!< out: compiler info, relevant only
+					for update type records */
+	bool*		updated_extern,	/*!< out: true if we updated an
+					externally stored fild */
+	undo_no_t*	undo_no,	/*!< out: undo log record number */
+	table_id_t*	table_id)	/*!< out: table id */
+{
+	byte*		ptr;
+	ulint		type_cmpl;
+
+	ptr = undo_rec + 2;
+
+	type_cmpl = mach_read_from_1(ptr);
+	ptr++;
+
+	*updated_extern = !!(type_cmpl & TRX_UNDO_UPD_EXTERN);
+	type_cmpl &= ~TRX_UNDO_UPD_EXTERN;
+
+	*type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1);
+	*cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT;
+
+	*undo_no = mach_ull_read_much_compressed(ptr);
+	ptr += mach_ull_get_much_compressed_size(*undo_no);
+
+	*table_id = mach_ull_read_much_compressed(ptr);
+	ptr += mach_ull_get_much_compressed_size(*table_id);
+
+	return(ptr);
+}
+
+/**********************************************************************//**
+Reads from an undo log record a stored column value.
+@return	remaining part of undo log record after reading these values */
+static
+byte*
+trx_undo_rec_get_col_val(
+/*=====================*/
+	byte*	ptr,	/*!< in: pointer to remaining part of undo log record */
+	byte**	field,	/*!< out: pointer to stored field */
+	ulint*	len,	/*!< out: length of the field, or UNIV_SQL_NULL */
+	ulint*	orig_len)/*!< out: original length of the locally
+			stored part of an externally stored column, or 0 */
+{
+	*len = mach_read_compressed(ptr);
+	ptr += mach_get_compressed_size(*len);
+
+	*orig_len = 0;
+
+	switch (*len) {
+	case UNIV_SQL_NULL:
+		*field = NULL;
+		break;
+	case UNIV_EXTERN_STORAGE_FIELD:
+		*orig_len = mach_read_compressed(ptr);
+		ptr += mach_get_compressed_size(*orig_len);
+		*len = mach_read_compressed(ptr);
+		ptr += mach_get_compressed_size(*len);
+		*field = ptr;
+		ptr += *len;
+
+		ut_ad(*orig_len >= BTR_EXTERN_FIELD_REF_SIZE);
+		ut_ad(*len > *orig_len);
+		/* @see dtuple_convert_big_rec() */
+		ut_ad(*len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+		/* we do not have access to index->table here
+		ut_ad(dict_table_get_format(index->table) >= UNIV_FORMAT_B
+		      || *len >= col->max_prefix
+		      + BTR_EXTERN_FIELD_REF_SIZE);
+		*/
+
+		*len += UNIV_EXTERN_STORAGE_FIELD;
+		break;
+	default:
+		*field = ptr;
+		if (*len >= UNIV_EXTERN_STORAGE_FIELD) {
+			ptr += *len - UNIV_EXTERN_STORAGE_FIELD;
+		} else {
+			ptr += *len;
+		}
+	}
+
+	return(ptr);
+}
+
+/*******************************************************************//**
+Builds a row reference from an undo log record.
+@return	pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+	byte*		ptr,	/*!< in: remaining part of a copy of an undo log
+				record, at the start of the row reference;
+				NOTE that this copy of the undo log record must
+				be preserved as long as the row reference is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	dtuple_t**	ref,	/*!< out, own: row reference */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
+				needed is allocated */
+{
+	ulint		ref_len;
+	ulint		i;
+
+	ut_ad(index && ptr && ref && heap);
+	ut_a(dict_index_is_clust(index));
+
+	ref_len = dict_index_get_n_unique(index);
+
+	*ref = dtuple_create(heap, ref_len);
+
+	dict_index_copy_types(*ref, index, ref_len);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield_t*	dfield;
+		byte*		field;
+		ulint		len;
+		ulint		orig_len;
+
+		dfield = dtuple_get_nth_field(*ref, i);
+
+		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+		dfield_set_data(dfield, field, len);
+	}
+
+	return(ptr);
+}
+
+/*******************************************************************//**
+Skips a row reference from an undo log record.
+@return	pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_skip_row_ref(
+/*======================*/
+	byte*		ptr,	/*!< in: remaining part in update undo log
+				record, at the start of the row reference */
+	dict_index_t*	index)	/*!< in: clustered index */
+{
+	ulint	ref_len;
+	ulint	i;
+
+	ut_ad(index && ptr);
+	ut_a(dict_index_is_clust(index));
+
+	ref_len = dict_index_get_n_unique(index);
+
+	for (i = 0; i < ref_len; i++) {
+		byte*	field;
+		ulint	len;
+		ulint	orig_len;
+
+		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+	}
+
+	return(ptr);
+}
+
+/**********************************************************************//**
+Fetch a prefix of an externally stored column, for writing to the undo log
+of an update or delete marking of a clustered index record.
+@return	ext_buf */
+static
+byte*
+trx_undo_page_fetch_ext(
+/*====================*/
+	byte*		ext_buf,	/*!< in: buffer to hold the prefix
+					data and BLOB pointer */
+	ulint		prefix_len,	/*!< in: prefix size to store
+					in the undo log */
+	ulint		zip_size,	/*!< compressed page size in bytes,
+					or 0 for uncompressed BLOB  */
+	const byte*	field,		/*!< in: an externally stored column */
+	ulint*		len)		/*!< in: length of field;
+					out: used length of ext_buf */
+{
+	/* Fetch the BLOB. */
+	ulint	ext_len = btr_copy_externally_stored_field_prefix(
+		ext_buf, prefix_len, zip_size, field, *len);
+	/* BLOBs should always be nonempty. */
+	ut_a(ext_len);
+	/* Append the BLOB pointer to the prefix. */
+	memcpy(ext_buf + ext_len,
+	       field + *len - BTR_EXTERN_FIELD_REF_SIZE,
+	       BTR_EXTERN_FIELD_REF_SIZE);
+	*len = ext_len + BTR_EXTERN_FIELD_REF_SIZE;
+	return(ext_buf);
+}
+
+/**********************************************************************//**
+Writes to the undo log a prefix of an externally stored column.
+@return	undo log position */
+static
+byte*
+trx_undo_page_report_modify_ext(
+/*============================*/
+	byte*		ptr,		/*!< in: undo log position,
+					at least 15 bytes must be available */
+	byte*		ext_buf,	/*!< in: a buffer of
+					DICT_MAX_FIELD_LEN_BY_FORMAT() size,
+					or NULL when should not fetch
+					a longer prefix */
+	ulint		prefix_len,	/*!< prefix size to store in the
+					undo log */
+	ulint		zip_size,	/*!< compressed page size in bytes,
+					or 0 for uncompressed BLOB  */
+	const byte**	field,		/*!< in/out: the locally stored part of
+					the externally stored column */
+	ulint*		len)		/*!< in/out: length of field, in bytes */
+{
+	if (ext_buf) {
+		ut_a(prefix_len > 0);
+
+		/* If an ordering column is externally stored, we will
+		have to store a longer prefix of the field.  In this
+		case, write to the log a marker followed by the
+		original length and the real length of the field. */
+		ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD);
+
+		ptr += mach_write_compressed(ptr, *len);
+
+		*field = trx_undo_page_fetch_ext(ext_buf, prefix_len, zip_size,
+						 *field, len);
+
+		ptr += mach_write_compressed(ptr, *len);
+	} else {
+		ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD
+					     + *len);
+	}
+
+	return(ptr);
+}
+
+/**********************************************************************//**
+Reports in the undo log of an update or delete marking of a clustered index
+record.
+@return byte offset of the inserted undo log entry on the page if
+succeed, 0 if fail */
+static
+ulint
+trx_undo_page_report_modify(
+/*========================*/
+	page_t*		undo_page,	/*!< in: undo log page */
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: clustered index where update or
+					delete marking is done */
+	const rec_t*	rec,		/*!< in: clustered index record which
+					has NOT yet been modified */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update,		/*!< in: update vector which tells the
+					columns to be updated; in the case of
+					a delete, this should be set to NULL */
+	ulint		cmpl_info,	/*!< in: compiler info on secondary
+					index updates */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	dict_table_t*	table;
+	ulint		first_free;
+	byte*		ptr;
+	const byte*	field;
+	ulint		flen;
+	ulint		col_no;
+	ulint		type_cmpl;
+	byte*		type_cmpl_ptr;
+	ulint		i;
+	trx_id_t	trx_id;
+	ibool		ignore_prefix = FALSE;
+	byte		ext_buf[REC_VERSION_56_MAX_INDEX_COL_LEN
+				+ BTR_EXTERN_FIELD_REF_SIZE];
+
+	ut_a(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+			       + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE);
+	table = index->table;
+
+	first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+				      + TRX_UNDO_PAGE_FREE);
+	ptr = undo_page + first_free;
+
+	ut_ad(first_free <= UNIV_PAGE_SIZE);
+
+	if (trx_undo_left(undo_page, ptr) < 50) {
+
+		/* NOTE: the value 50 must be big enough so that the general
+		fields written below fit on the undo log page */
+
+		return(0);
+	}
+
+	/* Reserve 2 bytes for the pointer to the next undo log record */
+	ptr += 2;
+
+	/* Store first some general parameters to the undo log */
+
+	if (!update) {
+		ut_ad(!rec_get_deleted_flag(rec, dict_table_is_comp(table)));
+		type_cmpl = TRX_UNDO_DEL_MARK_REC;
+	} else if (rec_get_deleted_flag(rec, dict_table_is_comp(table))) {
+		type_cmpl = TRX_UNDO_UPD_DEL_REC;
+		/* We are about to update a delete marked record.
+		We don't typically need the prefix in this case unless
+		the delete marking is done by the same transaction
+		(which we check below). */
+		ignore_prefix = TRUE;
+	} else {
+		type_cmpl = TRX_UNDO_UPD_EXIST_REC;
+	}
+
+	type_cmpl |= cmpl_info * TRX_UNDO_CMPL_INFO_MULT;
+	type_cmpl_ptr = ptr;
+
+	*ptr++ = (byte) type_cmpl;
+	ptr += mach_ull_write_much_compressed(ptr, trx->undo_no);
+
+	ptr += mach_ull_write_much_compressed(ptr, table->id);
+
+	/*----------------------------------------*/
+	/* Store the state of the info bits */
+
+	*ptr++ = (byte) rec_get_info_bits(rec, dict_table_is_comp(table));
+
+	/* Store the values of the system columns */
+	field = rec_get_nth_field(rec, offsets,
+				  dict_index_get_sys_col_pos(
+					  index, DATA_TRX_ID), &flen);
+	ut_ad(flen == DATA_TRX_ID_LEN);
+
+	trx_id = trx_read_trx_id(field);
+
+	/* If it is an update of a delete marked record, then we are
+	allowed to ignore blob prefixes if the delete marking was done
+	by some other trx as it must have committed by now for us to
+	allow an over-write. */
+	if (ignore_prefix) {
+		ignore_prefix = (trx_id != trx->id);
+	}
+	ptr += mach_ull_write_compressed(ptr, trx_id);
+
+	field = rec_get_nth_field(rec, offsets,
+				  dict_index_get_sys_col_pos(
+					  index, DATA_ROLL_PTR), &flen);
+	ut_ad(flen == DATA_ROLL_PTR_LEN);
+
+	ptr += mach_ull_write_compressed(ptr, trx_read_roll_ptr(field));
+
+	/*----------------------------------------*/
+	/* Store then the fields required to uniquely determine the
+	record which will be modified in the clustered index */
+
+	for (i = 0; i < dict_index_get_n_unique(index); i++) {
+
+		field = rec_get_nth_field(rec, offsets, i, &flen);
+
+		/* The ordering columns must not be stored externally. */
+		ut_ad(!rec_offs_nth_extern(offsets, i));
+		ut_ad(dict_index_get_nth_col(index, i)->ord_part);
+
+		if (trx_undo_left(undo_page, ptr) < 5) {
+
+			return(0);
+		}
+
+		ptr += mach_write_compressed(ptr, flen);
+
+		if (flen != UNIV_SQL_NULL) {
+			if (trx_undo_left(undo_page, ptr) < flen) {
+
+				return(0);
+			}
+
+			ut_memcpy(ptr, field, flen);
+			ptr += flen;
+		}
+	}
+
+	/*----------------------------------------*/
+	/* Save to the undo log the old values of the columns to be updated. */
+
+	if (update) {
+		if (trx_undo_left(undo_page, ptr) < 5) {
+
+			return(0);
+		}
+
+		ptr += mach_write_compressed(ptr, upd_get_n_fields(update));
+
+		for (i = 0; i < upd_get_n_fields(update); i++) {
+
+			ulint	pos = upd_get_nth_field(update, i)->field_no;
+
+			/* Write field number to undo log */
+			if (trx_undo_left(undo_page, ptr) < 5) {
+
+				return(0);
+			}
+
+			ptr += mach_write_compressed(ptr, pos);
+
+			/* Save the old value of field */
+			field = rec_get_nth_field(rec, offsets, pos, &flen);
+
+			if (trx_undo_left(undo_page, ptr) < 15) {
+
+				return(0);
+			}
+
+			if (rec_offs_nth_extern(offsets, pos)) {
+				const dict_col_t*	col
+					= dict_index_get_nth_col(index, pos);
+				ulint			prefix_len
+					= dict_max_field_len_store_undo(
+						table, col);
+
+				ut_ad(prefix_len + BTR_EXTERN_FIELD_REF_SIZE
+				      <= sizeof ext_buf);
+
+				ptr = trx_undo_page_report_modify_ext(
+					ptr,
+					col->ord_part
+					&& !ignore_prefix
+					&& flen < REC_ANTELOPE_MAX_INDEX_COL_LEN
+					? ext_buf : NULL, prefix_len,
+					dict_table_zip_size(table),
+					&field, &flen);
+
+				/* Notify purge that it eventually has to
+				free the old externally stored field */
+
+				trx->update_undo->del_marks = TRUE;
+
+				*type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN;
+			} else {
+				ptr += mach_write_compressed(ptr, flen);
+			}
+
+			if (flen != UNIV_SQL_NULL) {
+				if (trx_undo_left(undo_page, ptr) < flen) {
+
+					return(0);
+				}
+
+				ut_memcpy(ptr, field, flen);
+				ptr += flen;
+			}
+		}
+	}
+
+	/*----------------------------------------*/
+	/* In the case of a delete marking, and also in the case of an update
+	where any ordering field of any index changes, store the values of all
+	columns which occur as ordering fields in any index. This info is used
+	in the purge of old versions where we use it to build and search the
+	delete marked index records, to look if we can remove them from the
+	index tree. Note that starting from 4.0.14 also externally stored
+	fields can be ordering in some index. Starting from 5.2, we no longer
+	store REC_MAX_INDEX_COL_LEN first bytes to the undo log record,
+	but we can construct the column prefix fields in the index by
+	fetching the first page of the BLOB that is pointed to by the
+	clustered index. This works also in crash recovery, because all pages
+	(including BLOBs) are recovered before anything is rolled back. */
+
+	if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+		byte*	old_ptr = ptr;
+
+		trx->update_undo->del_marks = TRUE;
+
+		if (trx_undo_left(undo_page, ptr) < 5) {
+
+			return(0);
+		}
+
+		/* Reserve 2 bytes to write the number of bytes the stored
+		fields take in this undo record */
+
+		ptr += 2;
+
+		for (col_no = 0; col_no < dict_table_get_n_cols(table);
+		     col_no++) {
+
+			const dict_col_t*	col
+				= dict_table_get_nth_col(table, col_no);
+
+			if (col->ord_part) {
+				ulint	pos;
+
+				/* Write field number to undo log */
+				if (trx_undo_left(undo_page, ptr) < 5 + 15) {
+
+					return(0);
+				}
+
+				pos = dict_index_get_nth_col_pos(index,
+								 col_no);
+				ptr += mach_write_compressed(ptr, pos);
+
+				/* Save the old value of field */
+				field = rec_get_nth_field(rec, offsets, pos,
+							  &flen);
+
+				if (rec_offs_nth_extern(offsets, pos)) {
+					const dict_col_t*	col =
+						dict_index_get_nth_col(
+							index, pos);
+					ulint			prefix_len =
+						dict_max_field_len_store_undo(
+							table, col);
+
+					ut_a(prefix_len < sizeof ext_buf);
+
+					ptr = trx_undo_page_report_modify_ext(
+						ptr,
+						flen < REC_ANTELOPE_MAX_INDEX_COL_LEN
+						&& !ignore_prefix
+						? ext_buf : NULL, prefix_len,
+						dict_table_zip_size(table),
+						&field, &flen);
+				} else {
+					ptr += mach_write_compressed(
+						ptr, flen);
+				}
+
+				if (flen != UNIV_SQL_NULL) {
+					if (trx_undo_left(undo_page, ptr)
+					    < flen) {
+
+						return(0);
+					}
+
+					ut_memcpy(ptr, field, flen);
+					ptr += flen;
+				}
+			}
+		}
+
+		mach_write_to_2(old_ptr, ptr - old_ptr);
+	}
+
+	/*----------------------------------------*/
+	/* Write pointers to the previous and the next undo log records */
+	if (trx_undo_left(undo_page, ptr) < 2) {
+
+		return(0);
+	}
+
+	mach_write_to_2(ptr, first_free);
+	ptr += 2;
+	mach_write_to_2(undo_page + first_free, ptr - undo_page);
+
+	mach_write_to_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE,
+			ptr - undo_page);
+
+	/* Write to the REDO log about this change in the UNDO log */
+
+	trx_undof_page_add_undo_rec_log(undo_page, first_free,
+					ptr - undo_page, mtr);
+	return(first_free);
+}
+
+/**********************************************************************//**
+Reads from an undo log update record the system field values of the old
+version.
+@return	remaining part of undo log record after reading these values */
+UNIV_INTERN
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+	byte*		ptr,		/*!< in: remaining part of undo
+					log record after reading
+					general parameters */
+	trx_id_t*	trx_id,		/*!< out: trx id */
+	roll_ptr_t*	roll_ptr,	/*!< out: roll ptr */
+	ulint*		info_bits)	/*!< out: info bits state */
+{
+	/* Read the state of the info bits */
+	*info_bits = mach_read_from_1(ptr);
+	ptr += 1;
+
+	/* Read the values of the system columns */
+
+	*trx_id = mach_ull_read_compressed(ptr);
+	ptr += mach_ull_get_compressed_size(*trx_id);
+
+	*roll_ptr = mach_ull_read_compressed(ptr);
+	ptr += mach_ull_get_compressed_size(*roll_ptr);
+
+	return(ptr);
+}
+
+/**********************************************************************//**
+Reads from an update undo log record the number of updated fields.
+@return	remaining part of undo log record after reading this value */
+UNIV_INLINE
+byte*
+trx_undo_update_rec_get_n_upd_fields(
+/*=================================*/
+	byte*	ptr,	/*!< in: pointer to remaining part of undo log record */
+	ulint*	n)	/*!< out: number of fields */
+{
+	*n = mach_read_compressed(ptr);
+	ptr += mach_get_compressed_size(*n);
+
+	return(ptr);
+}
+
+/**********************************************************************//**
+Reads from an update undo log record a stored field number.
+@return	remaining part of undo log record after reading this value */
+UNIV_INLINE
+byte*
+trx_undo_update_rec_get_field_no(
+/*=============================*/
+	byte*	ptr,	/*!< in: pointer to remaining part of undo log record */
+	ulint*	field_no)/*!< out: field number */
+{
+	*field_no = mach_read_compressed(ptr);
+	ptr += mach_get_compressed_size(*field_no);
+
+	return(ptr);
+}
+
+/*******************************************************************//**
+Builds an update vector based on a remaining part of an undo log record.
+@return remaining part of the record, NULL if an error detected, which
+means that the record is corrupted */
+UNIV_INTERN
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+	byte*		ptr,	/*!< in: remaining part in update undo log
+				record, after reading the row reference
+				NOTE that this copy of the undo log record must
+				be preserved as long as the update vector is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		type,	/*!< in: TRX_UNDO_UPD_EXIST_REC,
+				TRX_UNDO_UPD_DEL_REC, or
+				TRX_UNDO_DEL_MARK_REC; in the last case,
+				only trx id and roll ptr fields are added to
+				the update vector */
+	trx_id_t	trx_id,	/*!< in: transaction id from this undo record */
+	roll_ptr_t	roll_ptr,/*!< in: roll pointer from this undo record */
+	ulint		info_bits,/*!< in: info bits from this undo record */
+	trx_t*		trx,	/*!< in: transaction */
+	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
+				needed is allocated */
+	upd_t**		upd)	/*!< out, own: update vector */
+{
+	upd_field_t*	upd_field;
+	upd_t*		update;
+	ulint		n_fields;
+	byte*		buf;
+	ulint		i;
+
+	ut_a(dict_index_is_clust(index));
+
+	if (type != TRX_UNDO_DEL_MARK_REC) {
+		ptr = trx_undo_update_rec_get_n_upd_fields(ptr, &n_fields);
+	} else {
+		n_fields = 0;
+	}
+
+	update = upd_create(n_fields + 2, heap);
+
+	update->info_bits = info_bits;
+
+	/* Store first trx id and roll ptr to update vector */
+
+	upd_field = upd_get_nth_field(update, n_fields);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_TRX_ID_LEN));
+
+	trx_write_trx_id(buf, trx_id);
+
+	upd_field_set_field_no(upd_field,
+			       dict_index_get_sys_col_pos(index, DATA_TRX_ID),
+			       index, trx);
+	dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN);
+
+	upd_field = upd_get_nth_field(update, n_fields + 1);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_ROLL_PTR_LEN));
+
+	trx_write_roll_ptr(buf, roll_ptr);
+
+	upd_field_set_field_no(
+		upd_field, dict_index_get_sys_col_pos(index, DATA_ROLL_PTR),
+		index, trx);
+	dfield_set_data(&(upd_field->new_val), buf, DATA_ROLL_PTR_LEN);
+
+	/* Store then the updated ordinary columns to the update vector */
+
+	for (i = 0; i < n_fields; i++) {
+
+		byte*	field;
+		ulint	len;
+		ulint	field_no;
+		ulint	orig_len;
+
+		ptr = trx_undo_update_rec_get_field_no(ptr, &field_no);
+
+		if (field_no >= dict_index_get_n_fields(index)) {
+			fprintf(stderr,
+				"InnoDB: Error: trying to access"
+				" update undo rec field %lu in ",
+				(ulong) field_no);
+			dict_index_name_print(stderr, trx, index);
+			fprintf(stderr, "\n"
+				"InnoDB: but index has only %lu fields\n"
+				"InnoDB: Submit a detailed bug report"
+				" to http://bugs.mysql.com\n"
+				"InnoDB: Run also CHECK TABLE ",
+				(ulong) dict_index_get_n_fields(index));
+			ut_print_name(stderr, trx, TRUE, index->table_name);
+			fprintf(stderr, "\n"
+				"InnoDB: n_fields = %lu, i = %lu, ptr %p\n",
+				(ulong) n_fields, (ulong) i, ptr);
+			ut_ad(0);
+			*upd = NULL;
+			return(NULL);
+		}
+
+		upd_field = upd_get_nth_field(update, i);
+
+		upd_field_set_field_no(upd_field, field_no, index, trx);
+
+		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+		upd_field->orig_len = orig_len;
+
+		if (len == UNIV_SQL_NULL) {
+			dfield_set_null(&upd_field->new_val);
+		} else if (len < UNIV_EXTERN_STORAGE_FIELD) {
+			dfield_set_data(&upd_field->new_val, field, len);
+		} else {
+			len -= UNIV_EXTERN_STORAGE_FIELD;
+
+			dfield_set_data(&upd_field->new_val, field, len);
+			dfield_set_ext(&upd_field->new_val);
+		}
+	}
+
+	*upd = update;
+
+	return(ptr);
+}
+
+/*******************************************************************//**
+Builds a partial row from an update undo log record, for purge.
+It contains the columns which occur as ordering in any index of the table.
+Any missing columns are indicated by col->mtype == DATA_MISSING.
+@return	pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_partial_row(
+/*=========================*/
+	byte*		ptr,	/*!< in: remaining part in update undo log
+				record of a suitable type, at the start of
+				the stored index columns;
+				NOTE that this copy of the undo log record must
+				be preserved as long as the partial row is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	dtuple_t**	row,	/*!< out, own: partial row */
+	ibool		ignore_prefix, /*!< in: flag to indicate if we
+				expect blob prefixes in undo. Used
+				only in the assertion. */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
+				needed is allocated */
+{
+	const byte*	end_ptr;
+	ulint		row_len;
+
+	ut_ad(index);
+	ut_ad(ptr);
+	ut_ad(row);
+	ut_ad(heap);
+	ut_ad(dict_index_is_clust(index));
+
+	row_len = dict_table_get_n_cols(index->table);
+
+	*row = dtuple_create(heap, row_len);
+
+	/* Mark all columns in the row uninitialized, so that
+	we can distinguish missing fields from fields that are SQL NULL. */
+	for (ulint i = 0; i < row_len; i++) {
+		dfield_get_type(dtuple_get_nth_field(*row, i))
+			->mtype = DATA_MISSING;
+	}
+
+	end_ptr = ptr + mach_read_from_2(ptr);
+	ptr += 2;
+
+	while (ptr != end_ptr) {
+		dfield_t*		dfield;
+		byte*			field;
+		ulint			field_no;
+		const dict_col_t*	col;
+		ulint			col_no;
+		ulint			len;
+		ulint			orig_len;
+
+		ptr = trx_undo_update_rec_get_field_no(ptr, &field_no);
+
+		col = dict_index_get_nth_col(index, field_no);
+		col_no = dict_col_get_no(col);
+
+		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+		dfield = dtuple_get_nth_field(*row, col_no);
+		dict_col_copy_type(
+			dict_table_get_nth_col(index->table, col_no),
+			dfield_get_type(dfield));
+		dfield_set_data(dfield, field, len);
+
+		if (len != UNIV_SQL_NULL
+		    && len >= UNIV_EXTERN_STORAGE_FIELD) {
+			dfield_set_len(dfield,
+				       len - UNIV_EXTERN_STORAGE_FIELD);
+			dfield_set_ext(dfield);
+			/* If the prefix of this column is indexed,
+			ensure that enough prefix is stored in the
+			undo log record. */
+			if (!ignore_prefix && col->ord_part) {
+				ut_a(dfield_get_len(dfield)
+				     >= BTR_EXTERN_FIELD_REF_SIZE);
+				ut_a(dict_table_get_format(index->table)
+				     >= UNIV_FORMAT_B
+				     || dfield_get_len(dfield)
+				     >= REC_ANTELOPE_MAX_INDEX_COL_LEN
+				     + BTR_EXTERN_FIELD_REF_SIZE);
+			}
+		}
+	}
+
+	return(ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Erases the unused undo log page end.
+@return TRUE if the page contained something, FALSE if it was empty */
+static __attribute__((nonnull))
+ibool
+trx_undo_erase_page_end(
+/*====================*/
+	page_t*	undo_page,	/*!< in/out: undo page whose end to erase */
+	mtr_t*	mtr)		/*!< in/out: mini-transaction */
+{
+	ulint	first_free;
+
+	first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+				      + TRX_UNDO_PAGE_FREE);
+	memset(undo_page + first_free, 0xff,
+	       (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END) - first_free);
+
+	mlog_write_initial_log_record(undo_page, MLOG_UNDO_ERASE_END, mtr);
+	return(first_free != TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+}
+
+/***********************************************************//**
+Parses a redo log record of erasing of an undo page end.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_erase_page_end(
+/*==========================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr __attribute__((unused)), /*!< in: buffer end */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr)	/*!< in: mtr or NULL */
+{
+	ut_ad(ptr && end_ptr);
+
+	if (page == NULL) {
+
+		return(ptr);
+	}
+
+	trx_undo_erase_page_end(page, mtr);
+
+	return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+trx_undo_report_row_operation(
+/*==========================*/
+	ulint		flags,		/*!< in: if BTR_NO_UNDO_LOG_FLAG bit is
+					set, does nothing */
+	ulint		op_type,	/*!< in: TRX_UNDO_INSERT_OP or
+					TRX_UNDO_MODIFY_OP */
+	que_thr_t*	thr,		/*!< in: query thread */
+	dict_index_t*	index,		/*!< in: clustered index */
+	const dtuple_t*	clust_entry,	/*!< in: in the case of an insert,
+					index entry to insert into the
+					clustered index, otherwise NULL */
+	const upd_t*	update,		/*!< in: in the case of an update,
+					the update vector, otherwise NULL */
+	ulint		cmpl_info,	/*!< in: compiler info on secondary
+					index updates */
+	const rec_t*	rec,		/*!< in: in case of an update or delete
+					marking, the record in the clustered
+					index, otherwise NULL */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec) */
+	roll_ptr_t*	roll_ptr)	/*!< out: rollback pointer to the
+					inserted undo log record,
+					0 if BTR_NO_UNDO_LOG
+					flag was specified */
+{
+	trx_t*		trx;
+	trx_undo_t*	undo;
+	ulint		page_no;
+	buf_block_t*	undo_block;
+	trx_rseg_t*	rseg;
+	mtr_t		mtr;
+	dberr_t		err		= DB_SUCCESS;
+#ifdef UNIV_DEBUG
+	int		loop_count	= 0;
+#endif /* UNIV_DEBUG */
+
+	ut_ad(!srv_read_only_mode);
+	ut_a(dict_index_is_clust(index));
+	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
+
+	if (flags & BTR_NO_UNDO_LOG_FLAG) {
+
+		*roll_ptr = 0;
+
+		return(DB_SUCCESS);
+	}
+
+	ut_ad(thr);
+	ut_ad((op_type != TRX_UNDO_INSERT_OP)
+	      || (clust_entry && !update && !rec));
+
+	trx = thr_get_trx(thr);
+
+	/* This table is visible only to the session that created it. */
+	if (trx->read_only) {
+		ut_ad(!srv_read_only_mode);
+		/* MySQL should block writes to non-temporary tables. */
+		ut_a(DICT_TF2_FLAG_IS_SET(index->table, DICT_TF2_TEMPORARY));
+		if (trx->rseg == 0) {
+			trx_assign_rseg(trx);
+		}
+	}
+
+	rseg = trx->rseg;
+
+	mtr_start(&mtr);
+	mutex_enter(&trx->undo_mutex);
+
+	/* If the undo log is not assigned yet, assign one */
+
+	switch (op_type) {
+	case TRX_UNDO_INSERT_OP:
+		undo = trx->insert_undo;
+
+		if (undo == NULL) {
+
+			err = trx_undo_assign_undo(trx, TRX_UNDO_INSERT);
+			undo = trx->insert_undo;
+
+			if (undo == NULL) {
+				/* Did not succeed */
+				ut_ad(err != DB_SUCCESS);
+				goto err_exit;
+			}
+
+			ut_ad(err == DB_SUCCESS);
+		}
+		break;
+	default:
+		ut_ad(op_type == TRX_UNDO_MODIFY_OP);
+
+		undo = trx->update_undo;
+
+		if (undo == NULL) {
+			err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE);
+			undo = trx->update_undo;
+
+			if (undo == NULL) {
+				/* Did not succeed */
+				ut_ad(err != DB_SUCCESS);
+				goto err_exit;
+			}
+		}
+
+		ut_ad(err == DB_SUCCESS);
+	}
+
+	page_no = undo->last_page_no;
+	undo_block = buf_page_get_gen(
+		undo->space, undo->zip_size, page_no, RW_X_LATCH,
+		undo->guess_block, BUF_GET, __FILE__, __LINE__, &mtr);
+	buf_block_dbg_add_level(undo_block, SYNC_TRX_UNDO_PAGE);
+
+	do {
+		page_t*		undo_page;
+		ulint		offset;
+
+		undo_page = buf_block_get_frame(undo_block);
+		ut_ad(page_no == buf_block_get_page_no(undo_block));
+
+		switch (op_type) {
+		case TRX_UNDO_INSERT_OP:
+			offset = trx_undo_page_report_insert(
+				undo_page, trx, index, clust_entry, &mtr);
+			break;
+		default:
+			ut_ad(op_type == TRX_UNDO_MODIFY_OP);
+			offset = trx_undo_page_report_modify(
+				undo_page, trx, index, rec, offsets, update,
+				cmpl_info, &mtr);
+		}
+
+		if (UNIV_UNLIKELY(offset == 0)) {
+			/* The record did not fit on the page. We erase the
+			end segment of the undo log page and write a log
+			record of it: this is to ensure that in the debug
+			version the replicate page constructed using the log
+			records stays identical to the original page */
+
+			if (!trx_undo_erase_page_end(undo_page, &mtr)) {
+				/* The record did not fit on an empty
+				undo page. Discard the freshly allocated
+				page and return an error. */
+
+				/* When we remove a page from an undo
+				log, this is analogous to a
+				pessimistic insert in a B-tree, and we
+				must reserve the counterpart of the
+				tree latch, which is the rseg
+				mutex. We must commit the mini-transaction
+				first, because it may be holding lower-level
+				latches, such as SYNC_FSP and SYNC_FSP_PAGE. */
+
+				mtr_commit(&mtr);
+				mtr_start(&mtr);
+
+				mutex_enter(&rseg->mutex);
+				trx_undo_free_last_page(trx, undo, &mtr);
+				mutex_exit(&rseg->mutex);
+
+				err = DB_UNDO_RECORD_TOO_BIG;
+				goto err_exit;
+			}
+
+			mtr_commit(&mtr);
+		} else {
+			/* Success */
+
+			mtr_commit(&mtr);
+
+			undo->empty = FALSE;
+			undo->top_page_no = page_no;
+			undo->top_offset  = offset;
+			undo->top_undo_no = trx->undo_no;
+			undo->guess_block = undo_block;
+
+			trx->undo_no++;
+
+			mutex_exit(&trx->undo_mutex);
+
+			*roll_ptr = trx_undo_build_roll_ptr(
+				op_type == TRX_UNDO_INSERT_OP,
+				rseg->id, page_no, offset);
+			return(DB_SUCCESS);
+		}
+
+		ut_ad(page_no == undo->last_page_no);
+
+		/* We have to extend the undo log by one page */
+
+		ut_ad(++loop_count < 2);
+		mtr_start(&mtr);
+
+		/* When we add a page to an undo log, this is analogous to
+		a pessimistic insert in a B-tree, and we must reserve the
+		counterpart of the tree latch, which is the rseg mutex. */
+
+		mutex_enter(&rseg->mutex);
+		undo_block = trx_undo_add_page(trx, undo, &mtr);
+		mutex_exit(&rseg->mutex);
+
+		page_no = undo->last_page_no;
+	} while (undo_block != NULL);
+
+	/* Did not succeed: out of space */
+	err = DB_OUT_OF_FILE_SPACE;
+
+err_exit:
+	mutex_exit(&trx->undo_mutex);
+	mtr_commit(&mtr);
+	return(err);
+}
+
+/*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/
+
+/******************************************************************//**
+Copies an undo record to heap. This function can be called if we know that
+the undo log record exists.
+@return	own: copy of the record */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_undo_rec_low(
+/*======================*/
+	roll_ptr_t	roll_ptr,	/*!< in: roll pointer to record */
+	mem_heap_t*	heap)		/*!< in: memory heap where copied */
+{
+	trx_undo_rec_t*	undo_rec;
+	ulint		rseg_id;
+	ulint		page_no;
+	ulint		offset;
+	const page_t*	undo_page;
+	trx_rseg_t*	rseg;
+	ibool		is_insert;
+	mtr_t		mtr;
+
+	trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no,
+				 &offset);
+	rseg = trx_rseg_get_on_id(rseg_id);
+
+	mtr_start(&mtr);
+
+	undo_page = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size,
+						page_no, &mtr);
+
+	undo_rec = trx_undo_rec_copy(undo_page + offset, heap);
+
+	mtr_commit(&mtr);
+
+	return(undo_rec);
+}
+
+/******************************************************************//**
+Copies an undo record to heap.
+
+NOTE: the caller must have latches on the clustered index page.
+
+@retval true if the undo log has been
+truncated and we cannot fetch the old version
+@retval false if the undo log record is available  */
+static __attribute__((nonnull, warn_unused_result))
+bool
+trx_undo_get_undo_rec(
+/*==================*/
+	roll_ptr_t	roll_ptr,	/*!< in: roll pointer to record */
+	trx_id_t	trx_id,		/*!< in: id of the trx that generated
+					the roll pointer: it points to an
+					undo log of this transaction */
+	trx_undo_rec_t**undo_rec,	/*!< out, own: copy of the record */
+	mem_heap_t*	heap)		/*!< in: memory heap where copied */
+{
+	bool		missing_history;
+
+	rw_lock_s_lock(&purge_sys->latch);
+	missing_history = read_view_sees_trx_id(purge_sys->view, trx_id);
+
+	if (!missing_history) {
+		*undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
+	}
+
+	rw_lock_s_unlock(&purge_sys->latch);
+
+	return(missing_history);
+}
+
+#ifdef UNIV_DEBUG
+#define ATTRIB_USED_ONLY_IN_DEBUG
+#else /* UNIV_DEBUG */
+#define ATTRIB_USED_ONLY_IN_DEBUG	__attribute__((unused))
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Build a previous version of a clustered index record. The caller must
+hold a latch on the index page of the clustered index record.
+@retval true if previous version was built, or if it was an insert
+or the table has been rebuilt
+@retval false if the previous version is earlier than purge_view,
+which means that it may have been removed */
+UNIV_INTERN
+bool
+trx_undo_prev_version_build(
+/*========================*/
+	const rec_t*	index_rec ATTRIB_USED_ONLY_IN_DEBUG,
+				/*!< in: clustered index record in the
+				index tree */
+	mtr_t*		index_mtr ATTRIB_USED_ONLY_IN_DEBUG,
+				/*!< in: mtr which contains the latch to
+				index_rec page and purge_view */
+	const rec_t*	rec,	/*!< in: version of a clustered index record */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
+				needed is allocated */
+	rec_t**		old_vers)/*!< out, own: previous version, or NULL if
+				rec is the first inserted version, or if
+				history data has been deleted (an error),
+				or if the purge COULD have removed the version
+				though it has not yet done so */
+{
+	trx_undo_rec_t*	undo_rec	= NULL;
+	dtuple_t*	entry;
+	trx_id_t	rec_trx_id;
+	ulint		type;
+	undo_no_t	undo_no;
+	table_id_t	table_id;
+	trx_id_t	trx_id;
+	roll_ptr_t	roll_ptr;
+	upd_t*		update;
+	byte*		ptr;
+	ulint		info_bits;
+	ulint		cmpl_info;
+	bool		dummy_extern;
+	byte*		buf;
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&purge_sys->latch, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mtr_memo_contains_page(index_mtr, index_rec, MTR_MEMO_PAGE_S_FIX)
+	      || mtr_memo_contains_page(index_mtr, index_rec,
+					MTR_MEMO_PAGE_X_FIX));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_a(dict_index_is_clust(index));
+
+	roll_ptr = row_get_rec_roll_ptr(rec, index, offsets);
+
+	*old_vers = NULL;
+
+	if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+		/* The record rec is the first inserted version */
+		return(true);
+	}
+
+	rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+	if (trx_undo_get_undo_rec(roll_ptr, rec_trx_id, &undo_rec, heap)) {
+		/* The undo record may already have been purged,
+		during purge or semi-consistent read. */
+		return(false);
+	}
+
+	ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
+				    &dummy_extern, &undo_no, &table_id);
+
+	if (table_id != index->table->id) {
+		/* The table should have been rebuilt, but purge has
+		not yet removed the undo log records for the
+		now-dropped old table (table_id). */
+		return(true);
+	}
+
+	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+					       &info_bits);
+
+	/* (a) If a clustered index record version is such that the
+	trx id stamp in it is bigger than purge_sys->view, then the
+	BLOBs in that version are known to exist (the purge has not
+	progressed that far);
+
+	(b) if the version is the first version such that trx id in it
+	is less than purge_sys->view, and it is not delete-marked,
+	then the BLOBs in that version are known to exist (the purge
+	cannot have purged the BLOBs referenced by that version
+	yet).
+
+	This function does not fetch any BLOBs.  The callers might, by
+	possibly invoking row_ext_create() via row_build().  However,
+	they should have all needed information in the *old_vers
+	returned by this function.  This is because *old_vers is based
+	on the transaction undo log records.  The function
+	trx_undo_page_fetch_ext() will write BLOB prefixes to the
+	transaction undo log that are at least as long as the longest
+	possible column prefix in a secondary index.  Thus, secondary
+	index entries for *old_vers can be constructed without
+	dereferencing any BLOB pointers. */
+
+	ptr = trx_undo_rec_skip_row_ref(ptr, index);
+
+	ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id,
+					     roll_ptr, info_bits,
+					     NULL, heap, &update);
+	ut_a(ptr);
+
+# if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	ut_a(!rec_offs_any_null_extern(rec, offsets));
+# endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+	if (row_upd_changes_field_size_or_external(index, offsets, update)) {
+		ulint	n_ext;
+
+		/* We should confirm the existence of disowned external data,
+		if the previous version record is delete marked. If the trx_id
+		of the previous record is seen by purge view, we should treat
+		it as missing history, because the disowned external data
+		might be purged already.
+
+		The inherited external data (BLOBs) can be freed (purged)
+		after trx_id was committed, provided that no view was started
+		before trx_id. If the purge view can see the committed
+		delete-marked record by trx_id, no transactions need to access
+		the BLOB. */
+
+		/* the row_upd_changes_disowned_external(update) call could be
+		omitted, but the synchronization on purge_sys->latch is likely
+		more expensive. */
+
+		if ((update->info_bits & REC_INFO_DELETED_FLAG)
+		    && row_upd_changes_disowned_external(update)) {
+			bool	missing_extern;
+
+			rw_lock_s_lock(&purge_sys->latch);
+			missing_extern = read_view_sees_trx_id(purge_sys->view,
+							       trx_id);
+			rw_lock_s_unlock(&purge_sys->latch);
+
+			if (missing_extern) {
+				/* treat as a fresh insert, not to
+				cause assertion error at the caller. */
+				return(true);
+			}
+		}
+
+		/* We have to set the appropriate extern storage bits in the
+		old version of the record: the extern bits in rec for those
+		fields that update does NOT update, as well as the bits for
+		those fields that update updates to become externally stored
+		fields. Store the info: */
+
+		entry = row_rec_to_index_entry(
+			rec, index, offsets, &n_ext, heap);
+		n_ext += btr_push_update_extern_fields(entry, update, heap);
+		/* The page containing the clustered index record
+		corresponding to entry is latched in mtr.  Thus the
+		following call is safe. */
+		row_upd_index_replace_new_col_vals(entry, index, update, heap);
+
+		buf = static_cast<byte*>(
+			mem_heap_alloc(
+				heap,
+				rec_get_converted_size(index, entry, n_ext)));
+
+		*old_vers = rec_convert_dtuple_to_rec(buf, index,
+						      entry, n_ext);
+	} else {
+		buf = static_cast<byte*>(
+			mem_heap_alloc(heap, rec_offs_size(offsets)));
+
+		*old_vers = rec_copy(buf, rec, offsets);
+		rec_offs_make_valid(*old_vers, index, offsets);
+		row_upd_rec_in_place(*old_vers, index, offsets, update, NULL);
+	}
+
+	return(true);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/trx/trx0roll.cc b/storage/innobase/trx/trx0roll.cc
new file mode 100644
index 00000000000..bc11f1d76bd
--- /dev/null
+++ b/storage/innobase/trx/trx0roll.cc
@@ -0,0 +1,1391 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0roll.cc
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0roll.h"
+
+#ifdef UNIV_NONINL
+#include "trx0roll.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0undo.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "usr0sess.h"
+#include "srv0start.h"
+#include "read0read.h"
+#include "row0undo.h"
+#include "row0mysql.h"
+#include "lock0lock.h"
+#include "pars0pars.h"
+#include "srv0mon.h"
+#include "trx0sys.h"
+
+/** This many pages must be undone before a truncate is tried within
+rollback */
+#define TRX_ROLL_TRUNC_THRESHOLD	1
+
+/** true if trx_rollback_or_clean_all_recovered() thread is active */
+bool			trx_rollback_or_clean_is_active;
+
+/** In crash recovery, the current trx to be rolled back; NULL otherwise */
+static const trx_t*	trx_roll_crash_recv_trx	= NULL;
+
+/** In crash recovery we set this to the undo n:o of the current trx to be
+rolled back. Then we can print how many % the rollback has progressed. */
+static undo_no_t	trx_roll_max_undo_no;
+
+/** Auxiliary variable which tells the previous progress % we printed */
+static ulint		trx_roll_progress_printed_pct;
+
+/****************************************************************//**
+Finishes a transaction rollback. */
+static
+void
+trx_rollback_finish(
+/*================*/
+	trx_t*		trx);	/*!< in: transaction */
+
+/*******************************************************************//**
+Rollback a transaction used in MySQL. */
+static
+void
+trx_rollback_to_savepoint_low(
+/*==========================*/
+	trx_t*		trx,	/*!< in: transaction handle */
+	trx_savept_t*	savept)	/*!< in: pointer to savepoint undo number, if
+				partial rollback requested, or NULL for
+				complete rollback */
+{
+	que_thr_t*	thr;
+	mem_heap_t*	heap;
+	roll_node_t*	roll_node;
+
+	heap = mem_heap_create(512);
+
+	roll_node = roll_node_create(heap);
+
+	if (savept != NULL) {
+		roll_node->partial = TRUE;
+		roll_node->savept = *savept;
+		assert_trx_in_list(trx);
+	}  else {
+		assert_trx_nonlocking_or_in_list(trx);
+	}
+
+	trx->error_state = DB_SUCCESS;
+
+	if (trx->insert_undo || trx->update_undo) {
+		thr = pars_complete_graph_for_exec(roll_node, trx, heap);
+
+		ut_a(thr == que_fork_start_command(
+			static_cast<que_fork_t*>(que_node_get_parent(thr))));
+
+		que_run_threads(thr);
+
+		ut_a(roll_node->undo_thr != NULL);
+		que_run_threads(roll_node->undo_thr);
+
+		/* Free the memory reserved by the undo graph. */
+		que_graph_free(static_cast<que_t*>(
+				       roll_node->undo_thr->common.parent));
+	}
+
+	if (savept == NULL) {
+		trx_rollback_finish(trx);
+		MONITOR_INC(MONITOR_TRX_ROLLBACK);
+	} else {
+		trx->lock.que_state = TRX_QUE_RUNNING;
+		MONITOR_INC(MONITOR_TRX_ROLLBACK_SAVEPOINT);
+	}
+
+	ut_a(trx->error_state == DB_SUCCESS);
+	ut_a(trx->lock.que_state == TRX_QUE_RUNNING);
+
+	mem_heap_free(heap);
+
+	/* There might be work for utility threads.*/
+	srv_active_wake_master_thread();
+
+	MONITOR_DEC(MONITOR_TRX_ACTIVE);
+}
+
+/*******************************************************************//**
+Rollback a transaction to a given savepoint or do a complete rollback.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+trx_rollback_to_savepoint(
+/*======================*/
+	trx_t*		trx,	/*!< in: transaction handle */
+	trx_savept_t*	savept)	/*!< in: pointer to savepoint undo number, if
+				partial rollback requested, or NULL for
+				complete rollback */
+{
+	ut_ad(!trx_mutex_own(trx));
+
+	trx_start_if_not_started_xa(trx);
+
+	trx_rollback_to_savepoint_low(trx, savept);
+
+	return(trx->error_state);
+}
+
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return	error code or DB_SUCCESS */
+static
+dberr_t
+trx_rollback_for_mysql_low(
+/*=======================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	trx->op_info = "rollback";
+
+	/* If we are doing the XA recovery of prepared transactions,
+	then the transaction object does not have an InnoDB session
+	object, and we set a dummy session that we use for all MySQL
+	transactions. */
+
+	trx_rollback_to_savepoint_low(trx, NULL);
+
+	trx->op_info = "";
+
+	ut_a(trx->error_state == DB_SUCCESS);
+
+	return(trx->error_state);
+}
+
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+trx_rollback_for_mysql(
+/*===================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	/* We are reading trx->state without holding trx_sys->mutex
+	here, because the rollback should be invoked for a running
+	active MySQL transaction (or recovered prepared transaction)
+	that is associated with the current thread. */
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		ut_ad(trx->in_mysql_trx_list);
+		return(DB_SUCCESS);
+
+	case TRX_STATE_ACTIVE:
+		ut_ad(trx->in_mysql_trx_list);
+		assert_trx_nonlocking_or_in_list(trx);
+		return(trx_rollback_for_mysql_low(trx));
+
+	case TRX_STATE_PREPARED:
+		ut_ad(!trx_is_autocommit_non_locking(trx));
+		return(trx_rollback_for_mysql_low(trx));
+
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		assert_trx_in_list(trx);
+		break;
+	}
+
+	ut_error;
+	return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Rollback the latest SQL statement for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	dberr_t	err;
+
+	/* We are reading trx->state without holding trx_sys->mutex
+	here, because the statement rollback should be invoked for a
+	running active MySQL transaction that is associated with the
+	current thread. */
+	ut_ad(trx->in_mysql_trx_list);
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		return(DB_SUCCESS);
+	case TRX_STATE_ACTIVE:
+		assert_trx_nonlocking_or_in_list(trx);
+
+		trx->op_info = "rollback of SQL statement";
+
+		err = trx_rollback_to_savepoint(
+			trx, &trx->last_sql_stat_start);
+
+		if (trx->fts_trx) {
+			fts_savepoint_rollback_last_stmt(trx);
+		}
+
+		/* The following call should not be needed,
+		but we play it safe: */
+		trx_mark_sql_stat_end(trx);
+
+		trx->op_info = "";
+
+		return(err);
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		/* The statement rollback is only allowed on an ACTIVE
+		transaction, not a PREPARED or COMMITTED one. */
+		break;
+	}
+
+	ut_error;
+	return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Search for a savepoint using name.
+@return savepoint if found else NULL */
+static
+trx_named_savept_t*
+trx_savepoint_find(
+/*===============*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	name)			/*!< in: savepoint name */
+{
+	trx_named_savept_t*	savep;
+
+	for (savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+	     savep != NULL;
+	     savep = UT_LIST_GET_NEXT(trx_savepoints, savep)) {
+
+		if (0 == ut_strcmp(savep->name, name)) {
+			return(savep);
+		}
+	}
+
+	return(NULL);
+}
+
+/*******************************************************************//**
+Frees a single savepoint struct. */
+static
+void
+trx_roll_savepoint_free(
+/*=====================*/
+	trx_t*			trx,	/*!< in: transaction handle */
+	trx_named_savept_t*	savep)	/*!< in: savepoint to free */
+{
+	UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep);
+	mem_free(savep->name);
+	mem_free(savep);
+}
+
+/*******************************************************************//**
+Frees savepoint structs starting from savep. */
+UNIV_INTERN
+void
+trx_roll_savepoints_free(
+/*=====================*/
+	trx_t*			trx,	/*!< in: transaction handle */
+	trx_named_savept_t*	savep)	/*!< in: free all savepoints starting
+					with this savepoint i*/
+{
+	while (savep != NULL) {
+		trx_named_savept_t*	next_savep;
+
+		next_savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+
+		trx_roll_savepoint_free(trx, savep);
+
+		savep = next_savep;
+	}
+}
+
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+trx_rollback_to_savepoint_for_mysql_low(
+/*====================================*/
+	trx_t*			trx,	/*!< in/out: transaction */
+	trx_named_savept_t*	savep,	/*!< in/out: savepoint */
+	ib_int64_t*		mysql_binlog_cache_pos)
+					/*!< out: the MySQL binlog
+					cache position corresponding
+					to this savepoint; MySQL needs
+					this information to remove the
+					binlog entries of the queries
+					executed after the savepoint */
+{
+	dberr_t	err;
+
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	ut_ad(trx->in_mysql_trx_list);
+
+	/* Free all savepoints strictly later than savep. */
+
+	trx_roll_savepoints_free(
+		trx, UT_LIST_GET_NEXT(trx_savepoints, savep));
+
+	*mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos;
+
+	trx->op_info = "rollback to a savepoint";
+
+	err = trx_rollback_to_savepoint(trx, &savep->savept);
+
+	/* Store the current undo_no of the transaction so that
+	we know where to roll back if we have to roll back the
+	next SQL statement: */
+
+	trx_mark_sql_stat_end(trx);
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+trx_rollback_to_savepoint_for_mysql(
+/*================================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name,		/*!< in: savepoint name */
+	ib_int64_t*	mysql_binlog_cache_pos)	/*!< out: the MySQL binlog cache
+						position corresponding to this
+						savepoint; MySQL needs this
+						information to remove the
+						binlog entries of the queries
+						executed after the savepoint */
+{
+	trx_named_savept_t*	savep;
+
+	/* We are reading trx->state without holding trx_sys->mutex
+	here, because the savepoint rollback should be invoked for a
+	running active MySQL transaction that is associated with the
+	current thread. */
+	ut_ad(trx->in_mysql_trx_list);
+
+	savep = trx_savepoint_find(trx, savepoint_name);
+
+	if (savep == NULL) {
+		return(DB_NO_SAVEPOINT);
+	}
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: transaction has a savepoint ", stderr);
+		ut_print_name(stderr, trx, FALSE, savep->name);
+		fputs(" though it is not started\n", stderr);
+		return(DB_ERROR);
+	case TRX_STATE_ACTIVE:
+		return(trx_rollback_to_savepoint_for_mysql_low(
+				trx, savep, mysql_binlog_cache_pos));
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		/* The savepoint rollback is only allowed on an ACTIVE
+		transaction, not a PREPARED or COMMITTED one. */
+		break;
+	}
+
+	ut_error;
+	return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Creates a named savepoint. If the transaction is not yet started, starts it.
+If there is already a savepoint of the same name, this call erases that old
+savepoint and replaces it with a new. Savepoints are deleted in a transaction
+commit or rollback.
+@return	always DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+trx_savepoint_for_mysql(
+/*====================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name,		/*!< in: savepoint name */
+	ib_int64_t	binlog_cache_pos)	/*!< in: MySQL binlog cache
+						position corresponding to this
+						connection at the time of the
+						savepoint */
+{
+	trx_named_savept_t*	savep;
+
+	trx_start_if_not_started_xa(trx);
+
+	savep = trx_savepoint_find(trx, savepoint_name);
+
+	if (savep) {
+		/* There is a savepoint with the same name: free that */
+
+		UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep);
+
+		mem_free(savep->name);
+		mem_free(savep);
+	}
+
+	/* Create a new savepoint and add it as the last in the list */
+
+	savep = static_cast<trx_named_savept_t*>(mem_alloc(sizeof(*savep)));
+
+	savep->name = mem_strdup(savepoint_name);
+
+	savep->savept = trx_savept_take(trx);
+
+	savep->mysql_binlog_cache_pos = binlog_cache_pos;
+
+	UT_LIST_ADD_LAST(trx_savepoints, trx->trx_savepoints, savep);
+
+	return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Releases only the named savepoint. Savepoints which were set after this
+savepoint are left as is.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+trx_release_savepoint_for_mysql(
+/*============================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name)		/*!< in: savepoint name */
+{
+	trx_named_savept_t*	savep;
+
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	ut_ad(trx->in_mysql_trx_list);
+
+	savep = trx_savepoint_find(trx, savepoint_name);
+
+	if (savep != NULL) {
+		trx_roll_savepoint_free(trx, savep);
+	}
+
+	return(savep != NULL ? DB_SUCCESS : DB_NO_SAVEPOINT);
+}
+
+/*******************************************************************//**
+Determines if this transaction is rolling back an incomplete transaction
+in crash recovery.
+@return TRUE if trx is an incomplete transaction that is being rolled
+back in crash recovery */
+UNIV_INTERN
+ibool
+trx_is_recv(
+/*========*/
+	const trx_t*	trx)	/*!< in: transaction */
+{
+	return(trx == trx_roll_crash_recv_trx);
+}
+
+/*******************************************************************//**
+Returns a transaction savepoint taken at this point in time.
+@return	savepoint */
+UNIV_INTERN
+trx_savept_t
+trx_savept_take(
+/*============*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	trx_savept_t	savept;
+
+	savept.least_undo_no = trx->undo_no;
+
+	return(savept);
+}
+
+/*******************************************************************//**
+Roll back an active transaction. */
+static
+void
+trx_rollback_active(
+/*================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	mem_heap_t*	heap;
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+	roll_node_t*	roll_node;
+	dict_table_t*	table;
+	ib_int64_t	rows_to_undo;
+	const char*	unit		= "";
+	ibool		dictionary_locked = FALSE;
+
+	heap = mem_heap_create(512);
+
+	fork = que_fork_create(NULL, NULL, QUE_FORK_RECOVERY, heap);
+	fork->trx = trx;
+
+	thr = que_thr_create(fork, heap);
+
+	roll_node = roll_node_create(heap);
+
+	thr->child = roll_node;
+	roll_node->common.parent = thr;
+
+	trx->graph = fork;
+
+	ut_a(thr == que_fork_start_command(fork));
+
+	mutex_enter(&trx_sys->mutex);
+
+	trx_roll_crash_recv_trx	= trx;
+
+	trx_roll_max_undo_no = trx->undo_no;
+
+	trx_roll_progress_printed_pct = 0;
+
+	rows_to_undo = trx_roll_max_undo_no;
+
+	mutex_exit(&trx_sys->mutex);
+
+	if (rows_to_undo > 1000000000) {
+		rows_to_undo = rows_to_undo / 1000000;
+		unit = "M";
+	}
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		"  InnoDB: Rolling back trx with id " TRX_ID_FMT ", %lu%s"
+		" rows to undo\n",
+		trx->id,
+		(ulong) rows_to_undo, unit);
+
+	if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) {
+		row_mysql_lock_data_dictionary(trx);
+		dictionary_locked = TRUE;
+	}
+
+	que_run_threads(thr);
+	ut_a(roll_node->undo_thr != NULL);
+
+	que_run_threads(roll_node->undo_thr);
+
+	trx_rollback_finish(thr_get_trx(roll_node->undo_thr));
+
+	/* Free the memory reserved by the undo graph */
+	que_graph_free(static_cast<que_t*>(
+			       roll_node->undo_thr->common.parent));
+
+	ut_a(trx->lock.que_state == TRX_QUE_RUNNING);
+
+	if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE
+	    && trx->table_id != 0) {
+
+		/* If the transaction was for a dictionary operation,
+		we drop the relevant table only if it is not flagged
+		as DISCARDED. If it still exists. */
+
+		table = dict_table_open_on_id(
+			trx->table_id, dictionary_locked,
+			DICT_TABLE_OP_NORMAL);
+
+		if (table && !dict_table_is_discarded(table)) {
+
+			dberr_t	err;
+
+			/* Ensure that the table doesn't get evicted from the
+			cache, keeps things simple for drop. */
+
+			if (table->can_be_evicted) {
+				dict_table_move_from_lru_to_non_lru(table);
+			}
+
+			dict_table_close(table, dictionary_locked, FALSE);
+
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Dropping table '%s', with id " UINT64PF " "
+				"in recovery",
+				table->name, trx->table_id);
+
+			err = row_drop_table_for_mysql(table->name, trx, TRUE);
+			trx_commit_for_mysql(trx);
+
+			ut_a(err == DB_SUCCESS);
+		}
+	}
+
+	if (dictionary_locked) {
+		row_mysql_unlock_data_dictionary(trx);
+	}
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Rollback of trx with id " TRX_ID_FMT " completed", trx->id);
+
+	mem_heap_free(heap);
+
+	trx_roll_crash_recv_trx	= NULL;
+}
+
+/*******************************************************************//**
+Rollback or clean up any resurrected incomplete transactions. It assumes
+that the caller holds the trx_sys_t::mutex and it will release the
+lock if it does a clean up or rollback.
+@return TRUE if the transaction was cleaned up or rolled back
+and trx_sys->mutex was released. */
+static
+ibool
+trx_rollback_resurrected(
+/*=====================*/
+	trx_t*	trx,	/*!< in: transaction to rollback or clean */
+	ibool	all)	/*!< in: FALSE=roll back dictionary transactions;
+			TRUE=roll back all non-PREPARED transactions */
+{
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	/* The trx->is_recovered flag and trx->state are set
+	atomically under the protection of the trx->mutex (and
+	lock_sys->mutex) in lock_trx_release_locks(). We do not want
+	to accidentally clean up a non-recovered transaction here. */
+
+	trx_mutex_enter(trx);
+	bool		is_recovered	= trx->is_recovered;
+	trx_state_t	state		= trx->state;
+	trx_mutex_exit(trx);
+
+	if (!is_recovered) {
+		return(FALSE);
+	}
+
+	switch (state) {
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		mutex_exit(&trx_sys->mutex);
+		fprintf(stderr,
+			"InnoDB: Cleaning up trx with id " TRX_ID_FMT "\n",
+			trx->id);
+		trx_cleanup_at_db_startup(trx);
+		trx_free_for_background(trx);
+		return(TRUE);
+	case TRX_STATE_ACTIVE:
+		if (all || trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) {
+			mutex_exit(&trx_sys->mutex);
+			trx_rollback_active(trx);
+			trx_free_for_background(trx);
+			return(TRUE);
+		}
+		return(FALSE);
+	case TRX_STATE_PREPARED:
+		return(FALSE);
+	case TRX_STATE_NOT_STARTED:
+		break;
+	}
+
+	ut_error;
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery.  If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back. */
+UNIV_INTERN
+void
+trx_rollback_or_clean_recovered(
+/*============================*/
+	ibool	all)	/*!< in: FALSE=roll back dictionary transactions;
+			TRUE=roll back all non-PREPARED transactions */
+{
+	trx_t*	trx;
+
+	ut_a(srv_force_recovery < SRV_FORCE_NO_TRX_UNDO);
+
+	if (trx_sys_get_n_rw_trx() == 0) {
+
+		return;
+	}
+
+	if (all) {
+		fprintf(stderr,
+			"InnoDB: Starting in background the rollback"
+			" of uncommitted transactions\n");
+	}
+
+	/* Note: For XA recovered transactions, we rely on MySQL to
+	do rollback. They will be in TRX_STATE_PREPARED state. If the server
+	is shutdown and they are still lingering in trx_sys_t::trx_list
+	then the shutdown will hang. */
+
+	/* Loop over the transaction list as long as there are
+	recovered transactions to clean up or recover. */
+
+	do {
+		mutex_enter(&trx_sys->mutex);
+
+		for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+		     trx != NULL;
+		     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+			assert_trx_in_rw_list(trx);
+
+			/* If this function does a cleanup or rollback
+			then it will release the trx_sys->mutex, therefore
+			we need to reacquire it before retrying the loop. */
+
+			if (trx_rollback_resurrected(trx, all)) {
+
+				mutex_enter(&trx_sys->mutex);
+
+				break;
+			}
+		}
+
+		mutex_exit(&trx_sys->mutex);
+
+	} while (trx != NULL);
+
+	if (all) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Rollback of non-prepared"
+			" transactions completed\n");
+	}
+}
+
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery.  If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+Note: this is done in a background thread.
+@return	a dummy parameter */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(trx_rollback_or_clean_all_recovered)(
+/*================================================*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	ut_ad(!srv_read_only_mode);
+
+#ifdef UNIV_PFS_THREAD
+	pfs_register_thread(trx_rollback_clean_thread_key);
+#endif /* UNIV_PFS_THREAD */
+
+	trx_rollback_or_clean_recovered(TRUE);
+
+	trx_rollback_or_clean_is_active = false;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/*******************************************************************//**
+Creates an undo number array.
+@return	own: undo number array */
+static
+trx_undo_arr_t*
+trx_undo_arr_create(
+/*================*/
+	ulint		n_cells)	/*!< Number of cells */
+{
+	trx_undo_arr_t*	arr;
+	mem_heap_t*	heap;
+	ulint		sz = sizeof(*arr) + sizeof(*arr->infos) * n_cells;
+
+	heap = mem_heap_create(sz);
+
+	arr = static_cast<trx_undo_arr_t*>(mem_heap_zalloc(heap, sz));
+
+	arr->n_cells = n_cells;
+
+	arr->infos = (trx_undo_inf_t*) (arr + 1);
+
+	arr->heap = heap;
+
+	return(arr);
+}
+
+/*******************************************************************//**
+Frees an undo number array. */
+UNIV_INTERN
+void
+trx_undo_arr_free(
+/*==============*/
+	trx_undo_arr_t*	arr)	/*!< in: undo number array */
+{
+	mem_heap_free(arr->heap);
+}
+
+/*******************************************************************//**
+Stores info of an undo log record to the array if it is not stored yet.
+@return	FALSE if the record already existed in the array */
+static
+ibool
+trx_undo_arr_store_info(
+/*====================*/
+	trx_t*		trx,	/*!< in: transaction */
+	undo_no_t	undo_no)/*!< in: undo number */
+{
+	ulint		i;
+	trx_undo_arr_t*	arr;
+	ulint		n = 0;
+	ulint		n_used;
+	trx_undo_inf_t*	stored_here = NULL;
+
+	arr = trx->undo_no_arr;
+	n_used = arr->n_used;
+
+	for (i = 0; i < arr->n_cells; i++) {
+		trx_undo_inf_t*	cell;
+
+		cell = trx_undo_arr_get_nth_info(arr, i);
+
+		if (!cell->in_use) {
+			if (!stored_here) {
+				/* Not in use, we may store here */
+				cell->undo_no = undo_no;
+				cell->in_use = TRUE;
+
+				arr->n_used++;
+
+				stored_here = cell;
+			}
+		} else {
+			n++;
+
+			if (cell->undo_no == undo_no) {
+
+				if (stored_here) {
+					stored_here->in_use = FALSE;
+					ut_ad(arr->n_used > 0);
+					arr->n_used--;
+				}
+
+				ut_ad(arr->n_used == n_used);
+
+				return(FALSE);
+			}
+		}
+
+		if (n == n_used && stored_here) {
+
+			ut_ad(arr->n_used == 1 + n_used);
+
+			return(TRUE);
+		}
+	}
+
+	ut_error;
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Removes an undo number from the array. */
+static
+void
+trx_undo_arr_remove_info(
+/*=====================*/
+	trx_undo_arr_t*	arr,	/*!< in: undo number array */
+	undo_no_t	undo_no)/*!< in: undo number */
+{
+	ulint		i;
+
+	for (i = 0; i < arr->n_cells; i++) {
+
+		trx_undo_inf_t*	cell;
+
+		cell = trx_undo_arr_get_nth_info(arr, i);
+
+		if (cell->in_use && cell->undo_no == undo_no) {
+			cell->in_use = FALSE;
+			ut_ad(arr->n_used > 0);
+			--arr->n_used;
+			break;
+		}
+	}
+}
+
+/*******************************************************************//**
+Gets the biggest undo number in an array.
+@return	biggest value, 0 if the array is empty */
+static
+undo_no_t
+trx_undo_arr_get_biggest(
+/*=====================*/
+	const trx_undo_arr_t*	arr)	/*!< in: undo number array */
+{
+	ulint		i;
+	undo_no_t	biggest = 0;
+	ulint		n_checked = 0;
+
+	for (i = 0; i < arr->n_cells && n_checked < arr->n_used; ++i) {
+
+		const trx_undo_inf_t*	cell = &arr->infos[i];
+
+		if (cell->in_use) {
+
+			++n_checked;
+
+			if (cell->undo_no > biggest) {
+
+				biggest = cell->undo_no;
+			}
+		}
+	}
+
+	return(biggest);
+}
+
+/***********************************************************************//**
+Tries truncate the undo logs. */
+static
+void
+trx_roll_try_truncate(
+/*==================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	undo_no_t		limit;
+	const trx_undo_arr_t*	arr;
+
+	ut_ad(mutex_own(&(trx->undo_mutex)));
+	ut_ad(mutex_own(&((trx->rseg)->mutex)));
+
+	trx->pages_undone = 0;
+
+	arr = trx->undo_no_arr;
+
+	limit = trx->undo_no;
+
+	if (arr->n_used > 0) {
+		undo_no_t	biggest;
+
+		biggest = trx_undo_arr_get_biggest(arr);
+
+		if (biggest >= limit) {
+
+			limit = biggest + 1;
+		}
+	}
+
+	if (trx->insert_undo) {
+		trx_undo_truncate_end(trx, trx->insert_undo, limit);
+	}
+
+	if (trx->update_undo) {
+		trx_undo_truncate_end(trx, trx->update_undo, limit);
+	}
+}
+
+/***********************************************************************//**
+Pops the topmost undo log record in a single undo log and updates the info
+about the topmost record in the undo log memory struct.
+@return	undo log record, the page s-latched */
+static
+trx_undo_rec_t*
+trx_roll_pop_top_rec(
+/*=================*/
+	trx_t*		trx,	/*!< in: transaction */
+	trx_undo_t*	undo,	/*!< in: undo log */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*		undo_page;
+	ulint		offset;
+	trx_undo_rec_t*	prev_rec;
+	page_t*		prev_rec_page;
+
+	ut_ad(mutex_own(&trx->undo_mutex));
+
+	undo_page = trx_undo_page_get_s_latched(
+		undo->space, undo->zip_size, undo->top_page_no, mtr);
+
+	offset = undo->top_offset;
+
+	/*	fprintf(stderr, "Thread %lu undoing trx " TRX_ID_FMT
+			" undo record " TRX_ID_FMT "\n",
+	os_thread_get_curr_id(), trx->id, undo->top_undo_no); */
+
+	prev_rec = trx_undo_get_prev_rec(
+		undo_page + offset, undo->hdr_page_no, undo->hdr_offset,
+		true, mtr);
+
+	if (prev_rec == NULL) {
+
+		undo->empty = TRUE;
+	} else {
+		prev_rec_page = page_align(prev_rec);
+
+		if (prev_rec_page != undo_page) {
+
+			trx->pages_undone++;
+		}
+
+		undo->top_page_no = page_get_page_no(prev_rec_page);
+		undo->top_offset  = prev_rec - prev_rec_page;
+		undo->top_undo_no = trx_undo_rec_get_undo_no(prev_rec);
+	}
+
+	return(undo_page + offset);
+}
+
+/********************************************************************//**
+Pops the topmost record when the two undo logs of a transaction are seen
+as a single stack of records ordered by their undo numbers. Inserts the
+undo number of the popped undo record to the array of currently processed
+undo numbers in the transaction. When the query thread finishes processing
+of this undo record, it must be released with trx_undo_rec_release.
+@return undo log record copied to heap, NULL if none left, or if the
+undo number of the top record would be less than the limit */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_roll_pop_top_rec_of_trx(
+/*========================*/
+	trx_t*		trx,	/*!< in: transaction */
+	undo_no_t	limit,	/*!< in: least undo number we need */
+	roll_ptr_t*	roll_ptr,/*!< out: roll pointer to undo record */
+	mem_heap_t*	heap)	/*!< in: memory heap where copied */
+{
+	trx_undo_t*	undo;
+	trx_undo_t*	ins_undo;
+	trx_undo_t*	upd_undo;
+	trx_undo_rec_t*	undo_rec;
+	trx_undo_rec_t*	undo_rec_copy;
+	undo_no_t	undo_no;
+	ibool		is_insert;
+	trx_rseg_t*	rseg;
+	ulint		progress_pct;
+	mtr_t		mtr;
+
+	rseg = trx->rseg;
+try_again:
+	mutex_enter(&(trx->undo_mutex));
+
+	if (trx->pages_undone >= TRX_ROLL_TRUNC_THRESHOLD) {
+		mutex_enter(&rseg->mutex);
+
+		trx_roll_try_truncate(trx);
+
+		mutex_exit(&rseg->mutex);
+	}
+
+	ins_undo = trx->insert_undo;
+	upd_undo = trx->update_undo;
+
+	if (!ins_undo || ins_undo->empty) {
+		undo = upd_undo;
+	} else if (!upd_undo || upd_undo->empty) {
+		undo = ins_undo;
+	} else if (upd_undo->top_undo_no > ins_undo->top_undo_no) {
+		undo = upd_undo;
+	} else {
+		undo = ins_undo;
+	}
+
+	if (!undo || undo->empty || limit > undo->top_undo_no) {
+
+		if ((trx->undo_no_arr)->n_used == 0) {
+			/* Rollback is ending */
+
+			mutex_enter(&(rseg->mutex));
+
+			trx_roll_try_truncate(trx);
+
+			mutex_exit(&(rseg->mutex));
+		}
+
+		mutex_exit(&(trx->undo_mutex));
+
+		return(NULL);
+	}
+
+	is_insert = (undo == ins_undo);
+
+	*roll_ptr = trx_undo_build_roll_ptr(
+		is_insert, undo->rseg->id, undo->top_page_no, undo->top_offset);
+
+	mtr_start(&mtr);
+
+	undo_rec = trx_roll_pop_top_rec(trx, undo, &mtr);
+
+	undo_no = trx_undo_rec_get_undo_no(undo_rec);
+
+	ut_ad(undo_no + 1 == trx->undo_no);
+
+	/* We print rollback progress info if we are in a crash recovery
+	and the transaction has at least 1000 row operations to undo. */
+
+	if (trx == trx_roll_crash_recv_trx && trx_roll_max_undo_no > 1000) {
+
+		progress_pct = 100 - (ulint)
+			((undo_no * 100) / trx_roll_max_undo_no);
+		if (progress_pct != trx_roll_progress_printed_pct) {
+			if (trx_roll_progress_printed_pct == 0) {
+				fprintf(stderr,
+					"\nInnoDB: Progress in percents:"
+					" %lu", (ulong) progress_pct);
+			} else {
+				fprintf(stderr,
+					" %lu", (ulong) progress_pct);
+			}
+			fflush(stderr);
+			trx_roll_progress_printed_pct = progress_pct;
+		}
+	}
+
+	trx->undo_no = undo_no;
+
+	if (!trx_undo_arr_store_info(trx, undo_no)) {
+		/* A query thread is already processing this undo log record */
+
+		mutex_exit(&(trx->undo_mutex));
+
+		mtr_commit(&mtr);
+
+		goto try_again;
+	}
+
+	undo_rec_copy = trx_undo_rec_copy(undo_rec, heap);
+
+	mutex_exit(&(trx->undo_mutex));
+
+	mtr_commit(&mtr);
+
+	return(undo_rec_copy);
+}
+
+/********************************************************************//**
+Reserves an undo log record for a query thread to undo. This should be
+called if the query thread gets the undo log record not using the pop
+function above.
+@return	TRUE if succeeded */
+UNIV_INTERN
+ibool
+trx_undo_rec_reserve(
+/*=================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	undo_no_t	undo_no)/*!< in: undo number of the record */
+{
+	ibool	ret;
+
+	mutex_enter(&(trx->undo_mutex));
+
+	ret = trx_undo_arr_store_info(trx, undo_no);
+
+	mutex_exit(&(trx->undo_mutex));
+
+	return(ret);
+}
+
+/*******************************************************************//**
+Releases a reserved undo record. */
+UNIV_INTERN
+void
+trx_undo_rec_release(
+/*=================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	undo_no_t	undo_no)/*!< in: undo number */
+{
+	trx_undo_arr_t*	arr;
+
+	mutex_enter(&(trx->undo_mutex));
+
+	arr = trx->undo_no_arr;
+
+	trx_undo_arr_remove_info(arr, undo_no);
+
+	mutex_exit(&(trx->undo_mutex));
+}
+
+/****************************************************************//**
+Builds an undo 'query' graph for a transaction. The actual rollback is
+performed by executing this query graph like a query subprocedure call.
+The reply about the completion of the rollback will be sent by this
+graph.
+@return	own: the query graph */
+static
+que_t*
+trx_roll_graph_build(
+/*=================*/
+	trx_t*	trx)	/*!< in: trx handle */
+{
+	mem_heap_t*	heap;
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+
+	ut_ad(trx_mutex_own(trx));
+
+	heap = mem_heap_create(512);
+	fork = que_fork_create(NULL, NULL, QUE_FORK_ROLLBACK, heap);
+	fork->trx = trx;
+
+	thr = que_thr_create(fork, heap);
+
+	thr->child = row_undo_node_create(trx, thr, heap);
+
+	return(fork);
+}
+
+/*********************************************************************//**
+Starts a rollback operation, creates the UNDO graph that will do the
+actual undo operation.
+@return query graph thread that will perform the UNDO operations. */
+static
+que_thr_t*
+trx_rollback_start(
+/*===============*/
+	trx_t*		trx,		/*!< in: transaction */
+	ib_id_t		roll_limit)	/*!< in: rollback to undo no (for
+					partial undo), 0 if we are rolling back
+					the entire transaction */
+{
+	que_t*		roll_graph;
+
+	ut_ad(trx_mutex_own(trx));
+
+	ut_ad(trx->undo_no_arr == NULL || trx->undo_no_arr->n_used == 0);
+
+	/* Initialize the rollback field in the transaction */
+
+	trx->roll_limit = roll_limit;
+
+	ut_a(trx->roll_limit <= trx->undo_no);
+
+	trx->pages_undone = 0;
+
+	if (trx->undo_no_arr == NULL) {
+		/* Single query thread -> 1 */
+		trx->undo_no_arr = trx_undo_arr_create(1);
+	}
+
+	/* Build a 'query' graph which will perform the undo operations */
+
+	roll_graph = trx_roll_graph_build(trx);
+
+	trx->graph = roll_graph;
+
+	trx->lock.que_state = TRX_QUE_ROLLING_BACK;
+
+	return(que_fork_start_command(roll_graph));
+}
+
+/****************************************************************//**
+Finishes a transaction rollback. */
+static
+void
+trx_rollback_finish(
+/*================*/
+	trx_t*		trx)	/*!< in: transaction */
+{
+	ut_a(trx->undo_no_arr == NULL || trx->undo_no_arr->n_used == 0);
+
+	trx_commit(trx);
+
+	trx->lock.que_state = TRX_QUE_RUNNING;
+}
+
+/*********************************************************************//**
+Creates a rollback command node struct.
+@return	own: rollback node struct */
+UNIV_INTERN
+roll_node_t*
+roll_node_create(
+/*=============*/
+	mem_heap_t*	heap)	/*!< in: mem heap where created */
+{
+	roll_node_t*	node;
+
+	node = static_cast<roll_node_t*>(mem_heap_zalloc(heap, sizeof(*node)));
+
+	node->state = ROLL_NODE_SEND;
+
+	node->common.type = QUE_NODE_ROLLBACK;
+
+	return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a rollback command node in a query graph.
+@return	query thread to run next, or NULL */
+UNIV_INTERN
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	roll_node_t*	node;
+
+	node = static_cast<roll_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_ROLLBACK);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		node->state = ROLL_NODE_SEND;
+	}
+
+	if (node->state == ROLL_NODE_SEND) {
+		trx_t*		trx;
+		ib_id_t		roll_limit = 0;
+
+		trx = thr_get_trx(thr);
+
+		trx_mutex_enter(trx);
+
+		node->state = ROLL_NODE_WAIT;
+
+		ut_a(node->undo_thr == NULL);
+
+		roll_limit = node->partial ? node->savept.least_undo_no : 0;
+
+		trx_commit_or_rollback_prepare(trx);
+
+		node->undo_thr = trx_rollback_start(trx, roll_limit);
+
+		trx_mutex_exit(trx);
+
+	} else {
+		ut_ad(node->state == ROLL_NODE_WAIT);
+
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc
new file mode 100644
index 00000000000..003d1036a8c
--- /dev/null
+++ b/storage/innobase/trx/trx0rseg.cc
@@ -0,0 +1,425 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rseg.cc
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rseg.h"
+
+#ifdef UNIV_NONINL
+#include "trx0rseg.ic"
+#endif
+
+#include "trx0undo.h"
+#include "fut0lst.h"
+#include "srv0srv.h"
+#include "trx0purge.h"
+#include "ut0bh.h"
+#include "srv0mon.h"
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register rseg_mutex_key with performance schema */
+UNIV_INTERN mysql_pfs_key_t	rseg_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/****************************************************************//**
+Creates a rollback segment header. This function is called only when
+a new rollback segment is created in the database.
+@return	page number of the created segment, FIL_NULL if fail */
+UNIV_INTERN
+ulint
+trx_rseg_header_create(
+/*===================*/
+	ulint	space,		/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	max_size,	/*!< in: max size in pages */
+	ulint	rseg_slot_no,	/*!< in: rseg id == slot number in trx sys */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	ulint		page_no;
+	trx_rsegf_t*	rsegf;
+	trx_sysf_t*	sys_header;
+	ulint		i;
+	buf_block_t*	block;
+
+	ut_ad(mtr);
+	ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL),
+				MTR_MEMO_X_LOCK));
+
+	/* Allocate a new file segment for the rollback segment */
+	block = fseg_create(space, 0, TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr);
+
+	if (block == NULL) {
+		/* No space left */
+
+		return(FIL_NULL);
+	}
+
+	buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW);
+
+	page_no = buf_block_get_page_no(block);
+
+	/* Get the rollback segment file page */
+	rsegf = trx_rsegf_get_new(space, zip_size, page_no, mtr);
+
+	/* Initialize max size field */
+	mlog_write_ulint(rsegf + TRX_RSEG_MAX_SIZE, max_size,
+			 MLOG_4BYTES, mtr);
+
+	/* Initialize the history list */
+
+	mlog_write_ulint(rsegf + TRX_RSEG_HISTORY_SIZE, 0, MLOG_4BYTES, mtr);
+	flst_init(rsegf + TRX_RSEG_HISTORY, mtr);
+
+	/* Reset the undo log slots */
+	for (i = 0; i < TRX_RSEG_N_SLOTS; i++) {
+
+		trx_rsegf_set_nth_undo(rsegf, i, FIL_NULL, mtr);
+	}
+
+	/* Add the rollback segment info to the free slot in
+	the trx system header */
+
+	sys_header = trx_sysf_get(mtr);
+
+	trx_sysf_rseg_set_space(sys_header, rseg_slot_no, space, mtr);
+	trx_sysf_rseg_set_page_no(sys_header, rseg_slot_no, page_no, mtr);
+
+	return(page_no);
+}
+
+/***********************************************************************//**
+Free's an instance of the rollback segment in memory. */
+UNIV_INTERN
+void
+trx_rseg_mem_free(
+/*==============*/
+	trx_rseg_t*	rseg)	/* in, own: instance to free */
+{
+	trx_undo_t*	undo;
+	trx_undo_t*	next_undo;
+
+	mutex_free(&rseg->mutex);
+
+	/* There can't be any active transactions. */
+	ut_a(UT_LIST_GET_LEN(rseg->update_undo_list) == 0);
+	ut_a(UT_LIST_GET_LEN(rseg->insert_undo_list) == 0);
+
+	for (undo = UT_LIST_GET_FIRST(rseg->update_undo_cached);
+	     undo != NULL;
+	     undo = next_undo) {
+
+		next_undo = UT_LIST_GET_NEXT(undo_list, undo);
+
+		UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, undo);
+
+		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
+
+		trx_undo_mem_free(undo);
+	}
+
+	for (undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached);
+	     undo != NULL;
+	     undo = next_undo) {
+
+		next_undo = UT_LIST_GET_NEXT(undo_list, undo);
+
+		UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, undo);
+
+		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
+
+		trx_undo_mem_free(undo);
+	}
+
+	/* const_cast<trx_rseg_t*>() because this function is
+	like a destructor.  */
+
+	*((trx_rseg_t**) trx_sys->rseg_array + rseg->id) = NULL;
+
+	mem_free(rseg);
+}
+
+/***************************************************************************
+Creates and initializes a rollback segment object. The values for the
+fields are read from the header. The object is inserted to the rseg
+list of the trx system object and a pointer is inserted in the rseg
+array in the trx system object.
+@return	own: rollback segment object */
+static
+trx_rseg_t*
+trx_rseg_mem_create(
+/*================*/
+	ulint		id,		/*!< in: rollback segment id */
+	ulint		space,		/*!< in: space where the segment
+					placed */
+	ulint		zip_size,	/*!< in: compressed page size in bytes
+					or 0 for uncompressed pages */
+	ulint		page_no,	/*!< in: page number of the segment
+					header */
+	ib_bh_t*	ib_bh,		/*!< in/out: rseg queue */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint		len;
+	trx_rseg_t*	rseg;
+	fil_addr_t	node_addr;
+	trx_rsegf_t*	rseg_header;
+	trx_ulogf_t*	undo_log_hdr;
+	ulint		sum_of_undo_sizes;
+
+	rseg = static_cast<trx_rseg_t*>(mem_zalloc(sizeof(trx_rseg_t)));
+
+	rseg->id = id;
+	rseg->space = space;
+	rseg->zip_size = zip_size;
+	rseg->page_no = page_no;
+
+	mutex_create(rseg_mutex_key, &rseg->mutex, SYNC_RSEG);
+
+	/* const_cast<trx_rseg_t*>() because this function is
+	like a constructor.  */
+	*((trx_rseg_t**) trx_sys->rseg_array + rseg->id) = rseg;
+
+	rseg_header = trx_rsegf_get_new(space, zip_size, page_no, mtr);
+
+	rseg->max_size = mtr_read_ulint(
+		rseg_header + TRX_RSEG_MAX_SIZE, MLOG_4BYTES, mtr);
+
+	/* Initialize the undo log lists according to the rseg header */
+
+	sum_of_undo_sizes = trx_undo_lists_init(rseg);
+
+	rseg->curr_size = mtr_read_ulint(
+		rseg_header + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, mtr)
+		+ 1 + sum_of_undo_sizes;
+
+	len = flst_get_len(rseg_header + TRX_RSEG_HISTORY, mtr);
+
+	if (len > 0) {
+		rseg_queue_t	rseg_queue;
+
+		trx_sys->rseg_history_len += len;
+
+		node_addr = trx_purge_get_log_from_hist(
+			flst_get_last(rseg_header + TRX_RSEG_HISTORY, mtr));
+
+		rseg->last_page_no = node_addr.page;
+		rseg->last_offset = node_addr.boffset;
+
+		undo_log_hdr = trx_undo_page_get(
+			rseg->space, rseg->zip_size, node_addr.page,
+			mtr) + node_addr.boffset;
+
+		rseg->last_trx_no = mach_read_from_8(
+			undo_log_hdr + TRX_UNDO_TRX_NO);
+
+		rseg->last_del_marks = mtr_read_ulint(
+			undo_log_hdr + TRX_UNDO_DEL_MARKS, MLOG_2BYTES, mtr);
+
+		rseg_queue.rseg = rseg;
+		rseg_queue.trx_no = rseg->last_trx_no;
+
+		if (rseg->last_page_no != FIL_NULL) {
+			const void*	ptr;
+
+			/* There is no need to cover this operation by the purge
+			mutex because we are still bootstrapping. */
+
+			ptr = ib_bh_push(ib_bh, &rseg_queue);
+			ut_a(ptr != NULL);
+		}
+	} else {
+		rseg->last_page_no = FIL_NULL;
+	}
+
+	return(rseg);
+}
+
+/********************************************************************
+Creates the memory copies for the rollback segments and initializes the
+rseg array in trx_sys at a database startup. */
+static
+void
+trx_rseg_create_instance(
+/*=====================*/
+	trx_sysf_t*	sys_header,	/*!< in: trx system header */
+	ib_bh_t*	ib_bh,		/*!< in/out: rseg queue */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint		i;
+
+	for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+		ulint	page_no;
+
+		page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);
+
+		if (page_no != FIL_NULL) {
+			ulint		space;
+			ulint		zip_size;
+			trx_rseg_t*	rseg = NULL;
+
+			ut_a(!trx_rseg_get_on_id(i));
+
+			space = trx_sysf_rseg_get_space(sys_header, i, mtr);
+
+			zip_size = space ? fil_space_get_zip_size(space) : 0;
+
+			rseg = trx_rseg_mem_create(
+				i, space, zip_size, page_no, ib_bh, mtr);
+
+			ut_a(rseg->id == i);
+		} else {
+			ut_a(trx_sys->rseg_array[i] == NULL);
+		}
+	}
+}
+
+/*********************************************************************
+Creates a rollback segment.
+@return pointer to new rollback segment if create successful */
+UNIV_INTERN
+trx_rseg_t*
+trx_rseg_create(
+/*============*/
+	ulint		space)		/*!< in: id of UNDO tablespace */
+{
+	mtr_t		mtr;
+	ulint		slot_no;
+	trx_rseg_t*	rseg = NULL;
+
+	mtr_start(&mtr);
+
+	/* To obey the latching order, acquire the file space
+	x-latch before the trx_sys->mutex. */
+	mtr_x_lock(fil_space_get_latch(space, NULL), &mtr);
+
+	slot_no = trx_sysf_rseg_find_free(&mtr);
+
+	if (slot_no != ULINT_UNDEFINED) {
+		ulint		id;
+		ulint		page_no;
+		ulint		zip_size;
+		trx_sysf_t*	sys_header;
+
+		page_no = trx_rseg_header_create(
+			space, 0, ULINT_MAX, slot_no, &mtr);
+
+		ut_a(page_no != FIL_NULL);
+
+		sys_header = trx_sysf_get(&mtr);
+
+		id = trx_sysf_rseg_get_space(sys_header, slot_no, &mtr);
+		ut_a(id == space);
+
+		zip_size = space ? fil_space_get_zip_size(space) : 0;
+
+		rseg = trx_rseg_mem_create(
+			slot_no, space, zip_size, page_no,
+			purge_sys->ib_bh, &mtr);
+	}
+
+	mtr_commit(&mtr);
+
+	return(rseg);
+}
+
+/*********************************************************************//**
+Creates the memory copies for rollback segments and initializes the
+rseg array in trx_sys at a database startup. */
+UNIV_INTERN
+void
+trx_rseg_array_init(
+/*================*/
+	trx_sysf_t*	sys_header,	/* in/out: trx system header */
+	ib_bh_t*	ib_bh,		/*!< in: rseg queue */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	trx_sys->rseg_history_len = 0;
+
+	trx_rseg_create_instance(sys_header, ib_bh, mtr);
+}
+
+/********************************************************************
+Get the number of unique rollback tablespaces in use except space id 0.
+The last space id will be the sentinel value ULINT_UNDEFINED. The array
+will be sorted on space id. Note: space_ids should have have space for
+TRX_SYS_N_RSEGS + 1 elements.
+@return number of unique rollback tablespaces in use. */
+UNIV_INTERN
+ulint
+trx_rseg_get_n_undo_tablespaces(
+/*============================*/
+	ulint*		space_ids)	/*!< out: array of space ids of
+					UNDO tablespaces */
+{
+	ulint		i;
+	mtr_t		mtr;
+	trx_sysf_t*	sys_header;
+	ulint		n_undo_tablespaces = 0;
+	ulint		space_ids_aux[TRX_SYS_N_RSEGS + 1];
+
+	mtr_start(&mtr);
+
+	sys_header = trx_sysf_get(&mtr);
+
+	for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+		ulint	page_no;
+		ulint	space;
+
+		page_no = trx_sysf_rseg_get_page_no(sys_header, i, &mtr);
+
+		if (page_no == FIL_NULL) {
+			continue;
+		}
+
+		space = trx_sysf_rseg_get_space(sys_header, i, &mtr);
+
+		if (space != 0) {
+			ulint	j;
+			ibool	found = FALSE;
+
+			for (j = 0; j < n_undo_tablespaces; ++j) {
+				if (space_ids[j] == space) {
+					found = TRUE;
+					break;
+				}
+			}
+
+			if (!found) {
+				ut_a(n_undo_tablespaces <= i);
+				space_ids[n_undo_tablespaces++] = space;
+			}
+		}
+	}
+
+	mtr_commit(&mtr);
+
+	ut_a(n_undo_tablespaces <= TRX_SYS_N_RSEGS);
+
+	space_ids[n_undo_tablespaces] = ULINT_UNDEFINED;
+
+	if (n_undo_tablespaces > 0) {
+		ut_ulint_sort(space_ids, space_ids_aux, 0, n_undo_tablespaces);
+	}
+
+	return(n_undo_tablespaces);
+}
diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc
new file mode 100644
index 00000000000..52830a77b12
--- /dev/null
+++ b/storage/innobase/trx/trx0sys.cc
@@ -0,0 +1,1311 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0sys.cc
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0sys.h"
+
+#ifdef UNIV_NONINL
+#include "trx0sys.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+#include "fsp0fsp.h"
+#include "mtr0log.h"
+#include "mtr0log.h"
+#include "trx0trx.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "log0log.h"
+#include "log0recv.h"
+#include "os0file.h"
+#include "read0read.h"
+
+/** The file format tag structure with id and name. */
+struct file_format_t {
+	ulint		id;		/*!< id of the file format */
+	const char*	name;		/*!< text representation of the
+					file format */
+	ib_mutex_t		mutex;		/*!< covers changes to the above
+					fields */
+};
+
+/** The transaction system */
+UNIV_INTERN trx_sys_t*		trx_sys		= NULL;
+
+/** In a MySQL replication slave, in crash recovery we store the master log
+file name and position here. */
+/* @{ */
+/** Master binlog file name */
+UNIV_INTERN char	trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN];
+/** Master binlog file position.  We have successfully got the updates
+up to this position.  -1 means that no crash recovery was needed, or
+there was no master log position info inside InnoDB.*/
+UNIV_INTERN ib_int64_t	trx_sys_mysql_master_log_pos	= -1;
+/* @} */
+
+/** If this MySQL server uses binary logging, after InnoDB has been inited
+and if it has done a crash recovery, we store the binlog file name and position
+here. */
+/* @{ */
+/** Binlog file name */
+UNIV_INTERN char	trx_sys_mysql_bin_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN];
+/** Binlog file position, or -1 if unknown */
+UNIV_INTERN ib_int64_t	trx_sys_mysql_bin_log_pos	= -1;
+/* @} */
+#endif /* !UNIV_HOTBACKUP */
+
+/** List of animal names representing file format. */
+static const char*	file_format_name_map[] = {
+	"Antelope",
+	"Barracuda",
+	"Cheetah",
+	"Dragon",
+	"Elk",
+	"Fox",
+	"Gazelle",
+	"Hornet",
+	"Impala",
+	"Jaguar",
+	"Kangaroo",
+	"Leopard",
+	"Moose",
+	"Nautilus",
+	"Ocelot",
+	"Porpoise",
+	"Quail",
+	"Rabbit",
+	"Shark",
+	"Tiger",
+	"Urchin",
+	"Viper",
+	"Whale",
+	"Xenops",
+	"Yak",
+	"Zebra"
+};
+
+/** The number of elements in the file format name array. */
+static const ulint	FILE_FORMAT_NAME_N
+	= sizeof(file_format_name_map) / sizeof(file_format_name_map[0]);
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	file_format_max_mutex_key;
+UNIV_INTERN mysql_pfs_key_t	trx_sys_mutex_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+#ifndef UNIV_HOTBACKUP
+#ifdef UNIV_DEBUG
+/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
+UNIV_INTERN uint	trx_rseg_n_slots_debug = 0;
+#endif
+
+/** This is used to track the maximum file format id known to InnoDB. It's
+updated via SET GLOBAL innodb_file_format_max = 'x' or when we open
+or create a table. */
+static	file_format_t	file_format_max;
+
+#ifdef UNIV_DEBUG
+/****************************************************************//**
+Checks whether a trx is in one of rw_trx_list or ro_trx_list.
+@return	TRUE if is in */
+UNIV_INTERN
+ibool
+trx_in_trx_list(
+/*============*/
+	const trx_t*	in_trx)	/*!< in: transaction */
+{
+	const trx_t*	trx;
+	trx_list_t*	trx_list;
+
+	/* Non-locking autocommits should not hold any locks. */
+	assert_trx_in_list(in_trx);
+
+	trx_list = in_trx->read_only
+		? &trx_sys->ro_trx_list : &trx_sys->rw_trx_list;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	ut_ad(trx_assert_started(in_trx));
+
+	for (trx = UT_LIST_GET_FIRST(*trx_list);
+	     trx != NULL && trx != in_trx;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+		assert_trx_in_list(trx);
+		ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list));
+	}
+
+	return(trx != NULL);
+}
+#endif /* UNIV_DEBUG */
+
+/*****************************************************************//**
+Writes the value of max_trx_id to the file based trx system header. */
+UNIV_INTERN
+void
+trx_sys_flush_max_trx_id(void)
+/*==========================*/
+{
+	mtr_t		mtr;
+	trx_sysf_t*	sys_header;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	if (!srv_read_only_mode) {
+		mtr_start(&mtr);
+
+		sys_header = trx_sysf_get(&mtr);
+
+		mlog_write_ull(
+			sys_header + TRX_SYS_TRX_ID_STORE,
+			trx_sys->max_trx_id, &mtr);
+
+		mtr_commit(&mtr);
+	}
+}
+
+/*****************************************************************//**
+Updates the offset information about the end of the MySQL binlog entry
+which corresponds to the transaction just being committed. In a MySQL
+replication slave updates the latest master binlog position up to which
+replication has proceeded. */
+UNIV_INTERN
+void
+trx_sys_update_mysql_binlog_offset(
+/*===============================*/
+	const char*	file_name,/*!< in: MySQL log file name */
+	ib_int64_t	offset,	/*!< in: position in that log file */
+	ulint		field,	/*!< in: offset of the MySQL log info field in
+				the trx sys header */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	trx_sysf_t*	sys_header;
+
+	if (ut_strlen(file_name) >= TRX_SYS_MYSQL_LOG_NAME_LEN) {
+
+		/* We cannot fit the name to the 512 bytes we have reserved */
+
+		return;
+	}
+
+	sys_header = trx_sysf_get(mtr);
+
+	if (mach_read_from_4(sys_header + field
+			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+	    != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+		mlog_write_ulint(sys_header + field
+				 + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD,
+				 TRX_SYS_MYSQL_LOG_MAGIC_N,
+				 MLOG_4BYTES, mtr);
+	}
+
+	if (0 != strcmp((char*) (sys_header + field + TRX_SYS_MYSQL_LOG_NAME),
+			file_name)) {
+
+		mlog_write_string(sys_header + field
+				  + TRX_SYS_MYSQL_LOG_NAME,
+				  (byte*) file_name, 1 + ut_strlen(file_name),
+				  mtr);
+	}
+
+	if (mach_read_from_4(sys_header + field
+			     + TRX_SYS_MYSQL_LOG_OFFSET_HIGH) > 0
+	    || (offset >> 32) > 0) {
+
+		mlog_write_ulint(sys_header + field
+				 + TRX_SYS_MYSQL_LOG_OFFSET_HIGH,
+				 (ulint)(offset >> 32),
+				 MLOG_4BYTES, mtr);
+	}
+
+	mlog_write_ulint(sys_header + field
+			 + TRX_SYS_MYSQL_LOG_OFFSET_LOW,
+			 (ulint)(offset & 0xFFFFFFFFUL),
+			 MLOG_4BYTES, mtr);
+}
+
+/*****************************************************************//**
+Stores the MySQL binlog offset info in the trx system header if
+the magic number shows it valid, and print the info to stderr */
+UNIV_INTERN
+void
+trx_sys_print_mysql_binlog_offset(void)
+/*===================================*/
+{
+	trx_sysf_t*	sys_header;
+	mtr_t		mtr;
+	ulint		trx_sys_mysql_bin_log_pos_high;
+	ulint		trx_sys_mysql_bin_log_pos_low;
+
+	mtr_start(&mtr);
+
+	sys_header = trx_sysf_get(&mtr);
+
+	if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+	    != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+		mtr_commit(&mtr);
+
+		return;
+	}
+
+	trx_sys_mysql_bin_log_pos_high = mach_read_from_4(
+		sys_header + TRX_SYS_MYSQL_LOG_INFO
+		+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH);
+	trx_sys_mysql_bin_log_pos_low = mach_read_from_4(
+		sys_header + TRX_SYS_MYSQL_LOG_INFO
+		+ TRX_SYS_MYSQL_LOG_OFFSET_LOW);
+
+	trx_sys_mysql_bin_log_pos
+		= (((ib_int64_t) trx_sys_mysql_bin_log_pos_high) << 32)
+		+ (ib_int64_t) trx_sys_mysql_bin_log_pos_low;
+
+	ut_memcpy(trx_sys_mysql_bin_log_name,
+		  sys_header + TRX_SYS_MYSQL_LOG_INFO
+		  + TRX_SYS_MYSQL_LOG_NAME, TRX_SYS_MYSQL_LOG_NAME_LEN);
+
+	fprintf(stderr,
+		"InnoDB: Last MySQL binlog file position %lu %lu,"
+		" file name %s\n",
+		trx_sys_mysql_bin_log_pos_high, trx_sys_mysql_bin_log_pos_low,
+		trx_sys_mysql_bin_log_name);
+
+	mtr_commit(&mtr);
+}
+
+/*****************************************************************//**
+Prints to stderr the MySQL master log offset info in the trx system header if
+the magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_master_log_pos(void)
+/*====================================*/
+{
+	trx_sysf_t*	sys_header;
+	mtr_t		mtr;
+
+	mtr_start(&mtr);
+
+	sys_header = trx_sysf_get(&mtr);
+
+	if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+	    != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+		mtr_commit(&mtr);
+
+		return;
+	}
+
+	fprintf(stderr,
+		"InnoDB: In a MySQL replication slave the last"
+		" master binlog file\n"
+		"InnoDB: position %lu %lu, file name %s\n",
+		(ulong) mach_read_from_4(sys_header
+					 + TRX_SYS_MYSQL_MASTER_LOG_INFO
+					 + TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
+		(ulong) mach_read_from_4(sys_header
+					 + TRX_SYS_MYSQL_MASTER_LOG_INFO
+					 + TRX_SYS_MYSQL_LOG_OFFSET_LOW),
+		sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+		+ TRX_SYS_MYSQL_LOG_NAME);
+	/* Copy the master log position info to global variables we can
+	use in ha_innobase.cc to initialize glob_mi to right values */
+
+	ut_memcpy(trx_sys_mysql_master_log_name,
+		  sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+		  + TRX_SYS_MYSQL_LOG_NAME,
+		  TRX_SYS_MYSQL_LOG_NAME_LEN);
+
+	trx_sys_mysql_master_log_pos
+		= (((ib_int64_t) mach_read_from_4(
+			    sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+			    + TRX_SYS_MYSQL_LOG_OFFSET_HIGH)) << 32)
+		+ ((ib_int64_t) mach_read_from_4(
+			   sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+			   + TRX_SYS_MYSQL_LOG_OFFSET_LOW));
+	mtr_commit(&mtr);
+}
+
+/****************************************************************//**
+Looks for a free slot for a rollback segment in the trx system file copy.
+@return	slot index or ULINT_UNDEFINED if not found */
+UNIV_INTERN
+ulint
+trx_sysf_rseg_find_free(
+/*====================*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	ulint		i;
+	trx_sysf_t*	sys_header;
+
+	sys_header = trx_sysf_get(mtr);
+
+	for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+		ulint	page_no;
+
+		page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);
+
+		if (page_no == FIL_NULL) {
+
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/*****************************************************************//**
+Creates the file page for the transaction system. This function is called only
+at the database creation, before trx_sys_init. */
+static
+void
+trx_sysf_create(
+/*============*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	trx_sysf_t*	sys_header;
+	ulint		slot_no;
+	buf_block_t*	block;
+	page_t*		page;
+	ulint		page_no;
+	byte*		ptr;
+	ulint		len;
+
+	ut_ad(mtr);
+
+	/* Note that below we first reserve the file space x-latch, and
+	then enter the kernel: we must do it in this order to conform
+	to the latching order rules. */
+
+	mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), mtr);
+
+	/* Create the trx sys file block in a new allocated file segment */
+	block = fseg_create(TRX_SYS_SPACE, 0, TRX_SYS + TRX_SYS_FSEG_HEADER,
+			    mtr);
+	buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
+
+	ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO);
+
+	page = buf_block_get_frame(block);
+
+	mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS,
+			 MLOG_2BYTES, mtr);
+
+	/* Reset the doublewrite buffer magic number to zero so that we
+	know that the doublewrite buffer has not yet been created (this
+	suppresses a Valgrind warning) */
+
+	mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE
+			 + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr);
+
+	sys_header = trx_sysf_get(mtr);
+
+	/* Start counting transaction ids from number 1 up */
+	mach_write_to_8(sys_header + TRX_SYS_TRX_ID_STORE, 1);
+
+	/* Reset the rollback segment slots.  Old versions of InnoDB
+	define TRX_SYS_N_RSEGS as 256 (TRX_SYS_OLD_N_RSEGS) and expect
+	that the whole array is initialized. */
+	ptr = TRX_SYS_RSEGS + sys_header;
+	len = ut_max(TRX_SYS_OLD_N_RSEGS, TRX_SYS_N_RSEGS)
+		* TRX_SYS_RSEG_SLOT_SIZE;
+	memset(ptr, 0xff, len);
+	ptr += len;
+	ut_a(ptr <= page + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END));
+
+	/* Initialize all of the page.  This part used to be uninitialized. */
+	memset(ptr, 0, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page - ptr);
+
+	mlog_log_string(sys_header, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END
+			+ page - sys_header, mtr);
+
+	/* Create the first rollback segment in the SYSTEM tablespace */
+	slot_no = trx_sysf_rseg_find_free(mtr);
+	page_no = trx_rseg_header_create(TRX_SYS_SPACE, 0, ULINT_MAX, slot_no,
+					 mtr);
+
+	ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
+	ut_a(page_no == FSP_FIRST_RSEG_PAGE_NO);
+}
+
+/*****************************************************************//**
+Compare two trx_rseg_t instances on last_trx_no. */
+static
+int
+trx_rseg_compare_last_trx_no(
+/*=========================*/
+	const void*	p1,		/*!< in: elem to compare */
+	const void*	p2)		/*!< in: elem to compare */
+{
+	ib_int64_t	cmp;
+
+	const rseg_queue_t*	rseg_q1 = (const rseg_queue_t*) p1;
+	const rseg_queue_t*	rseg_q2 = (const rseg_queue_t*) p2;
+
+	cmp = rseg_q1->trx_no - rseg_q2->trx_no;
+
+	if (cmp < 0) {
+		return(-1);
+	} else if (cmp > 0) {
+		return(1);
+	}
+
+	return(0);
+}
+
+/*****************************************************************//**
+Creates and initializes the central memory structures for the transaction
+system. This is called when the database is started.
+@return min binary heap of rsegs to purge */
+UNIV_INTERN
+ib_bh_t*
+trx_sys_init_at_db_start(void)
+/*==========================*/
+{
+	mtr_t		mtr;
+	ib_bh_t*	ib_bh;
+	trx_sysf_t*	sys_header;
+	ib_uint64_t	rows_to_undo	= 0;
+	const char*	unit		= "";
+
+	/* We create the min binary heap here and pass ownership to
+	purge when we init the purge sub-system. Purge is responsible
+	for freeing the binary heap. */
+
+	ib_bh = ib_bh_create(
+		trx_rseg_compare_last_trx_no,
+		sizeof(rseg_queue_t), TRX_SYS_N_RSEGS);
+
+	mtr_start(&mtr);
+
+	sys_header = trx_sysf_get(&mtr);
+
+	if (srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) {
+		trx_rseg_array_init(sys_header, ib_bh, &mtr);
+	}
+
+	/* VERY important: after the database is started, max_trx_id value is
+	divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in
+	trx_sys_get_new_trx_id will evaluate to TRUE when the function
+	is first time called, and the value for trx id will be written
+	to the disk-based header! Thus trx id values will not overlap when
+	the database is repeatedly started! */
+
+	trx_sys->max_trx_id = 2 * TRX_SYS_TRX_ID_WRITE_MARGIN
+		+ ut_uint64_align_up(mach_read_from_8(sys_header
+						   + TRX_SYS_TRX_ID_STORE),
+				     TRX_SYS_TRX_ID_WRITE_MARGIN);
+
+	ut_d(trx_sys->rw_max_trx_id = trx_sys->max_trx_id);
+
+	UT_LIST_INIT(trx_sys->mysql_trx_list);
+
+	trx_dummy_sess = sess_open();
+
+	trx_lists_init_at_db_start();
+
+	/* This S lock is not strictly required, it is here only to satisfy
+	the debug code (assertions). We are still running in single threaded
+	bootstrap mode. */
+
+	mutex_enter(&trx_sys->mutex);
+
+	ut_a(UT_LIST_GET_LEN(trx_sys->ro_trx_list) == 0);
+
+	if (UT_LIST_GET_LEN(trx_sys->rw_trx_list) > 0) {
+		const trx_t*	trx;
+
+		for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+		     trx != NULL;
+		     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+			ut_ad(trx->is_recovered);
+			assert_trx_in_rw_list(trx);
+
+			if (trx_state_eq(trx, TRX_STATE_ACTIVE)) {
+				rows_to_undo += trx->undo_no;
+			}
+		}
+
+		if (rows_to_undo > 1000000000) {
+			unit = "M";
+			rows_to_undo = rows_to_undo / 1000000;
+		}
+
+		fprintf(stderr,
+			"InnoDB: %lu transaction(s) which must be"
+			" rolled back or cleaned up\n"
+			"InnoDB: in total %lu%s row operations to undo\n",
+			(ulong) UT_LIST_GET_LEN(trx_sys->rw_trx_list),
+			(ulong) rows_to_undo, unit);
+
+		fprintf(stderr, "InnoDB: Trx id counter is " TRX_ID_FMT "\n",
+			trx_sys->max_trx_id);
+	}
+
+	mutex_exit(&trx_sys->mutex);
+
+	UT_LIST_INIT(trx_sys->view_list);
+
+	mtr_commit(&mtr);
+
+	return(ib_bh);
+}
+
+/*****************************************************************//**
+Creates the trx_sys instance and initializes ib_bh and mutex. */
+UNIV_INTERN
+void
+trx_sys_create(void)
+/*================*/
+{
+	ut_ad(trx_sys == NULL);
+
+	trx_sys = static_cast<trx_sys_t*>(mem_zalloc(sizeof(*trx_sys)));
+
+	mutex_create(trx_sys_mutex_key, &trx_sys->mutex, SYNC_TRX_SYS);
+}
+
+/*****************************************************************//**
+Creates and initializes the transaction system at the database creation. */
+UNIV_INTERN
+void
+trx_sys_create_sys_pages(void)
+/*==========================*/
+{
+	mtr_t	mtr;
+
+	mtr_start(&mtr);
+
+	trx_sysf_create(&mtr);
+
+	mtr_commit(&mtr);
+}
+
+/*****************************************************************//**
+Update the file format tag.
+@return	always TRUE */
+static
+ibool
+trx_sys_file_format_max_write(
+/*==========================*/
+	ulint		format_id,	/*!< in: file format id */
+	const char**	name)		/*!< out: max file format name, can
+					be NULL */
+{
+	mtr_t		mtr;
+	byte*		ptr;
+	buf_block_t*	block;
+	ib_uint64_t	tag_value;
+
+	mtr_start(&mtr);
+
+	block = buf_page_get(
+		TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
+
+	file_format_max.id = format_id;
+	file_format_max.name = trx_sys_file_format_id_to_name(format_id);
+
+	ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG;
+	tag_value = format_id + TRX_SYS_FILE_FORMAT_TAG_MAGIC_N;
+
+	if (name) {
+		*name = file_format_max.name;
+	}
+
+	mlog_write_ull(ptr, tag_value, &mtr);
+
+	mtr_commit(&mtr);
+
+	return(TRUE);
+}
+
+/*****************************************************************//**
+Read the file format tag.
+@return	the file format or ULINT_UNDEFINED if not set. */
+static
+ulint
+trx_sys_file_format_max_read(void)
+/*==============================*/
+{
+	mtr_t			mtr;
+	const byte*		ptr;
+	const buf_block_t*	block;
+	ib_id_t			file_format_id;
+
+	/* Since this is called during the startup phase it's safe to
+	read the value without a covering mutex. */
+	mtr_start(&mtr);
+
+	block = buf_page_get(
+		TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
+
+	ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG;
+	file_format_id = mach_read_from_8(ptr);
+
+	mtr_commit(&mtr);
+
+	file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N;
+
+	if (file_format_id >= FILE_FORMAT_NAME_N) {
+
+		/* Either it has never been tagged, or garbage in it. */
+		return(ULINT_UNDEFINED);
+	}
+
+	return((ulint) file_format_id);
+}
+
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return	pointer to the name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_id_to_name(
+/*===========================*/
+	const ulint	id)	/*!< in: id of the file format */
+{
+	ut_a(id < FILE_FORMAT_NAME_N);
+
+	return(file_format_name_map[id]);
+}
+
+/*****************************************************************//**
+Check for the max file format tag stored on disk. Note: If max_format_id
+is == UNIV_FORMAT_MAX + 1 then we only print a warning.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+trx_sys_file_format_max_check(
+/*==========================*/
+	ulint	max_format_id)	/*!< in: max format id to check */
+{
+	ulint	format_id;
+
+	/* Check the file format in the tablespace. Do not try to
+	recover if the file format is not supported by the engine
+	unless forced by the user. */
+	format_id = trx_sys_file_format_max_read();
+	if (format_id == ULINT_UNDEFINED) {
+		/* Format ID was not set. Set it to minimum possible
+		value. */
+		format_id = UNIV_FORMAT_MIN;
+	}
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Highest supported file format is %s.",
+		trx_sys_file_format_id_to_name(UNIV_FORMAT_MAX));
+
+	if (format_id > UNIV_FORMAT_MAX) {
+
+		ut_a(format_id < FILE_FORMAT_NAME_N);
+
+		ib_logf(max_format_id <= UNIV_FORMAT_MAX
+			? IB_LOG_LEVEL_ERROR : IB_LOG_LEVEL_WARN,
+			"The system tablespace is in a file "
+			"format that this version doesn't support - %s.",
+			trx_sys_file_format_id_to_name(format_id));
+
+		if (max_format_id <= UNIV_FORMAT_MAX) {
+			return(DB_ERROR);
+		}
+	}
+
+	format_id = (format_id > max_format_id) ? format_id : max_format_id;
+
+	/* We don't need a mutex here, as this function should only
+	be called once at start up. */
+	file_format_max.id = format_id;
+	file_format_max.name = trx_sys_file_format_id_to_name(format_id);
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Set the file format id unconditionally except if it's already the
+same value.
+@return	TRUE if value updated */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_set(
+/*========================*/
+	ulint		format_id,	/*!< in: file format id */
+	const char**	name)		/*!< out: max file format name or
+					NULL if not needed. */
+{
+	ibool		ret = FALSE;
+
+	ut_a(format_id <= UNIV_FORMAT_MAX);
+
+	mutex_enter(&file_format_max.mutex);
+
+	/* Only update if not already same value. */
+	if (format_id != file_format_max.id) {
+
+		ret = trx_sys_file_format_max_write(format_id, name);
+	}
+
+	mutex_exit(&file_format_max.mutex);
+
+	return(ret);
+}
+
+/********************************************************************//**
+Tags the system table space with minimum format id if it has not been
+tagged yet.
+WARNING: This function is only called during the startup and AFTER the
+redo log application during recovery has finished. */
+UNIV_INTERN
+void
+trx_sys_file_format_tag_init(void)
+/*==============================*/
+{
+	ulint	format_id;
+
+	format_id = trx_sys_file_format_max_read();
+
+	/* If format_id is not set then set it to the minimum. */
+	if (format_id == ULINT_UNDEFINED) {
+		trx_sys_file_format_max_set(UNIV_FORMAT_MIN, NULL);
+	}
+}
+
+/********************************************************************//**
+Update the file format tag in the system tablespace only if the given
+format id is greater than the known max id.
+@return	TRUE if format_id was bigger than the known max id */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_upgrade(
+/*============================*/
+	const char**	name,		/*!< out: max file format name */
+	ulint		format_id)	/*!< in: file format identifier */
+{
+	ibool		ret = FALSE;
+
+	ut_a(name);
+	ut_a(file_format_max.name != NULL);
+	ut_a(format_id <= UNIV_FORMAT_MAX);
+
+	mutex_enter(&file_format_max.mutex);
+
+	if (format_id > file_format_max.id) {
+
+		ret = trx_sys_file_format_max_write(format_id, name);
+	}
+
+	mutex_exit(&file_format_max.mutex);
+
+	return(ret);
+}
+
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return	pointer to the max format name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_max_get(void)
+/*=============================*/
+{
+	return(file_format_max.name);
+}
+
+/*****************************************************************//**
+Initializes the tablespace tag system. */
+UNIV_INTERN
+void
+trx_sys_file_format_init(void)
+/*==========================*/
+{
+	mutex_create(file_format_max_mutex_key,
+		     &file_format_max.mutex, SYNC_FILE_FORMAT_TAG);
+
+	/* We don't need a mutex here, as this function should only
+	be called once at start up. */
+	file_format_max.id = UNIV_FORMAT_MIN;
+
+	file_format_max.name = trx_sys_file_format_id_to_name(
+		file_format_max.id);
+}
+
+/*****************************************************************//**
+Closes the tablespace tag system. */
+UNIV_INTERN
+void
+trx_sys_file_format_close(void)
+/*===========================*/
+{
+	/* Does nothing at the moment */
+}
+
+/*********************************************************************
+Creates the rollback segments.
+@return number of rollback segments that are active. */
+UNIV_INTERN
+ulint
+trx_sys_create_rsegs(
+/*=================*/
+	ulint	n_spaces,	/*!< number of tablespaces for UNDO logs */
+	ulint	n_rsegs)	/*!< number of rollback segments to create */
+{
+	mtr_t	mtr;
+	ulint	n_used;
+
+	ut_a(n_spaces < TRX_SYS_N_RSEGS);
+	ut_a(n_rsegs <= TRX_SYS_N_RSEGS);
+
+	if (srv_read_only_mode) {
+		return(ULINT_UNDEFINED);
+	}
+
+	/* This is executed in single-threaded mode therefore it is not
+	necessary to use the same mtr in trx_rseg_create(). n_used cannot
+	change while the function is executing. */
+
+	mtr_start(&mtr);
+	n_used = trx_sysf_rseg_find_free(&mtr);
+	mtr_commit(&mtr);
+
+	if (n_used == ULINT_UNDEFINED) {
+		n_used = TRX_SYS_N_RSEGS;
+	}
+
+	/* Do not create additional rollback segments if innodb_force_recovery
+	has been set and the database was not shutdown cleanly. */
+
+	if (!srv_force_recovery && !recv_needed_recovery && n_used < n_rsegs) {
+		ulint	i;
+		ulint	new_rsegs = n_rsegs - n_used;
+
+		for (i = 0; i < new_rsegs; ++i) {
+			ulint	space;
+
+			/* Tablespace 0 is the system tablespace. All UNDO
+			log tablespaces start from 1. */
+
+			if (n_spaces > 0) {
+				space = (i % n_spaces) + 1;
+			} else {
+				space = 0; /* System tablespace */
+			}
+
+			if (trx_rseg_create(space) != NULL) {
+				++n_used;
+			} else {
+				break;
+			}
+		}
+	}
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"%lu rollback segment(s) are active.", n_used);
+
+	return(n_used);
+}
+
+#else /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Prints to stderr the MySQL binlog info in the system header if the
+magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_binlog_offset_from_page(
+/*========================================*/
+	const byte*	page)	/*!< in: buffer containing the trx
+				system header page, i.e., page number
+				TRX_SYS_PAGE_NO in the tablespace */
+{
+	const trx_sysf_t*	sys_header;
+
+	sys_header = page + TRX_SYS;
+
+	if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+	    == TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+		fprintf(stderr,
+			"mysqlbackup: Last MySQL binlog file position %lu %lu,"
+			" file name %s\n",
+			(ulong) mach_read_from_4(
+				sys_header + TRX_SYS_MYSQL_LOG_INFO
+				+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
+			(ulong) mach_read_from_4(
+				sys_header + TRX_SYS_MYSQL_LOG_INFO
+				+ TRX_SYS_MYSQL_LOG_OFFSET_LOW),
+			sys_header + TRX_SYS_MYSQL_LOG_INFO
+			+ TRX_SYS_MYSQL_LOG_NAME);
+	}
+}
+
+/*****************************************************************//**
+Reads the file format id from the first system table space file.
+Even if the call succeeds and returns TRUE, the returned format id
+may be ULINT_UNDEFINED signalling that the format id was not present
+in the data file.
+@return TRUE if call succeeds */
+UNIV_INTERN
+ibool
+trx_sys_read_file_format_id(
+/*========================*/
+	const char *pathname,  /*!< in: pathname of the first system
+				        table space file */
+	ulint *format_id)      /*!< out: file format of the system table
+				         space */
+{
+	os_file_t	file;
+	ibool		success;
+	byte		buf[UNIV_PAGE_SIZE * 2];
+	page_t*		page = ut_align(buf, UNIV_PAGE_SIZE);
+	const byte*	ptr;
+	ib_id_t		file_format_id;
+
+	*format_id = ULINT_UNDEFINED;
+
+	file = os_file_create_simple_no_error_handling(
+		innodb_file_data_key,
+		pathname,
+		OS_FILE_OPEN,
+		OS_FILE_READ_ONLY,
+		&success
+	);
+	if (!success) {
+		/* The following call prints an error message */
+		os_file_get_last_error(true);
+
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr,
+			"  mysqlbackup: Error: trying to read system "
+			"tablespace file format,\n"
+			"  mysqlbackup: but could not open the tablespace "
+			"file %s!\n", pathname);
+		return(FALSE);
+	}
+
+	/* Read the page on which file format is stored */
+
+	success = os_file_read_no_error_handling(
+		file, page, TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE, UNIV_PAGE_SIZE);
+
+	if (!success) {
+		/* The following call prints an error message */
+		os_file_get_last_error(true);
+
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr,
+			"  mysqlbackup: Error: trying to read system "
+			"tablespace file format,\n"
+			"  mysqlbackup: but failed to read the tablespace "
+			"file %s!\n", pathname);
+
+		os_file_close(file);
+		return(FALSE);
+	}
+	os_file_close(file);
+
+	/* get the file format from the page */
+	ptr = page + TRX_SYS_FILE_FORMAT_TAG;
+	file_format_id = mach_read_from_8(ptr);
+	file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N;
+
+	if (file_format_id >= FILE_FORMAT_NAME_N) {
+
+		/* Either it has never been tagged, or garbage in it. */
+		return(TRUE);
+	}
+
+	*format_id = (ulint) file_format_id;
+
+	return(TRUE);
+}
+
+/*****************************************************************//**
+Reads the file format id from the given per-table data file.
+@return TRUE if call succeeds */
+UNIV_INTERN
+ibool
+trx_sys_read_pertable_file_format_id(
+/*=================================*/
+	const char *pathname,  /*!< in: pathname of a per-table
+				        datafile */
+	ulint *format_id)      /*!< out: file format of the per-table
+				         data file */
+{
+	os_file_t	file;
+	ibool		success;
+	byte		buf[UNIV_PAGE_SIZE * 2];
+	page_t*		page = ut_align(buf, UNIV_PAGE_SIZE);
+	const byte*	ptr;
+	ib_uint32_t	flags;
+
+	*format_id = ULINT_UNDEFINED;
+
+	file = os_file_create_simple_no_error_handling(
+		innodb_file_data_key,
+		pathname,
+		OS_FILE_OPEN,
+		OS_FILE_READ_ONLY,
+		&success
+	);
+	if (!success) {
+		/* The following call prints an error message */
+		os_file_get_last_error(true);
+
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr,
+			"  mysqlbackup: Error: trying to read per-table "
+			"tablespace format,\n"
+			"  mysqlbackup: but could not open the tablespace "
+			"file %s!\n", pathname);
+
+		return(FALSE);
+	}
+
+	/* Read the first page of the per-table datafile */
+
+	success = os_file_read_no_error_handling(file, page, 0, UNIV_PAGE_SIZE);
+
+	if (!success) {
+		/* The following call prints an error message */
+		os_file_get_last_error(true);
+
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr,
+			"  mysqlbackup: Error: trying to per-table data file "
+			"format,\n"
+			"  mysqlbackup: but failed to read the tablespace "
+			"file %s!\n", pathname);
+
+		os_file_close(file);
+		return(FALSE);
+	}
+	os_file_close(file);
+
+	/* get the file format from the page */
+	ptr = page + 54;
+	flags = mach_read_from_4(ptr);
+	if (flags == 0) {
+		/* file format is Antelope */
+		*format_id = 0;
+		return(TRUE);
+	} else if (flags & 1) {
+		/* tablespace flags are ok */
+		*format_id = (flags / 32) % 128;
+		return(TRUE);
+	} else {
+		/* bad tablespace flags */
+		return(FALSE);
+	}
+}
+
+
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return	pointer to the name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_id_to_name(
+/*===========================*/
+	const ulint	id)	/*!< in: id of the file format */
+{
+	if (!(id < FILE_FORMAT_NAME_N)) {
+		/* unknown id */
+		return("Unknown");
+	}
+
+	return(file_format_name_map[id]);
+}
+
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************
+Shutdown/Close the transaction system. */
+UNIV_INTERN
+void
+trx_sys_close(void)
+/*===============*/
+{
+	ulint		i;
+	trx_t*		trx;
+	read_view_t*	view;
+
+	ut_ad(trx_sys != NULL);
+	ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS);
+
+	/* Check that all read views are closed except read view owned
+	by a purge. */
+
+	mutex_enter(&trx_sys->mutex);
+
+	if (UT_LIST_GET_LEN(trx_sys->view_list) > 1) {
+		fprintf(stderr,
+			"InnoDB: Error: all read views were not closed"
+			" before shutdown:\n"
+			"InnoDB: %lu read views open \n",
+			UT_LIST_GET_LEN(trx_sys->view_list) - 1);
+	}
+
+	mutex_exit(&trx_sys->mutex);
+
+	sess_close(trx_dummy_sess);
+	trx_dummy_sess = NULL;
+
+	trx_purge_sys_close();
+
+	/* Free the double write data structures. */
+	buf_dblwr_free();
+
+	mutex_enter(&trx_sys->mutex);
+
+	ut_a(UT_LIST_GET_LEN(trx_sys->ro_trx_list) == 0);
+
+	/* Only prepared transactions may be left in the system. Free them. */
+	ut_a(UT_LIST_GET_LEN(trx_sys->rw_trx_list) == trx_sys->n_prepared_trx);
+
+	while ((trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list)) != NULL) {
+		trx_free_prepared(trx);
+	}
+
+	/* There can't be any active transactions. */
+	for (i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+		trx_rseg_t*	rseg;
+
+		rseg = trx_sys->rseg_array[i];
+
+		if (rseg != NULL) {
+			trx_rseg_mem_free(rseg);
+		} else {
+			break;
+		}
+	}
+
+	view = UT_LIST_GET_FIRST(trx_sys->view_list);
+
+	while (view != NULL) {
+		read_view_t*	prev_view = view;
+
+		view = UT_LIST_GET_NEXT(view_list, prev_view);
+
+		/* Views are allocated from the trx_sys->global_read_view_heap.
+		So, we simply remove the element here. */
+		UT_LIST_REMOVE(view_list, trx_sys->view_list, prev_view);
+	}
+
+	ut_a(UT_LIST_GET_LEN(trx_sys->view_list) == 0);
+	ut_a(UT_LIST_GET_LEN(trx_sys->ro_trx_list) == 0);
+	ut_a(UT_LIST_GET_LEN(trx_sys->rw_trx_list) == 0);
+	ut_a(UT_LIST_GET_LEN(trx_sys->mysql_trx_list) == 0);
+
+	mutex_exit(&trx_sys->mutex);
+
+	mutex_free(&trx_sys->mutex);
+
+	mem_free(trx_sys);
+
+	trx_sys = NULL;
+}
+
+/*********************************************************************
+Check if there are any active (non-prepared) transactions.
+@return total number of active transactions or 0 if none */
+UNIV_INTERN
+ulint
+trx_sys_any_active_transactions(void)
+/*=================================*/
+{
+	ulint	total_trx = 0;
+
+	mutex_enter(&trx_sys->mutex);
+
+	total_trx = UT_LIST_GET_LEN(trx_sys->rw_trx_list)
+		  + UT_LIST_GET_LEN(trx_sys->mysql_trx_list);
+
+	ut_a(total_trx >= trx_sys->n_prepared_trx);
+	total_trx -= trx_sys->n_prepared_trx;
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(total_trx);
+}
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Validate the trx_list_t.
+@return TRUE if valid. */
+static
+ibool
+trx_sys_validate_trx_list_low(
+/*===========================*/
+	trx_list_t*	trx_list)	/*!< in: &trx_sys->ro_trx_list
+					or &trx_sys->rw_trx_list */
+{
+	const trx_t*	trx;
+	const trx_t*	prev_trx = NULL;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	ut_ad(trx_list == &trx_sys->ro_trx_list
+	      || trx_list == &trx_sys->rw_trx_list);
+
+	for (trx = UT_LIST_GET_FIRST(*trx_list);
+	     trx != NULL;
+	     prev_trx = trx, trx = UT_LIST_GET_NEXT(trx_list, prev_trx)) {
+
+		assert_trx_in_list(trx);
+		ut_ad(trx->read_only == (trx_list == &trx_sys->ro_trx_list));
+
+		ut_a(prev_trx == NULL || prev_trx->id > trx->id);
+	}
+
+	return(TRUE);
+}
+
+/*************************************************************//**
+Validate the trx_sys_t::ro_trx_list and trx_sys_t::rw_trx_list.
+@return TRUE if lists are valid. */
+UNIV_INTERN
+ibool
+trx_sys_validate_trx_list(void)
+/*===========================*/
+{
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	ut_a(trx_sys_validate_trx_list_low(&trx_sys->ro_trx_list));
+	ut_a(trx_sys_validate_trx_list_low(&trx_sys->rw_trx_list));
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
new file mode 100644
index 00000000000..c2d5c1f7c7f
--- /dev/null
+++ b/storage/innobase/trx/trx0trx.cc
@@ -0,0 +1,2306 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0trx.cc
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0trx.h"
+
+#ifdef UNIV_NONINL
+#include "trx0trx.ic"
+#endif
+
+#include "trx0undo.h"
+#include "trx0rseg.h"
+#include "log0log.h"
+#include "que0que.h"
+#include "lock0lock.h"
+#include "trx0roll.h"
+#include "usr0sess.h"
+#include "read0read.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "btr0sea.h"
+#include "os0proc.h"
+#include "trx0xa.h"
+#include "trx0rec.h"
+#include "trx0purge.h"
+#include "ha_prototypes.h"
+#include "srv0mon.h"
+#include "ut0vec.h"
+
+#include<set>
+
+/** Set of table_id */
+typedef std::set<table_id_t>	table_id_set;
+
+/** Dummy session used currently in MySQL interface */
+UNIV_INTERN sess_t*		trx_dummy_sess = NULL;
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	trx_mutex_key;
+/* Key to register the mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	trx_undo_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/*************************************************************//**
+Set detailed error message for the transaction. */
+UNIV_INTERN
+void
+trx_set_detailed_error(
+/*===================*/
+	trx_t*		trx,	/*!< in: transaction struct */
+	const char*	msg)	/*!< in: detailed error message */
+{
+	ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error));
+}
+
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+UNIV_INTERN
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+	trx_t*	trx,	/*!< in: transaction struct */
+	FILE*	file)	/*!< in: file to read message from */
+{
+	os_file_read_string(file, trx->detailed_error,
+			    sizeof(trx->detailed_error));
+}
+
+/****************************************************************//**
+Creates and initializes a transaction object. It must be explicitly
+started with trx_start_if_not_started() before using it. The default
+isolation level is TRX_ISO_REPEATABLE_READ.
+@return transaction instance, should never be NULL */
+static
+trx_t*
+trx_create(void)
+/*============*/
+{
+	trx_t*		trx;
+	mem_heap_t*	heap;
+	ib_alloc_t*	heap_alloc;
+
+	trx = static_cast<trx_t*>(mem_zalloc(sizeof(*trx)));
+
+	mutex_create(trx_mutex_key, &trx->mutex, SYNC_TRX);
+
+	trx->magic_n = TRX_MAGIC_N;
+
+	trx->state = TRX_STATE_NOT_STARTED;
+
+	trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+	trx->no = TRX_ID_MAX;
+
+	trx->support_xa = TRUE;
+
+	trx->check_foreigns = TRUE;
+	trx->check_unique_secondary = TRUE;
+
+	trx->dict_operation = TRX_DICT_OP_NONE;
+
+	mutex_create(trx_undo_mutex_key, &trx->undo_mutex, SYNC_TRX_UNDO);
+
+	trx->error_state = DB_SUCCESS;
+
+	trx->lock.que_state = TRX_QUE_RUNNING;
+
+	trx->lock.lock_heap = mem_heap_create_typed(
+		256, MEM_HEAP_FOR_LOCK_HEAP);
+
+	trx->search_latch_timeout = BTR_SEA_TIMEOUT;
+
+	trx->global_read_view_heap = mem_heap_create(256);
+
+	trx->xid.formatID = -1;
+
+	trx->op_info = "";
+
+	trx->api_trx = false;
+
+	trx->api_auto_commit = false;
+
+	trx->read_write = true;
+
+	heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 8);
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	/* Remember to free the vector explicitly in trx_free(). */
+	trx->autoinc_locks = ib_vector_create(heap_alloc, sizeof(void**), 4);
+
+	/* Remember to free the vector explicitly in trx_free(). */
+	heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 128);
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	trx->lock.table_locks = ib_vector_create(
+		heap_alloc, sizeof(void**), 32);
+
+	return(trx);
+}
+
+/********************************************************************//**
+Creates a transaction object for background operations by the master thread.
+@return	own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_background(void)
+/*=============================*/
+{
+	trx_t*	trx;
+
+	trx = trx_create();
+
+	trx->sess = trx_dummy_sess;
+
+	return(trx);
+}
+
+/********************************************************************//**
+Creates a transaction object for MySQL.
+@return	own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_mysql(void)
+/*========================*/
+{
+	trx_t*	trx;
+
+	trx = trx_allocate_for_background();
+
+	mutex_enter(&trx_sys->mutex);
+
+	ut_d(trx->in_mysql_trx_list = TRUE);
+	UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx);
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(trx);
+}
+
+/********************************************************************//**
+Frees a transaction object. */
+static
+void
+trx_free(
+/*=====*/
+	trx_t*	trx)	/*!< in, own: trx object */
+{
+	ut_a(trx->magic_n == TRX_MAGIC_N);
+	ut_ad(!trx->in_ro_trx_list);
+	ut_ad(!trx->in_rw_trx_list);
+	ut_ad(!trx->in_mysql_trx_list);
+
+	mutex_free(&trx->undo_mutex);
+
+	if (trx->undo_no_arr != NULL) {
+		trx_undo_arr_free(trx->undo_no_arr);
+	}
+
+	ut_a(trx->lock.wait_lock == NULL);
+	ut_a(trx->lock.wait_thr == NULL);
+
+	ut_a(!trx->has_search_latch);
+
+	ut_a(trx->dict_operation_lock_mode == 0);
+
+	if (trx->lock.lock_heap) {
+		mem_heap_free(trx->lock.lock_heap);
+	}
+
+	ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+
+	if (trx->global_read_view_heap) {
+		mem_heap_free(trx->global_read_view_heap);
+	}
+
+	ut_a(ib_vector_is_empty(trx->autoinc_locks));
+	/* We allocated a dedicated heap for the vector. */
+	ib_vector_free(trx->autoinc_locks);
+
+	if (trx->lock.table_locks != NULL) {
+		/* We allocated a dedicated heap for the vector. */
+		ib_vector_free(trx->lock.table_locks);
+	}
+
+	mutex_free(&trx->mutex);
+
+	mem_free(trx);
+}
+
+/********************************************************************//**
+Frees a transaction object of a background operation of the master thread. */
+UNIV_INTERN
+void
+trx_free_for_background(
+/*====================*/
+	trx_t*	trx)	/*!< in, own: trx object */
+{
+	if (trx->declared_to_be_inside_innodb) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Freeing a trx (%p, " TRX_ID_FMT ") which is declared "
+			"to be processing inside InnoDB", trx, trx->id);
+
+		trx_print(stderr, trx, 600);
+		putc('\n', stderr);
+
+		/* This is an error but not a fatal error. We must keep
+		the counters like srv_conc_n_threads accurate. */
+		srv_conc_force_exit_innodb(trx);
+	}
+
+	if (trx->n_mysql_tables_in_use != 0
+	    || trx->mysql_n_tables_locked != 0) {
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"MySQL is freeing a thd though "
+			"trx->n_mysql_tables_in_use is %lu and "
+			"trx->mysql_n_tables_locked is %lu.",
+			(ulong) trx->n_mysql_tables_in_use,
+			(ulong) trx->mysql_n_tables_locked);
+
+		trx_print(stderr, trx, 600);
+		ut_print_buf(stderr, trx, sizeof(trx_t));
+		putc('\n', stderr);
+	}
+
+	ut_a(trx->state == TRX_STATE_NOT_STARTED);
+	ut_a(trx->insert_undo == NULL);
+	ut_a(trx->update_undo == NULL);
+	ut_a(trx->read_view == NULL);
+
+	trx_free(trx);
+}
+
+/********************************************************************//**
+At shutdown, frees a transaction object that is in the PREPARED state. */
+UNIV_INTERN
+void
+trx_free_prepared(
+/*==============*/
+	trx_t*	trx)	/*!< in, own: trx object */
+{
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	ut_a(trx_state_eq(trx, TRX_STATE_PREPARED));
+	ut_a(trx->magic_n == TRX_MAGIC_N);
+
+	trx_undo_free_prepared(trx);
+
+	assert_trx_in_rw_list(trx);
+
+	ut_a(!trx->read_only);
+
+	UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx);
+	ut_d(trx->in_rw_trx_list = FALSE);
+
+	/* Undo trx_resurrect_table_locks(). */
+	UT_LIST_INIT(trx->lock.trx_locks);
+
+	trx_free(trx);
+}
+
+/********************************************************************//**
+Frees a transaction object for MySQL. */
+UNIV_INTERN
+void
+trx_free_for_mysql(
+/*===============*/
+	trx_t*	trx)	/*!< in, own: trx object */
+{
+	mutex_enter(&trx_sys->mutex);
+
+	ut_ad(trx->in_mysql_trx_list);
+	ut_d(trx->in_mysql_trx_list = FALSE);
+	UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx);
+
+	ut_ad(trx_sys_validate_trx_list());
+
+	mutex_exit(&trx_sys->mutex);
+
+	trx_free_for_background(trx);
+}
+
+/****************************************************************//**
+Inserts the trx handle in the trx system trx list in the right position.
+The list is sorted on the trx id so that the biggest id is at the list
+start. This function is used at the database startup to insert incomplete
+transactions to the list. */
+static
+void
+trx_list_rw_insert_ordered(
+/*=======================*/
+	trx_t*	trx)	/*!< in: trx handle */
+{
+	trx_t*	trx2;
+
+	ut_ad(!trx->read_only);
+
+	ut_d(trx->start_file = __FILE__);
+	ut_d(trx->start_line = __LINE__);
+
+	ut_a(srv_is_being_started);
+	ut_ad(!trx->in_ro_trx_list);
+	ut_ad(!trx->in_rw_trx_list);
+	ut_ad(trx->state != TRX_STATE_NOT_STARTED);
+	ut_ad(trx->is_recovered);
+
+	for (trx2 = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+	     trx2 != NULL;
+	     trx2 = UT_LIST_GET_NEXT(trx_list, trx2)) {
+
+		assert_trx_in_rw_list(trx2);
+
+		if (trx->id >= trx2->id) {
+
+			ut_ad(trx->id > trx2->id);
+			break;
+		}
+	}
+
+	if (trx2 != NULL) {
+		trx2 = UT_LIST_GET_PREV(trx_list, trx2);
+
+		if (trx2 == NULL) {
+			UT_LIST_ADD_FIRST(trx_list, trx_sys->rw_trx_list, trx);
+		} else {
+			UT_LIST_INSERT_AFTER(
+				trx_list, trx_sys->rw_trx_list, trx2, trx);
+		}
+	} else {
+		UT_LIST_ADD_LAST(trx_list, trx_sys->rw_trx_list, trx);
+	}
+
+#ifdef UNIV_DEBUG
+	if (trx->id > trx_sys->rw_max_trx_id) {
+		trx_sys->rw_max_trx_id = trx->id;
+	}
+#endif /* UNIV_DEBUG */
+
+	ut_ad(!trx->in_rw_trx_list);
+	ut_d(trx->in_rw_trx_list = TRUE);
+}
+
+/****************************************************************//**
+Resurrect the table locks for a resurrected transaction. */
+static
+void
+trx_resurrect_table_locks(
+/*======================*/
+	trx_t*			trx,	/*!< in/out: transaction */
+	const trx_undo_t*	undo)	/*!< in: undo log */
+{
+	mtr_t			mtr;
+	page_t*			undo_page;
+	trx_undo_rec_t*		undo_rec;
+	table_id_set		tables;
+
+	ut_ad(undo == trx->insert_undo || undo == trx->update_undo);
+
+	if (trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)
+	    || undo->empty) {
+		return;
+	}
+
+	mtr_start(&mtr);
+	/* trx_rseg_mem_create() may have acquired an X-latch on this
+	page, so we cannot acquire an S-latch. */
+	undo_page = trx_undo_page_get(
+		undo->space, undo->zip_size, undo->top_page_no, &mtr);
+	undo_rec = undo_page + undo->top_offset;
+
+	do {
+		ulint		type;
+		ulint		cmpl_info;
+		bool		updated_extern;
+		undo_no_t	undo_no;
+		table_id_t	table_id;
+
+		page_t*		undo_rec_page = page_align(undo_rec);
+
+		if (undo_rec_page != undo_page) {
+			if (!mtr_memo_release(&mtr,
+					      buf_block_align(undo_page),
+					      MTR_MEMO_PAGE_X_FIX)) {
+				/* The page of the previous undo_rec
+				should have been latched by
+				trx_undo_page_get() or
+				trx_undo_get_prev_rec(). */
+				ut_ad(0);
+			}
+
+			undo_page = undo_rec_page;
+		}
+
+		trx_undo_rec_get_pars(
+			undo_rec, &type, &cmpl_info,
+			&updated_extern, &undo_no, &table_id);
+		tables.insert(table_id);
+
+		undo_rec = trx_undo_get_prev_rec(
+			undo_rec, undo->hdr_page_no,
+			undo->hdr_offset, false, &mtr);
+	} while (undo_rec);
+
+	mtr_commit(&mtr);
+
+	for (table_id_set::const_iterator i = tables.begin();
+	     i != tables.end(); i++) {
+		if (dict_table_t* table = dict_table_open_on_id(
+			    *i, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE)) {
+			if (table->ibd_file_missing
+			    || dict_table_is_temporary(table)) {
+				mutex_enter(&dict_sys->mutex);
+				dict_table_close(table, TRUE, FALSE);
+				dict_table_remove_from_cache(table);
+				mutex_exit(&dict_sys->mutex);
+				continue;
+			}
+
+			lock_table_ix_resurrect(table, trx);
+
+			DBUG_PRINT("ib_trx",
+				   ("resurrect" TRX_ID_FMT
+				    "  table '%s' IX lock from %s undo",
+				    trx->id, table->name,
+				    undo == trx->insert_undo
+				    ? "insert" : "update"));
+
+			dict_table_close(table, FALSE, FALSE);
+		}
+	}
+}
+
+/****************************************************************//**
+Resurrect the transactions that were doing inserts the time of the
+crash, they need to be undone.
+@return trx_t instance  */
+static
+trx_t*
+trx_resurrect_insert(
+/*=================*/
+	trx_undo_t*	undo,		/*!< in: entry to UNDO */
+	trx_rseg_t*	rseg)		/*!< in: rollback segment */
+{
+	trx_t*		trx;
+
+	trx = trx_allocate_for_background();
+
+	trx->rseg = rseg;
+	trx->xid = undo->xid;
+	trx->id = undo->trx_id;
+	trx->insert_undo = undo;
+	trx->is_recovered = TRUE;
+
+	/* This is single-threaded startup code, we do not need the
+	protection of trx->mutex or trx_sys->mutex here. */
+
+	if (undo->state != TRX_UNDO_ACTIVE) {
+
+		/* Prepared transactions are left in the prepared state
+		waiting for a commit or abort decision from MySQL */
+
+		if (undo->state == TRX_UNDO_PREPARED) {
+
+			fprintf(stderr,
+				"InnoDB: Transaction " TRX_ID_FMT " was in the"
+				" XA prepared state.\n", trx->id);
+
+			if (srv_force_recovery == 0) {
+
+				trx->state = TRX_STATE_PREPARED;
+				trx_sys->n_prepared_trx++;
+				trx_sys->n_prepared_recovered_trx++;
+			} else {
+				fprintf(stderr,
+					"InnoDB: Since innodb_force_recovery"
+					" > 0, we will rollback it anyway.\n");
+
+				trx->state = TRX_STATE_ACTIVE;
+			}
+		} else {
+			trx->state = TRX_STATE_COMMITTED_IN_MEMORY;
+		}
+
+		/* We give a dummy value for the trx no; this should have no
+		relevance since purge is not interested in committed
+		transaction numbers, unless they are in the history
+		list, in which case it looks the number from the disk based
+		undo log structure */
+
+		trx->no = trx->id;
+	} else {
+		trx->state = TRX_STATE_ACTIVE;
+
+		/* A running transaction always has the number
+		field inited to TRX_ID_MAX */
+
+		trx->no = TRX_ID_MAX;
+	}
+
+	/* trx_start_low() is not called with resurrect, so need to initialize
+	start time here.*/
+	if (trx->state == TRX_STATE_ACTIVE
+	    || trx->state == TRX_STATE_PREPARED) {
+		trx->start_time = ut_time();
+	}
+
+	if (undo->dict_operation) {
+		trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+		trx->table_id = undo->table_id;
+	}
+
+	if (!undo->empty) {
+		trx->undo_no = undo->top_undo_no + 1;
+	}
+
+	return(trx);
+}
+
+/****************************************************************//**
+Prepared transactions are left in the prepared state waiting for a
+commit or abort decision from MySQL */
+static
+void
+trx_resurrect_update_in_prepared_state(
+/*===================================*/
+	trx_t*			trx,	/*!< in,out: transaction */
+	const trx_undo_t*	undo)	/*!< in: update UNDO record */
+{
+	/* This is single-threaded startup code, we do not need the
+	protection of trx->mutex or trx_sys->mutex here. */
+
+	if (undo->state == TRX_UNDO_PREPARED) {
+		fprintf(stderr,
+			"InnoDB: Transaction " TRX_ID_FMT
+			" was in the XA prepared state.\n", trx->id);
+
+		if (srv_force_recovery == 0) {
+			if (trx_state_eq(trx, TRX_STATE_NOT_STARTED)) {
+				trx_sys->n_prepared_trx++;
+				trx_sys->n_prepared_recovered_trx++;
+			} else {
+				ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED));
+			}
+
+			trx->state = TRX_STATE_PREPARED;
+		} else {
+			fprintf(stderr,
+				"InnoDB: Since innodb_force_recovery"
+				" > 0, we will rollback it anyway.\n");
+
+			trx->state = TRX_STATE_ACTIVE;
+		}
+	} else {
+		trx->state = TRX_STATE_COMMITTED_IN_MEMORY;
+	}
+}
+
+/****************************************************************//**
+Resurrect the transactions that were doing updates the time of the
+crash, they need to be undone. */
+static
+void
+trx_resurrect_update(
+/*=================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	trx_undo_t*	undo,	/*!< in/out: update UNDO record */
+	trx_rseg_t*	rseg)	/*!< in/out: rollback segment */
+{
+	trx->rseg = rseg;
+	trx->xid = undo->xid;
+	trx->id = undo->trx_id;
+	trx->update_undo = undo;
+	trx->is_recovered = TRUE;
+
+	/* This is single-threaded startup code, we do not need the
+	protection of trx->mutex or trx_sys->mutex here. */
+
+	if (undo->state != TRX_UNDO_ACTIVE) {
+		trx_resurrect_update_in_prepared_state(trx, undo);
+
+		/* We give a dummy value for the trx number */
+
+		trx->no = trx->id;
+
+	} else {
+		trx->state = TRX_STATE_ACTIVE;
+
+		/* A running transaction always has the number field inited to
+		TRX_ID_MAX */
+
+		trx->no = TRX_ID_MAX;
+	}
+
+	/* trx_start_low() is not called with resurrect, so need to initialize
+	start time here.*/
+	if (trx->state == TRX_STATE_ACTIVE
+	    || trx->state == TRX_STATE_PREPARED) {
+		trx->start_time = ut_time();
+	}
+
+	if (undo->dict_operation) {
+		trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+		trx->table_id = undo->table_id;
+	}
+
+	if (!undo->empty && undo->top_undo_no >= trx->undo_no) {
+
+		trx->undo_no = undo->top_undo_no + 1;
+	}
+}
+
+/****************************************************************//**
+Creates trx objects for transactions and initializes the trx list of
+trx_sys at database start. Rollback segment and undo log lists must
+already exist when this function is called, because the lists of
+transactions to be rolled back or cleaned up are built based on the
+undo log lists. */
+UNIV_INTERN
+void
+trx_lists_init_at_db_start(void)
+/*============================*/
+{
+	ulint		i;
+
+	ut_a(srv_is_being_started);
+
+	UT_LIST_INIT(trx_sys->ro_trx_list);
+	UT_LIST_INIT(trx_sys->rw_trx_list);
+
+	/* Look from the rollback segments if there exist undo logs for
+	transactions */
+
+	for (i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+		trx_undo_t*	undo;
+		trx_rseg_t*	rseg;
+
+		rseg = trx_sys->rseg_array[i];
+
+		if (rseg == NULL) {
+			continue;
+		}
+
+		/* Resurrect transactions that were doing inserts. */
+		for (undo = UT_LIST_GET_FIRST(rseg->insert_undo_list);
+		     undo != NULL;
+		     undo = UT_LIST_GET_NEXT(undo_list, undo)) {
+			trx_t*	trx;
+
+			trx = trx_resurrect_insert(undo, rseg);
+
+			trx_list_rw_insert_ordered(trx);
+
+			trx_resurrect_table_locks(trx, undo);
+		}
+
+		/* Ressurrect transactions that were doing updates. */
+		for (undo = UT_LIST_GET_FIRST(rseg->update_undo_list);
+		     undo != NULL;
+		     undo = UT_LIST_GET_NEXT(undo_list, undo)) {
+			trx_t*	trx;
+			ibool	trx_created;
+
+			/* Check the trx_sys->rw_trx_list first. */
+			mutex_enter(&trx_sys->mutex);
+			trx = trx_get_rw_trx_by_id(undo->trx_id);
+			mutex_exit(&trx_sys->mutex);
+
+			if (trx == NULL) {
+				trx = trx_allocate_for_background();
+				trx_created = TRUE;
+			} else {
+				trx_created = FALSE;
+			}
+
+			trx_resurrect_update(trx, undo, rseg);
+
+			if (trx_created) {
+				trx_list_rw_insert_ordered(trx);
+			}
+
+			trx_resurrect_table_locks(trx, undo);
+		}
+	}
+}
+
+/******************************************************************//**
+Assigns a rollback segment to a transaction in a round-robin fashion.
+@return	assigned rollback segment instance */
+static
+trx_rseg_t*
+trx_assign_rseg_low(
+/*================*/
+	ulong	max_undo_logs,	/*!< in: maximum number of UNDO logs to use */
+	ulint	n_tablespaces)	/*!< in: number of rollback tablespaces */
+{
+	ulint		i;
+	trx_rseg_t*	rseg;
+	static ulint	latest_rseg = 0;
+
+	if (srv_read_only_mode) {
+		ut_a(max_undo_logs == ULONG_UNDEFINED);
+		return(NULL);
+	}
+
+	/* This breaks true round robin but that should be OK. */
+
+	ut_a(max_undo_logs > 0 && max_undo_logs <= TRX_SYS_N_RSEGS);
+
+	i = latest_rseg++;
+        i %= max_undo_logs;
+
+	/* Note: The assumption here is that there can't be any gaps in
+	the array. Once we implement more flexible rollback segment
+	management this may not hold. The assertion checks for that case. */
+
+	if (trx_sys->rseg_array[0] == NULL) {
+		return(NULL);
+	}
+
+	/* Skip the system tablespace if we have more than one tablespace
+	defined for rollback segments. We want all UNDO records to be in
+	the non-system tablespaces. */
+
+	do {
+		rseg = trx_sys->rseg_array[i];
+		ut_a(rseg == NULL || i == rseg->id);
+
+		i = (rseg == NULL) ? 0 : i + 1;
+
+	} while (rseg == NULL
+		 || (rseg->space == 0
+		     && n_tablespaces > 0
+		     && trx_sys->rseg_array[1] != NULL));
+
+	return(rseg);
+}
+
+/****************************************************************//**
+Assign a read-only transaction a rollback-segment, if it is attempting
+to write to a TEMPORARY table. */
+UNIV_INTERN
+void
+trx_assign_rseg(
+/*============*/
+	trx_t*		trx)		/*!< A read-only transaction that
+					needs to be assigned a RBS. */
+{
+	ut_a(trx->rseg == 0);
+	ut_a(trx->read_only);
+	ut_a(!srv_read_only_mode);
+	ut_a(!trx_is_autocommit_non_locking(trx));
+
+	trx->rseg = trx_assign_rseg_low(srv_undo_logs, srv_undo_tablespaces);
+}
+
+/****************************************************************//**
+Starts a transaction. */
+static
+void
+trx_start_low(
+/*==========*/
+	trx_t*	trx)		/*!< in: transaction */
+{
+	ut_ad(trx->rseg == NULL);
+
+	ut_ad(trx->start_file != 0);
+	ut_ad(trx->start_line != 0);
+	ut_ad(!trx->is_recovered);
+	ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+	ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+
+	/* Check whether it is an AUTOCOMMIT SELECT */
+	trx->auto_commit = (trx->api_trx && trx->api_auto_commit)
+			   || thd_trx_is_auto_commit(trx->mysql_thd);
+
+	trx->read_only =
+		(trx->api_trx && !trx->read_write)
+		|| (!trx->ddl && thd_trx_is_read_only(trx->mysql_thd))
+		|| srv_read_only_mode;
+
+	if (!trx->auto_commit) {
+		++trx->will_lock;
+	} else if (trx->will_lock == 0) {
+		trx->read_only = TRUE;
+	}
+
+	if (!trx->read_only) {
+		trx->rseg = trx_assign_rseg_low(
+			srv_undo_logs, srv_undo_tablespaces);
+	}
+
+	/* The initial value for trx->no: TRX_ID_MAX is used in
+	read_view_open_now: */
+
+	trx->no = TRX_ID_MAX;
+
+	ut_a(ib_vector_is_empty(trx->autoinc_locks));
+	ut_a(ib_vector_is_empty(trx->lock.table_locks));
+
+	mutex_enter(&trx_sys->mutex);
+
+	/* If this transaction came from trx_allocate_for_mysql(),
+	trx->in_mysql_trx_list would hold. In that case, the trx->state
+	change must be protected by the trx_sys->mutex, so that
+	lock_print_info_all_transactions() will have a consistent view. */
+
+	trx->state = TRX_STATE_ACTIVE;
+
+	trx->id = trx_sys_get_new_trx_id();
+
+	ut_ad(!trx->in_rw_trx_list);
+	ut_ad(!trx->in_ro_trx_list);
+
+	if (trx->read_only) {
+
+		/* Note: The trx_sys_t::ro_trx_list doesn't really need to
+		be ordered, we should exploit this using a list type that
+		doesn't need a list wide lock to increase concurrency. */
+
+		if (!trx_is_autocommit_non_locking(trx)) {
+			UT_LIST_ADD_FIRST(trx_list, trx_sys->ro_trx_list, trx);
+			ut_d(trx->in_ro_trx_list = TRUE);
+		}
+	} else {
+
+		ut_ad(trx->rseg != NULL
+		      || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO);
+
+		ut_ad(!trx_is_autocommit_non_locking(trx));
+		UT_LIST_ADD_FIRST(trx_list, trx_sys->rw_trx_list, trx);
+		ut_d(trx->in_rw_trx_list = TRUE);
+#ifdef UNIV_DEBUG
+		if (trx->id > trx_sys->rw_max_trx_id) {
+			trx_sys->rw_max_trx_id = trx->id;
+		}
+#endif /* UNIV_DEBUG */
+	}
+
+	ut_ad(trx_sys_validate_trx_list());
+
+	mutex_exit(&trx_sys->mutex);
+
+	trx->start_time = ut_time();
+
+	MONITOR_INC(MONITOR_TRX_ACTIVE);
+}
+
+/****************************************************************//**
+Set the transaction serialisation number. */
+static
+void
+trx_serialisation_number_get(
+/*=========================*/
+	trx_t*		trx)	/*!< in: transaction */
+{
+	trx_rseg_t*	rseg;
+
+	rseg = trx->rseg;
+
+	ut_ad(mutex_own(&rseg->mutex));
+
+	mutex_enter(&trx_sys->mutex);
+
+	trx->no = trx_sys_get_new_trx_id();
+
+	/* If the rollack segment is not empty then the
+	new trx_t::no can't be less than any trx_t::no
+	already in the rollback segment. User threads only
+	produce events when a rollback segment is empty. */
+
+	if (rseg->last_page_no == FIL_NULL) {
+		void*		ptr;
+		rseg_queue_t	rseg_queue;
+
+		rseg_queue.rseg = rseg;
+		rseg_queue.trx_no = trx->no;
+
+		mutex_enter(&purge_sys->bh_mutex);
+
+		/* This is to reduce the pressure on the trx_sys_t::mutex
+		though in reality it should make very little (read no)
+		difference because this code path is only taken when the
+		rbs is empty. */
+
+		mutex_exit(&trx_sys->mutex);
+
+		ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue);
+		ut_a(ptr);
+
+		mutex_exit(&purge_sys->bh_mutex);
+	} else {
+		mutex_exit(&trx_sys->mutex);
+	}
+}
+
+/****************************************************************//**
+Assign the transaction its history serialisation number and write the
+update UNDO log record to the assigned rollback segment. */
+static __attribute__((nonnull))
+void
+trx_write_serialisation_history(
+/*============================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	trx_rseg_t*	rseg;
+
+	rseg = trx->rseg;
+
+	/* Change the undo log segment states from TRX_UNDO_ACTIVE
+	to some other state: these modifications to the file data
+	structure define the transaction as committed in the file
+	based domain, at the serialization point of the log sequence
+	number lsn obtained below. */
+
+	if (trx->update_undo != NULL) {
+		page_t*		undo_hdr_page;
+		trx_undo_t*	undo = trx->update_undo;
+
+		/* We have to hold the rseg mutex because update
+		log headers have to be put to the history list in the
+		(serialisation) order of the UNDO trx number. This is
+		required for the purge in-memory data structures too. */
+
+		mutex_enter(&rseg->mutex);
+
+		/* Assign the transaction serialisation number and also
+		update the purge min binary heap if this is the first
+		UNDO log being written to the assigned rollback segment. */
+
+		trx_serialisation_number_get(trx);
+
+		/* It is not necessary to obtain trx->undo_mutex here
+		because only a single OS thread is allowed to do the
+		transaction commit for this transaction. */
+
+		undo_hdr_page = trx_undo_set_state_at_finish(undo, mtr);
+
+		trx_undo_update_cleanup(trx, undo_hdr_page, mtr);
+	} else {
+		mutex_enter(&rseg->mutex);
+	}
+
+	if (trx->insert_undo != NULL) {
+		trx_undo_set_state_at_finish(trx->insert_undo, mtr);
+	}
+
+	mutex_exit(&rseg->mutex);
+
+	MONITOR_INC(MONITOR_TRX_COMMIT_UNDO);
+
+	/* Update the latest MySQL binlog name and offset info
+	in trx sys header if MySQL binlogging is on or the database
+	server is a MySQL replication slave */
+
+	if (trx->mysql_log_file_name
+	    && trx->mysql_log_file_name[0] != '\0') {
+
+		trx_sys_update_mysql_binlog_offset(
+			trx->mysql_log_file_name,
+			trx->mysql_log_offset,
+			TRX_SYS_MYSQL_LOG_INFO, mtr);
+
+		trx->mysql_log_file_name = NULL;
+	}
+}
+
+/********************************************************************
+Finalize a transaction containing updates for a FTS table. */
+static __attribute__((nonnull))
+void
+trx_finalize_for_fts_table(
+/*=======================*/
+        fts_trx_table_t*        ftt)            /* in: FTS trx table */
+{
+	fts_t*                  fts = ftt->table->fts;
+	fts_doc_ids_t*          doc_ids = ftt->added_doc_ids;
+
+	mutex_enter(&fts->bg_threads_mutex);
+
+	if (fts->fts_status & BG_THREAD_STOP) {
+		/* The table is about to be dropped, no use
+		adding anything to its work queue. */
+
+		mutex_exit(&fts->bg_threads_mutex);
+	} else {
+		mem_heap_t*     heap;
+		mutex_exit(&fts->bg_threads_mutex);
+
+		ut_a(fts->add_wq);
+
+		heap = static_cast<mem_heap_t*>(doc_ids->self_heap->arg);
+
+		ib_wqueue_add(fts->add_wq, doc_ids, heap);
+
+		/* fts_trx_table_t no longer owns the list. */
+		ftt->added_doc_ids = NULL;
+	}
+}
+
+/******************************************************************//**
+Finalize a transaction containing updates to FTS tables. */
+static __attribute__((nonnull))
+void
+trx_finalize_for_fts(
+/*=================*/
+	trx_t*	trx,		/*!< in/out: transaction */
+	bool	is_commit)	/*!< in: true if the transaction was
+				committed, false if it was rolled back. */
+{
+	if (is_commit) {
+		const ib_rbt_node_t*	node;
+		ib_rbt_t*		tables;
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_last(trx->fts_trx->savepoints));
+
+		tables = savepoint->tables;
+
+		for (node = rbt_first(tables);
+		     node;
+		     node = rbt_next(tables, node)) {
+			fts_trx_table_t**	ftt;
+
+			ftt = rbt_value(fts_trx_table_t*, node);
+
+			if ((*ftt)->added_doc_ids) {
+				trx_finalize_for_fts_table(*ftt);
+			}
+		}
+	}
+
+	fts_trx_free(trx->fts_trx);
+	trx->fts_trx = NULL;
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk based on the value of
+innodb_flush_log_at_trx_commit. */
+static
+void
+trx_flush_log_if_needed_low(
+/*========================*/
+	lsn_t	lsn)	/*!< in: lsn up to which logs are to be
+			flushed. */
+{
+	switch (srv_flush_log_at_trx_commit) {
+	case 0:
+		/* Do nothing */
+		break;
+	case 1:
+		/* Write the log and optionally flush it to disk */
+		log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
+				srv_unix_file_flush_method != SRV_UNIX_NOSYNC);
+		break;
+	case 2:
+		/* Write the log but do not flush it to disk */
+		log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+
+		break;
+	default:
+		ut_error;
+	}
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk based on the value of
+innodb_flush_log_at_trx_commit. */
+static __attribute__((nonnull))
+void
+trx_flush_log_if_needed(
+/*====================*/
+	lsn_t	lsn,	/*!< in: lsn up to which logs are to be
+			flushed. */
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	trx->op_info = "flushing log";
+	trx_flush_log_if_needed_low(lsn);
+	trx->op_info = "";
+}
+
+/****************************************************************//**
+Commits a transaction in memory. */
+static __attribute__((nonnull))
+void
+trx_commit_in_memory(
+/*=================*/
+	trx_t*	trx,	/*!< in/out: transaction */
+	lsn_t	lsn)	/*!< in: log sequence number of the mini-transaction
+			commit of trx_write_serialisation_history(), or 0
+			if the transaction did not modify anything */
+{
+	trx->must_flush_log_later = FALSE;
+
+	if (trx_is_autocommit_non_locking(trx)) {
+		ut_ad(trx->read_only);
+		ut_a(!trx->is_recovered);
+		ut_ad(trx->rseg == NULL);
+		ut_ad(!trx->in_ro_trx_list);
+		ut_ad(!trx->in_rw_trx_list);
+
+		/* Note: We are asserting without holding the lock mutex. But
+		that is OK because this transaction is not waiting and cannot
+		be rolled back and no new locks can (or should not) be added
+		becuase it is flagged as a non-locking read-only transaction. */
+
+		ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+
+		/* This state change is not protected by any mutex, therefore
+		there is an inherent race here around state transition during
+		printouts. We ignore this race for the sake of efficiency.
+		However, the trx_sys_t::mutex will protect the trx_t instance
+		and it cannot be removed from the mysql_trx_list and freed
+		without first acquiring the trx_sys_t::mutex. */
+
+		ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+
+		trx->state = TRX_STATE_NOT_STARTED;
+
+		read_view_remove(trx->global_read_view, false);
+
+		MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT);
+	} else {
+		lock_trx_release_locks(trx);
+
+		/* Remove the transaction from the list of active
+		transactions now that it no longer holds any user locks. */
+
+		ut_ad(trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY));
+
+		mutex_enter(&trx_sys->mutex);
+
+		assert_trx_in_list(trx);
+
+		if (trx->read_only) {
+			UT_LIST_REMOVE(trx_list, trx_sys->ro_trx_list, trx);
+			ut_d(trx->in_ro_trx_list = FALSE);
+			MONITOR_INC(MONITOR_TRX_RO_COMMIT);
+		} else {
+			UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx);
+			ut_d(trx->in_rw_trx_list = FALSE);
+			MONITOR_INC(MONITOR_TRX_RW_COMMIT);
+		}
+
+		/* If this transaction came from trx_allocate_for_mysql(),
+		trx->in_mysql_trx_list would hold. In that case, the
+		trx->state change must be protected by trx_sys->mutex, so that
+		lock_print_info_all_transactions() will have a consistent
+		view. */
+
+		trx->state = TRX_STATE_NOT_STARTED;
+
+		/* We already own the trx_sys_t::mutex, by doing it here we
+		avoid a potential context switch later. */
+		read_view_remove(trx->global_read_view, true);
+
+		ut_ad(trx_sys_validate_trx_list());
+
+		mutex_exit(&trx_sys->mutex);
+	}
+
+	if (trx->global_read_view != NULL) {
+
+		mem_heap_empty(trx->global_read_view_heap);
+
+		trx->global_read_view = NULL;
+	}
+
+	trx->read_view = NULL;
+
+	if (lsn) {
+		if (trx->insert_undo != NULL) {
+
+			trx_undo_insert_cleanup(trx);
+		}
+
+		/* NOTE that we could possibly make a group commit more
+		efficient here: call os_thread_yield here to allow also other
+		trxs to come to commit! */
+
+		/*-------------------------------------*/
+
+		/* Depending on the my.cnf options, we may now write the log
+		buffer to the log files, making the transaction durable if
+		the OS does not crash. We may also flush the log files to
+		disk, making the transaction durable also at an OS crash or a
+		power outage.
+
+		The idea in InnoDB's group commit is that a group of
+		transactions gather behind a trx doing a physical disk write
+		to log files, and when that physical write has been completed,
+		one of those transactions does a write which commits the whole
+		group. Note that this group commit will only bring benefit if
+		there are > 2 users in the database. Then at least 2 users can
+		gather behind one doing the physical log write to disk.
+
+		If we are calling trx_commit() under prepare_commit_mutex, we
+		will delay possible log write and flush to a separate function
+		trx_commit_complete_for_mysql(), which is only called when the
+		thread has released the mutex. This is to make the
+		group commit algorithm to work. Otherwise, the prepare_commit
+		mutex would serialize all commits and prevent a group of
+		transactions from gathering. */
+
+		if (trx->flush_log_later) {
+			/* Do nothing yet */
+			trx->must_flush_log_later = TRUE;
+		} else if (srv_flush_log_at_trx_commit == 0
+			   || thd_requested_durability(trx->mysql_thd)
+			   == HA_IGNORE_DURABILITY) {
+			/* Do nothing */
+		} else {
+			trx_flush_log_if_needed(lsn, trx);
+		}
+
+		trx->commit_lsn = lsn;
+
+		/* Tell server some activity has happened, since the trx
+		does changes something. Background utility threads like
+		master thread, purge thread or page_cleaner thread might
+		have some work to do. */
+		srv_active_wake_master_thread();
+	}
+
+	/* undo_no is non-zero if we're doing the final commit. */
+	bool			not_rollback = trx->undo_no != 0;
+	/* Free all savepoints, starting from the first. */
+	trx_named_savept_t*	savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+	trx_roll_savepoints_free(trx, savep);
+
+	trx->rseg = NULL;
+	trx->undo_no = 0;
+	trx->last_sql_stat_start.least_undo_no = 0;
+
+	trx->ddl = false;
+#ifdef UNIV_DEBUG
+	ut_ad(trx->start_file != 0);
+	ut_ad(trx->start_line != 0);
+	trx->start_file = 0;
+	trx->start_line = 0;
+#endif /* UNIV_DEBUG */
+
+	trx->will_lock = 0;
+	trx->read_only = FALSE;
+	trx->auto_commit = FALSE;
+
+        if (trx->fts_trx) {
+                trx_finalize_for_fts(trx, not_rollback);
+        }
+
+	ut_ad(trx->lock.wait_thr == NULL);
+	ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+	ut_ad(!trx->in_ro_trx_list);
+	ut_ad(!trx->in_rw_trx_list);
+
+	trx->dict_operation = TRX_DICT_OP_NONE;
+
+	trx->error_state = DB_SUCCESS;
+
+	/* trx->in_mysql_trx_list would hold between
+	trx_allocate_for_mysql() and trx_free_for_mysql(). It does not
+	hold for recovered transactions or system transactions. */
+}
+
+/****************************************************************//**
+Commits a transaction and a mini-transaction. */
+UNIV_INTERN
+void
+trx_commit_low(
+/*===========*/
+	trx_t*	trx,	/*!< in/out: transaction */
+	mtr_t*	mtr)	/*!< in/out: mini-transaction (will be committed),
+			or NULL if trx made no modifications */
+{
+	lsn_t	lsn;
+
+	assert_trx_nonlocking_or_in_list(trx);
+	ut_ad(!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY));
+	ut_ad(!mtr || mtr->state == MTR_ACTIVE);
+	ut_ad(!mtr == !(trx->insert_undo || trx->update_undo));
+
+	/* undo_no is non-zero if we're doing the final commit. */
+	if (trx->fts_trx && trx->undo_no != 0) {
+		dberr_t	error;
+
+		ut_a(!trx_is_autocommit_non_locking(trx));
+
+		error = fts_commit(trx);
+
+		/* FTS-FIXME: Temporarily tolerate DB_DUPLICATE_KEY
+		instead of dying. This is a possible scenario if there
+		is a crash between insert to DELETED table committing
+		and transaction committing. The fix would be able to
+		return error from this function */
+		if (error != DB_SUCCESS && error != DB_DUPLICATE_KEY) {
+			/* FTS-FIXME: once we can return values from this
+			function, we should do so and signal an error
+			instead of just dying. */
+
+			ut_error;
+		}
+	}
+
+	if (mtr) {
+		trx_write_serialisation_history(trx, mtr);
+		/* The following call commits the mini-transaction, making the
+		whole transaction committed in the file-based world, at this
+		log sequence number. The transaction becomes 'durable' when
+		we write the log to disk, but in the logical sense the commit
+		in the file-based data structures (undo logs etc.) happens
+		here.
+
+		NOTE that transaction numbers, which are assigned only to
+		transactions with an update undo log, do not necessarily come
+		in exactly the same order as commit lsn's, if the transactions
+		have different rollback segments. To get exactly the same
+		order we should hold the kernel mutex up to this point,
+		adding to the contention of the kernel mutex. However, if
+		a transaction T2 is able to see modifications made by
+		a transaction T1, T2 will always get a bigger transaction
+		number and a bigger commit lsn than T1. */
+
+		/*--------------*/
+		mtr_commit(mtr);
+		/*--------------*/
+		lsn = mtr->end_lsn;
+	} else {
+		lsn = 0;
+	}
+
+	trx_commit_in_memory(trx, lsn);
+}
+
+/****************************************************************//**
+Commits a transaction. */
+UNIV_INTERN
+void
+trx_commit(
+/*=======*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	mtr_t	local_mtr;
+	mtr_t*	mtr;
+
+	if (trx->insert_undo || trx->update_undo) {
+		mtr = &local_mtr;
+		mtr_start(mtr);
+	} else {
+		mtr = NULL;
+	}
+
+	trx_commit_low(trx, mtr);
+}
+
+/****************************************************************//**
+Cleans up a transaction at database startup. The cleanup is needed if
+the transaction already got to the middle of a commit when the database
+crashed, and we cannot roll it back. */
+UNIV_INTERN
+void
+trx_cleanup_at_db_startup(
+/*======================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	ut_ad(trx->is_recovered);
+
+	if (trx->insert_undo != NULL) {
+
+		trx_undo_insert_cleanup(trx);
+	}
+
+	trx->rseg = NULL;
+	trx->undo_no = 0;
+	trx->last_sql_stat_start.least_undo_no = 0;
+
+	mutex_enter(&trx_sys->mutex);
+
+	ut_a(!trx->read_only);
+
+	UT_LIST_REMOVE(trx_list, trx_sys->rw_trx_list, trx);
+
+	assert_trx_in_rw_list(trx);
+	ut_d(trx->in_rw_trx_list = FALSE);
+
+	mutex_exit(&trx_sys->mutex);
+
+	/* Change the transaction state without mutex protection, now
+	that it no longer is in the trx_list. Recovered transactions
+	are never placed in the mysql_trx_list. */
+	ut_ad(trx->is_recovered);
+	ut_ad(!trx->in_ro_trx_list);
+	ut_ad(!trx->in_rw_trx_list);
+	ut_ad(!trx->in_mysql_trx_list);
+	trx->state = TRX_STATE_NOT_STARTED;
+}
+
+/********************************************************************//**
+Assigns a read view for a consistent read query. All the consistent reads
+within the same transaction will get the same read view, which is created
+when this function is first called for a new started transaction.
+@return	consistent read view */
+UNIV_INTERN
+read_view_t*
+trx_assign_read_view(
+/*=================*/
+	trx_t*	trx)	/*!< in: active transaction */
+{
+	ut_ad(trx->state == TRX_STATE_ACTIVE);
+
+	if (trx->read_view != NULL) {
+		return(trx->read_view);
+	}
+
+	if (!trx->read_view) {
+
+		trx->read_view = read_view_open_now(
+			trx->id, trx->global_read_view_heap);
+
+		trx->global_read_view = trx->read_view;
+	}
+
+	return(trx->read_view);
+}
+
+/****************************************************************//**
+Prepares a transaction for commit/rollback. */
+UNIV_INTERN
+void
+trx_commit_or_rollback_prepare(
+/*===========================*/
+	trx_t*	trx)		/*!< in/out: transaction */
+{
+	/* We are reading trx->state without holding trx_sys->mutex
+	here, because the commit or rollback should be invoked for a
+	running (or recovered prepared) transaction that is associated
+	with the current thread. */
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		trx_start_low(trx);
+		/* fall through */
+	case TRX_STATE_ACTIVE:
+	case TRX_STATE_PREPARED:
+		/* If the trx is in a lock wait state, moves the waiting
+		query thread to the suspended state */
+
+		if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+			ut_a(trx->lock.wait_thr != NULL);
+			trx->lock.wait_thr->state = QUE_THR_SUSPENDED;
+			trx->lock.wait_thr = NULL;
+
+			trx->lock.que_state = TRX_QUE_RUNNING;
+		}
+
+		ut_a(trx->lock.n_active_thrs == 1);
+		return;
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
+/*********************************************************************//**
+Creates a commit command node struct.
+@return	own: commit node struct */
+UNIV_INTERN
+commit_node_t*
+trx_commit_node_create(
+/*===================*/
+	mem_heap_t*	heap)	/*!< in: mem heap where created */
+{
+	commit_node_t*	node;
+
+	node = static_cast<commit_node_t*>(mem_heap_alloc(heap, sizeof(*node)));
+	node->common.type  = QUE_NODE_COMMIT;
+	node->state = COMMIT_NODE_SEND;
+
+	return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return	query thread to run next, or NULL */
+UNIV_INTERN
+que_thr_t*
+trx_commit_step(
+/*============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	commit_node_t*	node;
+
+	node = static_cast<commit_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		node->state = COMMIT_NODE_SEND;
+	}
+
+	if (node->state == COMMIT_NODE_SEND) {
+		trx_t*	trx;
+
+		node->state = COMMIT_NODE_WAIT;
+
+		trx = thr_get_trx(thr);
+
+		ut_a(trx->lock.wait_thr == NULL);
+		ut_a(trx->lock.que_state != TRX_QUE_LOCK_WAIT);
+
+		trx_commit_or_rollback_prepare(trx);
+
+		trx->lock.que_state = TRX_QUE_COMMITTING;
+
+		trx_commit(trx);
+
+		ut_ad(trx->lock.wait_thr == NULL);
+
+		trx->lock.que_state = TRX_QUE_RUNNING;
+
+		thr = NULL;
+	} else {
+		ut_ad(node->state == COMMIT_NODE_WAIT);
+
+		node->state = COMMIT_NODE_SEND;
+
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+dberr_t
+trx_commit_for_mysql(
+/*=================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	/* Because we do not do the commit by sending an Innobase
+	sig to the transaction, we must here make sure that trx has been
+	started. */
+
+	ut_a(trx);
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		/* Update the info whether we should skip XA steps that eat
+		CPU time.
+
+		For the duration of the transaction trx->support_xa is
+		not reread from thd so any changes in the value take
+		effect in the next transaction. This is to avoid a
+		scenario where some undo log records generated by a
+		transaction contain XA information and other undo log
+		records, generated by the same transaction do not. */
+		trx->support_xa = thd_supports_xa(trx->mysql_thd);
+
+		ut_d(trx->start_file = __FILE__);
+		ut_d(trx->start_line = __LINE__);
+
+		trx_start_low(trx);
+		/* fall through */
+	case TRX_STATE_ACTIVE:
+	case TRX_STATE_PREPARED:
+		trx->op_info = "committing";
+		trx_commit(trx);
+		MONITOR_DEC(MONITOR_TRX_ACTIVE);
+		trx->op_info = "";
+		return(DB_SUCCESS);
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+	ut_error;
+	return(DB_CORRUPTION);
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk if we called trx_commit_for_mysql()
+with trx->flush_log_later == TRUE. */
+UNIV_INTERN
+void
+trx_commit_complete_for_mysql(
+/*==========================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	ut_a(trx);
+
+	if (!trx->must_flush_log_later
+	    || thd_requested_durability(trx->mysql_thd)
+	       == HA_IGNORE_DURABILITY) {
+		return;
+	}
+
+	trx_flush_log_if_needed(trx->commit_lsn, trx);
+
+	trx->must_flush_log_later = FALSE;
+}
+
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+UNIV_INTERN
+void
+trx_mark_sql_stat_end(
+/*==================*/
+	trx_t*	trx)	/*!< in: trx handle */
+{
+	ut_a(trx);
+
+	switch (trx->state) {
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	case TRX_STATE_NOT_STARTED:
+		trx->undo_no = 0;
+		/* fall through */
+	case TRX_STATE_ACTIVE:
+		trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+
+		if (trx->fts_trx) {
+			fts_savepoint_laststmt_refresh(trx);
+		}
+
+		return;
+	}
+
+	ut_error;
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+Caller must hold trx_sys->mutex. */
+UNIV_INTERN
+void
+trx_print_low(
+/*==========*/
+	FILE*		f,
+			/*!< in: output stream */
+	const trx_t*	trx,
+			/*!< in: transaction */
+	ulint		max_query_len,
+			/*!< in: max query length to print,
+			or 0 to use the default max length */
+	ulint		n_rec_locks,
+			/*!< in: lock_number_of_rows_locked(&trx->lock) */
+	ulint		n_trx_locks,
+			/*!< in: length of trx->lock.trx_locks */
+	ulint		heap_size)
+			/*!< in: mem_heap_get_size(trx->lock.lock_heap) */
+{
+	ibool		newline;
+	const char*	op_info;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	fprintf(f, "TRANSACTION " TRX_ID_FMT, trx->id);
+
+	/* trx->state cannot change from or to NOT_STARTED while we
+	are holding the trx_sys->mutex. It may change from ACTIVE to
+	PREPARED or COMMITTED. */
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		fputs(", not started", f);
+		goto state_ok;
+	case TRX_STATE_ACTIVE:
+		fprintf(f, ", ACTIVE %lu sec",
+			(ulong) difftime(time(NULL), trx->start_time));
+		goto state_ok;
+	case TRX_STATE_PREPARED:
+		fprintf(f, ", ACTIVE (PREPARED) %lu sec",
+			(ulong) difftime(time(NULL), trx->start_time));
+		goto state_ok;
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		fputs(", COMMITTED IN MEMORY", f);
+		goto state_ok;
+	}
+	fprintf(f, ", state %lu", (ulong) trx->state);
+	ut_ad(0);
+state_ok:
+
+	/* prevent a race condition */
+	op_info = trx->op_info;
+
+	if (*op_info) {
+		putc(' ', f);
+		fputs(op_info, f);
+	}
+
+	if (trx->is_recovered) {
+		fputs(" recovered trx", f);
+	}
+
+	if (trx->declared_to_be_inside_innodb) {
+		fprintf(f, ", thread declared inside InnoDB %lu",
+			(ulong) trx->n_tickets_to_enter_innodb);
+	}
+
+	putc('\n', f);
+
+	if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
+		fprintf(f, "mysql tables in use %lu, locked %lu\n",
+			(ulong) trx->n_mysql_tables_in_use,
+			(ulong) trx->mysql_n_tables_locked);
+	}
+
+	newline = TRUE;
+
+	/* trx->lock.que_state of an ACTIVE transaction may change
+	while we are not holding trx->mutex. We perform a dirty read
+	for performance reasons. */
+
+	switch (trx->lock.que_state) {
+	case TRX_QUE_RUNNING:
+		newline = FALSE; break;
+	case TRX_QUE_LOCK_WAIT:
+		fputs("LOCK WAIT ", f); break;
+	case TRX_QUE_ROLLING_BACK:
+		fputs("ROLLING BACK ", f); break;
+	case TRX_QUE_COMMITTING:
+		fputs("COMMITTING ", f); break;
+	default:
+		fprintf(f, "que state %lu ", (ulong) trx->lock.que_state);
+	}
+
+	if (n_trx_locks > 0 || heap_size > 400) {
+		newline = TRUE;
+
+		fprintf(f, "%lu lock struct(s), heap size %lu,"
+			" %lu row lock(s)",
+			(ulong) n_trx_locks,
+			(ulong) heap_size,
+			(ulong) n_rec_locks);
+	}
+
+	if (trx->has_search_latch) {
+		newline = TRUE;
+		fputs(", holds adaptive hash latch", f);
+	}
+
+	if (trx->undo_no != 0) {
+		newline = TRUE;
+		fprintf(f, ", undo log entries " TRX_ID_FMT, trx->undo_no);
+	}
+
+	if (newline) {
+		putc('\n', f);
+	}
+
+	if (trx->mysql_thd != NULL) {
+		innobase_mysql_print_thd(
+			f, trx->mysql_thd, static_cast<uint>(max_query_len));
+	}
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+The caller must hold lock_sys->mutex and trx_sys->mutex.
+When possible, use trx_print() instead. */
+UNIV_INTERN
+void
+trx_print_latched(
+/*==============*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+{
+	ut_ad(lock_mutex_own());
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	trx_print_low(f, trx, max_query_len,
+		      lock_number_of_rows_locked(&trx->lock),
+		      UT_LIST_GET_LEN(trx->lock.trx_locks),
+		      mem_heap_get_size(trx->lock.lock_heap));
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+Acquires and releases lock_sys->mutex and trx_sys->mutex. */
+UNIV_INTERN
+void
+trx_print(
+/*======*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+{
+	ulint	n_rec_locks;
+	ulint	n_trx_locks;
+	ulint	heap_size;
+
+	lock_mutex_enter();
+	n_rec_locks = lock_number_of_rows_locked(&trx->lock);
+	n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
+	heap_size = mem_heap_get_size(trx->lock.lock_heap);
+	lock_mutex_exit();
+
+	mutex_enter(&trx_sys->mutex);
+	trx_print_low(f, trx, max_query_len,
+		      n_rec_locks, n_trx_locks, heap_size);
+	mutex_exit(&trx_sys->mutex);
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Asserts that a transaction has been started.
+The caller must hold trx_sys->mutex.
+@return TRUE if started */
+UNIV_INTERN
+ibool
+trx_assert_started(
+/*===============*/
+	const trx_t*	trx)	/*!< in: transaction */
+{
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	/* Non-locking autocommits should not hold any locks and this
+	function is only called from the locking code. */
+	assert_trx_in_list(trx);
+
+	/* trx->state can change from or to NOT_STARTED while we are holding
+	trx_sys->mutex for non-locking autocommit selects but not for other
+	types of transactions. It may change from ACTIVE to PREPARED. Unless
+	we are holding lock_sys->mutex, it may also change to COMMITTED. */
+
+	switch (trx->state) {
+	case TRX_STATE_PREPARED:
+		return(TRUE);
+
+	case TRX_STATE_ACTIVE:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		return(TRUE);
+
+	case TRX_STATE_NOT_STARTED:
+		break;
+	}
+
+	ut_error;
+	return(FALSE);
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Compares the "weight" (or size) of two transactions. Transactions that
+have edited non-transactional tables are considered heavier than ones
+that have not.
+@return	TRUE if weight(a) >= weight(b) */
+UNIV_INTERN
+ibool
+trx_weight_ge(
+/*==========*/
+	const trx_t*	a,	/*!< in: the first transaction to be compared */
+	const trx_t*	b)	/*!< in: the second transaction to be compared */
+{
+	ibool	a_notrans_edit;
+	ibool	b_notrans_edit;
+
+	/* If mysql_thd is NULL for a transaction we assume that it has
+	not edited non-transactional tables. */
+
+	a_notrans_edit = a->mysql_thd != NULL
+		&& thd_has_edited_nontrans_tables(a->mysql_thd);
+
+	b_notrans_edit = b->mysql_thd != NULL
+		&& thd_has_edited_nontrans_tables(b->mysql_thd);
+
+	if (a_notrans_edit != b_notrans_edit) {
+
+		return(a_notrans_edit);
+	}
+
+	/* Either both had edited non-transactional tables or both had
+	not, we fall back to comparing the number of altered/locked
+	rows. */
+
+#if 0
+	fprintf(stderr,
+		"%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n",
+		__func__,
+		a->undo_no, UT_LIST_GET_LEN(a->lock.trx_locks),
+		b->undo_no, UT_LIST_GET_LEN(b->lock.trx_locks));
+#endif
+
+	return(TRX_WEIGHT(a) >= TRX_WEIGHT(b));
+}
+
+/****************************************************************//**
+Prepares a transaction. */
+static
+void
+trx_prepare(
+/*========*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	trx_rseg_t*	rseg;
+	lsn_t		lsn;
+	mtr_t		mtr;
+
+	rseg = trx->rseg;
+	/* Only fresh user transactions can be prepared.
+	Recovered transactions cannot. */
+	ut_a(!trx->is_recovered);
+
+	if (trx->insert_undo != NULL || trx->update_undo != NULL) {
+
+		mtr_start(&mtr);
+
+		/* Change the undo log segment states from TRX_UNDO_ACTIVE
+		to TRX_UNDO_PREPARED: these modifications to the file data
+		structure define the transaction as prepared in the
+		file-based world, at the serialization point of lsn. */
+
+		mutex_enter(&rseg->mutex);
+
+		if (trx->insert_undo != NULL) {
+
+			/* It is not necessary to obtain trx->undo_mutex here
+			because only a single OS thread is allowed to do the
+			transaction prepare for this transaction. */
+
+			trx_undo_set_state_at_prepare(trx, trx->insert_undo,
+						      &mtr);
+		}
+
+		if (trx->update_undo) {
+			trx_undo_set_state_at_prepare(
+				trx, trx->update_undo, &mtr);
+		}
+
+		mutex_exit(&rseg->mutex);
+
+		/*--------------*/
+		mtr_commit(&mtr);	/* This mtr commit makes the
+					transaction prepared in the file-based
+					world */
+		/*--------------*/
+		lsn = mtr.end_lsn;
+		ut_ad(lsn);
+	} else {
+		lsn = 0;
+	}
+
+	/*--------------------------------------*/
+	ut_a(trx->state == TRX_STATE_ACTIVE);
+	mutex_enter(&trx_sys->mutex);
+	trx->state = TRX_STATE_PREPARED;
+	trx_sys->n_prepared_trx++;
+	mutex_exit(&trx_sys->mutex);
+	/*--------------------------------------*/
+
+	if (lsn) {
+		/* Depending on the my.cnf options, we may now write the log
+		buffer to the log files, making the prepared state of the
+		transaction durable if the OS does not crash. We may also
+		flush the log files to disk, making the prepared state of the
+		transaction durable also at an OS crash or a power outage.
+
+		The idea in InnoDB's group prepare is that a group of
+		transactions gather behind a trx doing a physical disk write
+		to log files, and when that physical write has been completed,
+		one of those transactions does a write which prepares the whole
+		group. Note that this group prepare will only bring benefit if
+		there are > 2 users in the database. Then at least 2 users can
+		gather behind one doing the physical log write to disk.
+
+		TODO: find out if MySQL holds some mutex when calling this.
+		That would spoil our group prepare algorithm. */
+
+		trx_flush_log_if_needed(lsn, trx);
+	}
+}
+
+/**********************************************************************//**
+Does the transaction prepare for MySQL. */
+UNIV_INTERN
+void
+trx_prepare_for_mysql(
+/*==================*/
+	trx_t*	trx)	/*!< in/out: trx handle */
+{
+	trx_start_if_not_started_xa(trx);
+
+	trx->op_info = "preparing";
+
+	trx_prepare(trx);
+
+	trx->op_info = "";
+}
+
+/**********************************************************************//**
+This function is used to find number of prepared transactions and
+their transaction objects for a recovery.
+@return	number of prepared transactions stored in xid_list */
+UNIV_INTERN
+int
+trx_recover_for_mysql(
+/*==================*/
+	XID*	xid_list,	/*!< in/out: prepared transactions */
+	ulint	len)		/*!< in: number of slots in xid_list */
+{
+	const trx_t*	trx;
+	ulint		count = 0;
+
+	ut_ad(xid_list);
+	ut_ad(len);
+
+	/* We should set those transactions which are in the prepared state
+	to the xid_list */
+
+	mutex_enter(&trx_sys->mutex);
+
+	for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+	     trx != NULL;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+		assert_trx_in_rw_list(trx);
+
+		/* The state of a read-write transaction cannot change
+		from or to NOT_STARTED while we are holding the
+		trx_sys->mutex. It may change to PREPARED, but not if
+		trx->is_recovered. It may also change to COMMITTED. */
+		if (trx_state_eq(trx, TRX_STATE_PREPARED)) {
+			xid_list[count] = trx->xid;
+
+			if (count == 0) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"  InnoDB: Starting recovery for"
+					" XA transactions...\n");
+			}
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Transaction " TRX_ID_FMT " in"
+				" prepared state after recovery\n",
+				trx->id);
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Transaction contains changes"
+				" to " TRX_ID_FMT " rows\n",
+				trx->undo_no);
+
+			count++;
+
+			if (count == len) {
+				break;
+			}
+		}
+	}
+
+	mutex_exit(&trx_sys->mutex);
+
+	if (count > 0){
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: %d transactions in prepared state"
+			" after recovery\n",
+			int (count));
+	}
+
+	return(int (count));
+}
+
+/*******************************************************************//**
+This function is used to find one X/Open XA distributed transaction
+which is in the prepared state
+@return	trx on match, the trx->xid will be invalidated;
+note that the trx may have been committed, unless the caller is
+holding lock_sys->mutex */
+static __attribute__((nonnull, warn_unused_result))
+trx_t*
+trx_get_trx_by_xid_low(
+/*===================*/
+	const XID*	xid)		/*!< in: X/Open XA transaction
+					identifier */
+{
+	trx_t*		trx;
+
+	ut_ad(mutex_own(&trx_sys->mutex));
+
+	for (trx = UT_LIST_GET_FIRST(trx_sys->rw_trx_list);
+	     trx != NULL;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+		assert_trx_in_rw_list(trx);
+
+		/* Compare two X/Open XA transaction id's: their
+		length should be the same and binary comparison
+		of gtrid_length+bqual_length bytes should be
+		the same */
+
+		if (trx->is_recovered
+		    && trx_state_eq(trx, TRX_STATE_PREPARED)
+		    && xid->gtrid_length == trx->xid.gtrid_length
+		    && xid->bqual_length == trx->xid.bqual_length
+		    && memcmp(xid->data, trx->xid.data,
+			      xid->gtrid_length + xid->bqual_length) == 0) {
+
+			/* Invalidate the XID, so that subsequent calls
+			will not find it. */
+			memset(&trx->xid, 0, sizeof(trx->xid));
+			trx->xid.formatID = -1;
+			break;
+		}
+	}
+
+	return(trx);
+}
+
+/*******************************************************************//**
+This function is used to find one X/Open XA distributed transaction
+which is in the prepared state
+@return	trx or NULL; on match, the trx->xid will be invalidated;
+note that the trx may have been committed, unless the caller is
+holding lock_sys->mutex */
+UNIV_INTERN
+trx_t*
+trx_get_trx_by_xid(
+/*===============*/
+	const XID*	xid)	/*!< in: X/Open XA transaction identifier */
+{
+	trx_t*	trx;
+
+	if (xid == NULL) {
+
+		return(NULL);
+	}
+
+	mutex_enter(&trx_sys->mutex);
+
+	/* Recovered/Resurrected transactions are always only on the
+	trx_sys_t::rw_trx_list. */
+	trx = trx_get_trx_by_xid_low(xid);
+
+	mutex_exit(&trx_sys->mutex);
+
+	return(trx);
+}
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+UNIV_INTERN
+void
+trx_start_if_not_started_xa_low(
+/*============================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+
+		/* Update the info whether we should skip XA steps
+		that eat CPU time.
+
+		For the duration of the transaction trx->support_xa is
+		not reread from thd so any changes in the value take
+		effect in the next transaction. This is to avoid a
+		scenario where some undo generated by a transaction,
+		has XA stuff, and other undo, generated by the same
+		transaction, doesn't. */
+		trx->support_xa = thd_supports_xa(trx->mysql_thd);
+
+		trx_start_low(trx);
+		/* fall through */
+	case TRX_STATE_ACTIVE:
+		return;
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+UNIV_INTERN
+void
+trx_start_if_not_started_low(
+/*=========================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		trx_start_low(trx);
+		/* fall through */
+	case TRX_STATE_ACTIVE:
+		return;
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
+/*************************************************************//**
+Starts the transaction for a DDL operation. */
+UNIV_INTERN
+void
+trx_start_for_ddl_low(
+/*==================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	trx_dict_op_t	op)	/*!< in: dictionary operation type */
+{
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		/* Flag this transaction as a dictionary operation, so that
+		the data dictionary will be locked in crash recovery. */
+
+		trx_set_dict_operation(trx, op);
+
+		/* Ensure it is not flagged as an auto-commit-non-locking
+		transation. */
+		trx->will_lock = 1;
+
+		trx->ddl = true;
+
+		trx_start_low(trx);
+		return;
+
+	case TRX_STATE_ACTIVE:
+		/* We have this start if not started idiom, therefore we
+		can't add stronger checks here. */
+		trx->ddl = true;
+
+		ut_ad(trx->dict_operation != TRX_DICT_OP_NONE);
+		ut_ad(trx->will_lock > 0);
+		return;
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc
new file mode 100644
index 00000000000..290271c6cab
--- /dev/null
+++ b/storage/innobase/trx/trx0undo.cc
@@ -0,0 +1,2026 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0undo.cc
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0undo.h"
+
+#ifdef UNIV_NONINL
+#include "trx0undo.ic"
+#endif
+
+#include "fsp0fsp.h"
+#ifndef UNIV_HOTBACKUP
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0rec.h"
+#include "trx0purge.h"
+#include "srv0mon.h"
+
+/* How should the old versions in the history list be managed?
+   ----------------------------------------------------------
+If each transaction is given a whole page for its update undo log, file
+space consumption can be 10 times higher than necessary. Therefore,
+partly filled update undo log pages should be reusable. But then there
+is no way individual pages can be ordered so that the ordering agrees
+with the serialization numbers of the transactions on the pages. Thus,
+the history list must be formed of undo logs, not their header pages as
+it was in the old implementation.
+	However, on a single header page the transactions are placed in
+the order of their serialization numbers. As old versions are purged, we
+may free the page when the last transaction on the page has been purged.
+	A problem is that the purge has to go through the transactions
+in the serialization order. This means that we have to look through all
+rollback segments for the one that has the smallest transaction number
+in its history list.
+	When should we do a purge? A purge is necessary when space is
+running out in any of the rollback segments. Then we may have to purge
+also old version which might be needed by some consistent read. How do
+we trigger the start of a purge? When a transaction writes to an undo log,
+it may notice that the space is running out. When a read view is closed,
+it may make some history superfluous. The server can have an utility which
+periodically checks if it can purge some history.
+	In a parallellized purge we have the problem that a query thread
+can remove a delete marked clustered index record before another query
+thread has processed an earlier version of the record, which cannot then
+be done because the row cannot be constructed from the clustered index
+record. To avoid this problem, we will store in the update and delete mark
+undo record also the columns necessary to construct the secondary index
+entries which are modified.
+	We can latch the stack of versions of a single clustered index record
+by taking a latch on the clustered index page. As long as the latch is held,
+no new versions can be added and no versions removed by undo. But, a purge
+can still remove old versions from the bottom of the stack. */
+
+/* How to protect rollback segments, undo logs, and history lists with
+   -------------------------------------------------------------------
+latches?
+-------
+The contention of the trx_sys_t::mutex should be minimized. When a transaction
+does its first insert or modify in an index, an undo log is assigned for it.
+Then we must have an x-latch to the rollback segment header.
+	When the transaction does more modifys or rolls back, the undo log is
+protected with undo_mutex in the transaction.
+	When the transaction commits, its insert undo log is either reset and
+cached for a fast reuse, or freed. In these cases we must have an x-latch on
+the rollback segment page. The update undo log is put to the history list. If
+it is not suitable for reuse, its slot in the rollback segment is reset. In
+both cases, an x-latch must be acquired on the rollback segment.
+	The purge operation steps through the history list without modifying
+it until a truncate operation occurs, which can remove undo logs from the end
+of the list and release undo log segments. In stepping through the list,
+s-latches on the undo log pages are enough, but in a truncate, x-latches must
+be obtained on the rollback segment and individual pages. */
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Initializes the fields in an undo log segment page. */
+static
+void
+trx_undo_page_init(
+/*===============*/
+	page_t* undo_page,	/*!< in: undo log segment page */
+	ulint	type,		/*!< in: undo log segment type */
+	mtr_t*	mtr);		/*!< in: mtr */
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return	own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+	trx_rseg_t*	rseg,	/*!< in: rollback segment memory object */
+	ulint		id,	/*!< in: slot index within rseg */
+	ulint		type,	/*!< in: type of the log: TRX_UNDO_INSERT or
+				TRX_UNDO_UPDATE */
+	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
+				is created */
+	const XID*	xid,	/*!< in: X/Open XA transaction identification*/
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset);/*!< in: undo log header byte offset on page */
+#endif /* !UNIV_HOTBACKUP */
+/***************************************************************//**
+Initializes a cached insert undo log header page for new use. NOTE that this
+function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change
+the operation of this function!
+@return	undo log header byte offset on page */
+static
+ulint
+trx_undo_insert_header_reuse(
+/*=========================*/
+	page_t*		undo_page,	/*!< in/out: insert undo log segment
+					header page, x-latched */
+	trx_id_t	trx_id,		/*!< in: transaction id */
+	mtr_t*		mtr);		/*!< in: mtr */
+/**********************************************************************//**
+If an update undo log can be discarded immediately, this function frees the
+space, resetting the page to the proper state for caching. */
+static
+void
+trx_undo_discard_latest_update_undo(
+/*================================*/
+	page_t*	undo_page,	/*!< in: header page of an undo log of size 1 */
+	mtr_t*	mtr);		/*!< in: mtr */
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Gets the previous record in an undo log from the previous page.
+@return	undo log record, the page s-latched, NULL if none */
+static
+trx_undo_rec_t*
+trx_undo_get_prev_rec_from_prev_page(
+/*=================================*/
+	trx_undo_rec_t*	rec,	/*!< in: undo record */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset,	/*!< in: undo log header offset on page */
+	bool		shared,	/*!< in: true=S-latch, false=X-latch */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint	space;
+	ulint	zip_size;
+	ulint	prev_page_no;
+	page_t* prev_page;
+	page_t*	undo_page;
+
+	undo_page = page_align(rec);
+
+	prev_page_no = flst_get_prev_addr(undo_page + TRX_UNDO_PAGE_HDR
+					  + TRX_UNDO_PAGE_NODE, mtr)
+		.page;
+
+	if (prev_page_no == FIL_NULL) {
+
+		return(NULL);
+	}
+
+	space = page_get_space_id(undo_page);
+	zip_size = fil_space_get_zip_size(space);
+
+	buf_block_t*	block = buf_page_get(space, zip_size, prev_page_no,
+					     shared ? RW_S_LATCH : RW_X_LATCH,
+					     mtr);
+	buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+	prev_page = buf_block_get_frame(block);
+
+	return(trx_undo_page_get_last_rec(prev_page, page_no, offset));
+}
+
+/***********************************************************************//**
+Gets the previous record in an undo log.
+@return	undo log record, the page s-latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_prev_rec(
+/*==================*/
+	trx_undo_rec_t*	rec,	/*!< in: undo record */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset,	/*!< in: undo log header offset on page */
+	bool		shared,	/*!< in: true=S-latch, false=X-latch */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	trx_undo_rec_t*	prev_rec;
+
+	prev_rec = trx_undo_page_get_prev_rec(rec, page_no, offset);
+
+	if (prev_rec) {
+
+		return(prev_rec);
+	}
+
+	/* We have to go to the previous undo log page to look for the
+	previous record */
+
+	return(trx_undo_get_prev_rec_from_prev_page(rec, page_no, offset,
+						    shared, mtr));
+}
+
+/***********************************************************************//**
+Gets the next record in an undo log from the next page.
+@return	undo log record, the page latched, NULL if none */
+static
+trx_undo_rec_t*
+trx_undo_get_next_rec_from_next_page(
+/*=================================*/
+	ulint	space,	/*!< in: undo log header space */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	page_t*	undo_page, /*!< in: undo log page */
+	ulint	page_no,/*!< in: undo log header page number */
+	ulint	offset,	/*!< in: undo log header offset on page */
+	ulint	mode,	/*!< in: latch mode: RW_S_LATCH or RW_X_LATCH */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	trx_ulogf_t*	log_hdr;
+	ulint		next_page_no;
+	page_t*		next_page;
+	ulint		next;
+
+	if (page_no == page_get_page_no(undo_page)) {
+
+		log_hdr = undo_page + offset;
+		next = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG);
+
+		if (next != 0) {
+
+			return(NULL);
+		}
+	}
+
+	next_page_no = flst_get_next_addr(undo_page + TRX_UNDO_PAGE_HDR
+					  + TRX_UNDO_PAGE_NODE, mtr)
+		.page;
+	if (next_page_no == FIL_NULL) {
+
+		return(NULL);
+	}
+
+	if (mode == RW_S_LATCH) {
+		next_page = trx_undo_page_get_s_latched(space, zip_size,
+							next_page_no, mtr);
+	} else {
+		ut_ad(mode == RW_X_LATCH);
+		next_page = trx_undo_page_get(space, zip_size,
+					      next_page_no, mtr);
+	}
+
+	return(trx_undo_page_get_first_rec(next_page, page_no, offset));
+}
+
+/***********************************************************************//**
+Gets the next record in an undo log.
+@return	undo log record, the page s-latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_next_rec(
+/*==================*/
+	trx_undo_rec_t*	rec,	/*!< in: undo record */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset,	/*!< in: undo log header offset on page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint		space;
+	ulint		zip_size;
+	trx_undo_rec_t*	next_rec;
+
+	next_rec = trx_undo_page_get_next_rec(rec, page_no, offset);
+
+	if (next_rec) {
+		return(next_rec);
+	}
+
+	space = page_get_space_id(page_align(rec));
+	zip_size = fil_space_get_zip_size(space);
+
+	return(trx_undo_get_next_rec_from_next_page(space, zip_size,
+						    page_align(rec),
+						    page_no, offset,
+						    RW_S_LATCH, mtr));
+}
+
+/***********************************************************************//**
+Gets the first record in an undo log.
+@return	undo log record, the page latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_first_rec(
+/*===================*/
+	ulint	space,	/*!< in: undo log header space */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	ulint	page_no,/*!< in: undo log header page number */
+	ulint	offset,	/*!< in: undo log header offset on page */
+	ulint	mode,	/*!< in: latching mode: RW_S_LATCH or RW_X_LATCH */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	page_t*		undo_page;
+	trx_undo_rec_t*	rec;
+
+	if (mode == RW_S_LATCH) {
+		undo_page = trx_undo_page_get_s_latched(space, zip_size,
+							page_no, mtr);
+	} else {
+		undo_page = trx_undo_page_get(space, zip_size, page_no, mtr);
+	}
+
+	rec = trx_undo_page_get_first_rec(undo_page, page_no, offset);
+
+	if (rec) {
+		return(rec);
+	}
+
+	return(trx_undo_get_next_rec_from_next_page(space, zip_size,
+						    undo_page, page_no, offset,
+						    mode, mtr));
+}
+
+/*============== UNDO LOG FILE COPY CREATION AND FREEING ==================*/
+
+/**********************************************************************//**
+Writes the mtr log entry of an undo log page initialization. */
+UNIV_INLINE
+void
+trx_undo_page_init_log(
+/*===================*/
+	page_t* undo_page,	/*!< in: undo log page */
+	ulint	type,		/*!< in: undo log type */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	mlog_write_initial_log_record(undo_page, MLOG_UNDO_INIT, mtr);
+
+	mlog_catenate_ulint_compressed(mtr, type);
+}
+#else /* !UNIV_HOTBACKUP */
+# define trx_undo_page_init_log(undo_page,type,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses the redo log entry of an undo log page initialization.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_page_init(
+/*=====================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr)	/*!< in: mtr or NULL */
+{
+	ulint	type;
+
+	ptr = mach_parse_compressed(ptr, end_ptr, &type);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	if (page) {
+		trx_undo_page_init(page, type, mtr);
+	}
+
+	return(ptr);
+}
+
+/********************************************************************//**
+Initializes the fields in an undo log segment page. */
+static
+void
+trx_undo_page_init(
+/*===============*/
+	page_t* undo_page,	/*!< in: undo log segment page */
+	ulint	type,		/*!< in: undo log segment type */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	trx_upagef_t*	page_hdr;
+
+	page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_TYPE, type);
+
+	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START,
+			TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE,
+			TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+
+	fil_page_set_type(undo_page, FIL_PAGE_UNDO_LOG);
+
+	trx_undo_page_init_log(undo_page, type, mtr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Creates a new undo log segment in file.
+@return DB_SUCCESS if page creation OK possible error codes are:
+DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+trx_undo_seg_create(
+/*================*/
+	trx_rseg_t*	rseg __attribute__((unused)),/*!< in: rollback segment */
+	trx_rsegf_t*	rseg_hdr,/*!< in: rollback segment header, page
+				x-latched */
+	ulint		type,	/*!< in: type of the segment: TRX_UNDO_INSERT or
+				TRX_UNDO_UPDATE */
+	ulint*		id,	/*!< out: slot index within rseg header */
+	page_t**	undo_page,
+				/*!< out: segment header page x-latched, NULL
+				if there was an error */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint		slot_no;
+	ulint		space;
+	buf_block_t*	block;
+	trx_upagef_t*	page_hdr;
+	trx_usegf_t*	seg_hdr;
+	ulint		n_reserved;
+	ibool		success;
+	dberr_t		err = DB_SUCCESS;
+
+	ut_ad(mtr && id && rseg_hdr);
+	ut_ad(mutex_own(&(rseg->mutex)));
+
+	/*	fputs(type == TRX_UNDO_INSERT
+	? "Creating insert undo log segment\n"
+	: "Creating update undo log segment\n", stderr); */
+	slot_no = trx_rsegf_undo_find_free(rseg_hdr, mtr);
+
+	if (slot_no == ULINT_UNDEFINED) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Warning: cannot find a free slot for"
+			" an undo log. Do you have too\n"
+			"InnoDB: many active transactions"
+			" running concurrently?\n");
+
+		return(DB_TOO_MANY_CONCURRENT_TRXS);
+	}
+
+	space = page_get_space_id(page_align(rseg_hdr));
+
+	success = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO,
+					   mtr);
+	if (!success) {
+
+		return(DB_OUT_OF_FILE_SPACE);
+	}
+
+	/* Allocate a new file segment for the undo log */
+	block = fseg_create_general(space, 0,
+				    TRX_UNDO_SEG_HDR
+				    + TRX_UNDO_FSEG_HEADER, TRUE, mtr);
+
+	fil_space_release_free_extents(space, n_reserved);
+
+	if (block == NULL) {
+		/* No space left */
+
+		return(DB_OUT_OF_FILE_SPACE);
+	}
+
+	buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+	*undo_page = buf_block_get_frame(block);
+
+	page_hdr = *undo_page + TRX_UNDO_PAGE_HDR;
+	seg_hdr = *undo_page + TRX_UNDO_SEG_HDR;
+
+	trx_undo_page_init(*undo_page, type, mtr);
+
+	mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE,
+			 TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE,
+			 MLOG_2BYTES, mtr);
+
+	mlog_write_ulint(seg_hdr + TRX_UNDO_LAST_LOG, 0, MLOG_2BYTES, mtr);
+
+	flst_init(seg_hdr + TRX_UNDO_PAGE_LIST, mtr);
+
+	flst_add_last(seg_hdr + TRX_UNDO_PAGE_LIST,
+		      page_hdr + TRX_UNDO_PAGE_NODE, mtr);
+
+	trx_rsegf_set_nth_undo(rseg_hdr, slot_no,
+			       page_get_page_no(*undo_page), mtr);
+	*id = slot_no;
+
+	MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED);
+
+	return(err);
+}
+
+/**********************************************************************//**
+Writes the mtr log entry of an undo log header initialization. */
+UNIV_INLINE
+void
+trx_undo_header_create_log(
+/*=======================*/
+	const page_t*	undo_page,	/*!< in: undo log header page */
+	trx_id_t	trx_id,		/*!< in: transaction id */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_CREATE, mtr);
+
+	mlog_catenate_ull_compressed(mtr, trx_id);
+}
+#else /* !UNIV_HOTBACKUP */
+# define trx_undo_header_create_log(undo_page,trx_id,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+Creates a new undo log header in file. NOTE that this function has its own
+log record type MLOG_UNDO_HDR_CREATE. You must NOT change the operation of
+this function!
+@return	header byte offset on page */
+static
+ulint
+trx_undo_header_create(
+/*===================*/
+	page_t*		undo_page,	/*!< in/out: undo log segment
+					header page, x-latched; it is
+					assumed that there is
+					TRX_UNDO_LOG_XA_HDR_SIZE bytes
+					free space on it */
+	trx_id_t	trx_id,		/*!< in: transaction id */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	trx_upagef_t*	page_hdr;
+	trx_usegf_t*	seg_hdr;
+	trx_ulogf_t*	log_hdr;
+	trx_ulogf_t*	prev_log_hdr;
+	ulint		prev_log;
+	ulint		free;
+	ulint		new_free;
+
+	ut_ad(mtr && undo_page);
+
+	page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+	free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE);
+
+	log_hdr = undo_page + free;
+
+	new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE;
+
+	ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100);
+
+	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free);
+
+	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free);
+
+	mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE);
+
+	prev_log = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG);
+
+	if (prev_log != 0) {
+		prev_log_hdr = undo_page + prev_log;
+
+		mach_write_to_2(prev_log_hdr + TRX_UNDO_NEXT_LOG, free);
+	}
+
+	mach_write_to_2(seg_hdr + TRX_UNDO_LAST_LOG, free);
+
+	log_hdr = undo_page + free;
+
+	mach_write_to_2(log_hdr + TRX_UNDO_DEL_MARKS, TRUE);
+
+	mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id);
+	mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free);
+
+	mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE);
+	mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE);
+
+	mach_write_to_2(log_hdr + TRX_UNDO_NEXT_LOG, 0);
+	mach_write_to_2(log_hdr + TRX_UNDO_PREV_LOG, prev_log);
+
+	/* Write the log record about the header creation */
+	trx_undo_header_create_log(undo_page, trx_id, mtr);
+
+	return(free);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Write X/Open XA Transaction Identification (XID) to undo log header */
+static
+void
+trx_undo_write_xid(
+/*===============*/
+	trx_ulogf_t*	log_hdr,/*!< in: undo log header */
+	const XID*	xid,	/*!< in: X/Open XA Transaction Identification */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	mlog_write_ulint(log_hdr + TRX_UNDO_XA_FORMAT,
+			 (ulint) xid->formatID, MLOG_4BYTES, mtr);
+
+	mlog_write_ulint(log_hdr + TRX_UNDO_XA_TRID_LEN,
+			 (ulint) xid->gtrid_length, MLOG_4BYTES, mtr);
+
+	mlog_write_ulint(log_hdr + TRX_UNDO_XA_BQUAL_LEN,
+			 (ulint) xid->bqual_length, MLOG_4BYTES, mtr);
+
+	mlog_write_string(log_hdr + TRX_UNDO_XA_XID, (const byte*) xid->data,
+			  XIDDATASIZE, mtr);
+}
+
+/********************************************************************//**
+Read X/Open XA Transaction Identification (XID) from undo log header */
+static
+void
+trx_undo_read_xid(
+/*==============*/
+	trx_ulogf_t*	log_hdr,/*!< in: undo log header */
+	XID*		xid)	/*!< out: X/Open XA Transaction Identification */
+{
+	xid->formatID = (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_FORMAT);
+
+	xid->gtrid_length
+		= (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_TRID_LEN);
+	xid->bqual_length
+		= (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_BQUAL_LEN);
+
+	memcpy(xid->data, log_hdr + TRX_UNDO_XA_XID, XIDDATASIZE);
+}
+
+/***************************************************************//**
+Adds space for the XA XID after an undo log old-style header. */
+static
+void
+trx_undo_header_add_space_for_xid(
+/*==============================*/
+	page_t*		undo_page,/*!< in: undo log segment header page */
+	trx_ulogf_t*	log_hdr,/*!< in: undo log header */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	trx_upagef_t*	page_hdr;
+	ulint		free;
+	ulint		new_free;
+
+	page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+	free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE);
+
+	/* free is now the end offset of the old style undo log header */
+
+	ut_a(free == (ulint)(log_hdr - undo_page) + TRX_UNDO_LOG_OLD_HDR_SIZE);
+
+	new_free = free + (TRX_UNDO_LOG_XA_HDR_SIZE
+			   - TRX_UNDO_LOG_OLD_HDR_SIZE);
+
+	/* Add space for a XID after the header, update the free offset
+	fields on the undo log page and in the undo log header */
+
+	mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_START, new_free,
+			 MLOG_2BYTES, mtr);
+
+	mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE, new_free,
+			 MLOG_2BYTES, mtr);
+
+	mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, new_free,
+			 MLOG_2BYTES, mtr);
+}
+
+/**********************************************************************//**
+Writes the mtr log entry of an undo log header reuse. */
+UNIV_INLINE
+void
+trx_undo_insert_header_reuse_log(
+/*=============================*/
+	const page_t*	undo_page,	/*!< in: undo log header page */
+	trx_id_t	trx_id,		/*!< in: transaction id */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_REUSE, mtr);
+
+	mlog_catenate_ull_compressed(mtr, trx_id);
+}
+#else /* !UNIV_HOTBACKUP */
+# define trx_undo_insert_header_reuse_log(undo_page,trx_id,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses the redo log entry of an undo log page header create or reuse.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_page_header(
+/*=======================*/
+	ulint	type,	/*!< in: MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE */
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr)	/*!< in: mtr or NULL */
+{
+	trx_id_t	trx_id;
+	/* Silence a GCC warning about possibly uninitialized variable
+	when mach_ull_parse_compressed() is not inlined. */
+	ut_d(trx_id = 0);
+	/* Declare the variable uninitialized in Valgrind, so that the
+	above initialization will not mask any bugs. */
+	UNIV_MEM_INVALID(&trx_id, sizeof trx_id);
+
+	ptr = mach_ull_parse_compressed(ptr, end_ptr, &trx_id);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	if (page) {
+		if (type == MLOG_UNDO_HDR_CREATE) {
+			trx_undo_header_create(page, trx_id, mtr);
+		} else {
+			ut_ad(type == MLOG_UNDO_HDR_REUSE);
+			trx_undo_insert_header_reuse(page, trx_id, mtr);
+		}
+	}
+
+	return(ptr);
+}
+
+/***************************************************************//**
+Initializes a cached insert undo log header page for new use. NOTE that this
+function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change
+the operation of this function!
+@return	undo log header byte offset on page */
+static
+ulint
+trx_undo_insert_header_reuse(
+/*=========================*/
+	page_t*		undo_page,	/*!< in/out: insert undo log segment
+					header page, x-latched */
+	trx_id_t	trx_id,		/*!< in: transaction id */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	trx_upagef_t*	page_hdr;
+	trx_usegf_t*	seg_hdr;
+	trx_ulogf_t*	log_hdr;
+	ulint		free;
+	ulint		new_free;
+
+	ut_ad(mtr && undo_page);
+
+	page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+	free = TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE;
+
+	ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100);
+
+	log_hdr = undo_page + free;
+
+	new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE;
+
+	/* Insert undo data is not needed after commit: we may free all
+	the space on the page */
+
+	ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+			      + TRX_UNDO_PAGE_TYPE)
+	     == TRX_UNDO_INSERT);
+
+	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free);
+
+	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free);
+
+	mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE);
+
+	log_hdr = undo_page + free;
+
+	mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id);
+	mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free);
+
+	mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE);
+	mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE);
+
+	/* Write the log record MLOG_UNDO_HDR_REUSE */
+	trx_undo_insert_header_reuse_log(undo_page, trx_id, mtr);
+
+	return(free);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Writes the redo log entry of an update undo log header discard. */
+UNIV_INLINE
+void
+trx_undo_discard_latest_log(
+/*========================*/
+	page_t* undo_page,	/*!< in: undo log header page */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_DISCARD, mtr);
+}
+#else /* !UNIV_HOTBACKUP */
+# define trx_undo_discard_latest_log(undo_page, mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses the redo log entry of an undo log page header discard.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_discard_latest(
+/*==========================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr __attribute__((unused)), /*!< in: buffer end */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr)	/*!< in: mtr or NULL */
+{
+	ut_ad(end_ptr);
+
+	if (page) {
+		trx_undo_discard_latest_update_undo(page, mtr);
+	}
+
+	return(ptr);
+}
+
+/**********************************************************************//**
+If an update undo log can be discarded immediately, this function frees the
+space, resetting the page to the proper state for caching. */
+static
+void
+trx_undo_discard_latest_update_undo(
+/*================================*/
+	page_t*	undo_page,	/*!< in: header page of an undo log of size 1 */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	trx_usegf_t*	seg_hdr;
+	trx_upagef_t*	page_hdr;
+	trx_ulogf_t*	log_hdr;
+	trx_ulogf_t*	prev_log_hdr;
+	ulint		free;
+	ulint		prev_hdr_offset;
+
+	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+	page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+	free = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG);
+	log_hdr = undo_page + free;
+
+	prev_hdr_offset = mach_read_from_2(log_hdr + TRX_UNDO_PREV_LOG);
+
+	if (prev_hdr_offset != 0) {
+		prev_log_hdr = undo_page + prev_hdr_offset;
+
+		mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START,
+				mach_read_from_2(prev_log_hdr
+						 + TRX_UNDO_LOG_START));
+		mach_write_to_2(prev_log_hdr + TRX_UNDO_NEXT_LOG, 0);
+	}
+
+	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, free);
+
+	mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_CACHED);
+	mach_write_to_2(seg_hdr + TRX_UNDO_LAST_LOG, prev_hdr_offset);
+
+	trx_undo_discard_latest_log(undo_page, mtr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Tries to add a page to the undo log segment where the undo log is placed.
+@return	X-latched block if success, else NULL */
+UNIV_INTERN
+buf_block_t*
+trx_undo_add_page(
+/*==============*/
+	trx_t*		trx,	/*!< in: transaction */
+	trx_undo_t*	undo,	/*!< in: undo log memory object */
+	mtr_t*		mtr)	/*!< in: mtr which does not have a latch to any
+				undo log page; the caller must have reserved
+				the rollback segment mutex */
+{
+	page_t*		header_page;
+	buf_block_t*	new_block;
+	page_t*		new_page;
+	trx_rseg_t*	rseg;
+	ulint		n_reserved;
+
+	ut_ad(mutex_own(&(trx->undo_mutex)));
+	ut_ad(mutex_own(&(trx->rseg->mutex)));
+
+	rseg = trx->rseg;
+
+	if (rseg->curr_size == rseg->max_size) {
+
+		return(NULL);
+	}
+
+	header_page = trx_undo_page_get(undo->space, undo->zip_size,
+					undo->hdr_page_no, mtr);
+
+	if (!fsp_reserve_free_extents(&n_reserved, undo->space, 1,
+				      FSP_UNDO, mtr)) {
+
+		return(NULL);
+	}
+
+	new_block = fseg_alloc_free_page_general(
+		TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+		+ header_page,
+		undo->top_page_no + 1, FSP_UP, TRUE, mtr, mtr);
+
+	fil_space_release_free_extents(undo->space, n_reserved);
+
+	if (new_block == NULL) {
+
+		/* No space left */
+
+		return(NULL);
+	}
+
+	ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
+	buf_block_dbg_add_level(new_block, SYNC_TRX_UNDO_PAGE);
+	undo->last_page_no = buf_block_get_page_no(new_block);
+
+	new_page = buf_block_get_frame(new_block);
+
+	trx_undo_page_init(new_page, undo->type, mtr);
+
+	flst_add_last(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+		      new_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+	undo->size++;
+	rseg->curr_size++;
+
+	return(new_block);
+}
+
+/********************************************************************//**
+Frees an undo log page that is not the header page.
+@return	last page number in remaining log */
+static
+ulint
+trx_undo_free_page(
+/*===============*/
+	trx_rseg_t* rseg,	/*!< in: rollback segment */
+	ibool	in_history,	/*!< in: TRUE if the undo log is in the history
+				list */
+	ulint	space,		/*!< in: space */
+	ulint	hdr_page_no,	/*!< in: header page number */
+	ulint	page_no,	/*!< in: page number to free: must not be the
+				header page */
+	mtr_t*	mtr)		/*!< in: mtr which does not have a latch to any
+				undo log page; the caller must have reserved
+				the rollback segment mutex */
+{
+	page_t*		header_page;
+	page_t*		undo_page;
+	fil_addr_t	last_addr;
+	trx_rsegf_t*	rseg_header;
+	ulint		hist_size;
+	ulint		zip_size;
+
+	ut_a(hdr_page_no != page_no);
+	ut_ad(mutex_own(&(rseg->mutex)));
+
+	zip_size = rseg->zip_size;
+
+	undo_page = trx_undo_page_get(space, zip_size, page_no, mtr);
+
+	header_page = trx_undo_page_get(space, zip_size, hdr_page_no, mtr);
+
+	flst_remove(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+		    undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+
+	fseg_free_page(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER,
+		       space, page_no, mtr);
+
+	last_addr = flst_get_last(header_page + TRX_UNDO_SEG_HDR
+				  + TRX_UNDO_PAGE_LIST, mtr);
+	rseg->curr_size--;
+
+	if (in_history) {
+		rseg_header = trx_rsegf_get(space, zip_size,
+					    rseg->page_no, mtr);
+
+		hist_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+					   MLOG_4BYTES, mtr);
+		ut_ad(hist_size > 0);
+		mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+				 hist_size - 1, MLOG_4BYTES, mtr);
+	}
+
+	return(last_addr.page);
+}
+
+/********************************************************************//**
+Frees the last undo log page.
+The caller must hold the rollback segment mutex. */
+UNIV_INTERN
+void
+trx_undo_free_last_page_func(
+/*==========================*/
+#ifdef UNIV_DEBUG
+	const trx_t*	trx,	/*!< in: transaction */
+#endif /* UNIV_DEBUG */
+	trx_undo_t*	undo,	/*!< in/out: undo log memory copy */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction which does not
+				have a latch to any undo log page or which
+				has allocated the undo log page */
+{
+	ut_ad(mutex_own(&trx->undo_mutex));
+	ut_ad(undo->hdr_page_no != undo->last_page_no);
+	ut_ad(undo->size > 0);
+
+	undo->last_page_no = trx_undo_free_page(
+		undo->rseg, FALSE, undo->space,
+		undo->hdr_page_no, undo->last_page_no, mtr);
+
+	undo->size--;
+}
+
+/********************************************************************//**
+Empties an undo log header page of undo records for that undo log. Other
+undo logs may still have records on that page, if it is an update undo log. */
+static
+void
+trx_undo_empty_header_page(
+/*=======================*/
+	ulint	space,		/*!< in: space */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	hdr_page_no,	/*!< in: header page number */
+	ulint	hdr_offset,	/*!< in: header offset */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	page_t*		header_page;
+	trx_ulogf_t*	log_hdr;
+	ulint		end;
+
+	header_page = trx_undo_page_get(space, zip_size, hdr_page_no, mtr);
+
+	log_hdr = header_page + hdr_offset;
+
+	end = trx_undo_page_get_end(header_page, hdr_page_no, hdr_offset);
+
+	mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, end, MLOG_2BYTES, mtr);
+}
+
+/***********************************************************************//**
+Truncates an undo log from the end. This function is used during a rollback
+to free space from an undo log. */
+UNIV_INTERN
+void
+trx_undo_truncate_end_func(
+/*=======================*/
+#ifdef UNIV_DEBUG
+	const trx_t*	trx,	/*!< in: transaction whose undo log it is */
+#endif /* UNIV_DEBUG */
+	trx_undo_t*	undo,	/*!< in: undo log */
+	undo_no_t	limit)	/*!< in: all undo records with undo number
+				>= this value should be truncated */
+{
+	page_t*		undo_page;
+	ulint		last_page_no;
+	trx_undo_rec_t* rec;
+	trx_undo_rec_t* trunc_here;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(trx->undo_mutex)));
+	ut_ad(mutex_own(&(trx->rseg->mutex)));
+
+	for (;;) {
+		mtr_start(&mtr);
+
+		trunc_here = NULL;
+
+		last_page_no = undo->last_page_no;
+
+		undo_page = trx_undo_page_get(undo->space, undo->zip_size,
+					      last_page_no, &mtr);
+
+		rec = trx_undo_page_get_last_rec(undo_page, undo->hdr_page_no,
+						 undo->hdr_offset);
+		while (rec) {
+			if (trx_undo_rec_get_undo_no(rec) >= limit) {
+				/* Truncate at least this record off, maybe
+				more */
+				trunc_here = rec;
+			} else {
+				goto function_exit;
+			}
+
+			rec = trx_undo_page_get_prev_rec(rec,
+							 undo->hdr_page_no,
+							 undo->hdr_offset);
+		}
+
+		if (last_page_no == undo->hdr_page_no) {
+
+			goto function_exit;
+		}
+
+		ut_ad(last_page_no == undo->last_page_no);
+		trx_undo_free_last_page(trx, undo, &mtr);
+
+		mtr_commit(&mtr);
+	}
+
+function_exit:
+	if (trunc_here) {
+		mlog_write_ulint(undo_page + TRX_UNDO_PAGE_HDR
+				 + TRX_UNDO_PAGE_FREE,
+				 trunc_here - undo_page, MLOG_2BYTES, &mtr);
+	}
+
+	mtr_commit(&mtr);
+}
+
+/***********************************************************************//**
+Truncates an undo log from the start. This function is used during a purge
+operation. */
+UNIV_INTERN
+void
+trx_undo_truncate_start(
+/*====================*/
+	trx_rseg_t*	rseg,		/*!< in: rollback segment */
+	ulint		space,		/*!< in: space id of the log */
+	ulint		hdr_page_no,	/*!< in: header page number */
+	ulint		hdr_offset,	/*!< in: header offset on the page */
+	undo_no_t	limit)		/*!< in: all undo pages with
+					undo numbers < this value
+					should be truncated; NOTE that
+					the function only frees whole
+					pages; the header page is not
+					freed, but emptied, if all the
+					records there are < limit */
+{
+	page_t*		undo_page;
+	trx_undo_rec_t* rec;
+	trx_undo_rec_t* last_rec;
+	ulint		page_no;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(rseg->mutex)));
+
+	if (!limit) {
+
+		return;
+	}
+loop:
+	mtr_start(&mtr);
+
+	rec = trx_undo_get_first_rec(space, rseg->zip_size,
+				     hdr_page_no, hdr_offset,
+				     RW_X_LATCH, &mtr);
+	if (rec == NULL) {
+		/* Already empty */
+
+		mtr_commit(&mtr);
+
+		return;
+	}
+
+	undo_page = page_align(rec);
+
+	last_rec = trx_undo_page_get_last_rec(undo_page, hdr_page_no,
+					      hdr_offset);
+	if (trx_undo_rec_get_undo_no(last_rec) >= limit) {
+
+		mtr_commit(&mtr);
+
+		return;
+	}
+
+	page_no = page_get_page_no(undo_page);
+
+	if (page_no == hdr_page_no) {
+		trx_undo_empty_header_page(space, rseg->zip_size,
+					   hdr_page_no, hdr_offset,
+					   &mtr);
+	} else {
+		trx_undo_free_page(rseg, TRUE, space, hdr_page_no,
+				   page_no, &mtr);
+	}
+
+	mtr_commit(&mtr);
+
+	goto loop;
+}
+
+/**********************************************************************//**
+Frees an undo log segment which is not in the history list. */
+static
+void
+trx_undo_seg_free(
+/*==============*/
+	trx_undo_t*	undo)	/*!< in: undo log */
+{
+	trx_rseg_t*	rseg;
+	fseg_header_t*	file_seg;
+	trx_rsegf_t*	rseg_header;
+	trx_usegf_t*	seg_header;
+	ibool		finished;
+	mtr_t		mtr;
+
+	rseg = undo->rseg;
+
+	do {
+
+		mtr_start(&mtr);
+
+		mutex_enter(&(rseg->mutex));
+
+		seg_header = trx_undo_page_get(undo->space, undo->zip_size,
+					       undo->hdr_page_no,
+					       &mtr) + TRX_UNDO_SEG_HDR;
+
+		file_seg = seg_header + TRX_UNDO_FSEG_HEADER;
+
+		finished = fseg_free_step(file_seg, &mtr);
+
+		if (finished) {
+			/* Update the rseg header */
+			rseg_header = trx_rsegf_get(
+				rseg->space, rseg->zip_size, rseg->page_no,
+				&mtr);
+			trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL,
+					       &mtr);
+
+			MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED);
+		}
+
+		mutex_exit(&(rseg->mutex));
+		mtr_commit(&mtr);
+	} while (!finished);
+}
+
+/*========== UNDO LOG MEMORY COPY INITIALIZATION =====================*/
+
+/********************************************************************//**
+Creates and initializes an undo log memory object according to the values
+in the header in file, when the database is started. The memory object is
+inserted in the appropriate list of rseg.
+@return	own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create_at_db_start(
+/*============================*/
+	trx_rseg_t*	rseg,	/*!< in: rollback segment memory object */
+	ulint		id,	/*!< in: slot index within rseg */
+	ulint		page_no,/*!< in: undo log segment page number */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*		undo_page;
+	trx_upagef_t*	page_header;
+	trx_usegf_t*	seg_header;
+	trx_ulogf_t*	undo_header;
+	trx_undo_t*	undo;
+	ulint		type;
+	ulint		state;
+	trx_id_t	trx_id;
+	ulint		offset;
+	fil_addr_t	last_addr;
+	page_t*		last_page;
+	trx_undo_rec_t*	rec;
+	XID		xid;
+	ibool		xid_exists = FALSE;
+
+	if (id >= TRX_RSEG_N_SLOTS) {
+		fprintf(stderr,
+			"InnoDB: Error: undo->id is %lu\n", (ulong) id);
+		ut_error;
+	}
+
+	undo_page = trx_undo_page_get(rseg->space, rseg->zip_size,
+				      page_no, mtr);
+
+	page_header = undo_page + TRX_UNDO_PAGE_HDR;
+
+	type = mtr_read_ulint(page_header + TRX_UNDO_PAGE_TYPE, MLOG_2BYTES,
+			      mtr);
+	seg_header = undo_page + TRX_UNDO_SEG_HDR;
+
+	state = mach_read_from_2(seg_header + TRX_UNDO_STATE);
+
+	offset = mach_read_from_2(seg_header + TRX_UNDO_LAST_LOG);
+
+	undo_header = undo_page + offset;
+
+	trx_id = mach_read_from_8(undo_header + TRX_UNDO_TRX_ID);
+
+	xid_exists = mtr_read_ulint(undo_header + TRX_UNDO_XID_EXISTS,
+				    MLOG_1BYTE, mtr);
+
+	/* Read X/Open XA transaction identification if it exists, or
+	set it to NULL. */
+
+	memset(&xid, 0, sizeof(xid));
+	xid.formatID = -1;
+
+	if (xid_exists == TRUE) {
+		trx_undo_read_xid(undo_header, &xid);
+	}
+
+	mutex_enter(&(rseg->mutex));
+
+	undo = trx_undo_mem_create(rseg, id, type, trx_id, &xid,
+				   page_no, offset);
+	mutex_exit(&(rseg->mutex));
+
+	undo->dict_operation =	mtr_read_ulint(
+		undo_header + TRX_UNDO_DICT_TRANS, MLOG_1BYTE, mtr);
+
+	undo->table_id = mach_read_from_8(undo_header + TRX_UNDO_TABLE_ID);
+	undo->state = state;
+	undo->size = flst_get_len(seg_header + TRX_UNDO_PAGE_LIST, mtr);
+
+	/* If the log segment is being freed, the page list is inconsistent! */
+	if (state == TRX_UNDO_TO_FREE) {
+
+		goto add_to_list;
+	}
+
+	last_addr = flst_get_last(seg_header + TRX_UNDO_PAGE_LIST, mtr);
+
+	undo->last_page_no = last_addr.page;
+	undo->top_page_no = last_addr.page;
+
+	last_page = trx_undo_page_get(rseg->space, rseg->zip_size,
+				      undo->last_page_no, mtr);
+
+	rec = trx_undo_page_get_last_rec(last_page, page_no, offset);
+
+	if (rec == NULL) {
+		undo->empty = TRUE;
+	} else {
+		undo->empty = FALSE;
+		undo->top_offset = rec - last_page;
+		undo->top_undo_no = trx_undo_rec_get_undo_no(rec);
+	}
+add_to_list:
+	if (type == TRX_UNDO_INSERT) {
+		if (state != TRX_UNDO_CACHED) {
+			UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_list,
+					 undo);
+		} else {
+			UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_cached,
+					 undo);
+			MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
+		}
+	} else {
+		ut_ad(type == TRX_UNDO_UPDATE);
+		if (state != TRX_UNDO_CACHED) {
+			UT_LIST_ADD_LAST(undo_list, rseg->update_undo_list,
+					 undo);
+		} else {
+			UT_LIST_ADD_LAST(undo_list, rseg->update_undo_cached,
+					 undo);
+			MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
+		}
+	}
+
+	return(undo);
+}
+
+/********************************************************************//**
+Initializes the undo log lists for a rollback segment memory copy. This
+function is only called when the database is started or a new rollback
+segment is created.
+@return	the combined size of undo log segments in pages */
+UNIV_INTERN
+ulint
+trx_undo_lists_init(
+/*================*/
+	trx_rseg_t*	rseg)	/*!< in: rollback segment memory object */
+{
+	ulint		size	= 0;
+	trx_rsegf_t*	rseg_header;
+	ulint		i;
+	mtr_t		mtr;
+
+	UT_LIST_INIT(rseg->update_undo_list);
+	UT_LIST_INIT(rseg->update_undo_cached);
+	UT_LIST_INIT(rseg->insert_undo_list);
+	UT_LIST_INIT(rseg->insert_undo_cached);
+
+	mtr_start(&mtr);
+
+	rseg_header = trx_rsegf_get_new(
+		rseg->space, rseg->zip_size, rseg->page_no, &mtr);
+
+	for (i = 0; i < TRX_RSEG_N_SLOTS; i++) {
+		ulint	page_no;
+
+		page_no = trx_rsegf_get_nth_undo(rseg_header, i, &mtr);
+
+		/* In forced recovery: try to avoid operations which look
+		at database pages; undo logs are rapidly changing data, and
+		the probability that they are in an inconsistent state is
+		high */
+
+		if (page_no != FIL_NULL
+		    && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) {
+
+			trx_undo_t*	undo;
+
+			undo = trx_undo_mem_create_at_db_start(
+				rseg, i, page_no, &mtr);
+
+			size += undo->size;
+
+			mtr_commit(&mtr);
+
+			mtr_start(&mtr);
+
+			rseg_header = trx_rsegf_get(
+				rseg->space, rseg->zip_size, rseg->page_no,
+				&mtr);
+
+			/* Found a used slot */
+			MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED);
+		}
+	}
+
+	mtr_commit(&mtr);
+
+	return(size);
+}
+
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return	own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+	trx_rseg_t*	rseg,	/*!< in: rollback segment memory object */
+	ulint		id,	/*!< in: slot index within rseg */
+	ulint		type,	/*!< in: type of the log: TRX_UNDO_INSERT or
+				TRX_UNDO_UPDATE */
+	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
+				is created */
+	const XID*	xid,	/*!< in: X/Open transaction identification */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset)	/*!< in: undo log header byte offset on page */
+{
+	trx_undo_t*	undo;
+
+	ut_ad(mutex_own(&(rseg->mutex)));
+
+	if (id >= TRX_RSEG_N_SLOTS) {
+		fprintf(stderr,
+			"InnoDB: Error: undo->id is %lu\n", (ulong) id);
+		ut_error;
+	}
+
+	undo = static_cast<trx_undo_t*>(mem_alloc(sizeof(*undo)));
+
+	if (undo == NULL) {
+
+		return(NULL);
+	}
+
+	undo->id = id;
+	undo->type = type;
+	undo->state = TRX_UNDO_ACTIVE;
+	undo->del_marks = FALSE;
+	undo->trx_id = trx_id;
+	undo->xid = *xid;
+
+	undo->dict_operation = FALSE;
+
+	undo->rseg = rseg;
+
+	undo->space = rseg->space;
+	undo->zip_size = rseg->zip_size;
+	undo->hdr_page_no = page_no;
+	undo->hdr_offset = offset;
+	undo->last_page_no = page_no;
+	undo->size = 1;
+
+	undo->empty = TRUE;
+	undo->top_page_no = page_no;
+	undo->guess_block = NULL;
+
+	return(undo);
+}
+
+/********************************************************************//**
+Initializes a cached undo log object for new use. */
+static
+void
+trx_undo_mem_init_for_reuse(
+/*========================*/
+	trx_undo_t*	undo,	/*!< in: undo log to init */
+	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
+				is created */
+	const XID*	xid,	/*!< in: X/Open XA transaction identification*/
+	ulint		offset)	/*!< in: undo log header byte offset on page */
+{
+	ut_ad(mutex_own(&((undo->rseg)->mutex)));
+
+	if (UNIV_UNLIKELY(undo->id >= TRX_RSEG_N_SLOTS)) {
+		fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+			(ulong) undo->id);
+
+		mem_analyze_corruption(undo);
+		ut_error;
+	}
+
+	undo->state = TRX_UNDO_ACTIVE;
+	undo->del_marks = FALSE;
+	undo->trx_id = trx_id;
+	undo->xid = *xid;
+
+	undo->dict_operation = FALSE;
+
+	undo->hdr_offset = offset;
+	undo->empty = TRUE;
+}
+
+/********************************************************************//**
+Frees an undo log memory copy. */
+UNIV_INTERN
+void
+trx_undo_mem_free(
+/*==============*/
+	trx_undo_t*	undo)	/*!< in: the undo object to be freed */
+{
+	if (undo->id >= TRX_RSEG_N_SLOTS) {
+		fprintf(stderr,
+			"InnoDB: Error: undo->id is %lu\n", (ulong) undo->id);
+		ut_error;
+	}
+
+	mem_free(undo);
+}
+
+/**********************************************************************//**
+Creates a new undo log.
+@return DB_SUCCESS if successful in creating the new undo lob object,
+possible error codes are: DB_TOO_MANY_CONCURRENT_TRXS
+DB_OUT_OF_FILE_SPACE DB_OUT_OF_MEMORY */
+static __attribute__((nonnull, warn_unused_result))
+dberr_t
+trx_undo_create(
+/*============*/
+	trx_t*		trx,	/*!< in: transaction */
+	trx_rseg_t*	rseg,	/*!< in: rollback segment memory copy */
+	ulint		type,	/*!< in: type of the log: TRX_UNDO_INSERT or
+				TRX_UNDO_UPDATE */
+	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
+				is created */
+	const XID*	xid,	/*!< in: X/Open transaction identification*/
+	trx_undo_t**	undo,	/*!< out: the new undo log object, undefined
+				 * if did not succeed */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	trx_rsegf_t*	rseg_header;
+	ulint		page_no;
+	ulint		offset;
+	ulint		id;
+	page_t*		undo_page;
+	dberr_t		err;
+
+	ut_ad(mutex_own(&(rseg->mutex)));
+
+	if (rseg->curr_size == rseg->max_size) {
+
+		return(DB_OUT_OF_FILE_SPACE);
+	}
+
+	rseg->curr_size++;
+
+	rseg_header = trx_rsegf_get(rseg->space, rseg->zip_size, rseg->page_no,
+				    mtr);
+
+	err = trx_undo_seg_create(rseg, rseg_header, type, &id,
+				  &undo_page, mtr);
+
+	if (err != DB_SUCCESS) {
+		/* Did not succeed */
+
+		rseg->curr_size--;
+
+		return(err);
+	}
+
+	page_no = page_get_page_no(undo_page);
+
+	offset = trx_undo_header_create(undo_page, trx_id, mtr);
+
+	if (trx->support_xa) {
+		trx_undo_header_add_space_for_xid(undo_page,
+						  undo_page + offset, mtr);
+	}
+
+	*undo = trx_undo_mem_create(rseg, id, type, trx_id, xid,
+				   page_no, offset);
+	if (*undo == NULL) {
+
+		err = DB_OUT_OF_MEMORY;
+	}
+
+	return(err);
+}
+
+/*================ UNDO LOG ASSIGNMENT AND CLEANUP =====================*/
+
+/********************************************************************//**
+Reuses a cached undo log.
+@return	the undo log memory object, NULL if none cached */
+static
+trx_undo_t*
+trx_undo_reuse_cached(
+/*==================*/
+	trx_t*		trx,	/*!< in: transaction */
+	trx_rseg_t*	rseg,	/*!< in: rollback segment memory object */
+	ulint		type,	/*!< in: type of the log: TRX_UNDO_INSERT or
+				TRX_UNDO_UPDATE */
+	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
+				is used */
+	const XID*	xid,	/*!< in: X/Open XA transaction identification */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	trx_undo_t*	undo;
+	page_t*		undo_page;
+	ulint		offset;
+
+	ut_ad(mutex_own(&(rseg->mutex)));
+
+	if (type == TRX_UNDO_INSERT) {
+
+		undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached);
+		if (undo == NULL) {
+
+			return(NULL);
+		}
+
+		UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, undo);
+
+		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
+	} else {
+		ut_ad(type == TRX_UNDO_UPDATE);
+
+		undo = UT_LIST_GET_FIRST(rseg->update_undo_cached);
+		if (undo == NULL) {
+
+			return(NULL);
+		}
+
+		UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, undo);
+
+		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
+	}
+
+	ut_ad(undo->size == 1);
+
+	if (undo->id >= TRX_RSEG_N_SLOTS) {
+		fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+			(ulong) undo->id);
+		mem_analyze_corruption(undo);
+		ut_error;
+	}
+
+	undo_page = trx_undo_page_get(undo->space, undo->zip_size,
+				      undo->hdr_page_no, mtr);
+
+	if (type == TRX_UNDO_INSERT) {
+		offset = trx_undo_insert_header_reuse(undo_page, trx_id, mtr);
+
+		if (trx->support_xa) {
+			trx_undo_header_add_space_for_xid(
+				undo_page, undo_page + offset, mtr);
+		}
+	} else {
+		ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+				      + TRX_UNDO_PAGE_TYPE)
+		     == TRX_UNDO_UPDATE);
+
+		offset = trx_undo_header_create(undo_page, trx_id, mtr);
+
+		if (trx->support_xa) {
+			trx_undo_header_add_space_for_xid(
+				undo_page, undo_page + offset, mtr);
+		}
+	}
+
+	trx_undo_mem_init_for_reuse(undo, trx_id, xid, offset);
+
+	return(undo);
+}
+
+/**********************************************************************//**
+Marks an undo log header as a header of a data dictionary operation
+transaction. */
+static
+void
+trx_undo_mark_as_dict_operation(
+/*============================*/
+	trx_t*		trx,	/*!< in: dict op transaction */
+	trx_undo_t*	undo,	/*!< in: assigned undo log */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*	hdr_page;
+
+	hdr_page = trx_undo_page_get(undo->space, undo->zip_size,
+				     undo->hdr_page_no, mtr);
+
+	switch (trx_get_dict_operation(trx)) {
+	case TRX_DICT_OP_NONE:
+		ut_error;
+	case TRX_DICT_OP_INDEX:
+		/* Do not discard the table on recovery. */
+		undo->table_id = 0;
+		break;
+	case TRX_DICT_OP_TABLE:
+		undo->table_id = trx->table_id;
+		break;
+	}
+
+	mlog_write_ulint(hdr_page + undo->hdr_offset
+			 + TRX_UNDO_DICT_TRANS,
+			 TRUE, MLOG_1BYTE, mtr);
+
+	mlog_write_ull(hdr_page + undo->hdr_offset + TRX_UNDO_TABLE_ID,
+		       undo->table_id, mtr);
+
+	undo->dict_operation = TRUE;
+}
+
+/**********************************************************************//**
+Assigns an undo log for a transaction. A new undo log is created or a cached
+undo log reused.
+@return DB_SUCCESS if undo log assign successful, possible error codes
+are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE DB_READ_ONLY
+DB_OUT_OF_MEMORY */
+UNIV_INTERN
+dberr_t
+trx_undo_assign_undo(
+/*=================*/
+	trx_t*		trx,	/*!< in: transaction */
+	ulint		type)	/*!< in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */
+{
+	trx_rseg_t*	rseg;
+	trx_undo_t*	undo;
+	mtr_t		mtr;
+	dberr_t		err = DB_SUCCESS;
+
+	ut_ad(trx);
+
+	if (trx->rseg == NULL) {
+		return(DB_READ_ONLY);
+	}
+
+	rseg = trx->rseg;
+
+	ut_ad(mutex_own(&(trx->undo_mutex)));
+
+	mtr_start(&mtr);
+
+	mutex_enter(&rseg->mutex);
+
+	DBUG_EXECUTE_IF(
+		"ib_create_table_fail_too_many_trx",
+		err = DB_TOO_MANY_CONCURRENT_TRXS;
+		goto func_exit;
+	);
+
+	undo = trx_undo_reuse_cached(trx, rseg, type, trx->id, &trx->xid,
+				     &mtr);
+	if (undo == NULL) {
+		err = trx_undo_create(trx, rseg, type, trx->id, &trx->xid,
+				      &undo, &mtr);
+		if (err != DB_SUCCESS) {
+
+			goto func_exit;
+		}
+	}
+
+	if (type == TRX_UNDO_INSERT) {
+		UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_list, undo);
+		ut_ad(trx->insert_undo == NULL);
+		trx->insert_undo = undo;
+	} else {
+		UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_list, undo);
+		ut_ad(trx->update_undo == NULL);
+		trx->update_undo = undo;
+	}
+
+	if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) {
+		trx_undo_mark_as_dict_operation(trx, undo, &mtr);
+	}
+
+func_exit:
+	mutex_exit(&(rseg->mutex));
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction finish.
+@return	undo log segment header page, x-latched */
+UNIV_INTERN
+page_t*
+trx_undo_set_state_at_finish(
+/*=========================*/
+	trx_undo_t*	undo,	/*!< in: undo log memory copy */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	trx_usegf_t*	seg_hdr;
+	trx_upagef_t*	page_hdr;
+	page_t*		undo_page;
+	ulint		state;
+
+	if (undo->id >= TRX_RSEG_N_SLOTS) {
+		fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+			(ulong) undo->id);
+		mem_analyze_corruption(undo);
+		ut_error;
+	}
+
+	undo_page = trx_undo_page_get(undo->space, undo->zip_size,
+				      undo->hdr_page_no, mtr);
+
+	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+	page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+	if (undo->size == 1
+	    && mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE)
+	       < TRX_UNDO_PAGE_REUSE_LIMIT) {
+
+		state = TRX_UNDO_CACHED;
+
+	} else if (undo->type == TRX_UNDO_INSERT) {
+
+		state = TRX_UNDO_TO_FREE;
+	} else {
+		state = TRX_UNDO_TO_PURGE;
+	}
+
+	undo->state = state;
+
+	mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, state, MLOG_2BYTES, mtr);
+
+	return(undo_page);
+}
+
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction prepare.
+@return	undo log segment header page, x-latched */
+UNIV_INTERN
+page_t*
+trx_undo_set_state_at_prepare(
+/*==========================*/
+	trx_t*		trx,	/*!< in: transaction */
+	trx_undo_t*	undo,	/*!< in: undo log memory copy */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	trx_usegf_t*	seg_hdr;
+	trx_ulogf_t*	undo_header;
+	page_t*		undo_page;
+	ulint		offset;
+
+	ut_ad(trx && undo && mtr);
+
+	if (undo->id >= TRX_RSEG_N_SLOTS) {
+		fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+			(ulong) undo->id);
+		mem_analyze_corruption(undo);
+		ut_error;
+	}
+
+	undo_page = trx_undo_page_get(undo->space, undo->zip_size,
+				      undo->hdr_page_no, mtr);
+
+	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+	/*------------------------------*/
+	undo->state = TRX_UNDO_PREPARED;
+	undo->xid   = trx->xid;
+	/*------------------------------*/
+
+	mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, undo->state,
+			 MLOG_2BYTES, mtr);
+
+	offset = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG);
+	undo_header = undo_page + offset;
+
+	mlog_write_ulint(undo_header + TRX_UNDO_XID_EXISTS,
+			 TRUE, MLOG_1BYTE, mtr);
+
+	trx_undo_write_xid(undo_header, &undo->xid, mtr);
+
+	return(undo_page);
+}
+
+/**********************************************************************//**
+Adds the update undo log header as the first in the history list, and
+frees the memory object, or puts it to the list of cached update undo log
+segments. */
+UNIV_INTERN
+void
+trx_undo_update_cleanup(
+/*====================*/
+	trx_t*	trx,		/*!< in: trx owning the update undo log */
+	page_t*	undo_page,	/*!< in: update undo log header page,
+				x-latched */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	trx_rseg_t*	rseg;
+	trx_undo_t*	undo;
+
+	undo = trx->update_undo;
+	rseg = trx->rseg;
+
+	ut_ad(mutex_own(&(rseg->mutex)));
+
+	trx_purge_add_update_undo_to_history(trx, undo_page, mtr);
+
+	UT_LIST_REMOVE(undo_list, rseg->update_undo_list, undo);
+
+	trx->update_undo = NULL;
+
+	if (undo->state == TRX_UNDO_CACHED) {
+
+		UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_cached, undo);
+
+		MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
+	} else {
+		ut_ad(undo->state == TRX_UNDO_TO_PURGE);
+
+		trx_undo_mem_free(undo);
+	}
+}
+
+/******************************************************************//**
+Frees or caches an insert undo log after a transaction commit or rollback.
+Knowledge of inserts is not needed after a commit or rollback, therefore
+the data can be discarded. */
+UNIV_INTERN
+void
+trx_undo_insert_cleanup(
+/*====================*/
+	trx_t*	trx)	/*!< in: transaction handle */
+{
+	trx_undo_t*	undo;
+	trx_rseg_t*	rseg;
+
+	undo = trx->insert_undo;
+	ut_ad(undo);
+
+	rseg = trx->rseg;
+
+	mutex_enter(&(rseg->mutex));
+
+	UT_LIST_REMOVE(undo_list, rseg->insert_undo_list, undo);
+	trx->insert_undo = NULL;
+
+	if (undo->state == TRX_UNDO_CACHED) {
+
+		UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_cached, undo);
+
+		MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
+	} else {
+		ut_ad(undo->state == TRX_UNDO_TO_FREE);
+
+		/* Delete first the undo log segment in the file */
+
+		mutex_exit(&(rseg->mutex));
+
+		trx_undo_seg_free(undo);
+
+		mutex_enter(&(rseg->mutex));
+
+		ut_ad(rseg->curr_size > undo->size);
+
+		rseg->curr_size -= undo->size;
+
+		trx_undo_mem_free(undo);
+	}
+
+	mutex_exit(&(rseg->mutex));
+}
+
+/********************************************************************//**
+At shutdown, frees the undo logs of a PREPARED transaction. */
+UNIV_INTERN
+void
+trx_undo_free_prepared(
+/*===================*/
+	trx_t*	trx)	/*!< in/out: PREPARED transaction */
+{
+	ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS);
+
+	if (trx->update_undo) {
+		ut_a(trx->update_undo->state == TRX_UNDO_PREPARED);
+		UT_LIST_REMOVE(undo_list, trx->rseg->update_undo_list,
+			       trx->update_undo);
+		trx_undo_mem_free(trx->update_undo);
+	}
+	if (trx->insert_undo) {
+		ut_a(trx->insert_undo->state == TRX_UNDO_PREPARED);
+		UT_LIST_REMOVE(undo_list, trx->rseg->insert_undo_list,
+			       trx->insert_undo);
+		trx_undo_mem_free(trx->insert_undo);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/usr/usr0sess.cc b/storage/innobase/usr/usr0sess.cc
new file mode 100644
index 00000000000..ab7ba6bea09
--- /dev/null
+++ b/storage/innobase/usr/usr0sess.cc
@@ -0,0 +1,68 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file usr/usr0sess.cc
+Sessions
+
+Created 6/25/1996 Heikki Tuuri
+*******************************************************/
+
+#include "usr0sess.h"
+
+#ifdef UNIV_NONINL
+#include "usr0sess.ic"
+#endif
+
+#include "trx0trx.h"
+
+/*********************************************************************//**
+Opens a session.
+@return	own: session object */
+UNIV_INTERN
+sess_t*
+sess_open(void)
+/*===========*/
+{
+	sess_t*	sess;
+
+	sess = static_cast<sess_t*>(mem_zalloc(sizeof(*sess)));
+
+	sess->state = SESS_ACTIVE;
+
+	sess->trx = trx_allocate_for_background();
+	sess->trx->sess = sess;
+
+	UT_LIST_INIT(sess->graphs);
+
+	return(sess);
+}
+
+/*********************************************************************//**
+Closes a session, freeing the memory occupied by it. */
+UNIV_INTERN
+void
+sess_close(
+/*=======*/
+	sess_t*	sess)	/*!< in, own: session object */
+{
+	ut_a(UT_LIST_GET_LEN(sess->graphs) == 0);
+
+	trx_free_for_background(sess->trx);
+	mem_free(sess);
+}
diff --git a/storage/innobase/ut/ut0bh.cc b/storage/innobase/ut/ut0bh.cc
new file mode 100644
index 00000000000..1a3038a0d71
--- /dev/null
+++ b/storage/innobase/ut/ut0bh.cc
@@ -0,0 +1,159 @@
+/***************************************************************************//**
+
+Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file ut/ut0bh.cc
+Binary min-heap implementation.
+
+Created 2010-05-28 by Sunny Bains
+*******************************************************/
+
+#include "ut0bh.h"
+#include "ut0mem.h"
+
+#ifdef UNIV_NONINL
+#include "ut0bh.ic"
+#endif
+
+#include <string.h>
+
+/**********************************************************************//**
+Create a binary heap.
+@return a new binary heap */
+UNIV_INTERN
+ib_bh_t*
+ib_bh_create(
+/*=========*/
+	ib_bh_cmp_t	compare,		/*!< in: comparator */
+	ulint		sizeof_elem,		/*!< in: size of one element */
+	ulint		max_elems)		/*!< in: max elements allowed */
+{
+	ulint		sz;
+	ib_bh_t*	ib_bh;
+
+	sz = sizeof(*ib_bh) + (sizeof_elem * max_elems);
+
+	ib_bh = (ib_bh_t*) ut_malloc(sz);
+	memset(ib_bh, 0x0, sz);
+
+	ib_bh->compare = compare;
+	ib_bh->max_elems = max_elems;
+	ib_bh->sizeof_elem = sizeof_elem;
+
+	return(ib_bh);
+}
+
+/**********************************************************************//**
+Free a binary heap.
+@return a new binary heap */
+UNIV_INTERN
+void
+ib_bh_free(
+/*=======*/
+	ib_bh_t*	ib_bh)			/*!< in/own: instance */
+{
+	ut_free(ib_bh);
+}
+
+/**********************************************************************//**
+Add an element to the binary heap. Note: The element is copied.
+@return pointer to added element or NULL if full. */
+UNIV_INTERN
+void*
+ib_bh_push(
+/*=======*/
+	ib_bh_t*	ib_bh,			/*!< in/out: instance */
+	const void*	elem)			/*!< in: element to add */
+{
+	void*		ptr;
+
+	if (ib_bh_is_full(ib_bh)) {
+		return(NULL);
+	} else if (ib_bh_is_empty(ib_bh)) {
+		++ib_bh->n_elems;
+		return(ib_bh_set(ib_bh, 0, elem));
+	} else {
+		ulint	i;
+
+		i = ib_bh->n_elems;
+
+		++ib_bh->n_elems;
+
+		for (ptr = ib_bh_get(ib_bh, i >> 1);
+		     i > 0 && ib_bh->compare(ptr, elem) > 0;
+		     i >>= 1, ptr = ib_bh_get(ib_bh, i >> 1)) {
+
+			ib_bh_set(ib_bh, i, ptr);
+		}
+
+		ptr = ib_bh_set(ib_bh, i, elem);
+	}
+
+	return(ptr);
+}
+
+/**********************************************************************//**
+Remove the first element from the binary heap. */
+UNIV_INTERN
+void
+ib_bh_pop(
+/*======*/
+	ib_bh_t*	ib_bh)			/*!< in/out: instance */
+{
+	byte*		ptr;
+	byte*		last;
+	ulint		parent = 0;
+
+	if (ib_bh_is_empty(ib_bh)) {
+		return;
+	} else if (ib_bh_size(ib_bh) == 1) {
+		--ib_bh->n_elems;
+		return;
+	}
+
+	last = (byte*) ib_bh_last(ib_bh);
+
+	/* Start from the child node */
+	ptr = (byte*) ib_bh_get(ib_bh, 1);
+
+	while (ptr < last) {
+		/* If the "right" child node is < "left" child node */
+		if (ib_bh->compare(ptr + ib_bh->sizeof_elem, ptr) < 0) {
+			ptr += ib_bh->sizeof_elem;
+		}
+
+		if (ib_bh->compare(last, ptr) <= 0) {
+			break;
+		}
+
+		ib_bh_set(ib_bh, parent, ptr);
+
+		parent = (ptr - (byte*) ib_bh_first(ib_bh))
+		       / ib_bh->sizeof_elem;
+
+		if ((parent << 1) >= ib_bh_size(ib_bh)) {
+			break;
+		}
+
+		ptr = (byte*) ib_bh_get(ib_bh, parent << 1);
+	}
+
+	--ib_bh->n_elems;
+
+	ib_bh_set(ib_bh, parent, last);
+}
diff --git a/storage/innobase/ut/ut0byte.cc b/storage/innobase/ut/ut0byte.cc
new file mode 100644
index 00000000000..bc592edc6bf
--- /dev/null
+++ b/storage/innobase/ut/ut0byte.cc
@@ -0,0 +1,30 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***************************************************************//**
+@file ut/ut0byte.cc
+Byte utilities
+
+Created 5/11/1994 Heikki Tuuri
+********************************************************************/
+
+#include "ut0byte.h"
+
+#ifdef UNIV_NONINL
+#include "ut0byte.ic"
+#endif
diff --git a/storage/innobase/ut/ut0crc32.cc b/storage/innobase/ut/ut0crc32.cc
new file mode 100644
index 00000000000..1caf27ebae3
--- /dev/null
+++ b/storage/innobase/ut/ut0crc32.cc
@@ -0,0 +1,318 @@
+/*****************************************************************************
+
+Copyright (C) 2009, 2010 Facebook, Inc. All Rights Reserved.
+Copyright (c) 2011, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***************************************************************//**
+@file ut/ut0crc32.cc
+CRC32 implementation from Facebook, based on the zlib implementation.
+
+Created Aug 8, 2011, Vasil Dimov, based on mysys/my_crc32.c and
+mysys/my_perf.c, contributed by Facebook under the following license.
+********************************************************************/
+
+/* Copyright (C) 2009-2010 Facebook, Inc.  All Rights Reserved.
+
+   Dual licensed under BSD license and GPLv2.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+   1. Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+   2. Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY FACEBOOK, INC. ``AS IS'' AND ANY EXPRESS OR
+   IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+   MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+   EVENT SHALL FACEBOOK, INC. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+   OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+   OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+   ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   This program is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the Free
+   Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   You should have received a copy of the GNU General Public License along with
+   this program; if not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
+
+/* The below CRC32 implementation is based on the implementation included with
+ * zlib with modifications to process 8 bytes at a time and using SSE 4.2
+ * extentions when available.  The polynomial constant has been changed to
+ * match the one used by SSE 4.2 and does not return the same value as the
+ * version used by zlib.  This implementation only supports 64-bit
+ * little-endian processors.  The original zlib copyright notice follows. */
+
+/* crc32.c -- compute the CRC-32 of a buf stream
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
+ * CRC methods: exclusive-oring 32 bits of buf at a time, and pre-computing
+ * tables for updating the shift register in one step with three exclusive-ors
+ * instead of four steps with four exclusive-ors.  This results in about a
+ * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
+ */
+
+#include "univ.i"
+#include "ut0crc32.h"
+
+#include <string.h>
+
+ib_ut_crc32_t	ut_crc32;
+
+/* Precalculated table used to generate the CRC32 if the CPU does not
+have support for it */
+static ib_uint32_t	ut_crc32_slice8_table[8][256];
+static ibool		ut_crc32_slice8_table_initialized = FALSE;
+
+/* Flag that tells whether the CPU supports CRC32 or not */
+UNIV_INTERN bool	ut_crc32_sse2_enabled = false;
+
+/********************************************************************//**
+Initializes the table that is used to generate the CRC32 if the CPU does
+not have support for it. */
+static
+void
+ut_crc32_slice8_table_init()
+/*========================*/
+{
+	/* bit-reversed poly 0x1EDC6F41 (from SSE42 crc32 instruction) */
+	static const ib_uint32_t	poly = 0x82f63b78;
+	ib_uint32_t			n;
+	ib_uint32_t			k;
+	ib_uint32_t			c;
+
+	for (n = 0; n < 256; n++) {
+		c = n;
+		for (k = 0; k < 8; k++) {
+			c = (c & 1) ? (poly ^ (c >> 1)) : (c >> 1);
+		}
+		ut_crc32_slice8_table[0][n] = c;
+	}
+
+	for (n = 0; n < 256; n++) {
+		c = ut_crc32_slice8_table[0][n];
+		for (k = 1; k < 8; k++) {
+			c = ut_crc32_slice8_table[0][c & 0xFF] ^ (c >> 8);
+			ut_crc32_slice8_table[k][n] = c;
+		}
+	}
+
+	ut_crc32_slice8_table_initialized = TRUE;
+}
+
+#if defined(__GNUC__) && defined(__x86_64__)
+/********************************************************************//**
+Fetches CPU info */
+static
+void
+ut_cpuid(
+/*=====*/
+	ib_uint32_t	vend[3],	/*!< out: CPU vendor */
+	ib_uint32_t*	model,		/*!< out: CPU model */
+	ib_uint32_t*	family,		/*!< out: CPU family */
+	ib_uint32_t*	stepping,	/*!< out: CPU stepping */
+	ib_uint32_t*	features_ecx,	/*!< out: CPU features ecx */
+	ib_uint32_t*	features_edx)	/*!< out: CPU features edx */
+{
+	ib_uint32_t	sig;
+	asm("cpuid" : "=b" (vend[0]), "=c" (vend[2]), "=d" (vend[1]) : "a" (0));
+	asm("cpuid" : "=a" (sig), "=c" (*features_ecx), "=d" (*features_edx)
+	    : "a" (1)
+	    : "ebx");
+
+	*model = ((sig >> 4) & 0xF);
+	*family = ((sig >> 8) & 0xF);
+	*stepping = (sig & 0xF);
+
+	if (memcmp(vend, "GenuineIntel", 12) == 0
+	    || (memcmp(vend, "AuthenticAMD", 12) == 0 && *family == 0xF)) {
+
+		*model += (((sig >> 16) & 0xF) << 4);
+		*family += ((sig >> 20) & 0xFF);
+	}
+}
+
+/* opcodes taken from objdump of "crc32b (%%rdx), %%rcx"
+for RHEL4 support (GCC 3 doesn't support this instruction) */
+#define ut_crc32_sse42_byte \
+	asm(".byte 0xf2, 0x48, 0x0f, 0x38, 0xf0, 0x0a" \
+	    : "=c"(crc) : "c"(crc), "d"(buf)); \
+	len--, buf++
+
+/* opcodes taken from objdump of "crc32q (%%rdx), %%rcx"
+for RHEL4 support (GCC 3 doesn't support this instruction) */
+#define ut_crc32_sse42_quadword \
+	asm(".byte 0xf2, 0x48, 0x0f, 0x38, 0xf1, 0x0a" \
+	    : "=c"(crc) : "c"(crc), "d"(buf)); \
+	len -= 8, buf += 8
+#endif /* defined(__GNUC__) && defined(__x86_64__) */
+
+/********************************************************************//**
+Calculates CRC32 using CPU instructions.
+@return CRC-32C (polynomial 0x11EDC6F41) */
+UNIV_INLINE
+ib_uint32_t
+ut_crc32_sse42(
+/*===========*/
+	const byte*	buf,	/*!< in: data over which to calculate CRC32 */
+	ulint		len)	/*!< in: data length */
+{
+#if defined(__GNUC__) && defined(__x86_64__)
+	ib_uint64_t	crc = (ib_uint32_t) (-1);
+
+	ut_a(ut_crc32_sse2_enabled);
+
+	while (len && ((ulint) buf & 7)) {
+		ut_crc32_sse42_byte;
+	}
+
+	while (len >= 32) {
+		ut_crc32_sse42_quadword;
+		ut_crc32_sse42_quadword;
+		ut_crc32_sse42_quadword;
+		ut_crc32_sse42_quadword;
+	}
+
+	while (len >= 8) {
+		ut_crc32_sse42_quadword;
+	}
+
+	while (len) {
+		ut_crc32_sse42_byte;
+	}
+
+	return((ib_uint32_t) ((~crc) & 0xFFFFFFFF));
+#else
+	ut_error;
+	/* silence compiler warning about unused parameters */
+	return((ib_uint32_t) buf[len]);
+#endif /* defined(__GNUC__) && defined(__x86_64__) */
+}
+
+#define ut_crc32_slice8_byte \
+	crc = (crc >> 8) ^ ut_crc32_slice8_table[0][(crc ^ *buf++) & 0xFF]; \
+	len--
+
+#define ut_crc32_slice8_quadword \
+	crc ^= *(ib_uint64_t*) buf; \
+	crc = ut_crc32_slice8_table[7][(crc      ) & 0xFF] ^ \
+	      ut_crc32_slice8_table[6][(crc >>  8) & 0xFF] ^ \
+	      ut_crc32_slice8_table[5][(crc >> 16) & 0xFF] ^ \
+	      ut_crc32_slice8_table[4][(crc >> 24) & 0xFF] ^ \
+	      ut_crc32_slice8_table[3][(crc >> 32) & 0xFF] ^ \
+	      ut_crc32_slice8_table[2][(crc >> 40) & 0xFF] ^ \
+	      ut_crc32_slice8_table[1][(crc >> 48) & 0xFF] ^ \
+	      ut_crc32_slice8_table[0][(crc >> 56)]; \
+	len -= 8, buf += 8
+
+/********************************************************************//**
+Calculates CRC32 manually.
+@return CRC-32C (polynomial 0x11EDC6F41) */
+UNIV_INLINE
+ib_uint32_t
+ut_crc32_slice8(
+/*============*/
+	const byte*	buf,	/*!< in: data over which to calculate CRC32 */
+	ulint		len)	/*!< in: data length */
+{
+	ib_uint64_t	crc = (ib_uint32_t) (-1);
+
+	ut_a(ut_crc32_slice8_table_initialized);
+
+	while (len && ((ulint) buf & 7)) {
+		ut_crc32_slice8_byte;
+	}
+
+	while (len >= 32) {
+		ut_crc32_slice8_quadword;
+		ut_crc32_slice8_quadword;
+		ut_crc32_slice8_quadword;
+		ut_crc32_slice8_quadword;
+	}
+
+	while (len >= 8) {
+		ut_crc32_slice8_quadword;
+	}
+
+	while (len) {
+		ut_crc32_slice8_byte;
+	}
+
+	return((ib_uint32_t) ((~crc) & 0xFFFFFFFF));
+}
+
+/********************************************************************//**
+Initializes the data structures used by ut_crc32(). Does not do any
+allocations, would not hurt if called twice, but would be pointless. */
+UNIV_INTERN
+void
+ut_crc32_init()
+/*===========*/
+{
+#if defined(__GNUC__) && defined(__x86_64__)
+	ib_uint32_t	vend[3];
+	ib_uint32_t	model;
+	ib_uint32_t	family;
+	ib_uint32_t	stepping;
+	ib_uint32_t	features_ecx;
+	ib_uint32_t	features_edx;
+
+	ut_cpuid(vend, &model, &family, &stepping,
+		 &features_ecx, &features_edx);
+
+	/* Valgrind does not understand the CRC32 instructions:
+
+	vex amd64->IR: unhandled instruction bytes: 0xF2 0x48 0xF 0x38 0xF0 0xA
+	valgrind: Unrecognised instruction at address 0xad3db5.
+	Your program just tried to execute an instruction that Valgrind
+	did not recognise.  There are two possible reasons for this.
+	1. Your program has a bug and erroneously jumped to a non-code
+	   location.  If you are running Memcheck and you just saw a
+	   warning about a bad jump, it's probably your program's fault.
+	2. The instruction is legitimate but Valgrind doesn't handle it,
+	   i.e. it's Valgrind's fault.  If you think this is the case or
+	   you are not sure, please let us know and we'll try to fix it.
+	Either way, Valgrind will now raise a SIGILL signal which will
+	probably kill your program.
+
+	*/
+#ifndef UNIV_DEBUG_VALGRIND
+	ut_crc32_sse2_enabled = (features_ecx >> 20) & 1;
+#endif /* UNIV_DEBUG_VALGRIND */
+
+#endif /* defined(__GNUC__) && defined(__x86_64__) */
+
+	if (ut_crc32_sse2_enabled) {
+		ut_crc32 = ut_crc32_sse42;
+	} else {
+		ut_crc32_slice8_table_init();
+		ut_crc32 = ut_crc32_slice8;
+	}
+}
diff --git a/storage/innobase/ut/ut0dbg.cc b/storage/innobase/ut/ut0dbg.cc
new file mode 100644
index 00000000000..a1cad144da4
--- /dev/null
+++ b/storage/innobase/ut/ut0dbg.cc
@@ -0,0 +1,139 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*****************************************************************//**
+@file ut/ut0dbg.cc
+Debug utilities for Innobase.
+
+Created 1/30/1994 Heikki Tuuri
+**********************************************************************/
+
+#include "univ.i"
+#include "ut0dbg.h"
+#ifndef UNIV_HOTBACKUP
+# include "ha_prototypes.h"
+#endif /* !UNIV_HOTBACKUP */
+
+#if defined(__GNUC__) && (__GNUC__ > 2)
+#else
+/** This is used to eliminate compiler warnings */
+UNIV_INTERN ulint	ut_dbg_zero	= 0;
+#endif
+
+/*************************************************************//**
+Report a failed assertion. */
+UNIV_INTERN
+void
+ut_dbg_assertion_failed(
+/*====================*/
+	const char* expr,	/*!< in: the failed assertion (optional) */
+	const char* file,	/*!< in: source file containing the assertion */
+	ulint line)		/*!< in: line number of the assertion */
+{
+	ut_print_timestamp(stderr);
+#ifdef UNIV_HOTBACKUP
+	fprintf(stderr, "  InnoDB: Assertion failure in file %s line %lu\n",
+		file, line);
+#else /* UNIV_HOTBACKUP */
+	fprintf(stderr,
+		"  InnoDB: Assertion failure in thread %lu"
+		" in file %s line %lu\n",
+		os_thread_pf(os_thread_get_curr_id()),
+		innobase_basename(file), line);
+#endif /* UNIV_HOTBACKUP */
+	if (expr) {
+		fprintf(stderr,
+			"InnoDB: Failing assertion: %s\n", expr);
+	}
+
+	fputs("InnoDB: We intentionally generate a memory trap.\n"
+	      "InnoDB: Submit a detailed bug report"
+	      " to http://bugs.mysql.com.\n"
+	      "InnoDB: If you get repeated assertion failures"
+	      " or crashes, even\n"
+	      "InnoDB: immediately after the mysqld startup, there may be\n"
+	      "InnoDB: corruption in the InnoDB tablespace. Please refer to\n"
+	      "InnoDB: " REFMAN "forcing-innodb-recovery.html\n"
+	      "InnoDB: about forcing recovery.\n", stderr);
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include <unistd.h>
+
+#ifndef timersub
+#define timersub(a, b, r)						\
+	do {								\
+		(r)->tv_sec = (a)->tv_sec - (b)->tv_sec;		\
+		(r)->tv_usec = (a)->tv_usec - (b)->tv_usec;		\
+		if ((r)->tv_usec < 0) {					\
+			(r)->tv_sec--;					\
+			(r)->tv_usec += 1000000;			\
+		}							\
+	} while (0)
+#endif /* timersub */
+
+/*******************************************************************//**
+Resets a speedo (records the current time in it). */
+UNIV_INTERN
+void
+speedo_reset(
+/*=========*/
+	speedo_t*	speedo)	/*!< out: speedo */
+{
+	gettimeofday(&speedo->tv, NULL);
+
+	getrusage(RUSAGE_SELF, &speedo->ru);
+}
+
+/*******************************************************************//**
+Shows the time elapsed and usage statistics since the last reset of a
+speedo. */
+UNIV_INTERN
+void
+speedo_show(
+/*========*/
+	const speedo_t*	speedo)	/*!< in: speedo */
+{
+	struct rusage	ru_now;
+	struct timeval	tv_now;
+	struct timeval	tv_diff;
+
+	getrusage(RUSAGE_SELF, &ru_now);
+
+	gettimeofday(&tv_now, NULL);
+
+#define PRINT_TIMEVAL(prefix, tvp)		\
+	fprintf(stderr, "%s% 5ld.%06ld sec\n",	\
+		prefix, (tvp)->tv_sec, (tvp)->tv_usec)
+
+	timersub(&tv_now, &speedo->tv, &tv_diff);
+	PRINT_TIMEVAL("real", &tv_diff);
+
+	timersub(&ru_now.ru_utime, &speedo->ru.ru_utime, &tv_diff);
+	PRINT_TIMEVAL("user", &tv_diff);
+
+	timersub(&ru_now.ru_stime, &speedo->ru.ru_stime, &tv_diff);
+	PRINT_TIMEVAL("sys ", &tv_diff);
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
diff --git a/storage/innobase/ut/ut0list.cc b/storage/innobase/ut/ut0list.cc
new file mode 100644
index 00000000000..f906061d185
--- /dev/null
+++ b/storage/innobase/ut/ut0list.cc
@@ -0,0 +1,203 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file ut/ut0list.cc
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+#include "ut0list.h"
+#ifdef UNIV_NONINL
+#include "ut0list.ic"
+#endif
+
+/****************************************************************//**
+Create a new list.
+@return	list */
+UNIV_INTERN
+ib_list_t*
+ib_list_create(void)
+/*=================*/
+{
+	ib_list_t*	list;
+
+	list = static_cast<ib_list_t*>(mem_alloc(sizeof(*list)));
+
+	list->first = NULL;
+	list->last = NULL;
+	list->is_heap_list = FALSE;
+
+	return(list);
+}
+
+/****************************************************************//**
+Create a new list using the given heap. ib_list_free MUST NOT BE CALLED for
+lists created with this function.
+@return	list */
+UNIV_INTERN
+ib_list_t*
+ib_list_create_heap(
+/*================*/
+	mem_heap_t*	heap)	/*!< in: memory heap to use */
+{
+	ib_list_t*	list;
+
+	list = static_cast<ib_list_t*>(mem_heap_alloc(heap, sizeof(*list)));
+
+	list->first = NULL;
+	list->last = NULL;
+	list->is_heap_list = TRUE;
+
+	return(list);
+}
+
+/****************************************************************//**
+Free a list. */
+UNIV_INTERN
+void
+ib_list_free(
+/*=========*/
+	ib_list_t*	list)	/*!< in: list */
+{
+	ut_a(!list->is_heap_list);
+
+	/* We don't check that the list is empty because it's entirely valid
+	to e.g. have all the nodes allocated from a single heap that is then
+	freed after the list itself is freed. */
+
+	mem_free(list);
+}
+
+/****************************************************************//**
+Add the data to the start of the list.
+@return	new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_first(
+/*==============*/
+	ib_list_t*	list,	/*!< in: list */
+	void*		data,	/*!< in: data */
+	mem_heap_t*	heap)	/*!< in: memory heap to use */
+{
+	return(ib_list_add_after(list, ib_list_get_first(list), data, heap));
+}
+
+/****************************************************************//**
+Add the data to the end of the list.
+@return	new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_last(
+/*=============*/
+	ib_list_t*	list,	/*!< in: list */
+	void*		data,	/*!< in: data */
+	mem_heap_t*	heap)	/*!< in: memory heap to use */
+{
+	return(ib_list_add_after(list, ib_list_get_last(list), data, heap));
+}
+
+/****************************************************************//**
+Add the data after the indicated node.
+@return	new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_after(
+/*==============*/
+	ib_list_t*	list,		/*!< in: list */
+	ib_list_node_t*	prev_node,	/*!< in: node preceding new node (can
+					be NULL) */
+	void*		data,		/*!< in: data */
+	mem_heap_t*	heap)		/*!< in: memory heap to use */
+{
+	ib_list_node_t*	node;
+
+	node = static_cast<ib_list_node_t*>(
+		mem_heap_alloc(heap, sizeof(*node)));
+
+	node->data = data;
+
+	if (!list->first) {
+		/* Empty list. */
+
+		ut_a(!prev_node);
+
+		node->prev = NULL;
+		node->next = NULL;
+
+		list->first = node;
+		list->last = node;
+	} else if (!prev_node) {
+		/* Start of list. */
+
+		node->prev = NULL;
+		node->next = list->first;
+
+		list->first->prev = node;
+
+		list->first = node;
+	} else {
+		/* Middle or end of list. */
+
+		node->prev = prev_node;
+		node->next = prev_node->next;
+
+		prev_node->next = node;
+
+		if (node->next) {
+			node->next->prev = node;
+		} else {
+			list->last = node;
+		}
+	}
+
+	return(node);
+}
+
+/****************************************************************//**
+Remove the node from the list. */
+UNIV_INTERN
+void
+ib_list_remove(
+/*===========*/
+	ib_list_t*	list,	/*!< in: list */
+	ib_list_node_t*	node)	/*!< in: node to remove */
+{
+	if (node->prev) {
+		node->prev->next = node->next;
+	} else {
+		/* First item in list. */
+
+		ut_ad(list->first == node);
+
+		list->first = node->next;
+	}
+
+	if (node->next) {
+		node->next->prev = node->prev;
+	} else {
+		/* Last item in list. */
+
+		ut_ad(list->last == node);
+
+		list->last = node->prev;
+	}
+
+	node->prev = node->next = NULL;
+}
diff --git a/storage/innobase/ut/ut0mem.cc b/storage/innobase/ut/ut0mem.cc
new file mode 100644
index 00000000000..2bb5d9ce332
--- /dev/null
+++ b/storage/innobase/ut/ut0mem.cc
@@ -0,0 +1,609 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file ut/ut0mem.cc
+Memory primitives
+
+Created 5/11/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0mem.h"
+
+#ifdef UNIV_NONINL
+#include "ut0mem.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+# include "os0thread.h"
+# include "srv0srv.h"
+
+#include <stdlib.h>
+
+/** The total amount of memory currently allocated from the operating
+system with os_mem_alloc_large() or malloc().  Does not count malloc()
+if srv_use_sys_malloc is set.  Protected by ut_list_mutex. */
+UNIV_INTERN ulint		ut_total_allocated_memory	= 0;
+
+/** Mutex protecting ut_total_allocated_memory and ut_mem_block_list */
+UNIV_INTERN os_fast_mutex_t	ut_list_mutex;
+
+#ifdef UNIV_PFS_MUTEX
+/* Key to register server_mutex with performance schema */
+UNIV_INTERN mysql_pfs_key_t	ut_list_mutex_key;
+#endif
+
+/** Dynamically allocated memory block */
+struct ut_mem_block_t{
+	UT_LIST_NODE_T(ut_mem_block_t) mem_block_list;
+			/*!< mem block list node */
+	ulint	size;	/*!< size of allocated memory */
+	ulint	magic_n;/*!< magic number (UT_MEM_MAGIC_N) */
+};
+
+/** The value of ut_mem_block_t::magic_n.  Used in detecting
+memory corruption. */
+#define UT_MEM_MAGIC_N	1601650166
+
+/** List of all memory blocks allocated from the operating system
+with malloc.  Protected by ut_list_mutex. */
+static UT_LIST_BASE_NODE_T(ut_mem_block_t)   ut_mem_block_list;
+
+/** Flag: has ut_mem_block_list been initialized? */
+static ibool  ut_mem_block_list_inited = FALSE;
+
+/** A dummy pointer for generating a null pointer exception in
+ut_malloc_low() */
+static ulint*	ut_mem_null_ptr	= NULL;
+
+/**********************************************************************//**
+Initializes the mem block list at database startup. */
+UNIV_INTERN
+void
+ut_mem_init(void)
+/*=============*/
+{
+	ut_a(!ut_mem_block_list_inited);
+	os_fast_mutex_init(ut_list_mutex_key, &ut_list_mutex);
+	UT_LIST_INIT(ut_mem_block_list);
+	ut_mem_block_list_inited = TRUE;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Allocates memory.
+@return	own: allocated memory */
+UNIV_INTERN
+void*
+ut_malloc_low(
+/*==========*/
+	ulint	n,		/*!< in: number of bytes to allocate */
+	ibool	assert_on_error)/*!< in: if TRUE, we crash mysqld if the
+				memory cannot be allocated */
+{
+#ifndef UNIV_HOTBACKUP
+	ulint	retry_count;
+	void*	ret;
+
+	if (UNIV_LIKELY(srv_use_sys_malloc)) {
+		ret = malloc(n);
+		ut_a(ret || !assert_on_error);
+
+		return(ret);
+	}
+
+	ut_ad((sizeof(ut_mem_block_t) % 8) == 0); /* check alignment ok */
+	ut_a(ut_mem_block_list_inited);
+
+	retry_count = 0;
+retry:
+	os_fast_mutex_lock(&ut_list_mutex);
+
+	ret = malloc(n + sizeof(ut_mem_block_t));
+
+	if (ret == NULL && retry_count < 60) {
+		if (retry_count == 0) {
+			ut_print_timestamp(stderr);
+
+			fprintf(stderr,
+				"  InnoDB: Error: cannot allocate"
+				" %lu bytes of\n"
+				"InnoDB: memory with malloc!"
+				" Total allocated memory\n"
+				"InnoDB: by InnoDB %lu bytes."
+				" Operating system errno: %lu\n"
+				"InnoDB: Check if you should"
+				" increase the swap file or\n"
+				"InnoDB: ulimits of your operating system.\n"
+				"InnoDB: On FreeBSD check you"
+				" have compiled the OS with\n"
+				"InnoDB: a big enough maximum process size.\n"
+				"InnoDB: Note that in most 32-bit"
+				" computers the process\n"
+				"InnoDB: memory space is limited"
+				" to 2 GB or 4 GB.\n"
+				"InnoDB: We keep retrying"
+				" the allocation for 60 seconds...\n",
+				(ulong) n, (ulong) ut_total_allocated_memory,
+#ifdef __WIN__
+				(ulong) GetLastError()
+#else
+				(ulong) errno
+#endif
+				);
+		}
+
+		os_fast_mutex_unlock(&ut_list_mutex);
+
+		/* Sleep for a second and retry the allocation; maybe this is
+		just a temporary shortage of memory */
+
+		os_thread_sleep(1000000);
+
+		retry_count++;
+
+		goto retry;
+	}
+
+	if (ret == NULL) {
+		/* Flush stderr to make more probable that the error
+		message gets in the error file before we generate a seg
+		fault */
+
+		fflush(stderr);
+
+		os_fast_mutex_unlock(&ut_list_mutex);
+
+		/* Make an intentional seg fault so that we get a stack
+		trace */
+		if (assert_on_error) {
+			ut_print_timestamp(stderr);
+
+			fprintf(stderr,
+				"  InnoDB: We now intentionally"
+				" generate a seg fault so that\n"
+				"InnoDB: on Linux we get a stack trace.\n");
+
+			if (*ut_mem_null_ptr) ut_mem_null_ptr = 0;
+		} else {
+			return(NULL);
+		}
+	}
+
+	UNIV_MEM_ALLOC(ret, n + sizeof(ut_mem_block_t));
+
+	((ut_mem_block_t*) ret)->size = n + sizeof(ut_mem_block_t);
+	((ut_mem_block_t*) ret)->magic_n = UT_MEM_MAGIC_N;
+
+	ut_total_allocated_memory += n + sizeof(ut_mem_block_t);
+
+	UT_LIST_ADD_FIRST(mem_block_list, ut_mem_block_list,
+			  ((ut_mem_block_t*) ret));
+	os_fast_mutex_unlock(&ut_list_mutex);
+
+	return((void*)((byte*) ret + sizeof(ut_mem_block_t)));
+#else /* !UNIV_HOTBACKUP */
+	void*	ret = malloc(n);
+	ut_a(ret || !assert_on_error);
+
+	return(ret);
+#endif /* !UNIV_HOTBACKUP */
+}
+
+/**********************************************************************//**
+Frees a memory block allocated with ut_malloc. Freeing a NULL pointer is
+a nop. */
+UNIV_INTERN
+void
+ut_free(
+/*====*/
+	void* ptr)  /*!< in, own: memory block, can be NULL */
+{
+#ifndef UNIV_HOTBACKUP
+	ut_mem_block_t* block;
+
+	if (ptr == NULL) {
+		return;
+	} else if (UNIV_LIKELY(srv_use_sys_malloc)) {
+		free(ptr);
+		return;
+	}
+
+	block = (ut_mem_block_t*)((byte*) ptr - sizeof(ut_mem_block_t));
+
+	os_fast_mutex_lock(&ut_list_mutex);
+
+	ut_a(block->magic_n == UT_MEM_MAGIC_N);
+	ut_a(ut_total_allocated_memory >= block->size);
+
+	ut_total_allocated_memory -= block->size;
+
+	UT_LIST_REMOVE(mem_block_list, ut_mem_block_list, block);
+	free(block);
+
+	os_fast_mutex_unlock(&ut_list_mutex);
+#else /* !UNIV_HOTBACKUP */
+	free(ptr);
+#endif /* !UNIV_HOTBACKUP */
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Implements realloc. This is needed by /pars/lexyy.cc. Otherwise, you should not
+use this function because the allocation functions in mem0mem.h are the
+recommended ones in InnoDB.
+
+man realloc in Linux, 2004:
+
+       realloc()  changes the size of the memory block pointed to
+       by ptr to size bytes.  The contents will be  unchanged  to
+       the minimum of the old and new sizes; newly allocated mem-
+       ory will be uninitialized.  If ptr is NULL,  the	 call  is
+       equivalent  to malloc(size); if size is equal to zero, the
+       call is equivalent to free(ptr).	 Unless ptr is	NULL,  it
+       must  have  been	 returned by an earlier call to malloc(),
+       calloc() or realloc().
+
+RETURN VALUE
+       realloc() returns a pointer to the newly allocated memory,
+       which is suitably aligned for any kind of variable and may
+       be different from ptr, or NULL if the  request  fails.  If
+       size  was equal to 0, either NULL or a pointer suitable to
+       be passed to free() is returned.	 If realloc()  fails  the
+       original	 block	is  left  untouched  - it is not freed or
+       moved.
+@return	own: pointer to new mem block or NULL */
+UNIV_INTERN
+void*
+ut_realloc(
+/*=======*/
+	void*	ptr,	/*!< in: pointer to old block or NULL */
+	ulint	size)	/*!< in: desired size */
+{
+	ut_mem_block_t* block;
+	ulint		old_size;
+	ulint		min_size;
+	void*		new_ptr;
+
+	if (UNIV_LIKELY(srv_use_sys_malloc)) {
+		return(realloc(ptr, size));
+	}
+
+	if (ptr == NULL) {
+
+		return(ut_malloc(size));
+	}
+
+	if (size == 0) {
+		ut_free(ptr);
+
+		return(NULL);
+	}
+
+	block = (ut_mem_block_t*)((byte*) ptr - sizeof(ut_mem_block_t));
+
+	ut_a(block->magic_n == UT_MEM_MAGIC_N);
+
+	old_size = block->size - sizeof(ut_mem_block_t);
+
+	if (size < old_size) {
+		min_size = size;
+	} else {
+		min_size = old_size;
+	}
+
+	new_ptr = ut_malloc(size);
+
+	if (new_ptr == NULL) {
+
+		return(NULL);
+	}
+
+	/* Copy the old data from ptr */
+	ut_memcpy(new_ptr, ptr, min_size);
+
+	ut_free(ptr);
+
+	return(new_ptr);
+}
+
+/**********************************************************************//**
+Frees in shutdown all allocated memory not freed yet. */
+UNIV_INTERN
+void
+ut_free_all_mem(void)
+/*=================*/
+{
+	ut_mem_block_t* block;
+
+	ut_a(ut_mem_block_list_inited);
+	ut_mem_block_list_inited = FALSE;
+	os_fast_mutex_free(&ut_list_mutex);
+
+	while ((block = UT_LIST_GET_FIRST(ut_mem_block_list))) {
+
+		ut_a(block->magic_n == UT_MEM_MAGIC_N);
+		ut_a(ut_total_allocated_memory >= block->size);
+
+		ut_total_allocated_memory -= block->size;
+
+		UT_LIST_REMOVE(mem_block_list, ut_mem_block_list, block);
+		free(block);
+	}
+
+	if (ut_total_allocated_memory != 0) {
+		fprintf(stderr,
+			"InnoDB: Warning: after shutdown"
+			" total allocated memory is %lu\n",
+			(ulong) ut_total_allocated_memory);
+	}
+
+	ut_mem_block_list_inited = FALSE;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Copies up to size - 1 characters from the NUL-terminated string src to
+dst, NUL-terminating the result. Returns strlen(src), so truncation
+occurred if the return value >= size.
+@return	strlen(src) */
+UNIV_INTERN
+ulint
+ut_strlcpy(
+/*=======*/
+	char*		dst,	/*!< in: destination buffer */
+	const char*	src,	/*!< in: source buffer */
+	ulint		size)	/*!< in: size of destination buffer */
+{
+	ulint	src_size = strlen(src);
+
+	if (size != 0) {
+		ulint	n = ut_min(src_size, size - 1);
+
+		memcpy(dst, src, n);
+		dst[n] = '\0';
+	}
+
+	return(src_size);
+}
+
+/**********************************************************************//**
+Like ut_strlcpy, but if src doesn't fit in dst completely, copies the last
+(size - 1) bytes of src, not the first.
+@return	strlen(src) */
+UNIV_INTERN
+ulint
+ut_strlcpy_rev(
+/*===========*/
+	char*		dst,	/*!< in: destination buffer */
+	const char*	src,	/*!< in: source buffer */
+	ulint		size)	/*!< in: size of destination buffer */
+{
+	ulint	src_size = strlen(src);
+
+	if (size != 0) {
+		ulint	n = ut_min(src_size, size - 1);
+
+		memcpy(dst, src + src_size - n, n + 1);
+	}
+
+	return(src_size);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Return the number of times s2 occurs in s1. Overlapping instances of s2
+are only counted once.
+@return	the number of times s2 occurs in s1 */
+UNIV_INTERN
+ulint
+ut_strcount(
+/*========*/
+	const char*	s1,	/*!< in: string to search in */
+	const char*	s2)	/*!< in: string to search for */
+{
+	ulint	count = 0;
+	ulint	len = strlen(s2);
+
+	if (len == 0) {
+
+		return(0);
+	}
+
+	for (;;) {
+		s1 = strstr(s1, s2);
+
+		if (!s1) {
+
+			break;
+		}
+
+		count++;
+		s1 += len;
+	}
+
+	return(count);
+}
+
+/********************************************************************
+Concatenate 3 strings.*/
+
+char*
+ut_str3cat(
+/*=======*/
+				/* out, own: concatenated string, must be
+				freed with mem_free() */
+	const char*	s1,	/* in: string 1 */
+	const char*	s2,	/* in: string 2 */
+	const char*	s3)	/* in: string 3 */
+{
+	char*	s;
+	ulint	s1_len = strlen(s1);
+	ulint	s2_len = strlen(s2);
+	ulint	s3_len = strlen(s3);
+
+	s = static_cast<char*>(mem_alloc(s1_len + s2_len + s3_len + 1));
+
+	memcpy(s, s1, s1_len);
+	memcpy(s + s1_len, s2, s2_len);
+	memcpy(s + s1_len + s2_len, s3, s3_len);
+
+	s[s1_len + s2_len + s3_len] = '\0';
+
+	return(s);
+}
+/**********************************************************************//**
+Replace every occurrence of s1 in str with s2. Overlapping instances of s1
+are only replaced once.
+@return	own: modified string, must be freed with mem_free() */
+UNIV_INTERN
+char*
+ut_strreplace(
+/*==========*/
+	const char*	str,	/*!< in: string to operate on */
+	const char*	s1,	/*!< in: string to replace */
+	const char*	s2)	/*!< in: string to replace s1 with */
+{
+	char*		new_str;
+	char*		ptr;
+	const char*	str_end;
+	ulint		str_len = strlen(str);
+	ulint		s1_len = strlen(s1);
+	ulint		s2_len = strlen(s2);
+	ulint		count = 0;
+	int		len_delta = (int) s2_len - (int) s1_len;
+
+	str_end = str + str_len;
+
+	if (len_delta <= 0) {
+		len_delta = 0;
+	} else {
+		count = ut_strcount(str, s1);
+	}
+
+	new_str = static_cast<char*>(
+		mem_alloc(str_len + count * len_delta + 1));
+
+	ptr = new_str;
+
+	while (str) {
+		const char*	next = strstr(str, s1);
+
+		if (!next) {
+			next = str_end;
+		}
+
+		memcpy(ptr, str, next - str);
+		ptr += next - str;
+
+		if (next == str_end) {
+
+			break;
+		}
+
+		memcpy(ptr, s2, s2_len);
+		ptr += s2_len;
+
+		str = next + s1_len;
+	}
+
+	*ptr = '\0';
+
+	return(new_str);
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+void
+test_ut_str_sql_format()
+{
+	char	buf[128];
+	ulint	ret;
+
+#define CALL_AND_TEST(str, str_len, buf, buf_size, ret_expected, buf_expected)\
+	do {\
+		ibool	ok = TRUE;\
+		memset(buf, 'x', 10);\
+		buf[10] = '\0';\
+		fprintf(stderr, "TESTING \"%s\", %lu, %lu\n",\
+			str, (ulint) str_len, (ulint) buf_size);\
+		ret = ut_str_sql_format(str, str_len, buf, buf_size);\
+		if (ret != ret_expected) {\
+			fprintf(stderr, "expected ret %lu, got %lu\n",\
+				(ulint) ret_expected, ret);\
+			ok = FALSE;\
+		}\
+		if (strcmp((char*) buf, buf_expected) != 0) {\
+			fprintf(stderr, "expected buf \"%s\", got \"%s\"\n",\
+				buf_expected, buf);\
+			ok = FALSE;\
+		}\
+		if (ok) {\
+			fprintf(stderr, "OK: %lu, \"%s\"\n\n",\
+				(ulint) ret, buf);\
+		} else {\
+			return;\
+		}\
+	} while (0)
+
+	CALL_AND_TEST("abcd", 4, buf, 0, 0, "xxxxxxxxxx");
+
+	CALL_AND_TEST("abcd", 4, buf, 1, 1, "");
+
+	CALL_AND_TEST("abcd", 4, buf, 2, 1, "");
+
+	CALL_AND_TEST("abcd", 0, buf, 3, 3, "''");
+	CALL_AND_TEST("abcd", 1, buf, 3, 1, "");
+	CALL_AND_TEST("abcd", 2, buf, 3, 1, "");
+	CALL_AND_TEST("abcd", 3, buf, 3, 1, "");
+	CALL_AND_TEST("abcd", 4, buf, 3, 1, "");
+
+	CALL_AND_TEST("abcd", 0, buf, 4, 3, "''");
+	CALL_AND_TEST("abcd", 1, buf, 4, 4, "'a'");
+	CALL_AND_TEST("abcd", 2, buf, 4, 4, "'a'");
+	CALL_AND_TEST("abcd", 3, buf, 4, 4, "'a'");
+	CALL_AND_TEST("abcd", 4, buf, 4, 4, "'a'");
+	CALL_AND_TEST("abcde", 5, buf, 4, 4, "'a'");
+	CALL_AND_TEST("'", 1, buf, 4, 3, "''");
+	CALL_AND_TEST("''", 2, buf, 4, 3, "''");
+	CALL_AND_TEST("a'", 2, buf, 4, 4, "'a'");
+	CALL_AND_TEST("'a", 2, buf, 4, 3, "''");
+	CALL_AND_TEST("ab", 2, buf, 4, 4, "'a'");
+
+	CALL_AND_TEST("abcdef", 0, buf, 5, 3, "''");
+	CALL_AND_TEST("abcdef", 1, buf, 5, 4, "'a'");
+	CALL_AND_TEST("abcdef", 2, buf, 5, 5, "'ab'");
+	CALL_AND_TEST("abcdef", 3, buf, 5, 5, "'ab'");
+	CALL_AND_TEST("abcdef", 4, buf, 5, 5, "'ab'");
+	CALL_AND_TEST("abcdef", 5, buf, 5, 5, "'ab'");
+	CALL_AND_TEST("abcdef", 6, buf, 5, 5, "'ab'");
+	CALL_AND_TEST("'", 1, buf, 5, 5, "''''");
+	CALL_AND_TEST("''", 2, buf, 5, 5, "''''");
+	CALL_AND_TEST("a'", 2, buf, 5, 4, "'a'");
+	CALL_AND_TEST("'a", 2, buf, 5, 5, "''''");
+	CALL_AND_TEST("ab", 2, buf, 5, 5, "'ab'");
+	CALL_AND_TEST("abc", 3, buf, 5, 5, "'ab'");
+
+	CALL_AND_TEST("ab", 2, buf, 6, 5, "'ab'");
+
+	CALL_AND_TEST("a'b'c", 5, buf, 32, 10, "'a''b''c'");
+	CALL_AND_TEST("a'b'c'", 6, buf, 32, 12, "'a''b''c'''");
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/ut/ut0rbt.cc b/storage/innobase/ut/ut0rbt.cc
new file mode 100644
index 00000000000..e93844af600
--- /dev/null
+++ b/storage/innobase/ut/ut0rbt.cc
@@ -0,0 +1,1328 @@
+/***************************************************************************//**
+
+Copyright (c) 2007, 2010, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/********************************************************************//**
+Red-Black tree implementation
+
+(c) 2007 Oracle/Innobase Oy
+
+Created 2007-03-20 Sunny Bains
+***********************************************************************/
+
+#include "ut0rbt.h"
+
+/**********************************************************************//**
+Definition of a red-black tree
+==============================
+
+A red-black tree is a binary search tree which has the following
+red-black properties:
+
+   1. Every node is either red or black.
+   2. Every leaf (NULL - in our case tree->nil) is black.
+   3. If a node is red, then both its children are black.
+   4. Every simple path from a node to a descendant leaf contains the
+      same number of black nodes.
+
+   from (3) above, the implication is that on any path from the root
+   to a leaf, red nodes must not be adjacent.
+
+   However, any number of black nodes may appear in a sequence.
+ */
+
+#if	defined(IB_RBT_TESTING)
+#warning "Testing enabled!"
+#endif
+
+#define ROOT(t)		(t->root->left)
+#define	SIZEOF_NODE(t)	((sizeof(ib_rbt_node_t) + t->sizeof_value) - 1)
+
+/**********************************************************************//**
+Print out the sub-tree recursively. */
+static
+void
+rbt_print_subtree(
+/*==============*/
+	const ib_rbt_t*		tree,		/*!< in: tree to traverse */
+	const ib_rbt_node_t*	node,		/*!< in: node to print */
+	ib_rbt_print_node	print)		/*!< in: print key function */
+{
+	/* FIXME: Doesn't do anything yet */
+	if (node != tree->nil) {
+		print(node);
+		rbt_print_subtree(tree, node->left, print);
+		rbt_print_subtree(tree, node->right, print);
+	}
+}
+
+/**********************************************************************//**
+Verify that the keys are in order.
+@return	TRUE of OK. FALSE if not ordered */
+static
+ibool
+rbt_check_ordering(
+/*===============*/
+	const ib_rbt_t*		tree)		/*!< in: tree to verfify */
+{
+	const ib_rbt_node_t*	node;
+	const ib_rbt_node_t*	prev = NULL;
+
+	/* Iterate over all the nodes, comparing each node with the prev */
+	for (node = rbt_first(tree); node; node = rbt_next(tree, prev)) {
+
+		if (prev) {
+			int	result;
+
+			if (tree->cmp_arg) {
+				result = tree->compare_with_arg(
+					tree->cmp_arg, prev->value,
+					node->value);
+			} else {
+				result = tree->compare(
+					prev->value, node->value);
+			}
+
+			if (result >= 0) {
+				return(FALSE);
+			}
+		}
+
+		prev = node;
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Check that every path from the root to the leaves has the same count.
+Count is expressed in the number of black nodes.
+@return	0 on failure else black height of the subtree */
+static
+ibool
+rbt_count_black_nodes(
+/*==================*/
+	const ib_rbt_t*		tree,		/*!< in: tree to verify */
+	const ib_rbt_node_t*	node)		/*!< in: start of sub-tree */
+{
+	ulint	result;
+
+	if (node != tree->nil) {
+		ulint	left_height = rbt_count_black_nodes(tree, node->left);
+
+		ulint	right_height = rbt_count_black_nodes(tree, node->right);
+
+		if (left_height == 0
+		    || right_height == 0
+		    || left_height != right_height) {
+
+			result = 0;
+		} else if (node->color == IB_RBT_RED) {
+
+			/* Case 3 */
+			if (node->left->color != IB_RBT_BLACK
+			    || node->right->color != IB_RBT_BLACK) {
+
+				result = 0;
+			} else {
+				result = left_height;
+			}
+		/* Check if it's anything other than RED or BLACK. */
+		} else if (node->color != IB_RBT_BLACK) {
+
+			result = 0;
+		} else {
+
+			result = right_height + 1;
+		}
+	} else {
+		result = 1;
+	}
+
+	return(result);
+}
+
+/**********************************************************************//**
+Turn the node's right child's left sub-tree into node's right sub-tree.
+This will also make node's right child it's parent. */
+static
+void
+rbt_rotate_left(
+/*============*/
+	const ib_rbt_node_t*	nil,		/*!< in: nil node of the tree */
+	ib_rbt_node_t*		node)		/*!< in: node to rotate */
+{
+	ib_rbt_node_t*	right = node->right;
+
+	node->right = right->left;
+
+	if (right->left != nil) {
+		right->left->parent = node;
+	}
+
+	/* Right's new parent was node's parent. */
+	right->parent = node->parent;
+
+	/* Since root's parent is tree->nil and root->parent->left points
+	back to root, we can avoid the check. */
+	if (node == node->parent->left) {
+		/* Node was on the left of its parent. */
+		node->parent->left = right;
+	} else {
+		/* Node must have been on the right. */
+		node->parent->right = right;
+	}
+
+	/* Finally, put node on right's left. */
+	right->left = node;
+	node->parent = right;
+}
+
+/**********************************************************************//**
+Turn the node's left child's right sub-tree into node's left sub-tree.
+This also make node's left child it's parent. */
+static
+void
+rbt_rotate_right(
+/*=============*/
+	const ib_rbt_node_t*	nil,		/*!< in: nil node of tree */
+	ib_rbt_node_t*		node)		/*!< in: node to rotate */
+{
+	ib_rbt_node_t*	left = node->left;
+
+	node->left = left->right;
+
+	if (left->right != nil) {
+		left->right->parent = node;
+	}
+
+	/* Left's new parent was node's parent. */
+	left->parent = node->parent;
+
+	/* Since root's parent is tree->nil and root->parent->left points
+	back to root, we can avoid the check. */
+	if (node == node->parent->right) {
+	    /* Node was on the left of its parent. */
+            node->parent->right = left;
+	} else {
+	    /* Node must have been on the left. */
+            node->parent->left = left;
+	}
+
+	/* Finally, put node on left's right. */
+	left->right = node;
+	node->parent = left;
+}
+
+/**********************************************************************//**
+Append a node to the tree. */
+static
+ib_rbt_node_t*
+rbt_tree_add_child(
+/*===============*/
+	const ib_rbt_t*	tree,
+	ib_rbt_bound_t*	parent,
+	ib_rbt_node_t*	node)
+{
+	/* Cast away the const. */
+	ib_rbt_node_t*	last = (ib_rbt_node_t*) parent->last;
+
+	if (last == tree->root || parent->result < 0) {
+		last->left = node;
+	} else {
+		/* FIXME: We don't handle duplicates (yet)! */
+		ut_a(parent->result != 0);
+
+		last->right = node;
+	}
+
+	node->parent = last;
+
+	return(node);
+}
+
+/**********************************************************************//**
+Generic binary tree insert */
+static
+ib_rbt_node_t*
+rbt_tree_insert(
+/*============*/
+	ib_rbt_t*	tree,
+	const void*	key,
+	ib_rbt_node_t*	node)
+{
+	ib_rbt_bound_t	parent;
+	ib_rbt_node_t*	current = ROOT(tree);
+
+	parent.result = 0;
+	parent.last = tree->root;
+
+	/* Regular binary search. */
+	while (current != tree->nil) {
+
+		parent.last = current;
+
+		if (tree->cmp_arg) {
+			parent.result = tree->compare_with_arg(
+				tree->cmp_arg, key, current->value);
+		} else {
+			parent.result = tree->compare(key, current->value);
+		}
+
+		if (parent.result < 0) {
+			current = current->left;
+		} else {
+			current = current->right;
+		}
+	}
+
+	ut_a(current == tree->nil);
+
+	rbt_tree_add_child(tree, &parent, node);
+
+	return(node);
+}
+
+/**********************************************************************//**
+Balance a tree after inserting a node. */
+static
+void
+rbt_balance_tree(
+/*=============*/
+	const ib_rbt_t*	tree,			/*!< in: tree to balance */
+	ib_rbt_node_t*	node)			/*!< in: node that was inserted */
+{
+	const ib_rbt_node_t*	nil = tree->nil;
+	ib_rbt_node_t*		parent = node->parent;
+
+	/* Restore the red-black property. */
+	node->color = IB_RBT_RED;
+
+	while (node != ROOT(tree) && parent->color == IB_RBT_RED) {
+		ib_rbt_node_t*	grand_parent = parent->parent;
+
+		if (parent == grand_parent->left) {
+			ib_rbt_node_t*	uncle = grand_parent->right;
+
+			if (uncle->color == IB_RBT_RED) {
+
+				/* Case 1 - change the colors. */
+				uncle->color = IB_RBT_BLACK;
+				parent->color = IB_RBT_BLACK;
+				grand_parent->color = IB_RBT_RED;
+
+				/* Move node up the tree. */
+				node = grand_parent;
+
+			} else {
+
+				if (node == parent->right) {
+					/* Right is a black node and node is
+					to the right, case 2 - move node
+					up and rotate. */
+					node = parent;
+					rbt_rotate_left(nil, node);
+				}
+
+				grand_parent = node->parent->parent;
+
+				/* Case 3. */
+				node->parent->color = IB_RBT_BLACK;
+				grand_parent->color = IB_RBT_RED;
+
+				rbt_rotate_right(nil, grand_parent);
+			}
+
+		} else {
+			ib_rbt_node_t*	uncle = grand_parent->left;
+
+			if (uncle->color == IB_RBT_RED) {
+
+				/* Case 1 - change the colors. */
+				uncle->color = IB_RBT_BLACK;
+				parent->color = IB_RBT_BLACK;
+				grand_parent->color = IB_RBT_RED;
+
+				/* Move node up the tree. */
+				node = grand_parent;
+
+			} else {
+
+				if (node == parent->left) {
+					/* Left is a black node and node is to
+					the right, case 2 - move node up and
+					rotate. */
+					node = parent;
+					rbt_rotate_right(nil, node);
+				}
+
+				grand_parent = node->parent->parent;
+
+				/* Case 3. */
+				node->parent->color = IB_RBT_BLACK;
+				grand_parent->color = IB_RBT_RED;
+
+				rbt_rotate_left(nil, grand_parent);
+			}
+		}
+
+		parent = node->parent;
+	}
+
+	/* Color the root black. */
+	ROOT(tree)->color = IB_RBT_BLACK;
+}
+
+/**********************************************************************//**
+Find the given node's successor.
+@return	successor node or NULL if no successor */
+static
+ib_rbt_node_t*
+rbt_find_successor(
+/*===============*/
+	const ib_rbt_t*		tree,		/*!< in: rb tree */
+	const ib_rbt_node_t*	current)	/*!< in: this is declared const
+						because it can be called via
+						rbt_next() */
+{
+	const ib_rbt_node_t*	nil = tree->nil;
+	ib_rbt_node_t*		next = current->right;
+
+	/* Is there a sub-tree to the right that we can follow. */
+	if (next != nil) {
+
+		/* Follow the left most links of the current right child. */
+		while (next->left != nil) {
+			next = next->left;
+		}
+
+	} else { /* We will have to go up the tree to find the successor. */
+		ib_rbt_node_t*	parent = current->parent;
+
+		/* Cast away the const. */
+		next = (ib_rbt_node_t*) current;
+
+		while (parent != tree->root && next == parent->right) {
+			next = parent;
+			parent = next->parent;
+		}
+
+		next = (parent == tree->root) ? NULL : parent;
+	}
+
+	return(next);
+}
+
+/**********************************************************************//**
+Find the given node's precedecessor.
+@return	predecessor node or NULL if no predecesor */
+static
+ib_rbt_node_t*
+rbt_find_predecessor(
+/*=================*/
+	const ib_rbt_t*		tree,		/*!< in: rb tree */
+	const ib_rbt_node_t*	current)	/*!< in: this is declared const
+						because it can be called via
+						rbt_prev() */
+{
+	const ib_rbt_node_t*	nil = tree->nil;
+	ib_rbt_node_t*		prev = current->left;
+
+	/* Is there a sub-tree to the left that we can follow. */
+	if (prev != nil) {
+
+		/* Follow the right most links of the current left child. */
+		while (prev->right != nil) {
+			prev = prev->right;
+		}
+
+	} else { /* We will have to go up the tree to find the precedecessor. */
+		ib_rbt_node_t*	parent = current->parent;
+
+		/* Cast away the const. */
+		prev = (ib_rbt_node_t*) current;
+
+		while (parent != tree->root && prev == parent->left) {
+			prev = parent;
+			parent = prev->parent;
+		}
+
+		prev = (parent == tree->root) ? NULL : parent;
+	}
+
+	return(prev);
+}
+
+/**********************************************************************//**
+Replace node with child. After applying transformations eject becomes
+an orphan. */
+static
+void
+rbt_eject_node(
+/*===========*/
+	ib_rbt_node_t*	eject,			/*!< in: node to eject */
+	ib_rbt_node_t*	node)			/*!< in: node to replace with */
+{
+	/* Update the to be ejected node's parent's child pointers. */
+	if (eject->parent->left == eject) {
+		eject->parent->left = node;
+	} else if (eject->parent->right == eject) {
+		eject->parent->right = node;
+	} else {
+		ut_a(0);
+	}
+	/* eject is now an orphan but otherwise its pointers
+	and color are left intact. */
+
+	node->parent = eject->parent;
+}
+
+/**********************************************************************//**
+Replace a node with another node. */
+static
+void
+rbt_replace_node(
+/*=============*/
+	ib_rbt_node_t*	replace,		/*!< in: node to replace */
+	ib_rbt_node_t*	node)			/*!< in: node to replace with */
+{
+	ib_rbt_color_t	color = node->color;
+
+	/* Update the node pointers. */
+	node->left = replace->left;
+	node->right = replace->right;
+
+	/* Update the child node pointers. */
+	node->left->parent = node;
+	node->right->parent = node;
+
+	/* Make the parent of replace point to node. */
+	rbt_eject_node(replace, node);
+
+	/* Swap the colors. */
+	node->color = replace->color;
+	replace->color = color;
+}
+
+/**********************************************************************//**
+Detach node from the tree replacing it with one of it's children.
+@return	the child node that now occupies the position of the detached node */
+static
+ib_rbt_node_t*
+rbt_detach_node(
+/*============*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	ib_rbt_node_t*	node)			/*!< in: node to detach */
+{
+	ib_rbt_node_t*		child;
+	const ib_rbt_node_t*	nil = tree->nil;
+
+	if (node->left != nil && node->right != nil) {
+		/* Case where the node to be deleted has two children. */
+		ib_rbt_node_t*	successor = rbt_find_successor(tree, node);
+
+		ut_a(successor != nil);
+		ut_a(successor->parent != nil);
+		ut_a(successor->left == nil);
+
+		child = successor->right;
+
+		/* Remove the successor node and replace with its child. */
+		rbt_eject_node(successor, child);
+
+		/* Replace the node to delete with its successor node. */
+		rbt_replace_node(node, successor);
+	} else {
+		ut_a(node->left == nil || node->right == nil);
+
+		child = (node->left != nil) ? node->left : node->right;
+
+		/* Replace the node to delete with one of it's children. */
+		rbt_eject_node(node, child);
+	}
+
+	/* Reset the node links. */
+	node->parent = node->right = node->left = tree->nil;
+
+	return(child);
+}
+
+/**********************************************************************//**
+Rebalance the right sub-tree after deletion.
+@return	node to rebalance if more rebalancing required else NULL */
+static
+ib_rbt_node_t*
+rbt_balance_right(
+/*==============*/
+	const ib_rbt_node_t*	nil,		/*!< in: rb tree nil node */
+	ib_rbt_node_t*		parent,		/*!< in: parent node */
+	ib_rbt_node_t*		sibling)	/*!< in: sibling node */
+{
+	ib_rbt_node_t*		node = NULL;
+
+	ut_a(sibling != nil);
+
+	/* Case 3. */
+	if (sibling->color == IB_RBT_RED) {
+
+		parent->color = IB_RBT_RED;
+		sibling->color = IB_RBT_BLACK;
+
+		rbt_rotate_left(nil, parent);
+
+		sibling = parent->right;
+
+		ut_a(sibling != nil);
+	}
+
+	/* Since this will violate case 3 because of the change above. */
+	if (sibling->left->color == IB_RBT_BLACK
+	    && sibling->right->color == IB_RBT_BLACK) {
+
+		node = parent; /* Parent needs to be rebalanced too. */
+		sibling->color = IB_RBT_RED;
+
+	} else {
+		if (sibling->right->color == IB_RBT_BLACK) {
+
+			ut_a(sibling->left->color == IB_RBT_RED);
+
+			sibling->color = IB_RBT_RED;
+			sibling->left->color = IB_RBT_BLACK;
+
+			rbt_rotate_right(nil, sibling);
+
+			sibling = parent->right;
+			ut_a(sibling != nil);
+		}
+
+		sibling->color = parent->color;
+		sibling->right->color = IB_RBT_BLACK;
+
+		parent->color = IB_RBT_BLACK;
+
+		rbt_rotate_left(nil, parent);
+	}
+
+	return(node);
+}
+
+/**********************************************************************//**
+Rebalance the left sub-tree after deletion.
+@return	node to rebalance if more rebalancing required else NULL */
+static
+ib_rbt_node_t*
+rbt_balance_left(
+/*=============*/
+	const ib_rbt_node_t*	nil,		/*!< in: rb tree nil node */
+	ib_rbt_node_t*		parent,		/*!< in: parent node */
+	ib_rbt_node_t*		sibling)	/*!< in: sibling node */
+{
+	ib_rbt_node_t*	node = NULL;
+
+	ut_a(sibling != nil);
+
+	/* Case 3. */
+	if (sibling->color == IB_RBT_RED) {
+
+		parent->color = IB_RBT_RED;
+		sibling->color = IB_RBT_BLACK;
+
+		rbt_rotate_right(nil, parent);
+		sibling = parent->left;
+
+		ut_a(sibling != nil);
+	}
+
+	/* Since this will violate case 3 because of the change above. */
+	if (sibling->right->color == IB_RBT_BLACK
+	    && sibling->left->color == IB_RBT_BLACK) {
+
+		node = parent; /* Parent needs to be rebalanced too. */
+		sibling->color = IB_RBT_RED;
+
+	} else {
+		if (sibling->left->color == IB_RBT_BLACK) {
+
+			ut_a(sibling->right->color == IB_RBT_RED);
+
+			sibling->color = IB_RBT_RED;
+			sibling->right->color = IB_RBT_BLACK;
+
+			rbt_rotate_left(nil, sibling);
+
+			sibling = parent->left;
+
+			ut_a(sibling != nil);
+		}
+
+		sibling->color = parent->color;
+		sibling->left->color = IB_RBT_BLACK;
+
+		parent->color = IB_RBT_BLACK;
+
+		rbt_rotate_right(nil, parent);
+	}
+
+	return(node);
+}
+
+/**********************************************************************//**
+Delete the node and rebalance the tree if necessary */
+static
+void
+rbt_remove_node_and_rebalance(
+/*==========================*/
+	ib_rbt_t*		tree,		/*!< in: rb tree */
+	ib_rbt_node_t*		node)		/*!< in: node to remove */
+{
+	/* Detach node and get the node that will be used
+	as rebalance start. */
+	ib_rbt_node_t*	child = rbt_detach_node(tree, node);
+
+	if (node->color == IB_RBT_BLACK) {
+		ib_rbt_node_t*	last = child;
+
+		ROOT(tree)->color = IB_RBT_RED;
+
+		while (child && child->color == IB_RBT_BLACK) {
+			ib_rbt_node_t*	parent = child->parent;
+
+			/* Did the deletion cause an imbalance in the
+			parents left sub-tree. */
+			if (parent->left == child) {
+
+				child = rbt_balance_right(
+					tree->nil, parent, parent->right);
+
+			} else if (parent->right == child) {
+
+				child = rbt_balance_left(
+					tree->nil, parent, parent->left);
+
+			} else {
+				ut_error;
+			}
+
+			if (child) {
+				last = child;
+			}
+		}
+
+		ut_a(last);
+
+		last->color = IB_RBT_BLACK;
+		ROOT(tree)->color = IB_RBT_BLACK;
+	}
+
+	/* Note that we have removed a node from the tree. */
+	--tree->n_nodes;
+}
+
+/**********************************************************************//**
+Recursively free the nodes. */
+static
+void
+rbt_free_node(
+/*==========*/
+	ib_rbt_node_t*	node,			/*!< in: node to free */
+	ib_rbt_node_t*	nil)			/*!< in: rb tree nil node */
+{
+	if (node != nil) {
+		rbt_free_node(node->left, nil);
+		rbt_free_node(node->right, nil);
+
+		ut_free(node);
+	}
+}
+
+/**********************************************************************//**
+Free all the nodes and free the tree. */
+UNIV_INTERN
+void
+rbt_free(
+/*=====*/
+	ib_rbt_t*	tree)			/*!< in: rb tree to free */
+{
+	rbt_free_node(tree->root, tree->nil);
+	ut_free(tree->nil);
+	ut_free(tree);
+}
+
+/**********************************************************************//**
+Create an instance of a red black tree, whose comparison function takes
+an argument
+@return	an empty rb tree */
+UNIV_INTERN
+ib_rbt_t*
+rbt_create_arg_cmp(
+/*===============*/
+	size_t		sizeof_value,		/*!< in: sizeof data item */
+	ib_rbt_arg_compare
+			compare,		/*!< in: fn to compare items */
+	void*		cmp_arg)		/*!< in: compare fn arg */
+{
+	ib_rbt_t*       tree;
+
+	ut_a(cmp_arg);
+
+	tree = rbt_create(sizeof_value, NULL);
+	tree->cmp_arg = cmp_arg;
+	tree->compare_with_arg = compare;
+
+	return(tree);
+}
+
+/**********************************************************************//**
+Create an instance of a red black tree.
+@return	an empty rb tree */
+UNIV_INTERN
+ib_rbt_t*
+rbt_create(
+/*=======*/
+	size_t		sizeof_value,		/*!< in: sizeof data item */
+	ib_rbt_compare	compare)		/*!< in: fn to compare items */
+{
+	ib_rbt_t*	tree;
+	ib_rbt_node_t*	node;
+
+	tree = (ib_rbt_t*) ut_malloc(sizeof(*tree));
+	memset(tree, 0, sizeof(*tree));
+
+	tree->sizeof_value = sizeof_value;
+
+	/* Create the sentinel (NIL) node. */
+	node = tree->nil = (ib_rbt_node_t*) ut_malloc(sizeof(*node));
+	memset(node, 0, sizeof(*node));
+
+	node->color = IB_RBT_BLACK;
+	node->parent = node->left = node->right = node;
+
+	/* Create the "fake" root, the real root node will be the
+	left child of this node. */
+	node = tree->root = (ib_rbt_node_t*) ut_malloc(sizeof(*node));
+	memset(node, 0, sizeof(*node));
+
+	node->color = IB_RBT_BLACK;
+	node->parent = node->left = node->right = tree->nil;
+
+	tree->compare = compare;
+
+	return(tree);
+}
+
+/**********************************************************************//**
+Generic insert of a value in the rb tree.
+@return	inserted node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_insert(
+/*=======*/
+	ib_rbt_t*	tree,			/*!< in: rb tree */
+	const void*	key,			/*!< in: key for ordering */
+	const void*	value)			/*!< in: value of key, this value
+						is copied to the node */
+{
+	ib_rbt_node_t*	node;
+
+	/* Create the node that will hold the value data. */
+	node = (ib_rbt_node_t*) ut_malloc(SIZEOF_NODE(tree));
+
+	memcpy(node->value, value, tree->sizeof_value);
+	node->parent = node->left = node->right = tree->nil;
+
+	/* Insert in the tree in the usual way. */
+	rbt_tree_insert(tree, key, node);
+	rbt_balance_tree(tree, node);
+
+	++tree->n_nodes;
+
+	return(node);
+}
+
+/**********************************************************************//**
+Add a new node to the tree, useful for data that is pre-sorted.
+@return	appended node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_add_node(
+/*=========*/
+	ib_rbt_t*	tree,			/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,			/*!< in: bounds */
+	const void*	value)			/*!< in: this value is copied
+						to the node */
+{
+	ib_rbt_node_t*	node;
+
+	/* Create the node that will hold the value data */
+	node = (ib_rbt_node_t*) ut_malloc(SIZEOF_NODE(tree));
+
+	memcpy(node->value, value, tree->sizeof_value);
+	node->parent = node->left = node->right = tree->nil;
+
+	/* If tree is empty */
+	if (parent->last == NULL) {
+		parent->last = tree->root;
+	}
+
+	/* Append the node, the hope here is that the caller knows
+	what s/he is doing. */
+	rbt_tree_add_child(tree, parent, node);
+	rbt_balance_tree(tree, node);
+
+	++tree->n_nodes;
+
+#if	defined(IB_RBT_TESTING)
+	ut_a(rbt_validate(tree));
+#endif
+	return(node);
+}
+
+/**********************************************************************//**
+Find a matching node in the rb tree.
+@return	NULL if not found else the node where key was found */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lookup(
+/*=======*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	const void*	key)			/*!< in: key to use for search */
+{
+	const ib_rbt_node_t*	current = ROOT(tree);
+
+	/* Regular binary search. */
+	while (current != tree->nil) {
+		int	result;
+
+		if (tree->cmp_arg) {
+			result = tree->compare_with_arg(
+				tree->cmp_arg, key, current->value);
+		} else {
+			result = tree->compare(key, current->value);
+		}
+
+		if (result < 0) {
+			current = current->left;
+		} else if (result > 0) {
+			current = current->right;
+		} else {
+			break;
+		}
+	}
+
+	return(current != tree->nil ? current : NULL);
+}
+
+/**********************************************************************//**
+Delete a node indentified by key.
+@return	TRUE if success FALSE if not found */
+UNIV_INTERN
+ibool
+rbt_delete(
+/*=======*/
+	ib_rbt_t*	tree,			/*!< in: rb tree */
+	const void*	key)			/*!< in: key to delete */
+{
+	ibool		deleted = FALSE;
+	ib_rbt_node_t*	node = (ib_rbt_node_t*) rbt_lookup(tree, key);
+
+	if (node) {
+		rbt_remove_node_and_rebalance(tree, node);
+
+		ut_free(node);
+		deleted = TRUE;
+	}
+
+	return(deleted);
+}
+
+/**********************************************************************//**
+Remove a node from the rb tree, the node is not free'd, that is the
+callers responsibility.
+@return	deleted node but without the const */
+UNIV_INTERN
+ib_rbt_node_t*
+rbt_remove_node(
+/*============*/
+	ib_rbt_t*		tree,		/*!< in: rb tree */
+	const ib_rbt_node_t*	const_node)	/*!< in: node to delete, this
+						is a fudge and declared const
+						because the caller can access
+						only const nodes */
+{
+	/* Cast away the const. */
+	rbt_remove_node_and_rebalance(tree, (ib_rbt_node_t*) const_node);
+
+	/* This is to make it easier to do something like this:
+		ut_free(rbt_remove_node(node));
+	*/
+
+	return((ib_rbt_node_t*) const_node);
+}
+
+/**********************************************************************//**
+Find the node that has the lowest key that is >= key.
+@return	node satisfying the lower bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lower_bound(
+/*============*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	const void*	key)			/*!< in: key to search */
+{
+	ib_rbt_node_t*	lb_node = NULL;
+	ib_rbt_node_t*	current = ROOT(tree);
+
+	while (current != tree->nil) {
+		int	result;
+
+		if (tree->cmp_arg) {
+			result = tree->compare_with_arg(
+				tree->cmp_arg, key, current->value);
+		} else {
+			result = tree->compare(key, current->value);
+		}
+
+		if (result > 0) {
+
+			current = current->right;
+
+		} else if (result < 0) {
+
+			lb_node = current;
+			current = current->left;
+
+		} else {
+			lb_node = current;
+			break;
+		}
+	}
+
+	return(lb_node);
+}
+
+/**********************************************************************//**
+Find the node that has the greatest key that is <= key.
+@return	node satisfying the upper bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_upper_bound(
+/*============*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	const void*	key)			/*!< in: key to search */
+{
+	ib_rbt_node_t*	ub_node = NULL;
+	ib_rbt_node_t*	current = ROOT(tree);
+
+	while (current != tree->nil) {
+		int	result;
+
+		if (tree->cmp_arg) {
+			result = tree->compare_with_arg(
+				tree->cmp_arg, key, current->value);
+		} else {
+			result = tree->compare(key, current->value);
+		}
+
+		if (result > 0) {
+
+			ub_node = current;
+			current = current->right;
+
+		} else if (result < 0) {
+
+			current = current->left;
+
+		} else {
+			ub_node = current;
+			break;
+		}
+	}
+
+	return(ub_node);
+}
+
+/**********************************************************************//**
+Find the node that has the greatest key that is <= key.
+@return	value of result */
+UNIV_INTERN
+int
+rbt_search(
+/*=======*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,			/*!< in: search bounds */
+	const void*	key)			/*!< in: key to search */
+{
+	ib_rbt_node_t*	current = ROOT(tree);
+
+	/* Every thing is greater than the NULL root. */
+	parent->result = 1;
+	parent->last = NULL;
+
+	while (current != tree->nil) {
+
+		parent->last = current;
+
+		if (tree->cmp_arg) {
+			parent->result = tree->compare_with_arg(
+				tree->cmp_arg, key, current->value);
+		} else {
+			parent->result = tree->compare(key, current->value);
+		}
+
+		if (parent->result > 0) {
+			current = current->right;
+		} else if (parent->result < 0) {
+			current = current->left;
+		} else {
+			break;
+		}
+	}
+
+	return(parent->result);
+}
+
+/**********************************************************************//**
+Find the node that has the greatest key that is <= key. But use the
+supplied comparison function.
+@return	value of result */
+UNIV_INTERN
+int
+rbt_search_cmp(
+/*===========*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,			/*!< in: search bounds */
+	const void*	key,			/*!< in: key to search */
+	ib_rbt_compare	compare,		/*!< in: fn to compare items */
+	ib_rbt_arg_compare
+			arg_compare)		/*!< in: fn to compare items
+						with argument */
+{
+	ib_rbt_node_t*	current = ROOT(tree);
+
+	/* Every thing is greater than the NULL root. */
+	parent->result = 1;
+	parent->last = NULL;
+
+	while (current != tree->nil) {
+
+		parent->last = current;
+
+		if (arg_compare) {
+			ut_ad(tree->cmp_arg);
+			parent->result = arg_compare(
+				tree->cmp_arg, key, current->value);
+		} else {
+			parent->result = compare(key, current->value);
+		}
+
+		if (parent->result > 0) {
+			current = current->right;
+		} else if (parent->result < 0) {
+			current = current->left;
+		} else {
+			break;
+		}
+	}
+
+	return(parent->result);
+}
+
+/**********************************************************************//**
+Return the left most node in the tree. */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_first(
+/*======*/
+						/* out leftmost node or NULL */
+	const ib_rbt_t*	tree)			/* in: rb tree */
+{
+	ib_rbt_node_t*	first = NULL;
+	ib_rbt_node_t*	current = ROOT(tree);
+
+	while (current != tree->nil) {
+		first = current;
+		current = current->left;
+	}
+
+	return(first);
+}
+
+/**********************************************************************//**
+Return the right most node in the tree.
+@return	the rightmost node or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_last(
+/*=====*/
+	const ib_rbt_t*	tree)			/*!< in: rb tree */
+{
+	ib_rbt_node_t*	last = NULL;
+	ib_rbt_node_t*	current = ROOT(tree);
+
+	while (current != tree->nil) {
+		last = current;
+		current = current->right;
+	}
+
+	return(last);
+}
+
+/**********************************************************************//**
+Return the next node.
+@return	node next from current */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_next(
+/*=====*/
+	const ib_rbt_t*		tree,		/*!< in: rb tree */
+	const ib_rbt_node_t*	current)	/*!< in: current node */
+{
+	return(current ? rbt_find_successor(tree, current) : NULL);
+}
+
+/**********************************************************************//**
+Return the previous node.
+@return	node prev from current */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_prev(
+/*=====*/
+	const ib_rbt_t*		tree,		/*!< in: rb tree */
+	const ib_rbt_node_t*	current)	/*!< in: current node */
+{
+	return(current ? rbt_find_predecessor(tree, current) : NULL);
+}
+
+/**********************************************************************//**
+Reset the tree. Delete all the nodes. */
+UNIV_INTERN
+void
+rbt_clear(
+/*======*/
+	ib_rbt_t*	tree)			/*!< in: rb tree */
+{
+	rbt_free_node(ROOT(tree), tree->nil);
+
+	tree->n_nodes = 0;
+	tree->root->left = tree->root->right = tree->nil;
+}
+
+/**********************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+@return	no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq(
+/*===========*/
+	ib_rbt_t*	dst,			/*!< in: dst rb tree */
+	const ib_rbt_t*	src)			/*!< in: src rb tree */
+{
+	ib_rbt_bound_t		parent;
+	ulint			n_merged = 0;
+	const	ib_rbt_node_t*	src_node = rbt_first(src);
+
+	if (rbt_empty(src) || dst == src) {
+		return(0);
+	}
+
+	for (/* No op */; src_node; src_node = rbt_next(src, src_node)) {
+
+		if (rbt_search(dst, &parent, src_node->value) != 0) {
+			rbt_add_node(dst, &parent, src_node->value);
+			++n_merged;
+		}
+	}
+
+	return(n_merged);
+}
+
+/**********************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+Delete the nodes from src after copying node to dst. As a side effect
+the duplicates will be left untouched in the src.
+@return	no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq_destructive(
+/*=======================*/
+	ib_rbt_t*	dst,			/*!< in: dst rb tree */
+	ib_rbt_t*	src)			/*!< in: src rb tree */
+{
+	ib_rbt_bound_t	parent;
+	ib_rbt_node_t*	src_node;
+	ulint		old_size = rbt_size(dst);
+
+	if (rbt_empty(src) || dst == src) {
+		return(0);
+	}
+
+	for (src_node = (ib_rbt_node_t*) rbt_first(src); src_node; /* */) {
+		ib_rbt_node_t*	prev = src_node;
+
+		src_node = (ib_rbt_node_t*) rbt_next(src, prev);
+
+		/* Skip duplicates. */
+		if (rbt_search(dst, &parent, prev->value) != 0) {
+
+			/* Remove and reset the node but preserve
+			the node (data) value. */
+			rbt_remove_node_and_rebalance(src, prev);
+
+			/* The nil should be taken from the dst tree. */
+			prev->parent = prev->left = prev->right = dst->nil;
+			rbt_tree_add_child(dst, &parent, prev);
+			rbt_balance_tree(dst, prev);
+
+			++dst->n_nodes;
+		}
+	}
+
+#if	defined(IB_RBT_TESTING)
+	ut_a(rbt_validate(dst));
+	ut_a(rbt_validate(src));
+#endif
+	return(rbt_size(dst) - old_size);
+}
+
+/**********************************************************************//**
+Check that every path from the root to the leaves has the same count and
+the tree nodes are in order.
+@return	TRUE if OK FALSE otherwise */
+UNIV_INTERN
+ibool
+rbt_validate(
+/*=========*/
+	const ib_rbt_t*	tree)		/*!< in: RB tree to validate */
+{
+	if (rbt_count_black_nodes(tree, ROOT(tree)) > 0) {
+		return(rbt_check_ordering(tree));
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Iterate over the tree in depth first order. */
+UNIV_INTERN
+void
+rbt_print(
+/*======*/
+	const ib_rbt_t*		tree,		/*!< in: tree to traverse */
+	ib_rbt_print_node	print)		/*!< in: print function */
+{
+	rbt_print_subtree(tree, ROOT(tree), print);
+}
diff --git a/storage/innobase/ut/ut0rnd.cc b/storage/innobase/ut/ut0rnd.cc
new file mode 100644
index 00000000000..3b4d7381181
--- /dev/null
+++ b/storage/innobase/ut/ut0rnd.cc
@@ -0,0 +1,97 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***************************************************************//**
+@file ut/ut0rnd.cc
+Random numbers and hashing
+
+Created 5/11/1994 Heikki Tuuri
+********************************************************************/
+
+#include "ut0rnd.h"
+
+#ifdef UNIV_NONINL
+#include "ut0rnd.ic"
+#endif
+
+/** These random numbers are used in ut_find_prime */
+/*@{*/
+#define	UT_RANDOM_1	1.0412321
+#define	UT_RANDOM_2	1.1131347
+#define UT_RANDOM_3	1.0132677
+/*@}*/
+
+/** Seed value of ut_rnd_gen_ulint(). */
+UNIV_INTERN ulint	ut_rnd_ulint_counter = 65654363;
+
+/***********************************************************//**
+Looks for a prime number slightly greater than the given argument.
+The prime is chosen so that it is not near any power of 2.
+@return	prime */
+UNIV_INTERN
+ulint
+ut_find_prime(
+/*==========*/
+	ulint	n)	/*!< in: positive number > 100 */
+{
+	ulint	pow2;
+	ulint	i;
+
+	n += 100;
+
+	pow2 = 1;
+	while (pow2 * 2 < n) {
+		pow2 = 2 * pow2;
+	}
+
+	if ((double) n < 1.05 * (double) pow2) {
+		n = (ulint) ((double) n * UT_RANDOM_1);
+	}
+
+	pow2 = 2 * pow2;
+
+	if ((double) n > 0.95 * (double) pow2) {
+		n = (ulint) ((double) n * UT_RANDOM_2);
+	}
+
+	if (n > pow2 - 20) {
+		n += 30;
+	}
+
+	/* Now we have n far enough from powers of 2. To make
+	n more random (especially, if it was not near
+	a power of 2), we then multiply it by a random number. */
+
+	n = (ulint) ((double) n * UT_RANDOM_3);
+
+	for (;; n++) {
+		i = 2;
+		while (i * i <= n) {
+			if (n % i == 0) {
+				goto next_n;
+			}
+			i++;
+		}
+
+		/* Found a prime */
+		break;
+next_n:		;
+	}
+
+	return(n);
+}
diff --git a/storage/innobase/ut/ut0ut.cc b/storage/innobase/ut/ut0ut.cc
new file mode 100644
index 00000000000..68446cc85ef
--- /dev/null
+++ b/storage/innobase/ut/ut0ut.cc
@@ -0,0 +1,840 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***************************************************************//**
+@file ut/ut0ut.cc
+Various utilities for Innobase.
+
+Created 5/11/1994 Heikki Tuuri
+********************************************************************/
+
+#include "ut0ut.h"
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "ut0sort.h"
+#include "os0thread.h" /* thread-ID */
+
+#ifdef UNIV_NONINL
+#include "ut0ut.ic"
+#endif
+
+#include <stdarg.h>
+#include <string.h>
+#include <ctype.h>
+
+#ifndef UNIV_HOTBACKUP
+# include "trx0trx.h"
+# include "ha_prototypes.h"
+# include "mysql_com.h" /* NAME_LEN */
+#endif /* UNIV_HOTBACKUP */
+
+/** A constant to prevent the compiler from optimizing ut_delay() away. */
+UNIV_INTERN ibool	ut_always_false	= FALSE;
+
+#ifdef __WIN__
+/*****************************************************************//**
+NOTE: The Windows epoch starts from 1601/01/01 whereas the Unix
+epoch starts from 1970/1/1. For selection of constant see:
+http://support.microsoft.com/kb/167296/ */
+#define WIN_TO_UNIX_DELTA_USEC  ((ib_int64_t) 11644473600000000ULL)
+
+
+/*****************************************************************//**
+This is the Windows version of gettimeofday(2).
+@return	0 if all OK else -1 */
+static
+int
+ut_gettimeofday(
+/*============*/
+	struct timeval*	tv,	/*!< out: Values are relative to Unix epoch */
+	void*		tz)	/*!< in: not used */
+{
+	FILETIME	ft;
+	ib_int64_t	tm;
+
+	if (!tv) {
+		errno = EINVAL;
+		return(-1);
+	}
+
+	GetSystemTimeAsFileTime(&ft);
+
+	tm = (ib_int64_t) ft.dwHighDateTime << 32;
+	tm |= ft.dwLowDateTime;
+
+	ut_a(tm >= 0);	/* If tm wraps over to negative, the quotient / 10
+			does not work */
+
+	tm /= 10;	/* Convert from 100 nsec periods to usec */
+
+	/* If we don't convert to the Unix epoch the value for
+	struct timeval::tv_sec will overflow.*/
+	tm -= WIN_TO_UNIX_DELTA_USEC;
+
+	tv->tv_sec  = (long) (tm / 1000000L);
+	tv->tv_usec = (long) (tm % 1000000L);
+
+	return(0);
+}
+#else
+/** An alias for gettimeofday(2).  On Microsoft Windows, we have to
+reimplement this function. */
+#define	ut_gettimeofday		gettimeofday
+#endif
+
+/**********************************************************//**
+Returns system time. We do not specify the format of the time returned:
+the only way to manipulate it is to use the function ut_difftime.
+@return	system time */
+UNIV_INTERN
+ib_time_t
+ut_time(void)
+/*=========*/
+{
+	return(time(NULL));
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Returns system time.
+Upon successful completion, the value 0 is returned; otherwise the
+value -1 is returned and the global variable errno is set to indicate the
+error.
+@return	0 on success, -1 otherwise */
+UNIV_INTERN
+int
+ut_usectime(
+/*========*/
+	ulint*	sec,	/*!< out: seconds since the Epoch */
+	ulint*	ms)	/*!< out: microseconds since the Epoch+*sec */
+{
+	struct timeval	tv;
+	int		ret;
+	int		errno_gettimeofday;
+	int		i;
+
+	for (i = 0; i < 10; i++) {
+
+		ret = ut_gettimeofday(&tv, NULL);
+
+		if (ret == -1) {
+			errno_gettimeofday = errno;
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: gettimeofday(): %s\n",
+				strerror(errno_gettimeofday));
+			os_thread_sleep(100000);  /* 0.1 sec */
+			errno = errno_gettimeofday;
+		} else {
+			break;
+		}
+	}
+
+	if (ret != -1) {
+		*sec = (ulint) tv.tv_sec;
+		*ms  = (ulint) tv.tv_usec;
+	}
+
+	return(ret);
+}
+
+/**********************************************************//**
+Returns the number of microseconds since epoch. Similar to
+time(3), the return value is also stored in *tloc, provided
+that tloc is non-NULL.
+@return	us since epoch */
+UNIV_INTERN
+ullint
+ut_time_us(
+/*=======*/
+	ullint*	tloc)	/*!< out: us since epoch, if non-NULL */
+{
+	struct timeval	tv;
+	ullint		us;
+
+	ut_gettimeofday(&tv, NULL);
+
+	us = (ullint) tv.tv_sec * 1000000 + tv.tv_usec;
+
+	if (tloc != NULL) {
+		*tloc = us;
+	}
+
+	return(us);
+}
+
+/**********************************************************//**
+Returns the number of milliseconds since some epoch.  The
+value may wrap around.  It should only be used for heuristic
+purposes.
+@return	ms since epoch */
+UNIV_INTERN
+ulint
+ut_time_ms(void)
+/*============*/
+{
+	struct timeval	tv;
+
+	ut_gettimeofday(&tv, NULL);
+
+	return((ulint) tv.tv_sec * 1000 + tv.tv_usec / 1000);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************//**
+Returns the difference of two times in seconds.
+@return	time2 - time1 expressed in seconds */
+UNIV_INTERN
+double
+ut_difftime(
+/*========*/
+	ib_time_t	time2,	/*!< in: time */
+	ib_time_t	time1)	/*!< in: time */
+{
+	return(difftime(time2, time1));
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/**********************************************************//**
+Prints a timestamp to a file. */
+UNIV_INTERN
+void
+ut_print_timestamp(
+/*===============*/
+	FILE*  file) /*!< in: file where to print */
+{
+	ulint thread_id = 0;
+
+#ifndef UNIV_INNOCHECKSUM
+	thread_id = os_thread_pf(os_thread_get_curr_id());
+#endif
+
+#ifdef __WIN__
+	SYSTEMTIME cal_tm;
+
+	GetLocalTime(&cal_tm);
+
+	fprintf(file, "%d-%02d-%02d %02d:%02d:%02d %lx",
+		(int) cal_tm.wYear,
+		(int) cal_tm.wMonth,
+		(int) cal_tm.wDay,
+		(int) cal_tm.wHour,
+		(int) cal_tm.wMinute,
+		(int) cal_tm.wSecond,
+		thread_id);
+#else
+	struct tm* cal_tm_ptr;
+	time_t	   tm;
+
+#ifdef HAVE_LOCALTIME_R
+	struct tm  cal_tm;
+	time(&tm);
+	localtime_r(&tm, &cal_tm);
+	cal_tm_ptr = &cal_tm;
+#else
+	time(&tm);
+	cal_tm_ptr = localtime(&tm);
+#endif
+	fprintf(file, "%d-%02d-%02d %02d:%02d:%02d %lx",
+		cal_tm_ptr->tm_year + 1900,
+		cal_tm_ptr->tm_mon + 1,
+		cal_tm_ptr->tm_mday,
+		cal_tm_ptr->tm_hour,
+		cal_tm_ptr->tm_min,
+		cal_tm_ptr->tm_sec,
+		thread_id);
+#endif
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/**********************************************************//**
+Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */
+UNIV_INTERN
+void
+ut_sprintf_timestamp(
+/*=================*/
+	char*	buf) /*!< in: buffer where to sprintf */
+{
+#ifdef __WIN__
+	SYSTEMTIME cal_tm;
+
+	GetLocalTime(&cal_tm);
+
+	sprintf(buf, "%02d%02d%02d %2d:%02d:%02d",
+		(int) cal_tm.wYear % 100,
+		(int) cal_tm.wMonth,
+		(int) cal_tm.wDay,
+		(int) cal_tm.wHour,
+		(int) cal_tm.wMinute,
+		(int) cal_tm.wSecond);
+#else
+	struct tm* cal_tm_ptr;
+	time_t	   tm;
+
+#ifdef HAVE_LOCALTIME_R
+	struct tm  cal_tm;
+	time(&tm);
+	localtime_r(&tm, &cal_tm);
+	cal_tm_ptr = &cal_tm;
+#else
+	time(&tm);
+	cal_tm_ptr = localtime(&tm);
+#endif
+	sprintf(buf, "%02d%02d%02d %2d:%02d:%02d",
+		cal_tm_ptr->tm_year % 100,
+		cal_tm_ptr->tm_mon + 1,
+		cal_tm_ptr->tm_mday,
+		cal_tm_ptr->tm_hour,
+		cal_tm_ptr->tm_min,
+		cal_tm_ptr->tm_sec);
+#endif
+}
+
+#ifdef UNIV_HOTBACKUP
+/**********************************************************//**
+Sprintfs a timestamp to a buffer with no spaces and with ':' characters
+replaced by '_'. */
+UNIV_INTERN
+void
+ut_sprintf_timestamp_without_extra_chars(
+/*=====================================*/
+	char*	buf) /*!< in: buffer where to sprintf */
+{
+#ifdef __WIN__
+	SYSTEMTIME cal_tm;
+
+	GetLocalTime(&cal_tm);
+
+	sprintf(buf, "%02d%02d%02d_%2d_%02d_%02d",
+		(int) cal_tm.wYear % 100,
+		(int) cal_tm.wMonth,
+		(int) cal_tm.wDay,
+		(int) cal_tm.wHour,
+		(int) cal_tm.wMinute,
+		(int) cal_tm.wSecond);
+#else
+	struct tm* cal_tm_ptr;
+	time_t	   tm;
+
+#ifdef HAVE_LOCALTIME_R
+	struct tm  cal_tm;
+	time(&tm);
+	localtime_r(&tm, &cal_tm);
+	cal_tm_ptr = &cal_tm;
+#else
+	time(&tm);
+	cal_tm_ptr = localtime(&tm);
+#endif
+	sprintf(buf, "%02d%02d%02d_%2d_%02d_%02d",
+		cal_tm_ptr->tm_year % 100,
+		cal_tm_ptr->tm_mon + 1,
+		cal_tm_ptr->tm_mday,
+		cal_tm_ptr->tm_hour,
+		cal_tm_ptr->tm_min,
+		cal_tm_ptr->tm_sec);
+#endif
+}
+
+/**********************************************************//**
+Returns current year, month, day. */
+UNIV_INTERN
+void
+ut_get_year_month_day(
+/*==================*/
+	ulint*	year,	/*!< out: current year */
+	ulint*	month,	/*!< out: month */
+	ulint*	day)	/*!< out: day */
+{
+#ifdef __WIN__
+	SYSTEMTIME cal_tm;
+
+	GetLocalTime(&cal_tm);
+
+	*year = (ulint) cal_tm.wYear;
+	*month = (ulint) cal_tm.wMonth;
+	*day = (ulint) cal_tm.wDay;
+#else
+	struct tm* cal_tm_ptr;
+	time_t	   tm;
+
+#ifdef HAVE_LOCALTIME_R
+	struct tm  cal_tm;
+	time(&tm);
+	localtime_r(&tm, &cal_tm);
+	cal_tm_ptr = &cal_tm;
+#else
+	time(&tm);
+	cal_tm_ptr = localtime(&tm);
+#endif
+	*year = (ulint) cal_tm_ptr->tm_year + 1900;
+	*month = (ulint) cal_tm_ptr->tm_mon + 1;
+	*day = (ulint) cal_tm_ptr->tm_mday;
+#endif
+}
+#endif /* UNIV_HOTBACKUP */
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Runs an idle loop on CPU. The argument gives the desired delay
+in microseconds on 100 MHz Pentium + Visual C++.
+@return	dummy value */
+UNIV_INTERN
+ulint
+ut_delay(
+/*=====*/
+	ulint	delay)	/*!< in: delay in microseconds on 100 MHz Pentium */
+{
+	ulint	i, j;
+
+	j = 0;
+
+	for (i = 0; i < delay * 50; i++) {
+		j += i;
+		UT_RELAX_CPU();
+	}
+
+	if (ut_always_false) {
+		ut_always_false = (ibool) j;
+	}
+
+	return(j);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Prints the contents of a memory buffer in hex and ascii. */
+UNIV_INTERN
+void
+ut_print_buf(
+/*=========*/
+	FILE*		file,	/*!< in: file where to print */
+	const void*	buf,	/*!< in: memory buffer */
+	ulint		len)	/*!< in: length of the buffer */
+{
+	const byte*	data;
+	ulint		i;
+
+	UNIV_MEM_ASSERT_RW(buf, len);
+
+	fprintf(file, " len %lu; hex ", len);
+
+	for (data = (const byte*) buf, i = 0; i < len; i++) {
+		fprintf(file, "%02lx", (ulong)*data++);
+	}
+
+	fputs("; asc ", file);
+
+	data = (const byte*) buf;
+
+	for (i = 0; i < len; i++) {
+		int	c = (int) *data++;
+		putc(isprint(c) ? c : ' ', file);
+	}
+
+	putc(';', file);
+}
+
+/**********************************************************************//**
+Sort function for ulint arrays. */
+UNIV_INTERN
+void
+ut_ulint_sort(
+/*==========*/
+	ulint*	arr,		/*!< in/out: array to sort */
+	ulint*	aux_arr,	/*!< in/out: aux array to use in sort */
+	ulint	low,		/*!< in: lower bound */
+	ulint	high)		/*!< in: upper bound */
+{
+	UT_SORT_FUNCTION_BODY(ut_ulint_sort, arr, aux_arr, low, high,
+			      ut_ulint_cmp);
+}
+
+/*************************************************************//**
+Calculates fast the number rounded up to the nearest power of 2.
+@return	first power of 2 which is >= n */
+UNIV_INTERN
+ulint
+ut_2_power_up(
+/*==========*/
+	ulint	n)	/*!< in: number != 0 */
+{
+	ulint	res;
+
+	res = 1;
+
+	ut_ad(n > 0);
+
+	while (res < n) {
+		res = res * 2;
+	}
+
+	return(res);
+}
+
+/**********************************************************************//**
+Outputs a NUL-terminated file name, quoted with apostrophes. */
+UNIV_INTERN
+void
+ut_print_filename(
+/*==============*/
+	FILE*		f,	/*!< in: output stream */
+	const char*	name)	/*!< in: name to print */
+{
+	putc('\'', f);
+	for (;;) {
+		int	c = *name++;
+		switch (c) {
+		case 0:
+			goto done;
+		case '\'':
+			putc(c, f);
+			/* fall through */
+		default:
+			putc(c, f);
+		}
+	}
+done:
+	putc('\'', f);
+}
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+UNIV_INTERN
+void
+ut_print_name(
+/*==========*/
+	FILE*		f,	/*!< in: output stream */
+	const trx_t*	trx,	/*!< in: transaction */
+	ibool		table_id,/*!< in: TRUE=print a table name,
+				FALSE=print other identifier */
+	const char*	name)	/*!< in: name to print */
+{
+	ut_print_namel(f, trx, table_id, name, strlen(name));
+}
+
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+UNIV_INTERN
+void
+ut_print_namel(
+/*===========*/
+	FILE*		f,	/*!< in: output stream */
+	const trx_t*	trx,	/*!< in: transaction (NULL=no quotes) */
+	ibool		table_id,/*!< in: TRUE=print a table name,
+				FALSE=print other identifier */
+	const char*	name,	/*!< in: name to print */
+	ulint		namelen)/*!< in: length of name */
+{
+	/* 2 * NAME_LEN for database and table name,
+	and some slack for the #mysql50# prefix and quotes */
+	char		buf[3 * NAME_LEN];
+	const char*	bufend;
+
+	bufend = innobase_convert_name(buf, sizeof buf,
+				       name, namelen,
+				       trx ? trx->mysql_thd : NULL,
+				       table_id);
+
+	fwrite(buf, 1, bufend - buf, f);
+}
+
+/**********************************************************************//**
+Formats a table or index name, quoted as an SQL identifier. If the name
+contains a slash '/', the result will contain two identifiers separated by
+a period (.), as in SQL database_name.identifier.
+@return pointer to 'formatted' */
+UNIV_INTERN
+char*
+ut_format_name(
+/*===========*/
+	const char*	name,		/*!< in: table or index name, must be
+					'\0'-terminated */
+	ibool		is_table,	/*!< in: if TRUE then 'name' is a table
+					name */
+	char*		formatted,	/*!< out: formatted result, will be
+					'\0'-terminated */
+	ulint		formatted_size)	/*!< out: no more than this number of
+					bytes will be written to 'formatted' */
+{
+	switch (formatted_size) {
+	case 1:
+		formatted[0] = '\0';
+		/* FALL-THROUGH */
+	case 0:
+		return(formatted);
+	}
+
+	char*	end;
+
+	end = innobase_convert_name(formatted, formatted_size,
+				    name, strlen(name), NULL, is_table);
+
+	/* If the space in 'formatted' was completely used, then sacrifice
+	the last character in order to write '\0' at the end. */
+	if ((ulint) (end - formatted) == formatted_size) {
+		end--;
+	}
+
+	ut_a((ulint) (end - formatted) < formatted_size);
+
+	*end = '\0';
+
+	return(formatted);
+}
+
+/**********************************************************************//**
+Catenate files. */
+UNIV_INTERN
+void
+ut_copy_file(
+/*=========*/
+	FILE*	dest,	/*!< in: output file */
+	FILE*	src)	/*!< in: input file to be appended to output */
+{
+	long	len = ftell(src);
+	char	buf[4096];
+
+	rewind(src);
+	do {
+		size_t	maxs = len < (long) sizeof buf
+			? (size_t) len
+			: sizeof buf;
+		size_t	size = fread(buf, 1, maxs, src);
+		fwrite(buf, 1, size, dest);
+		len -= (long) size;
+		if (size < maxs) {
+			break;
+		}
+	} while (len > 0);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef __WIN__
+# include <stdarg.h>
+/**********************************************************************//**
+A substitute for vsnprintf(3), formatted output conversion into
+a limited buffer. Note: this function DOES NOT return the number of
+characters that would have been printed if the buffer was unlimited because
+VC's _vsnprintf() returns -1 in this case and we would need to call
+_vscprintf() in addition to estimate that but we would need another copy
+of "ap" for that and VC does not provide va_copy(). */
+UNIV_INTERN
+void
+ut_vsnprintf(
+/*=========*/
+	char*		str,	/*!< out: string */
+	size_t		size,	/*!< in: str size */
+	const char*	fmt,	/*!< in: format */
+	va_list		ap)	/*!< in: format values */
+{
+	_vsnprintf(str, size, fmt, ap);
+	str[size - 1] = '\0';
+}
+
+/**********************************************************************//**
+A substitute for snprintf(3), formatted output conversion into
+a limited buffer.
+@return number of characters that would have been printed if the size
+were unlimited, not including the terminating '\0'. */
+UNIV_INTERN
+int
+ut_snprintf(
+/*========*/
+	char*		str,	/*!< out: string */
+	size_t		size,	/*!< in: str size */
+	const char*	fmt,	/*!< in: format */
+	...)			/*!< in: format values */
+{
+	int	res;
+	va_list	ap1;
+	va_list	ap2;
+
+	va_start(ap1, fmt);
+	va_start(ap2, fmt);
+
+	res = _vscprintf(fmt, ap1);
+	ut_a(res != -1);
+
+	if (size > 0) {
+		_vsnprintf(str, size, fmt, ap2);
+
+		if ((size_t) res >= size) {
+			str[size - 1] = '\0';
+		}
+	}
+
+	va_end(ap1);
+	va_end(ap2);
+
+	return(res);
+}
+#endif /* __WIN__ */
+
+/*************************************************************//**
+Convert an error number to a human readable text message. The
+returned string is static and should not be freed or modified.
+@return	string, describing the error */
+UNIV_INTERN
+const char*
+ut_strerr(
+/*======*/
+	dberr_t	num)	/*!< in: error number */
+{
+	switch (num) {
+	case DB_SUCCESS:
+		return("Success");
+	case DB_SUCCESS_LOCKED_REC:
+		return("Success, record lock created");
+	case DB_ERROR:
+		return("Generic error");
+	case DB_READ_ONLY:
+		return("Read only transaction");
+	case DB_INTERRUPTED:
+		return("Operation interrupted");
+	case DB_OUT_OF_MEMORY:
+		return("Cannot allocate memory");
+	case DB_OUT_OF_FILE_SPACE:
+		return("Out of disk space");
+	case DB_LOCK_WAIT:
+		return("Lock wait");
+	case DB_DEADLOCK:
+		return("Deadlock");
+	case DB_ROLLBACK:
+		return("Rollback");
+	case DB_DUPLICATE_KEY:
+		return("Duplicate key");
+	case DB_QUE_THR_SUSPENDED:
+		return("The queue thread has been suspended");
+	case DB_MISSING_HISTORY:
+		return("Required history data has been deleted");
+	case DB_CLUSTER_NOT_FOUND:
+		return("Cluster not found");
+	case DB_TABLE_NOT_FOUND:
+		return("Table not found");
+	case DB_MUST_GET_MORE_FILE_SPACE:
+		return("More file space needed");
+	case DB_TABLE_IS_BEING_USED:
+		return("Table is being used");
+	case DB_TOO_BIG_RECORD:
+		return("Record too big");
+	case DB_TOO_BIG_INDEX_COL:
+		return("Index columns size too big");
+	case DB_LOCK_WAIT_TIMEOUT:
+		return("Lock wait timeout");
+	case DB_NO_REFERENCED_ROW:
+		return("Referenced key value not found");
+	case DB_ROW_IS_REFERENCED:
+		return("Row is referenced");
+	case DB_CANNOT_ADD_CONSTRAINT:
+		return("Cannot add constraint");
+	case DB_CORRUPTION:
+		return("Data structure corruption");
+	case DB_CANNOT_DROP_CONSTRAINT:
+		return("Cannot drop constraint");
+	case DB_NO_SAVEPOINT:
+		return("No such savepoint");
+	case DB_TABLESPACE_EXISTS:
+		return("Tablespace already exists");
+	case DB_TABLESPACE_DELETED:
+		return("Tablespace deleted or being deleted");
+	case DB_TABLESPACE_NOT_FOUND:
+		return("Tablespace not found");
+	case DB_LOCK_TABLE_FULL:
+		return("Lock structs have exhausted the buffer pool");
+	case DB_FOREIGN_DUPLICATE_KEY:
+		return("Foreign key activated with duplicate keys");
+	case DB_FOREIGN_EXCEED_MAX_CASCADE:
+		return("Foreign key cascade delete/update exceeds max depth");
+	case DB_TOO_MANY_CONCURRENT_TRXS:
+		return("Too many concurrent transactions");
+	case DB_UNSUPPORTED:
+		return("Unsupported");
+	case DB_INVALID_NULL:
+		return("NULL value encountered in NOT NULL column");
+	case DB_STATS_DO_NOT_EXIST:
+		return("Persistent statistics do not exist");
+	case DB_FAIL:
+		return("Failed, retry may succeed");
+	case DB_OVERFLOW:
+		return("Overflow");
+	case DB_UNDERFLOW:
+		return("Underflow");
+	case DB_STRONG_FAIL:
+		return("Failed, retry will not succeed");
+	case DB_ZIP_OVERFLOW:
+		return("Zip overflow");
+	case DB_RECORD_NOT_FOUND:
+		return("Record not found");
+	case DB_CHILD_NO_INDEX:
+		return("No index on referencing keys in referencing table");
+	case DB_PARENT_NO_INDEX:
+		return("No index on referenced keys in referenced table");
+	case DB_FTS_INVALID_DOCID:
+		return("FTS Doc ID cannot be zero");
+	case DB_INDEX_CORRUPT:
+		return("Index corrupted");
+	case DB_UNDO_RECORD_TOO_BIG:
+		return("Undo record too big");
+	case DB_END_OF_INDEX:
+		return("End of index");
+	case DB_IO_ERROR:
+		return("I/O error");
+	case DB_TABLE_IN_FK_CHECK:
+		return("Table is being used in foreign key check");
+	case DB_DATA_MISMATCH:
+		return("data mismatch");
+	case DB_SCHEMA_NOT_LOCKED:
+		return("schema not locked");
+	case DB_NOT_FOUND:
+		return("not found");
+	case DB_ONLINE_LOG_TOO_BIG:
+		return("Log size exceeded during online index creation");
+	case DB_DICT_CHANGED:
+		return("Table dictionary has changed");
+	case DB_IDENTIFIER_TOO_LONG:
+		return("Identifier name is too long");
+	case DB_FTS_EXCEED_RESULT_CACHE_LIMIT:
+		return("FTS query exceeds result cache limit");
+	case DB_TEMP_FILE_WRITE_FAILURE:
+		return("Temp file write failure");
+	case DB_FTS_TOO_MANY_WORDS_IN_PHRASE:
+		return("Too many words in a FTS phrase or proximity search");
+	case DB_TOO_BIG_FOR_REDO:
+		return("BLOB record length is greater than 10%% of redo log");
+
+	/* do not add default: in order to produce a warning if new code
+	is added to the enum but not added here */
+	}
+
+	/* we abort here because if unknown error code is given, this could
+	mean that memory corruption has happened and someone's error-code
+	variable has been overwritten with bogus data */
+	ut_error;
+
+	/* NOT REACHED */
+	return("Unknown error");
+}
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/ut/ut0vec.cc b/storage/innobase/ut/ut0vec.cc
new file mode 100644
index 00000000000..5842d9f1c0e
--- /dev/null
+++ b/storage/innobase/ut/ut0vec.cc
@@ -0,0 +1,78 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file ut/ut0vec.cc
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+#include "ut0vec.h"
+#ifdef UNIV_NONINL
+#include "ut0vec.ic"
+#endif
+#include "mem0mem.h"
+
+/********************************************************************
+Create a new vector with the given initial size. */
+UNIV_INTERN
+ib_vector_t*
+ib_vector_create(
+/*=============*/
+					/* out: vector */
+	ib_alloc_t*	allocator,	/* in: vector allocator */
+	ulint		sizeof_value,	/* in: size of data item */
+	ulint		size)		/* in: initial size */
+{
+	ib_vector_t*	vec;
+
+	ut_a(size > 0);
+
+	vec = static_cast<ib_vector_t*>(
+		allocator->mem_malloc(allocator, sizeof(*vec)));
+
+	vec->used = 0;
+	vec->total = size;
+	vec->allocator = allocator;
+	vec->sizeof_value = sizeof_value;
+
+	vec->data = static_cast<void*>(
+		allocator->mem_malloc(allocator, vec->sizeof_value * size));
+
+	return(vec);
+}
+
+/********************************************************************
+Resize the vector, currently the vector can only grow and we
+expand the number of elements it can hold by 2 times. */
+UNIV_INTERN
+void
+ib_vector_resize(
+/*=============*/
+	ib_vector_t*	vec)		/* in: vector */
+{
+	ulint		new_total = vec->total * 2;
+	ulint		old_size = vec->used * vec->sizeof_value;
+	ulint		new_size = new_total * vec->sizeof_value;
+
+	vec->data = static_cast<void*>(vec->allocator->mem_resize(
+		vec->allocator, vec->data, old_size, new_size));
+
+	vec->total = new_total;
+}
diff --git a/storage/innobase/ut/ut0wqueue.cc b/storage/innobase/ut/ut0wqueue.cc
new file mode 100644
index 00000000000..d1ba36b3b00
--- /dev/null
+++ b/storage/innobase/ut/ut0wqueue.cc
@@ -0,0 +1,175 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2011, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#include "ut0wqueue.h"
+
+/*******************************************************************//**
+@file ut/ut0wqueue.cc
+A work queue
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/****************************************************************//**
+Create a new work queue.
+@return	work queue */
+UNIV_INTERN
+ib_wqueue_t*
+ib_wqueue_create(void)
+/*===================*/
+{
+	ib_wqueue_t*	wq = static_cast<ib_wqueue_t*>(mem_alloc(sizeof(*wq)));
+
+	/* Function ib_wqueue_create() has not been used anywhere,
+	not necessary to instrument this mutex */
+	mutex_create(PFS_NOT_INSTRUMENTED, &wq->mutex, SYNC_WORK_QUEUE);
+
+	wq->items = ib_list_create();
+	wq->event = os_event_create();
+
+	return(wq);
+}
+
+/****************************************************************//**
+Free a work queue. */
+UNIV_INTERN
+void
+ib_wqueue_free(
+/*===========*/
+	ib_wqueue_t*	wq)	/*!< in: work queue */
+{
+	mutex_free(&wq->mutex);
+	ib_list_free(wq->items);
+	os_event_free(wq->event);
+
+	mem_free(wq);
+}
+
+/****************************************************************//**
+Add a work item to the queue. */
+UNIV_INTERN
+void
+ib_wqueue_add(
+/*==========*/
+	ib_wqueue_t*	wq,	/*!< in: work queue */
+	void*		item,	/*!< in: work item */
+	mem_heap_t*	heap)	/*!< in: memory heap to use for allocating the
+				list node */
+{
+	mutex_enter(&wq->mutex);
+
+	ib_list_add_last(wq->items, item, heap);
+	os_event_set(wq->event);
+
+	mutex_exit(&wq->mutex);
+}
+
+/****************************************************************//**
+Wait for a work item to appear in the queue.
+@return	work item */
+UNIV_INTERN
+void*
+ib_wqueue_wait(
+/*===========*/
+	ib_wqueue_t*	wq)	/*!< in: work queue */
+{
+	ib_list_node_t*	node;
+
+	for (;;) {
+		os_event_wait(wq->event);
+
+		mutex_enter(&wq->mutex);
+
+		node = ib_list_get_first(wq->items);
+
+		if (node) {
+			ib_list_remove(wq->items, node);
+
+			if (!ib_list_get_first(wq->items)) {
+				/* We must reset the event when the list
+				gets emptied. */
+				os_event_reset(wq->event);
+			}
+
+			break;
+		}
+
+		mutex_exit(&wq->mutex);
+	}
+
+	mutex_exit(&wq->mutex);
+
+	return(node->data);
+}
+
+
+/********************************************************************
+Wait for a work item to appear in the queue for specified time. */
+
+void*
+ib_wqueue_timedwait(
+/*================*/
+					/* out: work item or NULL on timeout*/
+	ib_wqueue_t*	wq,		/* in: work queue */
+	ib_time_t	wait_in_usecs)	/* in: wait time in micro seconds */
+{
+	ib_list_node_t*	node = NULL;
+
+	for (;;) {
+		ulint		error;
+		ib_int64_t	sig_count;
+
+		mutex_enter(&wq->mutex);
+
+		node = ib_list_get_first(wq->items);
+
+		if (node) {
+			ib_list_remove(wq->items, node);
+
+			mutex_exit(&wq->mutex);
+			break;
+		}
+
+		sig_count = os_event_reset(wq->event);
+
+		mutex_exit(&wq->mutex);
+
+		error = os_event_wait_time_low(wq->event,
+					       (ulint) wait_in_usecs,
+					       sig_count);
+
+		if (error == OS_SYNC_TIME_EXCEEDED) {
+			break;
+		}
+	}
+
+	return(node ? node->data : NULL);
+}
+
+/********************************************************************
+Check if queue is empty. */
+
+ibool
+ib_wqueue_is_empty(
+/*===============*/
+					/* out: TRUE if queue empty
+					else FALSE */
+	const ib_wqueue_t*	wq)	/* in: work queue */
+{
+	return(ib_list_is_empty(wq->items));
+}
author	Sergei Golubchik <vuvova@gmail.com>	2015-05-04 19:17:21 +0200
committer	Sergei Golubchik <vuvova@gmail.com>	2015-05-04 19:17:21 +0200
commit	6d06fbbd1dc25b3c12568f9038060dfdb69f9683 (patch)
tree	21e27f3fddc89f9dda6b337091464ba10c490123 /storage
parent	1645930d0bd02f79df3ebff412b90acdc15bd9a0 (diff)
download	mariadb-git-6d06fbbd1dc25b3c12568f9038060dfdb69f9683.tar.gz